From 99ee4ca746dda71326db7645463b4075ac1d665c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Mar 2010 17:50:17 -0800 Subject: rcu: Suppress __mpol_dup() false positive from RCU lockdep Common code is used during task creation and after the task has started running. RCU protection is not needed during task creation because no other CPU has access to the under-construction task. Provide the RCU protection anyway to suppress the false positive, as there does not appear to be a good way for the common code to recognize that the task is only accessible to the CPU creating it. Signed-off-by: Paul E. McKenney Cc: Paul Menage Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1267667418-32233-2-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- mm/mempolicy.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 290fb5bf0440..3cec080faa23 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1730,10 +1730,12 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) if (!new) return ERR_PTR(-ENOMEM); + rcu_read_lock(); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(old, &mems); } + rcu_read_unlock(); *new = *old; atomic_set(&new->refcnt, 1); return new; -- cgit v1.2.3 From 9d8cebd4bcd7c3878462fdfda34bbcdeb4df7ef4 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 5 Mar 2010 13:41:57 -0800 Subject: mm: fix mbind vma merge problem Strangely, current mbind() doesn't merge vma with neighbor vma although it's possible. Unfortunately, many vma can reduce performance... This patch fixes it. reproduced program ---------------------------------------------------------------- #include #include #include #include #include #include #include static unsigned long pagesize; int main(int argc, char** argv) { void* addr; int ch; int node; struct bitmask *nmask = numa_allocate_nodemask(); int err; int node_set = 0; char buf[128]; while ((ch = getopt(argc, argv, "n:")) != -1){ switch (ch){ case 'n': node = strtol(optarg, NULL, 0); numa_bitmask_setbit(nmask, node); node_set = 1; break; default: ; } } argc -= optind; argv += optind; if (!node_set) numa_bitmask_setbit(nmask, 0); pagesize = getpagesize(); addr = mmap(NULL, pagesize*3, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, 0, 0); if (addr == MAP_FAILED) perror("mmap "), exit(1); fprintf(stderr, "pid = %d \n" "addr = %p\n", getpid(), addr); /* make page populate */ memset(addr, 0, pagesize*3); /* first mbind */ err = mbind(addr+pagesize, pagesize, MPOL_BIND, nmask->maskp, nmask->size, MPOL_MF_MOVE_ALL); if (err) error("mbind1 "); /* second mbind */ err = mbind(addr, pagesize*3, MPOL_DEFAULT, NULL, 0, 0); if (err) error("mbind2 "); sprintf(buf, "cat /proc/%d/maps", getpid()); system(buf); return 0; } ---------------------------------------------------------------- result without this patch addr = 0x7fe26ef09000 [snip] 7fe26ef09000-7fe26ef0a000 rw-p 00000000 00:00 0 7fe26ef0a000-7fe26ef0b000 rw-p 00000000 00:00 0 7fe26ef0b000-7fe26ef0c000 rw-p 00000000 00:00 0 7fe26ef0c000-7fe26ef0d000 rw-p 00000000 00:00 0 => 0x7fe26ef09000-0x7fe26ef0c000 have three vmas. result with this patch addr = 0x7fc9ebc76000 [snip] 7fc9ebc76000-7fc9ebc7a000 rw-p 00000000 00:00 0 7fffbe690000-7fffbe6a5000 rw-p 00000000 00:00 0 [stack] => 0x7fc9ebc76000-0x7fc9ebc7a000 have only one vma. [minchan.kim@gmail.com: fix file offset passed to vma_merge()] Signed-off-by: KOSAKI Motohiro Reviewed-by: Christoph Lameter Cc: Nick Piggin Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Lee Schermerhorn Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 52 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 13 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 290fb5bf0440..44dd9d1521ec 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -563,24 +563,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) } /* Step 2: apply policy to a range and do splits. */ -static int mbind_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct mempolicy *new) +static int mbind_range(struct mm_struct *mm, unsigned long start, + unsigned long end, struct mempolicy *new_pol) { struct vm_area_struct *next; - int err; + struct vm_area_struct *prev; + struct vm_area_struct *vma; + int err = 0; + pgoff_t pgoff; + unsigned long vmstart; + unsigned long vmend; - err = 0; - for (; vma && vma->vm_start < end; vma = next) { + vma = find_vma_prev(mm, start, &prev); + if (!vma || vma->vm_start > start) + return -EFAULT; + + for (; vma && vma->vm_start < end; prev = vma, vma = next) { next = vma->vm_next; - if (vma->vm_start < start) - err = split_vma(vma->vm_mm, vma, start, 1); - if (!err && vma->vm_end > end) - err = split_vma(vma->vm_mm, vma, end, 0); - if (!err) - err = policy_vma(vma, new); + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, new_pol); + if (prev) { + vma = prev; + next = vma->vm_next; + continue; + } + if (vma->vm_start != vmstart) { + err = split_vma(vma->vm_mm, vma, vmstart, 1); + if (err) + goto out; + } + if (vma->vm_end != vmend) { + err = split_vma(vma->vm_mm, vma, vmend, 0); + if (err) + goto out; + } + err = policy_vma(vma, new_pol); if (err) - break; + goto out; } + + out: return err; } @@ -1047,7 +1073,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!IS_ERR(vma)) { int nr_failed = 0; - err = mbind_range(vma, start, end, new); + err = mbind_range(mm, start, end, new); if (!list_empty(&pagelist)) nr_failed = migrate_pages(&pagelist, new_vma_page, -- cgit v1.2.3 From da0aa138944311e6745a00ac3d88f03e8d9a46c4 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 5 Mar 2010 13:41:59 -0800 Subject: mm/mempolicy.c: fix indentation of the comments of do_migrate_pages Currently, do_migrate_pages() have very long comment and this is not indent properly. I often misunderstand it is function starting commnents and confused it. this patch fixes it. note: this patch doesn't break 80 column rule. I guess original author intended this indentaion, but an accident corrupted it. Signed-off-by: KOSAKI Motohiro Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 60 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 44dd9d1521ec..bda230e52acd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -888,36 +888,36 @@ int do_migrate_pages(struct mm_struct *mm, if (err) goto out; -/* - * Find a 'source' bit set in 'tmp' whose corresponding 'dest' - * bit in 'to' is not also set in 'tmp'. Clear the found 'source' - * bit in 'tmp', and return that pair for migration. - * The pair of nodemasks 'to' and 'from' define the map. - * - * If no pair of bits is found that way, fallback to picking some - * pair of 'source' and 'dest' bits that are not the same. If the - * 'source' and 'dest' bits are the same, this represents a node - * that will be migrating to itself, so no pages need move. - * - * If no bits are left in 'tmp', or if all remaining bits left - * in 'tmp' correspond to the same bit in 'to', return false - * (nothing left to migrate). - * - * This lets us pick a pair of nodes to migrate between, such that - * if possible the dest node is not already occupied by some other - * source node, minimizing the risk of overloading the memory on a - * node that would happen if we migrated incoming memory to a node - * before migrating outgoing memory source that same node. - * - * A single scan of tmp is sufficient. As we go, we remember the - * most recent pair that moved (s != d). If we find a pair - * that not only moved, but what's better, moved to an empty slot - * (d is not set in tmp), then we break out then, with that pair. - * Otherwise when we finish scannng from_tmp, we at least have the - * most recent pair that moved. If we get all the way through - * the scan of tmp without finding any node that moved, much less - * moved to an empty node, then there is nothing left worth migrating. - */ + /* + * Find a 'source' bit set in 'tmp' whose corresponding 'dest' + * bit in 'to' is not also set in 'tmp'. Clear the found 'source' + * bit in 'tmp', and return that pair for migration. + * The pair of nodemasks 'to' and 'from' define the map. + * + * If no pair of bits is found that way, fallback to picking some + * pair of 'source' and 'dest' bits that are not the same. If the + * 'source' and 'dest' bits are the same, this represents a node + * that will be migrating to itself, so no pages need move. + * + * If no bits are left in 'tmp', or if all remaining bits left + * in 'tmp' correspond to the same bit in 'to', return false + * (nothing left to migrate). + * + * This lets us pick a pair of nodes to migrate between, such that + * if possible the dest node is not already occupied by some other + * source node, minimizing the risk of overloading the memory on a + * node that would happen if we migrated incoming memory to a node + * before migrating outgoing memory source that same node. + * + * A single scan of tmp is sufficient. As we go, we remember the + * most recent pair that moved (s != d). If we find a pair + * that not only moved, but what's better, moved to an empty slot + * (d is not set in tmp), then we break out then, with that pair. + * Otherwise when we finish scannng from_tmp, we at least have the + * most recent pair that moved. If we get all the way through + * the scan of tmp without finding any node that moved, much less + * moved to an empty node, then there is nothing left worth migrating. + */ tmp = *from_nodes; while (!nodes_empty(tmp)) { -- cgit v1.2.3