From 70b50f94f1644e2aa7cb374819cfd93f3c28d725 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:36:59 -0700 Subject: mm: thp: tail page refcounting fix Michel while working on the working set estimation code, noticed that calling get_page_unless_zero() on a random pfn_to_page(random_pfn) wasn't safe, if the pfn ended up being a tail page of a transparent hugepage under splitting by __split_huge_page_refcount(). He then found the problem could also theoretically materialize with page_cache_get_speculative() during the speculative radix tree lookups that uses get_page_unless_zero() in SMP if the radix tree page is freed and reallocated and get_user_pages is called on it before page_cache_get_speculative has a chance to call get_page_unless_zero(). So the best way to fix the problem is to keep page_tail->_count zero at all times. This will guarantee that get_page_unless_zero() can never succeed on any tail page. page_tail->_mapcount is guaranteed zero and is unused for all tail pages of a compound page, so we can simply account the tail page references there and transfer them to tail_page->_count in __split_huge_page_refcount() (in addition to the head_page->_mapcount). While debugging this s/_count/_mapcount/ change I also noticed get_page is called by direct-io.c on pages returned by get_user_pages. That wasn't entirely safe because the two atomic_inc in get_page weren't atomic. As opposed to other get_user_page users like secondary-MMU page fault to establish the shadow pagetables would never call any superflous get_page after get_user_page returns. It's safer to make get_page universally safe for tail pages and to use get_page_foll() within follow_page (inside get_user_pages()). get_page_foll() is safe to do the refcounting for tail pages without taking any locks because it is run within PT lock protected critical sections (PT lock for pte and page_table_lock for pmd_trans_huge). The standard get_page() as invoked by direct-io instead will now take the compound_lock but still only for tail pages. The direct-io paths are usually I/O bound and the compound_lock is per THP so very finegrined, so there's no risk of scalability issues with it. A simple direct-io benchmarks with all lockdep prove locking and spinlock debugging infrastructure enabled shows identical performance and no overhead. So it's worth it. Ideally direct-io should stop calling get_page() on pages returned by get_user_pages(). The spinlock in get_page() is already optimized away for no-THP builds but doing get_page() on tail pages returned by GUP is generally a rare operation and usually only run in I/O paths. This new refcounting on page_tail->_mapcount in addition to avoiding new RCU critical sections will also allow the working set estimation code to work without any further complexity associated to the tail page refcounting with THP. Signed-off-by: Andrea Arcangeli Reported-by: Michel Lespinasse Reviewed-by: Michel Lespinasse Reviewed-by: Minchan Kim Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/gup.c | 5 +-- arch/x86/mm/gup.c | 5 +-- include/linux/mm.h | 56 +++++++++++++------------------- include/linux/mm_types.h | 21 +++++++++--- mm/huge_memory.c | 37 ++++++++++++++------- mm/internal.h | 46 +++++++++++++++++++++++++++ mm/memory.c | 2 +- mm/swap.c | 83 +++++++++++++++++++++++++++++++----------------- 8 files changed, 171 insertions(+), 84 deletions(-) diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index fec13200868f..b9e1c7ff5f6d 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -22,8 +22,9 @@ static inline void get_huge_page_tail(struct page *page) * __split_huge_page_refcount() cannot run * from under us. */ - VM_BUG_ON(atomic_read(&page->_count) < 0); - atomic_inc(&page->_count); + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); } /* diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index dbe34b931374..3b5032a62b0f 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -114,8 +114,9 @@ static inline void get_huge_page_tail(struct page *page) * __split_huge_page_refcount() cannot run * from under us. */ - VM_BUG_ON(atomic_read(&page->_count) < 0); - atomic_inc(&page->_count); + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); } static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, diff --git a/include/linux/mm.h b/include/linux/mm.h index 3b3e3b8bb706..f81b7b41930c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -356,36 +356,39 @@ static inline struct page *compound_head(struct page *page) return page; } +/* + * The atomic page->_mapcount, starts from -1: so that transitions + * both from it and to it can be tracked, using atomic_inc_and_test + * and atomic_add_negative(-1). + */ +static inline void reset_page_mapcount(struct page *page) +{ + atomic_set(&(page)->_mapcount, -1); +} + +static inline int page_mapcount(struct page *page) +{ + return atomic_read(&(page)->_mapcount) + 1; +} + static inline int page_count(struct page *page) { return atomic_read(&compound_head(page)->_count); } +extern bool __get_page_tail(struct page *page); + static inline void get_page(struct page *page) { + if (unlikely(PageTail(page))) + if (likely(__get_page_tail(page))) + return; /* * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. Only if - * we're getting a tail page, the elevated page->_count is - * required only in the head page, so for tail pages the - * bugcheck only verifies that the page->_count isn't - * negative. + * requires to already have an elevated page->_count. */ - VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page)); + VM_BUG_ON(atomic_read(&page->_count) <= 0); atomic_inc(&page->_count); - /* - * Getting a tail page will elevate both the head and tail - * page->_count(s). - */ - if (unlikely(PageTail(page))) { - /* - * This is safe only because - * __split_huge_page_refcount can't run under - * get_page(). - */ - VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); - atomic_inc(&page->first_page->_count); - } } static inline struct page *virt_to_head_page(const void *x) @@ -803,21 +806,6 @@ static inline pgoff_t page_index(struct page *page) return page->index; } -/* - * The atomic page->_mapcount, like _count, starts from -1: - * so that transitions both from it and to it can be tracked, - * using atomic_inc_and_test and atomic_add_negative(-1). - */ -static inline void reset_page_mapcount(struct page *page) -{ - atomic_set(&(page)->_mapcount, -1); -} - -static inline int page_mapcount(struct page *page) -{ - return atomic_read(&(page)->_mapcount) + 1; -} - /* * Return true if this page is mapped into pagetables. */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3e01a19a91e8..5b42f1b34eb7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -62,10 +62,23 @@ struct page { struct { union { - atomic_t _mapcount; /* Count of ptes mapped in mms, - * to show when page is mapped - * & limit reverse map searches. - */ + /* + * Count of ptes mapped in + * mms, to show when page is + * mapped & limit reverse map + * searches. + * + * Used also for tail pages + * refcounting instead of + * _count. Tail pages cannot + * be mapped and keeping the + * tail page _count zero at + * all times guarantees + * get_page_unless_zero() will + * never succeed on tail + * pages. + */ + atomic_t _mapcount; struct { unsigned inuse:16; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 860ec211ddd6..4298abaae153 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -990,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON(!PageCompound(page)); if (flags & FOLL_GET) - get_page(page); + get_page_foll(page); out: return page; @@ -1202,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page) unsigned long head_index = page->index; struct zone *zone = page_zone(page); int zonestat; + int tail_count = 0; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -1210,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page) for (i = 1; i < HPAGE_PMD_NR; i++) { struct page *page_tail = page + i; - /* tail_page->_count cannot change */ - atomic_sub(atomic_read(&page_tail->_count), &page->_count); - BUG_ON(page_count(page) <= 0); - atomic_add(page_mapcount(page) + 1, &page_tail->_count); - BUG_ON(atomic_read(&page_tail->_count) <= 0); + /* tail_page->_mapcount cannot change */ + BUG_ON(page_mapcount(page_tail) < 0); + tail_count += page_mapcount(page_tail); + /* check for overflow */ + BUG_ON(tail_count < 0); + BUG_ON(atomic_read(&page_tail->_count) != 0); + /* + * tail_page->_count is zero and not changing from + * under us. But get_page_unless_zero() may be running + * from under us on the tail_page. If we used + * atomic_set() below instead of atomic_add(), we + * would then run atomic_set() concurrently with + * get_page_unless_zero(), and atomic_set() is + * implemented in C not using locked ops. spin_unlock + * on x86 sometime uses locked ops because of PPro + * errata 66, 92, so unless somebody can guarantee + * atomic_set() here would be safe on all archs (and + * not only on x86), it's safer to use atomic_add(). + */ + atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, + &page_tail->_count); /* after clearing PageTail the gup refcount can be released */ smp_mb(); @@ -1232,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page) (1L << PG_uptodate))); page_tail->flags |= (1L << PG_dirty); - /* - * 1) clear PageTail before overwriting first_page - * 2) clear PageTail before clearing PageHead for VM_BUG_ON - */ + /* clear PageTail before overwriting first_page */ smp_wmb(); /* @@ -1252,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page) * status is achieved setting a reserved bit in the * pmd, not by clearing the present bit. */ - BUG_ON(page_mapcount(page_tail)); page_tail->_mapcount = page->_mapcount; BUG_ON(page_tail->mapping); @@ -1269,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page) lru_add_page_tail(zone, page, page_tail); } + atomic_sub(tail_count, &page->_count); + BUG_ON(atomic_read(&page->_count) <= 0); __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); diff --git a/mm/internal.h b/mm/internal.h index d071d380fb49..2189af491783 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -37,6 +37,52 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } +static inline void __get_page_tail_foll(struct page *page, + bool get_page_head) +{ + /* + * If we're getting a tail page, the elevated page->_count is + * required only in the head page and we will elevate the head + * page->_count and tail page->_mapcount. + * + * We elevate page_tail->_mapcount for tail pages to force + * page_tail->_count to be zero at all times to avoid getting + * false positives from get_page_unless_zero() with + * speculative page access (like in + * page_cache_get_speculative()) on tail pages. + */ + VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + VM_BUG_ON(page_mapcount(page) < 0); + if (get_page_head) + atomic_inc(&page->first_page->_count); + atomic_inc(&page->_mapcount); +} + +/* + * This is meant to be called as the FOLL_GET operation of + * follow_page() and it must be called while holding the proper PT + * lock while the pte (or pmd_trans_huge) is still mapping the page. + */ +static inline void get_page_foll(struct page *page) +{ + if (unlikely(PageTail(page))) + /* + * This is safe only because + * __split_huge_page_refcount() can't run under + * get_page_foll() because we hold the proper PT lock. + */ + __get_page_tail_foll(page, true); + else { + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. + */ + VM_BUG_ON(atomic_read(&page->_count) <= 0); + atomic_inc(&page->_count); + } +} + extern unsigned long highest_memmap_pfn; /* diff --git a/mm/memory.c b/mm/memory.c index a56e3ba816b2..b2b87315cdc6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1503,7 +1503,7 @@ split_fallthrough: } if (flags & FOLL_GET) - get_page(page); + get_page_foll(page); if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) diff --git a/mm/swap.c b/mm/swap.c index 3a442f18b0b3..87627f181c3f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -78,39 +78,22 @@ static void put_compound_page(struct page *page) { if (unlikely(PageTail(page))) { /* __split_huge_page_refcount can run under us */ - struct page *page_head = page->first_page; - smp_rmb(); - /* - * If PageTail is still set after smp_rmb() we can be sure - * that the page->first_page we read wasn't a dangling pointer. - * See __split_huge_page_refcount() smp_wmb(). - */ - if (likely(PageTail(page) && get_page_unless_zero(page_head))) { + struct page *page_head = compound_trans_head(page); + + if (likely(page != page_head && + get_page_unless_zero(page_head))) { unsigned long flags; /* - * Verify that our page_head wasn't converted - * to a a regular page before we got a - * reference on it. + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. */ - if (unlikely(!PageHead(page_head))) { - /* PageHead is cleared after PageTail */ - smp_rmb(); - VM_BUG_ON(PageTail(page)); - goto out_put_head; - } - /* - * Only run compound_lock on a valid PageHead, - * after having it pinned with - * get_page_unless_zero() above. - */ - smp_mb(); - /* page_head wasn't a dangling pointer */ flags = compound_lock_irqsave(page_head); if (unlikely(!PageTail(page))) { /* __split_huge_page_refcount run before us */ compound_unlock_irqrestore(page_head, flags); VM_BUG_ON(PageHead(page_head)); - out_put_head: if (put_page_testzero(page_head)) __put_single_page(page_head); out_put_single: @@ -121,16 +104,17 @@ static void put_compound_page(struct page *page) VM_BUG_ON(page_head != page->first_page); /* * We can release the refcount taken by - * get_page_unless_zero now that - * split_huge_page_refcount is blocked on the - * compound_lock. + * get_page_unless_zero() now that + * __split_huge_page_refcount() is blocked on + * the compound_lock. */ if (put_page_testzero(page_head)) VM_BUG_ON(1); /* __split_huge_page_refcount will wait now */ - VM_BUG_ON(atomic_read(&page->_count) <= 0); - atomic_dec(&page->_count); + VM_BUG_ON(page_mapcount(page) <= 0); + atomic_dec(&page->_mapcount); VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); compound_unlock_irqrestore(page_head, flags); if (put_page_testzero(page_head)) { if (PageHead(page_head)) @@ -160,6 +144,45 @@ void put_page(struct page *page) } EXPORT_SYMBOL(put_page); +/* + * This function is exported but must not be called by anything other + * than get_page(). It implements the slow path of get_page(). + */ +bool __get_page_tail(struct page *page) +{ + /* + * This takes care of get_page() if run on a tail page + * returned by one of the get_user_pages/follow_page variants. + * get_user_pages/follow_page itself doesn't need the compound + * lock because it runs __get_page_tail_foll() under the + * proper PT lock that already serializes against + * split_huge_page(). + */ + unsigned long flags; + bool got = false; + struct page *page_head = compound_trans_head(page); + + if (likely(page != page_head && get_page_unless_zero(page_head))) { + /* + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. + */ + flags = compound_lock_irqsave(page_head); + /* here __split_huge_page_refcount won't run anymore */ + if (likely(PageTail(page))) { + __get_page_tail_foll(page, false); + got = true; + } + compound_unlock_irqrestore(page_head, flags); + if (unlikely(!got)) + put_page(page_head); + } + return got; +} +EXPORT_SYMBOL(__get_page_tail); + /** * put_pages_list() - release a list of pages * @pages: list of pages threaded on page->lru -- cgit v1.2.3 From 2839bdc1bfc0af76a2f0f11eca011590520a04fa Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:03 -0700 Subject: powerpc: remove superfluous PageTail checks on the pte gup_fast This part of gup_fast doesn't seem capable of handling hugetlbfs ptes, those should be handled by gup_hugepd only, so these checks are superfluous. Plus if this wasn't a noop, it would have oopsed because, the insistence of using the speculative refcounting would trigger a VM_BUG_ON if a tail page was encountered in the page_cache_get_speculative(). Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Acked-by: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/gup.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index b9e1c7ff5f6d..d7efdbf640c7 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -16,17 +16,6 @@ #ifdef __HAVE_ARCH_PTE_SPECIAL -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -58,8 +47,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, put_page(page); return 0; } - if (PageTail(page)) - get_huge_page_tail(page); pages[*nr] = page; (*nr)++; -- cgit v1.2.3 From 405e44f2e312dd5dd63e5a9f459bffcbcd4368ef Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:08 -0700 Subject: powerpc: get_hugepte() don't put_page() the wrong page "page" may have changed to point to the next hugepage after the loop completed, The references have been taken on the head page, so the put_page must happen there too. This is a longstanding issue pre-thp inclusion. It's totally unclear how these page_cache_add_speculative and pte_val(pte) != pte_val(*ptep) checks are necessary across all the powerpc gup_fast code, when x86 doesn't need any of that: there's no way the page can be freed with irq disabled so we're guaranteed the atomic_inc will happen on a page with page_count > 0 (so not needing the speculative check). The pte check is also meaningless on x86: no need to rollback on x86 if the pte changed, because the pte can still change a CPU tick after the check succeeded and it won't be rolled back in that case. The important thing is we got a reference on a valid page that was mapped there a CPU tick ago. So not knowing the soft tlb refill code of ppc64 in great detail I'm not removing the "speculative" page_count increase and the pte checks across all the code, but unless there's a strong reason for it they should be later cleaned up too. If a pte can change from huge to non-huge (like it could happen with THP) passing a pte_t *ptep to gup_hugepte() would also require to repeat the is_hugepd in gup_hugepte(), but that shouldn't happen with hugetlbfs only so I'm not altering that. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Acked-by: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 0b9a5c1901b9..b649c288af90 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -429,7 +429,7 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add if (unlikely(pte_val(pte) != pte_val(*ptep))) { /* Could be optimized better */ while (*nr) { - put_page(page); + put_page(head); (*nr)--; } } -- cgit v1.2.3 From 8596468487e2062cae2aad56e973784e03959245 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:11 -0700 Subject: powerpc: gup_hugepte() avoid freeing the head page too many times We only taken "refs" pins on the head page not "*nr" pins. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Acked-by: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index b649c288af90..78b14abded65 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -428,10 +428,9 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add if (unlikely(pte_val(pte) != pte_val(*ptep))) { /* Could be optimized better */ - while (*nr) { + *nr -= refs; + while (refs--) put_page(head); - (*nr)--; - } } return 1; -- cgit v1.2.3 From 3526741f0964c88bc2ce511e1078359052bf225b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:15 -0700 Subject: powerpc: gup_hugepte() support THP based tail recounting Up to this point the code assumed old refcounting for hugepages (pre-thp). This updates the code directly to the thp mapcount tail page refcounting. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 78b14abded65..a618ef01bfad 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -385,12 +385,23 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); +} + static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask; unsigned long pte_end; - struct page *head, *page; + struct page *head, *page, *tail; pte_t pte; int refs; @@ -413,6 +424,7 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add head = pte_page(pte); page = head + ((addr & (sz-1)) >> PAGE_SHIFT); + tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -431,6 +443,16 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add *nr -= refs; while (refs--) put_page(head); + } else { + /* + * Any tail page need their mapcount reference taken + * before we return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } } return 1; -- cgit v1.2.3 From cf592bf768c4fa40282b8fce58a80820065de2cb Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:19 -0700 Subject: powerpc: gup_huge_pmd() return 0 if pte changes powerpc didn't return 0 in that case, if it's rolling back the *nr pointer it should also return zero to avoid adding pages to the array at the wrong offset. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Acked-by: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a618ef01bfad..1c59d94f5942 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -443,16 +443,17 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add *nr -= refs; while (refs--) put_page(head); - } else { - /* - * Any tail page need their mapcount reference taken - * before we return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } + return 0; + } + + /* + * Any tail page need their mapcount reference taken before we + * return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; } return 1; -- cgit v1.2.3 From 220a2eb228d032acde60e9fd044ca802706ff583 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:25 -0700 Subject: s390: gup_huge_pmd() support THP tail recounting Up to this point the code assumed old refcounting for hugepages (pre-thp). This updates the code directly to the thp mapcount tail page refcounting. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/gup.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 45b405ca2567..668dda964f20 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -48,11 +48,22 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, return 1; } +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); +} + static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask, result; - struct page *head, *page; + struct page *head, *page, *tail; int refs; result = write ? 0 : _SEGMENT_ENTRY_RO; @@ -64,6 +75,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, refs = 0; head = pmd_page(pmd); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -81,6 +93,16 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, *nr -= refs; while (refs--) put_page(head); + } else { + /* + * Any tail page need their mapcount reference taken + * before we return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } } return 1; -- cgit v1.2.3 From 0693bc9ce2cc4f6a1b9c3c05790fc149a74c0b87 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:28 -0700 Subject: s390: gup_huge_pmd() return 0 if pte changes s390 didn't return 0 in that case, if it's rolling back the *nr pointer it should also return zero to avoid adding pages to the array at the wrong offset. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/gup.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 668dda964f20..da33a0281d9d 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -93,16 +93,17 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, *nr -= refs; while (refs--) put_page(head); - } else { - /* - * Any tail page need their mapcount reference taken - * before we return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } + return 0; + } + + /* + * Any tail page need their mapcount reference taken before we + * return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; } return 1; -- cgit v1.2.3 From e0d85a366c2300efd230ef82a9b22110b0658331 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:31 -0700 Subject: sparc: gup_pte_range() support THP based tail recounting Up to this point the code assumed old refcounting for hugepages (pre-thp). This updates the code directly to the thp mapcount tail page refcounting. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Acked-by: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/mm/gup.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index a986b5d05712..afcebac144fb 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -12,6 +12,17 @@ #include #include +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); +} + /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -56,6 +67,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, put_page(head); return 0; } + if (head != page) + get_huge_page_tail(page); pages[*nr] = page; (*nr)++; -- cgit v1.2.3 From b35a35b556f5e6b7993ad0baf20173e75c09ce8c Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:36 -0700 Subject: thp: share get_huge_page_tail() This avoids duplicating the function in every arch gup_fast. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 11 ----------- arch/s390/mm/gup.c | 11 ----------- arch/sparc/mm/gup.c | 11 ----------- arch/x86/mm/gup.c | 11 ----------- include/linux/mm.h | 11 +++++++++++ 5 files changed, 11 insertions(+), 44 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 1c59d94f5942..da5eb3885702 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -385,17 +385,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index da33a0281d9d..65cb06e2af4e 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -48,17 +48,6 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, return 1; } -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index afcebac144fb..42c55df3aec3 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -12,17 +12,6 @@ #include #include -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 3b5032a62b0f..ea305856151c 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -108,17 +108,6 @@ static inline void get_head_page_multiple(struct page *page, int nr) SetPageReferenced(page); } -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { diff --git a/include/linux/mm.h b/include/linux/mm.h index f81b7b41930c..3dc3a8c2c485 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -376,6 +376,17 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); +} + extern bool __get_page_tail(struct page *page); static inline void get_page(struct page *page) -- cgit v1.2.3 From a3defbe5c337dbc6da911f8cc49ae3cc3b49b453 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 2 Nov 2011 13:37:41 -0700 Subject: binfmt_elf: fix PIE execution with randomization disabled The case of address space randomization being disabled in runtime through randomize_va_space sysctl is not treated properly in load_elf_binary(), resulting in SIGKILL coming at exec() time for certain PIE-linked binaries in case the randomization has been disabled at runtime prior to calling exec(). Handle the randomize_va_space == 0 case the same way as if we were not supporting .text randomization at all. Based on original patch by H.J. Lu and Josh Boyer. Signed-off-by: Jiri Kosina Cc: Ingo Molnar Cc: Russell King Cc: H.J. Lu Cc: Tested-by: Josh Boyer Acked-by: Nicolas Pitre Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index dd0fdfc56d38..21ac5ee4b43f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -795,7 +795,16 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * might try to exec. This is because the brk will * follow the loader, and is not movable. */ #if defined(CONFIG_X86) || defined(CONFIG_ARM) - load_bias = 0; + /* Memory randomization might have been switched off + * in runtime via sysctl. + * If that is the case, retain the original non-zero + * load_bias value in order to establish proper + * non-randomized mappings. + */ + if (current->flags & PF_RANDOMIZE) + load_bias = 0; + else + load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); #else load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); #endif -- cgit v1.2.3 From 0620d9193cb976ba635d56a6cfd11cb81616d02b Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 2 Nov 2011 13:37:45 -0700 Subject: ramfs: remove module leftovers Since ramfs is hard-selected to "y", the module leftovers make no sense. Signed-off-by: Richard Weinberger Reviewed-by: WANG Cong Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/inode.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index eacb166fb259..462ceb38fec6 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -23,7 +23,6 @@ * caches is sufficient. */ -#include #include #include #include @@ -288,14 +287,7 @@ static int __init init_ramfs_fs(void) { return register_filesystem(&ramfs_fs_type); } - -static void __exit exit_ramfs_fs(void) -{ - unregister_filesystem(&ramfs_fs_type); -} - module_init(init_ramfs_fs) -module_exit(exit_ramfs_fs) int __init init_rootfs(void) { @@ -311,5 +303,3 @@ int __init init_rootfs(void) return err; } - -MODULE_LICENSE("GPL"); -- cgit v1.2.3 From f919b9235f930e649b374a50009c6c268bd9a073 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Wed, 2 Nov 2011 13:37:47 -0700 Subject: init/do_mounts_rd.c: fix ramdisk identification for padded cramfs When a cramfs ramdisk padded with 512 bytes is given to the kernel, the current identify_ramdisk_image function fails to identify it. Tested with a padded cramfs image on an ARM based board. Signed-off-by: Neil Armstrong Cc: Namhyung Kim Cc: Davidlohr Bueso Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/do_mounts_rd.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index fe9acb0ae480..887629e24c54 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -119,6 +119,20 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor) goto done; } + /* + * Read 512 bytes further to check if cramfs is padded + */ + sys_lseek(fd, start_block * BLOCK_SIZE + 0x200, 0); + sys_read(fd, buf, size); + + if (cramfsb->magic == CRAMFS_MAGIC) { + printk(KERN_NOTICE + "RAMDISK: cramfs filesystem found at block %d\n", + start_block); + nblocks = (cramfsb->size + BLOCK_SIZE - 1) >> BLOCK_SIZE_BITS; + goto done; + } + /* * Read block 1 to test for minix and ext2 superblock */ -- cgit v1.2.3 From 6d03d06db8881f4f9da87d5c77234b98c40a30e9 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Wed, 2 Nov 2011 13:37:49 -0700 Subject: drivers/rtc/class.c: convert idr to ida and use ida_simple_get() This is the one use of an ida that doesn't retry on receiving -EAGAIN. I'm assuming do so will cause no harm and may help on a rare occasion. Signed-off-by: Jonathan Cameron Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/class.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index 01a7df5317c1..e8326f26fa2f 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -21,16 +21,13 @@ #include "rtc-core.h" -static DEFINE_IDR(rtc_idr); -static DEFINE_MUTEX(idr_lock); +static DEFINE_IDA(rtc_ida); struct class *rtc_class; static void rtc_device_release(struct device *dev) { struct rtc_device *rtc = to_rtc_device(dev); - mutex_lock(&idr_lock); - idr_remove(&rtc_idr, rtc->id); - mutex_unlock(&idr_lock); + ida_simple_remove(&rtc_ida, rtc->id); kfree(rtc); } @@ -146,25 +143,16 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev, struct rtc_wkalrm alrm; int id, err; - if (idr_pre_get(&rtc_idr, GFP_KERNEL) == 0) { - err = -ENOMEM; + id = ida_simple_get(&rtc_ida, 0, 0, GFP_KERNEL); + if (id < 0) { + err = id; goto exit; } - - mutex_lock(&idr_lock); - err = idr_get_new(&rtc_idr, NULL, &id); - mutex_unlock(&idr_lock); - - if (err < 0) - goto exit; - - id = id & MAX_ID_MASK; - rtc = kzalloc(sizeof(struct rtc_device), GFP_KERNEL); if (rtc == NULL) { err = -ENOMEM; - goto exit_idr; + goto exit_ida; } rtc->id = id; @@ -222,10 +210,8 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev, exit_kfree: kfree(rtc); -exit_idr: - mutex_lock(&idr_lock); - idr_remove(&rtc_idr, id); - mutex_unlock(&idr_lock); +exit_ida: + ida_simple_remove(&rtc_ida, id); exit: dev_err(dev, "rtc core: unable to register %s, err = %d\n", @@ -276,7 +262,7 @@ static void __exit rtc_exit(void) { rtc_dev_exit(); class_destroy(rtc_class); - idr_destroy(&rtc_idr); + ida_destroy(&rtc_ida); } subsys_initcall(rtc_init); -- cgit v1.2.3 From 43fcb81550f7a16be192b19c77a379c9b27b1585 Mon Sep 17 00:00:00 2001 From: David Anders Date: Wed, 2 Nov 2011 13:37:53 -0700 Subject: rtc: add initial support for mcp7941x parts Add initial support for the microchip mcp7941x series of real time clocks. The mcp7941x series is generally compatible with the ds1307 and ds1337 rtc devices from dallas semiconductor. minor differences include a backup battery enable bit, and the polarity of the oscillator enable bit. Signed-off-by: David Anders Cc: Alessandro Zummo Reviewed-by: Wolfram Sang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds1307.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index b2005b44e4f7..62b0763b7b9a 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -34,6 +34,7 @@ enum ds_type { ds_1388, ds_3231, m41t00, + mcp7941x, rx_8025, // rs5c372 too? different address... }; @@ -43,6 +44,7 @@ enum ds_type { #define DS1307_REG_SECS 0x00 /* 00-59 */ # define DS1307_BIT_CH 0x80 # define DS1340_BIT_nEOSC 0x80 +# define MCP7941X_BIT_ST 0x80 #define DS1307_REG_MIN 0x01 /* 00-59 */ #define DS1307_REG_HOUR 0x02 /* 00-23, or 1-12{am,pm} */ # define DS1307_BIT_12HR 0x40 /* in REG_HOUR */ @@ -50,6 +52,7 @@ enum ds_type { # define DS1340_BIT_CENTURY_EN 0x80 /* in REG_HOUR */ # define DS1340_BIT_CENTURY 0x40 /* in REG_HOUR */ #define DS1307_REG_WDAY 0x03 /* 01-07 */ +# define MCP7941X_BIT_VBATEN 0x08 #define DS1307_REG_MDAY 0x04 /* 01-31 */ #define DS1307_REG_MONTH 0x05 /* 01-12 */ # define DS1337_BIT_CENTURY 0x80 /* in REG_MONTH */ @@ -137,6 +140,8 @@ static const struct chip_desc chips[] = { }, [m41t00] = { }, +[mcp7941x] = { +}, [rx_8025] = { }, }; @@ -149,6 +154,7 @@ static const struct i2c_device_id ds1307_id[] = { { "ds1340", ds_1340 }, { "ds3231", ds_3231 }, { "m41t00", m41t00 }, + { "mcp7941x", mcp7941x }, { "pt7c4338", ds_1307 }, { "rx8025", rx_8025 }, { } @@ -365,6 +371,10 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t) buf[DS1307_REG_HOUR] |= DS1340_BIT_CENTURY_EN | DS1340_BIT_CENTURY; break; + case mcp7941x: + buf[DS1307_REG_SECS] |= MCP7941X_BIT_ST; + buf[DS1307_REG_WDAY] |= MCP7941X_BIT_VBATEN; + break; default: break; } @@ -808,6 +818,23 @@ read_rtc: i2c_smbus_write_byte_data(client, DS1340_REG_FLAG, 0); dev_warn(&client->dev, "SET TIME!\n"); } + break; + case mcp7941x: + /* make sure that the backup battery is enabled */ + if (!(ds1307->regs[DS1307_REG_WDAY] & MCP7941X_BIT_VBATEN)) { + i2c_smbus_write_byte_data(client, DS1307_REG_WDAY, + ds1307->regs[DS1307_REG_WDAY] + | MCP7941X_BIT_VBATEN); + } + + /* clock halted? turn it on, so clock can tick. */ + if (!(tmp & MCP7941X_BIT_ST)) { + i2c_smbus_write_byte_data(client, DS1307_REG_SECS, + MCP7941X_BIT_ST); + dev_warn(&client->dev, "SET TIME!\n"); + goto read_rtc; + } + break; case rx_8025: case ds_1337: -- cgit v1.2.3 From db5cf8d1ac4ac3fa06d89345154ce20068aeb097 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Wed, 2 Nov 2011 13:37:56 -0700 Subject: drivers/rtc/rtc-mc13xxx.c: move probe and remove callbacks to .init.text and .exit.text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver is added using platform_driver_probe(), so the callbacks can be discarded more aggessively. Signed-off-by: Uwe Kleine-König Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-mc13xxx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index a1a278bc340d..9d0c3b478d55 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -309,7 +309,7 @@ static irqreturn_t mc13xxx_rtc_reset_handler(int irq, void *dev) return IRQ_HANDLED; } -static int __devinit mc13xxx_rtc_probe(struct platform_device *pdev) +static int __init mc13xxx_rtc_probe(struct platform_device *pdev) { int ret; struct mc13xxx_rtc *priv; @@ -378,7 +378,7 @@ err_reset_irq_request: return ret; } -static int __devexit mc13xxx_rtc_remove(struct platform_device *pdev) +static int __exit mc13xxx_rtc_remove(struct platform_device *pdev) { struct mc13xxx_rtc *priv = platform_get_drvdata(pdev); @@ -410,7 +410,7 @@ const struct platform_device_id mc13xxx_rtc_idtable[] = { static struct platform_driver mc13xxx_rtc_driver = { .id_table = mc13xxx_rtc_idtable, - .remove = __devexit_p(mc13xxx_rtc_remove), + .remove = __exit_p(mc13xxx_rtc_remove), .driver = { .name = DRIVER_NAME, .owner = THIS_MODULE, -- cgit v1.2.3 From b6eb48d02dc73d19bebc396a9e92dd64a65d3199 Mon Sep 17 00:00:00 2001 From: Sami Kerola Date: Wed, 2 Nov 2011 13:37:58 -0700 Subject: minix: describe usage of different magic numbers One can get this information from minix/inode.c, but adding the explanations at the definition sites is more appropriate. Signed-off-by: Sami Kerola Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/magic.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/magic.h b/include/linux/magic.h index 1e5df2af8d84..2d4beab0d5b7 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h @@ -30,11 +30,11 @@ #define ANON_INODE_FS_MAGIC 0x09041934 #define PSTOREFS_MAGIC 0x6165676C -#define MINIX_SUPER_MAGIC 0x137F /* original minix fs */ -#define MINIX_SUPER_MAGIC2 0x138F /* minix fs, 30 char names */ -#define MINIX2_SUPER_MAGIC 0x2468 /* minix V2 fs */ -#define MINIX2_SUPER_MAGIC2 0x2478 /* minix V2 fs, 30 char names */ -#define MINIX3_SUPER_MAGIC 0x4d5a /* minix V3 fs */ +#define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */ +#define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */ +#define MINIX2_SUPER_MAGIC 0x2468 /* minix v2 fs, 14 char names */ +#define MINIX2_SUPER_MAGIC2 0x2478 /* minix v2 fs, 30 char names */ +#define MINIX3_SUPER_MAGIC 0x4d5a /* minix v3 fs, 60 char names */ #define MSDOS_SUPER_MAGIC 0x4d44 /* MD */ #define NCP_SUPER_MAGIC 0x564c /* Guess, what 0x564c is :-) */ -- cgit v1.2.3 From 3069083cc8def2ffad8520f0f24c6f95f140aac5 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 2 Nov 2011 13:38:00 -0700 Subject: isofs: add readpages support Use mpage_readpages() instead of multiple calls to isofs_readpage() to reduce the CPU utilization and make performance higher. Signed-off-by: Namjae Jeon Cc: Al Viro Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/isofs/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index a5d03672d04e..46844ff39d61 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "isofs.h" #include "zisofs.h" @@ -1148,7 +1149,13 @@ struct buffer_head *isofs_bread(struct inode *inode, sector_t block) static int isofs_readpage(struct file *file, struct page *page) { - return block_read_full_page(page,isofs_get_block); + return mpage_readpage(page, isofs_get_block); +} + +static int isofs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, isofs_get_block); } static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) @@ -1158,6 +1165,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) static const struct address_space_operations isofs_aops = { .readpage = isofs_readpage, + .readpages = isofs_readpages, .bmap = _isofs_bmap }; -- cgit v1.2.3 From 434a964daa14b9db083ce20404a4a2add54d037a Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Wed, 2 Nov 2011 13:38:01 -0700 Subject: hfs: fix hfs_find_init() sb->ext_tree NULL ptr oops Clement Lecigne reports a filesystem which causes a kernel oops in hfs_find_init() trying to dereference sb->ext_tree which is NULL. This proves to be because the filesystem has a corrupted MDB extent record, where the extents file does not fit into the first three extents in the file record (the first blocks). In hfs_get_block() when looking up the blocks for the extent file (HFS_EXT_CNID), it fails the first blocks special case, and falls through to the extent code (which ultimately calls hfs_find_init()) which is in the process of being initialised. Hfs avoids this scenario by always having the extents b-tree fitting into the first blocks (the extents B-tree can't have overflow extents). The fix is to check at mount time that the B-tree fits into first blocks, i.e. fail if HFS_I(inode)->alloc_blocks >= HFS_I(inode)->first_blocks Note, the existing commit 47f365eb57573 ("hfs: fix oops on mount with corrupted btree extent records") becomes subsumed into this as a special case, but only for the extents B-tree (HFS_EXT_CNID), it is perfectly acceptable for the catalog B-Tree file to grow beyond three extents, with the remaining extent descriptors in the extents overfow. This fixes CVE-2011-2203 Reported-by: Clement LECIGNE Signed-off-by: Phillip Lougher Cc: Jeff Mahoney Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfs/btree.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 3ebc437736fe..1cbdeea1db44 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -46,11 +46,26 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke case HFS_EXT_CNID: hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize, mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz)); + if (HFS_I(tree->inode)->alloc_blocks > + HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records\n"); + unlock_new_inode(tree->inode); + goto free_inode; + } + tree->inode->i_mapping->a_ops = &hfs_btree_aops; break; case HFS_CAT_CNID: hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize, mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz)); + + if (!HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records " + "(0 size).\n"); + unlock_new_inode(tree->inode); + goto free_inode; + } + tree->inode->i_mapping->a_ops = &hfs_btree_aops; break; default: @@ -59,11 +74,6 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke } unlock_new_inode(tree->inode); - if (!HFS_I(tree->inode)->first_blocks) { - printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n"); - goto free_inode; - } - mapping = tree->inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) -- cgit v1.2.3 From 33ef6b6984403a688189317ef46bb3caab3b70e0 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 2 Nov 2011 13:38:05 -0700 Subject: cgroups: more safe tasklist locking in cgroup_attach_proc Fix unstable tasklist locking in cgroup_attach_proc. According to this thread - https://lkml.org/lkml/2011/7/27/243 - RCU is not sufficient to guarantee the tasklist is stable w.r.t. de_thread and exit. Taking tasklist_lock for reading, instead of rcu_read_lock, ensures proper exclusion. Signed-off-by: Ben Blum Acked-by: Paul Menage Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: "Paul E. McKenney" Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 453100a4159d..64b0e73402df 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2027,7 +2027,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) goto out_free_group_list; /* prevent changes to the threadgroup list while we take a snapshot. */ - rcu_read_lock(); + read_lock(&tasklist_lock); if (!thread_group_leader(leader)) { /* * a race with de_thread from another thread's exec() may strip @@ -2036,7 +2036,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * throw this task away and try again (from cgroup_procs_write); * this is "double-double-toil-and-trouble-check locking". */ - rcu_read_unlock(); + read_unlock(&tasklist_lock); retval = -EAGAIN; goto out_free_group_list; } @@ -2057,7 +2057,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) } while_each_thread(leader, tsk); /* remember the number of threads in the array for later. */ group_size = i; - rcu_read_unlock(); + read_unlock(&tasklist_lock); /* * step 1: check that we can legitimately attach to the cgroup. -- cgit v1.2.3 From 77ceab8ea590d7dc6c8f055ce43dfebd74428107 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 2 Nov 2011 13:38:07 -0700 Subject: cgroups: don't attach task to subsystem if migration failed If a task has exited to the point it has called cgroup_exit() already, then we can't migrate it to another cgroup anymore. This can happen when we are attaching a task to a new cgroup between the call to ->can_attach_task() on subsystems and the migration that is eventually tried in cgroup_task_migrate(). In this case cgroup_task_migrate() returns -ESRCH and we don't want to attach the task to the subsystems because the attachment to the new cgroup itself failed. Fix this by only calling ->attach_task() on the subsystems if the cgroup migration succeeded. Reported-by: Oleg Nesterov Signed-off-by: Ben Blum Acked-by: Paul Menage Cc: Li Zefan Cc: Tejun Heo Signed-off-by: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 64b0e73402df..8386b21224ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2135,14 +2135,17 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) continue; - /* attach each task to each subsystem */ - for_each_subsys(root, ss) { - if (ss->attach_task) - ss->attach_task(cgrp, tsk); - } /* if the thread is PF_EXITING, it can just get skipped. */ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); - BUG_ON(retval != 0 && retval != -ESRCH); + if (retval == 0) { + /* attach each task to each subsystem */ + for_each_subsys(root, ss) { + if (ss->attach_task) + ss->attach_task(cgrp, tsk); + } + } else { + BUG_ON(retval != -ESRCH); + } } /* nothing is sensitive to fork() after this point. */ -- cgit v1.2.3 From ff7ee93f47151e23601856e7eb5510babf956571 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Nov 2011 13:38:11 -0700 Subject: cgroup/kmemleak: Annotate alloc_page() for cgroup allocations When the cgroup base was allocated with kmalloc, it was necessary to annotate the variable with kmemleak_not_leak(). But because it has recently been changed to be allocated with alloc_page() (which skips kmemleak checks) causes a warning on boot up. I was triggering this output: allocated 8388608 bytes of page_cgroup please try 'cgroup_disable=memory' option if you don't want memory cgroups kmemleak: Trying to color unknown object at 0xf5840000 as Grey Pid: 0, comm: swapper Not tainted 3.0.0-test #12 Call Trace: [] ? printk+0x1d/0x1f^M [] paint_ptr+0x4f/0x78 [] kmemleak_not_leak+0x58/0x7d [] ? __rcu_read_unlock+0x9/0x7d [] kmemleak_init+0x19d/0x1e9 [] start_kernel+0x346/0x3ec [] ? loglevel+0x18/0x18 [] i386_start_kernel+0xaa/0xb0 After a bit of debugging I tracked the object 0xf840000 (and others) down to the cgroup code. The change from allocating base with kmalloc to alloc_page() has the base not calling kmemleak_alloc() which adds the pointer to the object_tree_root, but kmemleak_not_leak() adds it to the crt_early_log[] table. On kmemleak_init(), the entry is found in the early_log[] but not the object_tree_root, and this error message is displayed. If alloc_page() fails then it defaults back to vmalloc() which still uses the kmemleak_alloc() which makes us still need the kmemleak_not_leak() call. The solution is to call the kmemleak_alloc() directly if the alloc_page() succeeds. Reviewed-by: Michal Hocko Signed-off-by: Steven Rostedt Acked-by: Catalin Marinas Signed-off-by: Jonathan Nieder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 6bdc67dbbc28..3749ae15a8c8 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -133,10 +133,13 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc) static void *__meminit alloc_page_cgroup(size_t size, int nid) { void *addr = NULL; + gfp_t flags = GFP_KERNEL | __GFP_NOWARN; - addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN); - if (addr) + addr = alloc_pages_exact_nid(nid, size, flags); + if (addr) { + kmemleak_alloc(addr, size, 1, flags); return addr; + } if (node_state(nid, N_HIGH_MEMORY)) addr = vmalloc_node(size, nid); -- cgit v1.2.3 From c0ff4b8540a5c158b8e5bafb7d767298b67b0b92 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Wed, 2 Nov 2011 13:38:15 -0700 Subject: memcg: rename mem variable to memcg The memcg code sometimes uses "struct mem_cgroup *mem" and sometimes uses "struct mem_cgroup *memcg". Rename all mem variables to memcg in source file. Signed-off-by: Raghavendra K T Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 34 +- mm/memcontrol.c | 930 +++++++++++++++++++++++---------------------- 2 files changed, 485 insertions(+), 479 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ac797fa03ef8..05206aac5965 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -78,8 +78,8 @@ extern void mem_cgroup_uncharge_end(void); extern void mem_cgroup_uncharge_page(struct page *page); extern void mem_cgroup_uncharge_cache_page(struct page *page); -extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); +extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask); +int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); @@ -88,19 +88,19 @@ extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm); static inline int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) { - struct mem_cgroup *mem; + struct mem_cgroup *memcg; rcu_read_lock(); - mem = mem_cgroup_from_task(rcu_dereference((mm)->owner)); + memcg = mem_cgroup_from_task(rcu_dereference((mm)->owner)); rcu_read_unlock(); - return cgroup == mem; + return cgroup == memcg; } -extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem); +extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); extern int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask); -extern void mem_cgroup_end_migration(struct mem_cgroup *mem, +extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok); /* @@ -148,7 +148,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask, unsigned long *total_scanned); -u64 mem_cgroup_get_limit(struct mem_cgroup *mem); +u64 mem_cgroup_get_limit(struct mem_cgroup *memcg); void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -244,18 +244,20 @@ static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm return NULL; } -static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem) +static inline int mm_match_cgroup(struct mm_struct *mm, + struct mem_cgroup *memcg) { return 1; } static inline int task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *mem) + const struct mem_cgroup *memcg) { return 1; } -static inline struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) +static inline struct cgroup_subsys_state + *mem_cgroup_css(struct mem_cgroup *memcg) { return NULL; } @@ -267,22 +269,22 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, return 0; } -static inline void mem_cgroup_end_migration(struct mem_cgroup *mem, +static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok) { } -static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) +static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg) { return 0; } -static inline void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, +static inline void mem_cgroup_note_reclaim_priority(struct mem_cgroup *memcg, int priority) { } -static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, +static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *memcg, int priority) { } @@ -348,7 +350,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, } static inline -u64 mem_cgroup_get_limit(struct mem_cgroup *mem) +u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) { return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2d5755544afe..9e38abdbfd95 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -201,8 +201,8 @@ struct mem_cgroup_eventfd_list { struct eventfd_ctx *eventfd; }; -static void mem_cgroup_threshold(struct mem_cgroup *mem); -static void mem_cgroup_oom_notify(struct mem_cgroup *mem); +static void mem_cgroup_threshold(struct mem_cgroup *memcg); +static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); /* * The memory controller data structure. The memory controller controls both @@ -362,29 +362,29 @@ enum charge_type { #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) -static void mem_cgroup_get(struct mem_cgroup *mem); -static void mem_cgroup_put(struct mem_cgroup *mem); -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); -static void drain_all_stock_async(struct mem_cgroup *mem); +static void mem_cgroup_get(struct mem_cgroup *memcg); +static void mem_cgroup_put(struct mem_cgroup *memcg); +static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); +static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) +mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) { - return &mem->info.nodeinfo[nid]->zoneinfo[zid]; + return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; } -struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) +struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) { - return &mem->css; + return &memcg->css; } static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) +page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) { int nid = page_to_nid(page); int zid = page_zonenum(page); - return mem_cgroup_zoneinfo(mem, nid, zid); + return mem_cgroup_zoneinfo(memcg, nid, zid); } static struct mem_cgroup_tree_per_zone * @@ -403,7 +403,7 @@ soft_limit_tree_from_page(struct page *page) } static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, +__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz, unsigned long long new_usage_in_excess) @@ -437,7 +437,7 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, } static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, +__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz) { @@ -448,17 +448,17 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, } static void -mem_cgroup_remove_exceeded(struct mem_cgroup *mem, +mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz) { spin_lock(&mctz->lock); - __mem_cgroup_remove_exceeded(mem, mz, mctz); + __mem_cgroup_remove_exceeded(memcg, mz, mctz); spin_unlock(&mctz->lock); } -static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) { unsigned long long excess; struct mem_cgroup_per_zone *mz; @@ -471,9 +471,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) * Necessary to update all ancestors when hierarchy is used. * because their event counter is not touched. */ - for (; mem; mem = parent_mem_cgroup(mem)) { - mz = mem_cgroup_zoneinfo(mem, nid, zid); - excess = res_counter_soft_limit_excess(&mem->res); + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + mz = mem_cgroup_zoneinfo(memcg, nid, zid); + excess = res_counter_soft_limit_excess(&memcg->res); /* * We have to update the tree if mz is on RB-tree or * mem is over its softlimit. @@ -482,18 +482,18 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) spin_lock(&mctz->lock); /* if on-tree, remove it */ if (mz->on_tree) - __mem_cgroup_remove_exceeded(mem, mz, mctz); + __mem_cgroup_remove_exceeded(memcg, mz, mctz); /* * Insert again. mz->usage_in_excess will be updated. * If excess is 0, no tree ops. */ - __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); + __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); spin_unlock(&mctz->lock); } } } -static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) +static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { int node, zone; struct mem_cgroup_per_zone *mz; @@ -501,9 +501,9 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) for_each_node_state(node, N_POSSIBLE) { for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = mem_cgroup_zoneinfo(mem, node, zone); + mz = mem_cgroup_zoneinfo(memcg, node, zone); mctz = soft_limit_tree_node_zone(node, zone); - mem_cgroup_remove_exceeded(mem, mz, mctz); + mem_cgroup_remove_exceeded(memcg, mz, mctz); } } } @@ -564,7 +564,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) * common workload, threashold and synchonization as vmstat[] should be * implemented. */ -static long mem_cgroup_read_stat(struct mem_cgroup *mem, +static long mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { long val = 0; @@ -572,81 +572,83 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem, get_online_cpus(); for_each_online_cpu(cpu) - val += per_cpu(mem->stat->count[idx], cpu); + val += per_cpu(memcg->stat->count[idx], cpu); #ifdef CONFIG_HOTPLUG_CPU - spin_lock(&mem->pcp_counter_lock); - val += mem->nocpu_base.count[idx]; - spin_unlock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); + val += memcg->nocpu_base.count[idx]; + spin_unlock(&memcg->pcp_counter_lock); #endif put_online_cpus(); return val; } -static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, +static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) { int val = (charge) ? 1 : -1; - this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); + this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } -void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) +void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val) { - this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); + this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); } -void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) +void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val) { - this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); + this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); } -static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, +static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { unsigned long val = 0; int cpu; for_each_online_cpu(cpu) - val += per_cpu(mem->stat->events[idx], cpu); + val += per_cpu(memcg->stat->events[idx], cpu); #ifdef CONFIG_HOTPLUG_CPU - spin_lock(&mem->pcp_counter_lock); - val += mem->nocpu_base.events[idx]; - spin_unlock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); + val += memcg->nocpu_base.events[idx]; + spin_unlock(&memcg->pcp_counter_lock); #endif return val; } -static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, +static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, bool file, int nr_pages) { preempt_disable(); if (file) - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], + nr_pages); else - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], + nr_pages); /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); + __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); else { - __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); + __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); nr_pages = -nr_pages; /* for event */ } - __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); + __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); preempt_enable(); } unsigned long -mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, +mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, unsigned int lru_mask) { struct mem_cgroup_per_zone *mz; enum lru_list l; unsigned long ret = 0; - mz = mem_cgroup_zoneinfo(mem, nid, zid); + mz = mem_cgroup_zoneinfo(memcg, nid, zid); for_each_lru(l) { if (BIT(l) & lru_mask) @@ -656,44 +658,45 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, } static unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, +mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { u64 total = 0; int zid; for (zid = 0; zid < MAX_NR_ZONES; zid++) - total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); + total += mem_cgroup_zone_nr_lru_pages(memcg, + nid, zid, lru_mask); return total; } -static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, unsigned int lru_mask) { int nid; u64 total = 0; for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); + total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); return total; } -static bool __memcg_event_check(struct mem_cgroup *mem, int target) +static bool __memcg_event_check(struct mem_cgroup *memcg, int target) { unsigned long val, next; - val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); - next = this_cpu_read(mem->stat->targets[target]); + val = this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); + next = this_cpu_read(memcg->stat->targets[target]); /* from time_after() in jiffies.h */ return ((long)next - (long)val < 0); } -static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) +static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) { unsigned long val, next; - val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); + val = this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); switch (target) { case MEM_CGROUP_TARGET_THRESH: @@ -709,30 +712,30 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) return; } - this_cpu_write(mem->stat->targets[target], next); + this_cpu_write(memcg->stat->targets[target], next); } /* * Check events in order. * */ -static void memcg_check_events(struct mem_cgroup *mem, struct page *page) +static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { - mem_cgroup_threshold(mem); - __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); - if (unlikely(__memcg_event_check(mem, + if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { + mem_cgroup_threshold(memcg); + __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); + if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_SOFTLIMIT))) { - mem_cgroup_update_tree(mem, page); - __mem_cgroup_target_update(mem, + mem_cgroup_update_tree(memcg, page); + __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_SOFTLIMIT); } #if MAX_NUMNODES > 1 - if (unlikely(__memcg_event_check(mem, + if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_NUMAINFO))) { - atomic_inc(&mem->numainfo_events); - __mem_cgroup_target_update(mem, + atomic_inc(&memcg->numainfo_events); + __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_NUMAINFO); } #endif @@ -762,7 +765,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; if (!mm) return NULL; @@ -773,25 +776,25 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) */ rcu_read_lock(); do { - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!mem)) + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) break; - } while (!css_tryget(&mem->css)); + } while (!css_tryget(&memcg->css)); rcu_read_unlock(); - return mem; + return memcg; } /* The caller has to guarantee "mem" exists before calling this */ -static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) +static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) { struct cgroup_subsys_state *css; int found; - if (!mem) /* ROOT cgroup has the smallest ID */ + if (!memcg) /* ROOT cgroup has the smallest ID */ return root_mem_cgroup; /*css_put/get against root is ignored*/ - if (!mem->use_hierarchy) { - if (css_tryget(&mem->css)) - return mem; + if (!memcg->use_hierarchy) { + if (css_tryget(&memcg->css)) + return memcg; return NULL; } rcu_read_lock(); @@ -799,13 +802,13 @@ static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) * searching a memory cgroup which has the smallest ID under given * ROOT cgroup. (ID >= 1) */ - css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); + css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found); if (css && css_tryget(css)) - mem = container_of(css, struct mem_cgroup, css); + memcg = container_of(css, struct mem_cgroup, css); else - mem = NULL; + memcg = NULL; rcu_read_unlock(); - return mem; + return memcg; } static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, @@ -859,29 +862,29 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, for_each_mem_cgroup_tree_cond(iter, NULL, true) -static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) +static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { - return (mem == root_mem_cgroup); + return (memcg == root_mem_cgroup); } void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { - struct mem_cgroup *mem; + struct mem_cgroup *memcg; if (!mm) return; rcu_read_lock(); - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!mem)) + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) goto out; switch (idx) { case PGMAJFAULT: - mem_cgroup_pgmajfault(mem, 1); + mem_cgroup_pgmajfault(memcg, 1); break; case PGFAULT: - mem_cgroup_pgfault(mem, 1); + mem_cgroup_pgfault(memcg, 1); break; default: BUG(); @@ -1063,21 +1066,21 @@ void mem_cgroup_move_lists(struct page *page, } /* - * Checks whether given mem is same or in the root_mem's + * Checks whether given mem is same or in the root_mem_cgroup's * hierarchy subtree */ -static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, - struct mem_cgroup *mem) +static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, + struct mem_cgroup *memcg) { - if (root_mem != mem) { - return (root_mem->use_hierarchy && - css_is_ancestor(&mem->css, &root_mem->css)); + if (root_memcg != memcg) { + return (root_memcg->use_hierarchy && + css_is_ancestor(&memcg->css, &root_memcg->css)); } return true; } -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) +int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) { int ret; struct mem_cgroup *curr = NULL; @@ -1091,12 +1094,12 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) if (!curr) return 0; /* - * We should check use_hierarchy of "mem" not "curr". Because checking + * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is - * enabled in "curr" and "curr" is a child of "mem" in *cgroup* - * hierarchy(even if use_hierarchy is disabled in "mem"). + * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* + * hierarchy(even if use_hierarchy is disabled in "memcg"). */ - ret = mem_cgroup_same_or_subtree(mem, curr); + ret = mem_cgroup_same_or_subtree(memcg, curr); css_put(&curr->css); return ret; } @@ -1254,13 +1257,13 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, * Returns the maximum amount of memory @mem can be charged with, in * pages. */ -static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) +static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) { unsigned long long margin; - margin = res_counter_margin(&mem->res); + margin = res_counter_margin(&memcg->res); if (do_swap_account) - margin = min(margin, res_counter_margin(&mem->memsw)); + margin = min(margin, res_counter_margin(&memcg->memsw)); return margin >> PAGE_SHIFT; } @@ -1275,33 +1278,33 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) return memcg->swappiness; } -static void mem_cgroup_start_move(struct mem_cgroup *mem) +static void mem_cgroup_start_move(struct mem_cgroup *memcg) { int cpu; get_online_cpus(); - spin_lock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); for_each_online_cpu(cpu) - per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; - mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; - spin_unlock(&mem->pcp_counter_lock); + per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; + memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; + spin_unlock(&memcg->pcp_counter_lock); put_online_cpus(); synchronize_rcu(); } -static void mem_cgroup_end_move(struct mem_cgroup *mem) +static void mem_cgroup_end_move(struct mem_cgroup *memcg) { int cpu; - if (!mem) + if (!memcg) return; get_online_cpus(); - spin_lock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); for_each_online_cpu(cpu) - per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; - mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; - spin_unlock(&mem->pcp_counter_lock); + per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; + memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; + spin_unlock(&memcg->pcp_counter_lock); put_online_cpus(); } /* @@ -1316,13 +1319,13 @@ static void mem_cgroup_end_move(struct mem_cgroup *mem) * waiting at hith-memory prressure caused by "move". */ -static bool mem_cgroup_stealed(struct mem_cgroup *mem) +static bool mem_cgroup_stealed(struct mem_cgroup *memcg) { VM_BUG_ON(!rcu_read_lock_held()); - return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; + return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; } -static bool mem_cgroup_under_move(struct mem_cgroup *mem) +static bool mem_cgroup_under_move(struct mem_cgroup *memcg) { struct mem_cgroup *from; struct mem_cgroup *to; @@ -1337,17 +1340,17 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem) if (!from) goto unlock; - ret = mem_cgroup_same_or_subtree(mem, from) - || mem_cgroup_same_or_subtree(mem, to); + ret = mem_cgroup_same_or_subtree(memcg, from) + || mem_cgroup_same_or_subtree(memcg, to); unlock: spin_unlock(&mc.lock); return ret; } -static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) +static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) { if (mc.moving_task && current != mc.moving_task) { - if (mem_cgroup_under_move(mem)) { + if (mem_cgroup_under_move(memcg)) { DEFINE_WAIT(wait); prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); /* moving charge context might have finished. */ @@ -1431,12 +1434,12 @@ done: * This function returns the number of memcg under hierarchy tree. Returns * 1(self count) if no children. */ -static int mem_cgroup_count_children(struct mem_cgroup *mem) +static int mem_cgroup_count_children(struct mem_cgroup *memcg) { int num = 0; struct mem_cgroup *iter; - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) num++; return num; } @@ -1466,21 +1469,21 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) * that to reclaim free pages from. */ static struct mem_cgroup * -mem_cgroup_select_victim(struct mem_cgroup *root_mem) +mem_cgroup_select_victim(struct mem_cgroup *root_memcg) { struct mem_cgroup *ret = NULL; struct cgroup_subsys_state *css; int nextid, found; - if (!root_mem->use_hierarchy) { - css_get(&root_mem->css); - ret = root_mem; + if (!root_memcg->use_hierarchy) { + css_get(&root_memcg->css); + ret = root_memcg; } while (!ret) { rcu_read_lock(); - nextid = root_mem->last_scanned_child + 1; - css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, + nextid = root_memcg->last_scanned_child + 1; + css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, &found); if (css && css_tryget(css)) ret = container_of(css, struct mem_cgroup, css); @@ -1489,9 +1492,9 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) /* Updates scanning parameter */ if (!css) { /* this means start scan from ID:1 */ - root_mem->last_scanned_child = 0; + root_memcg->last_scanned_child = 0; } else - root_mem->last_scanned_child = found; + root_memcg->last_scanned_child = found; } return ret; @@ -1507,14 +1510,14 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) * reclaimable pages on a node. Returns true if there are any reclaimable * pages in the node. */ -static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, +static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, int nid, bool noswap) { - if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) + if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) return true; if (noswap || !total_swap_pages) return false; - if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) + if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) return true; return false; @@ -1527,29 +1530,29 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, * nodes based on the zonelist. So update the list loosely once per 10 secs. * */ -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) { int nid; /* * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET * pagein/pageout changes since the last update. */ - if (!atomic_read(&mem->numainfo_events)) + if (!atomic_read(&memcg->numainfo_events)) return; - if (atomic_inc_return(&mem->numainfo_updating) > 1) + if (atomic_inc_return(&memcg->numainfo_updating) > 1) return; /* make a nodemask where this memcg uses memory from */ - mem->scan_nodes = node_states[N_HIGH_MEMORY]; + memcg->scan_nodes = node_states[N_HIGH_MEMORY]; for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { - if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) - node_clear(nid, mem->scan_nodes); + if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) + node_clear(nid, memcg->scan_nodes); } - atomic_set(&mem->numainfo_events, 0); - atomic_set(&mem->numainfo_updating, 0); + atomic_set(&memcg->numainfo_events, 0); + atomic_set(&memcg->numainfo_updating, 0); } /* @@ -1564,16 +1567,16 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) * * Now, we use round-robin. Better algorithm is welcomed. */ -int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) { int node; - mem_cgroup_may_update_nodemask(mem); - node = mem->last_scanned_node; + mem_cgroup_may_update_nodemask(memcg); + node = memcg->last_scanned_node; - node = next_node(node, mem->scan_nodes); + node = next_node(node, memcg->scan_nodes); if (node == MAX_NUMNODES) - node = first_node(mem->scan_nodes); + node = first_node(memcg->scan_nodes); /* * We call this when we hit limit, not when pages are added to LRU. * No LRU may hold pages because all pages are UNEVICTABLE or @@ -1583,7 +1586,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem) if (unlikely(node == MAX_NUMNODES)) node = numa_node_id(); - mem->last_scanned_node = node; + memcg->last_scanned_node = node; return node; } @@ -1593,7 +1596,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem) * unused nodes. But scan_nodes is lazily updated and may not cotain * enough new information. We need to do double check. */ -bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) +bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) { int nid; @@ -1601,12 +1604,12 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) * quick check...making use of scan_node. * We can skip unused nodes. */ - if (!nodes_empty(mem->scan_nodes)) { - for (nid = first_node(mem->scan_nodes); + if (!nodes_empty(memcg->scan_nodes)) { + for (nid = first_node(memcg->scan_nodes); nid < MAX_NUMNODES; - nid = next_node(nid, mem->scan_nodes)) { + nid = next_node(nid, memcg->scan_nodes)) { - if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) + if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) return true; } } @@ -1614,23 +1617,23 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) * Check rest of nodes. */ for_each_node_state(nid, N_HIGH_MEMORY) { - if (node_isset(nid, mem->scan_nodes)) + if (node_isset(nid, memcg->scan_nodes)) continue; - if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) + if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) return true; } return false; } #else -int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) { return 0; } -bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) +bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) { - return test_mem_cgroup_node_reclaimable(mem, 0, noswap); + return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); } #endif @@ -1639,14 +1642,14 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) * we reclaimed from, so that we don't end up penalizing one child extensively * based on its position in the children list. * - * root_mem is the original ancestor that we've been reclaim from. + * root_memcg is the original ancestor that we've been reclaim from. * - * We give up and return to the caller when we visit root_mem twice. + * We give up and return to the caller when we visit root_memcg twice. * (other groups can be removed while we're walking....) * * If shrink==true, for avoiding to free too much, this returns immedieately. */ -static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, +static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, struct zone *zone, gfp_t gfp_mask, unsigned long reclaim_options, @@ -1661,15 +1664,15 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, unsigned long excess; unsigned long nr_scanned; - excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; + excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; /* If memsw_is_minimum==1, swap-out is of-no-use. */ - if (!check_soft && !shrink && root_mem->memsw_is_minimum) + if (!check_soft && !shrink && root_memcg->memsw_is_minimum) noswap = true; while (1) { - victim = mem_cgroup_select_victim(root_mem); - if (victim == root_mem) { + victim = mem_cgroup_select_victim(root_memcg); + if (victim == root_memcg) { loop++; /* * We are not draining per cpu cached charges during @@ -1678,7 +1681,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, * charges will not give any. */ if (!check_soft && loop >= 1) - drain_all_stock_async(root_mem); + drain_all_stock_async(root_memcg); if (loop >= 2) { /* * If we have not been able to reclaim @@ -1725,9 +1728,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, return ret; total += ret; if (check_soft) { - if (!res_counter_soft_limit_excess(&root_mem->res)) + if (!res_counter_soft_limit_excess(&root_memcg->res)) return total; - } else if (mem_cgroup_margin(root_mem)) + } else if (mem_cgroup_margin(root_memcg)) return total; } return total; @@ -1738,12 +1741,12 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, * If someone is running, return false. * Has to be called with memcg_oom_lock */ -static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) +static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) { struct mem_cgroup *iter, *failed = NULL; bool cond = true; - for_each_mem_cgroup_tree_cond(iter, mem, cond) { + for_each_mem_cgroup_tree_cond(iter, memcg, cond) { if (iter->oom_lock) { /* * this subtree of our hierarchy is already locked @@ -1763,7 +1766,7 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) * what we set up to the failing subtree */ cond = true; - for_each_mem_cgroup_tree_cond(iter, mem, cond) { + for_each_mem_cgroup_tree_cond(iter, memcg, cond) { if (iter == failed) { cond = false; continue; @@ -1776,24 +1779,24 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) /* * Has to be called with memcg_oom_lock */ -static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) +static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) { struct mem_cgroup *iter; - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) iter->oom_lock = false; return 0; } -static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) +static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) { struct mem_cgroup *iter; - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) atomic_inc(&iter->under_oom); } -static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) +static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) { struct mem_cgroup *iter; @@ -1802,7 +1805,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) * mem_cgroup_oom_lock() may not be called. We have to use * atomic_add_unless() here. */ - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) atomic_add_unless(&iter->under_oom, -1, 0); } @@ -1817,80 +1820,80 @@ struct oom_wait_info { static int memcg_oom_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { - struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, - *oom_wait_mem; + struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, + *oom_wait_memcg; struct oom_wait_info *oom_wait_info; oom_wait_info = container_of(wait, struct oom_wait_info, wait); - oom_wait_mem = oom_wait_info->mem; + oom_wait_memcg = oom_wait_info->mem; /* * Both of oom_wait_info->mem and wake_mem are stable under us. * Then we can use css_is_ancestor without taking care of RCU. */ - if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) - && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) + if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) + && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) return 0; return autoremove_wake_function(wait, mode, sync, arg); } -static void memcg_wakeup_oom(struct mem_cgroup *mem) +static void memcg_wakeup_oom(struct mem_cgroup *memcg) { - /* for filtering, pass "mem" as argument. */ - __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); + /* for filtering, pass "memcg" as argument. */ + __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } -static void memcg_oom_recover(struct mem_cgroup *mem) +static void memcg_oom_recover(struct mem_cgroup *memcg) { - if (mem && atomic_read(&mem->under_oom)) - memcg_wakeup_oom(mem); + if (memcg && atomic_read(&memcg->under_oom)) + memcg_wakeup_oom(memcg); } /* * try to call OOM killer. returns false if we should exit memory-reclaim loop. */ -bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) +bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) { struct oom_wait_info owait; bool locked, need_to_kill; - owait.mem = mem; + owait.mem = memcg; owait.wait.flags = 0; owait.wait.func = memcg_oom_wake_function; owait.wait.private = current; INIT_LIST_HEAD(&owait.wait.task_list); need_to_kill = true; - mem_cgroup_mark_under_oom(mem); + mem_cgroup_mark_under_oom(memcg); - /* At first, try to OOM lock hierarchy under mem.*/ + /* At first, try to OOM lock hierarchy under memcg.*/ spin_lock(&memcg_oom_lock); - locked = mem_cgroup_oom_lock(mem); + locked = mem_cgroup_oom_lock(memcg); /* * Even if signal_pending(), we can't quit charge() loop without * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL * under OOM is always welcomed, use TASK_KILLABLE here. */ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); - if (!locked || mem->oom_kill_disable) + if (!locked || memcg->oom_kill_disable) need_to_kill = false; if (locked) - mem_cgroup_oom_notify(mem); + mem_cgroup_oom_notify(memcg); spin_unlock(&memcg_oom_lock); if (need_to_kill) { finish_wait(&memcg_oom_waitq, &owait.wait); - mem_cgroup_out_of_memory(mem, mask); + mem_cgroup_out_of_memory(memcg, mask); } else { schedule(); finish_wait(&memcg_oom_waitq, &owait.wait); } spin_lock(&memcg_oom_lock); if (locked) - mem_cgroup_oom_unlock(mem); - memcg_wakeup_oom(mem); + mem_cgroup_oom_unlock(memcg); + memcg_wakeup_oom(memcg); spin_unlock(&memcg_oom_lock); - mem_cgroup_unmark_under_oom(mem); + mem_cgroup_unmark_under_oom(memcg); if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) return false; @@ -1926,7 +1929,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) void mem_cgroup_update_page_stat(struct page *page, enum mem_cgroup_page_stat_item idx, int val) { - struct mem_cgroup *mem; + struct mem_cgroup *memcg; struct page_cgroup *pc = lookup_page_cgroup(page); bool need_unlock = false; unsigned long uninitialized_var(flags); @@ -1935,16 +1938,16 @@ void mem_cgroup_update_page_stat(struct page *page, return; rcu_read_lock(); - mem = pc->mem_cgroup; - if (unlikely(!mem || !PageCgroupUsed(pc))) + memcg = pc->mem_cgroup; + if (unlikely(!memcg || !PageCgroupUsed(pc))) goto out; /* pc->mem_cgroup is unstable ? */ - if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { + if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) { /* take a lock against to access pc->mem_cgroup */ move_lock_page_cgroup(pc, &flags); need_unlock = true; - mem = pc->mem_cgroup; - if (!mem || !PageCgroupUsed(pc)) + memcg = pc->mem_cgroup; + if (!memcg || !PageCgroupUsed(pc)) goto out; } @@ -1960,7 +1963,7 @@ void mem_cgroup_update_page_stat(struct page *page, BUG(); } - this_cpu_add(mem->stat->count[idx], val); + this_cpu_add(memcg->stat->count[idx], val); out: if (unlikely(need_unlock)) @@ -1991,13 +1994,13 @@ static DEFINE_MUTEX(percpu_charge_mutex); * cgroup which is not current target, returns false. This stock will be * refilled. */ -static bool consume_stock(struct mem_cgroup *mem) +static bool consume_stock(struct mem_cgroup *memcg) { struct memcg_stock_pcp *stock; bool ret = true; stock = &get_cpu_var(memcg_stock); - if (mem == stock->cached && stock->nr_pages) + if (memcg == stock->cached && stock->nr_pages) stock->nr_pages--; else /* need to call res_counter_charge */ ret = false; @@ -2038,24 +2041,24 @@ static void drain_local_stock(struct work_struct *dummy) * Cache charges(val) which is from res_counter, to local per_cpu area. * This will be consumed by consume_stock() function, later. */ -static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); - if (stock->cached != mem) { /* reset if necessary */ + if (stock->cached != memcg) { /* reset if necessary */ drain_stock(stock); - stock->cached = mem; + stock->cached = memcg; } stock->nr_pages += nr_pages; put_cpu_var(memcg_stock); } /* - * Drains all per-CPU charge caches for given root_mem resp. subtree + * Drains all per-CPU charge caches for given root_memcg resp. subtree * of the hierarchy under it. sync flag says whether we should block * until the work is done. */ -static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) +static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) { int cpu, curcpu; @@ -2064,12 +2067,12 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) curcpu = get_cpu(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - struct mem_cgroup *mem; + struct mem_cgroup *memcg; - mem = stock->cached; - if (!mem || !stock->nr_pages) + memcg = stock->cached; + if (!memcg || !stock->nr_pages) continue; - if (!mem_cgroup_same_or_subtree(root_mem, mem)) + if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) continue; if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { if (cpu == curcpu) @@ -2098,23 +2101,23 @@ out: * expects some charges will be back to res_counter later but cannot wait for * it. */ -static void drain_all_stock_async(struct mem_cgroup *root_mem) +static void drain_all_stock_async(struct mem_cgroup *root_memcg) { /* * If someone calls draining, avoid adding more kworker runs. */ if (!mutex_trylock(&percpu_charge_mutex)) return; - drain_all_stock(root_mem, false); + drain_all_stock(root_memcg, false); mutex_unlock(&percpu_charge_mutex); } /* This is a synchronous drain interface. */ -static void drain_all_stock_sync(struct mem_cgroup *root_mem) +static void drain_all_stock_sync(struct mem_cgroup *root_memcg) { /* called when force_empty is called */ mutex_lock(&percpu_charge_mutex); - drain_all_stock(root_mem, true); + drain_all_stock(root_memcg, true); mutex_unlock(&percpu_charge_mutex); } @@ -2122,35 +2125,35 @@ static void drain_all_stock_sync(struct mem_cgroup *root_mem) * This function drains percpu counter value from DEAD cpu and * move it to local cpu. Note that this function can be preempted. */ -static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) +static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) { int i; - spin_lock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { - long x = per_cpu(mem->stat->count[i], cpu); + long x = per_cpu(memcg->stat->count[i], cpu); - per_cpu(mem->stat->count[i], cpu) = 0; - mem->nocpu_base.count[i] += x; + per_cpu(memcg->stat->count[i], cpu) = 0; + memcg->nocpu_base.count[i] += x; } for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { - unsigned long x = per_cpu(mem->stat->events[i], cpu); + unsigned long x = per_cpu(memcg->stat->events[i], cpu); - per_cpu(mem->stat->events[i], cpu) = 0; - mem->nocpu_base.events[i] += x; + per_cpu(memcg->stat->events[i], cpu) = 0; + memcg->nocpu_base.events[i] += x; } /* need to clear ON_MOVE value, works as a kind of lock. */ - per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; - spin_unlock(&mem->pcp_counter_lock); + per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; + spin_unlock(&memcg->pcp_counter_lock); } -static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) +static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu) { int idx = MEM_CGROUP_ON_MOVE; - spin_lock(&mem->pcp_counter_lock); - per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; - spin_unlock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); + per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx]; + spin_unlock(&memcg->pcp_counter_lock); } static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, @@ -2188,7 +2191,7 @@ enum { CHARGE_OOM_DIE, /* the current is killed because of OOM */ }; -static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, +static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages, bool oom_check) { unsigned long csize = nr_pages * PAGE_SIZE; @@ -2197,16 +2200,16 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, unsigned long flags = 0; int ret; - ret = res_counter_charge(&mem->res, csize, &fail_res); + ret = res_counter_charge(&memcg->res, csize, &fail_res); if (likely(!ret)) { if (!do_swap_account) return CHARGE_OK; - ret = res_counter_charge(&mem->memsw, csize, &fail_res); + ret = res_counter_charge(&memcg->memsw, csize, &fail_res); if (likely(!ret)) return CHARGE_OK; - res_counter_uncharge(&mem->res, csize); + res_counter_uncharge(&memcg->res, csize); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else @@ -2264,12 +2267,12 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, static int __mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, unsigned int nr_pages, - struct mem_cgroup **memcg, + struct mem_cgroup **ptr, bool oom) { unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; int ret; /* @@ -2287,17 +2290,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, * thread group leader migrates. It's possible that mm is not * set, if so charge the init_mm (happens for pagecache usage). */ - if (!*memcg && !mm) + if (!*ptr && !mm) goto bypass; again: - if (*memcg) { /* css should be a valid one */ - mem = *memcg; - VM_BUG_ON(css_is_removed(&mem->css)); - if (mem_cgroup_is_root(mem)) + if (*ptr) { /* css should be a valid one */ + memcg = *ptr; + VM_BUG_ON(css_is_removed(&memcg->css)); + if (mem_cgroup_is_root(memcg)) goto done; - if (nr_pages == 1 && consume_stock(mem)) + if (nr_pages == 1 && consume_stock(memcg)) goto done; - css_get(&mem->css); + css_get(&memcg->css); } else { struct task_struct *p; @@ -2305,7 +2308,7 @@ again: p = rcu_dereference(mm->owner); /* * Because we don't have task_lock(), "p" can exit. - * In that case, "mem" can point to root or p can be NULL with + * In that case, "memcg" can point to root or p can be NULL with * race with swapoff. Then, we have small risk of mis-accouning. * But such kind of mis-account by race always happens because * we don't have cgroup_mutex(). It's overkill and we allo that @@ -2313,12 +2316,12 @@ again: * (*) swapoff at el will charge against mm-struct not against * task-struct. So, mm->owner can be NULL. */ - mem = mem_cgroup_from_task(p); - if (!mem || mem_cgroup_is_root(mem)) { + memcg = mem_cgroup_from_task(p); + if (!memcg || mem_cgroup_is_root(memcg)) { rcu_read_unlock(); goto done; } - if (nr_pages == 1 && consume_stock(mem)) { + if (nr_pages == 1 && consume_stock(memcg)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -2331,7 +2334,7 @@ again: goto done; } /* after here, we may be blocked. we need to get refcnt */ - if (!css_tryget(&mem->css)) { + if (!css_tryget(&memcg->css)) { rcu_read_unlock(); goto again; } @@ -2343,7 +2346,7 @@ again: /* If killed, bypass charge */ if (fatal_signal_pending(current)) { - css_put(&mem->css); + css_put(&memcg->css); goto bypass; } @@ -2353,43 +2356,43 @@ again: nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; } - ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); + ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); switch (ret) { case CHARGE_OK: break; case CHARGE_RETRY: /* not in OOM situation but retry */ batch = nr_pages; - css_put(&mem->css); - mem = NULL; + css_put(&memcg->css); + memcg = NULL; goto again; case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - css_put(&mem->css); + css_put(&memcg->css); goto nomem; case CHARGE_NOMEM: /* OOM routine works */ if (!oom) { - css_put(&mem->css); + css_put(&memcg->css); goto nomem; } /* If oom, we never return -ENOMEM */ nr_oom_retries--; break; case CHARGE_OOM_DIE: /* Killed by OOM Killer */ - css_put(&mem->css); + css_put(&memcg->css); goto bypass; } } while (ret != CHARGE_OK); if (batch > nr_pages) - refill_stock(mem, batch - nr_pages); - css_put(&mem->css); + refill_stock(memcg, batch - nr_pages); + css_put(&memcg->css); done: - *memcg = mem; + *ptr = memcg; return 0; nomem: - *memcg = NULL; + *ptr = NULL; return -ENOMEM; bypass: - *memcg = NULL; + *ptr = NULL; return 0; } @@ -2398,15 +2401,15 @@ bypass: * This function is for that and do uncharge, put css's refcnt. * gotten by try_charge(). */ -static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, +static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - if (!mem_cgroup_is_root(mem)) { + if (!mem_cgroup_is_root(memcg)) { unsigned long bytes = nr_pages * PAGE_SIZE; - res_counter_uncharge(&mem->res, bytes); + res_counter_uncharge(&memcg->res, bytes); if (do_swap_account) - res_counter_uncharge(&mem->memsw, bytes); + res_counter_uncharge(&memcg->memsw, bytes); } } @@ -2431,7 +2434,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; unsigned short id; swp_entry_t ent; @@ -2441,23 +2444,23 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { - mem = pc->mem_cgroup; - if (mem && !css_tryget(&mem->css)) - mem = NULL; + memcg = pc->mem_cgroup; + if (memcg && !css_tryget(&memcg->css)) + memcg = NULL; } else if (PageSwapCache(page)) { ent.val = page_private(page); id = lookup_swap_cgroup(ent); rcu_read_lock(); - mem = mem_cgroup_lookup(id); - if (mem && !css_tryget(&mem->css)) - mem = NULL; + memcg = mem_cgroup_lookup(id); + if (memcg && !css_tryget(&memcg->css)) + memcg = NULL; rcu_read_unlock(); } unlock_page_cgroup(pc); - return mem; + return memcg; } -static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, +static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, struct page *page, unsigned int nr_pages, struct page_cgroup *pc, @@ -2466,14 +2469,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - __mem_cgroup_cancel_charge(mem, nr_pages); + __mem_cgroup_cancel_charge(memcg, nr_pages); return; } /* * we don't need page_cgroup_lock about tail pages, becase they are not * accessed by any other context at this point. */ - pc->mem_cgroup = mem; + pc->mem_cgroup = memcg; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). * Especially when a page_cgroup is taken from a page, pc->mem_cgroup @@ -2496,14 +2499,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, break; } - mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); + mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); unlock_page_cgroup(pc); /* * "charge_statistics" updated event counter. Then, check it. * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. * if they exceeds softlimit. */ - memcg_check_events(mem, page); + memcg_check_events(memcg, page); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2690,7 +2693,7 @@ out: static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, enum charge_type ctype) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; struct page_cgroup *pc; bool oom = true; @@ -2709,11 +2712,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, pc = lookup_page_cgroup(page); BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ - ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); - if (ret || !mem) + ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); + if (ret || !memcg) return ret; - __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); + __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); return 0; } @@ -2742,7 +2745,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, enum charge_type ctype); static void -__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, +__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) { struct page_cgroup *pc = lookup_page_cgroup(page); @@ -2752,7 +2755,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, * LRU. Take care of it. */ mem_cgroup_lru_del_before_commit(page); - __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); + __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); mem_cgroup_lru_add_after_commit(page); return; } @@ -2760,7 +2763,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; int ret; if (mem_cgroup_disabled()) @@ -2772,8 +2775,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, mm = &init_mm; if (page_is_file_cache(page)) { - ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); - if (ret || !mem) + ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); + if (ret || !memcg) return ret; /* @@ -2781,15 +2784,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, * put that would remove them from the LRU list, make * sure that they get relinked properly. */ - __mem_cgroup_commit_charge_lrucare(page, mem, + __mem_cgroup_commit_charge_lrucare(page, memcg, MEM_CGROUP_CHARGE_TYPE_CACHE); return ret; } /* shmem */ if (PageSwapCache(page)) { - ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); + ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); if (!ret) - __mem_cgroup_commit_charge_swapin(page, mem, + __mem_cgroup_commit_charge_swapin(page, memcg, MEM_CGROUP_CHARGE_TYPE_SHMEM); } else ret = mem_cgroup_charge_common(page, mm, gfp_mask, @@ -2808,7 +2811,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, gfp_t mask, struct mem_cgroup **ptr) { - struct mem_cgroup *mem; + struct mem_cgroup *memcg; int ret; *ptr = NULL; @@ -2826,12 +2829,12 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, */ if (!PageSwapCache(page)) goto charge_cur_mm; - mem = try_get_mem_cgroup_from_page(page); - if (!mem) + memcg = try_get_mem_cgroup_from_page(page); + if (!memcg) goto charge_cur_mm; - *ptr = mem; + *ptr = memcg; ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); - css_put(&mem->css); + css_put(&memcg->css); return ret; charge_cur_mm: if (unlikely(!mm)) @@ -2891,16 +2894,16 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) MEM_CGROUP_CHARGE_TYPE_MAPPED); } -void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) +void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return; - if (!mem) + if (!memcg) return; - __mem_cgroup_cancel_charge(mem, 1); + __mem_cgroup_cancel_charge(memcg, 1); } -static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, +static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages, const enum charge_type ctype) { @@ -2918,7 +2921,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, * uncharges. Then, it's ok to ignore memcg's refcnt. */ if (!batch->memcg) - batch->memcg = mem; + batch->memcg = memcg; /* * do_batch > 0 when unmapping pages or inode invalidate/truncate. * In those cases, all pages freed continuously can be expected to be in @@ -2938,7 +2941,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, * merge a series of uncharges to an uncharge of res_counter. * If not, we uncharge res_counter ony by one. */ - if (batch->memcg != mem) + if (batch->memcg != memcg) goto direct_uncharge; /* remember freed charge and uncharge it later */ batch->nr_pages++; @@ -2946,11 +2949,11 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, batch->memsw_nr_pages++; return; direct_uncharge: - res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); + res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); - if (unlikely(batch->memcg != mem)) - memcg_oom_recover(mem); + res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); + if (unlikely(batch->memcg != memcg)) + memcg_oom_recover(memcg); return; } @@ -2960,7 +2963,7 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; struct page_cgroup *pc; @@ -2983,7 +2986,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) lock_page_cgroup(pc); - mem = pc->mem_cgroup; + memcg = pc->mem_cgroup; if (!PageCgroupUsed(pc)) goto unlock_out; @@ -3006,7 +3009,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); + mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); ClearPageCgroupUsed(pc); /* @@ -3018,18 +3021,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) unlock_page_cgroup(pc); /* - * even after unlock, we have mem->res.usage here and this memcg + * even after unlock, we have memcg->res.usage here and this memcg * will never be freed. */ - memcg_check_events(mem, page); + memcg_check_events(memcg, page); if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { - mem_cgroup_swap_statistics(mem, true); - mem_cgroup_get(mem); + mem_cgroup_swap_statistics(memcg, true); + mem_cgroup_get(memcg); } - if (!mem_cgroup_is_root(mem)) - mem_cgroup_do_uncharge(mem, nr_pages, ctype); + if (!mem_cgroup_is_root(memcg)) + mem_cgroup_do_uncharge(memcg, nr_pages, ctype); - return mem; + return memcg; unlock_out: unlock_page_cgroup(pc); @@ -3219,7 +3222,7 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; enum charge_type ctype; int ret = 0; @@ -3233,8 +3236,8 @@ int mem_cgroup_prepare_migration(struct page *page, pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { - mem = pc->mem_cgroup; - css_get(&mem->css); + memcg = pc->mem_cgroup; + css_get(&memcg->css); /* * At migrating an anonymous page, its mapcount goes down * to 0 and uncharge() will be called. But, even if it's fully @@ -3272,12 +3275,12 @@ int mem_cgroup_prepare_migration(struct page *page, * If the page is not charged at this point, * we return here. */ - if (!mem) + if (!memcg) return 0; - *ptr = mem; + *ptr = memcg; ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); - css_put(&mem->css);/* drop extra refcnt */ + css_put(&memcg->css);/* drop extra refcnt */ if (ret || *ptr == NULL) { if (PageAnon(page)) { lock_page_cgroup(pc); @@ -3303,21 +3306,21 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); + __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); return ret; } /* remove redundant charge if migration failed*/ -void mem_cgroup_end_migration(struct mem_cgroup *mem, +void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok) { struct page *used, *unused; struct page_cgroup *pc; - if (!mem) + if (!memcg) return; /* blocks rmdir() */ - cgroup_exclude_rmdir(&mem->css); + cgroup_exclude_rmdir(&memcg->css); if (!migration_ok) { used = oldpage; unused = newpage; @@ -3353,7 +3356,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, * So, rmdir()->pre_destroy() can be called while we do this charge. * In that case, we need to call pre_destroy() again. check it here. */ - cgroup_release_and_wakeup_rmdir(&mem->css); + cgroup_release_and_wakeup_rmdir(&memcg->css); } #ifdef CONFIG_DEBUG_VM @@ -3432,7 +3435,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, /* * Rather than hide all in some function, I do this in * open coded manner. You see what this really does. - * We have to guarantee mem->res.limit < mem->memsw.limit. + * We have to guarantee memcg->res.limit < memcg->memsw.limit. */ mutex_lock(&set_limit_mutex); memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); @@ -3494,7 +3497,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, /* * Rather than hide all in some function, I do this in * open coded manner. You see what this really does. - * We have to guarantee mem->res.limit < mem->memsw.limit. + * We have to guarantee memcg->res.limit < memcg->memsw.limit. */ mutex_lock(&set_limit_mutex); memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -3632,7 +3635,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, * This routine traverse page_cgroup in given list and drop them all. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. */ -static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, +static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { struct zone *zone; @@ -3643,7 +3646,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, int ret = 0; zone = &NODE_DATA(node)->node_zones[zid]; - mz = mem_cgroup_zoneinfo(mem, node, zid); + mz = mem_cgroup_zoneinfo(memcg, node, zid); list = &mz->lists[lru]; loop = MEM_CGROUP_ZSTAT(mz, lru); @@ -3670,7 +3673,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, page = lookup_cgroup_page(pc); - ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); + ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); if (ret == -ENOMEM) break; @@ -3691,14 +3694,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, * make mem_cgroup's charge to be 0 if there is no task. * This enables deleting this mem_cgroup. */ -static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) +static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) { int ret; int node, zid, shrink; int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct cgroup *cgrp = mem->css.cgroup; + struct cgroup *cgrp = memcg->css.cgroup; - css_get(&mem->css); + css_get(&memcg->css); shrink = 0; /* should free all ? */ @@ -3714,14 +3717,14 @@ move_account: goto out; /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); - drain_all_stock_sync(mem); + drain_all_stock_sync(memcg); ret = 0; - mem_cgroup_start_move(mem); + mem_cgroup_start_move(memcg); for_each_node_state(node, N_HIGH_MEMORY) { for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { enum lru_list l; for_each_lru(l) { - ret = mem_cgroup_force_empty_list(mem, + ret = mem_cgroup_force_empty_list(memcg, node, zid, l); if (ret) break; @@ -3730,16 +3733,16 @@ move_account: if (ret) break; } - mem_cgroup_end_move(mem); - memcg_oom_recover(mem); + mem_cgroup_end_move(memcg); + memcg_oom_recover(memcg); /* it seems parent cgroup doesn't have enough mem */ if (ret == -ENOMEM) goto try_to_free; cond_resched(); /* "ret" should also be checked to ensure all lists are empty. */ - } while (mem->res.usage > 0 || ret); + } while (memcg->res.usage > 0 || ret); out: - css_put(&mem->css); + css_put(&memcg->css); return ret; try_to_free: @@ -3752,14 +3755,14 @@ try_to_free: lru_add_drain_all(); /* try to free all pages in this cgroup */ shrink = 1; - while (nr_retries && mem->res.usage > 0) { + while (nr_retries && memcg->res.usage > 0) { int progress; if (signal_pending(current)) { ret = -EINTR; goto out; } - progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, + progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, false); if (!progress) { nr_retries--; @@ -3788,12 +3791,12 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, u64 val) { int retval = 0; - struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); struct cgroup *parent = cont->parent; - struct mem_cgroup *parent_mem = NULL; + struct mem_cgroup *parent_memcg = NULL; if (parent) - parent_mem = mem_cgroup_from_cont(parent); + parent_memcg = mem_cgroup_from_cont(parent); cgroup_lock(); /* @@ -3804,10 +3807,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, * For the root cgroup, parent_mem is NULL, we allow value to be * set if there are no children. */ - if ((!parent_mem || !parent_mem->use_hierarchy) && + if ((!parent_memcg || !parent_memcg->use_hierarchy) && (val == 1 || val == 0)) { if (list_empty(&cont->children)) - mem->use_hierarchy = val; + memcg->use_hierarchy = val; else retval = -EBUSY; } else @@ -3818,14 +3821,14 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, } -static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, +static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { struct mem_cgroup *iter; long val = 0; /* Per-cpu values can be negative, use a signed accumulator */ - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) val += mem_cgroup_read_stat(iter, idx); if (val < 0) /* race ? */ @@ -3833,29 +3836,29 @@ static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, return val; } -static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) +static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { u64 val; - if (!mem_cgroup_is_root(mem)) { + if (!mem_cgroup_is_root(memcg)) { if (!swap) - return res_counter_read_u64(&mem->res, RES_USAGE); + return res_counter_read_u64(&memcg->res, RES_USAGE); else - return res_counter_read_u64(&mem->memsw, RES_USAGE); + return res_counter_read_u64(&memcg->memsw, RES_USAGE); } - val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); + val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); if (swap) - val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); + val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); return val << PAGE_SHIFT; } static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); u64 val; int type, name; @@ -3864,15 +3867,15 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) switch (type) { case _MEM: if (name == RES_USAGE) - val = mem_cgroup_usage(mem, false); + val = mem_cgroup_usage(memcg, false); else - val = res_counter_read_u64(&mem->res, name); + val = res_counter_read_u64(&memcg->res, name); break; case _MEMSWAP: if (name == RES_USAGE) - val = mem_cgroup_usage(mem, true); + val = mem_cgroup_usage(memcg, true); else - val = res_counter_read_u64(&mem->memsw, name); + val = res_counter_read_u64(&memcg->memsw, name); break; default: BUG(); @@ -3960,24 +3963,24 @@ out: static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) { - struct mem_cgroup *mem; + struct mem_cgroup *memcg; int type, name; - mem = mem_cgroup_from_cont(cont); + memcg = mem_cgroup_from_cont(cont); type = MEMFILE_TYPE(event); name = MEMFILE_ATTR(event); switch (name) { case RES_MAX_USAGE: if (type == _MEM) - res_counter_reset_max(&mem->res); + res_counter_reset_max(&memcg->res); else - res_counter_reset_max(&mem->memsw); + res_counter_reset_max(&memcg->memsw); break; case RES_FAILCNT: if (type == _MEM) - res_counter_reset_failcnt(&mem->res); + res_counter_reset_failcnt(&memcg->res); else - res_counter_reset_failcnt(&mem->memsw); + res_counter_reset_failcnt(&memcg->memsw); break; } @@ -3994,7 +3997,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, static int mem_cgroup_move_charge_write(struct cgroup *cgrp, struct cftype *cft, u64 val) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); if (val >= (1 << NR_MOVE_TYPE)) return -EINVAL; @@ -4004,7 +4007,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, * inconsistent. */ cgroup_lock(); - mem->move_charge_at_immigrate = val; + memcg->move_charge_at_immigrate = val; cgroup_unlock(); return 0; @@ -4061,49 +4064,49 @@ struct { static void -mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) +mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) { s64 val; /* per cpu stat */ - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); + val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE); s->stat[MCS_CACHE] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); + val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS); s->stat[MCS_RSS] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); + val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; - val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); + val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN); s->stat[MCS_PGPGIN] += val; - val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); + val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT); s->stat[MCS_PGPGOUT] += val; if (do_swap_account) { - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); + val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); s->stat[MCS_SWAP] += val * PAGE_SIZE; } - val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); + val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT); s->stat[MCS_PGFAULT] += val; - val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); + val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT); s->stat[MCS_PGMAJFAULT] += val; /* per zone stat */ - val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON)); + val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)); + val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE)); + val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE)); + val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE)); + val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; } static void -mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) +mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) { struct mem_cgroup *iter; - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) mem_cgroup_get_local_stat(iter, s); } @@ -4327,20 +4330,20 @@ static int compare_thresholds(const void *a, const void *b) return _a->threshold - _b->threshold; } -static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) { struct mem_cgroup_eventfd_list *ev; - list_for_each_entry(ev, &mem->oom_notify, list) + list_for_each_entry(ev, &memcg->oom_notify, list) eventfd_signal(ev->eventfd, 1); return 0; } -static void mem_cgroup_oom_notify(struct mem_cgroup *mem) +static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) { struct mem_cgroup *iter; - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) mem_cgroup_oom_notify_cb(iter); } @@ -4530,7 +4533,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, struct cftype *cft, struct eventfd_ctx *eventfd) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *ev, *tmp; int type = MEMFILE_TYPE(cft->private); @@ -4538,7 +4541,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, spin_lock(&memcg_oom_lock); - list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { + list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { if (ev->eventfd == eventfd) { list_del(&ev->list); kfree(ev); @@ -4551,11 +4554,11 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, static int mem_cgroup_oom_control_read(struct cgroup *cgrp, struct cftype *cft, struct cgroup_map_cb *cb) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); + cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); - if (atomic_read(&mem->under_oom)) + if (atomic_read(&memcg->under_oom)) cb->fill(cb, "under_oom", 1); else cb->fill(cb, "under_oom", 0); @@ -4565,7 +4568,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, static int mem_cgroup_oom_control_write(struct cgroup *cgrp, struct cftype *cft, u64 val) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup *parent; /* cannot set to root cgroup and only 0 and 1 are allowed */ @@ -4577,13 +4580,13 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, cgroup_lock(); /* oom-kill-disable is a flag for subhierarchy. */ if ((parent->use_hierarchy) || - (mem->use_hierarchy && !list_empty(&cgrp->children))) { + (memcg->use_hierarchy && !list_empty(&cgrp->children))) { cgroup_unlock(); return -EINVAL; } - mem->oom_kill_disable = val; + memcg->oom_kill_disable = val; if (!val) - memcg_oom_recover(mem); + memcg_oom_recover(memcg); cgroup_unlock(); return 0; } @@ -4719,7 +4722,7 @@ static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) } #endif -static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) +static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; struct mem_cgroup_per_zone *mz; @@ -4739,21 +4742,21 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) if (!pn) return 1; - mem->info.nodeinfo[node] = pn; + memcg->info.nodeinfo[node] = pn; for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) INIT_LIST_HEAD(&mz->lists[l]); mz->usage_in_excess = 0; mz->on_tree = false; - mz->mem = mem; + mz->mem = memcg; } return 0; } -static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) +static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { - kfree(mem->info.nodeinfo[node]); + kfree(memcg->info.nodeinfo[node]); } static struct mem_cgroup *mem_cgroup_alloc(void) @@ -4795,51 +4798,51 @@ out_free: * Removal of cgroup itself succeeds regardless of refs from swap. */ -static void __mem_cgroup_free(struct mem_cgroup *mem) +static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - mem_cgroup_remove_from_trees(mem); - free_css_id(&mem_cgroup_subsys, &mem->css); + mem_cgroup_remove_from_trees(memcg); + free_css_id(&mem_cgroup_subsys, &memcg->css); for_each_node_state(node, N_POSSIBLE) - free_mem_cgroup_per_zone_info(mem, node); + free_mem_cgroup_per_zone_info(memcg, node); - free_percpu(mem->stat); + free_percpu(memcg->stat); if (sizeof(struct mem_cgroup) < PAGE_SIZE) - kfree(mem); + kfree(memcg); else - vfree(mem); + vfree(memcg); } -static void mem_cgroup_get(struct mem_cgroup *mem) +static void mem_cgroup_get(struct mem_cgroup *memcg) { - atomic_inc(&mem->refcnt); + atomic_inc(&memcg->refcnt); } -static void __mem_cgroup_put(struct mem_cgroup *mem, int count) +static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) { - if (atomic_sub_and_test(count, &mem->refcnt)) { - struct mem_cgroup *parent = parent_mem_cgroup(mem); - __mem_cgroup_free(mem); + if (atomic_sub_and_test(count, &memcg->refcnt)) { + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + __mem_cgroup_free(memcg); if (parent) mem_cgroup_put(parent); } } -static void mem_cgroup_put(struct mem_cgroup *mem) +static void mem_cgroup_put(struct mem_cgroup *memcg) { - __mem_cgroup_put(mem, 1); + __mem_cgroup_put(memcg, 1); } /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) +static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { - if (!mem->res.parent) + if (!memcg->res.parent) return NULL; - return mem_cgroup_from_res_counter(mem->res.parent, res); + return mem_cgroup_from_res_counter(memcg->res.parent, res); } #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -4882,16 +4885,16 @@ static int mem_cgroup_soft_limit_tree_init(void) static struct cgroup_subsys_state * __ref mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { - struct mem_cgroup *mem, *parent; + struct mem_cgroup *memcg, *parent; long error = -ENOMEM; int node; - mem = mem_cgroup_alloc(); - if (!mem) + memcg = mem_cgroup_alloc(); + if (!memcg) return ERR_PTR(error); for_each_node_state(node, N_POSSIBLE) - if (alloc_mem_cgroup_per_zone_info(mem, node)) + if (alloc_mem_cgroup_per_zone_info(memcg, node)) goto free_out; /* root ? */ @@ -4899,7 +4902,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) int cpu; enable_swap_cgroup(); parent = NULL; - root_mem_cgroup = mem; + root_mem_cgroup = memcg; if (mem_cgroup_soft_limit_tree_init()) goto free_out; for_each_possible_cpu(cpu) { @@ -4910,13 +4913,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) hotcpu_notifier(memcg_cpu_hotplug_callback, 0); } else { parent = mem_cgroup_from_cont(cont->parent); - mem->use_hierarchy = parent->use_hierarchy; - mem->oom_kill_disable = parent->oom_kill_disable; + memcg->use_hierarchy = parent->use_hierarchy; + memcg->oom_kill_disable = parent->oom_kill_disable; } if (parent && parent->use_hierarchy) { - res_counter_init(&mem->res, &parent->res); - res_counter_init(&mem->memsw, &parent->memsw); + res_counter_init(&memcg->res, &parent->res); + res_counter_init(&memcg->memsw, &parent->memsw); /* * We increment refcnt of the parent to ensure that we can * safely access it on res_counter_charge/uncharge. @@ -4925,21 +4928,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) */ mem_cgroup_get(parent); } else { - res_counter_init(&mem->res, NULL); - res_counter_init(&mem->memsw, NULL); + res_counter_init(&memcg->res, NULL); + res_counter_init(&memcg->memsw, NULL); } - mem->last_scanned_child = 0; - mem->last_scanned_node = MAX_NUMNODES; - INIT_LIST_HEAD(&mem->oom_notify); + memcg->last_scanned_child = 0; + memcg->last_scanned_node = MAX_NUMNODES; + INIT_LIST_HEAD(&memcg->oom_notify); if (parent) - mem->swappiness = mem_cgroup_swappiness(parent); - atomic_set(&mem->refcnt, 1); - mem->move_charge_at_immigrate = 0; - mutex_init(&mem->thresholds_lock); - return &mem->css; + memcg->swappiness = mem_cgroup_swappiness(parent); + atomic_set(&memcg->refcnt, 1); + memcg->move_charge_at_immigrate = 0; + mutex_init(&memcg->thresholds_lock); + return &memcg->css; free_out: - __mem_cgroup_free(mem); + __mem_cgroup_free(memcg); root_mem_cgroup = NULL; return ERR_PTR(error); } @@ -4947,17 +4950,17 @@ free_out: static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - return mem_cgroup_force_empty(mem, false); + return mem_cgroup_force_empty(memcg, false); } static void mem_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - mem_cgroup_put(mem); + mem_cgroup_put(memcg); } static int mem_cgroup_populate(struct cgroup_subsys *ss, @@ -4980,9 +4983,9 @@ static int mem_cgroup_do_precharge(unsigned long count) { int ret = 0; int batch_count = PRECHARGE_COUNT_AT_ONCE; - struct mem_cgroup *mem = mc.to; + struct mem_cgroup *memcg = mc.to; - if (mem_cgroup_is_root(mem)) { + if (mem_cgroup_is_root(memcg)) { mc.precharge += count; /* we don't need css_get for root */ return ret; @@ -4991,16 +4994,16 @@ static int mem_cgroup_do_precharge(unsigned long count) if (count > 1) { struct res_counter *dummy; /* - * "mem" cannot be under rmdir() because we've already checked + * "memcg" cannot be under rmdir() because we've already checked * by cgroup_lock_live_cgroup() that it is not removed and we * are still under the same cgroup_mutex. So we can postpone * css_get(). */ - if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) + if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) goto one_by_one; - if (do_swap_account && res_counter_charge(&mem->memsw, + if (do_swap_account && res_counter_charge(&memcg->memsw, PAGE_SIZE * count, &dummy)) { - res_counter_uncharge(&mem->res, PAGE_SIZE * count); + res_counter_uncharge(&memcg->res, PAGE_SIZE * count); goto one_by_one; } mc.precharge += count; @@ -5017,8 +5020,9 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); - if (ret || !mem) + ret = __mem_cgroup_try_charge(NULL, + GFP_KERNEL, 1, &memcg, false); + if (ret || !memcg) /* mem_cgroup_clear_mc() will do uncharge later */ return -ENOMEM; mc.precharge++; @@ -5292,13 +5296,13 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct task_struct *p) { int ret = 0; - struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); - if (mem->move_charge_at_immigrate) { + if (memcg->move_charge_at_immigrate) { struct mm_struct *mm; struct mem_cgroup *from = mem_cgroup_from_task(p); - VM_BUG_ON(from == mem); + VM_BUG_ON(from == memcg); mm = get_task_mm(p); if (!mm) @@ -5313,7 +5317,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; - mc.to = mem; + mc.to = memcg; spin_unlock(&mc.lock); /* We set mc.moving_task later */ -- cgit v1.2.3 From 715a5ee82ab3c07430f748630044354132add5ad Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 2 Nov 2011 13:38:18 -0700 Subject: memcg: fix oom schedule_timeout() Before calling schedule_timeout(), task state should be changed. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9e38abdbfd95..c02d87028b9f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1898,7 +1898,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) return false; /* Give chance to dying process */ - schedule_timeout(1); + schedule_timeout_uninterruptible(1); return true; } -- cgit v1.2.3 From 0a619e58703b86d53d07e938eade9a91a4a863c6 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 2 Nov 2011 13:38:21 -0700 Subject: memcg: do not expose uninitialized mem_cgroup_per_node to world If somebody is touching data too early, it might be easier to diagnose a problem when dereferencing NULL at mem->info.nodeinfo[node] than trying to understand why mem_cgroup_per_zone is [un|partly]initialized. Signed-off-by: Igor Mammedov Acked-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c02d87028b9f..f6c4beb4db56 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4742,7 +4742,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - memcg->info.nodeinfo[node] = pn; for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) @@ -4751,6 +4750,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) mz->on_tree = false; mz->mem = memcg; } + memcg->info.nodeinfo[node] = pn; return 0; } -- cgit v1.2.3 From 9b272977e3b99a8699361d214b51f98c8a9e0e7b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 2 Nov 2011 13:38:23 -0700 Subject: memcg: skip scanning active lists based on individual size Reclaim decides to skip scanning an active list when the corresponding inactive list is above a certain size in comparison to leave the assumed working set alone while there are still enough reclaim candidates around. The memcg implementation of comparing those lists instead reports whether the whole memcg is low on the requested type of inactive pages, considering all nodes and zones. This can lead to an oversized active list not being scanned because of the state of the other lists in the memcg, as well as an active list being scanned while its corresponding inactive list has enough pages. Not only is this wrong, it's also a scalability hazard, because the global memory state over all nodes and zones has to be gathered for each memcg and zone scanned. Make these calculations purely based on the size of the two LRU lists that are actually affected by the outcome of the decision. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: KOSAKI Motohiro Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Reviewed-by: Minchan Kim Reviewed-by: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 1 - include/linux/memcontrol.h | 10 ++++---- mm/memcontrol.c | 51 ++++++++++++++-------------------------- mm/vmscan.c | 4 ++-- 4 files changed, 25 insertions(+), 41 deletions(-) diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 06eb6d957c83..cc0ebc5241b3 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -418,7 +418,6 @@ total_unevictable - sum of all children's "unevictable" # The following additional stats are dependent on CONFIG_DEBUG_VM. -inactive_ratio - VM internal parameter. (see mm/page_alloc.c) recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) recent_rotated_file - VM internal parameter. (see mm/vmscan.c) recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 05206aac5965..b87068a1a09e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -106,8 +106,10 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, /* * For memory reclaim. */ -int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); -int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); +int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, + struct zone *zone); +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, + struct zone *zone); int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, unsigned int lrumask); @@ -295,13 +297,13 @@ static inline bool mem_cgroup_disabled(void) } static inline int -mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) +mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) { return 1; } static inline int -mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) +mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) { return 1; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f6c4beb4db56..ce7b35d024e9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1104,15 +1104,19 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) return ret; } -static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) +int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) { - unsigned long active; + unsigned long inactive_ratio; + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); unsigned long inactive; + unsigned long active; unsigned long gb; - unsigned long inactive_ratio; - inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); - active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); + inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, + BIT(LRU_INACTIVE_ANON)); + active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, + BIT(LRU_ACTIVE_ANON)); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -1120,39 +1124,20 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ else inactive_ratio = 1; - if (present_pages) { - present_pages[0] = inactive; - present_pages[1] = active; - } - - return inactive_ratio; + return inactive * inactive_ratio < active; } -int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) -{ - unsigned long active; - unsigned long inactive; - unsigned long present_pages[2]; - unsigned long inactive_ratio; - - inactive_ratio = calc_inactive_ratio(memcg, present_pages); - - inactive = present_pages[0]; - active = present_pages[1]; - - if (inactive * inactive_ratio < active) - return 1; - - return 0; -} - -int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) { unsigned long active; unsigned long inactive; + int zid = zone_idx(zone); + int nid = zone_to_nid(zone); - inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); - active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); + inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, + BIT(LRU_INACTIVE_FILE)); + active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, + BIT(LRU_ACTIVE_FILE)); return (active > inactive); } @@ -4192,8 +4177,6 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, } #ifdef CONFIG_DEBUG_VM - cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); - { int nid, zid; struct mem_cgroup_per_zone *mz; diff --git a/mm/vmscan.c b/mm/vmscan.c index a90c603a8d02..132d1ddb2238 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1767,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) if (scanning_global_lru(sc)) low = inactive_anon_is_low_global(zone); else - low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); + low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone); return low; } #else @@ -1810,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) if (scanning_global_lru(sc)) low = inactive_file_is_low_global(zone); else - low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); + low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone); return low; } -- cgit v1.2.3 From a61ed3cec51cfd4877855c24890ab8d3e2b143e3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 2 Nov 2011 13:38:29 -0700 Subject: memcg: close race between charge and putback There is a potential race between a thread charging a page and another thread putting it back to the LRU list: charge: putback: SetPageCgroupUsed SetPageLRU PageLRU && add to memcg LRU PageCgroupUsed && add to memcg LRU The order of setting one flag and checking the other is crucial, otherwise the charge may observe !PageLRU while the putback observes !PageCgroupUsed and the page is not linked to the memcg LRU at all. Global memory pressure may fix this by trying to isolate and putback the page for reclaim, where that putback would link it to the memcg LRU again. Without that, the memory cgroup is undeletable due to a charge whose physical page can not be found and moved out. Signed-off-by: Johannes Weiner Cc: Ying Han Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ce7b35d024e9..01e0c725de65 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -993,6 +993,16 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); VM_BUG_ON(PageCgroupAcctLRU(pc)); + /* + * putback: charge: + * SetPageLRU SetPageCgroupUsed + * smp_mb smp_mb + * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU + * + * Ensure that one of the two sides adds the page to the memcg + * LRU during a race. + */ + smp_mb(); if (!PageCgroupUsed(pc)) return; /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ @@ -1044,7 +1054,16 @@ static void mem_cgroup_lru_add_after_commit(struct page *page) unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); - + /* + * putback: charge: + * SetPageLRU SetPageCgroupUsed + * smp_mb smp_mb + * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU + * + * Ensure that one of the two sides adds the page to the memcg + * LRU during a race. + */ + smp_mb(); /* taking care of that the page is added to LRU while we commit it */ if (likely(!PageLRU(page))) return; -- cgit v1.2.3 From 4799401fef9d5951b2da384c5eb08034c48e08a0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Nov 2011 13:38:33 -0700 Subject: memcg: Fix race condition in memcg_check_events() with this_cpu usage Various code in memcontrol.c () calls this_cpu_read() on the calculations to be done from two different percpu variables, or does an open-coded read-modify-write on a single percpu variable. Disable preemption throughout these operations so that the writes go to the correct palces. [hannes@cmpxchg.org: added this_cpu to __this_cpu conversion] Signed-off-by: Johannes Weiner Signed-off-by: Steven Rostedt Cc: Greg Thelen Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Daisuke Nishimura Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 01e0c725de65..7af1d5ee1598 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -686,8 +686,8 @@ static bool __memcg_event_check(struct mem_cgroup *memcg, int target) { unsigned long val, next; - val = this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); - next = this_cpu_read(memcg->stat->targets[target]); + val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); + next = __this_cpu_read(memcg->stat->targets[target]); /* from time_after() in jiffies.h */ return ((long)next - (long)val < 0); } @@ -696,7 +696,7 @@ static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) { unsigned long val, next; - val = this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); + val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); switch (target) { case MEM_CGROUP_TARGET_THRESH: @@ -712,7 +712,7 @@ static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) return; } - this_cpu_write(memcg->stat->targets[target], next); + __this_cpu_write(memcg->stat->targets[target], next); } /* @@ -721,6 +721,7 @@ static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) */ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { + preempt_disable(); /* threshold event is triggered in finer grain than soft limit */ if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { mem_cgroup_threshold(memcg); @@ -740,6 +741,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) } #endif } + preempt_enable(); } static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) -- cgit v1.2.3 From 61600f578fbd2e8ad0c90bddb9c729e7628d3813 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Wed, 2 Nov 2011 13:38:36 -0700 Subject: mm/page_cgroup.c: quiet sparse noise warning: symbol 'swap_cgroup_ctrl' was not declared. Should it be static? Signed-off-by: H Hartley Sweeten Cc: Paul Menage Cc: Li Zefan Acked-by: Balbir Singh Cc: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3749ae15a8c8..2d123f94a8df 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -360,7 +360,7 @@ struct swap_cgroup_ctrl { spinlock_t lock; }; -struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; +static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; struct swap_cgroup { unsigned short id; -- cgit v1.2.3 From 89e8a244b97e48f1f30e898b6f32acca477f2a13 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 Nov 2011 13:38:39 -0700 Subject: cpusets: avoid looping when storing to mems_allowed if one node remains set {get,put}_mems_allowed() exist so that general kernel code may locklessly access a task's set of allowable nodes without having the chance that a concurrent write will cause the nodemask to be empty on configurations where MAX_NUMNODES > BITS_PER_LONG. This could incur a significant delay, however, especially in low memory conditions because the page allocator is blocking and reclaim requires get_mems_allowed() itself. It is not atypical to see writes to cpuset.mems take over 2 seconds to complete, for example. In low memory conditions, this is problematic because it's one of the most imporant times to change cpuset.mems in the first place! The only way a task's set of allowable nodes may change is through cpusets by writing to cpuset.mems and when attaching a task to a generic code is not reading the nodemask with get_mems_allowed() at the same time, and then clearing all the old nodes. This prevents the possibility that a reader will see an empty nodemask at the same time the writer is storing a new nodemask. If at least one node remains unchanged, though, it's possible to simply set all new nodes and then clear all the old nodes. Changing a task's nodemask is protected by cgroup_mutex so it's guaranteed that two threads are not changing the same task's nodemask at the same time, so the nodemask is guaranteed to be stored before another thread changes it and determines whether a node remains set or not. Signed-off-by: David Rientjes Cc: Miao Xie Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 10131fdaff70..ed0ff443f036 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -949,6 +949,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { + bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed); + repeat: /* * Allow tasks that have access to memory reserves because they have @@ -963,7 +965,6 @@ repeat: nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - /* * ensure checking ->mems_allowed_change_disable after setting all new * allowed nodes. @@ -980,9 +981,11 @@ repeat: /* * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. + * for the read-side. No wait is necessary, however, if at least one + * node remains unchanged. */ - while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + while (masks_disjoint && + ACCESS_ONCE(tsk->mems_allowed_change_disable)) { task_unlock(tsk); if (!task_curr(tsk)) yield(); -- cgit v1.2.3 From 887df07891de0435c25cffb92268fea2c621f99c Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 2 Nov 2011 13:38:42 -0700 Subject: procfs: report EISDIR when reading sysctl dirs in proc On reading sysctl dirs we should return -EISDIR instead of -EINVAL. Signed-off-by: Pavel Emelyanov Signed-off-by: Cyrill Gorcunov Cc: Alexey Dobriyan Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 1a77dbef226f..dacd840a675a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -370,6 +370,7 @@ static const struct file_operations proc_sys_file_operations = { }; static const struct file_operations proc_sys_dir_file_operations = { + .read = generic_read_dir, .readdir = proc_sys_readdir, .llseek = generic_file_llseek, }; -- cgit v1.2.3 From aa6afca5bcaba8101f3ea09d5c3e4100b2b9f0e5 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Wed, 2 Nov 2011 13:38:44 -0700 Subject: proc: fix races against execve() of /proc/PID/fd** fd* files are restricted to the task's owner, and other users may not get direct access to them. But one may open any of these files and run any setuid program, keeping opened file descriptors. As there are permission checks on open(), but not on readdir() and read(), operations on the kept file descriptors will not be checked. It makes it possible to violate procfs permission model. Reading fdinfo/* may disclosure current fds' position and flags, reading directory contents of fdinfo/ and fd/ may disclosure the number of opened files by the target task. This information is not sensible per se, but it can reveal some private information (like length of a password stored in a file) under certain conditions. Used existing (un)lock_trace functions to check for ptrace_may_access(), but instead of using EPERM return code from it use EACCES to be consistent with existing proc_pid_follow_link()/proc_pid_readlink() return code. If they differ, attacker can guess what fds exist by analyzing stat() return code. Patched handlers: stat() for fd/*, stat() and read() for fdindo/*, readdir() and lookup() for fd/ and fdinfo/. Signed-off-by: Vasiliy Kulikov Cc: Cyrill Gorcunov Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 146 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 103 insertions(+), 43 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 8f0087e20e16..d4f4913f00db 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1652,12 +1652,46 @@ out: return error; } +static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + int rc; + + if (task == NULL) + return -ESRCH; + + rc = -EACCES; + if (lock_trace(task)) + goto out_task; + + generic_fillattr(inode, stat); + unlock_trace(task); + rc = 0; +out_task: + put_task_struct(task); + return rc; +} + static const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, .follow_link = proc_pid_follow_link, .setattr = proc_setattr, }; +static const struct inode_operations proc_fdinfo_link_inode_operations = { + .setattr = proc_setattr, + .getattr = proc_pid_fd_link_getattr, +}; + +static const struct inode_operations proc_fd_link_inode_operations = { + .readlink = proc_pid_readlink, + .follow_link = proc_pid_follow_link, + .setattr = proc_setattr, + .getattr = proc_pid_fd_link_getattr, +}; + /* building an inode */ @@ -1889,49 +1923,61 @@ out: static int proc_fd_info(struct inode *inode, struct path *path, char *info) { - struct task_struct *task = get_proc_task(inode); - struct files_struct *files = NULL; + struct task_struct *task; + struct files_struct *files; struct file *file; int fd = proc_fd(inode); + int rc; - if (task) { - files = get_files_struct(task); - put_task_struct(task); - } - if (files) { - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); - if (file) { - unsigned int f_flags; - struct fdtable *fdt; - - fdt = files_fdtable(files); - f_flags = file->f_flags & ~O_CLOEXEC; - if (FD_ISSET(fd, fdt->close_on_exec)) - f_flags |= O_CLOEXEC; - - if (path) { - *path = file->f_path; - path_get(&file->f_path); - } - if (info) - snprintf(info, PROC_FDINFO_MAX, - "pos:\t%lli\n" - "flags:\t0%o\n", - (long long) file->f_pos, - f_flags); - spin_unlock(&files->file_lock); - put_files_struct(files); - return 0; + task = get_proc_task(inode); + if (!task) + return -ENOENT; + + rc = -EACCES; + if (lock_trace(task)) + goto out_task; + + rc = -ENOENT; + files = get_files_struct(task); + if (files == NULL) + goto out_unlock; + + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { + unsigned int f_flags; + struct fdtable *fdt; + + fdt = files_fdtable(files); + f_flags = file->f_flags & ~O_CLOEXEC; + if (FD_ISSET(fd, fdt->close_on_exec)) + f_flags |= O_CLOEXEC; + + if (path) { + *path = file->f_path; + path_get(&file->f_path); } - spin_unlock(&files->file_lock); - put_files_struct(files); - } - return -ENOENT; + if (info) + snprintf(info, PROC_FDINFO_MAX, + "pos:\t%lli\n" + "flags:\t0%o\n", + (long long) file->f_pos, + f_flags); + rc = 0; + } else + rc = -ENOENT; + spin_unlock(&files->file_lock); + put_files_struct(files); + +out_unlock: + unlock_trace(task); +out_task: + put_task_struct(task); + return rc; } static int proc_fd_link(struct inode *inode, struct path *path) @@ -2026,7 +2072,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, spin_unlock(&files->file_lock); put_files_struct(files); - inode->i_op = &proc_pid_link_inode_operations; + inode->i_op = &proc_fd_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; d_set_d_op(dentry, &tid_fd_dentry_operations); @@ -2058,7 +2104,12 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, if (fd == ~0U) goto out; + result = ERR_PTR(-EACCES); + if (lock_trace(task)) + goto out; + result = instantiate(dir, dentry, task, &fd); + unlock_trace(task); out: put_task_struct(task); out_no_task: @@ -2078,23 +2129,28 @@ static int proc_readfd_common(struct file * filp, void * dirent, retval = -ENOENT; if (!p) goto out_no_task; + + retval = -EACCES; + if (lock_trace(p)) + goto out; + retval = 0; fd = filp->f_pos; switch (fd) { case 0: if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) - goto out; + goto out_unlock; filp->f_pos++; case 1: ino = parent_ino(dentry); if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out; + goto out_unlock; filp->f_pos++; default: files = get_files_struct(p); if (!files) - goto out; + goto out_unlock; rcu_read_lock(); for (fd = filp->f_pos-2; fd < files_fdtable(files)->max_fds; @@ -2118,6 +2174,9 @@ static int proc_readfd_common(struct file * filp, void * dirent, rcu_read_unlock(); put_files_struct(files); } + +out_unlock: + unlock_trace(p); out: put_task_struct(p); out_no_task: @@ -2195,6 +2254,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir, ei->fd = fd; inode->i_mode = S_IFREG | S_IRUSR; inode->i_fop = &proc_fdinfo_file_operations; + inode->i_op = &proc_fdinfo_link_inode_operations; d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ -- cgit v1.2.3 From 46cbc1d3981ee753518fbf9198a14f71a9f6841e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Nov 2011 13:38:46 -0700 Subject: ida: make ida_simple_get/put() IRQ safe It's often convenient to be able to release resource from IRQ context. Make ida_simple_*() use irqsave/restore spin ops so that they are IRQ safe. Signed-off-by: Tejun Heo Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lib/idr.c b/lib/idr.c index bbf211aea4eb..ed055b297c81 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -944,6 +944,7 @@ int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end, { int ret, id; unsigned int max; + unsigned long flags; BUG_ON((int)start < 0); BUG_ON((int)end < 0); @@ -959,7 +960,7 @@ again: if (!ida_pre_get(ida, gfp_mask)) return -ENOMEM; - spin_lock(&simple_ida_lock); + spin_lock_irqsave(&simple_ida_lock, flags); ret = ida_get_new_above(ida, start, &id); if (!ret) { if (id > max) { @@ -969,7 +970,7 @@ again: ret = id; } } - spin_unlock(&simple_ida_lock); + spin_unlock_irqrestore(&simple_ida_lock, flags); if (unlikely(ret == -EAGAIN)) goto again; @@ -985,10 +986,12 @@ EXPORT_SYMBOL(ida_simple_get); */ void ida_simple_remove(struct ida *ida, unsigned int id) { + unsigned long flags; + BUG_ON((int)id < 0); - spin_lock(&simple_ida_lock); + spin_lock_irqsave(&simple_ida_lock, flags); ida_remove(ida, id); - spin_unlock(&simple_ida_lock); + spin_unlock_irqrestore(&simple_ida_lock, flags); } EXPORT_SYMBOL(ida_simple_remove); -- cgit v1.2.3 From 3c24783bb2deafaa106b7e69a97540071afc590c Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Wed, 2 Nov 2011 13:38:50 -0700 Subject: ipc/sem.c: fix return code race with semop vs. semop +semctl(IPC_RMID) sys_semtimedop() may return -EIDRM although the semaphore operation completed successfully: thread 1: thread 2: semtimedop(), sleeps semop(): * acquires sem_lock() semtimedop() woken up due to timeout sem_lock() loops * notices that thread 2 could be completed. * performs the operations that thread 2 is sleeping on. * marks the semaphore operation as IN_WAKEUP * drops sem_lock(), does wakeup, sets return code to 0 * thread delayed due to interrupt, whatever * returns to user space * thread still delayed semctl(IPC_RMID) * acquires sem_lock() * ipc_rmid(), ipcp->deleted=1 * drops sem_lock() * thread finally continues - but seem_lock() now fails due to ipcp->deleted == 1 * returns -EIDRM instead of 0 The fix is trivial: Always use the return code in queue.status. In real world, the race probably doesn't matter: If the semaphore array is destroyed, the app is probably not interested if the last operation succeeded or was already cancelled. Signed-off-by: Manfred Spraul Cc: Thomas Gleixner Cc: Mike Galbraith Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ipc/sem.c b/ipc/sem.c index c8e00f8b4be1..fb13be17945b 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1460,7 +1460,6 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, * Array removed? If yes, leave without sem_unlock(). */ if (IS_ERR(sma)) { - error = -EIDRM; goto out_free; } -- cgit v1.2.3 From 0b0577f6080c0645b079dcc03fdbaf40d928beb8 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Wed, 2 Nov 2011 13:38:52 -0700 Subject: ipc/sem.c: handle spurious wakeups semtimedop() does not handle spurious wakeups, it returns -EINTR to user space. Most other schedule() users would just loop and not return to user space. The patch adds such a loop to semtimedop() Signed-off-by: Manfred Spraul Reported-by: Peter Zijlstra Acked-by: Peter Zijlstra Cc: Thomas Gleixner Cc: Mike Galbraith Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ipc/sem.c b/ipc/sem.c index fb13be17945b..227948f28ce6 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1426,6 +1426,8 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, queue.status = -EINTR; queue.sleeper = current; + +sleep_again: current->state = TASK_INTERRUPTIBLE; sem_unlock(sma); @@ -1478,6 +1480,13 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, */ if (timeout && jiffies_left == 0) error = -EAGAIN; + + /* + * If the wakeup was spurious, just retry + */ + if (error == -EINTR && !signal_pending(current)) + goto sleep_again; + unlink_queue(sma, &queue); out_unlock_free: -- cgit v1.2.3 From e57940d719e9fc5223d133b631f8cb5232d6064e Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Wed, 2 Nov 2011 13:38:54 -0700 Subject: ipc/sem.c: remove private structures from public header file include/linux/sem.h contains several structures that are only used within ipc/sem.c. The patch moves them into ipc/sem.c - there is no need to expose the structures to the whole kernel. No functional changes, only whitespace cleanups and 80-char per line fixes. Signed-off-by: Manfred Spraul Acked-by: Peter Zijlstra Cc: Thomas Gleixner Cc: Mike Galbraith Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sem.h | 42 ------------------------------------------ ipc/sem.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 42 deletions(-) diff --git a/include/linux/sem.h b/include/linux/sem.h index 1feb2de2ee57..464842621a4a 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -83,13 +83,6 @@ struct seminfo { struct task_struct; -/* One semaphore structure for each semaphore in the system. */ -struct sem { - int semval; /* current value */ - int sempid; /* pid of last operation */ - struct list_head sem_pending; /* pending single-sop operations */ -}; - /* One sem_array data structure for each set of semaphores in the system. */ struct sem_array { struct kern_ipc_perm ____cacheline_aligned_in_smp @@ -103,41 +96,6 @@ struct sem_array { int complex_count; /* pending complex operations */ }; -/* One queue for each sleeping process in the system. */ -struct sem_queue { - struct list_head simple_list; /* queue of pending operations */ - struct list_head list; /* queue of pending operations */ - struct task_struct *sleeper; /* this process */ - struct sem_undo *undo; /* undo structure */ - int pid; /* process id of requesting process */ - int status; /* completion status of operation */ - struct sembuf *sops; /* array of pending operations */ - int nsops; /* number of operations */ - int alter; /* does the operation alter the array? */ -}; - -/* Each task has a list of undo requests. They are executed automatically - * when the process exits. - */ -struct sem_undo { - struct list_head list_proc; /* per-process list: all undos from one process. */ - /* rcu protected */ - struct rcu_head rcu; /* rcu struct for sem_undo() */ - struct sem_undo_list *ulp; /* sem_undo_list for the process */ - struct list_head list_id; /* per semaphore array list: all undos for one array */ - int semid; /* semaphore set identifier */ - short * semadj; /* array of adjustments, one per semaphore */ -}; - -/* sem_undo_list controls shared access to the list of sem_undo structures - * that may be shared among all a CLONE_SYSVSEM task group. - */ -struct sem_undo_list { - atomic_t refcnt; - spinlock_t lock; - struct list_head list_proc; -}; - struct sysv_sem { struct sem_undo_list *undo_list; }; diff --git a/ipc/sem.c b/ipc/sem.c index 227948f28ce6..5215a81420df 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -90,6 +90,52 @@ #include #include "util.h" +/* One semaphore structure for each semaphore in the system. */ +struct sem { + int semval; /* current value */ + int sempid; /* pid of last operation */ + struct list_head sem_pending; /* pending single-sop operations */ +}; + +/* One queue for each sleeping process in the system. */ +struct sem_queue { + struct list_head simple_list; /* queue of pending operations */ + struct list_head list; /* queue of pending operations */ + struct task_struct *sleeper; /* this process */ + struct sem_undo *undo; /* undo structure */ + int pid; /* process id of requesting process */ + int status; /* completion status of operation */ + struct sembuf *sops; /* array of pending operations */ + int nsops; /* number of operations */ + int alter; /* does *sops alter the array? */ +}; + +/* Each task has a list of undo requests. They are executed automatically + * when the process exits. + */ +struct sem_undo { + struct list_head list_proc; /* per-process list: * + * all undos from one process + * rcu protected */ + struct rcu_head rcu; /* rcu struct for sem_undo */ + struct sem_undo_list *ulp; /* back ptr to sem_undo_list */ + struct list_head list_id; /* per semaphore array list: + * all undos for one array */ + int semid; /* semaphore set identifier */ + short *semadj; /* array of adjustments */ + /* one per semaphore */ +}; + +/* sem_undo_list controls shared access to the list of sem_undo structures + * that may be shared among all a CLONE_SYSVSEM task group. + */ +struct sem_undo_list { + atomic_t refcnt; + spinlock_t lock; + struct list_head list_proc; +}; + + #define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS]) #define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) -- cgit v1.2.3 From f567a18590742b811287b7512fb0908deac4eef7 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Wed, 2 Nov 2011 13:38:56 -0700 Subject: include/linux/sem.h: make sysv_sem empty if SYSVIPC is disabled For the sysvsem undo, each task struct contains a sysv_sem structure with a pointer to the undo information. This pointer is only necessary if sysvipc is enabled - thus the pointer can be made conditional on CONFIG_SYSVIPC. Signed-off-by: Manfred Spraul Acked-by: Peter Zijlstra Cc: Thomas Gleixner Cc: Mike Galbraith Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sem.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/sem.h b/include/linux/sem.h index 464842621a4a..10d6b226afc5 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -96,16 +96,21 @@ struct sem_array { int complex_count; /* pending complex operations */ }; +#ifdef CONFIG_SYSVIPC + struct sysv_sem { struct sem_undo_list *undo_list; }; -#ifdef CONFIG_SYSVIPC - extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); extern void exit_sem(struct task_struct *tsk); #else + +struct sysv_sem { + /* empty */ +}; + static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) { return 0; -- cgit v1.2.3 From 79975f1327850ef198ada994c2fc44b7d1ea8935 Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Wed, 2 Nov 2011 13:38:59 -0700 Subject: init: add root=PARTUUID=UUID/PARTNROFF=%d support Expand root=PARTUUID=UUID syntax to support selecting a root partition by integer offset from a known, unique partition. This approach provides similar properties to specifying a device and partition number, but using the UUID as the unique path prior to evaluating the offset. For example, root=PARTUUID=99DE9194-FC15-4223-9192-FC243948F88B/PARTNROFF=1 selects the partition with UUID 99DE.. then select the next partition. This change is motivated by a particular usecase in Chromium OS where the bootloader can easily determine what partition it is on (by UUID) but doesn't perform general partition table walking. That said, support for this model provides a direct mechanism for the user to modify the root partition to boot without specifically needing to extract each UUID or update the bootloader explicitly when the root partition UUID is changed (if it is recreated to be larger, for instance). Pinning to a /boot-style partition UUID allows the arbitrary root partition reconfiguration/modifications with slightly less ambiguity than just [dev][partition] and less stringency than the specific root partition UUID. [sfr@canb.auug.org.au: fix init sections warning] Signed-off-by: Will Drewry Cc: Kay Sievers Cc: Randy Dunlap Cc: Namhyung Kim Cc: Trond Myklebust Cc: Jens Axboe Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/do_mounts.c | 48 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index c0851a8e030c..0f6e1d985a3b 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -28,7 +28,7 @@ int __initdata rd_doload; /* 1 = load RAM disk, 0 = don't load */ int root_mountflags = MS_RDONLY | MS_SILENT; static char * __initdata root_device_name; static char __initdata saved_root_name[64]; -static int __initdata root_wait; +static int root_wait; dev_t ROOT_DEV; @@ -85,12 +85,15 @@ no_match: /** * devt_from_partuuid - looks up the dev_t of a partition by its UUID - * @uuid: 36 byte char array containing a hex ascii UUID + * @uuid: min 36 byte char array containing a hex ascii UUID * * The function will return the first partition which contains a matching * UUID value in its partition_meta_info struct. This does not search * by filesystem UUIDs. * + * If @uuid is followed by a "/PARTNROFF=%d", then the number will be + * extracted and used as an offset from the partition identified by the UUID. + * * Returns the matching dev_t on success or 0 on failure. */ static dev_t devt_from_partuuid(char *uuid_str) @@ -98,6 +101,28 @@ static dev_t devt_from_partuuid(char *uuid_str) dev_t res = 0; struct device *dev = NULL; u8 uuid[16]; + struct gendisk *disk; + struct hd_struct *part; + int offset = 0; + + if (strlen(uuid_str) < 36) + goto done; + + /* Check for optional partition number offset attributes. */ + if (uuid_str[36]) { + char c = 0; + /* Explicitly fail on poor PARTUUID syntax. */ + if (sscanf(&uuid_str[36], + "/PARTNROFF=%d%c", &offset, &c) != 1) { + printk(KERN_ERR "VFS: PARTUUID= is invalid.\n" + "Expected PARTUUID=[/PARTNROFF=%%d]\n"); + if (root_wait) + printk(KERN_ERR + "Disabling rootwait; root= is invalid.\n"); + root_wait = 0; + goto done; + } + } /* Pack the requested UUID in the expected format. */ part_pack_uuid(uuid_str, uuid); @@ -107,8 +132,21 @@ static dev_t devt_from_partuuid(char *uuid_str) goto done; res = dev->devt; - put_device(dev); + /* Attempt to find the partition by offset. */ + if (!offset) + goto no_offset; + + res = 0; + disk = part_to_disk(dev_to_part(dev)); + part = disk_get_part(disk, dev_to_part(dev)->partno + offset); + if (part) { + res = part_devt(part); + put_device(part_to_dev(part)); + } + +no_offset: + put_device(dev); done: return res; } @@ -126,6 +164,8 @@ done: * used when disk name of partitioned disk ends on a digit. * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the * unique id of a partition if the partition table provides it. + * 7) PARTUUID=/PARTNROFF= to select a partition in relation to + * a partition with a known unique id. * * If name doesn't have fall into the categories above, we return (0,0). * block_class is used to check if something is a disk name. If the disk @@ -143,8 +183,6 @@ dev_t name_to_dev_t(char *name) #ifdef CONFIG_BLOCK if (strncmp(name, "PARTUUID=", 9) == 0) { name += 9; - if (strlen(name) != 36) - goto fail; res = devt_from_partuuid(name); if (!res) goto fail; -- cgit v1.2.3 From a571259f4874023306db36e83054d093833b1902 Mon Sep 17 00:00:00 2001 From: Liu Gang Date: Wed, 2 Nov 2011 13:39:05 -0700 Subject: drivers/rapidio/rio-scan.c: use discovered bit to test if enumeration is complete The discovered bit in PGCCSR register indicates if the device has been discovered by system host. In Rapidio systems, some agent devices can also be master devices. They can issue requests into the system. Signed-off-by: Liu Gang Acked-by: Alexandre Bounine Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio-scan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index ebe77dd87daf..0914f49b0a53 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -923,7 +923,7 @@ static int __devinit rio_enum_peer(struct rio_net *net, struct rio_mport *port, * rio_enum_complete- Tests if enumeration of a network is complete * @port: Master port to send transaction * - * Tests the Component Tag CSR for non-zero value (enumeration + * Tests the PGCCSR discovered bit for non-zero value (enumeration * complete flag). Return %1 if enumeration is complete or %0 if * enumeration is incomplete. */ @@ -933,7 +933,7 @@ static int rio_enum_complete(struct rio_mport *port) rio_local_read_config_32(port, port->phys_efptr + RIO_PORT_GEN_CTL_CSR, ®val); - return (regval & RIO_PORT_GEN_MASTER) ? 1 : 0; + return (regval & RIO_PORT_GEN_DISCOVERED) ? 1 : 0; } /** -- cgit v1.2.3 From e80dd9a7bca4057d5a09d1ba94a7ba0791e7426a Mon Sep 17 00:00:00 2001 From: Liu Gang Date: Wed, 2 Nov 2011 13:39:07 -0700 Subject: arch/powerpc/sysdev/fsl_rio.c: release rapidio port I/O region resource if port failed to initialize The "struct rio_mport" contains a member of master port I/O memory resource structure "struct resource iores". This resource will be read from device tree and be used for rapidio R/W transaction memory space. Rapidio requests the port I/O memory resource under the root resource "iomem_resource". struct rio_mport *port; port = kzalloc(sizeof(struct rio_mport), GFP_KERNEL); request_resource(&iomem_resource, &port->iores); When port failed to initialize, allocated "rio_mport" structure memory will be freed, and the port I/O memory resource structure pointer "&port->iores" will be invalid. If other requests resource under "iomem_resource", "&port->iores" node may be operated in the child resources list and this will cause the system to crash. So the requested port I/O memory resource should be released before freeing allocated "rio_mport" structure. Signed-off-by: Liu Gang Acked-by: Alexandre Bounine Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Grant Likely Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/sysdev/fsl_rio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index c65f75aa7ff7..22ffccd8bef5 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -1608,6 +1608,7 @@ int fsl_rio_setup(struct platform_device *dev) return 0; err: iounmap(priv->regs_win); + release_resource(&port->iores); err_res: kfree(priv); err_priv: -- cgit v1.2.3 From 48618fb4e522d9d02e217ac05f52749545c1af20 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Wed, 2 Nov 2011 13:39:09 -0700 Subject: RapidIO: add mport driver for Tsi721 bridge Add RapidIO mport driver for IDT TSI721 PCI Express-to-SRIO bridge device. The driver provides full set of callback functions defined for mport devices in RapidIO subsystem. It also is compatible with current version of RIONET driver (Ethernet over RapidIO messaging services). This patch is applicable to kernel versions starting from 2.6.39. Signed-off-by: Alexandre Bounine Signed-off-by: Chul Kim Cc: Kumar Gala Cc: Matt Porter Cc: Li Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/rapidio/tsi721.txt | 49 + drivers/rapidio/Kconfig | 6 +- drivers/rapidio/Makefile | 1 + drivers/rapidio/devices/Kconfig | 10 + drivers/rapidio/devices/Makefile | 5 + drivers/rapidio/devices/tsi721.c | 2360 ++++++++++++++++++++++++++++++++++++++ drivers/rapidio/devices/tsi721.h | 766 +++++++++++++ include/linux/rio_ids.h | 1 + 8 files changed, 3196 insertions(+), 2 deletions(-) create mode 100644 Documentation/rapidio/tsi721.txt create mode 100644 drivers/rapidio/devices/Kconfig create mode 100644 drivers/rapidio/devices/Makefile create mode 100644 drivers/rapidio/devices/tsi721.c create mode 100644 drivers/rapidio/devices/tsi721.h diff --git a/Documentation/rapidio/tsi721.txt b/Documentation/rapidio/tsi721.txt new file mode 100644 index 000000000000..335f3c6087dc --- /dev/null +++ b/Documentation/rapidio/tsi721.txt @@ -0,0 +1,49 @@ +RapidIO subsystem mport driver for IDT Tsi721 PCI Express-to-SRIO bridge. +========================================================================= + +I. Overview + +This driver implements all currently defined RapidIO mport callback functions. +It supports maintenance read and write operations, inbound and outbound RapidIO +doorbells, inbound maintenance port-writes and RapidIO messaging. + +To generate SRIO maintenance transactions this driver uses one of Tsi721 DMA +channels. This mechanism provides access to larger range of hop counts and +destination IDs without need for changes in outbound window translation. + +RapidIO messaging support uses dedicated messaging channels for each mailbox. +For inbound messages this driver uses destination ID matching to forward messages +into the corresponding message queue. Messaging callbacks are implemented to be +fully compatible with RIONET driver (Ethernet over RapidIO messaging services). + +II. Known problems + + None. + +III. To do + + Add DMA data transfers (non-messaging). + Add inbound region (SRIO-to-PCIe) mapping. + +IV. Version History + + 1.0.0 - Initial driver release. + +V. License +----------------------------------------------- + + Copyright(c) 2011 Integrated Device Technology, Inc. All rights reserved. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. diff --git a/drivers/rapidio/Kconfig b/drivers/rapidio/Kconfig index 070211a5955c..bc8719238793 100644 --- a/drivers/rapidio/Kconfig +++ b/drivers/rapidio/Kconfig @@ -1,6 +1,8 @@ # # RapidIO configuration # +source "drivers/rapidio/devices/Kconfig" + config RAPIDIO_DISC_TIMEOUT int "Discovery timeout duration (seconds)" depends on RAPIDIO @@ -20,8 +22,6 @@ config RAPIDIO_ENABLE_RX_TX_PORTS ports for Input/Output direction to allow other traffic than Maintenance transfers. -source "drivers/rapidio/switches/Kconfig" - config RAPIDIO_DEBUG bool "RapidIO subsystem debug messages" depends on RAPIDIO @@ -32,3 +32,5 @@ config RAPIDIO_DEBUG going on. If you are unsure about this, say N here. + +source "drivers/rapidio/switches/Kconfig" diff --git a/drivers/rapidio/Makefile b/drivers/rapidio/Makefile index 89b8eca825b5..ec3fb8121004 100644 --- a/drivers/rapidio/Makefile +++ b/drivers/rapidio/Makefile @@ -4,5 +4,6 @@ obj-y += rio.o rio-access.o rio-driver.o rio-scan.o rio-sysfs.o obj-$(CONFIG_RAPIDIO) += switches/ +obj-$(CONFIG_RAPIDIO) += devices/ subdir-ccflags-$(CONFIG_RAPIDIO_DEBUG) := -DDEBUG diff --git a/drivers/rapidio/devices/Kconfig b/drivers/rapidio/devices/Kconfig new file mode 100644 index 000000000000..12a9d7f7040b --- /dev/null +++ b/drivers/rapidio/devices/Kconfig @@ -0,0 +1,10 @@ +# +# RapidIO master port configuration +# + +config RAPIDIO_TSI721 + bool "IDT Tsi721 PCI Express SRIO Controller support" + depends on RAPIDIO && PCIEPORTBUS + default "n" + ---help--- + Include support for IDT Tsi721 PCI Express Serial RapidIO controller. diff --git a/drivers/rapidio/devices/Makefile b/drivers/rapidio/devices/Makefile new file mode 100644 index 000000000000..3b7b4e2dff7c --- /dev/null +++ b/drivers/rapidio/devices/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for RapidIO devices +# + +obj-$(CONFIG_RAPIDIO_TSI721) += tsi721.o diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c new file mode 100644 index 000000000000..5225930a10cd --- /dev/null +++ b/drivers/rapidio/devices/tsi721.c @@ -0,0 +1,2360 @@ +/* + * RapidIO mport driver for Tsi721 PCIExpress-to-SRIO bridge + * + * Copyright 2011 Integrated Device Technology, Inc. + * Alexandre Bounine + * Chul Kim + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tsi721.h" + +#define DEBUG_PW /* Inbound Port-Write debugging */ + +static void tsi721_omsg_handler(struct tsi721_device *priv, int ch); +static void tsi721_imsg_handler(struct tsi721_device *priv, int ch); + +/** + * tsi721_lcread - read from local SREP config space + * @mport: RapidIO master port info + * @index: ID of RapdiIO interface + * @offset: Offset into configuration space + * @len: Length (in bytes) of the maintenance transaction + * @data: Value to be read into + * + * Generates a local SREP space read. Returns %0 on + * success or %-EINVAL on failure. + */ +static int tsi721_lcread(struct rio_mport *mport, int index, u32 offset, + int len, u32 *data) +{ + struct tsi721_device *priv = mport->priv; + + if (len != sizeof(u32)) + return -EINVAL; /* only 32-bit access is supported */ + + *data = ioread32(priv->regs + offset); + + return 0; +} + +/** + * tsi721_lcwrite - write into local SREP config space + * @mport: RapidIO master port info + * @index: ID of RapdiIO interface + * @offset: Offset into configuration space + * @len: Length (in bytes) of the maintenance transaction + * @data: Value to be written + * + * Generates a local write into SREP configuration space. Returns %0 on + * success or %-EINVAL on failure. + */ +static int tsi721_lcwrite(struct rio_mport *mport, int index, u32 offset, + int len, u32 data) +{ + struct tsi721_device *priv = mport->priv; + + if (len != sizeof(u32)) + return -EINVAL; /* only 32-bit access is supported */ + + iowrite32(data, priv->regs + offset); + + return 0; +} + +/** + * tsi721_maint_dma - Helper function to generate RapidIO maintenance + * transactions using designated Tsi721 DMA channel. + * @priv: pointer to tsi721 private data + * @sys_size: RapdiIO transport system size + * @destid: Destination ID of transaction + * @hopcount: Number of hops to target device + * @offset: Offset into configuration space + * @len: Length (in bytes) of the maintenance transaction + * @data: Location to be read from or write into + * @do_wr: Operation flag (1 == MAINT_WR) + * + * Generates a RapidIO maintenance transaction (Read or Write). + * Returns %0 on success and %-EINVAL or %-EFAULT on failure. + */ +static int tsi721_maint_dma(struct tsi721_device *priv, u32 sys_size, + u16 destid, u8 hopcount, u32 offset, int len, + u32 *data, int do_wr) +{ + struct tsi721_dma_desc *bd_ptr; + u32 rd_count, swr_ptr, ch_stat; + int i, err = 0; + u32 op = do_wr ? MAINT_WR : MAINT_RD; + + if (offset > (RIO_MAINT_SPACE_SZ - len) || (len != sizeof(u32))) + return -EINVAL; + + bd_ptr = priv->bdma[TSI721_DMACH_MAINT].bd_base; + + rd_count = ioread32( + priv->regs + TSI721_DMAC_DRDCNT(TSI721_DMACH_MAINT)); + + /* Initialize DMA descriptor */ + bd_ptr[0].type_id = cpu_to_le32((DTYPE2 << 29) | (op << 19) | destid); + bd_ptr[0].bcount = cpu_to_le32((sys_size << 26) | 0x04); + bd_ptr[0].raddr_lo = cpu_to_le32((hopcount << 24) | offset); + bd_ptr[0].raddr_hi = 0; + if (do_wr) + bd_ptr[0].data[0] = cpu_to_be32p(data); + else + bd_ptr[0].data[0] = 0xffffffff; + + mb(); + + /* Start DMA operation */ + iowrite32(rd_count + 2, + priv->regs + TSI721_DMAC_DWRCNT(TSI721_DMACH_MAINT)); + ioread32(priv->regs + TSI721_DMAC_DWRCNT(TSI721_DMACH_MAINT)); + i = 0; + + /* Wait until DMA transfer is finished */ + while ((ch_stat = ioread32(priv->regs + + TSI721_DMAC_STS(TSI721_DMACH_MAINT))) & TSI721_DMAC_STS_RUN) { + udelay(1); + if (++i >= 5000000) { + dev_dbg(&priv->pdev->dev, + "%s : DMA[%d] read timeout ch_status=%x\n", + __func__, TSI721_DMACH_MAINT, ch_stat); + if (!do_wr) + *data = 0xffffffff; + err = -EIO; + goto err_out; + } + } + + if (ch_stat & TSI721_DMAC_STS_ABORT) { + /* If DMA operation aborted due to error, + * reinitialize DMA channel + */ + dev_dbg(&priv->pdev->dev, "%s : DMA ABORT ch_stat=%x\n", + __func__, ch_stat); + dev_dbg(&priv->pdev->dev, "OP=%d : destid=%x hc=%x off=%x\n", + do_wr ? MAINT_WR : MAINT_RD, destid, hopcount, offset); + iowrite32(TSI721_DMAC_INT_ALL, + priv->regs + TSI721_DMAC_INT(TSI721_DMACH_MAINT)); + iowrite32(TSI721_DMAC_CTL_INIT, + priv->regs + TSI721_DMAC_CTL(TSI721_DMACH_MAINT)); + udelay(10); + iowrite32(0, priv->regs + + TSI721_DMAC_DWRCNT(TSI721_DMACH_MAINT)); + udelay(1); + if (!do_wr) + *data = 0xffffffff; + err = -EIO; + goto err_out; + } + + if (!do_wr) + *data = be32_to_cpu(bd_ptr[0].data[0]); + + /* + * Update descriptor status FIFO RD pointer. + * NOTE: Skipping check and clear FIFO entries because we are waiting + * for transfer to be completed. + */ + swr_ptr = ioread32(priv->regs + TSI721_DMAC_DSWP(TSI721_DMACH_MAINT)); + iowrite32(swr_ptr, priv->regs + TSI721_DMAC_DSRP(TSI721_DMACH_MAINT)); +err_out: + + return err; +} + +/** + * tsi721_cread_dma - Generate a RapidIO maintenance read transaction + * using Tsi721 BDMA engine. + * @mport: RapidIO master port control structure + * @index: ID of RapdiIO interface + * @destid: Destination ID of transaction + * @hopcount: Number of hops to target device + * @offset: Offset into configuration space + * @len: Length (in bytes) of the maintenance transaction + * @val: Location to be read into + * + * Generates a RapidIO maintenance read transaction. + * Returns %0 on success and %-EINVAL or %-EFAULT on failure. + */ +static int tsi721_cread_dma(struct rio_mport *mport, int index, u16 destid, + u8 hopcount, u32 offset, int len, u32 *data) +{ + struct tsi721_device *priv = mport->priv; + + return tsi721_maint_dma(priv, mport->sys_size, destid, hopcount, + offset, len, data, 0); +} + +/** + * tsi721_cwrite_dma - Generate a RapidIO maintenance write transaction + * using Tsi721 BDMA engine + * @mport: RapidIO master port control structure + * @index: ID of RapdiIO interface + * @destid: Destination ID of transaction + * @hopcount: Number of hops to target device + * @offset: Offset into configuration space + * @len: Length (in bytes) of the maintenance transaction + * @val: Value to be written + * + * Generates a RapidIO maintenance write transaction. + * Returns %0 on success and %-EINVAL or %-EFAULT on failure. + */ +static int tsi721_cwrite_dma(struct rio_mport *mport, int index, u16 destid, + u8 hopcount, u32 offset, int len, u32 data) +{ + struct tsi721_device *priv = mport->priv; + u32 temp = data; + + return tsi721_maint_dma(priv, mport->sys_size, destid, hopcount, + offset, len, &temp, 1); +} + +/** + * tsi721_pw_handler - Tsi721 inbound port-write interrupt handler + * @mport: RapidIO master port structure + * + * Handles inbound port-write interrupts. Copies PW message from an internal + * buffer into PW message FIFO and schedules deferred routine to process + * queued messages. + */ +static int +tsi721_pw_handler(struct rio_mport *mport) +{ + struct tsi721_device *priv = mport->priv; + u32 pw_stat; + u32 pw_buf[TSI721_RIO_PW_MSG_SIZE/sizeof(u32)]; + + + pw_stat = ioread32(priv->regs + TSI721_RIO_PW_RX_STAT); + + if (pw_stat & TSI721_RIO_PW_RX_STAT_PW_VAL) { + pw_buf[0] = ioread32(priv->regs + TSI721_RIO_PW_RX_CAPT(0)); + pw_buf[1] = ioread32(priv->regs + TSI721_RIO_PW_RX_CAPT(1)); + pw_buf[2] = ioread32(priv->regs + TSI721_RIO_PW_RX_CAPT(2)); + pw_buf[3] = ioread32(priv->regs + TSI721_RIO_PW_RX_CAPT(3)); + + /* Queue PW message (if there is room in FIFO), + * otherwise discard it. + */ + spin_lock(&priv->pw_fifo_lock); + if (kfifo_avail(&priv->pw_fifo) >= TSI721_RIO_PW_MSG_SIZE) + kfifo_in(&priv->pw_fifo, pw_buf, + TSI721_RIO_PW_MSG_SIZE); + else + priv->pw_discard_count++; + spin_unlock(&priv->pw_fifo_lock); + } + + /* Clear pending PW interrupts */ + iowrite32(TSI721_RIO_PW_RX_STAT_PW_DISC | TSI721_RIO_PW_RX_STAT_PW_VAL, + priv->regs + TSI721_RIO_PW_RX_STAT); + + schedule_work(&priv->pw_work); + + return 0; +} + +static void tsi721_pw_dpc(struct work_struct *work) +{ + struct tsi721_device *priv = container_of(work, struct tsi721_device, + pw_work); + u32 msg_buffer[RIO_PW_MSG_SIZE/sizeof(u32)]; /* Use full size PW message + buffer for RIO layer */ + + /* + * Process port-write messages + */ + while (kfifo_out_spinlocked(&priv->pw_fifo, (unsigned char *)msg_buffer, + TSI721_RIO_PW_MSG_SIZE, &priv->pw_fifo_lock)) { + /* Process one message */ +#ifdef DEBUG_PW + { + u32 i; + pr_debug("%s : Port-Write Message:", __func__); + for (i = 0; i < RIO_PW_MSG_SIZE/sizeof(u32); ) { + pr_debug("0x%02x: %08x %08x %08x %08x", i*4, + msg_buffer[i], msg_buffer[i + 1], + msg_buffer[i + 2], msg_buffer[i + 3]); + i += 4; + } + pr_debug("\n"); + } +#endif + /* Pass the port-write message to RIO core for processing */ + rio_inb_pwrite_handler((union rio_pw_msg *)msg_buffer); + } +} + +/** + * tsi721_pw_enable - enable/disable port-write interface init + * @mport: Master port implementing the port write unit + * @enable: 1=enable; 0=disable port-write message handling + */ +static int tsi721_pw_enable(struct rio_mport *mport, int enable) +{ + struct tsi721_device *priv = mport->priv; + u32 rval; + + rval = ioread32(priv->regs + TSI721_RIO_EM_INT_ENABLE); + + if (enable) + rval |= TSI721_RIO_EM_INT_ENABLE_PW_RX; + else + rval &= ~TSI721_RIO_EM_INT_ENABLE_PW_RX; + + /* Clear pending PW interrupts */ + iowrite32(TSI721_RIO_PW_RX_STAT_PW_DISC | TSI721_RIO_PW_RX_STAT_PW_VAL, + priv->regs + TSI721_RIO_PW_RX_STAT); + /* Update enable bits */ + iowrite32(rval, priv->regs + TSI721_RIO_EM_INT_ENABLE); + + return 0; +} + +/** + * tsi721_dsend - Send a RapidIO doorbell + * @mport: RapidIO master port info + * @index: ID of RapidIO interface + * @destid: Destination ID of target device + * @data: 16-bit info field of RapidIO doorbell + * + * Sends a RapidIO doorbell message. Always returns %0. + */ +static int tsi721_dsend(struct rio_mport *mport, int index, + u16 destid, u16 data) +{ + struct tsi721_device *priv = mport->priv; + u32 offset; + + offset = (((mport->sys_size) ? RIO_TT_CODE_16 : RIO_TT_CODE_8) << 18) | + (destid << 2); + + dev_dbg(&priv->pdev->dev, + "Send Doorbell 0x%04x to destID 0x%x\n", data, destid); + iowrite16be(data, priv->odb_base + offset); + + return 0; +} + +/** + * tsi721_dbell_handler - Tsi721 doorbell interrupt handler + * @mport: RapidIO master port structure + * + * Handles inbound doorbell interrupts. Copies doorbell entry from an internal + * buffer into DB message FIFO and schedules deferred routine to process + * queued DBs. + */ +static int +tsi721_dbell_handler(struct rio_mport *mport) +{ + struct tsi721_device *priv = mport->priv; + u32 regval; + + /* Disable IDB interrupts */ + regval = ioread32(priv->regs + TSI721_SR_CHINTE(IDB_QUEUE)); + regval &= ~TSI721_SR_CHINT_IDBQRCV; + iowrite32(regval, + priv->regs + TSI721_SR_CHINTE(IDB_QUEUE)); + + schedule_work(&priv->idb_work); + + return 0; +} + +static void tsi721_db_dpc(struct work_struct *work) +{ + struct tsi721_device *priv = container_of(work, struct tsi721_device, + idb_work); + struct rio_mport *mport; + struct rio_dbell *dbell; + int found = 0; + u32 wr_ptr, rd_ptr; + u64 *idb_entry; + u32 regval; + union { + u64 msg; + u8 bytes[8]; + } idb; + + /* + * Process queued inbound doorbells + */ + mport = priv->mport; + + wr_ptr = ioread32(priv->regs + TSI721_IDQ_WP(IDB_QUEUE)); + rd_ptr = ioread32(priv->regs + TSI721_IDQ_RP(IDB_QUEUE)); + + while (wr_ptr != rd_ptr) { + idb_entry = (u64 *)(priv->idb_base + + (TSI721_IDB_ENTRY_SIZE * rd_ptr)); + rd_ptr++; + idb.msg = *idb_entry; + *idb_entry = 0; + + /* Process one doorbell */ + list_for_each_entry(dbell, &mport->dbells, node) { + if ((dbell->res->start <= DBELL_INF(idb.bytes)) && + (dbell->res->end >= DBELL_INF(idb.bytes))) { + found = 1; + break; + } + } + + if (found) { + dbell->dinb(mport, dbell->dev_id, DBELL_SID(idb.bytes), + DBELL_TID(idb.bytes), DBELL_INF(idb.bytes)); + } else { + dev_dbg(&priv->pdev->dev, + "spurious inb doorbell, sid %2.2x tid %2.2x" + " info %4.4x\n", DBELL_SID(idb.bytes), + DBELL_TID(idb.bytes), DBELL_INF(idb.bytes)); + } + } + + iowrite32(rd_ptr & (IDB_QSIZE - 1), + priv->regs + TSI721_IDQ_RP(IDB_QUEUE)); + + /* Re-enable IDB interrupts */ + regval = ioread32(priv->regs + TSI721_SR_CHINTE(IDB_QUEUE)); + regval |= TSI721_SR_CHINT_IDBQRCV; + iowrite32(regval, + priv->regs + TSI721_SR_CHINTE(IDB_QUEUE)); +} + +/** + * tsi721_irqhandler - Tsi721 interrupt handler + * @irq: Linux interrupt number + * @ptr: Pointer to interrupt-specific data (mport structure) + * + * Handles Tsi721 interrupts signaled using MSI and INTA. Checks reported + * interrupt events and calls an event-specific handler(s). + */ +static irqreturn_t tsi721_irqhandler(int irq, void *ptr) +{ + struct rio_mport *mport = (struct rio_mport *)ptr; + struct tsi721_device *priv = mport->priv; + u32 dev_int; + u32 dev_ch_int; + u32 intval; + u32 ch_inte; + + dev_int = ioread32(priv->regs + TSI721_DEV_INT); + if (!dev_int) + return IRQ_NONE; + + dev_ch_int = ioread32(priv->regs + TSI721_DEV_CHAN_INT); + + if (dev_int & TSI721_DEV_INT_SR2PC_CH) { + /* Service SR2PC Channel interrupts */ + if (dev_ch_int & TSI721_INT_SR2PC_CHAN(IDB_QUEUE)) { + /* Service Inbound Doorbell interrupt */ + intval = ioread32(priv->regs + + TSI721_SR_CHINT(IDB_QUEUE)); + if (intval & TSI721_SR_CHINT_IDBQRCV) + tsi721_dbell_handler(mport); + else + dev_info(&priv->pdev->dev, + "Unsupported SR_CH_INT %x\n", intval); + + /* Clear interrupts */ + iowrite32(intval, + priv->regs + TSI721_SR_CHINT(IDB_QUEUE)); + ioread32(priv->regs + TSI721_SR_CHINT(IDB_QUEUE)); + } + } + + if (dev_int & TSI721_DEV_INT_SMSG_CH) { + int ch; + + /* + * Service channel interrupts from Messaging Engine + */ + + if (dev_ch_int & TSI721_INT_IMSG_CHAN_M) { /* Inbound Msg */ + /* Disable signaled OB MSG Channel interrupts */ + ch_inte = ioread32(priv->regs + TSI721_DEV_CHAN_INTE); + ch_inte &= ~(dev_ch_int & TSI721_INT_IMSG_CHAN_M); + iowrite32(ch_inte, priv->regs + TSI721_DEV_CHAN_INTE); + + /* + * Process Inbound Message interrupt for each MBOX + */ + for (ch = 4; ch < RIO_MAX_MBOX + 4; ch++) { + if (!(dev_ch_int & TSI721_INT_IMSG_CHAN(ch))) + continue; + tsi721_imsg_handler(priv, ch); + } + } + + if (dev_ch_int & TSI721_INT_OMSG_CHAN_M) { /* Outbound Msg */ + /* Disable signaled OB MSG Channel interrupts */ + ch_inte = ioread32(priv->regs + TSI721_DEV_CHAN_INTE); + ch_inte &= ~(dev_ch_int & TSI721_INT_OMSG_CHAN_M); + iowrite32(ch_inte, priv->regs + TSI721_DEV_CHAN_INTE); + + /* + * Process Outbound Message interrupts for each MBOX + */ + + for (ch = 0; ch < RIO_MAX_MBOX; ch++) { + if (!(dev_ch_int & TSI721_INT_OMSG_CHAN(ch))) + continue; + tsi721_omsg_handler(priv, ch); + } + } + } + + if (dev_int & TSI721_DEV_INT_SRIO) { + /* Service SRIO MAC interrupts */ + intval = ioread32(priv->regs + TSI721_RIO_EM_INT_STAT); + if (intval & TSI721_RIO_EM_INT_STAT_PW_RX) + tsi721_pw_handler(mport); + } + + return IRQ_HANDLED; +} + +static void tsi721_interrupts_init(struct tsi721_device *priv) +{ + u32 intr; + + /* Enable IDB interrupts */ + iowrite32(TSI721_SR_CHINT_ALL, + priv->regs + TSI721_SR_CHINT(IDB_QUEUE)); + iowrite32(TSI721_SR_CHINT_IDBQRCV, + priv->regs + TSI721_SR_CHINTE(IDB_QUEUE)); + iowrite32(TSI721_INT_SR2PC_CHAN(IDB_QUEUE), + priv->regs + TSI721_DEV_CHAN_INTE); + + /* Enable SRIO MAC interrupts */ + iowrite32(TSI721_RIO_EM_DEV_INT_EN_INT, + priv->regs + TSI721_RIO_EM_DEV_INT_EN); + + if (priv->flags & TSI721_USING_MSIX) + intr = TSI721_DEV_INT_SRIO; + else + intr = TSI721_DEV_INT_SR2PC_CH | TSI721_DEV_INT_SRIO | + TSI721_DEV_INT_SMSG_CH; + + iowrite32(intr, priv->regs + TSI721_DEV_INTE); + ioread32(priv->regs + TSI721_DEV_INTE); +} + +#ifdef CONFIG_PCI_MSI +/** + * tsi721_omsg_msix - MSI-X interrupt handler for outbound messaging + * @irq: Linux interrupt number + * @ptr: Pointer to interrupt-specific data (mport structure) + * + * Handles outbound messaging interrupts signaled using MSI-X. + */ +static irqreturn_t tsi721_omsg_msix(int irq, void *ptr) +{ + struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv; + int mbox; + + mbox = (irq - priv->msix[TSI721_VECT_OMB0_DONE].vector) % RIO_MAX_MBOX; + tsi721_omsg_handler(priv, mbox); + return IRQ_HANDLED; +} + +/** + * tsi721_imsg_msix - MSI-X interrupt handler for inbound messaging + * @irq: Linux interrupt number + * @ptr: Pointer to interrupt-specific data (mport structure) + * + * Handles inbound messaging interrupts signaled using MSI-X. + */ +static irqreturn_t tsi721_imsg_msix(int irq, void *ptr) +{ + struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv; + int mbox; + + mbox = (irq - priv->msix[TSI721_VECT_IMB0_RCV].vector) % RIO_MAX_MBOX; + tsi721_imsg_handler(priv, mbox + 4); + return IRQ_HANDLED; +} + +/** + * tsi721_srio_msix - Tsi721 MSI-X SRIO MAC interrupt handler + * @irq: Linux interrupt number + * @ptr: Pointer to interrupt-specific data (mport structure) + * + * Handles Tsi721 interrupts from SRIO MAC. + */ +static irqreturn_t tsi721_srio_msix(int irq, void *ptr) +{ + struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv; + u32 srio_int; + + /* Service SRIO MAC interrupts */ + srio_int = ioread32(priv->regs + TSI721_RIO_EM_INT_STAT); + if (srio_int & TSI721_RIO_EM_INT_STAT_PW_RX) + tsi721_pw_handler((struct rio_mport *)ptr); + + return IRQ_HANDLED; +} + +/** + * tsi721_sr2pc_ch_msix - Tsi721 MSI-X SR2PC Channel interrupt handler + * @irq: Linux interrupt number + * @ptr: Pointer to interrupt-specific data (mport structure) + * + * Handles Tsi721 interrupts from SR2PC Channel. + * NOTE: At this moment services only one SR2PC channel associated with inbound + * doorbells. + */ +static irqreturn_t tsi721_sr2pc_ch_msix(int irq, void *ptr) +{ + struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv; + u32 sr_ch_int; + + /* Service Inbound DB interrupt from SR2PC channel */ + sr_ch_int = ioread32(priv->regs + TSI721_SR_CHINT(IDB_QUEUE)); + if (sr_ch_int & TSI721_SR_CHINT_IDBQRCV) + tsi721_dbell_handler((struct rio_mport *)ptr); + + /* Clear interrupts */ + iowrite32(sr_ch_int, priv->regs + TSI721_SR_CHINT(IDB_QUEUE)); + /* Read back to ensure that interrupt was cleared */ + sr_ch_int = ioread32(priv->regs + TSI721_SR_CHINT(IDB_QUEUE)); + + return IRQ_HANDLED; +} + +/** + * tsi721_request_msix - register interrupt service for MSI-X mode. + * @mport: RapidIO master port structure + * + * Registers MSI-X interrupt service routines for interrupts that are active + * immediately after mport initialization. Messaging interrupt service routines + * should be registered during corresponding open requests. + */ +static int tsi721_request_msix(struct rio_mport *mport) +{ + struct tsi721_device *priv = mport->priv; + int err = 0; + + err = request_irq(priv->msix[TSI721_VECT_IDB].vector, + tsi721_sr2pc_ch_msix, 0, + priv->msix[TSI721_VECT_IDB].irq_name, (void *)mport); + if (err) + goto out; + + err = request_irq(priv->msix[TSI721_VECT_PWRX].vector, + tsi721_srio_msix, 0, + priv->msix[TSI721_VECT_PWRX].irq_name, (void *)mport); + if (err) + free_irq( + priv->msix[TSI721_VECT_IDB].vector, + (void *)mport); +out: + return err; +} + +/** + * tsi721_enable_msix - Attempts to enable MSI-X support for Tsi721. + * @priv: pointer to tsi721 private data + * + * Configures MSI-X support for Tsi721. Supports only an exact number + * of requested vectors. + */ +static int tsi721_enable_msix(struct tsi721_device *priv) +{ + struct msix_entry entries[TSI721_VECT_MAX]; + int err; + int i; + + entries[TSI721_VECT_IDB].entry = TSI721_MSIX_SR2PC_IDBQ_RCV(IDB_QUEUE); + entries[TSI721_VECT_PWRX].entry = TSI721_MSIX_SRIO_MAC_INT; + + /* + * Initialize MSI-X entries for Messaging Engine: + * this driver supports four RIO mailboxes (inbound and outbound) + * NOTE: Inbound message MBOX 0...4 use IB channels 4...7. Therefore + * offset +4 is added to IB MBOX number. + */ + for (i = 0; i < RIO_MAX_MBOX; i++) { + entries[TSI721_VECT_IMB0_RCV + i].entry = + TSI721_MSIX_IMSG_DQ_RCV(i + 4); + entries[TSI721_VECT_IMB0_INT + i].entry = + TSI721_MSIX_IMSG_INT(i + 4); + entries[TSI721_VECT_OMB0_DONE + i].entry = + TSI721_MSIX_OMSG_DONE(i); + entries[TSI721_VECT_OMB0_INT + i].entry = + TSI721_MSIX_OMSG_INT(i); + } + + err = pci_enable_msix(priv->pdev, entries, ARRAY_SIZE(entries)); + if (err) { + if (err > 0) + dev_info(&priv->pdev->dev, + "Only %d MSI-X vectors available, " + "not using MSI-X\n", err); + return err; + } + + /* + * Copy MSI-X vector information into tsi721 private structure + */ + priv->msix[TSI721_VECT_IDB].vector = entries[TSI721_VECT_IDB].vector; + snprintf(priv->msix[TSI721_VECT_IDB].irq_name, IRQ_DEVICE_NAME_MAX, + DRV_NAME "-idb@pci:%s", pci_name(priv->pdev)); + priv->msix[TSI721_VECT_PWRX].vector = entries[TSI721_VECT_PWRX].vector; + snprintf(priv->msix[TSI721_VECT_PWRX].irq_name, IRQ_DEVICE_NAME_MAX, + DRV_NAME "-pwrx@pci:%s", pci_name(priv->pdev)); + + for (i = 0; i < RIO_MAX_MBOX; i++) { + priv->msix[TSI721_VECT_IMB0_RCV + i].vector = + entries[TSI721_VECT_IMB0_RCV + i].vector; + snprintf(priv->msix[TSI721_VECT_IMB0_RCV + i].irq_name, + IRQ_DEVICE_NAME_MAX, DRV_NAME "-imbr%d@pci:%s", + i, pci_name(priv->pdev)); + + priv->msix[TSI721_VECT_IMB0_INT + i].vector = + entries[TSI721_VECT_IMB0_INT + i].vector; + snprintf(priv->msix[TSI721_VECT_IMB0_INT + i].irq_name, + IRQ_DEVICE_NAME_MAX, DRV_NAME "-imbi%d@pci:%s", + i, pci_name(priv->pdev)); + + priv->msix[TSI721_VECT_OMB0_DONE + i].vector = + entries[TSI721_VECT_OMB0_DONE + i].vector; + snprintf(priv->msix[TSI721_VECT_OMB0_DONE + i].irq_name, + IRQ_DEVICE_NAME_MAX, DRV_NAME "-ombd%d@pci:%s", + i, pci_name(priv->pdev)); + + priv->msix[TSI721_VECT_OMB0_INT + i].vector = + entries[TSI721_VECT_OMB0_INT + i].vector; + snprintf(priv->msix[TSI721_VECT_OMB0_INT + i].irq_name, + IRQ_DEVICE_NAME_MAX, DRV_NAME "-ombi%d@pci:%s", + i, pci_name(priv->pdev)); + } + + return 0; +} +#endif /* CONFIG_PCI_MSI */ + +static int tsi721_request_irq(struct rio_mport *mport) +{ + struct tsi721_device *priv = mport->priv; + int err; + +#ifdef CONFIG_PCI_MSI + if (priv->flags & TSI721_USING_MSIX) + err = tsi721_request_msix(mport); + else +#endif + err = request_irq(priv->pdev->irq, tsi721_irqhandler, + (priv->flags & TSI721_USING_MSI) ? 0 : IRQF_SHARED, + DRV_NAME, (void *)mport); + + if (err) + dev_err(&priv->pdev->dev, + "Unable to allocate interrupt, Error: %d\n", err); + + return err; +} + +/** + * tsi721_init_pc2sr_mapping - initializes outbound (PCIe->SRIO) + * translation regions. + * @priv: pointer to tsi721 private data + * + * Disables SREP translation regions. + */ +static void tsi721_init_pc2sr_mapping(struct tsi721_device *priv) +{ + int i; + + /* Disable all PC2SR translation windows */ + for (i = 0; i < TSI721_OBWIN_NUM; i++) + iowrite32(0, priv->regs + TSI721_OBWINLB(i)); +} + +/** + * tsi721_init_sr2pc_mapping - initializes inbound (SRIO->PCIe) + * translation regions. + * @priv: pointer to tsi721 private data + * + * Disables inbound windows. + */ +static void tsi721_init_sr2pc_mapping(struct tsi721_device *priv) +{ + int i; + + /* Disable all SR2PC inbound windows */ + for (i = 0; i < TSI721_IBWIN_NUM; i++) + iowrite32(0, priv->regs + TSI721_IBWINLB(i)); +} + +/** + * tsi721_port_write_init - Inbound port write interface init + * @priv: pointer to tsi721 private data + * + * Initializes inbound port write handler. + * Returns %0 on success or %-ENOMEM on failure. + */ +static int tsi721_port_write_init(struct tsi721_device *priv) +{ + priv->pw_discard_count = 0; + INIT_WORK(&priv->pw_work, tsi721_pw_dpc); + spin_lock_init(&priv->pw_fifo_lock); + if (kfifo_alloc(&priv->pw_fifo, + TSI721_RIO_PW_MSG_SIZE * 32, GFP_KERNEL)) { + dev_err(&priv->pdev->dev, "PW FIFO allocation failed\n"); + return -ENOMEM; + } + + /* Use reliable port-write capture mode */ + iowrite32(TSI721_RIO_PW_CTL_PWC_REL, priv->regs + TSI721_RIO_PW_CTL); + return 0; +} + +static int tsi721_doorbell_init(struct tsi721_device *priv) +{ + /* Outbound Doorbells do not require any setup. + * Tsi721 uses dedicated PCI BAR1 to generate doorbells. + * That BAR1 was mapped during the probe routine. + */ + + /* Initialize Inbound Doorbell processing DPC and queue */ + priv->db_discard_count = 0; + INIT_WORK(&priv->idb_work, tsi721_db_dpc); + + /* Allocate buffer for inbound doorbells queue */ + priv->idb_base = dma_alloc_coherent(&priv->pdev->dev, + IDB_QSIZE * TSI721_IDB_ENTRY_SIZE, + &priv->idb_dma, GFP_KERNEL); + if (!priv->idb_base) + return -ENOMEM; + + memset(priv->idb_base, 0, IDB_QSIZE * TSI721_IDB_ENTRY_SIZE); + + dev_dbg(&priv->pdev->dev, "Allocated IDB buffer @ %p (phys = %llx)\n", + priv->idb_base, (unsigned long long)priv->idb_dma); + + iowrite32(TSI721_IDQ_SIZE_VAL(IDB_QSIZE), + priv->regs + TSI721_IDQ_SIZE(IDB_QUEUE)); + iowrite32(((u64)priv->idb_dma >> 32), + priv->regs + TSI721_IDQ_BASEU(IDB_QUEUE)); + iowrite32(((u64)priv->idb_dma & TSI721_IDQ_BASEL_ADDR), + priv->regs + TSI721_IDQ_BASEL(IDB_QUEUE)); + /* Enable accepting all inbound doorbells */ + iowrite32(0, priv->regs + TSI721_IDQ_MASK(IDB_QUEUE)); + + iowrite32(TSI721_IDQ_INIT, priv->regs + TSI721_IDQ_CTL(IDB_QUEUE)); + + iowrite32(0, priv->regs + TSI721_IDQ_RP(IDB_QUEUE)); + + return 0; +} + +static void tsi721_doorbell_free(struct tsi721_device *priv) +{ + if (priv->idb_base == NULL) + return; + + /* Free buffer allocated for inbound doorbell queue */ + dma_free_coherent(&priv->pdev->dev, IDB_QSIZE * TSI721_IDB_ENTRY_SIZE, + priv->idb_base, priv->idb_dma); + priv->idb_base = NULL; +} + +static int tsi721_bdma_ch_init(struct tsi721_device *priv, int chnum) +{ + struct tsi721_dma_desc *bd_ptr; + u64 *sts_ptr; + dma_addr_t bd_phys, sts_phys; + int sts_size; + int bd_num = priv->bdma[chnum].bd_num; + + dev_dbg(&priv->pdev->dev, "Init Block DMA Engine, CH%d\n", chnum); + + /* + * Initialize DMA channel for maintenance requests + */ + + /* Allocate space for DMA descriptors */ + bd_ptr = dma_alloc_coherent(&priv->pdev->dev, + bd_num * sizeof(struct tsi721_dma_desc), + &bd_phys, GFP_KERNEL); + if (!bd_ptr) + return -ENOMEM; + + priv->bdma[chnum].bd_phys = bd_phys; + priv->bdma[chnum].bd_base = bd_ptr; + + memset(bd_ptr, 0, bd_num * sizeof(struct tsi721_dma_desc)); + + dev_dbg(&priv->pdev->dev, "DMA descriptors @ %p (phys = %llx)\n", + bd_ptr, (unsigned long long)bd_phys); + + /* Allocate space for descriptor status FIFO */ + sts_size = (bd_num >= TSI721_DMA_MINSTSSZ) ? + bd_num : TSI721_DMA_MINSTSSZ; + sts_size = roundup_pow_of_two(sts_size); + sts_ptr = dma_alloc_coherent(&priv->pdev->dev, + sts_size * sizeof(struct tsi721_dma_sts), + &sts_phys, GFP_KERNEL); + if (!sts_ptr) { + /* Free space allocated for DMA descriptors */ + dma_free_coherent(&priv->pdev->dev, + bd_num * sizeof(struct tsi721_dma_desc), + bd_ptr, bd_phys); + priv->bdma[chnum].bd_base = NULL; + return -ENOMEM; + } + + priv->bdma[chnum].sts_phys = sts_phys; + priv->bdma[chnum].sts_base = sts_ptr; + priv->bdma[chnum].sts_size = sts_size; + + memset(sts_ptr, 0, sts_size); + + dev_dbg(&priv->pdev->dev, + "desc status FIFO @ %p (phys = %llx) size=0x%x\n", + sts_ptr, (unsigned long long)sts_phys, sts_size); + + /* Initialize DMA descriptors ring */ + bd_ptr[bd_num - 1].type_id = cpu_to_le32(DTYPE3 << 29); + bd_ptr[bd_num - 1].next_lo = cpu_to_le32((u64)bd_phys & + TSI721_DMAC_DPTRL_MASK); + bd_ptr[bd_num - 1].next_hi = cpu_to_le32((u64)bd_phys >> 32); + + /* Setup DMA descriptor pointers */ + iowrite32(((u64)bd_phys >> 32), + priv->regs + TSI721_DMAC_DPTRH(chnum)); + iowrite32(((u64)bd_phys & TSI721_DMAC_DPTRL_MASK), + priv->regs + TSI721_DMAC_DPTRL(chnum)); + + /* Setup descriptor status FIFO */ + iowrite32(((u64)sts_phys >> 32), + priv->regs + TSI721_DMAC_DSBH(chnum)); + iowrite32(((u64)sts_phys & TSI721_DMAC_DSBL_MASK), + priv->regs + TSI721_DMAC_DSBL(chnum)); + iowrite32(TSI721_DMAC_DSSZ_SIZE(sts_size), + priv->regs + TSI721_DMAC_DSSZ(chnum)); + + /* Clear interrupt bits */ + iowrite32(TSI721_DMAC_INT_ALL, + priv->regs + TSI721_DMAC_INT(chnum)); + + ioread32(priv->regs + TSI721_DMAC_INT(chnum)); + + /* Toggle DMA channel initialization */ + iowrite32(TSI721_DMAC_CTL_INIT, priv->regs + TSI721_DMAC_CTL(chnum)); + ioread32(priv->regs + TSI721_DMAC_CTL(chnum)); + udelay(10); + + return 0; +} + +static int tsi721_bdma_ch_free(struct tsi721_device *priv, int chnum) +{ + u32 ch_stat; + + if (priv->bdma[chnum].bd_base == NULL) + return 0; + + /* Check if DMA channel still running */ + ch_stat = ioread32(priv->regs + TSI721_DMAC_STS(chnum)); + if (ch_stat & TSI721_DMAC_STS_RUN) + return -EFAULT; + + /* Put DMA channel into init state */ + iowrite32(TSI721_DMAC_CTL_INIT, + priv->regs + TSI721_DMAC_CTL(chnum)); + + /* Free space allocated for DMA descriptors */ + dma_free_coherent(&priv->pdev->dev, + priv->bdma[chnum].bd_num * sizeof(struct tsi721_dma_desc), + priv->bdma[chnum].bd_base, priv->bdma[chnum].bd_phys); + priv->bdma[chnum].bd_base = NULL; + + /* Free space allocated for status FIFO */ + dma_free_coherent(&priv->pdev->dev, + priv->bdma[chnum].sts_size * sizeof(struct tsi721_dma_sts), + priv->bdma[chnum].sts_base, priv->bdma[chnum].sts_phys); + priv->bdma[chnum].sts_base = NULL; + return 0; +} + +static int tsi721_bdma_init(struct tsi721_device *priv) +{ + /* Initialize BDMA channel allocated for RapidIO maintenance read/write + * request generation + */ + priv->bdma[TSI721_DMACH_MAINT].bd_num = 2; + if (tsi721_bdma_ch_init(priv, TSI721_DMACH_MAINT)) { + dev_err(&priv->pdev->dev, "Unable to initialize maintenance DMA" + " channel %d, aborting\n", TSI721_DMACH_MAINT); + return -ENOMEM; + } + + return 0; +} + +static void tsi721_bdma_free(struct tsi721_device *priv) +{ + tsi721_bdma_ch_free(priv, TSI721_DMACH_MAINT); +} + +/* Enable Inbound Messaging Interrupts */ +static void +tsi721_imsg_interrupt_enable(struct tsi721_device *priv, int ch, + u32 inte_mask) +{ + u32 rval; + + if (!inte_mask) + return; + + /* Clear pending Inbound Messaging interrupts */ + iowrite32(inte_mask, priv->regs + TSI721_IBDMAC_INT(ch)); + + /* Enable Inbound Messaging interrupts */ + rval = ioread32(priv->regs + TSI721_IBDMAC_INTE(ch)); + iowrite32(rval | inte_mask, priv->regs + TSI721_IBDMAC_INTE(ch)); + + if (priv->flags & TSI721_USING_MSIX) + return; /* Finished if we are in MSI-X mode */ + + /* + * For MSI and INTA interrupt signalling we need to enable next levels + */ + + /* Enable Device Channel Interrupt */ + rval = ioread32(priv->regs + TSI721_DEV_CHAN_INTE); + iowrite32(rval | TSI721_INT_IMSG_CHAN(ch), + priv->regs + TSI721_DEV_CHAN_INTE); +} + +/* Disable Inbound Messaging Interrupts */ +static void +tsi721_imsg_interrupt_disable(struct tsi721_device *priv, int ch, + u32 inte_mask) +{ + u32 rval; + + if (!inte_mask) + return; + + /* Clear pending Inbound Messaging interrupts */ + iowrite32(inte_mask, priv->regs + TSI721_IBDMAC_INT(ch)); + + /* Disable Inbound Messaging interrupts */ + rval = ioread32(priv->regs + TSI721_IBDMAC_INTE(ch)); + rval &= ~inte_mask; + iowrite32(rval, priv->regs + TSI721_IBDMAC_INTE(ch)); + + if (priv->flags & TSI721_USING_MSIX) + return; /* Finished if we are in MSI-X mode */ + + /* + * For MSI and INTA interrupt signalling we need to disable next levels + */ + + /* Disable Device Channel Interrupt */ + rval = ioread32(priv->regs + TSI721_DEV_CHAN_INTE); + rval &= ~TSI721_INT_IMSG_CHAN(ch); + iowrite32(rval, priv->regs + TSI721_DEV_CHAN_INTE); +} + +/* Enable Outbound Messaging interrupts */ +static void +tsi721_omsg_interrupt_enable(struct tsi721_device *priv, int ch, + u32 inte_mask) +{ + u32 rval; + + if (!inte_mask) + return; + + /* Clear pending Outbound Messaging interrupts */ + iowrite32(inte_mask, priv->regs + TSI721_OBDMAC_INT(ch)); + + /* Enable Outbound Messaging channel interrupts */ + rval = ioread32(priv->regs + TSI721_OBDMAC_INTE(ch)); + iowrite32(rval | inte_mask, priv->regs + TSI721_OBDMAC_INTE(ch)); + + if (priv->flags & TSI721_USING_MSIX) + return; /* Finished if we are in MSI-X mode */ + + /* + * For MSI and INTA interrupt signalling we need to enable next levels + */ + + /* Enable Device Channel Interrupt */ + rval = ioread32(priv->regs + TSI721_DEV_CHAN_INTE); + iowrite32(rval | TSI721_INT_OMSG_CHAN(ch), + priv->regs + TSI721_DEV_CHAN_INTE); +} + +/* Disable Outbound Messaging interrupts */ +static void +tsi721_omsg_interrupt_disable(struct tsi721_device *priv, int ch, + u32 inte_mask) +{ + u32 rval; + + if (!inte_mask) + return; + + /* Clear pending Outbound Messaging interrupts */ + iowrite32(inte_mask, priv->regs + TSI721_OBDMAC_INT(ch)); + + /* Disable Outbound Messaging interrupts */ + rval = ioread32(priv->regs + TSI721_OBDMAC_INTE(ch)); + rval &= ~inte_mask; + iowrite32(rval, priv->regs + TSI721_OBDMAC_INTE(ch)); + + if (priv->flags & TSI721_USING_MSIX) + return; /* Finished if we are in MSI-X mode */ + + /* + * For MSI and INTA interrupt signalling we need to disable next levels + */ + + /* Disable Device Channel Interrupt */ + rval = ioread32(priv->regs + TSI721_DEV_CHAN_INTE); + rval &= ~TSI721_INT_OMSG_CHAN(ch); + iowrite32(rval, priv->regs + TSI721_DEV_CHAN_INTE); +} + +/** + * tsi721_add_outb_message - Add message to the Tsi721 outbound message queue + * @mport: Master port with outbound message queue + * @rdev: Target of outbound message + * @mbox: Outbound mailbox + * @buffer: Message to add to outbound queue + * @len: Length of message + */ +static int +tsi721_add_outb_message(struct rio_mport *mport, struct rio_dev *rdev, int mbox, + void *buffer, size_t len) +{ + struct tsi721_device *priv = mport->priv; + struct tsi721_omsg_desc *desc; + u32 tx_slot; + + if (!priv->omsg_init[mbox] || + len > TSI721_MSG_MAX_SIZE || len < 8) + return -EINVAL; + + tx_slot = priv->omsg_ring[mbox].tx_slot; + + /* Copy copy message into transfer buffer */ + memcpy(priv->omsg_ring[mbox].omq_base[tx_slot], buffer, len); + + if (len & 0x7) + len += 8; + + /* Build descriptor associated with buffer */ + desc = priv->omsg_ring[mbox].omd_base; + desc[tx_slot].type_id = cpu_to_le32((DTYPE4 << 29) | rdev->destid); + if (tx_slot % 4 == 0) + desc[tx_slot].type_id |= cpu_to_le32(TSI721_OMD_IOF); + + desc[tx_slot].msg_info = + cpu_to_le32((mport->sys_size << 26) | (mbox << 22) | + (0xe << 12) | (len & 0xff8)); + desc[tx_slot].bufptr_lo = + cpu_to_le32((u64)priv->omsg_ring[mbox].omq_phys[tx_slot] & + 0xffffffff); + desc[tx_slot].bufptr_hi = + cpu_to_le32((u64)priv->omsg_ring[mbox].omq_phys[tx_slot] >> 32); + + priv->omsg_ring[mbox].wr_count++; + + /* Go to next descriptor */ + if (++priv->omsg_ring[mbox].tx_slot == priv->omsg_ring[mbox].size) { + priv->omsg_ring[mbox].tx_slot = 0; + /* Move through the ring link descriptor at the end */ + priv->omsg_ring[mbox].wr_count++; + } + + mb(); + + /* Set new write count value */ + iowrite32(priv->omsg_ring[mbox].wr_count, + priv->regs + TSI721_OBDMAC_DWRCNT(mbox)); + ioread32(priv->regs + TSI721_OBDMAC_DWRCNT(mbox)); + + return 0; +} + +/** + * tsi721_omsg_handler - Outbound Message Interrupt Handler + * @priv: pointer to tsi721 private data + * @ch: number of OB MSG channel to service + * + * Services channel interrupts from outbound messaging engine. + */ +static void tsi721_omsg_handler(struct tsi721_device *priv, int ch) +{ + u32 omsg_int; + + spin_lock(&priv->omsg_ring[ch].lock); + + omsg_int = ioread32(priv->regs + TSI721_OBDMAC_INT(ch)); + + if (omsg_int & TSI721_OBDMAC_INT_ST_FULL) + dev_info(&priv->pdev->dev, + "OB MBOX%d: Status FIFO is full\n", ch); + + if (omsg_int & (TSI721_OBDMAC_INT_DONE | TSI721_OBDMAC_INT_IOF_DONE)) { + u32 srd_ptr; + u64 *sts_ptr, last_ptr = 0, prev_ptr = 0; + int i, j; + u32 tx_slot; + + /* + * Find last successfully processed descriptor + */ + + /* Check and clear descriptor status FIFO entries */ + srd_ptr = priv->omsg_ring[ch].sts_rdptr; + sts_ptr = priv->omsg_ring[ch].sts_base; + j = srd_ptr * 8; + while (sts_ptr[j]) { + for (i = 0; i < 8 && sts_ptr[j]; i++, j++) { + prev_ptr = last_ptr; + last_ptr = le64_to_cpu(sts_ptr[j]); + sts_ptr[j] = 0; + } + + ++srd_ptr; + srd_ptr %= priv->omsg_ring[ch].sts_size; + j = srd_ptr * 8; + } + + if (last_ptr == 0) + goto no_sts_update; + + priv->omsg_ring[ch].sts_rdptr = srd_ptr; + iowrite32(srd_ptr, priv->regs + TSI721_OBDMAC_DSRP(ch)); + + if (!priv->mport->outb_msg[ch].mcback) + goto no_sts_update; + + /* Inform upper layer about transfer completion */ + + tx_slot = (last_ptr - (u64)priv->omsg_ring[ch].omd_phys)/ + sizeof(struct tsi721_omsg_desc); + + /* + * Check if this is a Link Descriptor (LD). + * If yes, ignore LD and use descriptor processed + * before LD. + */ + if (tx_slot == priv->omsg_ring[ch].size) { + if (prev_ptr) + tx_slot = (prev_ptr - + (u64)priv->omsg_ring[ch].omd_phys)/ + sizeof(struct tsi721_omsg_desc); + else + goto no_sts_update; + } + + /* Move slot index to the next message to be sent */ + ++tx_slot; + if (tx_slot == priv->omsg_ring[ch].size) + tx_slot = 0; + BUG_ON(tx_slot >= priv->omsg_ring[ch].size); + priv->mport->outb_msg[ch].mcback(priv->mport, + priv->omsg_ring[ch].dev_id, ch, + tx_slot); + } + +no_sts_update: + + if (omsg_int & TSI721_OBDMAC_INT_ERROR) { + /* + * Outbound message operation aborted due to error, + * reinitialize OB MSG channel + */ + + dev_dbg(&priv->pdev->dev, "OB MSG ABORT ch_stat=%x\n", + ioread32(priv->regs + TSI721_OBDMAC_STS(ch))); + + iowrite32(TSI721_OBDMAC_INT_ERROR, + priv->regs + TSI721_OBDMAC_INT(ch)); + iowrite32(TSI721_OBDMAC_CTL_INIT, + priv->regs + TSI721_OBDMAC_CTL(ch)); + ioread32(priv->regs + TSI721_OBDMAC_CTL(ch)); + + /* Inform upper level to clear all pending tx slots */ + if (priv->mport->outb_msg[ch].mcback) + priv->mport->outb_msg[ch].mcback(priv->mport, + priv->omsg_ring[ch].dev_id, ch, + priv->omsg_ring[ch].tx_slot); + /* Synch tx_slot tracking */ + iowrite32(priv->omsg_ring[ch].tx_slot, + priv->regs + TSI721_OBDMAC_DRDCNT(ch)); + ioread32(priv->regs + TSI721_OBDMAC_DRDCNT(ch)); + priv->omsg_ring[ch].wr_count = priv->omsg_ring[ch].tx_slot; + priv->omsg_ring[ch].sts_rdptr = 0; + } + + /* Clear channel interrupts */ + iowrite32(omsg_int, priv->regs + TSI721_OBDMAC_INT(ch)); + + if (!(priv->flags & TSI721_USING_MSIX)) { + u32 ch_inte; + + /* Re-enable channel interrupts */ + ch_inte = ioread32(priv->regs + TSI721_DEV_CHAN_INTE); + ch_inte |= TSI721_INT_OMSG_CHAN(ch); + iowrite32(ch_inte, priv->regs + TSI721_DEV_CHAN_INTE); + } + + spin_unlock(&priv->omsg_ring[ch].lock); +} + +/** + * tsi721_open_outb_mbox - Initialize Tsi721 outbound mailbox + * @mport: Master port implementing Outbound Messaging Engine + * @dev_id: Device specific pointer to pass on event + * @mbox: Mailbox to open + * @entries: Number of entries in the outbound mailbox ring + */ +static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id, + int mbox, int entries) +{ + struct tsi721_device *priv = mport->priv; + struct tsi721_omsg_desc *bd_ptr; + int i, rc = 0; + + if ((entries < TSI721_OMSGD_MIN_RING_SIZE) || + (entries > (TSI721_OMSGD_RING_SIZE)) || + (!is_power_of_2(entries)) || mbox >= RIO_MAX_MBOX) { + rc = -EINVAL; + goto out; + } + + priv->omsg_ring[mbox].dev_id = dev_id; + priv->omsg_ring[mbox].size = entries; + priv->omsg_ring[mbox].sts_rdptr = 0; + spin_lock_init(&priv->omsg_ring[mbox].lock); + + /* Outbound Msg Buffer allocation based on + the number of maximum descriptor entries */ + for (i = 0; i < entries; i++) { + priv->omsg_ring[mbox].omq_base[i] = + dma_alloc_coherent( + &priv->pdev->dev, TSI721_MSG_BUFFER_SIZE, + &priv->omsg_ring[mbox].omq_phys[i], + GFP_KERNEL); + if (priv->omsg_ring[mbox].omq_base[i] == NULL) { + dev_dbg(&priv->pdev->dev, + "Unable to allocate OB MSG data buffer for" + " MBOX%d\n", mbox); + rc = -ENOMEM; + goto out_buf; + } + } + + /* Outbound message descriptor allocation */ + priv->omsg_ring[mbox].omd_base = dma_alloc_coherent( + &priv->pdev->dev, + (entries + 1) * sizeof(struct tsi721_omsg_desc), + &priv->omsg_ring[mbox].omd_phys, GFP_KERNEL); + if (priv->omsg_ring[mbox].omd_base == NULL) { + dev_dbg(&priv->pdev->dev, + "Unable to allocate OB MSG descriptor memory " + "for MBOX%d\n", mbox); + rc = -ENOMEM; + goto out_buf; + } + + priv->omsg_ring[mbox].tx_slot = 0; + + /* Outbound message descriptor status FIFO allocation */ + priv->omsg_ring[mbox].sts_size = roundup_pow_of_two(entries + 1); + priv->omsg_ring[mbox].sts_base = dma_alloc_coherent(&priv->pdev->dev, + priv->omsg_ring[mbox].sts_size * + sizeof(struct tsi721_dma_sts), + &priv->omsg_ring[mbox].sts_phys, GFP_KERNEL); + if (priv->omsg_ring[mbox].sts_base == NULL) { + dev_dbg(&priv->pdev->dev, + "Unable to allocate OB MSG descriptor status FIFO " + "for MBOX%d\n", mbox); + rc = -ENOMEM; + goto out_desc; + } + + memset(priv->omsg_ring[mbox].sts_base, 0, + entries * sizeof(struct tsi721_dma_sts)); + + /* + * Configure Outbound Messaging Engine + */ + + /* Setup Outbound Message descriptor pointer */ + iowrite32(((u64)priv->omsg_ring[mbox].omd_phys >> 32), + priv->regs + TSI721_OBDMAC_DPTRH(mbox)); + iowrite32(((u64)priv->omsg_ring[mbox].omd_phys & + TSI721_OBDMAC_DPTRL_MASK), + priv->regs + TSI721_OBDMAC_DPTRL(mbox)); + + /* Setup Outbound Message descriptor status FIFO */ + iowrite32(((u64)priv->omsg_ring[mbox].sts_phys >> 32), + priv->regs + TSI721_OBDMAC_DSBH(mbox)); + iowrite32(((u64)priv->omsg_ring[mbox].sts_phys & + TSI721_OBDMAC_DSBL_MASK), + priv->regs + TSI721_OBDMAC_DSBL(mbox)); + iowrite32(TSI721_DMAC_DSSZ_SIZE(priv->omsg_ring[mbox].sts_size), + priv->regs + (u32)TSI721_OBDMAC_DSSZ(mbox)); + + /* Enable interrupts */ + +#ifdef CONFIG_PCI_MSI + if (priv->flags & TSI721_USING_MSIX) { + /* Request interrupt service if we are in MSI-X mode */ + rc = request_irq( + priv->msix[TSI721_VECT_OMB0_DONE + mbox].vector, + tsi721_omsg_msix, 0, + priv->msix[TSI721_VECT_OMB0_DONE + mbox].irq_name, + (void *)mport); + + if (rc) { + dev_dbg(&priv->pdev->dev, + "Unable to allocate MSI-X interrupt for " + "OBOX%d-DONE\n", mbox); + goto out_stat; + } + + rc = request_irq(priv->msix[TSI721_VECT_OMB0_INT + mbox].vector, + tsi721_omsg_msix, 0, + priv->msix[TSI721_VECT_OMB0_INT + mbox].irq_name, + (void *)mport); + + if (rc) { + dev_dbg(&priv->pdev->dev, + "Unable to allocate MSI-X interrupt for " + "MBOX%d-INT\n", mbox); + free_irq( + priv->msix[TSI721_VECT_OMB0_DONE + mbox].vector, + (void *)mport); + goto out_stat; + } + } +#endif /* CONFIG_PCI_MSI */ + + tsi721_omsg_interrupt_enable(priv, mbox, TSI721_OBDMAC_INT_ALL); + + /* Initialize Outbound Message descriptors ring */ + bd_ptr = priv->omsg_ring[mbox].omd_base; + bd_ptr[entries].type_id = cpu_to_le32(DTYPE5 << 29); + bd_ptr[entries].msg_info = 0; + bd_ptr[entries].next_lo = + cpu_to_le32((u64)priv->omsg_ring[mbox].omd_phys & + TSI721_OBDMAC_DPTRL_MASK); + bd_ptr[entries].next_hi = + cpu_to_le32((u64)priv->omsg_ring[mbox].omd_phys >> 32); + priv->omsg_ring[mbox].wr_count = 0; + mb(); + + /* Initialize Outbound Message engine */ + iowrite32(TSI721_OBDMAC_CTL_INIT, priv->regs + TSI721_OBDMAC_CTL(mbox)); + ioread32(priv->regs + TSI721_OBDMAC_DWRCNT(mbox)); + udelay(10); + + priv->omsg_init[mbox] = 1; + + return 0; + +#ifdef CONFIG_PCI_MSI +out_stat: + dma_free_coherent(&priv->pdev->dev, + priv->omsg_ring[mbox].sts_size * sizeof(struct tsi721_dma_sts), + priv->omsg_ring[mbox].sts_base, + priv->omsg_ring[mbox].sts_phys); + + priv->omsg_ring[mbox].sts_base = NULL; +#endif /* CONFIG_PCI_MSI */ + +out_desc: + dma_free_coherent(&priv->pdev->dev, + (entries + 1) * sizeof(struct tsi721_omsg_desc), + priv->omsg_ring[mbox].omd_base, + priv->omsg_ring[mbox].omd_phys); + + priv->omsg_ring[mbox].omd_base = NULL; + +out_buf: + for (i = 0; i < priv->omsg_ring[mbox].size; i++) { + if (priv->omsg_ring[mbox].omq_base[i]) { + dma_free_coherent(&priv->pdev->dev, + TSI721_MSG_BUFFER_SIZE, + priv->omsg_ring[mbox].omq_base[i], + priv->omsg_ring[mbox].omq_phys[i]); + + priv->omsg_ring[mbox].omq_base[i] = NULL; + } + } + +out: + return rc; +} + +/** + * tsi721_close_outb_mbox - Close Tsi721 outbound mailbox + * @mport: Master port implementing the outbound message unit + * @mbox: Mailbox to close + */ +static void tsi721_close_outb_mbox(struct rio_mport *mport, int mbox) +{ + struct tsi721_device *priv = mport->priv; + u32 i; + + if (!priv->omsg_init[mbox]) + return; + priv->omsg_init[mbox] = 0; + + /* Disable Interrupts */ + + tsi721_omsg_interrupt_disable(priv, mbox, TSI721_OBDMAC_INT_ALL); + +#ifdef CONFIG_PCI_MSI + if (priv->flags & TSI721_USING_MSIX) { + free_irq(priv->msix[TSI721_VECT_OMB0_DONE + mbox].vector, + (void *)mport); + free_irq(priv->msix[TSI721_VECT_OMB0_INT + mbox].vector, + (void *)mport); + } +#endif /* CONFIG_PCI_MSI */ + + /* Free OMSG Descriptor Status FIFO */ + dma_free_coherent(&priv->pdev->dev, + priv->omsg_ring[mbox].sts_size * sizeof(struct tsi721_dma_sts), + priv->omsg_ring[mbox].sts_base, + priv->omsg_ring[mbox].sts_phys); + + priv->omsg_ring[mbox].sts_base = NULL; + + /* Free OMSG descriptors */ + dma_free_coherent(&priv->pdev->dev, + (priv->omsg_ring[mbox].size + 1) * + sizeof(struct tsi721_omsg_desc), + priv->omsg_ring[mbox].omd_base, + priv->omsg_ring[mbox].omd_phys); + + priv->omsg_ring[mbox].omd_base = NULL; + + /* Free message buffers */ + for (i = 0; i < priv->omsg_ring[mbox].size; i++) { + if (priv->omsg_ring[mbox].omq_base[i]) { + dma_free_coherent(&priv->pdev->dev, + TSI721_MSG_BUFFER_SIZE, + priv->omsg_ring[mbox].omq_base[i], + priv->omsg_ring[mbox].omq_phys[i]); + + priv->omsg_ring[mbox].omq_base[i] = NULL; + } + } +} + +/** + * tsi721_imsg_handler - Inbound Message Interrupt Handler + * @priv: pointer to tsi721 private data + * @ch: inbound message channel number to service + * + * Services channel interrupts from inbound messaging engine. + */ +static void tsi721_imsg_handler(struct tsi721_device *priv, int ch) +{ + u32 mbox = ch - 4; + u32 imsg_int; + + spin_lock(&priv->imsg_ring[mbox].lock); + + imsg_int = ioread32(priv->regs + TSI721_IBDMAC_INT(ch)); + + if (imsg_int & TSI721_IBDMAC_INT_SRTO) + dev_info(&priv->pdev->dev, "IB MBOX%d SRIO timeout\n", + mbox); + + if (imsg_int & TSI721_IBDMAC_INT_PC_ERROR) + dev_info(&priv->pdev->dev, "IB MBOX%d PCIe error\n", + mbox); + + if (imsg_int & TSI721_IBDMAC_INT_FQ_LOW) + dev_info(&priv->pdev->dev, + "IB MBOX%d IB free queue low\n", mbox); + + /* Clear IB channel interrupts */ + iowrite32(imsg_int, priv->regs + TSI721_IBDMAC_INT(ch)); + + /* If an IB Msg is received notify the upper layer */ + if (imsg_int & TSI721_IBDMAC_INT_DQ_RCV && + priv->mport->inb_msg[mbox].mcback) + priv->mport->inb_msg[mbox].mcback(priv->mport, + priv->imsg_ring[mbox].dev_id, mbox, -1); + + if (!(priv->flags & TSI721_USING_MSIX)) { + u32 ch_inte; + + /* Re-enable channel interrupts */ + ch_inte = ioread32(priv->regs + TSI721_DEV_CHAN_INTE); + ch_inte |= TSI721_INT_IMSG_CHAN(ch); + iowrite32(ch_inte, priv->regs + TSI721_DEV_CHAN_INTE); + } + + spin_unlock(&priv->imsg_ring[mbox].lock); +} + +/** + * tsi721_open_inb_mbox - Initialize Tsi721 inbound mailbox + * @mport: Master port implementing the Inbound Messaging Engine + * @dev_id: Device specific pointer to pass on event + * @mbox: Mailbox to open + * @entries: Number of entries in the inbound mailbox ring + */ +static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id, + int mbox, int entries) +{ + struct tsi721_device *priv = mport->priv; + int ch = mbox + 4; + int i; + u64 *free_ptr; + int rc = 0; + + if ((entries < TSI721_IMSGD_MIN_RING_SIZE) || + (entries > TSI721_IMSGD_RING_SIZE) || + (!is_power_of_2(entries)) || mbox >= RIO_MAX_MBOX) { + rc = -EINVAL; + goto out; + } + + /* Initialize IB Messaging Ring */ + priv->imsg_ring[mbox].dev_id = dev_id; + priv->imsg_ring[mbox].size = entries; + priv->imsg_ring[mbox].rx_slot = 0; + priv->imsg_ring[mbox].desc_rdptr = 0; + priv->imsg_ring[mbox].fq_wrptr = 0; + for (i = 0; i < priv->imsg_ring[mbox].size; i++) + priv->imsg_ring[mbox].imq_base[i] = NULL; + spin_lock_init(&priv->imsg_ring[mbox].lock); + + /* Allocate buffers for incoming messages */ + priv->imsg_ring[mbox].buf_base = + dma_alloc_coherent(&priv->pdev->dev, + entries * TSI721_MSG_BUFFER_SIZE, + &priv->imsg_ring[mbox].buf_phys, + GFP_KERNEL); + + if (priv->imsg_ring[mbox].buf_base == NULL) { + dev_err(&priv->pdev->dev, + "Failed to allocate buffers for IB MBOX%d\n", mbox); + rc = -ENOMEM; + goto out; + } + + /* Allocate memory for circular free list */ + priv->imsg_ring[mbox].imfq_base = + dma_alloc_coherent(&priv->pdev->dev, + entries * 8, + &priv->imsg_ring[mbox].imfq_phys, + GFP_KERNEL); + + if (priv->imsg_ring[mbox].imfq_base == NULL) { + dev_err(&priv->pdev->dev, + "Failed to allocate free queue for IB MBOX%d\n", mbox); + rc = -ENOMEM; + goto out_buf; + } + + /* Allocate memory for Inbound message descriptors */ + priv->imsg_ring[mbox].imd_base = + dma_alloc_coherent(&priv->pdev->dev, + entries * sizeof(struct tsi721_imsg_desc), + &priv->imsg_ring[mbox].imd_phys, GFP_KERNEL); + + if (priv->imsg_ring[mbox].imd_base == NULL) { + dev_err(&priv->pdev->dev, + "Failed to allocate descriptor memory for IB MBOX%d\n", + mbox); + rc = -ENOMEM; + goto out_dma; + } + + /* Fill free buffer pointer list */ + free_ptr = priv->imsg_ring[mbox].imfq_base; + for (i = 0; i < entries; i++) + free_ptr[i] = cpu_to_le64( + (u64)(priv->imsg_ring[mbox].buf_phys) + + i * 0x1000); + + mb(); + + /* + * For mapping of inbound SRIO Messages into appropriate queues we need + * to set Inbound Device ID register in the messaging engine. We do it + * once when first inbound mailbox is requested. + */ + if (!(priv->flags & TSI721_IMSGID_SET)) { + iowrite32((u32)priv->mport->host_deviceid, + priv->regs + TSI721_IB_DEVID); + priv->flags |= TSI721_IMSGID_SET; + } + + /* + * Configure Inbound Messaging channel (ch = mbox + 4) + */ + + /* Setup Inbound Message free queue */ + iowrite32(((u64)priv->imsg_ring[mbox].imfq_phys >> 32), + priv->regs + TSI721_IBDMAC_FQBH(ch)); + iowrite32(((u64)priv->imsg_ring[mbox].imfq_phys & + TSI721_IBDMAC_FQBL_MASK), + priv->regs+TSI721_IBDMAC_FQBL(ch)); + iowrite32(TSI721_DMAC_DSSZ_SIZE(entries), + priv->regs + TSI721_IBDMAC_FQSZ(ch)); + + /* Setup Inbound Message descriptor queue */ + iowrite32(((u64)priv->imsg_ring[mbox].imd_phys >> 32), + priv->regs + TSI721_IBDMAC_DQBH(ch)); + iowrite32(((u32)priv->imsg_ring[mbox].imd_phys & + (u32)TSI721_IBDMAC_DQBL_MASK), + priv->regs+TSI721_IBDMAC_DQBL(ch)); + iowrite32(TSI721_DMAC_DSSZ_SIZE(entries), + priv->regs + TSI721_IBDMAC_DQSZ(ch)); + + /* Enable interrupts */ + +#ifdef CONFIG_PCI_MSI + if (priv->flags & TSI721_USING_MSIX) { + /* Request interrupt service if we are in MSI-X mode */ + rc = request_irq(priv->msix[TSI721_VECT_IMB0_RCV + mbox].vector, + tsi721_imsg_msix, 0, + priv->msix[TSI721_VECT_IMB0_RCV + mbox].irq_name, + (void *)mport); + + if (rc) { + dev_dbg(&priv->pdev->dev, + "Unable to allocate MSI-X interrupt for " + "IBOX%d-DONE\n", mbox); + goto out_desc; + } + + rc = request_irq(priv->msix[TSI721_VECT_IMB0_INT + mbox].vector, + tsi721_imsg_msix, 0, + priv->msix[TSI721_VECT_IMB0_INT + mbox].irq_name, + (void *)mport); + + if (rc) { + dev_dbg(&priv->pdev->dev, + "Unable to allocate MSI-X interrupt for " + "IBOX%d-INT\n", mbox); + free_irq( + priv->msix[TSI721_VECT_IMB0_RCV + mbox].vector, + (void *)mport); + goto out_desc; + } + } +#endif /* CONFIG_PCI_MSI */ + + tsi721_imsg_interrupt_enable(priv, ch, TSI721_IBDMAC_INT_ALL); + + /* Initialize Inbound Message Engine */ + iowrite32(TSI721_IBDMAC_CTL_INIT, priv->regs + TSI721_IBDMAC_CTL(ch)); + ioread32(priv->regs + TSI721_IBDMAC_CTL(ch)); + udelay(10); + priv->imsg_ring[mbox].fq_wrptr = entries - 1; + iowrite32(entries - 1, priv->regs + TSI721_IBDMAC_FQWP(ch)); + + priv->imsg_init[mbox] = 1; + return 0; + +#ifdef CONFIG_PCI_MSI +out_desc: + dma_free_coherent(&priv->pdev->dev, + priv->imsg_ring[mbox].size * sizeof(struct tsi721_imsg_desc), + priv->imsg_ring[mbox].imd_base, + priv->imsg_ring[mbox].imd_phys); + + priv->imsg_ring[mbox].imd_base = NULL; +#endif /* CONFIG_PCI_MSI */ + +out_dma: + dma_free_coherent(&priv->pdev->dev, + priv->imsg_ring[mbox].size * 8, + priv->imsg_ring[mbox].imfq_base, + priv->imsg_ring[mbox].imfq_phys); + + priv->imsg_ring[mbox].imfq_base = NULL; + +out_buf: + dma_free_coherent(&priv->pdev->dev, + priv->imsg_ring[mbox].size * TSI721_MSG_BUFFER_SIZE, + priv->imsg_ring[mbox].buf_base, + priv->imsg_ring[mbox].buf_phys); + + priv->imsg_ring[mbox].buf_base = NULL; + +out: + return rc; +} + +/** + * tsi721_close_inb_mbox - Shut down Tsi721 inbound mailbox + * @mport: Master port implementing the Inbound Messaging Engine + * @mbox: Mailbox to close + */ +static void tsi721_close_inb_mbox(struct rio_mport *mport, int mbox) +{ + struct tsi721_device *priv = mport->priv; + u32 rx_slot; + int ch = mbox + 4; + + if (!priv->imsg_init[mbox]) /* mbox isn't initialized yet */ + return; + priv->imsg_init[mbox] = 0; + + /* Disable Inbound Messaging Engine */ + + /* Disable Interrupts */ + tsi721_imsg_interrupt_disable(priv, ch, TSI721_OBDMAC_INT_MASK); + +#ifdef CONFIG_PCI_MSI + if (priv->flags & TSI721_USING_MSIX) { + free_irq(priv->msix[TSI721_VECT_IMB0_RCV + mbox].vector, + (void *)mport); + free_irq(priv->msix[TSI721_VECT_IMB0_INT + mbox].vector, + (void *)mport); + } +#endif /* CONFIG_PCI_MSI */ + + /* Clear Inbound Buffer Queue */ + for (rx_slot = 0; rx_slot < priv->imsg_ring[mbox].size; rx_slot++) + priv->imsg_ring[mbox].imq_base[rx_slot] = NULL; + + /* Free memory allocated for message buffers */ + dma_free_coherent(&priv->pdev->dev, + priv->imsg_ring[mbox].size * TSI721_MSG_BUFFER_SIZE, + priv->imsg_ring[mbox].buf_base, + priv->imsg_ring[mbox].buf_phys); + + priv->imsg_ring[mbox].buf_base = NULL; + + /* Free memory allocated for free pointr list */ + dma_free_coherent(&priv->pdev->dev, + priv->imsg_ring[mbox].size * 8, + priv->imsg_ring[mbox].imfq_base, + priv->imsg_ring[mbox].imfq_phys); + + priv->imsg_ring[mbox].imfq_base = NULL; + + /* Free memory allocated for RX descriptors */ + dma_free_coherent(&priv->pdev->dev, + priv->imsg_ring[mbox].size * sizeof(struct tsi721_imsg_desc), + priv->imsg_ring[mbox].imd_base, + priv->imsg_ring[mbox].imd_phys); + + priv->imsg_ring[mbox].imd_base = NULL; +} + +/** + * tsi721_add_inb_buffer - Add buffer to the Tsi721 inbound message queue + * @mport: Master port implementing the Inbound Messaging Engine + * @mbox: Inbound mailbox number + * @buf: Buffer to add to inbound queue + */ +static int tsi721_add_inb_buffer(struct rio_mport *mport, int mbox, void *buf) +{ + struct tsi721_device *priv = mport->priv; + u32 rx_slot; + int rc = 0; + + rx_slot = priv->imsg_ring[mbox].rx_slot; + if (priv->imsg_ring[mbox].imq_base[rx_slot]) { + dev_err(&priv->pdev->dev, + "Error adding inbound buffer %d, buffer exists\n", + rx_slot); + rc = -EINVAL; + goto out; + } + + priv->imsg_ring[mbox].imq_base[rx_slot] = buf; + + if (++priv->imsg_ring[mbox].rx_slot == priv->imsg_ring[mbox].size) + priv->imsg_ring[mbox].rx_slot = 0; + +out: + return rc; +} + +/** + * tsi721_get_inb_message - Fetch inbound message from the Tsi721 MSG Queue + * @mport: Master port implementing the Inbound Messaging Engine + * @mbox: Inbound mailbox number + * + * Returns pointer to the message on success or NULL on failure. + */ +static void *tsi721_get_inb_message(struct rio_mport *mport, int mbox) +{ + struct tsi721_device *priv = mport->priv; + struct tsi721_imsg_desc *desc; + u32 rx_slot; + void *rx_virt = NULL; + u64 rx_phys; + void *buf = NULL; + u64 *free_ptr; + int ch = mbox + 4; + int msg_size; + + if (!priv->imsg_init[mbox]) + return NULL; + + desc = priv->imsg_ring[mbox].imd_base; + desc += priv->imsg_ring[mbox].desc_rdptr; + + if (!(le32_to_cpu(desc->msg_info) & TSI721_IMD_HO)) + goto out; + + rx_slot = priv->imsg_ring[mbox].rx_slot; + while (priv->imsg_ring[mbox].imq_base[rx_slot] == NULL) { + if (++rx_slot == priv->imsg_ring[mbox].size) + rx_slot = 0; + } + + rx_phys = ((u64)le32_to_cpu(desc->bufptr_hi) << 32) | + le32_to_cpu(desc->bufptr_lo); + + rx_virt = priv->imsg_ring[mbox].buf_base + + (rx_phys - (u64)priv->imsg_ring[mbox].buf_phys); + + buf = priv->imsg_ring[mbox].imq_base[rx_slot]; + msg_size = le32_to_cpu(desc->msg_info) & TSI721_IMD_BCOUNT; + if (msg_size == 0) + msg_size = RIO_MAX_MSG_SIZE; + + memcpy(buf, rx_virt, msg_size); + priv->imsg_ring[mbox].imq_base[rx_slot] = NULL; + + desc->msg_info &= cpu_to_le32(~TSI721_IMD_HO); + if (++priv->imsg_ring[mbox].desc_rdptr == priv->imsg_ring[mbox].size) + priv->imsg_ring[mbox].desc_rdptr = 0; + + iowrite32(priv->imsg_ring[mbox].desc_rdptr, + priv->regs + TSI721_IBDMAC_DQRP(ch)); + + /* Return free buffer into the pointer list */ + free_ptr = priv->imsg_ring[mbox].imfq_base; + free_ptr[priv->imsg_ring[mbox].fq_wrptr] = cpu_to_le64(rx_phys); + + if (++priv->imsg_ring[mbox].fq_wrptr == priv->imsg_ring[mbox].size) + priv->imsg_ring[mbox].fq_wrptr = 0; + + iowrite32(priv->imsg_ring[mbox].fq_wrptr, + priv->regs + TSI721_IBDMAC_FQWP(ch)); +out: + return buf; +} + +/** + * tsi721_messages_init - Initialization of Messaging Engine + * @priv: pointer to tsi721 private data + * + * Configures Tsi721 messaging engine. + */ +static int tsi721_messages_init(struct tsi721_device *priv) +{ + int ch; + + iowrite32(0, priv->regs + TSI721_SMSG_ECC_LOG); + iowrite32(0, priv->regs + TSI721_RETRY_GEN_CNT); + iowrite32(0, priv->regs + TSI721_RETRY_RX_CNT); + + /* Set SRIO Message Request/Response Timeout */ + iowrite32(TSI721_RQRPTO_VAL, priv->regs + TSI721_RQRPTO); + + /* Initialize Inbound Messaging Engine Registers */ + for (ch = 0; ch < TSI721_IMSG_CHNUM; ch++) { + /* Clear interrupt bits */ + iowrite32(TSI721_IBDMAC_INT_MASK, + priv->regs + TSI721_IBDMAC_INT(ch)); + /* Clear Status */ + iowrite32(0, priv->regs + TSI721_IBDMAC_STS(ch)); + + iowrite32(TSI721_SMSG_ECC_COR_LOG_MASK, + priv->regs + TSI721_SMSG_ECC_COR_LOG(ch)); + iowrite32(TSI721_SMSG_ECC_NCOR_MASK, + priv->regs + TSI721_SMSG_ECC_NCOR(ch)); + } + + return 0; +} + +/** + * tsi721_disable_ints - disables all device interrupts + * @priv: pointer to tsi721 private data + */ +static void tsi721_disable_ints(struct tsi721_device *priv) +{ + int ch; + + /* Disable all device level interrupts */ + iowrite32(0, priv->regs + TSI721_DEV_INTE); + + /* Disable all Device Channel interrupts */ + iowrite32(0, priv->regs + TSI721_DEV_CHAN_INTE); + + /* Disable all Inbound Msg Channel interrupts */ + for (ch = 0; ch < TSI721_IMSG_CHNUM; ch++) + iowrite32(0, priv->regs + TSI721_IBDMAC_INTE(ch)); + + /* Disable all Outbound Msg Channel interrupts */ + for (ch = 0; ch < TSI721_OMSG_CHNUM; ch++) + iowrite32(0, priv->regs + TSI721_OBDMAC_INTE(ch)); + + /* Disable all general messaging interrupts */ + iowrite32(0, priv->regs + TSI721_SMSG_INTE); + + /* Disable all BDMA Channel interrupts */ + for (ch = 0; ch < TSI721_DMA_MAXCH; ch++) + iowrite32(0, priv->regs + TSI721_DMAC_INTE(ch)); + + /* Disable all general BDMA interrupts */ + iowrite32(0, priv->regs + TSI721_BDMA_INTE); + + /* Disable all SRIO Channel interrupts */ + for (ch = 0; ch < TSI721_SRIO_MAXCH; ch++) + iowrite32(0, priv->regs + TSI721_SR_CHINTE(ch)); + + /* Disable all general SR2PC interrupts */ + iowrite32(0, priv->regs + TSI721_SR2PC_GEN_INTE); + + /* Disable all PC2SR interrupts */ + iowrite32(0, priv->regs + TSI721_PC2SR_INTE); + + /* Disable all I2C interrupts */ + iowrite32(0, priv->regs + TSI721_I2C_INT_ENABLE); + + /* Disable SRIO MAC interrupts */ + iowrite32(0, priv->regs + TSI721_RIO_EM_INT_ENABLE); + iowrite32(0, priv->regs + TSI721_RIO_EM_DEV_INT_EN); +} + +/** + * tsi721_setup_mport - Setup Tsi721 as RapidIO subsystem master port + * @priv: pointer to tsi721 private data + * + * Configures Tsi721 as RapidIO master port. + */ +static int __devinit tsi721_setup_mport(struct tsi721_device *priv) +{ + struct pci_dev *pdev = priv->pdev; + int err = 0; + struct rio_ops *ops; + + struct rio_mport *mport; + + ops = kzalloc(sizeof(struct rio_ops), GFP_KERNEL); + if (!ops) { + dev_dbg(&pdev->dev, "Unable to allocate memory for rio_ops\n"); + return -ENOMEM; + } + + ops->lcread = tsi721_lcread; + ops->lcwrite = tsi721_lcwrite; + ops->cread = tsi721_cread_dma; + ops->cwrite = tsi721_cwrite_dma; + ops->dsend = tsi721_dsend; + ops->open_inb_mbox = tsi721_open_inb_mbox; + ops->close_inb_mbox = tsi721_close_inb_mbox; + ops->open_outb_mbox = tsi721_open_outb_mbox; + ops->close_outb_mbox = tsi721_close_outb_mbox; + ops->add_outb_message = tsi721_add_outb_message; + ops->add_inb_buffer = tsi721_add_inb_buffer; + ops->get_inb_message = tsi721_get_inb_message; + + mport = kzalloc(sizeof(struct rio_mport), GFP_KERNEL); + if (!mport) { + kfree(ops); + dev_dbg(&pdev->dev, "Unable to allocate memory for mport\n"); + return -ENOMEM; + } + + mport->ops = ops; + mport->index = 0; + mport->sys_size = 0; /* small system */ + mport->phy_type = RIO_PHY_SERIAL; + mport->priv = (void *)priv; + mport->phys_efptr = 0x100; + + INIT_LIST_HEAD(&mport->dbells); + + rio_init_dbell_res(&mport->riores[RIO_DOORBELL_RESOURCE], 0, 0xffff); + rio_init_mbox_res(&mport->riores[RIO_INB_MBOX_RESOURCE], 0, 0); + rio_init_mbox_res(&mport->riores[RIO_OUTB_MBOX_RESOURCE], 0, 0); + strcpy(mport->name, "Tsi721 mport"); + + /* Hook up interrupt handler */ + +#ifdef CONFIG_PCI_MSI + if (!tsi721_enable_msix(priv)) + priv->flags |= TSI721_USING_MSIX; + else if (!pci_enable_msi(pdev)) + priv->flags |= TSI721_USING_MSI; + else + dev_info(&pdev->dev, + "MSI/MSI-X is not available. Using legacy INTx.\n"); +#endif /* CONFIG_PCI_MSI */ + + err = tsi721_request_irq(mport); + + if (!err) { + tsi721_interrupts_init(priv); + ops->pwenable = tsi721_pw_enable; + } else + dev_err(&pdev->dev, "Unable to get assigned PCI IRQ " + "vector %02X err=0x%x\n", pdev->irq, err); + + /* Enable SRIO link */ + iowrite32(ioread32(priv->regs + TSI721_DEVCTL) | + TSI721_DEVCTL_SRBOOT_CMPL, + priv->regs + TSI721_DEVCTL); + + rio_register_mport(mport); + priv->mport = mport; + + if (mport->host_deviceid >= 0) + iowrite32(RIO_PORT_GEN_HOST | RIO_PORT_GEN_MASTER | + RIO_PORT_GEN_DISCOVERED, + priv->regs + (0x100 + RIO_PORT_GEN_CTL_CSR)); + else + iowrite32(0, priv->regs + (0x100 + RIO_PORT_GEN_CTL_CSR)); + + return 0; +} + +static int __devinit tsi721_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct tsi721_device *priv; + int i; + int err; + u32 regval; + + priv = kzalloc(sizeof(struct tsi721_device), GFP_KERNEL); + if (priv == NULL) { + dev_err(&pdev->dev, "Failed to allocate memory for device\n"); + err = -ENOMEM; + goto err_exit; + } + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "Failed to enable PCI device\n"); + goto err_clean; + } + + priv->pdev = pdev; + +#ifdef DEBUG + for (i = 0; i <= PCI_STD_RESOURCE_END; i++) { + dev_dbg(&pdev->dev, "res[%d] @ 0x%llx (0x%lx, 0x%lx)\n", + i, (unsigned long long)pci_resource_start(pdev, i), + (unsigned long)pci_resource_len(pdev, i), + pci_resource_flags(pdev, i)); + } +#endif + /* + * Verify BAR configuration + */ + + /* BAR_0 (registers) must be 512KB+ in 32-bit address space */ + if (!(pci_resource_flags(pdev, BAR_0) & IORESOURCE_MEM) || + pci_resource_flags(pdev, BAR_0) & IORESOURCE_MEM_64 || + pci_resource_len(pdev, BAR_0) < TSI721_REG_SPACE_SIZE) { + dev_err(&pdev->dev, + "Missing or misconfigured CSR BAR0, aborting.\n"); + err = -ENODEV; + goto err_disable_pdev; + } + + /* BAR_1 (outbound doorbells) must be 16MB+ in 32-bit address space */ + if (!(pci_resource_flags(pdev, BAR_1) & IORESOURCE_MEM) || + pci_resource_flags(pdev, BAR_1) & IORESOURCE_MEM_64 || + pci_resource_len(pdev, BAR_1) < TSI721_DB_WIN_SIZE) { + dev_err(&pdev->dev, + "Missing or misconfigured Doorbell BAR1, aborting.\n"); + err = -ENODEV; + goto err_disable_pdev; + } + + /* + * BAR_2 and BAR_4 (outbound translation) must be in 64-bit PCIe address + * space. + * NOTE: BAR_2 and BAR_4 are not used by this version of driver. + * It may be a good idea to keep them disabled using HW configuration + * to save PCI memory space. + */ + if ((pci_resource_flags(pdev, BAR_2) & IORESOURCE_MEM) && + (pci_resource_flags(pdev, BAR_2) & IORESOURCE_MEM_64)) { + dev_info(&pdev->dev, "Outbound BAR2 is not used but enabled.\n"); + } + + if ((pci_resource_flags(pdev, BAR_4) & IORESOURCE_MEM) && + (pci_resource_flags(pdev, BAR_4) & IORESOURCE_MEM_64)) { + dev_info(&pdev->dev, "Outbound BAR4 is not used but enabled.\n"); + } + + err = pci_request_regions(pdev, DRV_NAME); + if (err) { + dev_err(&pdev->dev, "Cannot obtain PCI resources, " + "aborting.\n"); + goto err_disable_pdev; + } + + pci_set_master(pdev); + + priv->regs = pci_ioremap_bar(pdev, BAR_0); + if (!priv->regs) { + dev_err(&pdev->dev, + "Unable to map device registers space, aborting\n"); + err = -ENOMEM; + goto err_free_res; + } + + priv->odb_base = pci_ioremap_bar(pdev, BAR_1); + if (!priv->odb_base) { + dev_err(&pdev->dev, + "Unable to map outbound doorbells space, aborting\n"); + err = -ENOMEM; + goto err_unmap_bars; + } + + /* Configure DMA attributes. */ + if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { + if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) { + dev_info(&pdev->dev, "Unable to set DMA mask\n"); + goto err_unmap_bars; + } + + if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32))) + dev_info(&pdev->dev, "Unable to set consistent DMA mask\n"); + } else { + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) + dev_info(&pdev->dev, "Unable to set consistent DMA mask\n"); + } + + /* Clear "no snoop" and "relaxed ordering" bits. */ + pci_read_config_dword(pdev, 0x40 + PCI_EXP_DEVCTL, ®val); + regval &= ~(PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN); + pci_write_config_dword(pdev, 0x40 + PCI_EXP_DEVCTL, regval); + + /* + * FIXUP: correct offsets of MSI-X tables in the MSI-X Capability Block + */ + pci_write_config_dword(pdev, TSI721_PCIECFG_EPCTL, 0x01); + pci_write_config_dword(pdev, TSI721_PCIECFG_MSIXTBL, + TSI721_MSIXTBL_OFFSET); + pci_write_config_dword(pdev, TSI721_PCIECFG_MSIXPBA, + TSI721_MSIXPBA_OFFSET); + pci_write_config_dword(pdev, TSI721_PCIECFG_EPCTL, 0); + /* End of FIXUP */ + + tsi721_disable_ints(priv); + + tsi721_init_pc2sr_mapping(priv); + tsi721_init_sr2pc_mapping(priv); + + if (tsi721_bdma_init(priv)) { + dev_err(&pdev->dev, "BDMA initialization failed, aborting\n"); + err = -ENOMEM; + goto err_unmap_bars; + } + + err = tsi721_doorbell_init(priv); + if (err) + goto err_free_bdma; + + tsi721_port_write_init(priv); + + err = tsi721_messages_init(priv); + if (err) + goto err_free_consistent; + + err = tsi721_setup_mport(priv); + if (err) + goto err_free_consistent; + + return 0; + +err_free_consistent: + tsi721_doorbell_free(priv); +err_free_bdma: + tsi721_bdma_free(priv); +err_unmap_bars: + if (priv->regs) + iounmap(priv->regs); + if (priv->odb_base) + iounmap(priv->odb_base); +err_free_res: + pci_release_regions(pdev); + pci_clear_master(pdev); +err_disable_pdev: + pci_disable_device(pdev); +err_clean: + kfree(priv); +err_exit: + return err; +} + +static DEFINE_PCI_DEVICE_TABLE(tsi721_pci_tbl) = { + { PCI_DEVICE(PCI_VENDOR_ID_IDT, PCI_DEVICE_ID_TSI721) }, + { 0, } /* terminate list */ +}; + +MODULE_DEVICE_TABLE(pci, tsi721_pci_tbl); + +static struct pci_driver tsi721_driver = { + .name = "tsi721", + .id_table = tsi721_pci_tbl, + .probe = tsi721_probe, +}; + +static int __init tsi721_init(void) +{ + return pci_register_driver(&tsi721_driver); +} + +static void __exit tsi721_exit(void) +{ + pci_unregister_driver(&tsi721_driver); +} + +device_initcall(tsi721_init); diff --git a/drivers/rapidio/devices/tsi721.h b/drivers/rapidio/devices/tsi721.h new file mode 100644 index 000000000000..58be4deb1402 --- /dev/null +++ b/drivers/rapidio/devices/tsi721.h @@ -0,0 +1,766 @@ +/* + * Tsi721 PCIExpress-to-SRIO bridge definitions + * + * Copyright 2011, Integrated Device Technology, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __TSI721_H +#define __TSI721_H + +#define DRV_NAME "tsi721" + +#define DEFAULT_HOPCOUNT 0xff +#define DEFAULT_DESTID 0xff + +/* PCI device ID */ +#define PCI_DEVICE_ID_TSI721 0x80ab + +#define BAR_0 0 +#define BAR_1 1 +#define BAR_2 2 +#define BAR_4 4 + +#define TSI721_PC2SR_BARS 2 +#define TSI721_PC2SR_WINS 8 +#define TSI721_PC2SR_ZONES 8 +#define TSI721_MAINT_WIN 0 /* Window for outbound maintenance requests */ +#define IDB_QUEUE 0 /* Inbound Doorbell Queue to use */ +#define IDB_QSIZE 512 /* Inbound Doorbell Queue size */ + +/* Memory space sizes */ +#define TSI721_REG_SPACE_SIZE (512 * 1024) /* 512K */ +#define TSI721_DB_WIN_SIZE (16 * 1024 * 1024) /* 16MB */ + +#define RIO_TT_CODE_8 0x00000000 +#define RIO_TT_CODE_16 0x00000001 + +#define TSI721_DMA_MAXCH 8 +#define TSI721_DMA_MINSTSSZ 32 +#define TSI721_DMA_STSBLKSZ 8 + +#define TSI721_SRIO_MAXCH 8 + +#define DBELL_SID(buf) (((u8)buf[2] << 8) | (u8)buf[3]) +#define DBELL_TID(buf) (((u8)buf[4] << 8) | (u8)buf[5]) +#define DBELL_INF(buf) (((u8)buf[0] << 8) | (u8)buf[1]) + +#define TSI721_RIO_PW_MSG_SIZE 16 /* Tsi721 saves only 16 bytes of PW msg */ + +/* Register definitions */ + +/* + * Registers in PCIe configuration space + */ + +#define TSI721_PCIECFG_MSIXTBL 0x0a4 +#define TSI721_MSIXTBL_OFFSET 0x2c000 +#define TSI721_PCIECFG_MSIXPBA 0x0a8 +#define TSI721_MSIXPBA_OFFSET 0x2a000 +#define TSI721_PCIECFG_EPCTL 0x400 + +/* + * Event Management Registers + */ + +#define TSI721_RIO_EM_INT_STAT 0x10910 +#define TSI721_RIO_EM_INT_STAT_PW_RX 0x00010000 + +#define TSI721_RIO_EM_INT_ENABLE 0x10914 +#define TSI721_RIO_EM_INT_ENABLE_PW_RX 0x00010000 + +#define TSI721_RIO_EM_DEV_INT_EN 0x10930 +#define TSI721_RIO_EM_DEV_INT_EN_INT 0x00000001 + +/* + * Port-Write Block Registers + */ + +#define TSI721_RIO_PW_CTL 0x10a04 +#define TSI721_RIO_PW_CTL_PW_TIMER 0xf0000000 +#define TSI721_RIO_PW_CTL_PWT_DIS (0 << 28) +#define TSI721_RIO_PW_CTL_PWT_103 (1 << 28) +#define TSI721_RIO_PW_CTL_PWT_205 (1 << 29) +#define TSI721_RIO_PW_CTL_PWT_410 (1 << 30) +#define TSI721_RIO_PW_CTL_PWT_820 (1 << 31) +#define TSI721_RIO_PW_CTL_PWC_MODE 0x01000000 +#define TSI721_RIO_PW_CTL_PWC_CONT 0x00000000 +#define TSI721_RIO_PW_CTL_PWC_REL 0x01000000 + +#define TSI721_RIO_PW_RX_STAT 0x10a10 +#define TSI721_RIO_PW_RX_STAT_WR_SIZE 0x0000f000 +#define TSI_RIO_PW_RX_STAT_WDPTR 0x00000100 +#define TSI721_RIO_PW_RX_STAT_PW_SHORT 0x00000008 +#define TSI721_RIO_PW_RX_STAT_PW_TRUNC 0x00000004 +#define TSI721_RIO_PW_RX_STAT_PW_DISC 0x00000002 +#define TSI721_RIO_PW_RX_STAT_PW_VAL 0x00000001 + +#define TSI721_RIO_PW_RX_CAPT(x) (0x10a20 + (x)*4) + +/* + * Inbound Doorbells + */ + +#define TSI721_IDB_ENTRY_SIZE 64 + +#define TSI721_IDQ_CTL(x) (0x20000 + (x) * 1000) +#define TSI721_IDQ_SUSPEND 0x00000002 +#define TSI721_IDQ_INIT 0x00000001 + +#define TSI721_IDQ_STS(x) (0x20004 + (x) * 1000) +#define TSI721_IDQ_RUN 0x00200000 + +#define TSI721_IDQ_MASK(x) (0x20008 + (x) * 1000) +#define TSI721_IDQ_MASK_MASK 0xffff0000 +#define TSI721_IDQ_MASK_PATT 0x0000ffff + +#define TSI721_IDQ_RP(x) (0x2000c + (x) * 1000) +#define TSI721_IDQ_RP_PTR 0x0007ffff + +#define TSI721_IDQ_WP(x) (0x20010 + (x) * 1000) +#define TSI721_IDQ_WP_PTR 0x0007ffff + +#define TSI721_IDQ_BASEL(x) (0x20014 + (x) * 1000) +#define TSI721_IDQ_BASEL_ADDR 0xffffffc0 +#define TSI721_IDQ_BASEU(x) (0x20018 + (x) * 1000) +#define TSI721_IDQ_SIZE(x) (0x2001c + (x) * 1000) +#define TSI721_IDQ_SIZE_VAL(size) (__fls(size) - 4) +#define TSI721_IDQ_SIZE_MIN 512 +#define TSI721_IDQ_SIZE_MAX (512 * 1024) + +#define TSI721_SR_CHINT(x) (0x20040 + (x) * 1000) +#define TSI721_SR_CHINTE(x) (0x20044 + (x) * 1000) +#define TSI721_SR_CHINTSET(x) (0x20048 + (x) * 1000) +#define TSI721_SR_CHINT_ODBOK 0x00000020 +#define TSI721_SR_CHINT_IDBQRCV 0x00000010 +#define TSI721_SR_CHINT_SUSP 0x00000008 +#define TSI721_SR_CHINT_ODBTO 0x00000004 +#define TSI721_SR_CHINT_ODBRTRY 0x00000002 +#define TSI721_SR_CHINT_ODBERR 0x00000001 +#define TSI721_SR_CHINT_ALL 0x0000003f + +#define TSI721_IBWIN_NUM 8 + +#define TSI721_IBWINLB(x) (0x29000 + (x) * 20) +#define TSI721_IBWINLB_BA 0xfffff000 +#define TSI721_IBWINLB_WEN 0x00000001 + +#define TSI721_SR2PC_GEN_INTE 0x29800 +#define TSI721_SR2PC_PWE 0x29804 +#define TSI721_SR2PC_GEN_INT 0x29808 + +#define TSI721_DEV_INTE 0x29840 +#define TSI721_DEV_INT 0x29844 +#define TSI721_DEV_INTSET 0x29848 +#define TSI721_DEV_INT_SMSG_CH 0x00000800 +#define TSI721_DEV_INT_SMSG_NCH 0x00000400 +#define TSI721_DEV_INT_SR2PC_CH 0x00000200 +#define TSI721_DEV_INT_SRIO 0x00000020 + +#define TSI721_DEV_CHAN_INTE 0x2984c +#define TSI721_DEV_CHAN_INT 0x29850 + +#define TSI721_INT_SR2PC_CHAN_M 0xff000000 +#define TSI721_INT_SR2PC_CHAN(x) (1 << (24 + (x))) +#define TSI721_INT_IMSG_CHAN_M 0x00ff0000 +#define TSI721_INT_IMSG_CHAN(x) (1 << (16 + (x))) +#define TSI721_INT_OMSG_CHAN_M 0x0000ff00 +#define TSI721_INT_OMSG_CHAN(x) (1 << (8 + (x))) + +/* + * PC2SR block registers + */ +#define TSI721_OBWIN_NUM TSI721_PC2SR_WINS + +#define TSI721_OBWINLB(x) (0x40000 + (x) * 20) +#define TSI721_OBWINLB_BA 0xffff8000 +#define TSI721_OBWINLB_WEN 0x00000001 + +#define TSI721_OBWINUB(x) (0x40004 + (x) * 20) + +#define TSI721_OBWINSZ(x) (0x40008 + (x) * 20) +#define TSI721_OBWINSZ_SIZE 0x00001f00 +#define TSI721_OBWIN_SIZE(size) (__fls(size) - 15) + +#define TSI721_ZONE_SEL 0x41300 +#define TSI721_ZONE_SEL_RD_WRB 0x00020000 +#define TSI721_ZONE_SEL_GO 0x00010000 +#define TSI721_ZONE_SEL_WIN 0x00000038 +#define TSI721_ZONE_SEL_ZONE 0x00000007 + +#define TSI721_LUT_DATA0 0x41304 +#define TSI721_LUT_DATA0_ADD 0xfffff000 +#define TSI721_LUT_DATA0_RDTYPE 0x00000f00 +#define TSI721_LUT_DATA0_NREAD 0x00000100 +#define TSI721_LUT_DATA0_MNTRD 0x00000200 +#define TSI721_LUT_DATA0_RDCRF 0x00000020 +#define TSI721_LUT_DATA0_WRCRF 0x00000010 +#define TSI721_LUT_DATA0_WRTYPE 0x0000000f +#define TSI721_LUT_DATA0_NWR 0x00000001 +#define TSI721_LUT_DATA0_MNTWR 0x00000002 +#define TSI721_LUT_DATA0_NWR_R 0x00000004 + +#define TSI721_LUT_DATA1 0x41308 + +#define TSI721_LUT_DATA2 0x4130c +#define TSI721_LUT_DATA2_HC 0xff000000 +#define TSI721_LUT_DATA2_ADD65 0x000c0000 +#define TSI721_LUT_DATA2_TT 0x00030000 +#define TSI721_LUT_DATA2_DSTID 0x0000ffff + +#define TSI721_PC2SR_INTE 0x41310 + +#define TSI721_DEVCTL 0x48004 +#define TSI721_DEVCTL_SRBOOT_CMPL 0x00000004 + +#define TSI721_I2C_INT_ENABLE 0x49120 + +/* + * Block DMA Engine Registers + * x = 0..7 + */ + +#define TSI721_DMAC_DWRCNT(x) (0x51000 + (x) * 0x1000) +#define TSI721_DMAC_DRDCNT(x) (0x51004 + (x) * 0x1000) + +#define TSI721_DMAC_CTL(x) (0x51008 + (x) * 0x1000) +#define TSI721_DMAC_CTL_SUSP 0x00000002 +#define TSI721_DMAC_CTL_INIT 0x00000001 + +#define TSI721_DMAC_INT(x) (0x5100c + (x) * 0x1000) +#define TSI721_DMAC_INT_STFULL 0x00000010 +#define TSI721_DMAC_INT_DONE 0x00000008 +#define TSI721_DMAC_INT_SUSP 0x00000004 +#define TSI721_DMAC_INT_ERR 0x00000002 +#define TSI721_DMAC_INT_IOFDONE 0x00000001 +#define TSI721_DMAC_INT_ALL 0x0000001f + +#define TSI721_DMAC_INTSET(x) (0x51010 + (x) * 0x1000) + +#define TSI721_DMAC_STS(x) (0x51014 + (x) * 0x1000) +#define TSI721_DMAC_STS_ABORT 0x00400000 +#define TSI721_DMAC_STS_RUN 0x00200000 +#define TSI721_DMAC_STS_CS 0x001f0000 + +#define TSI721_DMAC_INTE(x) (0x51018 + (x) * 0x1000) + +#define TSI721_DMAC_DPTRL(x) (0x51024 + (x) * 0x1000) +#define TSI721_DMAC_DPTRL_MASK 0xffffffe0 + +#define TSI721_DMAC_DPTRH(x) (0x51028 + (x) * 0x1000) + +#define TSI721_DMAC_DSBL(x) (0x5102c + (x) * 0x1000) +#define TSI721_DMAC_DSBL_MASK 0xffffffc0 + +#define TSI721_DMAC_DSBH(x) (0x51030 + (x) * 0x1000) + +#define TSI721_DMAC_DSSZ(x) (0x51034 + (x) * 0x1000) +#define TSI721_DMAC_DSSZ_SIZE_M 0x0000000f +#define TSI721_DMAC_DSSZ_SIZE(size) (__fls(size) - 4) + + +#define TSI721_DMAC_DSRP(x) (0x51038 + (x) * 0x1000) +#define TSI721_DMAC_DSRP_MASK 0x0007ffff + +#define TSI721_DMAC_DSWP(x) (0x5103c + (x) * 0x1000) +#define TSI721_DMAC_DSWP_MASK 0x0007ffff + +#define TSI721_BDMA_INTE 0x5f000 + +/* + * Messaging definitions + */ +#define TSI721_MSG_BUFFER_SIZE RIO_MAX_MSG_SIZE +#define TSI721_MSG_MAX_SIZE RIO_MAX_MSG_SIZE +#define TSI721_IMSG_MAXCH 8 +#define TSI721_IMSG_CHNUM TSI721_IMSG_MAXCH +#define TSI721_IMSGD_MIN_RING_SIZE 32 +#define TSI721_IMSGD_RING_SIZE 512 + +#define TSI721_OMSG_CHNUM 4 /* One channel per MBOX */ +#define TSI721_OMSGD_MIN_RING_SIZE 32 +#define TSI721_OMSGD_RING_SIZE 512 + +/* + * Outbound Messaging Engine Registers + * x = 0..7 + */ + +#define TSI721_OBDMAC_DWRCNT(x) (0x61000 + (x) * 0x1000) + +#define TSI721_OBDMAC_DRDCNT(x) (0x61004 + (x) * 0x1000) + +#define TSI721_OBDMAC_CTL(x) (0x61008 + (x) * 0x1000) +#define TSI721_OBDMAC_CTL_MASK 0x00000007 +#define TSI721_OBDMAC_CTL_RETRY_THR 0x00000004 +#define TSI721_OBDMAC_CTL_SUSPEND 0x00000002 +#define TSI721_OBDMAC_CTL_INIT 0x00000001 + +#define TSI721_OBDMAC_INT(x) (0x6100c + (x) * 0x1000) +#define TSI721_OBDMAC_INTSET(x) (0x61010 + (x) * 0x1000) +#define TSI721_OBDMAC_INTE(x) (0x61018 + (x) * 0x1000) +#define TSI721_OBDMAC_INT_MASK 0x0000001F +#define TSI721_OBDMAC_INT_ST_FULL 0x00000010 +#define TSI721_OBDMAC_INT_DONE 0x00000008 +#define TSI721_OBDMAC_INT_SUSPENDED 0x00000004 +#define TSI721_OBDMAC_INT_ERROR 0x00000002 +#define TSI721_OBDMAC_INT_IOF_DONE 0x00000001 +#define TSI721_OBDMAC_INT_ALL TSI721_OBDMAC_INT_MASK + +#define TSI721_OBDMAC_STS(x) (0x61014 + (x) * 0x1000) +#define TSI721_OBDMAC_STS_MASK 0x007f0000 +#define TSI721_OBDMAC_STS_ABORT 0x00400000 +#define TSI721_OBDMAC_STS_RUN 0x00200000 +#define TSI721_OBDMAC_STS_CS 0x001f0000 + +#define TSI721_OBDMAC_PWE(x) (0x6101c + (x) * 0x1000) +#define TSI721_OBDMAC_PWE_MASK 0x00000002 +#define TSI721_OBDMAC_PWE_ERROR_EN 0x00000002 + +#define TSI721_OBDMAC_DPTRL(x) (0x61020 + (x) * 0x1000) +#define TSI721_OBDMAC_DPTRL_MASK 0xfffffff0 + +#define TSI721_OBDMAC_DPTRH(x) (0x61024 + (x) * 0x1000) +#define TSI721_OBDMAC_DPTRH_MASK 0xffffffff + +#define TSI721_OBDMAC_DSBL(x) (0x61040 + (x) * 0x1000) +#define TSI721_OBDMAC_DSBL_MASK 0xffffffc0 + +#define TSI721_OBDMAC_DSBH(x) (0x61044 + (x) * 0x1000) +#define TSI721_OBDMAC_DSBH_MASK 0xffffffff + +#define TSI721_OBDMAC_DSSZ(x) (0x61048 + (x) * 0x1000) +#define TSI721_OBDMAC_DSSZ_MASK 0x0000000f + +#define TSI721_OBDMAC_DSRP(x) (0x6104c + (x) * 0x1000) +#define TSI721_OBDMAC_DSRP_MASK 0x0007ffff + +#define TSI721_OBDMAC_DSWP(x) (0x61050 + (x) * 0x1000) +#define TSI721_OBDMAC_DSWP_MASK 0x0007ffff + +#define TSI721_RQRPTO 0x60010 +#define TSI721_RQRPTO_MASK 0x00ffffff +#define TSI721_RQRPTO_VAL 400 /* Response TO value */ + +/* + * Inbound Messaging Engine Registers + * x = 0..7 + */ + +#define TSI721_IB_DEVID_GLOBAL 0xffff +#define TSI721_IBDMAC_FQBL(x) (0x61200 + (x) * 0x1000) +#define TSI721_IBDMAC_FQBL_MASK 0xffffffc0 + +#define TSI721_IBDMAC_FQBH(x) (0x61204 + (x) * 0x1000) +#define TSI721_IBDMAC_FQBH_MASK 0xffffffff + +#define TSI721_IBDMAC_FQSZ_ENTRY_INX TSI721_IMSGD_RING_SIZE +#define TSI721_IBDMAC_FQSZ(x) (0x61208 + (x) * 0x1000) +#define TSI721_IBDMAC_FQSZ_MASK 0x0000000f + +#define TSI721_IBDMAC_FQRP(x) (0x6120c + (x) * 0x1000) +#define TSI721_IBDMAC_FQRP_MASK 0x0007ffff + +#define TSI721_IBDMAC_FQWP(x) (0x61210 + (x) * 0x1000) +#define TSI721_IBDMAC_FQWP_MASK 0x0007ffff + +#define TSI721_IBDMAC_FQTH(x) (0x61214 + (x) * 0x1000) +#define TSI721_IBDMAC_FQTH_MASK 0x0007ffff + +#define TSI721_IB_DEVID 0x60020 +#define TSI721_IB_DEVID_MASK 0x0000ffff + +#define TSI721_IBDMAC_CTL(x) (0x61240 + (x) * 0x1000) +#define TSI721_IBDMAC_CTL_MASK 0x00000003 +#define TSI721_IBDMAC_CTL_SUSPEND 0x00000002 +#define TSI721_IBDMAC_CTL_INIT 0x00000001 + +#define TSI721_IBDMAC_STS(x) (0x61244 + (x) * 0x1000) +#define TSI721_IBDMAC_STS_MASK 0x007f0000 +#define TSI721_IBSMAC_STS_ABORT 0x00400000 +#define TSI721_IBSMAC_STS_RUN 0x00200000 +#define TSI721_IBSMAC_STS_CS 0x001f0000 + +#define TSI721_IBDMAC_INT(x) (0x61248 + (x) * 0x1000) +#define TSI721_IBDMAC_INTSET(x) (0x6124c + (x) * 0x1000) +#define TSI721_IBDMAC_INTE(x) (0x61250 + (x) * 0x1000) +#define TSI721_IBDMAC_INT_MASK 0x0000100f +#define TSI721_IBDMAC_INT_SRTO 0x00001000 +#define TSI721_IBDMAC_INT_SUSPENDED 0x00000008 +#define TSI721_IBDMAC_INT_PC_ERROR 0x00000004 +#define TSI721_IBDMAC_INT_FQ_LOW 0x00000002 +#define TSI721_IBDMAC_INT_DQ_RCV 0x00000001 +#define TSI721_IBDMAC_INT_ALL TSI721_IBDMAC_INT_MASK + +#define TSI721_IBDMAC_PWE(x) (0x61254 + (x) * 0x1000) +#define TSI721_IBDMAC_PWE_MASK 0x00001700 +#define TSI721_IBDMAC_PWE_SRTO 0x00001000 +#define TSI721_IBDMAC_PWE_ILL_FMT 0x00000400 +#define TSI721_IBDMAC_PWE_ILL_DEC 0x00000200 +#define TSI721_IBDMAC_PWE_IMP_SP 0x00000100 + +#define TSI721_IBDMAC_DQBL(x) (0x61300 + (x) * 0x1000) +#define TSI721_IBDMAC_DQBL_MASK 0xffffffc0 +#define TSI721_IBDMAC_DQBL_ADDR 0xffffffc0 + +#define TSI721_IBDMAC_DQBH(x) (0x61304 + (x) * 0x1000) +#define TSI721_IBDMAC_DQBH_MASK 0xffffffff + +#define TSI721_IBDMAC_DQRP(x) (0x61308 + (x) * 0x1000) +#define TSI721_IBDMAC_DQRP_MASK 0x0007ffff + +#define TSI721_IBDMAC_DQWR(x) (0x6130c + (x) * 0x1000) +#define TSI721_IBDMAC_DQWR_MASK 0x0007ffff + +#define TSI721_IBDMAC_DQSZ(x) (0x61314 + (x) * 0x1000) +#define TSI721_IBDMAC_DQSZ_MASK 0x0000000f + +/* + * Messaging Engine Interrupts + */ + +#define TSI721_SMSG_PWE 0x6a004 + +#define TSI721_SMSG_INTE 0x6a000 +#define TSI721_SMSG_INT 0x6a008 +#define TSI721_SMSG_INTSET 0x6a010 +#define TSI721_SMSG_INT_MASK 0x0086ffff +#define TSI721_SMSG_INT_UNS_RSP 0x00800000 +#define TSI721_SMSG_INT_ECC_NCOR 0x00040000 +#define TSI721_SMSG_INT_ECC_COR 0x00020000 +#define TSI721_SMSG_INT_ECC_NCOR_CH 0x0000ff00 +#define TSI721_SMSG_INT_ECC_COR_CH 0x000000ff + +#define TSI721_SMSG_ECC_LOG 0x6a014 +#define TSI721_SMSG_ECC_LOG_MASK 0x00070007 +#define TSI721_SMSG_ECC_LOG_ECC_NCOR_M 0x00070000 +#define TSI721_SMSG_ECC_LOG_ECC_COR_M 0x00000007 + +#define TSI721_RETRY_GEN_CNT 0x6a100 +#define TSI721_RETRY_GEN_CNT_MASK 0xffffffff + +#define TSI721_RETRY_RX_CNT 0x6a104 +#define TSI721_RETRY_RX_CNT_MASK 0xffffffff + +#define TSI721_SMSG_ECC_COR_LOG(x) (0x6a300 + (x) * 4) +#define TSI721_SMSG_ECC_COR_LOG_MASK 0x000000ff + +#define TSI721_SMSG_ECC_NCOR(x) (0x6a340 + (x) * 4) +#define TSI721_SMSG_ECC_NCOR_MASK 0x000000ff + +/* + * Block DMA Descriptors + */ + +struct tsi721_dma_desc { + __le32 type_id; + +#define TSI721_DMAD_DEVID 0x0000ffff +#define TSI721_DMAD_CRF 0x00010000 +#define TSI721_DMAD_PRIO 0x00060000 +#define TSI721_DMAD_RTYPE 0x00780000 +#define TSI721_DMAD_IOF 0x08000000 +#define TSI721_DMAD_DTYPE 0xe0000000 + + __le32 bcount; + +#define TSI721_DMAD_BCOUNT1 0x03ffffff /* if DTYPE == 1 */ +#define TSI721_DMAD_BCOUNT2 0x0000000f /* if DTYPE == 2 */ +#define TSI721_DMAD_TT 0x0c000000 +#define TSI721_DMAD_RADDR0 0xc0000000 + + union { + __le32 raddr_lo; /* if DTYPE == (1 || 2) */ + __le32 next_lo; /* if DTYPE == 3 */ + }; + +#define TSI721_DMAD_CFGOFF 0x00ffffff +#define TSI721_DMAD_HOPCNT 0xff000000 + + union { + __le32 raddr_hi; /* if DTYPE == (1 || 2) */ + __le32 next_hi; /* if DTYPE == 3 */ + }; + + union { + struct { /* if DTYPE == 1 */ + __le32 bufptr_lo; + __le32 bufptr_hi; + __le32 s_dist; + __le32 s_size; + } t1; + __le32 data[4]; /* if DTYPE == 2 */ + u32 reserved[4]; /* if DTYPE == 3 */ + }; +} __aligned(32); + +/* + * Inbound Messaging Descriptor + */ +struct tsi721_imsg_desc { + __le32 type_id; + +#define TSI721_IMD_DEVID 0x0000ffff +#define TSI721_IMD_CRF 0x00010000 +#define TSI721_IMD_PRIO 0x00060000 +#define TSI721_IMD_TT 0x00180000 +#define TSI721_IMD_DTYPE 0xe0000000 + + __le32 msg_info; + +#define TSI721_IMD_BCOUNT 0x00000ff8 +#define TSI721_IMD_SSIZE 0x0000f000 +#define TSI721_IMD_LETER 0x00030000 +#define TSI721_IMD_XMBOX 0x003c0000 +#define TSI721_IMD_MBOX 0x00c00000 +#define TSI721_IMD_CS 0x78000000 +#define TSI721_IMD_HO 0x80000000 + + __le32 bufptr_lo; + __le32 bufptr_hi; + u32 reserved[12]; + +} __aligned(64); + +/* + * Outbound Messaging Descriptor + */ +struct tsi721_omsg_desc { + __le32 type_id; + +#define TSI721_OMD_DEVID 0x0000ffff +#define TSI721_OMD_CRF 0x00010000 +#define TSI721_OMD_PRIO 0x00060000 +#define TSI721_OMD_IOF 0x08000000 +#define TSI721_OMD_DTYPE 0xe0000000 +#define TSI721_OMD_RSRVD 0x17f80000 + + __le32 msg_info; + +#define TSI721_OMD_BCOUNT 0x00000ff8 +#define TSI721_OMD_SSIZE 0x0000f000 +#define TSI721_OMD_LETER 0x00030000 +#define TSI721_OMD_XMBOX 0x003c0000 +#define TSI721_OMD_MBOX 0x00c00000 +#define TSI721_OMD_TT 0x0c000000 + + union { + __le32 bufptr_lo; /* if DTYPE == 4 */ + __le32 next_lo; /* if DTYPE == 5 */ + }; + + union { + __le32 bufptr_hi; /* if DTYPE == 4 */ + __le32 next_hi; /* if DTYPE == 5 */ + }; + +} __aligned(16); + +struct tsi721_dma_sts { + __le64 desc_sts[8]; +} __aligned(64); + +struct tsi721_desc_sts_fifo { + union { + __le64 da64; + struct { + __le32 lo; + __le32 hi; + } da32; + } stat[8]; +} __aligned(64); + +/* Descriptor types for BDMA and Messaging blocks */ +enum dma_dtype { + DTYPE1 = 1, /* Data Transfer DMA Descriptor */ + DTYPE2 = 2, /* Immediate Data Transfer DMA Descriptor */ + DTYPE3 = 3, /* Block Pointer DMA Descriptor */ + DTYPE4 = 4, /* Outbound Msg DMA Descriptor */ + DTYPE5 = 5, /* OB Messaging Block Pointer Descriptor */ + DTYPE6 = 6 /* Inbound Messaging Descriptor */ +}; + +enum dma_rtype { + NREAD = 0, + LAST_NWRITE_R = 1, + ALL_NWRITE = 2, + ALL_NWRITE_R = 3, + MAINT_RD = 4, + MAINT_WR = 5 +}; + +/* + * mport Driver Definitions + */ +#define TSI721_DMA_CHNUM TSI721_DMA_MAXCH + +#define TSI721_DMACH_MAINT 0 /* DMA channel for maint requests */ +#define TSI721_DMACH_MAINT_NBD 32 /* Number of BDs for maint requests */ + +#define MSG_DMA_ENTRY_INX_TO_SIZE(x) ((0x10 << (x)) & 0xFFFF0) + +enum tsi721_smsg_int_flag { + SMSG_INT_NONE = 0x00000000, + SMSG_INT_ECC_COR_CH = 0x000000ff, + SMSG_INT_ECC_NCOR_CH = 0x0000ff00, + SMSG_INT_ECC_COR = 0x00020000, + SMSG_INT_ECC_NCOR = 0x00040000, + SMSG_INT_UNS_RSP = 0x00800000, + SMSG_INT_ALL = 0x0006ffff +}; + +/* Structures */ + +struct tsi721_bdma_chan { + int bd_num; /* number of buffer descriptors */ + void *bd_base; /* start of DMA descriptors */ + dma_addr_t bd_phys; + void *sts_base; /* start of DMA BD status FIFO */ + dma_addr_t sts_phys; + int sts_size; +}; + +struct tsi721_imsg_ring { + u32 size; + /* VA/PA of data buffers for incoming messages */ + void *buf_base; + dma_addr_t buf_phys; + /* VA/PA of circular free buffer list */ + void *imfq_base; + dma_addr_t imfq_phys; + /* VA/PA of Inbound message descriptors */ + void *imd_base; + dma_addr_t imd_phys; + /* Inbound Queue buffer pointers */ + void *imq_base[TSI721_IMSGD_RING_SIZE]; + + u32 rx_slot; + void *dev_id; + u32 fq_wrptr; + u32 desc_rdptr; + spinlock_t lock; +}; + +struct tsi721_omsg_ring { + u32 size; + /* VA/PA of OB Msg descriptors */ + void *omd_base; + dma_addr_t omd_phys; + /* VA/PA of OB Msg data buffers */ + void *omq_base[TSI721_OMSGD_RING_SIZE]; + dma_addr_t omq_phys[TSI721_OMSGD_RING_SIZE]; + /* VA/PA of OB Msg descriptor status FIFO */ + void *sts_base; + dma_addr_t sts_phys; + u32 sts_size; /* # of allocated status entries */ + u32 sts_rdptr; + + u32 tx_slot; + void *dev_id; + u32 wr_count; + spinlock_t lock; +}; + +enum tsi721_flags { + TSI721_USING_MSI = (1 << 0), + TSI721_USING_MSIX = (1 << 1), + TSI721_IMSGID_SET = (1 << 2), +}; + +#ifdef CONFIG_PCI_MSI +/* + * MSI-X Table Entries (0 ... 69) + */ +#define TSI721_MSIX_DMACH_DONE(x) (0 + (x)) +#define TSI721_MSIX_DMACH_INT(x) (8 + (x)) +#define TSI721_MSIX_BDMA_INT 16 +#define TSI721_MSIX_OMSG_DONE(x) (17 + (x)) +#define TSI721_MSIX_OMSG_INT(x) (25 + (x)) +#define TSI721_MSIX_IMSG_DQ_RCV(x) (33 + (x)) +#define TSI721_MSIX_IMSG_INT(x) (41 + (x)) +#define TSI721_MSIX_MSG_INT 49 +#define TSI721_MSIX_SR2PC_IDBQ_RCV(x) (50 + (x)) +#define TSI721_MSIX_SR2PC_CH_INT(x) (58 + (x)) +#define TSI721_MSIX_SR2PC_INT 66 +#define TSI721_MSIX_PC2SR_INT 67 +#define TSI721_MSIX_SRIO_MAC_INT 68 +#define TSI721_MSIX_I2C_INT 69 + +/* MSI-X vector and init table entry indexes */ +enum tsi721_msix_vect { + TSI721_VECT_IDB, + TSI721_VECT_PWRX, /* PW_RX is part of SRIO MAC Interrupt reporting */ + TSI721_VECT_OMB0_DONE, + TSI721_VECT_OMB1_DONE, + TSI721_VECT_OMB2_DONE, + TSI721_VECT_OMB3_DONE, + TSI721_VECT_OMB0_INT, + TSI721_VECT_OMB1_INT, + TSI721_VECT_OMB2_INT, + TSI721_VECT_OMB3_INT, + TSI721_VECT_IMB0_RCV, + TSI721_VECT_IMB1_RCV, + TSI721_VECT_IMB2_RCV, + TSI721_VECT_IMB3_RCV, + TSI721_VECT_IMB0_INT, + TSI721_VECT_IMB1_INT, + TSI721_VECT_IMB2_INT, + TSI721_VECT_IMB3_INT, + TSI721_VECT_MAX +}; + +#define IRQ_DEVICE_NAME_MAX 64 + +struct msix_irq { + u16 vector; + char irq_name[IRQ_DEVICE_NAME_MAX]; +}; +#endif /* CONFIG_PCI_MSI */ + +struct tsi721_device { + struct pci_dev *pdev; + struct rio_mport *mport; + u32 flags; + void __iomem *regs; +#ifdef CONFIG_PCI_MSI + struct msix_irq msix[TSI721_VECT_MAX]; +#endif + /* Doorbells */ + void __iomem *odb_base; + void *idb_base; + dma_addr_t idb_dma; + struct work_struct idb_work; + u32 db_discard_count; + + /* Inbound Port-Write */ + struct work_struct pw_work; + struct kfifo pw_fifo; + spinlock_t pw_fifo_lock; + u32 pw_discard_count; + + /* BDMA Engine */ + struct tsi721_bdma_chan bdma[TSI721_DMA_CHNUM]; + + /* Inbound Messaging */ + int imsg_init[TSI721_IMSG_CHNUM]; + struct tsi721_imsg_ring imsg_ring[TSI721_IMSG_CHNUM]; + + /* Outbound Messaging */ + int omsg_init[TSI721_OMSG_CHNUM]; + struct tsi721_omsg_ring omsg_ring[TSI721_OMSG_CHNUM]; +}; + +#endif diff --git a/include/linux/rio_ids.h b/include/linux/rio_ids.h index 0cee0152aca9..b66d13d1bdc0 100644 --- a/include/linux/rio_ids.h +++ b/include/linux/rio_ids.h @@ -39,5 +39,6 @@ #define RIO_DID_IDTCPS1616 0x0379 #define RIO_DID_IDTVPS1616 0x0377 #define RIO_DID_IDTSPS1616 0x0378 +#define RIO_DID_TSI721 0x80ab #endif /* LINUX_RIO_IDS_H */ -- cgit v1.2.3 From 166c050bda80bb5dd627a287b6efcdfb68d172b4 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Wed, 2 Nov 2011 13:39:11 -0700 Subject: RapidIO: fix potential null deref in rio_setup_device() The "goto cleanup" path can deference "rswitch" when it is NULL. Reported-by: Dan Carpenter Signed-off-by: Alexandre Bounine Cc: Dan Carpenter Cc: Kumar Gala Cc: Matt Porter Cc: Chul Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio-scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index 0914f49b0a53..2bebd791a092 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -516,7 +516,7 @@ static struct rio_dev __devinit *rio_setup_device(struct rio_net *net, return rdev; cleanup: - if (rio_is_switch(rdev)) + if (rswitch) kfree(rswitch->route_table); kfree(rdev); -- cgit v1.2.3 From e0c87bd95e8dad455c23bc56513af8dcb1737e55 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Wed, 2 Nov 2011 13:39:15 -0700 Subject: drivers/net/rionet.c: fix ethernet address macros for LE platforms Modify Ethernet addess macros to be compatible with BE/LE platforms Signed-off-by: Alexandre Bounine Cc: Chul Kim Cc: Kumar Gala Cc: Matt Porter Cc: Li Yang Cc: [2.6.39+] Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/rionet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index 3bb131137033..7145714a5ec9 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -88,8 +88,8 @@ static struct rio_dev **rionet_active; #define dev_rionet_capable(dev) \ is_rionet_capable(dev->src_ops, dev->dst_ops) -#define RIONET_MAC_MATCH(x) (*(u32 *)x == 0x00010001) -#define RIONET_GET_DESTID(x) (*(u16 *)(x + 4)) +#define RIONET_MAC_MATCH(x) (!memcmp((x), "\00\01\00\01", 4)) +#define RIONET_GET_DESTID(x) ((*((u8 *)x + 4) << 8) | *((u8 *)x + 5)) static int rionet_rx_clean(struct net_device *ndev) { -- cgit v1.2.3 From 088024b1deee206cd37eff980138e918837aabdb Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Wed, 2 Nov 2011 13:39:19 -0700 Subject: RapidIO: documentation update Update rapidio.txt to reflect changes from recent patch. See http://marc.info/?l=linux-kernel&m=131285620113589&w=2 for details. Signed-off-by: Alexandre Bounine Cc: Liu Gang Cc: Micha Nelissen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/rapidio/rapidio.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/rapidio/rapidio.txt b/Documentation/rapidio/rapidio.txt index be70ee15f8ca..c75694b35d08 100644 --- a/Documentation/rapidio/rapidio.txt +++ b/Documentation/rapidio/rapidio.txt @@ -144,7 +144,7 @@ and the default device ID in order to access the device on the active port. After the host has completed enumeration of the entire network it releases devices by clearing device ID locks (calls rio_clear_locks()). For each endpoint -in the system, it sets the Master Enable bit in the Port General Control CSR +in the system, it sets the Discovered bit in the Port General Control CSR to indicate that enumeration is completed and agents are allowed to execute passive discovery of the network. -- cgit v1.2.3 From f1ecf06854a66ee663f4d4cf029c78cd62a15e04 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 2 Nov 2011 13:39:22 -0700 Subject: sysctl: add support for poll() Adding support for poll() in sysctl fs allows userspace to receive notifications of changes in sysctl entries. This adds a infrastructure to allow files in sysctl fs to be pollable and implements it for hostname and domainname. [akpm@linux-foundation.org: s/declare/define/ for definitions] Signed-off-by: Lucas De Marchi Cc: Greg KH Cc: Kay Sievers Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/sysctl.h | 22 ++++++++++++++++++++++ include/linux/utsname.h | 16 ++++++++++++++++ kernel/sys.c | 2 ++ kernel/utsname_sysctl.c | 23 +++++++++++++++++++++++ 5 files changed, 108 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index dacd840a675a..df594803f45a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -3,6 +3,7 @@ */ #include #include +#include #include #include #include @@ -14,6 +15,15 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; +void proc_sys_poll_notify(struct ctl_table_poll *poll) +{ + if (!poll) + return; + + atomic_inc(&poll->event); + wake_up_interruptible(&poll->wait); +} + static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { @@ -176,6 +186,39 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf, return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); } +static int proc_sys_open(struct inode *inode, struct file *filp) +{ + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + + if (table->poll) + filp->private_data = proc_sys_poll_event(table->poll); + + return 0; +} + +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + unsigned long event = (unsigned long)filp->private_data; + unsigned int ret = DEFAULT_POLLMASK; + + if (!table->proc_handler) + goto out; + + if (!table->poll) + goto out; + + poll_wait(filp, &table->poll->wait, wait); + + if (event != atomic_read(&table->poll->event)) { + filp->private_data = proc_sys_poll_event(table->poll); + ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI; + } + +out: + return ret; +} static int proc_sys_fill_cache(struct file *filp, void *dirent, filldir_t filldir, @@ -364,6 +407,8 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct } static const struct file_operations proc_sys_file_operations = { + .open = proc_sys_open, + .poll = proc_sys_poll, .read = proc_sys_read, .write = proc_sys_write, .llseek = default_llseek, diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 9a1ec10fd504..703cfa33a3ca 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -931,6 +931,7 @@ enum #ifdef __KERNEL__ #include #include +#include /* For the /proc/sys support */ struct ctl_table; @@ -1011,6 +1012,26 @@ extern int proc_do_large_bitmap(struct ctl_table *, int, * cover common cases. */ +/* Support for userspace poll() to watch for changes */ +struct ctl_table_poll { + atomic_t event; + wait_queue_head_t wait; +}; + +static inline void *proc_sys_poll_event(struct ctl_table_poll *poll) +{ + return (void *)(unsigned long)atomic_read(&poll->event); +} + +void proc_sys_poll_notify(struct ctl_table_poll *poll); + +#define __CTL_TABLE_POLL_INITIALIZER(name) { \ + .event = ATOMIC_INIT(0), \ + .wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait) } + +#define DEFINE_CTL_TABLE_POLL(name) \ + struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name) + /* A sysctl table is an array of struct ctl_table: */ struct ctl_table { @@ -1021,6 +1042,7 @@ struct ctl_table struct ctl_table *child; struct ctl_table *parent; /* Automatically set */ proc_handler *proc_handler; /* Callback for text formatting */ + struct ctl_table_poll *poll; void *extra1; void *extra2; }; diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 4e5b0213fdc1..c714ed75eae2 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -37,6 +37,14 @@ struct new_utsname { #include #include +enum uts_proc { + UTS_PROC_OSTYPE, + UTS_PROC_OSRELEASE, + UTS_PROC_VERSION, + UTS_PROC_HOSTNAME, + UTS_PROC_DOMAINNAME, +}; + struct user_namespace; extern struct user_namespace init_user_ns; @@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags, } #endif +#ifdef CONFIG_PROC_SYSCTL +extern void uts_proc_notify(enum uts_proc proc); +#else +static inline void uts_proc_notify(enum uts_proc proc) +{ +} +#endif + static inline struct new_utsname *utsname(void) { return ¤t->nsproxy->uts_ns->name; diff --git a/kernel/sys.c b/kernel/sys.c index 58459509b14c..d06c091e0345 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1286,6 +1286,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; } + uts_proc_notify(UTS_PROC_HOSTNAME); up_write(&uts_sem); return errno; } @@ -1336,6 +1337,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; } + uts_proc_notify(UTS_PROC_DOMAINNAME); up_write(&uts_sem); return errno; } diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index a2cd77e70d4d..3b0d48ebf81d 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -13,6 +13,7 @@ #include #include #include +#include static void *get_uts(ctl_table *table, int write) { @@ -51,12 +52,19 @@ static int proc_do_uts_string(ctl_table *table, int write, uts_table.data = get_uts(table, write); r = proc_dostring(&uts_table,write,buffer,lenp, ppos); put_uts(table, write, uts_table.data); + + if (write) + proc_sys_poll_notify(table->poll); + return r; } #else #define proc_do_uts_string NULL #endif +static DEFINE_CTL_TABLE_POLL(hostname_poll); +static DEFINE_CTL_TABLE_POLL(domainname_poll); + static struct ctl_table uts_kern_table[] = { { .procname = "ostype", @@ -85,6 +93,7 @@ static struct ctl_table uts_kern_table[] = { .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, .proc_handler = proc_do_uts_string, + .poll = &hostname_poll, }, { .procname = "domainname", @@ -92,6 +101,7 @@ static struct ctl_table uts_kern_table[] = { .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, .proc_handler = proc_do_uts_string, + .poll = &domainname_poll, }, {} }; @@ -105,6 +115,19 @@ static struct ctl_table uts_root_table[] = { {} }; +#ifdef CONFIG_PROC_SYSCTL +/* + * Notify userspace about a change in a certain entry of uts_kern_table, + * identified by the parameter proc. + */ +void uts_proc_notify(enum uts_proc proc) +{ + struct ctl_table *table = &uts_kern_table[proc]; + + proc_sys_poll_notify(table->poll); +} +#endif + static int __init utsname_sysctl_init(void) { register_sysctl_table(uts_root_table); -- cgit v1.2.3 From c736de60aed869df8a9aba512cdaf89e32545b00 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 2 Nov 2011 13:39:25 -0700 Subject: sysctl: make CONFIG_SYSCTL_SYSCALL default to n When I tried to send a patch to remove it, Andi told me we still need to keep compabitlies for old libc, so we can't remove this completely. Then just make it default to n and remove the doc from feature-removal-schedule.txt. Signed-off-by: WANG Cong Cc: Eric Biederman Cc: Andi Kleen Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/feature-removal-schedule.txt | 35 ------------------------------ init/Kconfig | 4 ++-- 2 files changed, 2 insertions(+), 37 deletions(-) diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 7c799fc5b88e..3d849122b5b1 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -133,41 +133,6 @@ Who: Pavel Machek --------------------------- -What: sys_sysctl -When: September 2010 -Option: CONFIG_SYSCTL_SYSCALL -Why: The same information is available in a more convenient from - /proc/sys, and none of the sysctl variables appear to be - important performance wise. - - Binary sysctls are a long standing source of subtle kernel - bugs and security issues. - - When I looked several months ago all I could find after - searching several distributions were 5 user space programs and - glibc (which falls back to /proc/sys) using this syscall. - - The man page for sysctl(2) documents it as unusable for user - space programs. - - sysctl(2) is not generally ABI compatible to a 32bit user - space application on a 64bit and a 32bit kernel. - - For the last several months the policy has been no new binary - sysctls and no one has put forward an argument to use them. - - Binary sysctls issues seem to keep happening appearing so - properly deprecating them (with a warning to user space) and a - 2 year grace warning period will mean eventually we can kill - them and end the pain. - - In the mean time individual binary sysctls can be dealt with - in a piecewise fashion. - -Who: Eric Biederman - ---------------------------- - What: /proc//oom_adj When: August 2012 Why: /proc//oom_adj allows userspace to influence the oom killer's diff --git a/init/Kconfig b/init/Kconfig index 31ba0fd0f36b..43298f9810fb 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -947,7 +947,7 @@ config UID16 config SYSCTL_SYSCALL bool "Sysctl syscall support" if EXPERT depends on PROC_SYSCTL - default y + default n select SYSCTL ---help--- sys_sysctl uses binary paths that have been found challenging @@ -959,7 +959,7 @@ config SYSCTL_SYSCALL trying to save some space it is probably safe to disable this, making your kernel marginally smaller. - If unsure say Y here. + If unsure say N here. config KALLSYMS bool "Load all symbols for debugging/ksymoops" if EXPERT -- cgit v1.2.3 From 842fa69f3e0c9a178b294e7af7c07f4c9d9e7af2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 2 Nov 2011 13:39:33 -0700 Subject: include/linux/dma-mapping.h: add dma_zalloc_coherent() Lots of driver code does a dma_alloc_coherent() and then zeroes out the memory with a memset. Make it easy for them. Cc: Alexandre Bounine Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/DMA-API.txt | 7 +++++++ include/linux/dma-mapping.h | 10 ++++++++++ 2 files changed, 17 insertions(+) diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index fe2326906610..66bd97a95f10 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -50,6 +50,13 @@ specify the GFP_ flags (see kmalloc) for the allocation (the implementation may choose to ignore flags that affect the location of the returned memory, like GFP_DMA). +void * +dma_zalloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) + +Wraps dma_alloc_coherent() and also zeroes the returned memory if the +allocation attempt succeeded. + void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 347fdc32177a..be86ae13893f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -1,6 +1,7 @@ #ifndef _LINUX_DMA_MAPPING_H #define _LINUX_DMA_MAPPING_H +#include #include #include #include @@ -117,6 +118,15 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) return -EIO; } +static inline void *dma_zalloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + void *ret = dma_alloc_coherent(dev, size, dma_handle, flag); + if (ret) + memset(ret, 0, size); + return ret; +} + #ifdef CONFIG_HAS_DMA static inline int dma_get_cache_alignment(void) { -- cgit v1.2.3 From 437c53418616973071fd2d7c87497780944d8fdb Mon Sep 17 00:00:00 2001 From: James Nuss Date: Wed, 2 Nov 2011 13:39:34 -0700 Subject: pps: default echo function A default echo function has been provided so it is no longer an error when you specify PPS_ECHOASSERT or PPS_ECHOCLEAR without an explicit echo function. This allows some code re-use and also makes it easier to write client drivers since the default echo function does not normally need to change. Signed-off-by: James Nuss Reviewed-by: Ben Gardiner Acked-by: Rodolfo Giometti Cc: Ricardo Martins Cc: Alexander Gordeev Cc: Igor Plyatov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/pps/clients/pps-ktimer.c | 12 ------------ drivers/pps/clients/pps_parport.c | 9 --------- drivers/pps/kapi.c | 20 +++++++++++++------- 3 files changed, 13 insertions(+), 28 deletions(-) diff --git a/drivers/pps/clients/pps-ktimer.c b/drivers/pps/clients/pps-ktimer.c index 82583b0ff82d..436b4e4e71a1 100644 --- a/drivers/pps/clients/pps-ktimer.c +++ b/drivers/pps/clients/pps-ktimer.c @@ -51,17 +51,6 @@ static void pps_ktimer_event(unsigned long ptr) mod_timer(&ktimer, jiffies + HZ); } -/* - * The echo function - */ - -static void pps_ktimer_echo(struct pps_device *pps, int event, void *data) -{ - dev_info(pps->dev, "echo %s %s\n", - event & PPS_CAPTUREASSERT ? "assert" : "", - event & PPS_CAPTURECLEAR ? "clear" : ""); -} - /* * The PPS info struct */ @@ -72,7 +61,6 @@ static struct pps_source_info pps_ktimer_info = { .mode = PPS_CAPTUREASSERT | PPS_OFFSETASSERT | PPS_ECHOASSERT | PPS_CANWAIT | PPS_TSFMT_TSPEC, - .echo = pps_ktimer_echo, .owner = THIS_MODULE, }; diff --git a/drivers/pps/clients/pps_parport.c b/drivers/pps/clients/pps_parport.c index c571d6dd8f61..e1b4705ae3ec 100644 --- a/drivers/pps/clients/pps_parport.c +++ b/drivers/pps/clients/pps_parport.c @@ -133,14 +133,6 @@ out_both: return; } -/* the PPS echo function */ -static void pps_echo(struct pps_device *pps, int event, void *data) -{ - dev_info(pps->dev, "echo %s %s\n", - event & PPS_CAPTUREASSERT ? "assert" : "", - event & PPS_CAPTURECLEAR ? "clear" : ""); -} - static void parport_attach(struct parport *port) { struct pps_client_pp *device; @@ -151,7 +143,6 @@ static void parport_attach(struct parport *port) PPS_OFFSETASSERT | PPS_OFFSETCLEAR | \ PPS_ECHOASSERT | PPS_ECHOCLEAR | \ PPS_CANWAIT | PPS_TSFMT_TSPEC, - .echo = pps_echo, .owner = THIS_MODULE, .dev = NULL }; diff --git a/drivers/pps/kapi.c b/drivers/pps/kapi.c index a4e8eb9fece6..f197e8ea185c 100644 --- a/drivers/pps/kapi.c +++ b/drivers/pps/kapi.c @@ -52,6 +52,14 @@ static void pps_add_offset(struct pps_ktime *ts, struct pps_ktime *offset) ts->sec += offset->sec; } +static void pps_echo_client_default(struct pps_device *pps, int event, + void *data) +{ + dev_info(pps->dev, "echo %s %s\n", + event & PPS_CAPTUREASSERT ? "assert" : "", + event & PPS_CAPTURECLEAR ? "clear" : ""); +} + /* * Exported functions */ @@ -80,13 +88,6 @@ struct pps_device *pps_register_source(struct pps_source_info *info, err = -EINVAL; goto pps_register_source_exit; } - if ((info->mode & (PPS_ECHOASSERT | PPS_ECHOCLEAR)) != 0 && - info->echo == NULL) { - pr_err("%s: echo function is not defined\n", - info->name); - err = -EINVAL; - goto pps_register_source_exit; - } if ((info->mode & (PPS_TSFMT_TSPEC | PPS_TSFMT_NTPFP)) == 0) { pr_err("%s: unspecified time format\n", info->name); @@ -108,6 +109,11 @@ struct pps_device *pps_register_source(struct pps_source_info *info, pps->params.mode = default_params; pps->info = *info; + /* check for default echo function */ + if ((pps->info.mode & (PPS_ECHOASSERT | PPS_ECHOCLEAR)) && + pps->info.echo == NULL) + pps->info.echo = pps_echo_client_default; + init_waitqueue_head(&pps->queue); spin_lock_init(&pps->lock); -- cgit v1.2.3 From 161520451dfacd0eb79d501933f47d3fb7464938 Mon Sep 17 00:00:00 2001 From: James Nuss Date: Wed, 2 Nov 2011 13:39:38 -0700 Subject: pps: new client driver using GPIO This client driver allows you to use a GPIO pin as a source for PPS signals. Platform data [1] are used to specify the GPIO pin number, label, assert event edge type, and whether clear events are captured. This driver is based on the work by Ricardo Martins who submitted an initial implementation [2] of a PPS IRQ client driver to the linuxpps mailing-list on Dec 3 2010. [1] include/linux/pps-gpio.h [2] http://ml.enneenne.com/pipermail/linuxpps/2010-December/004155.html [akpm@linux-foundation.org: remove unneeded cast of void*] Signed-off-by: James Nuss Cc: Ricardo Martins Acked-by: Rodolfo Giometti Signed-off-by: Ricardo Martins Cc: Alexander Gordeev Cc: Igor Plyatov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/pps/clients/Kconfig | 9 ++ drivers/pps/clients/Makefile | 1 + drivers/pps/clients/pps-gpio.c | 227 +++++++++++++++++++++++++++++++++++++++++ include/linux/pps-gpio.h | 32 ++++++ 4 files changed, 269 insertions(+) create mode 100644 drivers/pps/clients/pps-gpio.c create mode 100644 include/linux/pps-gpio.h diff --git a/drivers/pps/clients/Kconfig b/drivers/pps/clients/Kconfig index 8520a7f4dd62..c2e0f1e97bcd 100644 --- a/drivers/pps/clients/Kconfig +++ b/drivers/pps/clients/Kconfig @@ -29,4 +29,13 @@ config PPS_CLIENT_PARPORT If you say yes here you get support for a PPS source connected with the interrupt pin of your parallel port. +config PPS_CLIENT_GPIO + tristate "PPS client using GPIO" + depends on PPS + help + If you say yes here you get support for a PPS source using + GPIO. To be useful you must also register a platform device + specifying the GPIO pin and other options, usually in your board + setup. + endif diff --git a/drivers/pps/clients/Makefile b/drivers/pps/clients/Makefile index 4feb7e9e71ee..a461d15f4a2e 100644 --- a/drivers/pps/clients/Makefile +++ b/drivers/pps/clients/Makefile @@ -5,5 +5,6 @@ obj-$(CONFIG_PPS_CLIENT_KTIMER) += pps-ktimer.o obj-$(CONFIG_PPS_CLIENT_LDISC) += pps-ldisc.o obj-$(CONFIG_PPS_CLIENT_PARPORT) += pps_parport.o +obj-$(CONFIG_PPS_CLIENT_GPIO) += pps-gpio.o ccflags-$(CONFIG_PPS_DEBUG) := -DDEBUG diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c new file mode 100644 index 000000000000..655055545479 --- /dev/null +++ b/drivers/pps/clients/pps-gpio.c @@ -0,0 +1,227 @@ +/* + * pps-gpio.c -- PPS client driver using GPIO + * + * + * Copyright (C) 2010 Ricardo Martins + * Copyright (C) 2011 James Nuss + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define PPS_GPIO_NAME "pps-gpio" +#define pr_fmt(fmt) PPS_GPIO_NAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Info for each registered platform device */ +struct pps_gpio_device_data { + int irq; /* IRQ used as PPS source */ + struct pps_device *pps; /* PPS source device */ + struct pps_source_info info; /* PPS source information */ + const struct pps_gpio_platform_data *pdata; +}; + +/* + * Report the PPS event + */ + +static irqreturn_t pps_gpio_irq_handler(int irq, void *data) +{ + const struct pps_gpio_device_data *info; + struct pps_event_time ts; + int rising_edge; + + /* Get the time stamp first */ + pps_get_ts(&ts); + + info = data; + + rising_edge = gpio_get_value(info->pdata->gpio_pin); + if ((rising_edge && !info->pdata->assert_falling_edge) || + (!rising_edge && info->pdata->assert_falling_edge)) + pps_event(info->pps, &ts, PPS_CAPTUREASSERT, NULL); + else if (info->pdata->capture_clear && + ((rising_edge && info->pdata->assert_falling_edge) || + (!rising_edge && !info->pdata->assert_falling_edge))) + pps_event(info->pps, &ts, PPS_CAPTURECLEAR, NULL); + + return IRQ_HANDLED; +} + +static int pps_gpio_setup(struct platform_device *pdev) +{ + int ret; + const struct pps_gpio_platform_data *pdata = pdev->dev.platform_data; + + ret = gpio_request(pdata->gpio_pin, pdata->gpio_label); + if (ret) { + pr_warning("failed to request GPIO %u\n", pdata->gpio_pin); + return -EINVAL; + } + + ret = gpio_direction_input(pdata->gpio_pin); + if (ret) { + pr_warning("failed to set pin direction\n"); + gpio_free(pdata->gpio_pin); + return -EINVAL; + } + + return 0; +} + +static unsigned long +get_irqf_trigger_flags(const struct pps_gpio_platform_data *pdata) +{ + unsigned long flags = pdata->assert_falling_edge ? + IRQF_TRIGGER_FALLING : IRQF_TRIGGER_RISING; + + if (pdata->capture_clear) { + flags |= ((flags & IRQF_TRIGGER_RISING) ? + IRQF_TRIGGER_FALLING : IRQF_TRIGGER_RISING); + } + + return flags; +} + +static int pps_gpio_probe(struct platform_device *pdev) +{ + struct pps_gpio_device_data *data; + int irq; + int ret; + int err; + int pps_default_params; + const struct pps_gpio_platform_data *pdata = pdev->dev.platform_data; + + + /* GPIO setup */ + ret = pps_gpio_setup(pdev); + if (ret) + return -EINVAL; + + /* IRQ setup */ + irq = gpio_to_irq(pdata->gpio_pin); + if (irq < 0) { + pr_err("failed to map GPIO to IRQ: %d\n", irq); + err = -EINVAL; + goto return_error; + } + + /* allocate space for device info */ + data = kzalloc(sizeof(struct pps_gpio_device_data), GFP_KERNEL); + if (data == NULL) { + err = -ENOMEM; + goto return_error; + } + + /* initialize PPS specific parts of the bookkeeping data structure. */ + data->info.mode = PPS_CAPTUREASSERT | PPS_OFFSETASSERT | + PPS_ECHOASSERT | PPS_CANWAIT | PPS_TSFMT_TSPEC; + if (pdata->capture_clear) + data->info.mode |= PPS_CAPTURECLEAR | PPS_OFFSETCLEAR | + PPS_ECHOCLEAR; + data->info.owner = THIS_MODULE; + snprintf(data->info.name, PPS_MAX_NAME_LEN - 1, "%s.%d", + pdev->name, pdev->id); + + /* register PPS source */ + pps_default_params = PPS_CAPTUREASSERT | PPS_OFFSETASSERT; + if (pdata->capture_clear) + pps_default_params |= PPS_CAPTURECLEAR | PPS_OFFSETCLEAR; + data->pps = pps_register_source(&data->info, pps_default_params); + if (data->pps == NULL) { + kfree(data); + pr_err("failed to register IRQ %d as PPS source\n", irq); + err = -EINVAL; + goto return_error; + } + + data->irq = irq; + data->pdata = pdata; + + /* register IRQ interrupt handler */ + ret = request_irq(irq, pps_gpio_irq_handler, + get_irqf_trigger_flags(pdata), data->info.name, data); + if (ret) { + pps_unregister_source(data->pps); + kfree(data); + pr_err("failed to acquire IRQ %d\n", irq); + err = -EINVAL; + goto return_error; + } + + platform_set_drvdata(pdev, data); + dev_info(data->pps->dev, "Registered IRQ %d as PPS source\n", irq); + + return 0; + +return_error: + gpio_free(pdata->gpio_pin); + return err; +} + +static int pps_gpio_remove(struct platform_device *pdev) +{ + struct pps_gpio_device_data *data = platform_get_drvdata(pdev); + const struct pps_gpio_platform_data *pdata = data->pdata; + + platform_set_drvdata(pdev, NULL); + free_irq(data->irq, data); + gpio_free(pdata->gpio_pin); + pps_unregister_source(data->pps); + pr_info("removed IRQ %d as PPS source\n", data->irq); + kfree(data); + return 0; +} + +static struct platform_driver pps_gpio_driver = { + .probe = pps_gpio_probe, + .remove = __devexit_p(pps_gpio_remove), + .driver = { + .name = PPS_GPIO_NAME, + .owner = THIS_MODULE + }, +}; + +static int __init pps_gpio_init(void) +{ + int ret = platform_driver_register(&pps_gpio_driver); + if (ret < 0) + pr_err("failed to register platform driver\n"); + return ret; +} + +static void __exit pps_gpio_exit(void) +{ + platform_driver_unregister(&pps_gpio_driver); + pr_debug("unregistered platform driver\n"); +} + +module_init(pps_gpio_init); +module_exit(pps_gpio_exit); + +MODULE_AUTHOR("Ricardo Martins "); +MODULE_AUTHOR("James Nuss "); +MODULE_DESCRIPTION("Use GPIO pin as PPS source"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0.0"); diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h new file mode 100644 index 000000000000..0035abe41b9a --- /dev/null +++ b/include/linux/pps-gpio.h @@ -0,0 +1,32 @@ +/* + * pps-gpio.h -- PPS client for GPIOs + * + * + * Copyright (C) 2011 James Nuss + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _PPS_GPIO_H +#define _PPS_GPIO_H + +struct pps_gpio_platform_data { + bool assert_falling_edge; + bool capture_clear; + unsigned int gpio_pin; + const char *gpio_label; +}; + +#endif -- cgit v1.2.3 From 79bc57463be2ad5020a53accbf26898e8ac04550 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 2 Nov 2011 13:39:41 -0700 Subject: pps gpio client: add missing dependency Add "depends on GENERIC_HARDIRQS" to avoid compile breakage on s390: drivers/built-in.o: In function `pps_gpio_remove': linux-next/drivers/pps/clients/pps-gpio.c:189: undefined reference to `free_irq' Signed-off-by: Heiko Carstens Cc: James Nuss Cc: Rodolfo Giometti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/pps/clients/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pps/clients/Kconfig b/drivers/pps/clients/Kconfig index c2e0f1e97bcd..445197d4a8c4 100644 --- a/drivers/pps/clients/Kconfig +++ b/drivers/pps/clients/Kconfig @@ -31,7 +31,7 @@ config PPS_CLIENT_PARPORT config PPS_CLIENT_GPIO tristate "PPS client using GPIO" - depends on PPS + depends on PPS && GENERIC_HARDIRQS help If you say yes here you get support for a PPS source using GPIO. To be useful you must also register a platform device -- cgit v1.2.3 From 3e5428177c74df7f3b8c59b2f27f46b82b077e94 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Wed, 2 Nov 2011 13:39:43 -0700 Subject: w1: ds2760 and ds2780, use ida for id and ida_simple_get() to get it Straightforward. As an aside, the ida_init calls are not needed as far as I can see needed. (DEFINE_IDA does the same already). Signed-off-by: Jonathan Cameron Cc: Evgeniy Polyakov Acked-by: Clifton Barnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/w1/slaves/w1_ds2760.c | 48 ++++++------------------------------------- drivers/w1/slaves/w1_ds2780.c | 48 ++++++------------------------------------- 2 files changed, 12 insertions(+), 84 deletions(-) diff --git a/drivers/w1/slaves/w1_ds2760.c b/drivers/w1/slaves/w1_ds2760.c index 483d45180911..5754c9a4f58b 100644 --- a/drivers/w1/slaves/w1_ds2760.c +++ b/drivers/w1/slaves/w1_ds2760.c @@ -114,43 +114,7 @@ static struct bin_attribute w1_ds2760_bin_attr = { .read = w1_ds2760_read_bin, }; -static DEFINE_IDR(bat_idr); -static DEFINE_MUTEX(bat_idr_lock); - -static int new_bat_id(void) -{ - int ret; - - while (1) { - int id; - - ret = idr_pre_get(&bat_idr, GFP_KERNEL); - if (ret == 0) - return -ENOMEM; - - mutex_lock(&bat_idr_lock); - ret = idr_get_new(&bat_idr, NULL, &id); - mutex_unlock(&bat_idr_lock); - - if (ret == 0) { - ret = id & MAX_ID_MASK; - break; - } else if (ret == -EAGAIN) { - continue; - } else { - break; - } - } - - return ret; -} - -static void release_bat_id(int id) -{ - mutex_lock(&bat_idr_lock); - idr_remove(&bat_idr, id); - mutex_unlock(&bat_idr_lock); -} +static DEFINE_IDA(bat_ida); static int w1_ds2760_add_slave(struct w1_slave *sl) { @@ -158,7 +122,7 @@ static int w1_ds2760_add_slave(struct w1_slave *sl) int id; struct platform_device *pdev; - id = new_bat_id(); + id = ida_simple_get(&bat_ida, 0, 0, GFP_KERNEL); if (id < 0) { ret = id; goto noid; @@ -187,7 +151,7 @@ bin_attr_failed: pdev_add_failed: platform_device_unregister(pdev); pdev_alloc_failed: - release_bat_id(id); + ida_simple_remove(&bat_ida, id); noid: success: return ret; @@ -199,7 +163,7 @@ static void w1_ds2760_remove_slave(struct w1_slave *sl) int id = pdev->id; platform_device_unregister(pdev); - release_bat_id(id); + ida_simple_remove(&bat_ida, id); sysfs_remove_bin_file(&sl->dev.kobj, &w1_ds2760_bin_attr); } @@ -217,14 +181,14 @@ static int __init w1_ds2760_init(void) { printk(KERN_INFO "1-Wire driver for the DS2760 battery monitor " " chip - (c) 2004-2005, Szabolcs Gyurko\n"); - idr_init(&bat_idr); + ida_init(&bat_ida); return w1_register_family(&w1_ds2760_family); } static void __exit w1_ds2760_exit(void) { w1_unregister_family(&w1_ds2760_family); - idr_destroy(&bat_idr); + ida_destroy(&bat_ida); } EXPORT_SYMBOL(w1_ds2760_read); diff --git a/drivers/w1/slaves/w1_ds2780.c b/drivers/w1/slaves/w1_ds2780.c index 274c8f38303f..a134b38e34b2 100644 --- a/drivers/w1/slaves/w1_ds2780.c +++ b/drivers/w1/slaves/w1_ds2780.c @@ -99,43 +99,7 @@ static struct bin_attribute w1_ds2780_bin_attr = { .read = w1_ds2780_read_bin, }; -static DEFINE_IDR(bat_idr); -static DEFINE_MUTEX(bat_idr_lock); - -static int new_bat_id(void) -{ - int ret; - - while (1) { - int id; - - ret = idr_pre_get(&bat_idr, GFP_KERNEL); - if (ret == 0) - return -ENOMEM; - - mutex_lock(&bat_idr_lock); - ret = idr_get_new(&bat_idr, NULL, &id); - mutex_unlock(&bat_idr_lock); - - if (ret == 0) { - ret = id & MAX_ID_MASK; - break; - } else if (ret == -EAGAIN) { - continue; - } else { - break; - } - } - - return ret; -} - -static void release_bat_id(int id) -{ - mutex_lock(&bat_idr_lock); - idr_remove(&bat_idr, id); - mutex_unlock(&bat_idr_lock); -} +static DEFINE_IDA(bat_ida); static int w1_ds2780_add_slave(struct w1_slave *sl) { @@ -143,7 +107,7 @@ static int w1_ds2780_add_slave(struct w1_slave *sl) int id; struct platform_device *pdev; - id = new_bat_id(); + id = ida_simple_get(&bat_ida, 0, 0, GFP_KERNEL); if (id < 0) { ret = id; goto noid; @@ -172,7 +136,7 @@ bin_attr_failed: pdev_add_failed: platform_device_unregister(pdev); pdev_alloc_failed: - release_bat_id(id); + ida_simple_remove(&bat_ida, id); noid: return ret; } @@ -183,7 +147,7 @@ static void w1_ds2780_remove_slave(struct w1_slave *sl) int id = pdev->id; platform_device_unregister(pdev); - release_bat_id(id); + ida_simple_remove(&bat_ida, id); sysfs_remove_bin_file(&sl->dev.kobj, &w1_ds2780_bin_attr); } @@ -199,14 +163,14 @@ static struct w1_family w1_ds2780_family = { static int __init w1_ds2780_init(void) { - idr_init(&bat_idr); + ida_init(&bat_ida); return w1_register_family(&w1_ds2780_family); } static void __exit w1_ds2780_exit(void) { w1_unregister_family(&w1_ds2780_family); - idr_destroy(&bat_idr); + ida_destroy(&bat_ida); } module_init(w1_ds2780_init); -- cgit v1.2.3 From 853eee72f74f449797f0500ea19fc1bf497428d8 Mon Sep 17 00:00:00 2001 From: Clifton Barnes Date: Wed, 2 Nov 2011 13:39:50 -0700 Subject: drivers/power/ds2780_battery.c: create central point for calling w1 interface Simply creates one point to call the w1 interface. Signed-off-by: Clifton Barnes Cc: Evgeniy Polyakov Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/power/ds2780_battery.c | 77 +++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/drivers/power/ds2780_battery.c b/drivers/power/ds2780_battery.c index 1fefe82e12e3..2571b4143784 100644 --- a/drivers/power/ds2780_battery.c +++ b/drivers/power/ds2780_battery.c @@ -49,8 +49,8 @@ enum current_types { static const char model[] = "DS2780"; static const char manufacturer[] = "Maxim/Dallas"; -static inline struct ds2780_device_info *to_ds2780_device_info( - struct power_supply *psy) +static inline struct ds2780_device_info * +to_ds2780_device_info(struct power_supply *psy) { return container_of(psy, struct ds2780_device_info, bat); } @@ -60,17 +60,25 @@ static inline struct power_supply *to_power_supply(struct device *dev) return dev_get_drvdata(dev); } -static inline int ds2780_read8(struct device *dev, u8 *val, int addr) +static inline int ds2780_battery_io(struct ds2780_device_info *dev_info, + char *buf, int addr, size_t count, int io) { - return w1_ds2780_io(dev, val, addr, sizeof(u8), 0); + return w1_ds2780_io(dev_info->w1_dev, buf, addr, count, io); } -static int ds2780_read16(struct device *dev, s16 *val, int addr) +static inline int ds2780_read8(struct ds2780_device_info *dev_info, u8 *val, + int addr) +{ + return ds2780_battery_io(dev_info, val, addr, sizeof(u8), 0); +} + +static int ds2780_read16(struct ds2780_device_info *dev_info, s16 *val, + int addr) { int ret; u8 raw[2]; - ret = w1_ds2780_io(dev, raw, addr, sizeof(u8) * 2, 0); + ret = ds2780_battery_io(dev_info, raw, addr, sizeof(raw), 0); if (ret < 0) return ret; @@ -79,16 +87,16 @@ static int ds2780_read16(struct device *dev, s16 *val, int addr) return 0; } -static inline int ds2780_read_block(struct device *dev, u8 *val, int addr, - size_t count) +static inline int ds2780_read_block(struct ds2780_device_info *dev_info, + u8 *val, int addr, size_t count) { - return w1_ds2780_io(dev, val, addr, count, 0); + return ds2780_battery_io(dev_info, val, addr, count, 0); } -static inline int ds2780_write(struct device *dev, u8 *val, int addr, - size_t count) +static inline int ds2780_write(struct ds2780_device_info *dev_info, u8 *val, + int addr, size_t count) { - return w1_ds2780_io(dev, val, addr, count, 1); + return ds2780_battery_io(dev_info, val, addr, count, 1); } static inline int ds2780_store_eeprom(struct device *dev, int addr) @@ -122,7 +130,7 @@ static int ds2780_set_sense_register(struct ds2780_device_info *dev_info, { int ret; - ret = ds2780_write(dev_info->w1_dev, &conductance, + ret = ds2780_write(dev_info, &conductance, DS2780_RSNSP_REG, sizeof(u8)); if (ret < 0) return ret; @@ -134,7 +142,7 @@ static int ds2780_set_sense_register(struct ds2780_device_info *dev_info, static int ds2780_get_rsgain_register(struct ds2780_device_info *dev_info, u16 *rsgain) { - return ds2780_read16(dev_info->w1_dev, rsgain, DS2780_RSGAIN_MSB_REG); + return ds2780_read16(dev_info, rsgain, DS2780_RSGAIN_MSB_REG); } /* Set RSGAIN value from 0 to 1.999 in steps of 0.001 */ @@ -144,8 +152,8 @@ static int ds2780_set_rsgain_register(struct ds2780_device_info *dev_info, int ret; u8 raw[] = {rsgain >> 8, rsgain & 0xFF}; - ret = ds2780_write(dev_info->w1_dev, raw, - DS2780_RSGAIN_MSB_REG, sizeof(u8) * 2); + ret = ds2780_write(dev_info, raw, + DS2780_RSGAIN_MSB_REG, sizeof(raw)); if (ret < 0) return ret; @@ -167,7 +175,7 @@ static int ds2780_get_voltage(struct ds2780_device_info *dev_info, * Bits 2 - 0 of the voltage value are in bits 7 - 5 of the * voltage LSB register */ - ret = ds2780_read16(dev_info->w1_dev, &voltage_raw, + ret = ds2780_read16(dev_info, &voltage_raw, DS2780_VOLT_MSB_REG); if (ret < 0) return ret; @@ -196,7 +204,7 @@ static int ds2780_get_temperature(struct ds2780_device_info *dev_info, * Bits 2 - 0 of the temperature value are in bits 7 - 5 of the * temperature LSB register */ - ret = ds2780_read16(dev_info->w1_dev, &temperature_raw, + ret = ds2780_read16(dev_info, &temperature_raw, DS2780_TEMP_MSB_REG); if (ret < 0) return ret; @@ -222,13 +230,13 @@ static int ds2780_get_current(struct ds2780_device_info *dev_info, * The units of measurement for current are dependent on the value of * the sense resistor. */ - ret = ds2780_read8(dev_info->w1_dev, &sense_res_raw, DS2780_RSNSP_REG); + ret = ds2780_read8(dev_info, &sense_res_raw, DS2780_RSNSP_REG); if (ret < 0) return ret; if (sense_res_raw == 0) { dev_err(dev_info->dev, "sense resistor value is 0\n"); - return -ENXIO; + return -EINVAL; } sense_res = 1000 / sense_res_raw; @@ -248,7 +256,7 @@ static int ds2780_get_current(struct ds2780_device_info *dev_info, * Bits 7 - 0 of the current value are in bits 7 - 0 of the current * LSB register */ - ret = ds2780_read16(dev_info->w1_dev, ¤t_raw, reg_msb); + ret = ds2780_read16(dev_info, ¤t_raw, reg_msb); if (ret < 0) return ret; @@ -267,7 +275,7 @@ static int ds2780_get_accumulated_current(struct ds2780_device_info *dev_info, * The units of measurement for accumulated current are dependent on * the value of the sense resistor. */ - ret = ds2780_read8(dev_info->w1_dev, &sense_res_raw, DS2780_RSNSP_REG); + ret = ds2780_read8(dev_info, &sense_res_raw, DS2780_RSNSP_REG); if (ret < 0) return ret; @@ -285,7 +293,7 @@ static int ds2780_get_accumulated_current(struct ds2780_device_info *dev_info, * Bits 7 - 0 of the ACR value are in bits 7 - 0 of the ACR * LSB register */ - ret = ds2780_read16(dev_info->w1_dev, ¤t_raw, DS2780_ACR_MSB_REG); + ret = ds2780_read16(dev_info, ¤t_raw, DS2780_ACR_MSB_REG); if (ret < 0) return ret; @@ -299,7 +307,7 @@ static int ds2780_get_capacity(struct ds2780_device_info *dev_info, int ret; u8 raw; - ret = ds2780_read8(dev_info->w1_dev, &raw, DS2780_RARC_REG); + ret = ds2780_read8(dev_info, &raw, DS2780_RARC_REG); if (ret < 0) return ret; @@ -345,7 +353,7 @@ static int ds2780_get_charge_now(struct ds2780_device_info *dev_info, * Bits 7 - 0 of the RAAC value are in bits 7 - 0 of the RAAC * LSB register */ - ret = ds2780_read16(dev_info->w1_dev, &charge_raw, DS2780_RAAC_MSB_REG); + ret = ds2780_read16(dev_info, &charge_raw, DS2780_RAAC_MSB_REG); if (ret < 0) return ret; @@ -356,7 +364,7 @@ static int ds2780_get_charge_now(struct ds2780_device_info *dev_info, static int ds2780_get_control_register(struct ds2780_device_info *dev_info, u8 *control_reg) { - return ds2780_read8(dev_info->w1_dev, control_reg, DS2780_CONTROL_REG); + return ds2780_read8(dev_info, control_reg, DS2780_CONTROL_REG); } static int ds2780_set_control_register(struct ds2780_device_info *dev_info, @@ -364,7 +372,7 @@ static int ds2780_set_control_register(struct ds2780_device_info *dev_info, { int ret; - ret = ds2780_write(dev_info->w1_dev, &control_reg, + ret = ds2780_write(dev_info, &control_reg, DS2780_CONTROL_REG, sizeof(u8)); if (ret < 0) return ret; @@ -503,7 +511,7 @@ static ssize_t ds2780_get_sense_resistor_value(struct device *dev, struct power_supply *psy = to_power_supply(dev); struct ds2780_device_info *dev_info = to_ds2780_device_info(psy); - ret = ds2780_read8(dev_info->w1_dev, &sense_resistor, DS2780_RSNSP_REG); + ret = ds2780_read8(dev_info, &sense_resistor, DS2780_RSNSP_REG); if (ret < 0) return ret; @@ -584,7 +592,7 @@ static ssize_t ds2780_get_pio_pin(struct device *dev, struct power_supply *psy = to_power_supply(dev); struct ds2780_device_info *dev_info = to_ds2780_device_info(psy); - ret = ds2780_read8(dev_info->w1_dev, &sfr, DS2780_SFR_REG); + ret = ds2780_read8(dev_info, &sfr, DS2780_SFR_REG); if (ret < 0) return ret; @@ -611,7 +619,7 @@ static ssize_t ds2780_set_pio_pin(struct device *dev, return -EINVAL; } - ret = ds2780_write(dev_info->w1_dev, &new_setting, + ret = ds2780_write(dev_info, &new_setting, DS2780_SFR_REG, sizeof(u8)); if (ret < 0) return ret; @@ -632,7 +640,7 @@ static ssize_t ds2780_read_param_eeprom_bin(struct file *filp, DS2780_EEPROM_BLOCK1_END - DS2780_EEPROM_BLOCK1_START + 1 - off); - return ds2780_read_block(dev_info->w1_dev, buf, + return ds2780_read_block(dev_info, buf, DS2780_EEPROM_BLOCK1_START + off, count); } @@ -650,7 +658,7 @@ static ssize_t ds2780_write_param_eeprom_bin(struct file *filp, DS2780_EEPROM_BLOCK1_END - DS2780_EEPROM_BLOCK1_START + 1 - off); - ret = ds2780_write(dev_info->w1_dev, buf, + ret = ds2780_write(dev_info, buf, DS2780_EEPROM_BLOCK1_START + off, count); if (ret < 0) return ret; @@ -685,9 +693,8 @@ static ssize_t ds2780_read_user_eeprom_bin(struct file *filp, DS2780_EEPROM_BLOCK0_END - DS2780_EEPROM_BLOCK0_START + 1 - off); - return ds2780_read_block(dev_info->w1_dev, buf, + return ds2780_read_block(dev_info, buf, DS2780_EEPROM_BLOCK0_START + off, count); - } static ssize_t ds2780_write_user_eeprom_bin(struct file *filp, @@ -704,7 +711,7 @@ static ssize_t ds2780_write_user_eeprom_bin(struct file *filp, DS2780_EEPROM_BLOCK0_END - DS2780_EEPROM_BLOCK0_START + 1 - off); - ret = ds2780_write(dev_info->w1_dev, buf, + ret = ds2780_write(dev_info, buf, DS2780_EEPROM_BLOCK0_START + off, count); if (ret < 0) return ret; -- cgit v1.2.3 From 9fe678fa2feb4aaac0b4220de63e1b7f8ccebae6 Mon Sep 17 00:00:00 2001 From: Clifton Barnes Date: Wed, 2 Nov 2011 13:39:52 -0700 Subject: drivers/power/ds2780_battery.c: add a nolock function to w1 interface Adds a nolock function to the w1 interface to avoid locking the mutex if needed. Signed-off-by: Clifton Barnes Cc: Evgeniy Polyakov Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/w1/slaves/w1_ds2780.c | 48 +++++++++++++++++++++++++++++++------------ drivers/w1/slaves/w1_ds2780.h | 2 ++ 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/drivers/w1/slaves/w1_ds2780.c b/drivers/w1/slaves/w1_ds2780.c index a134b38e34b2..39f78c0b143c 100644 --- a/drivers/w1/slaves/w1_ds2780.c +++ b/drivers/w1/slaves/w1_ds2780.c @@ -26,20 +26,14 @@ #include "../w1_family.h" #include "w1_ds2780.h" -int w1_ds2780_io(struct device *dev, char *buf, int addr, size_t count, - int io) +static int w1_ds2780_do_io(struct device *dev, char *buf, int addr, + size_t count, int io) { struct w1_slave *sl = container_of(dev, struct w1_slave, dev); - if (!dev) - return -ENODEV; + if (addr > DS2780_DATA_SIZE || addr < 0) + return 0; - mutex_lock(&sl->master->mutex); - - if (addr > DS2780_DATA_SIZE || addr < 0) { - count = 0; - goto out; - } count = min_t(int, count, DS2780_DATA_SIZE - addr); if (w1_reset_select_slave(sl) == 0) { @@ -47,7 +41,6 @@ int w1_ds2780_io(struct device *dev, char *buf, int addr, size_t count, w1_write_8(sl->master, W1_DS2780_WRITE_DATA); w1_write_8(sl->master, addr); w1_write_block(sl->master, buf, count); - /* XXX w1_write_block returns void, not n_written */ } else { w1_write_8(sl->master, W1_DS2780_READ_DATA); w1_write_8(sl->master, addr); @@ -55,13 +48,42 @@ int w1_ds2780_io(struct device *dev, char *buf, int addr, size_t count, } } -out: + return count; +} + +int w1_ds2780_io(struct device *dev, char *buf, int addr, size_t count, + int io) +{ + struct w1_slave *sl = container_of(dev, struct w1_slave, dev); + int ret; + + if (!dev) + return -ENODEV; + + mutex_lock(&sl->master->mutex); + + ret = w1_ds2780_do_io(dev, buf, addr, count, io); + mutex_unlock(&sl->master->mutex); - return count; + return ret; } EXPORT_SYMBOL(w1_ds2780_io); +int w1_ds2780_io_nolock(struct device *dev, char *buf, int addr, size_t count, + int io) +{ + int ret; + + if (!dev) + return -ENODEV; + + ret = w1_ds2780_do_io(dev, buf, addr, count, io); + + return ret; +} +EXPORT_SYMBOL(w1_ds2780_io_nolock); + int w1_ds2780_eeprom_cmd(struct device *dev, int addr, int cmd) { struct w1_slave *sl = container_of(dev, struct w1_slave, dev); diff --git a/drivers/w1/slaves/w1_ds2780.h b/drivers/w1/slaves/w1_ds2780.h index a1fba79eb1b5..737379365021 100644 --- a/drivers/w1/slaves/w1_ds2780.h +++ b/drivers/w1/slaves/w1_ds2780.h @@ -124,6 +124,8 @@ extern int w1_ds2780_io(struct device *dev, char *buf, int addr, size_t count, int io); +extern int w1_ds2780_io_nolock(struct device *dev, char *buf, int addr, + size_t count, int io); extern int w1_ds2780_eeprom_cmd(struct device *dev, int addr, int cmd); #endif /* !_W1_DS2780_H */ -- cgit v1.2.3 From 0e053fcbbbc4d945247cb32cad2767b483cb65f8 Mon Sep 17 00:00:00 2001 From: Clifton Barnes Date: Wed, 2 Nov 2011 13:39:55 -0700 Subject: drivers/power/ds2780_battery.c: fix deadlock upon insertion and removal Fixes the deadlock when inserting and removing the ds2780. Signed-off-by: Clifton Barnes Cc: Evgeniy Polyakov Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/power/ds2780_battery.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/power/ds2780_battery.c b/drivers/power/ds2780_battery.c index 2571b4143784..91a783d72360 100644 --- a/drivers/power/ds2780_battery.c +++ b/drivers/power/ds2780_battery.c @@ -39,6 +39,7 @@ struct ds2780_device_info { struct device *dev; struct power_supply bat; struct device *w1_dev; + struct task_struct *mutex_holder; }; enum current_types { @@ -63,6 +64,9 @@ static inline struct power_supply *to_power_supply(struct device *dev) static inline int ds2780_battery_io(struct ds2780_device_info *dev_info, char *buf, int addr, size_t count, int io) { + if (dev_info->mutex_holder == current) + return w1_ds2780_io_nolock(dev_info->w1_dev, buf, addr, count, io); + else return w1_ds2780_io(dev_info->w1_dev, buf, addr, count, io); } @@ -775,6 +779,7 @@ static int __devinit ds2780_battery_probe(struct platform_device *pdev) dev_info->bat.properties = ds2780_battery_props; dev_info->bat.num_properties = ARRAY_SIZE(ds2780_battery_props); dev_info->bat.get_property = ds2780_battery_get_property; + dev_info->mutex_holder = current; ret = power_supply_register(&pdev->dev, &dev_info->bat); if (ret) { @@ -804,6 +809,8 @@ static int __devinit ds2780_battery_probe(struct platform_device *pdev) goto fail_remove_bin_file; } + dev_info->mutex_holder = NULL; + return 0; fail_remove_bin_file: @@ -823,6 +830,8 @@ static int __devexit ds2780_battery_remove(struct platform_device *pdev) { struct ds2780_device_info *dev_info = platform_get_drvdata(pdev); + dev_info->mutex_holder = current; + /* remove attributes */ sysfs_remove_group(&dev_info->bat.dev->kobj, &ds2780_attr_group); -- cgit v1.2.3 From 68a436aec345c2bcd05dbdafae1f5f608ff8f61f Mon Sep 17 00:00:00 2001 From: Florian Faber Date: Wed, 2 Nov 2011 13:39:59 -0700 Subject: drivers/w1/w1_int.c: multiple masters used same init_name When using multiple masters, w1_int.c would use the .init_name from w1.c for all entities, which will fail when creating a corresponding sysfs entry. This patch uses the unique name previously generated. WARNING: at fs/sysfs/dir.c:451 sysfs_add_one+0x48/0x64() sysfs: cannot create duplicate filename '/devices/w1 bus master' Modules linked in: Call trace: [<9001a604>] warn_slowpath_common+0x34/0x44 [<9001a64c>] warn_slowpath_fmt+0x14/0x18 [<90078020>] sysfs_add_one+0x48/0x64 [<900784ec>] create_dir+0x40/0x68 [<9007857a>] sysfs_create_dir+0x66/0x78 [<900c1a8a>] kobject_add_internal+0x6e/0x104 [<900c1bc0>] kobject_add_varg+0x20/0x2c [<900c1c1c>] kobject_add+0x30/0x3c [<900dbd66>] device_add+0x6a/0x378 [<900dbb4a>] device_initialize+0x12/0x48 [<900dc080>] device_register+0xc/0x10 [<900f99be>] w1_add_master_device+0x162/0x274 [<90008e7a>] w1_gpio_probe+0x66/0xb4 [<9000030c>] kernel_init+0x0/0xe8 [<900dde54>] platform_drv_probe+0xc/0xe [<9000030c>] kernel_init+0x0/0xe8 [<900dd4f8>] driver_probe_device+0x6c/0xdc [<900dd5fc>] __driver_attach+0x34/0x48 [<900dcce8>] bus_for_each_dev+0x2c/0x48 [<900dd5c8>] __driver_attach+0x0/0x48 [<900dd38c>] driver_attach+0x10/0x14 [<900dd16a>] bus_add_driver+0x6a/0x18c [<900dd768>] driver_register+0x60/0xb8 [<90011594>] __initcall_w1_therm_init6+0x0/0x4 [<90008e00>] w1_gpio_init+0x0/0x14 [<9000030c>] kernel_init+0x0/0xe8 [<900ddf48>] platform_driver_register+0x30/0x38 [<90011594>] __initcall_w1_therm_init6+0x0/0x4 [<90008e00>] w1_gpio_init+0x0/0x14 [<9000030c>] kernel_init+0x0/0xe8 [<900ddf5e>] platform_driver_probe+0xe/0x3c [<90008e0c>] w1_gpio_init+0xc/0x14 [<90011594>] __initcall_w1_therm_init6+0x0/0x4 [<90008e00>] w1_gpio_init+0x0/0x14 [<900126d4>] do_one_initcall+0x34/0x130 [<90000372>] kernel_init+0x66/0xe8 [<90011594>] __initcall_w1_therm_init6+0x0/0x4 [<9001ca3e>] do_exit+0x0/0x3a6 [<9000030c>] kernel_init+0x0/0xe8 [<9001ca3e>] do_exit+0x0/0x3a6 ---[ end trace 5a9233884fead918 ]--- kobject_add_internal failed for w1 bus master with -EEXIST, don't try to register things with the same name in the same directory. Signed-off-by: Florian Faber Cc: Evgeniy Polyakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/w1/w1_int.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c index d220bce2cee4..f79e62e54e8d 100644 --- a/drivers/w1/w1_int.c +++ b/drivers/w1/w1_int.c @@ -78,6 +78,7 @@ static struct w1_master * w1_alloc_dev(u32 id, int slave_count, int slave_ttl, memcpy(&dev->dev, device, sizeof(struct device)); dev_set_name(&dev->dev, "w1_bus_master%u", dev->id); snprintf(dev->name, sizeof(dev->name), "w1_bus_master%u", dev->id); + dev->dev.init_name = dev->name; dev->driver = driver; -- cgit v1.2.3 From 3fd306c85adcde7209281cb663dd8ea247e97cc3 Mon Sep 17 00:00:00 2001 From: Jan Weitzel Date: Wed, 2 Nov 2011 13:40:02 -0700 Subject: w1: disable irqs in critical section Interrupting w1_delay() in w1_read_bit() results in missing the low level on the w1 line and receiving "1" instead of "0". Add local_irq_save()/local_irq_restore() around the critical section Signed-off-by: Jan Weitzel Acked-by: Evgeniy Polyakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/w1/w1_io.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/w1/w1_io.c b/drivers/w1/w1_io.c index 765b37b62a4f..3135b2c63998 100644 --- a/drivers/w1/w1_io.c +++ b/drivers/w1/w1_io.c @@ -158,13 +158,18 @@ EXPORT_SYMBOL_GPL(w1_write_8); static u8 w1_read_bit(struct w1_master *dev) { int result; + unsigned long flags; + /* sample timing is critical here */ + local_irq_save(flags); dev->bus_master->write_bit(dev->bus_master->data, 0); w1_delay(6); dev->bus_master->write_bit(dev->bus_master->data, 1); w1_delay(9); result = dev->bus_master->read_bit(dev->bus_master->data); + local_irq_restore(flags); + w1_delay(55); return result & 0x1; -- cgit v1.2.3 From 6d994a7e42ab219ba3c10d5ffccf20990252881e Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Wed, 2 Nov 2011 13:40:04 -0700 Subject: drivers/misc/vmw_balloon.c: determine page allocation flag can_sleep outside loop In vmballoon_reserve_page(), flags has been passed from the callee function (vmballoon_inflate here). So, we can determine can_sleep outside the loop. Signed-off-by: Rakib Mullick Acked-by: Dmitry Torokhov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/vmw_balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 053d36caf955..6983d80563cb 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -412,6 +412,7 @@ static int vmballoon_reserve_page(struct vmballoon *b, bool can_sleep) gfp_t flags; unsigned int hv_status; bool locked = false; + flags = can_sleep ? VMW_PAGE_ALLOC_CANSLEEP : VMW_PAGE_ALLOC_NOSLEEP; do { if (!can_sleep) @@ -419,7 +420,6 @@ static int vmballoon_reserve_page(struct vmballoon *b, bool can_sleep) else STATS_INC(b->stats.sleep_alloc); - flags = can_sleep ? VMW_PAGE_ALLOC_CANSLEEP : VMW_PAGE_ALLOC_NOSLEEP; page = alloc_page(flags); if (!page) { if (!can_sleep) -- cgit v1.2.3 From 2ca02df6b098be2d33a99a65531dcd84a10b6e21 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Wed, 2 Nov 2011 13:40:07 -0700 Subject: drivers/misc/vmw_balloon.c: fix typo in code comment Fix typo in code comment. Signed-off-by: Rakib Mullick Acked-by: Dmitry Torokhov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/vmw_balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 6983d80563cb..cd41d403c9df 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -151,7 +151,7 @@ MODULE_LICENSE("GPL"); struct vmballoon_stats { unsigned int timer; - /* allocation statustics */ + /* allocation statistics */ unsigned int alloc; unsigned int alloc_fail; unsigned int sleep_alloc; -- cgit v1.2.3 From 080d676de095a14ecba14c0b9a91acb5bbb634df Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 2 Nov 2011 13:40:10 -0700 Subject: aio: allocate kiocbs in batches In testing aio on a fast storage device, I found that the context lock takes up a fair amount of cpu time in the I/O submission path. The reason is that we take it for every I/O submitted (see __aio_get_req). Since we know how many I/Os are passed to io_submit, we can preallocate the kiocbs in batches, reducing the number of times we take and release the lock. In my testing, I was able to reduce the amount of time spent in _raw_spin_lock_irq by .56% (average of 3 runs). The command I used to test this was: aio-stress -O -o 2 -o 3 -r 8 -d 128 -b 32 -i 32 -s 16384 I also tested the patch with various numbers of events passed to io_submit, and I ran the xfstests aio group of tests to ensure I didn't break anything. Signed-off-by: Jeff Moyer Cc: Daniel Ehrenberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 136 +++++++++++++++++++++++++++++++++++++++++----------- include/linux/aio.h | 1 + 2 files changed, 108 insertions(+), 29 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 632b235f4fbe..78c514cfd212 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -440,8 +440,6 @@ void exit_aio(struct mm_struct *mm) static struct kiocb *__aio_get_req(struct kioctx *ctx) { struct kiocb *req = NULL; - struct aio_ring *ring; - int okay = 0; req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); if (unlikely(!req)) @@ -459,39 +457,114 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx) INIT_LIST_HEAD(&req->ki_run_list); req->ki_eventfd = NULL; - /* Check if the completion queue has enough free space to - * accept an event from this io. - */ + return req; +} + +/* + * struct kiocb's are allocated in batches to reduce the number of + * times the ctx lock is acquired and released. + */ +#define KIOCB_BATCH_SIZE 32L +struct kiocb_batch { + struct list_head head; + long count; /* number of requests left to allocate */ +}; + +static void kiocb_batch_init(struct kiocb_batch *batch, long total) +{ + INIT_LIST_HEAD(&batch->head); + batch->count = total; +} + +static void kiocb_batch_free(struct kiocb_batch *batch) +{ + struct kiocb *req, *n; + + list_for_each_entry_safe(req, n, &batch->head, ki_batch) { + list_del(&req->ki_batch); + kmem_cache_free(kiocb_cachep, req); + } +} + +/* + * Allocate a batch of kiocbs. This avoids taking and dropping the + * context lock a lot during setup. + */ +static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) +{ + unsigned short allocated, to_alloc; + long avail; + bool called_fput = false; + struct kiocb *req, *n; + struct aio_ring *ring; + + to_alloc = min(batch->count, KIOCB_BATCH_SIZE); + for (allocated = 0; allocated < to_alloc; allocated++) { + req = __aio_get_req(ctx); + if (!req) + /* allocation failed, go with what we've got */ + break; + list_add(&req->ki_batch, &batch->head); + } + + if (allocated == 0) + goto out; + +retry: spin_lock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); - if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { + ring = kmap_atomic(ctx->ring_info.ring_pages[0]); + + avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; + BUG_ON(avail < 0); + if (avail == 0 && !called_fput) { + /* + * Handle a potential starvation case. It is possible that + * we hold the last reference on a struct file, causing us + * to delay the final fput to non-irq context. In this case, + * ctx->reqs_active is artificially high. Calling the fput + * routine here may free up a slot in the event completion + * ring, allowing this allocation to succeed. + */ + kunmap_atomic(ring); + spin_unlock_irq(&ctx->ctx_lock); + aio_fput_routine(NULL); + called_fput = true; + goto retry; + } + + if (avail < allocated) { + /* Trim back the number of requests. */ + list_for_each_entry_safe(req, n, &batch->head, ki_batch) { + list_del(&req->ki_batch); + kmem_cache_free(kiocb_cachep, req); + if (--allocated <= avail) + break; + } + } + + batch->count -= allocated; + list_for_each_entry(req, &batch->head, ki_batch) { list_add(&req->ki_list, &ctx->active_reqs); ctx->reqs_active++; - okay = 1; } - kunmap_atomic(ring, KM_USER0); - spin_unlock_irq(&ctx->ctx_lock); - if (!okay) { - kmem_cache_free(kiocb_cachep, req); - req = NULL; - } + kunmap_atomic(ring); + spin_unlock_irq(&ctx->ctx_lock); - return req; +out: + return allocated; } -static inline struct kiocb *aio_get_req(struct kioctx *ctx) +static inline struct kiocb *aio_get_req(struct kioctx *ctx, + struct kiocb_batch *batch) { struct kiocb *req; - /* Handle a potential starvation case -- should be exceedingly rare as - * requests will be stuck on fput_head only if the aio_fput_routine is - * delayed and the requests were the last user of the struct file. - */ - req = __aio_get_req(ctx); - if (unlikely(NULL == req)) { - aio_fput_routine(NULL); - req = __aio_get_req(ctx); - } + + if (list_empty(&batch->head)) + if (kiocb_batch_refill(ctx, batch) == 0) + return NULL; + req = list_first_entry(&batch->head, struct kiocb, ki_batch); + list_del(&req->ki_batch); return req; } @@ -1515,7 +1588,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, bool compat) + struct iocb *iocb, struct kiocb_batch *batch, + bool compat) { struct kiocb *req; struct file *file; @@ -1541,7 +1615,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, if (unlikely(!file)) return -EBADF; - req = aio_get_req(ctx); /* returns with 2 references to req */ + req = aio_get_req(ctx, batch); /* returns with 2 references to req */ if (unlikely(!req)) { fput(file); return -EAGAIN; @@ -1621,8 +1695,9 @@ long do_io_submit(aio_context_t ctx_id, long nr, { struct kioctx *ctx; long ret = 0; - int i; + int i = 0; struct blk_plug plug; + struct kiocb_batch batch; if (unlikely(nr < 0)) return -EINVAL; @@ -1639,6 +1714,8 @@ long do_io_submit(aio_context_t ctx_id, long nr, return -EINVAL; } + kiocb_batch_init(&batch, nr); + blk_start_plug(&plug); /* @@ -1659,12 +1736,13 @@ long do_io_submit(aio_context_t ctx_id, long nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp, compat); + ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat); if (ret) break; } blk_finish_plug(&plug); + kiocb_batch_free(&batch); put_ioctx(ctx); return i ? i : ret; } diff --git a/include/linux/aio.h b/include/linux/aio.h index 2dcb72bff4b6..2314ad8b3c9c 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -117,6 +117,7 @@ struct kiocb { struct list_head ki_list; /* the aio core uses this * for cancellation */ + struct list_head ki_batch; /* batch allocation */ /* * If the aio_resfd field of the userspace iocb is not zero, -- cgit v1.2.3 From c1e2ee2dc436574880758b3836fc96935b774c32 Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Wed, 2 Nov 2011 13:40:29 -0700 Subject: memcg: replace ss->id_lock with a rwlock While back-porting Johannes Weiner's patch "mm: memcg-aware global reclaim" for an internal effort, we noticed a significant performance regression during page-reclaim heavy workloads due to high contention of the ss->id_lock. This lock protects idr map, and serializes calls to idr_get_next() in css_get_next() (which is used during the memcg hierarchy walk). Since idr_get_next() is just doing a look up, we need only serialize it with respect to idr_remove()/idr_get_new(). By making the ss->id_lock a rwlock, contention is greatly reduced and performance improves. Tested: cat a 256m file from a ramdisk in a 128m container 50 times on each core (one file + container per core) in parallel on a NUMA machine. Result is the time for the test to complete in 1 of the containers. Both kernels included Johannes' memcg-aware global reclaim patches. Before rwlock patch: 1710.778s After rwlock patch: 152.227s Signed-off-by: Andrew Bresticker Cc: Paul Menage Cc: Li Zefan Acked-by: KAMEZAWA Hiroyuki Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index da7e4bc34e8c..1b7f9d525013 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -516,7 +516,7 @@ struct cgroup_subsys { struct list_head sibling; /* used when use_id == true */ struct idr idr; - spinlock_t id_lock; + rwlock_t id_lock; /* should be defined only by modular subsystems */ struct module *module; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8386b21224ef..d9d5648f3cdc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4883,9 +4883,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) rcu_assign_pointer(id->css, NULL); rcu_assign_pointer(css->id, NULL); - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); @@ -4911,10 +4911,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) error = -ENOMEM; goto err_out; } - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); /* Don't use 0. allocates an ID of 1-65535 */ error = idr_get_new_above(&ss->idr, newid, 1, &myid); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); /* Returns error when there are no free spaces for new ID.*/ if (error) { @@ -4929,9 +4929,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) return newid; remove_idr: error = -ENOSPC; - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); idr_remove(&ss->idr, myid); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); err_out: kfree(newid); return ERR_PTR(error); @@ -4943,7 +4943,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, { struct css_id *newid; - spin_lock_init(&ss->id_lock); + rwlock_init(&ss->id_lock); idr_init(&ss->idr); newid = get_new_cssid(ss, 0); @@ -5038,9 +5038,9 @@ css_get_next(struct cgroup_subsys *ss, int id, * scan next entry from bitmap(tree), tmpid is updated after * idr_get_next(). */ - spin_lock(&ss->id_lock); + read_lock(&ss->id_lock); tmp = idr_get_next(&ss->idr, &tmpid); - spin_unlock(&ss->id_lock); + read_unlock(&ss->id_lock); if (!tmp) break; -- cgit v1.2.3