From bd4cfb594bdea00c3920b31bd12f497fc4a2e79c Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Sun, 20 Nov 2005 21:11:31 -0800
Subject: [NETFILTER]: Remove ARRAY_SIZE duplicate

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ipt_sctp.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')
diff --git a/include/linux/netfilter_ipv4/ipt_sctp.h b/include/linux/netfilter_ipv4/ipt_sctp.h
index e93a9ec99fc2..80b3dbacd193 100644
--- a/include/linux/netfilter_ipv4/ipt_sctp.h
+++ b/include/linux/netfilter_ipv4/ipt_sctp.h
@@ -7,8 +7,6 @@
 
 #define IPT_SCTP_VALID_FLAGS		0x07
 
-#define ELEMCOUNT(x) (sizeof(x)/sizeof(x[0]))
-
 
 struct ipt_sctp_flag_info {
 	u_int8_t chunktype;
@@ -59,21 +57,21 @@ struct ipt_sctp_info {
 #define SCTP_CHUNKMAP_RESET(chunkmap) 				\
 	do {							\
 		int i; 						\
-		for (i = 0; i < ELEMCOUNT(chunkmap); i++)	\
+		for (i = 0; i < ARRAY_SIZE(chunkmap); i++)	\
 			chunkmap[i] = 0;			\
 	} while (0)
 
 #define SCTP_CHUNKMAP_SET_ALL(chunkmap) 			\
 	do {							\
 		int i; 						\
-		for (i = 0; i < ELEMCOUNT(chunkmap); i++) 	\
+		for (i = 0; i < ARRAY_SIZE(chunkmap); i++)	\
 			chunkmap[i] = ~0;			\
 	} while (0)
 
 #define SCTP_CHUNKMAP_COPY(destmap, srcmap) 			\
 	do {							\
 		int i; 						\
-		for (i = 0; i < ELEMCOUNT(chunkmap); i++) 	\
+		for (i = 0; i < ARRAY_SIZE(chunkmap); i++)	\
 			destmap[i] = srcmap[i];			\
 	} while (0)
 
@@ -81,7 +79,7 @@ struct ipt_sctp_info {
 ({							\
 	int i; 						\
 	int flag = 1;					\
-	for (i = 0; i < ELEMCOUNT(chunkmap); i++) {	\
+	for (i = 0; i < ARRAY_SIZE(chunkmap); i++) {	\
 		if (chunkmap[i]) {			\
 			flag = 0;			\
 			break;				\
@@ -94,7 +92,7 @@ struct ipt_sctp_info {
 ({							\
 	int i; 						\
 	int flag = 1;					\
-	for (i = 0; i < ELEMCOUNT(chunkmap); i++) {	\
+	for (i = 0; i < ARRAY_SIZE(chunkmap); i++) {	\
 		if (chunkmap[i] != ~0) {		\
 			flag = 0;			\
 				break;			\
-- 
cgit v1.2.3


From b84f4cc977ec4a1260dc8d9165efc9319a93c2a2 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 20 Nov 2005 21:19:21 -0800
Subject: [NET]: Use unused bit for ipvs_property field in struct sk_buff

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0a8ea8b35816..e797d9ef0e28 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -264,16 +264,14 @@ struct sk_buff {
 				nohdr:1,
 				nfctinfo:3;
 	__u8			pkt_type:3,
-				fclone:2;
+				fclone:2,
+				ipvs_property:1;
 	__be16			protocol;
 
 	void			(*destructor)(struct sk_buff *skb);
 #ifdef CONFIG_NETFILTER
 	__u32			nfmark;
 	struct nf_conntrack	*nfct;
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
-	__u8			ipvs_property:1;
-#endif
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	struct sk_buff		*nfct_reasm;
 #endif
-- 
cgit v1.2.3


From 461ddf3b90bb149b99c3f675959c1bd6b11ed936 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sun, 20 Nov 2005 21:25:15 -0800
Subject: [NET]: kernel-doc fixes

Fix kernel-doc warnings in network files.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/DocBook/kernel-api.tmpl | 6 ++++--
 include/linux/skbuff.h                | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 096aed62c326..767433bdbc40 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -237,8 +237,10 @@ X!Ilib/string.c
      <sect1><title>Driver Support</title>
 !Enet/core/dev.c
 !Enet/ethernet/eth.c
-!Einclude/linux/etherdevice.h
-!Enet/core/wireless.c
+!Iinclude/linux/etherdevice.h
+<!-- FIXME: Removed for now since no structured comments in source
+X!Enet/core/wireless.c
+-->
      </sect1>
      <sect1><title>Synchronous PPP</title>
 !Edrivers/net/wan/syncppp.c
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e797d9ef0e28..8c5d6001a923 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -206,6 +206,7 @@ enum {
  *	@nfct: Associated connection, if any
  *	@ipvs_property: skbuff is owned by ipvs
  *	@nfctinfo: Relationship of this skb to the connection
+ *	@nfct_reasm: netfilter conntrack re-assembly pointer
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@tc_index: Traffic control index
  *	@tc_verd: traffic control verdict
-- 
cgit v1.2.3


From c243f1f1f6545985afcc6adf1fc085729029c3ee Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Mon, 21 Nov 2005 06:53:16 -0800
Subject: [AGPGART] Support VIA P4M800CE bridge.

Signed-off-by: Dave Jones <davej@redhat.com>
---
 drivers/char/agp/via-agp.c | 6 ++++++
 include/linux/pci_ids.h    | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/agp/via-agp.c b/drivers/char/agp/via-agp.c
index c847df575cf5..97b0a890ba7f 100644
--- a/drivers/char/agp/via-agp.c
+++ b/drivers/char/agp/via-agp.c
@@ -371,6 +371,11 @@ static struct agp_device_ids via_agp_device_ids[] __devinitdata =
 		.device_id	= PCI_DEVICE_ID_VIA_3296_0,
 		.chipset_name	= "P4M800",
 	},
+	/* P4M800CE */
+	{
+		.device_id	= PCI_DEVICE_ID_VIA_P4M800CE,
+		.chipset_name	= "P4M800CE",
+	},
 
 	{ }, /* dummy final entry, always present */
 };
@@ -511,6 +516,7 @@ static struct pci_device_id agp_via_pci_table[] = {
 	ID(PCI_DEVICE_ID_VIA_3269_0),
 	ID(PCI_DEVICE_ID_VIA_83_87XX_1),
 	ID(PCI_DEVICE_ID_VIA_3296_0),
+	ID(PCI_DEVICE_ID_VIA_P4M800CE),
 	{ }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d4c1c8fd2925..c99a83f88dc9 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1198,6 +1198,7 @@
 #define PCI_DEVICE_ID_VIA_3269_0	0x0269
 #define PCI_DEVICE_ID_VIA_K8T800PRO_0	0x0282
 #define PCI_DEVICE_ID_VIA_8363_0	0x0305
+#define PCI_DEVICE_ID_VIA_P4M800CE	0x0314
 #define PCI_DEVICE_ID_VIA_8371_0	0x0391
 #define PCI_DEVICE_ID_VIA_8501_0	0x0501
 #define PCI_DEVICE_ID_VIA_82C561	0x0561
-- 
cgit v1.2.3


From 664beed0190fae687ac51295694004902ddeb18e Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 21 Nov 2005 21:32:14 -0800
Subject: [PATCH] unpaged: unifdefed PageCompound

It looks like snd_xxx is not the only nopage to be using PageReserved as a way
of holding a high-order page together: which no longer works, but is masked by
our failure to free from VM_RESERVED areas.  We cannot fix that bug without
first substituting another way to hold the high-order page together, while
farming out the 0-order pages from within it.

That's just what PageCompound is designed for, but it's been kept under
CONFIG_HUGETLB_PAGE.  Remove the #ifdefs: which saves some space (out- of-line
put_page), doesn't slow down what most needs to be fast (already using
hugetlb), and unifies the way we handle high-order pages.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h         | 19 -------------------
 include/linux/page-flags.h |  4 ----
 mm/page_alloc.c            |  5 -----
 mm/swap.c                  |  3 ---
 4 files changed, 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0986d19be0b7..9701210c6680 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -311,8 +311,6 @@ struct page {
 
 extern void FASTCALL(__page_cache_release(struct page *));
 
-#ifdef CONFIG_HUGETLB_PAGE
-
 static inline int page_count(struct page *page)
 {
 	if (PageCompound(page))
@@ -329,23 +327,6 @@ static inline void get_page(struct page *page)
 
 void put_page(struct page *page);
 
-#else		/* CONFIG_HUGETLB_PAGE */
-
-#define page_count(p)		(atomic_read(&(p)->_count) + 1)
-
-static inline void get_page(struct page *page)
-{
-	atomic_inc(&page->_count);
-}
-
-static inline void put_page(struct page *page)
-{
-	if (put_page_testzero(page))
-		__page_cache_release(page);
-}
-
-#endif		/* CONFIG_HUGETLB_PAGE */
-
 /*
  * Multiple processes may "see" the same page. E.g. for untouched
  * mappings of /dev/null, all processes see the same page full of
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f34767c5fc79..343083fec258 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -287,11 +287,7 @@ extern void __mod_page_state(unsigned long offset, unsigned long delta);
 #define ClearPageReclaim(page)	clear_bit(PG_reclaim, &(page)->flags)
 #define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags)
 
-#ifdef CONFIG_HUGETLB_PAGE
 #define PageCompound(page)	test_bit(PG_compound, &(page)->flags)
-#else
-#define PageCompound(page)	0
-#endif
 #define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
 #define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bd4de592dc23..23b84c4e1a57 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -148,10 +148,6 @@ static void bad_page(const char *function, struct page *page)
 	add_taint(TAINT_BAD_PAGE);
 }
 
-#ifndef CONFIG_HUGETLB_PAGE
-#define prep_compound_page(page, order) do { } while (0)
-#define destroy_compound_page(page, order) do { } while (0)
-#else
 /*
  * Higher-order pages are called "compound pages".  They are structured thusly:
  *
@@ -205,7 +201,6 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 		ClearPageCompound(p);
 	}
 }
-#endif		/* CONFIG_HUGETLB_PAGE */
 
 /*
  * function for dealing with page's order in buddy system.
diff --git a/mm/swap.c b/mm/swap.c
index d09cf7f03e76..73d351439ef6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,8 +34,6 @@
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
-#ifdef CONFIG_HUGETLB_PAGE
-
 void put_page(struct page *page)
 {
 	if (unlikely(PageCompound(page))) {
@@ -52,7 +50,6 @@ void put_page(struct page *page)
 		__page_cache_release(page);
 }
 EXPORT_SYMBOL(put_page);
-#endif
 
 /*
  * Writeback is about to end against a page which has been marked for immediate
-- 
cgit v1.2.3


From 0b14c179a483e71ea41df2aa4a661760063115bd Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 21 Nov 2005 21:32:15 -0800
Subject: [PATCH] unpaged: VM_UNPAGED

Although we tend to associate VM_RESERVED with remap_pfn_range, quite a few
drivers set VM_RESERVED on areas which are then populated by nopage.  The
PageReserved removal in 2.6.15-rc1 changed VM_RESERVED not to free pages in
zap_pte_range, without changing those drivers not to set it: so their pages
just leak away.

Let's not change miscellaneous drivers now: introduce VM_UNPAGED at the core,
to flag the special areas where the ptes may have no struct page, or if they
have then it's not to be touched.  Replace most instances of VM_RESERVED in
core mm by VM_UNPAGED.  Force it on in remap_pfn_range, and the sparc and
sparc64 io_remap_pfn_range.

Revert addition of VM_RESERVED to powerpc vdso, it's not needed there.  Is it
needed anywhere?  It still governs the mm->reserved_vm statistic, and special
vmas not to be merged, and areas not to be core dumped; but could probably be
eliminated later (the drivers are probably specifying it because in 2.4 it
kept swapout off the vma, but in 2.6 we work from the LRU, which these pages
don't get on).

Use the VM_SHM slot for VM_UNPAGED, and define VM_SHM to 0: it serves no
purpose whatsoever, and should be removed from drivers when we clean up.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/kernel/vdso.c |  3 +--
 arch/sparc/mm/generic.c    |  2 +-
 arch/sparc64/mm/generic.c  |  2 +-
 include/linux/mm.h         |  5 +++--
 mm/fremap.c                |  4 ++--
 mm/madvise.c               |  2 +-
 mm/memory.c                | 30 ++++++++++++++++++------------
 mm/mempolicy.c             |  2 +-
 mm/msync.c                 |  4 ++--
 9 files changed, 30 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 0d4d8bec0df4..b44b36e0c293 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -285,8 +285,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 	 * It's fine to use that for setting breakpoints in the vDSO code
 	 * pages though
 	 */
-	vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE |
-		VM_MAYEXEC | VM_RESERVED;
+	vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 	vma->vm_flags |= mm->def_flags;
 	vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
 	vma->vm_ops = &vdso_vmops;
diff --git a/arch/sparc/mm/generic.c b/arch/sparc/mm/generic.c
index 9604893ffdbd..0410bae681f8 100644
--- a/arch/sparc/mm/generic.c
+++ b/arch/sparc/mm/generic.c
@@ -74,7 +74,7 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 
 	/* See comment in mm/memory.c remap_pfn_range */
-	vma->vm_flags |= VM_IO | VM_RESERVED;
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED;
 
 	prot = __pgprot(pg_iobits);
 	offset -= from;
diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c
index 112c316e7cd2..8fd4cb1f050a 100644
--- a/arch/sparc64/mm/generic.c
+++ b/arch/sparc64/mm/generic.c
@@ -128,7 +128,7 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 
 	/* See comment in mm/memory.c remap_pfn_range */
-	vma->vm_flags |= VM_IO | VM_RESERVED;
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED;
 
 	prot = __pgprot(pg_iobits);
 	offset -= from;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9701210c6680..f0cdfd18db55 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -144,7 +144,8 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_GROWSDOWN	0x00000100	/* general info on the segment */
 #define VM_GROWSUP	0x00000200
-#define VM_SHM		0x00000400	/* shared memory area, don't swap out */
+#define VM_SHM		0x00000000	/* Means nothing: delete it later */
+#define VM_UNPAGED	0x00000400	/* Pages managed without map count */
 #define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
 
 #define VM_EXECUTABLE	0x00001000
@@ -157,7 +158,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_DONTCOPY	0x00020000      /* Do not copy this vma on fork */
 #define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
-#define VM_RESERVED	0x00080000	/* Pages managed in a special way */
+#define VM_RESERVED	0x00080000	/* Count as reserved_vm like IO */
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
diff --git a/mm/fremap.c b/mm/fremap.c
index d862be3bc3e3..94254c5d7a18 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -65,7 +65,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t pte_val;
 	spinlock_t *ptl;
 
-	BUG_ON(vma->vm_flags & VM_RESERVED);
+	BUG_ON(vma->vm_flags & VM_UNPAGED);
 
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
@@ -122,7 +122,7 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t pte_val;
 	spinlock_t *ptl;
 
-	BUG_ON(vma->vm_flags & VM_RESERVED);
+	BUG_ON(vma->vm_flags & VM_UNPAGED);
 
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
diff --git a/mm/madvise.c b/mm/madvise.c
index 17aaf3e16449..328a3bcce527 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 			     unsigned long start, unsigned long end)
 {
 	*prev = vma;
-	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED))
+	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_UNPAGED))
 		return -EINVAL;
 
 	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c
index cfce5f1f30f2..ece04963158e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -334,7 +334,7 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 
 /*
  * This function is called to print an error when a pte in a
- * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * !VM_UNPAGED region is found pointing to an invalid pfn (which
  * is an error.
  *
  * The calling function must still handle the error.
@@ -381,15 +381,15 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		goto out_set_pte;
 	}
 
-	/* If the region is VM_RESERVED, the mapping is not
+	/* If the region is VM_UNPAGED, the mapping is not
 	 * mapped via rmap - duplicate the pte as is.
 	 */
-	if (vm_flags & VM_RESERVED)
+	if (vm_flags & VM_UNPAGED)
 		goto out_set_pte;
 
 	pfn = pte_pfn(pte);
 	/* If the pte points outside of valid memory but
-	 * the region is not VM_RESERVED, we have a problem.
+	 * the region is not VM_UNPAGED, we have a problem.
 	 */
 	if (unlikely(!pfn_valid(pfn))) {
 		print_bad_pte(vma, pte, addr);
@@ -528,7 +528,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
-	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) {
+	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_UNPAGED))) {
 		if (!vma->anon_vma)
 			return 0;
 	}
@@ -572,7 +572,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 
 			(*zap_work) -= PAGE_SIZE;
 
-			if (!(vma->vm_flags & VM_RESERVED)) {
+			if (!(vma->vm_flags & VM_UNPAGED)) {
 				unsigned long pfn = pte_pfn(ptent);
 				if (unlikely(!pfn_valid(pfn)))
 					print_bad_pte(vma, ptent, addr);
@@ -1191,10 +1191,16 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 * rest of the world about it:
 	 *   VM_IO tells people not to look at these pages
 	 *	(accesses can have side effects).
-	 *   VM_RESERVED tells the core MM not to "manage" these pages
-         *	(e.g. refcount, mapcount, try to swap them out).
+	 *   VM_RESERVED is specified all over the place, because
+	 *	in 2.4 it kept swapout's vma scan off this vma; but
+	 *	in 2.6 the LRU scan won't even find its pages, so this
+	 *	flag means no more than count its pages in reserved_vm,
+	 * 	and omit it from core dump, even when VM_IO turned off.
+	 *   VM_UNPAGED tells the core MM not to "manage" these pages
+         *	(e.g. refcount, mapcount, try to swap them out): in
+	 *	particular, zap_pte_range does not try to free them.
 	 */
-	vma->vm_flags |= VM_IO | VM_RESERVED;
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED;
 
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
@@ -1276,7 +1282,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t entry;
 	int ret = VM_FAULT_MINOR;
 
-	BUG_ON(vma->vm_flags & VM_RESERVED);
+	BUG_ON(vma->vm_flags & VM_UNPAGED);
 
 	if (unlikely(!pfn_valid(pfn))) {
 		/*
@@ -1924,7 +1930,7 @@ retry:
 			inc_mm_counter(mm, anon_rss);
 			lru_cache_add_active(new_page);
 			page_add_anon_rmap(new_page, vma, address);
-		} else if (!(vma->vm_flags & VM_RESERVED)) {
+		} else if (!(vma->vm_flags & VM_UNPAGED)) {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
 		}
@@ -2203,7 +2209,7 @@ static int __init gate_vma_init(void)
 	gate_vma.vm_start = FIXADDR_USER_START;
 	gate_vma.vm_end = FIXADDR_USER_END;
 	gate_vma.vm_page_prot = PAGE_READONLY;
-	gate_vma.vm_flags = VM_RESERVED;
+	gate_vma.vm_flags = 0;
 	return 0;
 }
 __initcall(gate_vma_init);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5abc57c2b8bd..5609a31bdf22 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -269,7 +269,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	first = find_vma(mm, start);
 	if (!first)
 		return ERR_PTR(-EFAULT);
-	if (first->vm_flags & VM_RESERVED)
+	if (first->vm_flags & VM_UNPAGED)
 		return ERR_PTR(-EACCES);
 	prev = NULL;
 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
diff --git a/mm/msync.c b/mm/msync.c
index 0e040e9c39d8..b3f4caf3010b 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -97,9 +97,9 @@ static void msync_page_range(struct vm_area_struct *vma,
 	/* For hugepages we can't go walking the page table normally,
 	 * but that's ok, hugetlbfs is memory based, so we don't need
 	 * to do anything more on an msync().
-	 * Can't do anything with VM_RESERVED regions either.
+	 * Can't do anything with VM_UNPAGED regions either.
 	 */
-	if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED))
+	if (vma->vm_flags & (VM_HUGETLB|VM_UNPAGED))
 		return;
 
 	BUG_ON(addr >= end);
-- 
cgit v1.2.3


From af2b4079ab154bd12e8c12b02db5f31b31babe63 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Tue, 22 Nov 2005 14:38:04 -0800
Subject: [NET]: Shut up warnings in net/core/flow.c

Not really a network problem, more a !SMP issue.

net/core/flow.c:295: warning: statement with no effect

flow.c:295:        smp_call_function(flow_cache_flush_per_cpu, &info, 1, 0);

Fix this by converting the macro to an inline function, which
also increases the typechecking for !SMP builds.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/smp.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 9dfa3ee769ae..b6069c8e1f04 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -94,7 +94,13 @@ void smp_prepare_boot_cpu(void);
  */
 #define raw_smp_processor_id()			0
 #define hard_smp_processor_id()			0
-#define smp_call_function(func,info,retry,wait)	({ 0; })
+
+static inline int smp_call_function(void (*func) (void *info), void *info,
+				    int retry, int wait)
+{
+	return 0;
+}
+
 #define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
 static inline void smp_send_reschedule(int cpu) { }
 #define num_booting_cpus()			1
-- 
cgit v1.2.3


From ac3461ad632e86e7debd871776683c05ef3ba4c6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 22 Nov 2005 19:39:30 -0800
Subject: Fix up GFP_ZONEMASK for GFP_DMA32 usage

There was some confusion about the different zone usage, this should fix
up the resulting mess in the GFP zonemask handling.

The different zone usage is still confusing (it's very easy to mix up
the individual zone numbers with the GFP zone _list_ numbers), so we
might want to clean up some of this in the future, but in the meantime
this should fix the actual problems.

Acked-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/gfp.h    |  9 +++++++--
 include/linux/mmzone.h | 18 ++++--------------
 2 files changed, 11 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 313dfe9b443a..8b2eab90abb6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -11,7 +11,7 @@ struct vm_area_struct;
 /*
  * GFP bitmasks..
  */
-/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */
+/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) */
 #define __GFP_DMA	((__force gfp_t)0x01u)
 #define __GFP_HIGHMEM	((__force gfp_t)0x02u)
 #ifdef CONFIG_DMA_IS_DMA32
@@ -74,7 +74,12 @@ struct vm_area_struct;
 #define GFP_DMA32	__GFP_DMA32
 
 
-#define gfp_zone(mask) ((__force int)((mask) & (__force gfp_t)GFP_ZONEMASK))
+static inline int gfp_zone(gfp_t gfp)
+{
+	int zone = GFP_ZONEMASK & (__force int) gfp;
+	BUG_ON(zone >= GFP_ZONETYPES);
+	return zone;
+}
 
 /*
  * There is only one page-allocator function, and two main namespaces to
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2c8edad5dccf..9f22090df7dd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -91,21 +91,11 @@ struct per_cpu_pageset {
  * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will
  * be 8 (2 ** 3) zonelists.  GFP_ZONETYPES defines the number of possible
  * combinations of zone modifiers in "zone modifier space".
+ *
+ * NOTE! Make sure this matches the zones in <linux/gfp.h>
  */
-#define GFP_ZONEMASK	0x03
-/*
- * As an optimisation any zone modifier bits which are only valid when
- * no other zone modifier bits are set (loners) should be placed in
- * the highest order bits of this field.  This allows us to reduce the
- * extent of the zonelists thus saving space.  For example in the case
- * of three zone modifier bits, we could require up to eight zonelists.
- * If the left most zone modifier is a "loner" then the highest valid
- * zonelist would be four allowing us to allocate only five zonelists.
- * Use the first form when the left most bit is not a "loner", otherwise
- * use the second.
- */
-/* #define GFP_ZONETYPES	(GFP_ZONEMASK + 1) */		/* Non-loner */
-#define GFP_ZONETYPES	((GFP_ZONEMASK + 1) / 2 + 1)		/* Loner */
+#define GFP_ZONEMASK	0x07
+#define GFP_ZONETYPES	5
 
 /*
  * On machines where it is needed (eg PCs) we divide physical memory
-- 
cgit v1.2.3


From 1778d55edb62753a92b979fa57072c2e1ff3d062 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 22 Nov 2005 21:58:37 -0800
Subject: compat-ioctl.c: fix compile with no CONFIG_JBD

The ext3 compat-ioctl translation wants to translate data structures
that <linux/jbd.h> only declared when CONFIG_JBD was enabled.

So make <linux/jbd.h> play nicely even when we don't actually end up
using it.

Acked-by: Andrew Morton <akpm@osdl.org>
Acked-by: Jeffrey Hundstad <jeffrey.hundstad@mnsu.edu>
Acked-by: Zan Lynx <zlynx@acm.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/jbd.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index aa56172c6fed..dcde7adfdce5 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -16,8 +16,6 @@
 #ifndef _LINUX_JBD_H
 #define _LINUX_JBD_H
 
-#if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || !defined(__KERNEL__)
-
 /* Allow this file to be included directly into e2fsprogs */
 #ifndef __KERNEL__
 #include "jfs_compat.h"
@@ -1083,19 +1081,4 @@ extern int jbd_blocks_per_page(struct inode *inode);
 
 #endif	/* __KERNEL__ */
 
-#endif	/* CONFIG_JBD || CONFIG_JBD_MODULE || !__KERNEL__ */
-
-/*
- * Compatibility no-ops which allow the kernel to compile without CONFIG_JBD
- * go here.
- */
-
-#if defined(__KERNEL__) && !(defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE))
-
-#define J_ASSERT(expr)			do {} while (0)
-#define J_ASSERT_BH(bh, expr)		do {} while (0)
-#define buffer_jbd(bh)			0
-#define journal_buffer_journal_lru(bh)	0
-
-#endif	/* defined(__KERNEL__) && !defined(CONFIG_JBD) */
 #endif	/* _LINUX_JBD_H */
-- 
cgit v1.2.3


From 2d0ebb36038c0626cde662a3b06da9787cfb68c3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Wed, 23 Nov 2005 08:44:05 -0800
Subject: Revert "[NET]: Shut up warnings in net/core/flow.c"

This reverts commit af2b4079ab154bd12e8c12b02db5f31b31babe63

Changing the #define to an inline function breaks on non-SMP builds,
since wuite a few places in the kernel do not implement the ipi handler
when compiling for UP.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/smp.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index b6069c8e1f04..9dfa3ee769ae 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -94,13 +94,7 @@ void smp_prepare_boot_cpu(void);
  */
 #define raw_smp_processor_id()			0
 #define hard_smp_processor_id()			0
-
-static inline int smp_call_function(void (*func) (void *info), void *info,
-				    int retry, int wait)
-{
-	return 0;
-}
-
+#define smp_call_function(func,info,retry,wait)	({ 0; })
 #define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
 static inline void smp_send_reschedule(int cpu) { }
 #define num_booting_cpus()			1
-- 
cgit v1.2.3


From 8dd396ec7bf706fe85d8c6792b478ee6f09e8de6 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 23 Nov 2005 15:45:53 -0800
Subject: [PATCH] USB: kernel-doc for linux/usb.h

Fix kernel-doc warning in linux/usb.h.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/usb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 856d232c7562..d81b050e5955 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -47,6 +47,7 @@ struct usb_driver;
  * @urb_list: urbs queued to this endpoint; maintained by usbcore
  * @hcpriv: for use by HCD; typically holds hardware dma queue head (QH)
  *	with one or more transfer descriptors (TDs) per urb
+ * @kobj: kobject for sysfs info
  * @extra: descriptors following this endpoint in the configuration
  * @extralen: how many bytes of "extra" are valid
  *
-- 
cgit v1.2.3


From f5417612d787e6b619fd69616bbf95f1b895e900 Mon Sep 17 00:00:00 2001
From: Sascha Hauer <sascha@saschahauer.de>
Date: Mon, 28 Nov 2005 18:09:44 +0000
Subject: [ARM] 3181/1: add PORT_ identifier for Hilscher netx uart

Patch from Sascha Hauer

This patch adds PORT_NETX for supporting the Hilscher netx embedded
UARTs.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/serial_core.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index a3ac92b19aca..e3710d7e260a 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -121,6 +121,9 @@
 
 #define PORT_IP3106	70
 
+/* Hilscher netx */
+#define PORT_NETX	71
+
 #ifdef __KERNEL__
 
 #include <linux/config.h>
-- 
cgit v1.2.3


From 24117defabc849a6ad5081ad0fafd0664bf55f13 Mon Sep 17 00:00:00 2001
From: Pierre Ossman <drzeus@drzeus.cx>
Date: Mon, 28 Nov 2005 21:00:29 +0000
Subject: [MMC] Fix protocol errors

A review against MMC/SD specifications found some errors in the current
implementation.

Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/mmc/mmc.c            | 2 +-
 include/linux/mmc/protocol.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/mmc.c b/drivers/mmc/mmc.c
index da528390acf8..d336a1d65dc7 100644
--- a/drivers/mmc/mmc.c
+++ b/drivers/mmc/mmc.c
@@ -816,7 +816,7 @@ static void mmc_discover_cards(struct mmc_host *host)
 
 			cmd.opcode = SD_SEND_RELATIVE_ADDR;
 			cmd.arg = 0;
-			cmd.flags = MMC_RSP_R1;
+			cmd.flags = MMC_RSP_R6;
 
 			err = mmc_wait_for_cmd(host, &cmd, CMD_RETRIES);
 			if (err != MMC_ERR_NONE)
diff --git a/include/linux/mmc/protocol.h b/include/linux/mmc/protocol.h
index f819cae92266..a14dc306545b 100644
--- a/include/linux/mmc/protocol.h
+++ b/include/linux/mmc/protocol.h
@@ -63,7 +63,7 @@
   /* class 5 */
 #define MMC_ERASE_GROUP_START    35   /* ac   [31:0] data addr   R1  */
 #define MMC_ERASE_GROUP_END      36   /* ac   [31:0] data addr   R1  */
-#define MMC_ERASE                37   /* ac                      R1b */
+#define MMC_ERASE                38   /* ac                      R1b */
 
   /* class 9 */
 #define MMC_FAST_IO              39   /* ac   <Complex>          R4  */
@@ -74,7 +74,7 @@
 
   /* class 8 */
 #define MMC_APP_CMD              55   /* ac   [31:16] RCA        R1  */
-#define MMC_GEN_CMD              56   /* adtc [0] RD/WR          R1b */
+#define MMC_GEN_CMD              56   /* adtc [0] RD/WR          R1  */
 
 /* SD commands                           type  argument     response */
   /* class 8 */
-- 
cgit v1.2.3


From 6aab341e0a28aff100a09831c5300a2994b8b986 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Mon, 28 Nov 2005 14:34:23 -0800
Subject: mm: re-architect the VM_UNPAGED logic

This replaces the (in my opinion horrible) VM_UNMAPPED logic with very
explicit support for a "remapped page range" aka VM_PFNMAP.  It allows a
VM area to contain an arbitrary range of page table entries that the VM
never touches, and never considers to be normal pages.

Any user of "remap_pfn_range()" automatically gets this new
functionality, and doesn't even have to mark the pages reserved or
indeed mark them any other way.  It just works.  As a side effect, doing
mmap() on /dev/mem works for arbitrary ranges.

Sparc update from David in the next commit.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/kernel/vdso.c |   6 +-
 drivers/char/mem.c         |   2 +-
 fs/proc/task_mmu.c         |   7 +-
 include/linux/mm.h         |   5 +-
 mm/fremap.c                |  22 ++----
 mm/madvise.c               |   2 +-
 mm/memory.c                | 189 ++++++++++++++++++++++++---------------------
 mm/mempolicy.c             |  12 +--
 mm/msync.c                 |  12 +--
 mm/nommu.c                 |   2 +-
 mm/rmap.c                  |  14 +---
 11 files changed, 127 insertions(+), 146 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index b44b36e0c293..f0c47dab0903 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -145,8 +145,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma)
 			struct page *pg = virt_to_page(vdso32_kbase +
 						       i*PAGE_SIZE);
 			struct page *upg = (vma && vma->vm_mm) ?
-				follow_page(vma->vm_mm, vma->vm_start +
-					    i*PAGE_SIZE, 0)
+				follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0)
 				: NULL;
 			dump_one_vdso_page(pg, upg);
 		}
@@ -157,8 +156,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma)
 			struct page *pg = virt_to_page(vdso64_kbase +
 						       i*PAGE_SIZE);
 			struct page *upg = (vma && vma->vm_mm) ?
-				follow_page(vma->vm_mm, vma->vm_start +
-					    i*PAGE_SIZE, 0)
+				follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0)
 				: NULL;
 			dump_one_vdso_page(pg, upg);
 		}
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 29c3b631445a..91dd669273e0 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -591,7 +591,7 @@ static inline size_t read_zero_pagealigned(char __user * buf, size_t size)
 
 		if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)
 			goto out_up;
-		if (vma->vm_flags & (VM_SHARED | VM_HUGETLB | VM_UNPAGED))
+		if (vma->vm_flags & (VM_SHARED | VM_HUGETLB))
 			break;
 		count = vma->vm_end - addr;
 		if (count > size)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9ab97cef0daa..50bd5a8f0446 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -402,12 +402,11 @@ struct numa_maps {
 /*
  * Calculate numa node maps for a vma
  */
-static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
+static struct numa_maps *get_numa_maps(struct vm_area_struct *vma)
 {
+	int i;
 	struct page *page;
 	unsigned long vaddr;
-	struct mm_struct *mm = vma->vm_mm;
-	int i;
 	struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL);
 
 	if (!md)
@@ -420,7 +419,7 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
 		md->node[i] =0;
 
  	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
-		page = follow_page(mm, vaddr, 0);
+		page = follow_page(vma, vaddr, 0);
 		if (page) {
 			int count = page_mapcount(page);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f0cdfd18db55..6a75a7a78bf1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -145,7 +145,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_GROWSDOWN	0x00000100	/* general info on the segment */
 #define VM_GROWSUP	0x00000200
 #define VM_SHM		0x00000000	/* Means nothing: delete it later */
-#define VM_UNPAGED	0x00000400	/* Pages managed without map count */
+#define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
 
 #define VM_EXECUTABLE	0x00001000
@@ -664,6 +664,7 @@ struct zap_details {
 	unsigned long truncate_count;		/* Compare vm_truncate_count */
 };
 
+struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t);
 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *);
 unsigned long unmap_vmas(struct mmu_gather **tlb,
@@ -953,7 +954,7 @@ unsigned long vmalloc_to_pfn(void *addr);
 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
 			unsigned long pfn, unsigned long size, pgprot_t);
 
-struct page *follow_page(struct mm_struct *, unsigned long address,
+struct page *follow_page(struct vm_area_struct *, unsigned long address,
 			unsigned int foll_flags);
 #define FOLL_WRITE	0x01	/* check pte is writable */
 #define FOLL_TOUCH	0x02	/* mark page accessed */
diff --git a/mm/fremap.c b/mm/fremap.c
index 007cbad9331e..f851775e09c2 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *page = NULL;
 
 	if (pte_present(pte)) {
-		unsigned long pfn = pte_pfn(pte);
-		flush_cache_page(vma, addr, pfn);
+		flush_cache_page(vma, addr, pte_pfn(pte));
 		pte = ptep_clear_flush(vma, addr, ptep);
-		if (unlikely(!pfn_valid(pfn))) {
-			print_bad_pte(vma, pte, addr);
-			goto out;
+		page = vm_normal_page(vma, addr, pte);
+		if (page) {
+			if (pte_dirty(pte))
+				set_page_dirty(page);
+			page_remove_rmap(page);
+			page_cache_release(page);
 		}
-		page = pfn_to_page(pfn);
-		if (pte_dirty(pte))
-			set_page_dirty(page);
-		page_remove_rmap(page);
-		page_cache_release(page);
 	} else {
 		if (!pte_file(pte))
 			free_swap_and_cache(pte_to_swp_entry(pte));
 		pte_clear(mm, addr, ptep);
 	}
-out:
 	return !!page;
 }
 
@@ -65,8 +61,6 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t pte_val;
 	spinlock_t *ptl;
 
-	BUG_ON(vma->vm_flags & VM_UNPAGED);
-
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
@@ -122,8 +116,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t pte_val;
 	spinlock_t *ptl;
 
-	BUG_ON(vma->vm_flags & VM_UNPAGED);
-
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
diff --git a/mm/madvise.c b/mm/madvise.c
index 328a3bcce527..2b7cf0400a21 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 			     unsigned long start, unsigned long end)
 {
 	*prev = vma;
-	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_UNPAGED))
+	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
 		return -EINVAL;
 
 	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c
index d1f46f4e4c8a..b57fbc636058 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 }
 
 /*
- * This function is called to print an error when a pte in a
- * !VM_UNPAGED region is found pointing to an invalid pfn (which
- * is an error.
+ * This function is called to print an error when a bad pte
+ * is found. For example, we might have a PFN-mapped pte in
+ * a region that doesn't allow it.
  *
  * The calling function must still handle the error.
  */
@@ -350,19 +350,56 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
 }
 
 /*
- * page_is_anon applies strict checks for an anonymous page belonging to
- * this vma at this address.  It is used on VM_UNPAGED vmas, which are
- * usually populated with shared originals (which must not be counted),
- * but occasionally contain private COWed copies (when !VM_SHARED, or
- * perhaps via ptrace when VM_SHARED).  An mmap of /dev/mem might window
- * free pages, pages from other processes, or from other parts of this:
- * it's tricky, but try not to be deceived by foreign anonymous pages.
+ * This function gets the "struct page" associated with a pte.
+ *
+ * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
+ * will have each page table entry just pointing to a raw page frame
+ * number, and as far as the VM layer is concerned, those do not have
+ * pages associated with them - even if the PFN might point to memory
+ * that otherwise is perfectly fine and has a "struct page".
+ *
+ * The way we recognize those mappings is through the rules set up
+ * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
+ * and the vm_pgoff will point to the first PFN mapped: thus every
+ * page that is a raw mapping will always honor the rule
+ *
+ *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
+ *
+ * and if that isn't true, the page has been COW'ed (in which case it
+ * _does_ have a "struct page" associated with it even if it is in a
+ * VM_PFNMAP range).
  */
-static inline int page_is_anon(struct page *page,
-			struct vm_area_struct *vma, unsigned long addr)
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {
-	return page && PageAnon(page) && page_mapped(page) &&
-		page_address_in_vma(page, vma) == addr;
+	unsigned long pfn = pte_pfn(pte);
+
+	if (vma->vm_flags & VM_PFNMAP) {
+		unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
+		if (pfn == vma->vm_pgoff + off)
+			return NULL;
+	}
+
+	/*
+	 * Add some anal sanity checks for now. Eventually,
+	 * we should just do "return pfn_to_page(pfn)", but
+	 * in the meantime we check that we get a valid pfn,
+	 * and that the resulting page looks ok.
+	 *
+	 * Remove this test eventually!
+	 */
+	if (unlikely(!pfn_valid(pfn))) {
+		print_bad_pte(vma, pte, addr);
+		return NULL;
+	}
+
+	/*
+	 * NOTE! We still have PageReserved() pages in the page 
+	 * tables. 
+	 *
+	 * The PAGE_ZERO() pages and various VDSO mappings can
+	 * cause them to exist.
+	 */
+	return pfn_to_page(pfn);
 }
 
 /*
@@ -379,7 +416,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	unsigned long vm_flags = vma->vm_flags;
 	pte_t pte = *src_pte;
 	struct page *page;
-	unsigned long pfn;
 
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
@@ -397,22 +433,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		goto out_set_pte;
 	}
 
-	pfn = pte_pfn(pte);
-	page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
-
-	if (unlikely(vm_flags & VM_UNPAGED))
-		if (!page_is_anon(page, vma, addr))
-			goto out_set_pte;
-
-	/*
-	 * If the pte points outside of valid memory but
-	 * the region is not VM_UNPAGED, we have a problem.
-	 */
-	if (unlikely(!page)) {
-		print_bad_pte(vma, pte, addr);
-		goto out_set_pte; /* try to do something sane */
-	}
-
 	/*
 	 * If it's a COW mapping, write protect it both
 	 * in the parent and the child
@@ -429,9 +449,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (vm_flags & VM_SHARED)
 		pte = pte_mkclean(pte);
 	pte = pte_mkold(pte);
-	get_page(page);
-	page_dup_rmap(page);
-	rss[!!PageAnon(page)]++;
+
+	page = vm_normal_page(vma, addr, pte);
+	if (page) {
+		get_page(page);
+		page_dup_rmap(page);
+		rss[!!PageAnon(page)]++;
+	}
 
 out_set_pte:
 	set_pte_at(dst_mm, addr, dst_pte, pte);
@@ -543,7 +567,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
-	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_UNPAGED))) {
+	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) {
 		if (!vma->anon_vma)
 			return 0;
 	}
@@ -584,19 +608,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		}
 		if (pte_present(ptent)) {
 			struct page *page;
-			unsigned long pfn;
 
 			(*zap_work) -= PAGE_SIZE;
 
-			pfn = pte_pfn(ptent);
-			page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
-
-			if (unlikely(vma->vm_flags & VM_UNPAGED)) {
-				if (!page_is_anon(page, vma, addr))
-					page = NULL;
-			} else if (unlikely(!page))
-				print_bad_pte(vma, ptent, addr);
-
+			page = vm_normal_page(vma, addr, ptent);
 			if (unlikely(details) && page) {
 				/*
 				 * unmap_shared_mapping_pages() wants to
@@ -852,7 +867,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 /*
  * Do a quick page-table lookup for a single page.
  */
-struct page *follow_page(struct mm_struct *mm, unsigned long address,
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 			unsigned int flags)
 {
 	pgd_t *pgd;
@@ -860,8 +875,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
 	pmd_t *pmd;
 	pte_t *ptep, pte;
 	spinlock_t *ptl;
-	unsigned long pfn;
 	struct page *page;
+	struct mm_struct *mm = vma->vm_mm;
 
 	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 	if (!IS_ERR(page)) {
@@ -897,11 +912,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
 		goto unlock;
 	if ((flags & FOLL_WRITE) && !pte_write(pte))
 		goto unlock;
-	pfn = pte_pfn(pte);
-	if (!pfn_valid(pfn))
+	page = vm_normal_page(vma, address, pte);
+	if (unlikely(!page))
 		goto unlock;
 
-	page = pfn_to_page(pfn);
 	if (flags & FOLL_GET)
 		get_page(page);
 	if (flags & FOLL_TOUCH) {
@@ -974,8 +988,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				return i ? : -EFAULT;
 			}
 			if (pages) {
-				pages[i] = pte_page(*pte);
-				get_page(pages[i]);
+				struct page *page = vm_normal_page(vma, start, *pte);
+				pages[i] = page;
+				if (page)
+					get_page(page);
 			}
 			pte_unmap(pte);
 			if (vmas)
@@ -1010,7 +1026,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				foll_flags |= FOLL_WRITE;
 
 			cond_resched();
-			while (!(page = follow_page(mm, start, foll_flags))) {
+			while (!(page = follow_page(vma, start, foll_flags))) {
 				int ret;
 				ret = __handle_mm_fault(mm, vma, start,
 						foll_flags & FOLL_WRITE);
@@ -1214,11 +1230,12 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 *	in 2.6 the LRU scan won't even find its pages, so this
 	 *	flag means no more than count its pages in reserved_vm,
 	 * 	and omit it from core dump, even when VM_IO turned off.
-	 *   VM_UNPAGED tells the core MM not to "manage" these pages
-         *	(e.g. refcount, mapcount, try to swap them out): in
-	 *	particular, zap_pte_range does not try to free them.
+	 *   VM_PFNMAP tells the core MM that the base pages are just
+	 *	raw PFN mappings, and do not have a "struct page" associated
+	 *	with them.
 	 */
-	vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED;
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+	vma->vm_pgoff = pfn;
 
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
@@ -1273,6 +1290,26 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 	return pte;
 }
 
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
+{
+	/*
+	 * If the source page was a PFN mapping, we don't have
+	 * a "struct page" for it. We do a best-effort copy by
+	 * just copying from the original user address. If that
+	 * fails, we just zero-fill it. Live with it.
+	 */
+	if (unlikely(!src)) {
+		void *kaddr = kmap_atomic(dst, KM_USER0);
+		unsigned long left = __copy_from_user_inatomic(kaddr, (void __user *)va, PAGE_SIZE);
+		if (left)
+			memset(kaddr, 0, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+		
+	}
+	copy_user_highpage(dst, src, va);
+}
+
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -1296,28 +1333,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		spinlock_t *ptl, pte_t orig_pte)
 {
 	struct page *old_page, *src_page, *new_page;
-	unsigned long pfn = pte_pfn(orig_pte);
 	pte_t entry;
 	int ret = VM_FAULT_MINOR;
 
-	if (unlikely(!pfn_valid(pfn))) {
-		/*
-		 * Page table corrupted: show pte and kill process.
-		 * Or it's an attempt to COW an out-of-map VM_UNPAGED
-		 * entry, which copy_user_highpage does not support.
-		 */
-		print_bad_pte(vma, orig_pte, address);
-		ret = VM_FAULT_OOM;
-		goto unlock;
-	}
-	old_page = pfn_to_page(pfn);
+	old_page = vm_normal_page(vma, address, orig_pte);
 	src_page = old_page;
-
-	if (unlikely(vma->vm_flags & VM_UNPAGED))
-		if (!page_is_anon(old_page, vma, address)) {
-			old_page = NULL;
-			goto gotten;
-		}
+	if (!old_page)
+		goto gotten;
 
 	if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
 		int reuse = can_share_swap_page(old_page);
@@ -1351,7 +1373,7 @@ gotten:
 		new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!new_page)
 			goto oom;
-		copy_user_highpage(new_page, src_page, address);
+		cow_user_page(new_page, src_page, address);
 	}
 
 	/*
@@ -1812,16 +1834,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	pte_t entry;
 
-	/*
-	 * A VM_UNPAGED vma will normally be filled with present ptes
-	 * by remap_pfn_range, and never arrive here; but it might have
-	 * holes, or if !VM_DONTEXPAND, mremap might have expanded it.
-	 * It's weird enough handling anon pages in unpaged vmas, we do
-	 * not want to worry about ZERO_PAGEs too (it may or may not
-	 * matter if their counts wrap): just give them anon pages.
-	 */
-
-	if (write_access || (vma->vm_flags & VM_UNPAGED)) {
+	if (write_access) {
 		/* Allocate our own private page. */
 		pte_unmap(page_table);
 
@@ -1896,8 +1909,6 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int anon = 0;
 
 	pte_unmap(page_table);
-	BUG_ON(vma->vm_flags & VM_UNPAGED);
-
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
 		sequence = mapping->truncate_count;
@@ -1930,7 +1941,7 @@ retry:
 		page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!page)
 			goto oom;
-		copy_user_highpage(page, new_page, address);
+		cow_user_page(page, new_page, address);
 		page_cache_release(new_page);
 		new_page = page;
 		anon = 1;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5609a31bdf22..bec88c81244e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -189,17 +189,15 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
-		unsigned long pfn;
+		struct page *page;
 		unsigned int nid;
 
 		if (!pte_present(*pte))
 			continue;
-		pfn = pte_pfn(*pte);
-		if (!pfn_valid(pfn)) {
-			print_bad_pte(vma, *pte, addr);
+		page = vm_normal_page(vma, addr, *pte);
+		if (!page)
 			continue;
-		}
-		nid = pfn_to_nid(pfn);
+		nid = page_to_nid(page);
 		if (!node_isset(nid, *nodes))
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -269,8 +267,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	first = find_vma(mm, start);
 	if (!first)
 		return ERR_PTR(-EFAULT);
-	if (first->vm_flags & VM_UNPAGED)
-		return ERR_PTR(-EACCES);
 	prev = NULL;
 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 		if (!vma->vm_next && vma->vm_end < end)
diff --git a/mm/msync.c b/mm/msync.c
index b3f4caf3010b..1b5b6f662dcf 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 again:
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
-		unsigned long pfn;
 		struct page *page;
 
 		if (progress >= 64) {
@@ -40,13 +39,9 @@ again:
 			continue;
 		if (!pte_maybe_dirty(*pte))
 			continue;
-		pfn = pte_pfn(*pte);
-		if (unlikely(!pfn_valid(pfn))) {
-			print_bad_pte(vma, *pte, addr);
+		page = vm_normal_page(vma, addr, *pte);
+		if (!page)
 			continue;
-		}
-		page = pfn_to_page(pfn);
-
 		if (ptep_clear_flush_dirty(vma, addr, pte) ||
 		    page_test_and_clear_dirty(page))
 			set_page_dirty(page);
@@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma,
 	/* For hugepages we can't go walking the page table normally,
 	 * but that's ok, hugetlbfs is memory based, so we don't need
 	 * to do anything more on an msync().
-	 * Can't do anything with VM_UNPAGED regions either.
 	 */
-	if (vma->vm_flags & (VM_HUGETLB|VM_UNPAGED))
+	if (vma->vm_flags & VM_HUGETLB)
 		return;
 
 	BUG_ON(addr >= end);
diff --git a/mm/nommu.c b/mm/nommu.c
index 6deb6ab3d6ad..c1196812876b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 
 EXPORT_SYMBOL(find_vma);
 
-struct page *follow_page(struct mm_struct *mm, unsigned long address,
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 			unsigned int foll_flags)
 {
 	return NULL;
diff --git a/mm/rmap.c b/mm/rmap.c
index 2e034a0b89ab..6389cda02a20 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -226,8 +226,6 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 /*
  * At what user virtual address is page expected in vma? checking that the
  * page matches the vma: currently only used on anon pages, by unuse_vma;
- * and by extraordinary checks on anon pages in VM_UNPAGED vmas, taking
- * care that an mmap of /dev/mem might window free and foreign pages.
  */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
@@ -614,7 +612,6 @@ static void try_to_unmap_cluster(unsigned long cursor,
 	struct page *page;
 	unsigned long address;
 	unsigned long end;
-	unsigned long pfn;
 
 	address = (vma->vm_start + cursor) & CLUSTER_MASK;
 	end = address + CLUSTER_SIZE;
@@ -643,15 +640,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
 	for (; address < end; pte++, address += PAGE_SIZE) {
 		if (!pte_present(*pte))
 			continue;
-
-		pfn = pte_pfn(*pte);
-		if (unlikely(!pfn_valid(pfn))) {
-			print_bad_pte(vma, *pte, address);
-			continue;
-		}
-
-		page = pfn_to_page(pfn);
-		BUG_ON(PageAnon(page));
+		page = vm_normal_page(vma, address, *pte);
+		BUG_ON(!page || PageAnon(page));
 
 		if (ptep_clear_flush_young(vma, address, pte))
 			continue;
-- 
cgit v1.2.3


From a9d9baa1e819b2f92f9cfa5240f766c535e636a6 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Mon, 28 Nov 2005 13:43:46 -0800
Subject: [PATCH] clean up lock_cpu_hotplug() in cpufreq

There are some callers in cpufreq hotplug notify path that the lowest
function calls lock_cpu_hotplug().  The lock is already held during
cpu_up() and cpu_down() calls when the notify calls are broadcast to
registered clients.

Ideally if possible, we could disable_preempt() at the highest caller and
make sure we dont sleep in the path down in cpufreq->driver_target() calls
but the calls are so intertwined and cumbersome to cleanup.

Hence we consistently use lock_cpu_hotplug() and unlock_cpu_hotplug() in
all places.

 - Removed export of cpucontrol semaphore and made it static.
 - removed explicit uses of up/down with lock_cpu_hotplug()
   so we can keep track of the the callers in same thread context and
   just keep refcounts without calling a down() that causes a deadlock.
 - Removed current_in_hotplug() uses
 - Removed PF_HOTPLUG_CPU in sched.h introduced for the current_in_hotplug()
   temporary workaround.

Tested with insmod of cpufreq_stat.ko, and logical online/offline
to make sure we dont have any hang situations.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Cc: Zwane Mwaikambo <zwane@linuxpower.ca>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/cpufreq/cpufreq.c | 12 ++-----
 include/linux/cpu.h       |  7 ++--
 include/linux/sched.h     |  1 -
 kernel/cpu.c              | 83 ++++++++++++++++++++++++++++-------------------
 4 files changed, 54 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 1c0f62d0f938..815902c2c856 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1113,21 +1113,13 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 {
 	int retval = -EINVAL;
 
-	/*
-	 * If we are already in context of hotplug thread, we dont need to
-	 * acquire the hotplug lock. Otherwise acquire cpucontrol to prevent
-	 * hotplug from removing this cpu that we are working on.
-	 */
-	if (!current_in_cpu_hotplug())
-		lock_cpu_hotplug();
-
+	lock_cpu_hotplug();
 	dprintk("target for CPU %u: %u kHz, relation %u\n", policy->cpu,
 		target_freq, relation);
 	if (cpu_online(policy->cpu) && cpufreq_driver->target)
 		retval = cpufreq_driver->target(policy, target_freq, relation);
 
-	if (!current_in_cpu_hotplug())
-		unlock_cpu_hotplug();
+	unlock_cpu_hotplug();
 
 	return retval;
 }
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 43c44530ef9d..0ed1d4853c69 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -65,10 +65,9 @@ extern struct sysdev_class cpu_sysdev_class;
 
 #ifdef CONFIG_HOTPLUG_CPU
 /* Stop CPUs going up and down. */
-extern struct semaphore cpucontrol;
-#define lock_cpu_hotplug()	down(&cpucontrol)
-#define unlock_cpu_hotplug()	up(&cpucontrol)
-#define lock_cpu_hotplug_interruptible() down_interruptible(&cpucontrol)
+extern void lock_cpu_hotplug(void);
+extern void unlock_cpu_hotplug(void);
+extern int lock_cpu_hotplug_interruptible(void);
 #define hotcpu_notifier(fn, pri) {				\
 	static struct notifier_block fn##_nb =			\
 		{ .notifier_call = fn, .priority = pri };	\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2038bd27b041..b0ad6f30679e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -908,7 +908,6 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 #define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
-#define PF_HOTPLUG_CPU	0x01000000	/* Currently performing CPU hotplug */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d61ba88f34e5..e882c6babf41 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -16,47 +16,76 @@
 #include <asm/semaphore.h>
 
 /* This protects CPUs going up and down... */
-DECLARE_MUTEX(cpucontrol);
-EXPORT_SYMBOL_GPL(cpucontrol);
+static DECLARE_MUTEX(cpucontrol);
 
 static struct notifier_block *cpu_chain;
 
-/*
- * Used to check by callers if they need to acquire the cpucontrol
- * or not to protect a cpu from being removed. Its sometimes required to
- * call these functions both for normal operations, and in response to
- * a cpu being added/removed. If the context of the call is in the same
- * thread context as a CPU hotplug thread, we dont need to take the lock
- * since its already protected
- * check drivers/cpufreq/cpufreq.c for its usage - Ashok Raj
- */
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct *lock_cpu_hotplug_owner;
+static int lock_cpu_hotplug_depth;
 
-int current_in_cpu_hotplug(void)
+static int __lock_cpu_hotplug(int interruptible)
 {
-	return (current->flags & PF_HOTPLUG_CPU);
+	int ret = 0;
+
+	if (lock_cpu_hotplug_owner != current) {
+		if (interruptible)
+			ret = down_interruptible(&cpucontrol);
+		else
+			down(&cpucontrol);
+	}
+
+	/*
+	 * Set only if we succeed in locking
+	 */
+	if (!ret) {
+		lock_cpu_hotplug_depth++;
+		lock_cpu_hotplug_owner = current;
+	}
+
+	return ret;
 }
 
-EXPORT_SYMBOL_GPL(current_in_cpu_hotplug);
+void lock_cpu_hotplug(void)
+{
+	__lock_cpu_hotplug(0);
+}
+EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
 
+void unlock_cpu_hotplug(void)
+{
+	if (--lock_cpu_hotplug_depth == 0) {
+		lock_cpu_hotplug_owner = NULL;
+		up(&cpucontrol);
+	}
+}
+EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
+
+int lock_cpu_hotplug_interruptible(void)
+{
+	return __lock_cpu_hotplug(1);
+}
+EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
+#endif	/* CONFIG_HOTPLUG_CPU */
 
 /* Need to know about CPUs going up/down? */
 int register_cpu_notifier(struct notifier_block *nb)
 {
 	int ret;
 
-	if ((ret = down_interruptible(&cpucontrol)) != 0)
+	if ((ret = lock_cpu_hotplug_interruptible()) != 0)
 		return ret;
 	ret = notifier_chain_register(&cpu_chain, nb);
-	up(&cpucontrol);
+	unlock_cpu_hotplug();
 	return ret;
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
-	down(&cpucontrol);
+	lock_cpu_hotplug();
 	notifier_chain_unregister(&cpu_chain, nb);
-	up(&cpucontrol);
+	unlock_cpu_hotplug();
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
@@ -112,13 +141,6 @@ int cpu_down(unsigned int cpu)
 		goto out;
 	}
 
-	/*
-	 * Leave a trace in current->flags indicating we are already in
-	 * process of performing CPU hotplug. Callers can check if cpucontrol
-	 * is already acquired by current thread, and if so not cause
-	 * a dead lock by not acquiring the lock
-	 */
-	current->flags |= PF_HOTPLUG_CPU;
 	err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
 						(void *)(long)cpu);
 	if (err == NOTIFY_BAD) {
@@ -171,7 +193,6 @@ out_thread:
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
 out:
-	current->flags &= ~PF_HOTPLUG_CPU;
 	unlock_cpu_hotplug();
 	return err;
 }
@@ -182,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu)
 	int ret;
 	void *hcpu = (void *)(long)cpu;
 
-	if ((ret = down_interruptible(&cpucontrol)) != 0)
+	if ((ret = lock_cpu_hotplug_interruptible()) != 0)
 		return ret;
 
 	if (cpu_online(cpu) || !cpu_present(cpu)) {
@@ -190,11 +211,6 @@ int __devinit cpu_up(unsigned int cpu)
 		goto out;
 	}
 
-	/*
-	 * Leave a trace in current->flags indicating we are already in
-	 * process of performing CPU hotplug.
-	 */
-	current->flags |= PF_HOTPLUG_CPU;
 	ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
 	if (ret == NOTIFY_BAD) {
 		printk("%s: attempt to bring up CPU %u failed\n",
@@ -217,7 +233,6 @@ out_notify:
 	if (ret != 0)
 		notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
 out:
-	current->flags &= ~PF_HOTPLUG_CPU;
-	up(&cpucontrol);
+	unlock_cpu_hotplug();
 	return ret;
 }
-- 
cgit v1.2.3


From ff88a3b2f56ae4f3296ea957ea38f99f8bd0e5a8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 28 Nov 2005 13:43:47 -0800
Subject: [PATCH] memory_sysdev_class is static

So don't define it as extern in the header file.

drivers/base/memory.c:28: error: static declaration of 'memory_sysdev_class' follows non-static declaration
include/linux/memory.h:88: error: previous declaration of 'memory_sysdev_class' was here

Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/memory.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 9a424383e6c6..dc4081b6f161 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -85,7 +85,6 @@ struct notifier_block;
 extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
 
-extern struct sysdev_class memory_sysdev_class;
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #define hotplug_memory_notifier(fn, pri) {			\
-- 
cgit v1.2.3


From f7b7fd8f3ebbb2810d6893295aa984acd0fd30db Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 28 Nov 2005 13:44:07 -0800
Subject: [PATCH] temporarily disable swap token on memory pressure

Some users (hi Zwane) have seen a problem when running a workload that
eats nearly all of physical memory - th system does an OOM kill, even
when there is still a lot of swap free.

The problem appears to be a very big task that is holding the swap
token, and the VM has a very hard time finding any other page in the
system that is swappable.

Instead of ignoring the swap token when sc->priority reaches 0, we could
simply take the swap token away from the memory hog and make sure we
don't give it back to the memory hog for a few seconds.

This patch resolves the problem Zwane ran into.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rmap.h |  4 ++--
 include/linux/swap.h |  6 ++++++
 mm/rmap.c            | 26 ++++++++++----------------
 mm/thrash.c          | 10 +++++++---
 mm/vmscan.c          | 11 +++++++++--
 5 files changed, 34 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 35b30e6c8cf8..33261f1d2239 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -89,7 +89,7 @@ static inline void page_dup_rmap(struct page *page)
 /*
  * Called from mm/vmscan.c to handle paging out
  */
-int page_referenced(struct page *, int is_locked, int ignore_token);
+int page_referenced(struct page *, int is_locked);
 int try_to_unmap(struct page *);
 
 /*
@@ -109,7 +109,7 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
 #define anon_vma_prepare(vma)	(0)
 #define anon_vma_link(vma)	do {} while (0)
 
-#define page_referenced(page,l,i) TestClearPageReferenced(page)
+#define page_referenced(page,l) TestClearPageReferenced(page)
 #define try_to_unmap(page)	SWAP_FAIL
 
 #endif	/* CONFIG_MMU */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 20c975642cab..508668f840b6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -239,6 +239,11 @@ static inline void put_swap_token(struct mm_struct *mm)
 		__put_swap_token(mm);
 }
 
+static inline void disable_swap_token(void)
+{
+	put_swap_token(swap_token_mm);
+}
+
 #else /* CONFIG_SWAP */
 
 #define total_swap_pages			0
@@ -283,6 +288,7 @@ static inline swp_entry_t get_swap_page(void)
 #define put_swap_token(x) do { } while(0)
 #define grab_swap_token()  do { } while(0)
 #define has_swap_token(x) 0
+#define disable_swap_token() do { } while(0)
 
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
diff --git a/mm/rmap.c b/mm/rmap.c
index 6389cda02a20..491ac350048f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -290,7 +290,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
  * repeatedly from either page_referenced_anon or page_referenced_file.
  */
 static int page_referenced_one(struct page *page,
-	struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token)
+	struct vm_area_struct *vma, unsigned int *mapcount)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
@@ -311,7 +311,7 @@ static int page_referenced_one(struct page *page,
 
 	/* Pretend the page is referenced if the task has the
 	   swap token and is in the middle of a page fault. */
-	if (mm != current->mm && !ignore_token && has_swap_token(mm) &&
+	if (mm != current->mm && has_swap_token(mm) &&
 			rwsem_is_locked(&mm->mmap_sem))
 		referenced++;
 
@@ -321,7 +321,7 @@ out:
 	return referenced;
 }
 
-static int page_referenced_anon(struct page *page, int ignore_token)
+static int page_referenced_anon(struct page *page)
 {
 	unsigned int mapcount;
 	struct anon_vma *anon_vma;
@@ -334,8 +334,7 @@ static int page_referenced_anon(struct page *page, int ignore_token)
 
 	mapcount = page_mapcount(page);
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-		referenced += page_referenced_one(page, vma, &mapcount,
-							ignore_token);
+		referenced += page_referenced_one(page, vma, &mapcount);
 		if (!mapcount)
 			break;
 	}
@@ -354,7 +353,7 @@ static int page_referenced_anon(struct page *page, int ignore_token)
  *
  * This function is only called from page_referenced for object-based pages.
  */
-static int page_referenced_file(struct page *page, int ignore_token)
+static int page_referenced_file(struct page *page)
 {
 	unsigned int mapcount;
 	struct address_space *mapping = page->mapping;
@@ -392,8 +391,7 @@ static int page_referenced_file(struct page *page, int ignore_token)
 			referenced++;
 			break;
 		}
-		referenced += page_referenced_one(page, vma, &mapcount,
-							ignore_token);
+		referenced += page_referenced_one(page, vma, &mapcount);
 		if (!mapcount)
 			break;
 	}
@@ -410,13 +408,10 @@ static int page_referenced_file(struct page *page, int ignore_token)
  * Quick test_and_clear_referenced for all mappings to a page,
  * returns the number of ptes which referenced the page.
  */
-int page_referenced(struct page *page, int is_locked, int ignore_token)
+int page_referenced(struct page *page, int is_locked)
 {
 	int referenced = 0;
 
-	if (!swap_token_default_timeout)
-		ignore_token = 1;
-
 	if (page_test_and_clear_young(page))
 		referenced++;
 
@@ -425,15 +420,14 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
 
 	if (page_mapped(page) && page->mapping) {
 		if (PageAnon(page))
-			referenced += page_referenced_anon(page, ignore_token);
+			referenced += page_referenced_anon(page);
 		else if (is_locked)
-			referenced += page_referenced_file(page, ignore_token);
+			referenced += page_referenced_file(page);
 		else if (TestSetPageLocked(page))
 			referenced++;
 		else {
 			if (page->mapping)
-				referenced += page_referenced_file(page,
-								ignore_token);
+				referenced += page_referenced_file(page);
 			unlock_page(page);
 		}
 	}
diff --git a/mm/thrash.c b/mm/thrash.c
index eff3c18c33a1..f4c560b4a2b7 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -57,14 +57,17 @@ void grab_swap_token(void)
 	/* We have the token. Let others know we still need it. */
 	if (has_swap_token(current->mm)) {
 		current->mm->recent_pagein = 1;
+		if (unlikely(!swap_token_default_timeout))
+			disable_swap_token();
 		return;
 	}
 
 	if (time_after(jiffies, swap_token_check)) {
 
-		/* Can't get swapout protection if we exceed our RSS limit. */
-		// if (current->mm->rss > current->mm->rlimit_rss)
-		//	return;
+		if (!swap_token_default_timeout) {
+			swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
+			return;
+		}
 
 		/* ... or if we recently held the token. */
 		if (time_before(jiffies, current->mm->swap_token_time))
@@ -95,6 +98,7 @@ void __put_swap_token(struct mm_struct *mm)
 {
 	spin_lock(&swap_token_lock);
 	if (likely(mm == swap_token_mm)) {
+		mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
 		swap_token_mm = &init_mm;
 		swap_token_check = jiffies;
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 28130541270f..078cf920208a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -407,7 +407,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		if (PageWriteback(page))
 			goto keep_locked;
 
-		referenced = page_referenced(page, 1, sc->priority <= 0);
+		referenced = page_referenced(page, 1);
 		/* In active use or really unfreeable?  Activate it. */
 		if (referenced && page_mapping_inuse(page))
 			goto activate_locked;
@@ -756,7 +756,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 		if (page_mapped(page)) {
 			if (!reclaim_mapped ||
 			    (total_swap_pages == 0 && PageAnon(page)) ||
-			    page_referenced(page, 0, sc->priority <= 0)) {
+			    page_referenced(page, 0)) {
 				list_add(&page->lru, &l_active);
 				continue;
 			}
@@ -960,6 +960,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		sc.nr_reclaimed = 0;
 		sc.priority = priority;
 		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+		if (!priority)
+			disable_swap_token();
 		shrink_caches(zones, &sc);
 		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
 		if (reclaim_state) {
@@ -1056,6 +1058,10 @@ loop_again:
 		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 		unsigned long lru_pages = 0;
 
+		/* The swap token gets in the way of swapout... */
+		if (!priority)
+			disable_swap_token();
+
 		all_zones_ok = 1;
 
 		if (nr_pages == 0) {
@@ -1360,6 +1366,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	sc.nr_reclaimed = 0;
 	/* scan at the highest priority */
 	sc.priority = 0;
+	disable_swap_token();
 
 	if (nr_pages > SWAP_CLUSTER_MAX)
 		sc.swap_cluster_max = nr_pages;
-- 
cgit v1.2.3