From 5e1f8c9e20a92743eefc9a82c2db835213905e26 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 28 Oct 2008 13:21:55 -0400
Subject: ext3: Add support for non-native signed/unsigned htree hash
 algorithms

The original ext3 hash algorithms assumed that variables of type char
were signed, as God and K&R intended.  Unfortunately, this assumption
is not true on some architectures.  Userspace support for marking
filesystems with non-native signed/unsigned chars was added two years
ago, but the kernel-side support was never added (until now).

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: akpm@linux-foundation.org
Cc: linux-kernel@vger.kernel.org
---
 include/linux/ext3_fs.h    | 28 +++++++++++++++++++++++++++-
 include/linux/ext3_fs_sb.h |  1 +
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index d14f02918483..9004794a35fe 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -353,6 +353,13 @@ struct ext3_inode {
 #define	EXT3_ERROR_FS			0x0002	/* Errors detected */
 #define	EXT3_ORPHAN_FS			0x0004	/* Orphans being recovered */
 
+/*
+ * Misc. filesystem flags
+ */
+#define EXT2_FLAGS_SIGNED_HASH		0x0001  /* Signed dirhash in use */
+#define EXT2_FLAGS_UNSIGNED_HASH	0x0002  /* Unsigned dirhash in use */
+#define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */
+
 /*
  * Mount flags
  */
@@ -489,7 +496,23 @@ struct ext3_super_block {
 	__u16	s_reserved_word_pad;
 	__le32	s_default_mount_opts;
 	__le32	s_first_meta_bg;	/* First metablock block group */
-	__u32	s_reserved[190];	/* Padding to the end of the block */
+	__le32	s_mkfs_time;		/* When the filesystem was created */
+	__le32	s_jnl_blocks[17];	/* Backup of the journal inode */
+	/* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
+/*150*/	__le32	s_blocks_count_hi;	/* Blocks count */
+	__le32	s_r_blocks_count_hi;	/* Reserved blocks count */
+	__le32	s_free_blocks_count_hi;	/* Free blocks count */
+	__le16	s_min_extra_isize;	/* All inodes have at least # bytes */
+	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
+	__le32	s_flags;		/* Miscellaneous flags */
+	__le16  s_raid_stride;		/* RAID stride */
+	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
+	__le64  s_mmp_block;            /* Block for multi-mount protection */
+	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
+	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
+	__u8	s_reserved_char_pad2;
+	__le16  s_reserved_pad;
+	__u32   s_reserved[162];        /* Padding to the end of the block */
 };
 
 #ifdef __KERNEL__
@@ -694,6 +717,9 @@ static inline __le16 ext3_rec_len_to_disk(unsigned len)
 #define DX_HASH_LEGACY		0
 #define DX_HASH_HALF_MD4	1
 #define DX_HASH_TEA		2
+#define DX_HASH_LEGACY_UNSIGNED	3
+#define DX_HASH_HALF_MD4_UNSIGNED	4
+#define DX_HASH_TEA_UNSIGNED		5
 
 #ifdef __KERNEL__
 
diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h
index e024e38248ff..a4e9216b3a6d 100644
--- a/include/linux/ext3_fs_sb.h
+++ b/include/linux/ext3_fs_sb.h
@@ -57,6 +57,7 @@ struct ext3_sb_info {
 	u32 s_next_generation;
 	u32 s_hash_seed[4];
 	int s_def_hash_version;
+	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
 	struct percpu_counter s_freeblocks_counter;
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
-- 
cgit v1.2.3


From a20c7ab570ffdce1d6f67c7acf8c1c502a3b3839 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Thu, 16 Oct 2008 18:43:48 +0400
Subject: [MTD] sharpsl-nand: use platform_data for model-specific values

Add platform_data which holds all model-specific values, like badblocks
pattern, oobinfo, partitions.

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
---
 drivers/mtd/nand/sharpsl.c  | 158 +++++++++++++++++++++++---------------------
 include/linux/mtd/sharpsl.h |  20 ++++++
 2 files changed, 104 insertions(+), 74 deletions(-)
 create mode 100644 include/linux/mtd/sharpsl.h

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/sharpsl.c b/drivers/mtd/nand/sharpsl.c
index 17625c0a8f61..698378ca8e0a 100644
--- a/drivers/mtd/nand/sharpsl.c
+++ b/drivers/mtd/nand/sharpsl.c
@@ -20,6 +20,7 @@
 #include <linux/mtd/nand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
+#include <linux/mtd/sharpsl.h>
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
 
@@ -53,31 +54,6 @@ struct sharpsl_nand {
 #define FLCLE		(1 << 1)
 #define FLCE0		(1 << 0)
 
-#ifdef CONFIG_MTD_PARTITIONS
-/*
- * Define partitions for flash device
- */
-#define DEFAULT_NUM_PARTITIONS 3
-
-static struct mtd_partition sharpsl_nand_default_partition_info[] = {
-	{
-	 .name = "System Area",
-	 .offset = 0,
-	 .size = 7 * 1024 * 1024,
-	 },
-	{
-	 .name = "Root Filesystem",
-	 .offset = 7 * 1024 * 1024,
-	 .size = 30 * 1024 * 1024,
-	 },
-	{
-	 .name = "Home Filesystem",
-	 .offset = MTDPART_OFS_APPEND,
-	 .size = MTDPART_SIZ_FULL,
-	 },
-};
-#endif
-
 /*
  *	hardware specific access to control-lines
  *	ctrl:
@@ -106,31 +82,6 @@ static void sharpsl_nand_hwcontrol(struct mtd_info *mtd, int cmd,
 		writeb(cmd, chip->IO_ADDR_W);
 }
 
-static uint8_t scan_ff_pattern[] = { 0xff, 0xff };
-
-static struct nand_bbt_descr sharpsl_bbt = {
-	.options = 0,
-	.offs = 4,
-	.len = 2,
-	.pattern = scan_ff_pattern
-};
-
-static struct nand_bbt_descr sharpsl_akita_bbt = {
-	.options = 0,
-	.offs = 4,
-	.len = 1,
-	.pattern = scan_ff_pattern
-};
-
-static struct nand_ecclayout akita_oobinfo = {
-	.eccbytes = 24,
-	.eccpos = {
-		   0x5, 0x1, 0x2, 0x3, 0x6, 0x7, 0x15, 0x11,
-		   0x12, 0x13, 0x16, 0x17, 0x25, 0x21, 0x22, 0x23,
-		   0x26, 0x27, 0x35, 0x31, 0x32, 0x33, 0x36, 0x37},
-	.oobfree = {{0x08, 0x09}}
-};
-
 static int sharpsl_nand_dev_ready(struct mtd_info *mtd)
 {
 	struct sharpsl_nand *sharpsl = mtd_to_sharpsl(mtd);
@@ -169,6 +120,12 @@ static int __devinit sharpsl_nand_probe(struct platform_device *pdev)
 	struct resource *r;
 	int err = 0;
 	struct sharpsl_nand *sharpsl;
+	struct sharpsl_nand_platform_data *data = pdev->dev.platform_data;
+
+	if (!data) {
+		dev_err(&pdev->dev, "no platform data!\n");
+		return -EINVAL;
+	}
 
 	/* Allocate memory for MTD device structure and private data */
 	sharpsl = kzalloc(sizeof(struct sharpsl_nand), GFP_KERNEL);
@@ -218,11 +175,8 @@ static int __devinit sharpsl_nand_probe(struct platform_device *pdev)
 	this->ecc.mode = NAND_ECC_HW;
 	this->ecc.size = 256;
 	this->ecc.bytes = 3;
-	this->badblock_pattern = &sharpsl_bbt;
-	if (machine_is_akita() || machine_is_borzoi()) {
-		this->badblock_pattern = &sharpsl_akita_bbt;
-		this->ecc.layout = &akita_oobinfo;
-	}
+	this->badblock_pattern = data->badblock_pattern;
+	this->ecc.layout = data->ecc_layout;
 	this->ecc.hwctl = sharpsl_nand_enable_hwecc;
 	this->ecc.calculate = sharpsl_nand_calculate_ecc;
 	this->ecc.correct = nand_correct_data;
@@ -236,29 +190,16 @@ static int __devinit sharpsl_nand_probe(struct platform_device *pdev)
 	sharpsl->mtd.name = "sharpsl-nand";
 #ifdef CONFIG_MTD_PARTITIONS
 	nr_partitions = parse_mtd_partitions(&sharpsl->mtd, part_probes, &sharpsl_partition_info, 0);
-
 	if (nr_partitions <= 0) {
-		nr_partitions = ARRAY_SIZE(sharpsl_nand_default_partition_info);
-		sharpsl_partition_info = sharpsl_nand_default_partition_info;
-		if (machine_is_poodle()) {
-			sharpsl_partition_info[1].size = 22 * 1024 * 1024;
-		} else if (machine_is_corgi() || machine_is_shepherd()) {
-			sharpsl_partition_info[1].size = 25 * 1024 * 1024;
-		} else if (machine_is_husky()) {
-			sharpsl_partition_info[1].size = 53 * 1024 * 1024;
-		} else if (machine_is_spitz()) {
-			sharpsl_partition_info[1].size = 5 * 1024 * 1024;
-		} else if (machine_is_akita()) {
-			sharpsl_partition_info[1].size = 58 * 1024 * 1024;
-		} else if (machine_is_borzoi()) {
-			sharpsl_partition_info[1].size = 32 * 1024 * 1024;
-		}
+		nr_partitions = data->nr_partitions;
+		sharpsl_partition_info = data->partitions;
 	}
 
-	err = add_mtd_partitions(&sharpsl->mtd, sharpsl_partition_info, nr_partitions);
-#else
-	err = add_mtd_device(&sharpsl->mtd);
+	if (nr_partitions > 0)
+		err = add_mtd_partitions(&sharpsl->mtd, sharpsl_partition_info, nr_partitions);
+	else
 #endif
+	err = add_mtd_device(&sharpsl->mtd);
 	if (err)
 		goto err_add;
 
@@ -306,6 +247,58 @@ static struct platform_driver sharpsl_nand_driver = {
 	.remove		= __devexit_p(sharpsl_nand_remove),
 };
 
+/*
+ * Define partitions for flash device
+ */
+static struct mtd_partition sharpsl_nand_partitions[] = {
+	{
+	 .name = "System Area",
+	 .offset = 0,
+	 .size = 7 * 1024 * 1024,
+	 },
+	{
+	 .name = "Root Filesystem",
+	 .offset = 7 * 1024 * 1024,
+	 .size = 30 * 1024 * 1024,
+	 },
+	{
+	 .name = "Home Filesystem",
+	 .offset = MTDPART_OFS_APPEND,
+	 .size = MTDPART_SIZ_FULL,
+	 },
+};
+
+static uint8_t scan_ff_pattern[] = { 0xff, 0xff };
+
+static struct nand_bbt_descr sharpsl_bbt = {
+	.options = 0,
+	.offs = 4,
+	.len = 2,
+	.pattern = scan_ff_pattern
+};
+
+static struct nand_bbt_descr sharpsl_akita_bbt = {
+	.options = 0,
+	.offs = 4,
+	.len = 1,
+	.pattern = scan_ff_pattern
+};
+
+static struct nand_ecclayout akita_oobinfo = {
+	.eccbytes = 24,
+	.eccpos = {
+		   0x5, 0x1, 0x2, 0x3, 0x6, 0x7, 0x15, 0x11,
+		   0x12, 0x13, 0x16, 0x17, 0x25, 0x21, 0x22, 0x23,
+		   0x26, 0x27, 0x35, 0x31, 0x32, 0x33, 0x36, 0x37},
+	.oobfree = {{0x08, 0x09}}
+};
+
+static struct sharpsl_nand_platform_data sharpsl_nand_platform_data = {
+	.badblock_pattern	= &sharpsl_bbt,
+	.partitions		= sharpsl_nand_partitions,
+	.nr_partitions		= ARRAY_SIZE(sharpsl_nand_partitions),
+};
+
 static struct resource sharpsl_nand_resources[] = {
 	{
 		.start	= 0x0C000000,
@@ -319,10 +312,27 @@ static struct platform_device sharpsl_nand_device = {
 	.id		= -1,
 	.resource	= sharpsl_nand_resources,
 	.num_resources	= ARRAY_SIZE(sharpsl_nand_resources),
+	.dev.platform_data	= &sharpsl_nand_platform_data,
 };
 
 static int __init sharpsl_nand_init(void)
 {
+	if (machine_is_poodle()) {
+		sharpsl_nand_partitions[1].size = 22 * 1024 * 1024;
+	} else if (machine_is_corgi() || machine_is_shepherd()) {
+		sharpsl_nand_partitions[1].size = 25 * 1024 * 1024;
+	} else if (machine_is_husky()) {
+		sharpsl_nand_partitions[1].size = 53 * 1024 * 1024;
+	} else if (machine_is_spitz()) {
+		sharpsl_nand_partitions[1].size = 5 * 1024 * 1024;
+	} else if (machine_is_akita()) {
+		sharpsl_nand_partitions[1].size = 58 * 1024 * 1024;
+		sharpsl_nand_platform_data.badblock_pattern = &sharpsl_akita_bbt;
+		sharpsl_nand_platform_data.ecc_layout = &akita_oobinfo;
+	} else if (machine_is_borzoi()) {
+		sharpsl_nand_partitions[1].size = 32 * 1024 * 1024;
+	}
+
 	platform_device_register(&sharpsl_nand_device);
 	return platform_driver_register(&sharpsl_nand_driver);
 }
diff --git a/include/linux/mtd/sharpsl.h b/include/linux/mtd/sharpsl.h
new file mode 100644
index 000000000000..25f4d2a845c1
--- /dev/null
+++ b/include/linux/mtd/sharpsl.h
@@ -0,0 +1,20 @@
+/*
+ * SharpSL NAND support
+ *
+ * Copyright (C) 2008 Dmitry Baryshkov
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/mtd/nand.h>
+#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/partitions.h>
+
+struct sharpsl_nand_platform_data {
+	struct nand_bbt_descr	*badblock_pattern;
+	struct nand_ecclayout	*ecc_layout;
+	struct mtd_partition	*partitions;
+	unsigned int		nr_partitions;
+};
-- 
cgit v1.2.3


From 171bbfbeab7730031eec8025341401fabe540bd5 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 25 Nov 2008 17:42:31 -0500
Subject: jbd2: Add BH_JBDPrivateStart

Add this so that file systems using JBD2 can safely allocate unused b_state
bits.

In this case, we add it so that Ocfs2 can define a single bit for tracking
the validation state of a buffer.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 include/linux/jbd2.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index c7d106ef22e2..f36645745489 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -329,6 +329,7 @@ enum jbd_state_bits {
 	BH_State,		/* Pins most journal_head state */
 	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
 	BH_Unshadow,		/* Dummy bit, for BJ_Shadow wakeup filtering */
+	BH_JBDPrivateStart,	/* First bit available for private use by FS */
 };
 
 BUFFER_FNS(JBD, jbd)
-- 
cgit v1.2.3


From e07f7183a486cf9783d1f8c9d2997b5b39eeb2d4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 26 Nov 2008 01:14:26 -0500
Subject: jbd2: improve jbd2 fsync batching

This patch removes the static sleep time in favor of a more self
optimizing approach where we measure the average amount of time it
takes to commit a transaction to disk and the ammount of time a
transaction has been running.  If somebody does a sync write or an
fsync() traditionally we would sleep for 1 jiffies, which depending on
the value of HZ could be a significant amount of time compared to how
long it takes to commit a transaction to the underlying storage.  With
this patch instead of sleeping for a jiffie, we check to see if the
amount of time this transaction has been running is less than the
average commit time, and if it is we sleep for the delta using
schedule_hrtimeout to give us a higher precision sleep time.  This
greatly benefits high end storage where you could end up sleeping for
longer than it takes to commit the transaction and therefore sitting
idle instead of allowing the transaction to be committed by keeping
the sleep time to a minimum so you are sure to always be doing
something.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c      | 14 +++++++++++++
 fs/jbd2/transaction.c | 58 ++++++++++++++++++++++++++++++++++++++-------------
 include/linux/jbd2.h  | 15 +++++++++++++
 3 files changed, 73 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6393fd0d804e..f22d1828ea85 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -355,6 +355,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int flags;
 	int err;
 	unsigned long long blocknr;
+	ktime_t start_time;
+	u64 commit_time;
 	char *tagp = NULL;
 	journal_header_t *header;
 	journal_block_tag_t *tag = NULL;
@@ -481,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction->t_state = T_FLUSH;
 	journal->j_committing_transaction = commit_transaction;
 	journal->j_running_transaction = NULL;
+	start_time = ktime_get();
 	commit_transaction->t_log_start = journal->j_head;
 	wake_up(&journal->j_wait_transaction_locked);
 	spin_unlock(&journal->j_state_lock);
@@ -995,6 +998,17 @@ restart_loop:
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
 	journal->j_commit_sequence = commit_transaction->t_tid;
 	journal->j_committing_transaction = NULL;
+	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+	/*
+	 * weight the commit time higher than the average time so we don't
+	 * react too strongly to vast changes in the commit time
+	 */
+	if (likely(journal->j_average_commit_time))
+		journal->j_average_commit_time = (commit_time +
+				journal->j_average_commit_time*3) / 4;
+	else
+		journal->j_average_commit_time = commit_time;
 	spin_unlock(&journal->j_state_lock);
 
 	if (journal->j_commit_callback)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 39b7805a599a..13dcbc990f41 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 {
 	transaction->t_journal = journal;
 	transaction->t_state = T_RUNNING;
+	transaction->t_start_time = ktime_get();
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
@@ -1193,7 +1195,7 @@ int jbd2_journal_stop(handle_t *handle)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
-	int old_handle_count, err;
+	int err;
 	pid_t pid;
 
 	J_ASSERT(journal_current_handle() == handle);
@@ -1216,24 +1218,52 @@ int jbd2_journal_stop(handle_t *handle)
 	/*
 	 * Implement synchronous transaction batching.  If the handle
 	 * was synchronous, don't force a commit immediately.  Let's
-	 * yield and let another thread piggyback onto this transaction.
-	 * Keep doing that while new threads continue to arrive.
-	 * It doesn't cost much - we're about to run a commit and sleep
-	 * on IO anyway.  Speeds up many-threaded, many-dir operations
-	 * by 30x or more...
+	 * yield and let another thread piggyback onto this
+	 * transaction.  Keep doing that while new threads continue to
+	 * arrive.  It doesn't cost much - we're about to run a commit
+	 * and sleep on IO anyway.  Speeds up many-threaded, many-dir
+	 * operations by 30x or more...
+	 *
+	 * We try and optimize the sleep time against what the
+	 * underlying disk can do, instead of having a static sleep
+	 * time.  This is useful for the case where our storage is so
+	 * fast that it is more optimal to go ahead and force a flush
+	 * and wait for the transaction to be committed than it is to
+	 * wait for an arbitrary amount of time for new writers to
+	 * join the transaction.  We achieve this by measuring how
+	 * long it takes to commit a transaction, and compare it with
+	 * how long this transaction has been running, and if run time
+	 * < commit time then we sleep for the delta and commit.  This
+	 * greatly helps super fast disks that would see slowdowns as
+	 * more threads started doing fsyncs.
 	 *
-	 * But don't do this if this process was the most recent one to
-	 * perform a synchronous write.  We do this to detect the case where a
-	 * single process is doing a stream of sync writes.  No point in waiting
-	 * for joiners in that case.
+	 * But don't do this if this process was the most recent one
+	 * to perform a synchronous write.  We do this to detect the
+	 * case where a single process is doing a stream of sync
+	 * writes.  No point in waiting for joiners in that case.
 	 */
 	pid = current->pid;
 	if (handle->h_sync && journal->j_last_sync_writer != pid) {
+		u64 commit_time, trans_time;
+
 		journal->j_last_sync_writer = pid;
-		do {
-			old_handle_count = transaction->t_handle_count;
-			schedule_timeout_uninterruptible(1);
-		} while (old_handle_count != transaction->t_handle_count);
+
+		spin_lock(&journal->j_state_lock);
+		commit_time = journal->j_average_commit_time;
+		spin_unlock(&journal->j_state_lock);
+
+		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+						   transaction->t_start_time));
+
+		commit_time = min_t(u64, commit_time,
+				    1000*jiffies_to_usecs(1));
+
+		if (trans_time < commit_time) {
+			ktime_t expires = ktime_add_ns(ktime_get(),
+						       commit_time);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+		}
 	}
 
 	current->journal_info = NULL;
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f36645745489..ab8cef130c28 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -637,6 +637,11 @@ struct transaction_s
 	 */
 	unsigned long		t_expires;
 
+	/*
+	 * When this transaction started, in nanoseconds [no locking]
+	 */
+	ktime_t			t_start_time;
+
 	/*
 	 * How many handles used this transaction? [t_handle_lock]
 	 */
@@ -939,8 +944,18 @@ struct journal_s
 	struct buffer_head	**j_wbuf;
 	int			j_wbufsize;
 
+	/*
+	 * this is the pid of hte last person to run a synchronous operation
+	 * through the journal
+	 */
 	pid_t			j_last_sync_writer;
 
+	/*
+	 * the average amount of time in nanoseconds it takes to commit a
+	 * transaction to disk. [j_state_lock]
+	 */
+	u64			j_average_commit_time;
+
 	/* This function is called when a transaction is closed */
 	void			(*j_commit_callback)(journal_t *,
 						     transaction_t *);
-- 
cgit v1.2.3


From 30773840c19cea60dcef39545960d541b1ac1cf8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 3 Jan 2009 20:27:38 -0500
Subject: ext4: add fsync batch tuning knobs

Add new mount options, min_batch_time and max_batch_time, which
controls how long the jbd2 layer should wait for additional filesystem
operations to get batched with a synchronous write transaction.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt | 29 +++++++++++++++++++++++
 fs/ext4/ext4.h                     |  7 ++++++
 fs/ext4/ext4_sb.h                  |  2 ++
 fs/ext4/super.c                    | 47 ++++++++++++++++++++++++++++++++------
 fs/jbd2/journal.c                  |  2 ++
 fs/jbd2/transaction.c              |  4 +++-
 include/linux/jbd2.h               |  8 +++++++
 7 files changed, 91 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index f75ab101c00a..e3fcbea3ec8c 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -283,6 +283,35 @@ delalloc	(*)	Deferring block allocation until write-out time.
 nodelalloc		Disable delayed allocation. Blocks are allocation
 			when data is copied from user to page cache.
 
+max_batch_time=usec	Maximum amount of time ext4 should wait for
+			additional filesystem operations to be batch
+			together with a synchronous write operation.
+			Since a synchronous write operation is going to
+			force a commit and then a wait for the I/O
+			complete, it doesn't cost much, and can be a
+			huge throughput win, we wait for a small amount
+			of time to see if any other transactions can
+			piggyback on the synchronous write.   The
+			algorithm used is designed to automatically tune
+			for the speed of the disk, by measuring the
+			amount of time (on average) that it takes to
+			finish committing a transaction.  Call this time
+			the "commit time".  If the time that the
+			transactoin has been running is less than the
+			commit time, ext4 will try sleeping for the
+			commit time to see if other operations will join
+			the transaction.   The commit time is capped by
+			the max_batch_time, which defaults to 15000us
+			(15ms).   This optimization can be turned off
+			entirely by setting max_batch_time to 0.
+
+min_batch_time=usec	This parameter sets the commit time (as
+			described above) to be at least min_batch_time.
+			It defaults to zero microseconds.  Increasing
+			this parameter may improve the throughput of
+			multi-threaded, synchronous workloads on very
+			fast disks, at the cost of increasing latency.
+
 Data Mode
 =========
 There are 3 different data modes:
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ac8551e0b70a..9ba9fd6d14da 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -328,6 +328,7 @@ struct ext4_mount_options {
 	uid_t s_resuid;
 	gid_t s_resgid;
 	unsigned long s_commit_interval;
+	u32 s_min_batch_time, s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	int s_jquota_fmt;
 	char *s_qf_names[MAXQUOTAS];
@@ -805,6 +806,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEFM_JMODE_ORDERED	0x0040
 #define EXT4_DEFM_JMODE_WBACK	0x0060
 
+/*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME	0
+#define EXT4_DEF_MAX_BATCH_TIME	15000 /* 15ms */
+
 /*
  * Structure of a directory entry
  */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 3db800f399a6..039b6ea1a042 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -74,6 +74,8 @@ struct ext4_sb_info {
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	unsigned long s_commit_interval;
+	u32 s_max_batch_time;
+	u32 s_min_batch_time;
 	struct block_device *journal_bdev;
 #ifdef CONFIG_JBD2_DEBUG
 	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dc27d4c613c0..da377f9521bb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -705,10 +705,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 #endif
 	if (!test_opt(sb, RESERVATION))
 		seq_puts(seq, ",noreservation");
-	if (sbi->s_commit_interval) {
+	if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
 		seq_printf(seq, ",commit=%u",
 			   (unsigned) (sbi->s_commit_interval / HZ));
 	}
+	if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
+		seq_printf(seq, ",min_batch_time=%u",
+			   (unsigned) sbi->s_min_batch_time);
+	}
+	if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
+		seq_printf(seq, ",max_batch_time=%u",
+			   (unsigned) sbi->s_min_batch_time);
+	}
+
 	/*
 	 * We're changing the default of barrier mount option, so
 	 * let's always display its mount state so it's clear what its
@@ -874,7 +883,8 @@ enum {
 	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
-	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+	Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
+	Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_data_err_abort, Opt_data_err_ignore,
@@ -913,6 +923,8 @@ static const match_table_t tokens = {
 	{Opt_nobh, "nobh"},
 	{Opt_bh, "bh"},
 	{Opt_commit, "commit=%u"},
+	{Opt_min_batch_time, "min_batch_time=%u"},
+	{Opt_max_batch_time, "max_batch_time=%u"},
 	{Opt_journal_update, "journal=update"},
 	{Opt_journal_inum, "journal=%u"},
 	{Opt_journal_dev, "journal_dev=%u"},
@@ -1131,6 +1143,22 @@ static int parse_options(char *options, struct super_block *sb,
 				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
 			sbi->s_commit_interval = HZ * option;
 			break;
+		case Opt_max_batch_time:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = EXT4_DEF_MAX_BATCH_TIME;
+			sbi->s_max_batch_time = option;
+			break;
+		case Opt_min_batch_time:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			sbi->s_min_batch_time = option;
+			break;
 		case Opt_data_journal:
 			data_opt = EXT4_MOUNT_JOURNAL_DATA;
 			goto datacheck;
@@ -1979,6 +2007,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 
 	set_opt(sbi->s_mount_opt, RESERVATION);
 	set_opt(sbi->s_mount_opt, BARRIER);
@@ -2524,11 +2555,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (sbi->s_commit_interval)
-		journal->j_commit_interval = sbi->s_commit_interval;
-	/* We could also set up an ext4-specific default for the commit
-	 * interval here, but for now we'll just fall back to the jbd
-	 * default. */
+	journal->j_commit_interval = sbi->s_commit_interval;
+	journal->j_min_batch_time = sbi->s_min_batch_time;
+	journal->j_max_batch_time = sbi->s_max_batch_time;
 
 	spin_lock(&journal->j_state_lock);
 	if (test_opt(sb, BARRIER))
@@ -3042,6 +3071,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	old_opts.s_resuid = sbi->s_resuid;
 	old_opts.s_resgid = sbi->s_resgid;
 	old_opts.s_commit_interval = sbi->s_commit_interval;
+	old_opts.s_min_batch_time = sbi->s_min_batch_time;
+	old_opts.s_max_batch_time = sbi->s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -3178,6 +3209,8 @@ restore_opts:
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sbi->s_commit_interval = old_opts.s_commit_interval;
+	sbi->s_min_batch_time = old_opts.s_min_batch_time;
+	sbi->s_max_batch_time = old_opts.s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 74d87290381c..fd1d7557a098 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -964,6 +964,8 @@ static journal_t * journal_init_common (void)
 	spin_lock_init(&journal->j_state_lock);
 
 	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
+	journal->j_min_batch_time = 0;
+	journal->j_max_batch_time = 15000; /* 15ms */
 
 	/* The journal is marked for error until we succeed with recovery! */
 	journal->j_flags = JBD2_ABORT;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 13dcbc990f41..48c21bac5a56 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1255,8 +1255,10 @@ int jbd2_journal_stop(handle_t *handle)
 		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
 						   transaction->t_start_time));
 
+		commit_time = max_t(u64, commit_time,
+				    1000*journal->j_min_batch_time);
 		commit_time = min_t(u64, commit_time,
-				    1000*jiffies_to_usecs(1));
+				    1000*journal->j_max_batch_time);
 
 		if (trans_time < commit_time) {
 			ktime_t expires = ktime_add_ns(ktime_get(),
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index ab8cef130c28..a3cd647ea1bc 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -956,6 +956,14 @@ struct journal_s
 	 */
 	u64			j_average_commit_time;
 
+	/*
+	 * minimum and maximum times that we should wait for
+	 * additional filesystem operations to get batched into a
+	 * synchronous handle in microseconds
+	 */
+	u32			j_min_batch_time;
+	u32			j_max_batch_time;
+
 	/* This function is called when a transaction is closed */
 	void			(*j_commit_callback)(journal_t *,
 						     transaction_t *);
-- 
cgit v1.2.3


From 1a0d3786dd57dbd74f340322054c3d618b999dcf Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 5 Nov 2008 00:09:22 -0500
Subject: jbd2: Remove a large array of bh's from the stack of the checkpoint
 routine

jbd2_log_do_checkpoint()n is one of the kernel's largest stack users.
Move the array of buffer head's from the stack of jbd2_log_do_checkpoint()
to the in-core journal structure.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 22 +++++++++-------------
 fs/jbd2/journal.c    |  2 ++
 include/linux/jbd2.h | 10 ++++++++++
 3 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9497718fe920..adc08ec875ed 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -249,16 +249,14 @@ restart:
 	return ret;
 }
 
-#define NR_BATCH	64
-
 static void
-__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
+__flush_batch(journal_t *journal, int *batch_count)
 {
 	int i;
 
-	ll_rw_block(SWRITE, *batch_count, bhs);
+	ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
 	for (i = 0; i < *batch_count; i++) {
-		struct buffer_head *bh = bhs[i];
+		struct buffer_head *bh = journal->j_chkpt_bhs[i];
 		clear_buffer_jwrite(bh);
 		BUFFER_TRACE(bh, "brelse");
 		__brelse(bh);
@@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
 static int __process_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count,
-			transaction_t *transaction)
+			    int *batch_count, transaction_t *transaction)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
@@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		get_bh(bh);
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
-		bhs[*batch_count] = bh;
+		journal->j_chkpt_bhs[*batch_count] = bh;
 		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		transaction->t_chp_stats.cs_written++;
 		(*batch_count)++;
-		if (*batch_count == NR_BATCH) {
+		if (*batch_count == JBD2_NR_BATCH) {
 			spin_unlock(&journal->j_list_lock);
-			__flush_batch(journal, bhs, batch_count);
+			__flush_batch(journal, batch_count);
 			ret = 1;
 		}
 	}
@@ -388,7 +385,6 @@ restart:
 	if (journal->j_checkpoint_transactions == transaction &&
 			transaction->t_tid == this_tid) {
 		int batch_count = 0;
-		struct buffer_head *bhs[NR_BATCH];
 		struct journal_head *jh;
 		int retry = 0, err;
 
@@ -402,7 +398,7 @@ restart:
 				retry = 1;
 				break;
 			}
-			retry = __process_buffer(journal, jh, bhs, &batch_count,
+			retry = __process_buffer(journal, jh, &batch_count,
 						 transaction);
 			if (retry < 0 && !result)
 				result = retry;
@@ -419,7 +415,7 @@ restart:
 				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 			}
-			__flush_batch(journal, bhs, &batch_count);
+			__flush_batch(journal, &batch_count);
 		}
 
 		if (retry) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fd1d7557a098..34ef98057202 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1477,7 +1477,9 @@ int jbd2_journal_destroy(journal_t *journal)
 	spin_lock(&journal->j_list_lock);
 	while (journal->j_checkpoint_transactions != NULL) {
 		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
 		jbd2_log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
 		spin_lock(&journal->j_list_lock);
 	}
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index a3cd647ea1bc..004c9a8d63ed 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -687,6 +687,8 @@ jbd2_time_diff(unsigned long start, unsigned long end)
 	return end + (MAX_JIFFY_OFFSET - start);
 }
 
+#define JBD2_NR_BATCH	64
+
 /**
  * struct journal_s - The journal_s type is the concrete type associated with
  *     journal_t.
@@ -830,6 +832,14 @@ struct journal_s
 	/* Semaphore for locking against concurrent checkpoints */
 	struct mutex		j_checkpoint_mutex;
 
+	/*
+	 * List of buffer heads used by the checkpoint routine.  This
+	 * was moved from jbd2_log_do_checkpoint() to reduce stack
+	 * usage.  Access to this array is controlled by the
+	 * j_checkpoint_mutex.  [j_checkpoint_mutex]
+	 */
+	struct buffer_head	*j_chkpt_bhs[JBD2_NR_BATCH];
+	
 	/*
 	 * Journal head: identifies the first unused block in the journal.
 	 * [j_state_lock]
-- 
cgit v1.2.3


From fb68407b0d9efba962c03f55009c797e22f024bc Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 6 Nov 2008 17:50:21 -0500
Subject: jbd2: Call journal commit callback without holding j_list_lock

Avoid freeing the transaction in __jbd2_journal_drop_transaction() so
the journal commit callback can run without holding j_list_lock, to
avoid lock contention on this spinlock.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c |  2 +-
 fs/jbd2/commit.c     | 13 ++++++++-----
 include/linux/jbd2.h |  4 ++--
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index adc08ec875ed..17159cacbd9e 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -682,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 	   safely remove this transaction from the log */
 
 	__jbd2_journal_drop_transaction(journal, transaction);
+	kfree(transaction);
 
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
@@ -756,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 	J_ASSERT(journal->j_running_transaction != transaction);
 
 	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
-	kfree(transaction);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f22d1828ea85..0ad84162c425 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -363,7 +363,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int space_left = 0;
 	int first_tag = 0;
 	int tag_flag;
-	int i;
+	int i, to_free = 0;
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
@@ -1011,12 +1011,10 @@ restart_loop:
 		journal->j_average_commit_time = commit_time;
 	spin_unlock(&journal->j_state_lock);
 
-	if (journal->j_commit_callback)
-		journal->j_commit_callback(journal, commit_transaction);
-
 	if (commit_transaction->t_checkpoint_list == NULL &&
 	    commit_transaction->t_checkpoint_io_list == NULL) {
 		__jbd2_journal_drop_transaction(journal, commit_transaction);
+		to_free = 1;
 	} else {
 		if (journal->j_checkpoint_transactions == NULL) {
 			journal->j_checkpoint_transactions = commit_transaction;
@@ -1035,11 +1033,16 @@ restart_loop:
 	}
 	spin_unlock(&journal->j_list_lock);
 
+	if (journal->j_commit_callback)
+		journal->j_commit_callback(journal, commit_transaction);
+
 	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
-		   journal->j_devname, journal->j_commit_sequence,
+		   journal->j_devname, commit_transaction->t_tid,
 		   journal->j_tail_sequence);
 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
+	if (to_free)
+		kfree(commit_transaction);
 
 	wake_up(&journal->j_wait_done_commit);
 }
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 004c9a8d63ed..9d82084a1605 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1179,8 +1179,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
 
 void __jbd2_log_wait_for_space(journal_t *journal);
-extern void	__jbd2_journal_drop_transaction(journal_t *, transaction_t *);
-extern int	jbd2_cleanup_journal_tail(journal_t *);
+extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
+extern int jbd2_cleanup_journal_tail(journal_t *);
 
 /* Debugging code only: */
 
-- 
cgit v1.2.3


From 69423d99fc182a81f3c5db3eb5c140acc6fc64be Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Wed, 10 Dec 2008 13:37:21 +0000
Subject: [MTD] update internal API to support 64-bit device size

MTD internal API presently uses 32-bit values to represent
device size.  This patch updates them to 64-bits but leaves
the external API unchanged.  Extending the external API
is a separate issue for several reasons.  First, no one
needs it at the moment.  Secondly, whether the implementation
is done with IOCTLs, sysfs or both is still debated.  Thirdly
external API changes require the internal API to be accepted
first.

Note that although the MTD API will be able to support 64-bit
device sizes, existing drivers do not and are not required
to do so, although NAND base has been updated.

In general, changing from 32-bit to 64-bit values cause little
or no changes to the majority of the code with the following
exceptions:
    	- printk message formats
    	- division and modulus of 64-bit values
    	- NAND base support
	- 32-bit local variables used by mtdpart and mtdconcat
	- naughtily assuming one structure maps to another
	in MEMERASE ioctl

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/chips/cfi_cmdset_0001.c | 12 ++++----
 drivers/mtd/chips/cfi_cmdset_0002.c |  8 +++---
 drivers/mtd/chips/cfi_cmdset_0020.c | 14 ++++-----
 drivers/mtd/chips/fwh_lock.h        |  4 +--
 drivers/mtd/inftlcore.c             |  2 +-
 drivers/mtd/inftlmount.c            |  4 +--
 drivers/mtd/maps/amd76xrom.c        |  4 +--
 drivers/mtd/maps/ck804xrom.c        |  4 +--
 drivers/mtd/maps/esb2rom.c          |  4 +--
 drivers/mtd/maps/ichxrom.c          |  4 +--
 drivers/mtd/maps/nettel.c           |  2 +-
 drivers/mtd/maps/scb2_flash.c       |  8 ++++--
 drivers/mtd/mtdchar.c               |  6 +++-
 drivers/mtd/mtdconcat.c             | 33 ++++++++++++---------
 drivers/mtd/mtdcore.c               | 16 ++++++++++-
 drivers/mtd/mtdoops.c               |  9 ++++--
 drivers/mtd/mtdpart.c               | 34 +++++++++++-----------
 drivers/mtd/nand/nand_base.c        | 24 ++++++++++------
 drivers/mtd/nand/nand_bbt.c         | 31 ++++++++++----------
 drivers/mtd/nftlcore.c              |  2 +-
 drivers/mtd/nftlmount.c             |  4 +--
 drivers/mtd/onenand/onenand_base.c  |  8 +++---
 drivers/mtd/rfd_ftl.c               | 23 ++++++++-------
 drivers/mtd/ssfdc.c                 |  7 +++--
 drivers/mtd/ubi/build.c             |  2 +-
 drivers/mtd/ubi/gluebi.c            | 17 +++++------
 fs/jffs2/erase.c                    |  5 ++--
 include/linux/mtd/mtd.h             | 57 +++++++++++++++++++++++++++++++------
 include/linux/mtd/nand.h            |  2 +-
 include/linux/mtd/partitions.h      |  4 +--
 30 files changed, 216 insertions(+), 138 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index c93a8be5d5f1..f5ab6fa1057b 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -58,8 +58,8 @@ static int cfi_intelext_write_buffers(struct mtd_info *, loff_t, size_t, size_t
 static int cfi_intelext_writev(struct mtd_info *, const struct kvec *, unsigned long, loff_t, size_t *);
 static int cfi_intelext_erase_varsize(struct mtd_info *, struct erase_info *);
 static void cfi_intelext_sync (struct mtd_info *);
-static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, size_t len);
-static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, size_t len);
+static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
 #ifdef CONFIG_MTD_OTP
 static int cfi_intelext_read_fact_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
 static int cfi_intelext_read_user_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
@@ -558,8 +558,8 @@ static struct mtd_info *cfi_intelext_setup(struct mtd_info *mtd)
 	}
 
 	for (i=0; i<mtd->numeraseregions;i++){
-		printk(KERN_DEBUG "erase region %d: offset=0x%x,size=0x%x,blocks=%d\n",
-		       i,mtd->eraseregions[i].offset,
+		printk(KERN_DEBUG "erase region %d: offset=0x%llx,size=0x%x,blocks=%d\n",
+		       i,(unsigned long long)mtd->eraseregions[i].offset,
 		       mtd->eraseregions[i].erasesize,
 		       mtd->eraseregions[i].numblocks);
 	}
@@ -2058,7 +2058,7 @@ out:	put_chip(map, chip, adr);
 	return ret;
 }
 
-static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	int ret;
 
@@ -2082,7 +2082,7 @@ static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
 	return ret;
 }
 
-static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	int ret;
 
diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index d74ec46aa032..f9c435a42670 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -71,8 +71,8 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 static void put_chip(struct map_info *map, struct flchip *chip, unsigned long adr);
 #include "fwh_lock.h"
 
-static int cfi_atmel_lock(struct mtd_info *mtd, loff_t ofs, size_t len);
-static int cfi_atmel_unlock(struct mtd_info *mtd, loff_t ofs, size_t len);
+static int cfi_atmel_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+static int cfi_atmel_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
 
 static struct mtd_chip_driver cfi_amdstd_chipdrv = {
 	.probe		= NULL, /* Not usable directly */
@@ -1774,12 +1774,12 @@ out_unlock:
 	return ret;
 }
 
-static int cfi_atmel_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int cfi_atmel_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	return cfi_varsize_frob(mtd, do_atmel_lock, ofs, len, NULL);
 }
 
-static int cfi_atmel_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int cfi_atmel_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	return cfi_varsize_frob(mtd, do_atmel_unlock, ofs, len, NULL);
 }
diff --git a/drivers/mtd/chips/cfi_cmdset_0020.c b/drivers/mtd/chips/cfi_cmdset_0020.c
index d4714dd9f7ab..6c740f346f91 100644
--- a/drivers/mtd/chips/cfi_cmdset_0020.c
+++ b/drivers/mtd/chips/cfi_cmdset_0020.c
@@ -42,8 +42,8 @@ static int cfi_staa_writev(struct mtd_info *mtd, const struct kvec *vecs,
 		unsigned long count, loff_t to, size_t *retlen);
 static int cfi_staa_erase_varsize(struct mtd_info *, struct erase_info *);
 static void cfi_staa_sync (struct mtd_info *);
-static int cfi_staa_lock(struct mtd_info *mtd, loff_t ofs, size_t len);
-static int cfi_staa_unlock(struct mtd_info *mtd, loff_t ofs, size_t len);
+static int cfi_staa_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+static int cfi_staa_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
 static int cfi_staa_suspend (struct mtd_info *);
 static void cfi_staa_resume (struct mtd_info *);
 
@@ -221,8 +221,8 @@ static struct mtd_info *cfi_staa_setup(struct map_info *map)
 		}
 
 		for (i=0; i<mtd->numeraseregions;i++){
-			printk(KERN_DEBUG "%d: offset=0x%x,size=0x%x,blocks=%d\n",
-			       i,mtd->eraseregions[i].offset,
+			printk(KERN_DEBUG "%d: offset=0x%llx,size=0x%x,blocks=%d\n",
+			       i, (unsigned long long)mtd->eraseregions[i].offset,
 			       mtd->eraseregions[i].erasesize,
 			       mtd->eraseregions[i].numblocks);
 		}
@@ -964,7 +964,7 @@ static int cfi_staa_erase_varsize(struct mtd_info *mtd,
 		adr += regions[i].erasesize;
 		len -= regions[i].erasesize;
 
-		if (adr % (1<< cfi->chipshift) == ((regions[i].offset + (regions[i].erasesize * regions[i].numblocks)) %( 1<< cfi->chipshift)))
+		if (adr % (1<< cfi->chipshift) == (((unsigned long)regions[i].offset + (regions[i].erasesize * regions[i].numblocks)) %( 1<< cfi->chipshift)))
 			i++;
 
 		if (adr >> cfi->chipshift) {
@@ -1135,7 +1135,7 @@ retry:
 	spin_unlock_bh(chip->mutex);
 	return 0;
 }
-static int cfi_staa_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int cfi_staa_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -1284,7 +1284,7 @@ retry:
 	spin_unlock_bh(chip->mutex);
 	return 0;
 }
-static int cfi_staa_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int cfi_staa_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
diff --git a/drivers/mtd/chips/fwh_lock.h b/drivers/mtd/chips/fwh_lock.h
index ab44f2b996f8..57e0e4e921f9 100644
--- a/drivers/mtd/chips/fwh_lock.h
+++ b/drivers/mtd/chips/fwh_lock.h
@@ -77,7 +77,7 @@ static int fwh_xxlock_oneblock(struct map_info *map, struct flchip *chip,
 }
 
 
-static int fwh_lock_varsize(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int fwh_lock_varsize(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	int ret;
 
@@ -88,7 +88,7 @@ static int fwh_lock_varsize(struct mtd_info *mtd, loff_t ofs, size_t len)
 }
 
 
-static int fwh_unlock_varsize(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int fwh_unlock_varsize(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	int ret;
 
diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c
index 50ce13887f63..73f05227dc8c 100644
--- a/drivers/mtd/inftlcore.c
+++ b/drivers/mtd/inftlcore.c
@@ -50,7 +50,7 @@ static void inftl_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 	struct INFTLrecord *inftl;
 	unsigned long temp;
 
-	if (mtd->type != MTD_NANDFLASH)
+	if (mtd->type != MTD_NANDFLASH || mtd->size > UINT_MAX)
 		return;
 	/* OK, this is moderately ugly.  But probably safe.  Alternatives? */
 	if (memcmp(mtd->name, "DiskOnChip", 10))
diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c
index 9113628ed1ef..f751dd97c549 100644
--- a/drivers/mtd/inftlmount.c
+++ b/drivers/mtd/inftlmount.c
@@ -63,7 +63,7 @@ static int find_boot_record(struct INFTLrecord *inftl)
 	 * otherwise.
 	 */
 	inftl->EraseSize = inftl->mbd.mtd->erasesize;
-        inftl->nb_blocks = inftl->mbd.mtd->size / inftl->EraseSize;
+        inftl->nb_blocks = (u32)inftl->mbd.mtd->size / inftl->EraseSize;
 
 	inftl->MediaUnit = BLOCK_NIL;
 
@@ -187,7 +187,7 @@ static int find_boot_record(struct INFTLrecord *inftl)
 				mh->BlockMultiplierBits);
 			inftl->EraseSize = inftl->mbd.mtd->erasesize <<
 				mh->BlockMultiplierBits;
-			inftl->nb_blocks = inftl->mbd.mtd->size / inftl->EraseSize;
+			inftl->nb_blocks = (u32)inftl->mbd.mtd->size / inftl->EraseSize;
 			block >>= mh->BlockMultiplierBits;
 		}
 
diff --git a/drivers/mtd/maps/amd76xrom.c b/drivers/mtd/maps/amd76xrom.c
index d1eec7d3243f..237733d094c4 100644
--- a/drivers/mtd/maps/amd76xrom.c
+++ b/drivers/mtd/maps/amd76xrom.c
@@ -232,8 +232,8 @@ static int __devinit amd76xrom_init_one (struct pci_dev *pdev,
 		/* Trim the size if we are larger than the map */
 		if (map->mtd->size > map->map.size) {
 			printk(KERN_WARNING MOD_NAME
-				" rom(%u) larger than window(%lu). fixing...\n",
-				map->mtd->size, map->map.size);
+				" rom(%llu) larger than window(%lu). fixing...\n",
+				(unsigned long long)map->mtd->size, map->map.size);
 			map->mtd->size = map->map.size;
 		}
 		if (window->rsrc.parent) {
diff --git a/drivers/mtd/maps/ck804xrom.c b/drivers/mtd/maps/ck804xrom.c
index 1a6feb4474de..5f7a245ed132 100644
--- a/drivers/mtd/maps/ck804xrom.c
+++ b/drivers/mtd/maps/ck804xrom.c
@@ -263,8 +263,8 @@ static int __devinit ck804xrom_init_one (struct pci_dev *pdev,
 		/* Trim the size if we are larger than the map */
 		if (map->mtd->size > map->map.size) {
 			printk(KERN_WARNING MOD_NAME
-				" rom(%u) larger than window(%lu). fixing...\n",
-				map->mtd->size, map->map.size);
+				" rom(%llu) larger than window(%lu). fixing...\n",
+				(unsigned long long)map->mtd->size, map->map.size);
 			map->mtd->size = map->map.size;
 		}
 		if (window->rsrc.parent) {
diff --git a/drivers/mtd/maps/esb2rom.c b/drivers/mtd/maps/esb2rom.c
index bbbcdd4c8d13..11a2f57df9cf 100644
--- a/drivers/mtd/maps/esb2rom.c
+++ b/drivers/mtd/maps/esb2rom.c
@@ -324,8 +324,8 @@ static int __devinit esb2rom_init_one(struct pci_dev *pdev,
 		/* Trim the size if we are larger than the map */
 		if (map->mtd->size > map->map.size) {
 			printk(KERN_WARNING MOD_NAME
-				" rom(%u) larger than window(%lu). fixing...\n",
-				map->mtd->size, map->map.size);
+				" rom(%llu) larger than window(%lu). fixing...\n",
+				(unsigned long long)map->mtd->size, map->map.size);
 			map->mtd->size = map->map.size;
 		}
 		if (window->rsrc.parent) {
diff --git a/drivers/mtd/maps/ichxrom.c b/drivers/mtd/maps/ichxrom.c
index aeb6c916e23f..c32bc28920b3 100644
--- a/drivers/mtd/maps/ichxrom.c
+++ b/drivers/mtd/maps/ichxrom.c
@@ -258,8 +258,8 @@ static int __devinit ichxrom_init_one (struct pci_dev *pdev,
 		/* Trim the size if we are larger than the map */
 		if (map->mtd->size > map->map.size) {
 			printk(KERN_WARNING MOD_NAME
-				" rom(%u) larger than window(%lu). fixing...\n",
-				map->mtd->size, map->map.size);
+				" rom(%llu) larger than window(%lu). fixing...\n",
+				(unsigned long long)map->mtd->size, map->map.size);
 			map->mtd->size = map->map.size;
 		}
 		if (window->rsrc.parent) {
diff --git a/drivers/mtd/maps/nettel.c b/drivers/mtd/maps/nettel.c
index 965e6c6d6ab0..a8723a6b7457 100644
--- a/drivers/mtd/maps/nettel.c
+++ b/drivers/mtd/maps/nettel.c
@@ -226,7 +226,7 @@ static int __init nettel_init(void)
 
 	if ((amd_mtd = do_map_probe("jedec_probe", &nettel_amd_map))) {
 		printk(KERN_NOTICE "SNAPGEAR: AMD flash device size = %dK\n",
-			amd_mtd->size>>10);
+			(int)(amd_mtd->size>>10));
 
 		amd_mtd->owner = THIS_MODULE;
 
diff --git a/drivers/mtd/maps/scb2_flash.c b/drivers/mtd/maps/scb2_flash.c
index 21169e6d646c..7e329f09a548 100644
--- a/drivers/mtd/maps/scb2_flash.c
+++ b/drivers/mtd/maps/scb2_flash.c
@@ -118,7 +118,8 @@ scb2_fixup_mtd(struct mtd_info *mtd)
 		struct mtd_erase_region_info *region = &mtd->eraseregions[i];
 
 		if (region->numblocks * region->erasesize > mtd->size) {
-			region->numblocks = (mtd->size / region->erasesize);
+			region->numblocks = ((unsigned long)mtd->size /
+						region->erasesize);
 			done = 1;
 		} else {
 			region->numblocks = 0;
@@ -187,8 +188,9 @@ scb2_flash_probe(struct pci_dev *dev, const struct pci_device_id *ent)
 		return -ENODEV;
 	}
 
-	printk(KERN_NOTICE MODNAME ": chip size 0x%x at offset 0x%x\n",
-	       scb2_mtd->size, SCB2_WINDOW - scb2_mtd->size);
+	printk(KERN_NOTICE MODNAME ": chip size 0x%llx at offset 0x%llx\n",
+	       (unsigned long long)scb2_mtd->size,
+	       (unsigned long long)(SCB2_WINDOW - scb2_mtd->size));
 
 	add_mtd_device(scb2_mtd);
 
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index bcffeda2df3d..e9ec59e9a566 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -450,16 +450,20 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 		if (!erase)
 			ret = -ENOMEM;
 		else {
+			struct erase_info_user einfo;
+
 			wait_queue_head_t waitq;
 			DECLARE_WAITQUEUE(wait, current);
 
 			init_waitqueue_head(&waitq);
 
-			if (copy_from_user(&erase->addr, argp,
+			if (copy_from_user(&einfo, argp,
 				    sizeof(struct erase_info_user))) {
 				kfree(erase);
 				return -EFAULT;
 			}
+			erase->addr = einfo.start;
+			erase->len = einfo.length;
 			erase->mtd = mtd;
 			erase->callback = mtdchar_erase_callback;
 			erase->priv = (unsigned long)&waitq;
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 789842d0e6f2..ff8c14bcac68 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -197,7 +197,7 @@ concat_writev(struct mtd_info *mtd, const struct kvec *vecs,
 			continue;
 		}
 
-		size = min(total_len, (size_t)(subdev->size - to));
+		size = min_t(uint64_t, total_len, subdev->size - to);
 		wsize = size; /* store for future use */
 
 		entry_high = entry_low;
@@ -385,7 +385,7 @@ static int concat_erase(struct mtd_info *mtd, struct erase_info *instr)
 	struct mtd_concat *concat = CONCAT(mtd);
 	struct mtd_info *subdev;
 	int i, err;
-	u_int32_t length, offset = 0;
+	uint64_t length, offset = 0;
 	struct erase_info *erase;
 
 	if (!(mtd->flags & MTD_WRITEABLE))
@@ -518,7 +518,7 @@ static int concat_erase(struct mtd_info *mtd, struct erase_info *instr)
 	return 0;
 }
 
-static int concat_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int concat_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	struct mtd_concat *concat = CONCAT(mtd);
 	int i, err = -EINVAL;
@@ -528,7 +528,7 @@ static int concat_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
 
 	for (i = 0; i < concat->num_subdev; i++) {
 		struct mtd_info *subdev = concat->subdev[i];
-		size_t size;
+		uint64_t size;
 
 		if (ofs >= subdev->size) {
 			size = 0;
@@ -556,7 +556,7 @@ static int concat_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
 	return err;
 }
 
-static int concat_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int concat_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	struct mtd_concat *concat = CONCAT(mtd);
 	int i, err = 0;
@@ -566,7 +566,7 @@ static int concat_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
 
 	for (i = 0; i < concat->num_subdev; i++) {
 		struct mtd_info *subdev = concat->subdev[i];
-		size_t size;
+		uint64_t size;
 
 		if (ofs >= subdev->size) {
 			size = 0;
@@ -842,12 +842,14 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 		concat->mtd.erasesize = curr_erasesize;
 		concat->mtd.numeraseregions = 0;
 	} else {
+		uint64_t tmp64;
+
 		/*
 		 * erase block size varies across the subdevices: allocate
 		 * space to store the data describing the variable erase regions
 		 */
 		struct mtd_erase_region_info *erase_region_p;
-		u_int32_t begin, position;
+		uint64_t begin, position;
 
 		concat->mtd.erasesize = max_erasesize;
 		concat->mtd.numeraseregions = num_erase_region;
@@ -879,8 +881,9 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 					erase_region_p->offset = begin;
 					erase_region_p->erasesize =
 					    curr_erasesize;
-					erase_region_p->numblocks =
-					    (position - begin) / curr_erasesize;
+					tmp64 = position - begin;
+					do_div(tmp64, curr_erasesize);
+					erase_region_p->numblocks = tmp64;
 					begin = position;
 
 					curr_erasesize = subdev[i]->erasesize;
@@ -897,9 +900,9 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 						erase_region_p->offset = begin;
 						erase_region_p->erasesize =
 						    curr_erasesize;
-						erase_region_p->numblocks =
-						    (position -
-						     begin) / curr_erasesize;
+						tmp64 = position - begin;
+						do_div(tmp64, curr_erasesize);
+						erase_region_p->numblocks = tmp64;
 						begin = position;
 
 						curr_erasesize =
@@ -909,14 +912,16 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 					}
 					position +=
 					    subdev[i]->eraseregions[j].
-					    numblocks * curr_erasesize;
+					    numblocks * (uint64_t)curr_erasesize;
 				}
 			}
 		}
 		/* Now write the final entry */
 		erase_region_p->offset = begin;
 		erase_region_p->erasesize = curr_erasesize;
-		erase_region_p->numblocks = (position - begin) / curr_erasesize;
+		tmp64 = position - begin;
+		do_div(tmp64, curr_erasesize);
+		erase_region_p->numblocks = tmp64;
 	}
 
 	return &concat->mtd;
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index a9d246949820..76fe0a1e7a5e 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -57,6 +57,19 @@ int add_mtd_device(struct mtd_info *mtd)
 			mtd->index = i;
 			mtd->usecount = 0;
 
+			if (is_power_of_2(mtd->erasesize))
+				mtd->erasesize_shift = ffs(mtd->erasesize) - 1;
+			else
+				mtd->erasesize_shift = 0;
+
+			if (is_power_of_2(mtd->writesize))
+				mtd->writesize_shift = ffs(mtd->writesize) - 1;
+			else
+				mtd->writesize_shift = 0;
+
+			mtd->erasesize_mask = (1 << mtd->erasesize_shift) - 1;
+			mtd->writesize_mask = (1 << mtd->writesize_shift) - 1;
+
 			/* Some chips always power up locked. Unlock them now */
 			if ((mtd->flags & MTD_WRITEABLE)
 			    && (mtd->flags & MTD_POWERUP_LOCK) && mtd->unlock) {
@@ -344,7 +357,8 @@ static inline int mtd_proc_info (char *buf, int i)
 	if (!this)
 		return 0;
 
-	return sprintf(buf, "mtd%d: %8.8x %8.8x \"%s\"\n", i, this->size,
+	return sprintf(buf, "mtd%d: %8.8llx %8.8x \"%s\"\n", i,
+		       (unsigned long long)this->size,
 		       this->erasesize, this->name);
 }
 
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index aebb3b27edbd..1a6b3beabe8d 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -80,9 +80,9 @@ static int mtdoops_erase_block(struct mtd_info *mtd, int offset)
 	if (ret) {
 		set_current_state(TASK_RUNNING);
 		remove_wait_queue(&wait_q, &wait);
-		printk (KERN_WARNING "mtdoops: erase of region [0x%x, 0x%x] "
+		printk (KERN_WARNING "mtdoops: erase of region [0x%llx, 0x%llx] "
 				     "on \"%s\" failed\n",
-			erase.addr, erase.len, mtd->name);
+			(unsigned long long)erase.addr, (unsigned long long)erase.len, mtd->name);
 		return ret;
 	}
 
@@ -289,7 +289,10 @@ static void mtdoops_notify_add(struct mtd_info *mtd)
 	}
 
 	cxt->mtd = mtd;
-	cxt->oops_pages = mtd->size / OOPS_PAGE_SIZE;
+	if (mtd->size > INT_MAX)
+		cxt->oops_pages = INT_MAX / OOPS_PAGE_SIZE;
+	else
+		cxt->oops_pages = (int)mtd->size / OOPS_PAGE_SIZE;
 
 	find_next_position(cxt);
 
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 3728913fa5fa..144e6b613a77 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -26,7 +26,7 @@ static LIST_HEAD(mtd_partitions);
 struct mtd_part {
 	struct mtd_info mtd;
 	struct mtd_info *master;
-	u_int32_t offset;
+	uint64_t offset;
 	int index;
 	struct list_head list;
 	int registered;
@@ -235,7 +235,7 @@ void mtd_erase_callback(struct erase_info *instr)
 }
 EXPORT_SYMBOL_GPL(mtd_erase_callback);
 
-static int part_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int part_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	struct mtd_part *part = PART(mtd);
 	if ((len + ofs) > mtd->size)
@@ -243,7 +243,7 @@ static int part_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
 	return part->master->lock(part->master, ofs + part->offset, len);
 }
 
-static int part_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int part_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	struct mtd_part *part = PART(mtd);
 	if ((len + ofs) > mtd->size)
@@ -317,7 +317,7 @@ EXPORT_SYMBOL(del_mtd_partitions);
 
 static struct mtd_part *add_one_partition(struct mtd_info *master,
 		const struct mtd_partition *part, int partno,
-		u_int32_t cur_offset)
+		uint64_t cur_offset)
 {
 	struct mtd_part *slave;
 
@@ -395,19 +395,19 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 		slave->offset = cur_offset;
 	if (slave->offset == MTDPART_OFS_NXTBLK) {
 		slave->offset = cur_offset;
-		if ((cur_offset % master->erasesize) != 0) {
+		if (mtd_mod_by_eb(cur_offset, master) != 0) {
 			/* Round up to next erasesize */
-			slave->offset = ((cur_offset / master->erasesize) + 1) * master->erasesize;
+			slave->offset = (mtd_div_by_eb(cur_offset, master) + 1) * master->erasesize;
 			printk(KERN_NOTICE "Moving partition %d: "
-			       "0x%08x -> 0x%08x\n", partno,
-			       cur_offset, slave->offset);
+			       "0x%012llx -> 0x%012llx\n", partno,
+			       (unsigned long long)cur_offset, (unsigned long long)slave->offset);
 		}
 	}
 	if (slave->mtd.size == MTDPART_SIZ_FULL)
 		slave->mtd.size = master->size - slave->offset;
 
-	printk(KERN_NOTICE "0x%08x-0x%08x : \"%s\"\n", slave->offset,
-		slave->offset + slave->mtd.size, slave->mtd.name);
+	printk(KERN_NOTICE "0x%012llx-0x%012llx : \"%s\"\n", (unsigned long long)slave->offset,
+		(unsigned long long)(slave->offset + slave->mtd.size), slave->mtd.name);
 
 	/* let's do some sanity checks */
 	if (slave->offset >= master->size) {
@@ -420,13 +420,13 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 	}
 	if (slave->offset + slave->mtd.size > master->size) {
 		slave->mtd.size = master->size - slave->offset;
-		printk(KERN_WARNING"mtd: partition \"%s\" extends beyond the end of device \"%s\" -- size truncated to %#x\n",
-			part->name, master->name, slave->mtd.size);
+		printk(KERN_WARNING"mtd: partition \"%s\" extends beyond the end of device \"%s\" -- size truncated to %#llx\n",
+			part->name, master->name, (unsigned long long)slave->mtd.size);
 	}
 	if (master->numeraseregions > 1) {
 		/* Deal with variable erase size stuff */
 		int i, max = master->numeraseregions;
-		u32 end = slave->offset + slave->mtd.size;
+		u64 end = slave->offset + slave->mtd.size;
 		struct mtd_erase_region_info *regions = master->eraseregions;
 
 		/* Find the first erase regions which is part of this
@@ -449,7 +449,7 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 	}
 
 	if ((slave->mtd.flags & MTD_WRITEABLE) &&
-	    (slave->offset % slave->mtd.erasesize)) {
+	    mtd_mod_by_eb(slave->offset, &slave->mtd)) {
 		/* Doesn't start on a boundary of major erase size */
 		/* FIXME: Let it be writable if it is on a boundary of
 		 * _minor_ erase size though */
@@ -458,7 +458,7 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 			part->name);
 	}
 	if ((slave->mtd.flags & MTD_WRITEABLE) &&
-	    (slave->mtd.size % slave->mtd.erasesize)) {
+	    mtd_mod_by_eb(slave->mtd.size, &slave->mtd)) {
 		slave->mtd.flags &= ~MTD_WRITEABLE;
 		printk(KERN_WARNING"mtd: partition \"%s\" doesn't end on an erase block -- force read-only\n",
 			part->name);
@@ -466,7 +466,7 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 
 	slave->mtd.ecclayout = master->ecclayout;
 	if (master->block_isbad) {
-		uint32_t offs = 0;
+		uint64_t offs = 0;
 
 		while (offs < slave->mtd.size) {
 			if (master->block_isbad(master,
@@ -501,7 +501,7 @@ int add_mtd_partitions(struct mtd_info *master,
 		       int nbparts)
 {
 	struct mtd_part *slave;
-	u_int32_t cur_offset = 0;
+	uint64_t cur_offset = 0;
 	int i;
 
 	printk(KERN_NOTICE "Creating %d MTD partitions on \"%s\":\n", nbparts, master->name);
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0a9c9cd33f96..ff2d33e4d6d6 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2014,13 +2014,14 @@ static int nand_erase(struct mtd_info *mtd, struct erase_info *instr)
 int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 		    int allowbbt)
 {
-	int page, len, status, pages_per_block, ret, chipnr;
+	int page, status, pages_per_block, ret, chipnr;
 	struct nand_chip *chip = mtd->priv;
-	int rewrite_bbt[NAND_MAX_CHIPS]={0};
+	loff_t rewrite_bbt[NAND_MAX_CHIPS]={0};
 	unsigned int bbt_masked_page = 0xffffffff;
+	loff_t len;
 
-	DEBUG(MTD_DEBUG_LEVEL3, "nand_erase: start = 0x%08x, len = %i\n",
-	      (unsigned int)instr->addr, (unsigned int)instr->len);
+	DEBUG(MTD_DEBUG_LEVEL3, "nand_erase: start = 0x%012llx, len = %llu\n",
+	      (unsigned long long)instr->addr, (unsigned long long)instr->len);
 
 	/* Start address must align on block boundary */
 	if (instr->addr & ((1 << chip->phys_erase_shift) - 1)) {
@@ -2116,7 +2117,8 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 			DEBUG(MTD_DEBUG_LEVEL0, "nand_erase: "
 			      "Failed erase, page 0x%08x\n", page);
 			instr->state = MTD_ERASE_FAILED;
-			instr->fail_addr = (page << chip->page_shift);
+			instr->fail_addr =
+				((loff_t)page << chip->page_shift);
 			goto erase_exit;
 		}
 
@@ -2126,7 +2128,8 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 		 */
 		if (bbt_masked_page != 0xffffffff &&
 		    (page & BBT_PAGE_MASK) == bbt_masked_page)
-			    rewrite_bbt[chipnr] = (page << chip->page_shift);
+			    rewrite_bbt[chipnr] =
+					((loff_t)page << chip->page_shift);
 
 		/* Increment page address and decrement length */
 		len -= (1 << chip->phys_erase_shift);
@@ -2173,7 +2176,7 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 			continue;
 		/* update the BBT for chip */
 		DEBUG(MTD_DEBUG_LEVEL0, "nand_erase_nand: nand_update_bbt "
-		      "(%d:0x%0x 0x%0x)\n", chipnr, rewrite_bbt[chipnr],
+		      "(%d:0x%0llx 0x%0x)\n", chipnr, rewrite_bbt[chipnr],
 		      chip->bbt_td->pages[chipnr]);
 		nand_update_bbt(mtd, rewrite_bbt[chipnr]);
 	}
@@ -2365,7 +2368,7 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd,
 	if (!mtd->name)
 		mtd->name = type->name;
 
-	chip->chipsize = type->chipsize << 20;
+	chip->chipsize = (uint64_t)type->chipsize << 20;
 
 	/* Newer devices have all the information in additional id bytes */
 	if (!type->pagesize) {
@@ -2423,7 +2426,10 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd,
 
 	chip->bbt_erase_shift = chip->phys_erase_shift =
 		ffs(mtd->erasesize) - 1;
-	chip->chip_shift = ffs(chip->chipsize) - 1;
+	if (chip->chipsize & 0xffffffff)
+		chip->chip_shift = ffs((unsigned)chip->chipsize) - 1;
+	else
+		chip->chip_shift = ffs((unsigned)(chip->chipsize >> 32)) + 32 - 1;
 
 	/* Set the bad block position */
 	chip->badblockpos = mtd->writesize > 512 ?
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index 0b1c48595f12..55c23e5cd210 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -171,16 +171,16 @@ static int read_bbt(struct mtd_info *mtd, uint8_t *buf, int page, int num,
 				if (tmp == msk)
 					continue;
 				if (reserved_block_code && (tmp == reserved_block_code)) {
-					printk(KERN_DEBUG "nand_read_bbt: Reserved block at 0x%08x\n",
-					       ((offs << 2) + (act >> 1)) << this->bbt_erase_shift);
+					printk(KERN_DEBUG "nand_read_bbt: Reserved block at 0x%012llx\n",
+					       (loff_t)((offs << 2) + (act >> 1)) << this->bbt_erase_shift);
 					this->bbt[offs + (act >> 3)] |= 0x2 << (act & 0x06);
 					mtd->ecc_stats.bbtblocks++;
 					continue;
 				}
 				/* Leave it for now, if its matured we can move this
 				 * message to MTD_DEBUG_LEVEL0 */
-				printk(KERN_DEBUG "nand_read_bbt: Bad block at 0x%08x\n",
-				       ((offs << 2) + (act >> 1)) << this->bbt_erase_shift);
+				printk(KERN_DEBUG "nand_read_bbt: Bad block at 0x%012llx\n",
+				       (loff_t)((offs << 2) + (act >> 1)) << this->bbt_erase_shift);
 				/* Factory marked bad or worn out ? */
 				if (tmp == 0)
 					this->bbt[offs + (act >> 3)] |= 0x3 << (act & 0x06);
@@ -284,7 +284,7 @@ static int read_abs_bbts(struct mtd_info *mtd, uint8_t *buf,
 
 	/* Read the primary version, if available */
 	if (td->options & NAND_BBT_VERSION) {
-		scan_read_raw(mtd, buf, td->pages[0] << this->page_shift,
+		scan_read_raw(mtd, buf, (loff_t)td->pages[0] << this->page_shift,
 			      mtd->writesize);
 		td->version[0] = buf[mtd->writesize + td->veroffs];
 		printk(KERN_DEBUG "Bad block table at page %d, version 0x%02X\n",
@@ -293,7 +293,7 @@ static int read_abs_bbts(struct mtd_info *mtd, uint8_t *buf,
 
 	/* Read the mirror version, if available */
 	if (md && (md->options & NAND_BBT_VERSION)) {
-		scan_read_raw(mtd, buf, md->pages[0] << this->page_shift,
+		scan_read_raw(mtd, buf, (loff_t)md->pages[0] << this->page_shift,
 			      mtd->writesize);
 		md->version[0] = buf[mtd->writesize + md->veroffs];
 		printk(KERN_DEBUG "Bad block table at page %d, version 0x%02X\n",
@@ -411,7 +411,7 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf,
 		numblocks = this->chipsize >> (this->bbt_erase_shift - 1);
 		startblock = chip * numblocks;
 		numblocks += startblock;
-		from = startblock << (this->bbt_erase_shift - 1);
+		from = (loff_t)startblock << (this->bbt_erase_shift - 1);
 	}
 
 	for (i = startblock; i < numblocks;) {
@@ -428,8 +428,8 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf,
 
 		if (ret) {
 			this->bbt[i >> 3] |= 0x03 << (i & 0x6);
-			printk(KERN_WARNING "Bad eraseblock %d at 0x%08x\n",
-			       i >> 1, (unsigned int)from);
+			printk(KERN_WARNING "Bad eraseblock %d at 0x%012llx\n",
+			       i >> 1, (unsigned long long)from);
 			mtd->ecc_stats.badblocks++;
 		}
 
@@ -495,7 +495,7 @@ static int search_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 		for (block = 0; block < td->maxblocks; block++) {
 
 			int actblock = startblock + dir * block;
-			loff_t offs = actblock << this->bbt_erase_shift;
+			loff_t offs = (loff_t)actblock << this->bbt_erase_shift;
 
 			/* Read first page */
 			scan_read_raw(mtd, buf, offs, mtd->writesize);
@@ -719,7 +719,7 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 
 		memset(&einfo, 0, sizeof(einfo));
 		einfo.mtd = mtd;
-		einfo.addr = (unsigned long)to;
+		einfo.addr = to;
 		einfo.len = 1 << this->bbt_erase_shift;
 		res = nand_erase_nand(mtd, &einfo, 1);
 		if (res < 0)
@@ -729,8 +729,8 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 		if (res < 0)
 			goto outerr;
 
-		printk(KERN_DEBUG "Bad block table written to 0x%08x, version "
-		       "0x%02X\n", (unsigned int)to, td->version[chip]);
+		printk(KERN_DEBUG "Bad block table written to 0x%012llx, version "
+		       "0x%02X\n", (unsigned long long)to, td->version[chip]);
 
 		/* Mark it as used */
 		td->pages[chip] = page;
@@ -910,7 +910,7 @@ static void mark_bbt_region(struct mtd_info *mtd, struct nand_bbt_descr *td)
 			newval = oldval | (0x2 << (block & 0x06));
 			this->bbt[(block >> 3)] = newval;
 			if ((oldval != newval) && td->reserved_block_code)
-				nand_update_bbt(mtd, block << (this->bbt_erase_shift - 1));
+				nand_update_bbt(mtd, (loff_t)block << (this->bbt_erase_shift - 1));
 			continue;
 		}
 		update = 0;
@@ -931,7 +931,7 @@ static void mark_bbt_region(struct mtd_info *mtd, struct nand_bbt_descr *td)
 		   new ones have been marked, then we need to update the stored
 		   bbts.  This should only happen once. */
 		if (update && td->reserved_block_code)
-			nand_update_bbt(mtd, (block - 2) << (this->bbt_erase_shift - 1));
+			nand_update_bbt(mtd, (loff_t)(block - 2) << (this->bbt_erase_shift - 1));
 	}
 }
 
@@ -1027,7 +1027,6 @@ int nand_update_bbt(struct mtd_info *mtd, loff_t offs)
 	if (!this->bbt || !td)
 		return -EINVAL;
 
-	len = mtd->size >> (this->bbt_erase_shift + 2);
 	/* Allocate a temporary buffer for one eraseblock incl. oob */
 	len = (1 << this->bbt_erase_shift);
 	len += (len >> this->page_shift) * mtd->oobsize;
diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c
index 320b929abe79..d1c4546513f7 100644
--- a/drivers/mtd/nftlcore.c
+++ b/drivers/mtd/nftlcore.c
@@ -39,7 +39,7 @@ static void nftl_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 	struct NFTLrecord *nftl;
 	unsigned long temp;
 
-	if (mtd->type != MTD_NANDFLASH)
+	if (mtd->type != MTD_NANDFLASH || mtd->size > UINT_MAX)
 		return;
 	/* OK, this is moderately ugly.  But probably safe.  Alternatives? */
 	if (memcmp(mtd->name, "DiskOnChip", 10))
diff --git a/drivers/mtd/nftlmount.c b/drivers/mtd/nftlmount.c
index ccc4f209fbb5..8b22b1836e9f 100644
--- a/drivers/mtd/nftlmount.c
+++ b/drivers/mtd/nftlmount.c
@@ -51,7 +51,7 @@ static int find_boot_record(struct NFTLrecord *nftl)
 	   the mtd device accordingly.  We could even get rid of
 	   nftl->EraseSize if there were any point in doing so. */
 	nftl->EraseSize = nftl->mbd.mtd->erasesize;
-        nftl->nb_blocks = nftl->mbd.mtd->size / nftl->EraseSize;
+        nftl->nb_blocks = (u32)nftl->mbd.mtd->size / nftl->EraseSize;
 
 	nftl->MediaUnit = BLOCK_NIL;
 	nftl->SpareMediaUnit = BLOCK_NIL;
@@ -168,7 +168,7 @@ device is already correct.
 			printk(KERN_NOTICE "WARNING: Support for NFTL with UnitSizeFactor 0x%02x is experimental\n",
 			       mh->UnitSizeFactor);
 			nftl->EraseSize = nftl->mbd.mtd->erasesize << (0xff - mh->UnitSizeFactor);
-			nftl->nb_blocks = nftl->mbd.mtd->size / nftl->EraseSize;
+			nftl->nb_blocks = (u32)nftl->mbd.mtd->size / nftl->EraseSize;
 		}
 #endif
 		nftl->nb_boot_blocks = le16_to_cpu(mh->FirstPhysicalEUN);
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 90ed319f26e6..529af271db17 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -1772,7 +1772,7 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr)
 	int len;
 	int ret = 0;
 
-	DEBUG(MTD_DEBUG_LEVEL3, "onenand_erase: start = 0x%08x, len = %i\n", (unsigned int) instr->addr, (unsigned int) instr->len);
+	DEBUG(MTD_DEBUG_LEVEL3, "onenand_erase: start = 0x%012llx, len = %llu\n", (unsigned long long) instr->addr, (unsigned long long) instr->len);
 
 	block_size = (1 << this->erase_shift);
 
@@ -1810,7 +1810,7 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr)
 
 		/* Check if we have a bad block, we do not erase bad blocks */
 		if (onenand_block_isbad_nolock(mtd, addr, 0)) {
-			printk (KERN_WARNING "onenand_erase: attempt to erase a bad block at addr 0x%08x\n", (unsigned int) addr);
+			printk (KERN_WARNING "onenand_erase: attempt to erase a bad block at addr 0x%012llx\n", (unsigned long long) addr);
 			instr->state = MTD_ERASE_FAILED;
 			goto erase_exit;
 		}
@@ -2029,7 +2029,7 @@ static int onenand_do_lock_cmd(struct mtd_info *mtd, loff_t ofs, size_t len, int
  *
  * Lock one or more blocks
  */
-static int onenand_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int onenand_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	int ret;
 
@@ -2047,7 +2047,7 @@ static int onenand_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
  *
  * Unlock one or more blocks
  */
-static int onenand_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int onenand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	int ret;
 
diff --git a/drivers/mtd/rfd_ftl.c b/drivers/mtd/rfd_ftl.c
index e538c0a72abb..490b4742ce3a 100644
--- a/drivers/mtd/rfd_ftl.c
+++ b/drivers/mtd/rfd_ftl.c
@@ -156,7 +156,7 @@ static int scan_header(struct partition *part)
 	size_t retlen;
 
 	sectors_per_block = part->block_size / SECTOR_SIZE;
-	part->total_blocks = part->mbd.mtd->size / part->block_size;
+	part->total_blocks = (u32)part->mbd.mtd->size / part->block_size;
 
 	if (part->total_blocks < 2)
 		return -ENOENT;
@@ -276,16 +276,17 @@ static void erase_callback(struct erase_info *erase)
 
 	part = (struct partition*)erase->priv;
 
-	i = erase->addr / part->block_size;
-	if (i >= part->total_blocks || part->blocks[i].offset != erase->addr) {
-		printk(KERN_ERR PREFIX "erase callback for unknown offset %x "
-				"on '%s'\n", erase->addr, part->mbd.mtd->name);
+	i = (u32)erase->addr / part->block_size;
+	if (i >= part->total_blocks || part->blocks[i].offset != erase->addr ||
+	    erase->addr > UINT_MAX) {
+		printk(KERN_ERR PREFIX "erase callback for unknown offset %llx "
+				"on '%s'\n", (unsigned long long)erase->addr, part->mbd.mtd->name);
 		return;
 	}
 
 	if (erase->state != MTD_ERASE_DONE) {
-		printk(KERN_WARNING PREFIX "erase failed at 0x%x on '%s', "
-				"state %d\n", erase->addr,
+		printk(KERN_WARNING PREFIX "erase failed at 0x%llx on '%s', "
+				"state %d\n", (unsigned long long)erase->addr,
 				part->mbd.mtd->name, erase->state);
 
 		part->blocks[i].state = BLOCK_FAILED;
@@ -345,9 +346,9 @@ static int erase_block(struct partition *part, int block)
 	rc = part->mbd.mtd->erase(part->mbd.mtd, erase);
 
 	if (rc) {
-		printk(KERN_ERR PREFIX "erase of region %x,%x on '%s' "
-				"failed\n", erase->addr, erase->len,
-				part->mbd.mtd->name);
+		printk(KERN_ERR PREFIX "erase of region %llx,%llx on '%s' "
+				"failed\n", (unsigned long long)erase->addr,
+				(unsigned long long)erase->len, part->mbd.mtd->name);
 		kfree(erase);
 	}
 
@@ -763,7 +764,7 @@ static void rfd_ftl_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 {
 	struct partition *part;
 
-	if (mtd->type != MTD_NORFLASH)
+	if (mtd->type != MTD_NORFLASH || mtd->size > UINT_MAX)
 		return;
 
 	part = kzalloc(sizeof(struct partition), GFP_KERNEL);
diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c
index 33a5d6ed6f18..3f67e00d98e0 100644
--- a/drivers/mtd/ssfdc.c
+++ b/drivers/mtd/ssfdc.c
@@ -294,7 +294,8 @@ static void ssfdcr_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 	int cis_sector;
 
 	/* Check for small page NAND flash */
-	if (mtd->type != MTD_NANDFLASH || mtd->oobsize != OOB_SIZE)
+	if (mtd->type != MTD_NANDFLASH || mtd->oobsize != OOB_SIZE ||
+	    mtd->size > UINT_MAX)
 		return;
 
 	/* Check for SSDFC format by reading CIS/IDI sector */
@@ -316,7 +317,7 @@ static void ssfdcr_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 
 	ssfdc->cis_block = cis_sector / (mtd->erasesize >> SECTOR_SHIFT);
 	ssfdc->erase_size = mtd->erasesize;
-	ssfdc->map_len = mtd->size / mtd->erasesize;
+	ssfdc->map_len = (u32)mtd->size / mtd->erasesize;
 
 	DEBUG(MTD_DEBUG_LEVEL1,
 		"SSFDC_RO: cis_block=%d,erase_size=%d,map_len=%d,n_zones=%d\n",
@@ -327,7 +328,7 @@ static void ssfdcr_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 	ssfdc->heads = 16;
 	ssfdc->sectors = 32;
 	get_chs(mtd->size, NULL, &ssfdc->heads, &ssfdc->sectors);
-	ssfdc->cylinders = (unsigned short)((mtd->size >> SECTOR_SHIFT) /
+	ssfdc->cylinders = (unsigned short)(((u32)mtd->size >> SECTOR_SHIFT) /
 			((long)ssfdc->sectors * (long)ssfdc->heads));
 
 	DEBUG(MTD_DEBUG_LEVEL1, "SSFDC_RO: using C:%d H:%d S:%d == %ld sects\n",
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
index c7630a228310..634e2e86525f 100644
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -561,7 +561,7 @@ static int io_init(struct ubi_device *ubi)
 	 */
 
 	ubi->peb_size   = ubi->mtd->erasesize;
-	ubi->peb_count  = ubi->mtd->size / ubi->mtd->erasesize;
+	ubi->peb_count  = mtd_div_by_eb(ubi->mtd->size, ubi->mtd);
 	ubi->flash_size = ubi->mtd->size;
 
 	if (ubi->mtd->block_isbad && ubi->mtd->block_markbad)
diff --git a/drivers/mtd/ubi/gluebi.c b/drivers/mtd/ubi/gluebi.c
index 605812bb0b1a..6dd4f5e77f82 100644
--- a/drivers/mtd/ubi/gluebi.c
+++ b/drivers/mtd/ubi/gluebi.c
@@ -215,7 +215,8 @@ static int gluebi_erase(struct mtd_info *mtd, struct erase_info *instr)
 	struct ubi_volume *vol;
 	struct ubi_device *ubi;
 
-	dbg_gen("erase %u bytes at offset %u", instr->len, instr->addr);
+	dbg_gen("erase %llu bytes at offset %llu", (unsigned long long)instr->len,
+		 (unsigned long long)instr->addr);
 
 	if (instr->addr < 0 || instr->addr > mtd->size - mtd->erasesize)
 		return -EINVAL;
@@ -223,11 +224,11 @@ static int gluebi_erase(struct mtd_info *mtd, struct erase_info *instr)
 	if (instr->len < 0 || instr->addr + instr->len > mtd->size)
 		return -EINVAL;
 
-	if (instr->addr % mtd->writesize || instr->len % mtd->writesize)
+	if (mtd_mod_by_ws(instr->addr, mtd) || mtd_mod_by_ws(instr->len, mtd))
 		return -EINVAL;
 
-	lnum = instr->addr / mtd->erasesize;
-	count = instr->len / mtd->erasesize;
+	lnum = mtd_div_by_eb(instr->addr, mtd);
+	count = mtd_div_by_eb(instr->len, mtd);
 
 	vol = container_of(mtd, struct ubi_volume, gluebi_mtd);
 	ubi = vol->ubi;
@@ -255,7 +256,7 @@ static int gluebi_erase(struct mtd_info *mtd, struct erase_info *instr)
 
 out_err:
 	instr->state = MTD_ERASE_FAILED;
-	instr->fail_addr = lnum * mtd->erasesize;
+	instr->fail_addr = (long long)lnum * mtd->erasesize;
 	return err;
 }
 
@@ -294,7 +295,7 @@ int ubi_create_gluebi(struct ubi_device *ubi, struct ubi_volume *vol)
 	 * bytes.
 	 */
 	if (vol->vol_type == UBI_DYNAMIC_VOLUME)
-		mtd->size = vol->usable_leb_size * vol->reserved_pebs;
+		mtd->size = (long long)vol->usable_leb_size * vol->reserved_pebs;
 	else
 		mtd->size = vol->used_bytes;
 
@@ -304,8 +305,8 @@ int ubi_create_gluebi(struct ubi_device *ubi, struct ubi_volume *vol)
 		return -ENFILE;
 	}
 
-	dbg_gen("added mtd%d (\"%s\"), size %u, EB size %u",
-		mtd->index, mtd->name, mtd->size, mtd->erasesize);
+	dbg_gen("added mtd%d (\"%s\"), size %llu, EB size %u",
+		mtd->index, mtd->name, (unsigned long long)mtd->size, mtd->erasesize);
 	return 0;
 }
 
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 259461b910af..c32b4a1ad6cf 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
 {
 	/* For NAND, if the failure did not occur at the device level for a
 	   specific physical page, don't bother updating the bad block table. */
-	if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) {
+	if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) {
 		/* We had a device-level failure to erase.  Let's see if we've
 		   failed too many times. */
 		if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
@@ -209,7 +209,8 @@ static void jffs2_erase_callback(struct erase_info *instr)
 	struct erase_priv_struct *priv = (void *)instr->priv;
 
 	if(instr->state != MTD_ERASE_DONE) {
-		printk(KERN_WARNING "Erase at 0x%08x finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", instr->addr, instr->state);
+		printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
+			(unsigned long long)instr->addr, instr->state);
 		jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr);
 	} else {
 		jffs2_erase_succeeded(priv->c, priv->jeb);
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index eae26bb6430a..95e585ecc297 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -15,6 +15,8 @@
 #include <linux/mtd/compatmac.h>
 #include <mtd/mtd-abi.h>
 
+#include <asm/div64.h>
+
 #define MTD_CHAR_MAJOR 90
 #define MTD_BLOCK_MAJOR 31
 #define MAX_MTD_DEVICES 32
@@ -25,16 +27,16 @@
 #define MTD_ERASE_DONE          0x08
 #define MTD_ERASE_FAILED        0x10
 
-#define MTD_FAIL_ADDR_UNKNOWN 0xffffffff
+#define MTD_FAIL_ADDR_UNKNOWN -1LL
 
 /* If the erase fails, fail_addr might indicate exactly which block failed.  If
    fail_addr = MTD_FAIL_ADDR_UNKNOWN, the failure was not at the device level or was not
    specific to any particular block. */
 struct erase_info {
 	struct mtd_info *mtd;
-	u_int32_t addr;
-	u_int32_t len;
-	u_int32_t fail_addr;
+	uint64_t addr;
+	uint64_t len;
+	uint64_t fail_addr;
 	u_long time;
 	u_long retries;
 	u_int dev;
@@ -46,7 +48,7 @@ struct erase_info {
 };
 
 struct mtd_erase_region_info {
-	u_int32_t offset;			/* At which this region starts, from the beginning of the MTD */
+	uint64_t offset;			/* At which this region starts, from the beginning of the MTD */
 	u_int32_t erasesize;		/* For this region */
 	u_int32_t numblocks;		/* Number of blocks of erasesize in this region */
 	unsigned long *lockmap;		/* If keeping bitmap of locks */
@@ -101,7 +103,7 @@ struct mtd_oob_ops {
 struct mtd_info {
 	u_char type;
 	u_int32_t flags;
-	u_int32_t size;	 // Total size of the MTD
+	uint64_t size;	 // Total size of the MTD
 
 	/* "Major" erase size for the device. Naïve users may take this
 	 * to be the only erase size available, or may use the more detailed
@@ -120,6 +122,16 @@ struct mtd_info {
 	u_int32_t oobsize;   // Amount of OOB data per block (e.g. 16)
 	u_int32_t oobavail;  // Available OOB bytes per block
 
+	/*
+	 * If erasesize is a power of 2 then the shift is stored in
+	 * erasesize_shift otherwise erasesize_shift is zero. Ditto writesize.
+	 */
+	unsigned int erasesize_shift;
+	unsigned int writesize_shift;
+	/* Masks based on erasesize_shift and writesize_shift */
+	unsigned int erasesize_mask;
+	unsigned int writesize_mask;
+
 	// Kernel-only stuff starts here.
 	const char *name;
 	int index;
@@ -190,8 +202,8 @@ struct mtd_info {
 	void (*sync) (struct mtd_info *mtd);
 
 	/* Chip-supported device locking */
-	int (*lock) (struct mtd_info *mtd, loff_t ofs, size_t len);
-	int (*unlock) (struct mtd_info *mtd, loff_t ofs, size_t len);
+	int (*lock) (struct mtd_info *mtd, loff_t ofs, uint64_t len);
+	int (*unlock) (struct mtd_info *mtd, loff_t ofs, uint64_t len);
 
 	/* Power Management functions */
 	int (*suspend) (struct mtd_info *mtd);
@@ -221,6 +233,35 @@ struct mtd_info {
 	void (*put_device) (struct mtd_info *mtd);
 };
 
+static inline u_int32_t mtd_div_by_eb(uint64_t sz, struct mtd_info *mtd)
+{
+	if (mtd->erasesize_shift)
+		return sz >> mtd->erasesize_shift;
+	do_div(sz, mtd->erasesize);
+	return sz;
+}
+
+static inline u_int32_t mtd_mod_by_eb(uint64_t sz, struct mtd_info *mtd)
+{
+	if (mtd->erasesize_shift)
+		return sz & mtd->erasesize_mask;
+	return do_div(sz, mtd->erasesize);
+}
+
+static inline u_int32_t mtd_div_by_ws(uint64_t sz, struct mtd_info *mtd)
+{
+	if (mtd->writesize_shift)
+		return sz >> mtd->writesize_shift;
+	do_div(sz, mtd->writesize);
+	return sz;
+}
+
+static inline u_int32_t mtd_mod_by_ws(uint64_t sz, struct mtd_info *mtd)
+{
+	if (mtd->writesize_shift)
+		return sz & mtd->writesize_mask;
+	return do_div(sz, mtd->writesize);
+}
 
 	/* Kernel-side ioctl definitions */
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 733d3f3b4eb8..c0677b8082be 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -399,7 +399,7 @@ struct nand_chip {
 	int		bbt_erase_shift;
 	int		chip_shift;
 	int		numchips;
-	unsigned long	chipsize;
+	uint64_t	chipsize;
 	int		pagemask;
 	int		pagebuf;
 	int		subpagesize;
diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
index c92b4d439609..164c7d78687d 100644
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -36,8 +36,8 @@
 
 struct mtd_partition {
 	char *name;			/* identifier string */
-	u_int32_t size;			/* partition size */
-	u_int32_t offset;		/* offset within the master MTD space */
+	uint64_t size;			/* partition size */
+	uint64_t offset;		/* offset within the master MTD space */
 	u_int32_t mask_flags;		/* master MTD flags to mask out for this partition */
 	struct nand_ecclayout *ecclayout;	/* out of band layout for this partition (NAND only)*/
 	struct mtd_info **mtdp;		/* pointer to store the MTD object */
-- 
cgit v1.2.3


From 3854be7712f7b4bdcaed14664fc7c7124b3fef0d Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Wed, 10 Dec 2008 14:06:42 +0000
Subject: [MTD] Remove strange u_int32_t types from FTL

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/ftl.c       | 100 ++++++++++++++++++++++++------------------------
 include/linux/mtd/ftl.h |  38 +++++++++---------
 2 files changed, 69 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c
index 9bf581c4f740..a790c062af1f 100644
--- a/drivers/mtd/ftl.c
+++ b/drivers/mtd/ftl.c
@@ -109,25 +109,25 @@ module_param(shuffle_freq, int, 0);
 /* Each memory region corresponds to a minor device */
 typedef struct partition_t {
     struct mtd_blktrans_dev mbd;
-    u_int32_t		state;
-    u_int32_t		*VirtualBlockMap;
-    u_int32_t		*VirtualPageMap;
-    u_int32_t		FreeTotal;
+    uint32_t		state;
+    uint32_t		*VirtualBlockMap;
+    uint32_t		*VirtualPageMap;
+    uint32_t		FreeTotal;
     struct eun_info_t {
-	u_int32_t		Offset;
-	u_int32_t		EraseCount;
-	u_int32_t		Free;
-	u_int32_t		Deleted;
+	uint32_t		Offset;
+	uint32_t		EraseCount;
+	uint32_t		Free;
+	uint32_t		Deleted;
     } *EUNInfo;
     struct xfer_info_t {
-	u_int32_t		Offset;
-	u_int32_t		EraseCount;
-	u_int16_t		state;
+	uint32_t		Offset;
+	uint32_t		EraseCount;
+	uint16_t		state;
     } *XferInfo;
-    u_int16_t		bam_index;
-    u_int32_t		*bam_cache;
-    u_int16_t		DataUnits;
-    u_int32_t		BlocksPerUnit;
+    uint16_t		bam_index;
+    uint32_t		*bam_cache;
+    uint16_t		DataUnits;
+    uint32_t		BlocksPerUnit;
     erase_unit_header_t	header;
 } partition_t;
 
@@ -199,8 +199,8 @@ static int scan_header(partition_t *part)
 static int build_maps(partition_t *part)
 {
     erase_unit_header_t header;
-    u_int16_t xvalid, xtrans, i;
-    u_int blocks, j;
+    uint16_t xvalid, xtrans, i;
+    unsigned blocks, j;
     int hdr_ok, ret = -1;
     ssize_t retval;
     loff_t offset;
@@ -269,14 +269,14 @@ static int build_maps(partition_t *part)
 
     /* Set up virtual page map */
     blocks = le32_to_cpu(header.FormattedSize) >> header.BlockSize;
-    part->VirtualBlockMap = vmalloc(blocks * sizeof(u_int32_t));
+    part->VirtualBlockMap = vmalloc(blocks * sizeof(uint32_t));
     if (!part->VirtualBlockMap)
 	    goto out_XferInfo;
 
-    memset(part->VirtualBlockMap, 0xff, blocks * sizeof(u_int32_t));
+    memset(part->VirtualBlockMap, 0xff, blocks * sizeof(uint32_t));
     part->BlocksPerUnit = (1 << header.EraseUnitSize) >> header.BlockSize;
 
-    part->bam_cache = kmalloc(part->BlocksPerUnit * sizeof(u_int32_t),
+    part->bam_cache = kmalloc(part->BlocksPerUnit * sizeof(uint32_t),
 			      GFP_KERNEL);
     if (!part->bam_cache)
 	    goto out_VirtualBlockMap;
@@ -290,7 +290,7 @@ static int build_maps(partition_t *part)
 	offset = part->EUNInfo[i].Offset + le32_to_cpu(header.BAMOffset);
 
 	ret = part->mbd.mtd->read(part->mbd.mtd, offset,
-			      part->BlocksPerUnit * sizeof(u_int32_t), &retval,
+			      part->BlocksPerUnit * sizeof(uint32_t), &retval,
 			      (unsigned char *)part->bam_cache);
 
 	if (ret)
@@ -332,7 +332,7 @@ out:
 ======================================================================*/
 
 static int erase_xfer(partition_t *part,
-		      u_int16_t xfernum)
+		      uint16_t xfernum)
 {
     int ret;
     struct xfer_info_t *xfer;
@@ -408,7 +408,7 @@ static int prepare_xfer(partition_t *part, int i)
     erase_unit_header_t header;
     struct xfer_info_t *xfer;
     int nbam, ret;
-    u_int32_t ctl;
+    uint32_t ctl;
     ssize_t retlen;
     loff_t offset;
 
@@ -430,15 +430,15 @@ static int prepare_xfer(partition_t *part, int i)
     }
 
     /* Write the BAM stub */
-    nbam = (part->BlocksPerUnit * sizeof(u_int32_t) +
+    nbam = (part->BlocksPerUnit * sizeof(uint32_t) +
 	    le32_to_cpu(part->header.BAMOffset) + SECTOR_SIZE - 1) / SECTOR_SIZE;
 
     offset = xfer->Offset + le32_to_cpu(part->header.BAMOffset);
     ctl = cpu_to_le32(BLOCK_CONTROL);
 
-    for (i = 0; i < nbam; i++, offset += sizeof(u_int32_t)) {
+    for (i = 0; i < nbam; i++, offset += sizeof(uint32_t)) {
 
-	ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(u_int32_t),
+	ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(uint32_t),
 			       &retlen, (u_char *)&ctl);
 
 	if (ret)
@@ -461,18 +461,18 @@ static int prepare_xfer(partition_t *part, int i)
 
 ======================================================================*/
 
-static int copy_erase_unit(partition_t *part, u_int16_t srcunit,
-			   u_int16_t xferunit)
+static int copy_erase_unit(partition_t *part, uint16_t srcunit,
+			   uint16_t xferunit)
 {
     u_char buf[SECTOR_SIZE];
     struct eun_info_t *eun;
     struct xfer_info_t *xfer;
-    u_int32_t src, dest, free, i;
-    u_int16_t unit;
+    uint32_t src, dest, free, i;
+    uint16_t unit;
     int ret;
     ssize_t retlen;
     loff_t offset;
-    u_int16_t srcunitswap = cpu_to_le16(srcunit);
+    uint16_t srcunitswap = cpu_to_le16(srcunit);
 
     eun = &part->EUNInfo[srcunit];
     xfer = &part->XferInfo[xferunit];
@@ -486,7 +486,7 @@ static int copy_erase_unit(partition_t *part, u_int16_t srcunit,
 	offset = eun->Offset + le32_to_cpu(part->header.BAMOffset);
 
 	ret = part->mbd.mtd->read(part->mbd.mtd, offset,
-			      part->BlocksPerUnit * sizeof(u_int32_t),
+			      part->BlocksPerUnit * sizeof(uint32_t),
 			      &retlen, (u_char *) (part->bam_cache));
 
 	/* mark the cache bad, in case we get an error later */
@@ -503,7 +503,7 @@ static int copy_erase_unit(partition_t *part, u_int16_t srcunit,
     offset = xfer->Offset + 20; /* Bad! */
     unit = cpu_to_le16(0x7fff);
 
-    ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(u_int16_t),
+    ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(uint16_t),
 			   &retlen, (u_char *) &unit);
 
     if (ret) {
@@ -560,7 +560,7 @@ static int copy_erase_unit(partition_t *part, u_int16_t srcunit,
 
 
     /* All clear? Then update the LogicalEUN again */
-    ret = part->mbd.mtd->write(part->mbd.mtd, xfer->Offset + 20, sizeof(u_int16_t),
+    ret = part->mbd.mtd->write(part->mbd.mtd, xfer->Offset + 20, sizeof(uint16_t),
 			   &retlen, (u_char *)&srcunitswap);
 
     if (ret) {
@@ -605,8 +605,8 @@ static int copy_erase_unit(partition_t *part, u_int16_t srcunit,
 
 static int reclaim_block(partition_t *part)
 {
-    u_int16_t i, eun, xfer;
-    u_int32_t best;
+    uint16_t i, eun, xfer;
+    uint32_t best;
     int queued, ret;
 
     DEBUG(0, "ftl_cs: reclaiming space...\n");
@@ -723,10 +723,10 @@ static void dump_lists(partition_t *part)
 }
 #endif
 
-static u_int32_t find_free(partition_t *part)
+static uint32_t find_free(partition_t *part)
 {
-    u_int16_t stop, eun;
-    u_int32_t blk;
+    uint16_t stop, eun;
+    uint32_t blk;
     size_t retlen;
     int ret;
 
@@ -749,7 +749,7 @@ static u_int32_t find_free(partition_t *part)
 
 	ret = part->mbd.mtd->read(part->mbd.mtd,
 		       part->EUNInfo[eun].Offset + le32_to_cpu(part->header.BAMOffset),
-		       part->BlocksPerUnit * sizeof(u_int32_t),
+		       part->BlocksPerUnit * sizeof(uint32_t),
 		       &retlen, (u_char *) (part->bam_cache));
 
 	if (ret) {
@@ -786,7 +786,7 @@ static u_int32_t find_free(partition_t *part)
 static int ftl_read(partition_t *part, caddr_t buffer,
 		    u_long sector, u_long nblocks)
 {
-    u_int32_t log_addr, bsize;
+    uint32_t log_addr, bsize;
     u_long i;
     int ret;
     size_t offset, retlen;
@@ -829,14 +829,14 @@ static int ftl_read(partition_t *part, caddr_t buffer,
 
 ======================================================================*/
 
-static int set_bam_entry(partition_t *part, u_int32_t log_addr,
-			 u_int32_t virt_addr)
+static int set_bam_entry(partition_t *part, uint32_t log_addr,
+			 uint32_t virt_addr)
 {
-    u_int32_t bsize, blk, le_virt_addr;
+    uint32_t bsize, blk, le_virt_addr;
 #ifdef PSYCHO_DEBUG
-    u_int32_t old_addr;
+    uint32_t old_addr;
 #endif
-    u_int16_t eun;
+    uint16_t eun;
     int ret;
     size_t retlen, offset;
 
@@ -845,11 +845,11 @@ static int set_bam_entry(partition_t *part, u_int32_t log_addr,
     bsize = 1 << part->header.EraseUnitSize;
     eun = log_addr / bsize;
     blk = (log_addr % bsize) / SECTOR_SIZE;
-    offset = (part->EUNInfo[eun].Offset + blk * sizeof(u_int32_t) +
+    offset = (part->EUNInfo[eun].Offset + blk * sizeof(uint32_t) +
 		  le32_to_cpu(part->header.BAMOffset));
 
 #ifdef PSYCHO_DEBUG
-    ret = part->mbd.mtd->read(part->mbd.mtd, offset, sizeof(u_int32_t),
+    ret = part->mbd.mtd->read(part->mbd.mtd, offset, sizeof(uint32_t),
                         &retlen, (u_char *)&old_addr);
     if (ret) {
 	printk(KERN_WARNING"ftl: Error reading old_addr in set_bam_entry: %d\n",ret);
@@ -886,7 +886,7 @@ static int set_bam_entry(partition_t *part, u_int32_t log_addr,
 #endif
 	part->bam_cache[blk] = le_virt_addr;
     }
-    ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(u_int32_t),
+    ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(uint32_t),
                             &retlen, (u_char *)&le_virt_addr);
 
     if (ret) {
@@ -900,7 +900,7 @@ static int set_bam_entry(partition_t *part, u_int32_t log_addr,
 static int ftl_write(partition_t *part, caddr_t buffer,
 		     u_long sector, u_long nblocks)
 {
-    u_int32_t bsize, log_addr, virt_addr, old_addr, blk;
+    uint32_t bsize, log_addr, virt_addr, old_addr, blk;
     u_long i;
     int ret;
     size_t retlen, offset;
diff --git a/include/linux/mtd/ftl.h b/include/linux/mtd/ftl.h
index 0be442f881dd..0555f7a0b9ed 100644
--- a/include/linux/mtd/ftl.h
+++ b/include/linux/mtd/ftl.h
@@ -32,25 +32,25 @@
 #define _LINUX_FTL_H
 
 typedef struct erase_unit_header_t {
-    u_int8_t	LinkTargetTuple[5];
-    u_int8_t	DataOrgTuple[10];
-    u_int8_t	NumTransferUnits;
-    u_int32_t	EraseCount;
-    u_int16_t	LogicalEUN;
-    u_int8_t	BlockSize;
-    u_int8_t	EraseUnitSize;
-    u_int16_t	FirstPhysicalEUN;
-    u_int16_t	NumEraseUnits;
-    u_int32_t	FormattedSize;
-    u_int32_t	FirstVMAddress;
-    u_int16_t	NumVMPages;
-    u_int8_t	Flags;
-    u_int8_t	Code;
-    u_int32_t	SerialNumber;
-    u_int32_t	AltEUHOffset;
-    u_int32_t	BAMOffset;
-    u_int8_t	Reserved[12];
-    u_int8_t	EndTuple[2];
+    uint8_t	LinkTargetTuple[5];
+    uint8_t	DataOrgTuple[10];
+    uint8_t	NumTransferUnits;
+    uint32_t	EraseCount;
+    uint16_t	LogicalEUN;
+    uint8_t	BlockSize;
+    uint8_t	EraseUnitSize;
+    uint16_t	FirstPhysicalEUN;
+    uint16_t	NumEraseUnits;
+    uint32_t	FormattedSize;
+    uint32_t	FirstVMAddress;
+    uint16_t	NumVMPages;
+    uint8_t	Flags;
+    uint8_t	Code;
+    uint32_t	SerialNumber;
+    uint32_t	AltEUHOffset;
+    uint32_t	BAMOffset;
+    uint8_t	Reserved[12];
+    uint8_t	EndTuple[2];
 } erase_unit_header_t;
 
 /* Flags in erase_unit_header_t */
-- 
cgit v1.2.3


From 26cdb67c74aedc22367e6d0271f7f955220cca65 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Wed, 10 Dec 2008 14:08:12 +0000
Subject: [MTD] Remove more strange u_intxx_t types

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/mtdconcat.c        |  2 +-
 include/linux/mtd/mtd.h        | 26 +++++++++++++-------------
 include/linux/mtd/partitions.h |  2 +-
 3 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index ff8c14bcac68..c26dd528d094 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -696,7 +696,7 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 	int i;
 	size_t size;
 	struct mtd_concat *concat;
-	u_int32_t max_erasesize, curr_erasesize;
+	uint32_t max_erasesize, curr_erasesize;
 	int num_erase_region;
 
 	printk(KERN_NOTICE "Concatenating MTD devices:\n");
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 95e585ecc297..adef674855f3 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -39,8 +39,8 @@ struct erase_info {
 	uint64_t fail_addr;
 	u_long time;
 	u_long retries;
-	u_int dev;
-	u_int cell;
+	unsigned dev;
+	unsigned cell;
 	void (*callback) (struct erase_info *self);
 	u_long priv;
 	u_char state;
@@ -49,8 +49,8 @@ struct erase_info {
 
 struct mtd_erase_region_info {
 	uint64_t offset;			/* At which this region starts, from the beginning of the MTD */
-	u_int32_t erasesize;		/* For this region */
-	u_int32_t numblocks;		/* Number of blocks of erasesize in this region */
+	uint32_t erasesize;		/* For this region */
+	uint32_t numblocks;		/* Number of blocks of erasesize in this region */
 	unsigned long *lockmap;		/* If keeping bitmap of locks */
 };
 
@@ -102,14 +102,14 @@ struct mtd_oob_ops {
 
 struct mtd_info {
 	u_char type;
-	u_int32_t flags;
+	uint32_t flags;
 	uint64_t size;	 // Total size of the MTD
 
 	/* "Major" erase size for the device. Naïve users may take this
 	 * to be the only erase size available, or may use the more detailed
 	 * information below if they desire
 	 */
-	u_int32_t erasesize;
+	uint32_t erasesize;
 	/* Minimal writable flash unit size. In case of NOR flash it is 1 (even
 	 * though individual bits can be cleared), in case of NAND flash it is
 	 * one NAND page (or half, or one-fourths of it), in case of ECC-ed NOR
@@ -117,10 +117,10 @@ struct mtd_info {
 	 * Any driver registering a struct mtd_info must ensure a writesize of
 	 * 1 or larger.
 	 */
-	u_int32_t writesize;
+	uint32_t writesize;
 
-	u_int32_t oobsize;   // Amount of OOB data per block (e.g. 16)
-	u_int32_t oobavail;  // Available OOB bytes per block
+	uint32_t oobsize;   // Amount of OOB data per block (e.g. 16)
+	uint32_t oobavail;  // Available OOB bytes per block
 
 	/*
 	 * If erasesize is a power of 2 then the shift is stored in
@@ -233,7 +233,7 @@ struct mtd_info {
 	void (*put_device) (struct mtd_info *mtd);
 };
 
-static inline u_int32_t mtd_div_by_eb(uint64_t sz, struct mtd_info *mtd)
+static inline uint32_t mtd_div_by_eb(uint64_t sz, struct mtd_info *mtd)
 {
 	if (mtd->erasesize_shift)
 		return sz >> mtd->erasesize_shift;
@@ -241,14 +241,14 @@ static inline u_int32_t mtd_div_by_eb(uint64_t sz, struct mtd_info *mtd)
 	return sz;
 }
 
-static inline u_int32_t mtd_mod_by_eb(uint64_t sz, struct mtd_info *mtd)
+static inline uint32_t mtd_mod_by_eb(uint64_t sz, struct mtd_info *mtd)
 {
 	if (mtd->erasesize_shift)
 		return sz & mtd->erasesize_mask;
 	return do_div(sz, mtd->erasesize);
 }
 
-static inline u_int32_t mtd_div_by_ws(uint64_t sz, struct mtd_info *mtd)
+static inline uint32_t mtd_div_by_ws(uint64_t sz, struct mtd_info *mtd)
 {
 	if (mtd->writesize_shift)
 		return sz >> mtd->writesize_shift;
@@ -256,7 +256,7 @@ static inline u_int32_t mtd_div_by_ws(uint64_t sz, struct mtd_info *mtd)
 	return sz;
 }
 
-static inline u_int32_t mtd_mod_by_ws(uint64_t sz, struct mtd_info *mtd)
+static inline uint32_t mtd_mod_by_ws(uint64_t sz, struct mtd_info *mtd)
 {
 	if (mtd->writesize_shift)
 		return sz & mtd->writesize_mask;
diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
index 164c7d78687d..a45dd831b3f8 100644
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -38,7 +38,7 @@ struct mtd_partition {
 	char *name;			/* identifier string */
 	uint64_t size;			/* partition size */
 	uint64_t offset;		/* offset within the master MTD space */
-	u_int32_t mask_flags;		/* master MTD flags to mask out for this partition */
+	uint32_t mask_flags;		/* master MTD flags to mask out for this partition */
 	struct nand_ecclayout *ecclayout;	/* out of band layout for this partition (NAND only)*/
 	struct mtd_info **mtdp;		/* pointer to store the MTD object */
 };
-- 
cgit v1.2.3


From d3af0f048c114dd53713d5920c54f6d5b6b12139 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 1 Dec 2008 14:23:38 -0800
Subject: [MTD] [NAND] remove excess kernel-doc notation

Delete extra kernel-doc notation for struct fields and function
parameters that don't exist:

Warning(include/linux/mtd/nand.h:428): Excess struct/union/enum/typedef member 'wq' description in 'nand_chip'
Warning(include/linux/mtd/nand.h:428): Excess struct/union/enum/typedef member 'datbuf' description in 'nand_chip'
Warning(include/linux/mtd/nand.h:428): Excess struct/union/enum/typedef member 'oobbuf' description in 'nand_chip'
Warning(include/linux/mtd/nand.h:428): Excess struct/union/enum/typedef member 'oobdirty' description in 'nand_chip'
Warning(include/linux/mtd/nand.h:428): Excess struct/union/enum/typedef member 'data_poi' description in 'nand_chip'
Warning(drivers/mtd/nand/nand_base.c:2527): Excess function parameter 'maxchips' description in 'nand_scan_tail'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/nand_base.c | 1 -
 include/linux/mtd/nand.h     | 5 -----
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index ff2d33e4d6d6..0c3afccde8a2 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2523,7 +2523,6 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips)
 /**
  * nand_scan_tail - [NAND Interface] Scan for the NAND device
  * @mtd:	    MTD device structure
- * @maxchips:	    Number of chips to scan for
  *
  * This is the second phase of the normal nand_scan() function. It
  * fills out all the uninitialized function pointers with the defaults
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index c0677b8082be..db5b63da2a7e 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -335,17 +335,12 @@ struct nand_buffers {
  * @erase_cmd:		[INTERN] erase command write function, selectable due to AND support
  * @scan_bbt:		[REPLACEABLE] function to scan bad block table
  * @chip_delay:		[BOARDSPECIFIC] chip dependent delay for transfering data from array to read regs (tR)
- * @wq:			[INTERN] wait queue to sleep on if a NAND operation is in progress
  * @state:		[INTERN] the current state of the NAND device
  * @oob_poi:		poison value buffer
  * @page_shift:		[INTERN] number of address bits in a page (column address bits)
  * @phys_erase_shift:	[INTERN] number of address bits in a physical eraseblock
  * @bbt_erase_shift:	[INTERN] number of address bits in a bbt entry
  * @chip_shift:		[INTERN] number of address bits in one chip
- * @datbuf:		[INTERN] internal buffer for one page + oob
- * @oobbuf:		[INTERN] oob buffer for one eraseblock
- * @oobdirty:		[INTERN] indicates that oob_buf must be reinitialized
- * @data_poi:		[INTERN] pointer to a data buffer
  * @options:		[BOARDSPECIFIC] various chip options. They can partly be set to inform nand_scan about
  *			special functionality. See the defines for further explanation
  * @badblockpos:	[INTERN] position of the bad block marker in the oob area
-- 
cgit v1.2.3


From 3f4b0ef7f2899c91b1d6958779f084b44dd59d32 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 26 Oct 2008 20:52:15 +0100
Subject: ACPI hibernate: Add a mechanism to save/restore ACPI NVS memory

According to the ACPI Specification 3.0b, Section 15.3.2,
"OSPM will call the _PTS control method some time before entering a
sleeping state, to allow the platform's AML code to update this
memory image before entering the sleeping state. After the system
awakes from an S4 state, OSPM will restore this memory area and call
the _WAK control method to enable the BIOS to reclaim its memory
image."  For this reason, implement a mechanism allowing us to save
the NVS memory during hibernation and to restore it during the
subsequent resume.

Based on a patch by Zhang Rui.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Nigel Cunningham <nigel@tuxonice.net>
Cc: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/sleep/main.c |  53 +++++++++++++++++---
 include/linux/suspend.h   |  13 +++++
 kernel/power/swsusp.c     | 122 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 180 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index 28a691cc625e..45a8015e4217 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -394,9 +394,25 @@ void __init acpi_no_s4_hw_signature(void)
 
 static int acpi_hibernation_begin(void)
 {
-	acpi_target_sleep_state = ACPI_STATE_S4;
-	acpi_sleep_tts_switch(acpi_target_sleep_state);
-	return 0;
+	int error;
+
+	error = hibernate_nvs_alloc();
+	if (!error) {
+		acpi_target_sleep_state = ACPI_STATE_S4;
+		acpi_sleep_tts_switch(acpi_target_sleep_state);
+	}
+
+	return error;
+}
+
+static int acpi_hibernation_pre_snapshot(void)
+{
+	int error = acpi_pm_prepare();
+
+	if (!error)
+		hibernate_nvs_save();
+
+	return error;
 }
 
 static int acpi_hibernation_enter(void)
@@ -417,6 +433,12 @@ static int acpi_hibernation_enter(void)
 	return ACPI_SUCCESS(status) ? 0 : -EFAULT;
 }
 
+static void acpi_hibernation_finish(void)
+{
+	hibernate_nvs_free();
+	acpi_pm_finish();
+}
+
 static void acpi_hibernation_leave(void)
 {
 	/*
@@ -432,6 +454,8 @@ static void acpi_hibernation_leave(void)
 			"cannot resume!\n");
 		panic("ACPI S4 hardware signature mismatch");
 	}
+	/* Restore the NVS memory area */
+	hibernate_nvs_restore();
 }
 
 static void acpi_pm_enable_gpes(void)
@@ -442,8 +466,8 @@ static void acpi_pm_enable_gpes(void)
 static struct platform_hibernation_ops acpi_hibernation_ops = {
 	.begin = acpi_hibernation_begin,
 	.end = acpi_pm_end,
-	.pre_snapshot = acpi_pm_prepare,
-	.finish = acpi_pm_finish,
+	.pre_snapshot = acpi_hibernation_pre_snapshot,
+	.finish = acpi_hibernation_finish,
 	.prepare = acpi_pm_prepare,
 	.enter = acpi_hibernation_enter,
 	.leave = acpi_hibernation_leave,
@@ -469,8 +493,21 @@ static int acpi_hibernation_begin_old(void)
 
 	error = acpi_sleep_prepare(ACPI_STATE_S4);
 
+	if (!error) {
+		error = hibernate_nvs_alloc();
+		if (!error)
+			acpi_target_sleep_state = ACPI_STATE_S4;
+	}
+	return error;
+}
+
+static int acpi_hibernation_pre_snapshot_old(void)
+{
+	int error = acpi_pm_disable_gpes();
+
 	if (!error)
-		acpi_target_sleep_state = ACPI_STATE_S4;
+		hibernate_nvs_save();
+
 	return error;
 }
 
@@ -481,8 +518,8 @@ static int acpi_hibernation_begin_old(void)
 static struct platform_hibernation_ops acpi_hibernation_ops_old = {
 	.begin = acpi_hibernation_begin_old,
 	.end = acpi_pm_end,
-	.pre_snapshot = acpi_pm_disable_gpes,
-	.finish = acpi_pm_finish,
+	.pre_snapshot = acpi_hibernation_pre_snapshot_old,
+	.finish = acpi_hibernation_finish,
 	.prepare = acpi_pm_disable_gpes,
 	.enter = acpi_hibernation_enter,
 	.leave = acpi_hibernation_leave,
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 2ce8207686e2..2b409c44db83 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -232,6 +232,11 @@ extern unsigned long get_safe_page(gfp_t gfp_mask);
 
 extern void hibernation_set_ops(struct platform_hibernation_ops *ops);
 extern int hibernate(void);
+extern int hibernate_nvs_register(unsigned long start, unsigned long size);
+extern int hibernate_nvs_alloc(void);
+extern void hibernate_nvs_free(void);
+extern void hibernate_nvs_save(void);
+extern void hibernate_nvs_restore(void);
 #else /* CONFIG_HIBERNATION */
 static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
 static inline void swsusp_set_page_free(struct page *p) {}
@@ -239,6 +244,14 @@ static inline void swsusp_unset_page_free(struct page *p) {}
 
 static inline void hibernation_set_ops(struct platform_hibernation_ops *ops) {}
 static inline int hibernate(void) { return -ENOSYS; }
+static inline int hibernate_nvs_register(unsigned long a, unsigned long b)
+{
+	return 0;
+}
+static inline int hibernate_nvs_alloc(void) { return 0; }
+static inline void hibernate_nvs_free(void) {}
+static inline void hibernate_nvs_save(void) {}
+static inline void hibernate_nvs_restore(void) {}
 #endif /* CONFIG_HIBERNATION */
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 023ff2a31d89..a92c91451559 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -262,3 +262,125 @@ int swsusp_shrink_memory(void)
 
 	return 0;
 }
+
+/*
+ * Platforms, like ACPI, may want us to save some memory used by them during
+ * hibernation and to restore the contents of this memory during the subsequent
+ * resume.  The code below implements a mechanism allowing us to do that.
+ */
+
+struct nvs_page {
+	unsigned long phys_start;
+	unsigned int size;
+	void *kaddr;
+	void *data;
+	struct list_head node;
+};
+
+static LIST_HEAD(nvs_list);
+
+/**
+ *	hibernate_nvs_register - register platform NVS memory region to save
+ *	@start - physical address of the region
+ *	@size - size of the region
+ *
+ *	The NVS region need not be page-aligned (both ends) and we arrange
+ *	things so that the data from page-aligned addresses in this region will
+ *	be copied into separate RAM pages.
+ */
+int hibernate_nvs_register(unsigned long start, unsigned long size)
+{
+	struct nvs_page *entry, *next;
+
+	while (size > 0) {
+		unsigned int nr_bytes;
+
+		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
+		if (!entry)
+			goto Error;
+
+		list_add_tail(&entry->node, &nvs_list);
+		entry->phys_start = start;
+		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
+		entry->size = (size < nr_bytes) ? size : nr_bytes;
+
+		start += entry->size;
+		size -= entry->size;
+	}
+	return 0;
+
+ Error:
+	list_for_each_entry_safe(entry, next, &nvs_list, node) {
+		list_del(&entry->node);
+		kfree(entry);
+	}
+	return -ENOMEM;
+}
+
+/**
+ *	hibernate_nvs_free - free data pages allocated for saving NVS regions
+ */
+void hibernate_nvs_free(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			free_page((unsigned long)entry->data);
+			entry->data = NULL;
+			if (entry->kaddr) {
+				iounmap(entry->kaddr);
+				entry->kaddr = NULL;
+			}
+		}
+}
+
+/**
+ *	hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
+ */
+int hibernate_nvs_alloc(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node) {
+		entry->data = (void *)__get_free_page(GFP_KERNEL);
+		if (!entry->data) {
+			hibernate_nvs_free();
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+/**
+ *	hibernate_nvs_save - save NVS memory regions
+ */
+void hibernate_nvs_save(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Saving platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			entry->kaddr = ioremap(entry->phys_start, entry->size);
+			memcpy(entry->data, entry->kaddr, entry->size);
+		}
+}
+
+/**
+ *	hibernate_nvs_restore - restore NVS memory regions
+ *
+ *	This function is going to be called with interrupts disabled, so it
+ *	cannot iounmap the virtual addresses used to access the NVS region.
+ */
+void hibernate_nvs_restore(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Restoring platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data)
+			memcpy(entry->kaddr, entry->data, entry->size);
+}
-- 
cgit v1.2.3


From ba84ed9546e91348fdf3ff2bff859b0ee53b407a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 26 Oct 2008 20:56:30 +0100
Subject: ACPI hibernate: Introduce new kernel parameter acpi_sleep=s4_nonvs

On some machines it may be necessary to disable the saving/restoring
of the ACPI NVS memory region during hibernation/resume.  For this
purpose, introduce new ACPI kernel command line option
acpi_sleep=s4_nonvs.

Based on a patch by Zhang Rui.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Nigel Cunningham <nigel@tuxonice.net>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 Documentation/kernel-parameters.txt |  5 ++++-
 arch/x86/kernel/acpi/sleep.c        |  2 ++
 drivers/acpi/sleep/main.c           | 18 ++++++++++++++++--
 include/linux/acpi.h                |  1 +
 4 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e0f346d201ed..1d089eeff3cf 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -149,7 +149,8 @@ and is between 256 and 4096 characters. It is defined in the file
 			default: 0
 
 	acpi_sleep=	[HW,ACPI] Sleep options
-			Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, old_ordering }
+			Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
+				  old_ordering, s4_nonvs }
 			See Documentation/power/video.txt for s3_bios and s3_mode.
 			s3_beep is for debugging; it makes the PC's speaker beep
 			as soon as the kernel's real-mode entry point is called.
@@ -159,6 +160,8 @@ and is between 256 and 4096 characters. It is defined in the file
 			control method, wrt putting devices into low power
 			states, to be enforced (the ACPI 2.0 ordering of _PTS is
 			used by default).
+			s4_nonvs prevents the kernel from saving/restoring the
+			ACPI NVS memory during hibernation.
 
 	acpi_sci=	[HW,ACPI] ACPI System Control Interrupt trigger mode
 			Format: { level | edge | high | low }
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 806b4e9051b4..707c1f6f95fa 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -159,6 +159,8 @@ static int __init acpi_sleep_setup(char *str)
 #endif
 		if (strncmp(str, "old_ordering", 12) == 0)
 			acpi_old_suspend_ordering();
+		if (strncmp(str, "s4_nonvs", 8) == 0)
+			acpi_s4_no_nvs();
 		str = strchr(str, ',');
 		if (str != NULL)
 			str += strspn(str, ", \t");
diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index 45a8015e4217..bef41fd4c877 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -101,6 +101,19 @@ void __init acpi_old_suspend_ordering(void)
  * cases.
  */
 static bool set_sci_en_on_resume;
+/*
+ * The ACPI specification wants us to save NVS memory regions during hibernation
+ * and to restore them during the subsequent resume.  However, it is not certain
+ * if this mechanism is going to work on all machines, so we allow the user to
+ * disable this mechanism using the 'acpi_sleep=s4_nonvs' kernel command line
+ * option.
+ */
+static bool s4_no_nvs;
+
+void __init acpi_s4_no_nvs(void)
+{
+	s4_no_nvs = true;
+}
 
 /**
  *	acpi_pm_disable_gpes - Disable the GPEs.
@@ -396,7 +409,7 @@ static int acpi_hibernation_begin(void)
 {
 	int error;
 
-	error = hibernate_nvs_alloc();
+	error = s4_no_nvs ? 0 : hibernate_nvs_alloc();
 	if (!error) {
 		acpi_target_sleep_state = ACPI_STATE_S4;
 		acpi_sleep_tts_switch(acpi_target_sleep_state);
@@ -494,7 +507,8 @@ static int acpi_hibernation_begin_old(void)
 	error = acpi_sleep_prepare(ACPI_STATE_S4);
 
 	if (!error) {
-		error = hibernate_nvs_alloc();
+		if (!s4_no_nvs)
+			error = hibernate_nvs_alloc();
 		if (!error)
 			acpi_target_sleep_state = ACPI_STATE_S4;
 	}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index fba8051fb297..dfa0a5356c53 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -270,6 +270,7 @@ int acpi_check_mem_region(resource_size_t start, resource_size_t n,
 #ifdef CONFIG_PM_SLEEP
 void __init acpi_no_s4_hw_signature(void);
 void __init acpi_old_suspend_ordering(void);
+void __init acpi_s4_no_nvs(void);
 #endif /* CONFIG_PM_SLEEP */
 #else	/* CONFIG_ACPI */
 
-- 
cgit v1.2.3


From 160bbab3000dafccbe43688e48208cecf4deb879 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Tue, 23 Dec 2008 10:00:14 +0000
Subject: [MTD] struct device - replace bus_id with dev_name(), dev_set_name()

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/devices/m25p80.c        | 20 ++++++++++----------
 drivers/mtd/devices/mtd_dataflash.c | 32 ++++++++++++++++----------------
 drivers/mtd/maps/integrator-flash.c |  2 +-
 drivers/mtd/maps/ixp2000.c          |  4 ++--
 drivers/mtd/maps/ixp4xx.c           |  2 +-
 drivers/mtd/maps/omap_nor.c         |  2 +-
 drivers/mtd/maps/physmap.c          |  6 +++---
 drivers/mtd/maps/physmap_of.c       |  4 ++--
 drivers/mtd/mtdconcat.c             |  2 +-
 drivers/mtd/nand/fsl_upm.c          |  2 +-
 drivers/mtd/nand/plat_nand.c        |  2 +-
 drivers/mtd/nand/tmio_nand.c        |  2 +-
 drivers/mtd/onenand/generic.c       |  2 +-
 drivers/mtd/onenand/omap2.c         |  2 +-
 drivers/mtd/ubi/build.c             |  2 +-
 drivers/mtd/ubi/vmt.c               |  4 ++--
 include/linux/mtd/concat.h          |  2 +-
 17 files changed, 46 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index 9be0229c3d30..7c3fc766dcf1 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -171,8 +171,8 @@ static int wait_till_ready(struct m25p *flash)
 static int erase_chip(struct m25p *flash)
 {
 	DEBUG(MTD_DEBUG_LEVEL3, "%s: %s %lldKiB\n",
-			flash->spi->dev.bus_id, __func__,
-			(long long)(flash->mtd.size >> 10));
+	      dev_name(&flash->spi->dev), __func__,
+	      (long long)(flash->mtd.size >> 10));
 
 	/* Wait until finished previous write command. */
 	if (wait_till_ready(flash))
@@ -198,7 +198,7 @@ static int erase_chip(struct m25p *flash)
 static int erase_sector(struct m25p *flash, u32 offset)
 {
 	DEBUG(MTD_DEBUG_LEVEL3, "%s: %s %dKiB at 0x%08x\n",
-			flash->spi->dev.bus_id, __func__,
+			dev_name(&flash->spi->dev), __func__,
 			flash->mtd.erasesize / 1024, offset);
 
 	/* Wait until finished previous write command. */
@@ -236,8 +236,8 @@ static int m25p80_erase(struct mtd_info *mtd, struct erase_info *instr)
 	uint32_t rem;
 
 	DEBUG(MTD_DEBUG_LEVEL2, "%s: %s %s 0x%llx, len %lld\n",
-			flash->spi->dev.bus_id, __func__, "at",
-			(long long)instr->addr, (long long)instr->len);
+	      dev_name(&flash->spi->dev), __func__, "at",
+	      (long long)instr->addr, (long long)instr->len);
 
 	/* sanity checks */
 	if (instr->addr + instr->len > flash->mtd.size)
@@ -296,7 +296,7 @@ static int m25p80_read(struct mtd_info *mtd, loff_t from, size_t len,
 	struct spi_message m;
 
 	DEBUG(MTD_DEBUG_LEVEL2, "%s: %s %s 0x%08x, len %zd\n",
-			flash->spi->dev.bus_id, __func__, "from",
+			dev_name(&flash->spi->dev), __func__, "from",
 			(u32)from, len);
 
 	/* sanity checks */
@@ -368,7 +368,7 @@ static int m25p80_write(struct mtd_info *mtd, loff_t to, size_t len,
 	struct spi_message m;
 
 	DEBUG(MTD_DEBUG_LEVEL2, "%s: %s %s 0x%08x, len %zd\n",
-			flash->spi->dev.bus_id, __func__, "to",
+			dev_name(&flash->spi->dev), __func__, "to",
 			(u32)to, len);
 
 	if (retlen)
@@ -564,7 +564,7 @@ static struct flash_info *__devinit jedec_probe(struct spi_device *spi)
 	tmp = spi_write_then_read(spi, &code, 1, id, 5);
 	if (tmp < 0) {
 		DEBUG(MTD_DEBUG_LEVEL0, "%s: error %d reading JEDEC ID\n",
-			spi->dev.bus_id, tmp);
+			dev_name(&spi->dev), tmp);
 		return NULL;
 	}
 	jedec = id[0];
@@ -618,7 +618,7 @@ static int __devinit m25p_probe(struct spi_device *spi)
 		/* unrecognized chip? */
 		if (i == ARRAY_SIZE(m25p_data)) {
 			DEBUG(MTD_DEBUG_LEVEL0, "%s: unrecognized id %s\n",
-					spi->dev.bus_id, data->type);
+					dev_name(&spi->dev), data->type);
 			info = NULL;
 
 		/* recognized; is that chip really what's there? */
@@ -659,7 +659,7 @@ static int __devinit m25p_probe(struct spi_device *spi)
 	if (data && data->name)
 		flash->mtd.name = data->name;
 	else
-		flash->mtd.name = spi->dev.bus_id;
+		flash->mtd.name = dev_name(&spi->dev);
 
 	flash->mtd.type = MTD_NORFLASH;
 	flash->mtd.writesize = 1;
diff --git a/drivers/mtd/devices/mtd_dataflash.c b/drivers/mtd/devices/mtd_dataflash.c
index 68068975940b..d44f741ae229 100644
--- a/drivers/mtd/devices/mtd_dataflash.c
+++ b/drivers/mtd/devices/mtd_dataflash.c
@@ -129,7 +129,7 @@ static int dataflash_waitready(struct spi_device *spi)
 		status = dataflash_status(spi);
 		if (status < 0) {
 			DEBUG(MTD_DEBUG_LEVEL1, "%s: status %d?\n",
-					spi->dev.bus_id, status);
+					dev_name(&spi->dev), status);
 			status = 0;
 		}
 
@@ -156,8 +156,8 @@ static int dataflash_erase(struct mtd_info *mtd, struct erase_info *instr)
 	uint32_t		rem;
 
 	DEBUG(MTD_DEBUG_LEVEL2, "%s: erase addr=0x%llx len 0x%llx\n",
-			spi->dev.bus_id, (long long)instr->addr,
-			(long long)instr->len);
+	      dev_name(&spi->dev), (long long)instr->addr,
+	      (long long)instr->len);
 
 	/* Sanity checks */
 	if (instr->addr + instr->len > mtd->size)
@@ -203,7 +203,7 @@ static int dataflash_erase(struct mtd_info *mtd, struct erase_info *instr)
 
 		if (status < 0) {
 			printk(KERN_ERR "%s: erase %x, err %d\n",
-				spi->dev.bus_id, pageaddr, status);
+				dev_name(&spi->dev), pageaddr, status);
 			/* REVISIT:  can retry instr->retries times; or
 			 * giveup and instr->fail_addr = instr->addr;
 			 */
@@ -245,7 +245,7 @@ static int dataflash_read(struct mtd_info *mtd, loff_t from, size_t len,
 	int			status;
 
 	DEBUG(MTD_DEBUG_LEVEL2, "%s: read 0x%x..0x%x\n",
-		priv->spi->dev.bus_id, (unsigned)from, (unsigned)(from + len));
+		dev_name(&priv->spi->dev), (unsigned)from, (unsigned)(from + len));
 
 	*retlen = 0;
 
@@ -294,7 +294,7 @@ static int dataflash_read(struct mtd_info *mtd, loff_t from, size_t len,
 		status = 0;
 	} else
 		DEBUG(MTD_DEBUG_LEVEL1, "%s: read %x..%x --> %d\n",
-			priv->spi->dev.bus_id,
+			dev_name(&priv->spi->dev),
 			(unsigned)from, (unsigned)(from + len),
 			status);
 	return status;
@@ -321,7 +321,7 @@ static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len,
 	uint8_t			*command;
 
 	DEBUG(MTD_DEBUG_LEVEL2, "%s: write 0x%x..0x%x\n",
-		spi->dev.bus_id, (unsigned)to, (unsigned)(to + len));
+		dev_name(&spi->dev), (unsigned)to, (unsigned)(to + len));
 
 	*retlen = 0;
 
@@ -380,7 +380,7 @@ static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len,
 			status = spi_sync(spi, &msg);
 			if (status < 0)
 				DEBUG(MTD_DEBUG_LEVEL1, "%s: xfer %u -> %d \n",
-					spi->dev.bus_id, addr, status);
+					dev_name(&spi->dev), addr, status);
 
 			(void) dataflash_waitready(priv->spi);
 		}
@@ -402,7 +402,7 @@ static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len,
 		spi_transfer_del(x + 1);
 		if (status < 0)
 			DEBUG(MTD_DEBUG_LEVEL1, "%s: pgm %u/%u -> %d \n",
-				spi->dev.bus_id, addr, writelen, status);
+				dev_name(&spi->dev), addr, writelen, status);
 
 		(void) dataflash_waitready(priv->spi);
 
@@ -422,14 +422,14 @@ static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len,
 		status = spi_sync(spi, &msg);
 		if (status < 0)
 			DEBUG(MTD_DEBUG_LEVEL1, "%s: compare %u -> %d \n",
-				spi->dev.bus_id, addr, status);
+				dev_name(&spi->dev), addr, status);
 
 		status = dataflash_waitready(priv->spi);
 
 		/* Check result of the compare operation */
 		if (status & (1 << 6)) {
 			printk(KERN_ERR "%s: compare page %u, err %d\n",
-				spi->dev.bus_id, pageaddr, status);
+				dev_name(&spi->dev), pageaddr, status);
 			remaining = 0;
 			status = -EIO;
 			break;
@@ -785,7 +785,7 @@ static struct flash_info *__devinit jedec_probe(struct spi_device *spi)
 	tmp = spi_write_then_read(spi, &code, 1, id, 3);
 	if (tmp < 0) {
 		DEBUG(MTD_DEBUG_LEVEL0, "%s: error %d reading JEDEC ID\n",
-			spi->dev.bus_id, tmp);
+			dev_name(&spi->dev), tmp);
 		return ERR_PTR(tmp);
 	}
 	if (id[0] != 0x1f)
@@ -875,7 +875,7 @@ static int __devinit dataflash_probe(struct spi_device *spi)
 	status = dataflash_status(spi);
 	if (status <= 0 || status == 0xff) {
 		DEBUG(MTD_DEBUG_LEVEL1, "%s: status error %d\n",
-				spi->dev.bus_id, status);
+				dev_name(&spi->dev), status);
 		if (status == 0 || status == 0xff)
 			status = -ENODEV;
 		return status;
@@ -911,13 +911,13 @@ static int __devinit dataflash_probe(struct spi_device *spi)
 	/* obsolete AT45DB1282 not (yet?) supported */
 	default:
 		DEBUG(MTD_DEBUG_LEVEL1, "%s: unsupported device (%x)\n",
-				spi->dev.bus_id, status & 0x3c);
+				dev_name(&spi->dev), status & 0x3c);
 		status = -ENODEV;
 	}
 
 	if (status < 0)
 		DEBUG(MTD_DEBUG_LEVEL1, "%s: add_dataflash --> %d\n",
-				spi->dev.bus_id, status);
+				dev_name(&spi->dev), status);
 
 	return status;
 }
@@ -927,7 +927,7 @@ static int __devexit dataflash_remove(struct spi_device *spi)
 	struct dataflash	*flash = dev_get_drvdata(&spi->dev);
 	int			status;
 
-	DEBUG(MTD_DEBUG_LEVEL1, "%s: remove\n", spi->dev.bus_id);
+	DEBUG(MTD_DEBUG_LEVEL1, "%s: remove\n", dev_name(&spi->dev));
 
 	if (mtd_has_partitions() && flash->partitioned)
 		status = del_mtd_partitions(&flash->mtd);
diff --git a/drivers/mtd/maps/integrator-flash.c b/drivers/mtd/maps/integrator-flash.c
index 7100ee3c7b01..d2ec262666c7 100644
--- a/drivers/mtd/maps/integrator-flash.c
+++ b/drivers/mtd/maps/integrator-flash.c
@@ -105,7 +105,7 @@ static int armflash_probe(struct platform_device *dev)
 	info->map.bankwidth	= plat->width;
 	info->map.phys		= res->start;
 	info->map.virt		= base;
-	info->map.name		= dev->dev.bus_id;
+	info->map.name		= dev_name(&dev->dev);
 	info->map.set_vpp	= armflash_set_vpp;
 
 	simple_map_init(&info->map);
diff --git a/drivers/mtd/maps/ixp2000.c b/drivers/mtd/maps/ixp2000.c
index dcdb1f17577d..d76880d91bdb 100644
--- a/drivers/mtd/maps/ixp2000.c
+++ b/drivers/mtd/maps/ixp2000.c
@@ -188,7 +188,7 @@ static int ixp2000_flash_probe(struct platform_device *dev)
  	 */
 	info->map.map_priv_2 = (unsigned long) ixp_data->bank_setup;
 
-	info->map.name = dev->dev.bus_id;
+	info->map.name = dev_name(&dev->dev);
 	info->map.read = ixp2000_flash_read8;
 	info->map.write = ixp2000_flash_write8;
 	info->map.copy_from = ixp2000_flash_copy_from;
@@ -196,7 +196,7 @@ static int ixp2000_flash_probe(struct platform_device *dev)
 
 	info->res = request_mem_region(dev->resource->start,
 			dev->resource->end - dev->resource->start + 1,
-			dev->dev.bus_id);
+			dev_name(&dev->dev));
 	if (!info->res) {
 		dev_err(&dev->dev, "Could not reserve memory region\n");
 		err = -ENOMEM;
diff --git a/drivers/mtd/maps/ixp4xx.c b/drivers/mtd/maps/ixp4xx.c
index 9c7a5fbd4e51..4d0be2f1503f 100644
--- a/drivers/mtd/maps/ixp4xx.c
+++ b/drivers/mtd/maps/ixp4xx.c
@@ -218,7 +218,7 @@ static int ixp4xx_flash_probe(struct platform_device *dev)
 	 * handle that.
 	 */
 	info->map.bankwidth = 2;
-	info->map.name = dev->dev.bus_id;
+	info->map.name = dev_name(&dev->dev);
 	info->map.read = ixp4xx_read16,
 	info->map.write = ixp4xx_probe_write16,
 	info->map.copy_from = ixp4xx_copy_from,
diff --git a/drivers/mtd/maps/omap_nor.c b/drivers/mtd/maps/omap_nor.c
index 05f276af15da..7e50e9b1b781 100644
--- a/drivers/mtd/maps/omap_nor.c
+++ b/drivers/mtd/maps/omap_nor.c
@@ -101,7 +101,7 @@ static int __init omapflash_probe(struct platform_device *pdev)
 		err = -ENOMEM;
 		goto out_release_mem_region;
 	}
-	info->map.name		= pdev->dev.bus_id;
+	info->map.name		= dev_name(&pdev->dev);
 	info->map.phys		= res->start;
 	info->map.size		= size;
 	info->map.bankwidth	= pdata->width;
diff --git a/drivers/mtd/maps/physmap.c b/drivers/mtd/maps/physmap.c
index 58207b3b9411..d3a2acc7e9be 100644
--- a/drivers/mtd/maps/physmap.c
+++ b/drivers/mtd/maps/physmap.c
@@ -106,13 +106,13 @@ static int physmap_flash_probe(struct platform_device *dev)
 		if (!devm_request_mem_region(&dev->dev,
 			dev->resource[i].start,
 			dev->resource[i].end - dev->resource[i].start + 1,
-			dev->dev.bus_id)) {
+			dev_name(&dev->dev))) {
 			dev_err(&dev->dev, "Could not reserve memory region\n");
 			err = -ENOMEM;
 			goto err_out;
 		}
 
-		info->map[i].name = dev->dev.bus_id;
+		info->map[i].name = dev_name(&dev->dev);
 		info->map[i].phys = dev->resource[i].start;
 		info->map[i].size = dev->resource[i].end - dev->resource[i].start + 1;
 		info->map[i].bankwidth = physmap_data->width;
@@ -148,7 +148,7 @@ static int physmap_flash_probe(struct platform_device *dev)
 		 * We detected multiple devices. Concatenate them together.
 		 */
 #ifdef CONFIG_MTD_CONCAT
-		info->cmtd = mtd_concat_create(info->mtd, devices_found, dev->dev.bus_id);
+		info->cmtd = mtd_concat_create(info->mtd, devices_found, dev_name(&dev->dev));
 		if (info->cmtd == NULL)
 			err = -ENXIO;
 #else
diff --git a/drivers/mtd/maps/physmap_of.c b/drivers/mtd/maps/physmap_of.c
index 5fcfec034a94..fbf0ca939d72 100644
--- a/drivers/mtd/maps/physmap_of.c
+++ b/drivers/mtd/maps/physmap_of.c
@@ -183,7 +183,7 @@ static int __devinit of_flash_probe(struct of_device *dev,
 
 	err = -EBUSY;
 	info->res = request_mem_region(res.start, res.end - res.start + 1,
-				       dev->dev.bus_id);
+				       dev_name(&dev->dev));
 	if (!info->res)
 		goto err_out;
 
@@ -194,7 +194,7 @@ static int __devinit of_flash_probe(struct of_device *dev,
 		goto err_out;
 	}
 
-	info->map.name = dev->dev.bus_id;
+	info->map.name = dev_name(&dev->dev);
 	info->map.phys = res.start;
 	info->map.size = res.end - res.start + 1;
 	info->map.bankwidth = *width;
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index c26dd528d094..3dbb1b38db66 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -691,7 +691,7 @@ static int concat_block_markbad(struct mtd_info *mtd, loff_t ofs)
  */
 struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to concatenate */
 				   int num_devs,	/* number of subdevices      */
-				   char *name)
+				   const char *name)
 {				/* name for the new device   */
 	int i;
 	size_t size;
diff --git a/drivers/mtd/nand/fsl_upm.c b/drivers/mtd/nand/fsl_upm.c
index a83192f80eba..7815a404a632 100644
--- a/drivers/mtd/nand/fsl_upm.c
+++ b/drivers/mtd/nand/fsl_upm.c
@@ -222,7 +222,7 @@ static int __devinit fun_probe(struct of_device *ofdev,
 
 	fun->rnb_gpio = of_get_gpio(ofdev->node, 0);
 	if (fun->rnb_gpio >= 0) {
-		ret = gpio_request(fun->rnb_gpio, ofdev->dev.bus_id);
+		ret = gpio_request(fun->rnb_gpio, dev_name(&ofdev->dev));
 		if (ret) {
 			dev_err(&ofdev->dev, "can't request RNB gpio\n");
 			goto err2;
diff --git a/drivers/mtd/nand/plat_nand.c b/drivers/mtd/nand/plat_nand.c
index f674c5427b17..75f9f4874ecf 100644
--- a/drivers/mtd/nand/plat_nand.c
+++ b/drivers/mtd/nand/plat_nand.c
@@ -54,7 +54,7 @@ static int __init plat_nand_probe(struct platform_device *pdev)
 	data->chip.priv = &data;
 	data->mtd.priv = &data->chip;
 	data->mtd.owner = THIS_MODULE;
-	data->mtd.name = pdev->dev.bus_id;
+	data->mtd.name = dev_name(&pdev->dev);
 
 	data->chip.IO_ADDR_R = data->io_base;
 	data->chip.IO_ADDR_W = data->io_base;
diff --git a/drivers/mtd/nand/tmio_nand.c b/drivers/mtd/nand/tmio_nand.c
index edb1e322113d..daa6a4c3b8ce 100644
--- a/drivers/mtd/nand/tmio_nand.c
+++ b/drivers/mtd/nand/tmio_nand.c
@@ -433,7 +433,7 @@ static int tmio_probe(struct platform_device *dev)
 	nand_chip->chip_delay = 15;
 
 	retval = request_irq(irq, &tmio_irq,
-				IRQF_DISABLED, dev->dev.bus_id, tmio);
+				IRQF_DISABLED, dev_name(&dev->dev), tmio);
 	if (retval) {
 		dev_err(&dev->dev, "request_irq error %d\n", retval);
 		goto err_irq;
diff --git a/drivers/mtd/onenand/generic.c b/drivers/mtd/onenand/generic.c
index ad81ab8e95e2..5b69e7773c6c 100644
--- a/drivers/mtd/onenand/generic.c
+++ b/drivers/mtd/onenand/generic.c
@@ -63,7 +63,7 @@ static int __devinit generic_onenand_probe(struct device *dev)
 	info->onenand.mmcontrol = pdata->mmcontrol;
 	info->onenand.irq = platform_get_irq(pdev, 0);
 
-	info->mtd.name = pdev->dev.bus_id;
+	info->mtd.name = dev_name(&pdev->dev);
 	info->mtd.priv = &info->onenand;
 	info->mtd.owner = THIS_MODULE;
 
diff --git a/drivers/mtd/onenand/omap2.c b/drivers/mtd/onenand/omap2.c
index a7e4d985f5ef..710edee790b5 100644
--- a/drivers/mtd/onenand/omap2.c
+++ b/drivers/mtd/onenand/omap2.c
@@ -668,7 +668,7 @@ static int __devinit omap2_onenand_probe(struct platform_device *pdev)
 		 c->onenand.base);
 
 	c->pdev = pdev;
-	c->mtd.name = pdev->dev.bus_id;
+	c->mtd.name = dev_name(&pdev->dev);
 	c->mtd.priv = &c->onenand;
 	c->mtd.owner = THIS_MODULE;
 
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
index 634e2e86525f..84a134ead7cc 100644
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -280,7 +280,7 @@ static int ubi_sysfs_init(struct ubi_device *ubi)
 	ubi->dev.release = dev_release;
 	ubi->dev.devt = ubi->cdev.dev;
 	ubi->dev.class = ubi_class;
-	sprintf(&ubi->dev.bus_id[0], UBI_NAME_STR"%d", ubi->ubi_num);
+	dev_set_name(&ubi->dev, UBI_NAME_STR"%d", ubi->ubi_num);
 	err = device_register(&ubi->dev);
 	if (err)
 		return err;
diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c
index 3531ca9a1e24..22e1d7398fce 100644
--- a/drivers/mtd/ubi/vmt.c
+++ b/drivers/mtd/ubi/vmt.c
@@ -329,7 +329,7 @@ int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req)
 	vol->dev.devt = dev;
 	vol->dev.class = ubi_class;
 
-	sprintf(&vol->dev.bus_id[0], "%s_%d", ubi->ubi_name, vol->vol_id);
+	dev_set_name(&vol->dev, "%s_%d", ubi->ubi_name, vol->vol_id);
 	err = device_register(&vol->dev);
 	if (err) {
 		ubi_err("cannot register device");
@@ -678,7 +678,7 @@ int ubi_add_volume(struct ubi_device *ubi, struct ubi_volume *vol)
 	vol->dev.parent = &ubi->dev;
 	vol->dev.devt = dev;
 	vol->dev.class = ubi_class;
-	sprintf(&vol->dev.bus_id[0], "%s_%d", ubi->ubi_name, vol->vol_id);
+	dev_set_name(&vol->dev, "%s_%d", ubi->ubi_name, vol->vol_id);
 	err = device_register(&vol->dev);
 	if (err)
 		goto out_gluebi;
diff --git a/include/linux/mtd/concat.h b/include/linux/mtd/concat.h
index c02f3d264ecf..e80c674daeb3 100644
--- a/include/linux/mtd/concat.h
+++ b/include/linux/mtd/concat.h
@@ -13,7 +13,7 @@
 struct mtd_info *mtd_concat_create(
     struct mtd_info *subdev[],  /* subdevices to concatenate */
     int num_devs,               /* number of subdevices      */
-    char *name);                /* name for the new device   */
+    const char *name);          /* name for the new device   */
 
 void mtd_concat_destroy(struct mtd_info *mtd);
 
-- 
cgit v1.2.3


From f748bafa3ca1fb056e63afdeecacc1c68d8104df Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Mon, 8 Dec 2008 21:30:31 -0700
Subject: ACPI: PCI: move struct acpi_prt_entry declaration out of public
 header file

The struct acpi_prt_entry is used only in pci_irq.c, so there's no
need for the declaration to be public.  This patch moves it into
pci_irq.c.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/pci_irq.c | 16 ++++++++++++++++
 include/linux/acpi.h   | 16 ----------------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index 9302f4bb89e2..ea003bab7ecd 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -41,6 +41,22 @@
 #define _COMPONENT		ACPI_PCI_COMPONENT
 ACPI_MODULE_NAME("pci_irq");
 
+struct acpi_prt_entry {
+	struct list_head	node;
+	struct acpi_pci_id	id;
+	u8			pin;
+	struct {
+		acpi_handle		handle;
+		u32			index;
+	}			link;
+	u32			irq;
+};
+
+struct acpi_prt_list {
+	int			count;
+	struct list_head	entries;
+};
+
 static struct acpi_prt_list acpi_prt;
 static DEFINE_SPINLOCK(acpi_prt_lock);
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index fba8051fb297..813f937b3ab4 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -131,22 +131,6 @@ extern int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity);
  */
 void acpi_unregister_gsi (u32 gsi);
 
-struct acpi_prt_entry {
-	struct list_head	node;
-	struct acpi_pci_id	id;
-	u8			pin;
-	struct {
-		acpi_handle		handle;
-		u32			index;
-	}			link;
-	u32			irq;
-};
-
-struct acpi_prt_list {
-	int			count;
-	struct list_head	entries;
-};
-
 struct pci_dev;
 
 int acpi_pci_irq_enable (struct pci_dev *dev);
-- 
cgit v1.2.3


From ea7e96e0f2277107d9ea14c3f16c86ba82b2e560 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Tue, 16 Dec 2008 16:28:17 +0800
Subject: ACPI: remove private acpica headers from driver files

External driver files should not include any private acpica headers.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/ia64/include/asm/acpi-ext.h   | 1 -
 arch/ia64/include/asm/sn/acpi.h    | 2 --
 arch/ia64/sn/kernel/io_acpi_init.c | 1 -
 arch/ia64/sn/kernel/io_common.c    | 1 -
 drivers/acpi/cm_sbs.c              | 3 ---
 drivers/acpi/debug.c               | 1 -
 drivers/acpi/ec.c                  | 1 -
 drivers/acpi/numa.c                | 1 -
 drivers/acpi/sbshc.c               | 1 -
 drivers/acpi/scan.c                | 1 -
 drivers/acpi/sleep/wakeup.c        | 1 -
 drivers/ata/libata-acpi.c          | 6 ------
 drivers/ata/pata_acpi.c            | 6 ------
 drivers/char/tpm/tpm_bios.c        | 2 --
 drivers/ide/ide-acpi.c             | 6 ------
 drivers/misc/tc1100-wmi.c          | 1 -
 drivers/misc/thinkpad_acpi.c       | 1 -
 drivers/pci/hotplug/acpi_pcihp.c   | 1 -
 drivers/pci/hotplug/pciehp.h       | 1 -
 drivers/pci/pci-acpi.c             | 2 --
 drivers/pnp/pnpacpi/core.c         | 1 -
 include/linux/pci_hotplug.h        | 1 -
 22 files changed, 42 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/acpi-ext.h b/arch/ia64/include/asm/acpi-ext.h
index 734d137dda6e..7f8362b379eb 100644
--- a/arch/ia64/include/asm/acpi-ext.h
+++ b/arch/ia64/include/asm/acpi-ext.h
@@ -14,7 +14,6 @@
 #define _ASM_IA64_ACPI_EXT_H
 
 #include <linux/types.h>
-#include <acpi/actypes.h>
 
 extern acpi_status hp_acpi_csr_space (acpi_handle, u64 *base, u64 *length);
 
diff --git a/arch/ia64/include/asm/sn/acpi.h b/arch/ia64/include/asm/sn/acpi.h
index 9ce2801cbd57..fd480db25565 100644
--- a/arch/ia64/include/asm/sn/acpi.h
+++ b/arch/ia64/include/asm/sn/acpi.h
@@ -9,8 +9,6 @@
 #ifndef _ASM_IA64_SN_ACPI_H
 #define _ASM_IA64_SN_ACPI_H
 
-#include "acpi/acglobal.h"
-
 extern int sn_acpi_rev;
 #define SN_ACPI_BASE_SUPPORT()   (sn_acpi_rev >= 0x20101)
 
diff --git a/arch/ia64/sn/kernel/io_acpi_init.c b/arch/ia64/sn/kernel/io_acpi_init.c
index 4c8bc8eee5ad..c5a214026a77 100644
--- a/arch/ia64/sn/kernel/io_acpi_init.c
+++ b/arch/ia64/sn/kernel/io_acpi_init.c
@@ -13,7 +13,6 @@
 #include <asm/sn/sn_sal.h>
 #include "xtalk/hubdev.h"
 #include <linux/acpi.h>
-#include <acpi/acnamesp.h>
 
 
 /*
diff --git a/arch/ia64/sn/kernel/io_common.c b/arch/ia64/sn/kernel/io_common.c
index e1917ede0d60..0d4ffa4da1da 100644
--- a/arch/ia64/sn/kernel/io_common.c
+++ b/arch/ia64/sn/kernel/io_common.c
@@ -26,7 +26,6 @@
 #include <linux/acpi.h>
 #include <asm/sn/sn2/sn_hwperf.h>
 #include <asm/sn/acpi.h>
-#include "acpi/acglobal.h"
 
 extern void sn_init_cpei_timer(void);
 extern void register_sn_procfs(void);
diff --git a/drivers/acpi/cm_sbs.c b/drivers/acpi/cm_sbs.c
index 307963bd1043..332fe4b21708 100644
--- a/drivers/acpi/cm_sbs.c
+++ b/drivers/acpi/cm_sbs.c
@@ -27,9 +27,6 @@
 #include <linux/seq_file.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
-#include <acpi/acmacros.h>
-#include <acpi/actypes.h>
-#include <acpi/acutils.h>
 
 ACPI_MODULE_NAME("cm_sbs");
 #define ACPI_AC_CLASS		"ac_adapter"
diff --git a/drivers/acpi/debug.c b/drivers/acpi/debug.c
index c48396892008..20223cbd0d1c 100644
--- a/drivers/acpi/debug.c
+++ b/drivers/acpi/debug.c
@@ -9,7 +9,6 @@
 #include <linux/moduleparam.h>
 #include <asm/uaccess.h>
 #include <acpi/acpi_drivers.h>
-#include <acpi/acglobal.h>
 
 #define _COMPONENT		ACPI_SYSTEM_COMPONENT
 ACPI_MODULE_NAME("debug");
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 2cbc2c9c07ac..3105e0410e9b 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -42,7 +42,6 @@
 #include <asm/io.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
-#include <acpi/actypes.h>
 
 #define ACPI_EC_CLASS			"embedded_controller"
 #define ACPI_EC_DEVICE_NAME		"Embedded Controller"
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 25ceae9191ef..c5e292aab0e3 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -29,7 +29,6 @@
 #include <linux/errno.h>
 #include <linux/acpi.h>
 #include <acpi/acpi_bus.h>
-#include <acpi/acmacros.h>
 
 #define ACPI_NUMA	0x80000000
 #define _COMPONENT	ACPI_NUMA
diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c
index e53e590252c0..0619734895b2 100644
--- a/drivers/acpi/sbshc.c
+++ b/drivers/acpi/sbshc.c
@@ -10,7 +10,6 @@
 
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
-#include <acpi/actypes.h>
 #include <linux/wait.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 39b7233c3485..c54d7b6c4066 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -10,7 +10,6 @@
 #include <linux/kthread.h>
 
 #include <acpi/acpi_drivers.h>
-#include <acpi/acinterp.h>	/* for acpi_ex_eisa_id_to_string() */
 
 #define _COMPONENT		ACPI_BUS_COMPONENT
 ACPI_MODULE_NAME("scan");
diff --git a/drivers/acpi/sleep/wakeup.c b/drivers/acpi/sleep/wakeup.c
index db325c28a1fb..2d34806d45dd 100644
--- a/drivers/acpi/sleep/wakeup.c
+++ b/drivers/acpi/sleep/wakeup.c
@@ -8,7 +8,6 @@
 #include <acpi/acpi_drivers.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <acpi/acevents.h>
 #include "sleep.h"
 
 #define _COMPONENT		ACPI_SYSTEM_COMPONENT
diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c
index c012307d0ba6..246987f0b88c 100644
--- a/drivers/ata/libata-acpi.c
+++ b/drivers/ata/libata-acpi.c
@@ -19,12 +19,6 @@
 #include "libata.h"
 
 #include <acpi/acpi_bus.h>
-#include <acpi/acnames.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acparser.h>
-#include <acpi/acexcep.h>
-#include <acpi/acmacros.h>
-#include <acpi/actypes.h>
 
 enum {
 	ATA_ACPI_FILTER_SETXFER	= 1 << 0,
diff --git a/drivers/ata/pata_acpi.c b/drivers/ata/pata_acpi.c
index e2e332d8ff95..8b77a9802df1 100644
--- a/drivers/ata/pata_acpi.c
+++ b/drivers/ata/pata_acpi.c
@@ -13,12 +13,6 @@
 #include <linux/device.h>
 #include <scsi/scsi_host.h>
 #include <acpi/acpi_bus.h>
-#include <acpi/acnames.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acparser.h>
-#include <acpi/acexcep.h>
-#include <acpi/acmacros.h>
-#include <acpi/actypes.h>
 
 #include <linux/libata.h>
 #include <linux/ata.h>
diff --git a/drivers/char/tpm/tpm_bios.c b/drivers/char/tpm/tpm_bios.c
index 68f052b42ed7..ed306eb1057f 100644
--- a/drivers/char/tpm/tpm_bios.c
+++ b/drivers/char/tpm/tpm_bios.c
@@ -23,8 +23,6 @@
 #include <linux/security.h>
 #include <linux/module.h>
 #include <acpi/acpi.h>
-#include <acpi/actypes.h>
-#include <acpi/actbl.h>
 #include "tpm.h"
 
 #define TCG_EVENT_NAME_LEN_MAX	255
diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
index 244a8a052ce8..9e8d52a4f306 100644
--- a/drivers/ide/ide-acpi.c
+++ b/drivers/ide/ide-acpi.c
@@ -18,12 +18,6 @@
 #include <linux/dmi.h>
 
 #include <acpi/acpi_bus.h>
-#include <acpi/acnames.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acparser.h>
-#include <acpi/acexcep.h>
-#include <acpi/acmacros.h>
-#include <acpi/actypes.h>
 
 #define REGS_PER_GTF		7
 struct taskfile_array {
diff --git a/drivers/misc/tc1100-wmi.c b/drivers/misc/tc1100-wmi.c
index f25e4c974dcf..b4a4aa9ee482 100644
--- a/drivers/misc/tc1100-wmi.c
+++ b/drivers/misc/tc1100-wmi.c
@@ -30,7 +30,6 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <acpi/acpi.h>
-#include <acpi/actypes.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 #include <linux/platform_device.h>
diff --git a/drivers/misc/thinkpad_acpi.c b/drivers/misc/thinkpad_acpi.c
index 899766e16fa8..3478453eba7a 100644
--- a/drivers/misc/thinkpad_acpi.c
+++ b/drivers/misc/thinkpad_acpi.c
@@ -76,7 +76,6 @@
 #include <linux/workqueue.h>
 
 #include <acpi/acpi_drivers.h>
-#include <acpi/acnamesp.h>
 
 #include <linux/pci_ids.h>
 
diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index e17ef54f0efc..2c981cbb0719 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -33,7 +33,6 @@
 #include <linux/pci-acpi.h>
 #include <acpi/acpi.h>
 #include <acpi/acpi_bus.h>
-#include <acpi/actypes.h>
 
 #define MY_NAME	"acpi_pcihp"
 
diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index b2801a7ee37f..7072952ea1d2 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -217,7 +217,6 @@ struct hpc_ops {
 #ifdef CONFIG_ACPI
 #include <acpi/acpi.h>
 #include <acpi/acpi_bus.h>
-#include <acpi/actypes.h>
 #include <linux/pci-acpi.h>
 
 static inline int pciehp_get_hp_hw_control_from_firmware(struct pci_dev *dev)
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index ae5ec76dca77..9d976d51d406 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -13,8 +13,6 @@
 #include <linux/module.h>
 #include <linux/pci-aspm.h>
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acresrc.h>
 #include <acpi/acpi_bus.h>
 
 #include <linux/pci-acpi.h>
diff --git a/drivers/pnp/pnpacpi/core.c b/drivers/pnp/pnpacpi/core.c
index 383e47c392a4..2834846a185d 100644
--- a/drivers/pnp/pnpacpi/core.c
+++ b/drivers/pnp/pnpacpi/core.c
@@ -23,7 +23,6 @@
 #include <linux/pnp.h>
 #include <linux/mod_devicetable.h>
 #include <acpi/acpi_bus.h>
-#include <acpi/actypes.h>
 
 #include "../base.h"
 #include "pnpacpi.h"
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index a00bd1a0f156..c2d1a7d1886a 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -223,7 +223,6 @@ struct hotplug_params {
 #ifdef CONFIG_ACPI
 #include <acpi/acpi.h>
 #include <acpi/acpi_bus.h>
-#include <acpi/actypes.h>
 extern acpi_status acpi_get_hp_params_from_firmware(struct pci_bus *bus,
 				struct hotplug_params *hpp);
 int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags);
-- 
cgit v1.2.3


From 56c451f4b583ccdf80c9e676179c9cb49de86745 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 18 Dec 2008 14:49:37 +0900
Subject: [SCSI] block: fix the partial mappings with struct rq_map_data

This fixes bio_copy_user_iov to properly handle the partial mappings
with struct rq_map_data (which only sg uses for now but st and osst
will shortly). It adds the offset member to struct rq_map_data and
changes blk_rq_map_user to update it so that bio_copy_user_iov can add
an appropriate page frame via bio_add_pc_page().

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 block/blk-map.c        |  3 +++
 drivers/scsi/sg.c      |  1 +
 fs/bio.c               | 12 +++++++++---
 include/linux/blkdev.h |  1 +
 4 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-map.c b/block/blk-map.c
index 2990447f45e9..c7e55b23a2bc 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -150,6 +150,9 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 			bio = rq->bio;
 		bytes_read += ret;
 		ubuf += ret;
+
+		if (map_data)
+			map_data->offset += ret;
 	}
 
 	if (!bio_flagged(bio, BIO_USER_MAPPED))
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 5103855242ae..7d0b3d9ee43b 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1669,6 +1669,7 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd)
 		md->pages = req_schp->pages;
 		md->page_order = req_schp->page_order;
 		md->nr_entries = req_schp->k_use_sg;
+		md->offset = 0;
 	}
 
 	if (iov_count)
diff --git a/fs/bio.c b/fs/bio.c
index 356e7423b923..13be075806b6 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -788,6 +788,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	int i, ret;
 	int nr_pages = 0;
 	unsigned int len = 0;
+	unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
 
 	for (i = 0; i < iov_count; i++) {
 		unsigned long uaddr;
@@ -814,12 +815,16 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	bio->bi_rw |= (!write_to_vm << BIO_RW);
 
 	ret = 0;
-	i = 0;
-	if (map_data)
+
+	if (map_data) {
 		nr_pages = 1 << map_data->page_order;
+		i = map_data->offset / PAGE_SIZE;
+	}
 	while (len) {
 		unsigned int bytes = PAGE_SIZE;
 
+		bytes -= offset;
+
 		if (bytes > len)
 			bytes = len;
 
@@ -841,10 +846,11 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 			}
 		}
 
-		if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
+		if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
 			break;
 
 		len -= bytes;
+		offset = 0;
 	}
 
 	if (ret)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7035cec583b6..811e5342c452 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -690,6 +690,7 @@ struct rq_map_data {
 	struct page **pages;
 	int page_order;
 	int nr_entries;
+	unsigned long offset;
 };
 
 struct req_iterator {
-- 
cgit v1.2.3


From 97ae77a1cd332c7b011d71315c8faabce6840c72 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 18 Dec 2008 14:49:38 +0900
Subject: [SCSI] block: make blk_rq_map_user take a NULL user-space buffer for
 WRITE

The commit 818827669d85b84241696ffef2de485db46b0b5e (block: make
blk_rq_map_user take a NULL user-space buffer) extended
blk_rq_map_user to accept a NULL user-space buffer with a READ
command. It was necessary to convert sg to use the block layer mapping
API.

This patch extends blk_rq_map_user again for a WRITE command. It is
necessary to convert st and osst drivers to use the block layer
apping API.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 block/blk-map.c        | 16 +++++++---------
 drivers/scsi/sg.c      |  1 +
 fs/bio.c               |  2 +-
 include/linux/blkdev.h |  1 +
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-map.c b/block/blk-map.c
index c7e55b23a2bc..f103729b462f 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -42,7 +42,7 @@ static int __blk_rq_unmap_user(struct bio *bio)
 
 static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 			     struct rq_map_data *map_data, void __user *ubuf,
-			     unsigned int len, int null_mapped, gfp_t gfp_mask)
+			     unsigned int len, gfp_t gfp_mask)
 {
 	unsigned long uaddr;
 	struct bio *bio, *orig_bio;
@@ -63,7 +63,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
-	if (null_mapped)
+	if (map_data && map_data->null_mapped)
 		bio->bi_flags |= (1 << BIO_NULL_MAPPED);
 
 	orig_bio = bio;
@@ -114,17 +114,15 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 {
 	unsigned long bytes_read = 0;
 	struct bio *bio = NULL;
-	int ret, null_mapped = 0;
+	int ret;
 
 	if (len > (q->max_hw_sectors << 9))
 		return -EINVAL;
 	if (!len)
 		return -EINVAL;
-	if (!ubuf) {
-		if (!map_data || rq_data_dir(rq) != READ)
-			return -EINVAL;
-		null_mapped = 1;
-	}
+
+	if (!ubuf && (!map_data || !map_data->null_mapped))
+		return -EINVAL;
 
 	while (bytes_read != len) {
 		unsigned long map_len, end, start;
@@ -143,7 +141,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 			map_len -= PAGE_SIZE;
 
 		ret = __blk_rq_map_user(q, rq, map_data, ubuf, map_len,
-					null_mapped, gfp_mask);
+					gfp_mask);
 		if (ret < 0)
 			goto unmap_rq;
 		if (!bio)
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 7d0b3d9ee43b..8f0bd3f7a59f 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1670,6 +1670,7 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd)
 		md->page_order = req_schp->page_order;
 		md->nr_entries = req_schp->k_use_sg;
 		md->offset = 0;
+		md->null_mapped = hp->dxferp ? 0 : 1;
 	}
 
 	if (iov_count)
diff --git a/fs/bio.c b/fs/bio.c
index 13be075806b6..062299acbccd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -859,7 +859,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	/*
 	 * success
 	 */
-	if (!write_to_vm) {
+	if (!write_to_vm && (!map_data || !map_data->null_mapped)) {
 		ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
 		if (ret)
 			goto cleanup;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 811e5342c452..044467ef7b11 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -691,6 +691,7 @@ struct rq_map_data {
 	int page_order;
 	int nr_entries;
 	unsigned long offset;
+	int null_mapped;
 };
 
 struct req_iterator {
-- 
cgit v1.2.3


From 87d8fe1ee6b8d2f95076142d58c440dba4e7bdc2 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 3 Jan 2009 09:47:09 -0500
Subject: add releasepage hooks to block devices which can be used by file
 systems

Implement blkdev_releasepage() to release the buffer_heads and pages
after we release private data belonging to a mounted filesystem.

Cc: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/block_dev.c     | 15 +++++++++++++++
 fs/super.c         |  2 ++
 include/linux/fs.h |  2 ++
 3 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 349a26c10001..1dd07e66e98a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1220,6 +1220,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 
+/*
+ * Try to release a page associated with block device when the system
+ * is under memory pressure.
+ */
+static int blkdev_releasepage(struct page *page, gfp_t wait)
+{
+	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
+
+	if (super && super->s_op->bdev_try_to_free_page)
+		return super->s_op->bdev_try_to_free_page(super, page, wait);
+
+	return try_to_free_buffers(page);
+}
+
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
 	.writepage	= blkdev_writepage,
@@ -1227,6 +1241,7 @@ static const struct address_space_operations def_blk_aops = {
 	.write_begin	= blkdev_write_begin,
 	.write_end	= blkdev_write_end,
 	.writepages	= generic_writepages,
+	.releasepage	= blkdev_releasepage,
 	.direct_IO	= blkdev_direct_IO,
 };
 
diff --git a/fs/super.c b/fs/super.c
index ddba069d7a99..d5fd4498548a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -800,6 +800,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
 		}
 
 		s->s_flags |= MS_ACTIVE;
+		bdev->bd_super = s;
 	}
 
 	return simple_set_mnt(mnt, s);
@@ -819,6 +820,7 @@ void kill_block_super(struct super_block *sb)
 	struct block_device *bdev = sb->s_bdev;
 	fmode_t mode = sb->s_mode;
 
+	bdev->bd_super = 0;
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
 	close_bdev_exclusive(bdev, mode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f2a3010140e3..0f54ae0f0ccd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -565,6 +565,7 @@ struct address_space {
 struct block_device {
 	dev_t			bd_dev;  /* not a kdev_t - it's a search key */
 	struct inode *		bd_inode;	/* will die */
+	struct super_block *	bd_super;
 	int			bd_openers;
 	struct mutex		bd_mutex;	/* open/close mutex */
 	struct semaphore	bd_mount_sem;
@@ -1385,6 +1386,7 @@ struct super_operations {
 	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 #endif
+	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
 };
 
 /*
-- 
cgit v1.2.3


From c31910672376dfb8d020e32afa7249763bcd924a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 6 Jan 2009 11:14:25 -0500
Subject: ext4: Remove code to create the journal inode

This code has been obsolete in quite some time, since the supported
method for adding a journal inode is to use tune2fs (or to creating
new filesystem with a journal via mke2fs or mkfs.ext4).

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt |  4 ---
 fs/ext4/super.c                    | 68 +++--------------------------------
 fs/jbd2/journal.c                  | 72 --------------------------------------
 include/linux/jbd2.h               |  1 -
 4 files changed, 4 insertions(+), 141 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index e3fcbea3ec8c..9ec29d86ff8b 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -149,10 +149,6 @@ journal_async_commit	Commit block can be written to disk without waiting
 journal=update		Update the ext4 file system's journal to the current
 			format.
 
-journal=inum		When a journal already exists, this option is ignored.
-			Otherwise, it specifies the number of the inode which
-			will represent the ext4 file system's journal file.
-
 journal_dev=devnum	When the external journal device's major/minor numbers
 			have changed, this option allows the user to specify
 			the new journal location.  The journal device is
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e5ab520724da..8036392b2121 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -51,8 +51,6 @@ struct proc_dir_entry *ext4_proc_root;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
-static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
-			       unsigned int);
 static void ext4_commit_super(struct super_block *sb,
 			      struct ext4_super_block *es, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
@@ -1006,7 +1004,7 @@ enum {
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
-	Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+	Opt_journal_update, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_data_err_abort, Opt_data_err_ignore,
@@ -1048,7 +1046,6 @@ static const match_table_t tokens = {
 	{Opt_min_batch_time, "min_batch_time=%u"},
 	{Opt_max_batch_time, "max_batch_time=%u"},
 	{Opt_journal_update, "journal=update"},
-	{Opt_journal_inum, "journal=%u"},
 	{Opt_journal_dev, "journal_dev=%u"},
 	{Opt_journal_checksum, "journal_checksum"},
 	{Opt_journal_async_commit, "journal_async_commit"},
@@ -1102,7 +1099,7 @@ static ext4_fsblk_t get_sb_block(void **data)
 }
 
 static int parse_options(char *options, struct super_block *sb,
-			 unsigned int *inum, unsigned long *journal_devnum,
+			 unsigned long *journal_devnum,
 			 ext4_fsblk_t *n_blocks_count, int is_remount)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1226,16 +1223,6 @@ static int parse_options(char *options, struct super_block *sb,
 			}
 			set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
 			break;
-		case Opt_journal_inum:
-			if (is_remount) {
-				printk(KERN_ERR "EXT4-fs: cannot specify "
-				       "journal on remount\n");
-				return 0;
-			}
-			if (match_int(&args[0], &option))
-				return 0;
-			*inum = option;
-			break;
 		case Opt_journal_dev:
 			if (is_remount) {
 				printk(KERN_ERR "EXT4-fs: cannot specify "
@@ -2035,7 +2022,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	ext4_fsblk_t sb_block = get_sb_block(&data);
 	ext4_fsblk_t logical_sb_block;
 	unsigned long offset = 0;
-	unsigned int journal_inum = 0;
 	unsigned long journal_devnum = 0;
 	unsigned long def_mount_opts;
 	struct inode *root;
@@ -2155,8 +2141,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	set_opt(sbi->s_mount_opt, DELALLOC);
 
 
-	if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
-			   NULL, 0))
+	if (!parse_options((char *) data, sb, &journal_devnum, NULL, 0))
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2460,9 +2445,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				goto failed_mount4;
 			}
 		}
-	} else if (journal_inum) {
-		if (ext4_create_journal(sb, es, journal_inum))
-			goto failed_mount3;
 	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
 	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		printk(KERN_ERR "EXT4-fs: required journal recovery "
@@ -2926,48 +2908,6 @@ static int ext4_load_journal(struct super_block *sb,
 	return 0;
 }
 
-static int ext4_create_journal(struct super_block *sb,
-			       struct ext4_super_block *es,
-			       unsigned int journal_inum)
-{
-	journal_t *journal;
-	int err;
-
-	if (sb->s_flags & MS_RDONLY) {
-		printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
-				"create journal.\n");
-		return -EROFS;
-	}
-
-	journal = ext4_get_journal(sb, journal_inum);
-	if (!journal)
-		return -EINVAL;
-
-	printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
-	       journal_inum);
-
-	err = jbd2_journal_create(journal);
-	if (err) {
-		printk(KERN_ERR "EXT4-fs: error creating journal.\n");
-		jbd2_journal_destroy(journal);
-		return -EIO;
-	}
-
-	EXT4_SB(sb)->s_journal = journal;
-
-	ext4_update_dynamic_rev(sb);
-	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-	EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
-
-	es->s_journal_inum = cpu_to_le32(journal_inum);
-	sb->s_dirt = 1;
-
-	/* Make sure we flush the recovery flag to disk. */
-	ext4_commit_super(sb, es, 1);
-
-	return 0;
-}
-
 static void ext4_commit_super(struct super_block *sb,
 			      struct ext4_super_block *es, int sync)
 {
@@ -3209,7 +3149,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	/*
 	 * Allow the "check" option to be passed as a remount option.
 	 */
-	if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+	if (!parse_options(data, sb, NULL, &n_blocks_count, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 34ef98057202..b10d7283ba5b 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -66,7 +66,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format);
 EXPORT_SYMBOL(jbd2_journal_check_used_features);
 EXPORT_SYMBOL(jbd2_journal_check_available_features);
 EXPORT_SYMBOL(jbd2_journal_set_features);
-EXPORT_SYMBOL(jbd2_journal_create);
 EXPORT_SYMBOL(jbd2_journal_load);
 EXPORT_SYMBOL(jbd2_journal_destroy);
 EXPORT_SYMBOL(jbd2_journal_abort);
@@ -1162,77 +1161,6 @@ static int journal_reset(journal_t *journal)
 	return jbd2_journal_start_thread(journal);
 }
 
-/**
- * int jbd2_journal_create() - Initialise the new journal file
- * @journal: Journal to create. This structure must have been initialised
- *
- * Given a journal_t structure which tells us which disk blocks we can
- * use, create a new journal superblock and initialise all of the
- * journal fields from scratch.
- **/
-int jbd2_journal_create(journal_t *journal)
-{
-	unsigned long long blocknr;
-	struct buffer_head *bh;
-	journal_superblock_t *sb;
-	int i, err;
-
-	if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
-		printk (KERN_ERR "Journal length (%d blocks) too short.\n",
-			journal->j_maxlen);
-		journal_fail_superblock(journal);
-		return -EINVAL;
-	}
-
-	if (journal->j_inode == NULL) {
-		/*
-		 * We don't know what block to start at!
-		 */
-		printk(KERN_EMERG
-		       "%s: creation of journal on external device!\n",
-		       __func__);
-		BUG();
-	}
-
-	/* Zero out the entire journal on disk.  We cannot afford to
-	   have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
-	jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
-	for (i = 0; i < journal->j_maxlen; i++) {
-		err = jbd2_journal_bmap(journal, i, &blocknr);
-		if (err)
-			return err;
-		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-		lock_buffer(bh);
-		memset (bh->b_data, 0, journal->j_blocksize);
-		BUFFER_TRACE(bh, "marking dirty");
-		mark_buffer_dirty(bh);
-		BUFFER_TRACE(bh, "marking uptodate");
-		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
-		__brelse(bh);
-	}
-
-	sync_blockdev(journal->j_dev);
-	jbd_debug(1, "JBD: journal cleared.\n");
-
-	/* OK, fill in the initial static fields in the new superblock */
-	sb = journal->j_superblock;
-
-	sb->s_header.h_magic	 = cpu_to_be32(JBD2_MAGIC_NUMBER);
-	sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
-
-	sb->s_blocksize	= cpu_to_be32(journal->j_blocksize);
-	sb->s_maxlen	= cpu_to_be32(journal->j_maxlen);
-	sb->s_first	= cpu_to_be32(1);
-
-	journal->j_transaction_sequence = 1;
-
-	journal->j_flags &= ~JBD2_ABORT;
-	journal->j_format_version = 2;
-
-	return journal_reset(journal);
-}
-
 /**
  * void jbd2_journal_update_superblock() - Update journal sb on disk.
  * @journal: The journal to update.
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 9d82084a1605..adef1c9940d3 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1104,7 +1104,6 @@ extern int	   jbd2_journal_set_features
 		   (journal_t *, unsigned long, unsigned long, unsigned long);
 extern void	   jbd2_journal_clear_features
 		   (journal_t *, unsigned long, unsigned long, unsigned long);
-extern int	   jbd2_journal_create     (journal_t *);
 extern int	   jbd2_journal_load       (journal_t *journal);
 extern int	   jbd2_journal_destroy    (journal_t *);
 extern int	   jbd2_journal_recover    (journal_t *journal);
-- 
cgit v1.2.3


From 922ab535bbe73975ce62f71ab9bf8ec9bce71c29 Mon Sep 17 00:00:00 2001
From: Alexey Korolev <akorolev@infradead.org>
Date: Tue, 16 Dec 2008 18:13:58 +0000
Subject: [MTD] LPDDR QINFO records definitions

There are declaraton of structures and macros definitions
necessary for operations with QINFO in this patch.

Signed-off-by: Alexey Korolev <akorolev@infradead.org>
Acked-by: Jared Hulbert <jaredeh@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/qinfo.h | 91 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 include/linux/mtd/qinfo.h

(limited to 'include/linux')

diff --git a/include/linux/mtd/qinfo.h b/include/linux/mtd/qinfo.h
new file mode 100644
index 000000000000..7b3d487d8b3f
--- /dev/null
+++ b/include/linux/mtd/qinfo.h
@@ -0,0 +1,91 @@
+#ifndef __LINUX_MTD_QINFO_H
+#define __LINUX_MTD_QINFO_H
+
+#include <linux/mtd/map.h>
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/flashchip.h>
+#include <linux/mtd/partitions.h>
+
+/* lpddr_private describes lpddr flash chip in memory map
+ * @ManufactId - Chip Manufacture ID
+ * @DevId - Chip Device ID
+ * @qinfo - pointer to qinfo records describing the chip
+ * @numchips - number of chips including virual RWW partitions
+ * @chipshift - Chip/partiton size 2^chipshift
+ * @chips - per-chip data structure
+ */
+struct lpddr_private {
+	uint16_t ManufactId;
+	uint16_t DevId;
+	struct qinfo_chip *qinfo;
+	int numchips;
+	unsigned long chipshift;
+	struct flchip chips[0];
+};
+
+/* qinfo_query_info structure contains request information for
+ * each qinfo record
+ * @major - major number of qinfo record
+ * @major - minor number of qinfo record
+ * @id_str - descriptive string to access the record
+ * @desc - detailed description for the qinfo record
+ */
+struct qinfo_query_info {
+	uint8_t	major;
+	uint8_t	minor;
+	char *id_str;
+	char *desc;
+};
+
+/*
+ * qinfo_chip structure contains necessary qinfo records data
+ * @DevSizeShift - Device size 2^n bytes
+ * @BufSizeShift - Program buffer size 2^n bytes
+ * @TotalBlocksNum - Total number of blocks
+ * @UniformBlockSizeShift - Uniform block size 2^UniformBlockSizeShift bytes
+ * @HWPartsNum - Number of hardware partitions
+ * @SuspEraseSupp - Suspend erase supported
+ * @SingleWordProgTime - Single word program 2^SingleWordProgTime u-sec
+ * @ProgBufferTime - Program buffer write 2^ProgBufferTime u-sec
+ * @BlockEraseTime - Block erase 2^BlockEraseTime m-sec
+ */
+struct qinfo_chip {
+	/* General device info */
+	uint16_t DevSizeShift;
+	uint16_t BufSizeShift;
+	/* Erase block information */
+	uint16_t TotalBlocksNum;
+	uint16_t UniformBlockSizeShift;
+	/* Partition information */
+	uint16_t HWPartsNum;
+	/* Optional features */
+	uint16_t SuspEraseSupp;
+	/* Operation typical time */
+	uint16_t SingleWordProgTime;
+	uint16_t ProgBufferTime;
+	uint16_t BlockEraseTime;
+};
+
+/* defines for fixup usage */
+#define LPDDR_MFR_ANY		0xffff
+#define LPDDR_ID_ANY		0xffff
+#define NUMONYX_MFGR_ID		0x0089
+#define R18_DEVICE_ID_1G	0x893c
+
+static inline map_word lpddr_build_cmd(u_long cmd, struct map_info *map)
+{
+	map_word val = { {0} };
+	val.x[0] = cmd;
+	return val;
+}
+
+#define CMD(x) lpddr_build_cmd(x, map)
+#define CMDVAL(cmd) cmd.x[0]
+
+struct mtd_info *lpddr_cmdset(struct map_info *);
+
+#endif
+
-- 
cgit v1.2.3


From eb3db27507f74b99241abfa11824d8b6d92b84ef Mon Sep 17 00:00:00 2001
From: Alexey Korolev <akorolev@infradead.org>
Date: Tue, 16 Dec 2008 18:15:33 +0000
Subject: [MTD] LPDDR PFOW definition

LPDDR chips use PFOW window for sending commands, reading status and
capabilites requesting.
This pfow.h - contains definitions for PFOW window fileds, possible commands,
error flags and some common macro function to avoid code duplications.

Signed-off-by: Alexey Korolev <akorolev@infradead.org>
Acked-by: Jared Hulbert <jaredeh@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/pfow.h | 159 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 include/linux/mtd/pfow.h

(limited to 'include/linux')

diff --git a/include/linux/mtd/pfow.h b/include/linux/mtd/pfow.h
new file mode 100644
index 000000000000..b730d4f84655
--- /dev/null
+++ b/include/linux/mtd/pfow.h
@@ -0,0 +1,159 @@
+/* Primary function overlay window definitions
+ * and service functions used by LPDDR chips
+ */
+#ifndef __LINUX_MTD_PFOW_H
+#define __LINUX_MTD_PFOW_H
+
+#include <linux/mtd/qinfo.h>
+
+/* PFOW registers addressing */
+/* Address of symbol "P" */
+#define PFOW_QUERY_STRING_P			0x0000
+/* Address of symbol "F" */
+#define PFOW_QUERY_STRING_F			0x0002
+/* Address of symbol "O" */
+#define PFOW_QUERY_STRING_O			0x0004
+/* Address of symbol "W" */
+#define PFOW_QUERY_STRING_W			0x0006
+/* Identification info for LPDDR chip */
+#define PFOW_MANUFACTURER_ID			0x0020
+#define PFOW_DEVICE_ID				0x0022
+/* Address in PFOW where prog buffer can can be found */
+#define PFOW_PROGRAM_BUFFER_OFFSET		0x0040
+/* Size of program buffer in words */
+#define PFOW_PROGRAM_BUFFER_SIZE		0x0042
+/* Address command code register */
+#define PFOW_COMMAND_CODE			0x0080
+/* command data register */
+#define PFOW_COMMAND_DATA			0x0084
+/* command address register lower address bits */
+#define PFOW_COMMAND_ADDRESS_L			0x0088
+/* command address register upper address bits */
+#define PFOW_COMMAND_ADDRESS_H			0x008a
+/* number of bytes to be proggrammed lower address bits */
+#define PFOW_DATA_COUNT_L			0x0090
+/* number of bytes to be proggrammed higher address bits */
+#define PFOW_DATA_COUNT_H			0x0092
+/* command execution register, the only possible value is 0x01 */
+#define PFOW_COMMAND_EXECUTE			0x00c0
+/* 0x01 should be written at this address to clear buffer */
+#define PFOW_CLEAR_PROGRAM_BUFFER		0x00c4
+/* device program/erase suspend register */
+#define PFOW_PROGRAM_ERASE_SUSPEND		0x00c8
+/* device status register */
+#define PFOW_DSR				0x00cc
+
+/* LPDDR memory device command codes */
+/* They are possible values of PFOW command code register */
+#define LPDDR_WORD_PROGRAM		0x0041
+#define LPDDR_BUFF_PROGRAM		0x00E9
+#define LPDDR_BLOCK_ERASE		0x0020
+#define LPDDR_LOCK_BLOCK		0x0061
+#define LPDDR_UNLOCK_BLOCK		0x0062
+#define LPDDR_READ_BLOCK_LOCK_STATUS	0x0065
+#define LPDDR_INFO_QUERY		0x0098
+#define LPDDR_READ_OTP			0x0097
+#define LPDDR_PROG_OTP			0x00C0
+#define LPDDR_RESUME			0x00D0
+
+/* Defines possible value of PFOW command execution register */
+#define LPDDR_START_EXECUTION			0x0001
+
+/* Defines possible value of PFOW program/erase suspend register */
+#define LPDDR_SUSPEND				0x0001
+
+/* Possible values of PFOW device status register */
+/* access R - read; RC read & clearable */
+#define DSR_DPS			(1<<1) /* RC; device protect status
+					* 0 - not protected 1 - locked */
+#define DSR_PSS			(1<<2) /* R; program suspend status;
+					* 0-prog in progress/completed,
+					* 1- prog suspended */
+#define DSR_VPPS		(1<<3) /* RC; 0-Vpp OK, * 1-Vpp low */
+#define DSR_PROGRAM_STATUS	(1<<4) /* RC; 0-successful, 1-error */
+#define DSR_ERASE_STATUS	(1<<5) /* RC; erase or blank check status;
+					* 0-success erase/blank check,
+					* 1 blank check error */
+#define DSR_ESS			(1<<6) /* R; erase suspend status;
+					* 0-erase in progress/complete,
+					* 1 erase suspended */
+#define DSR_READY_STATUS	(1<<7) /* R; Device status
+					* 0-busy,
+					* 1-ready */
+#define DSR_RPS			(0x3<<8) /* RC;  region program status
+					* 00 - Success,
+					* 01-re-program attempt in region with
+					* object mode data,
+					* 10-object mode program w attempt in
+					* region with control mode data
+					* 11-attempt to program invalid half
+					* with 0x41 command */
+#define DSR_AOS			(1<<12) /* RC; 1- AO related failure */
+#define DSR_AVAILABLE		(1<<15) /* R; Device availbility
+					* 1 - Device available
+					* 0 - not available */
+
+/* The superset of all possible error bits in DSR */
+#define DSR_ERR			0x133A
+
+static inline void send_pfow_command(struct map_info *map,
+				unsigned long cmd_code, unsigned long adr,
+				unsigned long len, map_word *datum)
+{
+	int bits_per_chip = map_bankwidth(map) * 8;
+	int chipnum;
+	struct lpddr_private *lpddr = map->fldrv_priv;
+	chipnum = adr >> lpddr->chipshift;
+
+	map_write(map, CMD(cmd_code), map->pfow_base + PFOW_COMMAND_CODE);
+	map_write(map, CMD(adr & ((1<<bits_per_chip) - 1)),
+				map->pfow_base + PFOW_COMMAND_ADDRESS_L);
+	map_write(map, CMD(adr>>bits_per_chip),
+				map->pfow_base + PFOW_COMMAND_ADDRESS_H);
+	if (len) {
+		map_write(map, CMD(len & ((1<<bits_per_chip) - 1)),
+					map->pfow_base + PFOW_DATA_COUNT_L);
+		map_write(map, CMD(len>>bits_per_chip),
+					map->pfow_base + PFOW_DATA_COUNT_H);
+	}
+	if (datum)
+		map_write(map, *datum, map->pfow_base + PFOW_COMMAND_DATA);
+
+	/* Command execution start */
+	map_write(map, CMD(LPDDR_START_EXECUTION),
+			map->pfow_base + PFOW_COMMAND_EXECUTE);
+}
+
+static inline void print_drs_error(unsigned dsr)
+{
+	int prog_status = (dsr & DSR_RPS) >> 8;
+
+	if (!(dsr & DSR_AVAILABLE))
+		printk(KERN_NOTICE"DSR.15: (0) Device not Available\n");
+	if (prog_status & 0x03)
+		printk(KERN_NOTICE"DSR.9,8: (11) Attempt to program invalid "
+						"half with 41h command\n");
+	else if (prog_status & 0x02)
+		printk(KERN_NOTICE"DSR.9,8: (10) Object Mode Program attempt "
+					"in region with Control Mode data\n");
+	else if (prog_status &  0x01)
+		printk(KERN_NOTICE"DSR.9,8: (01) Program attempt in region "
+						"with Object Mode data\n");
+	if (!(dsr & DSR_READY_STATUS))
+		printk(KERN_NOTICE"DSR.7: (0) Device is Busy\n");
+	if (dsr & DSR_ESS)
+		printk(KERN_NOTICE"DSR.6: (1) Erase Suspended\n");
+	if (dsr & DSR_ERASE_STATUS)
+		printk(KERN_NOTICE"DSR.5: (1) Erase/Blank check error\n");
+	if (dsr & DSR_PROGRAM_STATUS)
+		printk(KERN_NOTICE"DSR.4: (1) Program Error\n");
+	if (dsr & DSR_VPPS)
+		printk(KERN_NOTICE"DSR.3: (1) Vpp low detect, operation "
+					"aborted\n");
+	if (dsr & DSR_PSS)
+		printk(KERN_NOTICE"DSR.2: (1) Program suspended\n");
+	if (dsr & DSR_DPS)
+		printk(KERN_NOTICE"DSR.1: (1) Aborted Erase/Program attempt "
+					"on locked block\n");
+}
+#endif /* __LINUX_MTD_PFOW_H */
-- 
cgit v1.2.3


From d13e51e747fee301b404dffcf4a7e1bdc558969b Mon Sep 17 00:00:00 2001
From: Alexey Korolev <akorolev@infradead.org>
Date: Tue, 16 Dec 2008 18:21:10 +0000
Subject: [MTD] LPDDR added new pfow_base parameter

We need to supply additional parameter to mapping driver and tell
LPDDR drivers where PFOW window is in chip mapping.
It leads to necessity of map_info structure extendoing.

Signed-off-by: Alexey Korolev <akorolev@infradead.org>
Acked-by: Jared Hulbert <jaredeh@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/map.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index aa30244492c6..b981b8772217 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -223,6 +223,7 @@ struct map_info {
 	   must leave it enabled. */
 	void (*set_vpp)(struct map_info *, int);
 
+	unsigned long pfow_base;
 	unsigned long map_priv_1;
 	unsigned long map_priv_2;
 	void *fldrv_priv;
-- 
cgit v1.2.3


From d81408304b06a71c28417445202af9cd6673168d Mon Sep 17 00:00:00 2001
From: Alexey Korolev <akorolev@infradead.org>
Date: Tue, 16 Dec 2008 18:22:39 +0000
Subject: [MTD] LPDDR extended physmap driver to support LPDDR flash

Physmap is a generic map driver for different platforms and flash types.
We added support of LPDDR to physmap.
All changes here are related to introduction of new pfow_base parameter.
This parameter is valid in case of LPDDR chips only.

Signed-off-by: Alexey Korolev <akorolev@infradead.org>
Acked-by: Jared Hulbert <jaredeh@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/maps/Kconfig    | 4 ++--
 drivers/mtd/maps/physmap.c  | 8 +++++++-
 include/linux/mtd/physmap.h | 1 +
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig
index 3788a548336c..0225cbbf22de 100644
--- a/drivers/mtd/maps/Kconfig
+++ b/drivers/mtd/maps/Kconfig
@@ -10,8 +10,8 @@ config MTD_COMPLEX_MAPPINGS
 	  paged mappings of flash chips.
 
 config MTD_PHYSMAP
-	tristate "CFI Flash device in physical memory map"
-	depends on MTD_CFI || MTD_JEDECPROBE || MTD_ROM
+	tristate "Flash device in physical memory map"
+	depends on MTD_CFI || MTD_JEDECPROBE || MTD_ROM || MTD_LPDDR
 	help
 	  This provides a 'mapping' driver which allows the NOR Flash and
 	  ROM driver code to communicate with chips which are mapped
diff --git a/drivers/mtd/maps/physmap.c b/drivers/mtd/maps/physmap.c
index d3a2acc7e9be..87743661d48e 100644
--- a/drivers/mtd/maps/physmap.c
+++ b/drivers/mtd/maps/physmap.c
@@ -68,7 +68,12 @@ static int physmap_flash_remove(struct platform_device *dev)
 	return 0;
 }
 
-static const char *rom_probe_types[] = { "cfi_probe", "jedec_probe", "map_rom", NULL };
+static const char *rom_probe_types[] = {
+					"cfi_probe",
+					"jedec_probe",
+					"qinfo_probe",
+					"map_rom",
+					NULL };
 #ifdef CONFIG_MTD_PARTITIONS
 static const char *part_probe_types[] = { "cmdlinepart", "RedBoot", NULL };
 #endif
@@ -117,6 +122,7 @@ static int physmap_flash_probe(struct platform_device *dev)
 		info->map[i].size = dev->resource[i].end - dev->resource[i].start + 1;
 		info->map[i].bankwidth = physmap_data->width;
 		info->map[i].set_vpp = physmap_data->set_vpp;
+		info->map[i].pfow_base = physmap_data->pfow_base;
 
 		info->map[i].virt = devm_ioremap(&dev->dev, info->map[i].phys,
 						 info->map[i].size);
diff --git a/include/linux/mtd/physmap.h b/include/linux/mtd/physmap.h
index c8e63a5ee72e..76f7cabf07d3 100644
--- a/include/linux/mtd/physmap.h
+++ b/include/linux/mtd/physmap.h
@@ -24,6 +24,7 @@ struct physmap_flash_data {
 	unsigned int		width;
 	void			(*set_vpp)(struct map_info *, int);
 	unsigned int		nr_parts;
+	unsigned int		pfow_base;
 	struct mtd_partition	*parts;
 };
 
-- 
cgit v1.2.3


From 07f2211e4fbce6990722d78c4f04225da9c0e9cf Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 5 Jan 2009 17:14:31 -0700
Subject: dmaengine: remove dependency on async_tx

async_tx.ko is a consumer of dma channels.  A circular dependency arises
if modules in drivers/dma rely on common code in async_tx.ko.  It
prevents either module from being unloaded.

Move dma_wait_for_async_tx and async_tx_run_dependencies to dmaeninge.o
where they should have been from the beginning.

Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/async_tx/async_tx.c | 75 -----------------------------------------
 drivers/dma/Kconfig        |  2 --
 drivers/dma/dmaengine.c    | 84 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/dma/iop-adma.c     |  3 +-
 drivers/dma/mv_xor.c       |  3 +-
 include/linux/async_tx.h   | 15 ---------
 include/linux/dmaengine.h  |  9 +++++
 7 files changed, 95 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index dcbf1be149f3..8cfac182165d 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -72,81 +72,6 @@ void async_tx_issue_pending_all(void)
 }
 EXPORT_SYMBOL_GPL(async_tx_issue_pending_all);
 
-/* dma_wait_for_async_tx - spin wait for a transcation to complete
- * @tx: transaction to wait on
- */
-enum dma_status
-dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
-{
-	enum dma_status status;
-	struct dma_async_tx_descriptor *iter;
-	struct dma_async_tx_descriptor *parent;
-
-	if (!tx)
-		return DMA_SUCCESS;
-
-	/* poll through the dependency chain, return when tx is complete */
-	do {
-		iter = tx;
-
-		/* find the root of the unsubmitted dependency chain */
-		do {
-			parent = iter->parent;
-			if (!parent)
-				break;
-			else
-				iter = parent;
-		} while (parent);
-
-		/* there is a small window for ->parent == NULL and
-		 * ->cookie == -EBUSY
-		 */
-		while (iter->cookie == -EBUSY)
-			cpu_relax();
-
-		status = dma_sync_wait(iter->chan, iter->cookie);
-	} while (status == DMA_IN_PROGRESS || (iter != tx));
-
-	return status;
-}
-EXPORT_SYMBOL_GPL(dma_wait_for_async_tx);
-
-/* async_tx_run_dependencies - helper routine for dma drivers to process
- *	(start) dependent operations on their target channel
- * @tx: transaction with dependencies
- */
-void async_tx_run_dependencies(struct dma_async_tx_descriptor *tx)
-{
-	struct dma_async_tx_descriptor *dep = tx->next;
-	struct dma_async_tx_descriptor *dep_next;
-	struct dma_chan *chan;
-
-	if (!dep)
-		return;
-
-	chan = dep->chan;
-
-	/* keep submitting up until a channel switch is detected
-	 * in that case we will be called again as a result of
-	 * processing the interrupt from async_tx_channel_switch
-	 */
-	for (; dep; dep = dep_next) {
-		spin_lock_bh(&dep->lock);
-		dep->parent = NULL;
-		dep_next = dep->next;
-		if (dep_next && dep_next->chan == chan)
-			dep->next = NULL; /* ->next will be submitted */
-		else
-			dep_next = NULL; /* submit current dep and terminate */
-		spin_unlock_bh(&dep->lock);
-
-		dep->tx_submit(dep);
-	}
-
-	chan->device->device_issue_pending(chan);
-}
-EXPORT_SYMBOL_GPL(async_tx_run_dependencies);
-
 static void
 free_dma_chan_ref(struct rcu_head *rcu)
 {
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 904e57558bb5..e34b06420816 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -33,7 +33,6 @@ config INTEL_IOATDMA
 config INTEL_IOP_ADMA
 	tristate "Intel IOP ADMA support"
 	depends on ARCH_IOP32X || ARCH_IOP33X || ARCH_IOP13XX
-	select ASYNC_CORE
 	select DMA_ENGINE
 	help
 	  Enable support for the Intel(R) IOP Series RAID engines.
@@ -59,7 +58,6 @@ config FSL_DMA
 config MV_XOR
 	bool "Marvell XOR engine support"
 	depends on PLAT_ORION
-	select ASYNC_CORE
 	select DMA_ENGINE
 	---help---
 	  Enable support for the Marvell XOR engine.
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 657996517374..b9008932a8f3 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -626,6 +626,90 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 }
 EXPORT_SYMBOL(dma_async_tx_descriptor_init);
 
+/* dma_wait_for_async_tx - spin wait for a transaction to complete
+ * @tx: in-flight transaction to wait on
+ *
+ * This routine assumes that tx was obtained from a call to async_memcpy,
+ * async_xor, async_memset, etc which ensures that tx is "in-flight" (prepped
+ * and submitted).  Walking the parent chain is only meant to cover for DMA
+ * drivers that do not implement the DMA_INTERRUPT capability and may race with
+ * the driver's descriptor cleanup routine.
+ */
+enum dma_status
+dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
+{
+	enum dma_status status;
+	struct dma_async_tx_descriptor *iter;
+	struct dma_async_tx_descriptor *parent;
+
+	if (!tx)
+		return DMA_SUCCESS;
+
+	WARN_ONCE(tx->parent, "%s: speculatively walking dependency chain for"
+		  " %s\n", __func__, dev_name(&tx->chan->dev));
+
+	/* poll through the dependency chain, return when tx is complete */
+	do {
+		iter = tx;
+
+		/* find the root of the unsubmitted dependency chain */
+		do {
+			parent = iter->parent;
+			if (!parent)
+				break;
+			else
+				iter = parent;
+		} while (parent);
+
+		/* there is a small window for ->parent == NULL and
+		 * ->cookie == -EBUSY
+		 */
+		while (iter->cookie == -EBUSY)
+			cpu_relax();
+
+		status = dma_sync_wait(iter->chan, iter->cookie);
+	} while (status == DMA_IN_PROGRESS || (iter != tx));
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(dma_wait_for_async_tx);
+
+/* dma_run_dependencies - helper routine for dma drivers to process
+ *	(start) dependent operations on their target channel
+ * @tx: transaction with dependencies
+ */
+void dma_run_dependencies(struct dma_async_tx_descriptor *tx)
+{
+	struct dma_async_tx_descriptor *dep = tx->next;
+	struct dma_async_tx_descriptor *dep_next;
+	struct dma_chan *chan;
+
+	if (!dep)
+		return;
+
+	chan = dep->chan;
+
+	/* keep submitting up until a channel switch is detected
+	 * in that case we will be called again as a result of
+	 * processing the interrupt from async_tx_channel_switch
+	 */
+	for (; dep; dep = dep_next) {
+		spin_lock_bh(&dep->lock);
+		dep->parent = NULL;
+		dep_next = dep->next;
+		if (dep_next && dep_next->chan == chan)
+			dep->next = NULL; /* ->next will be submitted */
+		else
+			dep_next = NULL; /* submit current dep and terminate */
+		spin_unlock_bh(&dep->lock);
+
+		dep->tx_submit(dep);
+	}
+
+	chan->device->device_issue_pending(chan);
+}
+EXPORT_SYMBOL_GPL(dma_run_dependencies);
+
 static int __init dma_bus_init(void)
 {
 	mutex_init(&dma_list_mutex);
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index 6be317262200..be9ea9f88805 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -24,7 +24,6 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/async_tx.h>
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/spinlock.h>
@@ -116,7 +115,7 @@ iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc,
 	}
 
 	/* run dependent operations */
-	async_tx_run_dependencies(&desc->async_tx);
+	dma_run_dependencies(&desc->async_tx);
 
 	return cookie;
 }
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index bcda17426411..3f46df3390c7 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -18,7 +18,6 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/async_tx.h>
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/spinlock.h>
@@ -340,7 +339,7 @@ mv_xor_run_tx_complete_actions(struct mv_xor_desc_slot *desc,
 	}
 
 	/* run dependent operations */
-	async_tx_run_dependencies(&desc->async_tx);
+	dma_run_dependencies(&desc->async_tx);
 
 	return cookie;
 }
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 0f50d4cc4360..1c816775f135 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -60,8 +60,6 @@ enum async_tx_flags {
 
 #ifdef CONFIG_DMA_ENGINE
 void async_tx_issue_pending_all(void);
-enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
-void async_tx_run_dependencies(struct dma_async_tx_descriptor *tx);
 #ifdef CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL
 #include <asm/async_tx.h>
 #else
@@ -77,19 +75,6 @@ static inline void async_tx_issue_pending_all(void)
 	do { } while (0);
 }
 
-static inline enum dma_status
-dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
-{
-	return DMA_SUCCESS;
-}
-
-static inline void
-async_tx_run_dependencies(struct dma_async_tx_descriptor *tx,
-	struct dma_chan *host_chan)
-{
-	do { } while (0);
-}
-
 static inline struct dma_chan *
 async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
 	enum dma_transaction_type tx_type, struct page **dst, int dst_count,
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index adb0b084eb5a..e4ec7e7b8056 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -475,11 +475,20 @@ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
 }
 
 enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie);
+#ifdef CONFIG_DMA_ENGINE
+enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
+#else
+static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
+{
+	return DMA_SUCCESS;
+}
+#endif
 
 /* --- DMA device --- */
 
 int dma_async_device_register(struct dma_device *device);
 void dma_async_device_unregister(struct dma_device *device);
+void dma_run_dependencies(struct dma_async_tx_descriptor *tx);
 
 /* --- Helper iov-locking functions --- */
 
-- 
cgit v1.2.3


From b3881f74b31b7d47d0f1c4d89ac3e7f0b9c05e3e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 5 Jan 2009 22:46:26 -0500
Subject: ext4: Add mount option to set kjournald's I/O priority

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/filesystems/ext4.txt |  7 +++++++
 fs/ext4/super.c                    | 29 +++++++++++++++++++++++++----
 fs/ioprio.c                        |  3 ++-
 include/linux/ioprio.h             |  2 ++
 4 files changed, 36 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 9ec29d86ff8b..8938949b201e 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -308,6 +308,13 @@ min_batch_time=usec	This parameter sets the commit time (as
 			multi-threaded, synchronous workloads on very
 			fast disks, at the cost of increasing latency.
 
+journal_ioprio=prio	The I/O priority (from 0 to 7, where 0 is the
+			highest priorty) which should be used for I/O
+			operations submitted by kjournald2 during a
+			commit operation.  This defaults to 3, which is
+			a slightly higher priority than the default I/O
+			priority.
+
 Data Mode
 =========
 There are 3 different data modes:
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8036392b2121..8ff8709828fd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1013,7 +1013,7 @@ enum {
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
-	Opt_inode_readahead_blks
+	Opt_inode_readahead_blks, Opt_journal_ioprio
 };
 
 static const match_table_t tokens = {
@@ -1074,6 +1074,7 @@ static const match_table_t tokens = {
 	{Opt_delalloc, "delalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
+	{Opt_journal_ioprio, "journal_ioprio=%u"},
 	{Opt_err, NULL},
 };
 
@@ -1098,8 +1099,11 @@ static ext4_fsblk_t get_sb_block(void **data)
 	return sb_block;
 }
 
+#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+
 static int parse_options(char *options, struct super_block *sb,
 			 unsigned long *journal_devnum,
+			 unsigned int *journal_ioprio,
 			 ext4_fsblk_t *n_blocks_count, int is_remount)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1492,6 +1496,14 @@ set_qf_format:
 				return 0;
 			sbi->s_inode_readahead_blks = option;
 			break;
+		case Opt_journal_ioprio:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0 || option > 7)
+				break;
+			*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
+							    option);
+			break;
 		default:
 			printk(KERN_ERR
 			       "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -2035,6 +2047,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	int features;
 	__u64 blocks_count;
 	int err;
+	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
@@ -2141,7 +2154,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	set_opt(sbi->s_mount_opt, DELALLOC);
 
 
-	if (!parse_options((char *) data, sb, &journal_devnum, NULL, 0))
+	if (!parse_options((char *) data, sb, &journal_devnum,
+			   &journal_ioprio, NULL, 0))
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2506,6 +2520,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	default:
 		break;
 	}
+	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 
 no_journal:
 
@@ -3127,6 +3142,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	unsigned long old_sb_flags;
 	struct ext4_mount_options old_opts;
 	ext4_group_t g;
+	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 	int err;
 #ifdef CONFIG_QUOTA
 	int i;
@@ -3145,11 +3161,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	for (i = 0; i < MAXQUOTAS; i++)
 		old_opts.s_qf_names[i] = sbi->s_qf_names[i];
 #endif
+	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
 
 	/*
 	 * Allow the "check" option to be passed as a remount option.
 	 */
-	if (!parse_options(data, sb, NULL, &n_blocks_count, 1)) {
+	if (!parse_options(data, sb, NULL, &journal_ioprio,
+			   &n_blocks_count, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
@@ -3162,8 +3181,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 
 	es = sbi->s_es;
 
-	if (sbi->s_journal)
+	if (sbi->s_journal) {
 		ext4_init_journal_params(sb, sbi->s_journal);
+		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+	}
 
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
 		n_blocks_count > ext4_blocks_count(es)) {
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 3569e0ad86a2..1a39ac370942 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -27,7 +27,7 @@
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
 
-static int set_task_ioprio(struct task_struct *task, int ioprio)
+int set_task_ioprio(struct task_struct *task, int ioprio)
 {
 	int err;
 	struct io_context *ioc;
@@ -70,6 +70,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 	task_unlock(task);
 	return err;
 }
+EXPORT_SYMBOL_GPL(set_task_ioprio);
 
 asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 {
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index f98a656b17e5..76dad4808847 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -86,4 +86,6 @@ static inline int task_nice_ioclass(struct task_struct *task)
  */
 extern int ioprio_best(unsigned short aprio, unsigned short bprio);
 
+extern int set_task_ioprio(struct task_struct *task, int ioprio);
+
 #endif
-- 
cgit v1.2.3


From 835481d9bcd65720b473db6b38746a74a3964218 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sun, 4 Jan 2009 05:18:06 -0800
Subject: cpumask: convert struct cpufreq_policy to cpumask_var_t

Impact: use new cpumask API to reduce memory usage

This is part of an effort to reduce structure sizes for machines
configured with large NR_CPUS.  cpumask_t gets replaced by
cpumask_var_t, which is either struct cpumask[1] (small NR_CPUS) or
struct cpumask * (large NR_CPUS).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Dave Jones <davej@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c       | 10 +++---
 arch/x86/kernel/cpu/cpufreq/p4-clockmod.c        |  8 ++---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c        |  6 ++--
 arch/x86/kernel/cpu/cpufreq/powernow-k8.h        |  2 +-
 arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 14 ++++----
 arch/x86/kernel/cpu/cpufreq/speedstep-ich.c      | 18 +++++-----
 drivers/cpufreq/cpufreq.c                        | 42 ++++++++++++++++--------
 drivers/cpufreq/cpufreq_conservative.c           |  2 +-
 drivers/cpufreq/cpufreq_ondemand.c               |  4 +--
 include/linux/cpufreq.h                          |  4 +--
 10 files changed, 62 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 28102ad1a363..0b31939862d6 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -411,7 +411,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 
 #ifdef CONFIG_HOTPLUG_CPU
 	/* cpufreq holds the hotplug lock, so we are safe from here on */
-	cpus_and(online_policy_cpus, cpu_online_map, policy->cpus);
+	cpumask_and(&online_policy_cpus, cpu_online_mask, policy->cpus);
 #else
 	online_policy_cpus = policy->cpus;
 #endif
@@ -626,15 +626,15 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	 */
 	if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
 	    policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
-		cpumask_copy(&policy->cpus, perf->shared_cpu_map);
+		cpumask_copy(policy->cpus, perf->shared_cpu_map);
 	}
-	cpumask_copy(&policy->related_cpus, perf->shared_cpu_map);
+	cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
 
 #ifdef CONFIG_SMP
 	dmi_check_system(sw_any_bug_dmi_table);
-	if (bios_with_sw_any_bug && cpus_weight(policy->cpus) == 1) {
+	if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
 		policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
-		policy->cpus = per_cpu(cpu_core_map, cpu);
+		cpumask_copy(policy->cpus, cpu_core_mask(cpu));
 	}
 #endif
 
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index beea4466b063..b585e04cbc9e 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
 		return 0;
 
 	/* notifiers */
-	for_each_cpu_mask_nr(i, policy->cpus) {
+	for_each_cpu(i, policy->cpus) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 	}
@@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
 	/* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
 	 * Developer's Manual, Volume 3
 	 */
-	for_each_cpu_mask_nr(i, policy->cpus)
+	for_each_cpu(i, policy->cpus)
 		cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
 
 	/* notifiers */
-	for_each_cpu_mask_nr(i, policy->cpus) {
+	for_each_cpu(i, policy->cpus) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 	}
@@ -203,7 +203,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
 	unsigned int i;
 
 #ifdef CONFIG_SMP
-	policy->cpus = per_cpu(cpu_sibling_map, policy->cpu);
+	cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu));
 #endif
 
 	/* Errata workaround */
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index c3c9adbaa26f..5c28b37dea11 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1199,10 +1199,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 	set_cpus_allowed_ptr(current, &oldmask);
 
 	if (cpu_family == CPU_HW_PSTATE)
-		pol->cpus = cpumask_of_cpu(pol->cpu);
+		cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
 	else
-		pol->cpus = per_cpu(cpu_core_map, pol->cpu);
-	data->available_cores = &(pol->cpus);
+		cpumask_copy(pol->cpus, &per_cpu(cpu_core_map, pol->cpu));
+	data->available_cores = pol->cpus;
 
 	/* Take a crude guess here.
 	 * That guess was in microseconds, so multiply with 1000 */
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 65cfb5d7f77f..8ecc75b6c7c3 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -53,7 +53,7 @@ struct powernow_k8_data {
 	/* we need to keep track of associated cores, but let cpufreq
 	 * handle hotplug events - so just point at cpufreq pol->cpus
 	 * structure */
-	cpumask_t *available_cores;
+	struct cpumask *available_cores;
 };
 
 
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index d2cc4991cbaa..f08998278a3a 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -492,8 +492,8 @@ static int centrino_target (struct cpufreq_policy *policy,
 	}
 
 	first_cpu = 1;
-	for_each_cpu_mask_nr(j, policy->cpus) {
-		const cpumask_t *mask;
+	for_each_cpu(j, policy->cpus) {
+		const struct cpumask *mask;
 
 		/* cpufreq holds the hotplug lock, so we are safe here */
 		if (!cpu_online(j))
@@ -504,9 +504,9 @@ static int centrino_target (struct cpufreq_policy *policy,
 		 * Make sure we are running on CPU that wants to change freq
 		 */
 		if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
-			mask = &policy->cpus;
+			mask = policy->cpus;
 		else
-			mask = &cpumask_of_cpu(j);
+			mask = cpumask_of(j);
 
 		set_cpus_allowed_ptr(current, mask);
 		preempt_disable();
@@ -538,7 +538,7 @@ static int centrino_target (struct cpufreq_policy *policy,
 			dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
 				target_freq, freqs.old, freqs.new, msr);
 
-			for_each_cpu_mask_nr(k, policy->cpus) {
+			for_each_cpu(k, policy->cpus) {
 				if (!cpu_online(k))
 					continue;
 				freqs.cpu = k;
@@ -563,7 +563,7 @@ static int centrino_target (struct cpufreq_policy *policy,
 		preempt_enable();
 	}
 
-	for_each_cpu_mask_nr(k, policy->cpus) {
+	for_each_cpu(k, policy->cpus) {
 		if (!cpu_online(k))
 			continue;
 		freqs.cpu = k;
@@ -586,7 +586,7 @@ static int centrino_target (struct cpufreq_policy *policy,
 		tmp = freqs.new;
 		freqs.new = freqs.old;
 		freqs.old = tmp;
-		for_each_cpu_mask_nr(j, policy->cpus) {
+		for_each_cpu(j, policy->cpus) {
 			if (!cpu_online(j))
 				continue;
 			cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 04d0376b64b0..dedc1e98f168 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -229,7 +229,7 @@ static unsigned int speedstep_detect_chipset (void)
 	return 0;
 }
 
-static unsigned int _speedstep_get(const cpumask_t *cpus)
+static unsigned int _speedstep_get(const struct cpumask *cpus)
 {
 	unsigned int speed;
 	cpumask_t cpus_allowed;
@@ -244,7 +244,7 @@ static unsigned int _speedstep_get(const cpumask_t *cpus)
 
 static unsigned int speedstep_get(unsigned int cpu)
 {
-	return _speedstep_get(&cpumask_of_cpu(cpu));
+	return _speedstep_get(cpumask_of(cpu));
 }
 
 /**
@@ -267,7 +267,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
 	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate))
 		return -EINVAL;
 
-	freqs.old = _speedstep_get(&policy->cpus);
+	freqs.old = _speedstep_get(policy->cpus);
 	freqs.new = speedstep_freqs[newstate].frequency;
 	freqs.cpu = policy->cpu;
 
@@ -279,20 +279,20 @@ static int speedstep_target (struct cpufreq_policy *policy,
 
 	cpus_allowed = current->cpus_allowed;
 
-	for_each_cpu_mask_nr(i, policy->cpus) {
+	for_each_cpu(i, policy->cpus) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 	}
 
 	/* switch to physical CPU where state is to be changed */
-	set_cpus_allowed_ptr(current, &policy->cpus);
+	set_cpus_allowed_ptr(current, policy->cpus);
 
 	speedstep_set_state(newstate);
 
 	/* allow to be run on all CPUs */
 	set_cpus_allowed_ptr(current, &cpus_allowed);
 
-	for_each_cpu_mask_nr(i, policy->cpus) {
+	for_each_cpu(i, policy->cpus) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 	}
@@ -322,11 +322,11 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
 
 	/* only run on CPU to be set, or on its sibling */
 #ifdef CONFIG_SMP
-	policy->cpus = per_cpu(cpu_sibling_map, policy->cpu);
+	cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu));
 #endif
 
 	cpus_allowed = current->cpus_allowed;
-	set_cpus_allowed_ptr(current, &policy->cpus);
+	set_cpus_allowed_ptr(current, policy->cpus);
 
 	/* detect low and high frequency and transition latency */
 	result = speedstep_get_freqs(speedstep_processor,
@@ -339,7 +339,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
 		return result;
 
 	/* get current speed setting */
-	speed = _speedstep_get(&policy->cpus);
+	speed = _speedstep_get(policy->cpus);
 	if (!speed)
 		return -EIO;
 
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 01dde80597f7..b55cb67435bd 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -584,12 +584,12 @@ out:
 	return i;
 }
 
-static ssize_t show_cpus(cpumask_t mask, char *buf)
+static ssize_t show_cpus(const struct cpumask *mask, char *buf)
 {
 	ssize_t i = 0;
 	unsigned int cpu;
 
-	for_each_cpu_mask_nr(cpu, mask) {
+	for_each_cpu(cpu, mask) {
 		if (i)
 			i += scnprintf(&buf[i], (PAGE_SIZE - i - 2), " ");
 		i += scnprintf(&buf[i], (PAGE_SIZE - i - 2), "%u", cpu);
@@ -606,7 +606,7 @@ static ssize_t show_cpus(cpumask_t mask, char *buf)
  */
 static ssize_t show_related_cpus(struct cpufreq_policy *policy, char *buf)
 {
-	if (cpus_empty(policy->related_cpus))
+	if (cpumask_empty(policy->related_cpus))
 		return show_cpus(policy->cpus, buf);
 	return show_cpus(policy->related_cpus, buf);
 }
@@ -806,9 +806,20 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 		ret = -ENOMEM;
 		goto nomem_out;
 	}
+	if (!alloc_cpumask_var(&policy->cpus, GFP_KERNEL)) {
+		kfree(policy);
+		ret = -ENOMEM;
+		goto nomem_out;
+	}
+	if (!alloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) {
+		free_cpumask_var(policy->cpus);
+		kfree(policy);
+		ret = -ENOMEM;
+		goto nomem_out;
+	}
 
 	policy->cpu = cpu;
-	policy->cpus = cpumask_of_cpu(cpu);
+	cpumask_copy(policy->cpus, cpumask_of(cpu));
 
 	/* Initially set CPU itself as the policy_cpu */
 	per_cpu(policy_cpu, cpu) = cpu;
@@ -843,7 +854,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 	}
 #endif
 
-	for_each_cpu_mask_nr(j, policy->cpus) {
+	for_each_cpu(j, policy->cpus) {
 		if (cpu == j)
 			continue;
 
@@ -861,7 +872,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 				goto err_out_driver_exit;
 
 			spin_lock_irqsave(&cpufreq_driver_lock, flags);
-			managed_policy->cpus = policy->cpus;
+			cpumask_copy(managed_policy->cpus, policy->cpus);
 			per_cpu(cpufreq_cpu_data, cpu) = managed_policy;
 			spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
@@ -916,14 +927,14 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 	}
 
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
-	for_each_cpu_mask_nr(j, policy->cpus) {
+	for_each_cpu(j, policy->cpus) {
 		per_cpu(cpufreq_cpu_data, j) = policy;
 		per_cpu(policy_cpu, j) = policy->cpu;
 	}
 	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
 	/* symlink affected CPUs */
-	for_each_cpu_mask_nr(j, policy->cpus) {
+	for_each_cpu(j, policy->cpus) {
 		if (j == cpu)
 			continue;
 		if (!cpu_online(j))
@@ -963,7 +974,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 
 err_out_unregister:
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
-	for_each_cpu_mask_nr(j, policy->cpus)
+	for_each_cpu(j, policy->cpus)
 		per_cpu(cpufreq_cpu_data, j) = NULL;
 	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
@@ -1024,7 +1035,7 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 	 */
 	if (unlikely(cpu != data->cpu)) {
 		dprintk("removing link\n");
-		cpu_clear(cpu, data->cpus);
+		cpumask_clear_cpu(cpu, data->cpus);
 		spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 		sysfs_remove_link(&sys_dev->kobj, "cpufreq");
 		cpufreq_cpu_put(data);
@@ -1045,8 +1056,8 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 	 * per_cpu(cpufreq_cpu_data) while holding the lock, and remove
 	 * the sysfs links afterwards.
 	 */
-	if (unlikely(cpus_weight(data->cpus) > 1)) {
-		for_each_cpu_mask_nr(j, data->cpus) {
+	if (unlikely(cpumask_weight(data->cpus) > 1)) {
+		for_each_cpu(j, data->cpus) {
 			if (j == cpu)
 				continue;
 			per_cpu(cpufreq_cpu_data, j) = NULL;
@@ -1055,8 +1066,8 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 
 	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
-	if (unlikely(cpus_weight(data->cpus) > 1)) {
-		for_each_cpu_mask_nr(j, data->cpus) {
+	if (unlikely(cpumask_weight(data->cpus) > 1)) {
+		for_each_cpu(j, data->cpus) {
 			if (j == cpu)
 				continue;
 			dprintk("removing link for cpu %u\n", j);
@@ -1090,7 +1101,10 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 	if (cpufreq_driver->exit)
 		cpufreq_driver->exit(data);
 
+	free_cpumask_var(data->related_cpus);
+	free_cpumask_var(data->cpus);
 	kfree(data);
+	per_cpu(cpufreq_cpu_data, cpu) = NULL;
 
 	cpufreq_debug_enable_ratelimit();
 	return 0;
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index e2657837d954..0320962c4ec5 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -498,7 +498,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 			return rc;
 		}
 
-		for_each_cpu_mask_nr(j, policy->cpus) {
+		for_each_cpu(j, policy->cpus) {
 			struct cpu_dbs_info_s *j_dbs_info;
 			j_dbs_info = &per_cpu(cpu_dbs_info, j);
 			j_dbs_info->cur_policy = policy;
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 2ab3c12b88af..6a2b036c9389 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -400,7 +400,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 	/* Get Absolute Load - in terms of freq */
 	max_load_freq = 0;
 
-	for_each_cpu_mask_nr(j, policy->cpus) {
+	for_each_cpu(j, policy->cpus) {
 		struct cpu_dbs_info_s *j_dbs_info;
 		cputime64_t cur_wall_time, cur_idle_time;
 		unsigned int idle_time, wall_time;
@@ -568,7 +568,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 			return rc;
 		}
 
-		for_each_cpu_mask_nr(j, policy->cpus) {
+		for_each_cpu(j, policy->cpus) {
 			struct cpu_dbs_info_s *j_dbs_info;
 			j_dbs_info = &per_cpu(cpu_dbs_info, j);
 			j_dbs_info->cur_policy = policy;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 484b3abf61bb..384b38d3e8e2 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -80,8 +80,8 @@ struct cpufreq_real_policy {
 };
 
 struct cpufreq_policy {
-	cpumask_t		cpus;	/* CPUs requiring sw coordination */
-	cpumask_t		related_cpus; /* CPUs with any coordination */
+	cpumask_var_t		cpus;	/* CPUs requiring sw coordination */
+	cpumask_var_t		related_cpus; /* CPUs with any coordination */
 	unsigned int		shared_type; /* ANY or ALL affected CPUs
 						should set cpufreq */
 	unsigned int		cpu;    /* cpu nr of registered CPU */
-- 
cgit v1.2.3


From 6f49a57aa5a0c6d4e4e27c85f7af6c83325a12d1 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 6 Jan 2009 11:38:14 -0700
Subject: dmaengine: up-level reference counting to the module level

Simply, if a client wants any dmaengine channel then prevent all dmaengine
modules from being removed.  Once the clients are done re-enable module
removal.

Why?, beyond reducing complication:
1/ Tracking reference counts per-transaction in an efficient manner, as
   is currently done, requires a complicated scheme to avoid cache-line
   bouncing effects.
2/ Per-transaction ref-counting gives the false impression that a
   dma-driver can be gracefully removed ahead of its user (net, md, or
   dma-slave)
3/ None of the in-tree dma-drivers talk to hot pluggable hardware, but
   if such an engine were built one day we still would not need to notify
   clients of remove events.  The driver can simply return NULL to a
   ->prep() request, something that is much easier for a client to handle.

Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/async_tx/async_tx.c   |   4 -
 drivers/dma/dmaengine.c      | 205 +++++++++++++++++++++++++++----------------
 drivers/dma/dmatest.c        |   2 -
 drivers/dma/dw_dmac.c        |   2 +-
 drivers/mmc/host/atmel-mci.c |   4 +-
 include/linux/dmaengine.h    |  21 -----
 include/net/netdma.h         |   4 +-
 net/ipv4/tcp.c               |   1 -
 8 files changed, 132 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index 8cfac182165d..43fe4cbe71e6 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -198,8 +198,6 @@ dma_channel_add_remove(struct dma_client *client,
 		/* add the channel to the generic management list */
 		master_ref = kmalloc(sizeof(*master_ref), GFP_KERNEL);
 		if (master_ref) {
-			/* keep a reference until async_tx is unloaded */
-			dma_chan_get(chan);
 			init_dma_chan_ref(master_ref, chan);
 			spin_lock_irqsave(&async_tx_lock, flags);
 			list_add_tail_rcu(&master_ref->node,
@@ -221,8 +219,6 @@ dma_channel_add_remove(struct dma_client *client,
 		spin_lock_irqsave(&async_tx_lock, flags);
 		list_for_each_entry(ref, &async_tx_master_list, node)
 			if (ref->chan == chan) {
-				/* permit backing devices to go away */
-				dma_chan_put(ref->chan);
 				list_del_rcu(&ref->node);
 				call_rcu(&ref->rcu, free_dma_chan_ref);
 				found = 1;
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index b9008932a8f3..d4d925912c47 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -74,6 +74,7 @@
 static DEFINE_MUTEX(dma_list_mutex);
 static LIST_HEAD(dma_device_list);
 static LIST_HEAD(dma_client_list);
+static long dmaengine_ref_count;
 
 /* --- sysfs implementation --- */
 
@@ -105,19 +106,8 @@ static ssize_t show_bytes_transferred(struct device *dev, struct device_attribut
 static ssize_t show_in_use(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct dma_chan *chan = to_dma_chan(dev);
-	int in_use = 0;
-
-	if (unlikely(chan->slow_ref) &&
-		atomic_read(&chan->refcount.refcount) > 1)
-		in_use = 1;
-	else {
-		if (local_read(&(per_cpu_ptr(chan->local,
-			get_cpu())->refcount)) > 0)
-			in_use = 1;
-		put_cpu();
-	}
 
-	return sprintf(buf, "%d\n", in_use);
+	return sprintf(buf, "%d\n", chan->client_count);
 }
 
 static struct device_attribute dma_attrs[] = {
@@ -155,6 +145,78 @@ __dma_chan_satisfies_mask(struct dma_chan *chan, dma_cap_mask_t *want)
 	return bitmap_equal(want->bits, has.bits, DMA_TX_TYPE_END);
 }
 
+static struct module *dma_chan_to_owner(struct dma_chan *chan)
+{
+	return chan->device->dev->driver->owner;
+}
+
+/**
+ * balance_ref_count - catch up the channel reference count
+ * @chan - channel to balance ->client_count versus dmaengine_ref_count
+ *
+ * balance_ref_count must be called under dma_list_mutex
+ */
+static void balance_ref_count(struct dma_chan *chan)
+{
+	struct module *owner = dma_chan_to_owner(chan);
+
+	while (chan->client_count < dmaengine_ref_count) {
+		__module_get(owner);
+		chan->client_count++;
+	}
+}
+
+/**
+ * dma_chan_get - try to grab a dma channel's parent driver module
+ * @chan - channel to grab
+ *
+ * Must be called under dma_list_mutex
+ */
+static int dma_chan_get(struct dma_chan *chan)
+{
+	int err = -ENODEV;
+	struct module *owner = dma_chan_to_owner(chan);
+
+	if (chan->client_count) {
+		__module_get(owner);
+		err = 0;
+	} else if (try_module_get(owner))
+		err = 0;
+
+	if (err == 0)
+		chan->client_count++;
+
+	/* allocate upon first client reference */
+	if (chan->client_count == 1 && err == 0) {
+		int desc_cnt = chan->device->device_alloc_chan_resources(chan, NULL);
+
+		if (desc_cnt < 0) {
+			err = desc_cnt;
+			chan->client_count = 0;
+			module_put(owner);
+		} else
+			balance_ref_count(chan);
+	}
+
+	return err;
+}
+
+/**
+ * dma_chan_put - drop a reference to a dma channel's parent driver module
+ * @chan - channel to release
+ *
+ * Must be called under dma_list_mutex
+ */
+static void dma_chan_put(struct dma_chan *chan)
+{
+	if (!chan->client_count)
+		return; /* this channel failed alloc_chan_resources */
+	chan->client_count--;
+	module_put(dma_chan_to_owner(chan));
+	if (chan->client_count == 0)
+		chan->device->device_free_chan_resources(chan);
+}
+
 /**
  * dma_client_chan_alloc - try to allocate channels to a client
  * @client: &dma_client
@@ -165,7 +227,6 @@ static void dma_client_chan_alloc(struct dma_client *client)
 {
 	struct dma_device *device;
 	struct dma_chan *chan;
-	int desc;	/* allocated descriptor count */
 	enum dma_state_client ack;
 
 	/* Find a channel */
@@ -178,23 +239,16 @@ static void dma_client_chan_alloc(struct dma_client *client)
 		list_for_each_entry(chan, &device->channels, device_node) {
 			if (!dma_chan_satisfies_mask(chan, client->cap_mask))
 				continue;
+			if (!chan->client_count)
+				continue;
+			ack = client->event_callback(client, chan,
+						     DMA_RESOURCE_AVAILABLE);
 
-			desc = chan->device->device_alloc_chan_resources(
-					chan, client);
-			if (desc >= 0) {
-				ack = client->event_callback(client,
-						chan,
-						DMA_RESOURCE_AVAILABLE);
-
-				/* we are done once this client rejects
-				 * an available resource
-				 */
-				if (ack == DMA_ACK) {
-					dma_chan_get(chan);
-					chan->client_count++;
-				} else if (ack == DMA_NAK)
-					return;
-			}
+			/* we are done once this client rejects
+			 * an available resource
+			 */
+			if (ack == DMA_NAK)
+				return;
 		}
 	}
 }
@@ -224,7 +278,6 @@ EXPORT_SYMBOL(dma_sync_wait);
 void dma_chan_cleanup(struct kref *kref)
 {
 	struct dma_chan *chan = container_of(kref, struct dma_chan, refcount);
-	chan->device->device_free_chan_resources(chan);
 	kref_put(&chan->device->refcount, dma_async_device_cleanup);
 }
 EXPORT_SYMBOL(dma_chan_cleanup);
@@ -232,18 +285,12 @@ EXPORT_SYMBOL(dma_chan_cleanup);
 static void dma_chan_free_rcu(struct rcu_head *rcu)
 {
 	struct dma_chan *chan = container_of(rcu, struct dma_chan, rcu);
-	int bias = 0x7FFFFFFF;
-	int i;
-	for_each_possible_cpu(i)
-		bias -= local_read(&per_cpu_ptr(chan->local, i)->refcount);
-	atomic_sub(bias, &chan->refcount.refcount);
+
 	kref_put(&chan->refcount, dma_chan_cleanup);
 }
 
 static void dma_chan_release(struct dma_chan *chan)
 {
-	atomic_add(0x7FFFFFFF, &chan->refcount.refcount);
-	chan->slow_ref = 1;
 	call_rcu(&chan->rcu, dma_chan_free_rcu);
 }
 
@@ -262,44 +309,37 @@ static void dma_clients_notify_available(void)
 	mutex_unlock(&dma_list_mutex);
 }
 
-/**
- * dma_chans_notify_available - tell the clients that a channel is going away
- * @chan: channel on its way out
- */
-static void dma_clients_notify_removed(struct dma_chan *chan)
-{
-	struct dma_client *client;
-	enum dma_state_client ack;
-
-	mutex_lock(&dma_list_mutex);
-
-	list_for_each_entry(client, &dma_client_list, global_node) {
-		ack = client->event_callback(client, chan,
-				DMA_RESOURCE_REMOVED);
-
-		/* client was holding resources for this channel so
-		 * free it
-		 */
-		if (ack == DMA_ACK) {
-			dma_chan_put(chan);
-			chan->client_count--;
-		}
-	}
-
-	mutex_unlock(&dma_list_mutex);
-}
-
 /**
  * dma_async_client_register - register a &dma_client
  * @client: ptr to a client structure with valid 'event_callback' and 'cap_mask'
  */
 void dma_async_client_register(struct dma_client *client)
 {
+	struct dma_device *device, *_d;
+	struct dma_chan *chan;
+	int err;
+
 	/* validate client data */
 	BUG_ON(dma_has_cap(DMA_SLAVE, client->cap_mask) &&
 		!client->slave);
 
 	mutex_lock(&dma_list_mutex);
+	dmaengine_ref_count++;
+
+	/* try to grab channels */
+	list_for_each_entry_safe(device, _d, &dma_device_list, global_node)
+		list_for_each_entry(chan, &device->channels, device_node) {
+			err = dma_chan_get(chan);
+			if (err == -ENODEV) {
+				/* module removed before we could use it */
+				list_del_init(&device->global_node);
+				break;
+			} else if (err)
+				pr_err("dmaengine: failed to get %s: (%d)\n",
+				       dev_name(&chan->dev), err);
+		}
+
+
 	list_add_tail(&client->global_node, &dma_client_list);
 	mutex_unlock(&dma_list_mutex);
 }
@@ -315,23 +355,17 @@ void dma_async_client_unregister(struct dma_client *client)
 {
 	struct dma_device *device;
 	struct dma_chan *chan;
-	enum dma_state_client ack;
 
 	if (!client)
 		return;
 
 	mutex_lock(&dma_list_mutex);
-	/* free all channels the client is holding */
+	dmaengine_ref_count--;
+	BUG_ON(dmaengine_ref_count < 0);
+	/* drop channel references */
 	list_for_each_entry(device, &dma_device_list, global_node)
-		list_for_each_entry(chan, &device->channels, device_node) {
-			ack = client->event_callback(client, chan,
-				DMA_RESOURCE_REMOVED);
-
-			if (ack == DMA_ACK) {
-				dma_chan_put(chan);
-				chan->client_count--;
-			}
-		}
+		list_for_each_entry(chan, &device->channels, device_node)
+			dma_chan_put(chan);
 
 	list_del(&client->global_node);
 	mutex_unlock(&dma_list_mutex);
@@ -423,6 +457,21 @@ int dma_async_device_register(struct dma_device *device)
 	}
 
 	mutex_lock(&dma_list_mutex);
+	if (dmaengine_ref_count)
+		list_for_each_entry(chan, &device->channels, device_node) {
+			/* if clients are already waiting for channels we need
+			 * to take references on their behalf
+			 */
+			if (dma_chan_get(chan) == -ENODEV) {
+				/* note we can only get here for the first
+				 * channel as the remaining channels are
+				 * guaranteed to get a reference
+				 */
+				rc = -ENODEV;
+				mutex_unlock(&dma_list_mutex);
+				goto err_out;
+			}
+		}
 	list_add_tail(&device->global_node, &dma_device_list);
 	mutex_unlock(&dma_list_mutex);
 
@@ -456,7 +505,7 @@ static void dma_async_device_cleanup(struct kref *kref)
 }
 
 /**
- * dma_async_device_unregister - unregisters DMA devices
+ * dma_async_device_unregister - unregister a DMA device
  * @device: &dma_device
  */
 void dma_async_device_unregister(struct dma_device *device)
@@ -468,7 +517,9 @@ void dma_async_device_unregister(struct dma_device *device)
 	mutex_unlock(&dma_list_mutex);
 
 	list_for_each_entry(chan, &device->channels, device_node) {
-		dma_clients_notify_removed(chan);
+		WARN_ONCE(chan->client_count,
+			  "%s called while %d clients hold a reference\n",
+			  __func__, chan->client_count);
 		device_unregister(&chan->dev);
 		dma_chan_release(chan);
 	}
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index ed9636bfb54a..db4050884713 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -215,7 +215,6 @@ static int dmatest_func(void *data)
 
 	smp_rmb();
 	chan = thread->chan;
-	dma_chan_get(chan);
 
 	while (!kthread_should_stop()) {
 		total_tests++;
@@ -293,7 +292,6 @@ static int dmatest_func(void *data)
 	}
 
 	ret = 0;
-	dma_chan_put(chan);
 	kfree(thread->dstbuf);
 err_dstbuf:
 	kfree(thread->srcbuf);
diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index 0778d99aea7c..377dafa37a20 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -773,7 +773,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan,
 	dev_vdbg(&chan->dev, "alloc_chan_resources\n");
 
 	/* Channels doing slave DMA can only handle one client. */
-	if (dwc->dws || client->slave) {
+	if (dwc->dws || (client && client->slave)) {
 		if (chan->client_count)
 			return -EBUSY;
 	}
diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c
index 7a3f2436b011..6c11f4d4c4e9 100644
--- a/drivers/mmc/host/atmel-mci.c
+++ b/drivers/mmc/host/atmel-mci.c
@@ -593,10 +593,8 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data)
 
 	/* If we don't have a channel, we can't do DMA */
 	chan = host->dma.chan;
-	if (chan) {
-		dma_chan_get(chan);
+	if (chan)
 		host->data_chan = chan;
-	}
 
 	if (!chan)
 		return -ENODEV;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index e4ec7e7b8056..d18d37d1015d 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -165,7 +165,6 @@ struct dma_slave {
  */
 
 struct dma_chan_percpu {
-	local_t refcount;
 	/* stats */
 	unsigned long memcpy_count;
 	unsigned long bytes_transferred;
@@ -205,26 +204,6 @@ struct dma_chan {
 
 void dma_chan_cleanup(struct kref *kref);
 
-static inline void dma_chan_get(struct dma_chan *chan)
-{
-	if (unlikely(chan->slow_ref))
-		kref_get(&chan->refcount);
-	else {
-		local_inc(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
-		put_cpu();
-	}
-}
-
-static inline void dma_chan_put(struct dma_chan *chan)
-{
-	if (unlikely(chan->slow_ref))
-		kref_put(&chan->refcount, dma_chan_cleanup);
-	else {
-		local_dec(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
-		put_cpu();
-	}
-}
-
 /*
  * typedef dma_event_callback - function pointer to a DMA event callback
  * For each channel added to the system this routine is called for each client.
diff --git a/include/net/netdma.h b/include/net/netdma.h
index f28c6e064e8f..cbe2737f4a61 100644
--- a/include/net/netdma.h
+++ b/include/net/netdma.h
@@ -27,11 +27,11 @@
 static inline struct dma_chan *get_softnet_dma(void)
 {
 	struct dma_chan *chan;
+
 	rcu_read_lock();
 	chan = rcu_dereference(__get_cpu_var(softnet_data).net_dma);
-	if (chan)
-		dma_chan_get(chan);
 	rcu_read_unlock();
+
 	return chan;
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f28acf11fc67..75e0e0a2d8db 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1632,7 +1632,6 @@ skip_copy:
 
 		/* Safe to free early-copied skbs now */
 		__skb_queue_purge(&sk->sk_async_wait_queue);
-		dma_chan_put(tp->ucopy.dma_chan);
 		tp->ucopy.dma_chan = NULL;
 	}
 	if (tp->ucopy.pinned_list) {
-- 
cgit v1.2.3


From bec085134e446577a983f17f57d642a88d1af53b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 6 Jan 2009 11:38:14 -0700
Subject: dmaengine: centralize channel allocation, introduce dma_find_channel

Allowing multiple clients to each define their own channel allocation
scheme quickly leads to a pathological situation.  For memory-to-memory
offload all clients can share a central allocator.

This simply moves the existing async_tx allocator to dmaengine with
minimal fixups:
* async_tx.c:get_chan_ref_by_cap --> dmaengine.c:nth_chan
* async_tx.c:async_tx_rebalance --> dmaengine.c:dma_channel_rebalance
* split out common code from async_tx.c:__async_tx_find_channel -->
  dma_find_channel

Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/async_tx/async_tx.c | 146 ++-------------------------------------
 drivers/dma/dmaengine.c    | 168 ++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/dmaengine.h  |   3 +
 3 files changed, 174 insertions(+), 143 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index 43fe4cbe71e6..b88bb1f608fc 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -37,26 +37,11 @@ static struct dma_client async_tx_dma = {
 	/* .cap_mask == 0 defaults to all channels */
 };
 
-/**
- * dma_cap_mask_all - enable iteration over all operation types
- */
-static dma_cap_mask_t dma_cap_mask_all;
-
-/**
- * chan_ref_percpu - tracks channel allocations per core/opertion
- */
-struct chan_ref_percpu {
-	struct dma_chan_ref *ref;
-};
-
-static int channel_table_initialized;
-static struct chan_ref_percpu *channel_table[DMA_TX_TYPE_END];
-
 /**
  * async_tx_lock - protect modification of async_tx_master_list and serialize
  *	rebalance operations
  */
-static spinlock_t async_tx_lock;
+static DEFINE_SPINLOCK(async_tx_lock);
 
 static LIST_HEAD(async_tx_master_list);
 
@@ -89,85 +74,6 @@ init_dma_chan_ref(struct dma_chan_ref *ref, struct dma_chan *chan)
 	atomic_set(&ref->count, 0);
 }
 
-/**
- * get_chan_ref_by_cap - returns the nth channel of the given capability
- * 	defaults to returning the channel with the desired capability and the
- * 	lowest reference count if the index can not be satisfied
- * @cap: capability to match
- * @index: nth channel desired, passing -1 has the effect of forcing the
- *  default return value
- */
-static struct dma_chan_ref *
-get_chan_ref_by_cap(enum dma_transaction_type cap, int index)
-{
-	struct dma_chan_ref *ret_ref = NULL, *min_ref = NULL, *ref;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(ref, &async_tx_master_list, node)
-		if (dma_has_cap(cap, ref->chan->device->cap_mask)) {
-			if (!min_ref)
-				min_ref = ref;
-			else if (atomic_read(&ref->count) <
-				atomic_read(&min_ref->count))
-				min_ref = ref;
-
-			if (index-- == 0) {
-				ret_ref = ref;
-				break;
-			}
-		}
-	rcu_read_unlock();
-
-	if (!ret_ref)
-		ret_ref = min_ref;
-
-	if (ret_ref)
-		atomic_inc(&ret_ref->count);
-
-	return ret_ref;
-}
-
-/**
- * async_tx_rebalance - redistribute the available channels, optimize
- * for cpu isolation in the SMP case, and opertaion isolation in the
- * uniprocessor case
- */
-static void async_tx_rebalance(void)
-{
-	int cpu, cap, cpu_idx = 0;
-	unsigned long flags;
-
-	if (!channel_table_initialized)
-		return;
-
-	spin_lock_irqsave(&async_tx_lock, flags);
-
-	/* undo the last distribution */
-	for_each_dma_cap_mask(cap, dma_cap_mask_all)
-		for_each_possible_cpu(cpu) {
-			struct dma_chan_ref *ref =
-				per_cpu_ptr(channel_table[cap], cpu)->ref;
-			if (ref) {
-				atomic_set(&ref->count, 0);
-				per_cpu_ptr(channel_table[cap], cpu)->ref =
-									NULL;
-			}
-		}
-
-	for_each_dma_cap_mask(cap, dma_cap_mask_all)
-		for_each_online_cpu(cpu) {
-			struct dma_chan_ref *new;
-			if (NR_CPUS > 1)
-				new = get_chan_ref_by_cap(cap, cpu_idx++);
-			else
-				new = get_chan_ref_by_cap(cap, -1);
-
-			per_cpu_ptr(channel_table[cap], cpu)->ref = new;
-		}
-
-	spin_unlock_irqrestore(&async_tx_lock, flags);
-}
-
 static enum dma_state_client
 dma_channel_add_remove(struct dma_client *client,
 	struct dma_chan *chan, enum dma_state state)
@@ -211,8 +117,6 @@ dma_channel_add_remove(struct dma_client *client,
 				" (-ENOMEM)\n");
 			return 0;
 		}
-
-		async_tx_rebalance();
 		break;
 	case DMA_RESOURCE_REMOVED:
 		found = 0;
@@ -233,8 +137,6 @@ dma_channel_add_remove(struct dma_client *client,
 			ack = DMA_ACK;
 		else
 			break;
-
-		async_tx_rebalance();
 		break;
 	case DMA_RESOURCE_SUSPEND:
 	case DMA_RESOURCE_RESUME:
@@ -248,51 +150,18 @@ dma_channel_add_remove(struct dma_client *client,
 	return ack;
 }
 
-static int __init
-async_tx_init(void)
+static int __init async_tx_init(void)
 {
-	enum dma_transaction_type cap;
-
-	spin_lock_init(&async_tx_lock);
-	bitmap_fill(dma_cap_mask_all.bits, DMA_TX_TYPE_END);
-
-	/* an interrupt will never be an explicit operation type.
-	 * clearing this bit prevents allocation to a slot in 'channel_table'
-	 */
-	clear_bit(DMA_INTERRUPT, dma_cap_mask_all.bits);
-
-	for_each_dma_cap_mask(cap, dma_cap_mask_all) {
-		channel_table[cap] = alloc_percpu(struct chan_ref_percpu);
-		if (!channel_table[cap])
-			goto err;
-	}
-
-	channel_table_initialized = 1;
 	dma_async_client_register(&async_tx_dma);
 	dma_async_client_chan_request(&async_tx_dma);
 
 	printk(KERN_INFO "async_tx: api initialized (async)\n");
 
 	return 0;
-err:
-	printk(KERN_ERR "async_tx: initialization failure\n");
-
-	while (--cap >= 0)
-		free_percpu(channel_table[cap]);
-
-	return 1;
 }
 
 static void __exit async_tx_exit(void)
 {
-	enum dma_transaction_type cap;
-
-	channel_table_initialized = 0;
-
-	for_each_dma_cap_mask(cap, dma_cap_mask_all)
-		if (channel_table[cap])
-			free_percpu(channel_table[cap]);
-
 	dma_async_client_unregister(&async_tx_dma);
 }
 
@@ -308,16 +177,9 @@ __async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
 {
 	/* see if we can keep the chain on one channel */
 	if (depend_tx &&
-		dma_has_cap(tx_type, depend_tx->chan->device->cap_mask))
+	    dma_has_cap(tx_type, depend_tx->chan->device->cap_mask))
 		return depend_tx->chan;
-	else if (likely(channel_table_initialized)) {
-		struct dma_chan_ref *ref;
-		int cpu = get_cpu();
-		ref = per_cpu_ptr(channel_table[tx_type], cpu)->ref;
-		put_cpu();
-		return ref ? ref->chan : NULL;
-	} else
-		return NULL;
+	return dma_find_channel(tx_type);
 }
 EXPORT_SYMBOL_GPL(__async_tx_find_channel);
 #else
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index d4d925912c47..87a8cd4791ed 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -294,6 +294,164 @@ static void dma_chan_release(struct dma_chan *chan)
 	call_rcu(&chan->rcu, dma_chan_free_rcu);
 }
 
+/**
+ * dma_cap_mask_all - enable iteration over all operation types
+ */
+static dma_cap_mask_t dma_cap_mask_all;
+
+/**
+ * dma_chan_tbl_ent - tracks channel allocations per core/operation
+ * @chan - associated channel for this entry
+ */
+struct dma_chan_tbl_ent {
+	struct dma_chan *chan;
+};
+
+/**
+ * channel_table - percpu lookup table for memory-to-memory offload providers
+ */
+static struct dma_chan_tbl_ent *channel_table[DMA_TX_TYPE_END];
+
+static int __init dma_channel_table_init(void)
+{
+	enum dma_transaction_type cap;
+	int err = 0;
+
+	bitmap_fill(dma_cap_mask_all.bits, DMA_TX_TYPE_END);
+
+	/* 'interrupt' and 'slave' are channel capabilities, but are not
+	 * associated with an operation so they do not need an entry in the
+	 * channel_table
+	 */
+	clear_bit(DMA_INTERRUPT, dma_cap_mask_all.bits);
+	clear_bit(DMA_SLAVE, dma_cap_mask_all.bits);
+
+	for_each_dma_cap_mask(cap, dma_cap_mask_all) {
+		channel_table[cap] = alloc_percpu(struct dma_chan_tbl_ent);
+		if (!channel_table[cap]) {
+			err = -ENOMEM;
+			break;
+		}
+	}
+
+	if (err) {
+		pr_err("dmaengine: initialization failure\n");
+		for_each_dma_cap_mask(cap, dma_cap_mask_all)
+			if (channel_table[cap])
+				free_percpu(channel_table[cap]);
+	}
+
+	return err;
+}
+subsys_initcall(dma_channel_table_init);
+
+/**
+ * dma_find_channel - find a channel to carry out the operation
+ * @tx_type: transaction type
+ */
+struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type)
+{
+	struct dma_chan *chan;
+	int cpu;
+
+	WARN_ONCE(dmaengine_ref_count == 0,
+		  "client called %s without a reference", __func__);
+
+	cpu = get_cpu();
+	chan = per_cpu_ptr(channel_table[tx_type], cpu)->chan;
+	put_cpu();
+
+	return chan;
+}
+EXPORT_SYMBOL(dma_find_channel);
+
+/**
+ * nth_chan - returns the nth channel of the given capability
+ * @cap: capability to match
+ * @n: nth channel desired
+ *
+ * Defaults to returning the channel with the desired capability and the
+ * lowest reference count when 'n' cannot be satisfied.  Must be called
+ * under dma_list_mutex.
+ */
+static struct dma_chan *nth_chan(enum dma_transaction_type cap, int n)
+{
+	struct dma_device *device;
+	struct dma_chan *chan;
+	struct dma_chan *ret = NULL;
+	struct dma_chan *min = NULL;
+
+	list_for_each_entry(device, &dma_device_list, global_node) {
+		if (!dma_has_cap(cap, device->cap_mask))
+			continue;
+		list_for_each_entry(chan, &device->channels, device_node) {
+			if (!chan->client_count)
+				continue;
+			if (!min)
+				min = chan;
+			else if (chan->table_count < min->table_count)
+				min = chan;
+
+			if (n-- == 0) {
+				ret = chan;
+				break; /* done */
+			}
+		}
+		if (ret)
+			break; /* done */
+	}
+
+	if (!ret)
+		ret = min;
+
+	if (ret)
+		ret->table_count++;
+
+	return ret;
+}
+
+/**
+ * dma_channel_rebalance - redistribute the available channels
+ *
+ * Optimize for cpu isolation (each cpu gets a dedicated channel for an
+ * operation type) in the SMP case,  and operation isolation (avoid
+ * multi-tasking channels) in the non-SMP case.  Must be called under
+ * dma_list_mutex.
+ */
+static void dma_channel_rebalance(void)
+{
+	struct dma_chan *chan;
+	struct dma_device *device;
+	int cpu;
+	int cap;
+	int n;
+
+	/* undo the last distribution */
+	for_each_dma_cap_mask(cap, dma_cap_mask_all)
+		for_each_possible_cpu(cpu)
+			per_cpu_ptr(channel_table[cap], cpu)->chan = NULL;
+
+	list_for_each_entry(device, &dma_device_list, global_node)
+		list_for_each_entry(chan, &device->channels, device_node)
+			chan->table_count = 0;
+
+	/* don't populate the channel_table if no clients are available */
+	if (!dmaengine_ref_count)
+		return;
+
+	/* redistribute available channels */
+	n = 0;
+	for_each_dma_cap_mask(cap, dma_cap_mask_all)
+		for_each_online_cpu(cpu) {
+			if (num_possible_cpus() > 1)
+				chan = nth_chan(cap, n++);
+			else
+				chan = nth_chan(cap, -1);
+
+			per_cpu_ptr(channel_table[cap], cpu)->chan = chan;
+		}
+}
+
 /**
  * dma_chans_notify_available - broadcast available channels to the clients
  */
@@ -339,7 +497,12 @@ void dma_async_client_register(struct dma_client *client)
 				       dev_name(&chan->dev), err);
 		}
 
-
+	/* if this is the first reference and there were channels
+	 * waiting we need to rebalance to get those channels
+	 * incorporated into the channel table
+	 */
+	if (dmaengine_ref_count == 1)
+		dma_channel_rebalance();
 	list_add_tail(&client->global_node, &dma_client_list);
 	mutex_unlock(&dma_list_mutex);
 }
@@ -473,6 +636,7 @@ int dma_async_device_register(struct dma_device *device)
 			}
 		}
 	list_add_tail(&device->global_node, &dma_device_list);
+	dma_channel_rebalance();
 	mutex_unlock(&dma_list_mutex);
 
 	dma_clients_notify_available();
@@ -514,6 +678,7 @@ void dma_async_device_unregister(struct dma_device *device)
 
 	mutex_lock(&dma_list_mutex);
 	list_del(&device->global_node);
+	dma_channel_rebalance();
 	mutex_unlock(&dma_list_mutex);
 
 	list_for_each_entry(chan, &device->channels, device_node) {
@@ -768,3 +933,4 @@ static int __init dma_bus_init(void)
 }
 subsys_initcall(dma_bus_init);
 
+
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index d18d37d1015d..b466f02e2433 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -182,6 +182,7 @@ struct dma_chan_percpu {
  * @device_node: used to add this to the device chan list
  * @local: per-cpu pointer to a struct dma_chan_percpu
  * @client-count: how many clients are using this channel
+ * @table_count: number of appearances in the mem-to-mem allocation table
  */
 struct dma_chan {
 	struct dma_device *device;
@@ -198,6 +199,7 @@ struct dma_chan {
 	struct list_head device_node;
 	struct dma_chan_percpu *local;
 	int client_count;
+	int table_count;
 };
 
 #define to_dma_chan(p) container_of(p, struct dma_chan, dev)
@@ -468,6 +470,7 @@ static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descript
 int dma_async_device_register(struct dma_device *device);
 void dma_async_device_unregister(struct dma_device *device);
 void dma_run_dependencies(struct dma_async_tx_descriptor *tx);
+struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type);
 
 /* --- Helper iov-locking functions --- */
 
-- 
cgit v1.2.3


From 2ba05622b8b143b0c95968ba59bddfbd6d2f2559 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 6 Jan 2009 11:38:14 -0700
Subject: dmaengine: provide a common 'issue_pending_all' implementation

async_tx and net_dma each have open-coded versions of issue_pending_all,
so provide a common routine in dmaengine.

The implementation needs to walk the global device list, so implement
rcu to allow dma_issue_pending_all to run lockless.  Clients protect
themselves from channel removal events by holding a dmaengine reference.

Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/async_tx/async_tx.c | 12 ------------
 drivers/dma/dmaengine.c    | 27 ++++++++++++++++++++++++---
 include/linux/async_tx.h   |  2 +-
 include/linux/dmaengine.h  |  1 +
 net/core/dev.c             |  9 +--------
 5 files changed, 27 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index b88bb1f608fc..2cdf7a0867b7 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -45,18 +45,6 @@ static DEFINE_SPINLOCK(async_tx_lock);
 
 static LIST_HEAD(async_tx_master_list);
 
-/* async_tx_issue_pending_all - start all transactions on all channels */
-void async_tx_issue_pending_all(void)
-{
-	struct dma_chan_ref *ref;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(ref, &async_tx_master_list, node)
-		ref->chan->device->device_issue_pending(ref->chan);
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(async_tx_issue_pending_all);
-
 static void
 free_dma_chan_ref(struct rcu_head *rcu)
 {
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 87a8cd4791ed..418eca28d472 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -70,6 +70,7 @@
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
 #include <linux/jiffies.h>
+#include <linux/rculist.h>
 
 static DEFINE_MUTEX(dma_list_mutex);
 static LIST_HEAD(dma_device_list);
@@ -365,6 +366,26 @@ struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type)
 }
 EXPORT_SYMBOL(dma_find_channel);
 
+/**
+ * dma_issue_pending_all - flush all pending operations across all channels
+ */
+void dma_issue_pending_all(void)
+{
+	struct dma_device *device;
+	struct dma_chan *chan;
+
+	WARN_ONCE(dmaengine_ref_count == 0,
+		  "client called %s without a reference", __func__);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(device, &dma_device_list, global_node)
+		list_for_each_entry(chan, &device->channels, device_node)
+			if (chan->client_count)
+				device->device_issue_pending(chan);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(dma_issue_pending_all);
+
 /**
  * nth_chan - returns the nth channel of the given capability
  * @cap: capability to match
@@ -490,7 +511,7 @@ void dma_async_client_register(struct dma_client *client)
 			err = dma_chan_get(chan);
 			if (err == -ENODEV) {
 				/* module removed before we could use it */
-				list_del_init(&device->global_node);
+				list_del_rcu(&device->global_node);
 				break;
 			} else if (err)
 				pr_err("dmaengine: failed to get %s: (%d)\n",
@@ -635,7 +656,7 @@ int dma_async_device_register(struct dma_device *device)
 				goto err_out;
 			}
 		}
-	list_add_tail(&device->global_node, &dma_device_list);
+	list_add_tail_rcu(&device->global_node, &dma_device_list);
 	dma_channel_rebalance();
 	mutex_unlock(&dma_list_mutex);
 
@@ -677,7 +698,7 @@ void dma_async_device_unregister(struct dma_device *device)
 	struct dma_chan *chan;
 
 	mutex_lock(&dma_list_mutex);
-	list_del(&device->global_node);
+	list_del_rcu(&device->global_node);
 	dma_channel_rebalance();
 	mutex_unlock(&dma_list_mutex);
 
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 1c816775f135..45f6297821bd 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -59,7 +59,7 @@ enum async_tx_flags {
 };
 
 #ifdef CONFIG_DMA_ENGINE
-void async_tx_issue_pending_all(void);
+#define async_tx_issue_pending_all dma_issue_pending_all
 #ifdef CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL
 #include <asm/async_tx.h>
 #else
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index b466f02e2433..57a43adfc39e 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -471,6 +471,7 @@ int dma_async_device_register(struct dma_device *device);
 void dma_async_device_unregister(struct dma_device *device);
 void dma_run_dependencies(struct dma_async_tx_descriptor *tx);
 struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type);
+void dma_issue_pending_all(void);
 
 /* --- Helper iov-locking functions --- */
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 09c66a449da6..e40b0d57f8ff 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2635,14 +2635,7 @@ out:
 	 * There may not be any more sk_buffs coming right now, so push
 	 * any pending DMA copies to hardware
 	 */
-	if (!cpus_empty(net_dma.channel_mask)) {
-		int chan_idx;
-		for_each_cpu_mask_nr(chan_idx, net_dma.channel_mask) {
-			struct dma_chan *chan = net_dma.channels[chan_idx];
-			if (chan)
-				dma_async_memcpy_issue_pending(chan);
-		}
-	}
+	dma_issue_pending_all();
 #endif
 
 	return;
-- 
cgit v1.2.3


From f67b45999205164958de4ec0658d51fa4bee066d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 6 Jan 2009 11:38:15 -0700
Subject: net_dma: convert to dma_find_channel

Use the general-purpose channel allocation provided by dmaengine.

Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/netdevice.h |  3 ---
 include/net/netdma.h      | 11 -----------
 net/core/dev.c            | 40 ----------------------------------------
 net/ipv4/tcp.c            |  4 ++--
 net/ipv4/tcp_input.c      |  2 +-
 net/ipv4/tcp_ipv4.c       |  2 +-
 net/ipv6/tcp_ipv6.c       |  2 +-
 7 files changed, 5 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 41e1224651cf..bac2c458d9b8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1113,9 +1113,6 @@ struct softnet_data
 	struct sk_buff		*completion_queue;
 
 	struct napi_struct	backlog;
-#ifdef CONFIG_NET_DMA
-	struct dma_chan		*net_dma;
-#endif
 };
 
 DECLARE_PER_CPU(struct softnet_data,softnet_data);
diff --git a/include/net/netdma.h b/include/net/netdma.h
index cbe2737f4a61..8ba8ce284eeb 100644
--- a/include/net/netdma.h
+++ b/include/net/netdma.h
@@ -24,17 +24,6 @@
 #include <linux/dmaengine.h>
 #include <linux/skbuff.h>
 
-static inline struct dma_chan *get_softnet_dma(void)
-{
-	struct dma_chan *chan;
-
-	rcu_read_lock();
-	chan = rcu_dereference(__get_cpu_var(softnet_data).net_dma);
-	rcu_read_unlock();
-
-	return chan;
-}
-
 int dma_skb_copy_datagram_iovec(struct dma_chan* chan,
 		struct sk_buff *skb, int offset, struct iovec *to,
 		size_t len, struct dma_pinned_list *pinned_list);
diff --git a/net/core/dev.c b/net/core/dev.c
index e40b0d57f8ff..bbb07dbe1740 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4827,44 +4827,6 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 }
 
 #ifdef CONFIG_NET_DMA
-/**
- * net_dma_rebalance - try to maintain one DMA channel per CPU
- * @net_dma: DMA client and associated data (lock, channels, channel_mask)
- *
- * This is called when the number of channels allocated to the net_dma client
- * changes.  The net_dma client tries to have one DMA channel per CPU.
- */
-
-static void net_dma_rebalance(struct net_dma *net_dma)
-{
-	unsigned int cpu, i, n, chan_idx;
-	struct dma_chan *chan;
-
-	if (cpus_empty(net_dma->channel_mask)) {
-		for_each_online_cpu(cpu)
-			rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
-		return;
-	}
-
-	i = 0;
-	cpu = first_cpu(cpu_online_map);
-
-	for_each_cpu_mask_nr(chan_idx, net_dma->channel_mask) {
-		chan = net_dma->channels[chan_idx];
-
-		n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
-		   + (i < (num_online_cpus() %
-			cpus_weight(net_dma->channel_mask)) ? 1 : 0));
-
-		while(n) {
-			per_cpu(softnet_data, cpu).net_dma = chan;
-			cpu = next_cpu(cpu, cpu_online_map);
-			n--;
-		}
-		i++;
-	}
-}
-
 /**
  * netdev_dma_event - event callback for the net_dma_client
  * @client: should always be net_dma_client
@@ -4894,7 +4856,6 @@ netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 			ack = DMA_ACK;
 			net_dma->channels[pos] = chan;
 			cpu_set(pos, net_dma->channel_mask);
-			net_dma_rebalance(net_dma);
 		}
 		break;
 	case DMA_RESOURCE_REMOVED:
@@ -4909,7 +4870,6 @@ netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 			ack = DMA_ACK;
 			cpu_clear(pos, net_dma->channel_mask);
 			net_dma->channels[i] = NULL;
-			net_dma_rebalance(net_dma);
 		}
 		break;
 	default:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 75e0e0a2d8db..9b275abc8eb9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1317,7 +1317,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		if ((available < target) &&
 		    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
 		    !sysctl_tcp_low_latency &&
-		    __get_cpu_var(softnet_data).net_dma) {
+		    dma_find_channel(DMA_MEMCPY)) {
 			preempt_enable_no_resched();
 			tp->ucopy.pinned_list =
 					dma_pin_iovec_pages(msg->msg_iov, len);
@@ -1527,7 +1527,7 @@ do_prequeue:
 		if (!(flags & MSG_TRUNC)) {
 #ifdef CONFIG_NET_DMA
 			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-				tp->ucopy.dma_chan = get_softnet_dma();
+				tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
 
 			if (tp->ucopy.dma_chan) {
 				tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 99b7ecbe8893..a6961d75c7ea 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5005,7 +5005,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
 		return 0;
 
 	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-		tp->ucopy.dma_chan = get_softnet_dma();
+		tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
 
 	if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 9d839fa9331e..19d7b429a262 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1594,7 +1594,7 @@ process:
 #ifdef CONFIG_NET_DMA
 		struct tcp_sock *tp = tcp_sk(sk);
 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-			tp->ucopy.dma_chan = get_softnet_dma();
+			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
 		if (tp->ucopy.dma_chan)
 			ret = tcp_v4_do_rcv(sk, skb);
 		else
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e8b8337a8310..71cd70951d7d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1640,7 +1640,7 @@ process:
 #ifdef CONFIG_NET_DMA
 		struct tcp_sock *tp = tcp_sk(sk);
 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-			tp->ucopy.dma_chan = get_softnet_dma();
+			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
 		if (tp->ucopy.dma_chan)
 			ret = tcp_v6_do_rcv(sk, skb);
 		else
-- 
cgit v1.2.3


From 59b5ec21446b9239d706ab237fb261d525b75e81 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 6 Jan 2009 11:38:15 -0700
Subject: dmaengine: introduce dma_request_channel and private channels

This interface is primarily for device-to-memory clients which need to
search for dma channels with platform-specific characteristics.  The
prototype is:

struct dma_chan *dma_request_channel(dma_cap_mask_t mask,
                                     dma_filter_fn filter_fn,
                                     void *filter_param);

When the optional 'filter_fn' parameter is set to NULL
dma_request_channel simply returns the first channel that satisfies the
capability mask.  Otherwise, when the mask parameter is insufficient for
specifying the necessary channel, the filter_fn routine can be used to
disposition the available channels in the system. The filter_fn routine
is called once for each free channel in the system.  Upon seeing a
suitable channel filter_fn returns DMA_ACK which flags that channel to
be the return value from dma_request_channel.  A channel allocated via
this interface is exclusive to the caller, until dma_release_channel()
is called.

To ensure that all channels are not consumed by the general-purpose
allocator the DMA_PRIVATE capability is provided to exclude a dma_device
from general-purpose (memory-to-memory) consideration.

Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/dmaengine.c   | 155 +++++++++++++++++++++++++++++++++++++++++-----
 include/linux/dmaengine.h |  16 +++++
 2 files changed, 155 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 418eca28d472..7a0594f24a3f 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -134,14 +134,14 @@ static struct class dma_devclass = {
 
 /* --- client and device registration --- */
 
-#define dma_chan_satisfies_mask(chan, mask) \
-	__dma_chan_satisfies_mask((chan), &(mask))
+#define dma_device_satisfies_mask(device, mask) \
+	__dma_device_satisfies_mask((device), &(mask))
 static int
-__dma_chan_satisfies_mask(struct dma_chan *chan, dma_cap_mask_t *want)
+__dma_device_satisfies_mask(struct dma_device *device, dma_cap_mask_t *want)
 {
 	dma_cap_mask_t has;
 
-	bitmap_and(has.bits, want->bits, chan->device->cap_mask.bits,
+	bitmap_and(has.bits, want->bits, device->cap_mask.bits,
 		DMA_TX_TYPE_END);
 	return bitmap_equal(want->bits, has.bits, DMA_TX_TYPE_END);
 }
@@ -195,7 +195,7 @@ static int dma_chan_get(struct dma_chan *chan)
 			err = desc_cnt;
 			chan->client_count = 0;
 			module_put(owner);
-		} else
+		} else if (!dma_has_cap(DMA_PRIVATE, chan->device->cap_mask))
 			balance_ref_count(chan);
 	}
 
@@ -232,14 +232,16 @@ static void dma_client_chan_alloc(struct dma_client *client)
 
 	/* Find a channel */
 	list_for_each_entry(device, &dma_device_list, global_node) {
+		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+			continue;
 		/* Does the client require a specific DMA controller? */
 		if (client->slave && client->slave->dma_dev
 				&& client->slave->dma_dev != device->dev)
 			continue;
+		if (!dma_device_satisfies_mask(device, client->cap_mask))
+			continue;
 
 		list_for_each_entry(chan, &device->channels, device_node) {
-			if (!dma_chan_satisfies_mask(chan, client->cap_mask))
-				continue;
 			if (!chan->client_count)
 				continue;
 			ack = client->event_callback(client, chan,
@@ -320,11 +322,12 @@ static int __init dma_channel_table_init(void)
 
 	bitmap_fill(dma_cap_mask_all.bits, DMA_TX_TYPE_END);
 
-	/* 'interrupt' and 'slave' are channel capabilities, but are not
-	 * associated with an operation so they do not need an entry in the
-	 * channel_table
+	/* 'interrupt', 'private', and 'slave' are channel capabilities,
+	 * but are not associated with an operation so they do not need
+	 * an entry in the channel_table
 	 */
 	clear_bit(DMA_INTERRUPT, dma_cap_mask_all.bits);
+	clear_bit(DMA_PRIVATE, dma_cap_mask_all.bits);
 	clear_bit(DMA_SLAVE, dma_cap_mask_all.bits);
 
 	for_each_dma_cap_mask(cap, dma_cap_mask_all) {
@@ -378,10 +381,13 @@ void dma_issue_pending_all(void)
 		  "client called %s without a reference", __func__);
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(device, &dma_device_list, global_node)
+	list_for_each_entry_rcu(device, &dma_device_list, global_node) {
+		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+			continue;
 		list_for_each_entry(chan, &device->channels, device_node)
 			if (chan->client_count)
 				device->device_issue_pending(chan);
+	}
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(dma_issue_pending_all);
@@ -403,7 +409,8 @@ static struct dma_chan *nth_chan(enum dma_transaction_type cap, int n)
 	struct dma_chan *min = NULL;
 
 	list_for_each_entry(device, &dma_device_list, global_node) {
-		if (!dma_has_cap(cap, device->cap_mask))
+		if (!dma_has_cap(cap, device->cap_mask) ||
+		    dma_has_cap(DMA_PRIVATE, device->cap_mask))
 			continue;
 		list_for_each_entry(chan, &device->channels, device_node) {
 			if (!chan->client_count)
@@ -452,9 +459,12 @@ static void dma_channel_rebalance(void)
 		for_each_possible_cpu(cpu)
 			per_cpu_ptr(channel_table[cap], cpu)->chan = NULL;
 
-	list_for_each_entry(device, &dma_device_list, global_node)
+	list_for_each_entry(device, &dma_device_list, global_node) {
+		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+			continue;
 		list_for_each_entry(chan, &device->channels, device_node)
 			chan->table_count = 0;
+	}
 
 	/* don't populate the channel_table if no clients are available */
 	if (!dmaengine_ref_count)
@@ -473,6 +483,111 @@ static void dma_channel_rebalance(void)
 		}
 }
 
+static struct dma_chan *private_candidate(dma_cap_mask_t *mask, struct dma_device *dev)
+{
+	struct dma_chan *chan;
+	struct dma_chan *ret = NULL;
+
+	if (!__dma_device_satisfies_mask(dev, mask)) {
+		pr_debug("%s: wrong capabilities\n", __func__);
+		return NULL;
+	}
+	/* devices with multiple channels need special handling as we need to
+	 * ensure that all channels are either private or public.
+	 */
+	if (dev->chancnt > 1 && !dma_has_cap(DMA_PRIVATE, dev->cap_mask))
+		list_for_each_entry(chan, &dev->channels, device_node) {
+			/* some channels are already publicly allocated */
+			if (chan->client_count)
+				return NULL;
+		}
+
+	list_for_each_entry(chan, &dev->channels, device_node) {
+		if (chan->client_count) {
+			pr_debug("%s: %s busy\n",
+				 __func__, dev_name(&chan->dev));
+			continue;
+		}
+		ret = chan;
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * dma_request_channel - try to allocate an exclusive channel
+ * @mask: capabilities that the channel must satisfy
+ * @fn: optional callback to disposition available channels
+ * @fn_param: opaque parameter to pass to dma_filter_fn
+ */
+struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param)
+{
+	struct dma_device *device, *_d;
+	struct dma_chan *chan = NULL;
+	enum dma_state_client ack;
+	int err;
+
+	/* Find a channel */
+	mutex_lock(&dma_list_mutex);
+	list_for_each_entry_safe(device, _d, &dma_device_list, global_node) {
+		chan = private_candidate(mask, device);
+		if (!chan)
+			continue;
+
+		if (fn)
+			ack = fn(chan, fn_param);
+		else
+			ack = DMA_ACK;
+
+		if (ack == DMA_ACK) {
+			/* Found a suitable channel, try to grab, prep, and
+			 * return it.  We first set DMA_PRIVATE to disable
+			 * balance_ref_count as this channel will not be
+			 * published in the general-purpose allocator
+			 */
+			dma_cap_set(DMA_PRIVATE, device->cap_mask);
+			err = dma_chan_get(chan);
+
+			if (err == -ENODEV) {
+				pr_debug("%s: %s module removed\n", __func__,
+					 dev_name(&chan->dev));
+				list_del_rcu(&device->global_node);
+			} else if (err)
+				pr_err("dmaengine: failed to get %s: (%d)\n",
+				       dev_name(&chan->dev), err);
+			else
+				break;
+		} else if (ack == DMA_DUP) {
+			pr_debug("%s: %s filter said DMA_DUP\n",
+				 __func__, dev_name(&chan->dev));
+		} else if (ack == DMA_NAK) {
+			pr_debug("%s: %s filter said DMA_NAK\n",
+				 __func__, dev_name(&chan->dev));
+			break;
+		} else
+			WARN_ONCE(1, "filter_fn: unknown response?\n");
+		chan = NULL;
+	}
+	mutex_unlock(&dma_list_mutex);
+
+	pr_debug("%s: %s (%s)\n", __func__, chan ? "success" : "fail",
+		 chan ? dev_name(&chan->dev) : NULL);
+
+	return chan;
+}
+EXPORT_SYMBOL_GPL(__dma_request_channel);
+
+void dma_release_channel(struct dma_chan *chan)
+{
+	mutex_lock(&dma_list_mutex);
+	WARN_ONCE(chan->client_count != 1,
+		  "chan reference count %d != 1\n", chan->client_count);
+	dma_chan_put(chan);
+	mutex_unlock(&dma_list_mutex);
+}
+EXPORT_SYMBOL_GPL(dma_release_channel);
+
 /**
  * dma_chans_notify_available - broadcast available channels to the clients
  */
@@ -506,7 +621,9 @@ void dma_async_client_register(struct dma_client *client)
 	dmaengine_ref_count++;
 
 	/* try to grab channels */
-	list_for_each_entry_safe(device, _d, &dma_device_list, global_node)
+	list_for_each_entry_safe(device, _d, &dma_device_list, global_node) {
+		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+			continue;
 		list_for_each_entry(chan, &device->channels, device_node) {
 			err = dma_chan_get(chan);
 			if (err == -ENODEV) {
@@ -517,6 +634,7 @@ void dma_async_client_register(struct dma_client *client)
 				pr_err("dmaengine: failed to get %s: (%d)\n",
 				       dev_name(&chan->dev), err);
 		}
+	}
 
 	/* if this is the first reference and there were channels
 	 * waiting we need to rebalance to get those channels
@@ -547,9 +665,12 @@ void dma_async_client_unregister(struct dma_client *client)
 	dmaengine_ref_count--;
 	BUG_ON(dmaengine_ref_count < 0);
 	/* drop channel references */
-	list_for_each_entry(device, &dma_device_list, global_node)
+	list_for_each_entry(device, &dma_device_list, global_node) {
+		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+			continue;
 		list_for_each_entry(chan, &device->channels, device_node)
 			dma_chan_put(chan);
+	}
 
 	list_del(&client->global_node);
 	mutex_unlock(&dma_list_mutex);
@@ -639,9 +760,11 @@ int dma_async_device_register(struct dma_device *device)
 		chan->slow_ref = 0;
 		INIT_RCU_HEAD(&chan->rcu);
 	}
+	device->chancnt = chancnt;
 
 	mutex_lock(&dma_list_mutex);
-	if (dmaengine_ref_count)
+	/* take references on public channels */
+	if (dmaengine_ref_count && !dma_has_cap(DMA_PRIVATE, device->cap_mask))
 		list_for_each_entry(chan, &device->channels, device_node) {
 			/* if clients are already waiting for channels we need
 			 * to take references on their behalf
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 57a43adfc39e..fe40bc020af6 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -89,6 +89,7 @@ enum dma_transaction_type {
 	DMA_MEMSET,
 	DMA_MEMCPY_CRC32C,
 	DMA_INTERRUPT,
+	DMA_PRIVATE,
 	DMA_SLAVE,
 };
 
@@ -223,6 +224,18 @@ struct dma_client;
 typedef enum dma_state_client (*dma_event_callback) (struct dma_client *client,
 		struct dma_chan *chan, enum dma_state state);
 
+/**
+ * typedef dma_filter_fn - callback filter for dma_request_channel
+ * @chan: channel to be reviewed
+ * @filter_param: opaque parameter passed through dma_request_channel
+ *
+ * When this optional parameter is specified in a call to dma_request_channel a
+ * suitable channel is passed to this routine for further dispositioning before
+ * being returned.  Where 'suitable' indicates a non-busy channel that
+ * satisfies the given capability mask.
+ */
+typedef enum dma_state_client (*dma_filter_fn)(struct dma_chan *chan, void *filter_param);
+
 /**
  * struct dma_client - info on the entity making use of DMA services
  * @event_callback: func ptr to call when something happens
@@ -472,6 +485,9 @@ void dma_async_device_unregister(struct dma_device *device);
 void dma_run_dependencies(struct dma_async_tx_descriptor *tx);
 struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type);
 void dma_issue_pending_all(void);
+#define dma_request_channel(mask, x, y) __dma_request_channel(&(mask), x, y)
+struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param);
+void dma_release_channel(struct dma_chan *chan);
 
 /* --- Helper iov-locking functions --- */
 
-- 
cgit v1.2.3


From 33df8ca068123457db56c316946a3c0e4ef787d6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 6 Jan 2009 11:38:15 -0700
Subject: dmatest: convert to dma_request_channel

Replace the client registration infrastructure with a custom loop to
poll for channels.  Once dma_request_channel returns NULL stop asking
for channels.  A userspace side effect of this change if that loading
the dmatest module before loading a dma driver will result in no
channels being found, previously dmatest would get a callback.  To
facilitate testing in the built-in case dmatest_init is marked as a
late_initcall.  Another side effect is that channels under test can not
be used for any other purpose.

Cc: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/dmatest.c     | 115 +++++++++++++++++-----------------------------
 include/linux/dmaengine.h |   6 +++
 2 files changed, 49 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index db4050884713..1d6e48f9cd02 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -35,7 +35,7 @@ MODULE_PARM_DESC(threads_per_chan,
 
 static unsigned int max_channels;
 module_param(max_channels, uint, S_IRUGO);
-MODULE_PARM_DESC(nr_channels,
+MODULE_PARM_DESC(max_channels,
 		"Maximum number of channels to use (default: all)");
 
 /*
@@ -71,7 +71,7 @@ struct dmatest_chan {
 
 /*
  * These are protected by dma_list_mutex since they're only used by
- * the DMA client event callback
+ * the DMA filter function callback
  */
 static LIST_HEAD(dmatest_channels);
 static unsigned int nr_channels;
@@ -317,21 +317,16 @@ static void dmatest_cleanup_channel(struct dmatest_chan *dtc)
 	kfree(dtc);
 }
 
-static enum dma_state_client dmatest_add_channel(struct dma_chan *chan)
+static int dmatest_add_channel(struct dma_chan *chan)
 {
 	struct dmatest_chan	*dtc;
 	struct dmatest_thread	*thread;
 	unsigned int		i;
 
-	/* Have we already been told about this channel? */
-	list_for_each_entry(dtc, &dmatest_channels, node)
-		if (dtc->chan == chan)
-			return DMA_DUP;
-
 	dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL);
 	if (!dtc) {
 		pr_warning("dmatest: No memory for %s\n", dev_name(&chan->dev));
-		return DMA_NAK;
+		return -ENOMEM;
 	}
 
 	dtc->chan = chan;
@@ -365,81 +360,57 @@ static enum dma_state_client dmatest_add_channel(struct dma_chan *chan)
 	list_add_tail(&dtc->node, &dmatest_channels);
 	nr_channels++;
 
-	return DMA_ACK;
+	return 0;
 }
 
-static enum dma_state_client dmatest_remove_channel(struct dma_chan *chan)
+static enum dma_state_client filter(struct dma_chan *chan, void *param)
 {
-	struct dmatest_chan	*dtc, *_dtc;
-
-	list_for_each_entry_safe(dtc, _dtc, &dmatest_channels, node) {
-		if (dtc->chan == chan) {
-			list_del(&dtc->node);
-			dmatest_cleanup_channel(dtc);
-			pr_debug("dmatest: lost channel %s\n",
-					dev_name(&chan->dev));
-			return DMA_ACK;
-		}
-	}
-
-	return DMA_DUP;
-}
-
-/*
- * Start testing threads as new channels are assigned to us, and kill
- * them when the channels go away.
- *
- * When we unregister the client, all channels are removed so this
- * will also take care of cleaning things up when the module is
- * unloaded.
- */
-static enum dma_state_client
-dmatest_event(struct dma_client *client, struct dma_chan *chan,
-		enum dma_state state)
-{
-	enum dma_state_client	ack = DMA_NAK;
-
-	switch (state) {
-	case DMA_RESOURCE_AVAILABLE:
-		if (!dmatest_match_channel(chan)
-				|| !dmatest_match_device(chan->device))
-			ack = DMA_DUP;
-		else if (max_channels && nr_channels >= max_channels)
-			ack = DMA_NAK;
-		else
-			ack = dmatest_add_channel(chan);
-		break;
-
-	case DMA_RESOURCE_REMOVED:
-		ack = dmatest_remove_channel(chan);
-		break;
-
-	default:
-		pr_info("dmatest: Unhandled event %u (%s)\n",
-				state, dev_name(&chan->dev));
-		break;
-	}
-
-	return ack;
+	if (!dmatest_match_channel(chan) || !dmatest_match_device(chan->device))
+		return DMA_DUP;
+	else
+		return DMA_ACK;
 }
 
-static struct dma_client dmatest_client = {
-	.event_callback	= dmatest_event,
-};
-
 static int __init dmatest_init(void)
 {
-	dma_cap_set(DMA_MEMCPY, dmatest_client.cap_mask);
-	dma_async_client_register(&dmatest_client);
-	dma_async_client_chan_request(&dmatest_client);
+	dma_cap_mask_t mask;
+	struct dma_chan *chan;
+	int err = 0;
+
+	dma_cap_zero(mask);
+	dma_cap_set(DMA_MEMCPY, mask);
+	for (;;) {
+		chan = dma_request_channel(mask, filter, NULL);
+		if (chan) {
+			err = dmatest_add_channel(chan);
+			if (err == 0)
+				continue;
+			else {
+				dma_release_channel(chan);
+				break; /* add_channel failed, punt */
+			}
+		} else
+			break; /* no more channels available */
+		if (max_channels && nr_channels >= max_channels)
+			break; /* we have all we need */
+	}
 
-	return 0;
+	return err;
 }
-module_init(dmatest_init);
+/* when compiled-in wait for drivers to load first */
+late_initcall(dmatest_init);
 
 static void __exit dmatest_exit(void)
 {
-	dma_async_client_unregister(&dmatest_client);
+	struct dmatest_chan *dtc, *_dtc;
+
+	list_for_each_entry_safe(dtc, _dtc, &dmatest_channels, node) {
+		list_del(&dtc->node);
+		dmatest_cleanup_channel(dtc);
+		pr_debug("dmatest: dropped channel %s\n",
+			 dev_name(&dtc->chan->dev));
+		dma_release_channel(dtc->chan);
+	}
 }
 module_exit(dmatest_exit);
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index fe40bc020af6..6f2d070ac7f3 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -400,6 +400,12 @@ __dma_cap_set(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp)
 	set_bit(tx_type, dstp->bits);
 }
 
+#define dma_cap_zero(mask) __dma_cap_zero(&(mask))
+static inline void __dma_cap_zero(dma_cap_mask_t *dstp)
+{
+	bitmap_zero(dstp->bits, DMA_TX_TYPE_END);
+}
+
 #define dma_has_cap(tx, mask) __dma_has_cap((tx), &(mask))
 static inline int
 __dma_has_cap(enum dma_transaction_type tx_type, dma_cap_mask_t *srcp)
-- 
cgit v1.2.3


From 74465b4ff9ac1da503025c0a0042e023bfa6505c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 6 Jan 2009 11:38:16 -0700
Subject: atmel-mci: convert to dma_request_channel and down-level dma_slave

dma_request_channel provides an exclusive channel, so we no longer need to
pass slave data through dmaengine.

Cc: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/avr32/include/asm/atmel-mci.h  |  6 +--
 arch/avr32/mach-at32ap/at32ap700x.c | 15 ++----
 drivers/dma/dmaengine.c             |  8 ---
 drivers/dma/dw_dmac.c               | 25 +++-------
 drivers/mmc/host/atmel-mci.c        | 98 ++++++++++---------------------------
 include/linux/dmaengine.h           | 38 --------------
 include/linux/dw_dmac.h             | 31 +++++++++---
 7 files changed, 62 insertions(+), 159 deletions(-)

(limited to 'include/linux')

diff --git a/arch/avr32/include/asm/atmel-mci.h b/arch/avr32/include/asm/atmel-mci.h
index 59f3fadd0b68..e5e54c6f35d9 100644
--- a/arch/avr32/include/asm/atmel-mci.h
+++ b/arch/avr32/include/asm/atmel-mci.h
@@ -3,7 +3,7 @@
 
 #define ATMEL_MCI_MAX_NR_SLOTS	2
 
-struct dma_slave;
+#include <linux/dw_dmac.h>
 
 /**
  * struct mci_slot_pdata - board-specific per-slot configuration
@@ -28,11 +28,11 @@ struct mci_slot_pdata {
 
 /**
  * struct mci_platform_data - board-specific MMC/SDcard configuration
- * @dma_slave: DMA slave interface to use in data transfers, or NULL.
+ * @dma_slave: DMA slave interface to use in data transfers.
  * @slot: Per-slot configuration data.
  */
 struct mci_platform_data {
-	struct dma_slave	*dma_slave;
+	struct dw_dma_slave	dma_slave;
 	struct mci_slot_pdata	slot[ATMEL_MCI_MAX_NR_SLOTS];
 };
 
diff --git a/arch/avr32/mach-at32ap/at32ap700x.c b/arch/avr32/mach-at32ap/at32ap700x.c
index 066252eebf61..414f174e38cd 100644
--- a/arch/avr32/mach-at32ap/at32ap700x.c
+++ b/arch/avr32/mach-at32ap/at32ap700x.c
@@ -1305,7 +1305,7 @@ struct platform_device *__init
 at32_add_device_mci(unsigned int id, struct mci_platform_data *data)
 {
 	struct platform_device		*pdev;
-	struct dw_dma_slave		*dws;
+	struct dw_dma_slave		*dws = &data->dma_slave;
 	u32				pioa_mask;
 	u32				piob_mask;
 
@@ -1324,22 +1324,13 @@ at32_add_device_mci(unsigned int id, struct mci_platform_data *data)
 				ARRAY_SIZE(atmel_mci0_resource)))
 		goto fail;
 
-	if (data->dma_slave)
-		dws = kmemdup(to_dw_dma_slave(data->dma_slave),
-				sizeof(struct dw_dma_slave), GFP_KERNEL);
-	else
-		dws = kzalloc(sizeof(struct dw_dma_slave), GFP_KERNEL);
-
-	dws->slave.dev = &pdev->dev;
-	dws->slave.dma_dev = &dw_dmac0_device.dev;
-	dws->slave.reg_width = DMA_SLAVE_WIDTH_32BIT;
+	dws->dma_dev = &dw_dmac0_device.dev;
+	dws->reg_width = DW_DMA_SLAVE_WIDTH_32BIT;
 	dws->cfg_hi = (DWC_CFGH_SRC_PER(0)
 				| DWC_CFGH_DST_PER(1));
 	dws->cfg_lo &= ~(DWC_CFGL_HS_DST_POL
 				| DWC_CFGL_HS_SRC_POL);
 
-	data->dma_slave = &dws->slave;
-
 	if (platform_device_add_data(pdev, data,
 				sizeof(struct mci_platform_data)))
 		goto fail;
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 7a0594f24a3f..90aca505a1df 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -234,10 +234,6 @@ static void dma_client_chan_alloc(struct dma_client *client)
 	list_for_each_entry(device, &dma_device_list, global_node) {
 		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
 			continue;
-		/* Does the client require a specific DMA controller? */
-		if (client->slave && client->slave->dma_dev
-				&& client->slave->dma_dev != device->dev)
-			continue;
 		if (!dma_device_satisfies_mask(device, client->cap_mask))
 			continue;
 
@@ -613,10 +609,6 @@ void dma_async_client_register(struct dma_client *client)
 	struct dma_chan *chan;
 	int err;
 
-	/* validate client data */
-	BUG_ON(dma_has_cap(DMA_SLAVE, client->cap_mask) &&
-		!client->slave);
-
 	mutex_lock(&dma_list_mutex);
 	dmaengine_ref_count++;
 
diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index 377dafa37a20..dbd50804e5d2 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -567,7 +567,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	if (unlikely(!dws || !sg_len))
 		return NULL;
 
-	reg_width = dws->slave.reg_width;
+	reg_width = dws->reg_width;
 	prev = first = NULL;
 
 	sg_len = dma_map_sg(chan->dev.parent, sgl, sg_len, direction);
@@ -579,7 +579,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 				| DWC_CTLL_DST_FIX
 				| DWC_CTLL_SRC_INC
 				| DWC_CTLL_FC_M2P);
-		reg = dws->slave.tx_reg;
+		reg = dws->tx_reg;
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct dw_desc	*desc;
 			u32		len;
@@ -625,7 +625,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 				| DWC_CTLL_SRC_FIX
 				| DWC_CTLL_FC_P2M);
 
-		reg = dws->slave.rx_reg;
+		reg = dws->rx_reg;
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct dw_desc	*desc;
 			u32		len;
@@ -764,7 +764,6 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan,
 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
 	struct dw_dma		*dw = to_dw_dma(chan->device);
 	struct dw_desc		*desc;
-	struct dma_slave	*slave;
 	struct dw_dma_slave	*dws;
 	int			i;
 	u32			cfghi;
@@ -772,12 +771,6 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan,
 
 	dev_vdbg(&chan->dev, "alloc_chan_resources\n");
 
-	/* Channels doing slave DMA can only handle one client. */
-	if (dwc->dws || (client && client->slave)) {
-		if (chan->client_count)
-			return -EBUSY;
-	}
-
 	/* ASSERT:  channel is idle */
 	if (dma_readl(dw, CH_EN) & dwc->mask) {
 		dev_dbg(&chan->dev, "DMA channel not idle?\n");
@@ -789,23 +782,17 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan,
 	cfghi = DWC_CFGH_FIFO_MODE;
 	cfglo = 0;
 
-	slave = client->slave;
-	if (slave) {
+	dws = dwc->dws;
+	if (dws) {
 		/*
 		 * We need controller-specific data to set up slave
 		 * transfers.
 		 */
-		BUG_ON(!slave->dma_dev || slave->dma_dev != dw->dma.dev);
-
-		dws = container_of(slave, struct dw_dma_slave, slave);
+		BUG_ON(!dws->dma_dev || dws->dma_dev != dw->dma.dev);
 
-		dwc->dws = dws;
 		cfghi = dws->cfg_hi;
 		cfglo = dws->cfg_lo;
-	} else {
-		dwc->dws = NULL;
 	}
-
 	channel_writel(dwc, CFG_LO, cfglo);
 	channel_writel(dwc, CFG_HI, cfghi);
 
diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c
index 6c11f4d4c4e9..7a34118507db 100644
--- a/drivers/mmc/host/atmel-mci.c
+++ b/drivers/mmc/host/atmel-mci.c
@@ -1441,60 +1441,6 @@ static irqreturn_t atmci_detect_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-#ifdef CONFIG_MMC_ATMELMCI_DMA
-
-static inline struct atmel_mci *
-dma_client_to_atmel_mci(struct dma_client *client)
-{
-	return container_of(client, struct atmel_mci, dma.client);
-}
-
-static enum dma_state_client atmci_dma_event(struct dma_client *client,
-		struct dma_chan *chan, enum dma_state state)
-{
-	struct atmel_mci	*host;
-	enum dma_state_client	ret = DMA_NAK;
-
-	host = dma_client_to_atmel_mci(client);
-
-	switch (state) {
-	case DMA_RESOURCE_AVAILABLE:
-		spin_lock_bh(&host->lock);
-		if (!host->dma.chan) {
-			host->dma.chan = chan;
-			ret = DMA_ACK;
-		}
-		spin_unlock_bh(&host->lock);
-
-		if (ret == DMA_ACK)
-			dev_info(&host->pdev->dev,
-					"Using %s for DMA transfers\n",
-					chan->dev.bus_id);
-		break;
-
-	case DMA_RESOURCE_REMOVED:
-		spin_lock_bh(&host->lock);
-		if (host->dma.chan == chan) {
-			host->dma.chan = NULL;
-			ret = DMA_ACK;
-		}
-		spin_unlock_bh(&host->lock);
-
-		if (ret == DMA_ACK)
-			dev_info(&host->pdev->dev,
-					"Lost %s, falling back to PIO\n",
-					chan->dev.bus_id);
-		break;
-
-	default:
-		break;
-	}
-
-
-	return ret;
-}
-#endif /* CONFIG_MMC_ATMELMCI_DMA */
-
 static int __init atmci_init_slot(struct atmel_mci *host,
 		struct mci_slot_pdata *slot_data, unsigned int id,
 		u32 sdc_reg)
@@ -1598,6 +1544,18 @@ static void __exit atmci_cleanup_slot(struct atmel_mci_slot *slot,
 	mmc_free_host(slot->mmc);
 }
 
+#ifdef CONFIG_MMC_ATMELMCI_DMA
+static enum dma_state_client filter(struct dma_chan *chan, void *slave)
+{
+	struct dw_dma_slave *dws = slave;
+
+	if (dws->dma_dev == chan->device->dev)
+		return DMA_ACK;
+	else
+		return DMA_DUP;
+}
+#endif
+
 static int __init atmci_probe(struct platform_device *pdev)
 {
 	struct mci_platform_data	*pdata;
@@ -1650,22 +1608,20 @@ static int __init atmci_probe(struct platform_device *pdev)
 		goto err_request_irq;
 
 #ifdef CONFIG_MMC_ATMELMCI_DMA
-	if (pdata->dma_slave) {
-		struct dma_slave *slave = pdata->dma_slave;
+	if (pdata->dma_slave.dma_dev) {
+		struct dw_dma_slave *dws = &pdata->dma_slave;
+		dma_cap_mask_t mask;
 
-		slave->tx_reg = regs->start + MCI_TDR;
-		slave->rx_reg = regs->start + MCI_RDR;
+		dws->tx_reg = regs->start + MCI_TDR;
+		dws->rx_reg = regs->start + MCI_RDR;
 
 		/* Try to grab a DMA channel */
-		host->dma.client.event_callback = atmci_dma_event;
-		dma_cap_set(DMA_SLAVE, host->dma.client.cap_mask);
-		host->dma.client.slave = slave;
-
-		dma_async_client_register(&host->dma.client);
-		dma_async_client_chan_request(&host->dma.client);
-	} else {
-		dev_notice(&pdev->dev, "DMA not available, using PIO\n");
+		dma_cap_zero(mask);
+		dma_cap_set(DMA_SLAVE, mask);
+		host->dma.chan = dma_request_channel(mask, filter, dws);
 	}
+	if (!host->dma.chan)
+		dev_notice(&pdev->dev, "DMA not available, using PIO\n");
 #endif /* CONFIG_MMC_ATMELMCI_DMA */
 
 	platform_set_drvdata(pdev, host);
@@ -1697,8 +1653,8 @@ static int __init atmci_probe(struct platform_device *pdev)
 
 err_init_slot:
 #ifdef CONFIG_MMC_ATMELMCI_DMA
-	if (pdata->dma_slave)
-		dma_async_client_unregister(&host->dma.client);
+	if (host->dma.chan)
+		dma_release_channel(host->dma.chan);
 #endif
 	free_irq(irq, host);
 err_request_irq:
@@ -1729,8 +1685,8 @@ static int __exit atmci_remove(struct platform_device *pdev)
 	clk_disable(host->mck);
 
 #ifdef CONFIG_MMC_ATMELMCI_DMA
-	if (host->dma.client.slave)
-		dma_async_client_unregister(&host->dma.client);
+	if (host->dma.chan)
+		dma_release_channel(host->dma.chan);
 #endif
 
 	free_irq(platform_get_irq(pdev, 0), host);
@@ -1759,7 +1715,7 @@ static void __exit atmci_exit(void)
 	platform_driver_unregister(&atmci_driver);
 }
 
-module_init(atmci_init);
+late_initcall(atmci_init); /* try to load after dma driver when built-in */
 module_exit(atmci_exit);
 
 MODULE_DESCRIPTION("Atmel Multimedia Card Interface driver");
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 6f2d070ac7f3..d63544cf8a1a 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -96,17 +96,6 @@ enum dma_transaction_type {
 /* last transaction type for creation of the capabilities mask */
 #define DMA_TX_TYPE_END (DMA_SLAVE + 1)
 
-/**
- * enum dma_slave_width - DMA slave register access width.
- * @DMA_SLAVE_WIDTH_8BIT: Do 8-bit slave register accesses
- * @DMA_SLAVE_WIDTH_16BIT: Do 16-bit slave register accesses
- * @DMA_SLAVE_WIDTH_32BIT: Do 32-bit slave register accesses
- */
-enum dma_slave_width {
-	DMA_SLAVE_WIDTH_8BIT,
-	DMA_SLAVE_WIDTH_16BIT,
-	DMA_SLAVE_WIDTH_32BIT,
-};
 
 /**
  * enum dma_ctrl_flags - DMA flags to augment operation preparation,
@@ -132,32 +121,6 @@ enum dma_ctrl_flags {
  */
 typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t;
 
-/**
- * struct dma_slave - Information about a DMA slave
- * @dev: device acting as DMA slave
- * @dma_dev: required DMA master device. If non-NULL, the client can not be
- *	bound to other masters than this.
- * @tx_reg: physical address of data register used for
- *	memory-to-peripheral transfers
- * @rx_reg: physical address of data register used for
- *	peripheral-to-memory transfers
- * @reg_width: peripheral register width
- *
- * If dma_dev is non-NULL, the client can not be bound to other DMA
- * masters than the one corresponding to this device. The DMA master
- * driver may use this to determine if there is controller-specific
- * data wrapped around this struct. Drivers of platform code that sets
- * the dma_dev field must therefore make sure to use an appropriate
- * controller-specific dma slave structure wrapping this struct.
- */
-struct dma_slave {
-	struct device		*dev;
-	struct device		*dma_dev;
-	dma_addr_t		tx_reg;
-	dma_addr_t		rx_reg;
-	enum dma_slave_width	reg_width;
-};
-
 /**
  * struct dma_chan_percpu - the per-CPU part of struct dma_chan
  * @refcount: local_t used for open-coded "bigref" counting
@@ -248,7 +211,6 @@ typedef enum dma_state_client (*dma_filter_fn)(struct dma_chan *chan, void *filt
 struct dma_client {
 	dma_event_callback	event_callback;
 	dma_cap_mask_t		cap_mask;
-	struct dma_slave	*slave;
 	struct list_head	global_node;
 };
 
diff --git a/include/linux/dw_dmac.h b/include/linux/dw_dmac.h
index 04d217b442bf..d797dde247f7 100644
--- a/include/linux/dw_dmac.h
+++ b/include/linux/dw_dmac.h
@@ -21,15 +21,35 @@ struct dw_dma_platform_data {
 	unsigned int	nr_channels;
 };
 
+/**
+ * enum dw_dma_slave_width - DMA slave register access width.
+ * @DMA_SLAVE_WIDTH_8BIT: Do 8-bit slave register accesses
+ * @DMA_SLAVE_WIDTH_16BIT: Do 16-bit slave register accesses
+ * @DMA_SLAVE_WIDTH_32BIT: Do 32-bit slave register accesses
+ */
+enum dw_dma_slave_width {
+	DW_DMA_SLAVE_WIDTH_8BIT,
+	DW_DMA_SLAVE_WIDTH_16BIT,
+	DW_DMA_SLAVE_WIDTH_32BIT,
+};
+
 /**
  * struct dw_dma_slave - Controller-specific information about a slave
- * @slave: Generic information about the slave
- * @ctl_lo: Platform-specific initializer for the CTL_LO register
+ *
+ * @dma_dev: required DMA master device
+ * @tx_reg: physical address of data register used for
+ *	memory-to-peripheral transfers
+ * @rx_reg: physical address of data register used for
+ *	peripheral-to-memory transfers
+ * @reg_width: peripheral register width
  * @cfg_hi: Platform-specific initializer for the CFG_HI register
  * @cfg_lo: Platform-specific initializer for the CFG_LO register
  */
 struct dw_dma_slave {
-	struct dma_slave	slave;
+	struct device		*dma_dev;
+	dma_addr_t		tx_reg;
+	dma_addr_t		rx_reg;
+	enum dw_dma_slave_width	reg_width;
 	u32			cfg_hi;
 	u32			cfg_lo;
 };
@@ -54,9 +74,4 @@ struct dw_dma_slave {
 #define DWC_CFGL_HS_DST_POL	(1 << 18)	/* dst handshake active low */
 #define DWC_CFGL_HS_SRC_POL	(1 << 19)	/* src handshake active low */
 
-static inline struct dw_dma_slave *to_dw_dma_slave(struct dma_slave *slave)
-{
-	return container_of(slave, struct dw_dma_slave, slave);
-}
-
 #endif /* DW_DMAC_H */
-- 
cgit v1.2.3


From 209b84a88fe81341b4d8d465acc4a67cb7c3feb3 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 6 Jan 2009 11:38:17 -0700
Subject: dmaengine: replace dma_async_client_register with dmaengine_get

Now that clients no longer need to be notified of channel arrival
dma_async_client_register can simply increment the dmaengine_ref_count.

Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/async_tx/async_tx.c | 115 +--------------------------------------------
 drivers/dma/dmaengine.c    |  22 +++------
 include/linux/dmaengine.h  |   4 +-
 net/core/dev.c             |   3 +-
 4 files changed, 11 insertions(+), 133 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index 2cdf7a0867b7..f21147f3626a 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -28,120 +28,9 @@
 #include <linux/async_tx.h>
 
 #ifdef CONFIG_DMA_ENGINE
-static enum dma_state_client
-dma_channel_add_remove(struct dma_client *client,
-	struct dma_chan *chan, enum dma_state state);
-
-static struct dma_client async_tx_dma = {
-	.event_callback = dma_channel_add_remove,
-	/* .cap_mask == 0 defaults to all channels */
-};
-
-/**
- * async_tx_lock - protect modification of async_tx_master_list and serialize
- *	rebalance operations
- */
-static DEFINE_SPINLOCK(async_tx_lock);
-
-static LIST_HEAD(async_tx_master_list);
-
-static void
-free_dma_chan_ref(struct rcu_head *rcu)
-{
-	struct dma_chan_ref *ref;
-	ref = container_of(rcu, struct dma_chan_ref, rcu);
-	kfree(ref);
-}
-
-static void
-init_dma_chan_ref(struct dma_chan_ref *ref, struct dma_chan *chan)
-{
-	INIT_LIST_HEAD(&ref->node);
-	INIT_RCU_HEAD(&ref->rcu);
-	ref->chan = chan;
-	atomic_set(&ref->count, 0);
-}
-
-static enum dma_state_client
-dma_channel_add_remove(struct dma_client *client,
-	struct dma_chan *chan, enum dma_state state)
-{
-	unsigned long found, flags;
-	struct dma_chan_ref *master_ref, *ref;
-	enum dma_state_client ack = DMA_DUP; /* default: take no action */
-
-	switch (state) {
-	case DMA_RESOURCE_AVAILABLE:
-		found = 0;
-		rcu_read_lock();
-		list_for_each_entry_rcu(ref, &async_tx_master_list, node)
-			if (ref->chan == chan) {
-				found = 1;
-				break;
-			}
-		rcu_read_unlock();
-
-		pr_debug("async_tx: dma resource available [%s]\n",
-			found ? "old" : "new");
-
-		if (!found)
-			ack = DMA_ACK;
-		else
-			break;
-
-		/* add the channel to the generic management list */
-		master_ref = kmalloc(sizeof(*master_ref), GFP_KERNEL);
-		if (master_ref) {
-			init_dma_chan_ref(master_ref, chan);
-			spin_lock_irqsave(&async_tx_lock, flags);
-			list_add_tail_rcu(&master_ref->node,
-				&async_tx_master_list);
-			spin_unlock_irqrestore(&async_tx_lock,
-				flags);
-		} else {
-			printk(KERN_WARNING "async_tx: unable to create"
-				" new master entry in response to"
-				" a DMA_RESOURCE_ADDED event"
-				" (-ENOMEM)\n");
-			return 0;
-		}
-		break;
-	case DMA_RESOURCE_REMOVED:
-		found = 0;
-		spin_lock_irqsave(&async_tx_lock, flags);
-		list_for_each_entry(ref, &async_tx_master_list, node)
-			if (ref->chan == chan) {
-				list_del_rcu(&ref->node);
-				call_rcu(&ref->rcu, free_dma_chan_ref);
-				found = 1;
-				break;
-			}
-		spin_unlock_irqrestore(&async_tx_lock, flags);
-
-		pr_debug("async_tx: dma resource removed [%s]\n",
-			found ? "ours" : "not ours");
-
-		if (found)
-			ack = DMA_ACK;
-		else
-			break;
-		break;
-	case DMA_RESOURCE_SUSPEND:
-	case DMA_RESOURCE_RESUME:
-		printk(KERN_WARNING "async_tx: does not support dma channel"
-			" suspend/resume\n");
-		break;
-	default:
-		BUG();
-	}
-
-	return ack;
-}
-
 static int __init async_tx_init(void)
 {
-	dma_async_client_register(&async_tx_dma);
-	dma_async_client_chan_request(&async_tx_dma);
+	dmaengine_get();
 
 	printk(KERN_INFO "async_tx: api initialized (async)\n");
 
@@ -150,7 +39,7 @@ static int __init async_tx_init(void)
 
 static void __exit async_tx_exit(void)
 {
-	dma_async_client_unregister(&async_tx_dma);
+	dmaengine_put();
 }
 
 /**
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 90aca505a1df..3f1849b7f5ef 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -600,10 +600,9 @@ static void dma_clients_notify_available(void)
 }
 
 /**
- * dma_async_client_register - register a &dma_client
- * @client: ptr to a client structure with valid 'event_callback' and 'cap_mask'
+ * dmaengine_get - register interest in dma_channels
  */
-void dma_async_client_register(struct dma_client *client)
+void dmaengine_get(void)
 {
 	struct dma_device *device, *_d;
 	struct dma_chan *chan;
@@ -634,25 +633,18 @@ void dma_async_client_register(struct dma_client *client)
 	 */
 	if (dmaengine_ref_count == 1)
 		dma_channel_rebalance();
-	list_add_tail(&client->global_node, &dma_client_list);
 	mutex_unlock(&dma_list_mutex);
 }
-EXPORT_SYMBOL(dma_async_client_register);
+EXPORT_SYMBOL(dmaengine_get);
 
 /**
- * dma_async_client_unregister - unregister a client and free the &dma_client
- * @client: &dma_client to free
- *
- * Force frees any allocated DMA channels, frees the &dma_client memory
+ * dmaengine_put - let dma drivers be removed when ref_count == 0
  */
-void dma_async_client_unregister(struct dma_client *client)
+void dmaengine_put(void)
 {
 	struct dma_device *device;
 	struct dma_chan *chan;
 
-	if (!client)
-		return;
-
 	mutex_lock(&dma_list_mutex);
 	dmaengine_ref_count--;
 	BUG_ON(dmaengine_ref_count < 0);
@@ -663,11 +655,9 @@ void dma_async_client_unregister(struct dma_client *client)
 		list_for_each_entry(chan, &device->channels, device_node)
 			dma_chan_put(chan);
 	}
-
-	list_del(&client->global_node);
 	mutex_unlock(&dma_list_mutex);
 }
-EXPORT_SYMBOL(dma_async_client_unregister);
+EXPORT_SYMBOL(dmaengine_put);
 
 /**
  * dma_async_client_chan_request - send all available channels to the
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index d63544cf8a1a..37d95db156d3 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -318,8 +318,8 @@ struct dma_device {
 
 /* --- public DMA engine API --- */
 
-void dma_async_client_register(struct dma_client *client);
-void dma_async_client_unregister(struct dma_client *client);
+void dmaengine_get(void);
+void dmaengine_put(void);
 void dma_async_client_chan_request(struct dma_client *client);
 dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
 	void *dest, void *src, size_t len);
diff --git a/net/core/dev.c b/net/core/dev.c
index bbb07dbe1740..7596fc9403c8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4894,8 +4894,7 @@ static int __init netdev_dma_register(void)
 	}
 	spin_lock_init(&net_dma.lock);
 	dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
-	dma_async_client_register(&net_dma.client);
-	dma_async_client_chan_request(&net_dma.client);
+	dmaengine_get();
 	return 0;
 }
 
-- 
cgit v1.2.3


From aa1e6f1a385eb2b04171ec841f3b760091e4a8ee Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 6 Jan 2009 11:38:17 -0700
Subject: dmaengine: kill struct dma_client and supporting infrastructure

All users have been converted to either the general-purpose allocator,
dma_find_channel, or dma_request_channel.

Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/dmaengine.c      | 74 ++-------------------------------
 drivers/dma/dw_dmac.c        |  3 +-
 drivers/dma/fsldma.c         |  3 +-
 drivers/dma/ioat_dma.c       |  5 +--
 drivers/dma/iop-adma.c       |  7 ++--
 drivers/dma/mv_xor.c         |  7 ++--
 drivers/mmc/host/atmel-mci.c |  1 -
 include/linux/dmaengine.h    | 50 +---------------------
 net/core/dev.c               | 99 ++------------------------------------------
 9 files changed, 17 insertions(+), 232 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 3f1849b7f5ef..9fc91f973a9a 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -31,15 +31,12 @@
  *
  * LOCKING:
  *
- * The subsystem keeps two global lists, dma_device_list and dma_client_list.
- * Both of these are protected by a mutex, dma_list_mutex.
+ * The subsystem keeps a global list of dma_device structs it is protected by a
+ * mutex, dma_list_mutex.
  *
  * Each device has a channels list, which runs unlocked but is never modified
  * once the device is registered, it's just setup by the driver.
  *
- * Each client is responsible for keeping track of the channels it uses.  See
- * the definition of dma_event_callback in dmaengine.h.
- *
  * Each device has a kref, which is initialized to 1 when the device is
  * registered. A kref_get is done for each device registered.  When the
  * device is released, the corresponding kref_put is done in the release
@@ -74,7 +71,6 @@
 
 static DEFINE_MUTEX(dma_list_mutex);
 static LIST_HEAD(dma_device_list);
-static LIST_HEAD(dma_client_list);
 static long dmaengine_ref_count;
 
 /* --- sysfs implementation --- */
@@ -189,7 +185,7 @@ static int dma_chan_get(struct dma_chan *chan)
 
 	/* allocate upon first client reference */
 	if (chan->client_count == 1 && err == 0) {
-		int desc_cnt = chan->device->device_alloc_chan_resources(chan, NULL);
+		int desc_cnt = chan->device->device_alloc_chan_resources(chan);
 
 		if (desc_cnt < 0) {
 			err = desc_cnt;
@@ -218,40 +214,6 @@ static void dma_chan_put(struct dma_chan *chan)
 		chan->device->device_free_chan_resources(chan);
 }
 
-/**
- * dma_client_chan_alloc - try to allocate channels to a client
- * @client: &dma_client
- *
- * Called with dma_list_mutex held.
- */
-static void dma_client_chan_alloc(struct dma_client *client)
-{
-	struct dma_device *device;
-	struct dma_chan *chan;
-	enum dma_state_client ack;
-
-	/* Find a channel */
-	list_for_each_entry(device, &dma_device_list, global_node) {
-		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
-			continue;
-		if (!dma_device_satisfies_mask(device, client->cap_mask))
-			continue;
-
-		list_for_each_entry(chan, &device->channels, device_node) {
-			if (!chan->client_count)
-				continue;
-			ack = client->event_callback(client, chan,
-						     DMA_RESOURCE_AVAILABLE);
-
-			/* we are done once this client rejects
-			 * an available resource
-			 */
-			if (ack == DMA_NAK)
-				return;
-		}
-	}
-}
-
 enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
 {
 	enum dma_status status;
@@ -584,21 +546,6 @@ void dma_release_channel(struct dma_chan *chan)
 }
 EXPORT_SYMBOL_GPL(dma_release_channel);
 
-/**
- * dma_chans_notify_available - broadcast available channels to the clients
- */
-static void dma_clients_notify_available(void)
-{
-	struct dma_client *client;
-
-	mutex_lock(&dma_list_mutex);
-
-	list_for_each_entry(client, &dma_client_list, global_node)
-		dma_client_chan_alloc(client);
-
-	mutex_unlock(&dma_list_mutex);
-}
-
 /**
  * dmaengine_get - register interest in dma_channels
  */
@@ -659,19 +606,6 @@ void dmaengine_put(void)
 }
 EXPORT_SYMBOL(dmaengine_put);
 
-/**
- * dma_async_client_chan_request - send all available channels to the
- * client that satisfy the capability mask
- * @client - requester
- */
-void dma_async_client_chan_request(struct dma_client *client)
-{
-	mutex_lock(&dma_list_mutex);
-	dma_client_chan_alloc(client);
-	mutex_unlock(&dma_list_mutex);
-}
-EXPORT_SYMBOL(dma_async_client_chan_request);
-
 /**
  * dma_async_device_register - registers DMA devices found
  * @device: &dma_device
@@ -765,8 +699,6 @@ int dma_async_device_register(struct dma_device *device)
 	dma_channel_rebalance();
 	mutex_unlock(&dma_list_mutex);
 
-	dma_clients_notify_available();
-
 	return 0;
 
 err_out:
diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index dbd50804e5d2..a29dda8f801b 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -758,8 +758,7 @@ static void dwc_issue_pending(struct dma_chan *chan)
 	spin_unlock_bh(&dwc->lock);
 }
 
-static int dwc_alloc_chan_resources(struct dma_chan *chan,
-		struct dma_client *client)
+static int dwc_alloc_chan_resources(struct dma_chan *chan)
 {
 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
 	struct dw_dma		*dw = to_dw_dma(chan->device);
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 0b95dcce447e..46e0128929a0 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -366,8 +366,7 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor(
  *
  * Return - The number of descriptors allocated.
  */
-static int fsl_dma_alloc_chan_resources(struct dma_chan *chan,
-					struct dma_client *client)
+static int fsl_dma_alloc_chan_resources(struct dma_chan *chan)
 {
 	struct fsl_dma_chan *fsl_chan = to_fsl_chan(chan);
 
diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index 6607fdd00b1c..e42e1aea0f18 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -734,8 +734,7 @@ static void ioat2_dma_massage_chan_desc(struct ioat_dma_chan *ioat_chan)
  * ioat_dma_alloc_chan_resources - returns the number of allocated descriptors
  * @chan: the channel to be filled out
  */
-static int ioat_dma_alloc_chan_resources(struct dma_chan *chan,
-					 struct dma_client *client)
+static int ioat_dma_alloc_chan_resources(struct dma_chan *chan)
 {
 	struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
 	struct ioat_desc_sw *desc;
@@ -1381,7 +1380,7 @@ static int ioat_dma_self_test(struct ioatdma_device *device)
 	dma_chan = container_of(device->common.channels.next,
 				struct dma_chan,
 				device_node);
-	if (device->common.device_alloc_chan_resources(dma_chan, NULL) < 1) {
+	if (device->common.device_alloc_chan_resources(dma_chan) < 1) {
 		dev_err(&device->pdev->dev,
 			"selftest cannot allocate chan resource\n");
 		err = -ENODEV;
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index be9ea9f88805..c74ac9eb009a 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -470,8 +470,7 @@ static void iop_chan_start_null_xor(struct iop_adma_chan *iop_chan);
  * greater than 2x the number slots needed to satisfy a device->max_xor
  * request.
  * */
-static int iop_adma_alloc_chan_resources(struct dma_chan *chan,
-					 struct dma_client *client)
+static int iop_adma_alloc_chan_resources(struct dma_chan *chan)
 {
 	char *hw_desc;
 	int idx;
@@ -865,7 +864,7 @@ static int __devinit iop_adma_memcpy_self_test(struct iop_adma_device *device)
 	dma_chan = container_of(device->common.channels.next,
 				struct dma_chan,
 				device_node);
-	if (iop_adma_alloc_chan_resources(dma_chan, NULL) < 1) {
+	if (iop_adma_alloc_chan_resources(dma_chan) < 1) {
 		err = -ENODEV;
 		goto out;
 	}
@@ -963,7 +962,7 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
 	dma_chan = container_of(device->common.channels.next,
 				struct dma_chan,
 				device_node);
-	if (iop_adma_alloc_chan_resources(dma_chan, NULL) < 1) {
+	if (iop_adma_alloc_chan_resources(dma_chan) < 1) {
 		err = -ENODEV;
 		goto out;
 	}
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index 3f46df3390c7..fbaa2f6225e2 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -606,8 +606,7 @@ submit_done:
 }
 
 /* returns the number of allocated descriptors */
-static int mv_xor_alloc_chan_resources(struct dma_chan *chan,
-				       struct dma_client *client)
+static int mv_xor_alloc_chan_resources(struct dma_chan *chan)
 {
 	char *hw_desc;
 	int idx;
@@ -957,7 +956,7 @@ static int __devinit mv_xor_memcpy_self_test(struct mv_xor_device *device)
 	dma_chan = container_of(device->common.channels.next,
 				struct dma_chan,
 				device_node);
-	if (mv_xor_alloc_chan_resources(dma_chan, NULL) < 1) {
+	if (mv_xor_alloc_chan_resources(dma_chan) < 1) {
 		err = -ENODEV;
 		goto out;
 	}
@@ -1052,7 +1051,7 @@ mv_xor_xor_self_test(struct mv_xor_device *device)
 	dma_chan = container_of(device->common.channels.next,
 				struct dma_chan,
 				device_node);
-	if (mv_xor_alloc_chan_resources(dma_chan, NULL) < 1) {
+	if (mv_xor_alloc_chan_resources(dma_chan) < 1) {
 		err = -ENODEV;
 		goto out;
 	}
diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c
index 7a34118507db..4b567a0408e1 100644
--- a/drivers/mmc/host/atmel-mci.c
+++ b/drivers/mmc/host/atmel-mci.c
@@ -55,7 +55,6 @@ enum atmel_mci_state {
 
 struct atmel_mci_dma {
 #ifdef CONFIG_MMC_ATMELMCI_DMA
-	struct dma_client		client;
 	struct dma_chan			*chan;
 	struct dma_async_tx_descriptor	*data_desc;
 #endif
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 37d95db156d3..db050e97d2b4 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -28,20 +28,6 @@
 #include <linux/rcupdate.h>
 #include <linux/dma-mapping.h>
 
-/**
- * enum dma_state - resource PNP/power management state
- * @DMA_RESOURCE_SUSPEND: DMA device going into low power state
- * @DMA_RESOURCE_RESUME: DMA device returning to full power
- * @DMA_RESOURCE_AVAILABLE: DMA device available to the system
- * @DMA_RESOURCE_REMOVED: DMA device removed from the system
- */
-enum dma_state {
-	DMA_RESOURCE_SUSPEND,
-	DMA_RESOURCE_RESUME,
-	DMA_RESOURCE_AVAILABLE,
-	DMA_RESOURCE_REMOVED,
-};
-
 /**
  * enum dma_state_client - state of the channel in the client
  * @DMA_ACK: client would like to use, or was using this channel
@@ -170,23 +156,6 @@ struct dma_chan {
 
 void dma_chan_cleanup(struct kref *kref);
 
-/*
- * typedef dma_event_callback - function pointer to a DMA event callback
- * For each channel added to the system this routine is called for each client.
- * If the client would like to use the channel it returns '1' to signal (ack)
- * the dmaengine core to take out a reference on the channel and its
- * corresponding device.  A client must not 'ack' an available channel more
- * than once.  When a channel is removed all clients are notified.  If a client
- * is using the channel it must 'ack' the removal.  A client must not 'ack' a
- * removed channel more than once.
- * @client - 'this' pointer for the client context
- * @chan - channel to be acted upon
- * @state - available or removed
- */
-struct dma_client;
-typedef enum dma_state_client (*dma_event_callback) (struct dma_client *client,
-		struct dma_chan *chan, enum dma_state state);
-
 /**
  * typedef dma_filter_fn - callback filter for dma_request_channel
  * @chan: channel to be reviewed
@@ -199,21 +168,6 @@ typedef enum dma_state_client (*dma_event_callback) (struct dma_client *client,
  */
 typedef enum dma_state_client (*dma_filter_fn)(struct dma_chan *chan, void *filter_param);
 
-/**
- * struct dma_client - info on the entity making use of DMA services
- * @event_callback: func ptr to call when something happens
- * @cap_mask: only return channels that satisfy the requested capabilities
- *  a value of zero corresponds to any capability
- * @slave: data for preparing slave transfer. Must be non-NULL iff the
- *  DMA_SLAVE capability is requested.
- * @global_node: list_head for global dma_client_list
- */
-struct dma_client {
-	dma_event_callback	event_callback;
-	dma_cap_mask_t		cap_mask;
-	struct list_head	global_node;
-};
-
 typedef void (*dma_async_tx_callback)(void *dma_async_param);
 /**
  * struct dma_async_tx_descriptor - async transaction descriptor
@@ -285,8 +239,7 @@ struct dma_device {
 	int dev_id;
 	struct device *dev;
 
-	int (*device_alloc_chan_resources)(struct dma_chan *chan,
-			struct dma_client *client);
+	int (*device_alloc_chan_resources)(struct dma_chan *chan);
 	void (*device_free_chan_resources)(struct dma_chan *chan);
 
 	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)(
@@ -320,7 +273,6 @@ struct dma_device {
 
 void dmaengine_get(void);
 void dmaengine_put(void);
-void dma_async_client_chan_request(struct dma_client *client);
 dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
 	void *dest, void *src, size_t len);
 dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
diff --git a/net/core/dev.c b/net/core/dev.c
index 7596fc9403c8..ac55d84d6255 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -167,25 +167,6 @@ static DEFINE_SPINLOCK(ptype_lock);
 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 static struct list_head ptype_all __read_mostly;	/* Taps */
 
-#ifdef CONFIG_NET_DMA
-struct net_dma {
-	struct dma_client client;
-	spinlock_t lock;
-	cpumask_t channel_mask;
-	struct dma_chan **channels;
-};
-
-static enum dma_state_client
-netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
-	enum dma_state state);
-
-static struct net_dma net_dma = {
-	.client = {
-		.event_callback = netdev_dma_event,
-	},
-};
-#endif
-
 /*
  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  * semaphore.
@@ -4826,81 +4807,6 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-#ifdef CONFIG_NET_DMA
-/**
- * netdev_dma_event - event callback for the net_dma_client
- * @client: should always be net_dma_client
- * @chan: DMA channel for the event
- * @state: DMA state to be handled
- */
-static enum dma_state_client
-netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
-	enum dma_state state)
-{
-	int i, found = 0, pos = -1;
-	struct net_dma *net_dma =
-		container_of(client, struct net_dma, client);
-	enum dma_state_client ack = DMA_DUP; /* default: take no action */
-
-	spin_lock(&net_dma->lock);
-	switch (state) {
-	case DMA_RESOURCE_AVAILABLE:
-		for (i = 0; i < nr_cpu_ids; i++)
-			if (net_dma->channels[i] == chan) {
-				found = 1;
-				break;
-			} else if (net_dma->channels[i] == NULL && pos < 0)
-				pos = i;
-
-		if (!found && pos >= 0) {
-			ack = DMA_ACK;
-			net_dma->channels[pos] = chan;
-			cpu_set(pos, net_dma->channel_mask);
-		}
-		break;
-	case DMA_RESOURCE_REMOVED:
-		for (i = 0; i < nr_cpu_ids; i++)
-			if (net_dma->channels[i] == chan) {
-				found = 1;
-				pos = i;
-				break;
-			}
-
-		if (found) {
-			ack = DMA_ACK;
-			cpu_clear(pos, net_dma->channel_mask);
-			net_dma->channels[i] = NULL;
-		}
-		break;
-	default:
-		break;
-	}
-	spin_unlock(&net_dma->lock);
-
-	return ack;
-}
-
-/**
- * netdev_dma_register - register the networking subsystem as a DMA client
- */
-static int __init netdev_dma_register(void)
-{
-	net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma),
-								GFP_KERNEL);
-	if (unlikely(!net_dma.channels)) {
-		printk(KERN_NOTICE
-				"netdev_dma: no memory for net_dma.channels\n");
-		return -ENOMEM;
-	}
-	spin_lock_init(&net_dma.lock);
-	dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
-	dmaengine_get();
-	return 0;
-}
-
-#else
-static int __init netdev_dma_register(void) { return -ENODEV; }
-#endif /* CONFIG_NET_DMA */
 
 /**
  *	netdev_increment_features - increment feature set by one
@@ -5120,14 +5026,15 @@ static int __init net_dev_init(void)
 	if (register_pernet_device(&default_device_ops))
 		goto out;
 
-	netdev_dma_register();
-
 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
 
 	hotcpu_notifier(dev_cpu_callback, 0);
 	dst_init();
 	dev_mcast_init();
+	#ifdef CONFIG_NET_DMA
+	dmaengine_get();
+	#endif
 	rc = 0;
 out:
 	return rc;
-- 
cgit v1.2.3


From f27c580c3628d79b17f38976d842a6d7f3616e2e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 6 Jan 2009 11:38:18 -0700
Subject: dmaengine: remove 'bigref' infrastructure

Reference counting is done at the module level so clients need not worry
that a channel will leave while they are actively using dmaengine.

Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/dmaengine.c   | 87 +++++------------------------------------------
 drivers/dma/iop-adma.c    |  1 -
 drivers/dma/mv_xor.c      |  1 -
 include/linux/dmaengine.h |  7 ----
 4 files changed, 9 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 9fc91f973a9a..b245c38dbec3 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -34,26 +34,15 @@
  * The subsystem keeps a global list of dma_device structs it is protected by a
  * mutex, dma_list_mutex.
  *
+ * A subsystem can get access to a channel by calling dmaengine_get() followed
+ * by dma_find_channel(), or if it has need for an exclusive channel it can call
+ * dma_request_channel().  Once a channel is allocated a reference is taken
+ * against its corresponding driver to disable removal.
+ *
  * Each device has a channels list, which runs unlocked but is never modified
  * once the device is registered, it's just setup by the driver.
  *
- * Each device has a kref, which is initialized to 1 when the device is
- * registered. A kref_get is done for each device registered.  When the
- * device is released, the corresponding kref_put is done in the release
- * method. Every time one of the device's channels is allocated to a client,
- * a kref_get occurs.  When the channel is freed, the corresponding kref_put
- * happens. The device's release function does a completion, so
- * unregister_device does a remove event, device_unregister, a kref_put
- * for the first reference, then waits on the completion for all other
- * references to finish.
- *
- * Each channel has an open-coded implementation of Rusty Russell's "bigref,"
- * with a kref and a per_cpu local_t.  A dma_chan_get is called when a client
- * signals that it wants to use a channel, and dma_chan_put is called when
- * a channel is removed or a client using it is unregistered.  A client can
- * take extra references per outstanding transaction, as is the case with
- * the NET DMA client.  The release function does a kref_put on the device.
- *	-ChrisL, DanW
+ * See Documentation/dmaengine.txt for more details
  */
 
 #include <linux/init.h>
@@ -114,18 +103,9 @@ static struct device_attribute dma_attrs[] = {
 	__ATTR_NULL
 };
 
-static void dma_async_device_cleanup(struct kref *kref);
-
-static void dma_dev_release(struct device *dev)
-{
-	struct dma_chan *chan = to_dma_chan(dev);
-	kref_put(&chan->device->refcount, dma_async_device_cleanup);
-}
-
 static struct class dma_devclass = {
 	.name		= "dma",
 	.dev_attrs	= dma_attrs,
-	.dev_release	= dma_dev_release,
 };
 
 /* --- client and device registration --- */
@@ -232,29 +212,6 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
 }
 EXPORT_SYMBOL(dma_sync_wait);
 
-/**
- * dma_chan_cleanup - release a DMA channel's resources
- * @kref: kernel reference structure that contains the DMA channel device
- */
-void dma_chan_cleanup(struct kref *kref)
-{
-	struct dma_chan *chan = container_of(kref, struct dma_chan, refcount);
-	kref_put(&chan->device->refcount, dma_async_device_cleanup);
-}
-EXPORT_SYMBOL(dma_chan_cleanup);
-
-static void dma_chan_free_rcu(struct rcu_head *rcu)
-{
-	struct dma_chan *chan = container_of(rcu, struct dma_chan, rcu);
-
-	kref_put(&chan->refcount, dma_chan_cleanup);
-}
-
-static void dma_chan_release(struct dma_chan *chan)
-{
-	call_rcu(&chan->rcu, dma_chan_free_rcu);
-}
-
 /**
  * dma_cap_mask_all - enable iteration over all operation types
  */
@@ -641,9 +598,6 @@ int dma_async_device_register(struct dma_device *device)
 	BUG_ON(!device->device_issue_pending);
 	BUG_ON(!device->dev);
 
-	init_completion(&device->done);
-	kref_init(&device->refcount);
-
 	mutex_lock(&dma_list_mutex);
 	device->dev_id = id++;
 	mutex_unlock(&dma_list_mutex);
@@ -662,19 +616,11 @@ int dma_async_device_register(struct dma_device *device)
 
 		rc = device_register(&chan->dev);
 		if (rc) {
-			chancnt--;
 			free_percpu(chan->local);
 			chan->local = NULL;
 			goto err_out;
 		}
-
-		/* One for the channel, one of the class device */
-		kref_get(&device->refcount);
-		kref_get(&device->refcount);
-		kref_init(&chan->refcount);
 		chan->client_count = 0;
-		chan->slow_ref = 0;
-		INIT_RCU_HEAD(&chan->rcu);
 	}
 	device->chancnt = chancnt;
 
@@ -705,30 +651,19 @@ err_out:
 	list_for_each_entry(chan, &device->channels, device_node) {
 		if (chan->local == NULL)
 			continue;
-		kref_put(&device->refcount, dma_async_device_cleanup);
 		device_unregister(&chan->dev);
-		chancnt--;
 		free_percpu(chan->local);
 	}
 	return rc;
 }
 EXPORT_SYMBOL(dma_async_device_register);
 
-/**
- * dma_async_device_cleanup - function called when all references are released
- * @kref: kernel reference object
- */
-static void dma_async_device_cleanup(struct kref *kref)
-{
-	struct dma_device *device;
-
-	device = container_of(kref, struct dma_device, refcount);
-	complete(&device->done);
-}
-
 /**
  * dma_async_device_unregister - unregister a DMA device
  * @device: &dma_device
+ *
+ * This routine is called by dma driver exit routines, dmaengine holds module
+ * references to prevent it being called while channels are in use.
  */
 void dma_async_device_unregister(struct dma_device *device)
 {
@@ -744,11 +679,7 @@ void dma_async_device_unregister(struct dma_device *device)
 			  "%s called while %d clients hold a reference\n",
 			  __func__, chan->client_count);
 		device_unregister(&chan->dev);
-		dma_chan_release(chan);
 	}
-
-	kref_put(&device->refcount, dma_async_device_cleanup);
-	wait_for_completion(&device->done);
 }
 EXPORT_SYMBOL(dma_async_device_unregister);
 
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index c74ac9eb009a..df0e37d70b31 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -1253,7 +1253,6 @@ static int __devinit iop_adma_probe(struct platform_device *pdev)
 	spin_lock_init(&iop_chan->lock);
 	INIT_LIST_HEAD(&iop_chan->chain);
 	INIT_LIST_HEAD(&iop_chan->all_slots);
-	INIT_RCU_HEAD(&iop_chan->common.rcu);
 	iop_chan->common.device = dma_dev;
 	list_add_tail(&iop_chan->common.device_node, &dma_dev->channels);
 
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index fbaa2f6225e2..d35cbd1ff0b3 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -1219,7 +1219,6 @@ static int __devinit mv_xor_probe(struct platform_device *pdev)
 	INIT_LIST_HEAD(&mv_chan->chain);
 	INIT_LIST_HEAD(&mv_chan->completed_slots);
 	INIT_LIST_HEAD(&mv_chan->all_slots);
-	INIT_RCU_HEAD(&mv_chan->common.rcu);
 	mv_chan->common.device = dma_dev;
 
 	list_add_tail(&mv_chan->common.device_node, &dma_dev->channels);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index db050e97d2b4..bca2fc758894 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -142,10 +142,6 @@ struct dma_chan {
 	int chan_id;
 	struct device dev;
 
-	struct kref refcount;
-	int slow_ref;
-	struct rcu_head rcu;
-
 	struct list_head device_node;
 	struct dma_chan_percpu *local;
 	int client_count;
@@ -233,9 +229,6 @@ struct dma_device {
 	dma_cap_mask_t  cap_mask;
 	int max_xor;
 
-	struct kref refcount;
-	struct completion done;
-
 	int dev_id;
 	struct device *dev;
 
-- 
cgit v1.2.3


From 7dd602510128d7a64b11ff3b7d4f30ac8e3946ce Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 6 Jan 2009 11:38:19 -0700
Subject: dmaengine: kill enum dma_state_client

DMA_NAK is now useless.  We can just use a bool instead.

Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/dmaengine.c      | 16 +++++-----------
 drivers/dma/dmatest.c        |  6 +++---
 drivers/mmc/host/atmel-mci.c |  6 +++---
 include/linux/dmaengine.h    | 17 +++--------------
 4 files changed, 14 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index b245c38dbec3..cdc8ecfc2c2c 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -440,7 +440,7 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v
 {
 	struct dma_device *device, *_d;
 	struct dma_chan *chan = NULL;
-	enum dma_state_client ack;
+	bool ack;
 	int err;
 
 	/* Find a channel */
@@ -453,9 +453,9 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v
 		if (fn)
 			ack = fn(chan, fn_param);
 		else
-			ack = DMA_ACK;
+			ack = true;
 
-		if (ack == DMA_ACK) {
+		if (ack) {
 			/* Found a suitable channel, try to grab, prep, and
 			 * return it.  We first set DMA_PRIVATE to disable
 			 * balance_ref_count as this channel will not be
@@ -473,15 +473,9 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v
 				       dev_name(&chan->dev), err);
 			else
 				break;
-		} else if (ack == DMA_DUP) {
-			pr_debug("%s: %s filter said DMA_DUP\n",
-				 __func__, dev_name(&chan->dev));
-		} else if (ack == DMA_NAK) {
-			pr_debug("%s: %s filter said DMA_NAK\n",
-				 __func__, dev_name(&chan->dev));
-			break;
 		} else
-			WARN_ONCE(1, "filter_fn: unknown response?\n");
+			pr_debug("%s: %s filter said false\n",
+				 __func__, dev_name(&chan->dev));
 		chan = NULL;
 	}
 	mutex_unlock(&dma_list_mutex);
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index 1d6e48f9cd02..c77d47c4ec5b 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -363,12 +363,12 @@ static int dmatest_add_channel(struct dma_chan *chan)
 	return 0;
 }
 
-static enum dma_state_client filter(struct dma_chan *chan, void *param)
+static bool filter(struct dma_chan *chan, void *param)
 {
 	if (!dmatest_match_channel(chan) || !dmatest_match_device(chan->device))
-		return DMA_DUP;
+		return false;
 	else
-		return DMA_ACK;
+		return true;
 }
 
 static int __init dmatest_init(void)
diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c
index 4b567a0408e1..b0042d06eaf7 100644
--- a/drivers/mmc/host/atmel-mci.c
+++ b/drivers/mmc/host/atmel-mci.c
@@ -1544,14 +1544,14 @@ static void __exit atmci_cleanup_slot(struct atmel_mci_slot *slot,
 }
 
 #ifdef CONFIG_MMC_ATMELMCI_DMA
-static enum dma_state_client filter(struct dma_chan *chan, void *slave)
+static bool filter(struct dma_chan *chan, void *slave)
 {
 	struct dw_dma_slave *dws = slave;
 
 	if (dws->dma_dev == chan->device->dev)
-		return DMA_ACK;
+		return true;
 	else
-		return DMA_DUP;
+		return false;
 }
 #endif
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index bca2fc758894..1419a5094478 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -28,18 +28,6 @@
 #include <linux/rcupdate.h>
 #include <linux/dma-mapping.h>
 
-/**
- * enum dma_state_client - state of the channel in the client
- * @DMA_ACK: client would like to use, or was using this channel
- * @DMA_DUP: client has already seen this channel, or is not using this channel
- * @DMA_NAK: client does not want to see any more channels
- */
-enum dma_state_client {
-	DMA_ACK,
-	DMA_DUP,
-	DMA_NAK,
-};
-
 /**
  * typedef dma_cookie_t - an opaque DMA cookie
  *
@@ -160,9 +148,10 @@ void dma_chan_cleanup(struct kref *kref);
  * When this optional parameter is specified in a call to dma_request_channel a
  * suitable channel is passed to this routine for further dispositioning before
  * being returned.  Where 'suitable' indicates a non-busy channel that
- * satisfies the given capability mask.
+ * satisfies the given capability mask.  It returns 'true' to indicate that the
+ * channel is suitable.
  */
-typedef enum dma_state_client (*dma_filter_fn)(struct dma_chan *chan, void *filter_param);
+typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param);
 
 typedef void (*dma_async_tx_callback)(void *dma_async_param);
 /**
-- 
cgit v1.2.3


From 41d5e59c1299f27983977bcfe3b360600996051c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 6 Jan 2009 11:38:21 -0700
Subject: dmaengine: add a release for dma class devices and dependent
 infrastructure

Resolves:
WARNING: at drivers/base/core.c:122 device_release+0x4d/0x52()
Device 'dma0chan0' does not have a release() function, it is broken and must be fixed.

The dma_chan_dev object is introduced to gear-match sysfs kobject and
dmaengine channel lifetimes.  When a channel is removed access to the
sysfs entries return -ENODEV until the kobject can be released.

The bulk of the change is updates to existing code to handle the extra
layer of indirection between a dma_chan and its struct device.

Reported-by: Alexander Beregalov <a.beregalov@gmail.com>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Cc: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/dmaengine.c   | 106 ++++++++++++++++++++++++++++++++++++----------
 drivers/dma/dmatest.c     |  14 +++---
 drivers/dma/dw_dmac.c     |  91 +++++++++++++++++++++------------------
 drivers/dma/fsldma.c      |   2 +-
 include/linux/dmaengine.h |  19 +++++++--
 5 files changed, 157 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index cdc8ecfc2c2c..93c4c9ac8997 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -64,36 +64,75 @@ static long dmaengine_ref_count;
 
 /* --- sysfs implementation --- */
 
+/**
+ * dev_to_dma_chan - convert a device pointer to the its sysfs container object
+ * @dev - device node
+ *
+ * Must be called under dma_list_mutex
+ */
+static struct dma_chan *dev_to_dma_chan(struct device *dev)
+{
+	struct dma_chan_dev *chan_dev;
+
+	chan_dev = container_of(dev, typeof(*chan_dev), device);
+	return chan_dev->chan;
+}
+
 static ssize_t show_memcpy_count(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct dma_chan *chan = to_dma_chan(dev);
+	struct dma_chan *chan;
 	unsigned long count = 0;
 	int i;
+	int err;
 
-	for_each_possible_cpu(i)
-		count += per_cpu_ptr(chan->local, i)->memcpy_count;
+	mutex_lock(&dma_list_mutex);
+	chan = dev_to_dma_chan(dev);
+	if (chan) {
+		for_each_possible_cpu(i)
+			count += per_cpu_ptr(chan->local, i)->memcpy_count;
+		err = sprintf(buf, "%lu\n", count);
+	} else
+		err = -ENODEV;
+	mutex_unlock(&dma_list_mutex);
 
-	return sprintf(buf, "%lu\n", count);
+	return err;
 }
 
 static ssize_t show_bytes_transferred(struct device *dev, struct device_attribute *attr,
 				      char *buf)
 {
-	struct dma_chan *chan = to_dma_chan(dev);
+	struct dma_chan *chan;
 	unsigned long count = 0;
 	int i;
+	int err;
 
-	for_each_possible_cpu(i)
-		count += per_cpu_ptr(chan->local, i)->bytes_transferred;
+	mutex_lock(&dma_list_mutex);
+	chan = dev_to_dma_chan(dev);
+	if (chan) {
+		for_each_possible_cpu(i)
+			count += per_cpu_ptr(chan->local, i)->bytes_transferred;
+		err = sprintf(buf, "%lu\n", count);
+	} else
+		err = -ENODEV;
+	mutex_unlock(&dma_list_mutex);
 
-	return sprintf(buf, "%lu\n", count);
+	return err;
 }
 
 static ssize_t show_in_use(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct dma_chan *chan = to_dma_chan(dev);
+	struct dma_chan *chan;
+	int err;
 
-	return sprintf(buf, "%d\n", chan->client_count);
+	mutex_lock(&dma_list_mutex);
+	chan = dev_to_dma_chan(dev);
+	if (chan)
+		err = sprintf(buf, "%d\n", chan->client_count);
+	else
+		err = -ENODEV;
+	mutex_unlock(&dma_list_mutex);
+
+	return err;
 }
 
 static struct device_attribute dma_attrs[] = {
@@ -103,9 +142,18 @@ static struct device_attribute dma_attrs[] = {
 	__ATTR_NULL
 };
 
+static void chan_dev_release(struct device *dev)
+{
+	struct dma_chan_dev *chan_dev;
+
+	chan_dev = container_of(dev, typeof(*chan_dev), device);
+	kfree(chan_dev);
+}
+
 static struct class dma_devclass = {
 	.name		= "dma",
 	.dev_attrs	= dma_attrs,
+	.dev_release	= chan_dev_release,
 };
 
 /* --- client and device registration --- */
@@ -420,7 +468,7 @@ static struct dma_chan *private_candidate(dma_cap_mask_t *mask, struct dma_devic
 	list_for_each_entry(chan, &dev->channels, device_node) {
 		if (chan->client_count) {
 			pr_debug("%s: %s busy\n",
-				 __func__, dev_name(&chan->dev));
+				 __func__, dma_chan_name(chan));
 			continue;
 		}
 		ret = chan;
@@ -466,22 +514,22 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v
 
 			if (err == -ENODEV) {
 				pr_debug("%s: %s module removed\n", __func__,
-					 dev_name(&chan->dev));
+					 dma_chan_name(chan));
 				list_del_rcu(&device->global_node);
 			} else if (err)
 				pr_err("dmaengine: failed to get %s: (%d)\n",
-				       dev_name(&chan->dev), err);
+				       dma_chan_name(chan), err);
 			else
 				break;
 		} else
 			pr_debug("%s: %s filter said false\n",
-				 __func__, dev_name(&chan->dev));
+				 __func__, dma_chan_name(chan));
 		chan = NULL;
 	}
 	mutex_unlock(&dma_list_mutex);
 
 	pr_debug("%s: %s (%s)\n", __func__, chan ? "success" : "fail",
-		 chan ? dev_name(&chan->dev) : NULL);
+		 chan ? dma_chan_name(chan) : NULL);
 
 	return chan;
 }
@@ -521,7 +569,7 @@ void dmaengine_get(void)
 				break;
 			} else if (err)
 				pr_err("dmaengine: failed to get %s: (%d)\n",
-				       dev_name(&chan->dev), err);
+				       dma_chan_name(chan), err);
 		}
 	}
 
@@ -601,14 +649,20 @@ int dma_async_device_register(struct dma_device *device)
 		chan->local = alloc_percpu(typeof(*chan->local));
 		if (chan->local == NULL)
 			continue;
+		chan->dev = kzalloc(sizeof(*chan->dev), GFP_KERNEL);
+		if (chan->dev == NULL) {
+			free_percpu(chan->local);
+			continue;
+		}
 
 		chan->chan_id = chancnt++;
-		chan->dev.class = &dma_devclass;
-		chan->dev.parent = device->dev;
-		dev_set_name(&chan->dev, "dma%dchan%d",
+		chan->dev->device.class = &dma_devclass;
+		chan->dev->device.parent = device->dev;
+		chan->dev->chan = chan;
+		dev_set_name(&chan->dev->device, "dma%dchan%d",
 			     device->dev_id, chan->chan_id);
 
-		rc = device_register(&chan->dev);
+		rc = device_register(&chan->dev->device);
 		if (rc) {
 			free_percpu(chan->local);
 			chan->local = NULL;
@@ -645,7 +699,10 @@ err_out:
 	list_for_each_entry(chan, &device->channels, device_node) {
 		if (chan->local == NULL)
 			continue;
-		device_unregister(&chan->dev);
+		mutex_lock(&dma_list_mutex);
+		chan->dev->chan = NULL;
+		mutex_unlock(&dma_list_mutex);
+		device_unregister(&chan->dev->device);
 		free_percpu(chan->local);
 	}
 	return rc;
@@ -672,7 +729,10 @@ void dma_async_device_unregister(struct dma_device *device)
 		WARN_ONCE(chan->client_count,
 			  "%s called while %d clients hold a reference\n",
 			  __func__, chan->client_count);
-		device_unregister(&chan->dev);
+		mutex_lock(&dma_list_mutex);
+		chan->dev->chan = NULL;
+		mutex_unlock(&dma_list_mutex);
+		device_unregister(&chan->dev->device);
 	}
 }
 EXPORT_SYMBOL(dma_async_device_unregister);
@@ -845,7 +905,7 @@ dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
 		return DMA_SUCCESS;
 
 	WARN_ONCE(tx->parent, "%s: speculatively walking dependency chain for"
-		  " %s\n", __func__, dev_name(&tx->chan->dev));
+		  " %s\n", __func__, dma_chan_name(tx->chan));
 
 	/* poll through the dependency chain, return when tx is complete */
 	do {
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index c77d47c4ec5b..3603f1ea5b28 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -80,7 +80,7 @@ static bool dmatest_match_channel(struct dma_chan *chan)
 {
 	if (test_channel[0] == '\0')
 		return true;
-	return strcmp(dev_name(&chan->dev), test_channel) == 0;
+	return strcmp(dma_chan_name(chan), test_channel) == 0;
 }
 
 static bool dmatest_match_device(struct dma_device *device)
@@ -325,7 +325,7 @@ static int dmatest_add_channel(struct dma_chan *chan)
 
 	dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL);
 	if (!dtc) {
-		pr_warning("dmatest: No memory for %s\n", dev_name(&chan->dev));
+		pr_warning("dmatest: No memory for %s\n", dma_chan_name(chan));
 		return -ENOMEM;
 	}
 
@@ -336,16 +336,16 @@ static int dmatest_add_channel(struct dma_chan *chan)
 		thread = kzalloc(sizeof(struct dmatest_thread), GFP_KERNEL);
 		if (!thread) {
 			pr_warning("dmatest: No memory for %s-test%u\n",
-				   dev_name(&chan->dev), i);
+				   dma_chan_name(chan), i);
 			break;
 		}
 		thread->chan = dtc->chan;
 		smp_wmb();
 		thread->task = kthread_run(dmatest_func, thread, "%s-test%u",
-				dev_name(&chan->dev), i);
+				dma_chan_name(chan), i);
 		if (IS_ERR(thread->task)) {
 			pr_warning("dmatest: Failed to run thread %s-test%u\n",
-					dev_name(&chan->dev), i);
+					dma_chan_name(chan), i);
 			kfree(thread);
 			break;
 		}
@@ -355,7 +355,7 @@ static int dmatest_add_channel(struct dma_chan *chan)
 		list_add_tail(&thread->node, &dtc->threads);
 	}
 
-	pr_info("dmatest: Started %u threads using %s\n", i, dev_name(&chan->dev));
+	pr_info("dmatest: Started %u threads using %s\n", i, dma_chan_name(chan));
 
 	list_add_tail(&dtc->node, &dmatest_channels);
 	nr_channels++;
@@ -408,7 +408,7 @@ static void __exit dmatest_exit(void)
 		list_del(&dtc->node);
 		dmatest_cleanup_channel(dtc);
 		pr_debug("dmatest: dropped channel %s\n",
-			 dev_name(&dtc->chan->dev));
+			 dma_chan_name(dtc->chan));
 		dma_release_channel(dtc->chan);
 	}
 }
diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index a29dda8f801b..6b702cc46b3d 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -70,6 +70,15 @@
  * the controller, though.
  */
 
+static struct device *chan2dev(struct dma_chan *chan)
+{
+	return &chan->dev->device;
+}
+static struct device *chan2parent(struct dma_chan *chan)
+{
+	return chan->dev->device.parent;
+}
+
 static struct dw_desc *dwc_first_active(struct dw_dma_chan *dwc)
 {
 	return list_entry(dwc->active_list.next, struct dw_desc, desc_node);
@@ -93,12 +102,12 @@ static struct dw_desc *dwc_desc_get(struct dw_dma_chan *dwc)
 			ret = desc;
 			break;
 		}
-		dev_dbg(&dwc->chan.dev, "desc %p not ACKed\n", desc);
+		dev_dbg(chan2dev(&dwc->chan), "desc %p not ACKed\n", desc);
 		i++;
 	}
 	spin_unlock_bh(&dwc->lock);
 
-	dev_vdbg(&dwc->chan.dev, "scanned %u descriptors on freelist\n", i);
+	dev_vdbg(chan2dev(&dwc->chan), "scanned %u descriptors on freelist\n", i);
 
 	return ret;
 }
@@ -108,10 +117,10 @@ static void dwc_sync_desc_for_cpu(struct dw_dma_chan *dwc, struct dw_desc *desc)
 	struct dw_desc	*child;
 
 	list_for_each_entry(child, &desc->txd.tx_list, desc_node)
-		dma_sync_single_for_cpu(dwc->chan.dev.parent,
+		dma_sync_single_for_cpu(chan2parent(&dwc->chan),
 				child->txd.phys, sizeof(child->lli),
 				DMA_TO_DEVICE);
-	dma_sync_single_for_cpu(dwc->chan.dev.parent,
+	dma_sync_single_for_cpu(chan2parent(&dwc->chan),
 			desc->txd.phys, sizeof(desc->lli),
 			DMA_TO_DEVICE);
 }
@@ -129,11 +138,11 @@ static void dwc_desc_put(struct dw_dma_chan *dwc, struct dw_desc *desc)
 
 		spin_lock_bh(&dwc->lock);
 		list_for_each_entry(child, &desc->txd.tx_list, desc_node)
-			dev_vdbg(&dwc->chan.dev,
+			dev_vdbg(chan2dev(&dwc->chan),
 					"moving child desc %p to freelist\n",
 					child);
 		list_splice_init(&desc->txd.tx_list, &dwc->free_list);
-		dev_vdbg(&dwc->chan.dev, "moving desc %p to freelist\n", desc);
+		dev_vdbg(chan2dev(&dwc->chan), "moving desc %p to freelist\n", desc);
 		list_add(&desc->desc_node, &dwc->free_list);
 		spin_unlock_bh(&dwc->lock);
 	}
@@ -163,9 +172,9 @@ static void dwc_dostart(struct dw_dma_chan *dwc, struct dw_desc *first)
 
 	/* ASSERT:  channel is idle */
 	if (dma_readl(dw, CH_EN) & dwc->mask) {
-		dev_err(&dwc->chan.dev,
+		dev_err(chan2dev(&dwc->chan),
 			"BUG: Attempted to start non-idle channel\n");
-		dev_err(&dwc->chan.dev,
+		dev_err(chan2dev(&dwc->chan),
 			"  SAR: 0x%x DAR: 0x%x LLP: 0x%x CTL: 0x%x:%08x\n",
 			channel_readl(dwc, SAR),
 			channel_readl(dwc, DAR),
@@ -193,7 +202,7 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc)
 	void				*param;
 	struct dma_async_tx_descriptor	*txd = &desc->txd;
 
-	dev_vdbg(&dwc->chan.dev, "descriptor %u complete\n", txd->cookie);
+	dev_vdbg(chan2dev(&dwc->chan), "descriptor %u complete\n", txd->cookie);
 
 	dwc->completed = txd->cookie;
 	callback = txd->callback;
@@ -208,11 +217,11 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc)
 	 * mapped before they were submitted...
 	 */
 	if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP))
-		dma_unmap_page(dwc->chan.dev.parent, desc->lli.dar, desc->len,
-				DMA_FROM_DEVICE);
+		dma_unmap_page(chan2parent(&dwc->chan), desc->lli.dar,
+			       desc->len, DMA_FROM_DEVICE);
 	if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP))
-		dma_unmap_page(dwc->chan.dev.parent, desc->lli.sar, desc->len,
-				DMA_TO_DEVICE);
+		dma_unmap_page(chan2parent(&dwc->chan), desc->lli.sar,
+			       desc->len, DMA_TO_DEVICE);
 
 	/*
 	 * The API requires that no submissions are done from a
@@ -228,7 +237,7 @@ static void dwc_complete_all(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	LIST_HEAD(list);
 
 	if (dma_readl(dw, CH_EN) & dwc->mask) {
-		dev_err(&dwc->chan.dev,
+		dev_err(chan2dev(&dwc->chan),
 			"BUG: XFER bit set, but channel not idle!\n");
 
 		/* Try to continue after resetting the channel... */
@@ -273,7 +282,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
 		return;
 	}
 
-	dev_vdbg(&dwc->chan.dev, "scan_descriptors: llp=0x%x\n", llp);
+	dev_vdbg(chan2dev(&dwc->chan), "scan_descriptors: llp=0x%x\n", llp);
 
 	list_for_each_entry_safe(desc, _desc, &dwc->active_list, desc_node) {
 		if (desc->lli.llp == llp)
@@ -292,7 +301,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
 		dwc_descriptor_complete(dwc, desc);
 	}
 
-	dev_err(&dwc->chan.dev,
+	dev_err(chan2dev(&dwc->chan),
 		"BUG: All descriptors done, but channel not idle!\n");
 
 	/* Try to continue after resetting the channel... */
@@ -308,7 +317,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
 
 static void dwc_dump_lli(struct dw_dma_chan *dwc, struct dw_lli *lli)
 {
-	dev_printk(KERN_CRIT, &dwc->chan.dev,
+	dev_printk(KERN_CRIT, chan2dev(&dwc->chan),
 			"  desc: s0x%x d0x%x l0x%x c0x%x:%x\n",
 			lli->sar, lli->dar, lli->llp,
 			lli->ctlhi, lli->ctllo);
@@ -342,9 +351,9 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	 * controller flagged an error instead of scribbling over
 	 * random memory locations.
 	 */
-	dev_printk(KERN_CRIT, &dwc->chan.dev,
+	dev_printk(KERN_CRIT, chan2dev(&dwc->chan),
 			"Bad descriptor submitted for DMA!\n");
-	dev_printk(KERN_CRIT, &dwc->chan.dev,
+	dev_printk(KERN_CRIT, chan2dev(&dwc->chan),
 			"  cookie: %d\n", bad_desc->txd.cookie);
 	dwc_dump_lli(dwc, &bad_desc->lli);
 	list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node)
@@ -442,12 +451,12 @@ static dma_cookie_t dwc_tx_submit(struct dma_async_tx_descriptor *tx)
 	 * for DMA. But this is hard to do in a race-free manner.
 	 */
 	if (list_empty(&dwc->active_list)) {
-		dev_vdbg(&tx->chan->dev, "tx_submit: started %u\n",
+		dev_vdbg(chan2dev(tx->chan), "tx_submit: started %u\n",
 				desc->txd.cookie);
 		dwc_dostart(dwc, desc);
 		list_add_tail(&desc->desc_node, &dwc->active_list);
 	} else {
-		dev_vdbg(&tx->chan->dev, "tx_submit: queued %u\n",
+		dev_vdbg(chan2dev(tx->chan), "tx_submit: queued %u\n",
 				desc->txd.cookie);
 
 		list_add_tail(&desc->desc_node, &dwc->queue);
@@ -472,11 +481,11 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	unsigned int		dst_width;
 	u32			ctllo;
 
-	dev_vdbg(&chan->dev, "prep_dma_memcpy d0x%x s0x%x l0x%zx f0x%lx\n",
+	dev_vdbg(chan2dev(chan), "prep_dma_memcpy d0x%x s0x%x l0x%zx f0x%lx\n",
 			dest, src, len, flags);
 
 	if (unlikely(!len)) {
-		dev_dbg(&chan->dev, "prep_dma_memcpy: length is zero!\n");
+		dev_dbg(chan2dev(chan), "prep_dma_memcpy: length is zero!\n");
 		return NULL;
 	}
 
@@ -516,7 +525,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 			first = desc;
 		} else {
 			prev->lli.llp = desc->txd.phys;
-			dma_sync_single_for_device(chan->dev.parent,
+			dma_sync_single_for_device(chan2parent(chan),
 					prev->txd.phys, sizeof(prev->lli),
 					DMA_TO_DEVICE);
 			list_add_tail(&desc->desc_node,
@@ -531,7 +540,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 		prev->lli.ctllo |= DWC_CTLL_INT_EN;
 
 	prev->lli.llp = 0;
-	dma_sync_single_for_device(chan->dev.parent,
+	dma_sync_single_for_device(chan2parent(chan),
 			prev->txd.phys, sizeof(prev->lli),
 			DMA_TO_DEVICE);
 
@@ -562,7 +571,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	struct scatterlist	*sg;
 	size_t			total_len = 0;
 
-	dev_vdbg(&chan->dev, "prep_dma_slave\n");
+	dev_vdbg(chan2dev(chan), "prep_dma_slave\n");
 
 	if (unlikely(!dws || !sg_len))
 		return NULL;
@@ -570,7 +579,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	reg_width = dws->reg_width;
 	prev = first = NULL;
 
-	sg_len = dma_map_sg(chan->dev.parent, sgl, sg_len, direction);
+	sg_len = dma_map_sg(chan2parent(chan), sgl, sg_len, direction);
 
 	switch (direction) {
 	case DMA_TO_DEVICE:
@@ -587,7 +596,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 
 			desc = dwc_desc_get(dwc);
 			if (!desc) {
-				dev_err(&chan->dev,
+				dev_err(chan2dev(chan),
 					"not enough descriptors available\n");
 				goto err_desc_get;
 			}
@@ -607,7 +616,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 				first = desc;
 			} else {
 				prev->lli.llp = desc->txd.phys;
-				dma_sync_single_for_device(chan->dev.parent,
+				dma_sync_single_for_device(chan2parent(chan),
 						prev->txd.phys,
 						sizeof(prev->lli),
 						DMA_TO_DEVICE);
@@ -633,7 +642,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 
 			desc = dwc_desc_get(dwc);
 			if (!desc) {
-				dev_err(&chan->dev,
+				dev_err(chan2dev(chan),
 					"not enough descriptors available\n");
 				goto err_desc_get;
 			}
@@ -653,7 +662,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 				first = desc;
 			} else {
 				prev->lli.llp = desc->txd.phys;
-				dma_sync_single_for_device(chan->dev.parent,
+				dma_sync_single_for_device(chan2parent(chan),
 						prev->txd.phys,
 						sizeof(prev->lli),
 						DMA_TO_DEVICE);
@@ -673,7 +682,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 		prev->lli.ctllo |= DWC_CTLL_INT_EN;
 
 	prev->lli.llp = 0;
-	dma_sync_single_for_device(chan->dev.parent,
+	dma_sync_single_for_device(chan2parent(chan),
 			prev->txd.phys, sizeof(prev->lli),
 			DMA_TO_DEVICE);
 
@@ -768,11 +777,11 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 	u32			cfghi;
 	u32			cfglo;
 
-	dev_vdbg(&chan->dev, "alloc_chan_resources\n");
+	dev_vdbg(chan2dev(chan), "alloc_chan_resources\n");
 
 	/* ASSERT:  channel is idle */
 	if (dma_readl(dw, CH_EN) & dwc->mask) {
-		dev_dbg(&chan->dev, "DMA channel not idle?\n");
+		dev_dbg(chan2dev(chan), "DMA channel not idle?\n");
 		return -EIO;
 	}
 
@@ -808,7 +817,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 
 		desc = kzalloc(sizeof(struct dw_desc), GFP_KERNEL);
 		if (!desc) {
-			dev_info(&chan->dev,
+			dev_info(chan2dev(chan),
 				"only allocated %d descriptors\n", i);
 			spin_lock_bh(&dwc->lock);
 			break;
@@ -818,7 +827,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 		desc->txd.tx_submit = dwc_tx_submit;
 		desc->txd.flags = DMA_CTRL_ACK;
 		INIT_LIST_HEAD(&desc->txd.tx_list);
-		desc->txd.phys = dma_map_single(chan->dev.parent, &desc->lli,
+		desc->txd.phys = dma_map_single(chan2parent(chan), &desc->lli,
 				sizeof(desc->lli), DMA_TO_DEVICE);
 		dwc_desc_put(dwc, desc);
 
@@ -833,7 +842,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 
 	spin_unlock_bh(&dwc->lock);
 
-	dev_dbg(&chan->dev,
+	dev_dbg(chan2dev(chan),
 		"alloc_chan_resources allocated %d descriptors\n", i);
 
 	return i;
@@ -846,7 +855,7 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	struct dw_desc		*desc, *_desc;
 	LIST_HEAD(list);
 
-	dev_dbg(&chan->dev, "free_chan_resources (descs allocated=%u)\n",
+	dev_dbg(chan2dev(chan), "free_chan_resources (descs allocated=%u)\n",
 			dwc->descs_allocated);
 
 	/* ASSERT:  channel is idle */
@@ -867,13 +876,13 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	spin_unlock_bh(&dwc->lock);
 
 	list_for_each_entry_safe(desc, _desc, &list, desc_node) {
-		dev_vdbg(&chan->dev, "  freeing descriptor %p\n", desc);
-		dma_unmap_single(chan->dev.parent, desc->txd.phys,
+		dev_vdbg(chan2dev(chan), "  freeing descriptor %p\n", desc);
+		dma_unmap_single(chan2parent(chan), desc->txd.phys,
 				sizeof(desc->lli), DMA_TO_DEVICE);
 		kfree(desc);
 	}
 
-	dev_vdbg(&chan->dev, "free_chan_resources done\n");
+	dev_vdbg(chan2dev(chan), "free_chan_resources done\n");
 }
 
 /*----------------------------------------------------------------------*/
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 46e0128929a0..ca70a21afc68 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -822,7 +822,7 @@ static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev,
 	 */
 	WARN_ON(fdev->feature != new_fsl_chan->feature);
 
-	new_fsl_chan->dev = &new_fsl_chan->common.dev;
+	new_fsl_chan->dev = &new_fsl_chan->common.dev->device;
 	new_fsl_chan->reg_base = ioremap(new_fsl_chan->reg.start,
 			new_fsl_chan->reg.end - new_fsl_chan->reg.start + 1);
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 1419a5094478..d6b6bff355f4 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -113,7 +113,7 @@ struct dma_chan_percpu {
  * @device: ptr to the dma device who supplies this channel, always !%NULL
  * @cookie: last cookie value returned to client
  * @chan_id: channel ID for sysfs
- * @class_dev: class device for sysfs
+ * @dev: class device for sysfs
  * @refcount: kref, used in "bigref" slow-mode
  * @slow_ref: indicates that the DMA channel is free
  * @rcu: the DMA channel's RCU head
@@ -128,7 +128,7 @@ struct dma_chan {
 
 	/* sysfs */
 	int chan_id;
-	struct device dev;
+	struct dma_chan_dev *dev;
 
 	struct list_head device_node;
 	struct dma_chan_percpu *local;
@@ -136,7 +136,20 @@ struct dma_chan {
 	int table_count;
 };
 
-#define to_dma_chan(p) container_of(p, struct dma_chan, dev)
+/**
+ * struct dma_chan_dev - relate sysfs device node to backing channel device
+ * @chan - driver channel device
+ * @device - sysfs device
+ */
+struct dma_chan_dev {
+	struct dma_chan *chan;
+	struct device device;
+};
+
+static inline const char *dma_chan_name(struct dma_chan *chan)
+{
+	return dev_name(&chan->dev->device);
+}
 
 void dma_chan_cleanup(struct kref *kref);
 
-- 
cgit v1.2.3


From 864498aaa9fef69ee166da023d12413a7776342d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 6 Jan 2009 11:38:21 -0700
Subject: dmaengine: use idr for registering dma device numbers

This brings some predictability to dma device numbers, i.e. an rmmod/insmod
cycle may now result in /sys/class/dma/dma0chan0 being restored rather than
/sys/class/dma/dma1chan0 appearing.

Cc: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/dmaengine.c   | 27 +++++++++++++++++++++++++--
 include/linux/dmaengine.h |  4 ++++
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 93c4c9ac8997..dd43410c1019 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -57,10 +57,12 @@
 #include <linux/mutex.h>
 #include <linux/jiffies.h>
 #include <linux/rculist.h>
+#include <linux/idr.h>
 
 static DEFINE_MUTEX(dma_list_mutex);
 static LIST_HEAD(dma_device_list);
 static long dmaengine_ref_count;
+static struct idr dma_idr;
 
 /* --- sysfs implementation --- */
 
@@ -147,6 +149,12 @@ static void chan_dev_release(struct device *dev)
 	struct dma_chan_dev *chan_dev;
 
 	chan_dev = container_of(dev, typeof(*chan_dev), device);
+	if (atomic_dec_and_test(chan_dev->idr_ref)) {
+		mutex_lock(&dma_list_mutex);
+		idr_remove(&dma_idr, chan_dev->dev_id);
+		mutex_unlock(&dma_list_mutex);
+		kfree(chan_dev->idr_ref);
+	}
 	kfree(chan_dev);
 }
 
@@ -611,9 +619,9 @@ EXPORT_SYMBOL(dmaengine_put);
  */
 int dma_async_device_register(struct dma_device *device)
 {
-	static int id;
 	int chancnt = 0, rc;
 	struct dma_chan* chan;
+	atomic_t *idr_ref;
 
 	if (!device)
 		return -ENODEV;
@@ -640,9 +648,20 @@ int dma_async_device_register(struct dma_device *device)
 	BUG_ON(!device->device_issue_pending);
 	BUG_ON(!device->dev);
 
+	idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL);
+	if (!idr_ref)
+		return -ENOMEM;
+	atomic_set(idr_ref, 0);
+ idr_retry:
+	if (!idr_pre_get(&dma_idr, GFP_KERNEL))
+		return -ENOMEM;
 	mutex_lock(&dma_list_mutex);
-	device->dev_id = id++;
+	rc = idr_get_new(&dma_idr, NULL, &device->dev_id);
 	mutex_unlock(&dma_list_mutex);
+	if (rc == -EAGAIN)
+		goto idr_retry;
+	else if (rc != 0)
+		return rc;
 
 	/* represent channels in sysfs. Probably want devs too */
 	list_for_each_entry(chan, &device->channels, device_node) {
@@ -659,6 +678,9 @@ int dma_async_device_register(struct dma_device *device)
 		chan->dev->device.class = &dma_devclass;
 		chan->dev->device.parent = device->dev;
 		chan->dev->chan = chan;
+		chan->dev->idr_ref = idr_ref;
+		chan->dev->dev_id = device->dev_id;
+		atomic_inc(idr_ref);
 		dev_set_name(&chan->dev->device, "dma%dchan%d",
 			     device->dev_id, chan->chan_id);
 
@@ -971,6 +993,7 @@ EXPORT_SYMBOL_GPL(dma_run_dependencies);
 
 static int __init dma_bus_init(void)
 {
+	idr_init(&dma_idr);
 	mutex_init(&dma_list_mutex);
 	return class_register(&dma_devclass);
 }
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index d6b6bff355f4..64dea2ab326c 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -140,10 +140,14 @@ struct dma_chan {
  * struct dma_chan_dev - relate sysfs device node to backing channel device
  * @chan - driver channel device
  * @device - sysfs device
+ * @dev_id - parent dma_device dev_id
+ * @idr_ref - reference count to gate release of dma_device dev_id
  */
 struct dma_chan_dev {
 	struct dma_chan *chan;
 	struct device device;
+	int dev_id;
+	atomic_t *idr_ref;
 };
 
 static inline const char *dma_chan_name(struct dma_chan *chan)
-- 
cgit v1.2.3


From 96e93eab20337d063c70d537bd7bc70d90e04fa3 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 6 Jan 2009 10:49:34 -0800
Subject: gro: Add internal interfaces for VLAN

Previously GRO's only entry point from the outside is through
napi_gro_receive and napi_gro_frags.  These interfaces are for
device drivers.

This patch rearranges things to provide a new set of interfaces
for VLANs.  These interfaces are for internal use only.  The
VLAN code itself can then provide a set of entry points for
device drivers.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  6 ++++
 net/core/dev.c            | 82 ++++++++++++++++++++++++++++++++++-------------
 2 files changed, 65 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c28bbba3c23d..114091be8872 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1373,8 +1373,14 @@ extern int		netif_rx_ni(struct sk_buff *skb);
 #define HAVE_NETIF_RECEIVE_SKB 1
 extern int		netif_receive_skb(struct sk_buff *skb);
 extern void		napi_gro_flush(struct napi_struct *napi);
+extern int		dev_gro_receive(struct napi_struct *napi,
+					struct sk_buff *skb);
 extern int		napi_gro_receive(struct napi_struct *napi,
 					 struct sk_buff *skb);
+extern void		napi_reuse_skb(struct napi_struct *napi,
+				       struct sk_buff *skb);
+extern struct sk_buff *	napi_fraginfo_skb(struct napi_struct *napi,
+					  struct napi_gro_fraginfo *info);
 extern int		napi_gro_frags(struct napi_struct *napi,
 				       struct napi_gro_fraginfo *info);
 extern void		netif_nit_deliver(struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 382df6c09eec..bab8bcedd62e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2387,7 +2387,7 @@ void napi_gro_flush(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
-static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff **pp = NULL;
 	struct packet_type *ptype;
@@ -2417,11 +2417,14 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 
 		for (p = napi->gro_list; p; p = p->next) {
 			count++;
-			NAPI_GRO_CB(p)->same_flow =
-				p->mac_len == mac_len &&
-				!memcmp(skb_mac_header(p), skb_mac_header(skb),
-					mac_len);
-			NAPI_GRO_CB(p)->flush = 0;
+
+			if (!NAPI_GRO_CB(p)->same_flow)
+				continue;
+
+			if (p->mac_len != mac_len ||
+			    memcmp(skb_mac_header(p), skb_mac_header(skb),
+				   mac_len))
+				NAPI_GRO_CB(p)->same_flow = 0;
 		}
 
 		pp = ptype->gro_receive(&napi->gro_list, skb);
@@ -2463,6 +2466,19 @@ ok:
 normal:
 	return -1;
 }
+EXPORT_SYMBOL(dev_gro_receive);
+
+static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+	struct sk_buff *p;
+
+	for (p = napi->gro_list; p; p = p->next) {
+		NAPI_GRO_CB(p)->same_flow = 1;
+		NAPI_GRO_CB(p)->flush = 0;
+	}
+
+	return dev_gro_receive(napi, skb);
+}
 
 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
@@ -2479,11 +2495,26 @@ int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(napi_gro_receive);
 
-int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
+{
+	skb_shinfo(skb)->nr_frags = 0;
+
+	skb->len -= skb->data_len;
+	skb->truesize -= skb->data_len;
+	skb->data_len = 0;
+
+	__skb_pull(skb, skb_headlen(skb));
+	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
+
+	napi->skb = skb;
+}
+EXPORT_SYMBOL(napi_reuse_skb);
+
+struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
+				  struct napi_gro_fraginfo *info)
 {
 	struct net_device *dev = napi->dev;
 	struct sk_buff *skb = napi->skb;
-	int err = NET_RX_DROP;
 
 	napi->skb = NULL;
 
@@ -2503,16 +2534,31 @@ int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
 	skb->len += info->len;
 	skb->truesize += info->len;
 
-	if (!pskb_may_pull(skb, ETH_HLEN))
-		goto reuse;
-
-	err = NET_RX_SUCCESS;
+	if (!pskb_may_pull(skb, ETH_HLEN)) {
+		napi_reuse_skb(napi, skb);
+		goto out;
+	}
 
 	skb->protocol = eth_type_trans(skb, dev);
 
 	skb->ip_summed = info->ip_summed;
 	skb->csum = info->csum;
 
+out:
+	return skb;
+}
+EXPORT_SYMBOL(napi_fraginfo_skb);
+
+int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+{
+	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
+	int err = NET_RX_DROP;
+
+	if (!skb)
+		goto out;
+
+	err = NET_RX_SUCCESS;
+
 	switch (__napi_gro_receive(napi, skb)) {
 	case -1:
 		return netif_receive_skb(skb);
@@ -2521,17 +2567,7 @@ int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
 		goto out;
 	}
 
-reuse:
-	skb_shinfo(skb)->nr_frags = 0;
-
-	skb->len -= skb->data_len;
-	skb->truesize -= skb->data_len;
-	skb->data_len = 0;
-
-	__skb_pull(skb, skb_headlen(skb));
-	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
-
-	napi->skb = skb;
+	napi_reuse_skb(napi, skb);
 
 out:
 	return err;
-- 
cgit v1.2.3


From e1c096e251e52773afeffbbcb74d0a072be47ea3 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 6 Jan 2009 10:50:09 -0800
Subject: vlan: Add GRO interfaces

This patch adds GRO interfaces for hardware-assisted VLAN reception.
With this in place we're now at parity with LRO as far as the
interface is concerned.  That is, you can now take any LRO driver
and convert it over to GRO.

As the CB memory clashes with GRO's use of CB, I've removed it
entirely by storing dev in skb->dev.  This is OK because VLAN
gets called first thing in netif_receive_skb and skb->dev is
not used in between us calling netif_rx and netif_receive_skb
getting called.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h |  19 +++++++++
 net/8021q/vlan_core.c   | 111 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 107 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index a5cb0c3f6dcf..f8ff918c208f 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -115,6 +115,11 @@ extern u16 vlan_dev_vlan_id(const struct net_device *dev);
 extern int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
 			     u16 vlan_tci, int polling);
 extern int vlan_hwaccel_do_receive(struct sk_buff *skb);
+extern int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
+			    unsigned int vlan_tci, struct sk_buff *skb);
+extern int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
+			  unsigned int vlan_tci,
+			  struct napi_gro_fraginfo *info);
 
 #else
 static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
@@ -140,6 +145,20 @@ static inline int vlan_hwaccel_do_receive(struct sk_buff *skb)
 {
 	return 0;
 }
+
+static inline int vlan_gro_receive(struct napi_struct *napi,
+				   struct vlan_group *grp,
+				   unsigned int vlan_tci, struct sk_buff *skb)
+{
+	return NET_RX_DROP;
+}
+
+static inline int vlan_gro_frags(struct napi_struct *napi,
+				 struct vlan_group *grp, unsigned int vlan_tci,
+				 struct napi_gro_fraginfo *info)
+{
+	return NET_RX_DROP;
+}
 #endif
 
 /**
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index dd86a1dc4cd0..6c1323940263 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -3,46 +3,35 @@
 #include <linux/if_vlan.h>
 #include "vlan.h"
 
-struct vlan_hwaccel_cb {
-	struct net_device	*dev;
-};
-
-static inline struct vlan_hwaccel_cb *vlan_hwaccel_cb(struct sk_buff *skb)
-{
-	return (struct vlan_hwaccel_cb *)skb->cb;
-}
-
 /* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
 int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
 		      u16 vlan_tci, int polling)
 {
-	struct vlan_hwaccel_cb *cb = vlan_hwaccel_cb(skb);
-
-	if (skb_bond_should_drop(skb)) {
-		dev_kfree_skb_any(skb);
-		return NET_RX_DROP;
-	}
+	if (skb_bond_should_drop(skb))
+		goto drop;
 
 	skb->vlan_tci = vlan_tci;
-	cb->dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK);
+	skb->dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK);
+
+	if (!skb->dev)
+		goto drop;
 
 	return (polling ? netif_receive_skb(skb) : netif_rx(skb));
+
+drop:
+	dev_kfree_skb_any(skb);
+	return NET_RX_DROP;
 }
 EXPORT_SYMBOL(__vlan_hwaccel_rx);
 
 int vlan_hwaccel_do_receive(struct sk_buff *skb)
 {
-	struct vlan_hwaccel_cb *cb = vlan_hwaccel_cb(skb);
-	struct net_device *dev = cb->dev;
+	struct net_device *dev = skb->dev;
 	struct net_device_stats *stats;
 
+	skb->dev = vlan_dev_info(dev)->real_dev;
 	netif_nit_deliver(skb);
 
-	if (dev == NULL) {
-		kfree_skb(skb);
-		return -1;
-	}
-
 	skb->dev = dev;
 	skb->priority = vlan_get_ingress_priority(dev, skb->vlan_tci);
 	skb->vlan_tci = 0;
@@ -80,3 +69,79 @@ u16 vlan_dev_vlan_id(const struct net_device *dev)
 	return vlan_dev_info(dev)->vlan_id;
 }
 EXPORT_SYMBOL_GPL(vlan_dev_vlan_id);
+
+static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
+			   unsigned int vlan_tci, struct sk_buff *skb)
+{
+	struct sk_buff *p;
+
+	if (skb_bond_should_drop(skb))
+		goto drop;
+
+	skb->vlan_tci = vlan_tci;
+	skb->dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK);
+
+	if (!skb->dev)
+		goto drop;
+
+	for (p = napi->gro_list; p; p = p->next) {
+		NAPI_GRO_CB(p)->same_flow = p->dev == skb->dev;
+		NAPI_GRO_CB(p)->flush = 0;
+	}
+
+	return dev_gro_receive(napi, skb);
+
+drop:
+	return 2;
+}
+
+int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
+		     unsigned int vlan_tci, struct sk_buff *skb)
+{
+	int err = NET_RX_SUCCESS;
+
+	switch (vlan_gro_common(napi, grp, vlan_tci, skb)) {
+	case -1:
+		return netif_receive_skb(skb);
+
+	case 2:
+		err = NET_RX_DROP;
+		/* fall through */
+
+	case 1:
+		kfree_skb(skb);
+		break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(vlan_gro_receive);
+
+int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
+		   unsigned int vlan_tci, struct napi_gro_fraginfo *info)
+{
+	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
+	int err = NET_RX_DROP;
+
+	if (!skb)
+		goto out;
+
+	err = NET_RX_SUCCESS;
+
+	switch (vlan_gro_common(napi, grp, vlan_tci, skb)) {
+	case -1:
+		return netif_receive_skb(skb);
+
+	case 2:
+		err = NET_RX_DROP;
+		/* fall through */
+
+	case 1:
+		napi_reuse_skb(napi, skb);
+		break;
+	}
+
+out:
+	return err;
+}
+EXPORT_SYMBOL(vlan_gro_frags);
-- 
cgit v1.2.3


From 1fa17d4ba43d7e5aab5e90777b07da06524f6748 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <oliver@hartkopp.net>
Date: Tue, 6 Jan 2009 11:07:54 -0800
Subject: can: omit unneeded skb_clone() calls

The AF_CAN core delivered always cloned sk_buffs to the AF_CAN
protocols, although this was _only_ needed by the can-raw protocol.
With this (additionally documented) change, the AF_CAN core calls the
callback functions of the registered AF_CAN protocols with the original
(uncloned) sk_buff pointer and let's the can-raw protocol do the
skb_clone() itself which omits all unneeded skb_clone() calls for other
AF_CAN protocols.

Signed-off-by: Oliver Hartkopp <oliver@hartkopp.net>
Signed-off-by: Urs Thuermann <urs.thuermann@volkswagen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/can/core.h |  2 +-
 net/can/af_can.c         | 15 ++++++++-------
 net/can/bcm.c            | 12 +++++-------
 net/can/raw.c            | 15 ++++++++-------
 4 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/core.h b/include/linux/can/core.h
index f50785ad4781..25085cbadcfc 100644
--- a/include/linux/can/core.h
+++ b/include/linux/can/core.h
@@ -19,7 +19,7 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 
-#define CAN_VERSION "20081130"
+#define CAN_VERSION "20090105"
 
 /* increment this number each time you change some user-space interface */
 #define CAN_ABI_VERSION "8"
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 3dadb338addd..fa417ca6cbe6 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -414,6 +414,12 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
  *  The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can
  *  filter for error frames (CAN_ERR_FLAG bit set in mask).
  *
+ *  The provided pointer to the sk_buff is guaranteed to be valid as long as
+ *  the callback function is running. The callback function must *not* free
+ *  the given sk_buff while processing it's task. When the given sk_buff is
+ *  needed after the end of the callback function it must be cloned inside
+ *  the callback function with skb_clone().
+ *
  * Return:
  *  0 on success
  *  -ENOMEM on missing cache mem to create subscription entry
@@ -569,13 +575,8 @@ EXPORT_SYMBOL(can_rx_unregister);
 
 static inline void deliver(struct sk_buff *skb, struct receiver *r)
 {
-	struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
-
-	if (clone) {
-		clone->sk = skb->sk;
-		r->func(clone, r->data);
-		r->matches++;
-	}
+	r->func(skb, r->data);
+	r->matches++;
 }
 
 static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb)
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 6248ae2502c7..1649c8ab2c2f 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -633,7 +633,7 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data)
 	hrtimer_cancel(&op->timer);
 
 	if (op->can_id != rxframe->can_id)
-		goto rx_freeskb;
+		return;
 
 	/* save rx timestamp */
 	op->rx_stamp = skb->tstamp;
@@ -645,19 +645,19 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data)
 	if (op->flags & RX_RTR_FRAME) {
 		/* send reply for RTR-request (placed in op->frames[0]) */
 		bcm_can_tx(op);
-		goto rx_freeskb;
+		return;
 	}
 
 	if (op->flags & RX_FILTER_ID) {
 		/* the easiest case */
 		bcm_rx_update_and_send(op, &op->last_frames[0], rxframe);
-		goto rx_freeskb_starttimer;
+		goto rx_starttimer;
 	}
 
 	if (op->nframes == 1) {
 		/* simple compare with index 0 */
 		bcm_rx_cmp_to_index(op, 0, rxframe);
-		goto rx_freeskb_starttimer;
+		goto rx_starttimer;
 	}
 
 	if (op->nframes > 1) {
@@ -678,10 +678,8 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data)
 		}
 	}
 
-rx_freeskb_starttimer:
+rx_starttimer:
 	bcm_rx_starttimer(op);
-rx_freeskb:
-	kfree_skb(skb);
 }
 
 /*
diff --git a/net/can/raw.c b/net/can/raw.c
index 27aab63df467..0703cba4bf9f 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -99,13 +99,14 @@ static void raw_rcv(struct sk_buff *skb, void *data)
 	struct raw_sock *ro = raw_sk(sk);
 	struct sockaddr_can *addr;
 
-	if (!ro->recv_own_msgs) {
-		/* check the received tx sock reference */
-		if (skb->sk == sk) {
-			kfree_skb(skb);
-			return;
-		}
-	}
+	/* check the received tx sock reference */
+	if (!ro->recv_own_msgs && skb->sk == sk)
+		return;
+
+	/* clone the given skb to be able to enqueue it into the rcv queue */
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (!skb)
+		return;
 
 	/*
 	 *  Put the datagram to the queue so that raw_recvmsg() can
-- 
cgit v1.2.3


From 14f0ca8eaea42a5b5a69cfcb699665dd2618db5f Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 7 Jan 2009 21:50:22 +0100
Subject: oprofile: make new cpu buffer functions part of the api

This patch creates the new functions

 oprofile_write_reserve()
 oprofile_add_data()
 oprofile_write_commit()

and makes them part of the oprofile api.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c | 57 +++++++++++++++++-----------------------
 drivers/oprofile/cpu_buffer.c    | 17 +++++++++---
 drivers/oprofile/cpu_buffer.h    |  8 +-----
 include/linux/oprofile.h         | 18 +++++++++++++
 4 files changed, 57 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index cf310aeb462c..8fdf06e4edf9 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -22,7 +22,6 @@
 
 #include "op_x86_model.h"
 #include "op_counter.h"
-#include "../../../drivers/oprofile/cpu_buffer.h"
 
 #define NUM_COUNTERS 4
 #define NUM_CONTROLS 4
@@ -61,14 +60,6 @@ static unsigned long reset_value[NUM_COUNTERS];
 #define IBS_OP_LOW_VALID_BIT		(1ULL<<18)	/* bit 18 */
 #define IBS_OP_LOW_ENABLE		(1ULL<<17)	/* bit 17 */
 
-/*
- * The function interface needs to be fixed, something like add
- * data. Should then be added to linux/oprofile.h.
- */
-extern
-void oprofile_add_data(struct op_entry *entry, struct pt_regs * const regs,
-		       unsigned long pc, int code, int size);
-
 #define IBS_FETCH_SIZE	6
 #define IBS_OP_SIZE	12
 
@@ -174,16 +165,16 @@ op_amd_handle_ibs(struct pt_regs * const regs,
 		rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
 		if (high & IBS_FETCH_HIGH_VALID_BIT) {
 			rdmsrl(MSR_AMD64_IBSFETCHLINAD, msr);
-			oprofile_add_data(&entry, regs, msr, IBS_FETCH_CODE,
-					  IBS_FETCH_SIZE);
-			op_cpu_buffer_add_data(&entry, (u32)msr);
-			op_cpu_buffer_add_data(&entry, (u32)(msr >> 32));
-			op_cpu_buffer_add_data(&entry, low);
-			op_cpu_buffer_add_data(&entry, high);
+			oprofile_write_reserve(&entry, regs, msr,
+					       IBS_FETCH_CODE, IBS_FETCH_SIZE);
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
+			oprofile_add_data(&entry, low);
+			oprofile_add_data(&entry, high);
 			rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, msr);
-			op_cpu_buffer_add_data(&entry, (u32)msr);
-			op_cpu_buffer_add_data(&entry, (u32)(msr >> 32));
-			op_cpu_buffer_write_commit(&entry);
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
+			oprofile_write_commit(&entry);
 
 			/* reenable the IRQ */
 			high &= ~IBS_FETCH_HIGH_VALID_BIT;
@@ -197,26 +188,26 @@ op_amd_handle_ibs(struct pt_regs * const regs,
 		rdmsr(MSR_AMD64_IBSOPCTL, low, high);
 		if (low & IBS_OP_LOW_VALID_BIT) {
 			rdmsrl(MSR_AMD64_IBSOPRIP, msr);
-			oprofile_add_data(&entry, regs, msr, IBS_OP_CODE,
-					  IBS_OP_SIZE);
-			op_cpu_buffer_add_data(&entry, (u32)msr);
-			op_cpu_buffer_add_data(&entry, (u32)(msr >> 32));
+			oprofile_write_reserve(&entry, regs, msr,
+					       IBS_OP_CODE, IBS_OP_SIZE);
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
 			rdmsrl(MSR_AMD64_IBSOPDATA, msr);
-			op_cpu_buffer_add_data(&entry, (u32)msr);
-			op_cpu_buffer_add_data(&entry, (u32)(msr >> 32));
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
 			rdmsrl(MSR_AMD64_IBSOPDATA2, msr);
-			op_cpu_buffer_add_data(&entry, (u32)msr);
-			op_cpu_buffer_add_data(&entry, (u32)(msr >> 32));
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
 			rdmsrl(MSR_AMD64_IBSOPDATA3, msr);
-			op_cpu_buffer_add_data(&entry, (u32)msr);
-			op_cpu_buffer_add_data(&entry, (u32)(msr >> 32));
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
 			rdmsrl(MSR_AMD64_IBSDCLINAD, msr);
-			op_cpu_buffer_add_data(&entry, (u32)msr);
-			op_cpu_buffer_add_data(&entry, (u32)(msr >> 32));
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
 			rdmsrl(MSR_AMD64_IBSDCPHYSAD, msr);
-			op_cpu_buffer_add_data(&entry, (u32)msr);
-			op_cpu_buffer_add_data(&entry, (u32)(msr >> 32));
-			op_cpu_buffer_write_commit(&entry);
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
+			oprofile_write_commit(&entry);
 
 			/* reenable the IRQ */
 			high = 0;
diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index b846af632c81..2e03b6d796d3 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -364,10 +364,11 @@ void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
 /*
  * Add samples with data to the ring buffer.
  *
- * Use op_cpu_buffer_add_data(&entry, val) to add data and
- * op_cpu_buffer_write_commit(&entry) to commit the sample.
+ * Use oprofile_add_data(&entry, val) to add data and
+ * oprofile_write_commit(&entry) to commit the sample.
  */
-void oprofile_add_data(struct op_entry *entry, struct pt_regs * const regs,
+void
+oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs,
 		       unsigned long pc, int code, int size)
 {
 	struct op_sample *sample;
@@ -395,6 +396,16 @@ fail:
 	cpu_buf->sample_lost_overflow++;
 }
 
+int oprofile_add_data(struct op_entry *entry, unsigned long val)
+{
+	return op_cpu_buffer_add_data(entry, val);
+}
+
+int oprofile_write_commit(struct op_entry *entry)
+{
+	return op_cpu_buffer_write_commit(entry);
+}
+
 void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
 {
 	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
diff --git a/drivers/oprofile/cpu_buffer.h b/drivers/oprofile/cpu_buffer.h
index 525cc4d13d8d..63f81c44846a 100644
--- a/drivers/oprofile/cpu_buffer.h
+++ b/drivers/oprofile/cpu_buffer.h
@@ -35,13 +35,7 @@ struct op_sample {
 	unsigned long data[0];
 };
 
-struct op_entry {
-	struct ring_buffer_event *event;
-	struct op_sample *sample;
-	unsigned long irq_flags;
-	unsigned long size;
-	unsigned long *data;
-};
+struct op_entry;
 
 struct oprofile_cpu_buffer {
 	unsigned long buffer_size;
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index 1ce9fe572e51..1d9518bc4c58 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -164,4 +164,22 @@ void oprofile_put_buff(unsigned long *buf, unsigned int start,
 unsigned long oprofile_get_cpu_buffer_size(void);
 void oprofile_cpu_buffer_inc_smpl_lost(void);
  
+/* cpu buffer functions */
+
+struct op_sample;
+
+struct op_entry {
+	struct ring_buffer_event *event;
+	struct op_sample *sample;
+	unsigned long irq_flags;
+	unsigned long size;
+	unsigned long *data;
+};
+
+void oprofile_write_reserve(struct op_entry *entry,
+			    struct pt_regs * const regs,
+			    unsigned long pc, int code, int size);
+int oprofile_add_data(struct op_entry *entry, unsigned long val);
+int oprofile_write_commit(struct op_entry *entry);
+
 #endif /* OPROFILE_H */
-- 
cgit v1.2.3


From 5886188dc7ba9a76babcd37452f44079a9a77f71 Mon Sep 17 00:00:00 2001
From: Benjamin Krill <ben@codiert.org>
Date: Wed, 7 Jan 2009 10:32:38 +0100
Subject: serial: Add driver for the Cell Network Processor serial port NWP
 device

Add support for the nwp serial device which is connected to a DCR bus. It
uses the of_serial device driver to determine necessary properties from
the device tree.  The supported device is added as serial port number 85.

NWP stands for network processor and it is part of the QPACE - Quantum
Chromodynamics Parallel Computing on the Cell Broadband Engine project.
The implementation is a lightweight uart implementation with the focus
to consume as little resources as possible and it is connected to a
DCR bus.

Signed-off-by: Benjamin Krill <ben@codiert.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 drivers/serial/Kconfig      |  19 +-
 drivers/serial/Makefile     |   1 +
 drivers/serial/nwpserial.c  | 475 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/serial/of_serial.c  |  19 ++
 include/linux/nwpserial.h   |  18 ++
 include/linux/serial_core.h |   3 +
 6 files changed, 534 insertions(+), 1 deletion(-)
 create mode 100644 drivers/serial/nwpserial.c
 create mode 100644 include/linux/nwpserial.h

(limited to 'include/linux')

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index b695ab3142d8..64fcf8c20821 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -1320,13 +1320,30 @@ config SERIAL_NETX_CONSOLE
 config SERIAL_OF_PLATFORM
 	tristate "Serial port on Open Firmware platform bus"
 	depends on PPC_OF
-	depends on SERIAL_8250
+	depends on SERIAL_8250 || SERIAL_OF_PLATFORM_NWPSERIAL
 	help
 	  If you have a PowerPC based system that has serial ports
 	  on a platform specific bus, you should enable this option.
 	  Currently, only 8250 compatible ports are supported, but
 	  others can easily be added.
 
+config SERIAL_OF_PLATFORM_NWPSERIAL
+	tristate "NWP serial port driver"
+	depends on PPC_OF && PPC_DCR
+	select SERIAL_OF_PLATFORM
+	select SERIAL_CORE_CONSOLE
+	select SERIAL_CORE
+	help
+	  This driver supports the cell network processor nwp serial
+	  device.
+
+config SERIAL_OF_PLATFORM_NWPSERIAL_CONSOLE
+	bool "Console on NWP serial port"
+	depends on SERIAL_OF_PLATFORM_NWPSERIAL=y
+	select SERIAL_CORE_CONSOLE
+	help
+	  Support for Console on the NWP serial ports.
+
 config SERIAL_QE
 	tristate "Freescale QUICC Engine serial port support"
 	depends on QUICC_ENGINE
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index dfe775ac45b2..8844c0a03929 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -72,6 +72,7 @@ obj-$(CONFIG_SERIAL_ATMEL) += atmel_serial.o
 obj-$(CONFIG_SERIAL_UARTLITE) += uartlite.o
 obj-$(CONFIG_SERIAL_NETX) += netx-serial.o
 obj-$(CONFIG_SERIAL_OF_PLATFORM) += of_serial.o
+obj-$(CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL) += nwpserial.o
 obj-$(CONFIG_SERIAL_KS8695) += serial_ks8695.o
 obj-$(CONFIG_KGDB_SERIAL_CONSOLE) += kgdboc.o
 obj-$(CONFIG_SERIAL_QE) += ucc_uart.o
diff --git a/drivers/serial/nwpserial.c b/drivers/serial/nwpserial.c
new file mode 100644
index 000000000000..32f3eaf0d262
--- /dev/null
+++ b/drivers/serial/nwpserial.c
@@ -0,0 +1,475 @@
+/*
+ *  Serial Port driver for a NWP uart device
+ *
+ *    Copyright (C) 2008 IBM Corp., Benjamin Krill <ben@codiert.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/serial.h>
+#include <linux/serial_reg.h>
+#include <linux/serial_core.h>
+#include <linux/tty.h>
+#include <linux/irqreturn.h>
+#include <linux/mutex.h>
+#include <linux/of_platform.h>
+#include <linux/of_device.h>
+#include <linux/nwpserial.h>
+#include <asm/prom.h>
+#include <asm/dcr.h>
+
+#define NWPSERIAL_NR               2
+
+#define NWPSERIAL_STATUS_RXVALID 0x1
+#define NWPSERIAL_STATUS_TXFULL  0x2
+
+struct nwpserial_port {
+	struct uart_port port;
+	dcr_host_t dcr_host;
+	unsigned int ier;
+	unsigned int mcr;
+};
+
+static DEFINE_MUTEX(nwpserial_mutex);
+static struct nwpserial_port nwpserial_ports[NWPSERIAL_NR];
+
+static void wait_for_bits(struct nwpserial_port *up, int bits)
+{
+	unsigned int status, tmout = 10000;
+
+	/* Wait up to 10ms for the character(s) to be sent. */
+	do {
+		status = dcr_read(up->dcr_host, UART_LSR);
+
+		if (--tmout == 0)
+			break;
+		udelay(1);
+	} while ((status & bits) != bits);
+}
+
+#ifdef CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL_CONSOLE
+static void nwpserial_console_putchar(struct uart_port *port, int c)
+{
+	struct nwpserial_port *up;
+	up = container_of(port, struct nwpserial_port, port);
+	/* check if tx buffer is full */
+	wait_for_bits(up, UART_LSR_THRE);
+	dcr_write(up->dcr_host, UART_TX, c);
+	up->port.icount.tx++;
+}
+
+static void
+nwpserial_console_write(struct console *co, const char *s, unsigned int count)
+{
+	struct nwpserial_port *up = &nwpserial_ports[co->index];
+	unsigned long flags;
+	int locked = 1;
+
+	if (oops_in_progress)
+		locked = spin_trylock_irqsave(&up->port.lock, flags);
+	else
+		spin_lock_irqsave(&up->port.lock, flags);
+
+	/* save and disable interrupt */
+	up->ier = dcr_read(up->dcr_host, UART_IER);
+	dcr_write(up->dcr_host, UART_IER, up->ier & ~UART_IER_RDI);
+
+	uart_console_write(&up->port, s, count, nwpserial_console_putchar);
+
+	/* wait for transmitter to become emtpy */
+	while ((dcr_read(up->dcr_host, UART_LSR) & UART_LSR_THRE) == 0)
+		cpu_relax();
+
+	/* restore interrupt state */
+	dcr_write(up->dcr_host, UART_IER, up->ier);
+
+	if (locked)
+		spin_unlock_irqrestore(&up->port.lock, flags);
+}
+
+static struct uart_driver nwpserial_reg;
+static struct console nwpserial_console = {
+	.name		= "ttySQ",
+	.write		= nwpserial_console_write,
+	.device		= uart_console_device,
+	.flags		= CON_PRINTBUFFER,
+	.index		= -1,
+	.data		= &nwpserial_reg,
+};
+#define NWPSERIAL_CONSOLE	(&nwpserial_console)
+#else
+#define NWPSERIAL_CONSOLE	NULL
+#endif /* CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL_CONSOLE */
+
+/**************************************************************************/
+
+static int nwpserial_request_port(struct uart_port *port)
+{
+	return 0;
+}
+
+static void nwpserial_release_port(struct uart_port *port)
+{
+	/* N/A */
+}
+
+static void nwpserial_config_port(struct uart_port *port, int flags)
+{
+	port->type = PORT_NWPSERIAL;
+}
+
+static irqreturn_t nwpserial_interrupt(int irq, void *dev_id)
+{
+	struct nwpserial_port *up = dev_id;
+	struct tty_struct *tty = up->port.info->port.tty;
+	irqreturn_t ret;
+	unsigned int iir;
+	unsigned char ch;
+
+	spin_lock(&up->port.lock);
+
+	/* check if the uart was the interrupt source. */
+	iir = dcr_read(up->dcr_host, UART_IIR);
+	if (!iir) {
+		ret = IRQ_NONE;
+		goto out;
+	}
+
+	do {
+		up->port.icount.rx++;
+		ch = dcr_read(up->dcr_host, UART_RX);
+		if (up->port.ignore_status_mask != NWPSERIAL_STATUS_RXVALID)
+			tty_insert_flip_char(tty, ch, TTY_NORMAL);
+	} while (dcr_read(up->dcr_host, UART_RX) & UART_LSR_DR);
+
+	tty_flip_buffer_push(tty);
+	ret = IRQ_HANDLED;
+
+out:
+	spin_unlock(&up->port.lock);
+	return ret;
+}
+
+static int nwpserial_startup(struct uart_port *port)
+{
+	struct nwpserial_port *up;
+	int err;
+
+	up = container_of(port, struct nwpserial_port, port);
+
+	/* disable flow control by default */
+	up->mcr = dcr_read(up->dcr_host, UART_MCR) & ~UART_MCR_AFE;
+	dcr_write(up->dcr_host, UART_MCR, up->mcr);
+
+	/* register interrupt handler */
+	err = request_irq(up->port.irq, nwpserial_interrupt,
+			IRQF_SHARED, "nwpserial", up);
+	if (err)
+		return err;
+
+	/* enable interrupts */
+	up->ier = UART_IER_RDI;
+	dcr_write(up->dcr_host, UART_IER, up->ier);
+
+	/* enable receiving */
+	up->port.ignore_status_mask &= ~NWPSERIAL_STATUS_RXVALID;
+
+	return 0;
+}
+
+static void nwpserial_shutdown(struct uart_port *port)
+{
+	struct nwpserial_port *up;
+	up = container_of(port, struct nwpserial_port, port);
+
+	/* disable receiving */
+	up->port.ignore_status_mask |= NWPSERIAL_STATUS_RXVALID;
+
+	/* disable interrupts from this port */
+	up->ier = 0;
+	dcr_write(up->dcr_host, UART_IER, up->ier);
+
+	/* free irq */
+	free_irq(up->port.irq, port);
+}
+
+static int nwpserial_verify_port(struct uart_port *port,
+			struct serial_struct *ser)
+{
+	return -EINVAL;
+}
+
+static const char *nwpserial_type(struct uart_port *port)
+{
+	return port->type == PORT_NWPSERIAL ? "nwpserial" : NULL;
+}
+
+static void nwpserial_set_termios(struct uart_port *port,
+			struct ktermios *termios, struct ktermios *old)
+{
+	struct nwpserial_port *up;
+	up = container_of(port, struct nwpserial_port, port);
+
+	up->port.read_status_mask = NWPSERIAL_STATUS_RXVALID
+				| NWPSERIAL_STATUS_TXFULL;
+
+	up->port.ignore_status_mask = 0;
+	/* ignore all characters if CREAD is not set */
+	if ((termios->c_cflag & CREAD) == 0)
+		up->port.ignore_status_mask |= NWPSERIAL_STATUS_RXVALID;
+
+	/* Copy back the old hardware settings */
+	if (old)
+		tty_termios_copy_hw(termios, old);
+}
+
+static void nwpserial_break_ctl(struct uart_port *port, int ctl)
+{
+	/* N/A */
+}
+
+static void nwpserial_enable_ms(struct uart_port *port)
+{
+	/* N/A */
+}
+
+static void nwpserial_stop_rx(struct uart_port *port)
+{
+	struct nwpserial_port *up;
+	up = container_of(port, struct nwpserial_port, port);
+	/* don't forward any more data (like !CREAD) */
+	up->port.ignore_status_mask = NWPSERIAL_STATUS_RXVALID;
+}
+
+static void nwpserial_putchar(struct nwpserial_port *up, unsigned char c)
+{
+	/* check if tx buffer is full */
+	wait_for_bits(up, UART_LSR_THRE);
+	dcr_write(up->dcr_host, UART_TX, c);
+	up->port.icount.tx++;
+}
+
+static void nwpserial_start_tx(struct uart_port *port)
+{
+	struct nwpserial_port *up;
+	struct circ_buf *xmit;
+	up = container_of(port, struct nwpserial_port, port);
+	xmit  = &up->port.info->xmit;
+
+	if (port->x_char) {
+		nwpserial_putchar(up, up->port.x_char);
+		port->x_char = 0;
+	}
+
+	while (!(uart_circ_empty(xmit) || uart_tx_stopped(&up->port))) {
+		nwpserial_putchar(up, xmit->buf[xmit->tail]);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE-1);
+	}
+}
+
+static unsigned int nwpserial_get_mctrl(struct uart_port *port)
+{
+	return 0;
+}
+
+static void nwpserial_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	/* N/A */
+}
+
+static void nwpserial_stop_tx(struct uart_port *port)
+{
+	/* N/A */
+}
+
+static unsigned int nwpserial_tx_empty(struct uart_port *port)
+{
+	struct nwpserial_port *up;
+	unsigned long flags;
+	int ret;
+	up = container_of(port, struct nwpserial_port, port);
+
+	spin_lock_irqsave(&up->port.lock, flags);
+	ret = dcr_read(up->dcr_host, UART_LSR);
+	spin_unlock_irqrestore(&up->port.lock, flags);
+
+	return ret & UART_LSR_TEMT ? TIOCSER_TEMT : 0;
+}
+
+static struct uart_ops nwpserial_pops = {
+	.tx_empty     = nwpserial_tx_empty,
+	.set_mctrl    = nwpserial_set_mctrl,
+	.get_mctrl    = nwpserial_get_mctrl,
+	.stop_tx      = nwpserial_stop_tx,
+	.start_tx     = nwpserial_start_tx,
+	.stop_rx      = nwpserial_stop_rx,
+	.enable_ms    = nwpserial_enable_ms,
+	.break_ctl    = nwpserial_break_ctl,
+	.startup      = nwpserial_startup,
+	.shutdown     = nwpserial_shutdown,
+	.set_termios  = nwpserial_set_termios,
+	.type         = nwpserial_type,
+	.release_port = nwpserial_release_port,
+	.request_port = nwpserial_request_port,
+	.config_port  = nwpserial_config_port,
+	.verify_port  = nwpserial_verify_port,
+};
+
+static struct uart_driver nwpserial_reg = {
+	.owner       = THIS_MODULE,
+	.driver_name = "nwpserial",
+	.dev_name    = "ttySQ",
+	.major       = TTY_MAJOR,
+	.minor       = 68,
+	.nr          = NWPSERIAL_NR,
+	.cons        = NWPSERIAL_CONSOLE,
+};
+
+int nwpserial_register_port(struct uart_port *port)
+{
+	struct nwpserial_port *up = NULL;
+	int ret = -1;
+	int i;
+	static int first = 1;
+	int dcr_len;
+	int dcr_base;
+	struct device_node *dn;
+
+	mutex_lock(&nwpserial_mutex);
+
+	dn = to_of_device(port->dev)->node;
+	if (dn == NULL)
+		goto out;
+
+	/* get dcr base. */
+	dcr_base = dcr_resource_start(dn, 0);
+
+	/* find matching entry */
+	for (i = 0; i < NWPSERIAL_NR; i++)
+		if (nwpserial_ports[i].port.iobase == dcr_base) {
+			up = &nwpserial_ports[i];
+			break;
+		}
+
+	/* we didn't find a mtching entry, search for a free port */
+	if (up == NULL)
+		for (i = 0; i < NWPSERIAL_NR; i++)
+			if (nwpserial_ports[i].port.type == PORT_UNKNOWN &&
+				nwpserial_ports[i].port.iobase == 0) {
+				up = &nwpserial_ports[i];
+				break;
+			}
+
+	if (up == NULL) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (first)
+		uart_register_driver(&nwpserial_reg);
+	first = 0;
+
+	up->port.membase      = port->membase;
+	up->port.irq          = port->irq;
+	up->port.uartclk      = port->uartclk;
+	up->port.fifosize     = port->fifosize;
+	up->port.regshift     = port->regshift;
+	up->port.iotype       = port->iotype;
+	up->port.flags        = port->flags;
+	up->port.mapbase      = port->mapbase;
+	up->port.private_data = port->private_data;
+
+	if (port->dev)
+		up->port.dev = port->dev;
+
+	if (up->port.iobase != dcr_base) {
+		up->port.ops          = &nwpserial_pops;
+		up->port.fifosize     = 16;
+
+		spin_lock_init(&up->port.lock);
+
+		up->port.iobase = dcr_base;
+		dcr_len = dcr_resource_len(dn, 0);
+
+		up->dcr_host = dcr_map(dn, dcr_base, dcr_len);
+		if (!DCR_MAP_OK(up->dcr_host)) {
+			printk(KERN_ERR "Cannot map DCR resources for NWPSERIAL");
+			goto out;
+		}
+	}
+
+	ret = uart_add_one_port(&nwpserial_reg, &up->port);
+	if (ret == 0)
+		ret = up->port.line;
+
+out:
+	mutex_unlock(&nwpserial_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL(nwpserial_register_port);
+
+void nwpserial_unregister_port(int line)
+{
+	struct nwpserial_port *up = &nwpserial_ports[line];
+	mutex_lock(&nwpserial_mutex);
+	uart_remove_one_port(&nwpserial_reg, &up->port);
+
+	up->port.type = PORT_UNKNOWN;
+
+	mutex_unlock(&nwpserial_mutex);
+}
+EXPORT_SYMBOL(nwpserial_unregister_port);
+
+#ifdef CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL_CONSOLE
+static int __init nwpserial_console_init(void)
+{
+	struct nwpserial_port *up = NULL;
+	struct device_node *dn;
+	const char *name;
+	int dcr_base;
+	int dcr_len;
+	int i;
+
+	/* search for a free port */
+	for (i = 0; i < NWPSERIAL_NR; i++)
+		if (nwpserial_ports[i].port.type == PORT_UNKNOWN) {
+			up = &nwpserial_ports[i];
+			break;
+		}
+
+	if (up == NULL)
+		return -1;
+
+	name = of_get_property(of_chosen, "linux,stdout-path", NULL);
+	if (name == NULL)
+		return -1;
+
+	dn = of_find_node_by_path(name);
+	if (!dn)
+		return -1;
+
+	spin_lock_init(&up->port.lock);
+	up->port.ops = &nwpserial_pops;
+	up->port.type = PORT_NWPSERIAL;
+	up->port.fifosize = 16;
+
+	dcr_base = dcr_resource_start(dn, 0);
+	dcr_len = dcr_resource_len(dn, 0);
+	up->port.iobase = dcr_base;
+
+	up->dcr_host = dcr_map(dn, dcr_base, dcr_len);
+	if (!DCR_MAP_OK(up->dcr_host)) {
+		printk("Cannot map DCR resources for SERIAL");
+		return -1;
+	}
+	register_console(&nwpserial_console);
+	return 0;
+}
+console_initcall(nwpserial_console_init);
+#endif /* CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL_CONSOLE */
diff --git a/drivers/serial/of_serial.c b/drivers/serial/of_serial.c
index 8fa0ff561e9f..a821e3a3d664 100644
--- a/drivers/serial/of_serial.c
+++ b/drivers/serial/of_serial.c
@@ -14,6 +14,7 @@
 #include <linux/serial_core.h>
 #include <linux/serial_8250.h>
 #include <linux/of_platform.h>
+#include <linux/nwpserial.h>
 
 #include <asm/prom.h>
 
@@ -99,9 +100,16 @@ static int __devinit of_platform_serial_probe(struct of_device *ofdev,
 		goto out;
 
 	switch (port_type) {
+#ifdef CONFIG_SERIAL_8250
 	case PORT_8250 ... PORT_MAX_8250:
 		ret = serial8250_register_port(&port);
 		break;
+#endif
+#ifdef CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL
+	case PORT_NWPSERIAL:
+		ret = nwpserial_register_port(&port);
+		break;
+#endif
 	default:
 		/* need to add code for these */
 	case PORT_UNKNOWN:
@@ -129,9 +137,16 @@ static int of_platform_serial_remove(struct of_device *ofdev)
 {
 	struct of_serial_info *info = ofdev->dev.driver_data;
 	switch (info->type) {
+#ifdef CONFIG_SERIAL_8250
 	case PORT_8250 ... PORT_MAX_8250:
 		serial8250_unregister_port(info->line);
 		break;
+#endif
+#ifdef CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL
+	case PORT_NWPSERIAL:
+		nwpserial_unregister_port(info->line);
+		break;
+#endif
 	default:
 		/* need to add code for these */
 		break;
@@ -148,6 +163,10 @@ static struct of_device_id __devinitdata of_platform_serial_table[] = {
 	{ .type = "serial", .compatible = "ns16450",  .data = (void *)PORT_16450, },
 	{ .type = "serial", .compatible = "ns16550",  .data = (void *)PORT_16550, },
 	{ .type = "serial", .compatible = "ns16750",  .data = (void *)PORT_16750, },
+#ifdef CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL
+	{ .type = "serial", .compatible = "ibm,qpace-nwp-serial",
+					.data = (void *)PORT_NWPSERIAL, },
+#endif
 	{ .type = "serial",			      .data = (void *)PORT_UNKNOWN, },
 	{ /* end of list */ },
 };
diff --git a/include/linux/nwpserial.h b/include/linux/nwpserial.h
new file mode 100644
index 000000000000..9acb21572eaf
--- /dev/null
+++ b/include/linux/nwpserial.h
@@ -0,0 +1,18 @@
+/*
+ *  Serial Port driver for a NWP uart device
+ *
+ *    Copyright (C) 2008 IBM Corp., Benjamin Krill <ben@codiert.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+#ifndef _NWPSERIAL_H
+#define _NWPSERIAL_H
+
+int nwpserial_register_port(struct uart_port *port);
+void nwpserial_unregister_port(int line);
+
+#endif /* _NWPSERIAL_H */
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index b4199841f1fc..90bbbf0b1161 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -161,6 +161,9 @@
 
 #define PORT_S3C6400	84
 
+/* NWPSERIAL */
+#define PORT_NWPSERIAL	85
+
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
-- 
cgit v1.2.3


From 8feae13110d60cc6287afabc2887366b0eb226c2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Jan 2009 12:04:47 +0000
Subject: NOMMU: Make VMAs per MM as for MMU-mode linux

Make VMAs per mm_struct as for MMU-mode linux.  This solves two problems:

 (1) In SYSV SHM where nattch for a segment does not reflect the number of
     shmat's (and forks) done.

 (2) In mmap() where the VMA's vm_mm is set to point to the parent mm by an
     exec'ing process when VM_EXECUTABLE is specified, regardless of the fact
     that a VMA might be shared and already have its vm_mm assigned to another
     process or a dead process.

A new struct (vm_region) is introduced to track a mapped region and to remember
the circumstances under which it may be shared and the vm_list_struct structure
is discarded as it's no longer required.

This patch makes the following additional changes:

 (1) Regions are now allocated with alloc_pages() rather than kmalloc() and
     with no recourse to __GFP_COMP, so the pages are not composite.  Instead,
     each page has a reference on it held by the region.  Anything else that is
     interested in such a page will have to get a reference on it to retain it.
     When the pages are released due to unmapping, each page is passed to
     put_page() and will be freed when the page usage count reaches zero.

 (2) Excess pages are trimmed after an allocation as the allocation must be
     made as a power-of-2 quantity of pages.

 (3) VMAs are added to the parent MM's R/B tree and mmap lists.  As an MM may
     end up with overlapping VMAs within the tree, the VMA struct address is
     appended to the sort key.

 (4) Non-anonymous VMAs are now added to the backing inode's prio list.

 (5) Holes may be punched in anonymous VMAs with munmap(), releasing parts of
     the backing region.  The VMA and region structs will be split if
     necessary.

 (6) sys_shmdt() only releases one attachment to a SYSV IPC shared memory
     segment instead of all the attachments at that addresss.  Multiple
     shmat()'s return the same address under NOMMU-mode instead of different
     virtual addresses as under MMU-mode.

 (7) Core dumping for ELF-FDPIC requires fewer exceptions for NOMMU-mode.

 (8) /proc/maps is now the global list of mapped regions, and may list bits
     that aren't actually mapped anywhere.

 (9) /proc/meminfo gains a line (tagged "MmapCopy") that indicates the amount
     of RAM currently allocated by mmap to hold mappable regions that can't be
     mapped directly.  These are copies of the backing device or file if not
     anonymous.

These changes make NOMMU mode more similar to MMU mode.  The downside is that
NOMMU mode requires some extra memory to track things over NOMMU without this
patch (VMAs are no longer shared, and there are now region structs).

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Mike Frysinger <vapier.adi@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
---
 Documentation/nommu-mmap.txt     |  18 +-
 arch/arm/include/asm/mmu.h       |   1 -
 arch/blackfin/include/asm/mmu.h  |   1 -
 arch/blackfin/kernel/ptrace.c    |   6 +-
 arch/blackfin/kernel/traps.c     |  11 +-
 arch/frv/kernel/ptrace.c         |  11 +-
 arch/h8300/include/asm/mmu.h     |   1 -
 arch/m68knommu/include/asm/mmu.h |   1 -
 arch/sh/include/asm/mmu.h        |   1 -
 fs/binfmt_elf_fdpic.c            |  27 +-
 fs/proc/internal.h               |   2 -
 fs/proc/meminfo.c                |   6 +
 fs/proc/nommu.c                  |  71 ++-
 fs/proc/task_nommu.c             | 108 +++--
 include/asm-frv/mmu.h            |   1 -
 include/asm-m32r/mmu.h           |   1 -
 include/linux/mm.h               |  18 +-
 include/linux/mm_types.h         |  18 +-
 ipc/shm.c                        |  12 +
 kernel/fork.c                    |   4 +-
 lib/Kconfig.debug                |   7 +
 mm/mmap.c                        |  10 +
 mm/nommu.c                       | 960 +++++++++++++++++++++++++++------------
 23 files changed, 860 insertions(+), 436 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt
index 7714f57caad5..02b89dcf38ac 100644
--- a/Documentation/nommu-mmap.txt
+++ b/Documentation/nommu-mmap.txt
@@ -109,12 +109,18 @@ and it's also much more restricted in the latter case:
 FURTHER NOTES ON NO-MMU MMAP
 ============================
 
- (*) A request for a private mapping of less than a page in size may not return
-     a page-aligned buffer. This is because the kernel calls kmalloc() to
-     allocate the buffer, not get_free_page().
-
- (*) A list of all the mappings on the system is visible through /proc/maps in
-     no-MMU mode.
+ (*) A request for a private mapping of a file may return a buffer that is not
+     page-aligned.  This is because XIP may take place, and the data may not be
+     paged aligned in the backing store.
+
+ (*) A request for an anonymous mapping will always be page aligned.  If
+     possible the size of the request should be a power of two otherwise some
+     of the space may be wasted as the kernel must allocate a power-of-2
+     granule but will only discard the excess if appropriately configured as
+     this has an effect on fragmentation.
+
+ (*) A list of all the private copy and anonymous mappings on the system is
+     visible through /proc/maps in no-MMU mode.
 
  (*) A list of all the mappings in use by a process is visible through
      /proc/<pid>/maps in no-MMU mode.
diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h
index 53099d4ee421..b561584d04a1 100644
--- a/arch/arm/include/asm/mmu.h
+++ b/arch/arm/include/asm/mmu.h
@@ -24,7 +24,6 @@ typedef struct {
  *  modified for 2.6 by Hyok S. Choi <hyok.choi@samsung.com>
  */
 typedef struct {
-	struct vm_list_struct	*vmlist;
 	unsigned long		end_brk;
 } mm_context_t;
 
diff --git a/arch/blackfin/include/asm/mmu.h b/arch/blackfin/include/asm/mmu.h
index 757e43906ed4..dbfd686360e6 100644
--- a/arch/blackfin/include/asm/mmu.h
+++ b/arch/blackfin/include/asm/mmu.h
@@ -10,7 +10,6 @@ struct sram_list_struct {
 };
 
 typedef struct {
-	struct vm_list_struct *vmlist;
 	unsigned long end_brk;
 	unsigned long stack_start;
 
diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c
index d2d388536630..594e325b40e4 100644
--- a/arch/blackfin/kernel/ptrace.c
+++ b/arch/blackfin/kernel/ptrace.c
@@ -160,15 +160,15 @@ put_reg(struct task_struct *task, int regno, unsigned long data)
 static inline int is_user_addr_valid(struct task_struct *child,
 				     unsigned long start, unsigned long len)
 {
-	struct vm_list_struct *vml;
+	struct vm_area_struct *vma;
 	struct sram_list_struct *sraml;
 
 	/* overflow */
 	if (start + len < start)
 		return -EIO;
 
-	for (vml = child->mm->context.vmlist; vml; vml = vml->next)
-		if (start >= vml->vma->vm_start && start + len < vml->vma->vm_end)
+	vma = find_vma(child->mm, start);
+	if (vma && start >= vma->vm_start && start + len <= vma->vm_end)
 			return 0;
 
 	for (sraml = child->mm->context.sram_list; sraml; sraml = sraml->next)
diff --git a/arch/blackfin/kernel/traps.c b/arch/blackfin/kernel/traps.c
index 17d8e4172896..5b0667da8d05 100644
--- a/arch/blackfin/kernel/traps.c
+++ b/arch/blackfin/kernel/traps.c
@@ -32,6 +32,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/fs.h>
+#include <linux/rbtree.h>
 #include <asm/traps.h>
 #include <asm/cacheflush.h>
 #include <asm/cplb.h>
@@ -83,6 +84,7 @@ static void decode_address(char *buf, unsigned long address)
 	struct mm_struct *mm;
 	unsigned long flags, offset;
 	unsigned char in_atomic = (bfin_read_IPEND() & 0x10) || in_atomic();
+	struct rb_node *n;
 
 #ifdef CONFIG_KALLSYMS
 	unsigned long symsize;
@@ -128,9 +130,10 @@ static void decode_address(char *buf, unsigned long address)
 		if (!mm)
 			continue;
 
-		vml = mm->context.vmlist;
-		while (vml) {
-			struct vm_area_struct *vma = vml->vma;
+		for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
+			struct vm_area_struct *vma;
+
+			vma = rb_entry(n, struct vm_area_struct, vm_rb);
 
 			if (address >= vma->vm_start && address < vma->vm_end) {
 				char _tmpbuf[256];
@@ -176,8 +179,6 @@ static void decode_address(char *buf, unsigned long address)
 
 				goto done;
 			}
-
-			vml = vml->next;
 		}
 		if (!in_atomic)
 			mmput(mm);
diff --git a/arch/frv/kernel/ptrace.c b/arch/frv/kernel/ptrace.c
index 709e9bdc6126..5e7d401d21e7 100644
--- a/arch/frv/kernel/ptrace.c
+++ b/arch/frv/kernel/ptrace.c
@@ -69,7 +69,8 @@ static inline int put_reg(struct task_struct *task, int regno,
 }
 
 /*
- * check that an address falls within the bounds of the target process's memory mappings
+ * check that an address falls within the bounds of the target process's memory
+ * mappings
  */
 static inline int is_user_addr_valid(struct task_struct *child,
 				     unsigned long start, unsigned long len)
@@ -79,11 +80,11 @@ static inline int is_user_addr_valid(struct task_struct *child,
 		return -EIO;
 	return 0;
 #else
-	struct vm_list_struct *vml;
+	struct vm_area_struct *vma;
 
-	for (vml = child->mm->context.vmlist; vml; vml = vml->next)
-		if (start >= vml->vma->vm_start && start + len <= vml->vma->vm_end)
-			return 0;
+	vma = find_vma(child->mm, start);
+	if (vma && start >= vma->vm_start && start + len <= vma->vm_end)
+		return 0;
 
 	return -EIO;
 #endif
diff --git a/arch/h8300/include/asm/mmu.h b/arch/h8300/include/asm/mmu.h
index 2ce06ea46104..31309969df70 100644
--- a/arch/h8300/include/asm/mmu.h
+++ b/arch/h8300/include/asm/mmu.h
@@ -4,7 +4,6 @@
 /* Copyright (C) 2002, David McCullough <davidm@snapgear.com> */
 
 typedef struct {
-	struct vm_list_struct	*vmlist;
 	unsigned long		end_brk;
 } mm_context_t;
 
diff --git a/arch/m68knommu/include/asm/mmu.h b/arch/m68knommu/include/asm/mmu.h
index 5fa6b68353ba..e2da1e6f09fe 100644
--- a/arch/m68knommu/include/asm/mmu.h
+++ b/arch/m68knommu/include/asm/mmu.h
@@ -4,7 +4,6 @@
 /* Copyright (C) 2002, David McCullough <davidm@snapgear.com> */
 
 typedef struct {
-	struct vm_list_struct	*vmlist;
 	unsigned long		end_brk;
 } mm_context_t;
 
diff --git a/arch/sh/include/asm/mmu.h b/arch/sh/include/asm/mmu.h
index fdcb93bc6d11..6c43625bb1a5 100644
--- a/arch/sh/include/asm/mmu.h
+++ b/arch/sh/include/asm/mmu.h
@@ -9,7 +9,6 @@ typedef struct {
 	mm_context_id_t		id;
 	void			*vdso;
 #else
-	struct vm_list_struct	*vmlist;
 	unsigned long		end_brk;
 #endif
 #ifdef CONFIG_BINFMT_ELF_FDPIC
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index aa5b43205e37..22baf1b13493 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1567,11 +1567,9 @@ end_coredump:
 static int elf_fdpic_dump_segments(struct file *file, size_t *size,
 			   unsigned long *limit, unsigned long mm_flags)
 {
-	struct vm_list_struct *vml;
-
-	for (vml = current->mm->context.vmlist; vml; vml = vml->next) {
-	struct vm_area_struct *vma = vml->vma;
+	struct vm_area_struct *vma;
 
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
 		if (!maydump(vma, mm_flags))
 			continue;
 
@@ -1617,9 +1615,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 	elf_fpxregset_t *xfpu = NULL;
 #endif
 	int thread_status_size = 0;
-#ifndef CONFIG_MMU
-	struct vm_list_struct *vml;
-#endif
 	elf_addr_t *auxv;
 	unsigned long mm_flags;
 
@@ -1685,13 +1680,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 	fill_prstatus(prstatus, current, signr);
 	elf_core_copy_regs(&prstatus->pr_reg, regs);
 
-#ifdef CONFIG_MMU
 	segs = current->mm->map_count;
-#else
-	segs = 0;
-	for (vml = current->mm->context.vmlist; vml; vml = vml->next)
-	    segs++;
-#endif
 #ifdef ELF_CORE_EXTRA_PHDRS
 	segs += ELF_CORE_EXTRA_PHDRS;
 #endif
@@ -1766,20 +1755,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 	mm_flags = current->mm->flags;
 
 	/* write program headers for segments dump */
-	for (
-#ifdef CONFIG_MMU
-		vma = current->mm->mmap; vma; vma = vma->vm_next
-#else
-			vml = current->mm->context.vmlist; vml; vml = vml->next
-#endif
-	     ) {
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
 		struct elf_phdr phdr;
 		size_t sz;
 
-#ifndef CONFIG_MMU
-		vma = vml->vma;
-#endif
-
 		sz = vma->vm_end - vma->vm_start;
 
 		phdr.p_type = PT_LOAD;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3e8aeb8b61ce..cd53ff838498 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -41,8 +41,6 @@ do {						\
 	(vmi)->used = 0;			\
 	(vmi)->largest_chunk = 0;		\
 } while(0)
-
-extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
 #endif
 
 extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b1675c4e66da..43d23948384a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -73,6 +73,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		"HighFree:       %8lu kB\n"
 		"LowTotal:       %8lu kB\n"
 		"LowFree:        %8lu kB\n"
+#endif
+#ifndef CONFIG_MMU
+		"MmapCopy:       %8lu kB\n"
 #endif
 		"SwapTotal:      %8lu kB\n"
 		"SwapFree:       %8lu kB\n"
@@ -115,6 +118,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freehigh),
 		K(i.totalram-i.totalhigh),
 		K(i.freeram-i.freehigh),
+#endif
+#ifndef CONFIG_MMU
+		K((unsigned long) atomic_read(&mmap_pages_allocated)),
 #endif
 		K(i.totalswap),
 		K(i.freeswap),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 3f87d2632947..b446d7ad0b0d 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -33,33 +33,33 @@
 #include "internal.h"
 
 /*
- * display a single VMA to a sequenced file
+ * display a single region to a sequenced file
  */
-int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 {
 	unsigned long ino = 0;
 	struct file *file;
 	dev_t dev = 0;
 	int flags, len;
 
-	flags = vma->vm_flags;
-	file = vma->vm_file;
+	flags = region->vm_flags;
+	file = region->vm_file;
 
 	if (file) {
-		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+		struct inode *inode = region->vm_file->f_path.dentry->d_inode;
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 	}
 
 	seq_printf(m,
 		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
-		   vma->vm_start,
-		   vma->vm_end,
+		   region->vm_start,
+		   region->vm_end,
 		   flags & VM_READ ? 'r' : '-',
 		   flags & VM_WRITE ? 'w' : '-',
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-		   ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
+		   ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
 		   MAJOR(dev), MINOR(dev), ino, &len);
 
 	if (file) {
@@ -75,61 +75,54 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 }
 
 /*
- * display a list of all the VMAs the kernel knows about
+ * display a list of all the REGIONs the kernel knows about
  * - nommu kernals have a single flat list
  */
-static int nommu_vma_list_show(struct seq_file *m, void *v)
+static int nommu_region_list_show(struct seq_file *m, void *_p)
 {
-	struct vm_area_struct *vma;
+	struct rb_node *p = _p;
 
-	vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb);
-	return nommu_vma_show(m, vma);
+	return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb));
 }
 
-static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos)
+static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos)
 {
-	struct rb_node *_rb;
+	struct rb_node *p;
 	loff_t pos = *_pos;
-	void *next = NULL;
 
-	down_read(&nommu_vma_sem);
+	down_read(&nommu_region_sem);
 
-	for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) {
-		if (pos == 0) {
-			next = _rb;
-			break;
-		}
-		pos--;
-	}
-
-	return next;
+	for (p = rb_first(&nommu_region_tree); p; p = rb_next(p))
+		if (pos-- == 0)
+			return p;
+	return NULL;
 }
 
-static void nommu_vma_list_stop(struct seq_file *m, void *v)
+static void nommu_region_list_stop(struct seq_file *m, void *v)
 {
-	up_read(&nommu_vma_sem);
+	up_read(&nommu_region_sem);
 }
 
-static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos)
+static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	(*pos)++;
 	return rb_next((struct rb_node *) v);
 }
 
-static const struct seq_operations proc_nommu_vma_list_seqop = {
-	.start	= nommu_vma_list_start,
-	.next	= nommu_vma_list_next,
-	.stop	= nommu_vma_list_stop,
-	.show	= nommu_vma_list_show
+static struct seq_operations proc_nommu_region_list_seqop = {
+	.start	= nommu_region_list_start,
+	.next	= nommu_region_list_next,
+	.stop	= nommu_region_list_stop,
+	.show	= nommu_region_list_show
 };
 
-static int proc_nommu_vma_list_open(struct inode *inode, struct file *file)
+static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &proc_nommu_vma_list_seqop);
+	return seq_open(file, &proc_nommu_region_list_seqop);
 }
 
-static const struct file_operations proc_nommu_vma_list_operations = {
-	.open    = proc_nommu_vma_list_open,
+static const struct file_operations proc_nommu_region_list_operations = {
+	.open    = proc_nommu_region_list_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
 	.release = seq_release,
@@ -137,7 +130,7 @@ static const struct file_operations proc_nommu_vma_list_operations = {
 
 static int __init proc_nommu_init(void)
 {
-	proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations);
+	proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
 	return 0;
 }
 
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index d4a8be32b902..ca4a48d0d311 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -15,25 +15,25 @@
  */
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	struct vm_list_struct *vml;
+	struct vm_area_struct *vma;
+	struct rb_node *p;
 	unsigned long bytes = 0, sbytes = 0, slack = 0;
         
 	down_read(&mm->mmap_sem);
-	for (vml = mm->context.vmlist; vml; vml = vml->next) {
-		if (!vml->vma)
-			continue;
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
 
-		bytes += kobjsize(vml);
+		bytes += kobjsize(vma);
 		if (atomic_read(&mm->mm_count) > 1 ||
-		    atomic_read(&vml->vma->vm_usage) > 1
-		    ) {
-			sbytes += kobjsize((void *) vml->vma->vm_start);
-			sbytes += kobjsize(vml->vma);
+		    vma->vm_region ||
+		    vma->vm_flags & VM_MAYSHARE) {
+			sbytes += kobjsize((void *) vma->vm_start);
+			if (vma->vm_region)
+				sbytes += kobjsize(vma->vm_region);
 		} else {
-			bytes += kobjsize((void *) vml->vma->vm_start);
-			bytes += kobjsize(vml->vma);
-			slack += kobjsize((void *) vml->vma->vm_start) -
-				(vml->vma->vm_end - vml->vma->vm_start);
+			bytes += kobjsize((void *) vma->vm_start);
+			slack += kobjsize((void *) vma->vm_start) -
+				(vma->vm_end - vma->vm_start);
 		}
 	}
 
@@ -70,13 +70,14 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 
 unsigned long task_vsize(struct mm_struct *mm)
 {
-	struct vm_list_struct *tbp;
+	struct vm_area_struct *vma;
+	struct rb_node *p;
 	unsigned long vsize = 0;
 
 	down_read(&mm->mmap_sem);
-	for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) {
-		if (tbp->vma)
-			vsize += kobjsize((void *) tbp->vma->vm_start);
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
+		vsize += vma->vm_region->vm_end - vma->vm_region->vm_start;
 	}
 	up_read(&mm->mmap_sem);
 	return vsize;
@@ -85,16 +86,15 @@ unsigned long task_vsize(struct mm_struct *mm)
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	struct vm_list_struct *tbp;
+	struct vm_area_struct *vma;
+	struct rb_node *p;
 	int size = kobjsize(mm);
 
 	down_read(&mm->mmap_sem);
-	for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) {
-		size += kobjsize(tbp);
-		if (tbp->vma) {
-			size += kobjsize(tbp->vma);
-			size += kobjsize((void *) tbp->vma->vm_start);
-		}
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
+		size += kobjsize(vma);
+		size += kobjsize((void *) vma->vm_start);
 	}
 
 	size += (*text = mm->end_code - mm->start_code);
@@ -104,21 +104,63 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 	return size;
 }
 
+/*
+ * display a single VMA to a sequenced file
+ */
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+{
+	unsigned long ino = 0;
+	struct file *file;
+	dev_t dev = 0;
+	int flags, len;
+
+	flags = vma->vm_flags;
+	file = vma->vm_file;
+
+	if (file) {
+		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+		dev = inode->i_sb->s_dev;
+		ino = inode->i_ino;
+	}
+
+	seq_printf(m,
+		   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+		   vma->vm_start,
+		   vma->vm_end,
+		   flags & VM_READ ? 'r' : '-',
+		   flags & VM_WRITE ? 'w' : '-',
+		   flags & VM_EXEC ? 'x' : '-',
+		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
+		   vma->vm_pgoff << PAGE_SHIFT,
+		   MAJOR(dev), MINOR(dev), ino, &len);
+
+	if (file) {
+		len = 25 + sizeof(void *) * 6 - len;
+		if (len < 1)
+			len = 1;
+		seq_printf(m, "%*c", len, ' ');
+		seq_path(m, &file->f_path, "");
+	}
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
 /*
  * display mapping lines for a particular process's /proc/pid/maps
  */
-static int show_map(struct seq_file *m, void *_vml)
+static int show_map(struct seq_file *m, void *_p)
 {
-	struct vm_list_struct *vml = _vml;
+	struct rb_node *p = _p;
 
-	return nommu_vma_show(m, vml->vma);
+	return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
 }
 
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_maps_private *priv = m->private;
-	struct vm_list_struct *vml;
 	struct mm_struct *mm;
+	struct rb_node *p;
 	loff_t n = *pos;
 
 	/* pin the task and mm whilst we play with them */
@@ -134,9 +176,9 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	}
 
 	/* start from the Nth VMA */
-	for (vml = mm->context.vmlist; vml; vml = vml->next)
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
 		if (n-- == 0)
-			return vml;
+			return p;
 	return NULL;
 }
 
@@ -152,12 +194,12 @@ static void m_stop(struct seq_file *m, void *_vml)
 	}
 }
 
-static void *m_next(struct seq_file *m, void *_vml, loff_t *pos)
+static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
 {
-	struct vm_list_struct *vml = _vml;
+	struct rb_node *p = _p;
 
 	(*pos)++;
-	return vml ? vml->next : NULL;
+	return p ? rb_next(p) : NULL;
 }
 
 static const struct seq_operations proc_pid_maps_ops = {
diff --git a/include/asm-frv/mmu.h b/include/asm-frv/mmu.h
index 22c03714fb14..86ca0e86e7d2 100644
--- a/include/asm-frv/mmu.h
+++ b/include/asm-frv/mmu.h
@@ -22,7 +22,6 @@ typedef struct {
 	unsigned long	dtlb_ptd_mapping;	/* [DAMR5] PTD mapping for dtlb cached PGE */
 
 #else
-	struct vm_list_struct	*vmlist;
 	unsigned long		end_brk;
 
 #endif
diff --git a/include/asm-m32r/mmu.h b/include/asm-m32r/mmu.h
index d9bd724479cf..150cb92bb666 100644
--- a/include/asm-m32r/mmu.h
+++ b/include/asm-m32r/mmu.h
@@ -4,7 +4,6 @@
 #if !defined(CONFIG_MMU)
 
 typedef struct {
-	struct vm_list_struct	*vmlist;
 	unsigned long		end_brk;
 } mm_context_t;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4a3d28c86443..b91a73fd1bcc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -56,19 +56,9 @@ extern unsigned long mmap_min_addr;
 
 extern struct kmem_cache *vm_area_cachep;
 
-/*
- * This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is
- * disabled, then there's a single shared list of VMAs maintained by the
- * system, and mm's subscribe to these individually
- */
-struct vm_list_struct {
-	struct vm_list_struct	*next;
-	struct vm_area_struct	*vma;
-};
-
 #ifndef CONFIG_MMU
-extern struct rb_root nommu_vma_tree;
-extern struct rw_semaphore nommu_vma_sem;
+extern struct rb_root nommu_region_tree;
+extern struct rw_semaphore nommu_region_sem;
 
 extern unsigned int kobjsize(const void *objp);
 #endif
@@ -1061,6 +1051,7 @@ extern void memmap_init_zone(unsigned long, int, unsigned long,
 				unsigned long, enum memmap_context);
 extern void setup_per_zone_pages_min(void);
 extern void mem_init(void);
+extern void __init mmap_init(void);
 extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
@@ -1072,6 +1063,9 @@ extern void setup_per_cpu_pageset(void);
 static inline void setup_per_cpu_pageset(void) {}
 #endif
 
+/* nommu.c */
+extern atomic_t mmap_pages_allocated;
+
 /* prio_tree.c */
 void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
 void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9cfc9b627fdd..1c1e0d3a1714 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -96,6 +96,22 @@ struct page {
 #endif /* WANT_PAGE_VIRTUAL */
 };
 
+/*
+ * A region containing a mapping of a non-memory backed file under NOMMU
+ * conditions.  These are held in a global tree and are pinned by the VMAs that
+ * map parts of them.
+ */
+struct vm_region {
+	struct rb_node	vm_rb;		/* link in global region tree */
+	unsigned long	vm_flags;	/* VMA vm_flags */
+	unsigned long	vm_start;	/* start address of region */
+	unsigned long	vm_end;		/* region initialised to here */
+	unsigned long	vm_pgoff;	/* the offset in vm_file corresponding to vm_start */
+	struct file	*vm_file;	/* the backing file or NULL */
+
+	atomic_t	vm_usage;	/* region usage count */
+};
+
 /*
  * This struct defines a memory VMM memory area. There is one of these
  * per VM-area/task.  A VM area is any part of the process virtual memory
@@ -152,7 +168,7 @@ struct vm_area_struct {
 	unsigned long vm_truncate_count;/* truncate_count or restart_addr */
 
 #ifndef CONFIG_MMU
-	atomic_t vm_usage;		/* refcount (VMAs shared if !MMU) */
+	struct vm_region *vm_region;	/* NOMMU mapping region */
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
diff --git a/ipc/shm.c b/ipc/shm.c
index b125b560240e..d0ab5527bf45 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -990,6 +990,7 @@ asmlinkage long sys_shmdt(char __user *shmaddr)
 	 */
 	vma = find_vma(mm, addr);
 
+#ifdef CONFIG_MMU
 	while (vma) {
 		next = vma->vm_next;
 
@@ -1034,6 +1035,17 @@ asmlinkage long sys_shmdt(char __user *shmaddr)
 		vma = next;
 	}
 
+#else /* CONFIG_MMU */
+	/* under NOMMU conditions, the exact address to be destroyed must be
+	 * given */
+	retval = -EINVAL;
+	if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
+		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+		retval = 0;
+	}
+
+#endif
+
 	up_write(&mm->mmap_sem);
 	return retval;
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 7b8f2a78be3d..0bce4a43bb37 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1481,12 +1481,10 @@ void __init proc_caches_init(void)
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-	vm_area_cachep = kmem_cache_create("vm_area_struct",
-			sizeof(struct vm_area_struct), 0,
-			SLAB_PANIC, NULL);
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	mmap_init();
 }
 
 /*
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2e75478e9c69..d0a32aab03ff 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -512,6 +512,13 @@ config DEBUG_VIRTUAL
 
 	  If unsure, say N.
 
+config DEBUG_NOMMU_REGIONS
+	bool "Debug the global anon/private NOMMU mapping region tree"
+	depends on DEBUG_KERNEL && !MMU
+	help
+	  This option causes the global tree of anonymous and private mapping
+	  regions to be regularly checked for invalid topology.
+
 config DEBUG_WRITECOUNT
 	bool "Debug filesystem writers count"
 	depends on DEBUG_KERNEL
diff --git a/mm/mmap.c b/mm/mmap.c
index a910c045cfd4..749623196cb9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2472,3 +2472,13 @@ void mm_drop_all_locks(struct mm_struct *mm)
 
 	mutex_unlock(&mm_all_locks_mutex);
 }
+
+/*
+ * initialise the VMA slab
+ */
+void __init mmap_init(void)
+{
+	vm_area_cachep = kmem_cache_create("vm_area_struct",
+			sizeof(struct vm_area_struct), 0,
+			SLAB_PANIC, NULL);
+}
diff --git a/mm/nommu.c b/mm/nommu.c
index 23f355bbe262..0d363dfcf10e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -6,7 +6,7 @@
  *
  *  See Documentation/nommu-mmap.txt
  *
- *  Copyright (c) 2004-2005 David Howells <dhowells@redhat.com>
+ *  Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
  *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
  *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
  *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
@@ -33,6 +33,28 @@
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include "internal.h"
+
+static inline __attribute__((format(printf, 1, 2)))
+void no_printk(const char *fmt, ...)
+{
+}
+
+#if 0
+#define kenter(FMT, ...) \
+	printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) \
+	printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) \
+	printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
+#else
+#define kenter(FMT, ...) \
+	no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) \
+	no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) \
+	no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
+#endif
 
 #include "internal.h"
 
@@ -46,12 +68,15 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int heap_stack_gap = 0;
 
+atomic_t mmap_pages_allocated;
+
 EXPORT_SYMBOL(mem_map);
 EXPORT_SYMBOL(num_physpages);
 
-/* list of shareable VMAs */
-struct rb_root nommu_vma_tree = RB_ROOT;
-DECLARE_RWSEM(nommu_vma_sem);
+/* list of mapped, potentially shareable regions */
+static struct kmem_cache *vm_region_jar;
+struct rb_root nommu_region_tree = RB_ROOT;
+DECLARE_RWSEM(nommu_region_sem);
 
 struct vm_operations_struct generic_file_vm_ops = {
 };
@@ -400,129 +425,174 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
 	return mm->brk = brk;
 }
 
-#ifdef DEBUG
-static void show_process_blocks(void)
+/*
+ * initialise the VMA and region record slabs
+ */
+void __init mmap_init(void)
 {
-	struct vm_list_struct *vml;
-
-	printk("Process blocks %d:", current->pid);
-
-	for (vml = &current->mm->context.vmlist; vml; vml = vml->next) {
-		printk(" %p: %p", vml, vml->vma);
-		if (vml->vma)
-			printk(" (%d @%lx #%d)",
-			       kobjsize((void *) vml->vma->vm_start),
-			       vml->vma->vm_start,
-			       atomic_read(&vml->vma->vm_usage));
-		printk(vml->next ? " ->" : ".\n");
-	}
+	vm_region_jar = kmem_cache_create("vm_region_jar",
+					  sizeof(struct vm_region), 0,
+					  SLAB_PANIC, NULL);
+	vm_area_cachep = kmem_cache_create("vm_area_struct",
+					   sizeof(struct vm_area_struct), 0,
+					   SLAB_PANIC, NULL);
 }
-#endif /* DEBUG */
 
 /*
- * add a VMA into a process's mm_struct in the appropriate place in the list
- * - should be called with mm->mmap_sem held writelocked
+ * validate the region tree
+ * - the caller must hold the region lock
  */
-static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
+#ifdef CONFIG_DEBUG_NOMMU_REGIONS
+static noinline void validate_nommu_regions(void)
 {
-	struct vm_list_struct **ppv;
+	struct vm_region *region, *last;
+	struct rb_node *p, *lastp;
 
-	for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
-		if ((*ppv)->vma->vm_start > vml->vma->vm_start)
-			break;
+	lastp = rb_first(&nommu_region_tree);
+	if (!lastp)
+		return;
+
+	last = rb_entry(lastp, struct vm_region, vm_rb);
+	if (unlikely(last->vm_end <= last->vm_start))
+		BUG();
+
+	while ((p = rb_next(lastp))) {
+		region = rb_entry(p, struct vm_region, vm_rb);
+		last = rb_entry(lastp, struct vm_region, vm_rb);
+
+		if (unlikely(region->vm_end <= region->vm_start))
+			BUG();
+		if (unlikely(region->vm_start < last->vm_end))
+			BUG();
 
-	vml->next = *ppv;
-	*ppv = vml;
+		lastp = p;
+	}
 }
+#else
+#define validate_nommu_regions() do {} while(0)
+#endif
 
 /*
- * look up the first VMA in which addr resides, NULL if none
- * - should be called with mm->mmap_sem at least held readlocked
+ * add a region into the global tree
  */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+static void add_nommu_region(struct vm_region *region)
 {
-	struct vm_list_struct *loop, *vml;
+	struct vm_region *pregion;
+	struct rb_node **p, *parent;
 
-	/* search the vm_start ordered list */
-	vml = NULL;
-	for (loop = mm->context.vmlist; loop; loop = loop->next) {
-		if (loop->vma->vm_start > addr)
-			break;
-		vml = loop;
+	validate_nommu_regions();
+
+	BUG_ON(region->vm_start & ~PAGE_MASK);
+
+	parent = NULL;
+	p = &nommu_region_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		pregion = rb_entry(parent, struct vm_region, vm_rb);
+		if (region->vm_start < pregion->vm_start)
+			p = &(*p)->rb_left;
+		else if (region->vm_start > pregion->vm_start)
+			p = &(*p)->rb_right;
+		else if (pregion == region)
+			return;
+		else
+			BUG();
 	}
 
-	if (vml && vml->vma->vm_end > addr)
-		return vml->vma;
+	rb_link_node(&region->vm_rb, parent, p);
+	rb_insert_color(&region->vm_rb, &nommu_region_tree);
 
-	return NULL;
+	validate_nommu_regions();
 }
-EXPORT_SYMBOL(find_vma);
 
 /*
- * find a VMA
- * - we don't extend stack VMAs under NOMMU conditions
+ * delete a region from the global tree
  */
-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+static void delete_nommu_region(struct vm_region *region)
 {
-	return find_vma(mm, addr);
-}
+	BUG_ON(!nommu_region_tree.rb_node);
 
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
-{
-	return -ENOMEM;
+	validate_nommu_regions();
+	rb_erase(&region->vm_rb, &nommu_region_tree);
+	validate_nommu_regions();
 }
 
 /*
- * look up the first VMA exactly that exactly matches addr
- * - should be called with mm->mmap_sem at least held readlocked
+ * free a contiguous series of pages
  */
-static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
-						    unsigned long addr)
+static void free_page_series(unsigned long from, unsigned long to)
 {
-	struct vm_list_struct *vml;
-
-	/* search the vm_start ordered list */
-	for (vml = mm->context.vmlist; vml; vml = vml->next) {
-		if (vml->vma->vm_start == addr)
-			return vml->vma;
-		if (vml->vma->vm_start > addr)
-			break;
+	for (; from < to; from += PAGE_SIZE) {
+		struct page *page = virt_to_page(from);
+
+		kdebug("- free %lx", from);
+		atomic_dec(&mmap_pages_allocated);
+		if (page_count(page) != 1)
+			kdebug("free page %p [%d]", page, page_count(page));
+		put_page(page);
 	}
-
-	return NULL;
 }
 
 /*
- * find a VMA in the global tree
+ * release a reference to a region
+ * - the caller must hold the region semaphore, which this releases
+ * - the region may not have been added to the tree yet, in which case vm_end
+ *   will equal vm_start
  */
-static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
+static void __put_nommu_region(struct vm_region *region)
+	__releases(nommu_region_sem)
 {
-	struct vm_area_struct *vma;
-	struct rb_node *n = nommu_vma_tree.rb_node;
+	kenter("%p{%d}", region, atomic_read(&region->vm_usage));
 
-	while (n) {
-		vma = rb_entry(n, struct vm_area_struct, vm_rb);
+	BUG_ON(!nommu_region_tree.rb_node);
 
-		if (start < vma->vm_start)
-			n = n->rb_left;
-		else if (start > vma->vm_start)
-			n = n->rb_right;
-		else
-			return vma;
+	if (atomic_dec_and_test(&region->vm_usage)) {
+		if (region->vm_end > region->vm_start)
+			delete_nommu_region(region);
+		up_write(&nommu_region_sem);
+
+		if (region->vm_file)
+			fput(region->vm_file);
+
+		/* IO memory and memory shared directly out of the pagecache
+		 * from ramfs/tmpfs mustn't be released here */
+		if (region->vm_flags & VM_MAPPED_COPY) {
+			kdebug("free series");
+			free_page_series(region->vm_start, region->vm_end);
+		}
+		kmem_cache_free(vm_region_jar, region);
+	} else {
+		up_write(&nommu_region_sem);
 	}
+}
 
-	return NULL;
+/*
+ * release a reference to a region
+ */
+static void put_nommu_region(struct vm_region *region)
+{
+	down_write(&nommu_region_sem);
+	__put_nommu_region(region);
 }
 
 /*
- * add a VMA in the global tree
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * and tree and add to the address space's page tree also if not an anonymous
+ * page
+ * - should be called with mm->mmap_sem held writelocked
  */
-static void add_nommu_vma(struct vm_area_struct *vma)
+static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-	struct vm_area_struct *pvma;
+	struct vm_area_struct *pvma, **pp;
 	struct address_space *mapping;
-	struct rb_node **p = &nommu_vma_tree.rb_node;
-	struct rb_node *parent = NULL;
+	struct rb_node **p, *parent;
+
+	kenter(",%p", vma);
+
+	BUG_ON(!vma->vm_region);
+
+	mm->map_count++;
+	vma->vm_mm = mm;
 
 	/* add the VMA to the mapping */
 	if (vma->vm_file) {
@@ -533,42 +603,62 @@ static void add_nommu_vma(struct vm_area_struct *vma)
 		flush_dcache_mmap_unlock(mapping);
 	}
 
-	/* add the VMA to the master list */
+	/* add the VMA to the tree */
+	parent = NULL;
+	p = &mm->mm_rb.rb_node;
 	while (*p) {
 		parent = *p;
 		pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
 
-		if (vma->vm_start < pvma->vm_start) {
+		/* sort by: start addr, end addr, VMA struct addr in that order
+		 * (the latter is necessary as we may get identical VMAs) */
+		if (vma->vm_start < pvma->vm_start)
 			p = &(*p)->rb_left;
-		}
-		else if (vma->vm_start > pvma->vm_start) {
+		else if (vma->vm_start > pvma->vm_start)
 			p = &(*p)->rb_right;
-		}
-		else {
-			/* mappings are at the same address - this can only
-			 * happen for shared-mem chardevs and shared file
-			 * mappings backed by ramfs/tmpfs */
-			BUG_ON(!(pvma->vm_flags & VM_SHARED));
-
-			if (vma < pvma)
-				p = &(*p)->rb_left;
-			else if (vma > pvma)
-				p = &(*p)->rb_right;
-			else
-				BUG();
-		}
+		else if (vma->vm_end < pvma->vm_end)
+			p = &(*p)->rb_left;
+		else if (vma->vm_end > pvma->vm_end)
+			p = &(*p)->rb_right;
+		else if (vma < pvma)
+			p = &(*p)->rb_left;
+		else if (vma > pvma)
+			p = &(*p)->rb_right;
+		else
+			BUG();
 	}
 
 	rb_link_node(&vma->vm_rb, parent, p);
-	rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
+	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+
+	/* add VMA to the VMA list also */
+	for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) {
+		if (pvma->vm_start > vma->vm_start)
+			break;
+		if (pvma->vm_start < vma->vm_start)
+			continue;
+		if (pvma->vm_end < vma->vm_end)
+			break;
+	}
+
+	vma->vm_next = *pp;
+	*pp = vma;
 }
 
 /*
- * delete a VMA from the global list
+ * delete a VMA from its owning mm_struct and address space
  */
-static void delete_nommu_vma(struct vm_area_struct *vma)
+static void delete_vma_from_mm(struct vm_area_struct *vma)
 {
+	struct vm_area_struct **pp;
 	struct address_space *mapping;
+	struct mm_struct *mm = vma->vm_mm;
+
+	kenter("%p", vma);
+
+	mm->map_count--;
+	if (mm->mmap_cache == vma)
+		mm->mmap_cache = NULL;
 
 	/* remove the VMA from the mapping */
 	if (vma->vm_file) {
@@ -579,8 +669,115 @@ static void delete_nommu_vma(struct vm_area_struct *vma)
 		flush_dcache_mmap_unlock(mapping);
 	}
 
-	/* remove from the master list */
-	rb_erase(&vma->vm_rb, &nommu_vma_tree);
+	/* remove from the MM's tree and list */
+	rb_erase(&vma->vm_rb, &mm->mm_rb);
+	for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) {
+		if (*pp == vma) {
+			*pp = vma->vm_next;
+			break;
+		}
+	}
+
+	vma->vm_mm = NULL;
+}
+
+/*
+ * destroy a VMA record
+ */
+static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+	kenter("%p", vma);
+	if (vma->vm_ops && vma->vm_ops->close)
+		vma->vm_ops->close(vma);
+	if (vma->vm_file) {
+		fput(vma->vm_file);
+		if (vma->vm_flags & VM_EXECUTABLE)
+			removed_exe_file_vma(mm);
+	}
+	put_nommu_region(vma->vm_region);
+	kmem_cache_free(vm_area_cachep, vma);
+}
+
+/*
+ * look up the first VMA in which addr resides, NULL if none
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma;
+	struct rb_node *n = mm->mm_rb.rb_node;
+
+	/* check the cache first */
+	vma = mm->mmap_cache;
+	if (vma && vma->vm_start <= addr && vma->vm_end > addr)
+		return vma;
+
+	/* trawl the tree (there may be multiple mappings in which addr
+	 * resides) */
+	for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
+		vma = rb_entry(n, struct vm_area_struct, vm_rb);
+		if (vma->vm_start > addr)
+			return NULL;
+		if (vma->vm_end > addr) {
+			mm->mmap_cache = vma;
+			return vma;
+		}
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(find_vma);
+
+/*
+ * find a VMA
+ * - we don't extend stack VMAs under NOMMU conditions
+ */
+struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+	return find_vma(mm, addr);
+}
+
+/*
+ * expand a stack to a given address
+ * - not supported under NOMMU conditions
+ */
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+	return -ENOMEM;
+}
+
+/*
+ * look up the first VMA exactly that exactly matches addr
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+					     unsigned long addr,
+					     unsigned long len)
+{
+	struct vm_area_struct *vma;
+	struct rb_node *n = mm->mm_rb.rb_node;
+	unsigned long end = addr + len;
+
+	/* check the cache first */
+	vma = mm->mmap_cache;
+	if (vma && vma->vm_start == addr && vma->vm_end == end)
+		return vma;
+
+	/* trawl the tree (there may be multiple mappings in which addr
+	 * resides) */
+	for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
+		vma = rb_entry(n, struct vm_area_struct, vm_rb);
+		if (vma->vm_start < addr)
+			continue;
+		if (vma->vm_start > addr)
+			return NULL;
+		if (vma->vm_end == end) {
+			mm->mmap_cache = vma;
+			return vma;
+		}
+	}
+
+	return NULL;
 }
 
 /*
@@ -595,7 +792,7 @@ static int validate_mmap_request(struct file *file,
 				 unsigned long pgoff,
 				 unsigned long *_capabilities)
 {
-	unsigned long capabilities;
+	unsigned long capabilities, rlen;
 	unsigned long reqprot = prot;
 	int ret;
 
@@ -615,12 +812,12 @@ static int validate_mmap_request(struct file *file,
 		return -EINVAL;
 
 	/* Careful about overflows.. */
-	len = PAGE_ALIGN(len);
-	if (!len || len > TASK_SIZE)
+	rlen = PAGE_ALIGN(len);
+	if (!rlen || rlen > TASK_SIZE)
 		return -ENOMEM;
 
 	/* offset overflow? */
-	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
+	if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
 		return -EOVERFLOW;
 
 	if (file) {
@@ -794,9 +991,10 @@ static unsigned long determine_vm_flags(struct file *file,
 }
 
 /*
- * set up a shared mapping on a file
+ * set up a shared mapping on a file (the driver or filesystem provides and
+ * pins the storage)
  */
-static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
+static int do_mmap_shared_file(struct vm_area_struct *vma)
 {
 	int ret;
 
@@ -814,10 +1012,14 @@ static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
 /*
  * set up a private mapping or an anonymous shared mapping
  */
-static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
+static int do_mmap_private(struct vm_area_struct *vma,
+			   struct vm_region *region,
+			   unsigned long len)
 {
+	struct page *pages;
+	unsigned long total, point, n, rlen;
 	void *base;
-	int ret;
+	int ret, order;
 
 	/* invoke the file's mapping function so that it can keep track of
 	 * shared mappings on devices or memory
@@ -836,23 +1038,46 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
 		 * make a private copy of the data and map that instead */
 	}
 
+	rlen = PAGE_ALIGN(len);
+
 	/* allocate some memory to hold the mapping
 	 * - note that this may not return a page-aligned address if the object
 	 *   we're allocating is smaller than a page
 	 */
-	base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
-	if (!base)
+	order = get_order(rlen);
+	kdebug("alloc order %d for %lx", order, len);
+
+	pages = alloc_pages(GFP_KERNEL, order);
+	if (!pages)
 		goto enomem;
 
-	vma->vm_start = (unsigned long) base;
-	vma->vm_end = vma->vm_start + len;
-	vma->vm_flags |= VM_MAPPED_COPY;
+	/* we allocated a power-of-2 sized page set, so we need to trim off the
+	 * excess */
+	total = 1 << order;
+	atomic_add(total, &mmap_pages_allocated);
+
+	point = rlen >> PAGE_SHIFT;
+	while (total > point) {
+		order = ilog2(total - point);
+		n = 1 << order;
+		kdebug("shave %lu/%lu @%lu", n, total - point, total);
+		atomic_sub(n, &mmap_pages_allocated);
+		total -= n;
+		set_page_refcounted(pages + total);
+		__free_pages(pages + total, order);
+	}
+
+	total = rlen >> PAGE_SHIFT;
+	for (point = 1; point < total; point++)
+		set_page_refcounted(&pages[point]);
 
-#ifdef WARN_ON_SLACK
-	if (len + WARN_ON_SLACK <= kobjsize(result))
-		printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n",
-		       len, current->pid, kobjsize(result) - len);
-#endif
+	base = page_address(pages);
+	region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
+	region->vm_start = (unsigned long) base;
+	region->vm_end   = region->vm_start + rlen;
+
+	vma->vm_start = region->vm_start;
+	vma->vm_end   = region->vm_start + len;
 
 	if (vma->vm_file) {
 		/* read the contents of a file into the copy */
@@ -864,26 +1089,27 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
 
 		old_fs = get_fs();
 		set_fs(KERNEL_DS);
-		ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
+		ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos);
 		set_fs(old_fs);
 
 		if (ret < 0)
 			goto error_free;
 
 		/* clear the last little bit */
-		if (ret < len)
-			memset(base + ret, 0, len - ret);
+		if (ret < rlen)
+			memset(base + ret, 0, rlen - ret);
 
 	} else {
 		/* if it's an anonymous mapping, then just clear it */
-		memset(base, 0, len);
+		memset(base, 0, rlen);
 	}
 
 	return 0;
 
 error_free:
-	kfree(base);
-	vma->vm_start = 0;
+	free_page_series(region->vm_start, region->vm_end);
+	region->vm_start = vma->vm_start = 0;
+	region->vm_end   = vma->vm_end = 0;
 	return ret;
 
 enomem:
@@ -903,13 +1129,14 @@ unsigned long do_mmap_pgoff(struct file *file,
 			    unsigned long flags,
 			    unsigned long pgoff)
 {
-	struct vm_list_struct *vml = NULL;
-	struct vm_area_struct *vma = NULL;
+	struct vm_area_struct *vma;
+	struct vm_region *region;
 	struct rb_node *rb;
-	unsigned long capabilities, vm_flags;
-	void *result;
+	unsigned long capabilities, vm_flags, result;
 	int ret;
 
+	kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
+
 	if (!(flags & MAP_FIXED))
 		addr = round_hint_to_min(addr);
 
@@ -917,73 +1144,120 @@ unsigned long do_mmap_pgoff(struct file *file,
 	 * mapping */
 	ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
 				    &capabilities);
-	if (ret < 0)
+	if (ret < 0) {
+		kleave(" = %d [val]", ret);
 		return ret;
+	}
 
 	/* we've determined that we can make the mapping, now translate what we
 	 * now know into VMA flags */
 	vm_flags = determine_vm_flags(file, prot, flags, capabilities);
 
-	/* we're going to need to record the mapping if it works */
-	vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
-	if (!vml)
-		goto error_getting_vml;
+	/* we're going to need to record the mapping */
+	region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
+	if (!region)
+		goto error_getting_region;
+
+	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	if (!vma)
+		goto error_getting_vma;
+
+	atomic_set(&region->vm_usage, 1);
+	region->vm_flags = vm_flags;
+	region->vm_pgoff = pgoff;
 
-	down_write(&nommu_vma_sem);
+	INIT_LIST_HEAD(&vma->anon_vma_node);
+	vma->vm_flags = vm_flags;
+	vma->vm_pgoff = pgoff;
 
-	/* if we want to share, we need to check for VMAs created by other
+	if (file) {
+		region->vm_file = file;
+		get_file(file);
+		vma->vm_file = file;
+		get_file(file);
+		if (vm_flags & VM_EXECUTABLE) {
+			added_exe_file_vma(current->mm);
+			vma->vm_mm = current->mm;
+		}
+	}
+
+	down_write(&nommu_region_sem);
+
+	/* if we want to share, we need to check for regions created by other
 	 * mmap() calls that overlap with our proposed mapping
-	 * - we can only share with an exact match on most regular files
+	 * - we can only share with a superset match on most regular files
 	 * - shared mappings on character devices and memory backed files are
 	 *   permitted to overlap inexactly as far as we are concerned for in
 	 *   these cases, sharing is handled in the driver or filesystem rather
 	 *   than here
 	 */
 	if (vm_flags & VM_MAYSHARE) {
-		unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		unsigned long vmpglen;
+		struct vm_region *pregion;
+		unsigned long pglen, rpglen, pgend, rpgend, start;
 
-		/* suppress VMA sharing for shared regions */
-		if (vm_flags & VM_SHARED &&
-		    capabilities & BDI_CAP_MAP_DIRECT)
-			goto dont_share_VMAs;
+		pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		pgend = pgoff + pglen;
 
-		for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) {
-			vma = rb_entry(rb, struct vm_area_struct, vm_rb);
+		for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
+			pregion = rb_entry(rb, struct vm_region, vm_rb);
 
-			if (!(vma->vm_flags & VM_MAYSHARE))
+			if (!(pregion->vm_flags & VM_MAYSHARE))
 				continue;
 
 			/* search for overlapping mappings on the same file */
-			if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode)
+			if (pregion->vm_file->f_path.dentry->d_inode !=
+			    file->f_path.dentry->d_inode)
 				continue;
 
-			if (vma->vm_pgoff >= pgoff + pglen)
+			if (pregion->vm_pgoff >= pgend)
 				continue;
 
-			vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1;
-			vmpglen >>= PAGE_SHIFT;
-			if (pgoff >= vma->vm_pgoff + vmpglen)
+			rpglen = pregion->vm_end - pregion->vm_start;
+			rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+			rpgend = pregion->vm_pgoff + rpglen;
+			if (pgoff >= rpgend)
 				continue;
 
-			/* handle inexactly overlapping matches between mappings */
-			if (vma->vm_pgoff != pgoff || vmpglen != pglen) {
+			/* handle inexactly overlapping matches between
+			 * mappings */
+			if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
+			    !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
+				/* new mapping is not a subset of the region */
 				if (!(capabilities & BDI_CAP_MAP_DIRECT))
 					goto sharing_violation;
 				continue;
 			}
 
-			/* we've found a VMA we can share */
-			atomic_inc(&vma->vm_usage);
-
-			vml->vma = vma;
-			result = (void *) vma->vm_start;
-			goto shared;
+			/* we've found a region we can share */
+			atomic_inc(&pregion->vm_usage);
+			vma->vm_region = pregion;
+			start = pregion->vm_start;
+			start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
+			vma->vm_start = start;
+			vma->vm_end = start + len;
+
+			if (pregion->vm_flags & VM_MAPPED_COPY) {
+				kdebug("share copy");
+				vma->vm_flags |= VM_MAPPED_COPY;
+			} else {
+				kdebug("share mmap");
+				ret = do_mmap_shared_file(vma);
+				if (ret < 0) {
+					vma->vm_region = NULL;
+					vma->vm_start = 0;
+					vma->vm_end = 0;
+					atomic_dec(&pregion->vm_usage);
+					pregion = NULL;
+					goto error_just_free;
+				}
+			}
+			fput(region->vm_file);
+			kmem_cache_free(vm_region_jar, region);
+			region = pregion;
+			result = start;
+			goto share;
 		}
 
-	dont_share_VMAs:
-		vma = NULL;
-
 		/* obtain the address at which to make a shared mapping
 		 * - this is the hook for quasi-memory character devices to
 		 *   tell us the location of a shared mapping
@@ -994,102 +1268,93 @@ unsigned long do_mmap_pgoff(struct file *file,
 			if (IS_ERR((void *) addr)) {
 				ret = addr;
 				if (ret != (unsigned long) -ENOSYS)
-					goto error;
+					goto error_just_free;
 
 				/* the driver refused to tell us where to site
 				 * the mapping so we'll have to attempt to copy
 				 * it */
 				ret = (unsigned long) -ENODEV;
 				if (!(capabilities & BDI_CAP_MAP_COPY))
-					goto error;
+					goto error_just_free;
 
 				capabilities &= ~BDI_CAP_MAP_DIRECT;
+			} else {
+				vma->vm_start = region->vm_start = addr;
+				vma->vm_end = region->vm_end = addr + len;
 			}
 		}
 	}
 
-	/* we're going to need a VMA struct as well */
-	vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
-	if (!vma)
-		goto error_getting_vma;
-
-	INIT_LIST_HEAD(&vma->anon_vma_node);
-	atomic_set(&vma->vm_usage, 1);
-	if (file) {
-		get_file(file);
-		if (vm_flags & VM_EXECUTABLE) {
-			added_exe_file_vma(current->mm);
-			vma->vm_mm = current->mm;
-		}
-	}
-	vma->vm_file	= file;
-	vma->vm_flags	= vm_flags;
-	vma->vm_start	= addr;
-	vma->vm_end	= addr + len;
-	vma->vm_pgoff	= pgoff;
-
-	vml->vma = vma;
+	vma->vm_region = region;
 
 	/* set up the mapping */
 	if (file && vma->vm_flags & VM_SHARED)
-		ret = do_mmap_shared_file(vma, len);
+		ret = do_mmap_shared_file(vma);
 	else
-		ret = do_mmap_private(vma, len);
+		ret = do_mmap_private(vma, region, len);
 	if (ret < 0)
-		goto error;
+		goto error_put_region;
+
+	add_nommu_region(region);
 
 	/* okay... we have a mapping; now we have to register it */
-	result = (void *) vma->vm_start;
+	result = vma->vm_start;
 
 	current->mm->total_vm += len >> PAGE_SHIFT;
 
-	add_nommu_vma(vma);
+share:
+	add_vma_to_mm(current->mm, vma);
 
- shared:
-	add_vma_to_mm(current->mm, vml);
-
-	up_write(&nommu_vma_sem);
+	up_write(&nommu_region_sem);
 
 	if (prot & PROT_EXEC)
-		flush_icache_range((unsigned long) result,
-				   (unsigned long) result + len);
+		flush_icache_range(result, result + len);
 
-#ifdef DEBUG
-	printk("do_mmap:\n");
-	show_process_blocks();
-#endif
+	kleave(" = %lx", result);
+	return result;
 
-	return (unsigned long) result;
-
- error:
-	up_write(&nommu_vma_sem);
-	kfree(vml);
+error_put_region:
+	__put_nommu_region(region);
 	if (vma) {
 		if (vma->vm_file) {
 			fput(vma->vm_file);
 			if (vma->vm_flags & VM_EXECUTABLE)
 				removed_exe_file_vma(vma->vm_mm);
 		}
-		kfree(vma);
+		kmem_cache_free(vm_area_cachep, vma);
 	}
+	kleave(" = %d [pr]", ret);
 	return ret;
 
- sharing_violation:
-	up_write(&nommu_vma_sem);
-	printk("Attempt to share mismatched mappings\n");
-	kfree(vml);
-	return -EINVAL;
+error_just_free:
+	up_write(&nommu_region_sem);
+error:
+	fput(region->vm_file);
+	kmem_cache_free(vm_region_jar, region);
+	fput(vma->vm_file);
+	if (vma->vm_flags & VM_EXECUTABLE)
+		removed_exe_file_vma(vma->vm_mm);
+	kmem_cache_free(vm_area_cachep, vma);
+	kleave(" = %d", ret);
+	return ret;
+
+sharing_violation:
+	up_write(&nommu_region_sem);
+	printk(KERN_WARNING "Attempt to share mismatched mappings\n");
+	ret = -EINVAL;
+	goto error;
 
- error_getting_vma:
-	up_write(&nommu_vma_sem);
-	kfree(vml);
-	printk("Allocation of vma for %lu byte allocation from process %d failed\n",
+error_getting_vma:
+	kmem_cache_free(vm_region_jar, region);
+	printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
+	       " from process %d failed\n",
 	       len, current->pid);
 	show_free_areas();
 	return -ENOMEM;
 
- error_getting_vml:
-	printk("Allocation of vml for %lu byte allocation from process %d failed\n",
+error_getting_region:
+	printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
+	       " from process %d failed\n",
 	       len, current->pid);
 	show_free_areas();
 	return -ENOMEM;
@@ -1097,77 +1362,180 @@ unsigned long do_mmap_pgoff(struct file *file,
 EXPORT_SYMBOL(do_mmap_pgoff);
 
 /*
- * handle mapping disposal for uClinux
+ * split a vma into two pieces at address 'addr', a new vma is allocated either
+ * for the first part or the tail.
  */
-static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+	      unsigned long addr, int new_below)
 {
-	if (vma) {
-		down_write(&nommu_vma_sem);
+	struct vm_area_struct *new;
+	struct vm_region *region;
+	unsigned long npages;
 
-		if (atomic_dec_and_test(&vma->vm_usage)) {
-			delete_nommu_vma(vma);
+	kenter("");
 
-			if (vma->vm_ops && vma->vm_ops->close)
-				vma->vm_ops->close(vma);
+	/* we're only permitted to split anonymous regions that have a single
+	 * owner */
+	if (vma->vm_file ||
+	    atomic_read(&vma->vm_region->vm_usage) != 1)
+		return -ENOMEM;
 
-			/* IO memory and memory shared directly out of the pagecache from
-			 * ramfs/tmpfs mustn't be released here */
-			if (vma->vm_flags & VM_MAPPED_COPY)
-				kfree((void *) vma->vm_start);
+	if (mm->map_count >= sysctl_max_map_count)
+		return -ENOMEM;
 
-			if (vma->vm_file) {
-				fput(vma->vm_file);
-				if (vma->vm_flags & VM_EXECUTABLE)
-					removed_exe_file_vma(mm);
-			}
-			kfree(vma);
-		}
+	region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	if (!new) {
+		kmem_cache_free(vm_region_jar, region);
+		return -ENOMEM;
+	}
+
+	/* most fields are the same, copy all, and then fixup */
+	*new = *vma;
+	*region = *vma->vm_region;
+	new->vm_region = region;
+
+	npages = (addr - vma->vm_start) >> PAGE_SHIFT;
+
+	if (new_below) {
+		region->vm_end = new->vm_end = addr;
+	} else {
+		region->vm_start = new->vm_start = addr;
+		region->vm_pgoff = new->vm_pgoff += npages;
+	}
 
-		up_write(&nommu_vma_sem);
+	if (new->vm_ops && new->vm_ops->open)
+		new->vm_ops->open(new);
+
+	delete_vma_from_mm(vma);
+	down_write(&nommu_region_sem);
+	delete_nommu_region(vma->vm_region);
+	if (new_below) {
+		vma->vm_region->vm_start = vma->vm_start = addr;
+		vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
+	} else {
+		vma->vm_region->vm_end = vma->vm_end = addr;
 	}
+	add_nommu_region(vma->vm_region);
+	add_nommu_region(new->vm_region);
+	up_write(&nommu_region_sem);
+	add_vma_to_mm(mm, vma);
+	add_vma_to_mm(mm, new);
+	return 0;
 }
 
 /*
- * release a mapping
- * - under NOMMU conditions the parameters must match exactly to the mapping to
- *   be removed
+ * shrink a VMA by removing the specified chunk from either the beginning or
+ * the end
  */
-int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
+static int shrink_vma(struct mm_struct *mm,
+		      struct vm_area_struct *vma,
+		      unsigned long from, unsigned long to)
 {
-	struct vm_list_struct *vml, **parent;
-	unsigned long end = addr + len;
+	struct vm_region *region;
 
-#ifdef DEBUG
-	printk("do_munmap:\n");
-#endif
+	kenter("");
 
-	for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) {
-		if ((*parent)->vma->vm_start > addr)
-			break;
-		if ((*parent)->vma->vm_start == addr &&
-		    ((len == 0) || ((*parent)->vma->vm_end == end)))
-			goto found;
-	}
+	/* adjust the VMA's pointers, which may reposition it in the MM's tree
+	 * and list */
+	delete_vma_from_mm(vma);
+	if (from > vma->vm_start)
+		vma->vm_end = from;
+	else
+		vma->vm_start = to;
+	add_vma_to_mm(mm, vma);
 
-	printk("munmap of non-mmaped memory by process %d (%s): %p\n",
-	       current->pid, current->comm, (void *) addr);
-	return -EINVAL;
+	/* cut the backing region down to size */
+	region = vma->vm_region;
+	BUG_ON(atomic_read(&region->vm_usage) != 1);
 
- found:
-	vml = *parent;
+	down_write(&nommu_region_sem);
+	delete_nommu_region(region);
+	if (from > region->vm_start)
+		region->vm_end = from;
+	else
+		region->vm_start = to;
+	add_nommu_region(region);
+	up_write(&nommu_region_sem);
 
-	put_vma(mm, vml->vma);
+	free_page_series(from, to);
+	return 0;
+}
 
-	*parent = vml->next;
-	kfree(vml);
+/*
+ * release a mapping
+ * - under NOMMU conditions the chunk to be unmapped must be backed by a single
+ *   VMA, though it need not cover the whole VMA
+ */
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+{
+	struct vm_area_struct *vma;
+	struct rb_node *rb;
+	unsigned long end = start + len;
+	int ret;
 
-	update_hiwater_vm(mm);
-	mm->total_vm -= len >> PAGE_SHIFT;
+	kenter(",%lx,%zx", start, len);
 
-#ifdef DEBUG
-	show_process_blocks();
-#endif
+	if (len == 0)
+		return -EINVAL;
+
+	/* find the first potentially overlapping VMA */
+	vma = find_vma(mm, start);
+	if (!vma) {
+		printk(KERN_WARNING
+		       "munmap of memory not mmapped by process %d (%s):"
+		       " 0x%lx-0x%lx\n",
+		       current->pid, current->comm, start, start + len - 1);
+		return -EINVAL;
+	}
 
+	/* we're allowed to split an anonymous VMA but not a file-backed one */
+	if (vma->vm_file) {
+		do {
+			if (start > vma->vm_start) {
+				kleave(" = -EINVAL [miss]");
+				return -EINVAL;
+			}
+			if (end == vma->vm_end)
+				goto erase_whole_vma;
+			rb = rb_next(&vma->vm_rb);
+			vma = rb_entry(rb, struct vm_area_struct, vm_rb);
+		} while (rb);
+		kleave(" = -EINVAL [split file]");
+		return -EINVAL;
+	} else {
+		/* the chunk must be a subset of the VMA found */
+		if (start == vma->vm_start && end == vma->vm_end)
+			goto erase_whole_vma;
+		if (start < vma->vm_start || end > vma->vm_end) {
+			kleave(" = -EINVAL [superset]");
+			return -EINVAL;
+		}
+		if (start & ~PAGE_MASK) {
+			kleave(" = -EINVAL [unaligned start]");
+			return -EINVAL;
+		}
+		if (end != vma->vm_end && end & ~PAGE_MASK) {
+			kleave(" = -EINVAL [unaligned split]");
+			return -EINVAL;
+		}
+		if (start != vma->vm_start && end != vma->vm_end) {
+			ret = split_vma(mm, vma, start, 1);
+			if (ret < 0) {
+				kleave(" = %d [split]", ret);
+				return ret;
+			}
+		}
+		return shrink_vma(mm, vma, start, end);
+	}
+
+erase_whole_vma:
+	delete_vma_from_mm(vma);
+	delete_vma(mm, vma);
+	kleave(" = 0");
 	return 0;
 }
 EXPORT_SYMBOL(do_munmap);
@@ -1184,29 +1552,26 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len)
 }
 
 /*
- * Release all mappings
+ * release all the mappings made in a process's VM space
  */
-void exit_mmap(struct mm_struct * mm)
+void exit_mmap(struct mm_struct *mm)
 {
-	struct vm_list_struct *tmp;
+	struct vm_area_struct *vma;
 
-	if (mm) {
-#ifdef DEBUG
-		printk("Exit_mmap:\n");
-#endif
+	if (!mm)
+		return;
 
-		mm->total_vm = 0;
+	kenter("");
 
-		while ((tmp = mm->context.vmlist)) {
-			mm->context.vmlist = tmp->next;
-			put_vma(mm, tmp->vma);
-			kfree(tmp);
-		}
+	mm->total_vm = 0;
 
-#ifdef DEBUG
-		show_process_blocks();
-#endif
+	while ((vma = mm->mmap)) {
+		mm->mmap = vma->vm_next;
+		delete_vma_from_mm(vma);
+		delete_vma(mm, vma);
 	}
+
+	kleave("");
 }
 
 unsigned long do_brk(unsigned long addr, unsigned long len)
@@ -1219,8 +1584,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
  * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
  *
  * under NOMMU conditions, we only permit changing a mapping's size, and only
- * as long as it stays within the hole allocated by the kmalloc() call in
- * do_mmap_pgoff() and the block is not shareable
+ * as long as it stays within the region allocated by do_mmap_private() and the
+ * block is not shareable
  *
  * MREMAP_FIXED is not supported under NOMMU conditions
  */
@@ -1231,13 +1596,16 @@ unsigned long do_mremap(unsigned long addr,
 	struct vm_area_struct *vma;
 
 	/* insanity checks first */
-	if (new_len == 0)
+	if (old_len == 0 || new_len == 0)
 		return (unsigned long) -EINVAL;
 
+	if (addr & ~PAGE_MASK)
+		return -EINVAL;
+
 	if (flags & MREMAP_FIXED && new_addr != addr)
 		return (unsigned long) -EINVAL;
 
-	vma = find_vma_exact(current->mm, addr);
+	vma = find_vma_exact(current->mm, addr, old_len);
 	if (!vma)
 		return (unsigned long) -EINVAL;
 
@@ -1247,19 +1615,19 @@ unsigned long do_mremap(unsigned long addr,
 	if (vma->vm_flags & VM_MAYSHARE)
 		return (unsigned long) -EPERM;
 
-	if (new_len > kobjsize((void *) addr))
+	if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
 		return (unsigned long) -ENOMEM;
 
 	/* all checks complete - do it */
 	vma->vm_end = vma->vm_start + new_len;
-
 	return vma->vm_start;
 }
 EXPORT_SYMBOL(do_mremap);
 
-asmlinkage unsigned long sys_mremap(unsigned long addr,
-	unsigned long old_len, unsigned long new_len,
-	unsigned long flags, unsigned long new_addr)
+asmlinkage
+unsigned long sys_mremap(unsigned long addr,
+			 unsigned long old_len, unsigned long new_len,
+			 unsigned long flags, unsigned long new_addr)
 {
 	unsigned long ret;
 
-- 
cgit v1.2.3


From dd8632a12e500a684478fea0951f380478d56fed Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 8 Jan 2009 12:04:47 +0000
Subject: NOMMU: Make mmap allocation page trimming behaviour configurable.

NOMMU mmap allocates a piece of memory for an mmap that's rounded up in size to
the nearest power-of-2 number of pages.  Currently it then discards the excess
pages back to the page allocator, making that memory available for use by other
things.  This can, however, cause greater amount of fragmentation.

To counter this, a sysctl is added in order to fine-tune the trimming
behaviour.  The default behaviour remains to trim pages aggressively, while
this can either be disabled completely or set to a higher page-granular
watermark in order to have finer-grained control.

vm region vm_top bits taken from an earlier patch by David Howells.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Mike Frysinger <vapier.adi@gmail.com>
---
 Documentation/nommu-mmap.txt | 15 ++++++++++
 Documentation/sysctl/vm.txt  | 18 ++++++++++++
 include/linux/mm_types.h     |  1 +
 kernel/sysctl.c              | 14 ++++++++++
 mm/nommu.c                   | 65 ++++++++++++++++++++++++++++----------------
 5 files changed, 90 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt
index 02b89dcf38ac..b565e8279d13 100644
--- a/Documentation/nommu-mmap.txt
+++ b/Documentation/nommu-mmap.txt
@@ -248,3 +248,18 @@ PROVIDING SHAREABLE BLOCK DEVICE SUPPORT
 Provision of shared mappings on block device files is exactly the same as for
 character devices. If there isn't a real device underneath, then the driver
 should allocate sufficient contiguous memory to honour any supported mapping.
+
+
+=================================
+ADJUSTING PAGE TRIMMING BEHAVIOUR
+=================================
+
+NOMMU mmap automatically rounds up to the nearest power-of-2 number of pages
+when performing an allocation.  This can have adverse effects on memory
+fragmentation, and as such, is left configurable.  The default behaviour is to
+aggressively trim allocations and discard any excess pages back in to the page
+allocator.  In order to retain finer-grained control over fragmentation, this
+behaviour can either be disabled completely, or bumped up to a higher page
+watermark where trimming begins.
+
+Page trimming behaviour is configurable via the sysctl `vm.nr_trim_pages'.
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index cd05994a49e6..a3415070bcac 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -38,6 +38,7 @@ Currently, these files are in /proc/sys/vm:
 - numa_zonelist_order
 - nr_hugepages
 - nr_overcommit_hugepages
+- nr_trim_pages		(only if CONFIG_MMU=n)
 
 ==============================================================
 
@@ -348,3 +349,20 @@ Change the maximum size of the hugepage pool. The maximum is
 nr_hugepages + nr_overcommit_hugepages.
 
 See Documentation/vm/hugetlbpage.txt
+
+==============================================================
+
+nr_trim_pages
+
+This is available only on NOMMU kernels.
+
+This value adjusts the excess page trimming behaviour of power-of-2 aligned
+NOMMU mmap allocations.
+
+A value of 0 disables trimming of allocations entirely, while a value of 1
+trims excess pages aggressively. Any value >= 1 acts as the watermark where
+trimming of allocations is initiated.
+
+The default value is 1.
+
+See Documentation/nommu-mmap.txt for more information.
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1c1e0d3a1714..92915e81443f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -106,6 +106,7 @@ struct vm_region {
 	unsigned long	vm_flags;	/* VMA vm_flags */
 	unsigned long	vm_start;	/* start address of region */
 	unsigned long	vm_end;		/* region initialised to here */
+	unsigned long	vm_top;		/* region allocated to here */
 	unsigned long	vm_pgoff;	/* the offset in vm_file corresponding to vm_start */
 	struct file	*vm_file;	/* the backing file or NULL */
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 92f6e5bc3c24..89d74436318c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -82,6 +82,9 @@ extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
+#ifndef CONFIG_MMU
+extern int sysctl_nr_trim_pages;
+#endif
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
@@ -1102,6 +1105,17 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+#else
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nr_trim_pages",
+		.data		= &sysctl_nr_trim_pages,
+		.maxlen		= sizeof(sysctl_nr_trim_pages),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 #endif
 	{
 		.ctl_name	= VM_LAPTOP_MODE,
diff --git a/mm/nommu.c b/mm/nommu.c
index 0d363dfcf10e..a6e8ccfbd400 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,7 +10,7 @@
  *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
  *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
  *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
- *  Copyright (c) 2007      Paul Mundt <lethal@linux-sh.org>
+ *  Copyright (c) 2007-2008 Paul Mundt <lethal@linux-sh.org>
  */
 
 #include <linux/module.h>
@@ -66,6 +66,7 @@ atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
+int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
 int heap_stack_gap = 0;
 
 atomic_t mmap_pages_allocated;
@@ -455,6 +456,8 @@ static noinline void validate_nommu_regions(void)
 	last = rb_entry(lastp, struct vm_region, vm_rb);
 	if (unlikely(last->vm_end <= last->vm_start))
 		BUG();
+	if (unlikely(last->vm_top < last->vm_end))
+		BUG();
 
 	while ((p = rb_next(lastp))) {
 		region = rb_entry(p, struct vm_region, vm_rb);
@@ -462,7 +465,9 @@ static noinline void validate_nommu_regions(void)
 
 		if (unlikely(region->vm_end <= region->vm_start))
 			BUG();
-		if (unlikely(region->vm_start < last->vm_end))
+		if (unlikely(region->vm_top < region->vm_end))
+			BUG();
+		if (unlikely(region->vm_start < last->vm_top))
 			BUG();
 
 		lastp = p;
@@ -536,7 +541,7 @@ static void free_page_series(unsigned long from, unsigned long to)
 /*
  * release a reference to a region
  * - the caller must hold the region semaphore, which this releases
- * - the region may not have been added to the tree yet, in which case vm_end
+ * - the region may not have been added to the tree yet, in which case vm_top
  *   will equal vm_start
  */
 static void __put_nommu_region(struct vm_region *region)
@@ -547,7 +552,7 @@ static void __put_nommu_region(struct vm_region *region)
 	BUG_ON(!nommu_region_tree.rb_node);
 
 	if (atomic_dec_and_test(&region->vm_usage)) {
-		if (region->vm_end > region->vm_start)
+		if (region->vm_top > region->vm_start)
 			delete_nommu_region(region);
 		up_write(&nommu_region_sem);
 
@@ -558,7 +563,7 @@ static void __put_nommu_region(struct vm_region *region)
 		 * from ramfs/tmpfs mustn't be released here */
 		if (region->vm_flags & VM_MAPPED_COPY) {
 			kdebug("free series");
-			free_page_series(region->vm_start, region->vm_end);
+			free_page_series(region->vm_start, region->vm_top);
 		}
 		kmem_cache_free(vm_region_jar, region);
 	} else {
@@ -999,6 +1004,10 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
 	int ret;
 
 	ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+	if (ret == 0) {
+		vma->vm_region->vm_top = vma->vm_region->vm_end;
+		return ret;
+	}
 	if (ret != -ENOSYS)
 		return ret;
 
@@ -1027,11 +1036,14 @@ static int do_mmap_private(struct vm_area_struct *vma,
 	 */
 	if (vma->vm_file) {
 		ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
-		if (ret != -ENOSYS) {
+		if (ret == 0) {
 			/* shouldn't return success if we're not sharing */
-			BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE));
-			return ret; /* success or a real error */
+			BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
+			vma->vm_region->vm_top = vma->vm_region->vm_end;
+			return ret;
 		}
+		if (ret != -ENOSYS)
+			return ret;
 
 		/* getting an ENOSYS error indicates that direct mmap isn't
 		 * possible (as opposed to tried but failed) so we'll try to
@@ -1051,23 +1063,25 @@ static int do_mmap_private(struct vm_area_struct *vma,
 	if (!pages)
 		goto enomem;
 
-	/* we allocated a power-of-2 sized page set, so we need to trim off the
-	 * excess */
 	total = 1 << order;
 	atomic_add(total, &mmap_pages_allocated);
 
 	point = rlen >> PAGE_SHIFT;
-	while (total > point) {
-		order = ilog2(total - point);
-		n = 1 << order;
-		kdebug("shave %lu/%lu @%lu", n, total - point, total);
-		atomic_sub(n, &mmap_pages_allocated);
-		total -= n;
-		set_page_refcounted(pages + total);
-		__free_pages(pages + total, order);
+
+	/* we allocated a power-of-2 sized page set, so we may want to trim off
+	 * the excess */
+	if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
+		while (total > point) {
+			order = ilog2(total - point);
+			n = 1 << order;
+			kdebug("shave %lu/%lu @%lu", n, total - point, total);
+			atomic_sub(n, &mmap_pages_allocated);
+			total -= n;
+			set_page_refcounted(pages + total);
+			__free_pages(pages + total, order);
+		}
 	}
 
-	total = rlen >> PAGE_SHIFT;
 	for (point = 1; point < total; point++)
 		set_page_refcounted(&pages[point]);
 
@@ -1075,6 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
 	region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
 	region->vm_start = (unsigned long) base;
 	region->vm_end   = region->vm_start + rlen;
+	region->vm_top   = region->vm_start + (total << PAGE_SHIFT);
 
 	vma->vm_start = region->vm_start;
 	vma->vm_end   = region->vm_start + len;
@@ -1110,6 +1125,7 @@ error_free:
 	free_page_series(region->vm_start, region->vm_end);
 	region->vm_start = vma->vm_start = 0;
 	region->vm_end   = vma->vm_end = 0;
+	region->vm_top   = 0;
 	return ret;
 
 enomem:
@@ -1401,7 +1417,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	npages = (addr - vma->vm_start) >> PAGE_SHIFT;
 
 	if (new_below) {
-		region->vm_end = new->vm_end = addr;
+		region->vm_top = region->vm_end = new->vm_end = addr;
 	} else {
 		region->vm_start = new->vm_start = addr;
 		region->vm_pgoff = new->vm_pgoff += npages;
@@ -1418,6 +1434,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 		vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
 	} else {
 		vma->vm_region->vm_end = vma->vm_end = addr;
+		vma->vm_region->vm_top = addr;
 	}
 	add_nommu_region(vma->vm_region);
 	add_nommu_region(new->vm_region);
@@ -1454,10 +1471,12 @@ static int shrink_vma(struct mm_struct *mm,
 
 	down_write(&nommu_region_sem);
 	delete_nommu_region(region);
-	if (from > region->vm_start)
-		region->vm_end = from;
-	else
+	if (from > region->vm_start) {
+		to = region->vm_top;
+		region->vm_top = region->vm_end = from;
+	} else {
 		region->vm_start = to;
+	}
 	add_nommu_region(region);
 	up_write(&nommu_region_sem);
 
-- 
cgit v1.2.3


From e2387d6c20752ccdb2895ba5de664fa39652f4cc Mon Sep 17 00:00:00 2001
From: Wolfram Sang <w.sang@pengutronix.de>
Date: Mon, 17 Nov 2008 14:35:44 +0000
Subject: leds: Make header variable naming consistent

There is one place where the struct led_classdev as the function
argument is named differently. Fix it.

Signed-off-by: Wolfram Sang <w.sang@pengutronix.de>
Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
---
 include/linux/leds.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index d3a73f5a48c3..3c1a8ce6a5ea 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -62,7 +62,7 @@ struct led_classdev {
 
 extern int led_classdev_register(struct device *parent,
 				 struct led_classdev *led_cdev);
-extern void led_classdev_unregister(struct led_classdev *lcd);
+extern void led_classdev_unregister(struct led_classdev *led_cdev);
 extern void led_classdev_suspend(struct led_classdev *led_cdev);
 extern void led_classdev_resume(struct led_classdev *led_cdev);
 
-- 
cgit v1.2.3


From 934cd3f979a1daacbd403398f2c7a8f6720c33aa Mon Sep 17 00:00:00 2001
From: Riku Voipio <riku.voipio@iki.fi>
Date: Wed, 3 Dec 2008 08:21:36 +0000
Subject: leds: leds-pcs9532 - Move i2c work to a workqueque

Apparently these might be called under atomic context,
and i2c operations may sleep. BUG found by
Ross Burton <ross@burtonini.com>

Signed-off-by: Riku Voipio <riku.voipio@iki.fi>
Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
---
 drivers/leds/leds-pca9532.c  | 51 +++++++++++++++++++++++++++++++++++++-------
 include/linux/leds-pca9532.h |  2 ++
 2 files changed, 45 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/leds-pca9532.c b/drivers/leds/leds-pca9532.c
index 62b60a038e2c..76ec7498e2d5 100644
--- a/drivers/leds/leds-pca9532.c
+++ b/drivers/leds/leds-pca9532.c
@@ -16,6 +16,7 @@
 #include <linux/leds.h>
 #include <linux/input.h>
 #include <linux/mutex.h>
+#include <linux/workqueue.h>
 #include <linux/leds-pca9532.h>
 
 static const unsigned short normal_i2c[] = { /*0x60,*/ I2C_CLIENT_END};
@@ -34,6 +35,7 @@ struct pca9532_data {
 	struct pca9532_led leds[16];
 	struct mutex update_lock;
 	struct input_dev    *idev;
+       struct work_struct work;
 	u8 pwm[2];
 	u8 psc[2];
 };
@@ -63,7 +65,7 @@ static struct i2c_driver pca9532_driver = {
  * as a compromise we average one pwm to the values requested by all
  * leds that are not ON/OFF.
  * */
-static int pca9532_setpwm(struct i2c_client *client, int pwm, int blink,
+static int pca9532_calcpwm(struct i2c_client *client, int pwm, int blink,
 	enum led_brightness value)
 {
 	int a = 0, b = 0, i = 0;
@@ -84,11 +86,17 @@ static int pca9532_setpwm(struct i2c_client *client, int pwm, int blink,
 	b = b/a;
 	if (b > 0xFF)
 		return -EINVAL;
-	mutex_lock(&data->update_lock);
 	data->pwm[pwm] = b;
+       data->psc[pwm] = blink;
+       return 0;
+}
+
+static int pca9532_setpwm(struct i2c_client *client, int pwm)
+{
+       struct pca9532_data *data = i2c_get_clientdata(client);
+       mutex_lock(&data->update_lock);
 	i2c_smbus_write_byte_data(client, PCA9532_REG_PWM(pwm),
 		data->pwm[pwm]);
-	data->psc[pwm] = blink;
 	i2c_smbus_write_byte_data(client, PCA9532_REG_PSC(pwm),
 		data->psc[pwm]);
 	mutex_unlock(&data->update_lock);
@@ -124,11 +132,11 @@ static void pca9532_set_brightness(struct led_classdev *led_cdev,
 		led->state = PCA9532_ON;
 	else {
 		led->state = PCA9532_PWM0; /* Thecus: hardcode one pwm */
-		err = pca9532_setpwm(led->client, 0, 0, value);
+               err = pca9532_calcpwm(led->client, 0, 0, value);
 		if (err)
 			return; /* XXX: led api doesn't allow error code? */
 	}
-	pca9532_setled(led);
+       schedule_work(&led->work);
 }
 
 static int pca9532_set_blink(struct led_classdev *led_cdev,
@@ -137,6 +145,7 @@ static int pca9532_set_blink(struct led_classdev *led_cdev,
 	struct pca9532_led *led = ldev_to_led(led_cdev);
 	struct i2c_client *client = led->client;
 	int psc;
+       int err = 0;
 
 	if (*delay_on == 0 && *delay_off == 0) {
 	/* led subsystem ask us for a blink rate */
@@ -148,7 +157,11 @@ static int pca9532_set_blink(struct led_classdev *led_cdev,
 
 	/* Thecus specific: only use PSC/PWM 0 */
 	psc = (*delay_on * 152-1)/1000;
-	return pca9532_setpwm(client, 0, psc, led_cdev->brightness);
+       err = pca9532_calcpwm(client, 0, psc, led_cdev->brightness);
+       if (err)
+               return err;
+       schedule_work(&led->work);
+       return 0;
 }
 
 static int pca9532_event(struct input_dev *dev, unsigned int type,
@@ -165,13 +178,28 @@ static int pca9532_event(struct input_dev *dev, unsigned int type,
 	else
 		data->pwm[1] = 0;
 
-	dev_info(&dev->dev, "setting beep to %d \n", data->pwm[1]);
+       schedule_work(&data->work);
+
+       return 0;
+}
+
+static void pca9532_input_work(struct work_struct *work)
+{
+       struct pca9532_data *data;
+       data = container_of(work, struct pca9532_data, work);
 	mutex_lock(&data->update_lock);
 	i2c_smbus_write_byte_data(data->client, PCA9532_REG_PWM(1),
 		data->pwm[1]);
 	mutex_unlock(&data->update_lock);
+}
 
-	return 0;
+static void pca9532_led_work(struct work_struct *work)
+{
+       struct pca9532_led *led;
+       led = container_of(work, struct pca9532_led, work);
+       if (led->state == PCA9532_PWM0)
+               pca9532_setpwm(led->client, 0);
+       pca9532_setled(led);
 }
 
 static int pca9532_configure(struct i2c_client *client,
@@ -204,6 +232,7 @@ static int pca9532_configure(struct i2c_client *client,
 			led->ldev.brightness = LED_OFF;
 			led->ldev.brightness_set = pca9532_set_brightness;
 			led->ldev.blink_set = pca9532_set_blink;
+                       INIT_WORK(&led->work, pca9532_led_work);
 			err = led_classdev_register(&client->dev, &led->ldev);
 			if (err < 0) {
 				dev_err(&client->dev,
@@ -233,9 +262,11 @@ static int pca9532_configure(struct i2c_client *client,
 						BIT_MASK(SND_TONE);
 			data->idev->event = pca9532_event;
 			input_set_drvdata(data->idev, data);
+                       INIT_WORK(&data->work, pca9532_input_work);
 			err = input_register_device(data->idev);
 			if (err) {
 				input_free_device(data->idev);
+                               cancel_work_sync(&data->work);
 				data->idev = NULL;
 				goto exit;
 			}
@@ -252,11 +283,13 @@ exit:
 				break;
 			case PCA9532_TYPE_LED:
 				led_classdev_unregister(&data->leds[i].ldev);
+                               cancel_work_sync(&data->leds[i].work);
 				break;
 			case PCA9532_TYPE_N2100_BEEP:
 				if (data->idev != NULL) {
 					input_unregister_device(data->idev);
 					input_free_device(data->idev);
+                                       cancel_work_sync(&data->work);
 					data->idev = NULL;
 				}
 				break;
@@ -307,11 +340,13 @@ static int pca9532_remove(struct i2c_client *client)
 			break;
 		case PCA9532_TYPE_LED:
 			led_classdev_unregister(&data->leds[i].ldev);
+                       cancel_work_sync(&data->leds[i].work);
 			break;
 		case PCA9532_TYPE_N2100_BEEP:
 			if (data->idev != NULL) {
 				input_unregister_device(data->idev);
 				input_free_device(data->idev);
+                               cancel_work_sync(&data->work);
 				data->idev = NULL;
 			}
 			break;
diff --git a/include/linux/leds-pca9532.h b/include/linux/leds-pca9532.h
index 81b4207deb95..96eea90f01a8 100644
--- a/include/linux/leds-pca9532.h
+++ b/include/linux/leds-pca9532.h
@@ -15,6 +15,7 @@
 #define __LINUX_PCA9532_H
 
 #include <linux/leds.h>
+#include <linux/workqueue.h>
 
 enum pca9532_state {
 	PCA9532_OFF  = 0x0,
@@ -31,6 +32,7 @@ struct pca9532_led {
 	struct i2c_client *client;
 	char *name;
 	struct led_classdev ldev;
+       struct work_struct work;
 	enum pca9532_type type;
 	enum pca9532_state state;
 };
-- 
cgit v1.2.3


From 0081e8020ebd814a99e45720a10e869a54ee08a6 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Thu, 4 Dec 2008 16:52:33 +0000
Subject: leds: Add WM8350 LED driver

The voltage and current regulators on the WM8350 AudioPlus PMIC can be
used in concert to provide a power efficient LED driver.  This driver
implements support for this within the standard LED class.

Platform initialisation code should configure the LED hardware in the
init callback provided by the WM8350 core driver.  The callback should
use wm8350_isink_set_flash(), wm8350_dcdc25_set_mode() and
wm8350_dcdc_set_slot() to configure the operating parameters of the
regulators for their hardware and then then use wm8350_register_led() to
instantiate the LED driver.

This driver was originally written by Liam Girdwood, though it has been
extensively modified since then.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
---
 drivers/leds/Kconfig                 |   7 +
 drivers/leds/Makefile                |   1 +
 drivers/leds/leds-wm8350.c           | 333 +++++++++++++++++++++++++++++++++++
 drivers/mfd/wm8350-core.c            |   3 +
 drivers/regulator/wm8350-regulator.c |  91 ++++++++++
 include/linux/mfd/wm8350/pmic.h      |  36 ++++
 6 files changed, 471 insertions(+)
 create mode 100644 drivers/leds/leds-wm8350.c

(limited to 'include/linux')

diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 3d0d2625f6ab..a4a1ae214630 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -164,6 +164,13 @@ config LEDS_PCA955X
 	  LED driver chips accessed via the I2C bus.  Supported
 	  devices include PCA9550, PCA9551, PCA9552, and PCA9553.
 
+config LEDS_WM8350
+	tristate "LED Support for WM8350 AudioPlus PMIC"
+	depends on LEDS_CLASS && MFD_WM8350
+	help
+	  This option enables support for LEDs driven by the Wolfson
+	  Microelectronics WM8350 AudioPlus PMIC.
+
 config LEDS_DA903X
 	tristate "LED Support for DA9030/DA9034 PMIC"
 	depends on LEDS_CLASS && PMIC_DA903X
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index ff4616a49043..bc247cb02e82 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_LEDS_FSG)			+= leds-fsg.o
 obj-$(CONFIG_LEDS_PCA955X)		+= leds-pca955x.o
 obj-$(CONFIG_LEDS_DA903X)		+= leds-da903x.o
 obj-$(CONFIG_LEDS_HP_DISK)		+= leds-hp-disk.o
+obj-$(CONFIG_LEDS_WM8350)		+= leds-wm8350.o
 
 # LED Triggers
 obj-$(CONFIG_LEDS_TRIGGER_TIMER)	+= ledtrig-timer.o
diff --git a/drivers/leds/leds-wm8350.c b/drivers/leds/leds-wm8350.c
new file mode 100644
index 000000000000..846ed978103f
--- /dev/null
+++ b/drivers/leds/leds-wm8350.c
@@ -0,0 +1,333 @@
+/*
+ * LED driver for WM8350 driven LEDS.
+ *
+ * Copyright(C) 2007, 2008 Wolfson Microelectronics PLC.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/leds.h>
+#include <linux/err.h>
+#include <linux/mfd/wm8350/pmic.h>
+#include <linux/regulator/consumer.h>
+
+/* Microamps */
+static const int isink_cur[] = {
+	4,
+	5,
+	6,
+	7,
+	8,
+	10,
+	11,
+	14,
+	16,
+	19,
+	23,
+	27,
+	32,
+	39,
+	46,
+	54,
+	65,
+	77,
+	92,
+	109,
+	130,
+	154,
+	183,
+	218,
+	259,
+	308,
+	367,
+	436,
+	518,
+	616,
+	733,
+	872,
+	1037,
+	1233,
+	1466,
+	1744,
+	2073,
+	2466,
+	2933,
+	3487,
+	4147,
+	4932,
+	5865,
+	6975,
+	8294,
+	9864,
+	11730,
+	13949,
+	16589,
+	19728,
+	23460,
+	27899,
+	33178,
+	39455,
+	46920,
+	55798,
+	66355,
+	78910,
+	93840,
+	111596,
+	132710,
+	157820,
+	187681,
+	223191
+};
+
+#define to_wm8350_led(led_cdev) \
+	container_of(led_cdev, struct wm8350_led, cdev)
+
+static void wm8350_led_enable(struct wm8350_led *led)
+{
+	int ret;
+
+	if (led->enabled)
+		return;
+
+	ret = regulator_enable(led->isink);
+	if (ret != 0) {
+		dev_err(led->cdev.dev, "Failed to enable ISINK: %d\n", ret);
+		return;
+	}
+
+	ret = regulator_enable(led->dcdc);
+	if (ret != 0) {
+		dev_err(led->cdev.dev, "Failed to enable DCDC: %d\n", ret);
+		regulator_disable(led->isink);
+		return;
+	}
+
+	led->enabled = 1;
+}
+
+static void wm8350_led_disable(struct wm8350_led *led)
+{
+	int ret;
+
+	if (!led->enabled)
+		return;
+
+	ret = regulator_disable(led->dcdc);
+	if (ret != 0) {
+		dev_err(led->cdev.dev, "Failed to disable DCDC: %d\n", ret);
+		return;
+	}
+
+	ret = regulator_disable(led->isink);
+	if (ret != 0) {
+		dev_err(led->cdev.dev, "Failed to disable ISINK: %d\n", ret);
+		regulator_enable(led->dcdc);
+		return;
+	}
+
+	led->enabled = 0;
+}
+
+static void led_work(struct work_struct *work)
+{
+	struct wm8350_led *led = container_of(work, struct wm8350_led, work);
+	int ret;
+	int uA;
+	unsigned long flags;
+
+	mutex_lock(&led->mutex);
+
+	spin_lock_irqsave(&led->value_lock, flags);
+
+	if (led->value == LED_OFF) {
+		spin_unlock_irqrestore(&led->value_lock, flags);
+		wm8350_led_disable(led);
+		goto out;
+	}
+
+	/* This scales linearly into the index of valid current
+	 * settings which results in a linear scaling of perceived
+	 * brightness due to the non-linear current settings provided
+	 * by the hardware.
+	 */
+	uA = (led->max_uA_index * led->value) / LED_FULL;
+	spin_unlock_irqrestore(&led->value_lock, flags);
+	BUG_ON(uA >= ARRAY_SIZE(isink_cur));
+
+	ret = regulator_set_current_limit(led->isink, isink_cur[uA],
+					  isink_cur[uA]);
+	if (ret != 0)
+		dev_err(led->cdev.dev, "Failed to set %duA: %d\n",
+			isink_cur[uA], ret);
+
+	wm8350_led_enable(led);
+
+out:
+	mutex_unlock(&led->mutex);
+}
+
+static void wm8350_led_set(struct led_classdev *led_cdev,
+			   enum led_brightness value)
+{
+	struct wm8350_led *led = to_wm8350_led(led_cdev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&led->value_lock, flags);
+	led->value = value;
+	schedule_work(&led->work);
+	spin_unlock_irqrestore(&led->value_lock, flags);
+}
+
+#ifdef CONFIG_PM
+static int wm8350_led_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	struct wm8350_led *led = platform_get_drvdata(pdev);
+
+	led_classdev_suspend(&led->cdev);
+	return 0;
+}
+
+static int wm8350_led_resume(struct platform_device *pdev)
+{
+	struct wm8350_led *led = platform_get_drvdata(pdev);
+
+	led_classdev_resume(&led->cdev);
+	return 0;
+}
+#else
+#define wm8350_led_suspend NULL
+#define wm8350_led_resume NULL
+#endif
+
+static void wm8350_led_shutdown(struct platform_device *pdev)
+{
+	struct wm8350_led *led = platform_get_drvdata(pdev);
+
+	mutex_lock(&led->mutex);
+	led->value = LED_OFF;
+	wm8350_led_disable(led);
+	mutex_unlock(&led->mutex);
+}
+
+static int wm8350_led_probe(struct platform_device *pdev)
+{
+	struct regulator *isink, *dcdc;
+	struct wm8350_led *led;
+	struct wm8350_led_platform_data *pdata = pdev->dev.platform_data;
+	int ret, i;
+
+	if (pdata == NULL) {
+		dev_err(&pdev->dev, "no platform data\n");
+		return -ENODEV;
+	}
+
+	if (pdata->max_uA < isink_cur[0]) {
+		dev_err(&pdev->dev, "Invalid maximum current %duA\n",
+			pdata->max_uA);
+		return -EINVAL;
+	}
+
+	isink = regulator_get(&pdev->dev, "led_isink");
+	if (IS_ERR(isink)) {
+		printk(KERN_ERR "%s: cant get ISINK\n", __func__);
+		return PTR_ERR(isink);
+	}
+
+	dcdc = regulator_get(&pdev->dev, "led_vcc");
+	if (IS_ERR(dcdc)) {
+		printk(KERN_ERR "%s: cant get DCDC\n", __func__);
+		ret = PTR_ERR(dcdc);
+		goto err_isink;
+	}
+
+	led = kzalloc(sizeof(*led), GFP_KERNEL);
+	if (led == NULL) {
+		ret = -ENOMEM;
+		goto err_dcdc;
+	}
+
+	led->cdev.brightness_set = wm8350_led_set;
+	led->cdev.default_trigger = pdata->default_trigger;
+	led->cdev.name = pdata->name;
+	led->enabled = regulator_is_enabled(isink);
+	led->isink = isink;
+	led->dcdc = dcdc;
+
+	for (i = 0; i < ARRAY_SIZE(isink_cur) - 1; i++)
+		if (isink_cur[i] >= pdata->max_uA)
+			break;
+	led->max_uA_index = i;
+	if (pdata->max_uA != isink_cur[i])
+		dev_warn(&pdev->dev,
+			 "Maximum current %duA is not directly supported,"
+			 " check platform data\n",
+			 pdata->max_uA);
+
+	spin_lock_init(&led->value_lock);
+	mutex_init(&led->mutex);
+	INIT_WORK(&led->work, led_work);
+	led->value = LED_OFF;
+	platform_set_drvdata(pdev, led);
+
+	ret = led_classdev_register(&pdev->dev, &led->cdev);
+	if (ret < 0)
+		goto err_led;
+
+	return 0;
+
+ err_led:
+	kfree(led);
+ err_dcdc:
+	regulator_put(dcdc);
+ err_isink:
+	regulator_put(isink);
+	return ret;
+}
+
+static int wm8350_led_remove(struct platform_device *pdev)
+{
+	struct wm8350_led *led = platform_get_drvdata(pdev);
+
+	led_classdev_unregister(&led->cdev);
+	flush_scheduled_work();
+	wm8350_led_disable(led);
+	regulator_put(led->dcdc);
+	regulator_put(led->isink);
+	kfree(led);
+	return 0;
+}
+
+static struct platform_driver wm8350_led_driver = {
+	.driver = {
+		   .name = "wm8350-led",
+		   .owner = THIS_MODULE,
+		   },
+	.probe = wm8350_led_probe,
+	.remove = wm8350_led_remove,
+	.shutdown = wm8350_led_shutdown,
+	.suspend = wm8350_led_suspend,
+	.resume = wm8350_led_resume,
+};
+
+static int __devinit wm8350_led_init(void)
+{
+	return platform_driver_register(&wm8350_led_driver);
+}
+module_init(wm8350_led_init);
+
+static void wm8350_led_exit(void)
+{
+	platform_driver_unregister(&wm8350_led_driver);
+}
+module_exit(wm8350_led_exit);
+
+MODULE_AUTHOR("Mark Brown");
+MODULE_DESCRIPTION("WM8350 LED driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:wm8350-led");
diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c
index 3a273ccef3f2..f92595c8f165 100644
--- a/drivers/mfd/wm8350-core.c
+++ b/drivers/mfd/wm8350-core.c
@@ -1453,6 +1453,9 @@ void wm8350_device_exit(struct wm8350 *wm8350)
 {
 	int i;
 
+	for (i = 0; i < ARRAY_SIZE(wm8350->pmic.led); i++)
+		platform_device_unregister(wm8350->pmic.led[i].pdev);
+
 	for (i = 0; i < ARRAY_SIZE(wm8350->pmic.pdev); i++)
 		platform_device_unregister(wm8350->pmic.pdev[i]);
 
diff --git a/drivers/regulator/wm8350-regulator.c b/drivers/regulator/wm8350-regulator.c
index c68c496b2c49..7aa35248181b 100644
--- a/drivers/regulator/wm8350-regulator.c
+++ b/drivers/regulator/wm8350-regulator.c
@@ -1412,6 +1412,97 @@ int wm8350_register_regulator(struct wm8350 *wm8350, int reg,
 }
 EXPORT_SYMBOL_GPL(wm8350_register_regulator);
 
+/**
+ * wm8350_register_led - Register a WM8350 LED output
+ *
+ * @param wm8350 The WM8350 device to configure.
+ * @param lednum LED device index to create.
+ * @param dcdc The DCDC to use for the LED.
+ * @param isink The ISINK to use for the LED.
+ * @param pdata Configuration for the LED.
+ *
+ * The WM8350 supports the use of an ISINK together with a DCDC to
+ * provide a power-efficient LED driver.  This function registers the
+ * regulators and instantiates the platform device for a LED.  The
+ * operating modes for the LED regulators must be configured using
+ * wm8350_isink_set_flash(), wm8350_dcdc25_set_mode() and
+ * wm8350_dcdc_set_slot() prior to calling this function.
+ */
+int wm8350_register_led(struct wm8350 *wm8350, int lednum, int dcdc, int isink,
+			struct wm8350_led_platform_data *pdata)
+{
+	struct wm8350_led *led;
+	struct platform_device *pdev;
+	int ret;
+
+	if (lednum > ARRAY_SIZE(wm8350->pmic.led) || lednum < 0) {
+		dev_err(wm8350->dev, "Invalid LED index %d\n", lednum);
+		return -ENODEV;
+	}
+
+	led = &wm8350->pmic.led[lednum];
+
+	if (led->pdev) {
+		dev_err(wm8350->dev, "LED %d already allocated\n", lednum);
+		return -EINVAL;
+	}
+
+	pdev = platform_device_alloc("wm8350-led", lednum);
+	if (pdev == NULL) {
+		dev_err(wm8350->dev, "Failed to allocate LED %d\n", lednum);
+		return -ENOMEM;
+	}
+
+	led->isink_consumer.dev = &pdev->dev;
+	led->isink_consumer.supply = "led_isink";
+	led->isink_init.num_consumer_supplies = 1;
+	led->isink_init.consumer_supplies = &led->isink_consumer;
+	led->isink_init.constraints.min_uA = 0;
+	led->isink_init.constraints.max_uA = pdata->max_uA;
+	led->isink_init.constraints.valid_ops_mask = REGULATOR_CHANGE_CURRENT;
+	led->isink_init.constraints.valid_modes_mask = REGULATOR_MODE_NORMAL;
+	ret = wm8350_register_regulator(wm8350, isink, &led->isink_init);
+	if (ret != 0) {
+		platform_device_put(pdev);
+		return ret;
+	}
+
+	led->dcdc_consumer.dev = &pdev->dev;
+	led->dcdc_consumer.supply = "led_vcc";
+	led->dcdc_init.num_consumer_supplies = 1;
+	led->dcdc_init.consumer_supplies = &led->dcdc_consumer;
+	led->dcdc_init.constraints.valid_modes_mask = REGULATOR_MODE_NORMAL;
+	ret = wm8350_register_regulator(wm8350, dcdc, &led->dcdc_init);
+	if (ret != 0) {
+		platform_device_put(pdev);
+		return ret;
+	}
+
+	switch (isink) {
+	case WM8350_ISINK_A:
+		wm8350->pmic.isink_A_dcdc = dcdc;
+		break;
+	case WM8350_ISINK_B:
+		wm8350->pmic.isink_B_dcdc = dcdc;
+		break;
+	}
+
+	pdev->dev.platform_data = pdata;
+	pdev->dev.parent = wm8350->dev;
+	ret = platform_device_add(pdev);
+	if (ret != 0) {
+		dev_err(wm8350->dev, "Failed to register LED %d: %d\n",
+			lednum, ret);
+		platform_device_put(pdev);
+		return ret;
+	}
+
+	led->pdev = pdev;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(wm8350_register_led);
+
 static struct platform_driver wm8350_regulator_driver = {
 	.probe = wm8350_regulator_probe,
 	.remove = wm8350_regulator_remove,
diff --git a/include/linux/mfd/wm8350/pmic.h b/include/linux/mfd/wm8350/pmic.h
index 96acbfc8aa12..be3264e286e0 100644
--- a/include/linux/mfd/wm8350/pmic.h
+++ b/include/linux/mfd/wm8350/pmic.h
@@ -13,6 +13,10 @@
 #ifndef __LINUX_MFD_WM8350_PMIC_H
 #define __LINUX_MFD_WM8350_PMIC_H
 
+#include <linux/platform_device.h>
+#include <linux/leds.h>
+#include <linux/regulator/machine.h>
+
 /*
  * Register values.
  */
@@ -700,6 +704,33 @@ struct wm8350;
 struct platform_device;
 struct regulator_init_data;
 
+/*
+ * WM8350 LED platform data
+ */
+struct wm8350_led_platform_data {
+	const char *name;
+	const char *default_trigger;
+	int max_uA;
+};
+
+struct wm8350_led {
+	struct platform_device *pdev;
+	struct mutex mutex;
+	struct work_struct work;
+	spinlock_t value_lock;
+	enum led_brightness value;
+	struct led_classdev cdev;
+	int max_uA_index;
+	int enabled;
+
+	struct regulator *isink;
+	struct regulator_consumer_supply isink_consumer;
+	struct regulator_init_data isink_init;
+	struct regulator *dcdc;
+	struct regulator_consumer_supply dcdc_consumer;
+	struct regulator_init_data dcdc_init;
+};
+
 struct wm8350_pmic {
 	/* Number of regulators of each type on this device */
 	int max_dcdc;
@@ -717,10 +748,15 @@ struct wm8350_pmic {
 
 	/* regulator devices */
 	struct platform_device *pdev[NUM_WM8350_REGULATORS];
+
+	/* LED devices */
+	struct wm8350_led led[2];
 };
 
 int wm8350_register_regulator(struct wm8350 *wm8350, int reg,
 			      struct regulator_init_data *initdata);
+int wm8350_register_led(struct wm8350 *wm8350, int lednum, int dcdc, int isink,
+			struct wm8350_led_platform_data *pdata);
 
 /*
  * Additional DCDC control not supported via regulator API
-- 
cgit v1.2.3


From c835ee7f4154992e6cf0674d7ee136f5d36247a4 Mon Sep 17 00:00:00 2001
From: Richard Purdie <rpurdie@linux.intel.com>
Date: Tue, 6 Jan 2009 21:00:19 +0000
Subject: backlight: Add suspend/resume support to the backlight core

Add suspend/resume support to the backlight core and enable use of it
by appropriate drivers.

Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
---
 drivers/video/backlight/backlight.c     | 34 +++++++++++++++++++++++++
 drivers/video/backlight/corgi_bl.c      | 45 +++++++++------------------------
 drivers/video/backlight/mbp_nvidia_bl.c |  1 +
 include/linux/backlight.h               | 16 ++++++++++++
 4 files changed, 63 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index a9c013bb9f20..157057c79ca3 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -40,6 +40,10 @@ static int fb_notifier_callback(struct notifier_block *self,
 		if (!bd->ops->check_fb ||
 		    bd->ops->check_fb(evdata->info)) {
 			bd->props.fb_blank = *(int *)evdata->data;
+			if (bd->props.fb_blank == FB_BLANK_UNBLANK)
+				bd->props.state &= ~BL_CORE_FBBLANK;
+			else
+				bd->props.state |= BL_CORE_FBBLANK;
 			backlight_update_status(bd);
 		}
 	mutex_unlock(&bd->ops_lock);
@@ -165,6 +169,34 @@ static ssize_t backlight_show_actual_brightness(struct device *dev,
 
 static struct class *backlight_class;
 
+static int backlight_suspend(struct device *dev, pm_message_t state)
+{
+	struct backlight_device *bd = to_backlight_device(dev);
+
+	if (bd->ops->options & BL_CORE_SUSPENDRESUME) {
+		mutex_lock(&bd->ops_lock);
+		bd->props.state |= BL_CORE_SUSPENDED;
+		backlight_update_status(bd);
+		mutex_unlock(&bd->ops_lock);
+	}
+
+	return 0;
+}
+
+static int backlight_resume(struct device *dev)
+{
+	struct backlight_device *bd = to_backlight_device(dev);
+
+	if (bd->ops->options & BL_CORE_SUSPENDRESUME) {
+		mutex_lock(&bd->ops_lock);
+		bd->props.state &= ~BL_CORE_SUSPENDED;
+		backlight_update_status(bd);
+		mutex_unlock(&bd->ops_lock);
+	}
+
+	return 0;
+}
+
 static void bl_device_release(struct device *dev)
 {
 	struct backlight_device *bd = to_backlight_device(dev);
@@ -281,6 +313,8 @@ static int __init backlight_class_init(void)
 	}
 
 	backlight_class->dev_attrs = bl_device_attributes;
+	backlight_class->suspend = backlight_suspend;
+	backlight_class->resume = backlight_resume;
 	return 0;
 }
 
diff --git a/drivers/video/backlight/corgi_bl.c b/drivers/video/backlight/corgi_bl.c
index 4d4d037e3ec9..c6abf51ee3e0 100644
--- a/drivers/video/backlight/corgi_bl.c
+++ b/drivers/video/backlight/corgi_bl.c
@@ -24,9 +24,8 @@ static struct backlight_properties corgibl_data;
 static struct backlight_device *corgi_backlight_device;
 static struct generic_bl_info *bl_machinfo;
 
-static unsigned long corgibl_flags;
-#define CORGIBL_SUSPENDED     0x01
-#define CORGIBL_BATTLOW       0x02
+/* Flag to signal when the battery is low */
+#define CORGIBL_BATTLOW       BL_CORE_DRIVER1
 
 static int corgibl_send_intensity(struct backlight_device *bd)
 {
@@ -34,11 +33,11 @@ static int corgibl_send_intensity(struct backlight_device *bd)
 
 	if (bd->props.power != FB_BLANK_UNBLANK)
 		intensity = 0;
-	if (bd->props.fb_blank != FB_BLANK_UNBLANK)
+	if (bd->props.state & BL_CORE_FBBLANK)
 		intensity = 0;
-	if (corgibl_flags & CORGIBL_SUSPENDED)
+	if (bd->props.state & BL_CORE_SUSPENDED)
 		intensity = 0;
-	if (corgibl_flags & CORGIBL_BATTLOW)
+	if (bd->props.state & CORGIBL_BATTLOW)
 		intensity &= bl_machinfo->limit_mask;
 
 	bl_machinfo->set_bl_intensity(intensity);
@@ -51,29 +50,6 @@ static int corgibl_send_intensity(struct backlight_device *bd)
 	return 0;
 }
 
-#ifdef CONFIG_PM
-static int corgibl_suspend(struct platform_device *pdev, pm_message_t state)
-{
-	struct backlight_device *bd = platform_get_drvdata(pdev);
-
-	corgibl_flags |= CORGIBL_SUSPENDED;
-	backlight_update_status(bd);
-	return 0;
-}
-
-static int corgibl_resume(struct platform_device *pdev)
-{
-	struct backlight_device *bd = platform_get_drvdata(pdev);
-
-	corgibl_flags &= ~CORGIBL_SUSPENDED;
-	backlight_update_status(bd);
-	return 0;
-}
-#else
-#define corgibl_suspend	NULL
-#define corgibl_resume	NULL
-#endif
-
 static int corgibl_get_intensity(struct backlight_device *bd)
 {
 	return corgibl_intensity;
@@ -85,16 +61,21 @@ static int corgibl_get_intensity(struct backlight_device *bd)
  */
 void corgibl_limit_intensity(int limit)
 {
+	struct backlight_device *bd = corgi_backlight_device;
+
+	mutex_lock(&bd->ops_lock);
 	if (limit)
-		corgibl_flags |= CORGIBL_BATTLOW;
+		bd->props.state |= CORGIBL_BATTLOW;
 	else
-		corgibl_flags &= ~CORGIBL_BATTLOW;
+		bd->props.state &= ~CORGIBL_BATTLOW;
 	backlight_update_status(corgi_backlight_device);
+	mutex_unlock(&bd->ops_lock);
 }
 EXPORT_SYMBOL(corgibl_limit_intensity);
 
 
 static struct backlight_ops corgibl_ops = {
+	.options = BL_CORE_SUSPENDRESUME,
 	.get_brightness = corgibl_get_intensity,
 	.update_status  = corgibl_send_intensity,
 };
@@ -144,8 +125,6 @@ static int corgibl_remove(struct platform_device *pdev)
 static struct platform_driver corgibl_driver = {
 	.probe		= corgibl_probe,
 	.remove		= corgibl_remove,
-	.suspend	= corgibl_suspend,
-	.resume		= corgibl_resume,
 	.driver		= {
 		.name	= "generic-bl",
 	},
diff --git a/drivers/video/backlight/mbp_nvidia_bl.c b/drivers/video/backlight/mbp_nvidia_bl.c
index 06964af761c6..65864c500455 100644
--- a/drivers/video/backlight/mbp_nvidia_bl.c
+++ b/drivers/video/backlight/mbp_nvidia_bl.c
@@ -70,6 +70,7 @@ static int mbp_get_intensity(struct backlight_device *bd)
 }
 
 static struct backlight_ops mbp_ops = {
+	.options = BL_CORE_SUSPENDRESUME,
 	.get_brightness = mbp_get_intensity,
 	.update_status  = mbp_send_intensity,
 };
diff --git a/include/linux/backlight.h b/include/linux/backlight.h
index 1ee9488ca2e4..79ca2da81c87 100644
--- a/include/linux/backlight.h
+++ b/include/linux/backlight.h
@@ -31,6 +31,10 @@ struct backlight_device;
 struct fb_info;
 
 struct backlight_ops {
+	unsigned int options;
+
+#define BL_CORE_SUSPENDRESUME	(1 << 0)
+
 	/* Notify the backlight driver some property has changed */
 	int (*update_status)(struct backlight_device *);
 	/* Return the current backlight brightness (accounting for power,
@@ -51,7 +55,19 @@ struct backlight_properties {
 	   modes; 4: full off), see FB_BLANK_XXX */
 	int power;
 	/* FB Blanking active? (values as for power) */
+	/* Due to be removed, please use (state & BL_CORE_FBBLANK) */
 	int fb_blank;
+	/* Flags used to signal drivers of state changes */
+	/* Upper 4 bits are reserved for driver internal use */
+	unsigned int state;
+
+#define BL_CORE_SUSPENDED	(1 << 0)	/* backlight is suspended */
+#define BL_CORE_FBBLANK		(1 << 1)	/* backlight is under an fb blank event */
+#define BL_CORE_DRIVER4		(1 << 28)	/* reserved for driver specific use */
+#define BL_CORE_DRIVER3		(1 << 29)	/* reserved for driver specific use */
+#define BL_CORE_DRIVER2		(1 << 30)	/* reserved for driver specific use */
+#define BL_CORE_DRIVER1		(1 << 31)	/* reserved for driver specific use */
+
 };
 
 struct backlight_device {
-- 
cgit v1.2.3


From 1107ba885e46964316c083d441d5dd185b6c9e49 Mon Sep 17 00:00:00 2001
From: Alex Zeffertt <alex.zeffertt@eu.citrix.com>
Date: Wed, 7 Jan 2009 18:07:11 -0800
Subject: xen: add xenfs to allow usermode <-> Xen interaction

The xenfs filesystem exports various interfaces to usermode.  Initially
this exports a file to allow usermode to interact with xenbus/xenstore.

Traditionally this appeared in /proc/xen.  Rather than extending procfs,
this patch adds a backward-compat mountpoint on /proc/xen, and provides
a xenfs filesystem which can be mounted there.

Signed-off-by: Alex Zeffertt <alex.zeffertt@eu.citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/xen/Kconfig               |  24 ++
 drivers/xen/Makefile              |   2 +
 drivers/xen/xenbus/xenbus_probe.c |  28 +-
 drivers/xen/xenbus/xenbus_xs.c    |   1 +
 drivers/xen/xenfs/Makefile        |   3 +
 drivers/xen/xenfs/super.c         |  64 ++++
 drivers/xen/xenfs/xenbus.c        | 593 ++++++++++++++++++++++++++++++++++++++
 drivers/xen/xenfs/xenfs.h         |   6 +
 include/linux/magic.h             |   1 +
 include/xen/xenbus.h              |   2 -
 10 files changed, 716 insertions(+), 8 deletions(-)
 create mode 100644 drivers/xen/xenfs/Makefile
 create mode 100644 drivers/xen/xenfs/super.c
 create mode 100644 drivers/xen/xenfs/xenbus.c
 create mode 100644 drivers/xen/xenfs/xenfs.h

(limited to 'include/linux')

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 4b75a16de009..526187c8a12d 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -17,3 +17,27 @@ config XEN_SCRUB_PAGES
 	  is not accidentally visible to other domains.  Is it more
 	  secure, but slightly less efficient.
 	  If in doubt, say yes.
+
+config XENFS
+	tristate "Xen filesystem"
+	depends on XEN
+	default y
+	help
+	  The xen filesystem provides a way for domains to share
+	  information with each other and with the hypervisor.
+	  For example, by reading and writing the "xenbus" file, guests
+	  may pass arbitrary information to the initial domain.
+	  If in doubt, say yes.
+
+config XEN_COMPAT_XENFS
+       bool "Create compatibility mount point /proc/xen"
+       depends on XENFS
+       default y
+       help
+         The old xenstore userspace tools expect to find "xenbus"
+         under /proc/xen, but "xenbus" is now found at the root of the
+         xenfs filesystem.  Selecting this causes the kernel to create
+         the compatibilty mount point /proc/xen if it is running on
+         a xen platform.
+         If in doubt, say yes.
+
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index d2a8fdf0e191..ff8accc9e103 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,5 +1,7 @@
 obj-y	+= grant-table.o features.o events.o manage.o
 obj-y	+= xenbus/
+
 obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
 obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
 obj-$(CONFIG_XEN_BALLOON)	+= balloon.o
+obj-$(CONFIG_XENFS)		+= xenfs/
\ No newline at end of file
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index b2a03184a246..773d1cf23283 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -40,6 +40,7 @@
 #include <linux/ctype.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
+#include <linux/proc_fs.h>
 #include <linux/notifier.h>
 #include <linux/kthread.h>
 #include <linux/mutex.h>
@@ -55,7 +56,10 @@
 #include "xenbus_comms.h"
 #include "xenbus_probe.h"
 
+
 int xen_store_evtchn;
+EXPORT_SYMBOL(xen_store_evtchn);
+
 struct xenstore_domain_interface *xen_store_interface;
 static unsigned long xen_store_mfn;
 
@@ -166,6 +170,9 @@ static int read_backend_details(struct xenbus_device *xendev)
 	return read_otherend_details(xendev, "backend-id", "backend");
 }
 
+static struct device_attribute xenbus_dev_attrs[] = {
+	__ATTR_NULL
+};
 
 /* Bus type for frontend drivers. */
 static struct xen_bus_type xenbus_frontend = {
@@ -174,12 +181,13 @@ static struct xen_bus_type xenbus_frontend = {
 	.get_bus_id = frontend_bus_id,
 	.probe = xenbus_probe_frontend,
 	.bus = {
-		.name     = "xen",
-		.match    = xenbus_match,
-		.uevent   = xenbus_uevent,
-		.probe    = xenbus_dev_probe,
-		.remove   = xenbus_dev_remove,
-		.shutdown = xenbus_dev_shutdown,
+		.name      = "xen",
+		.match     = xenbus_match,
+		.uevent    = xenbus_uevent,
+		.probe     = xenbus_dev_probe,
+		.remove    = xenbus_dev_remove,
+		.shutdown  = xenbus_dev_shutdown,
+		.dev_attrs = xenbus_dev_attrs,
 	},
 };
 
@@ -852,6 +860,14 @@ static int __init xenbus_probe_init(void)
 	if (!xen_initial_domain())
 		xenbus_probe(NULL);
 
+#ifdef CONFIG_XEN_COMPAT_XENFS
+	/*
+	 * Create xenfs mountpoint in /proc for compatibility with
+	 * utilities that expect to find "xenbus" under "/proc/xen".
+	 */
+	proc_mkdir("xen", NULL);
+#endif
+
 	return 0;
 
   out_unreg_back:
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index 7f2f91c0e11d..e325eab4724d 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -184,6 +184,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
 
 	return ret;
 }
+EXPORT_SYMBOL(xenbus_dev_request_and_reply);
 
 /* Send message to xs, get kmalloc'ed reply.  ERR_PTR() on error. */
 static void *xs_talkv(struct xenbus_transaction t,
diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
new file mode 100644
index 000000000000..25275c3bbdff
--- /dev/null
+++ b/drivers/xen/xenfs/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XENFS) += xenfs.o
+
+xenfs-objs = super.o xenbus.o
\ No newline at end of file
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
new file mode 100644
index 000000000000..515741a8e6b8
--- /dev/null
+++ b/drivers/xen/xenfs/super.c
@@ -0,0 +1,64 @@
+/*
+ *  xenfs.c - a filesystem for passing info between the a domain and
+ *  the hypervisor.
+ *
+ * 2008-10-07  Alex Zeffertt    Replaced /proc/xen/xenbus with xenfs filesystem
+ *                              and /proc/xen compatibility mount point.
+ *                              Turned xenfs into a loadable module.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+
+#include "xenfs.h"
+
+#include <asm/xen/hypervisor.h>
+
+MODULE_DESCRIPTION("Xen filesystem");
+MODULE_LICENSE("GPL");
+
+static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	static struct tree_descr xenfs_files[] = {
+		[2] = {"xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR},
+		{""},
+	};
+
+	return simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
+}
+
+static int xenfs_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
+{
+	return get_sb_single(fs_type, flags, data, xenfs_fill_super, mnt);
+}
+
+static struct file_system_type xenfs_type = {
+	.owner =	THIS_MODULE,
+	.name =		"xenfs",
+	.get_sb =	xenfs_get_sb,
+	.kill_sb =	kill_litter_super,
+};
+
+static int __init xenfs_init(void)
+{
+	if (xen_pv_domain())
+		return register_filesystem(&xenfs_type);
+
+	printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n");
+	return 0;
+}
+
+static void __exit xenfs_exit(void)
+{
+	if (xen_pv_domain())
+		unregister_filesystem(&xenfs_type);
+}
+
+module_init(xenfs_init);
+module_exit(xenfs_exit);
+
diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c
new file mode 100644
index 000000000000..875a4c59c594
--- /dev/null
+++ b/drivers/xen/xenfs/xenbus.c
@@ -0,0 +1,593 @@
+/*
+ * Driver giving user-space access to the kernel's xenbus connection
+ * to xenstore.
+ *
+ * Copyright (c) 2005, Christian Limpach
+ * Copyright (c) 2005, Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Changes:
+ * 2008-10-07  Alex Zeffertt    Replaced /proc/xen/xenbus with xenfs filesystem
+ *                              and /proc/xen compatibility mount point.
+ *                              Turned xenfs into a loadable module.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/notifier.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/string.h>
+
+#include "xenfs.h"
+#include "../xenbus/xenbus_comms.h"
+
+#include <xen/xenbus.h>
+#include <asm/xen/hypervisor.h>
+
+/*
+ * An element of a list of outstanding transactions, for which we're
+ * still waiting a reply.
+ */
+struct xenbus_transaction_holder {
+	struct list_head list;
+	struct xenbus_transaction handle;
+};
+
+/*
+ * A buffer of data on the queue.
+ */
+struct read_buffer {
+	struct list_head list;
+	unsigned int cons;
+	unsigned int len;
+	char msg[];
+};
+
+struct xenbus_file_priv {
+	/*
+	 * msgbuffer_mutex is held while partial requests are built up
+	 * and complete requests are acted on.  It therefore protects
+	 * the "transactions" and "watches" lists, and the partial
+	 * request length and buffer.
+	 *
+	 * reply_mutex protects the reply being built up to return to
+	 * usermode.  It nests inside msgbuffer_mutex but may be held
+	 * alone during a watch callback.
+	 */
+	struct mutex msgbuffer_mutex;
+
+	/* In-progress transactions */
+	struct list_head transactions;
+
+	/* Active watches. */
+	struct list_head watches;
+
+	/* Partial request. */
+	unsigned int len;
+	union {
+		struct xsd_sockmsg msg;
+		char buffer[PAGE_SIZE];
+	} u;
+
+	/* Response queue. */
+	struct mutex reply_mutex;
+	struct list_head read_buffers;
+	wait_queue_head_t read_waitq;
+
+};
+
+/* Read out any raw xenbus messages queued up. */
+static ssize_t xenbus_file_read(struct file *filp,
+			       char __user *ubuf,
+			       size_t len, loff_t *ppos)
+{
+	struct xenbus_file_priv *u = filp->private_data;
+	struct read_buffer *rb;
+	unsigned i;
+	int ret;
+
+	mutex_lock(&u->reply_mutex);
+	while (list_empty(&u->read_buffers)) {
+		mutex_unlock(&u->reply_mutex);
+		ret = wait_event_interruptible(u->read_waitq,
+					       !list_empty(&u->read_buffers));
+		if (ret)
+			return ret;
+		mutex_lock(&u->reply_mutex);
+	}
+
+	rb = list_entry(u->read_buffers.next, struct read_buffer, list);
+	i = 0;
+	while (i < len) {
+		unsigned sz = min((unsigned)len - i, rb->len - rb->cons);
+
+		ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz);
+
+		i += sz - ret;
+		rb->cons += sz - ret;
+
+		if (ret != sz) {
+			if (i == 0)
+				i = -EFAULT;
+			goto out;
+		}
+
+		/* Clear out buffer if it has been consumed */
+		if (rb->cons == rb->len) {
+			list_del(&rb->list);
+			kfree(rb);
+			if (list_empty(&u->read_buffers))
+				break;
+			rb = list_entry(u->read_buffers.next,
+					struct read_buffer, list);
+		}
+	}
+
+out:
+	mutex_unlock(&u->reply_mutex);
+	return i;
+}
+
+/*
+ * Add a buffer to the queue.  Caller must hold the appropriate lock
+ * if the queue is not local.  (Commonly the caller will build up
+ * multiple queued buffers on a temporary local list, and then add it
+ * to the appropriate list under lock once all the buffers have een
+ * successfully allocated.)
+ */
+static int queue_reply(struct list_head *queue, const void *data, size_t len)
+{
+	struct read_buffer *rb;
+
+	if (len == 0)
+		return 0;
+
+	rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
+	if (rb == NULL)
+		return -ENOMEM;
+
+	rb->cons = 0;
+	rb->len = len;
+
+	memcpy(rb->msg, data, len);
+
+	list_add_tail(&rb->list, queue);
+	return 0;
+}
+
+/*
+ * Free all the read_buffer s on a list.
+ * Caller must have sole reference to list.
+ */
+static void queue_cleanup(struct list_head *list)
+{
+	struct read_buffer *rb;
+
+	while (!list_empty(list)) {
+		rb = list_entry(list->next, struct read_buffer, list);
+		list_del(list->next);
+		kfree(rb);
+	}
+}
+
+struct watch_adapter {
+	struct list_head list;
+	struct xenbus_watch watch;
+	struct xenbus_file_priv *dev_data;
+	char *token;
+};
+
+static void free_watch_adapter(struct watch_adapter *watch)
+{
+	kfree(watch->watch.node);
+	kfree(watch->token);
+	kfree(watch);
+}
+
+static struct watch_adapter *alloc_watch_adapter(const char *path,
+						 const char *token)
+{
+	struct watch_adapter *watch;
+
+	watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+	if (watch == NULL)
+		goto out_fail;
+
+	watch->watch.node = kstrdup(path, GFP_KERNEL);
+	if (watch->watch.node == NULL)
+		goto out_free;
+
+	watch->token = kstrdup(token, GFP_KERNEL);
+	if (watch->token == NULL)
+		goto out_free;
+
+	return watch;
+
+out_free:
+	free_watch_adapter(watch);
+
+out_fail:
+	return NULL;
+}
+
+static void watch_fired(struct xenbus_watch *watch,
+			const char **vec,
+			unsigned int len)
+{
+	struct watch_adapter *adap;
+	struct xsd_sockmsg hdr;
+	const char *path, *token;
+	int path_len, tok_len, body_len, data_len = 0;
+	int ret;
+	LIST_HEAD(staging_q);
+
+	adap = container_of(watch, struct watch_adapter, watch);
+
+	path = vec[XS_WATCH_PATH];
+	token = adap->token;
+
+	path_len = strlen(path) + 1;
+	tok_len = strlen(token) + 1;
+	if (len > 2)
+		data_len = vec[len] - vec[2] + 1;
+	body_len = path_len + tok_len + data_len;
+
+	hdr.type = XS_WATCH_EVENT;
+	hdr.len = body_len;
+
+	mutex_lock(&adap->dev_data->reply_mutex);
+
+	ret = queue_reply(&staging_q, &hdr, sizeof(hdr));
+	if (!ret)
+		ret = queue_reply(&staging_q, path, path_len);
+	if (!ret)
+		ret = queue_reply(&staging_q, token, tok_len);
+	if (!ret && len > 2)
+		ret = queue_reply(&staging_q, vec[2], data_len);
+
+	if (!ret) {
+		/* success: pass reply list onto watcher */
+		list_splice_tail(&staging_q, &adap->dev_data->read_buffers);
+		wake_up(&adap->dev_data->read_waitq);
+	} else
+		queue_cleanup(&staging_q);
+
+	mutex_unlock(&adap->dev_data->reply_mutex);
+}
+
+static int xenbus_write_transaction(unsigned msg_type,
+				    struct xenbus_file_priv *u)
+{
+	int rc, ret;
+	void *reply;
+	struct xenbus_transaction_holder *trans = NULL;
+	LIST_HEAD(staging_q);
+
+	if (msg_type == XS_TRANSACTION_START) {
+		trans = kmalloc(sizeof(*trans), GFP_KERNEL);
+		if (!trans) {
+			rc = -ENOMEM;
+			goto out;
+		}
+	}
+
+	reply = xenbus_dev_request_and_reply(&u->u.msg);
+	if (IS_ERR(reply)) {
+		kfree(trans);
+		rc = PTR_ERR(reply);
+		goto out;
+	}
+
+	if (msg_type == XS_TRANSACTION_START) {
+		trans->handle.id = simple_strtoul(reply, NULL, 0);
+
+		list_add(&trans->list, &u->transactions);
+	} else if (msg_type == XS_TRANSACTION_END) {
+		list_for_each_entry(trans, &u->transactions, list)
+			if (trans->handle.id == u->u.msg.tx_id)
+				break;
+		BUG_ON(&trans->list == &u->transactions);
+		list_del(&trans->list);
+
+		kfree(trans);
+	}
+
+	mutex_lock(&u->reply_mutex);
+	ret = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg));
+	if (!ret)
+		ret = queue_reply(&staging_q, reply, u->u.msg.len);
+	if (!ret) {
+		list_splice_tail(&staging_q, &u->read_buffers);
+		wake_up(&u->read_waitq);
+	} else {
+		queue_cleanup(&staging_q);
+		rc = ret;
+	}
+	mutex_unlock(&u->reply_mutex);
+
+	kfree(reply);
+
+out:
+	return rc;
+}
+
+static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)
+{
+	struct watch_adapter *watch, *tmp_watch;
+	char *path, *token;
+	int err, rc;
+	LIST_HEAD(staging_q);
+
+	path = u->u.buffer + sizeof(u->u.msg);
+	token = memchr(path, 0, u->u.msg.len);
+	if (token == NULL) {
+		rc = -EILSEQ;
+		goto out;
+	}
+	token++;
+
+	if (msg_type == XS_WATCH) {
+		watch = alloc_watch_adapter(path, token);
+		if (watch == NULL) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		watch->watch.callback = watch_fired;
+		watch->dev_data = u;
+
+		err = register_xenbus_watch(&watch->watch);
+		if (err) {
+			free_watch_adapter(watch);
+			rc = err;
+			goto out;
+		}
+		list_add(&watch->list, &u->watches);
+	} else {
+		list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+			if (!strcmp(watch->token, token) &&
+			    !strcmp(watch->watch.node, path)) {
+				unregister_xenbus_watch(&watch->watch);
+				list_del(&watch->list);
+				free_watch_adapter(watch);
+				break;
+			}
+		}
+	}
+
+	/* Success.  Synthesize a reply to say all is OK. */
+	{
+		struct {
+			struct xsd_sockmsg hdr;
+			char body[3];
+		} __packed reply = {
+			{
+				.type = msg_type,
+				.len = sizeof(reply.body)
+			},
+			"OK"
+		};
+
+		mutex_lock(&u->reply_mutex);
+		rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
+		mutex_unlock(&u->reply_mutex);
+	}
+
+out:
+	return rc;
+}
+
+static ssize_t xenbus_file_write(struct file *filp,
+				const char __user *ubuf,
+				size_t len, loff_t *ppos)
+{
+	struct xenbus_file_priv *u = filp->private_data;
+	uint32_t msg_type;
+	int rc = len;
+	int ret;
+	LIST_HEAD(staging_q);
+
+	/*
+	 * We're expecting usermode to be writing properly formed
+	 * xenbus messages.  If they write an incomplete message we
+	 * buffer it up.  Once it is complete, we act on it.
+	 */
+
+	/*
+	 * Make sure concurrent writers can't stomp all over each
+	 * other's messages and make a mess of our partial message
+	 * buffer.  We don't make any attemppt to stop multiple
+	 * writers from making a mess of each other's incomplete
+	 * messages; we're just trying to guarantee our own internal
+	 * consistency and make sure that single writes are handled
+	 * atomically.
+	 */
+	mutex_lock(&u->msgbuffer_mutex);
+
+	/* Get this out of the way early to avoid confusion */
+	if (len == 0)
+		goto out;
+
+	/* Can't write a xenbus message larger we can buffer */
+	if ((len + u->len) > sizeof(u->u.buffer)) {
+		/* On error, dump existing buffer */
+		u->len = 0;
+		rc = -EINVAL;
+		goto out;
+	}
+
+	ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
+
+	if (ret == len) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	/* Deal with a partial copy. */
+	len -= ret;
+	rc = len;
+
+	u->len += len;
+
+	/* Return if we haven't got a full message yet */
+	if (u->len < sizeof(u->u.msg))
+		goto out;	/* not even the header yet */
+
+	/* If we're expecting a message that's larger than we can
+	   possibly send, dump what we have and return an error. */
+	if ((sizeof(u->u.msg) + u->u.msg.len) > sizeof(u->u.buffer)) {
+		rc = -E2BIG;
+		u->len = 0;
+		goto out;
+	}
+
+	if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
+		goto out;	/* incomplete data portion */
+
+	/*
+	 * OK, now we have a complete message.  Do something with it.
+	 */
+
+	msg_type = u->u.msg.type;
+
+	switch (msg_type) {
+	case XS_TRANSACTION_START:
+	case XS_TRANSACTION_END:
+	case XS_DIRECTORY:
+	case XS_READ:
+	case XS_GET_PERMS:
+	case XS_RELEASE:
+	case XS_GET_DOMAIN_PATH:
+	case XS_WRITE:
+	case XS_MKDIR:
+	case XS_RM:
+	case XS_SET_PERMS:
+		/* Send out a transaction */
+		ret = xenbus_write_transaction(msg_type, u);
+		break;
+
+	case XS_WATCH:
+	case XS_UNWATCH:
+		/* (Un)Ask for some path to be watched for changes */
+		ret = xenbus_write_watch(msg_type, u);
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	if (ret != 0)
+		rc = ret;
+
+	/* Buffered message consumed */
+	u->len = 0;
+
+ out:
+	mutex_unlock(&u->msgbuffer_mutex);
+	return rc;
+}
+
+static int xenbus_file_open(struct inode *inode, struct file *filp)
+{
+	struct xenbus_file_priv *u;
+
+	if (xen_store_evtchn == 0)
+		return -ENOENT;
+
+	nonseekable_open(inode, filp);
+
+	u = kzalloc(sizeof(*u), GFP_KERNEL);
+	if (u == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&u->transactions);
+	INIT_LIST_HEAD(&u->watches);
+	INIT_LIST_HEAD(&u->read_buffers);
+	init_waitqueue_head(&u->read_waitq);
+
+	mutex_init(&u->reply_mutex);
+	mutex_init(&u->msgbuffer_mutex);
+
+	filp->private_data = u;
+
+	return 0;
+}
+
+static int xenbus_file_release(struct inode *inode, struct file *filp)
+{
+	struct xenbus_file_priv *u = filp->private_data;
+	struct xenbus_transaction_holder *trans, *tmp;
+	struct watch_adapter *watch, *tmp_watch;
+
+	/*
+	 * No need for locking here because there are no other users,
+	 * by definition.
+	 */
+
+	list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
+		xenbus_transaction_end(trans->handle, 1);
+		list_del(&trans->list);
+		kfree(trans);
+	}
+
+	list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+		unregister_xenbus_watch(&watch->watch);
+		list_del(&watch->list);
+		free_watch_adapter(watch);
+	}
+
+	kfree(u);
+
+	return 0;
+}
+
+static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
+{
+	struct xenbus_file_priv *u = file->private_data;
+
+	poll_wait(file, &u->read_waitq, wait);
+	if (!list_empty(&u->read_buffers))
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
+const struct file_operations xenbus_file_ops = {
+	.read = xenbus_file_read,
+	.write = xenbus_file_write,
+	.open = xenbus_file_open,
+	.release = xenbus_file_release,
+	.poll = xenbus_file_poll,
+};
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
new file mode 100644
index 000000000000..51f08b2d0bf1
--- /dev/null
+++ b/drivers/xen/xenfs/xenfs.h
@@ -0,0 +1,6 @@
+#ifndef _XENFS_XENBUS_H
+#define _XENFS_XENBUS_H
+
+extern const struct file_operations xenbus_file_ops;
+
+#endif	/* _XENFS_XENBUS_H */
diff --git a/include/linux/magic.h b/include/linux/magic.h
index f7f3fdddbef0..439f6f3cb0c4 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -13,6 +13,7 @@
 #define EFS_SUPER_MAGIC		0x414A53
 #define EXT2_SUPER_MAGIC	0xEF53
 #define EXT3_SUPER_MAGIC	0xEF53
+#define XENFS_SUPER_MAGIC	0xabba1974
 #define EXT4_SUPER_MAGIC	0xEF53
 #define HPFS_SUPER_MAGIC	0xf995e849
 #define ISOFS_SUPER_MAGIC	0x9660
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index 6369d89c25d5..f87f9614844d 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -136,8 +136,6 @@ struct xenbus_transaction
 /* Nil transaction ID. */
 #define XBT_NIL ((struct xenbus_transaction) { 0 })
 
-int __init xenbus_dev_init(void);
-
 char **xenbus_directory(struct xenbus_transaction t,
 			const char *dir, const char *node, unsigned int *num);
 void *xenbus_read(struct xenbus_transaction t,
-- 
cgit v1.2.3


From 18a82eb9f980b5e02cea651e4ecda26265d98933 Mon Sep 17 00:00:00 2001
From: Pekka J Enberg <penberg@cs.helsinki.fi>
Date: Wed, 7 Jan 2009 18:07:19 -0800
Subject: ext2: allocate ->s_blockgroup_lock separately

As spotted by kmemtrace, struct ext2_sb_info is 17024 bytes on 64-bit
which makes it a very bad fit for SLAB allocators.  The culprit of the
wasted memory is ->s_blockgroup_lock which can be as big as 16 KB when
NR_CPUS >= 32.

To fix that, allocate ->s_blockgroup_lock, which fits nicely in a order 2
page in the worst case, separately.  This shinks down struct ext2_sb_info
enough to fit a 1 KB slab cache so now we allocate 16 KB + 1 KB instead of
32 KB saving 15 KB of memory.

Acked-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/super.c            | 10 +++++++++-
 include/linux/ext2_fs_sb.h |  4 ++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 647cd888ac87..da8bdeaa2e6d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb)
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	brelse (sbi->s_sbh);
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 
 	return;
@@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		return -ENOMEM;
+	}
 	sb->s_fs_info = sbi;
 	sbi->s_sb_block = sb_block;
 
@@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount;
 	}
-	bgl_lock_init(&sbi->s_blockgroup_lock);
+	bgl_lock_init(sbi->s_blockgroup_lock);
 	sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
 	if (!sbi->s_debts) {
 		printk ("EXT2-fs: not enough memory\n");
diff --git a/include/linux/ext2_fs_sb.h b/include/linux/ext2_fs_sb.h
index dc541f3653d1..1cdb66367c98 100644
--- a/include/linux/ext2_fs_sb.h
+++ b/include/linux/ext2_fs_sb.h
@@ -101,7 +101,7 @@ struct ext2_sb_info {
 	struct percpu_counter s_freeblocks_counter;
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
-	struct blockgroup_lock s_blockgroup_lock;
+	struct blockgroup_lock *s_blockgroup_lock;
 	/* root of the per fs reservation window tree */
 	spinlock_t s_rsv_window_lock;
 	struct rb_root s_rsv_window_root;
@@ -111,7 +111,7 @@ struct ext2_sb_info {
 static inline spinlock_t *
 sb_bgl_lock(struct ext2_sb_info *sbi, unsigned int block_group)
 {
-	return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
 }
 
 #endif	/* _LINUX_EXT2_FS_SB */
-- 
cgit v1.2.3


From 0e090f1e05a563cc9acdda442767176bf1616001 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Wed, 7 Jan 2009 18:07:20 -0800
Subject: ext2: don't inherit inappropriate inode flags from parent

At present BTREE/INDEX is the only flag that new ext2 inodes do NOT
inherit from their parent.  In addition prevent the flags DIRTY, ECOMPR,
INDEX, IMAGIC and TOPDIR from being inherited.  List inheritable flags
explicitly to prevent future flags from accidentally being inherited.

This fixes the TOPDIR flag inheritance bug reported at
http://bugzilla.kernel.org/show_bug.cgi?id=9866.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Acked-by: Andreas Dilger <adilger@sun.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/ialloc.c        | 2 +-
 include/linux/ext2_fs.h | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c454d5db28a5..b5598e1393d9 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -565,7 +565,7 @@ got:
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	memset(ei->i_data, 0, sizeof(ei->i_data));
-	ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL;
+	ei->i_flags = EXT2_I(dir)->i_flags & EXT2_FL_INHERITED;
 	if (S_ISLNK(mode))
 		ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
 	/* dirsync is only applied to directories */
diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h
index 78c775a83f7c..c3a051819363 100644
--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -194,6 +194,13 @@ struct ext2_group_desc
 #define EXT2_FL_USER_VISIBLE		FS_FL_USER_VISIBLE	/* User visible flags */
 #define EXT2_FL_USER_MODIFIABLE		FS_FL_USER_MODIFIABLE	/* User modifiable flags */
 
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT2_FL_INHERITED (EXT2_SECRM_FL | EXT2_UNRM_FL | EXT2_COMPR_FL |\
+			   EXT2_SYNC_FL | EXT2_IMMUTABLE_FL | EXT2_APPEND_FL |\
+			   EXT2_NODUMP_FL | EXT2_NOATIME_FL | EXT2_COMPRBLK_FL|\
+			   EXT2_NOCOMP_FL | EXT2_JOURNAL_DATA_FL |\
+			   EXT2_NOTAIL_FL | EXT2_DIRSYNC_FL)
+
 /*
  * ioctl commands
  */
-- 
cgit v1.2.3


From ef8b646183868b2d042fa6cde0eef2a31263ff85 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Wed, 7 Jan 2009 18:07:21 -0800
Subject: ext2: tighten restrictions on inode flags

At the moment there are few restrictions on which flags may be set on
which inodes.  Specifically DIRSYNC may only be set on directories and
IMMUTABLE and APPEND may not be set on links.  Tighten that to disallow
TOPDIR being set on non-directories and only NODUMP and NOATIME to be set
on non-regular file, non-directories.

Introduces a flags masking function which masks flags based on mode and
use it during inode creation and when flags are set via the ioctl to
facilitate future consistency.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Acked-by: Andreas Dilger <adilger@sun.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/ialloc.c        |  8 ++------
 fs/ext2/ioctl.c         |  3 +--
 include/linux/ext2_fs.h | 17 +++++++++++++++++
 3 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index b5598e1393d9..66321a877e74 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -565,12 +565,8 @@ got:
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	memset(ei->i_data, 0, sizeof(ei->i_data));
-	ei->i_flags = EXT2_I(dir)->i_flags & EXT2_FL_INHERITED;
-	if (S_ISLNK(mode))
-		ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
-	/* dirsync is only applied to directories */
-	if (!S_ISDIR(mode))
-		ei->i_flags &= ~EXT2_DIRSYNC_FL;
+	ei->i_flags =
+		ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
 	ei->i_faddr = 0;
 	ei->i_frag_no = 0;
 	ei->i_frag_size = 0;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index de876fa793e1..7cb4badef927 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			goto setflags_out;
 		}
 
-		if (!S_ISDIR(inode->i_mode))
-			flags &= ~EXT2_DIRSYNC_FL;
+		flags = ext2_mask_flags(inode->i_mode, flags);
 
 		mutex_lock(&inode->i_mutex);
 		/* Is it quota file? Do not allow user to mess with it */
diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h
index c3a051819363..121720d74e15 100644
--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -201,6 +201,23 @@ struct ext2_group_desc
 			   EXT2_NOCOMP_FL | EXT2_JOURNAL_DATA_FL |\
 			   EXT2_NOTAIL_FL | EXT2_DIRSYNC_FL)
 
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT2_REG_FLMASK (~(EXT2_DIRSYNC_FL | EXT2_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT2_OTHER_FLMASK (EXT2_NODUMP_FL | EXT2_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext2_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & EXT2_REG_FLMASK;
+	else
+		return flags & EXT2_OTHER_FLMASK;
+}
+
 /*
  * ioctl commands
  */
-- 
cgit v1.2.3


From f420d4dc4272fd223986762df2ad06056ddebada Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 7 Jan 2009 18:07:24 -0800
Subject: jbd: improve fsync batching

There is a flaw with the way jbd handles fsync batching.  If we fsync() a
file and we were not the last person to run fsync() on this fs then we
automatically sleep for 1 jiffie in order to wait for new writers to join
into the transaction before forcing the commit.  The problem with this is
that with really fast storage (ie a Clariion) the time it takes to commit
a transaction to disk is way faster than 1 jiffie in most cases, so
sleeping means waiting longer with nothing to do than if we just committed
the transaction and kept going.  Ric Wheeler noticed this when using
fs_mark with more than 1 thread, the throughput would plummet as he added
more threads.

This patch attempts to fix this problem by recording the average time in
nanoseconds that it takes to commit a transaction to disk, and what time
we started the transaction.  If we run an fsync() and we have been running
for less time than it takes to commit the transaction to disk, we sleep
for the delta amount of time and then commit to disk.  We acheive
sub-jiffie sleeping using schedule_hrtimeout.  This means that the wait
time is auto-tuned to the speed of the underlying disk, instead of having
this static timeout.  I weighted the average according to somebody's
comments (Andreas Dilger I think) in order to help normalize random
outliers where we take way longer or way less time to commit than the
average.  I also have a min() check in there to make sure we don't sleep
longer than a jiffie in case our storage is super slow, this was requested
by Andrew.

I unfortunately do not have access to a Clariion, so I had to use a
ramdisk to represent a super fast array.  I tested with a SATA drive with
barrier=1 to make sure there was no regression with local disks, I tested
with a 4 way multipathed Apple Xserve RAID array and of course the
ramdisk.  I ran the following command

fs_mark -d /mnt/ext3-test -s 4096 -n 2000 -D 64 -t $i

where $i was 2, 4, 8, 16 and 32.  I mkfs'ed the fs each time.  Here are my
results

type	threads		with patch	without patch
sata	2		24.6		26.3
sata	4		49.2		48.1
sata	8		70.1		67.0
sata	16		104.0		94.1
sata	32		153.6		142.7

xserve	2		246.4		222.0
xserve	4		480.0		440.8
xserve	8		829.5		730.8
xserve	16		1172.7		1026.9
xserve	32		1816.3		1650.5

ramdisk	2		2538.3		1745.6
ramdisk	4		2942.3		661.9
ramdisk	8		2882.5		999.8
ramdisk	16		2738.7		1801.9
ramdisk	32		2541.9		2394.0

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Cc: Andreas Dilger <adilger@sun.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Ric Wheeler <rwheeler@redhat.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/commit.c      | 15 +++++++++++++++
 fs/jbd/transaction.c | 38 +++++++++++++++++++++++++++++++++-----
 include/linux/jbd.h  | 15 +++++++++++++++
 3 files changed, 63 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 25719d902c51..3fbffb1ea714 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
 	int flags;
 	int err;
 	unsigned long blocknr;
+	ktime_t start_time;
+	u64 commit_time;
 	char *tagp = NULL;
 	journal_header_t *header;
 	journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
 	commit_transaction->t_state = T_FLUSH;
 	journal->j_committing_transaction = commit_transaction;
 	journal->j_running_transaction = NULL;
+	start_time = ktime_get();
 	commit_transaction->t_log_start = journal->j_head;
 	wake_up(&journal->j_wait_transaction_locked);
 	spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
 	journal->j_commit_sequence = commit_transaction->t_tid;
 	journal->j_committing_transaction = NULL;
+	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+	/*
+	 * weight the commit time higher than the average time so we don't
+	 * react too strongly to vast changes in commit time
+	 */
+	if (likely(journal->j_average_commit_time))
+		journal->j_average_commit_time = (commit_time*3 +
+				journal->j_average_commit_time) / 4;
+	else
+		journal->j_average_commit_time = commit_time;
+
 	spin_unlock(&journal->j_state_lock);
 
 	if (commit_transaction->t_checkpoint_list == NULL &&
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d4c32c8808..b51fbd4b2913 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
 {
 	transaction->t_journal = journal;
 	transaction->t_state = T_RUNNING;
+	transaction->t_start_time = ktime_get();
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
@@ -1370,7 +1372,7 @@ int journal_stop(handle_t *handle)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
-	int old_handle_count, err;
+	int err;
 	pid_t pid;
 
 	J_ASSERT(journal_current_handle() == handle);
@@ -1399,6 +1401,17 @@ int journal_stop(handle_t *handle)
 	 * on IO anyway.  Speeds up many-threaded, many-dir operations
 	 * by 30x or more...
 	 *
+	 * We try and optimize the sleep time against what the underlying disk
+	 * can do, instead of having a static sleep time.  This is usefull for
+	 * the case where our storage is so fast that it is more optimal to go
+	 * ahead and force a flush and wait for the transaction to be committed
+	 * than it is to wait for an arbitrary amount of time for new writers to
+	 * join the transaction.  We acheive this by measuring how long it takes
+	 * to commit a transaction, and compare it with how long this
+	 * transaction has been running, and if run time < commit time then we
+	 * sleep for the delta and commit.  This greatly helps super fast disks
+	 * that would see slowdowns as more threads started doing fsyncs.
+	 *
 	 * But don't do this if this process was the most recent one to
 	 * perform a synchronous write.  We do this to detect the case where a
 	 * single process is doing a stream of sync writes.  No point in waiting
@@ -1406,11 +1419,26 @@ int journal_stop(handle_t *handle)
 	 */
 	pid = current->pid;
 	if (handle->h_sync && journal->j_last_sync_writer != pid) {
+		u64 commit_time, trans_time;
+
 		journal->j_last_sync_writer = pid;
-		do {
-			old_handle_count = transaction->t_handle_count;
-			schedule_timeout_uninterruptible(1);
-		} while (old_handle_count != transaction->t_handle_count);
+
+		spin_lock(&journal->j_state_lock);
+		commit_time = journal->j_average_commit_time;
+		spin_unlock(&journal->j_state_lock);
+
+		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+						   transaction->t_start_time));
+
+		commit_time = min_t(u64, commit_time,
+				    1000*jiffies_to_usecs(1));
+
+		if (trans_time < commit_time) {
+			ktime_t expires = ktime_add_ns(ktime_get(),
+						       commit_time);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+		}
 	}
 
 	current->journal_info = NULL;
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 346e2b80be7d..6384b19efe64 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -542,6 +542,11 @@ struct transaction_s
 	 */
 	unsigned long		t_expires;
 
+	/*
+	 * When this transaction started, in nanoseconds [no locking]
+	 */
+	ktime_t			t_start_time;
+
 	/*
 	 * How many handles used this transaction? [t_handle_lock]
 	 */
@@ -798,8 +803,18 @@ struct journal_s
 	struct buffer_head	**j_wbuf;
 	int			j_wbufsize;
 
+	/*
+	 * this is the pid of the last person to run a synchronous operation
+	 * through the journal.
+	 */
 	pid_t			j_last_sync_writer;
 
+	/*
+	 * the average amount of time in nanoseconds it takes to commit a
+	 * transaction to the disk.  [j_state_lock]
+	 */
+	u64			j_average_commit_time;
+
 	/*
 	 * An opaque pointer to fs-private information.  ext3 puts its
 	 * superblock pointer here
-- 
cgit v1.2.3


From 5df096d67ec2b6578518caed7d57317a4b807aa1 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 7 Jan 2009 18:07:25 -0800
Subject: ext3: allocate ->s_blockgroup_lock separately

As spotted by kmemtrace, struct ext3_sb_info is 17152 bytes on 64-bit
which makes it a very bad fit for SLAB allocators.  The culprit of the
wasted memory is ->s_blockgroup_lock which can be as big as 16 KB when
NR_CPUS >= 32.

To fix that, allocate ->s_blockgroup_lock, which fits nicely in a order 2
page in the worst case, separately.  This shinks down struct ext3_sb_info
enough to fit a 1 KB slab cache so now we allocate 16 KB + 1 KB instead of
32 KB saving 15 KB of memory.

Acked-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c            | 10 +++++++++-
 include/linux/ext3_fs_sb.h |  4 ++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index c22d01467bd1..01c235bc2054 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb)
 		ext3_blkdev_remove(sbi);
 	}
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	return;
 }
@@ -1546,6 +1547,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		return -ENOMEM;
+	}
 	sb->s_fs_info = sbi;
 	sbi->s_mount_opt = 0;
 	sbi->s_resuid = EXT3_DEF_RESUID;
@@ -1786,7 +1794,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	bgl_lock_init(&sbi->s_blockgroup_lock);
+	bgl_lock_init(sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logic_sb_block, i);
diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h
index e024e38248ff..76fdc0f4b028 100644
--- a/include/linux/ext3_fs_sb.h
+++ b/include/linux/ext3_fs_sb.h
@@ -60,7 +60,7 @@ struct ext3_sb_info {
 	struct percpu_counter s_freeblocks_counter;
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
-	struct blockgroup_lock s_blockgroup_lock;
+	struct blockgroup_lock *s_blockgroup_lock;
 
 	/* root of the per fs reservation window tree */
 	spinlock_t s_rsv_window_lock;
@@ -86,7 +86,7 @@ struct ext3_sb_info {
 static inline spinlock_t *
 sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group)
 {
-	return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
 }
 
 #endif	/* _LINUX_EXT3_FS_SB */
-- 
cgit v1.2.3


From 2e8671cb566da993425d324fc355af31edc6e7f1 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Wed, 7 Jan 2009 18:07:26 -0800
Subject: ext3: don't inherit inappropriate inode flags from parent

At present INDEX is the only flag that new ext3 inodes do NOT inherit from
their parent.  In addition prevent the flags DIRTY, ECOMPR, IMAGIC and
TOPDIR from being inherited.  List inheritable flags explicitly to prevent
future flags from accidentally being inherited.

This fixes the TOPDIR flag inheritance bug reported at
http://bugzilla.kernel.org/show_bug.cgi?id=9866.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Acked-by: Andreas Dilger <adilger@sun.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/ialloc.c        | 2 +-
 include/linux/ext3_fs.h | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 5655fbcbd11f..ba9186a21d1e 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,7 +559,7 @@ got:
 	ei->i_dir_start_lookup = 0;
 	ei->i_disksize = 0;
 
-	ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL;
+	ei->i_flags = EXT3_I(dir)->i_flags & EXT3_FL_INHERITED;
 	if (S_ISLNK(mode))
 		ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
 	/* dirsync only applies to directories */
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index d14f02918483..b745619a9b8e 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -178,6 +178,13 @@ struct ext3_group_desc
 #define EXT3_FL_USER_VISIBLE		0x0003DFFF /* User visible flags */
 #define EXT3_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */
 
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\
+			   EXT3_SYNC_FL | EXT3_IMMUTABLE_FL | EXT3_APPEND_FL |\
+			   EXT3_NODUMP_FL | EXT3_NOATIME_FL | EXT3_COMPRBLK_FL|\
+			   EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
+			   EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)
+
 /*
  * Inode dynamic state flags
  */
-- 
cgit v1.2.3


From 04143e2fb9d512c21e1dcfb561dbb0445dcfdc8c Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Wed, 7 Jan 2009 18:07:26 -0800
Subject: ext3: tighten restrictions on inode flags

At the moment there are few restrictions on which flags may be set on
which inodes.  Specifically DIRSYNC may only be set on directories and
IMMUTABLE and APPEND may not be set on links.  Tighten that to disallow
TOPDIR being set on non-directories and only NODUMP and NOATIME to be set
on non-regular file, non-directories.

Introduces a flags masking function which masks flags based on mode and
use it during inode creation and when flags are set via the ioctl to
facilitate future consistency.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Acked-by: Andreas Dilger <adilger@sun.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/ialloc.c        |  8 ++------
 fs/ext3/ioctl.c         |  3 +--
 include/linux/ext3_fs.h | 17 +++++++++++++++++
 3 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index ba9186a21d1e..8de6c720e510 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,12 +559,8 @@ got:
 	ei->i_dir_start_lookup = 0;
 	ei->i_disksize = 0;
 
-	ei->i_flags = EXT3_I(dir)->i_flags & EXT3_FL_INHERITED;
-	if (S_ISLNK(mode))
-		ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
-	/* dirsync only applies to directories */
-	if (!S_ISDIR(mode))
-		ei->i_flags &= ~EXT3_DIRSYNC_FL;
+	ei->i_flags =
+		ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
 #ifdef EXT3_FRAGMENTS
 	ei->i_faddr = 0;
 	ei->i_frag_no = 0;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index b7394d05ee8e..5e86ce9a86e0 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 			goto flags_out;
 		}
 
-		if (!S_ISDIR(inode->i_mode))
-			flags &= ~EXT3_DIRSYNC_FL;
+		flags = ext3_mask_flags(inode->i_mode, flags);
 
 		mutex_lock(&inode->i_mutex);
 		/* Is it quota file? Do not allow user to mess with it */
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index b745619a9b8e..d76800f6ecf0 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -185,6 +185,23 @@ struct ext3_group_desc
 			   EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
 			   EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)
 
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & EXT3_REG_FLMASK;
+	else
+		return flags & EXT3_OTHER_FLMASK;
+}
+
 /*
  * Inode dynamic state flags
  */
-- 
cgit v1.2.3


From b2aa30f7bb381e04c93eed106089ba55553955f1 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:37 -0800
Subject: cgroups: don't put struct cgroupfs_root protected by RCU

We don't access struct cgroupfs_root in fast path, so we should not put
struct cgroupfs_root protected by RCU

But the comment in struct cgroup_subsys.root confuse us.

struct cgroup_subsys.root is used in these places:

1 find_css_set(): if (ss->root->subsys_list.next == &ss->sibling)
2 rebind_subsystems(): if (ss->root != &rootnode)
                       rcu_assign_pointer(ss->root, root);
                       rcu_assign_pointer(subsys[i]->root, &rootnode);
3 cgroup_has_css_refs(): if (ss->root != cgrp->root)
4 cgroup_init_subsys(): ss->root = &rootnode;
5 proc_cgroupstats_show(): ss->name, ss->root->subsys_bits,
                           ss->root->number_of_cgroups, !ss->disabled);
6 cgroup_clone(): root = subsys->root;
                  if ((root != subsys->root) ||

All these place we have held cgroup_lock() or we don't dereference to
struct cgroupfs_root.  It's means wo don't need RCU when use struct
cgroup_subsys.root, and we should not put struct cgroupfs_root protected
by RCU.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Reviewed-by: Paul Menage <menage@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 1 -
 kernel/cgroup.c        | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 08b78c09b09a..f68dfd8dd53a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -337,7 +337,6 @@ struct cgroup_subsys {
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 
-	/* Protected by RCU */
 	struct cgroupfs_root *root;
 
 	struct list_head sibling;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a391ab3bdfc6..a288da176e46 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -713,7 +713,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			cgrp->subsys[i] = dummytop->subsys[i];
 			cgrp->subsys[i]->cgroup = cgrp;
 			list_add(&ss->sibling, &root->subsys_list);
-			rcu_assign_pointer(ss->root, root);
+			ss->root = root;
 			if (ss->bind)
 				ss->bind(ss, cgrp);
 
@@ -725,7 +725,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 				ss->bind(ss, dummytop);
 			dummytop->subsys[i]->cgroup = dummytop;
 			cgrp->subsys[i] = NULL;
-			rcu_assign_pointer(subsys[i]->root, &rootnode);
+			subsys[i]->root = &rootnode;
 			list_del(&ss->sibling);
 		} else if (bit & final_bits) {
 			/* Subsystem state should already exist */
-- 
cgit v1.2.3


From a47295e6bc42ad35f9c15ac66f598aa24debd4e2 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Wed, 7 Jan 2009 18:07:44 -0800
Subject: cgroups: make cgroup_path() RCU-safe

Fix races between /proc/sched_debug by freeing cgroup objects via an RCU
callback.  Thus any cgroup reference obtained from an RCU-safe source will
remain valid during the RCU section.  Since dentries are also RCU-safe,
this allows us to traverse up the tree safely.

Additionally, make cgroup_path() check for a NULL cgrp->dentry to avoid
trying to report a path for a partially-created cgroup.

[lizf@cn.fujitsu.com: call deactive_super() in cgroup_diput()]
Signed-off-by: Paul Menage <menage@google.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Tested-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  5 ++++-
 kernel/cgroup.c        | 30 +++++++++++++++++++++---------
 2 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f68dfd8dd53a..73d1c730c3c4 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -116,7 +116,7 @@ struct cgroup {
 	struct list_head children;	/* my children */
 
 	struct cgroup *parent;	/* my parent */
-	struct dentry *dentry;	  	/* cgroup fs entry */
+	struct dentry *dentry;	  	/* cgroup fs entry, RCU protected */
 
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
@@ -145,6 +145,9 @@ struct cgroup {
 	int pids_use_count;
 	/* Length of the current tasks_pids array */
 	int pids_length;
+
+	/* For RCU-protected deletion */
+	struct rcu_head rcu_head;
 };
 
 /* A css_set is a structure holding pointers to a set of
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cb7c72b91f46..83ea4f524be5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -271,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
 
 	rcu_read_lock();
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-		struct cgroup *cgrp = cg->subsys[i]->cgroup;
+		struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
 		if (atomic_dec_and_test(&cgrp->count) &&
 		    notify_on_release(cgrp)) {
 			if (taskexit)
@@ -594,6 +594,13 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
 	return;
 }
 
+static void free_cgroup_rcu(struct rcu_head *obj)
+{
+	struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
+
+	kfree(cgrp);
+}
+
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 {
 	/* is dentry a directory ? if so, kfree() associated cgroup */
@@ -619,11 +626,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		cgrp->root->number_of_cgroups--;
 		mutex_unlock(&cgroup_mutex);
 
-		/* Drop the active superblock reference that we took when we
-		 * created the cgroup */
+		/*
+		 * Drop the active superblock reference that we took when we
+		 * created the cgroup
+		 */
 		deactivate_super(cgrp->root->sb);
 
-		kfree(cgrp);
+		call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
 	}
 	iput(inode);
 }
@@ -1134,14 +1143,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
  * @buf: the buffer to write the path into
  * @buflen: the length of the buffer
  *
- * Called with cgroup_mutex held. Writes path of cgroup into buf.
- * Returns 0 on success, -errno on error.
+ * Called with cgroup_mutex held or else with an RCU-protected cgroup
+ * reference.  Writes path of cgroup into buf.  Returns 0 on success,
+ * -errno on error.
  */
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
 	char *start;
+	struct dentry *dentry = rcu_dereference(cgrp->dentry);
 
-	if (cgrp == dummytop) {
+	if (!dentry || cgrp == dummytop) {
 		/*
 		 * Inactive subsystems have no dentry for their root
 		 * cgroup
@@ -1154,13 +1165,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 
 	*--start = '\0';
 	for (;;) {
-		int len = cgrp->dentry->d_name.len;
+		int len = dentry->d_name.len;
 		if ((start -= len) < buf)
 			return -ENAMETOOLONG;
 		memcpy(start, cgrp->dentry->d_name.name, len);
 		cgrp = cgrp->parent;
 		if (!cgrp)
 			break;
+		dentry = rcu_dereference(cgrp->dentry);
 		if (!cgrp->parent)
 			continue;
 		if (--start < buf)
@@ -1663,7 +1675,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
 	if (!error) {
 		dentry->d_fsdata = cgrp;
 		inc_nlink(parent->d_inode);
-		cgrp->dentry = dentry;
+		rcu_assign_pointer(cgrp->dentry, dentry);
 		dget(dentry);
 	}
 	dput(dentry);
-- 
cgit v1.2.3


From 7a81b88cb53e335ff7d019e6398c95792c817d93 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:48 -0800
Subject: memcg: introduce charge-commit-cancel style of functions

There is a small race in do_swap_page().  When the page swapped-in is
charged, the mapcount can be greater than 0.  But, at the same time some
process (shares it ) call unmap and make mapcount 1->0 and the page is
uncharged.

      CPUA 			CPUB
       mapcount == 1.
   (1) charge if mapcount==0     zap_pte_range()
                                (2) mapcount 1 => 0.
			        (3) uncharge(). (success)
   (4) set page's rmap()
       mapcount 0=>1

Then, this swap page's account is leaked.

For fixing this, I added a new interface.
  - charge
   account to res_counter by PAGE_SIZE and try to free pages if necessary.
  - commit
   register page_cgroup and add to LRU if necessary.
  - cancel
   uncharge PAGE_SIZE because of do_swap_page failure.

     CPUA
  (1) charge (always)
  (2) set page's rmap (mapcount > 0)
  (3) commit charge was necessary or not after set_pte().

This protocol uses PCG_USED bit on page_cgroup for avoiding over accounting.
Usual mem_cgroup_charge_common() does charge -> commit at a time.

And this patch also adds following function to clarify all charges.

  - mem_cgroup_newpage_charge() ....replacement for mem_cgroup_charge()
	called against newly allocated anon pages.

  - mem_cgroup_charge_migrate_fixup()
        called only from remove_migration_ptes().
	we'll have to rewrite this later.(this patch just keeps old behavior)
	This function will be removed by additional patch to make migration
	clearer.

Good for clarifying "what we do"

Then, we have 4 following charge points.
  - newpage
  - swap-in
  - add-to-cache.
  - migration.

[akpm@linux-foundation.org: add missing inline directives to stubs]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  36 ++++++++++-
 mm/memcontrol.c            | 155 ++++++++++++++++++++++++++++++++++++---------
 mm/memory.c                |  12 ++--
 mm/migrate.c               |   2 +-
 mm/swapfile.c              |   6 +-
 5 files changed, 170 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1fbe14d39521..c592f315cd02 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -27,8 +27,17 @@ struct mm_struct;
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
-extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
+extern int mem_cgroup_charge_migrate_fixup(struct page *page,
+				struct mm_struct *mm, gfp_t gfp_mask);
+/* for swap handling */
+extern int mem_cgroup_try_charge(struct mm_struct *mm,
+		gfp_t gfp_mask, struct mem_cgroup **ptr);
+extern void mem_cgroup_commit_charge_swapin(struct page *page,
+					struct mem_cgroup *ptr);
+extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr);
+
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 					gfp_t gfp_mask);
 extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru);
@@ -71,7 +80,9 @@ extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
 
 
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
-static inline int mem_cgroup_charge(struct page *page,
+struct mem_cgroup;
+
+static inline int mem_cgroup_newpage_charge(struct page *page,
 					struct mm_struct *mm, gfp_t gfp_mask)
 {
 	return 0;
@@ -83,6 +94,27 @@ static inline int mem_cgroup_cache_charge(struct page *page,
 	return 0;
 }
 
+static inline int mem_cgroup_charge_migrate_fixup(struct page *page,
+					struct mm_struct *mm, gfp_t gfp_mask)
+{
+	return 0;
+}
+
+static inline int mem_cgroup_try_charge(struct mm_struct *mm,
+				gfp_t gfp_mask, struct mem_cgroup **ptr)
+{
+	return 0;
+}
+
+static inline void mem_cgroup_commit_charge_swapin(struct page *page,
+					  struct mem_cgroup *ptr)
+{
+}
+
+static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr)
+{
+}
+
 static inline void mem_cgroup_uncharge_page(struct page *page)
 {
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 51ee96545579..f568b1964551 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -467,35 +467,31 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 	return nr_taken;
 }
 
-/*
- * Charge the memory controller for page usage.
- * Return
- * 0 if the charge was successful
- * < 0 if the cgroup is over its limit
+
+/**
+ * mem_cgroup_try_charge - get charge of PAGE_SIZE.
+ * @mm: an mm_struct which is charged against. (when *memcg is NULL)
+ * @gfp_mask: gfp_mask for reclaim.
+ * @memcg: a pointer to memory cgroup which is charged against.
+ *
+ * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
+ * memory cgroup from @mm is got and stored in *memcg.
+ *
+ * Returns 0 if success. -ENOMEM at failure.
  */
-static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
-				gfp_t gfp_mask, enum charge_type ctype,
-				struct mem_cgroup *memcg)
+
+int mem_cgroup_try_charge(struct mm_struct *mm,
+			gfp_t gfp_mask, struct mem_cgroup **memcg)
 {
 	struct mem_cgroup *mem;
-	struct page_cgroup *pc;
-	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-	struct mem_cgroup_per_zone *mz;
-	unsigned long flags;
-
-	pc = lookup_page_cgroup(page);
-	/* can happen at boot */
-	if (unlikely(!pc))
-		return 0;
-	prefetchw(pc);
+	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
 	 * thread group leader migrates. It's possible that mm is not
 	 * set, if so charge the init_mm (happens for pagecache usage).
 	 */
-
-	if (likely(!memcg)) {
+	if (likely(!*memcg)) {
 		rcu_read_lock();
 		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 		if (unlikely(!mem)) {
@@ -506,15 +502,17 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 		 * For every charge from the cgroup, increment reference count
 		 */
 		css_get(&mem->css);
+		*memcg = mem;
 		rcu_read_unlock();
 	} else {
-		mem = memcg;
-		css_get(&memcg->css);
+		mem = *memcg;
+		css_get(&mem->css);
 	}
 
+
 	while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
 		if (!(gfp_mask & __GFP_WAIT))
-			goto out;
+			goto nomem;
 
 		if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
 			continue;
@@ -531,18 +529,37 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 
 		if (!nr_retries--) {
 			mem_cgroup_out_of_memory(mem, gfp_mask);
-			goto out;
+			goto nomem;
 		}
 	}
+	return 0;
+nomem:
+	css_put(&mem->css);
+	return -ENOMEM;
+}
+
+/*
+ * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
+ * USED state. If already USED, uncharge and return.
+ */
+
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+				     struct page_cgroup *pc,
+				     enum charge_type ctype)
+{
+	struct mem_cgroup_per_zone *mz;
+	unsigned long flags;
 
+	/* try_charge() can return NULL to *memcg, taking care of it. */
+	if (!mem)
+		return;
 
 	lock_page_cgroup(pc);
 	if (unlikely(PageCgroupUsed(pc))) {
 		unlock_page_cgroup(pc);
 		res_counter_uncharge(&mem->res, PAGE_SIZE);
 		css_put(&mem->css);
-
-		goto done;
+		return;
 	}
 	pc->mem_cgroup = mem;
 	/*
@@ -557,15 +574,39 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 	__mem_cgroup_add_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
 	unlock_page_cgroup(pc);
+}
 
-done:
+/*
+ * Charge the memory controller for page usage.
+ * Return
+ * 0 if the charge was successful
+ * < 0 if the cgroup is over its limit
+ */
+static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
+				gfp_t gfp_mask, enum charge_type ctype,
+				struct mem_cgroup *memcg)
+{
+	struct mem_cgroup *mem;
+	struct page_cgroup *pc;
+	int ret;
+
+	pc = lookup_page_cgroup(page);
+	/* can happen at boot */
+	if (unlikely(!pc))
+		return 0;
+	prefetchw(pc);
+
+	mem = memcg;
+	ret = mem_cgroup_try_charge(mm, gfp_mask, &mem);
+	if (ret)
+		return ret;
+
+	__mem_cgroup_commit_charge(mem, pc, ctype);
 	return 0;
-out:
-	css_put(&mem->css);
-	return -ENOMEM;
 }
 
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
+int mem_cgroup_newpage_charge(struct page *page,
+			      struct mm_struct *mm, gfp_t gfp_mask)
 {
 	if (mem_cgroup_subsys.disabled)
 		return 0;
@@ -586,6 +627,34 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
 }
 
+/*
+ * same as mem_cgroup_newpage_charge(), now.
+ * But what we assume is different from newpage, and this is special case.
+ * treat this in special function. easy for maintenance.
+ */
+
+int mem_cgroup_charge_migrate_fixup(struct page *page,
+				struct mm_struct *mm, gfp_t gfp_mask)
+{
+	if (mem_cgroup_subsys.disabled)
+		return 0;
+
+	if (PageCompound(page))
+		return 0;
+
+	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
+		return 0;
+
+	if (unlikely(!mm))
+		mm = &init_mm;
+
+	return mem_cgroup_charge_common(page, mm, gfp_mask,
+				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
+}
+
+
+
+
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask)
 {
@@ -628,6 +697,30 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
 }
 
+
+void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
+{
+	struct page_cgroup *pc;
+
+	if (mem_cgroup_subsys.disabled)
+		return;
+	if (!ptr)
+		return;
+	pc = lookup_page_cgroup(page);
+	__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+}
+
+void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
+{
+	if (mem_cgroup_subsys.disabled)
+		return;
+	if (!mem)
+		return;
+	res_counter_uncharge(&mem->res, PAGE_SIZE);
+	css_put(&mem->css);
+}
+
+
 /*
  * uncharge if !page_mapped(page)
  */
diff --git a/mm/memory.c b/mm/memory.c
index 3f8fa06b963b..7f210f160990 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2000,7 +2000,7 @@ gotten:
 	cow_user_page(new_page, old_page, address, vma);
 	__SetPageUptodate(new_page);
 
-	if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
+	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
 		goto oom_free_new;
 
 	/*
@@ -2392,6 +2392,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *page;
 	swp_entry_t entry;
 	pte_t pte;
+	struct mem_cgroup *ptr = NULL;
 	int ret = 0;
 
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2430,7 +2431,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	lock_page(page);
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 
-	if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+	if (mem_cgroup_try_charge(mm, GFP_KERNEL, &ptr) == -ENOMEM) {
 		ret = VM_FAULT_OOM;
 		unlock_page(page);
 		goto out;
@@ -2460,6 +2461,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	flush_icache_page(vma, page);
 	set_pte_at(mm, address, page_table, pte);
 	page_add_anon_rmap(page, vma, address);
+	mem_cgroup_commit_charge_swapin(page, ptr);
 
 	swap_free(entry);
 	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
@@ -2480,7 +2482,7 @@ unlock:
 out:
 	return ret;
 out_nomap:
-	mem_cgroup_uncharge_page(page);
+	mem_cgroup_cancel_charge_swapin(ptr);
 	pte_unmap_unlock(page_table, ptl);
 	unlock_page(page);
 	page_cache_release(page);
@@ -2510,7 +2512,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto oom;
 	__SetPageUptodate(page);
 
-	if (mem_cgroup_charge(page, mm, GFP_KERNEL))
+	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
 		goto oom_free_page;
 
 	entry = mk_pte(page, vma->vm_page_prot);
@@ -2601,7 +2603,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				ret = VM_FAULT_OOM;
 				goto out;
 			}
-			if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+			if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
 				ret = VM_FAULT_OOM;
 				page_cache_release(page);
 				goto out;
diff --git a/mm/migrate.c b/mm/migrate.c
index 55373983c9c6..246dcb973ae7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -133,7 +133,7 @@ static void remove_migration_pte(struct vm_area_struct *vma,
 	 * be reliable, and this charge can actually fail: oh well, we don't
 	 * make the situation any worse by proceeding as if it had succeeded.
 	 */
-	mem_cgroup_charge(new, mm, GFP_ATOMIC);
+	mem_cgroup_charge_migrate_fixup(new, mm, GFP_ATOMIC);
 
 	get_page(new);
 	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
diff --git a/mm/swapfile.c b/mm/swapfile.c
index eec5ca758a23..fb926efb5167 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -690,17 +690,18 @@ unsigned int count_swap_pages(int type, int free)
 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, swp_entry_t entry, struct page *page)
 {
+	struct mem_cgroup *ptr = NULL;
 	spinlock_t *ptl;
 	pte_t *pte;
 	int ret = 1;
 
-	if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+	if (mem_cgroup_try_charge(vma->vm_mm, GFP_KERNEL, &ptr))
 		ret = -ENOMEM;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
 		if (ret > 0)
-			mem_cgroup_uncharge_page(page);
+			mem_cgroup_cancel_charge_swapin(ptr);
 		ret = 0;
 		goto out;
 	}
@@ -710,6 +711,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
 	page_add_anon_rmap(page, vma, addr);
+	mem_cgroup_commit_charge_swapin(page, ptr);
 	swap_free(entry);
 	/*
 	 * Move the page to the active list so it is not
-- 
cgit v1.2.3


From 01b1ae63c2270cbacfd43fea94578c17950eb548 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:50 -0800
Subject: memcg: simple migration handling

Now, management of "charge" under page migration is done under following
manner. (Assume migrate page contents from oldpage to newpage)

 before
  - "newpage" is charged before migration.
 at success.
  - "oldpage" is uncharged at somewhere(unmap, radix-tree-replace)
 at failure
  - "newpage" is uncharged.
  - "oldpage" is charged if necessary (*1)

But (*1) is not reliable....because of GFP_ATOMIC.

This patch tries to change behavior as following by charge/commit/cancel ops.

 before
  - charge PAGE_SIZE (no target page)
 success
  - commit charge against "newpage".
 failure
  - commit charge against "oldpage".
    (PCG_USED bit works effectively to avoid double-counting)
  - if "oldpage" is obsolete, cancel charge of PAGE_SIZE.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  19 +++-----
 mm/memcontrol.c            | 108 ++++++++++++++++++++++-----------------------
 mm/migrate.c               |  42 ++++++------------
 3 files changed, 73 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c592f315cd02..b095f5f6ecf7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -29,8 +29,6 @@ struct mm_struct;
 
 extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
-extern int mem_cgroup_charge_migrate_fixup(struct page *page,
-				struct mm_struct *mm, gfp_t gfp_mask);
 /* for swap handling */
 extern int mem_cgroup_try_charge(struct mm_struct *mm,
 		gfp_t gfp_mask, struct mem_cgroup **ptr);
@@ -60,8 +58,9 @@ extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 	((cgroup) == mem_cgroup_from_task((mm)->owner))
 
 extern int
-mem_cgroup_prepare_migration(struct page *page, struct page *newpage);
-extern void mem_cgroup_end_migration(struct page *page);
+mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr);
+extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
+	struct page *oldpage, struct page *newpage);
 
 /*
  * For memory reclaim.
@@ -94,12 +93,6 @@ static inline int mem_cgroup_cache_charge(struct page *page,
 	return 0;
 }
 
-static inline int mem_cgroup_charge_migrate_fixup(struct page *page,
-					struct mm_struct *mm, gfp_t gfp_mask)
-{
-	return 0;
-}
-
 static inline int mem_cgroup_try_charge(struct mm_struct *mm,
 				gfp_t gfp_mask, struct mem_cgroup **ptr)
 {
@@ -144,12 +137,14 @@ static inline int task_in_mem_cgroup(struct task_struct *task,
 }
 
 static inline int
-mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
+mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 {
 	return 0;
 }
 
-static inline void mem_cgroup_end_migration(struct page *page)
+static inline void mem_cgroup_end_migration(struct mem_cgroup *mem,
+					struct page *oldpage,
+					struct page *newpage)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c34eb52bdc3f..b71195e8198b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -627,34 +627,6 @@ int mem_cgroup_newpage_charge(struct page *page,
 				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
 }
 
-/*
- * same as mem_cgroup_newpage_charge(), now.
- * But what we assume is different from newpage, and this is special case.
- * treat this in special function. easy for maintenance.
- */
-
-int mem_cgroup_charge_migrate_fixup(struct page *page,
-				struct mm_struct *mm, gfp_t gfp_mask)
-{
-	if (mem_cgroup_subsys.disabled)
-		return 0;
-
-	if (PageCompound(page))
-		return 0;
-
-	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
-		return 0;
-
-	if (unlikely(!mm))
-		mm = &init_mm;
-
-	return mem_cgroup_charge_common(page, mm, gfp_mask,
-				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
-}
-
-
-
-
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask)
 {
@@ -697,7 +669,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
 }
 
-
 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 {
 	struct page_cgroup *pc;
@@ -782,13 +753,13 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
 }
 
 /*
- * Before starting migration, account against new page.
+ * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
+ * page belongs to.
  */
-int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
+int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem = NULL;
-	enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
 	int ret = 0;
 
 	if (mem_cgroup_subsys.disabled)
@@ -799,42 +770,67 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
 	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
-		if (PageCgroupCache(pc)) {
-			if (page_is_file_cache(page))
-				ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
-			else
-				ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-		}
 	}
 	unlock_page_cgroup(pc);
+
 	if (mem) {
-		ret = mem_cgroup_charge_common(newpage, NULL,
-					GFP_HIGHUSER_MOVABLE,
-					ctype, mem);
+		ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem);
 		css_put(&mem->css);
 	}
+	*ptr = mem;
 	return ret;
 }
 
 /* remove redundant charge if migration failed*/
-void mem_cgroup_end_migration(struct page *newpage)
+void mem_cgroup_end_migration(struct mem_cgroup *mem,
+		struct page *oldpage, struct page *newpage)
 {
+	struct page *target, *unused;
+	struct page_cgroup *pc;
+	enum charge_type ctype;
+
+	if (!mem)
+		return;
+
+	/* at migration success, oldpage->mapping is NULL. */
+	if (oldpage->mapping) {
+		target = oldpage;
+		unused = NULL;
+	} else {
+		target = newpage;
+		unused = oldpage;
+	}
+
+	if (PageAnon(target))
+		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+	else if (page_is_file_cache(target))
+		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+	else
+		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+
+	/* unused page is not on radix-tree now. */
+	if (unused && ctype != MEM_CGROUP_CHARGE_TYPE_MAPPED)
+		__mem_cgroup_uncharge_common(unused, ctype);
+
+	pc = lookup_page_cgroup(target);
 	/*
-	 * At success, page->mapping is not NULL.
-	 * special rollback care is necessary when
-	 * 1. at migration failure. (newpage->mapping is cleared in this case)
-	 * 2. the newpage was moved but not remapped again because the task
-	 *    exits and the newpage is obsolete. In this case, the new page
-	 *    may be a swapcache. So, we just call mem_cgroup_uncharge_page()
-	 *    always for avoiding mess. The  page_cgroup will be removed if
-	 *    unnecessary. File cache pages is still on radix-tree. Don't
-	 *    care it.
+	 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
+	 * So, double-counting is effectively avoided.
+	 */
+	__mem_cgroup_commit_charge(mem, pc, ctype);
+
+	/*
+	 * Both of oldpage and newpage are still under lock_page().
+	 * Then, we don't have to care about race in radix-tree.
+	 * But we have to be careful that this page is unmapped or not.
+	 *
+	 * There is a case for !page_mapped(). At the start of
+	 * migration, oldpage was mapped. But now, it's zapped.
+	 * But we know *target* page is not freed/reused under us.
+	 * mem_cgroup_uncharge_page() does all necessary checks.
 	 */
-	if (!newpage->mapping)
-		__mem_cgroup_uncharge_common(newpage,
-				MEM_CGROUP_CHARGE_TYPE_FORCE);
-	else if (PageAnon(newpage))
-		mem_cgroup_uncharge_page(newpage);
+	if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
+		mem_cgroup_uncharge_page(target);
 }
 
 /*
diff --git a/mm/migrate.c b/mm/migrate.c
index 246dcb973ae7..a30ea5fcf9f1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -121,20 +121,6 @@ static void remove_migration_pte(struct vm_area_struct *vma,
 	if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
 		goto out;
 
-	/*
-	 * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
-	 * Failure is not an option here: we're now expected to remove every
-	 * migration pte, and will cause crashes otherwise.  Normally this
-	 * is not an issue: mem_cgroup_prepare_migration bumped up the old
-	 * page_cgroup count for safety, that's now attached to the new page,
-	 * so this charge should just be another incrementation of the count,
-	 * to keep in balance with rmap.c's mem_cgroup_uncharging.  But if
-	 * there's been a force_empty, those reference counts may no longer
-	 * be reliable, and this charge can actually fail: oh well, we don't
-	 * make the situation any worse by proceeding as if it had succeeded.
-	 */
-	mem_cgroup_charge_migrate_fixup(new, mm, GFP_ATOMIC);
-
 	get_page(new);
 	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 	if (is_write_migration_entry(entry))
@@ -378,9 +364,6 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
 	anon = PageAnon(page);
 	page->mapping = NULL;
 
-	if (!anon) /* This page was removed from radix-tree. */
-		mem_cgroup_uncharge_cache_page(page);
-
 	/*
 	 * If any waiters have accumulated on the new page then
 	 * wake them up.
@@ -614,6 +597,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	struct page *newpage = get_new_page(page, private, &result);
 	int rcu_locked = 0;
 	int charge = 0;
+	struct mem_cgroup *mem;
 
 	if (!newpage)
 		return -ENOMEM;
@@ -623,24 +607,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 		goto move_newpage;
 	}
 
-	charge = mem_cgroup_prepare_migration(page, newpage);
-	if (charge == -ENOMEM) {
-		rc = -ENOMEM;
-		goto move_newpage;
-	}
 	/* prepare cgroup just returns 0 or -ENOMEM */
-	BUG_ON(charge);
-
 	rc = -EAGAIN;
+
 	if (!trylock_page(page)) {
 		if (!force)
 			goto move_newpage;
 		lock_page(page);
 	}
 
+	/* charge against new page */
+	charge = mem_cgroup_prepare_migration(page, &mem);
+	if (charge == -ENOMEM) {
+		rc = -ENOMEM;
+		goto unlock;
+	}
+	BUG_ON(charge);
+
 	if (PageWriteback(page)) {
 		if (!force)
-			goto unlock;
+			goto uncharge;
 		wait_on_page_writeback(page);
 	}
 	/*
@@ -693,7 +679,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 rcu_unlock:
 	if (rcu_locked)
 		rcu_read_unlock();
-
+uncharge:
+	if (!charge)
+		mem_cgroup_end_migration(mem, page, newpage);
 unlock:
 	unlock_page(page);
 
@@ -709,8 +697,6 @@ unlock:
 	}
 
 move_newpage:
-	if (!charge)
-		mem_cgroup_end_migration(newpage);
 
 	/*
 	 * Move the new page to the LRU. If migration was not successful
-- 
cgit v1.2.3


From d13d144309d2e5a3e6ad978b16c1d0226ddc9231 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:56 -0800
Subject: memcg: handle swap caches

SwapCache support for memory resource controller (memcg)

Before mem+swap controller, memcg itself should handle SwapCache in proper
way.  This is cut-out from it.

In current memcg, SwapCache is just leaked and the user can create tons of
SwapCache.  This is a leak of account and should be handled.

SwapCache accounting is done as following.

  charge (anon)
	- charged when it's mapped.
	  (because of readahead, charge at add_to_swap_cache() is not sane)
  uncharge (anon)
	- uncharged when it's dropped from swapcache and fully unmapped.
	  means it's not uncharged at unmap.
	  Note: delete from swap cache at swap-in is done after rmap information
	        is established.
  charge (shmem)
	- charged at swap-in. this prevents charge at add_to_page_cache().

  uncharge (shmem)
	- uncharged when it's dropped from swapcache and not on shmem's
	  radix-tree.

  at migration, check against 'old page' is modified to handle shmem.

Comparing to the old version discussed (and caused troubles), we have
advantages of
  - PCG_USED bit.
  - simple migrating handling.

So, situation is much easier than several months ago, maybe.

[hugh@veritas.com: memcg: handle swap caches build fix]
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/controllers/memory.txt |  5 +++
 include/linux/swap.h                 | 22 ++++++++++++
 mm/memcontrol.c                      | 67 ++++++++++++++++++++++++++++++++----
 mm/shmem.c                           | 18 ++++++++--
 mm/swap_state.c                      |  1 +
 5 files changed, 105 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt
index 54253b7a8db2..9fe2d0eabe05 100644
--- a/Documentation/controllers/memory.txt
+++ b/Documentation/controllers/memory.txt
@@ -137,6 +137,11 @@ behind this approach is that a cgroup that aggressively uses a shared
 page will eventually get charged for it (once it is uncharged from
 the cgroup that brought it in -- this will happen on memory pressure).
 
+Exception: When you do swapoff and make swapped-out pages of shmem(tmpfs) to
+be backed into memory in force, charges for pages are accounted against the
+caller of swapoff rather than the users of shmem.
+
+
 2.4 Reclaim
 
 Each cgroup maintains a per cgroup LRU that consists of an active
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 91dee50fe260..f8f3907533f0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -333,6 +333,22 @@ static inline void disable_swap_token(void)
 	put_swap_token(swap_token_mm);
 }
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+extern int mem_cgroup_cache_charge_swapin(struct page *page,
+				struct mm_struct *mm, gfp_t mask, bool locked);
+extern void mem_cgroup_uncharge_swapcache(struct page *page);
+#else
+static inline
+int mem_cgroup_cache_charge_swapin(struct page *page,
+				struct mm_struct *mm, gfp_t mask, bool locked)
+{
+	return 0;
+}
+static inline void mem_cgroup_uncharge_swapcache(struct page *page)
+{
+}
+#endif
+
 #else /* CONFIG_SWAP */
 
 #define nr_swap_pages				0L
@@ -409,6 +425,12 @@ static inline swp_entry_t get_swap_page(void)
 #define has_swap_token(x) 0
 #define disable_swap_token() do { } while(0)
 
+static inline int mem_cgroup_cache_charge_swapin(struct page *page,
+			struct mm_struct *mm, gfp_t mask, bool locked)
+{
+	return 0;
+}
+
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index decace3bb57e..7288e9d85ca7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,6 +21,7 @@
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
@@ -139,6 +140,7 @@ enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
 	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
 	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
+	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
 	NR_CHARGE_TYPE,
 };
 
@@ -780,6 +782,33 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
 }
 
+#ifdef CONFIG_SWAP
+int mem_cgroup_cache_charge_swapin(struct page *page,
+			struct mm_struct *mm, gfp_t mask, bool locked)
+{
+	int ret = 0;
+
+	if (mem_cgroup_subsys.disabled)
+		return 0;
+	if (unlikely(!mm))
+		mm = &init_mm;
+	if (!locked)
+		lock_page(page);
+	/*
+	 * If not locked, the page can be dropped from SwapCache until
+	 * we reach here.
+	 */
+	if (PageSwapCache(page)) {
+		ret = mem_cgroup_charge_common(page, mm, mask,
+				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
+	}
+	if (!locked)
+		unlock_page(page);
+
+	return ret;
+}
+#endif
+
 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 {
 	struct page_cgroup *pc;
@@ -817,6 +846,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	if (mem_cgroup_subsys.disabled)
 		return;
 
+	if (PageSwapCache(page))
+		return;
+
 	/*
 	 * Check if our page_cgroup is valid
 	 */
@@ -825,12 +857,26 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 		return;
 
 	lock_page_cgroup(pc);
-	if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
-	     || !PageCgroupUsed(pc)) {
-		/* This happens at race in zap_pte_range() and do_swap_page()*/
-		unlock_page_cgroup(pc);
-		return;
+
+	if (!PageCgroupUsed(pc))
+		goto unlock_out;
+
+	switch (ctype) {
+	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+		if (page_mapped(page))
+			goto unlock_out;
+		break;
+	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
+		if (!PageAnon(page)) {	/* Shared memory */
+			if (page->mapping && !page_is_file_cache(page))
+				goto unlock_out;
+		} else if (page_mapped(page)) /* Anon */
+				goto unlock_out;
+		break;
+	default:
+		break;
 	}
+
 	ClearPageCgroupUsed(pc);
 	mem = pc->mem_cgroup;
 
@@ -844,6 +890,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	css_put(&mem->css);
 
 	return;
+
+unlock_out:
+	unlock_page_cgroup(pc);
+	return;
 }
 
 void mem_cgroup_uncharge_page(struct page *page)
@@ -863,6 +913,11 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 }
 
+void mem_cgroup_uncharge_swapcache(struct page *page)
+{
+	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
+}
+
 /*
  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
  * page belongs to.
@@ -920,7 +975,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
 		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 
 	/* unused page is not on radix-tree now. */
-	if (unused && ctype != MEM_CGROUP_CHARGE_TYPE_MAPPED)
+	if (unused)
 		__mem_cgroup_uncharge_common(unused, ctype);
 
 	pc = lookup_page_cgroup(target);
diff --git a/mm/shmem.c b/mm/shmem.c
index bd9b4ea307b2..adf5c3eedbc9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -928,8 +928,12 @@ found:
 	error = 1;
 	if (!inode)
 		goto out;
-	/* Charge page using GFP_HIGHUSER_MOVABLE while we can wait */
-	error = mem_cgroup_cache_charge(page, current->mm, GFP_HIGHUSER_MOVABLE);
+	/*
+	 * Charge page using GFP_HIGHUSER_MOVABLE while we can wait.
+	 * charged back to the user(not to caller) when swap account is used.
+	 */
+	error = mem_cgroup_cache_charge_swapin(page,
+			current->mm, GFP_HIGHUSER_MOVABLE, true);
 	if (error)
 		goto out;
 	error = radix_tree_preload(GFP_KERNEL);
@@ -1266,6 +1270,16 @@ repeat:
 				goto repeat;
 			}
 			wait_on_page_locked(swappage);
+			/*
+			 * We want to avoid charge at add_to_page_cache().
+			 * charge against this swap cache here.
+			 */
+			if (mem_cgroup_cache_charge_swapin(swappage,
+						current->mm, gfp, false)) {
+				page_cache_release(swappage);
+				error = -ENOMEM;
+				goto failed;
+			}
 			page_cache_release(swappage);
 			goto repeat;
 		}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 81c825f67a7f..09291ca11f5f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -118,6 +118,7 @@ void __delete_from_swap_cache(struct page *page)
 	total_swapcache_pages--;
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	INC_CACHE_INFO(del_total);
+	mem_cgroup_uncharge_swapcache(page);
 }
 
 /**
-- 
cgit v1.2.3


From c077719be8e9e6b55702117513d1b5f41d80404a Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:57 -0800
Subject: memcg: mem+swap controller Kconfig

Config and control variable for mem+swap controller.

This patch adds CONFIG_CGROUP_MEM_RES_CTLR_SWAP
(memory resource controller swap extension.)

For accounting swap, it's obvious that we have to use additional memory to
remember "who uses swap".  This adds more overhead.  So, it's better to
offer "choice" to users.  This patch adds 2 choices.

This patch adds 2 parameters to enable swap extension or not.
  - CONFIG
  - boot option

Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kernel-parameters.txt |  3 +++
 include/linux/memcontrol.h          |  3 +++
 init/Kconfig                        | 17 +++++++++++++++++
 mm/memcontrol.c                     | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 532eacbbed62..fb849020aea9 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1562,6 +1562,9 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	nosoftlockup	[KNL] Disable the soft-lockup detector.
 
+	noswapaccount	[KNL] Disable accounting of swap in memory resource
+			controller. (See Documentation/controllers/memory.txt)
+
 	nosync		[HW,M68K] Disables sync negotiation for all devices.
 
 	notsc		[BUGS=X86-32] Disable Time Stamp Counter
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b095f5f6ecf7..41b46cc9d1f1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -77,6 +77,9 @@ extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
 					int priority, enum lru_list lru);
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+extern int do_swap_account;
+#endif
 
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
diff --git a/init/Kconfig b/init/Kconfig
index 7cbe1f43ca22..a724a149bf3f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -428,6 +428,23 @@ config CGROUP_MEM_RES_CTLR
 config MM_OWNER
 	bool
 
+config CGROUP_MEM_RES_CTLR_SWAP
+	bool "Memory Resource Controller Swap Extension(EXPERIMENTAL)"
+	depends on CGROUP_MEM_RES_CTLR && SWAP && EXPERIMENTAL
+	help
+	  Add swap management feature to memory resource controller. When you
+	  enable this, you can limit mem+swap usage per cgroup. In other words,
+	  when you disable this, memory resource controller has no cares to
+	  usage of swap...a process can exhaust all of the swap. This extension
+	  is useful when you want to avoid exhaustion swap but this itself
+	  adds more overheads and consumes memory for remembering information.
+	  Especially if you use 32bit system or small memory system, please
+	  be careful about enabling this. When memory resource controller
+	  is disabled by boot option, this will be automatically disabled and
+	  there will be no overhead from this. Even when you set this config=y,
+	  if boot option "noswapaccount" is set, swap will not be accounted.
+
+
 endmenu
 
 config SYSFS_DEPRECATED
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7288e9d85ca7..59dd8c116372 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -41,6 +41,15 @@
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
+int do_swap_account __read_mostly;
+static int really_do_swap_account __initdata = 1; /* for remember boot option*/
+#else
+#define do_swap_account		(0)
+#endif
+
+
 /*
  * Statistics for memory cgroup.
  */
@@ -1404,6 +1413,18 @@ static void mem_cgroup_free(struct mem_cgroup *mem)
 }
 
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static void __init enable_swap_cgroup(void)
+{
+	if (!mem_cgroup_subsys.disabled && really_do_swap_account)
+		do_swap_account = 1;
+}
+#else
+static void __init enable_swap_cgroup(void)
+{
+}
+#endif
+
 static struct cgroup_subsys_state *
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
@@ -1419,6 +1440,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	for_each_node_state(node, N_POSSIBLE)
 		if (alloc_mem_cgroup_per_zone_info(mem, node))
 			goto free_out;
+	/* root ? */
+	if (cont->parent == NULL)
+		enable_swap_cgroup();
 
 	return &mem->css;
 free_out:
@@ -1490,3 +1514,13 @@ struct cgroup_subsys mem_cgroup_subsys = {
 	.attach = mem_cgroup_move_task,
 	.early_init = 0,
 };
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+
+static int __init disable_swap_account(char *s)
+{
+	really_do_swap_account = 0;
+	return 1;
+}
+__setup("noswapaccount", disable_swap_account);
+#endif
-- 
cgit v1.2.3


From 27a7faa0779dd13729196c1a818c294f44bbd1ee Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:58 -0800
Subject: memcg: swap cgroup for remembering usage

For accounting swap, we need a record per swap entry, at least.

This patch adds following function.
  - swap_cgroup_swapon() .... called from swapon
  - swap_cgroup_swapoff() ... called at the end of swapoff.

  - swap_cgroup_record() .... record information of swap entry.
  - swap_cgroup_lookup() .... lookup information of swap entry.

This patch just implements "how to record information".  No actual method
for limit the usage of swap.  These routine uses flat table to record and
lookup.  "wise" lookup system like radix-tree requires requires memory
allocation at new records but swap-out is usually called under memory
shortage (or memcg hits limit.) So, I used static allocation.  (maybe
dynamic allocation is not very hard but it adds additional memory
allocation in memory shortage path.)

Note1: In this, we use pointer to record information and this means
      8bytes per swap entry. I think we can reduce this when we
      create "id of cgroup" in the range of 0-65535 or 0-255.

Reported-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reported-by: Hugh Dickins <hugh@veritas.com>
Reported-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_cgroup.h |  35 ++++++++
 mm/page_cgroup.c            | 197 ++++++++++++++++++++++++++++++++++++++++++++
 mm/swapfile.c               |  10 +++
 3 files changed, 242 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 1e6d34bfa094..d754b2dfbf2d 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -104,5 +104,40 @@ static inline void page_cgroup_init(void)
 {
 }
 
+#endif
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#include <linux/swap.h>
+extern struct mem_cgroup *
+swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem);
+extern struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent);
+extern int swap_cgroup_swapon(int type, unsigned long max_pages);
+extern void swap_cgroup_swapoff(int type);
+#else
+#include <linux/swap.h>
+
+static inline
+struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
+{
+	return NULL;
+}
+
+static inline
+struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
+{
+	return NULL;
+}
+
+static inline int
+swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+	return 0;
+}
+
+static inline void swap_cgroup_swapoff(int type)
+{
+	return;
+}
+
 #endif
 #endif
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index df1e54a5ed19..685e7c8e1fd6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -8,6 +8,7 @@
 #include <linux/memory.h>
 #include <linux/vmalloc.h>
 #include <linux/cgroup.h>
+#include <linux/swapops.h>
 
 static void __meminit
 __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -270,3 +271,199 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 }
 
 #endif
+
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+
+static DEFINE_MUTEX(swap_cgroup_mutex);
+struct swap_cgroup_ctrl {
+	struct page **map;
+	unsigned long length;
+};
+
+struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+
+/*
+ * This 8bytes seems big..maybe we can reduce this when we can use "id" for
+ * cgroup rather than pointer.
+ */
+struct swap_cgroup {
+	struct mem_cgroup	*val;
+};
+#define SC_PER_PAGE	(PAGE_SIZE/sizeof(struct swap_cgroup))
+#define SC_POS_MASK	(SC_PER_PAGE - 1)
+
+/*
+ * SwapCgroup implements "lookup" and "exchange" operations.
+ * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
+ * against SwapCache. At swap_free(), this is accessed directly from swap.
+ *
+ * This means,
+ *  - we have no race in "exchange" when we're accessed via SwapCache because
+ *    SwapCache(and its swp_entry) is under lock.
+ *  - When called via swap_free(), there is no user of this entry and no race.
+ * Then, we don't need lock around "exchange".
+ *
+ * TODO: we can push these buffers out to HIGHMEM.
+ */
+
+/*
+ * allocate buffer for swap_cgroup.
+ */
+static int swap_cgroup_prepare(int type)
+{
+	struct page *page;
+	struct swap_cgroup_ctrl *ctrl;
+	unsigned long idx, max;
+
+	if (!do_swap_account)
+		return 0;
+	ctrl = &swap_cgroup_ctrl[type];
+
+	for (idx = 0; idx < ctrl->length; idx++) {
+		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!page)
+			goto not_enough_page;
+		ctrl->map[idx] = page;
+	}
+	return 0;
+not_enough_page:
+	max = idx;
+	for (idx = 0; idx < max; idx++)
+		__free_page(ctrl->map[idx]);
+
+	return -ENOMEM;
+}
+
+/**
+ * swap_cgroup_record - record mem_cgroup for this swp_entry.
+ * @ent: swap entry to be recorded into
+ * @mem: mem_cgroup to be recorded
+ *
+ * Returns old value at success, NULL at failure.
+ * (Of course, old value can be NULL.)
+ */
+struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
+{
+	int type = swp_type(ent);
+	unsigned long offset = swp_offset(ent);
+	unsigned long idx = offset / SC_PER_PAGE;
+	unsigned long pos = offset & SC_POS_MASK;
+	struct swap_cgroup_ctrl *ctrl;
+	struct page *mappage;
+	struct swap_cgroup *sc;
+	struct mem_cgroup *old;
+
+	if (!do_swap_account)
+		return NULL;
+
+	ctrl = &swap_cgroup_ctrl[type];
+
+	mappage = ctrl->map[idx];
+	sc = page_address(mappage);
+	sc += pos;
+	old = sc->val;
+	sc->val = mem;
+
+	return old;
+}
+
+/**
+ * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
+ * @ent: swap entry to be looked up.
+ *
+ * Returns pointer to mem_cgroup at success. NULL at failure.
+ */
+struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
+{
+	int type = swp_type(ent);
+	unsigned long offset = swp_offset(ent);
+	unsigned long idx = offset / SC_PER_PAGE;
+	unsigned long pos = offset & SC_POS_MASK;
+	struct swap_cgroup_ctrl *ctrl;
+	struct page *mappage;
+	struct swap_cgroup *sc;
+	struct mem_cgroup *ret;
+
+	if (!do_swap_account)
+		return NULL;
+
+	ctrl = &swap_cgroup_ctrl[type];
+	mappage = ctrl->map[idx];
+	sc = page_address(mappage);
+	sc += pos;
+	ret = sc->val;
+	return ret;
+}
+
+int swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+	void *array;
+	unsigned long array_size;
+	unsigned long length;
+	struct swap_cgroup_ctrl *ctrl;
+
+	if (!do_swap_account)
+		return 0;
+
+	length = ((max_pages/SC_PER_PAGE) + 1);
+	array_size = length * sizeof(void *);
+
+	array = vmalloc(array_size);
+	if (!array)
+		goto nomem;
+
+	memset(array, 0, array_size);
+	ctrl = &swap_cgroup_ctrl[type];
+	mutex_lock(&swap_cgroup_mutex);
+	ctrl->length = length;
+	ctrl->map = array;
+	if (swap_cgroup_prepare(type)) {
+		/* memory shortage */
+		ctrl->map = NULL;
+		ctrl->length = 0;
+		vfree(array);
+		mutex_unlock(&swap_cgroup_mutex);
+		goto nomem;
+	}
+	mutex_unlock(&swap_cgroup_mutex);
+
+	printk(KERN_INFO
+		"swap_cgroup: uses %ld bytes of vmalloc for pointer array space"
+		" and %ld bytes to hold mem_cgroup pointers on swap\n",
+		array_size, length * PAGE_SIZE);
+	printk(KERN_INFO
+	"swap_cgroup can be disabled by noswapaccount boot option.\n");
+
+	return 0;
+nomem:
+	printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
+	printk(KERN_INFO
+		"swap_cgroup can be disabled by noswapaccount boot option\n");
+	return -ENOMEM;
+}
+
+void swap_cgroup_swapoff(int type)
+{
+	int i;
+	struct swap_cgroup_ctrl *ctrl;
+
+	if (!do_swap_account)
+		return;
+
+	mutex_lock(&swap_cgroup_mutex);
+	ctrl = &swap_cgroup_ctrl[type];
+	if (ctrl->map) {
+		for (i = 0; i < ctrl->length; i++) {
+			struct page *page = ctrl->map[i];
+			if (page)
+				__free_page(page);
+		}
+		vfree(ctrl->map);
+		ctrl->map = NULL;
+		ctrl->length = 0;
+	}
+	mutex_unlock(&swap_cgroup_mutex);
+}
+
+#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ddc6d92be2cb..1e7a715a3866 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,6 +33,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
+#include <linux/page_cgroup.h>
 
 static DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
@@ -1494,6 +1495,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
 	vfree(swap_map);
+	/* Destroy swap account informatin */
+	swap_cgroup_swapoff(type);
+
 	inode = mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
 		struct block_device *bdev = I_BDEV(inode);
@@ -1811,6 +1815,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		}
 		swap_map[page_nr] = SWAP_MAP_BAD;
 	}
+
+	error = swap_cgroup_swapon(type, maxpages);
+	if (error)
+		goto bad_swap;
+
 	nr_good_pages = swap_header->info.last_page -
 			swap_header->info.nr_badpages -
 			1 /* header page */;
@@ -1882,6 +1891,7 @@ bad_swap:
 		bd_release(bdev);
 	}
 	destroy_swap_extents(p);
+	swap_cgroup_swapoff(type);
 bad_swap_2:
 	spin_lock(&swap_lock);
 	p->swap_file = NULL;
-- 
cgit v1.2.3


From 8c7c6e34a1256a5082d38c8e9bd1474476912715 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:00 -0800
Subject: memcg: mem+swap controller core

This patch implements per cgroup limit for usage of memory+swap.  However
there are SwapCache, double counting of swap-cache and swap-entry is
avoided.

Mem+Swap controller works as following.
  - memory usage is limited by memory.limit_in_bytes.
  - memory + swap usage is limited by memory.memsw_limit_in_bytes.

This has following benefits.
  - A user can limit total resource usage of mem+swap.

    Without this, because memory resource controller doesn't take care of
    usage of swap, a process can exhaust all the swap (by memory leak.)
    We can avoid this case.

    And Swap is shared resource but it cannot be reclaimed (goes back to memory)
    until it's used. This characteristic can be trouble when the memory
    is divided into some parts by cpuset or memcg.
    Assume group A and group B.
    After some application executes, the system can be..

    Group A -- very large free memory space but occupy 99% of swap.
    Group B -- under memory shortage but cannot use swap...it's nearly full.

    Ability to set appropriate swap limit for each group is required.

Maybe someone wonder "why not swap but mem+swap ?"

  - The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
    to move account from memory to swap...there is no change in usage of
    mem+swap.

    In other words, when we want to limit the usage of swap without affecting
    global LRU, mem+swap limit is better than just limiting swap.

Accounting target information is stored in swap_cgroup which is
per swap entry record.

Charge is done as following.
  map
    - charge  page and memsw.

  unmap
    - uncharge page/memsw if not SwapCache.

  swap-out (__delete_from_swap_cache)
    - uncharge page
    - record mem_cgroup information to swap_cgroup.

  swap-in (do_swap_page)
    - charged as page and memsw.
      record in swap_cgroup is cleared.
      memsw accounting is decremented.

  swap-free (swap_free())
    - if swap entry is freed, memsw is uncharged by PAGE_SIZE.

There are people work under never-swap environments and consider swap as
something bad. For such people, this mem+swap controller extension is just an
overhead.  This overhead is avoided by config or boot option.
(see Kconfig. detail is not in this patch.)

TODO:
 - maybe more optimization can be don in swap-in path. (but not very safe.)
   But we just do simple accounting at this stage.

[nishimura@mxp.nes.nec.co.jp: make resize limit hold mutex]
[hugh@veritas.com: memswap controller core swapcache fixes]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/controllers/memory.txt |  29 ++-
 include/linux/memcontrol.h           |  11 +-
 include/linux/swap.h                 |  14 +-
 mm/memcontrol.c                      | 400 +++++++++++++++++++++++++++++++----
 mm/memory.c                          |  18 +-
 mm/swap_state.c                      |   5 +-
 mm/swapfile.c                        |  11 +-
 mm/vmscan.c                          |   6 +-
 8 files changed, 440 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt
index 9fe2d0eabe05..05fe29ab1e58 100644
--- a/Documentation/controllers/memory.txt
+++ b/Documentation/controllers/memory.txt
@@ -137,12 +137,32 @@ behind this approach is that a cgroup that aggressively uses a shared
 page will eventually get charged for it (once it is uncharged from
 the cgroup that brought it in -- this will happen on memory pressure).
 
-Exception: When you do swapoff and make swapped-out pages of shmem(tmpfs) to
+Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used..
+When you do swapoff and make swapped-out pages of shmem(tmpfs) to
 be backed into memory in force, charges for pages are accounted against the
 caller of swapoff rather than the users of shmem.
 
 
-2.4 Reclaim
+2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP)
+Swap Extension allows you to record charge for swap. A swapped-in page is
+charged back to original page allocator if possible.
+
+When swap is accounted, following files are added.
+ - memory.memsw.usage_in_bytes.
+ - memory.memsw.limit_in_bytes.
+
+usage of mem+swap is limited by memsw.limit_in_bytes.
+
+Note: why 'mem+swap' rather than swap.
+The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
+to move account from memory to swap...there is no change in usage of
+mem+swap.
+
+In other words, when we want to limit the usage of swap without affecting
+global LRU, mem+swap limit is better than just limiting swap from OS point
+of view.
+
+2.5 Reclaim
 
 Each cgroup maintains a per cgroup LRU that consists of an active
 and inactive list. When a cgroup goes over its limit, we first try
@@ -246,6 +266,11 @@ Such charges are freed(at default) or moved to its parent. When moved,
 both of RSS and CACHES are moved to parent.
 If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also.
 
+Charges recorded in swap information is not updated at removal of cgroup.
+Recorded information is discarded and a cgroup which uses swap (swapcache)
+will be charged as a new owner of it.
+
+
 5. Misc. interfaces.
 
 5.1 force_empty
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 41b46cc9d1f1..ca51ac72d6c0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -32,6 +32,8 @@ extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
 /* for swap handling */
 extern int mem_cgroup_try_charge(struct mm_struct *mm,
 		gfp_t gfp_mask, struct mem_cgroup **ptr);
+extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
+		struct page *page, gfp_t mask, struct mem_cgroup **ptr);
 extern void mem_cgroup_commit_charge_swapin(struct page *page,
 					struct mem_cgroup *ptr);
 extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr);
@@ -80,7 +82,6 @@ extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 extern int do_swap_account;
 #endif
-
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
@@ -97,7 +98,13 @@ static inline int mem_cgroup_cache_charge(struct page *page,
 }
 
 static inline int mem_cgroup_try_charge(struct mm_struct *mm,
-				gfp_t gfp_mask, struct mem_cgroup **ptr)
+			gfp_t gfp_mask, struct mem_cgroup **ptr)
+{
+	return 0;
+}
+
+static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
+		struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr)
 {
 	return 0;
 }
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f8f3907533f0..be938ce4895a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,7 +214,7 @@ static inline void lru_cache_add_active_file(struct page *page)
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
-							gfp_t gfp_mask);
+						gfp_t gfp_mask, bool noswap);
 extern int __isolate_lru_page(struct page *page, int mode, int file);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
@@ -336,7 +336,7 @@ static inline void disable_swap_token(void)
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 extern int mem_cgroup_cache_charge_swapin(struct page *page,
 				struct mm_struct *mm, gfp_t mask, bool locked);
-extern void mem_cgroup_uncharge_swapcache(struct page *page);
+extern void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent);
 #else
 static inline
 int mem_cgroup_cache_charge_swapin(struct page *page,
@@ -344,7 +344,15 @@ int mem_cgroup_cache_charge_swapin(struct page *page,
 {
 	return 0;
 }
-static inline void mem_cgroup_uncharge_swapcache(struct page *page)
+static inline void
+mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
+{
+}
+#endif
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+extern void mem_cgroup_uncharge_swap(swp_entry_t ent);
+#else
+static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
 {
 }
 #endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 59dd8c116372..2efcf38f3b73 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/spinlock.h>
@@ -131,6 +132,10 @@ struct mem_cgroup {
 	 * the counter to account for memory usage
 	 */
 	struct res_counter res;
+	/*
+	 * the counter to account for mem+swap usage.
+	 */
+	struct res_counter memsw;
 	/*
 	 * Per cgroup active and inactive list, similar to the
 	 * per zone LRU lists.
@@ -138,6 +143,8 @@ struct mem_cgroup {
 	struct mem_cgroup_lru_info info;
 
 	int	prev_priority;	/* for recording reclaim priority */
+	int		obsolete;
+	atomic_t	refcnt;
 	/*
 	 * statistics. This must be placed at the end of memcg.
 	 */
@@ -167,6 +174,17 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
 	0, /* FORCE */
 };
 
+
+/* for encoding cft->private value on file */
+#define _MEM			(0)
+#define _MEMSWAP		(1)
+#define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
+#define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val)	((val) & 0xffff)
+
+static void mem_cgroup_get(struct mem_cgroup *mem);
+static void mem_cgroup_put(struct mem_cgroup *mem);
+
 /*
  * Always modified under lru lock. Then, not necessary to preempt_disable()
  */
@@ -485,7 +503,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
  * oom-killer can be invoked.
  */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
-			gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+			gfp_t gfp_mask, struct mem_cgroup **memcg,
+			bool oom)
 {
 	struct mem_cgroup *mem;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
@@ -513,12 +532,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 		css_get(&mem->css);
 	}
 
+	while (1) {
+		int ret;
+		bool noswap = false;
 
-	while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
+		ret = res_counter_charge(&mem->res, PAGE_SIZE);
+		if (likely(!ret)) {
+			if (!do_swap_account)
+				break;
+			ret = res_counter_charge(&mem->memsw, PAGE_SIZE);
+			if (likely(!ret))
+				break;
+			/* mem+swap counter fails */
+			res_counter_uncharge(&mem->res, PAGE_SIZE);
+			noswap = true;
+		}
 		if (!(gfp_mask & __GFP_WAIT))
 			goto nomem;
 
-		if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
+		if (try_to_free_mem_cgroup_pages(mem, gfp_mask, noswap))
 			continue;
 
 		/*
@@ -527,8 +559,13 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 		 * moved to swap cache or just unmapped from the cgroup.
 		 * Check the limit again to see if the reclaim reduced the
 		 * current usage of the cgroup before giving up
+		 *
 		 */
-		if (res_counter_check_under_limit(&mem->res))
+		if (!do_swap_account &&
+			res_counter_check_under_limit(&mem->res))
+			continue;
+		if (do_swap_account &&
+			res_counter_check_under_limit(&mem->memsw))
 			continue;
 
 		if (!nr_retries--) {
@@ -582,6 +619,8 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 	if (unlikely(PageCgroupUsed(pc))) {
 		unlock_page_cgroup(pc);
 		res_counter_uncharge(&mem->res, PAGE_SIZE);
+		if (do_swap_account)
+			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
 		css_put(&mem->css);
 		return;
 	}
@@ -646,6 +685,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 		__mem_cgroup_remove_list(from_mz, pc);
 		css_put(&from->css);
 		res_counter_uncharge(&from->res, PAGE_SIZE);
+		if (do_swap_account)
+			res_counter_uncharge(&from->memsw, PAGE_SIZE);
 		pc->mem_cgroup = to;
 		css_get(&to->css);
 		__mem_cgroup_add_list(to_mz, pc, false);
@@ -692,8 +733,11 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
 	/* drop extra refcnt */
 	css_put(&parent->css);
 	/* uncharge if move fails */
-	if (ret)
+	if (ret) {
 		res_counter_uncharge(&parent->res, PAGE_SIZE);
+		if (do_swap_account)
+			res_counter_uncharge(&parent->memsw, PAGE_SIZE);
+	}
 
 	return ret;
 }
@@ -791,7 +835,42 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
 }
 
+int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
+				 struct page *page,
+				 gfp_t mask, struct mem_cgroup **ptr)
+{
+	struct mem_cgroup *mem;
+	swp_entry_t     ent;
+
+	if (mem_cgroup_subsys.disabled)
+		return 0;
+
+	if (!do_swap_account)
+		goto charge_cur_mm;
+
+	/*
+	 * A racing thread's fault, or swapoff, may have already updated
+	 * the pte, and even removed page from swap cache: return success
+	 * to go on to do_swap_page()'s pte_same() test, which should fail.
+	 */
+	if (!PageSwapCache(page))
+		return 0;
+
+	ent.val = page_private(page);
+
+	mem = lookup_swap_cgroup(ent);
+	if (!mem || mem->obsolete)
+		goto charge_cur_mm;
+	*ptr = mem;
+	return __mem_cgroup_try_charge(NULL, mask, ptr, true);
+charge_cur_mm:
+	if (unlikely(!mm))
+		mm = &init_mm;
+	return __mem_cgroup_try_charge(mm, mask, ptr, true);
+}
+
 #ifdef CONFIG_SWAP
+
 int mem_cgroup_cache_charge_swapin(struct page *page,
 			struct mm_struct *mm, gfp_t mask, bool locked)
 {
@@ -808,8 +887,28 @@ int mem_cgroup_cache_charge_swapin(struct page *page,
 	 * we reach here.
 	 */
 	if (PageSwapCache(page)) {
+		struct mem_cgroup *mem = NULL;
+		swp_entry_t ent;
+
+		ent.val = page_private(page);
+		if (do_swap_account) {
+			mem = lookup_swap_cgroup(ent);
+			if (mem && mem->obsolete)
+				mem = NULL;
+			if (mem)
+				mm = NULL;
+		}
 		ret = mem_cgroup_charge_common(page, mm, mask,
-				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
+				MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
+
+		if (!ret && do_swap_account) {
+			/* avoid double counting */
+			mem = swap_cgroup_record(ent, NULL);
+			if (mem) {
+				res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+				mem_cgroup_put(mem);
+			}
+		}
 	}
 	if (!locked)
 		unlock_page(page);
@@ -828,6 +927,23 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 		return;
 	pc = lookup_page_cgroup(page);
 	__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+	/*
+	 * Now swap is on-memory. This means this page may be
+	 * counted both as mem and swap....double count.
+	 * Fix it by uncharging from memsw. This SwapCache is stable
+	 * because we're still under lock_page().
+	 */
+	if (do_swap_account) {
+		swp_entry_t ent = {.val = page_private(page)};
+		struct mem_cgroup *memcg;
+		memcg = swap_cgroup_record(ent, NULL);
+		if (memcg) {
+			/* If memcg is obsolete, memcg can be != ptr */
+			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+			mem_cgroup_put(memcg);
+		}
+
+	}
 }
 
 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
@@ -837,6 +953,8 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
 	if (!mem)
 		return;
 	res_counter_uncharge(&mem->res, PAGE_SIZE);
+	if (do_swap_account)
+		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
 	css_put(&mem->css);
 }
 
@@ -844,29 +962,31 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
 /*
  * uncharge if !page_mapped(page)
  */
-static void
+static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
 	struct page_cgroup *pc;
-	struct mem_cgroup *mem;
+	struct mem_cgroup *mem = NULL;
 	struct mem_cgroup_per_zone *mz;
 	unsigned long flags;
 
 	if (mem_cgroup_subsys.disabled)
-		return;
+		return NULL;
 
 	if (PageSwapCache(page))
-		return;
+		return NULL;
 
 	/*
 	 * Check if our page_cgroup is valid
 	 */
 	pc = lookup_page_cgroup(page);
 	if (unlikely(!pc || !PageCgroupUsed(pc)))
-		return;
+		return NULL;
 
 	lock_page_cgroup(pc);
 
+	mem = pc->mem_cgroup;
+
 	if (!PageCgroupUsed(pc))
 		goto unlock_out;
 
@@ -886,8 +1006,11 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 		break;
 	}
 
+	res_counter_uncharge(&mem->res, PAGE_SIZE);
+	if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
+		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+
 	ClearPageCgroupUsed(pc);
-	mem = pc->mem_cgroup;
 
 	mz = page_cgroup_zoneinfo(pc);
 	spin_lock_irqsave(&mz->lru_lock, flags);
@@ -895,14 +1018,13 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
 	unlock_page_cgroup(pc);
 
-	res_counter_uncharge(&mem->res, PAGE_SIZE);
 	css_put(&mem->css);
 
-	return;
+	return mem;
 
 unlock_out:
 	unlock_page_cgroup(pc);
-	return;
+	return NULL;
 }
 
 void mem_cgroup_uncharge_page(struct page *page)
@@ -922,10 +1044,42 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 }
 
-void mem_cgroup_uncharge_swapcache(struct page *page)
+/*
+ * called from __delete_from_swap_cache() and drop "page" account.
+ * memcg information is recorded to swap_cgroup of "ent"
+ */
+void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
+{
+	struct mem_cgroup *memcg;
+
+	memcg = __mem_cgroup_uncharge_common(page,
+					MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
+	/* record memcg information */
+	if (do_swap_account && memcg) {
+		swap_cgroup_record(ent, memcg);
+		mem_cgroup_get(memcg);
+	}
+}
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+/*
+ * called from swap_entry_free(). remove record in swap_cgroup and
+ * uncharge "memsw" account.
+ */
+void mem_cgroup_uncharge_swap(swp_entry_t ent)
 {
-	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
+	struct mem_cgroup *memcg;
+
+	if (!do_swap_account)
+		return;
+
+	memcg = swap_cgroup_record(ent, NULL);
+	if (memcg) {
+		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+		mem_cgroup_put(memcg);
+	}
 }
+#endif
 
 /*
  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
@@ -1034,7 +1188,7 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 	rcu_read_unlock();
 
 	do {
-		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
+		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true);
 		progress += res_counter_check_under_limit(&mem->res);
 	} while (!progress && --retry);
 
@@ -1044,26 +1198,84 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 	return 0;
 }
 
+static DEFINE_MUTEX(set_limit_mutex);
+
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
-				   unsigned long long val)
+				unsigned long long val)
 {
 
 	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
 	int progress;
+	u64 memswlimit;
 	int ret = 0;
 
-	while (res_counter_set_limit(&memcg->res, val)) {
+	while (retry_count) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
-		if (!retry_count) {
-			ret = -EBUSY;
+		/*
+		 * Rather than hide all in some function, I do this in
+		 * open coded manner. You see what this really does.
+		 * We have to guarantee mem->res.limit < mem->memsw.limit.
+		 */
+		mutex_lock(&set_limit_mutex);
+		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+		if (memswlimit < val) {
+			ret = -EINVAL;
+			mutex_unlock(&set_limit_mutex);
 			break;
 		}
+		ret = res_counter_set_limit(&memcg->res, val);
+		mutex_unlock(&set_limit_mutex);
+
+		if (!ret)
+			break;
+
 		progress = try_to_free_mem_cgroup_pages(memcg,
-				GFP_HIGHUSER_MOVABLE);
-		if (!progress)
+				GFP_HIGHUSER_MOVABLE, false);
+  		if (!progress)			retry_count--;
+	}
+	return ret;
+}
+
+int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
+				unsigned long long val)
+{
+	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
+	u64 memlimit, oldusage, curusage;
+	int ret;
+
+	if (!do_swap_account)
+		return -EINVAL;
+
+	while (retry_count) {
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+		/*
+		 * Rather than hide all in some function, I do this in
+		 * open coded manner. You see what this really does.
+		 * We have to guarantee mem->res.limit < mem->memsw.limit.
+		 */
+		mutex_lock(&set_limit_mutex);
+		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+		if (memlimit > val) {
+			ret = -EINVAL;
+			mutex_unlock(&set_limit_mutex);
+			break;
+		}
+		ret = res_counter_set_limit(&memcg->memsw, val);
+		mutex_unlock(&set_limit_mutex);
+
+		if (!ret)
+			break;
+
+		oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+		try_to_free_mem_cgroup_pages(memcg, GFP_HIGHUSER_MOVABLE, true);
+		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+		if (curusage >= oldusage)
 			retry_count--;
 	}
 	return ret;
@@ -1193,7 +1405,7 @@ try_to_free:
 			goto out;
 		}
 		progress = try_to_free_mem_cgroup_pages(mem,
-						  GFP_HIGHUSER_MOVABLE);
+						  GFP_HIGHUSER_MOVABLE, false);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
@@ -1216,8 +1428,25 @@ int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
 
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
-	return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
-				    cft->private);
+	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+	u64 val = 0;
+	int type, name;
+
+	type = MEMFILE_TYPE(cft->private);
+	name = MEMFILE_ATTR(cft->private);
+	switch (type) {
+	case _MEM:
+		val = res_counter_read_u64(&mem->res, name);
+		break;
+	case _MEMSWAP:
+		if (do_swap_account)
+			val = res_counter_read_u64(&mem->memsw, name);
+		break;
+	default:
+		BUG();
+		break;
+	}
+	return val;
 }
 /*
  * The user of this function is...
@@ -1227,15 +1456,22 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 			    const char *buffer)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	int type, name;
 	unsigned long long val;
 	int ret;
 
-	switch (cft->private) {
+	type = MEMFILE_TYPE(cft->private);
+	name = MEMFILE_ATTR(cft->private);
+	switch (name) {
 	case RES_LIMIT:
 		/* This function does all necessary parse...reuse it */
 		ret = res_counter_memparse_write_strategy(buffer, &val);
-		if (!ret)
+		if (ret)
+			break;
+		if (type == _MEM)
 			ret = mem_cgroup_resize_limit(memcg, val);
+		else
+			ret = mem_cgroup_resize_memsw_limit(memcg, val);
 		break;
 	default:
 		ret = -EINVAL; /* should be BUG() ? */
@@ -1247,14 +1483,23 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
 	struct mem_cgroup *mem;
+	int type, name;
 
 	mem = mem_cgroup_from_cont(cont);
-	switch (event) {
+	type = MEMFILE_TYPE(event);
+	name = MEMFILE_ATTR(event);
+	switch (name) {
 	case RES_MAX_USAGE:
-		res_counter_reset_max(&mem->res);
+		if (type == _MEM)
+			res_counter_reset_max(&mem->res);
+		else
+			res_counter_reset_max(&mem->memsw);
 		break;
 	case RES_FAILCNT:
-		res_counter_reset_failcnt(&mem->res);
+		if (type == _MEM)
+			res_counter_reset_failcnt(&mem->res);
+		else
+			res_counter_reset_failcnt(&mem->memsw);
 		break;
 	}
 	return 0;
@@ -1315,24 +1560,24 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
-		.private = RES_USAGE,
+		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "max_usage_in_bytes",
-		.private = RES_MAX_USAGE,
+		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "limit_in_bytes",
-		.private = RES_LIMIT,
+		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "failcnt",
-		.private = RES_FAILCNT,
+		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
@@ -1346,6 +1591,47 @@ static struct cftype mem_cgroup_files[] = {
 	},
 };
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static struct cftype memsw_cgroup_files[] = {
+	{
+		.name = "memsw.usage_in_bytes",
+		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+		.read_u64 = mem_cgroup_read,
+	},
+	{
+		.name = "memsw.max_usage_in_bytes",
+		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+		.trigger = mem_cgroup_reset,
+		.read_u64 = mem_cgroup_read,
+	},
+	{
+		.name = "memsw.limit_in_bytes",
+		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+		.write_string = mem_cgroup_write,
+		.read_u64 = mem_cgroup_read,
+	},
+	{
+		.name = "memsw.failcnt",
+		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+		.trigger = mem_cgroup_reset,
+		.read_u64 = mem_cgroup_read,
+	},
+};
+
+static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
+{
+	if (!do_swap_account)
+		return 0;
+	return cgroup_add_files(cont, ss, memsw_cgroup_files,
+				ARRAY_SIZE(memsw_cgroup_files));
+};
+#else
+static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
+{
+	return 0;
+}
+#endif
+
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
 	struct mem_cgroup_per_node *pn;
@@ -1404,14 +1690,44 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	return mem;
 }
 
+/*
+ * At destroying mem_cgroup, references from swap_cgroup can remain.
+ * (scanning all at force_empty is too costly...)
+ *
+ * Instead of clearing all references at force_empty, we remember
+ * the number of reference from swap_cgroup and free mem_cgroup when
+ * it goes down to 0.
+ *
+ * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and
+ * entry which points to this memcg will be ignore at swapin.
+ *
+ * Removal of cgroup itself succeeds regardless of refs from swap.
+ */
+
 static void mem_cgroup_free(struct mem_cgroup *mem)
 {
+	if (atomic_read(&mem->refcnt) > 0)
+		return;
 	if (mem_cgroup_size() < PAGE_SIZE)
 		kfree(mem);
 	else
 		vfree(mem);
 }
 
+static void mem_cgroup_get(struct mem_cgroup *mem)
+{
+	atomic_inc(&mem->refcnt);
+}
+
+static void mem_cgroup_put(struct mem_cgroup *mem)
+{
+	if (atomic_dec_and_test(&mem->refcnt)) {
+		if (!mem->obsolete)
+			return;
+		mem_cgroup_free(mem);
+	}
+}
+
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 static void __init enable_swap_cgroup(void)
@@ -1436,6 +1752,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 		return ERR_PTR(-ENOMEM);
 
 	res_counter_init(&mem->res);
+	res_counter_init(&mem->memsw);
 
 	for_each_node_state(node, N_POSSIBLE)
 		if (alloc_mem_cgroup_per_zone_info(mem, node))
@@ -1456,6 +1773,7 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
 					struct cgroup *cont)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+	mem->obsolete = 1;
 	mem_cgroup_force_empty(mem, false);
 }
 
@@ -1474,8 +1792,14 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
 				struct cgroup *cont)
 {
-	return cgroup_add_files(cont, ss, mem_cgroup_files,
-					ARRAY_SIZE(mem_cgroup_files));
+	int ret;
+
+	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
+				ARRAY_SIZE(mem_cgroup_files));
+
+	if (!ret)
+		ret = register_memsw_files(cont, ss);
+	return ret;
 }
 
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
diff --git a/mm/memory.c b/mm/memory.c
index ba5189e322e6..1358012ffa73 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2431,7 +2431,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	lock_page(page);
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 
-	if (mem_cgroup_try_charge(mm, GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) {
+	if (mem_cgroup_try_charge_swapin(mm, page,
+				GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) {
 		ret = VM_FAULT_OOM;
 		unlock_page(page);
 		goto out;
@@ -2449,8 +2450,20 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out_nomap;
 	}
 
-	/* The page isn't present yet, go ahead with the fault. */
+	/*
+	 * The page isn't present yet, go ahead with the fault.
+	 *
+	 * Be careful about the sequence of operations here.
+	 * To get its accounting right, reuse_swap_page() must be called
+	 * while the page is counted on swap but not yet in mapcount i.e.
+	 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
+	 * must be called after the swap_free(), or it will never succeed.
+	 * And mem_cgroup_commit_charge_swapin(), which uses the swp_entry
+	 * in page->private, must be called before reuse_swap_page(),
+	 * which may delete_from_swap_cache().
+	 */
 
+	mem_cgroup_commit_charge_swapin(page, ptr);
 	inc_mm_counter(mm, anon_rss);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if (write_access && reuse_swap_page(page)) {
@@ -2461,7 +2474,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	flush_icache_page(vma, page);
 	set_pte_at(mm, address, page_table, pte);
 	page_add_anon_rmap(page, vma, address);
-	mem_cgroup_commit_charge_swapin(page, ptr);
 
 	swap_free(entry);
 	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 09291ca11f5f..3ecea98ecb45 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,6 +17,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
+#include <linux/page_cgroup.h>
 
 #include <asm/pgtable.h>
 
@@ -108,6 +109,8 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
  */
 void __delete_from_swap_cache(struct page *page)
 {
+	swp_entry_t ent = {.val = page_private(page)};
+
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(!PageSwapCache(page));
 	VM_BUG_ON(PageWriteback(page));
@@ -118,7 +121,7 @@ void __delete_from_swap_cache(struct page *page)
 	total_swapcache_pages--;
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	INC_CACHE_INFO(del_total);
-	mem_cgroup_uncharge_swapcache(page);
+	mem_cgroup_uncharge_swapcache(page, ent);
 }
 
 /**
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1e7a715a3866..0579d9069b61 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -471,8 +471,9 @@ out:
 	return NULL;
 }
 
-static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
+static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent)
 {
+	unsigned long offset = swp_offset(ent);
 	int count = p->swap_map[offset];
 
 	if (count < SWAP_MAP_MAX) {
@@ -487,6 +488,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
 				swap_list.next = p - swap_info;
 			nr_swap_pages++;
 			p->inuse_pages--;
+			mem_cgroup_uncharge_swap(ent);
 		}
 	}
 	return count;
@@ -502,7 +504,7 @@ void swap_free(swp_entry_t entry)
 
 	p = swap_info_get(entry);
 	if (p) {
-		swap_entry_free(p, swp_offset(entry));
+		swap_entry_free(p, entry);
 		spin_unlock(&swap_lock);
 	}
 }
@@ -582,7 +584,7 @@ int free_swap_and_cache(swp_entry_t entry)
 
 	p = swap_info_get(entry);
 	if (p) {
-		if (swap_entry_free(p, swp_offset(entry)) == 1) {
+		if (swap_entry_free(p, entry) == 1) {
 			page = find_get_page(&swapper_space, entry.val);
 			if (page && !trylock_page(page)) {
 				page_cache_release(page);
@@ -696,7 +698,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 	pte_t *pte;
 	int ret = 1;
 
-	if (mem_cgroup_try_charge(vma->vm_mm, GFP_HIGHUSER_MOVABLE, &ptr))
+	if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
+					GFP_HIGHUSER_MOVABLE, &ptr))
 		ret = -ENOMEM;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b07c48b09a93..f63b20dd7714 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1661,7 +1661,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
-						gfp_t gfp_mask)
+						gfp_t gfp_mask,
+					   bool noswap)
 {
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
@@ -1674,6 +1675,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 	};
 	struct zonelist *zonelist;
 
+	if (noswap)
+		sc.may_swap = 0;
+
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
 	zonelist = NODE_DATA(numa_node_id())->node_zonelists;
-- 
cgit v1.2.3


From 08e552c69c6930d64722de3ec18c51844d06ee28 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:01 -0800
Subject: memcg: synchronized LRU

A big patch for changing memcg's LRU semantics.

Now,
  - page_cgroup is linked to mem_cgroup's its own LRU (per zone).

  - LRU of page_cgroup is not synchronous with global LRU.

  - page and page_cgroup is one-to-one and statically allocated.

  - To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
    - lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);

  - SwapCache is handled.

And, when we handle LRU list of page_cgroup, we do following.

	pc = lookup_page_cgroup(page);
	lock_page_cgroup(pc); .....................(1)
	mz = page_cgroup_zoneinfo(pc);
	spin_lock(&mz->lru_lock);
	.....add to LRU
	spin_unlock(&mz->lru_lock);
	unlock_page_cgroup(pc);

But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.

This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as

        spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
	mem_cgroup_add/remove/etc_lru() {
		pc = lookup_page_cgroup(page);
		mz = page_cgroup_zoneinfo(pc);
		if (PageCgroupUsed(pc)) {
			....add to LRU
		}
        spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU

This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
    1. When pc->mem_cgroup can be modified.
       - at charge.
       - at account_move().
    2. at charge
       the PCG_USED bit is not set before pc->mem_cgroup is fixed.
    3. at account_move()
       the page is isolated and not on LRU.

Pros.
  - easy for maintenance.
  - memcg can make use of laziness of pagevec.
  - we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
  - LRU status of memcg will be synchronized with global LRU's one.
  - # of locks are reduced.
  - account_move() is simplified very much.
Cons.
  - may increase cost of LRU rotation.
    (no impact if memcg is not configured.)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/splice.c                 |   1 +
 include/linux/memcontrol.h  |  29 +++-
 include/linux/mm_inline.h   |   3 +
 include/linux/page_cgroup.h |  17 ---
 mm/memcontrol.c             | 323 +++++++++++++++++++-------------------------
 mm/page_cgroup.c            |   1 +
 mm/swap.c                   |   1 -
 mm/vmscan.c                 |   9 +-
 8 files changed, 178 insertions(+), 206 deletions(-)

(limited to 'include/linux')

diff --git a/fs/splice.c b/fs/splice.c
index 1abab5cee4ba..a54b3e3f10a7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -21,6 +21,7 @@
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/splice.h>
+#include <linux/memcontrol.h>
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ca51ac72d6c0..32c07b1852d6 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -40,7 +40,12 @@ extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr);
 
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 					gfp_t gfp_mask);
-extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru);
+extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru);
+extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru);
+extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
+extern void mem_cgroup_del_lru(struct page *page);
+extern void mem_cgroup_move_lists(struct page *page,
+				  enum lru_list from, enum lru_list to);
 extern void mem_cgroup_uncharge_page(struct page *page);
 extern void mem_cgroup_uncharge_cache_page(struct page *page);
 extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask);
@@ -131,7 +136,27 @@ static inline int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 	return 0;
 }
 
-static inline void mem_cgroup_move_lists(struct page *page, bool active)
+static inline void mem_cgroup_add_lru_list(struct page *page, int lru)
+{
+}
+
+static inline void mem_cgroup_del_lru_list(struct page *page, int lru)
+{
+	return ;
+}
+
+static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru)
+{
+	return ;
+}
+
+static inline void mem_cgroup_del_lru(struct page *page)
+{
+	return ;
+}
+
+static inline void
+mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to)
 {
 }
 
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index c948350c378e..37ef13d0f01e 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -28,6 +28,7 @@ add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
 	list_add(&page->lru, &zone->lru[l].list);
 	__inc_zone_state(zone, NR_LRU_BASE + l);
+	mem_cgroup_add_lru_list(page, l);
 }
 
 static inline void
@@ -35,6 +36,7 @@ del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
 	list_del(&page->lru);
 	__dec_zone_state(zone, NR_LRU_BASE + l);
+	mem_cgroup_del_lru_list(page, l);
 }
 
 static inline void
@@ -54,6 +56,7 @@ del_page_from_lru(struct zone *zone, struct page *page)
 		l += page_is_file_cache(page);
 	}
 	__dec_zone_state(zone, NR_LRU_BASE + l);
+	mem_cgroup_del_lru_list(page, l);
 }
 
 /**
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index d754b2dfbf2d..602cc1fdee90 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -26,10 +26,6 @@ enum {
 	PCG_LOCK,  /* page cgroup is locked */
 	PCG_CACHE, /* charged as cache */
 	PCG_USED, /* this object is in use. */
-	/* flags for LRU placement */
-	PCG_ACTIVE, /* page is active in this cgroup */
-	PCG_FILE, /* page is file system backed */
-	PCG_UNEVICTABLE, /* page is unevictableable */
 };
 
 #define TESTPCGFLAG(uname, lname)			\
@@ -50,19 +46,6 @@ TESTPCGFLAG(Cache, CACHE)
 TESTPCGFLAG(Used, USED)
 CLEARPCGFLAG(Used, USED)
 
-/* LRU management flags (from global-lru definition) */
-TESTPCGFLAG(File, FILE)
-SETPCGFLAG(File, FILE)
-CLEARPCGFLAG(File, FILE)
-
-TESTPCGFLAG(Active, ACTIVE)
-SETPCGFLAG(Active, ACTIVE)
-CLEARPCGFLAG(Active, ACTIVE)
-
-TESTPCGFLAG(Unevictable, UNEVICTABLE)
-SETPCGFLAG(Unevictable, UNEVICTABLE)
-CLEARPCGFLAG(Unevictable, UNEVICTABLE)
-
 static inline int page_cgroup_nid(struct page_cgroup *pc)
 {
 	return page_to_nid(pc->page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2efcf38f3b73..8ce4e9e47959 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -36,6 +36,7 @@
 #include <linux/vmalloc.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
+#include "internal.h"
 
 #include <asm/uaccess.h>
 
@@ -100,7 +101,6 @@ struct mem_cgroup_per_zone {
 	/*
 	 * spin_lock to protect the per cgroup LRU
 	 */
-	spinlock_t		lru_lock;
 	struct list_head	lists[NR_LRU_LISTS];
 	unsigned long		count[NR_LRU_LISTS];
 };
@@ -163,14 +163,12 @@ enum charge_type {
 /* only for here (for easy reading.) */
 #define PCGF_CACHE	(1UL << PCG_CACHE)
 #define PCGF_USED	(1UL << PCG_USED)
-#define PCGF_ACTIVE	(1UL << PCG_ACTIVE)
 #define PCGF_LOCK	(1UL << PCG_LOCK)
-#define PCGF_FILE	(1UL << PCG_FILE)
 static const unsigned long
 pcg_default_flags[NR_CHARGE_TYPE] = {
-	PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
-	PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
-	PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
+	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
+	PCGF_USED | PCGF_LOCK, /* Anon */
+	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
 	0, /* FORCE */
 };
 
@@ -185,9 +183,6 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 
-/*
- * Always modified under lru lock. Then, not necessary to preempt_disable()
- */
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 					 struct page_cgroup *pc,
 					 bool charge)
@@ -195,10 +190,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 	int val = (charge)? 1 : -1;
 	struct mem_cgroup_stat *stat = &mem->stat;
 	struct mem_cgroup_stat_cpu *cpustat;
+	int cpu = get_cpu();
 
-	VM_BUG_ON(!irqs_disabled());
-
-	cpustat = &stat->cpustat[smp_processor_id()];
+	cpustat = &stat->cpustat[cpu];
 	if (PageCgroupCache(pc))
 		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
 	else
@@ -210,6 +204,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 	else
 		__mem_cgroup_stat_add_safe(cpustat,
 				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
+	put_cpu();
 }
 
 static struct mem_cgroup_per_zone *
@@ -264,80 +259,95 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 				struct mem_cgroup, css);
 }
 
-static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
-			struct page_cgroup *pc)
-{
-	int lru = LRU_BASE;
+/*
+ * Following LRU functions are allowed to be used without PCG_LOCK.
+ * Operations are called by routine of global LRU independently from memcg.
+ * What we have to take care of here is validness of pc->mem_cgroup.
+ *
+ * Changes to pc->mem_cgroup happens when
+ * 1. charge
+ * 2. moving account
+ * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
+ * It is added to LRU before charge.
+ * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
+ * When moving account, the page is not on LRU. It's isolated.
+ */
 
-	if (PageCgroupUnevictable(pc))
-		lru = LRU_UNEVICTABLE;
-	else {
-		if (PageCgroupActive(pc))
-			lru += LRU_ACTIVE;
-		if (PageCgroupFile(pc))
-			lru += LRU_FILE;
-	}
+void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
+{
+	struct page_cgroup *pc;
+	struct mem_cgroup *mem;
+	struct mem_cgroup_per_zone *mz;
 
+	if (mem_cgroup_subsys.disabled)
+		return;
+	pc = lookup_page_cgroup(page);
+	/* can happen while we handle swapcache. */
+	if (list_empty(&pc->lru))
+		return;
+	mz = page_cgroup_zoneinfo(pc);
+	mem = pc->mem_cgroup;
 	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
-
-	mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
-	list_del(&pc->lru);
+	list_del_init(&pc->lru);
+	return;
 }
 
-static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
-				struct page_cgroup *pc, bool hot)
+void mem_cgroup_del_lru(struct page *page)
 {
-	int lru = LRU_BASE;
+	mem_cgroup_del_lru_list(page, page_lru(page));
+}
 
-	if (PageCgroupUnevictable(pc))
-		lru = LRU_UNEVICTABLE;
-	else {
-		if (PageCgroupActive(pc))
-			lru += LRU_ACTIVE;
-		if (PageCgroupFile(pc))
-			lru += LRU_FILE;
-	}
+void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
+{
+	struct mem_cgroup_per_zone *mz;
+	struct page_cgroup *pc;
 
-	MEM_CGROUP_ZSTAT(mz, lru) += 1;
-	if (hot)
-		list_add(&pc->lru, &mz->lists[lru]);
-	else
-		list_add_tail(&pc->lru, &mz->lists[lru]);
+	if (mem_cgroup_subsys.disabled)
+		return;
 
-	mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
+	pc = lookup_page_cgroup(page);
+	smp_rmb();
+	/* unused page is not rotated. */
+	if (!PageCgroupUsed(pc))
+		return;
+	mz = page_cgroup_zoneinfo(pc);
+	list_move(&pc->lru, &mz->lists[lru]);
 }
 
-static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
+void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 {
-	struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
-	int active    = PageCgroupActive(pc);
-	int file      = PageCgroupFile(pc);
-	int unevictable = PageCgroupUnevictable(pc);
-	enum lru_list from = unevictable ? LRU_UNEVICTABLE :
-				(LRU_FILE * !!file + !!active);
+	struct page_cgroup *pc;
+	struct mem_cgroup_per_zone *mz;
 
-	if (lru == from)
+	if (mem_cgroup_subsys.disabled)
+		return;
+	pc = lookup_page_cgroup(page);
+	/* barrier to sync with "charge" */
+	smp_rmb();
+	if (!PageCgroupUsed(pc))
 		return;
 
-	MEM_CGROUP_ZSTAT(mz, from) -= 1;
-	/*
-	 * However this is done under mz->lru_lock, another flags, which
-	 * are not related to LRU, will be modified from out-of-lock.
-	 * We have to use atomic set/clear flags.
-	 */
-	if (is_unevictable_lru(lru)) {
-		ClearPageCgroupActive(pc);
-		SetPageCgroupUnevictable(pc);
-	} else {
-		if (is_active_lru(lru))
-			SetPageCgroupActive(pc);
-		else
-			ClearPageCgroupActive(pc);
-		ClearPageCgroupUnevictable(pc);
-	}
-
+	mz = page_cgroup_zoneinfo(pc);
 	MEM_CGROUP_ZSTAT(mz, lru) += 1;
-	list_move(&pc->lru, &mz->lists[lru]);
+	list_add(&pc->lru, &mz->lists[lru]);
+}
+/*
+ * To add swapcache into LRU. Be careful to all this function.
+ * zone->lru_lock shouldn't be held and irq must not be disabled.
+ */
+static void mem_cgroup_lru_fixup(struct page *page)
+{
+	if (!isolate_lru_page(page))
+		putback_lru_page(page);
+}
+
+void mem_cgroup_move_lists(struct page *page,
+			   enum lru_list from, enum lru_list to)
+{
+	if (mem_cgroup_subsys.disabled)
+		return;
+	mem_cgroup_del_lru_list(page, from);
+	mem_cgroup_add_lru_list(page, to);
 }
 
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -350,37 +360,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 	return ret;
 }
 
-/*
- * This routine assumes that the appropriate zone's lru lock is already held
- */
-void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
-{
-	struct page_cgroup *pc;
-	struct mem_cgroup_per_zone *mz;
-	unsigned long flags;
-
-	if (mem_cgroup_subsys.disabled)
-		return;
-
-	/*
-	 * We cannot lock_page_cgroup while holding zone's lru_lock,
-	 * because other holders of lock_page_cgroup can be interrupted
-	 * with an attempt to rotate_reclaimable_page.  But we cannot
-	 * safely get to page_cgroup without it, so just try_lock it:
-	 * mem_cgroup_isolate_pages allows for page left on wrong list.
-	 */
-	pc = lookup_page_cgroup(page);
-	if (!trylock_page_cgroup(pc))
-		return;
-	if (pc && PageCgroupUsed(pc)) {
-		mz = page_cgroup_zoneinfo(pc);
-		spin_lock_irqsave(&mz->lru_lock, flags);
-		__mem_cgroup_move_lists(pc, lru);
-		spin_unlock_irqrestore(&mz->lru_lock, flags);
-	}
-	unlock_page_cgroup(pc);
-}
-
 /*
  * Calculate mapped_ratio under memory controller. This will be used in
  * vmscan.c for deteremining we have to reclaim mapped pages.
@@ -460,40 +439,24 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 	src = &mz->lists[lru];
 
-	spin_lock(&mz->lru_lock);
 	scan = 0;
 	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
 		if (scan >= nr_to_scan)
 			break;
+
+		page = pc->page;
 		if (unlikely(!PageCgroupUsed(pc)))
 			continue;
-		page = pc->page;
-
 		if (unlikely(!PageLRU(page)))
 			continue;
 
-		/*
-		 * TODO: play better with lumpy reclaim, grabbing anything.
-		 */
-		if (PageUnevictable(page) ||
-		    (PageActive(page) && !active) ||
-		    (!PageActive(page) && active)) {
-			__mem_cgroup_move_lists(pc, page_lru(page));
-			continue;
-		}
-
 		scan++;
-		list_move(&pc->lru, &pc_list);
-
 		if (__isolate_lru_page(page, mode, file) == 0) {
 			list_move(&page->lru, dst);
 			nr_taken++;
 		}
 	}
 
-	list_splice(&pc_list, src);
-	spin_unlock(&mz->lru_lock);
-
 	*scanned = scan;
 	return nr_taken;
 }
@@ -608,9 +571,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 				     struct page_cgroup *pc,
 				     enum charge_type ctype)
 {
-	struct mem_cgroup_per_zone *mz;
-	unsigned long flags;
-
 	/* try_charge() can return NULL to *memcg, taking care of it. */
 	if (!mem)
 		return;
@@ -625,17 +585,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 		return;
 	}
 	pc->mem_cgroup = mem;
-	/*
-	 * If a page is accounted as a page cache, insert to inactive list.
-	 * If anon, insert to active list.
-	 */
+	smp_wmb();
 	pc->flags = pcg_default_flags[ctype];
 
-	mz = page_cgroup_zoneinfo(pc);
+	mem_cgroup_charge_statistics(mem, pc, true);
 
-	spin_lock_irqsave(&mz->lru_lock, flags);
-	__mem_cgroup_add_list(mz, pc, true);
-	spin_unlock_irqrestore(&mz->lru_lock, flags);
 	unlock_page_cgroup(pc);
 }
 
@@ -646,8 +600,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
  * @to:	mem_cgroup which the page is moved to. @from != @to.
  *
  * The caller must confirm following.
- * 1. disable irq.
- * 2. lru_lock of old mem_cgroup(@from) should be held.
+ * - page is not on LRU (isolate_page() is useful.)
  *
  * returns 0 at success,
  * returns -EBUSY when lock is busy or "pc" is unstable.
@@ -663,15 +616,14 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 	int nid, zid;
 	int ret = -EBUSY;
 
-	VM_BUG_ON(!irqs_disabled());
 	VM_BUG_ON(from == to);
+	VM_BUG_ON(PageLRU(pc->page));
 
 	nid = page_cgroup_nid(pc);
 	zid = page_cgroup_zid(pc);
 	from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
 	to_mz =  mem_cgroup_zoneinfo(to, nid, zid);
 
-
 	if (!trylock_page_cgroup(pc))
 		return ret;
 
@@ -681,18 +633,15 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 	if (pc->mem_cgroup != from)
 		goto out;
 
-	if (spin_trylock(&to_mz->lru_lock)) {
-		__mem_cgroup_remove_list(from_mz, pc);
-		css_put(&from->css);
-		res_counter_uncharge(&from->res, PAGE_SIZE);
-		if (do_swap_account)
-			res_counter_uncharge(&from->memsw, PAGE_SIZE);
-		pc->mem_cgroup = to;
-		css_get(&to->css);
-		__mem_cgroup_add_list(to_mz, pc, false);
-		ret = 0;
-		spin_unlock(&to_mz->lru_lock);
-	}
+	css_put(&from->css);
+	res_counter_uncharge(&from->res, PAGE_SIZE);
+	mem_cgroup_charge_statistics(from, pc, false);
+	if (do_swap_account)
+		res_counter_uncharge(&from->memsw, PAGE_SIZE);
+	pc->mem_cgroup = to;
+	mem_cgroup_charge_statistics(to, pc, true);
+	css_get(&to->css);
+	ret = 0;
 out:
 	unlock_page_cgroup(pc);
 	return ret;
@@ -706,39 +655,47 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
 				  struct mem_cgroup *child,
 				  gfp_t gfp_mask)
 {
+	struct page *page = pc->page;
 	struct cgroup *cg = child->css.cgroup;
 	struct cgroup *pcg = cg->parent;
 	struct mem_cgroup *parent;
-	struct mem_cgroup_per_zone *mz;
-	unsigned long flags;
 	int ret;
 
 	/* Is ROOT ? */
 	if (!pcg)
 		return -EINVAL;
 
+
 	parent = mem_cgroup_from_cont(pcg);
 
+
 	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
 	if (ret)
 		return ret;
 
-	mz = mem_cgroup_zoneinfo(child,
-			page_cgroup_nid(pc), page_cgroup_zid(pc));
+	if (!get_page_unless_zero(page))
+		return -EBUSY;
+
+	ret = isolate_lru_page(page);
+
+	if (ret)
+		goto cancel;
 
-	spin_lock_irqsave(&mz->lru_lock, flags);
 	ret = mem_cgroup_move_account(pc, child, parent);
-	spin_unlock_irqrestore(&mz->lru_lock, flags);
 
-	/* drop extra refcnt */
+	/* drop extra refcnt by try_charge() (move_account increment one) */
 	css_put(&parent->css);
-	/* uncharge if move fails */
-	if (ret) {
-		res_counter_uncharge(&parent->res, PAGE_SIZE);
-		if (do_swap_account)
-			res_counter_uncharge(&parent->memsw, PAGE_SIZE);
+	putback_lru_page(page);
+	if (!ret) {
+		put_page(page);
+		return 0;
 	}
-
+	/* uncharge if move fails */
+cancel:
+	res_counter_uncharge(&parent->res, PAGE_SIZE);
+	if (do_swap_account)
+		res_counter_uncharge(&parent->memsw, PAGE_SIZE);
+	put_page(page);
 	return ret;
 }
 
@@ -912,6 +869,8 @@ int mem_cgroup_cache_charge_swapin(struct page *page,
 	}
 	if (!locked)
 		unlock_page(page);
+	/* add this page(page_cgroup) to the LRU we want. */
+	mem_cgroup_lru_fixup(page);
 
 	return ret;
 }
@@ -944,6 +903,8 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 		}
 
 	}
+	/* add this page(page_cgroup) to the LRU we want. */
+	mem_cgroup_lru_fixup(page);
 }
 
 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
@@ -968,7 +929,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem = NULL;
 	struct mem_cgroup_per_zone *mz;
-	unsigned long flags;
 
 	if (mem_cgroup_subsys.disabled)
 		return NULL;
@@ -1010,12 +970,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
 		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
 
+	mem_cgroup_charge_statistics(mem, pc, false);
 	ClearPageCgroupUsed(pc);
 
 	mz = page_cgroup_zoneinfo(pc);
-	spin_lock_irqsave(&mz->lru_lock, flags);
-	__mem_cgroup_remove_list(mz, pc);
-	spin_unlock_irqrestore(&mz->lru_lock, flags);
 	unlock_page_cgroup(pc);
 
 	css_put(&mem->css);
@@ -1281,21 +1239,22 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 	return ret;
 }
 
-
 /*
  * This routine traverse page_cgroup in given list and drop them all.
  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
  */
 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
-			    struct mem_cgroup_per_zone *mz,
-			    enum lru_list lru)
+				int node, int zid, enum lru_list lru)
 {
+	struct zone *zone;
+	struct mem_cgroup_per_zone *mz;
 	struct page_cgroup *pc, *busy;
-	unsigned long flags;
-	unsigned long loop;
+	unsigned long flags, loop;
 	struct list_head *list;
 	int ret = 0;
 
+	zone = &NODE_DATA(node)->node_zones[zid];
+	mz = mem_cgroup_zoneinfo(mem, node, zid);
 	list = &mz->lists[lru];
 
 	loop = MEM_CGROUP_ZSTAT(mz, lru);
@@ -1304,19 +1263,19 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 	busy = NULL;
 	while (loop--) {
 		ret = 0;
-		spin_lock_irqsave(&mz->lru_lock, flags);
+		spin_lock_irqsave(&zone->lru_lock, flags);
 		if (list_empty(list)) {
-			spin_unlock_irqrestore(&mz->lru_lock, flags);
+			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			break;
 		}
 		pc = list_entry(list->prev, struct page_cgroup, lru);
 		if (busy == pc) {
 			list_move(&pc->lru, list);
 			busy = 0;
-			spin_unlock_irqrestore(&mz->lru_lock, flags);
+			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			continue;
 		}
-		spin_unlock_irqrestore(&mz->lru_lock, flags);
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
 
 		ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
 		if (ret == -ENOMEM)
@@ -1329,6 +1288,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 		} else
 			busy = NULL;
 	}
+
 	if (!ret && !list_empty(list))
 		return -EBUSY;
 	return ret;
@@ -1364,12 +1324,10 @@ move_account:
 		ret = 0;
 		for_each_node_state(node, N_POSSIBLE) {
 			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
-				struct mem_cgroup_per_zone *mz;
 				enum lru_list l;
-				mz = mem_cgroup_zoneinfo(mem, node, zid);
 				for_each_lru(l) {
 					ret = mem_cgroup_force_empty_list(mem,
-								  mz, l);
+							node, zid, l);
 					if (ret)
 						break;
 				}
@@ -1413,6 +1371,7 @@ try_to_free:
 		}
 
 	}
+	lru_add_drain();
 	/* try move_account...there may be some *locked* pages. */
 	if (mem->res.usage)
 		goto move_account;
@@ -1657,7 +1616,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
-		spin_lock_init(&mz->lru_lock);
 		for_each_lru(l)
 			INIT_LIST_HEAD(&mz->lists[l]);
 	}
@@ -1706,8 +1664,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 
 static void mem_cgroup_free(struct mem_cgroup *mem)
 {
+	int node;
+
 	if (atomic_read(&mem->refcnt) > 0)
 		return;
+
+
+	for_each_node_state(node, N_POSSIBLE)
+		free_mem_cgroup_per_zone_info(mem, node);
+
 	if (mem_cgroup_size() < PAGE_SIZE)
 		kfree(mem);
 	else
@@ -1780,12 +1745,6 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 				struct cgroup *cont)
 {
-	int node;
-	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-
-	for_each_node_state(node, N_POSSIBLE)
-		free_mem_cgroup_per_zone_info(mem, node);
-
 	mem_cgroup_free(mem_cgroup_from_cont(cont));
 }
 
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 685e7c8e1fd6..74ae8e01d071 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -16,6 +16,7 @@ __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
 	pc->flags = 0;
 	pc->mem_cgroup = NULL;
 	pc->page = pfn_to_page(pfn);
+	INIT_LIST_HEAD(&pc->lru);
 }
 static unsigned long total_usage;
 
diff --git a/mm/swap.c b/mm/swap.c
index ba2c0e8b8b54..8a98a9c90704 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -168,7 +168,6 @@ void activate_page(struct page *page)
 		lru += LRU_ACTIVE;
 		add_page_to_lru_list(zone, page, lru);
 		__count_vm_event(PGACTIVATE);
-		mem_cgroup_move_lists(page, lru);
 
 		zone->recent_rotated[!!file]++;
 		zone->recent_scanned[!!file]++;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f63b20dd7714..45983af1de3d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -512,7 +512,6 @@ redo:
 		lru = LRU_UNEVICTABLE;
 		add_page_to_unevictable_list(page);
 	}
-	mem_cgroup_move_lists(page, lru);
 
 	/*
 	 * page's status can change while we move it among lru. If an evictable
@@ -547,7 +546,6 @@ void putback_lru_page(struct page *page)
 
 	lru = !!TestClearPageActive(page) + page_is_file_cache(page);
 	lru_cache_add_lru(page, lru);
-	mem_cgroup_move_lists(page, lru);
 	put_page(page);
 }
 #endif /* CONFIG_UNEVICTABLE_LRU */
@@ -813,6 +811,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
 		return ret;
 
 	ret = -EBUSY;
+
 	if (likely(get_page_unless_zero(page))) {
 		/*
 		 * Be careful not to clear PageLRU until after we're
@@ -821,6 +820,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
 		 */
 		ClearPageLRU(page);
 		ret = 0;
+		mem_cgroup_del_lru(page);
 	}
 
 	return ret;
@@ -1134,7 +1134,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 			SetPageLRU(page);
 			lru = page_lru(page);
 			add_page_to_lru_list(zone, page, lru);
-			mem_cgroup_move_lists(page, lru);
 			if (PageActive(page) && scan_global_lru(sc)) {
 				int file = !!page_is_file_cache(page);
 				zone->recent_rotated[file]++;
@@ -1263,7 +1262,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		ClearPageActive(page);
 
 		list_move(&page->lru, &zone->lru[lru].list);
-		mem_cgroup_move_lists(page, lru);
+		mem_cgroup_add_lru_list(page, lru);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
@@ -2408,6 +2407,7 @@ retry:
 
 		__dec_zone_state(zone, NR_UNEVICTABLE);
 		list_move(&page->lru, &zone->lru[l].list);
+		mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
 		__inc_zone_state(zone, NR_INACTIVE_ANON + l);
 		__count_vm_event(UNEVICTABLE_PGRESCUED);
 	} else {
@@ -2416,6 +2416,7 @@ retry:
 		 */
 		SetPageUnevictable(page);
 		list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
+		mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
 		if (page_evictable(page, NULL))
 			goto retry;
 	}
-- 
cgit v1.2.3


From f8d665422603ee1b8ed04dcad4242f14d623c941 Mon Sep 17 00:00:00 2001
From: Hirokazu Takahashi <taka@valinux.co.jp>
Date: Wed, 7 Jan 2009 18:08:02 -0800
Subject: memcg: add mem_cgroup_disabled()

We check mem_cgroup is disabled or not by checking
mem_cgroup_subsys.disabled.  I think it has more references than expected,
now.

replacing
   if (mem_cgroup_subsys.disabled)
with
   if (mem_cgroup_disabled())

give us good look, I think.

[kamezawa.hiroyu@jp.fujitsu.com: fix typo]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 15 ++++++++++++++-
 mm/memcontrol.c            | 28 ++++++++++++++--------------
 mm/page_cgroup.c           |  4 ++--
 3 files changed, 30 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 32c07b1852d6..472efd09118c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -19,7 +19,7 @@
 
 #ifndef _LINUX_MEMCONTROL_H
 #define _LINUX_MEMCONTROL_H
-
+#include <linux/cgroup.h>
 struct mem_cgroup;
 struct page_cgroup;
 struct page;
@@ -87,6 +87,14 @@ extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 extern int do_swap_account;
 #endif
+
+static inline bool mem_cgroup_disabled(void)
+{
+	if (mem_cgroup_subsys.disabled)
+		return true;
+	return false;
+}
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
@@ -214,6 +222,11 @@ static inline long mem_cgroup_calc_reclaim(struct mem_cgroup *mem,
 {
 	return 0;
 }
+
+static inline bool mem_cgroup_disabled(void)
+{
+	return true;
+}
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8ce4e9e47959..9846f617115d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -279,7 +279,7 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
 	struct mem_cgroup *mem;
 	struct mem_cgroup_per_zone *mz;
 
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
 	/* can happen while we handle swapcache. */
@@ -302,7 +302,7 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
 	struct mem_cgroup_per_zone *mz;
 	struct page_cgroup *pc;
 
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return;
 
 	pc = lookup_page_cgroup(page);
@@ -319,7 +319,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 	struct page_cgroup *pc;
 	struct mem_cgroup_per_zone *mz;
 
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
 	/* barrier to sync with "charge" */
@@ -344,7 +344,7 @@ static void mem_cgroup_lru_fixup(struct page *page)
 void mem_cgroup_move_lists(struct page *page,
 			   enum lru_list from, enum lru_list to)
 {
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return;
 	mem_cgroup_del_lru_list(page, from);
 	mem_cgroup_add_lru_list(page, to);
@@ -731,7 +731,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 int mem_cgroup_newpage_charge(struct page *page,
 			      struct mm_struct *mm, gfp_t gfp_mask)
 {
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return 0;
 	if (PageCompound(page))
 		return 0;
@@ -753,7 +753,7 @@ int mem_cgroup_newpage_charge(struct page *page,
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask)
 {
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return 0;
 	if (PageCompound(page))
 		return 0;
@@ -799,7 +799,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 	struct mem_cgroup *mem;
 	swp_entry_t     ent;
 
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return 0;
 
 	if (!do_swap_account)
@@ -833,7 +833,7 @@ int mem_cgroup_cache_charge_swapin(struct page *page,
 {
 	int ret = 0;
 
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return 0;
 	if (unlikely(!mm))
 		mm = &init_mm;
@@ -880,7 +880,7 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 {
 	struct page_cgroup *pc;
 
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return;
 	if (!ptr)
 		return;
@@ -909,7 +909,7 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 
 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
 {
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return;
 	if (!mem)
 		return;
@@ -930,7 +930,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	struct mem_cgroup *mem = NULL;
 	struct mem_cgroup_per_zone *mz;
 
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return NULL;
 
 	if (PageSwapCache(page))
@@ -1049,7 +1049,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 	struct mem_cgroup *mem = NULL;
 	int ret = 0;
 
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return 0;
 
 	pc = lookup_page_cgroup(page);
@@ -1131,7 +1131,7 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 	int progress = 0;
 	int retry = MEM_CGROUP_RECLAIM_RETRIES;
 
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return 0;
 	if (!mm)
 		return 0;
@@ -1697,7 +1697,7 @@ static void mem_cgroup_put(struct mem_cgroup *mem)
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 static void __init enable_swap_cgroup(void)
 {
-	if (!mem_cgroup_subsys.disabled && really_do_swap_account)
+	if (!mem_cgroup_disabled() && really_do_swap_account)
 		do_swap_account = 1;
 }
 #else
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 74ae8e01d071..7006a11350c8 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -74,7 +74,7 @@ void __init page_cgroup_init(void)
 
 	int nid, fail;
 
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return;
 
 	for_each_online_node(nid)  {
@@ -247,7 +247,7 @@ void __init page_cgroup_init(void)
 	unsigned long pfn;
 	int fail = 0;
 
-	if (mem_cgroup_subsys.disabled)
+	if (mem_cgroup_disabled())
 		return;
 
 	for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
-- 
cgit v1.2.3


From 28dbc4b6a01fb579a9441c7b81e3d3413dc452df Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Wed, 7 Jan 2009 18:08:05 -0800
Subject: memcg: memory cgroup resource counters for hierarchy

Add support for building hierarchies in resource counters.  Cgroups allows
us to build a deep hierarchy, but we currently don't link the resource
counters belonging to the memory controller control groups, in the same
fashion as the corresponding cgroup entries in the cgroup hierarchy.  This
patch provides the infrastructure for resource counters that have the same
hiearchy as their cgroup counter parts.

These set of patches are based on the resource counter hiearchy patches
posted by Pavel Emelianov.

NOTE: Building hiearchies is expensive, deeper hierarchies imply charging
the all the way up to the root.  It is known that hiearchies are
expensive, so the user needs to be careful and aware of the trade-offs
before creating very deep ones.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/res_counter.h |  8 ++++++--
 kernel/res_counter.c        | 44 +++++++++++++++++++++++++++++++++++---------
 mm/memcontrol.c             | 20 +++++++++++++-------
 3 files changed, 54 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 271c1c2c9f6f..dede0a2cfc45 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -43,6 +43,10 @@ struct res_counter {
 	 * the routines below consider this to be IRQ-safe
 	 */
 	spinlock_t lock;
+	/*
+	 * Parent counter, used for hierarchial resource accounting
+	 */
+	struct res_counter *parent;
 };
 
 /**
@@ -87,7 +91,7 @@ enum {
  * helpers for accounting
  */
 
-void res_counter_init(struct res_counter *counter);
+void res_counter_init(struct res_counter *counter, struct res_counter *parent);
 
 /*
  * charge - try to consume more resource.
@@ -103,7 +107,7 @@ void res_counter_init(struct res_counter *counter);
 int __must_check res_counter_charge_locked(struct res_counter *counter,
 		unsigned long val);
 int __must_check res_counter_charge(struct res_counter *counter,
-		unsigned long val);
+		unsigned long val, struct res_counter **limit_fail_at);
 
 /*
  * uncharge - tell that some portion of the resource is released
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index f275c8eca772..bf8e7534c803 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -15,10 +15,11 @@
 #include <linux/uaccess.h>
 #include <linux/mm.h>
 
-void res_counter_init(struct res_counter *counter)
+void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 {
 	spin_lock_init(&counter->lock);
 	counter->limit = (unsigned long long)LLONG_MAX;
+	counter->parent = parent;
 }
 
 int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
@@ -34,14 +35,34 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
 	return 0;
 }
 
-int res_counter_charge(struct res_counter *counter, unsigned long val)
+int res_counter_charge(struct res_counter *counter, unsigned long val,
+			struct res_counter **limit_fail_at)
 {
 	int ret;
 	unsigned long flags;
-
-	spin_lock_irqsave(&counter->lock, flags);
-	ret = res_counter_charge_locked(counter, val);
-	spin_unlock_irqrestore(&counter->lock, flags);
+	struct res_counter *c, *u;
+
+	*limit_fail_at = NULL;
+	local_irq_save(flags);
+	for (c = counter; c != NULL; c = c->parent) {
+		spin_lock(&c->lock);
+		ret = res_counter_charge_locked(c, val);
+		spin_unlock(&c->lock);
+		if (ret < 0) {
+			*limit_fail_at = c;
+			goto undo;
+		}
+	}
+	ret = 0;
+	goto done;
+undo:
+	for (u = counter; u != c; u = u->parent) {
+		spin_lock(&u->lock);
+		res_counter_uncharge_locked(u, val);
+		spin_unlock(&u->lock);
+	}
+done:
+	local_irq_restore(flags);
 	return ret;
 }
 
@@ -56,10 +77,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
 void res_counter_uncharge(struct res_counter *counter, unsigned long val)
 {
 	unsigned long flags;
+	struct res_counter *c;
 
-	spin_lock_irqsave(&counter->lock, flags);
-	res_counter_uncharge_locked(counter, val);
-	spin_unlock_irqrestore(&counter->lock, flags);
+	local_irq_save(flags);
+	for (c = counter; c != NULL; c = c->parent) {
+		spin_lock(&c->lock);
+		res_counter_uncharge_locked(c, val);
+		spin_unlock(&c->lock);
+	}
+	local_irq_restore(flags);
 }
 
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9846f617115d..e72fb2b4a7d8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -471,6 +471,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 {
 	struct mem_cgroup *mem;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+	struct res_counter *fail_res;
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
@@ -499,11 +500,12 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 		int ret;
 		bool noswap = false;
 
-		ret = res_counter_charge(&mem->res, PAGE_SIZE);
+		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
 		if (likely(!ret)) {
 			if (!do_swap_account)
 				break;
-			ret = res_counter_charge(&mem->memsw, PAGE_SIZE);
+			ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
+							&fail_res);
 			if (likely(!ret))
 				break;
 			/* mem+swap counter fails */
@@ -1709,22 +1711,26 @@ static void __init enable_swap_cgroup(void)
 static struct cgroup_subsys_state *
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
-	struct mem_cgroup *mem;
+	struct mem_cgroup *mem, *parent;
 	int node;
 
 	mem = mem_cgroup_alloc();
 	if (!mem)
 		return ERR_PTR(-ENOMEM);
 
-	res_counter_init(&mem->res);
-	res_counter_init(&mem->memsw);
-
 	for_each_node_state(node, N_POSSIBLE)
 		if (alloc_mem_cgroup_per_zone_info(mem, node))
 			goto free_out;
 	/* root ? */
-	if (cont->parent == NULL)
+	if (cont->parent == NULL) {
 		enable_swap_cgroup();
+		parent = NULL;
+	} else
+		parent = mem_cgroup_from_cont(cont->parent);
+
+	res_counter_init(&mem->res, parent ? &parent->res : NULL);
+	res_counter_init(&mem->memsw, parent ? &parent->memsw : NULL);
+
 
 	return &mem->css;
 free_out:
-- 
cgit v1.2.3


From 2e4d40915fb85207fe48cfc31201824ec6d7426e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:07 -0800
Subject: memcontrol: rcu_read_lock() to protect mm_match_cgroup()

mm_match_cgroup() calls cgroup_subsys_state().

We must use rcu_read_lock() to protect cgroup_subsys_state().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 472efd09118c..2de6504e01fb 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -61,8 +61,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
 
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
-#define mm_match_cgroup(mm, cgroup)	\
-	((cgroup) == mem_cgroup_from_task((mm)->owner))
+static inline
+int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
+{
+	struct mem_cgroup *mem;
+	rcu_read_lock();
+	mem = mem_cgroup_from_task((mm)->owner);
+	rcu_read_unlock();
+	return cgroup == mem;
+}
 
 extern int
 mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr);
-- 
cgit v1.2.3


From a636b327f731143ccc544b966cfd8de6cb6d72c6 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:08 -0800
Subject: memcg: avoid unnecessary system-wide-oom-killer

Current mmtom has new oom function as pagefault_out_of_memory().  It's
added for select bad process rathar than killing current.

When memcg hit limit and calls OOM at page_fault, this handler called and
system-wide-oom handling happens.  (means kernel panics if panic_on_oom is
true....)

To avoid overkill, check memcg's recent behavior before starting
system-wide-oom.

And this patch also fixes to guarantee "don't accnout against process with
TIF_MEMDIE".  This is necessary for smooth OOM.

[akpm@linux-foundation.org: build fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Jan Blunck <jblunck@suse.de>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  7 +++++++
 mm/memcontrol.c            | 33 +++++++++++++++++++++++++++++----
 mm/oom_kill.c              |  8 ++++++++
 3 files changed, 44 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2de6504e01fb..2fdd1380bf0a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -102,6 +102,8 @@ static inline bool mem_cgroup_disabled(void)
 	return false;
 }
 
+extern bool mem_cgroup_oom_called(struct task_struct *task);
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
@@ -234,6 +236,11 @@ static inline bool mem_cgroup_disabled(void)
 {
 	return true;
 }
+
+static inline bool mem_cgroup_oom_called(struct task_struct *task)
+{
+	return false;
+}
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 886e2224c5fd..659b0c58f13e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -153,7 +153,7 @@ struct mem_cgroup {
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
 	bool use_hierarchy;
-
+	unsigned long	last_oom_jiffies;
 	int		obsolete;
 	atomic_t	refcnt;
 	/*
@@ -615,6 +615,22 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 	return ret;
 }
 
+bool mem_cgroup_oom_called(struct task_struct *task)
+{
+	bool ret = false;
+	struct mem_cgroup *mem;
+	struct mm_struct *mm;
+
+	rcu_read_lock();
+	mm = task->mm;
+	if (!mm)
+		mm = &init_mm;
+	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+	if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
+		ret = true;
+	rcu_read_unlock();
+	return ret;
+}
 /*
  * Unlike exported interface, "oom" parameter is added. if oom==true,
  * oom-killer can be invoked.
@@ -626,6 +642,13 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 	struct mem_cgroup *mem, *mem_over_limit;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct res_counter *fail_res;
+
+	if (unlikely(test_thread_flag(TIF_MEMDIE))) {
+		/* Don't account this! */
+		*memcg = NULL;
+		return 0;
+	}
+
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
@@ -694,8 +717,10 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 			continue;
 
 		if (!nr_retries--) {
-			if (oom)
+			if (oom) {
 				mem_cgroup_out_of_memory(mem, gfp_mask);
+				mem->last_oom_jiffies = jiffies;
+			}
 			goto nomem;
 		}
 	}
@@ -832,7 +857,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
 
 
 	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
-	if (ret)
+	if (ret || !parent)
 		return ret;
 
 	if (!get_page_unless_zero(page))
@@ -883,7 +908,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 
 	mem = memcg;
 	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
-	if (ret)
+	if (ret || !mem)
 		return ret;
 
 	__mem_cgroup_commit_charge(mem, pc, ctype);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 6b9e758c98a5..fd150e3a2567 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -560,6 +560,13 @@ void pagefault_out_of_memory(void)
 		/* Got some memory back in the last second. */
 		return;
 
+	/*
+	 * If this is from memcg, oom-killer is already invoked.
+	 * and not worth to go system-wide-oom.
+	 */
+	if (mem_cgroup_oom_called(current))
+		goto rest_and_return;
+
 	if (sysctl_panic_on_oom)
 		panic("out of memory from page fault. panic_on_oom is selected.\n");
 
@@ -571,6 +578,7 @@ void pagefault_out_of_memory(void)
 	 * Give "p" a good chance of killing itself before we
 	 * retry to allocate memory.
 	 */
+rest_and_return:
 	if (!test_thread_flag(TIF_MEMDIE))
 		schedule_timeout_uninterruptible(1);
 }
-- 
cgit v1.2.3


From 2c26fdd70c3094fa3e84caf9ef434911933d5477 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:10 -0800
Subject: memcg: revert gfp mask fix

My patch, memcg-fix-gfp_mask-of-callers-of-charge.patch changed gfp_mask
of callers of charge to be GFP_HIGHUSER_MOVABLE for showing what will
happen at memory reclaim.

But in recent discussion, it's NACKed because it sounds ugly.

This patch is for reverting it and add some clean up to gfp_mask of
callers of charge.  No behavior change but need review before generating
HUNK in deep queue.

This patch also adds explanation to meaning of gfp_mask passed to charge
functions in memcontrol.h.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 10 ++++++++++
 mm/filemap.c               |  2 +-
 mm/memcontrol.c            | 10 +++++-----
 mm/memory.c                | 10 ++++------
 mm/shmem.c                 |  8 ++++----
 mm/swapfile.c              |  3 +--
 6 files changed, 25 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2fdd1380bf0a..59ac95a64508 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -26,6 +26,16 @@ struct page;
 struct mm_struct;
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/*
+ * All "charge" functions with gfp_mask should use GFP_KERNEL or
+ * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't
+ * alloc memory but reclaims memory from all available zones. So, "where I want
+ * memory from" bits of gfp_mask has no meaning. So any bits of that field is
+ * available but adding a rule is better. charge functions' gfp_mask should
+ * be set to GFP_KERNEL or gfp_mask & GFP_RECLAIM_MASK for avoiding ambiguous
+ * codes.
+ * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
+ */
 
 extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
diff --git a/mm/filemap.c b/mm/filemap.c
index 2f55a1e2baf7..ceba0bd03662 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -460,7 +460,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 	VM_BUG_ON(!PageLocked(page));
 
 	error = mem_cgroup_cache_charge(page, current->mm,
-					gfp_mask & ~__GFP_HIGHMEM);
+					gfp_mask & GFP_RECLAIM_MASK);
 	if (error)
 		goto out;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9bf5d7c8ede7..b9cd57b667d6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1248,7 +1248,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 	unlock_page_cgroup(pc);
 
 	if (mem) {
-		ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem);
+		ret = mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem);
 		css_put(&mem->css);
 	}
 	*ptr = mem;
@@ -1378,7 +1378,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 			break;
 
 		progress = try_to_free_mem_cgroup_pages(memcg,
-				GFP_HIGHUSER_MOVABLE, false);
+				GFP_KERNEL, false);
   		if (!progress)			retry_count--;
 	}
 	return ret;
@@ -1418,7 +1418,7 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 			break;
 
 		oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
-		try_to_free_mem_cgroup_pages(memcg, GFP_HIGHUSER_MOVABLE, true);
+		try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, true);
 		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 		if (curusage >= oldusage)
 			retry_count--;
@@ -1464,7 +1464,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 		}
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 
-		ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
+		ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
 		if (ret == -ENOMEM)
 			break;
 
@@ -1550,7 +1550,7 @@ try_to_free:
 			goto out;
 		}
 		progress = try_to_free_mem_cgroup_pages(mem,
-						  GFP_HIGHUSER_MOVABLE, false);
+						  GFP_KERNEL, false);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
diff --git a/mm/memory.c b/mm/memory.c
index 1358012ffa73..e5bfbe6b594c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2000,7 +2000,7 @@ gotten:
 	cow_user_page(new_page, old_page, address, vma);
 	__SetPageUptodate(new_page);
 
-	if (mem_cgroup_newpage_charge(new_page, mm, GFP_HIGHUSER_MOVABLE))
+	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
 		goto oom_free_new;
 
 	/*
@@ -2431,8 +2431,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	lock_page(page);
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 
-	if (mem_cgroup_try_charge_swapin(mm, page,
-				GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) {
+	if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
 		ret = VM_FAULT_OOM;
 		unlock_page(page);
 		goto out;
@@ -2524,7 +2523,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto oom;
 	__SetPageUptodate(page);
 
-	if (mem_cgroup_newpage_charge(page, mm, GFP_HIGHUSER_MOVABLE))
+	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
 		goto oom_free_page;
 
 	entry = mk_pte(page, vma->vm_page_prot);
@@ -2615,8 +2614,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				ret = VM_FAULT_OOM;
 				goto out;
 			}
-			if (mem_cgroup_newpage_charge(page,
-						mm, GFP_HIGHUSER_MOVABLE)) {
+			if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
 				ret = VM_FAULT_OOM;
 				page_cache_release(page);
 				goto out;
diff --git a/mm/shmem.c b/mm/shmem.c
index adf5c3eedbc9..bbb7b043c986 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -932,8 +932,8 @@ found:
 	 * Charge page using GFP_HIGHUSER_MOVABLE while we can wait.
 	 * charged back to the user(not to caller) when swap account is used.
 	 */
-	error = mem_cgroup_cache_charge_swapin(page,
-			current->mm, GFP_HIGHUSER_MOVABLE, true);
+	error = mem_cgroup_cache_charge_swapin(page, current->mm, GFP_KERNEL,
+					true);
 	if (error)
 		goto out;
 	error = radix_tree_preload(GFP_KERNEL);
@@ -1275,7 +1275,7 @@ repeat:
 			 * charge against this swap cache here.
 			 */
 			if (mem_cgroup_cache_charge_swapin(swappage,
-						current->mm, gfp, false)) {
+				current->mm, gfp & GFP_RECLAIM_MASK, false)) {
 				page_cache_release(swappage);
 				error = -ENOMEM;
 				goto failed;
@@ -1393,7 +1393,7 @@ repeat:
 
 			/* Precharge page while we can wait, compensate after */
 			error = mem_cgroup_cache_charge(filepage, current->mm,
-					GFP_HIGHUSER_MOVABLE);
+					GFP_KERNEL);
 			if (error) {
 				page_cache_release(filepage);
 				shmem_unacct_blocks(info->flags, 1);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0579d9069b61..da422c47e2ee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -698,8 +698,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 	pte_t *pte;
 	int ret = 1;
 
-	if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
-					GFP_HIGHUSER_MOVABLE, &ptr))
+	if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr))
 		ret = -ENOMEM;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-- 
cgit v1.2.3


From f89eb90e33fd4e4e0cc1a6d20afd63c5a561885a Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:14 -0800
Subject: inactive_anon_is_low: move to vmscan

The inactive_anon_is_low() is called only vmscan.  Then it can move to
vmscan.c

This patch doesn't have any functional change.

Reviewd-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 19 -------------------
 mm/vmscan.c               | 20 ++++++++++++++++++++
 2 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 37ef13d0f01e..7fbb97267556 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -81,23 +81,4 @@ static inline enum lru_list page_lru(struct page *page)
 	return lru;
 }
 
-/**
- * inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @zone: zone to check
- *
- * Returns true if the zone does not have enough inactive anon pages,
- * meaning some active anon pages need to be deactivated.
- */
-static inline int inactive_anon_is_low(struct zone *zone)
-{
-	unsigned long active, inactive;
-
-	active = zone_page_state(zone, NR_ACTIVE_ANON);
-	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-
-	if (inactive * zone->inactive_ratio < active)
-		return 1;
-
-	return 0;
-}
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 45983af1de3d..f75d924cb4f4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1291,6 +1291,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	pagevec_release(&pvec);
 }
 
+/**
+ * inactive_anon_is_low - check if anonymous pages need to be deactivated
+ * @zone: zone to check
+ *
+ * Returns true if the zone does not have enough inactive anon pages,
+ * meaning some active anon pages need to be deactivated.
+ */
+static int inactive_anon_is_low(struct zone *zone)
+{
+	unsigned long active, inactive;
+
+	active = zone_page_state(zone, NR_ACTIVE_ANON);
+	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
+
+	if (inactive * zone->inactive_ratio < active)
+		return 1;
+
+	return 0;
+}
+
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 	struct zone *zone, struct scan_control *sc, int priority)
 {
-- 
cgit v1.2.3


From 6e9015716ae9b59e9635d692fddfcfb9582c146c Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:15 -0800
Subject: mm: introduce zone_reclaim struct

Add zone_reclam_stat struct for later enhancement.

A later patch uses this.  This patch doesn't any behavior change (yet).

Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 24 ++++++++++++++----------
 mm/page_alloc.c        |  8 ++++----
 mm/swap.c              | 12 ++++++++----
 mm/vmscan.c            | 47 ++++++++++++++++++++++++++++++-----------------
 4 files changed, 56 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 35a7b5e19465..09c14e213b63 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -263,6 +263,19 @@ enum zone_type {
 #error ZONES_SHIFT -- too many zones configured adjust calculation
 #endif
 
+struct zone_reclaim_stat {
+	/*
+	 * The pageout code in vmscan.c keeps track of how many of the
+	 * mem/swap backed and file backed pages are refeferenced.
+	 * The higher the rotated/scanned ratio, the more valuable
+	 * that cache is.
+	 *
+	 * The anon LRU stats live in [0], file LRU stats in [1]
+	 */
+	unsigned long		recent_rotated[2];
+	unsigned long		recent_scanned[2];
+};
+
 struct zone {
 	/* Fields commonly accessed by the page allocator */
 	unsigned long		pages_min, pages_low, pages_high;
@@ -315,16 +328,7 @@ struct zone {
 		unsigned long nr_scan;
 	} lru[NR_LRU_LISTS];
 
-	/*
-	 * The pageout code in vmscan.c keeps track of how many of the
-	 * mem/swap backed and file backed pages are refeferenced.
-	 * The higher the rotated/scanned ratio, the more valuable
-	 * that cache is.
-	 *
-	 * The anon LRU stats live in [0], file LRU stats in [1]
-	 */
-	unsigned long		recent_rotated[2];
-	unsigned long		recent_scanned[2];
+	struct zone_reclaim_stat reclaim_stat;
 
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	unsigned long		flags;		   /* zone flags, see below */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7bf22e045318..5675b3073854 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3523,10 +3523,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 			INIT_LIST_HEAD(&zone->lru[l].list);
 			zone->lru[l].nr_scan = 0;
 		}
-		zone->recent_rotated[0] = 0;
-		zone->recent_rotated[1] = 0;
-		zone->recent_scanned[0] = 0;
-		zone->recent_scanned[1] = 0;
+		zone->reclaim_stat.recent_rotated[0] = 0;
+		zone->reclaim_stat.recent_rotated[1] = 0;
+		zone->reclaim_stat.recent_scanned[0] = 0;
+		zone->reclaim_stat.recent_scanned[1] = 0;
 		zap_zone_vm_stats(zone);
 		zone->flags = 0;
 		if (!size)
diff --git a/mm/swap.c b/mm/swap.c
index 8a98a9c90704..26b07e7bc3d3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -157,6 +157,7 @@ void  rotate_reclaimable_page(struct page *page)
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
+	struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
 
 	spin_lock_irq(&zone->lru_lock);
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -169,8 +170,8 @@ void activate_page(struct page *page)
 		add_page_to_lru_list(zone, page, lru);
 		__count_vm_event(PGACTIVATE);
 
-		zone->recent_rotated[!!file]++;
-		zone->recent_scanned[!!file]++;
+		reclaim_stat->recent_rotated[!!file]++;
+		reclaim_stat->recent_scanned[!!file]++;
 	}
 	spin_unlock_irq(&zone->lru_lock);
 }
@@ -385,6 +386,8 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 {
 	int i;
 	struct zone *zone = NULL;
+	struct zone_reclaim_stat *reclaim_stat = NULL;
+
 	VM_BUG_ON(is_unevictable_lru(lru));
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
@@ -396,6 +399,7 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 			if (zone)
 				spin_unlock_irq(&zone->lru_lock);
 			zone = pagezone;
+			reclaim_stat = &zone->reclaim_stat;
 			spin_lock_irq(&zone->lru_lock);
 		}
 		VM_BUG_ON(PageActive(page));
@@ -403,10 +407,10 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
 		file = is_file_lru(lru);
-		zone->recent_scanned[file]++;
+		reclaim_stat->recent_scanned[file]++;
 		if (is_active_lru(lru)) {
 			SetPageActive(page);
-			zone->recent_rotated[file]++;
+			reclaim_stat->recent_rotated[file]++;
 		}
 		add_page_to_lru_list(zone, page, lru);
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f75d924cb4f4..03ca923c6656 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -130,6 +130,12 @@ static DECLARE_RWSEM(shrinker_rwsem);
 #define scan_global_lru(sc)	(1)
 #endif
 
+static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
+						  struct scan_control *sc)
+{
+	return &zone->reclaim_stat;
+}
+
 /*
  * Add a shrinker callback to be called from the vm
  */
@@ -1029,6 +1035,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 	struct pagevec pvec;
 	unsigned long nr_scanned = 0;
 	unsigned long nr_reclaimed = 0;
+	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
 
 	pagevec_init(&pvec, 1);
 
@@ -1072,10 +1079,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 
 		if (scan_global_lru(sc)) {
 			zone->pages_scanned += nr_scan;
-			zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
-			zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
-			zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
-			zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
+			reclaim_stat->recent_scanned[0] +=
+						      count[LRU_INACTIVE_ANON];
+			reclaim_stat->recent_scanned[0] +=
+						      count[LRU_ACTIVE_ANON];
+			reclaim_stat->recent_scanned[1] +=
+						      count[LRU_INACTIVE_FILE];
+			reclaim_stat->recent_scanned[1] +=
+						      count[LRU_ACTIVE_FILE];
 		}
 		spin_unlock_irq(&zone->lru_lock);
 
@@ -1136,7 +1147,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 			add_page_to_lru_list(zone, page, lru);
 			if (PageActive(page) && scan_global_lru(sc)) {
 				int file = !!page_is_file_cache(page);
-				zone->recent_rotated[file]++;
+				reclaim_stat->recent_rotated[file]++;
 			}
 			if (!pagevec_add(&pvec, page)) {
 				spin_unlock_irq(&zone->lru_lock);
@@ -1196,6 +1207,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	struct page *page;
 	struct pagevec pvec;
 	enum lru_list lru;
+	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
 
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
@@ -1208,7 +1220,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	 */
 	if (scan_global_lru(sc)) {
 		zone->pages_scanned += pgscanned;
-		zone->recent_scanned[!!file] += pgmoved;
+		reclaim_stat->recent_scanned[!!file] += pgmoved;
 	}
 
 	if (file)
@@ -1251,7 +1263,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	 * pages in get_scan_ratio.
 	 */
 	if (scan_global_lru(sc))
-		zone->recent_rotated[!!file] += pgmoved;
+		reclaim_stat->recent_rotated[!!file] += pgmoved;
 
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
@@ -1344,6 +1356,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 	unsigned long anon, file, free;
 	unsigned long anon_prio, file_prio;
 	unsigned long ap, fp;
+	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
 
 	/* If we have no swap space, do not bother scanning anon pages. */
 	if (nr_swap_pages <= 0) {
@@ -1376,17 +1389,17 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 	 *
 	 * anon in [0], file in [1]
 	 */
-	if (unlikely(zone->recent_scanned[0] > anon / 4)) {
+	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
 		spin_lock_irq(&zone->lru_lock);
-		zone->recent_scanned[0] /= 2;
-		zone->recent_rotated[0] /= 2;
+		reclaim_stat->recent_scanned[0] /= 2;
+		reclaim_stat->recent_rotated[0] /= 2;
 		spin_unlock_irq(&zone->lru_lock);
 	}
 
-	if (unlikely(zone->recent_scanned[1] > file / 4)) {
+	if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
 		spin_lock_irq(&zone->lru_lock);
-		zone->recent_scanned[1] /= 2;
-		zone->recent_rotated[1] /= 2;
+		reclaim_stat->recent_scanned[1] /= 2;
+		reclaim_stat->recent_rotated[1] /= 2;
 		spin_unlock_irq(&zone->lru_lock);
 	}
 
@@ -1402,11 +1415,11 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 	 * proportional to the fraction of recently scanned pages on
 	 * each list that were recently referenced and in active use.
 	 */
-	ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
-	ap /= zone->recent_rotated[0] + 1;
+	ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
+	ap /= reclaim_stat->recent_rotated[0] + 1;
 
-	fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
-	fp /= zone->recent_rotated[1] + 1;
+	fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
+	fp /= reclaim_stat->recent_rotated[1] + 1;
 
 	/* Normalize to percentages */
 	percent[0] = 100 * ap / (ap + fp + 1);
-- 
cgit v1.2.3


From 14797e2363c2b2f1ce139fd1c5a215e4e05aa1d9 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:18 -0800
Subject: memcg: add inactive_anon_is_low()

The inactive_anon_is_low() is key component of active/inactive anon
balancing on reclaim.  However current inactive_anon_is_low() function
only consider global reclaim.

Therefore, we need following ugly scan_global_lru() condition.

	if (lru == LRU_ACTIVE_ANON &&
	    (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
		shrink_active_list(nr_to_scan, zone, sc, priority, file);
		return 0;

it cause that memcg reclaim always deactivate pages when shrink_list() is
called.  To make mem_cgroup_inactive_anon_is_low() improve active/inactive
anon balancing of memcgroup.

Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: "Pekka Enberg" <penberg@cs.helsinki.fi>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  9 +++++++++
 mm/memcontrol.c            | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 mm/vmscan.c                | 37 +++++++++++++++++++++++--------------
 3 files changed, 77 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 59ac95a64508..aad9377c9828 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -100,6 +100,8 @@ extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 
 extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
 					int priority, enum lru_list lru);
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg,
+				    struct zone *zone);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 extern int do_swap_account;
@@ -251,6 +253,13 @@ static inline bool mem_cgroup_oom_called(struct task_struct *task)
 {
 	return false;
 }
+
+static inline int
+mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
+{
+	return 1;
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 457d671029b8..6611328460e9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -156,6 +156,9 @@ struct mem_cgroup {
 	unsigned long	last_oom_jiffies;
 	int		obsolete;
 	atomic_t	refcnt;
+
+	unsigned int inactive_ratio;
+
 	/*
 	 * statistics. This must be placed at the end of memcg.
 	 */
@@ -431,6 +434,20 @@ long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
 	return (nr_pages >> priority);
 }
 
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
+{
+	unsigned long active;
+	unsigned long inactive;
+
+	inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
+	active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
+
+	if (inactive * memcg->inactive_ratio < active)
+		return 1;
+
+	return 0;
+}
+
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					struct list_head *dst,
 					unsigned long *scanned, int order,
@@ -1360,6 +1377,29 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 	return 0;
 }
 
+/*
+ * The inactive anon list should be small enough that the VM never has to
+ * do too much work, but large enough that each inactive page has a chance
+ * to be referenced again before it is swapped out.
+ *
+ * this calculation is straightforward porting from
+ * page_alloc.c::setup_per_zone_inactive_ratio().
+ * it describe more detail.
+ */
+static void mem_cgroup_set_inactive_ratio(struct mem_cgroup *memcg)
+{
+	unsigned int gb, ratio;
+
+	gb = res_counter_read_u64(&memcg->res, RES_LIMIT) >> 30;
+	if (gb)
+		ratio = int_sqrt(10 * gb);
+	else
+		ratio = 1;
+
+	memcg->inactive_ratio = ratio;
+
+}
+
 static DEFINE_MUTEX(set_limit_mutex);
 
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
@@ -1398,6 +1438,10 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 				GFP_KERNEL, false);
   		if (!progress)			retry_count--;
 	}
+
+	if (!ret)
+		mem_cgroup_set_inactive_ratio(memcg);
+
 	return ret;
 }
 
@@ -1982,7 +2026,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 		res_counter_init(&mem->res, NULL);
 		res_counter_init(&mem->memsw, NULL);
 	}
-
+	mem_cgroup_set_inactive_ratio(mem);
 	mem->last_scanned_child = NULL;
 
 	return &mem->css;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e2b31a522a66..b2bc06bffcfb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1310,14 +1310,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	pagevec_release(&pvec);
 }
 
-/**
- * inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @zone: zone to check
- *
- * Returns true if the zone does not have enough inactive anon pages,
- * meaning some active anon pages need to be deactivated.
- */
-static int inactive_anon_is_low(struct zone *zone)
+static int inactive_anon_is_low_global(struct zone *zone)
 {
 	unsigned long active, inactive;
 
@@ -1330,6 +1323,25 @@ static int inactive_anon_is_low(struct zone *zone)
 	return 0;
 }
 
+/**
+ * inactive_anon_is_low - check if anonymous pages need to be deactivated
+ * @zone: zone to check
+ * @sc:   scan control of this context
+ *
+ * Returns true if the zone does not have enough inactive anon pages,
+ * meaning some active anon pages need to be deactivated.
+ */
+static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
+{
+	int low;
+
+	if (scan_global_lru(sc))
+		low = inactive_anon_is_low_global(zone);
+	else
+		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
+	return low;
+}
+
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 	struct zone *zone, struct scan_control *sc, int priority)
 {
@@ -1340,8 +1352,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 		return 0;
 	}
 
-	if (lru == LRU_ACTIVE_ANON &&
-	    (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
+	if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
 		shrink_active_list(nr_to_scan, zone, sc, priority, file);
 		return 0;
 	}
@@ -1509,9 +1520,7 @@ static void shrink_zone(int priority, struct zone *zone,
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
-		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
-	else if (!scan_global_lru(sc))
+	if (inactive_anon_is_low(zone, sc))
 		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
 
 	throttle_vm_writeout(sc->gfp_mask);
@@ -1807,7 +1816,7 @@ loop_again:
 			 * Do some background aging of the anon list, to give
 			 * pages a chance to be referenced before reclaiming.
 			 */
-			if (inactive_anon_is_low(zone))
+			if (inactive_anon_is_low(zone, &sc))
 				shrink_active_list(SWAP_CLUSTER_MAX, zone,
 							&sc, priority, 0);
 
-- 
cgit v1.2.3


From a3d8e0549d913e30968fa02e505dfe02c0a23e0d Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:19 -0800
Subject: memcg: add mem_cgroup_zone_nr_pages()

Introduce mem_cgroup_zone_nr_pages().  It is called by zone_nr_pages()
helper function.

This patch doesn't have any behavior change.

Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 11 +++++++++++
 mm/memcontrol.c            | 12 +++++++++++-
 mm/vmscan.c                |  3 +++
 3 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index aad9377c9828..b1defd6a2783 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -102,6 +102,9 @@ extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
 					int priority, enum lru_list lru);
 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg,
 				    struct zone *zone);
+unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
+				       struct zone *zone,
+				       enum lru_list lru);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 extern int do_swap_account;
@@ -260,6 +263,14 @@ mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
 	return 1;
 }
 
+static inline unsigned long
+mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone,
+			 enum lru_list lru)
+{
+	return 0;
+}
+
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6611328460e9..313247e6c503 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -186,7 +186,6 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
 	0, /* FORCE */
 };
 
-
 /* for encoding cft->private value on file */
 #define _MEM			(0)
 #define _MEMSWAP		(1)
@@ -448,6 +447,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
 	return 0;
 }
 
+unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
+				       struct zone *zone,
+				       enum lru_list lru)
+{
+	int nid = zone->zone_pgdat->node_id;
+	int zid = zone_idx(zone);
+	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+
+	return MEM_CGROUP_ZSTAT(mz, lru);
+}
+
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					struct list_head *dst,
 					unsigned long *scanned, int order,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2bc06bffcfb..d958d624d3ae 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -139,6 +139,9 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
 static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc,
 				   enum lru_list lru)
 {
+	if (!scan_global_lru(sc))
+		return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
+
 	return zone_page_state(zone, NR_LRU_BASE + lru);
 }
 
-- 
cgit v1.2.3


From 3e2f41f1f64744f7942980d93cc93dd3e5924560 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:20 -0800
Subject: memcg: add zone_reclaim_stat

Introduce mem_cgroup_per_zone::reclaim_stat member and its statics
collecting function.

Now, get_scan_ratio() can calculate correct value on memcg reclaim.

[hugh@veritas.com: avoid reclaim_stat oops when disabled]
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 16 ++++++++++++++++
 mm/memcontrol.c            | 29 +++++++++++++++++++++++++++++
 mm/swap.c                  | 34 +++++++++++++++++++++++++---------
 mm/vmscan.c                | 27 +++++++++++++--------------
 4 files changed, 83 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b1defd6a2783..36b8ebb39b82 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -105,6 +105,10 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg,
 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 				       struct zone *zone,
 				       enum lru_list lru);
+struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
+						      struct zone *zone);
+struct zone_reclaim_stat*
+mem_cgroup_get_reclaim_stat_from_page(struct page *page);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 extern int do_swap_account;
@@ -271,6 +275,18 @@ mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone,
 }
 
 
+static inline struct zone_reclaim_stat*
+mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone)
+{
+	return NULL;
+}
+
+static inline struct zone_reclaim_stat*
+mem_cgroup_get_reclaim_stat_from_page(struct page *page)
+{
+	return NULL;
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 313247e6c503..7b7f4dc05035 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -103,6 +103,8 @@ struct mem_cgroup_per_zone {
 	 */
 	struct list_head	lists[NR_LRU_LISTS];
 	unsigned long		count[NR_LRU_LISTS];
+
+	struct zone_reclaim_stat reclaim_stat;
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
@@ -458,6 +460,33 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 	return MEM_CGROUP_ZSTAT(mz, lru);
 }
 
+struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
+						      struct zone *zone)
+{
+	int nid = zone->zone_pgdat->node_id;
+	int zid = zone_idx(zone);
+	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+
+	return &mz->reclaim_stat;
+}
+
+struct zone_reclaim_stat *
+mem_cgroup_get_reclaim_stat_from_page(struct page *page)
+{
+	struct page_cgroup *pc;
+	struct mem_cgroup_per_zone *mz;
+
+	if (mem_cgroup_disabled())
+		return NULL;
+
+	pc = lookup_page_cgroup(page);
+	mz = page_cgroup_zoneinfo(pc);
+	if (!mz)
+		return NULL;
+
+	return &mz->reclaim_stat;
+}
+
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					struct list_head *dst,
 					unsigned long *scanned, int order,
diff --git a/mm/swap.c b/mm/swap.c
index 26b07e7bc3d3..8adb9feb61e1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -151,13 +151,32 @@ void  rotate_reclaimable_page(struct page *page)
 	}
 }
 
+static void update_page_reclaim_stat(struct zone *zone, struct page *page,
+				     int file, int rotated)
+{
+	struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
+	struct zone_reclaim_stat *memcg_reclaim_stat;
+
+	memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
+
+	reclaim_stat->recent_scanned[file]++;
+	if (rotated)
+		reclaim_stat->recent_rotated[file]++;
+
+	if (!memcg_reclaim_stat)
+		return;
+
+	memcg_reclaim_stat->recent_scanned[file]++;
+	if (rotated)
+		memcg_reclaim_stat->recent_rotated[file]++;
+}
+
 /*
  * FIXME: speed this up?
  */
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
-	struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
 
 	spin_lock_irq(&zone->lru_lock);
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -170,8 +189,7 @@ void activate_page(struct page *page)
 		add_page_to_lru_list(zone, page, lru);
 		__count_vm_event(PGACTIVATE);
 
-		reclaim_stat->recent_rotated[!!file]++;
-		reclaim_stat->recent_scanned[!!file]++;
+		update_page_reclaim_stat(zone, page, !!file, 1);
 	}
 	spin_unlock_irq(&zone->lru_lock);
 }
@@ -386,7 +404,6 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 {
 	int i;
 	struct zone *zone = NULL;
-	struct zone_reclaim_stat *reclaim_stat = NULL;
 
 	VM_BUG_ON(is_unevictable_lru(lru));
 
@@ -394,24 +411,23 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 		struct page *page = pvec->pages[i];
 		struct zone *pagezone = page_zone(page);
 		int file;
+		int active;
 
 		if (pagezone != zone) {
 			if (zone)
 				spin_unlock_irq(&zone->lru_lock);
 			zone = pagezone;
-			reclaim_stat = &zone->reclaim_stat;
 			spin_lock_irq(&zone->lru_lock);
 		}
 		VM_BUG_ON(PageActive(page));
 		VM_BUG_ON(PageUnevictable(page));
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
+		active = is_active_lru(lru);
 		file = is_file_lru(lru);
-		reclaim_stat->recent_scanned[file]++;
-		if (is_active_lru(lru)) {
+		if (active)
 			SetPageActive(page);
-			reclaim_stat->recent_rotated[file]++;
-		}
+		update_page_reclaim_stat(zone, page, file, active);
 		add_page_to_lru_list(zone, page, lru);
 	}
 	if (zone)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d958d624d3ae..56fc7abe4d23 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -133,6 +133,9 @@ static DECLARE_RWSEM(shrinker_rwsem);
 static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
 						  struct scan_control *sc)
 {
+	if (!scan_global_lru(sc))
+		return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
+
 	return &zone->reclaim_stat;
 }
 
@@ -1087,17 +1090,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		__mod_zone_page_state(zone, NR_INACTIVE_ANON,
 						-count[LRU_INACTIVE_ANON]);
 
-		if (scan_global_lru(sc)) {
+		if (scan_global_lru(sc))
 			zone->pages_scanned += nr_scan;
-			reclaim_stat->recent_scanned[0] +=
-						      count[LRU_INACTIVE_ANON];
-			reclaim_stat->recent_scanned[0] +=
-						      count[LRU_ACTIVE_ANON];
-			reclaim_stat->recent_scanned[1] +=
-						      count[LRU_INACTIVE_FILE];
-			reclaim_stat->recent_scanned[1] +=
-						      count[LRU_ACTIVE_FILE];
-		}
+
+		reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
+		reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
+		reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
+		reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
+
 		spin_unlock_irq(&zone->lru_lock);
 
 		nr_scanned += nr_scan;
@@ -1155,7 +1155,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 			SetPageLRU(page);
 			lru = page_lru(page);
 			add_page_to_lru_list(zone, page, lru);
-			if (PageActive(page) && scan_global_lru(sc)) {
+			if (PageActive(page)) {
 				int file = !!page_is_file_cache(page);
 				reclaim_stat->recent_rotated[file]++;
 			}
@@ -1230,8 +1230,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	 */
 	if (scan_global_lru(sc)) {
 		zone->pages_scanned += pgscanned;
-		reclaim_stat->recent_scanned[!!file] += pgmoved;
 	}
+	reclaim_stat->recent_scanned[!!file] += pgmoved;
 
 	if (file)
 		__mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
@@ -1272,8 +1272,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	 * This helps balance scan pressure between file and anonymous
 	 * pages in get_scan_ratio.
 	 */
-	if (scan_global_lru(sc))
-		reclaim_stat->recent_rotated[!!file] += pgmoved;
+	reclaim_stat->recent_rotated[!!file] += pgmoved;
 
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
-- 
cgit v1.2.3


From 9439c1c95b5c25b8031b2a7eb7e1590eb84be7f5 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:21 -0800
Subject: memcg: remove mem_cgroup_cal_reclaim()

Now, get_scan_ratio() return correct value although memcg reclaim.  Then,
mem_cgroup_calc_reclaim() can be removed.

So, memcg reclaim get the same capability of anon/file reclaim balancing
as global reclaim now.

Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 10 ----------
 mm/memcontrol.c            | 21 ---------------------
 mm/vmscan.c                | 27 ++++++++++-----------------
 3 files changed, 10 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 36b8ebb39b82..8752052da8df 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -97,9 +97,6 @@ extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
 							int priority);
 extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 							int priority);
-
-extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
-					int priority, enum lru_list lru);
 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg,
 				    struct zone *zone);
 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
@@ -244,13 +241,6 @@ static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 {
 }
 
-static inline long mem_cgroup_calc_reclaim(struct mem_cgroup *mem,
-					struct zone *zone, int priority,
-					enum lru_list lru)
-{
-	return 0;
-}
-
 static inline bool mem_cgroup_disabled(void)
 {
 	return true;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7b7f4dc05035..b8c1e5acc25a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -414,27 +414,6 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
 	mem->prev_priority = priority;
 }
 
-/*
- * Calculate # of pages to be scanned in this priority/zone.
- * See also vmscan.c
- *
- * priority starts from "DEF_PRIORITY" and decremented in each loop.
- * (see include/linux/mmzone.h)
- */
-
-long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
-					int priority, enum lru_list lru)
-{
-	long nr_pages;
-	int nid = zone->zone_pgdat->node_id;
-	int zid = zone_idx(zone);
-	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
-
-	nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
-
-	return (nr_pages >> priority);
-}
-
 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
 {
 	unsigned long active;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 56fc7abe4d23..66bb6ef44b5f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1466,30 +1466,23 @@ static void shrink_zone(int priority, struct zone *zone,
 	get_scan_ratio(zone, sc, percent);
 
 	for_each_evictable_lru(l) {
-		if (scan_global_lru(sc)) {
-			int file = is_file_lru(l);
-			int scan;
+		int file = is_file_lru(l);
+		int scan;
 
-			scan = zone_page_state(zone, NR_LRU_BASE + l);
-			if (priority) {
-				scan >>= priority;
-				scan = (scan * percent[file]) / 100;
-			}
+		scan = zone_page_state(zone, NR_LRU_BASE + l);
+		if (priority) {
+			scan >>= priority;
+			scan = (scan * percent[file]) / 100;
+		}
+		if (scan_global_lru(sc)) {
 			zone->lru[l].nr_scan += scan;
 			nr[l] = zone->lru[l].nr_scan;
 			if (nr[l] >= swap_cluster_max)
 				zone->lru[l].nr_scan = 0;
 			else
 				nr[l] = 0;
-		} else {
-			/*
-			 * This reclaim occurs not because zone memory shortage
-			 * but because memory controller hits its limit.
-			 * Don't modify zone reclaim related data.
-			 */
-			nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
-								priority, l);
-		}
+		} else
+			nr[l] = scan;
 	}
 
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
-- 
cgit v1.2.3


From a7885eb8ad465ec9db99ac5b5e6680f0ca8e11c8 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:24 -0800
Subject: memcg: swappiness

Currently, /proc/sys/vm/swappiness can change swappiness ratio for global
reclaim.  However, memcg reclaim doesn't have tuning parameter for itself.

In general, the optimal swappiness depend on workload.  (e.g.  hpc
workload need to low swappiness than the others.)

Then, per cgroup swappiness improve administrator tunability.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/controllers/memory.txt |  9 +++++
 include/linux/swap.h                 |  3 +-
 mm/memcontrol.c                      | 78 ++++++++++++++++++++++++++++++++----
 mm/vmscan.c                          |  7 ++--
 4 files changed, 86 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt
index d71745cc2f00..e1501964df1e 100644
--- a/Documentation/controllers/memory.txt
+++ b/Documentation/controllers/memory.txt
@@ -314,6 +314,15 @@ will be charged as a new owner of it.
 	showing for better debug please see the code for meanings.
 
 
+5.3 swappiness
+  Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
+
+  Following cgroup's swapiness can't be changed.
+  - root cgroup (uses /proc/sys/vm/swappiness).
+  - a cgroup which uses hierarchy and it has child cgroup.
+  - a cgroup which uses hierarchy and not the root of hierarchy.
+
+
 6. Hierarchy support
 
 The memory controller supports a deep hierarchy and hierarchical accounting.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index be938ce4895a..4ccca25d0f05 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,7 +214,8 @@ static inline void lru_cache_add_active_file(struct page *page)
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
-						gfp_t gfp_mask, bool noswap);
+						  gfp_t gfp_mask, bool noswap,
+						  unsigned int swappiness);
 extern int __isolate_lru_page(struct page *page, int mode, int file);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 027c0dd7a83e..ab2ecbb95b8d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -164,6 +164,9 @@ struct mem_cgroup {
 	int		obsolete;
 	atomic_t	refcnt;
 
+	unsigned int	swappiness;
+
+
 	unsigned int inactive_ratio;
 
 	/*
@@ -636,6 +639,22 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
 	return false;
 }
 
+static unsigned int get_swappiness(struct mem_cgroup *memcg)
+{
+	struct cgroup *cgrp = memcg->css.cgroup;
+	unsigned int swappiness;
+
+	/* root ? */
+	if (cgrp->parent == NULL)
+		return vm_swappiness;
+
+	spin_lock(&memcg->reclaim_param_lock);
+	swappiness = memcg->swappiness;
+	spin_unlock(&memcg->reclaim_param_lock);
+
+	return swappiness;
+}
+
 /*
  * Dance down the hierarchy if needed to reclaim memory. We remember the
  * last child we reclaimed from, so that we don't end up penalizing
@@ -656,7 +675,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 	 * but there might be left over accounting, even after children
 	 * have left.
 	 */
-	ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap);
+	ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
+					   get_swappiness(root_mem));
 	if (mem_cgroup_check_under_limit(root_mem))
 		return 0;
 	if (!root_mem->use_hierarchy)
@@ -672,7 +692,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 			cgroup_unlock();
 			continue;
 		}
-		ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap);
+		ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
+						   get_swappiness(next_mem));
 		if (mem_cgroup_check_under_limit(root_mem))
 			return 0;
 		cgroup_lock();
@@ -1400,7 +1421,8 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 	rcu_read_unlock();
 
 	do {
-		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true);
+		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true,
+							get_swappiness(mem));
 		progress += mem_cgroup_check_under_limit(mem);
 	} while (!progress && --retry);
 
@@ -1468,7 +1490,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 			break;
 
 		progress = try_to_free_mem_cgroup_pages(memcg,
-				GFP_KERNEL, false);
+							GFP_KERNEL,
+							false,
+							get_swappiness(memcg));
   		if (!progress)			retry_count--;
 	}
 
@@ -1512,7 +1536,8 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 			break;
 
 		oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
-		try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, true);
+		try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, true,
+					     get_swappiness(memcg));
 		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 		if (curusage >= oldusage)
 			retry_count--;
@@ -1643,8 +1668,8 @@ try_to_free:
 			ret = -EINTR;
 			goto out;
 		}
-		progress = try_to_free_mem_cgroup_pages(mem,
-						  GFP_KERNEL, false);
+		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
+						false, get_swappiness(mem));
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
@@ -1864,6 +1889,37 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 	return 0;
 }
 
+static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+	return get_swappiness(memcg);
+}
+
+static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
+				       u64 val)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+	struct mem_cgroup *parent;
+	if (val > 100)
+		return -EINVAL;
+
+	if (cgrp->parent == NULL)
+		return -EINVAL;
+
+	parent = mem_cgroup_from_cont(cgrp->parent);
+	/* If under hierarchy, only empty-root can set this value */
+	if ((parent->use_hierarchy) ||
+	    (memcg->use_hierarchy && !list_empty(&cgrp->children)))
+		return -EINVAL;
+
+	spin_lock(&memcg->reclaim_param_lock);
+	memcg->swappiness = val;
+	spin_unlock(&memcg->reclaim_param_lock);
+
+	return 0;
+}
+
 
 static struct cftype mem_cgroup_files[] = {
 	{
@@ -1902,6 +1958,11 @@ static struct cftype mem_cgroup_files[] = {
 		.write_u64 = mem_cgroup_hierarchy_write,
 		.read_u64 = mem_cgroup_hierarchy_read,
 	},
+	{
+		.name = "swappiness",
+		.read_u64 = mem_cgroup_swappiness_read,
+		.write_u64 = mem_cgroup_swappiness_write,
+	},
 };
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -2093,6 +2154,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	mem->last_scanned_child = NULL;
 	spin_lock_init(&mem->reclaim_param_lock);
 
+	if (parent)
+		mem->swappiness = get_swappiness(parent);
+
 	return &mem->css;
 free_out:
 	for_each_node_state(node, N_POSSIBLE)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f03c239440ad..ece2f405187f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1707,14 +1707,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
-						gfp_t gfp_mask,
-					   bool noswap)
+					   gfp_t gfp_mask,
+					   bool noswap,
+					   unsigned int swappiness)
 {
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
 		.may_swap = 1,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
-		.swappiness = vm_swappiness,
+		.swappiness = swappiness,
 		.order = 0,
 		.mem_cgroup = mem_cont,
 		.isolate_pages = mem_cgroup_isolate_pages,
-- 
cgit v1.2.3


From c772be939e078afd2505ede7d596a30f8f61de95 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:25 -0800
Subject: memcg: fix calculation of active_ratio

Currently, inactive_ratio of memcg is calculated at setting limit.
because page_alloc.c does so and current implementation is straightforward
porting.

However, memcg introduced hierarchy feature recently.  In hierarchy
restriction, memory limit is not only decided memory.limit_in_bytes of
current cgroup, but also parent limit and sibling memory usage.

Then, The optimal inactive_ratio is changed frequently.  So, everytime
calculation is better.

Tested-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  5 ++--
 mm/memcontrol.c            | 64 ++++++++++++++++++++++------------------------
 mm/vmscan.c                |  2 +-
 3 files changed, 34 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8752052da8df..056cf82c0e86 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -97,8 +97,7 @@ extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
 							int priority);
 extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 							int priority);
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg,
-				    struct zone *zone);
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 				       struct zone *zone,
 				       enum lru_list lru);
@@ -252,7 +251,7 @@ static inline bool mem_cgroup_oom_called(struct task_struct *task)
 }
 
 static inline int
-mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
+mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
 {
 	return 1;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ab2ecbb95b8d..c7d78eca8363 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -166,9 +166,6 @@ struct mem_cgroup {
 
 	unsigned int	swappiness;
 
-
-	unsigned int inactive_ratio;
-
 	/*
 	 * statistics. This must be placed at the end of memcg.
 	 */
@@ -432,15 +429,43 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
 	spin_unlock(&mem->reclaim_param_lock);
 }
 
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
+static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
 {
 	unsigned long active;
 	unsigned long inactive;
+	unsigned long gb;
+	unsigned long inactive_ratio;
 
 	inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
 	active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
 
-	if (inactive * memcg->inactive_ratio < active)
+	gb = (inactive + active) >> (30 - PAGE_SHIFT);
+	if (gb)
+		inactive_ratio = int_sqrt(10 * gb);
+	else
+		inactive_ratio = 1;
+
+	if (present_pages) {
+		present_pages[0] = inactive;
+		present_pages[1] = active;
+	}
+
+	return inactive_ratio;
+}
+
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
+{
+	unsigned long active;
+	unsigned long inactive;
+	unsigned long present_pages[2];
+	unsigned long inactive_ratio;
+
+	inactive_ratio = calc_inactive_ratio(memcg, present_pages);
+
+	inactive = present_pages[0];
+	active = present_pages[1];
+
+	if (inactive * inactive_ratio < active)
 		return 1;
 
 	return 0;
@@ -1432,29 +1457,6 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 	return 0;
 }
 
-/*
- * The inactive anon list should be small enough that the VM never has to
- * do too much work, but large enough that each inactive page has a chance
- * to be referenced again before it is swapped out.
- *
- * this calculation is straightforward porting from
- * page_alloc.c::setup_per_zone_inactive_ratio().
- * it describe more detail.
- */
-static void mem_cgroup_set_inactive_ratio(struct mem_cgroup *memcg)
-{
-	unsigned int gb, ratio;
-
-	gb = res_counter_read_u64(&memcg->res, RES_LIMIT) >> 30;
-	if (gb)
-		ratio = int_sqrt(10 * gb);
-	else
-		ratio = 1;
-
-	memcg->inactive_ratio = ratio;
-
-}
-
 static DEFINE_MUTEX(set_limit_mutex);
 
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
@@ -1496,9 +1498,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
   		if (!progress)			retry_count--;
 	}
 
-	if (!ret)
-		mem_cgroup_set_inactive_ratio(memcg);
-
 	return ret;
 }
 
@@ -1858,7 +1857,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 	}
 
 #ifdef CONFIG_DEBUG_VM
-	cb->fill(cb, "inactive_ratio", mem_cont->inactive_ratio);
+	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
 
 	{
 		int nid, zid;
@@ -2150,7 +2149,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 		res_counter_init(&mem->res, NULL);
 		res_counter_init(&mem->memsw, NULL);
 	}
-	mem_cgroup_set_inactive_ratio(mem);
 	mem->last_scanned_child = NULL;
 	spin_lock_init(&mem->reclaim_param_lock);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ece2f405187f..9a27c44aa327 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1340,7 +1340,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
 	if (scanning_global_lru(sc))
 		low = inactive_anon_is_low_global(zone);
 	else
-		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
+		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
 	return low;
 }
 
-- 
cgit v1.2.3


From a5e924f5f8abf97944e625d74967cc9452cfbce8 Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Date: Wed, 7 Jan 2009 18:08:28 -0800
Subject: memcg: remove mem_cgroup_try_charge

After previous patch, mem_cgroup_try_charge is not used by anyone, so we
can remove it.

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  8 --------
 mm/memcontrol.c            | 21 +--------------------
 2 files changed, 1 insertion(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 056cf82c0e86..8ae6ece8c962 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -40,8 +40,6 @@ struct mm_struct;
 extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
 /* for swap handling */
-extern int mem_cgroup_try_charge(struct mm_struct *mm,
-		gfp_t gfp_mask, struct mem_cgroup **ptr);
 extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 		struct page *page, gfp_t mask, struct mem_cgroup **ptr);
 extern void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -134,12 +132,6 @@ static inline int mem_cgroup_cache_charge(struct page *page,
 	return 0;
 }
 
-static inline int mem_cgroup_try_charge(struct mm_struct *mm,
-			gfp_t gfp_mask, struct mem_cgroup **ptr)
-{
-	return 0;
-}
-
 static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 		struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3ba72e6556cc..435f08dac8bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -839,27 +839,8 @@ nomem:
 	return -ENOMEM;
 }
 
-/**
- * mem_cgroup_try_charge - get charge of PAGE_SIZE.
- * @mm: an mm_struct which is charged against. (when *memcg is NULL)
- * @gfp_mask: gfp_mask for reclaim.
- * @memcg: a pointer to memory cgroup which is charged against.
- *
- * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
- * memory cgroup from @mm is got and stored in *memcg.
- *
- * Returns 0 if success. -ENOMEM at failure.
- * This call can invoke OOM-Killer.
- */
-
-int mem_cgroup_try_charge(struct mm_struct *mm,
-			  gfp_t mask, struct mem_cgroup **memcg)
-{
-	return __mem_cgroup_try_charge(mm, mask, memcg, true);
-}
-
 /*
- * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
+ * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
  * USED state. If already USED, uncharge and return.
  */
 
-- 
cgit v1.2.3


From b5a84319a4343a0db753436fd8147e61eaafa7ea Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:35 -0800
Subject: memcg: fix shmem's swap accounting

Now, you can see following even when swap accounting is enabled.

 1. Create Group 01, and 02.
 2. allocate a "file" on tmpfs by a task under 01.
 3. swap out the "file" (by memory pressure)
 4. Read "file" from a task in group 02.
 5. the charge of "file" is moved to group 02.

This is not ideal behavior. This is because SwapCache which was loaded
by read-ahead is not taken into account..

This is a patch to fix shmem's swapcache behavior.
  - remove mem_cgroup_cache_charge_swapin().
  - Add SwapCache handler routine to mem_cgroup_cache_charge().
    By this, shmem's file cache is charged at add_to_page_cache()
    with GFP_NOWAIT.
  - pass the page of swapcache to shrink_mem_cgroup.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |   6 +-
 include/linux/swap.h       |   8 ---
 mm/memcontrol.c            | 134 ++++++++++++++++++++-------------------------
 mm/shmem.c                 |  30 ++++------
 4 files changed, 76 insertions(+), 102 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8ae6ece8c962..326f45c86530 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -56,7 +56,8 @@ extern void mem_cgroup_move_lists(struct page *page,
 				  enum lru_list from, enum lru_list to);
 extern void mem_cgroup_uncharge_page(struct page *page);
 extern void mem_cgroup_uncharge_cache_page(struct page *page);
-extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask);
+extern int mem_cgroup_shrink_usage(struct page *page,
+			struct mm_struct *mm, gfp_t gfp_mask);
 
 extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					struct list_head *dst,
@@ -155,7 +156,8 @@ static inline void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 }
 
-static inline int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
+static inline int mem_cgroup_shrink_usage(struct page *page,
+			struct mm_struct *mm, gfp_t gfp_mask)
 {
 	return 0;
 }
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4ccca25d0f05..d30215578877 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -335,16 +335,8 @@ static inline void disable_swap_token(void)
 }
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
-extern int mem_cgroup_cache_charge_swapin(struct page *page,
-				struct mm_struct *mm, gfp_t mask, bool locked);
 extern void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent);
 #else
-static inline
-int mem_cgroup_cache_charge_swapin(struct page *page,
-				struct mm_struct *mm, gfp_t mask, bool locked)
-{
-	return 0;
-}
 static inline void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f50cb7b1efdb..93a792871804 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -893,6 +893,23 @@ nomem:
 	return -ENOMEM;
 }
 
+static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
+{
+	struct mem_cgroup *mem;
+	swp_entry_t ent;
+
+	if (!PageSwapCache(page))
+		return NULL;
+
+	ent.val = page_private(page);
+	mem = lookup_swap_cgroup(ent);
+	if (!mem)
+		return NULL;
+	if (!css_tryget(&mem->css))
+		return NULL;
+	return mem;
+}
+
 /*
  * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
  * USED state. If already USED, uncharge and return.
@@ -1084,6 +1101,9 @@ int mem_cgroup_newpage_charge(struct page *page,
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask)
 {
+	struct mem_cgroup *mem = NULL;
+	int ret;
+
 	if (mem_cgroup_disabled())
 		return 0;
 	if (PageCompound(page))
@@ -1096,6 +1116,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 	 * For GFP_NOWAIT case, the page may be pre-charged before calling
 	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
 	 * charge twice. (It works but has to pay a bit larger cost.)
+	 * And when the page is SwapCache, it should take swap information
+	 * into account. This is under lock_page() now.
 	 */
 	if (!(gfp_mask & __GFP_WAIT)) {
 		struct page_cgroup *pc;
@@ -1112,15 +1134,40 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 		unlock_page_cgroup(pc);
 	}
 
-	if (unlikely(!mm))
+	if (do_swap_account && PageSwapCache(page)) {
+		mem = try_get_mem_cgroup_from_swapcache(page);
+		if (mem)
+			mm = NULL;
+		  else
+			mem = NULL;
+		/* SwapCache may be still linked to LRU now. */
+		mem_cgroup_lru_del_before_commit_swapcache(page);
+	}
+
+	if (unlikely(!mm && !mem))
 		mm = &init_mm;
 
 	if (page_is_file_cache(page))
 		return mem_cgroup_charge_common(page, mm, gfp_mask,
 				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
-	else
-		return mem_cgroup_charge_common(page, mm, gfp_mask,
-				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
+
+	ret = mem_cgroup_charge_common(page, mm, gfp_mask,
+				MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
+	if (mem)
+		css_put(&mem->css);
+	if (PageSwapCache(page))
+		mem_cgroup_lru_add_after_commit_swapcache(page);
+
+	if (do_swap_account && !ret && PageSwapCache(page)) {
+		swp_entry_t ent = {.val = page_private(page)};
+		/* avoid double counting */
+		mem = swap_cgroup_record(ent, NULL);
+		if (mem) {
+			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+			mem_cgroup_put(mem);
+		}
+	}
+	return ret;
 }
 
 /*
@@ -1134,7 +1181,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 				 gfp_t mask, struct mem_cgroup **ptr)
 {
 	struct mem_cgroup *mem;
-	swp_entry_t     ent;
 	int ret;
 
 	if (mem_cgroup_disabled())
@@ -1142,7 +1188,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 
 	if (!do_swap_account)
 		goto charge_cur_mm;
-
 	/*
 	 * A racing thread's fault, or swapoff, may have already updated
 	 * the pte, and even removed page from swap cache: return success
@@ -1150,14 +1195,9 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 	 */
 	if (!PageSwapCache(page))
 		return 0;
-
-	ent.val = page_private(page);
-
-	mem = lookup_swap_cgroup(ent);
+	mem = try_get_mem_cgroup_from_swapcache(page);
 	if (!mem)
 		goto charge_cur_mm;
-	if (!css_tryget(&mem->css))
-		goto charge_cur_mm;
 	*ptr = mem;
 	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
 	/* drop extra refcnt from tryget */
@@ -1169,62 +1209,6 @@ charge_cur_mm:
 	return __mem_cgroup_try_charge(mm, mask, ptr, true);
 }
 
-#ifdef CONFIG_SWAP
-
-int mem_cgroup_cache_charge_swapin(struct page *page,
-			struct mm_struct *mm, gfp_t mask, bool locked)
-{
-	int ret = 0;
-
-	if (mem_cgroup_disabled())
-		return 0;
-	if (unlikely(!mm))
-		mm = &init_mm;
-	if (!locked)
-		lock_page(page);
-	/*
-	 * If not locked, the page can be dropped from SwapCache until
-	 * we reach here.
-	 */
-	if (PageSwapCache(page)) {
-		struct mem_cgroup *mem = NULL;
-		swp_entry_t ent;
-
-		ent.val = page_private(page);
-		if (do_swap_account) {
-			mem = lookup_swap_cgroup(ent);
-			if (mem) {
-				if (css_tryget(&mem->css))
-					mm = NULL; /* charge to recorded */
-				else
-					mem = NULL; /* charge to current */
-			}
-		}
-		/* SwapCache may be still linked to LRU now. */
-		mem_cgroup_lru_del_before_commit_swapcache(page);
-		ret = mem_cgroup_charge_common(page, mm, mask,
-				MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
-		mem_cgroup_lru_add_after_commit_swapcache(page);
-		/* drop extra refcnt from tryget */
-		if (mem)
-			css_put(&mem->css);
-
-		if (!ret && do_swap_account) {
-			/* avoid double counting */
-			mem = swap_cgroup_record(ent, NULL);
-			if (mem) {
-				res_counter_uncharge(&mem->memsw, PAGE_SIZE);
-				mem_cgroup_put(mem);
-			}
-		}
-	}
-	if (!locked)
-		unlock_page(page);
-
-	return ret;
-}
-#endif
-
 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 {
 	struct page_cgroup *pc;
@@ -1486,18 +1470,20 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
  * This is typically used for page reclaiming for shmem for reducing side
  * effect of page allocation from shmem, which is used by some mem_cgroup.
  */
-int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
+int mem_cgroup_shrink_usage(struct page *page,
+			    struct mm_struct *mm,
+			    gfp_t gfp_mask)
 {
-	struct mem_cgroup *mem;
+	struct mem_cgroup *mem = NULL;
 	int progress = 0;
 	int retry = MEM_CGROUP_RECLAIM_RETRIES;
 
 	if (mem_cgroup_disabled())
 		return 0;
-	if (!mm)
-		return 0;
-
-	mem = try_get_mem_cgroup_from_mm(mm);
+	if (page)
+		mem = try_get_mem_cgroup_from_swapcache(page);
+	if (!mem && mm)
+		mem = try_get_mem_cgroup_from_mm(mm);
 	if (unlikely(!mem))
 		return 0;
 
diff --git a/mm/shmem.c b/mm/shmem.c
index bbb7b043c986..5d0de96c9789 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -929,11 +929,11 @@ found:
 	if (!inode)
 		goto out;
 	/*
-	 * Charge page using GFP_HIGHUSER_MOVABLE while we can wait.
-	 * charged back to the user(not to caller) when swap account is used.
+	 * Charge page using GFP_KERNEL while we can wait.
+	 * Charged back to the user(not to caller) when swap account is used.
+	 * add_to_page_cache() will be called with GFP_NOWAIT.
 	 */
-	error = mem_cgroup_cache_charge_swapin(page, current->mm, GFP_KERNEL,
-					true);
+	error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
 	if (error)
 		goto out;
 	error = radix_tree_preload(GFP_KERNEL);
@@ -1270,16 +1270,6 @@ repeat:
 				goto repeat;
 			}
 			wait_on_page_locked(swappage);
-			/*
-			 * We want to avoid charge at add_to_page_cache().
-			 * charge against this swap cache here.
-			 */
-			if (mem_cgroup_cache_charge_swapin(swappage,
-				current->mm, gfp & GFP_RECLAIM_MASK, false)) {
-				page_cache_release(swappage);
-				error = -ENOMEM;
-				goto failed;
-			}
 			page_cache_release(swappage);
 			goto repeat;
 		}
@@ -1334,15 +1324,19 @@ repeat:
 		} else {
 			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
-			unlock_page(swappage);
-			page_cache_release(swappage);
 			if (error == -ENOMEM) {
 				/* allow reclaim from this memory cgroup */
-				error = mem_cgroup_shrink_usage(current->mm,
+				error = mem_cgroup_shrink_usage(swappage,
+								current->mm,
 								gfp);
-				if (error)
+				if (error) {
+					unlock_page(swappage);
+					page_cache_release(swappage);
 					goto failed;
+				}
 			}
+			unlock_page(swappage);
+			page_cache_release(swappage);
 			goto repeat;
 		}
 	} else if (sgp == SGP_READ && !filepage) {
-- 
cgit v1.2.3


From 999cd8a450f8f93701669a61cac4d3b19eca07e8 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Wed, 7 Jan 2009 18:08:36 -0800
Subject: cgroups: add a per-subsystem hierarchy_mutex

These patches introduce new locking/refcount support for cgroups to
reduce the need for subsystems to call cgroup_lock(). This will
ultimately allow the atomicity of cgroup_rmdir() (which was removed
recently) to be restored.

These three patches give:

1/3 - introduce a per-subsystem hierarchy_mutex which a subsystem can
     use to prevent changes to its own cgroup tree

2/3 - use hierarchy_mutex in place of calling cgroup_lock() in the
     memory controller

3/3 - introduce a css_tryget() function similar to the one recently
      proposed by Kamezawa, but avoiding spurious refcount failures in
      the event of a race between a css_tryget() and an unsuccessful
      cgroup_rmdir()

Future patches will likely involve:

- using hierarchy mutex in place of cgroup_lock() in more subsystems
 where appropriate

- restoring the atomicity of cgroup_rmdir() with respect to cgroup_create()

This patch:

Add a hierarchy_mutex to the cgroup_subsys object that protects changes to
the hierarchy observed by that subsystem.  It is taken by the cgroup
subsystem (in addition to cgroup_mutex) for the following operations:

- linking a cgroup into that subsystem's cgroup tree
- unlinking a cgroup from that subsystem's cgroup tree
- moving the subsystem to/from a hierarchy (including across the
  bind() callback)

Thus if the subsystem holds its own hierarchy_mutex, it can safely
traverse its own hierarchy.

Signed-off-by: Paul Menage <menage@google.com>
Tested-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/cgroups.txt |  2 +-
 include/linux/cgroup.h            | 17 ++++++++++++++++-
 kernel/cgroup.c                   | 37 +++++++++++++++++++++++++++++++++++--
 3 files changed, 52 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 60287e9e9d27..e33ee74eee77 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -528,7 +528,7 @@ example in cpusets, no task may attach before 'cpus' and 'mems' are set
 up.
 
 void bind(struct cgroup_subsys *ss, struct cgroup *root)
-(cgroup_mutex held by caller)
+(cgroup_mutex and ss->hierarchy_mutex held by caller)
 
 Called when a cgroup subsystem is rebound to a different hierarchy
 and root cgroup. Currently this will only involve movement between
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 73d1c730c3c4..ce1c1f34c30c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -340,8 +340,23 @@ struct cgroup_subsys {
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 
-	struct cgroupfs_root *root;
+	/*
+	 * Protects sibling/children links of cgroups in this
+	 * hierarchy, plus protects which hierarchy (or none) the
+	 * subsystem is a part of (i.e. root/sibling).  To avoid
+	 * potential deadlocks, the following operations should not be
+	 * undertaken while holding any hierarchy_mutex:
+	 *
+	 * - allocating memory
+	 * - initiating hotplug events
+	 */
+	struct mutex hierarchy_mutex;
 
+	/*
+	 * Link to parent, and list entry in parent's children.
+	 * Protected by this->hierarchy_mutex and cgroup_lock()
+	 */
+	struct cgroupfs_root *root;
 	struct list_head sibling;
 };
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 83ea4f524be5..8b6379cdf637 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -722,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			BUG_ON(cgrp->subsys[i]);
 			BUG_ON(!dummytop->subsys[i]);
 			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
+			mutex_lock(&ss->hierarchy_mutex);
 			cgrp->subsys[i] = dummytop->subsys[i];
 			cgrp->subsys[i]->cgroup = cgrp;
 			list_move(&ss->sibling, &root->subsys_list);
 			ss->root = root;
 			if (ss->bind)
 				ss->bind(ss, cgrp);
-
+			mutex_unlock(&ss->hierarchy_mutex);
 		} else if (bit & removed_bits) {
 			/* We're removing this subsystem */
 			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
 			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+			mutex_lock(&ss->hierarchy_mutex);
 			if (ss->bind)
 				ss->bind(ss, dummytop);
 			dummytop->subsys[i]->cgroup = dummytop;
 			cgrp->subsys[i] = NULL;
 			subsys[i]->root = &rootnode;
 			list_move(&ss->sibling, &rootnode.subsys_list);
+			mutex_unlock(&ss->hierarchy_mutex);
 		} else if (bit & final_bits) {
 			/* Subsystem state should already exist */
 			BUG_ON(!cgrp->subsys[i]);
@@ -2338,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 	cgrp->subsys[ss->subsys_id] = css;
 }
 
+static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
+{
+	/* We need to take each hierarchy_mutex in a consistent order */
+	int i;
+
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		struct cgroup_subsys *ss = subsys[i];
+		if (ss->root == root)
+			mutex_lock_nested(&ss->hierarchy_mutex, i);
+	}
+}
+
+static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
+{
+	int i;
+
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		struct cgroup_subsys *ss = subsys[i];
+		if (ss->root == root)
+			mutex_unlock(&ss->hierarchy_mutex);
+	}
+}
+
 /*
  * cgroup_create - create a cgroup
  * @parent: cgroup that will be parent of the new cgroup
@@ -2386,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		init_cgroup_css(css, ss, cgrp);
 	}
 
+	cgroup_lock_hierarchy(root);
 	list_add(&cgrp->sibling, &cgrp->parent->children);
+	cgroup_unlock_hierarchy(root);
 	root->number_of_cgroups++;
 
 	err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2504,8 +2532,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	if (!list_empty(&cgrp->release_list))
 		list_del(&cgrp->release_list);
 	spin_unlock(&release_list_lock);
-	/* delete my sibling from parent->children */
+
+	cgroup_lock_hierarchy(cgrp->root);
+	/* delete this cgroup from parent->children */
 	list_del(&cgrp->sibling);
+	cgroup_unlock_hierarchy(cgrp->root);
+
 	spin_lock(&cgrp->dentry->d_lock);
 	d = dget(cgrp->dentry);
 	spin_unlock(&d->d_lock);
@@ -2547,6 +2579,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	 * need to invoke fork callbacks here. */
 	BUG_ON(!list_empty(&init_task.tasks));
 
+	mutex_init(&ss->hierarchy_mutex);
 	ss->active = 1;
 }
 
-- 
cgit v1.2.3


From e7c5ec9193d32b9559a3bb8893ceedbda85201ff Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Wed, 7 Jan 2009 18:08:38 -0800
Subject: cgroups: add css_tryget()

Add css_tryget(), that obtains a counted reference on a CSS.  It is used
in situations where the caller has a "weak" reference to the CSS, i.e.
one that does not protect the cgroup from removal via a reference count,
but would instead be cleaned up by a destroy() callback.

css_tryget() will return true on success, or false if the cgroup is being
removed.

This is similar to Kamezawa Hiroyuki's patch from a week or two ago, but
with the difference that in the event of css_tryget() racing with a
cgroup_rmdir(), css_tryget() will only return false if the cgroup really
does get removed.

This implementation is done by biasing css->refcnt, so that a refcnt of 1
means "releasable" and 0 means "released or releasing".  In the event of a
race, css_tryget() distinguishes between "released" and "releasing" by
checking for the CSS_REMOVED flag in css->flags.

Signed-off-by: Paul Menage <menage@google.com>
Tested-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 38 ++++++++++++++++++++++++++-----
 kernel/cgroup.c        | 61 +++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 88 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ce1c1f34c30c..e267e62827bb 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -52,9 +52,9 @@ struct cgroup_subsys_state {
 	 * hierarchy structure */
 	struct cgroup *cgroup;
 
-	/* State maintained by the cgroup system to allow
-	 * subsystems to be "busy". Should be accessed via css_get()
-	 * and css_put() */
+	/* State maintained by the cgroup system to allow subsystems
+	 * to be "busy". Should be accessed via css_get(),
+	 * css_tryget() and and css_put(). */
 
 	atomic_t refcnt;
 
@@ -64,11 +64,14 @@ struct cgroup_subsys_state {
 /* bits in struct cgroup_subsys_state flags field */
 enum {
 	CSS_ROOT, /* This CSS is the root of the subsystem */
+	CSS_REMOVED, /* This CSS is dead */
 };
 
 /*
- * Call css_get() to hold a reference on the cgroup;
- *
+ * Call css_get() to hold a reference on the css; it can be used
+ * for a reference obtained via:
+ * - an existing ref-counted reference to the css
+ * - task->cgroups for a locked task
  */
 
 static inline void css_get(struct cgroup_subsys_state *css)
@@ -77,9 +80,32 @@ static inline void css_get(struct cgroup_subsys_state *css)
 	if (!test_bit(CSS_ROOT, &css->flags))
 		atomic_inc(&css->refcnt);
 }
+
+static inline bool css_is_removed(struct cgroup_subsys_state *css)
+{
+	return test_bit(CSS_REMOVED, &css->flags);
+}
+
+/*
+ * Call css_tryget() to take a reference on a css if your existing
+ * (known-valid) reference isn't already ref-counted. Returns false if
+ * the css has been destroyed.
+ */
+
+static inline bool css_tryget(struct cgroup_subsys_state *css)
+{
+	if (test_bit(CSS_ROOT, &css->flags))
+		return true;
+	while (!atomic_inc_not_zero(&css->refcnt)) {
+		if (test_bit(CSS_REMOVED, &css->flags))
+			return false;
+	}
+	return true;
+}
+
 /*
  * css_put() should be called to release a reference taken by
- * css_get()
+ * css_get() or css_tryget()
  */
 
 extern void __css_put(struct cgroup_subsys_state *css);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8b6379cdf637..c29831076e7a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2333,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 			       struct cgroup *cgrp)
 {
 	css->cgroup = cgrp;
-	atomic_set(&css->refcnt, 0);
+	atomic_set(&css->refcnt, 1);
 	css->flags = 0;
 	if (cgrp == dummytop)
 		set_bit(CSS_ROOT, &css->flags);
@@ -2465,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
 	/* Check the reference count on each subsystem. Since we
 	 * already established that there are no tasks in the
-	 * cgroup, if the css refcount is also 0, then there should
+	 * cgroup, if the css refcount is also 1, then there should
 	 * be no outstanding references, so the subsystem is safe to
 	 * destroy. We scan across all subsystems rather than using
 	 * the per-hierarchy linked list of mounted subsystems since
@@ -2486,12 +2486,62 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
 		 * matter, since it can only happen if the cgroup
 		 * has been deleted and hence no longer needs the
 		 * release agent to be called anyway. */
-		if (css && atomic_read(&css->refcnt))
+		if (css && (atomic_read(&css->refcnt) > 1))
 			return 1;
 	}
 	return 0;
 }
 
+/*
+ * Atomically mark all (or else none) of the cgroup's CSS objects as
+ * CSS_REMOVED. Return true on success, or false if the cgroup has
+ * busy subsystems. Call with cgroup_mutex held
+ */
+
+static int cgroup_clear_css_refs(struct cgroup *cgrp)
+{
+	struct cgroup_subsys *ss;
+	unsigned long flags;
+	bool failed = false;
+	local_irq_save(flags);
+	for_each_subsys(cgrp->root, ss) {
+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+		int refcnt;
+		do {
+			/* We can only remove a CSS with a refcnt==1 */
+			refcnt = atomic_read(&css->refcnt);
+			if (refcnt > 1) {
+				failed = true;
+				goto done;
+			}
+			BUG_ON(!refcnt);
+			/*
+			 * Drop the refcnt to 0 while we check other
+			 * subsystems. This will cause any racing
+			 * css_tryget() to spin until we set the
+			 * CSS_REMOVED bits or abort
+			 */
+		} while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
+	}
+ done:
+	for_each_subsys(cgrp->root, ss) {
+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+		if (failed) {
+			/*
+			 * Restore old refcnt if we previously managed
+			 * to clear it from 1 to 0
+			 */
+			if (!atomic_read(&css->refcnt))
+				atomic_set(&css->refcnt, 1);
+		} else {
+			/* Commit the fact that the CSS is removed */
+			set_bit(CSS_REMOVED, &css->flags);
+		}
+	}
+	local_irq_restore(flags);
+	return !failed;
+}
+
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
 	struct cgroup *cgrp = dentry->d_fsdata;
@@ -2522,7 +2572,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 
 	if (atomic_read(&cgrp->count)
 	    || !list_empty(&cgrp->children)
-	    || cgroup_has_css_refs(cgrp)) {
+	    || !cgroup_clear_css_refs(cgrp)) {
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
@@ -3078,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
 	rcu_read_lock();
-	if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {
+	if ((atomic_dec_return(&css->refcnt) == 1) &&
+	    notify_on_release(cgrp)) {
 		set_bit(CGRP_RELEASABLE, &cgrp->flags);
 		check_for_release(cgrp);
 	}
-- 
cgit v1.2.3


From 6af866af34a96fed24a55979a78b6f73bd4e8e87 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:45 -0800
Subject: cpuset: remove remaining pointers to cpumask_t

Impact: cleanups, use new cpumask API

Final trivial cleanups: mainly s/cpumask_t/struct cpumask

Note there is a FIXME in generate_sched_domains(). A future patch will
change struct cpumask *doms to struct cpumask *doms[].
(I suppose Rusty will do this.)

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Mike Travis <travis@sgi.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h | 10 ++++++----
 kernel/cpuset.c        | 28 +++++++++++++++-------------
 2 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 51ea2bdea0f9..90c6074a36ca 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -20,8 +20,9 @@ extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 extern int cpuset_init_early(void);
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask);
-extern void cpuset_cpus_allowed_locked(struct task_struct *p, cpumask_t *mask);
+extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
+extern void cpuset_cpus_allowed_locked(struct task_struct *p,
+				       struct cpumask *mask);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -86,12 +87,13 @@ static inline int cpuset_init_early(void) { return 0; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask)
+static inline void cpuset_cpus_allowed(struct task_struct *p,
+				       struct cpumask *mask)
 {
 	*mask = cpu_possible_map;
 }
 static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
-								cpumask_t *mask)
+					      struct cpumask *mask)
 {
 	*mask = cpu_possible_map;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fc294aa9a97a..647c77a88fcb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -289,7 +289,8 @@ static struct file_system_type cpuset_fs_type = {
  * Call with callback_mutex held.
  */
 
-static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
+static void guarantee_online_cpus(const struct cpuset *cs,
+				  struct cpumask *pmask)
 {
 	while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
 		cs = cs->parent;
@@ -610,7 +611,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
  *	element of the partition (one sched domain) to be passed to
  *	partition_sched_domains().
  */
-static int generate_sched_domains(cpumask_t **domains,
+/* FIXME: see the FIXME in partition_sched_domains() */
+static int generate_sched_domains(struct cpumask **domains,
 			struct sched_domain_attr **attributes)
 {
 	LIST_HEAD(q);		/* queue of cpusets to be scanned */
@@ -618,10 +620,10 @@ static int generate_sched_domains(cpumask_t **domains,
 	struct cpuset **csa;	/* array of all cpuset ptrs */
 	int csn;		/* how many cpuset ptrs in csa so far */
 	int i, j, k;		/* indices for partition finding loops */
-	cpumask_t *doms;	/* resulting partition; i.e. sched domains */
+	struct cpumask *doms;	/* resulting partition; i.e. sched domains */
 	struct sched_domain_attr *dattr;  /* attributes for custom domains */
 	int ndoms = 0;		/* number of sched domains in result */
-	int nslot;		/* next empty doms[] cpumask_t slot */
+	int nslot;		/* next empty doms[] struct cpumask slot */
 
 	doms = NULL;
 	dattr = NULL;
@@ -629,7 +631,7 @@ static int generate_sched_domains(cpumask_t **domains,
 
 	/* Special case for the 99% of systems with one, full, sched domain */
 	if (is_sched_load_balance(&top_cpuset)) {
-		doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+		doms = kmalloc(cpumask_size(), GFP_KERNEL);
 		if (!doms)
 			goto done;
 
@@ -708,7 +710,7 @@ restart:
 	 * Now we know how many domains to create.
 	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
 	 */
-	doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
+	doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
 	if (!doms)
 		goto done;
 
@@ -720,7 +722,7 @@ restart:
 
 	for (nslot = 0, i = 0; i < csn; i++) {
 		struct cpuset *a = csa[i];
-		cpumask_t *dp;
+		struct cpumask *dp;
 		int apn = a->pn;
 
 		if (apn < 0) {
@@ -743,7 +745,7 @@ restart:
 			continue;
 		}
 
-		cpus_clear(*dp);
+		cpumask_clear(dp);
 		if (dattr)
 			*(dattr + nslot) = SD_ATTR_INIT;
 		for (j = i; j < csn; j++) {
@@ -790,7 +792,7 @@ done:
 static void do_rebuild_sched_domains(struct work_struct *unused)
 {
 	struct sched_domain_attr *attr;
-	cpumask_t *doms;
+	struct cpumask *doms;
 	int ndoms;
 
 	get_online_cpus();
@@ -2044,7 +2046,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 				unsigned long phase, void *unused_cpu)
 {
 	struct sched_domain_attr *attr;
-	cpumask_t *doms;
+	struct cpumask *doms;
 	int ndoms;
 
 	switch (phase) {
@@ -2114,7 +2116,7 @@ void __init cpuset_init_smp(void)
 /**
  * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
  * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
- * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
+ * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
  *
  * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
  * attached to the specified @tsk.  Guaranteed to return some non-empty
@@ -2122,7 +2124,7 @@ void __init cpuset_init_smp(void)
  * tasks cpuset.
  **/
 
-void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
+void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
 	mutex_lock(&callback_mutex);
 	cpuset_cpus_allowed_locked(tsk, pmask);
@@ -2133,7 +2135,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
  * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
  * Must be called with callback_mutex held.
  **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask)
+void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
 {
 	task_lock(tsk);
 	guarantee_online_cpus(task_cs(tsk), pmask);
-- 
cgit v1.2.3


From f9fb860f67b9542cd78d1558dec7058092b57d8e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 7 Jan 2009 18:08:46 -0800
Subject: pid: implement ns_of_pid

A current problem with the pid namespace is that it is easy to do pid
related work after exit_task_namespaces which drops the nsproxy pointer.

However if we are doing pid namespace related work we are always operating
on some struct pid which retains the pid_namespace pointer of the pid
namespace it was allocated in.

So provide ns_of_pid which allows us to find the pid namespace a pid was
allocated in.

Using this we have the needed infrastructure to do pid namespace related
work at anytime we have a struct pid, removing the chance of accidentally
having a NULL pointer dereference when accessing current->nsproxy.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Bastian Blank <bastian@waldi.eu.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Nadia Derbey <Nadia.Derbey@bull.net>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index bb206c56d1f0..49f1c2f66e95 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -122,6 +122,24 @@ int next_pidmap(struct pid_namespace *pid_ns, int last);
 extern struct pid *alloc_pid(struct pid_namespace *ns);
 extern void free_pid(struct pid *pid);
 
+/*
+ * ns_of_pid() returns the pid namespace in which the specified pid was
+ * allocated.
+ *
+ * NOTE:
+ * 	ns_of_pid() is expected to be called for a process (task) that has
+ * 	an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
+ * 	is expected to be non-NULL. If @pid is NULL, caller should handle
+ * 	the resulting NULL pid-ns.
+ */
+static inline struct pid_namespace *ns_of_pid(struct pid *pid)
+{
+	struct pid_namespace *ns = NULL;
+	if (pid)
+		ns = pid->numbers[pid->level].ns;
+	return ns;
+}
+
 /*
  * the helpers to get the pid's id seen from different namespaces
  *
-- 
cgit v1.2.3


From 61bce0f1371cfff497fe85594fd39d1a0b15ebe1 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 7 Jan 2009 18:08:49 -0800
Subject: pid: generalize task_active_pid_ns

Currently task_active_pid_ns is not safe to call after a task becomes a
zombie and exit_task_namespaces is called, as nsproxy becomes NULL.  By
reading the pid namespace from the pid of the task we can trivially solve
this problem at the cost of one extra memory read in what should be the
same cacheline as we read the namespace from.

When moving things around I have made task_active_pid_ns out of line
because keeping it in pid_namespace.h would require adding includes of
pid.h and sched.h that I don't think we want.

This change does make task_active_pid_ns unsafe to call during
copy_process until we attach a pid on the task_struct which seems to be a
reasonable trade off.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Bastian Blank <bastian@waldi.eu.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Nadia Derbey <Nadia.Derbey@bull.net>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid_namespace.h | 6 +-----
 kernel/fork.c                 | 4 ++--
 kernel/pid.c                  | 6 ++++++
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index d82fe825d62f..38d10326246a 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -79,11 +79,7 @@ static inline void zap_pid_ns_processes(struct pid_namespace *ns)
 }
 #endif /* CONFIG_PID_NS */
 
-static inline struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
-{
-	return tsk->nsproxy->pid_ns;
-}
-
+extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
 void pidhash_init(void);
 void pidmap_init(void);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 7b8f2a78be3d..4018308048cf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1126,12 +1126,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	if (pid != &init_struct_pid) {
 		retval = -ENOMEM;
-		pid = alloc_pid(task_active_pid_ns(p));
+		pid = alloc_pid(p->nsproxy->pid_ns);
 		if (!pid)
 			goto bad_fork_cleanup_io;
 
 		if (clone_flags & CLONE_NEWPID) {
-			retval = pid_ns_prepare_proc(task_active_pid_ns(p));
+			retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
 			if (retval < 0)
 				goto bad_fork_free_pid;
 		}
diff --git a/kernel/pid.c b/kernel/pid.c
index af9224cdd6c0..1b3586fe753a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -474,6 +474,12 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 }
 EXPORT_SYMBOL(task_session_nr_ns);
 
+struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
+{
+	return ns_of_pid(task_pid(tsk));
+}
+EXPORT_SYMBOL_GPL(task_active_pid_ns);
+
 /*
  * Used by proc to find the first pid that is greater than or equal to nr.
  *
-- 
cgit v1.2.3


From f06295b44c296c8fb08823a3118468ae343b60f2 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Wed, 7 Jan 2009 18:08:52 -0800
Subject: ELF: implement AT_RANDOM for glibc PRNG seeding

While discussing[1] the need for glibc to have access to random bytes
during program load, it seems that an earlier attempt to implement
AT_RANDOM got stalled.  This implements a random 16 byte string, available
to every ELF program via a new auxv AT_RANDOM vector.

[1] http://sourceware.org/ml/libc-alpha/2008-10/msg00006.html

Ulrich said:

glibc needs right after startup a bit of random data for internal
protections (stack canary etc).  What is now in upstream glibc is that we
always unconditionally open /dev/urandom, read some data, and use it.  For
every process startup.  That's slow.

...

The solution is to provide a limited amount of random data to the
starting process in the aux vector.  I suggested 16 bytes and this is
what the patch implements.  If we need only 16 bytes or less we use the
data directly.  If we need more we'll use the 16 bytes to see a PRNG.
This avoids the costly /dev/urandom use and it allows the kernel to use
the most adequate source of random data for this purpose.  It might not
be the same pool as that for /dev/urandom.

Concerns were expressed about the depletion of the randomness pool.  But
this patch doesn't make the situation worse, it doesn't deplete entropy
more than happens now.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c        | 12 ++++++++++++
 include/linux/auxvec.h |  6 +++---
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c41fa2af7677..e3ff2b9e602f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	elf_addr_t __user *sp;
 	elf_addr_t __user *u_platform;
 	elf_addr_t __user *u_base_platform;
+	elf_addr_t __user *u_rand_bytes;
 	const char *k_platform = ELF_PLATFORM;
 	const char *k_base_platform = ELF_BASE_PLATFORM;
+	unsigned char k_rand_bytes[16];
 	int items;
 	elf_addr_t *elf_info;
 	int ei_index = 0;
@@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 			return -EFAULT;
 	}
 
+	/*
+	 * Generate 16 random bytes for userspace PRNG seeding.
+	 */
+	get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+	u_rand_bytes = (elf_addr_t __user *)
+		       STACK_ALLOC(p, sizeof(k_rand_bytes));
+	if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
+		return -EFAULT;
+
 	/* Create the ELF interpreter info */
 	elf_info = (elf_addr_t *)current->mm->saved_auxv;
 	/* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	NEW_AUX_ENT(AT_GID, cred->gid);
 	NEW_AUX_ENT(AT_EGID, cred->egid);
  	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
 	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
 	if (k_platform) {
 		NEW_AUX_ENT(AT_PLATFORM,
diff --git a/include/linux/auxvec.h b/include/linux/auxvec.h
index d7afa9dd6635..f3b5d4e3a2ac 100644
--- a/include/linux/auxvec.h
+++ b/include/linux/auxvec.h
@@ -23,16 +23,16 @@
 #define AT_PLATFORM 15  /* string identifying CPU for optimizations */
 #define AT_HWCAP  16    /* arch dependent hints at CPU capabilities */
 #define AT_CLKTCK 17	/* frequency at which times() increments */
-
+/* AT_* values 18 through 22 are reserved */
 #define AT_SECURE 23   /* secure mode boolean */
-
 #define AT_BASE_PLATFORM 24	/* string identifying real platform, may
 				 * differ from AT_PLATFORM. */
+#define AT_RANDOM 25	/* address of 16 random bytes */
 
 #define AT_EXECFN  31	/* filename of program */
 
 #ifdef __KERNEL__
-#define AT_VECTOR_SIZE_BASE 18 /* NEW_AUX_ENT entries in auxiliary table */
+#define AT_VECTOR_SIZE_BASE 19 /* NEW_AUX_ENT entries in auxiliary table */
   /* number of "#define AT_.*" above, minus {AT_NULL, AT_IGNORE, AT_NOTELF} */
 #endif
 
-- 
cgit v1.2.3


From 91f68b7359144aa40bb9668124543d15284750b4 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 7 Jan 2009 18:09:12 -0800
Subject: generic swap(): introduce global macro swap(a, b)

There have been some local definitions of swap(), it's time to replace
them all with a uniform one.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 6b8e2027165e..343df9ef2412 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -476,6 +476,12 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 	__val = __val < __min ? __min: __val;	\
 	__val > __max ? __max: __val; })
 
+
+/*
+ * swap - swap value of @a and @b
+ */
+#define swap(a, b) ({ typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; })
+
 /**
  * container_of - cast a member of a structure out to the containing structure
  * @ptr:	the pointer to the member.
-- 
cgit v1.2.3


From 859cb7f2a4244ea6da206d3fe9cc8a6810947a68 Mon Sep 17 00:00:00 2001
From: Richard Purdie <rpurdie@linux.intel.com>
Date: Thu, 8 Jan 2009 17:55:03 +0000
Subject: leds: Add suspend/resume to the core class

Add suspend/resume to the core class and remove all the now unneeded
code from various drivers. Originally the class code couldn't support
suspend/resume but since class_device can there is no reason for
each driver doing its own suspend/resume anymore.
---
 drivers/leds/led-class.c       | 22 ++++++++++++++++++++++
 drivers/leds/leds-alix2.c      | 30 +-----------------------------
 drivers/leds/leds-ams-delta.c  | 29 +----------------------------
 drivers/leds/leds-clevo-mail.c | 21 +--------------------
 drivers/leds/leds-fsg.c        | 37 ++++++-------------------------------
 drivers/leds/leds-gpio.c       | 36 +-----------------------------------
 drivers/leds/leds-hp-disk.c    | 20 +-------------------
 drivers/leds/leds-hp6xx.c      | 22 ++--------------------
 drivers/leds/leds-net48xx.c    | 21 +--------------------
 drivers/leds/leds-s3c24xx.c    | 25 +------------------------
 drivers/leds/leds-wm8350.c     | 24 +-----------------------
 drivers/leds/leds-wrap.c       | 27 +++------------------------
 include/linux/leds.h           |  3 +++
 13 files changed, 44 insertions(+), 273 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 1553d93b8c65..52f82e3ea13a 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -91,6 +91,26 @@ void led_classdev_resume(struct led_classdev *led_cdev)
 }
 EXPORT_SYMBOL_GPL(led_classdev_resume);
 
+static int led_suspend(struct device *dev, pm_message_t state)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+
+	if (led_cdev->flags & LED_CORE_SUSPENDRESUME)
+		led_classdev_suspend(led_cdev);
+
+	return 0;
+}
+
+static int led_resume(struct device *dev)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+
+	if (led_cdev->flags & LED_CORE_SUSPENDRESUME)
+		led_classdev_resume(led_cdev);
+
+	return 0;
+}
+
 /**
  * led_classdev_register - register a new object of led_classdev class.
  * @parent: The device to register.
@@ -174,6 +194,8 @@ static int __init leds_init(void)
 	leds_class = class_create(THIS_MODULE, "leds");
 	if (IS_ERR(leds_class))
 		return PTR_ERR(leds_class);
+	leds_class->suspend = led_suspend;
+	leds_class->resume = led_resume;
 	return 0;
 }
 
diff --git a/drivers/leds/leds-alix2.c b/drivers/leds/leds-alix2.c
index d7666e6cb425..ddbd7730dfc8 100644
--- a/drivers/leds/leds-alix2.c
+++ b/drivers/leds/leds-alix2.c
@@ -65,39 +65,13 @@ static struct alix_led alix_leds[] = {
 	},
 };
 
-#ifdef CONFIG_PM
-
-static int alix_led_suspend(struct platform_device *dev, pm_message_t state)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(alix_leds); i++)
-		led_classdev_suspend(&alix_leds[i].cdev);
-	return 0;
-}
-
-static int alix_led_resume(struct platform_device *dev)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(alix_leds); i++)
-		led_classdev_resume(&alix_leds[i].cdev);
-	return 0;
-}
-
-#else
-
-#define alix_led_suspend NULL
-#define alix_led_resume NULL
-
-#endif
-
 static int __init alix_led_probe(struct platform_device *pdev)
 {
 	int i;
 	int ret;
 
 	for (i = 0; i < ARRAY_SIZE(alix_leds); i++) {
+		alix_leds[i].cdev.flags |= LED_CORE_SUSPENDRESUME;
 		ret = led_classdev_register(&pdev->dev, &alix_leds[i].cdev);
 		if (ret < 0)
 			goto fail;
@@ -121,8 +95,6 @@ static int alix_led_remove(struct platform_device *pdev)
 
 static struct platform_driver alix_led_driver = {
 	.remove = alix_led_remove,
-	.suspend = alix_led_suspend,
-	.resume = alix_led_resume,
 	.driver = {
 		.name = KBUILD_MODNAME,
 		.owner = THIS_MODULE,
diff --git a/drivers/leds/leds-ams-delta.c b/drivers/leds/leds-ams-delta.c
index 072157c3ba72..446050759b4d 100644
--- a/drivers/leds/leds-ams-delta.c
+++ b/drivers/leds/leds-ams-delta.c
@@ -79,37 +79,12 @@ static struct ams_delta_led ams_delta_leds[] = {
 	},
 };
 
-#ifdef CONFIG_PM
-static int ams_delta_led_suspend(struct platform_device *dev,
-		pm_message_t state)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(ams_delta_leds); i++)
-		led_classdev_suspend(&ams_delta_leds[i].cdev);
-
-	return 0;
-}
-
-static int ams_delta_led_resume(struct platform_device *dev)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(ams_delta_leds); i++)
-		led_classdev_resume(&ams_delta_leds[i].cdev);
-
-	return 0;
-}
-#else
-#define ams_delta_led_suspend NULL
-#define ams_delta_led_resume NULL
-#endif
-
 static int ams_delta_led_probe(struct platform_device *pdev)
 {
 	int i, ret;
 
 	for (i = 0; i < ARRAY_SIZE(ams_delta_leds); i++) {
+		ams_delta_leds[i].cdev.flags |= LED_CORE_SUSPENDRESUME;
 		ret = led_classdev_register(&pdev->dev,
 				&ams_delta_leds[i].cdev);
 		if (ret < 0)
@@ -136,8 +111,6 @@ static int ams_delta_led_remove(struct platform_device *pdev)
 static struct platform_driver ams_delta_led_driver = {
 	.probe		= ams_delta_led_probe,
 	.remove		= ams_delta_led_remove,
-	.suspend	= ams_delta_led_suspend,
-	.resume		= ams_delta_led_resume,
 	.driver		= {
 		.name = "ams-delta-led",
 		.owner = THIS_MODULE,
diff --git a/drivers/leds/leds-clevo-mail.c b/drivers/leds/leds-clevo-mail.c
index eb3415e88f43..1813c84ea5fc 100644
--- a/drivers/leds/leds-clevo-mail.c
+++ b/drivers/leds/leds-clevo-mail.c
@@ -142,6 +142,7 @@ static struct led_classdev clevo_mail_led = {
 	.name			= "clevo::mail",
 	.brightness_set		= clevo_mail_led_set,
 	.blink_set		= clevo_mail_led_blink,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static int __init clevo_mail_led_probe(struct platform_device *pdev)
@@ -155,29 +156,9 @@ static int clevo_mail_led_remove(struct platform_device *pdev)
 	return 0;
 }
 
-#ifdef CONFIG_PM
-static int clevo_mail_led_suspend(struct platform_device *dev,
-				  pm_message_t state)
-{
-	led_classdev_suspend(&clevo_mail_led);
-	return 0;
-}
-
-static int clevo_mail_led_resume(struct platform_device *dev)
-{
-	led_classdev_resume(&clevo_mail_led);
-	return 0;
-}
-#else
-#define clevo_mail_led_suspend    NULL
-#define clevo_mail_led_resume     NULL
-#endif
-
 static struct platform_driver clevo_mail_led_driver = {
 	.probe		= clevo_mail_led_probe,
 	.remove		= clevo_mail_led_remove,
-	.suspend	= clevo_mail_led_suspend,
-	.resume		= clevo_mail_led_resume,
 	.driver		= {
 		.name		= KBUILD_MODNAME,
 		.owner		= THIS_MODULE,
diff --git a/drivers/leds/leds-fsg.c b/drivers/leds/leds-fsg.c
index 34935155c1c0..5f7c9c5c09b1 100644
--- a/drivers/leds/leds-fsg.c
+++ b/drivers/leds/leds-fsg.c
@@ -99,64 +99,43 @@ static void fsg_led_ring_set(struct led_classdev *led_cdev,
 }
 
 
-
 static struct led_classdev fsg_wlan_led = {
 	.name			= "fsg:blue:wlan",
 	.brightness_set		= fsg_led_wlan_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev fsg_wan_led = {
 	.name			= "fsg:blue:wan",
 	.brightness_set		= fsg_led_wan_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev fsg_sata_led = {
 	.name			= "fsg:blue:sata",
 	.brightness_set		= fsg_led_sata_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev fsg_usb_led = {
 	.name			= "fsg:blue:usb",
 	.brightness_set		= fsg_led_usb_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev fsg_sync_led = {
 	.name			= "fsg:blue:sync",
 	.brightness_set		= fsg_led_sync_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev fsg_ring_led = {
 	.name			= "fsg:blue:ring",
 	.brightness_set		= fsg_led_ring_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 
-
-#ifdef CONFIG_PM
-static int fsg_led_suspend(struct platform_device *dev, pm_message_t state)
-{
-	led_classdev_suspend(&fsg_wlan_led);
-	led_classdev_suspend(&fsg_wan_led);
-	led_classdev_suspend(&fsg_sata_led);
-	led_classdev_suspend(&fsg_usb_led);
-	led_classdev_suspend(&fsg_sync_led);
-	led_classdev_suspend(&fsg_ring_led);
-	return 0;
-}
-
-static int fsg_led_resume(struct platform_device *dev)
-{
-	led_classdev_resume(&fsg_wlan_led);
-	led_classdev_resume(&fsg_wan_led);
-	led_classdev_resume(&fsg_sata_led);
-	led_classdev_resume(&fsg_usb_led);
-	led_classdev_resume(&fsg_sync_led);
-	led_classdev_resume(&fsg_ring_led);
-	return 0;
-}
-#endif
-
-
 static int fsg_led_probe(struct platform_device *pdev)
 {
 	int ret;
@@ -232,10 +211,6 @@ static int fsg_led_remove(struct platform_device *pdev)
 static struct platform_driver fsg_led_driver = {
 	.probe		= fsg_led_probe,
 	.remove		= fsg_led_remove,
-#ifdef CONFIG_PM
-	.suspend	= fsg_led_suspend,
-	.resume		= fsg_led_resume,
-#endif
 	.driver		= {
 		.name		= "fsg-led",
 	},
diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c
index b13bd2950e95..2e3df08b649b 100644
--- a/drivers/leds/leds-gpio.c
+++ b/drivers/leds/leds-gpio.c
@@ -105,6 +105,7 @@ static int gpio_led_probe(struct platform_device *pdev)
 		}
 		led_dat->cdev.brightness_set = gpio_led_set;
 		led_dat->cdev.brightness = LED_OFF;
+		led_dat->cdev.flags |= LED_CORE_SUSPENDRESUME;
 
 		gpio_direction_output(led_dat->gpio, led_dat->active_low);
 
@@ -154,44 +155,9 @@ static int __devexit gpio_led_remove(struct platform_device *pdev)
 	return 0;
 }
 
-#ifdef CONFIG_PM
-static int gpio_led_suspend(struct platform_device *pdev, pm_message_t state)
-{
-	struct gpio_led_platform_data *pdata = pdev->dev.platform_data;
-	struct gpio_led_data *leds_data;
-	int i;
-
-	leds_data = platform_get_drvdata(pdev);
-
-	for (i = 0; i < pdata->num_leds; i++)
-		led_classdev_suspend(&leds_data[i].cdev);
-
-	return 0;
-}
-
-static int gpio_led_resume(struct platform_device *pdev)
-{
-	struct gpio_led_platform_data *pdata = pdev->dev.platform_data;
-	struct gpio_led_data *leds_data;
-	int i;
-
-	leds_data = platform_get_drvdata(pdev);
-
-	for (i = 0; i < pdata->num_leds; i++)
-		led_classdev_resume(&leds_data[i].cdev);
-
-	return 0;
-}
-#else
-#define gpio_led_suspend NULL
-#define gpio_led_resume NULL
-#endif
-
 static struct platform_driver gpio_led_driver = {
 	.probe		= gpio_led_probe,
 	.remove		= __devexit_p(gpio_led_remove),
-	.suspend	= gpio_led_suspend,
-	.resume		= gpio_led_resume,
 	.driver		= {
 		.name	= "leds-gpio",
 		.owner	= THIS_MODULE,
diff --git a/drivers/leds/leds-hp-disk.c b/drivers/leds/leds-hp-disk.c
index 44fa757d8254..d786adc8c5e3 100644
--- a/drivers/leds/leds-hp-disk.c
+++ b/drivers/leds/leds-hp-disk.c
@@ -68,25 +68,9 @@ static struct led_classdev hpled_led = {
 	.name			= "hp:red:hddprotection",
 	.default_trigger	= "heartbeat",
 	.brightness_set		= hpled_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
-#ifdef CONFIG_PM
-static int hpled_suspend(struct acpi_device *dev, pm_message_t state)
-{
-	led_classdev_suspend(&hpled_led);
-	return 0;
-}
-
-static int hpled_resume(struct acpi_device *dev)
-{
-	led_classdev_resume(&hpled_led);
-	return 0;
-}
-#else
-#define hpled_suspend NULL
-#define hpled_resume NULL
-#endif
-
 static int hpled_add(struct acpi_device *device)
 {
 	int ret;
@@ -121,8 +105,6 @@ static struct acpi_driver leds_hp_driver = {
 	.ops = {
 		.add     = hpled_add,
 		.remove  = hpled_remove,
-		.suspend = hpled_suspend,
-		.resume  = hpled_resume,
 	}
 };
 
diff --git a/drivers/leds/leds-hp6xx.c b/drivers/leds/leds-hp6xx.c
index e8fb1baf8a50..e4ce1fd46338 100644
--- a/drivers/leds/leds-hp6xx.c
+++ b/drivers/leds/leds-hp6xx.c
@@ -45,30 +45,16 @@ static struct led_classdev hp6xx_red_led = {
 	.name			= "hp6xx:red",
 	.default_trigger	= "hp6xx-charge",
 	.brightness_set		= hp6xxled_red_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev hp6xx_green_led = {
 	.name			= "hp6xx:green",
 	.default_trigger	= "ide-disk",
 	.brightness_set		= hp6xxled_green_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
-#ifdef CONFIG_PM
-static int hp6xxled_suspend(struct platform_device *dev, pm_message_t state)
-{
-	led_classdev_suspend(&hp6xx_red_led);
-	led_classdev_suspend(&hp6xx_green_led);
-	return 0;
-}
-
-static int hp6xxled_resume(struct platform_device *dev)
-{
-	led_classdev_resume(&hp6xx_red_led);
-	led_classdev_resume(&hp6xx_green_led);
-	return 0;
-}
-#endif
-
 static int hp6xxled_probe(struct platform_device *pdev)
 {
 	int ret;
@@ -98,10 +84,6 @@ MODULE_ALIAS("platform:hp6xx-led");
 static struct platform_driver hp6xxled_driver = {
 	.probe		= hp6xxled_probe,
 	.remove		= hp6xxled_remove,
-#ifdef CONFIG_PM
-	.suspend	= hp6xxled_suspend,
-	.resume		= hp6xxled_resume,
-#endif
 	.driver		= {
 		.name		= "hp6xx-led",
 		.owner		= THIS_MODULE,
diff --git a/drivers/leds/leds-net48xx.c b/drivers/leds/leds-net48xx.c
index 054360473c94..93987a12da49 100644
--- a/drivers/leds/leds-net48xx.c
+++ b/drivers/leds/leds-net48xx.c
@@ -33,26 +33,9 @@ static void net48xx_error_led_set(struct led_classdev *led_cdev,
 static struct led_classdev net48xx_error_led = {
 	.name		= "net48xx::error",
 	.brightness_set	= net48xx_error_led_set,
+	.flags		= LED_CORE_SUSPENDRESUME,
 };
 
-#ifdef CONFIG_PM
-static int net48xx_led_suspend(struct platform_device *dev,
-		pm_message_t state)
-{
-	led_classdev_suspend(&net48xx_error_led);
-	return 0;
-}
-
-static int net48xx_led_resume(struct platform_device *dev)
-{
-	led_classdev_resume(&net48xx_error_led);
-	return 0;
-}
-#else
-#define net48xx_led_suspend NULL
-#define net48xx_led_resume NULL
-#endif
-
 static int net48xx_led_probe(struct platform_device *pdev)
 {
 	return led_classdev_register(&pdev->dev, &net48xx_error_led);
@@ -67,8 +50,6 @@ static int net48xx_led_remove(struct platform_device *pdev)
 static struct platform_driver net48xx_led_driver = {
 	.probe		= net48xx_led_probe,
 	.remove		= net48xx_led_remove,
-	.suspend	= net48xx_led_suspend,
-	.resume		= net48xx_led_resume,
 	.driver		= {
 		.name		= DRVNAME,
 		.owner		= THIS_MODULE,
diff --git a/drivers/leds/leds-s3c24xx.c b/drivers/leds/leds-s3c24xx.c
index 25a07f2643ad..4d81131542ae 100644
--- a/drivers/leds/leds-s3c24xx.c
+++ b/drivers/leds/leds-s3c24xx.c
@@ -82,6 +82,7 @@ static int s3c24xx_led_probe(struct platform_device *dev)
 	led->cdev.brightness_set = s3c24xx_led_set;
 	led->cdev.default_trigger = pdata->def_trigger;
 	led->cdev.name = pdata->name;
+	led->cdev.flags |= LED_CORE_SUSPENDRESUME;
 
 	led->pdata = pdata;
 
@@ -111,33 +112,9 @@ static int s3c24xx_led_probe(struct platform_device *dev)
 	return ret;
 }
 
-
-#ifdef CONFIG_PM
-static int s3c24xx_led_suspend(struct platform_device *dev, pm_message_t state)
-{
-	struct s3c24xx_gpio_led *led = pdev_to_gpio(dev);
-
-	led_classdev_suspend(&led->cdev);
-	return 0;
-}
-
-static int s3c24xx_led_resume(struct platform_device *dev)
-{
-	struct s3c24xx_gpio_led *led = pdev_to_gpio(dev);
-
-	led_classdev_resume(&led->cdev);
-	return 0;
-}
-#else
-#define s3c24xx_led_suspend NULL
-#define s3c24xx_led_resume NULL
-#endif
-
 static struct platform_driver s3c24xx_led_driver = {
 	.probe		= s3c24xx_led_probe,
 	.remove		= s3c24xx_led_remove,
-	.suspend	= s3c24xx_led_suspend,
-	.resume		= s3c24xx_led_resume,
 	.driver		= {
 		.name		= "s3c24xx_led",
 		.owner		= THIS_MODULE,
diff --git a/drivers/leds/leds-wm8350.c b/drivers/leds/leds-wm8350.c
index 846ed978103f..38c6bcb07e6c 100644
--- a/drivers/leds/leds-wm8350.c
+++ b/drivers/leds/leds-wm8350.c
@@ -184,27 +184,6 @@ static void wm8350_led_set(struct led_classdev *led_cdev,
 	spin_unlock_irqrestore(&led->value_lock, flags);
 }
 
-#ifdef CONFIG_PM
-static int wm8350_led_suspend(struct platform_device *pdev, pm_message_t state)
-{
-	struct wm8350_led *led = platform_get_drvdata(pdev);
-
-	led_classdev_suspend(&led->cdev);
-	return 0;
-}
-
-static int wm8350_led_resume(struct platform_device *pdev)
-{
-	struct wm8350_led *led = platform_get_drvdata(pdev);
-
-	led_classdev_resume(&led->cdev);
-	return 0;
-}
-#else
-#define wm8350_led_suspend NULL
-#define wm8350_led_resume NULL
-#endif
-
 static void wm8350_led_shutdown(struct platform_device *pdev)
 {
 	struct wm8350_led *led = platform_get_drvdata(pdev);
@@ -255,6 +234,7 @@ static int wm8350_led_probe(struct platform_device *pdev)
 	led->cdev.brightness_set = wm8350_led_set;
 	led->cdev.default_trigger = pdata->default_trigger;
 	led->cdev.name = pdata->name;
+	led->cdev.flags |= LED_CORE_SUSPENDRESUME;
 	led->enabled = regulator_is_enabled(isink);
 	led->isink = isink;
 	led->dcdc = dcdc;
@@ -311,8 +291,6 @@ static struct platform_driver wm8350_led_driver = {
 	.probe = wm8350_led_probe,
 	.remove = wm8350_led_remove,
 	.shutdown = wm8350_led_shutdown,
-	.suspend = wm8350_led_suspend,
-	.resume = wm8350_led_resume,
 };
 
 static int __devinit wm8350_led_init(void)
diff --git a/drivers/leds/leds-wrap.c b/drivers/leds/leds-wrap.c
index 2f3aa87f2a1f..2982c86ac4cf 100644
--- a/drivers/leds/leds-wrap.c
+++ b/drivers/leds/leds-wrap.c
@@ -56,40 +56,21 @@ static struct led_classdev wrap_power_led = {
 	.name			= "wrap::power",
 	.brightness_set		= wrap_power_led_set,
 	.default_trigger	= "default-on",
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev wrap_error_led = {
 	.name		= "wrap::error",
 	.brightness_set	= wrap_error_led_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev wrap_extra_led = {
 	.name           = "wrap::extra",
 	.brightness_set = wrap_extra_led_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
-#ifdef CONFIG_PM
-static int wrap_led_suspend(struct platform_device *dev,
-		pm_message_t state)
-{
-	led_classdev_suspend(&wrap_power_led);
-	led_classdev_suspend(&wrap_error_led);
-	led_classdev_suspend(&wrap_extra_led);
-	return 0;
-}
-
-static int wrap_led_resume(struct platform_device *dev)
-{
-	led_classdev_resume(&wrap_power_led);
-	led_classdev_resume(&wrap_error_led);
-	led_classdev_resume(&wrap_extra_led);
-	return 0;
-}
-#else
-#define wrap_led_suspend NULL
-#define wrap_led_resume NULL
-#endif
-
 static int wrap_led_probe(struct platform_device *pdev)
 {
 	int ret;
@@ -127,8 +108,6 @@ static int wrap_led_remove(struct platform_device *pdev)
 static struct platform_driver wrap_led_driver = {
 	.probe		= wrap_led_probe,
 	.remove		= wrap_led_remove,
-	.suspend	= wrap_led_suspend,
-	.resume		= wrap_led_resume,
 	.driver		= {
 		.name		= DRVNAME,
 		.owner		= THIS_MODULE,
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 3c1a8ce6a5ea..24489da701e3 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -32,7 +32,10 @@ struct led_classdev {
 	int			 brightness;
 	int			 flags;
 
+	/* Lower 16 bits reflect status */
 #define LED_SUSPENDED		(1 << 0)
+	/* Upper 16 bits reflect control information */
+#define LED_CORE_SUSPENDRESUME	(1 << 16)
 
 	/* Set LED brightness level */
 	/* Must not sleep, use a workqueue if needed */
-- 
cgit v1.2.3


From 69279fb9a95051971ac03e558c4d46e7ba84ab3a Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 31 Dec 2008 12:52:41 +0000
Subject: regulator: Clean up kerneldoc warnings

Remove kerneldoc warnings that don't relate to missing documentation,
mostly by renaming parameters in the documentation to match their
actual names.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c           | 44 +++++++++++++++++++-------------------
 include/linux/regulator/consumer.h |  8 +++----
 2 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 7ba9491a05cb..ea12c68c327f 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -655,7 +655,7 @@ static void print_constraints(struct regulator_dev *rdev)
 
 /**
  * set_machine_constraints - sets regulator constraints
- * @regulator: regulator source
+ * @rdev: regulator source
  *
  * Allows platform initialisation code to define and constrain
  * regulator circuits e.g. valid voltage/current ranges, etc.  NOTE:
@@ -730,8 +730,8 @@ out:
 
 /**
  * set_supply - set regulator supply regulator
- * @regulator: regulator name
- * @supply: supply regulator name
+ * @rdev: regulator name
+ * @supply_rdev: supply regulator name
  *
  * Called by platform initialisation code to set the supply regulator for this
  * regulator. This ensures that a regulators supply will also be enabled by the
@@ -758,9 +758,9 @@ out:
 
 /**
  * set_consumer_device_supply: Bind a regulator to a symbolic supply
- * @regulator: regulator source
- * @dev:       device the supply applies to
- * @supply:    symbolic name for supply
+ * @rdev:         regulator source
+ * @consumer_dev: device the supply applies to
+ * @supply:       symbolic name for supply
  *
  * Allows platform initialisation code to map physical regulator
  * sources to symbolic names for supplies for use by devices.  Devices
@@ -1013,9 +1013,8 @@ static int _regulator_enable(struct regulator_dev *rdev)
  *
  * Enable the regulator output at the predefined voltage or current value.
  * NOTE: the output value can be set by other drivers, boot loader or may be
- * hardwired in the regulator.
- * NOTE: calls to regulator_enable() must be balanced with calls to
- * regulator_disable().
+ * hardwired in the regulator.  Calls to regulator_enable() must be balanced
+ * with calls to regulator_disable().
  */
 int regulator_enable(struct regulator *regulator)
 {
@@ -1074,10 +1073,10 @@ static int _regulator_disable(struct regulator_dev *rdev)
  * @regulator: regulator source
  *
  * Disable the regulator output voltage or current.
+ *
  * NOTE: this will only disable the regulator output if no other consumer
- * devices have it enabled.
- * NOTE: calls to regulator_enable() must be balanced with calls to
- * regulator_disable().
+ * devices have it enabled.  Calls to regulator_enable() must be balanced with
+ * calls to regulator_disable().
  */
 int regulator_disable(struct regulator *regulator)
 {
@@ -1200,7 +1199,7 @@ EXPORT_SYMBOL_GPL(regulator_is_enabled);
  *
  * NOTE: If the regulator is shared between several devices then the lowest
  * request voltage that meets the system constraints will be used.
- * NOTE: Regulator system constraints must be set for this regulator before
+ * Regulator system constraints must be set for this regulator before
  * calling this function otherwise this call will fail.
  */
 int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV)
@@ -1498,7 +1497,7 @@ EXPORT_SYMBOL_GPL(regulator_set_optimum_mode);
 /**
  * regulator_register_notifier - register regulator event notifier
  * @regulator: regulator source
- * @notifier_block: notifier block
+ * @nb: notifier block
  *
  * Register notifier block to receive regulator events.
  */
@@ -1513,7 +1512,7 @@ EXPORT_SYMBOL_GPL(regulator_register_notifier);
 /**
  * regulator_unregister_notifier - unregister regulator event notifier
  * @regulator: regulator source
- * @notifier_block: notifier block
+ * @nb: notifier block
  *
  * Unregister regulator event notifier block.
  */
@@ -1679,9 +1678,9 @@ EXPORT_SYMBOL_GPL(regulator_bulk_free);
 
 /**
  * regulator_notifier_call_chain - call regulator event notifier
- * @regulator: regulator source
+ * @rdev: regulator source
  * @event: notifier block
- * @data:
+ * @data: callback-specific data.
  *
  * Called by regulator drivers to notify clients a regulator event has
  * occurred. We also notify regulator clients downstream.
@@ -1808,8 +1807,9 @@ static int add_regulator_attributes(struct regulator_dev *rdev)
 
 /**
  * regulator_register - register regulator
- * @regulator: regulator source
- * @reg_data: private regulator data
+ * @regulator_desc: regulator to register
+ * @dev: struct device for the regulator
+ * @driver_data: private regulator data
  *
  * Called by regulator drivers to register a regulator.
  * Returns 0 on success.
@@ -1916,7 +1916,7 @@ EXPORT_SYMBOL_GPL(regulator_register);
 
 /**
  * regulator_unregister - unregister regulator
- * @regulator: regulator source
+ * @rdev: regulator to unregister
  *
  * Called by regulator drivers to unregister a regulator.
  */
@@ -1971,7 +1971,7 @@ EXPORT_SYMBOL_GPL(regulator_suspend_prepare);
 
 /**
  * rdev_get_drvdata - get rdev regulator driver data
- * @regulator: regulator
+ * @rdev: regulator
  *
  * Get rdev regulator driver private data. This call can be used in the
  * regulator driver context.
@@ -2008,7 +2008,7 @@ EXPORT_SYMBOL_GPL(regulator_set_drvdata);
 
 /**
  * regulator_get_id - get regulator ID
- * @regulator: regulator
+ * @rdev: regulator
  */
 int rdev_get_id(struct regulator_dev *rdev)
 {
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index afdc4558bb94..801bf77ff4e2 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -104,10 +104,10 @@ struct regulator;
 /**
  * struct regulator_bulk_data - Data used for bulk regulator operations.
  *
- * @supply   The name of the supply.  Initialised by the user before
- *           using the bulk regulator APIs.
- * @consumer The regulator consumer for the supply.  This will be managed
- *           by the bulk API.
+ * @supply:   The name of the supply.  Initialised by the user before
+ *            using the bulk regulator APIs.
+ * @consumer: The regulator consumer for the supply.  This will be managed
+ *            by the bulk API.
  *
  * The regulator APIs provide a series of regulator_bulk_() API calls as
  * a convenience to consumers which require multiple supplies.  This
-- 
cgit v1.2.3


From c8e7e4640facbe99d10a6e262523b25be129b9b9 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 31 Dec 2008 12:52:42 +0000
Subject: regulator: Add missing kerneldoc

This is only the documentation that the kerneldoc system warns about.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/regulator/core.c          |  1 +
 include/linux/regulator/driver.h  | 40 +++++++++++++++++++++++++++++++++++++-
 include/linux/regulator/machine.h | 41 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index ea12c68c327f..fda44009024d 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -656,6 +656,7 @@ static void print_constraints(struct regulator_dev *rdev)
 /**
  * set_machine_constraints - sets regulator constraints
  * @rdev: regulator source
+ * @constraints: constraints to apply
  *
  * Allows platform initialisation code to define and constrain
  * regulator circuits e.g. valid voltage/current ranges, etc.  NOTE:
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index e37d80561985..84c3737c2d26 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -24,7 +24,37 @@ struct regulator_init_data;
 /**
  * struct regulator_ops - regulator operations.
  *
- * This struct describes regulator operations.
+ * This struct describes regulator operations which can be implemented by
+ * regulator chip drivers.
+ *
+ * @enable: Enable the regulator.
+ * @disable: Disable the regulator.
+ * @is_enabled: Return 1 if the reguator is enabled, 0 otherwise.
+ *
+ * @set_voltage: Set the voltage for the regulator within the range specified.
+ *               The driver should select the voltage closest to min_uV.
+ * @get_voltage: Return the currently configured voltage for the regulator.
+ *
+ * @set_current: Set the current for the regulator within the range specified.
+ *               The driver should select the current closest to min_uA.
+ * @get_current: Return the currently configured current for the regulator.
+ *
+ * @set_current_limit: Configure a limit for a current-limited regulator.
+ * @get_current_limit: Get the limit for a current-limited regulator.
+ *
+ * @set_mode: Set the operating mode for the regulator.
+ * @get_mode: Get the current operating mode for the regulator.
+ * @get_optimum_mode: Get the most efficient operating mode for the regulator
+ *                    when running with the specified parameters.
+ *
+ * @set_suspend_voltage: Set the voltage for the regulator when the system
+ *                       is suspended.
+ * @set_suspend_enable: Mark the regulator as enabled when the system is
+ *                      suspended.
+ * @set_suspend_disable: Mark the regulator as disabled when the system is
+ *                       suspended.
+ * @set_suspend_mode: Set the operating mode for the regulator when the
+ *                    system is suspended.
  */
 struct regulator_ops {
 
@@ -75,6 +105,14 @@ enum regulator_type {
 /**
  * struct regulator_desc - Regulator descriptor
  *
+ * Each regulator registered with the core is described with a structure of
+ * this type.
+ *
+ * @name: Identifying name for the regulator.
+ * @id: Numerical identifier for the regulator.
+ * @ops: Regulator operations table.
+ * @type: Indicates if the regulator is a voltage or current regulator.
+ * @owner: Module providing the regulator, used for refcounting.
  */
 struct regulator_desc {
 	const char *name;
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index c6d69331a81e..3794773b23d2 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -44,6 +44,10 @@ struct regulator;
  * struct regulator_state - regulator state during low power syatem states
  *
  * This describes a regulators state during a system wide low power state.
+ *
+ * @uV: Operating voltage during suspend.
+ * @mode: Operating mode during suspend.
+ * @enabled: Enabled during suspend.
  */
 struct regulator_state {
 	int uV;	/* suspend voltage */
@@ -55,6 +59,30 @@ struct regulator_state {
  * struct regulation_constraints - regulator operating constraints.
  *
  * This struct describes regulator and board/machine specific constraints.
+ *
+ * @name: Descriptive name for the constraints, used for display purposes.
+ *
+ * @min_uV: Smallest voltage consumers may set.
+ * @max_uV: Largest voltage consumers may set.
+ *
+ * @min_uA: Smallest consumers consumers may set.
+ * @max_uA: Largest current consumers may set.
+ *
+ * @valid_modes_mask: Mask of modes which may be configured by consumers.
+ * @valid_ops_mask: Operations which may be performed by consumers.
+ *
+ * @always_on: Set if the regulator should never be disabled.
+ * @boot_on: Set if the regulator is enabled when the system is initially
+ *           started.
+ * @apply_uV: Apply the voltage constraint when initialising.
+ *
+ * @input_uV: Input voltage for regulator when supplied by another regulator.
+ *
+ * @state_disk: State for regulator when system is suspended in disk mode.
+ * @state_mem: State for regulator when system is suspended in mem mode.
+ * @state_standby: State for regulator when system is suspended in standby
+ *                 mode.
+ * @initial_state: Suspend state to set by default.
  */
 struct regulation_constraints {
 
@@ -93,6 +121,9 @@ struct regulation_constraints {
  * struct regulator_consumer_supply - supply -> device mapping
  *
  * This maps a supply name to a device.
+ *
+ * @dev: Device structure for the consumer.
+ * @supply: Name for the supply.
  */
 struct regulator_consumer_supply {
 	struct device *dev;	/* consumer */
@@ -103,6 +134,16 @@ struct regulator_consumer_supply {
  * struct regulator_init_data - regulator platform initialisation data.
  *
  * Initialisation constraints, our supply and consumers supplies.
+ *
+ * @supply_regulator_dev: Parent regulator (if any).
+ *
+ * @constraints: Constraints.  These must be specified for the regulator to
+ *               be usable.
+ * @num_consumer_supplies: Number of consumer device supplies.
+ * @consumer_supplies: Consumer device supply configuration.
+ *
+ * @regulator_init: Callback invoked when the regulator has been registered.
+ * @driver_data: Data passed to regulator_init.
  */
 struct regulator_init_data {
 	struct device *supply_regulator_dev; /* or NULL for LINE */
-- 
cgit v1.2.3


From 0ba4887c6329043d6cee5b5b477cfe50c2b57674 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 8 Jan 2009 11:50:23 -0800
Subject: regulator: fix kernel-doc warnings

Fix kernel-doc warnings in regulator/driver.h:

Warning(linux-next-20090108//include/linux/regulator/driver.h:95): Excess struct/union/enum/typedef member 'set_current' description in 'regulator_ops'
Warning(linux-next-20090108//include/linux/regulator/driver.h:95): Excess struct/union/enum/typedef member 'get_current' description in 'regulator_ops'
Warning(linux-next-20090108//include/linux/regulator/driver.h:124): No description found for parameter 'irq'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
cc: Liam Girdwood <lrg@slimlogic.co.uk>
cc: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/regulator/driver.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 84c3737c2d26..2dae05705f13 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -29,16 +29,12 @@ struct regulator_init_data;
  *
  * @enable: Enable the regulator.
  * @disable: Disable the regulator.
- * @is_enabled: Return 1 if the reguator is enabled, 0 otherwise.
+ * @is_enabled: Return 1 if the regulator is enabled, 0 otherwise.
  *
  * @set_voltage: Set the voltage for the regulator within the range specified.
  *               The driver should select the voltage closest to min_uV.
  * @get_voltage: Return the currently configured voltage for the regulator.
  *
- * @set_current: Set the current for the regulator within the range specified.
- *               The driver should select the current closest to min_uA.
- * @get_current: Return the currently configured current for the regulator.
- *
  * @set_current_limit: Configure a limit for a current-limited regulator.
  * @get_current_limit: Get the limit for a current-limited regulator.
  *
@@ -111,6 +107,7 @@ enum regulator_type {
  * @name: Identifying name for the regulator.
  * @id: Numerical identifier for the regulator.
  * @ops: Regulator operations table.
+ * @irq: Interrupt number for the regulator.
  * @type: Indicates if the regulator is a voltage or current regulator.
  * @owner: Module providing the regulator, used for refcounting.
  */
-- 
cgit v1.2.3


From f4f6bda00fc6bf995a35d8246db45aacaa9b3f09 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <mike@compulab.co.il>
Date: Wed, 3 Dec 2008 08:48:52 +0000
Subject: backlight: add support for Toppoly TDO35S series to tdo24m lcd driver

Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Acked-by: Eric Miao <eric.miao@marvell.com>
Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
---
 drivers/video/backlight/Kconfig  |  4 +-
 drivers/video/backlight/tdo24m.c | 94 ++++++++++++++++++++++++++++++++++++----
 include/linux/spi/tdo24m.h       | 13 ++++++
 3 files changed, 101 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/spi/tdo24m.h

(limited to 'include/linux')

diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig
index 4a4dd9adc328..7ee2313ed6b8 100644
--- a/drivers/video/backlight/Kconfig
+++ b/drivers/video/backlight/Kconfig
@@ -52,11 +52,11 @@ config LCD_ILI9320
 	  then say y to include a power driver for it.
 
 config LCD_TDO24M
-	tristate "Toppoly TDO24M LCD Panels support"
+	tristate "Toppoly TDO24M  and TDO35S LCD Panels support"
 	depends on LCD_CLASS_DEVICE && SPI_MASTER
 	default n
 	help
-	  If you have a Toppoly TDO24M series LCD panel, say y here to
+	  If you have a Toppoly TDO24M/TDO35S series LCD panel, say y here to
 	  include the support for it.
 
 config LCD_VGG2432A4
diff --git a/drivers/video/backlight/tdo24m.c b/drivers/video/backlight/tdo24m.c
index 8427669162ea..1dae7f8f3c6b 100644
--- a/drivers/video/backlight/tdo24m.c
+++ b/drivers/video/backlight/tdo24m.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/spi/spi.h>
+#include <linux/spi/tdo24m.h>
 #include <linux/fb.h>
 #include <linux/lcd.h>
 
@@ -31,6 +32,9 @@ struct tdo24m {
 	struct spi_transfer	xfer;
 	uint8_t			*buf;
 
+	int (*adj_mode)(struct tdo24m *lcd, int mode);
+	int color_invert;
+
 	int			power;
 	int			mode;
 };
@@ -66,7 +70,7 @@ static uint32_t lcd_panel_off[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_vga_pass_through[] = {
+static uint32_t lcd_vga_pass_through_tdo24m[] = {
 	CMD1(0xB0, 0x16),
 	CMD1(0xBC, 0x80),
 	CMD1(0xE1, 0x00),
@@ -75,7 +79,7 @@ static uint32_t lcd_vga_pass_through[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_qvga_pass_through[] = {
+static uint32_t lcd_qvga_pass_through_tdo24m[] = {
 	CMD1(0xB0, 0x16),
 	CMD1(0xBC, 0x81),
 	CMD1(0xE1, 0x00),
@@ -84,7 +88,7 @@ static uint32_t lcd_qvga_pass_through[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_vga_transfer[] = {
+static uint32_t lcd_vga_transfer_tdo24m[] = {
 	CMD1(0xcf, 0x02), 	/* Blanking period control (1) */
 	CMD2(0xd0, 0x08, 0x04),	/* Blanking period control (2) */
 	CMD1(0xd1, 0x01),	/* CKV timing control on/off */
@@ -110,6 +114,35 @@ static uint32_t lcd_qvga_transfer[] = {
 	CMD_NULL,
 };
 
+static uint32_t lcd_vga_pass_through_tdo35s[] = {
+	CMD1(0xB0, 0x16),
+	CMD1(0xBC, 0x80),
+	CMD1(0xE1, 0x00),
+	CMD1(0x3B, 0x00),
+	CMD_NULL,
+};
+
+static uint32_t lcd_qvga_pass_through_tdo35s[] = {
+	CMD1(0xB0, 0x16),
+	CMD1(0xBC, 0x81),
+	CMD1(0xE1, 0x00),
+	CMD1(0x3B, 0x22),
+	CMD_NULL,
+};
+
+static uint32_t lcd_vga_transfer_tdo35s[] = {
+	CMD1(0xcf, 0x02), 	/* Blanking period control (1) */
+	CMD2(0xd0, 0x08, 0x04),	/* Blanking period control (2) */
+	CMD1(0xd1, 0x01),	/* CKV timing control on/off */
+	CMD2(0xd2, 0x00, 0x1e),	/* CKV 1,2 timing control */
+	CMD2(0xd3, 0x14, 0x28),	/* OEV timing control */
+	CMD2(0xd4, 0x28, 0x64),	/* ASW timing control (1) */
+	CMD1(0xd5, 0x28),	/* ASW timing control (2) */
+	CMD0(0x21),		/* Invert for normally black display */
+	CMD0(0x29),		/* Display on */
+	CMD_NULL,
+};
+
 static uint32_t lcd_panel_config[] = {
 	CMD2(0xb8, 0xff, 0xf9),	/* Output control */
 	CMD0(0x11),		/* sleep out */
@@ -148,6 +181,8 @@ static int tdo24m_writes(struct tdo24m *lcd, uint32_t *array)
 	int nparams, err = 0;
 
 	for (; *p != CMD_NULL; p++) {
+		if (!lcd->color_invert && *p == CMD0(0x21))
+			continue;
 
 		nparams = (*p >> 30) & 0x3;
 
@@ -184,12 +219,33 @@ static int tdo24m_adj_mode(struct tdo24m *lcd, int mode)
 {
 	switch (mode) {
 	case MODE_VGA:
-		tdo24m_writes(lcd, lcd_vga_pass_through);
+		tdo24m_writes(lcd, lcd_vga_pass_through_tdo24m);
 		tdo24m_writes(lcd, lcd_panel_config);
-		tdo24m_writes(lcd, lcd_vga_transfer);
+		tdo24m_writes(lcd, lcd_vga_transfer_tdo24m);
 		break;
 	case MODE_QVGA:
-		tdo24m_writes(lcd, lcd_qvga_pass_through);
+		tdo24m_writes(lcd, lcd_qvga_pass_through_tdo24m);
+		tdo24m_writes(lcd, lcd_panel_config);
+		tdo24m_writes(lcd, lcd_qvga_transfer);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	lcd->mode = mode;
+	return 0;
+}
+
+static int tdo35s_adj_mode(struct tdo24m *lcd, int mode)
+{
+	switch (mode) {
+	case MODE_VGA:
+		tdo24m_writes(lcd, lcd_vga_pass_through_tdo35s);
+		tdo24m_writes(lcd, lcd_panel_config);
+		tdo24m_writes(lcd, lcd_vga_transfer_tdo35s);
+		break;
+	case MODE_QVGA:
+		tdo24m_writes(lcd, lcd_qvga_pass_through_tdo35s);
 		tdo24m_writes(lcd, lcd_panel_config);
 		tdo24m_writes(lcd, lcd_qvga_transfer);
 		break;
@@ -213,7 +269,7 @@ static int tdo24m_power_on(struct tdo24m *lcd)
 	if (err)
 		goto out;
 
-	err = tdo24m_adj_mode(lcd, lcd->mode);
+	err = lcd->adj_mode(lcd, lcd->mode);
 out:
 	return err;
 }
@@ -262,7 +318,7 @@ static int tdo24m_set_mode(struct lcd_device *ld, struct fb_videomode *m)
 	if (lcd->mode == mode)
 		return 0;
 
-	return tdo24m_adj_mode(lcd, mode);
+	return lcd->adj_mode(lcd, mode);
 }
 
 static struct lcd_ops tdo24m_ops = {
@@ -276,8 +332,16 @@ static int __devinit tdo24m_probe(struct spi_device *spi)
 	struct tdo24m *lcd;
 	struct spi_message *m;
 	struct spi_transfer *x;
+	struct tdo24m_platform_data *pdata;
+	enum tdo24m_model model;
 	int err;
 
+	pdata = spi->dev.platform_data;
+	if (pdata)
+		model = pdata->model;
+	else
+		model = TDO24M;
+
 	spi->bits_per_word = 8;
 	spi->mode = SPI_MODE_3;
 	err = spi_setup(spi);
@@ -306,6 +370,20 @@ static int __devinit tdo24m_probe(struct spi_device *spi)
 	x->tx_buf = &lcd->buf[0];
 	spi_message_add_tail(x, m);
 
+	switch (model) {
+	case TDO24M:
+		lcd->color_invert = 1;
+		lcd->adj_mode = tdo24m_adj_mode;
+		break;
+	case TDO35S:
+		lcd->adj_mode = tdo35s_adj_mode;
+		lcd->color_invert = 0;
+		break;
+	default:
+		dev_err(&spi->dev, "Unsupported model");
+		goto out_free;
+	}
+
 	lcd->lcd_dev = lcd_device_register("tdo24m", &spi->dev,
 					lcd, &tdo24m_ops);
 	if (IS_ERR(lcd->lcd_dev)) {
diff --git a/include/linux/spi/tdo24m.h b/include/linux/spi/tdo24m.h
new file mode 100644
index 000000000000..7572d4e1fe76
--- /dev/null
+++ b/include/linux/spi/tdo24m.h
@@ -0,0 +1,13 @@
+#ifndef __TDO24M_H__
+#define __TDO24M_H__
+
+enum tdo24m_model {
+	TDO24M,
+	TDO35S,
+};
+
+struct tdo24m_platform_data {
+	enum tdo24m_model model;
+};
+
+#endif /* __TDO24M_H__ */
-- 
cgit v1.2.3


From 0c3573f19d135d718264e38c46597295bd6154b7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 9 Jan 2009 08:31:05 +1100
Subject: md: use sysfs_notify_dirent to notify changes to md/sync_action.

There is no compelling need for this, but sysfs_notify_dirent is a
nicer interface and the change is good for consistency.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           | 20 +++++++++++++-------
 include/linux/raid/md_k.h |  1 +
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1b1d32694f6f..ee759b193e9a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3046,7 +3046,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 	}
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
-	sysfs_notify(&mddev->kobj, NULL, "sync_action");
+	sysfs_notify_dirent(mddev->sysfs_action);
 	return len;
 }
 
@@ -3684,6 +3684,7 @@ static int do_md_run(mddev_t * mddev)
 			printk(KERN_WARNING
 			       "md: cannot register extra attributes for %s\n",
 			       mdname(mddev));
+		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
 	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
 		mddev->ro = 0;
 
@@ -3754,7 +3755,8 @@ static int do_md_run(mddev_t * mddev)
 	mddev->changed = 1;
 	md_new_event(mddev);
 	sysfs_notify_dirent(mddev->sysfs_state);
-	sysfs_notify(&mddev->kobj, NULL, "sync_action");
+	if (mddev->sysfs_action)
+		sysfs_notify_dirent(mddev->sysfs_action);
 	sysfs_notify(&mddev->kobj, NULL, "degraded");
 	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
 	return 0;
@@ -3854,9 +3856,12 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			mddev->queue->merge_bvec_fn = NULL;
 			mddev->queue->unplug_fn = NULL;
 			mddev->queue->backing_dev_info.congested_fn = NULL;
-			if (mddev->pers->sync_request)
+			if (mddev->pers->sync_request) {
 				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
-
+				if (mddev->sysfs_action)
+					sysfs_put(mddev->sysfs_action);
+				mddev->sysfs_action = NULL;
+			}
 			module_put(mddev->pers->owner);
 			mddev->pers = NULL;
 			/* tell userspace to handle 'inactive' */
@@ -6155,7 +6160,7 @@ void md_check_recovery(mddev_t *mddev)
 			mddev->recovery = 0;
 			/* flag recovery needed just to double check */
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-			sysfs_notify(&mddev->kobj, NULL, "sync_action");
+			sysfs_notify_dirent(mddev->sysfs_action);
 			md_new_event(mddev);
 			goto unlock;
 		}
@@ -6216,7 +6221,7 @@ void md_check_recovery(mddev_t *mddev)
 				mddev->recovery = 0;
 			} else
 				md_wakeup_thread(mddev->sync_thread);
-			sysfs_notify(&mddev->kobj, NULL, "sync_action");
+			sysfs_notify_dirent(mddev->sysfs_action);
 			md_new_event(mddev);
 		}
 	unlock:
@@ -6224,7 +6229,8 @@ void md_check_recovery(mddev_t *mddev)
 			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
 					       &mddev->recovery))
-				sysfs_notify(&mddev->kobj, NULL, "sync_action");
+				if (mddev->sysfs_action)
+					sysfs_notify_dirent(mddev->sysfs_action);
 		}
 		mddev_unlock(mddev);
 	}
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 8fc909ef6787..663803eaf0de 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -244,6 +244,7 @@ struct mddev_s
 	struct sysfs_dirent		*sysfs_state;	/* handle for 'array_state'
 							 * file in sysfs.
 							 */
+	struct sysfs_dirent		*sysfs_action;  /* handle for 'sync_action' */
 
 	spinlock_t			write_lock;
 	wait_queue_head_t		sb_wait;	/* for waiting on superblock updates */
-- 
cgit v1.2.3


From 019c4e2f3e02aac4b44003913b54ca4b332e4371 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 9 Jan 2009 08:31:06 +1100
Subject: md: raid0: Represent device offset in sectors.

Rename zone->dev_offset to zone->dev_start to make sure all users
have been converted.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c         | 9 ++++-----
 include/linux/raid/raid0.h | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index fd65d8806c0f..b860536dc894 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -167,7 +167,7 @@ static int create_strip_zones (mddev_t *mddev)
 		zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks;
 
 		printk("raid0: zone %d\n", i);
-		zone->dev_offset = current_offset;
+		zone->dev_start = current_offset * 2;
 		smallest = NULL;
 		c = 0;
 
@@ -452,8 +452,7 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		x = sector >> chunksect_bits;
 		tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
 	}
-	rsect = (((chunk << (chunksect_bits - 1)) + zone->dev_offset)<<1)
-		+ sect_in_chunk;
+	rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk;
  
 	bio->bi_bdev = tmp_dev->bdev;
 	bio->bi_sector = rsect + tmp_dev->data_offset;
@@ -490,9 +489,9 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
 			seq_printf(seq, "%s/", bdevname(
 				conf->strip_zone[j].dev[k]->bdev,b));
 
-		seq_printf(seq, "] zo=%d do=%d s=%d\n",
+		seq_printf(seq, "] zo=%d ds=%d s=%d\n",
 				conf->strip_zone[j].zone_offset,
-				conf->strip_zone[j].dev_offset,
+				conf->strip_zone[j].dev_start,
 				conf->strip_zone[j].size);
 	}
 #endif
diff --git a/include/linux/raid/raid0.h b/include/linux/raid/raid0.h
index 1b2dda035f8e..61c3d29dc158 100644
--- a/include/linux/raid/raid0.h
+++ b/include/linux/raid/raid0.h
@@ -6,7 +6,7 @@
 struct strip_zone
 {
 	sector_t zone_offset;	/* Zone offset in md_dev */
-	sector_t dev_offset;	/* Zone offset in real dev */
+	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
 	sector_t size;		/* Zone size */
 	int nb_dev;		/* # of devices attached to the zone */
 	mdk_rdev_t **dev;	/* Devices attached to the zone */
-- 
cgit v1.2.3


From 6199d3db0fc34f8ada895879d04a353a6ae632bc Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 9 Jan 2009 08:31:07 +1100
Subject: md: raid0: Represent zone->zone_offset in sectors.

For the same reason as in the previous patch, rename it from zone_offset
to zone_start.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c         | 12 ++++++------
 include/linux/raid/raid0.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index b860536dc894..6e12a358f366 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -155,7 +155,7 @@ static int create_strip_zones (mddev_t *mddev)
 	}
 	zone->nb_dev = cnt;
 	zone->size = smallest->size * cnt;
-	zone->zone_offset = 0;
+	zone->zone_start = 0;
 
 	current_offset = smallest->size;
 	curr_zone_offset = zone->size;
@@ -194,7 +194,7 @@ static int create_strip_zones (mddev_t *mddev)
 		printk("raid0: zone->nb_dev: %d, size: %llu\n",
 			zone->nb_dev, (unsigned long long)zone->size);
 
-		zone->zone_offset = curr_zone_offset;
+		zone->zone_start = curr_zone_offset * 2;
 		curr_zone_offset += zone->size;
 
 		current_offset = smallest->size;
@@ -437,14 +437,14 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		zone = conf->hash_table[x];
 	}
  
-	while (sector / 2 >= (zone->zone_offset + zone->size))
+	while (sector / 2 >= (zone->zone_start / 2 + zone->size))
 		zone++;
     
 	sect_in_chunk = bio->bi_sector & (chunk_sects - 1);
 
 
 	{
-		sector_t x = (sector - zone->zone_offset * 2) >> chunksect_bits;
+		sector_t x = (sector - zone->zone_start) >> chunksect_bits;
 
 		sector_div(x, zone->nb_dev);
 		chunk = x;
@@ -489,8 +489,8 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
 			seq_printf(seq, "%s/", bdevname(
 				conf->strip_zone[j].dev[k]->bdev,b));
 
-		seq_printf(seq, "] zo=%d ds=%d s=%d\n",
-				conf->strip_zone[j].zone_offset,
+		seq_printf(seq, "] zs=%d ds=%d s=%d\n",
+				conf->strip_zone[j].zone_start,
 				conf->strip_zone[j].dev_start,
 				conf->strip_zone[j].size);
 	}
diff --git a/include/linux/raid/raid0.h b/include/linux/raid/raid0.h
index 61c3d29dc158..eaf4f6ac55f6 100644
--- a/include/linux/raid/raid0.h
+++ b/include/linux/raid/raid0.h
@@ -5,7 +5,7 @@
 
 struct strip_zone
 {
-	sector_t zone_offset;	/* Zone offset in md_dev */
+	sector_t zone_start;	/* Zone offset in md_dev (in sectors) */
 	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
 	sector_t size;		/* Zone size */
 	int nb_dev;		/* # of devices attached to the zone */
-- 
cgit v1.2.3


From 83838ed87898e0a8ff8dbf001e54e6c017f0a011 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 9 Jan 2009 08:31:07 +1100
Subject: md: raid0: Represent the size of strip zones in sectors.

This completes the block -> sector conversion of struct strip_zone.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c         | 26 +++++++++++++-------------
 include/linux/raid/raid0.h |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c759436b5d84..6e85e88bbae9 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -156,11 +156,11 @@ static int create_strip_zones (mddev_t *mddev)
 		goto abort;
 	}
 	zone->nb_dev = cnt;
-	zone->size = smallest->size * cnt;
+	zone->sectors = smallest->size * cnt * 2;
 	zone->zone_start = 0;
 
 	current_start = smallest->size * 2;
-	curr_zone_start = zone->size * 2;
+	curr_zone_start = zone->sectors;
 
 	/* now do the other zones */
 	for (i = 1; i < conf->nr_strip_zones; i++)
@@ -193,12 +193,12 @@ static int create_strip_zones (mddev_t *mddev)
 		}
 
 		zone->nb_dev = c;
-		zone->size = (smallest->size - current_start / 2) * c;
-		printk(KERN_INFO "raid0: zone->nb_dev: %d, size: %llu\n",
-			zone->nb_dev, (unsigned long long)zone->size);
+		zone->sectors = (smallest->size * 2 - current_start) * c;
+		printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
+			zone->nb_dev, (unsigned long long)zone->sectors);
 
 		zone->zone_start = curr_zone_start;
-		curr_zone_start += zone->size * 2;
+		curr_zone_start += zone->sectors;
 
 		current_start = smallest->size * 2;
 		printk(KERN_INFO "raid0: current zone start: %llu\n",
@@ -220,7 +220,7 @@ static int create_strip_zones (mddev_t *mddev)
 		sector_t sz = 0;
 		for (j=i; j<conf->nr_strip_zones-1 &&
 			     sz < min_spacing ; j++)
-			sz += conf->strip_zone[j].size;
+			sz += conf->strip_zone[j].sectors / 2;
 		if (sz >= min_spacing && sz < conf->hash_spacing)
 			conf->hash_spacing = sz;
 	}
@@ -325,13 +325,13 @@ static int raid0_run (mddev_t *mddev)
 	conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
 	if (!conf->hash_table)
 		goto out_free_conf;
-	size = conf->strip_zone[cur].size;
+	size = conf->strip_zone[cur].sectors / 2;
 
 	conf->hash_table[0] = conf->strip_zone + cur;
 	for (i=1; i< nb_zone; i++) {
 		while (size <= conf->hash_spacing) {
 			cur++;
-			size += conf->strip_zone[cur].size;
+			size += conf->strip_zone[cur].sectors / 2;
 		}
 		size -= conf->hash_spacing;
 		conf->hash_table[i] = conf->strip_zone + cur;
@@ -439,10 +439,10 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		sector_div(x, (u32)conf->hash_spacing);
 		zone = conf->hash_table[x];
 	}
- 
-	while (sector / 2 >= (zone->zone_start / 2 + zone->size))
+
+	while (sector >= zone->zone_start + zone->sectors)
 		zone++;
-    
+
 	sect_in_chunk = bio->bi_sector & (chunk_sects - 1);
 
 
@@ -495,7 +495,7 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
 		seq_printf(seq, "] zs=%d ds=%d s=%d\n",
 				conf->strip_zone[j].zone_start,
 				conf->strip_zone[j].dev_start,
-				conf->strip_zone[j].size);
+				conf->strip_zone[j].sectors);
 	}
 #endif
 	seq_printf(seq, " %dk chunks", mddev->chunk_size/1024);
diff --git a/include/linux/raid/raid0.h b/include/linux/raid/raid0.h
index eaf4f6ac55f6..c12521d027e2 100644
--- a/include/linux/raid/raid0.h
+++ b/include/linux/raid/raid0.h
@@ -7,7 +7,7 @@ struct strip_zone
 {
 	sector_t zone_start;	/* Zone offset in md_dev (in sectors) */
 	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
-	sector_t size;		/* Zone size */
+	sector_t sectors;	/* Zone size in sectors */
 	int nb_dev;		/* # of devices attached to the zone */
 	mdk_rdev_t **dev;	/* Devices attached to the zone */
 };
-- 
cgit v1.2.3


From ccacc7d2cf03114a24ab903f710118e9e5d43273 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 9 Jan 2009 08:31:08 +1100
Subject: md: raid0: make hash_spacing and preshift sector-based.

This patch renames the hash_spacing and preshift members of struct
raid0_private_data to spacing and sector_shift respectively and
changes the semantics as follows:

We always have spacing = 2 * hash_spacing. In case
sizeof(sector_t) > sizeof(u32) we also have sector_shift = preshift + 1
while sector_shift = preshift = 0 otherwise.

Note that the values of nb_zone and zone are unaffected by these changes
because in the sector_div() preceeding the assignement of these two
variables both arguments double.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c         | 58 +++++++++++++++++++++++-----------------------
 include/linux/raid/raid0.h |  4 ++--
 2 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6e85e88bbae9..90f5b24f6e6c 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -213,16 +213,16 @@ static int create_strip_zones (mddev_t *mddev)
 	 * strip though as it's size has no bearing on the efficacy of the hash
 	 * table.
 	 */
-	conf->hash_spacing = curr_zone_start / 2;
-	min_spacing = curr_zone_start / 2;
+	conf->spacing = curr_zone_start;
+	min_spacing = curr_zone_start;
 	sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
 	for (i=0; i < conf->nr_strip_zones-1; i++) {
-		sector_t sz = 0;
-		for (j=i; j<conf->nr_strip_zones-1 &&
-			     sz < min_spacing ; j++)
-			sz += conf->strip_zone[j].sectors / 2;
-		if (sz >= min_spacing && sz < conf->hash_spacing)
-			conf->hash_spacing = sz;
+		sector_t s = 0;
+		for (j = i; j < conf->nr_strip_zones - 1 &&
+				s < min_spacing; j++)
+			s += conf->strip_zone[j].sectors;
+		if (s >= min_spacing && s < conf->spacing)
+			conf->spacing = s;
 	}
 
 	mddev->queue->unplug_fn = raid0_unplug;
@@ -265,7 +265,7 @@ static int raid0_mergeable_bvec(struct request_queue *q,
 static int raid0_run (mddev_t *mddev)
 {
 	unsigned  cur=0, i=0, nb_zone;
-	s64 size;
+	s64 sectors;
 	raid0_conf_t *conf;
 	mdk_rdev_t *rdev;
 	struct list_head *tmp;
@@ -297,51 +297,51 @@ static int raid0_run (mddev_t *mddev)
 	rdev_for_each(rdev, tmp, mddev)
 		mddev->array_sectors += rdev->size * 2;
 
-	printk("raid0 : md_size is %llu blocks.\n", 
-		(unsigned long long)mddev->array_sectors / 2);
-	printk("raid0 : conf->hash_spacing is %llu blocks.\n",
-		(unsigned long long)conf->hash_spacing);
+	printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
+		(unsigned long long)mddev->array_sectors);
+	printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
+		(unsigned long long)conf->spacing);
 	{
-		sector_t s = mddev->array_sectors / 2;
-		sector_t space = conf->hash_spacing;
+		sector_t s = mddev->array_sectors;
+		sector_t space = conf->spacing;
 		int round;
-		conf->preshift = 0;
+		conf->sector_shift = 0;
 		if (sizeof(sector_t) > sizeof(u32)) {
 			/*shift down space and s so that sector_div will work */
 			while (space > (sector_t) (~(u32)0)) {
 				s >>= 1;
 				space >>= 1;
 				s += 1; /* force round-up */
-				conf->preshift++;
+				conf->sector_shift++;
 			}
 		}
 		round = sector_div(s, (u32)space) ? 1 : 0;
 		nb_zone = s + round;
 	}
-	printk("raid0 : nb_zone is %d.\n", nb_zone);
+	printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone);
 
-	printk("raid0 : Allocating %Zd bytes for hash.\n",
+	printk(KERN_INFO "raid0 : Allocating %zu bytes for hash.\n",
 				nb_zone*sizeof(struct strip_zone*));
 	conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
 	if (!conf->hash_table)
 		goto out_free_conf;
-	size = conf->strip_zone[cur].sectors / 2;
+	sectors = conf->strip_zone[cur].sectors;
 
 	conf->hash_table[0] = conf->strip_zone + cur;
 	for (i=1; i< nb_zone; i++) {
-		while (size <= conf->hash_spacing) {
+		while (sectors <= conf->spacing) {
 			cur++;
-			size += conf->strip_zone[cur].sectors / 2;
+			sectors += conf->strip_zone[cur].sectors;
 		}
-		size -= conf->hash_spacing;
+		sectors -= conf->spacing;
 		conf->hash_table[i] = conf->strip_zone + cur;
 	}
-	if (conf->preshift) {
-		conf->hash_spacing >>= conf->preshift;
-		/* round hash_spacing up so when we divide by it, we
+	if (conf->sector_shift) {
+		conf->spacing >>= conf->sector_shift;
+		/* round spacing up so when we divide by it, we
 		 * err on the side of too-low, which is safest
 		 */
-		conf->hash_spacing++;
+		conf->spacing++;
 	}
 
 	/* calculate the max read-ahead size.
@@ -435,8 +435,8 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
  
 
 	{
-		sector_t x = sector >> (conf->preshift + 1);
-		sector_div(x, (u32)conf->hash_spacing);
+		sector_t x = sector >> conf->sector_shift;
+		sector_div(x, (u32)conf->spacing);
 		zone = conf->hash_table[x];
 	}
 
diff --git a/include/linux/raid/raid0.h b/include/linux/raid/raid0.h
index c12521d027e2..fd42aa87c391 100644
--- a/include/linux/raid/raid0.h
+++ b/include/linux/raid/raid0.h
@@ -19,8 +19,8 @@ struct raid0_private_data
 	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
 	int nr_strip_zones;
 
-	sector_t hash_spacing;
-	int preshift;			/* shift this before divide by hash_spacing */
+	sector_t spacing;
+	int sector_shift; /* shift this before divide by spacing */
 };
 
 typedef struct raid0_private_data raid0_conf_t;
-- 
cgit v1.2.3


From 159ec1fc060ab22b157a62364045f5e98749c4d3 Mon Sep 17 00:00:00 2001
From: Cheng Renquan <crquan@gmail.com>
Date: Fri, 9 Jan 2009 08:31:08 +1100
Subject: md: use list_for_each_entry macro directly

The rdev_for_each macro defined in <linux/raid/md_k.h> is identical to
list_for_each_entry_safe, from <linux/list.h>, it should be defined to
use list_for_each_entry_safe, instead of reinventing the wheel.

But some calls to each_entry_safe don't really need a safe version,
just a direct list_for_each_entry is enough, this could save a temp
variable (tmp) in every function that used rdev_for_each.

In this patch, most rdev_for_each loops are replaced by list_for_each_entry,
totally save many tmp vars; and only in the other situations that will call
list_del to delete an entry, the safe version is used.

Signed-off-by: Cheng Renquan <crquan@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c       |   3 +-
 drivers/md/faulty.c       |   3 +-
 drivers/md/linear.c       |   3 +-
 drivers/md/md.c           | 107 +++++++++++++++++++---------------------------
 drivers/md/multipath.c    |   3 +-
 drivers/md/raid0.c        |  10 ++---
 drivers/md/raid1.c        |   3 +-
 drivers/md/raid10.c       |   3 +-
 drivers/md/raid5.c        |   8 ++--
 include/linux/raid/md_k.h |  11 ++---
 10 files changed, 60 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 666b7ba47ec5..719943763391 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -215,7 +215,6 @@ static struct page *read_sb_page(mddev_t *mddev, long offset,
 	/* choose a good rdev and read the page from there */
 
 	mdk_rdev_t *rdev;
-	struct list_head *tmp;
 	sector_t target;
 
 	if (!page)
@@ -223,7 +222,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset,
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	rdev_for_each(rdev, tmp, mddev) {
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		if (! test_bit(In_sync, &rdev->flags)
 		    || test_bit(Faulty, &rdev->flags))
 			continue;
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index f26c1f9a475b..86d9adf90e79 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -283,7 +283,6 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size)
 static int run(mddev_t *mddev)
 {
 	mdk_rdev_t *rdev;
-	struct list_head *tmp;
 	int i;
 
 	conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL);
@@ -296,7 +295,7 @@ static int run(mddev_t *mddev)
 	}
 	conf->nfaults = 0;
 
-	rdev_for_each(rdev, tmp, mddev)
+	list_for_each_entry(rdev, &mddev->disks, same_set)
 		conf->rdev = rdev;
 
 	mddev->array_sectors = mddev->size * 2;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3b90c5c924ec..1e3aea9eecf1 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -105,7 +105,6 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 	int i, nb_zone, cnt;
 	sector_t min_sectors;
 	sector_t curr_sector;
-	struct list_head *tmp;
 
 	conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
 			GFP_KERNEL);
@@ -115,7 +114,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 	cnt = 0;
 	conf->array_sectors = 0;
 
-	rdev_for_each(rdev, tmp, mddev) {
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		int j = rdev->raid_disk;
 		dev_info_t *disk = conf->disks + j;
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ee759b193e9a..58be665bb46b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -307,25 +307,23 @@ static inline void mddev_unlock(mddev_t * mddev)
 
 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
 {
-	mdk_rdev_t * rdev;
-	struct list_head *tmp;
+	mdk_rdev_t *rdev;
 
-	rdev_for_each(rdev, tmp, mddev) {
+	list_for_each_entry(rdev, &mddev->disks, same_set)
 		if (rdev->desc_nr == nr)
 			return rdev;
-	}
+
 	return NULL;
 }
 
 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
 {
-	struct list_head *tmp;
 	mdk_rdev_t *rdev;
 
-	rdev_for_each(rdev, tmp, mddev) {
+	list_for_each_entry(rdev, &mddev->disks, same_set)
 		if (rdev->bdev->bd_dev == dev)
 			return rdev;
-	}
+
 	return NULL;
 }
 
@@ -861,7 +859,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	mdp_super_t *sb;
-	struct list_head *tmp;
 	mdk_rdev_t *rdev2;
 	int next_spare = mddev->raid_disks;
 
@@ -933,7 +930,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
 
 	sb->disks[0].state = (1<<MD_DISK_REMOVED);
-	rdev_for_each(rdev2, tmp, mddev) {
+	list_for_each_entry(rdev2, &mddev->disks, same_set) {
 		mdp_disk_t *d;
 		int desc_nr;
 		if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
@@ -1259,7 +1256,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	struct mdp_superblock_1 *sb;
-	struct list_head *tmp;
 	mdk_rdev_t *rdev2;
 	int max_dev, i;
 	/* make rdev->sb match mddev and rdev data. */
@@ -1307,7 +1303,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	}
 
 	max_dev = 0;
-	rdev_for_each(rdev2, tmp, mddev)
+	list_for_each_entry(rdev2, &mddev->disks, same_set)
 		if (rdev2->desc_nr+1 > max_dev)
 			max_dev = rdev2->desc_nr+1;
 
@@ -1316,7 +1312,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	for (i=0; i<max_dev;i++)
 		sb->dev_roles[i] = cpu_to_le16(0xfffe);
 	
-	rdev_for_each(rdev2, tmp, mddev) {
+	list_for_each_entry(rdev2, &mddev->disks, same_set) {
 		i = rdev2->desc_nr;
 		if (test_bit(Faulty, &rdev2->flags))
 			sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1571,8 +1567,7 @@ static void kick_rdev_from_array(mdk_rdev_t * rdev)
 
 static void export_array(mddev_t *mddev)
 {
-	struct list_head *tmp;
-	mdk_rdev_t *rdev;
+	mdk_rdev_t *rdev, *tmp;
 
 	rdev_for_each(rdev, tmp, mddev) {
 		if (!rdev->mddev) {
@@ -1643,7 +1638,7 @@ static void print_rdev(mdk_rdev_t *rdev)
 
 static void md_print_devices(void)
 {
-	struct list_head *tmp, *tmp2;
+	struct list_head *tmp;
 	mdk_rdev_t *rdev;
 	mddev_t *mddev;
 	char b[BDEVNAME_SIZE];
@@ -1658,11 +1653,11 @@ static void md_print_devices(void)
 			bitmap_print_sb(mddev->bitmap);
 		else
 			printk("%s: ", mdname(mddev));
-		rdev_for_each(rdev, tmp2, mddev)
+		list_for_each_entry(rdev, &mddev->disks, same_set)
 			printk("<%s>", bdevname(rdev->bdev,b));
 		printk("\n");
 
-		rdev_for_each(rdev, tmp2, mddev)
+		list_for_each_entry(rdev, &mddev->disks, same_set)
 			print_rdev(rdev);
 	}
 	printk("md:	**********************************\n");
@@ -1679,9 +1674,8 @@ static void sync_sbs(mddev_t * mddev, int nospares)
 	 * with the rest of the array)
 	 */
 	mdk_rdev_t *rdev;
-	struct list_head *tmp;
 
-	rdev_for_each(rdev, tmp, mddev) {
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		if (rdev->sb_events == mddev->events ||
 		    (nospares &&
 		     rdev->raid_disk < 0 &&
@@ -1699,7 +1693,6 @@ static void sync_sbs(mddev_t * mddev, int nospares)
 
 static void md_update_sb(mddev_t * mddev, int force_change)
 {
-	struct list_head *tmp;
 	mdk_rdev_t *rdev;
 	int sync_req;
 	int nospares = 0;
@@ -1790,7 +1783,7 @@ repeat:
 		mdname(mddev),mddev->in_sync);
 
 	bitmap_update_sb(mddev->bitmap);
-	rdev_for_each(rdev, tmp, mddev) {
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		char b[BDEVNAME_SIZE];
 		dprintk(KERN_INFO "md: ");
 		if (rdev->sb_loaded != 1)
@@ -1999,7 +1992,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		md_wakeup_thread(rdev->mddev->thread);
 	} else if (rdev->mddev->pers) {
 		mdk_rdev_t *rdev2;
-		struct list_head *tmp;
 		/* Activating a spare .. or possibly reactivating
 		 * if we every get bitmaps working here.
 		 */
@@ -2010,7 +2002,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		if (rdev->mddev->pers->hot_add_disk == NULL)
 			return -EINVAL;
 
-		rdev_for_each(rdev2, tmp, rdev->mddev)
+		list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
 			if (rdev2->raid_disk == slot)
 				return -EEXIST;
 
@@ -2125,14 +2117,14 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		 */
 		mddev_t *mddev;
 		int overlap = 0;
-		struct list_head *tmp, *tmp2;
+		struct list_head *tmp;
 
 		mddev_unlock(my_mddev);
 		for_each_mddev(mddev, tmp) {
 			mdk_rdev_t *rdev2;
 
 			mddev_lock(mddev);
-			rdev_for_each(rdev2, tmp2, mddev)
+			list_for_each_entry(rdev2, &mddev->disks, same_set)
 				if (test_bit(AllReserved, &rdev2->flags) ||
 				    (rdev->bdev == rdev2->bdev &&
 				     rdev != rdev2 &&
@@ -2328,8 +2320,7 @@ abort_free:
 static void analyze_sbs(mddev_t * mddev)
 {
 	int i;
-	struct list_head *tmp;
-	mdk_rdev_t *rdev, *freshest;
+	mdk_rdev_t *rdev, *freshest, *tmp;
 	char b[BDEVNAME_SIZE];
 
 	freshest = NULL;
@@ -3501,7 +3492,6 @@ static int do_md_run(mddev_t * mddev)
 {
 	int err;
 	int chunk_size;
-	struct list_head *tmp;
 	mdk_rdev_t *rdev;
 	struct gendisk *disk;
 	struct mdk_personality *pers;
@@ -3540,7 +3530,7 @@ static int do_md_run(mddev_t * mddev)
 		}
 
 		/* devices must have minimum size of one chunk */
-		rdev_for_each(rdev, tmp, mddev) {
+		list_for_each_entry(rdev, &mddev->disks, same_set) {
 			if (test_bit(Faulty, &rdev->flags))
 				continue;
 			if (rdev->size < chunk_size / 1024) {
@@ -3565,7 +3555,7 @@ static int do_md_run(mddev_t * mddev)
 	 * the only valid external interface is through the md
 	 * device.
 	 */
-	rdev_for_each(rdev, tmp, mddev) {
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		if (test_bit(Faulty, &rdev->flags))
 			continue;
 		sync_blockdev(rdev->bdev);
@@ -3630,10 +3620,10 @@ static int do_md_run(mddev_t * mddev)
 		 */
 		char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
 		mdk_rdev_t *rdev2;
-		struct list_head *tmp2;
 		int warned = 0;
-		rdev_for_each(rdev, tmp, mddev) {
-			rdev_for_each(rdev2, tmp2, mddev) {
+
+		list_for_each_entry(rdev, &mddev->disks, same_set)
+			list_for_each_entry(rdev2, &mddev->disks, same_set) {
 				if (rdev < rdev2 &&
 				    rdev->bdev->bd_contains ==
 				    rdev2->bdev->bd_contains) {
@@ -3647,7 +3637,7 @@ static int do_md_run(mddev_t * mddev)
 					warned = 1;
 				}
 			}
-		}
+
 		if (warned)
 			printk(KERN_WARNING
 			       "True protection against single-disk"
@@ -3695,7 +3685,7 @@ static int do_md_run(mddev_t * mddev)
 	mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
 	mddev->in_sync = 1;
 
-	rdev_for_each(rdev, tmp, mddev)
+	list_for_each_entry(rdev, &mddev->disks, same_set)
 		if (rdev->raid_disk >= 0) {
 			char nm[20];
 			sprintf(nm, "rd%d", rdev->raid_disk);
@@ -3726,9 +3716,8 @@ static int do_md_run(mddev_t * mddev)
 	 * it will remove the drives and not do the right thing
 	 */
 	if (mddev->degraded && !mddev->sync_thread) {
-		struct list_head *rtmp;
 		int spares = 0;
-		rdev_for_each(rdev, rtmp, mddev)
+		list_for_each_entry(rdev, &mddev->disks, same_set)
 			if (rdev->raid_disk >= 0 &&
 			    !test_bit(In_sync, &rdev->flags) &&
 			    !test_bit(Faulty, &rdev->flags))
@@ -3888,7 +3877,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 	 */
 	if (mode == 0) {
 		mdk_rdev_t *rdev;
-		struct list_head *tmp;
 
 		printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
 
@@ -3900,7 +3888,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		}
 		mddev->bitmap_offset = 0;
 
-		rdev_for_each(rdev, tmp, mddev)
+		list_for_each_entry(rdev, &mddev->disks, same_set)
 			if (rdev->raid_disk >= 0) {
 				char nm[20];
 				sprintf(nm, "rd%d", rdev->raid_disk);
@@ -3961,7 +3949,6 @@ out:
 static void autorun_array(mddev_t *mddev)
 {
 	mdk_rdev_t *rdev;
-	struct list_head *tmp;
 	int err;
 
 	if (list_empty(&mddev->disks))
@@ -3969,7 +3956,7 @@ static void autorun_array(mddev_t *mddev)
 
 	printk(KERN_INFO "md: running: ");
 
-	rdev_for_each(rdev, tmp, mddev) {
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		char b[BDEVNAME_SIZE];
 		printk("<%s>", bdevname(rdev->bdev,b));
 	}
@@ -3996,8 +3983,7 @@ static void autorun_array(mddev_t *mddev)
  */
 static void autorun_devices(int part)
 {
-	struct list_head *tmp;
-	mdk_rdev_t *rdev0, *rdev;
+	mdk_rdev_t *rdev0, *rdev, *tmp;
 	mddev_t *mddev;
 	char b[BDEVNAME_SIZE];
 
@@ -4012,7 +3998,7 @@ static void autorun_devices(int part)
 		printk(KERN_INFO "md: considering %s ...\n",
 			bdevname(rdev0->bdev,b));
 		INIT_LIST_HEAD(&candidates);
-		rdev_for_each_list(rdev, tmp, pending_raid_disks)
+		rdev_for_each_list(rdev, tmp, &pending_raid_disks)
 			if (super_90_load(rdev, rdev0, 0) >= 0) {
 				printk(KERN_INFO "md:  adding %s ...\n",
 					bdevname(rdev->bdev,b));
@@ -4058,7 +4044,7 @@ static void autorun_devices(int part)
 		} else {
 			printk(KERN_INFO "md: created %s\n", mdname(mddev));
 			mddev->persistent = 1;
-			rdev_for_each_list(rdev, tmp, candidates) {
+			rdev_for_each_list(rdev, tmp, &candidates) {
 				list_del_init(&rdev->same_set);
 				if (bind_rdev_to_array(rdev, mddev))
 					export_rdev(rdev);
@@ -4069,7 +4055,7 @@ static void autorun_devices(int part)
 		/* on success, candidates will be empty, on error
 		 * it won't...
 		 */
-		rdev_for_each_list(rdev, tmp, candidates) {
+		rdev_for_each_list(rdev, tmp, &candidates) {
 			list_del_init(&rdev->same_set);
 			export_rdev(rdev);
 		}
@@ -4098,10 +4084,9 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
 	mdu_array_info_t info;
 	int nr,working,active,failed,spare;
 	mdk_rdev_t *rdev;
-	struct list_head *tmp;
 
 	nr=working=active=failed=spare=0;
-	rdev_for_each(rdev, tmp, mddev) {
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		nr++;
 		if (test_bit(Faulty, &rdev->flags))
 			failed++;
@@ -4619,9 +4604,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 
 static int update_size(mddev_t *mddev, sector_t num_sectors)
 {
-	mdk_rdev_t * rdev;
+	mdk_rdev_t *rdev;
 	int rv;
-	struct list_head *tmp;
 	int fit = (num_sectors == 0);
 
 	if (mddev->pers->resize == NULL)
@@ -4643,7 +4627,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
 		 * grow, and re-add.
 		 */
 		return -EBUSY;
-	rdev_for_each(rdev, tmp, mddev) {
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		sector_t avail;
 		avail = rdev->size * 2;
 
@@ -5192,11 +5176,10 @@ static void status_unused(struct seq_file *seq)
 {
 	int i = 0;
 	mdk_rdev_t *rdev;
-	struct list_head *tmp;
 
 	seq_printf(seq, "unused devices: ");
 
-	rdev_for_each_list(rdev, tmp, pending_raid_disks) {
+	list_for_each_entry(rdev, &pending_raid_disks, same_set) {
 		char b[BDEVNAME_SIZE];
 		i++;
 		seq_printf(seq, "%s ",
@@ -5355,7 +5338,6 @@ static int md_seq_show(struct seq_file *seq, void *v)
 {
 	mddev_t *mddev = v;
 	sector_t size;
-	struct list_head *tmp2;
 	mdk_rdev_t *rdev;
 	struct mdstat_info *mi = seq->private;
 	struct bitmap *bitmap;
@@ -5392,7 +5374,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
 		}
 
 		size = 0;
-		rdev_for_each(rdev, tmp2, mddev) {
+		list_for_each_entry(rdev, &mddev->disks, same_set) {
 			char b[BDEVNAME_SIZE];
 			seq_printf(seq, " %s[%d]",
 				bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -5699,7 +5681,6 @@ void md_do_sync(mddev_t *mddev)
 	struct list_head *tmp;
 	sector_t last_check;
 	int skipped = 0;
-	struct list_head *rtmp;
 	mdk_rdev_t *rdev;
 	char *desc;
 
@@ -5804,7 +5785,7 @@ void md_do_sync(mddev_t *mddev)
 		/* recovery follows the physical size of devices */
 		max_sectors = mddev->size << 1;
 		j = MaxSector;
-		rdev_for_each(rdev, rtmp, mddev)
+		list_for_each_entry(rdev, &mddev->disks, same_set)
 			if (rdev->raid_disk >= 0 &&
 			    !test_bit(Faulty, &rdev->flags) &&
 			    !test_bit(In_sync, &rdev->flags) &&
@@ -5954,7 +5935,7 @@ void md_do_sync(mddev_t *mddev)
 		} else {
 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 				mddev->curr_resync = MaxSector;
-			rdev_for_each(rdev, rtmp, mddev)
+			list_for_each_entry(rdev, &mddev->disks, same_set)
 				if (rdev->raid_disk >= 0 &&
 				    !test_bit(Faulty, &rdev->flags) &&
 				    !test_bit(In_sync, &rdev->flags) &&
@@ -5990,10 +5971,9 @@ EXPORT_SYMBOL_GPL(md_do_sync);
 static int remove_and_add_spares(mddev_t *mddev)
 {
 	mdk_rdev_t *rdev;
-	struct list_head *rtmp;
 	int spares = 0;
 
-	rdev_for_each(rdev, rtmp, mddev)
+	list_for_each_entry(rdev, &mddev->disks, same_set)
 		if (rdev->raid_disk >= 0 &&
 		    !test_bit(Blocked, &rdev->flags) &&
 		    (test_bit(Faulty, &rdev->flags) ||
@@ -6009,7 +5989,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 		}
 
 	if (mddev->degraded && ! mddev->ro) {
-		rdev_for_each(rdev, rtmp, mddev) {
+		list_for_each_entry(rdev, &mddev->disks, same_set) {
 			if (rdev->raid_disk >= 0 &&
 			    !test_bit(In_sync, &rdev->flags) &&
 			    !test_bit(Blocked, &rdev->flags))
@@ -6061,7 +6041,6 @@ static int remove_and_add_spares(mddev_t *mddev)
 void md_check_recovery(mddev_t *mddev)
 {
 	mdk_rdev_t *rdev;
-	struct list_head *rtmp;
 
 
 	if (mddev->bitmap)
@@ -6125,7 +6104,7 @@ void md_check_recovery(mddev_t *mddev)
 		if (mddev->flags)
 			md_update_sb(mddev, 0);
 
-		rdev_for_each(rdev, rtmp, mddev)
+		list_for_each_entry(rdev, &mddev->disks, same_set)
 			if (test_and_clear_bit(StateChanged, &rdev->flags))
 				sysfs_notify_dirent(rdev->sysfs_state);
 
@@ -6154,7 +6133,7 @@ void md_check_recovery(mddev_t *mddev)
 			 * information must be scrapped
 			 */
 			if (!mddev->degraded)
-				rdev_for_each(rdev, rtmp, mddev)
+				list_for_each_entry(rdev, &mddev->disks, same_set)
 					rdev->saved_raid_disk = -1;
 
 			mddev->recovery = 0;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index d4ac47d11279..f6d08f241671 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -408,7 +408,6 @@ static int multipath_run (mddev_t *mddev)
 	int disk_idx;
 	struct multipath_info *disk;
 	mdk_rdev_t *rdev;
-	struct list_head *tmp;
 
 	if (mddev->level != LEVEL_MULTIPATH) {
 		printk("multipath: %s: raid level not set to multipath IO (%d)\n",
@@ -441,7 +440,7 @@ static int multipath_run (mddev_t *mddev)
 	}
 
 	conf->working_disks = 0;
-	rdev_for_each(rdev, tmp, mddev) {
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		disk_idx = rdev->raid_disk;
 		if (disk_idx < 0 ||
 		    disk_idx >= mddev->raid_disks)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 90f5b24f6e6c..c605ba805586 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -57,7 +57,6 @@ static int create_strip_zones (mddev_t *mddev)
 	sector_t min_spacing;
 	raid0_conf_t *conf = mddev_to_conf(mddev);
 	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
-	struct list_head *tmp1, *tmp2;
 	struct strip_zone *zone;
 	int cnt;
 	char b[BDEVNAME_SIZE];
@@ -67,11 +66,11 @@ static int create_strip_zones (mddev_t *mddev)
 	 */
 	conf->nr_strip_zones = 0;
  
-	rdev_for_each(rdev1, tmp1, mddev) {
+	list_for_each_entry(rdev1, &mddev->disks, same_set) {
 		printk(KERN_INFO "raid0: looking at %s\n",
 			bdevname(rdev1->bdev,b));
 		c = 0;
-		rdev_for_each(rdev2, tmp2, mddev) {
+		list_for_each_entry(rdev2, &mddev->disks, same_set) {
 			printk(KERN_INFO "raid0:   comparing %s(%llu)",
 			       bdevname(rdev1->bdev,b),
 			       (unsigned long long)rdev1->size);
@@ -120,7 +119,7 @@ static int create_strip_zones (mddev_t *mddev)
 	cnt = 0;
 	smallest = NULL;
 	zone->dev = conf->devlist;
-	rdev_for_each(rdev1, tmp1, mddev) {
+	list_for_each_entry(rdev1, &mddev->disks, same_set) {
 		int j = rdev1->raid_disk;
 
 		if (j < 0 || j >= mddev->raid_disks) {
@@ -268,7 +267,6 @@ static int raid0_run (mddev_t *mddev)
 	s64 sectors;
 	raid0_conf_t *conf;
 	mdk_rdev_t *rdev;
-	struct list_head *tmp;
 
 	if (mddev->chunk_size == 0) {
 		printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
@@ -294,7 +292,7 @@ static int raid0_run (mddev_t *mddev)
 
 	/* calculate array device size */
 	mddev->array_sectors = 0;
-	rdev_for_each(rdev, tmp, mddev)
+	list_for_each_entry(rdev, &mddev->disks, same_set)
 		mddev->array_sectors += rdev->size * 2;
 
 	printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 9c788e2489b1..c165b1eed8bb 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1919,7 +1919,6 @@ static int run(mddev_t *mddev)
 	int i, j, disk_idx;
 	mirror_info_t *disk;
 	mdk_rdev_t *rdev;
-	struct list_head *tmp;
 
 	if (mddev->level != 1) {
 		printk("raid1: %s: raid level not set to mirroring (%d)\n",
@@ -1964,7 +1963,7 @@ static int run(mddev_t *mddev)
 	spin_lock_init(&conf->device_lock);
 	mddev->queue->queue_lock = &conf->device_lock;
 
-	rdev_for_each(rdev, tmp, mddev) {
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		disk_idx = rdev->raid_disk;
 		if (disk_idx >= mddev->raid_disks
 		    || disk_idx < 0)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 970a96ef9b18..6736d6dff981 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2025,7 +2025,6 @@ static int run(mddev_t *mddev)
 	int i, disk_idx;
 	mirror_info_t *disk;
 	mdk_rdev_t *rdev;
-	struct list_head *tmp;
 	int nc, fc, fo;
 	sector_t stride, size;
 
@@ -2108,7 +2107,7 @@ static int run(mddev_t *mddev)
 	spin_lock_init(&conf->device_lock);
 	mddev->queue->queue_lock = &conf->device_lock;
 
-	rdev_for_each(rdev, tmp, mddev) {
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		disk_idx = rdev->raid_disk;
 		if (disk_idx >= mddev->raid_disks
 		    || disk_idx < 0)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a36a7435edf5..a5ba080d303b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3998,7 +3998,6 @@ static int run(mddev_t *mddev)
 	int raid_disk, memory;
 	mdk_rdev_t *rdev;
 	struct disk_info *disk;
-	struct list_head *tmp;
 	int working_disks = 0;
 
 	if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
@@ -4108,7 +4107,7 @@ static int run(mddev_t *mddev)
 
 	pr_debug("raid5: run(%s) called.\n", mdname(mddev));
 
-	rdev_for_each(rdev, tmp, mddev) {
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		raid_disk = rdev->raid_disk;
 		if (raid_disk >= conf->raid_disks
 		    || raid_disk < 0)
@@ -4533,7 +4532,6 @@ static int raid5_start_reshape(mddev_t *mddev)
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 	mdk_rdev_t *rdev;
-	struct list_head *rtmp;
 	int spares = 0;
 	int added_devices = 0;
 	unsigned long flags;
@@ -4541,7 +4539,7 @@ static int raid5_start_reshape(mddev_t *mddev)
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		return -EBUSY;
 
-	rdev_for_each(rdev, rtmp, mddev)
+	list_for_each_entry(rdev, &mddev->disks, same_set)
 		if (rdev->raid_disk < 0 &&
 		    !test_bit(Faulty, &rdev->flags))
 			spares++;
@@ -4563,7 +4561,7 @@ static int raid5_start_reshape(mddev_t *mddev)
 	/* Add some new drives, as many as will fit.
 	 * We know there are enough to make the newly sized array work.
 	 */
-	rdev_for_each(rdev, rtmp, mddev)
+	list_for_each_entry(rdev, &mddev->disks, same_set)
 		if (rdev->raid_disk < 0 &&
 		    !test_bit(Faulty, &rdev->flags)) {
 			if (raid5_add_disk(mddev, rdev) == 0) {
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 663803eaf0de..8f9a54c1fb0e 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -335,17 +335,14 @@ static inline char * mdname (mddev_t * mddev)
  * iterates through some rdev ringlist. It's safe to remove the
  * current 'rdev'. Dont touch 'tmp' though.
  */
-#define rdev_for_each_list(rdev, tmp, list)				\
-									\
-	for ((tmp) = (list).next;					\
-		(rdev) = (list_entry((tmp), mdk_rdev_t, same_set)),	\
-			(tmp) = (tmp)->next, (tmp)->prev != &(list)	\
-		; )
+#define rdev_for_each_list(rdev, tmp, head)				\
+	list_for_each_entry_safe(rdev, tmp, head, same_set)
+
 /*
  * iterates through the 'same array disks' ringlist
  */
 #define rdev_for_each(rdev, tmp, mddev)				\
-	rdev_for_each_list(rdev, tmp, (mddev)->disks)
+	list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
 
 #define rdev_for_each_rcu(rdev, mddev)				\
 	list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
-- 
cgit v1.2.3


From cd2ac9321c26dc7a76455cd2a4df89123fa2b73e Mon Sep 17 00:00:00 2001
From: Cheng Renquan <crquan@gmail.com>
Date: Fri, 9 Jan 2009 08:31:08 +1100
Subject: md: need another print_sb for mdp_superblock_1

md_print_devices is called in two code path: MD_BUG(...), and md_ioctl
with PRINT_RAID_DEBUG.  it will dump out all in use md devices
information;

However, it wrongly processed two types of superblock in one:

The header file <linux/raid/md_p.h> has defined two types of superblock,
struct mdp_superblock_s (typedefed with mdp_super_t) according to md with
metadata 0.90, and struct mdp_superblock_1 according to md with metadata
1.0 and later,

These two types of superblock are very different,

The md_print_devices code processed them both in mdp_super_t, that would
lead to wrong informaton dump like:

	[ 6742.345877]
	[ 6742.345887] md:	**********************************
	[ 6742.345890] md:	* <COMPLETE RAID STATE PRINTOUT> *
	[ 6742.345892] md:	**********************************
	[ 6742.345896] md1: <ram7><ram6><ram5><ram4>
	[ 6742.345907] md: rdev ram7, SZ:00065472 F:0 S:1 DN:3
	[ 6742.345909] md: rdev superblock:
	[ 6742.345914] md:  SB: (V:0.90.0) ID:<42ef13c7.598c059a.5f9f1645.801e9ee6> CT:4919856d
	[ 6742.345918] md:     L5 S00065472 ND:4 RD:4 md1 LO:2 CS:65536
	[ 6742.345922] md:     UT:4919856d ST:1 AD:4 WD:4 FD:0 SD:0 CSUM:b7992907 E:00000001
	[ 6742.345924]      D  0:  DISK<N:0,(1,8),R:0,S:6>
	[ 6742.345930]      D  1:  DISK<N:1,(1,10),R:1,S:6>
	[ 6742.345933]      D  2:  DISK<N:2,(1,12),R:2,S:6>
	[ 6742.345937]      D  3:  DISK<N:3,(1,14),R:3,S:6>
	[ 6742.345942] md:     THIS:  DISK<N:3,(1,14),R:3,S:6>
	...
	[ 6742.346058] md0: <ram3><ram2><ram1><ram0>
	[ 6742.346067] md: rdev ram3, SZ:00065472 F:0 S:1 DN:3
	[ 6742.346070] md: rdev superblock:
	[ 6742.346073] md:  SB: (V:1.0.0) ID:<369aad81.00000000.00000000.00000000> CT:9a322a9c
	[ 6742.346077] md:     L-1507699579 S976570180 ND:48 RD:0 md0 LO:65536 CS:196610
	[ 6742.346081] md:     UT:00000018 ST:0 AD:131048 WD:0 FD:8 SD:0 CSUM:00000000 E:00000000
	[ 6742.346084]      D  0:  DISK<N:-1,(-1,-1),R:-1,S:-1>
	[ 6742.346089]      D  1:  DISK<N:-1,(-1,-1),R:-1,S:-1>
	[ 6742.346092]      D  2:  DISK<N:-1,(-1,-1),R:-1,S:-1>
	[ 6742.346096]      D  3:  DISK<N:-1,(-1,-1),R:-1,S:-1>
	[ 6742.346102] md:     THIS:  DISK<N:0,(0,0),R:0,S:0>
	...
	[ 6742.346219] md:	**********************************
	[ 6742.346221]

Here md1 is metadata 0.90.0, and md0 is metadata 1.2

After some more code to distinguish these two types of superblock, in this patch,

it will generate dump information like:

	[ 7906.755790]
	[ 7906.755799] md:	**********************************
	[ 7906.755802] md:	* <COMPLETE RAID STATE PRINTOUT> *
	[ 7906.755804] md:	**********************************
	[ 7906.755808] md1: <ram7><ram6><ram5><ram4>
	[ 7906.755819] md: rdev ram7, SZ:00065472 F:0 S:1 DN:3
	[ 7906.755821] md: rdev superblock (MJ:0):
	[ 7906.755826] md:  SB: (V:0.90.0) ID:<3fca7a0d.a612bfed.5f9f1645.801e9ee6> CT:491989f3
	[ 7906.755830] md:     L5 S00065472 ND:4 RD:4 md1 LO:2 CS:65536
	[ 7906.755834] md:     UT:491989f3 ST:1 AD:4 WD:4 FD:0 SD:0 CSUM:00fb52ad E:00000001
	[ 7906.755836]      D  0:  DISK<N:0,(1,8),R:0,S:6>
	[ 7906.755842]      D  1:  DISK<N:1,(1,10),R:1,S:6>
	[ 7906.755845]      D  2:  DISK<N:2,(1,12),R:2,S:6>
	[ 7906.755849]      D  3:  DISK<N:3,(1,14),R:3,S:6>
	[ 7906.755855] md:     THIS:  DISK<N:3,(1,14),R:3,S:6>
	...
	[ 7906.755972] md0: <ram3><ram2><ram1><ram0>
	[ 7906.755981] md: rdev ram3, SZ:00065472 F:0 S:1 DN:3
	[ 7906.755984] md: rdev superblock (MJ:1):
	[ 7906.755989] md:  SB: (V:1) (F:0) Array-ID:<5fbcf158:55aa:5fbe:9a79:1e939880dcbd>
	[ 7906.755990] md:    Name: "DG5:0" CT:1226410480
	[ 7906.755998] md:       L5 SZ130944 RD:4 LO:2 CS:128 DO:24 DS:131048 SO:8 RO:0
	[ 7906.755999] md:     Dev:00000003 UUID: 9194d744:87f7:a448:85f2:7497b84ce30a
	[ 7906.756001] md:       (F:0) UT:1226410480 Events:0 ResyncOffset:-1 CSUM:0dbcd829
	[ 7906.756003] md:         (MaxDev:384)
	...
	[ 7906.756113] md:	**********************************
	[ 7906.756116]

this md0 (metadata 1.2) information dumping is exactly according to struct
mdp_superblock_1.

Signed-off-by: Cheng Renquan <crquan@gmail.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Dan Williams <dan.j.williams@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           | 66 ++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/raid/md_p.h |  2 ++
 2 files changed, 62 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 58be665bb46b..1f770c16d435 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1588,7 +1588,7 @@ static void print_desc(mdp_disk_t *desc)
 		desc->major,desc->minor,desc->raid_disk,desc->state);
 }
 
-static void print_sb(mdp_super_t *sb)
+static void print_sb_90(mdp_super_t *sb)
 {
 	int i;
 
@@ -1619,10 +1619,57 @@ static void print_sb(mdp_super_t *sb)
 	}
 	printk(KERN_INFO "md:     THIS: ");
 	print_desc(&sb->this_disk);
-
 }
 
-static void print_rdev(mdk_rdev_t *rdev)
+static void print_sb_1(struct mdp_superblock_1 *sb)
+{
+	__u8 *uuid;
+
+	uuid = sb->set_uuid;
+	printk(KERN_INFO "md:  SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
+			":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
+	       KERN_INFO "md:    Name: \"%s\" CT:%llu\n",
+		le32_to_cpu(sb->major_version),
+		le32_to_cpu(sb->feature_map),
+		uuid[0], uuid[1], uuid[2], uuid[3],
+		uuid[4], uuid[5], uuid[6], uuid[7],
+		uuid[8], uuid[9], uuid[10], uuid[11],
+		uuid[12], uuid[13], uuid[14], uuid[15],
+		sb->set_name,
+		(unsigned long long)le64_to_cpu(sb->ctime)
+		       & MD_SUPERBLOCK_1_TIME_SEC_MASK);
+
+	uuid = sb->device_uuid;
+	printk(KERN_INFO "md:       L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
+			" RO:%llu\n"
+	       KERN_INFO "md:     Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
+			":%02x%02x%02x%02x%02x%02x\n"
+	       KERN_INFO "md:       (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
+	       KERN_INFO "md:         (MaxDev:%u) \n",
+		le32_to_cpu(sb->level),
+		(unsigned long long)le64_to_cpu(sb->size),
+		le32_to_cpu(sb->raid_disks),
+		le32_to_cpu(sb->layout),
+		le32_to_cpu(sb->chunksize),
+		(unsigned long long)le64_to_cpu(sb->data_offset),
+		(unsigned long long)le64_to_cpu(sb->data_size),
+		(unsigned long long)le64_to_cpu(sb->super_offset),
+		(unsigned long long)le64_to_cpu(sb->recovery_offset),
+		le32_to_cpu(sb->dev_number),
+		uuid[0], uuid[1], uuid[2], uuid[3],
+		uuid[4], uuid[5], uuid[6], uuid[7],
+		uuid[8], uuid[9], uuid[10], uuid[11],
+		uuid[12], uuid[13], uuid[14], uuid[15],
+		sb->devflags,
+		(unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
+		(unsigned long long)le64_to_cpu(sb->events),
+		(unsigned long long)le64_to_cpu(sb->resync_offset),
+		le32_to_cpu(sb->sb_csum),
+		le32_to_cpu(sb->max_dev)
+		);
+}
+
+static void print_rdev(mdk_rdev_t *rdev, int major_version)
 {
 	char b[BDEVNAME_SIZE];
 	printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
@@ -1630,8 +1677,15 @@ static void print_rdev(mdk_rdev_t *rdev)
 	        test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
 	        rdev->desc_nr);
 	if (rdev->sb_loaded) {
-		printk(KERN_INFO "md: rdev superblock:\n");
-		print_sb((mdp_super_t*)page_address(rdev->sb_page));
+		printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
+		switch (major_version) {
+		case 0:
+			print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
+			break;
+		case 1:
+			print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
+			break;
+		}
 	} else
 		printk(KERN_INFO "md: no rdev superblock!\n");
 }
@@ -1658,7 +1712,7 @@ static void md_print_devices(void)
 		printk("\n");
 
 		list_for_each_entry(rdev, &mddev->disks, same_set)
-			print_rdev(rdev);
+			print_rdev(rdev, mddev->major_version);
 	}
 	printk("md:	**********************************\n");
 	printk("\n");
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 8b4de4a41ff1..9491026afe66 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -194,6 +194,8 @@ static inline __u64 md_event(mdp_super_t *sb) {
 	return (ev<<32)| sb->events_lo;
 }
 
+#define MD_SUPERBLOCK_1_TIME_SEC_MASK ((1ULL<<40) - 1)
+
 /*
  * The version-1 superblock :
  * All numeric fields are little-endian.
-- 
cgit v1.2.3


From d3374825ce57ba2214d375023979f6197ccc1385 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 9 Jan 2009 08:31:10 +1100
Subject: md: make devices disappear when they are no longer needed.

Currently md devices, once created, never disappear until the module
is unloaded.  This is essentially because the gendisk holds a
reference to the mddev, and the mddev holds a reference to the
gendisk, this a circular reference.

If we drop the reference from mddev to gendisk, then we need to ensure
that the mddev is destroyed when the gendisk is destroyed.  However it
is not possible to hook into the gendisk destruction process to enable
this.

So we drop the reference from the gendisk to the mddev and destroy the
gendisk when the mddev gets destroyed.  However this has a
complication.
Between the call
   __blkdev_get->get_gendisk->kobj_lookup->md_probe
and the call
   __blkdev_get->md_open

there is no obvious way to hold a reference on the mddev any more, so
unless something is done, it will disappear and gendisk will be
destroyed prematurely.

Also, once we decide to destroy the mddev, there will be an unlockable
moment before the gendisk is unlinked (blk_unregister_region) during
which a new reference to the gendisk can be created.  We need to
ensure that this reference can not be used.  i.e. the ->open must
fail.

So:
 1/  in md_probe we set a flag in the mddev (hold_active) which
     indicates that the array should be treated as active, even
     though there are no references, and no appearance of activity.
     This is cleared by md_release when the device is closed if it
     is no longer needed.
     This ensures that the gendisk will survive between md_probe and
     md_open.

 2/  In md_open we check if the mddev we expect to open matches
     the gendisk that we did open.
     If there is a mismatch we return -ERESTARTSYS and modify
     __blkdev_get to retry from the top in that case.
     In the -ERESTARTSYS sys case we make sure to wait until
     the old gendisk (that we succeeded in opening) is really gone so
     we loop at most once.

Some udev configurations will always open an md device when it first
appears.   If we allow an md device that was just created by an open
to disappear on an immediate close, then this can race with such udev
configurations and result in an infinite loop the device being opened
and closed, then re-open due to the 'ADD' even from the first open,
and then close and so on.
So we make sure an md device, once created by an open, remains active
at least until some md 'ioctl' has been made on it.  This means that
all normal usage of md devices will allow them to disappear promptly
when not needed, but the worst that an incorrect usage will do it
cause an inactive md device to be left in existence (it can easily be
removed).

As an array can be stopped by writing to a sysfs attribute
  echo clear > /sys/block/mdXXX/md/array_state
we need to use scheduled work for deleting the gendisk and other
kobjects.  This allows us to wait for any pending gendisk deletion to
complete by simply calling flush_scheduled_work().


Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           | 61 +++++++++++++++++++++++++++++++++++++----------
 fs/block_dev.c            | 14 +++++++++++
 include/linux/raid/md_k.h |  4 ++++
 3 files changed, 67 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 970a8c42ba92..38697283aaf4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -214,16 +214,33 @@ static inline mddev_t *mddev_get(mddev_t *mddev)
 	return mddev;
 }
 
+static void mddev_delayed_delete(struct work_struct *ws)
+{
+	mddev_t *mddev = container_of(ws, mddev_t, del_work);
+	kobject_del(&mddev->kobj);
+	kobject_put(&mddev->kobj);
+}
+
 static void mddev_put(mddev_t *mddev)
 {
 	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
 		return;
-	if (!mddev->raid_disks && list_empty(&mddev->disks)) {
+	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
+	    !mddev->hold_active) {
 		list_del(&mddev->all_mddevs);
-		spin_unlock(&all_mddevs_lock);
-		kobject_put(&mddev->kobj);
-	} else
-		spin_unlock(&all_mddevs_lock);
+		if (mddev->gendisk) {
+			/* we did a probe so need to clean up.
+			 * Call schedule_work inside the spinlock
+			 * so that flush_scheduled_work() after
+			 * mddev_find will succeed in waiting for the
+			 * work to be done.
+			 */
+			INIT_WORK(&mddev->del_work, mddev_delayed_delete);
+			schedule_work(&mddev->del_work);
+		} else
+			kfree(mddev);
+	}
+	spin_unlock(&all_mddevs_lock);
 }
 
 static mddev_t * mddev_find(dev_t unit)
@@ -242,6 +259,7 @@ static mddev_t * mddev_find(dev_t unit)
 
 	if (new) {
 		list_add(&new->all_mddevs, &all_mddevs);
+		mddev->hold_active = UNTIL_IOCTL;
 		spin_unlock(&all_mddevs_lock);
 		return new;
 	}
@@ -3435,6 +3453,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 	rv = mddev_lock(mddev);
+	if (mddev->hold_active == UNTIL_IOCTL)
+		mddev->hold_active = 0;
 	if (!rv) {
 		rv = entry->store(mddev, page, length);
 		mddev_unlock(mddev);
@@ -3484,6 +3504,11 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
 	if (!mddev)
 		return NULL;
 
+	/* wait for any previous instance if this device
+	 * to be completed removed (mddev_delayed_delete).
+	 */
+	flush_scheduled_work();
+
 	mutex_lock(&disks_mutex);
 	if (mddev->gendisk) {
 		mutex_unlock(&disks_mutex);
@@ -3520,7 +3545,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
 	disk->private_data = mddev;
 	disk->queue = mddev->queue;
 	/* Allow extended partitions.  This makes the
-	 * 'mdp' device redundant, but we can really
+	 * 'mdp' device redundant, but we can't really
 	 * remove it now.
 	 */
 	disk->flags |= GENHD_FL_EXT_DEVT;
@@ -3536,6 +3561,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
 		kobject_uevent(&mddev->kobj, KOBJ_ADD);
 		mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
 	}
+	mddev_put(mddev);
 	return NULL;
 }
 
@@ -5054,6 +5080,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 
 done_unlock:
 abort_unlock:
+	if (mddev->hold_active == UNTIL_IOCTL &&
+	    err != -EINVAL)
+		mddev->hold_active = 0;
 	mddev_unlock(mddev);
 
 	return err;
@@ -5070,14 +5099,25 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 	 * Succeed if we can lock the mddev, which confirms that
 	 * it isn't being stopped right now.
 	 */
-	mddev_t *mddev = bdev->bd_disk->private_data;
+	mddev_t *mddev = mddev_find(bdev->bd_dev);
 	int err;
 
+	if (mddev->gendisk != bdev->bd_disk) {
+		/* we are racing with mddev_put which is discarding this
+		 * bd_disk.
+		 */
+		mddev_put(mddev);
+		/* Wait until bdev->bd_disk is definitely gone */
+		flush_scheduled_work();
+		/* Then retry the open from the top */
+		return -ERESTARTSYS;
+	}
+	BUG_ON(mddev != bdev->bd_disk->private_data);
+
 	if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
 		goto out;
 
 	err = 0;
-	mddev_get(mddev);
 	atomic_inc(&mddev->openers);
 	mddev_unlock(mddev);
 
@@ -6436,11 +6476,8 @@ static __exit void md_exit(void)
 	unregister_sysctl_table(raid_table_header);
 	remove_proc_entry("mdstat", NULL);
 	for_each_mddev(mddev, tmp) {
-		struct gendisk *disk = mddev->gendisk;
-		if (!disk)
-			continue;
 		export_array(mddev);
-		mddev_put(mddev);
+		mddev->hold_active = 0;
 	}
 }
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b957717e25ab..8ebbfdf708c2 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1005,6 +1005,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	}
 
 	lock_kernel();
+ restart:
 
 	ret = -ENXIO;
 	disk = get_gendisk(bdev->bd_dev, &partno);
@@ -1025,6 +1026,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev, mode);
+				if (ret == -ERESTARTSYS) {
+					/* Lost a race with 'disk' being
+					 * deleted, try again.
+					 * See md.c
+					 */
+					disk_put_part(bdev->bd_part);
+					bdev->bd_part = NULL;
+					module_put(disk->fops->owner);
+					put_disk(disk);
+					bdev->bd_disk = NULL;
+					mutex_unlock(&bdev->bd_mutex);
+					goto restart;
+				}
 				if (ret)
 					goto out_clear;
 			}
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 8f9a54c1fb0e..e3d17c7f954e 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -137,6 +137,8 @@ struct mddev_s
 	struct gendisk			*gendisk;
 
 	struct kobject			kobj;
+	int				hold_active;
+#define	UNTIL_IOCTL	1
 
 	/* Superblock information */
 	int				major_version,
@@ -246,6 +248,8 @@ struct mddev_s
 							 */
 	struct sysfs_dirent		*sysfs_action;  /* handle for 'sync_action' */
 
+	struct work_struct del_work;	/* used for delayed sysfs removal */
+
 	spinlock_t			write_lock;
 	wait_queue_head_t		sb_wait;	/* for waiting on superblock updates */
 	atomic_t			pending_writes;	/* number of active superblock writes */
-- 
cgit v1.2.3


From efeb53c0e57213e843b7ef3cc6ebcdea7d6186ac Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 9 Jan 2009 08:31:10 +1100
Subject: md: Allow md devices to be created by name.

Using sequential numbers to identify md devices is somewhat artificial.
Using names can be a lot more user-friendly.

Also, creating md devices by opening the device special file is a bit
awkward.

So this patch provides a new option for creating and naming devices.

Writing a name such as "md_home" to
    /sys/modules/md_mod/parameters/new_array
will cause an array with that name to be created.  It will appear in
/sys/block/ /proc/partitions and /proc/mdstat as 'md_home'.
It will have an arbitrary minor number allocated.

md devices that a created by an open are destroyed on the last
close when the device is inactive.
For named md devices, they will not be destroyed until the array
is explicitly stopped, either with the STOP_ARRAY ioctl or by
writing 'clear' to /sys/block/md_XXXX/md/array_state.

The name of the array must start 'md_' to avoid conflict with
other devices.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           | 119 +++++++++++++++++++++++++++++++++++++++-------
 include/linux/raid/md_k.h |   1 +
 2 files changed, 102 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 38697283aaf4..f5cbb9d2371a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -249,17 +249,51 @@ static mddev_t * mddev_find(dev_t unit)
 
  retry:
 	spin_lock(&all_mddevs_lock);
-	list_for_each_entry(mddev, &all_mddevs, all_mddevs)
-		if (mddev->unit == unit) {
-			mddev_get(mddev);
+
+	if (unit) {
+		list_for_each_entry(mddev, &all_mddevs, all_mddevs)
+			if (mddev->unit == unit) {
+				mddev_get(mddev);
+				spin_unlock(&all_mddevs_lock);
+				kfree(new);
+				return mddev;
+			}
+
+		if (new) {
+			list_add(&new->all_mddevs, &all_mddevs);
 			spin_unlock(&all_mddevs_lock);
-			kfree(new);
-			return mddev;
+			new->hold_active = UNTIL_IOCTL;
+			return new;
 		}
-
-	if (new) {
+	} else if (new) {
+		/* find an unused unit number */
+		static int next_minor = 512;
+		int start = next_minor;
+		int is_free = 0;
+		int dev = 0;
+		while (!is_free) {
+			dev = MKDEV(MD_MAJOR, next_minor);
+			next_minor++;
+			if (next_minor > MINORMASK)
+				next_minor = 0;
+			if (next_minor == start) {
+				/* Oh dear, all in use. */
+				spin_unlock(&all_mddevs_lock);
+				kfree(new);
+				return NULL;
+			}
+				
+			is_free = 1;
+			list_for_each_entry(mddev, &all_mddevs, all_mddevs)
+				if (mddev->unit == dev) {
+					is_free = 0;
+					break;
+				}
+		}
+		new->unit = dev;
+		new->md_minor = MINOR(dev);
+		new->hold_active = UNTIL_STOP;
 		list_add(&new->all_mddevs, &all_mddevs);
-		mddev->hold_active = UNTIL_IOCTL;
 		spin_unlock(&all_mddevs_lock);
 		return new;
 	}
@@ -3491,18 +3525,22 @@ static struct kobj_type md_ktype = {
 
 int mdp_major = 0;
 
-static struct kobject *md_probe(dev_t dev, int *part, void *data)
+static int md_alloc(dev_t dev, char *name)
 {
 	static DEFINE_MUTEX(disks_mutex);
 	mddev_t *mddev = mddev_find(dev);
 	struct gendisk *disk;
-	int partitioned = (MAJOR(dev) != MD_MAJOR);
-	int shift = partitioned ? MdpMinorShift : 0;
-	int unit = MINOR(dev) >> shift;
+	int partitioned;
+	int shift;
+	int unit;
 	int error;
 
 	if (!mddev)
-		return NULL;
+		return -ENODEV;
+
+	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
+	shift = partitioned ? MdpMinorShift : 0;
+	unit = MINOR(mddev->unit) >> shift;
 
 	/* wait for any previous instance if this device
 	 * to be completed removed (mddev_delayed_delete).
@@ -3513,14 +3551,29 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
 	if (mddev->gendisk) {
 		mutex_unlock(&disks_mutex);
 		mddev_put(mddev);
-		return NULL;
+		return -EEXIST;
+	}
+
+	if (name) {
+		/* Need to ensure that 'name' is not a duplicate.
+		 */
+		mddev_t *mddev2;
+		spin_lock(&all_mddevs_lock);
+
+		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
+			if (mddev2->gendisk &&
+			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
+				spin_unlock(&all_mddevs_lock);
+				return -EEXIST;
+			}
+		spin_unlock(&all_mddevs_lock);
 	}
 
 	mddev->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!mddev->queue) {
 		mutex_unlock(&disks_mutex);
 		mddev_put(mddev);
-		return NULL;
+		return -ENOMEM;
 	}
 	/* Can be unlocked because the queue is new: no concurrency */
 	queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
@@ -3533,11 +3586,13 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
 		blk_cleanup_queue(mddev->queue);
 		mddev->queue = NULL;
 		mddev_put(mddev);
-		return NULL;
+		return -ENOMEM;
 	}
-	disk->major = MAJOR(dev);
+	disk->major = MAJOR(mddev->unit);
 	disk->first_minor = unit << shift;
-	if (partitioned)
+	if (name)
+		strcpy(disk->disk_name, name);
+	else if (partitioned)
 		sprintf(disk->disk_name, "md_d%d", unit);
 	else
 		sprintf(disk->disk_name, "md%d", unit);
@@ -3562,9 +3617,34 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
 		mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
 	}
 	mddev_put(mddev);
+	return 0;
+}
+
+static struct kobject *md_probe(dev_t dev, int *part, void *data)
+{
+	md_alloc(dev, NULL);
 	return NULL;
 }
 
+static int add_named_array(const char *val, struct kernel_param *kp)
+{
+	/* val must be "md_*" where * is not all digits.
+	 * We allocate an array with a large free minor number, and
+	 * set the name to val.  val must not already be an active name.
+	 */
+	int len = strlen(val);
+	char buf[DISK_NAME_LEN];
+
+	while (len && val[len-1] == '\n')
+		len--;
+	if (len >= DISK_NAME_LEN)
+		return -E2BIG;
+	strlcpy(buf, val, len+1);
+	if (strncmp(buf, "md_", 3) != 0)
+		return -EINVAL;
+	return md_alloc(0, buf);
+}
+
 static void md_safemode_timeout(unsigned long data)
 {
 	mddev_t *mddev = (mddev_t *) data;
@@ -4025,6 +4105,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		mddev->barriers_work = 0;
 		mddev->safemode = 0;
 		kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+		if (mddev->hold_active == UNTIL_STOP)
+			mddev->hold_active = 0;
 
 	} else if (mddev->pers)
 		printk(KERN_INFO "md: %s switched to read-only mode.\n",
@@ -6502,6 +6584,7 @@ static int set_ro(const char *val, struct kernel_param *kp)
 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
 
+module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
 
 EXPORT_SYMBOL(register_md_personality);
 EXPORT_SYMBOL(unregister_md_personality);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index e3d17c7f954e..dac4217194b8 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -139,6 +139,7 @@ struct mddev_s
 	struct kobject			kobj;
 	int				hold_active;
 #define	UNTIL_IOCTL	1
+#define	UNTIL_STOP	2
 
 	/* Superblock information */
 	int				major_version,
-- 
cgit v1.2.3


From 4044ba58dd15cb01797c4fd034f39ef4a75f7cc3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 9 Jan 2009 08:31:11 +1100
Subject: md: don't retry recovery of raid1 that fails due to error on source
 drive.

If a raid1 has only one working drive and it has a sector which
gives an error on read, then an attempt to recover onto a spare will
fail, but as the single remaining drive is not removed from the
array, the recovery will be immediately re-attempted, resulting
in an infinite recovery loop.

So detect this situation and don't retry recovery once an error
on the lone remaining drive is detected.

Allow recovery to be retried once every time a spare is added
in case the problem wasn't actually a media error.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           | 5 ++++-
 drivers/md/raid1.c        | 8 ++++++--
 include/linux/raid/md_k.h | 3 +++
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f5cbb9d2371a..41e2509bf896 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1500,6 +1500,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 
 	list_add_rcu(&rdev->same_set, &mddev->disks);
 	bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
+
+	/* May as well allow recovery to be retried once */
+	mddev->recovery_disabled = 0;
 	return 0;
 
  fail:
@@ -6175,7 +6178,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 			}
 		}
 
-	if (mddev->degraded && ! mddev->ro) {
+	if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
 		list_for_each_entry(rdev, &mddev->disks, same_set) {
 			if (rdev->raid_disk >= 0 &&
 			    !test_bit(In_sync, &rdev->flags) &&
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c165b1eed8bb..7b4f5f7155d8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1016,12 +1016,16 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 	 * else mark the drive as failed
 	 */
 	if (test_bit(In_sync, &rdev->flags)
-	    && (conf->raid_disks - mddev->degraded) == 1)
+	    && (conf->raid_disks - mddev->degraded) == 1) {
 		/*
 		 * Don't fail the drive, act as though we were just a
-		 * normal single drive
+		 * normal single drive.
+		 * However don't try a recovery from this drive as
+		 * it is very likely to fail.
 		 */
+		mddev->recovery_disabled = 1;
 		return;
+	}
 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
 		unsigned long flags;
 		spin_lock_irqsave(&conf->device_lock, flags);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index dac4217194b8..9743e4dbc918 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -218,6 +218,9 @@ struct mddev_s
 #define	MD_RECOVERY_FROZEN	9
 
 	unsigned long			recovery;
+	int				recovery_disabled; /* if we detect that recovery
+							    * will always fail, set this
+							    * so we don't loop trying */
 
 	int				in_sync;	/* know to not need resync */
 	struct mutex			reconfig_mutex;
-- 
cgit v1.2.3


From 871af1210f13966ab911ed2166e4ab2ce775b99d Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Mon, 5 Jan 2009 14:16:39 +0000
Subject: libata: Add 32bit PIO support

This matters for some controllers and in one or two cases almost doubles
PIO performance. Add a bmdma32 operations set we can inherit and activate
it for some controllers

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/ata_piix.c    |  2 +-
 drivers/ata/libata-sff.c  | 53 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/ata/pata_ali.c    |  6 +++---
 drivers/ata/pata_amd.c    |  4 ++--
 drivers/ata/pata_mpiix.c  |  3 ++-
 drivers/ata/pata_sil680.c |  4 ++--
 include/linux/libata.h    |  3 +++
 7 files changed, 66 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 78659546130c..887d8f46a287 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -310,7 +310,7 @@ static struct scsi_host_template piix_sht = {
 };
 
 static struct ata_port_operations piix_pata_ops = {
-	.inherits		= &ata_bmdma_port_ops,
+	.inherits		= &ata_bmdma32_port_ops,
 	.cable_detect		= ata_cable_40wire,
 	.set_piomode		= piix_set_piomode,
 	.set_dmamode		= piix_set_dmamode,
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 9033d164c4ec..b58549fac460 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -78,6 +78,13 @@ const struct ata_port_operations ata_bmdma_port_ops = {
 	.bmdma_status		= ata_bmdma_status,
 };
 
+const struct ata_port_operations ata_bmdma32_port_ops = {
+	.inherits		= &ata_bmdma_port_ops,
+
+	.sff_data_xfer		= ata_sff_data_xfer32,
+};
+EXPORT_SYMBOL_GPL(ata_bmdma32_port_ops);
+
 /**
  *	ata_fill_sg - Fill PCI IDE PRD table
  *	@qc: Metadata associated with taskfile to be transferred
@@ -718,6 +725,52 @@ unsigned int ata_sff_data_xfer(struct ata_device *dev, unsigned char *buf,
 	return words << 1;
 }
 
+/**
+ *	ata_sff_data_xfer32 - Transfer data by PIO
+ *	@dev: device to target
+ *	@buf: data buffer
+ *	@buflen: buffer length
+ *	@rw: read/write
+ *
+ *	Transfer data from/to the device data register by PIO using 32bit
+ *	I/O operations.
+ *
+ *	LOCKING:
+ *	Inherited from caller.
+ *
+ *	RETURNS:
+ *	Bytes consumed.
+ */
+
+unsigned int ata_sff_data_xfer32(struct ata_device *dev, unsigned char *buf,
+			       unsigned int buflen, int rw)
+{
+	struct ata_port *ap = dev->link->ap;
+	void __iomem *data_addr = ap->ioaddr.data_addr;
+	unsigned int words = buflen >> 2;
+	int slop = buflen & 3;
+
+	/* Transfer multiple of 4 bytes */
+	if (rw == READ)
+		ioread32_rep(data_addr, buf, words);
+	else
+		iowrite32_rep(data_addr, buf, words);
+
+	if (unlikely(slop)) {
+		__le32 pad;
+		if (rw == READ) {
+			pad = cpu_to_le32(ioread32(ap->ioaddr.data_addr));
+			memcpy(buf + buflen - slop, &pad, slop);
+		} else {
+			memcpy(&pad, buf + buflen - slop, slop);
+			iowrite32(le32_to_cpu(pad), ap->ioaddr.data_addr);
+		}
+		words++;
+	}
+	return words << 2;
+}
+EXPORT_SYMBOL_GPL(ata_sff_data_xfer32);
+
 /**
  *	ata_sff_data_xfer_noirq - Transfer data by PIO
  *	@dev: device to target
diff --git a/drivers/ata/pata_ali.c b/drivers/ata/pata_ali.c
index a4f9e39442c6..a7999c19f0c9 100644
--- a/drivers/ata/pata_ali.c
+++ b/drivers/ata/pata_ali.c
@@ -151,8 +151,7 @@ static void ali_fifo_control(struct ata_port *ap, struct ata_device *adev, int o
 
 	pci_read_config_byte(pdev, pio_fifo, &fifo);
 	fifo &= ~(0x0F << shift);
-	if (on)
-		fifo |= (on << shift);
+	fifo |= (on << shift);
 	pci_write_config_byte(pdev, pio_fifo, fifo);
 }
 
@@ -370,10 +369,11 @@ static struct ata_port_operations ali_early_port_ops = {
 	.inherits	= &ata_sff_port_ops,
 	.cable_detect	= ata_cable_40wire,
 	.set_piomode	= ali_set_piomode,
+	.sff_data_xfer  = ata_sff_data_xfer32,
 };
 
 static const struct ata_port_operations ali_dma_base_ops = {
-	.inherits	= &ata_bmdma_port_ops,
+	.inherits	= &ata_bmdma32_port_ops,
 	.set_piomode	= ali_set_piomode,
 	.set_dmamode	= ali_set_dmamode,
 };
diff --git a/drivers/ata/pata_amd.c b/drivers/ata/pata_amd.c
index 0ec9c7d9fe9d..63719ab9ea44 100644
--- a/drivers/ata/pata_amd.c
+++ b/drivers/ata/pata_amd.c
@@ -24,7 +24,7 @@
 #include <linux/libata.h>
 
 #define DRV_NAME "pata_amd"
-#define DRV_VERSION "0.3.10"
+#define DRV_VERSION "0.3.11"
 
 /**
  *	timing_setup		-	shared timing computation and load
@@ -345,7 +345,7 @@ static struct scsi_host_template amd_sht = {
 };
 
 static const struct ata_port_operations amd_base_port_ops = {
-	.inherits	= &ata_bmdma_port_ops,
+	.inherits	= &ata_bmdma32_port_ops,
 	.prereset	= amd_pre_reset,
 };
 
diff --git a/drivers/ata/pata_mpiix.c b/drivers/ata/pata_mpiix.c
index 7c8faa48b5f3..aa576cac4d17 100644
--- a/drivers/ata/pata_mpiix.c
+++ b/drivers/ata/pata_mpiix.c
@@ -35,7 +35,7 @@
 #include <linux/libata.h>
 
 #define DRV_NAME "pata_mpiix"
-#define DRV_VERSION "0.7.6"
+#define DRV_VERSION "0.7.7"
 
 enum {
 	IDETIM = 0x6C,		/* IDE control register */
@@ -146,6 +146,7 @@ static struct ata_port_operations mpiix_port_ops = {
 	.cable_detect	= ata_cable_40wire,
 	.set_piomode	= mpiix_set_piomode,
 	.prereset	= mpiix_pre_reset,
+	.sff_data_xfer	= ata_sff_data_xfer32,
 };
 
 static int mpiix_init_one(struct pci_dev *dev, const struct pci_device_id *id)
diff --git a/drivers/ata/pata_sil680.c b/drivers/ata/pata_sil680.c
index 83580a59db58..9e764e5747e6 100644
--- a/drivers/ata/pata_sil680.c
+++ b/drivers/ata/pata_sil680.c
@@ -32,7 +32,7 @@
 #include <linux/libata.h>
 
 #define DRV_NAME "pata_sil680"
-#define DRV_VERSION "0.4.8"
+#define DRV_VERSION "0.4.9"
 
 #define SIL680_MMIO_BAR		5
 
@@ -195,7 +195,7 @@ static struct scsi_host_template sil680_sht = {
 };
 
 static struct ata_port_operations sil680_port_ops = {
-	.inherits	= &ata_bmdma_port_ops,
+	.inherits	= &ata_bmdma32_port_ops,
 	.cable_detect	= sil680_cable_detect,
 	.set_piomode	= sil680_set_piomode,
 	.set_dmamode	= sil680_set_dmamode,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 3449de597eff..4f7c8fb4d3fe 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1518,6 +1518,7 @@ extern void sata_pmp_error_handler(struct ata_port *ap);
 
 extern const struct ata_port_operations ata_sff_port_ops;
 extern const struct ata_port_operations ata_bmdma_port_ops;
+extern const struct ata_port_operations ata_bmdma32_port_ops;
 
 /* PIO only, sg_tablesize and dma_boundary limits can be removed */
 #define ATA_PIO_SHT(drv_name)					\
@@ -1545,6 +1546,8 @@ extern void ata_sff_exec_command(struct ata_port *ap,
 				 const struct ata_taskfile *tf);
 extern unsigned int ata_sff_data_xfer(struct ata_device *dev,
 			unsigned char *buf, unsigned int buflen, int rw);
+extern unsigned int ata_sff_data_xfer32(struct ata_device *dev,
+			unsigned char *buf, unsigned int buflen, int rw);
 extern unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev,
 			unsigned char *buf, unsigned int buflen, int rw);
 extern u8 ata_sff_irq_on(struct ata_port *ap);
-- 
cgit v1.2.3


From fefae48bf8caab7d56ee4f8181f06602cf73d29e Mon Sep 17 00:00:00 2001
From: Wolfgang Grandegger <wg@grandegger.com>
Date: Thu, 8 Jan 2009 19:21:27 +0100
Subject: [MTD] CFI: remove major/minor version check for command set 0x0002

The NOR Flash memory K8P2815UQB from Samsung uses the major version
number '0'. Add a quirk to cope with it.

Signed-off-by: Wolfgang Grandegger <wg@grandegger.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/chips/cfi_cmdset_0002.c | 10 ++++++++++
 include/linux/mtd/cfi.h             |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index f9c435a42670..94bb61e19047 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -322,6 +322,14 @@ static struct cfi_fixup fixup_table[] = {
 };
 
 
+static void cfi_fixup_major_minor(struct cfi_private *cfi,
+				  struct cfi_pri_amdstd *extp)
+{
+	if (cfi->mfr == CFI_MFR_SAMSUNG && cfi->id == 0x257e &&
+	    extp->MajorVersion == '0')
+		extp->MajorVersion = '1';
+}
+
 struct mtd_info *cfi_cmdset_0002(struct map_info *map, int primary)
 {
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -363,6 +371,8 @@ struct mtd_info *cfi_cmdset_0002(struct map_info *map, int primary)
 			return NULL;
 		}
 
+		cfi_fixup_major_minor(cfi, extp);
+
 		if (extp->MajorVersion != '1' ||
 		    (extp->MinorVersion < '0' || extp->MinorVersion > '4')) {
 			printk(KERN_ERR "  Unknown Amd/Fujitsu Extended Query "
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index 00e2b575021f..88d3d8fbf9f2 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -520,6 +520,7 @@ struct cfi_fixup {
 
 #define CFI_MFR_AMD 0x0001
 #define CFI_MFR_ATMEL 0x001F
+#define CFI_MFR_SAMSUNG 0x00EC
 #define CFI_MFR_ST  0x0020 	/* STMicroelectronics */
 
 void cfi_fixup(struct mtd_info *mtd, struct cfi_fixup* fixups);
-- 
cgit v1.2.3


From 8dd2f36f317569665e454268a2677cfba3e848f1 Mon Sep 17 00:00:00 2001
From: Andreas Eversberg <andreas@eversberg.eu>
Date: Sat, 2 Aug 2008 22:51:52 +0200
Subject: mISDN: Add feature via MISDN_CTRL_FILL_EMPTY to fill fifo if empty

This prevents underrun of fifo when filled and in case of an underrun it
prevents subsequent underruns due to jitter.
Improve dsp, so buffers are kept filled with a certain delay, so moderate
jitter will not cause underrun all the time -> the audio quality is highly
improved. tones are not interrupted by gaps anymore, except when CPU is
stalling or in high load.

Signed-off-by: Andreas Eversberg <andreas@eversberg.eu>
Signed-off-by: Karsten Keil <kkeil@suse.de>
---
 drivers/isdn/hardware/mISDN/hfc_multi.h |  3 ++
 drivers/isdn/hardware/mISDN/hfc_pci.h   |  1 +
 drivers/isdn/hardware/mISDN/hfcmulti.c  | 45 ++++++++++++++++++++------
 drivers/isdn/hardware/mISDN/hfcpci.c    | 41 ++++++++++++++++++++++--
 drivers/isdn/mISDN/dsp.h                |  1 +
 drivers/isdn/mISDN/dsp_cmx.c            | 56 ++++++++++++++++++++++-----------
 drivers/isdn/mISDN/dsp_core.c           | 37 ++++++++++++++++++++--
 include/linux/mISDNhw.h                 | 25 ++++++++-------
 include/linux/mISDNif.h                 |  1 +
 9 files changed, 165 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/hardware/mISDN/hfc_multi.h b/drivers/isdn/hardware/mISDN/hfc_multi.h
index 6d74951a1797..4aa6a8b41f50 100644
--- a/drivers/isdn/hardware/mISDN/hfc_multi.h
+++ b/drivers/isdn/hardware/mISDN/hfc_multi.h
@@ -9,6 +9,7 @@
 #define	DEBUG_HFCMULTI_MODE	0x00100000
 #define	DEBUG_HFCMULTI_MSG	0x00200000
 #define	DEBUG_HFCMULTI_STATE	0x00400000
+#define	DEBUG_HFCMULTI_FILL	0x00800000
 #define	DEBUG_HFCMULTI_SYNC	0x01000000
 #define	DEBUG_HFCMULTI_DTMF	0x02000000
 #define	DEBUG_HFCMULTI_LOCK	0x80000000
@@ -166,6 +167,8 @@ struct hfc_multi {
 
 	u_long		chip;	/* chip configuration */
 	int		masterclk; /* port that provides master clock -1=off */
+	unsigned char	silence;/* silence byte */
+	unsigned char	silence_data[128];/* silence block */
 	int		dtmf;	/* flag that dtmf is currently in process */
 	int		Flen;	/* F-buffer size */
 	int		Zlen;	/* Z-buffer size (must be int for calculation)*/
diff --git a/drivers/isdn/hardware/mISDN/hfc_pci.h b/drivers/isdn/hardware/mISDN/hfc_pci.h
index 5783d22a18fe..fd9241ab1802 100644
--- a/drivers/isdn/hardware/mISDN/hfc_pci.h
+++ b/drivers/isdn/hardware/mISDN/hfc_pci.h
@@ -27,6 +27,7 @@
  */
 #define HFCPCI_BTRANS_THRESHOLD 128
 #define HFCPCI_BTRANS_MAX	256
+#define HFCPCI_FILLEMPTY	64
 #define HFCPCI_BTRANS_THRESMASK 0x00
 
 /* defines for PCI config */
diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c
index ff5ec3cbeb77..3fc2e9d95341 100644
--- a/drivers/isdn/hardware/mISDN/hfcmulti.c
+++ b/drivers/isdn/hardware/mISDN/hfcmulti.c
@@ -180,7 +180,6 @@ static int nt_t1_count[] = { 3840, 1920, 960, 480, 240, 120, 60, 30  };
 #define	CLKDEL_TE	0x0f	/* CLKDEL in TE mode */
 #define	CLKDEL_NT	0x6c	/* CLKDEL in NT mode
 				   (0x60 MUST be included!) */
-static u_char silence =	0xff;	/* silence by LAW */
 
 #define	DIP_4S	0x1		/* DIP Switches for Beronet 1S/2S/4S cards */
 #define	DIP_8S	0x2		/* DIP Switches for Beronet 8S+ cards */
@@ -1975,6 +1974,17 @@ next_frame:
 		return; /* no data */
 	}
 
+	/* "fill fifo if empty" feature */
+	if (bch && test_bit(FLG_FILLEMPTY, &bch->Flags)
+		&& !test_bit(FLG_HDLC, &bch->Flags) && z2 == z1) {
+		if (debug & DEBUG_HFCMULTI_FILL)
+			printk(KERN_DEBUG "%s: buffer empty, so we have "
+				"underrun\n", __func__);
+		/* fill buffer, to prevent future underrun */
+		hc->write_fifo(hc, hc->silence_data, poll >> 1);
+		Zspace -= (poll >> 1);
+	}
+
 	/* if audio data and connected slot */
 	if (bch && (!test_bit(FLG_HDLC, &bch->Flags)) && (!*txpending)
 		&& slot_tx >= 0) {
@@ -2011,7 +2021,6 @@ next_frame:
 			__func__, hc->id + 1, ch, Zspace, z1, z2, ii-i, len-i,
 			temp ? "HDLC":"TRANS");
 
-
 	/* Have to prep the audio data */
 	hc->write_fifo(hc, d, ii - i);
 	*idxp = ii;
@@ -2050,7 +2059,7 @@ next_frame:
 	 * no more data at all. this prevents sending an undefined value.
 	 */
 	if (bch && test_bit(FLG_TRANSPARENT, &bch->Flags))
-		HFC_outb_nodebug(hc, A_FIFO_DATA0_NOINC, silence);
+		HFC_outb_nodebug(hc, A_FIFO_DATA0_NOINC, hc->silence);
 }
 
 
@@ -2932,7 +2941,7 @@ mode_hfcmulti(struct hfc_multi *hc, int ch, int protocol, int slot_tx,
 			HFC_outb(hc, R_INC_RES_FIFO, V_RES_F);
 			HFC_wait(hc);
 			/* tx silence */
-			HFC_outb_nodebug(hc, A_FIFO_DATA0_NOINC, silence);
+			HFC_outb_nodebug(hc, A_FIFO_DATA0_NOINC, hc->silence);
 			HFC_outb(hc, R_SLOT, (((ch / 4) * 8) +
 			    ((ch % 4) * 4)) << 1);
 			HFC_outb(hc, A_SL_CFG, 0x80 | 0x20 | (ch << 1));
@@ -2947,7 +2956,7 @@ mode_hfcmulti(struct hfc_multi *hc, int ch, int protocol, int slot_tx,
 			HFC_outb(hc, R_INC_RES_FIFO, V_RES_F);
 			HFC_wait(hc);
 			/* tx silence */
-			HFC_outb_nodebug(hc, A_FIFO_DATA0_NOINC, silence);
+			HFC_outb_nodebug(hc, A_FIFO_DATA0_NOINC, hc->silence);
 			/* enable RX fifo */
 			HFC_outb(hc, R_FIFO, (ch<<1)|1);
 			HFC_wait(hc);
@@ -3439,7 +3448,7 @@ channel_bctrl(struct bchannel *bch, struct mISDN_ctrl_req *cq)
 	switch (cq->op) {
 	case MISDN_CTRL_GETOP:
 		cq->op = MISDN_CTRL_HFC_OP | MISDN_CTRL_HW_FEATURES_OP
-			| MISDN_CTRL_RX_OFF;
+			| MISDN_CTRL_RX_OFF | MISDN_CTRL_FILL_EMPTY;
 		break;
 	case MISDN_CTRL_RX_OFF: /* turn off / on rx stream */
 		hc->chan[bch->slot].rx_off = !!cq->p1;
@@ -3454,6 +3463,12 @@ channel_bctrl(struct bchannel *bch, struct mISDN_ctrl_req *cq)
 			printk(KERN_DEBUG "%s: RX_OFF request (nr=%d off=%d)\n",
 			    __func__, bch->nr, hc->chan[bch->slot].rx_off);
 		break;
+	case MISDN_CTRL_FILL_EMPTY: /* fill fifo, if empty */
+		test_and_set_bit(FLG_FILLEMPTY, &bch->Flags);
+		if (debug & DEBUG_HFCMULTI_MSG)
+			printk(KERN_DEBUG "%s: FILL_EMPTY request (nr=%d "
+				"off=%d)\n", __func__, bch->nr, !!cq->p1);
+		break;
 	case MISDN_CTRL_HW_FEATURES: /* fill features structure */
 		if (debug & DEBUG_HFCMULTI_MSG)
 			printk(KERN_DEBUG "%s: HW_FEATURE request\n",
@@ -3970,6 +3985,7 @@ open_bchannel(struct hfc_multi *hc, struct dchannel *dch,
 	}
 	if (test_and_set_bit(FLG_OPEN, &bch->Flags))
 		return -EBUSY; /* b-channel can be only open once */
+	test_and_clear_bit(FLG_FILLEMPTY, &bch->Flags);
 	bch->ch.protocol = rq->protocol;
 	hc->chan[ch].rx_off = 0;
 	rq->ch = &bch->ch;
@@ -4806,6 +4822,7 @@ hfcmulti_init(struct pci_dev *pdev, const struct pci_device_id *ent)
 	struct hfc_multi	*hc;
 	u_long		flags;
 	u_char		dips = 0, pmj = 0; /* dip settings, port mode Jumpers */
+	int		i;
 
 	if (HFC_cnt >= MAX_CARDS) {
 		printk(KERN_ERR "too many cards (max=%d).\n",
@@ -4839,11 +4856,11 @@ hfcmulti_init(struct pci_dev *pdev, const struct pci_device_id *ent)
 	hc->id = HFC_cnt;
 	hc->pcm = pcm[HFC_cnt];
 	hc->io_mode = iomode[HFC_cnt];
-	if (dslot[HFC_cnt] < 0) {
+	if (dslot[HFC_cnt] < 0 && hc->type == 1) {
 		hc->dslot = 0;
 		printk(KERN_INFO "HFC-E1 card has disabled D-channel, but "
 			"31 B-channels\n");
-	} if (dslot[HFC_cnt] > 0 && dslot[HFC_cnt] < 32) {
+	} if (dslot[HFC_cnt] > 0 && dslot[HFC_cnt] < 32 && hc->type == 1) {
 		hc->dslot = dslot[HFC_cnt];
 		printk(KERN_INFO "HFC-E1 card has alternating D-channel on "
 			"time slot %d\n", dslot[HFC_cnt]);
@@ -4854,9 +4871,17 @@ hfcmulti_init(struct pci_dev *pdev, const struct pci_device_id *ent)
 	hc->masterclk = -1;
 	if (type[HFC_cnt] & 0x100) {
 		test_and_set_bit(HFC_CHIP_ULAW, &hc->chip);
-		silence = 0xff; /* ulaw silence */
+		hc->silence = 0xff; /* ulaw silence */
 	} else
-		silence = 0x2a; /* alaw silence */
+		hc->silence = 0x2a; /* alaw silence */
+	if ((poll >> 1) > sizeof(hc->silence_data)) {
+		printk(KERN_ERR "HFCMULTI error: silence_data too small, "
+			"please fix\n");
+		return -EINVAL;
+	}
+	for (i = 0; i < (poll >> 1); i++)
+		hc->silence_data[i] = hc->silence;
+
 	if (!(type[HFC_cnt] & 0x200))
 		test_and_set_bit(HFC_CHIP_DTMF, &hc->chip);
 
diff --git a/drivers/isdn/hardware/mISDN/hfcpci.c b/drivers/isdn/hardware/mISDN/hfcpci.c
index cd8302af40eb..80c356e5dbe1 100644
--- a/drivers/isdn/hardware/mISDN/hfcpci.c
+++ b/drivers/isdn/hardware/mISDN/hfcpci.c
@@ -751,6 +751,36 @@ hfcpci_fill_fifo(struct bchannel *bch)
 			    /* fcnt contains available bytes in fifo */
 		fcnt = B_FIFO_SIZE - fcnt;
 		    /* remaining bytes to send (bytes in fifo) */
+
+		/* "fill fifo if empty" feature */
+		if (test_bit(FLG_FILLEMPTY, &bch->Flags) && !fcnt) {
+			/* printk(KERN_DEBUG "%s: buffer empty, so we have "
+				"underrun\n", __func__); */
+			/* fill buffer, to prevent future underrun */
+			count = HFCPCI_FILLEMPTY;
+			new_z1 = le16_to_cpu(*z1t) + count;
+			   /* new buffer Position */
+			if (new_z1 >= (B_FIFO_SIZE + B_SUB_VAL))
+				new_z1 -= B_FIFO_SIZE;	/* buffer wrap */
+			dst = bdata + (le16_to_cpu(*z1t) - B_SUB_VAL);
+			maxlen = (B_FIFO_SIZE + B_SUB_VAL) - le16_to_cpu(*z1t);
+			    /* end of fifo */
+			if (bch->debug & DEBUG_HW_BFIFO)
+				printk(KERN_DEBUG "hfcpci_FFt fillempty "
+				    "fcnt(%d) maxl(%d) nz1(%x) dst(%p)\n",
+				    fcnt, maxlen, new_z1, dst);
+			fcnt += count;
+			if (maxlen > count)
+				maxlen = count; 	/* limit size */
+			memset(dst, 0x2a, maxlen);	/* first copy */
+			count -= maxlen;		/* remaining bytes */
+			if (count) {
+				dst = bdata;		/* start of buffer */
+				memset(dst, 0x2a, count);
+			}
+			*z1t = cpu_to_le16(new_z1);	/* now send data */
+		}
+
 next_t_frame:
 		count = bch->tx_skb->len - bch->tx_idx;
 		/* maximum fill shall be HFCPCI_BTRANS_MAX */
@@ -1481,11 +1511,17 @@ deactivate_bchannel(struct bchannel *bch)
 static int
 channel_bctrl(struct bchannel *bch, struct mISDN_ctrl_req *cq)
 {
-	int			ret = 0;
+	int	ret = 0;
 
 	switch (cq->op) {
 	case MISDN_CTRL_GETOP:
-		cq->op = 0;
+		cq->op = MISDN_CTRL_FILL_EMPTY;
+		break;
+	case MISDN_CTRL_FILL_EMPTY: /* fill fifo, if empty */
+		test_and_set_bit(FLG_FILLEMPTY, &bch->Flags);
+		if (debug & DEBUG_HW_OPEN)
+			printk(KERN_DEBUG "%s: FILL_EMPTY request (nr=%d "
+				"off=%d)\n", __func__, bch->nr, !!cq->p1);
 		break;
 	default:
 		printk(KERN_WARNING "%s: unknown Op %x\n", __func__, cq->op);
@@ -1903,6 +1939,7 @@ open_bchannel(struct hfc_pci *hc, struct channel_req *rq)
 	bch = &hc->bch[rq->adr.channel - 1];
 	if (test_and_set_bit(FLG_OPEN, &bch->Flags))
 		return -EBUSY; /* b-channel can be only open once */
+	test_and_clear_bit(FLG_FILLEMPTY, &bch->Flags);
 	bch->ch.protocol = rq->protocol;
 	rq->ch = &bch->ch; /* TODO: E-channel */
 	if (!try_module_get(THIS_MODULE))
diff --git a/drivers/isdn/mISDN/dsp.h b/drivers/isdn/mISDN/dsp.h
index 6c3fed6b8d4f..ded948069335 100644
--- a/drivers/isdn/mISDN/dsp.h
+++ b/drivers/isdn/mISDN/dsp.h
@@ -198,6 +198,7 @@ struct dsp {
 	/* hardware stuff */
 	struct dsp_features features;
 	int		features_rx_off; /* set if rx_off is featured */
+	int		features_fill_empty; /* set if fill_empty is featured */
 	int		pcm_slot_rx; /* current PCM slot (or -1) */
 	int		pcm_bank_rx;
 	int		pcm_slot_tx;
diff --git a/drivers/isdn/mISDN/dsp_cmx.c b/drivers/isdn/mISDN/dsp_cmx.c
index c884511e2d49..fc8ea41ae6a2 100644
--- a/drivers/isdn/mISDN/dsp_cmx.c
+++ b/drivers/isdn/mISDN/dsp_cmx.c
@@ -1168,11 +1168,18 @@ dsp_cmx_receive(struct dsp *dsp, struct sk_buff *skb)
 		dsp->rx_init = 0;
 		if (dsp->features.unordered) {
 			dsp->rx_R = (hh->id & CMX_BUFF_MASK);
-			dsp->rx_W = (dsp->rx_R + dsp->cmx_delay)
-				& CMX_BUFF_MASK;
+			if (dsp->cmx_delay)
+				dsp->rx_W = (dsp->rx_R + dsp->cmx_delay)
+					& CMX_BUFF_MASK;
+			else
+				dsp->rx_W = (dsp->rx_R + (dsp_poll >> 1))
+					& CMX_BUFF_MASK;
 		} else {
 			dsp->rx_R = 0;
-			dsp->rx_W = dsp->cmx_delay;
+			if (dsp->cmx_delay)
+				dsp->rx_W = dsp->cmx_delay;
+			else
+				dsp->rx_W = dsp_poll >> 1;
 		}
 	}
 	/* if frame contains time code, write directly */
@@ -1190,14 +1197,20 @@ dsp_cmx_receive(struct dsp *dsp, struct sk_buff *skb)
 			    "cmx_receive(dsp=%lx): UNDERRUN (or overrun the "
 			    "maximum delay), adjusting read pointer! "
 			    "(inst %s)\n", (u_long)dsp, dsp->name);
-		/* flush buffer */
+		/* flush rx buffer and set delay to dsp_poll / 2 */
 		if (dsp->features.unordered) {
 			dsp->rx_R = (hh->id & CMX_BUFF_MASK);
-			dsp->rx_W = (dsp->rx_R + dsp->cmx_delay)
-				& CMX_BUFF_MASK;
+			if (dsp->cmx_delay)
+				dsp->rx_W = (dsp->rx_R + dsp->cmx_delay)
+					& CMX_BUFF_MASK;
+				dsp->rx_W = (dsp->rx_R + (dsp_poll >> 1))
+					& CMX_BUFF_MASK;
 		} else {
 			dsp->rx_R = 0;
-			dsp->rx_W = dsp->cmx_delay;
+			if (dsp->cmx_delay)
+				dsp->rx_W = dsp->cmx_delay;
+			else
+				dsp->rx_W = dsp_poll >> 1;
 		}
 		memset(dsp->rx_buff, dsp_silence, sizeof(dsp->rx_buff));
 	}
@@ -1360,8 +1373,11 @@ dsp_cmx_send_member(struct dsp *dsp, int len, s32 *c, int members)
 				t = (t+1) & CMX_BUFF_MASK;
 				r = (r+1) & CMX_BUFF_MASK;
 			}
-			if (r != rr)
+			if (r != rr) {
+				printk(KERN_DEBUG "%s: buffer empty\n",
+					__func__);
 				memset(d, dsp_silence, (rr-r)&CMX_BUFF_MASK);
+			}
 		/* -> if echo is enabled */
 		} else {
 			/*
@@ -1704,9 +1720,10 @@ dsp_cmx_send(void *arg)
 			}
 			/*
 			 * remove rx_delay only if we have delay AND we
-			 * have not preset cmx_delay
+			 * have not preset cmx_delay AND
+			 * the delay is greater dsp_poll
 			 */
-			if (delay && !dsp->cmx_delay) {
+			if (delay > dsp_poll && !dsp->cmx_delay) {
 				if (dsp_debug & DEBUG_DSP_CMX)
 					printk(KERN_DEBUG
 					    "%s lowest rx_delay of %d bytes for"
@@ -1714,7 +1731,8 @@ dsp_cmx_send(void *arg)
 					    __func__, delay,
 					    dsp->name);
 				r = dsp->rx_R;
-				rr = (r + delay) & CMX_BUFF_MASK;
+				rr = (r + delay - (dsp_poll >> 1))
+					& CMX_BUFF_MASK;
 				/* delete rx-data */
 				while (r != rr) {
 					p[r] = dsp_silence;
@@ -1736,7 +1754,7 @@ dsp_cmx_send(void *arg)
 			 * remove delay only if we have delay AND we
 			 * have enabled tx_dejitter
 			 */
-			if (delay && dsp->tx_dejitter) {
+			if (delay > dsp_poll && dsp->tx_dejitter) {
 				if (dsp_debug & DEBUG_DSP_CMX)
 					printk(KERN_DEBUG
 					    "%s lowest tx_delay of %d bytes for"
@@ -1744,7 +1762,8 @@ dsp_cmx_send(void *arg)
 					    __func__, delay,
 					    dsp->name);
 				r = dsp->tx_R;
-				rr = (r + delay) & CMX_BUFF_MASK;
+				rr = (r + delay - (dsp_poll >> 1))
+					& CMX_BUFF_MASK;
 				/* delete tx-data */
 				while (r != rr) {
 					q[r] = dsp_silence;
@@ -1797,14 +1816,13 @@ dsp_cmx_transmit(struct dsp *dsp, struct sk_buff *skb)
 	ww = dsp->tx_R;
 	p = dsp->tx_buff;
 	d = skb->data;
-	space = ww-w;
-	if (space <= 0)
-		space += CMX_BUFF_SIZE;
+	space = (ww - w - 1) & CMX_BUFF_MASK;
 	/* write-pointer should not overrun nor reach read pointer */
-	if (space-1 < skb->len)
+	if (space < skb->len) {
 		/* write to the space we have left */
-		ww = (ww - 1) & CMX_BUFF_MASK;
-	else
+		ww = (ww - 1) & CMX_BUFF_MASK; /* end one byte prior tx_R */
+		printk(KERN_DEBUG "%s: buffer overflow\n", __func__);
+	} else
 		/* write until all byte are copied */
 		ww = (w + skb->len) & CMX_BUFF_MASK;
 	dsp->tx_W = ww;
diff --git a/drivers/isdn/mISDN/dsp_core.c b/drivers/isdn/mISDN/dsp_core.c
index 1dc21d803410..1d504ba954f5 100644
--- a/drivers/isdn/mISDN/dsp_core.c
+++ b/drivers/isdn/mISDN/dsp_core.c
@@ -191,6 +191,8 @@ dsp_rx_off_member(struct dsp *dsp)
 	struct mISDN_ctrl_req	cq;
 	int rx_off = 1;
 
+	memset(&cq, 0, sizeof(cq));
+
 	if (!dsp->features_rx_off)
 		return;
 
@@ -249,6 +251,32 @@ dsp_rx_off(struct dsp *dsp)
 	}
 }
 
+/* enable "fill empty" feature */
+static void
+dsp_fill_empty(struct dsp *dsp)
+{
+	struct mISDN_ctrl_req	cq;
+
+	memset(&cq, 0, sizeof(cq));
+
+	if (!dsp->ch.peer) {
+		if (dsp_debug & DEBUG_DSP_CORE)
+			printk(KERN_DEBUG "%s: no peer, no fill_empty\n",
+				__func__);
+		return;
+	}
+	cq.op = MISDN_CTRL_FILL_EMPTY;
+	cq.p1 = 1;
+	if (dsp->ch.peer->ctrl(dsp->ch.peer, CONTROL_CHANNEL, &cq)) {
+		printk(KERN_DEBUG "%s: CONTROL_CHANNEL failed\n",
+			__func__);
+		return;
+	}
+	if (dsp_debug & DEBUG_DSP_CORE)
+		printk(KERN_DEBUG "%s: %s set fill_empty = 1\n",
+			__func__, dsp->name);
+}
+
 static int
 dsp_control_req(struct dsp *dsp, struct mISDNhead *hh, struct sk_buff *skb)
 {
@@ -593,8 +621,6 @@ get_features(struct mISDNchannel *ch)
 	struct dsp		*dsp = container_of(ch, struct dsp, ch);
 	struct mISDN_ctrl_req	cq;
 
-	if (dsp_options & DSP_OPT_NOHARDWARE)
-		return;
 	if (!ch->peer) {
 		if (dsp_debug & DEBUG_DSP_CORE)
 			printk(KERN_DEBUG "%s: no peer, no features\n",
@@ -610,6 +636,10 @@ get_features(struct mISDNchannel *ch)
 	}
 	if (cq.op & MISDN_CTRL_RX_OFF)
 		dsp->features_rx_off = 1;
+	if (cq.op & MISDN_CTRL_FILL_EMPTY)
+		dsp->features_fill_empty = 1;
+	if (dsp_options & DSP_OPT_NOHARDWARE)
+		return;
 	if ((cq.op & MISDN_CTRL_HW_FEATURES_OP)) {
 		cq.op = MISDN_CTRL_HW_FEATURES;
 		*((u_long *)&cq.p1) = (u_long)&dsp->features;
@@ -865,6 +895,9 @@ dsp_function(struct mISDNchannel *ch,  struct sk_buff *skb)
 		if (dsp->dtmf.hardware || dsp->dtmf.software)
 			dsp_dtmf_goertzel_init(dsp);
 		get_features(ch);
+		/* enable fill_empty feature */
+		if (dsp->features_fill_empty)
+			dsp_fill_empty(dsp);
 		/* send ph_activate */
 		hh->prim = PH_ACTIVATE_REQ;
 		if (ch->peer)
diff --git a/include/linux/mISDNhw.h b/include/linux/mISDNhw.h
index e794dfb87504..9384b92dfc65 100644
--- a/include/linux/mISDNhw.h
+++ b/include/linux/mISDNhw.h
@@ -57,20 +57,21 @@
 #define FLG_L2DATA		14	/* channel use L2 DATA primitivs */
 #define FLG_ORIGIN		15	/* channel is on origin site */
 /* channel specific stuff */
+#define FLG_FILLEMPTY		16	/* fill fifo on first frame (empty) */
 /* arcofi specific */
-#define FLG_ARCOFI_TIMER	16
-#define FLG_ARCOFI_ERROR	17
+#define FLG_ARCOFI_TIMER	17
+#define FLG_ARCOFI_ERROR	18
 /* isar specific */
-#define FLG_INITIALIZED		16
-#define FLG_DLEETX		17
-#define FLG_LASTDLE		18
-#define FLG_FIRST		19
-#define FLG_LASTDATA		20
-#define FLG_NMD_DATA		21
-#define FLG_FTI_RUN		22
-#define FLG_LL_OK		23
-#define FLG_LL_CONN		24
-#define FLG_DTMFSEND		25
+#define FLG_INITIALIZED		17
+#define FLG_DLEETX		18
+#define FLG_LASTDLE		19
+#define FLG_FIRST		20
+#define FLG_LASTDATA		21
+#define FLG_NMD_DATA		22
+#define FLG_FTI_RUN		23
+#define FLG_LL_OK		24
+#define FLG_LL_CONN		25
+#define FLG_DTMFSEND		26
 
 /* workq events */
 #define FLG_RECVQUEUE		30
diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index 8f2d60da04e7..74c903cd7a0a 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -312,6 +312,7 @@ clear_channelmap(u_int nr, u_char *map)
 #define MISDN_CTRL_SETPEER		0x0040
 #define MISDN_CTRL_UNSETPEER		0x0080
 #define MISDN_CTRL_RX_OFF		0x0100
+#define MISDN_CTRL_FILL_EMPTY		0x0200
 #define MISDN_CTRL_HW_FEATURES_OP	0x2000
 #define MISDN_CTRL_HW_FEATURES		0x2001
 #define MISDN_CTRL_HFC_OP		0x4000
-- 
cgit v1.2.3


From 8b6015f736125050722dbe59c4f943e78cd626f0 Mon Sep 17 00:00:00 2001
From: Matthias Urlichs <smurf@smurf.noris.de>
Date: Tue, 12 Aug 2008 10:12:09 +0200
Subject: mISDN: Added an ioctl to change the device name

To get persistent device names with hotplug we need to rename devices
sometime.

Signed-off-by: Matthias Urlichs <matthias@urlichs.de>
Signed-off-by: Karsten Keil <kkeil@suse.de>
---
 drivers/isdn/mISDN/socket.c | 15 +++++++++++++++
 include/linux/mISDNif.h     | 31 +++++++++++++++++++------------
 2 files changed, 34 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 37a2de18cfd0..98fa6c4a4bd6 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -645,6 +645,21 @@ base_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		} else
 			err = -ENODEV;
 		break;
+	case IMSETDEVNAME:
+		{
+			struct mISDN_devrename dn;
+			if (copy_from_user(&dn, (void __user *)arg,
+			    sizeof(dn))) {
+				err = -EFAULT;
+				break;
+			}
+			dev = get_mdevice(dn.id);
+			if (dev)
+				strlcpy(dev->name, dn.name, MISDN_MAX_IDLEN);
+			else
+				err = -ENODEV;
+		}
+		break;
 	default:
 		err = -EINVAL;
 	}
diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index 74c903cd7a0a..be09476ed854 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -36,8 +36,8 @@
  *              - should be incremented on every checkin
  */
 #define	MISDN_MAJOR_VERSION	1
-#define	MISDN_MINOR_VERSION	0
-#define MISDN_RELEASE		19
+#define	MISDN_MINOR_VERSION	1
+#define MISDN_RELEASE		20
 
 /* primitives for information exchange
  * generell format
@@ -255,16 +255,6 @@ struct sockaddr_mISDN {
 	unsigned char	tei;
 };
 
-/* timer device ioctl */
-#define IMADDTIMER	_IOR('I', 64, int)
-#define IMDELTIMER	_IOR('I', 65, int)
-/* socket ioctls */
-#define	IMGETVERSION	_IOR('I', 66, int)
-#define	IMGETCOUNT	_IOR('I', 67, int)
-#define IMGETDEVINFO	_IOR('I', 68, int)
-#define IMCTRLREQ	_IOR('I', 69, int)
-#define IMCLEAR_L2	_IOR('I', 70, int)
-
 struct mISDNversion {
 	unsigned char	major;
 	unsigned char	minor;
@@ -281,6 +271,23 @@ struct mISDN_devinfo {
 	char			name[MISDN_MAX_IDLEN];
 };
 
+struct mISDN_devrename {
+	u_int			id;
+	char			name[MISDN_MAX_IDLEN]; /* new name */
+};
+
+/* timer device ioctl */
+#define IMADDTIMER	_IOR('I', 64, int)
+#define IMDELTIMER	_IOR('I', 65, int)
+
+/* socket ioctls */
+#define	IMGETVERSION	_IOR('I', 66, int)
+#define	IMGETCOUNT	_IOR('I', 67, int)
+#define IMGETDEVINFO	_IOR('I', 68, int)
+#define IMCTRLREQ	_IOR('I', 69, int)
+#define IMCLEAR_L2	_IOR('I', 70, int)
+#define IMSETDEVNAME	_IOR('I', 71, struct mISDN_devrename)
+
 static inline int
 test_channelmap(u_int nr, u_char *map)
 {
-- 
cgit v1.2.3


From 837468d135dcc49cdabc9fa92fc9550479f60704 Mon Sep 17 00:00:00 2001
From: Matthias Urlichs <smurf@smurf.noris.de>
Date: Sat, 16 Aug 2008 00:04:33 +0200
Subject: mISDN: Use struct device name field

struct device already has a 'name' member, use it.

Signed-off-by: Matthias Urlichs <matthias@urlichs.de>
Signed-off-by: Karsten Keil <kkeil@suse.de>
---
 drivers/isdn/mISDN/core.c   |  8 ++++----
 drivers/isdn/mISDN/layer1.c |  2 +-
 drivers/isdn/mISDN/socket.c |  6 +++---
 drivers/isdn/mISDN/stack.c  | 43 ++++++++++++++++++++++++-------------------
 drivers/isdn/mISDN/tei.c    |  6 +++---
 include/linux/mISDNif.h     |  1 -
 6 files changed, 35 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/mISDN/core.c b/drivers/isdn/mISDN/core.c
index 751665c448d0..43fd97b0fe42 100644
--- a/drivers/isdn/mISDN/core.c
+++ b/drivers/isdn/mISDN/core.c
@@ -82,12 +82,12 @@ mISDN_register_device(struct mISDNdevice *dev, char *name)
 	if (dev->id < 0)
 		return -EBUSY;
 	if (name && name[0])
-		strcpy(dev->name, name);
+		dev_set_name(&dev->dev, "%s", name);
 	else
-		sprintf(dev->name, "mISDN%d", dev->id);
+		dev_set_name(&dev->dev, "mISDN%d", dev->id);
 	if (debug & DEBUG_CORE)
 		printk(KERN_DEBUG "mISDN_register %s %d\n",
-			dev->name, dev->id);
+			dev_name(&dev->dev), dev->id);
 	err = create_stack(dev);
 	if (err)
 		return err;
@@ -104,7 +104,7 @@ mISDN_unregister_device(struct mISDNdevice *dev) {
 
 	if (debug & DEBUG_CORE)
 		printk(KERN_DEBUG "mISDN_unregister %s %d\n",
-			dev->name, dev->id);
+			dev_name(&dev->dev), dev->id);
 	write_lock_irqsave(&device_lock, flags);
 	list_del(&dev->D.list);
 	write_unlock_irqrestore(&device_lock, flags);
diff --git a/drivers/isdn/mISDN/layer1.c b/drivers/isdn/mISDN/layer1.c
index b73e952d12cf..e826eeb1ecec 100644
--- a/drivers/isdn/mISDN/layer1.c
+++ b/drivers/isdn/mISDN/layer1.c
@@ -101,7 +101,7 @@ l1m_debug(struct FsmInst *fi, char *fmt, ...)
 	va_list va;
 
 	va_start(va, fmt);
-	printk(KERN_DEBUG "%s: ", l1->dch->dev.name);
+	printk(KERN_DEBUG "%s: ", dev_name(&l1->dch->dev.dev));
 	vprintk(fmt, va);
 	printk("\n");
 	va_end(va);
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 98fa6c4a4bd6..2f6d6e88ff2c 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -381,7 +381,7 @@ data_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			memcpy(di.channelmap, dev->channelmap,
 				sizeof(di.channelmap));
 			di.nrbchan = dev->nrbchan;
-			strcpy(di.name, dev->name);
+			strcpy(di.name, dev_name(&dev->dev));
 			if (copy_to_user((void __user *)arg, &di, sizeof(di)))
 				err = -EFAULT;
 		} else
@@ -639,7 +639,7 @@ base_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			memcpy(di.channelmap, dev->channelmap,
 				sizeof(di.channelmap));
 			di.nrbchan = dev->nrbchan;
-			strcpy(di.name, dev->name);
+			strcpy(di.name, dev_name(&dev->dev));
 			if (copy_to_user((void __user *)arg, &di, sizeof(di)))
 				err = -EFAULT;
 		} else
@@ -655,7 +655,7 @@ base_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			}
 			dev = get_mdevice(dn.id);
 			if (dev)
-				strlcpy(dev->name, dn.name, MISDN_MAX_IDLEN);
+				err = device_rename(&dev->dev, dn.name);
 			else
 				err = -ENODEV;
 		}
diff --git a/drivers/isdn/mISDN/stack.c b/drivers/isdn/mISDN/stack.c
index d55b14ae4e99..9b9fab47cb2b 100644
--- a/drivers/isdn/mISDN/stack.c
+++ b/drivers/isdn/mISDN/stack.c
@@ -172,7 +172,8 @@ send_msg_to_layer(struct mISDNstack *st, struct sk_buff *skb)
 		else
 			printk(KERN_WARNING
 			    "%s: dev(%s) prim(%x) id(%x) no channel\n",
-			    __func__, st->dev->name, hh->prim, hh->id);
+			    __func__, dev_name(&st->dev->dev), hh->prim,
+			    hh->id);
 	} else if (lm == 0x8) {
 		WARN_ON(lm == 0x8);
 		ch = get_channel4id(st, hh->id);
@@ -181,11 +182,12 @@ send_msg_to_layer(struct mISDNstack *st, struct sk_buff *skb)
 		else
 			printk(KERN_WARNING
 			    "%s: dev(%s) prim(%x) id(%x) no channel\n",
-			    __func__, st->dev->name, hh->prim, hh->id);
+			    __func__, dev_name(&st->dev->dev), hh->prim,
+			    hh->id);
 	} else {
 		/* broadcast not handled yet */
 		printk(KERN_WARNING "%s: dev(%s) prim %x not delivered\n",
-		    __func__, st->dev->name, hh->prim);
+		    __func__, dev_name(&st->dev->dev), hh->prim);
 	}
 	return -ESRCH;
 }
@@ -209,7 +211,8 @@ mISDNStackd(void *data)
 	unlock_kernel();
 #endif
 	if (*debug & DEBUG_MSG_THREAD)
-		printk(KERN_DEBUG "mISDNStackd %s started\n", st->dev->name);
+		printk(KERN_DEBUG "mISDNStackd %s started\n",
+		    dev_name(&st->dev->dev));
 
 	if (st->notify != NULL) {
 		complete(st->notify);
@@ -245,7 +248,7 @@ mISDNStackd(void *data)
 					printk(KERN_DEBUG
 					    "%s: %s prim(%x) id(%x) "
 					    "send call(%d)\n",
-					    __func__, st->dev->name,
+					    __func__, dev_name(&st->dev->dev),
 					    mISDN_HEAD_PRIM(skb),
 					    mISDN_HEAD_ID(skb), err);
 				dev_kfree_skb(skb);
@@ -288,7 +291,7 @@ mISDNStackd(void *data)
 		    mISDN_STACK_ACTION_MASK));
 		if (*debug & DEBUG_MSG_THREAD)
 			printk(KERN_DEBUG "%s: %s wake status %08lx\n",
-			    __func__, st->dev->name, st->status);
+			    __func__, dev_name(&st->dev->dev), st->status);
 		test_and_set_bit(mISDN_STACK_ACTIVE, &st->status);
 
 		test_and_clear_bit(mISDN_STACK_WAKEUP, &st->status);
@@ -303,15 +306,16 @@ mISDNStackd(void *data)
 #ifdef MISDN_MSG_STATS
 	printk(KERN_DEBUG "mISDNStackd daemon for %s proceed %d "
 	    "msg %d sleep %d stopped\n",
-	    st->dev->name, st->msg_cnt, st->sleep_cnt, st->stopped_cnt);
+	    dev_name(&st->dev->dev), st->msg_cnt, st->sleep_cnt,
+	    st->stopped_cnt);
 	printk(KERN_DEBUG
 	    "mISDNStackd daemon for %s utime(%ld) stime(%ld)\n",
-	    st->dev->name, st->thread->utime, st->thread->stime);
+	    dev_name(&st->dev->dev), st->thread->utime, st->thread->stime);
 	printk(KERN_DEBUG
 	    "mISDNStackd daemon for %s nvcsw(%ld) nivcsw(%ld)\n",
-	    st->dev->name, st->thread->nvcsw, st->thread->nivcsw);
+	    dev_name(&st->dev->dev), st->thread->nvcsw, st->thread->nivcsw);
 	printk(KERN_DEBUG "mISDNStackd daemon for %s killed now\n",
-	    st->dev->name);
+	    dev_name(&st->dev->dev));
 #endif
 	test_and_set_bit(mISDN_STACK_KILLED, &st->status);
 	test_and_clear_bit(mISDN_STACK_RUNNING, &st->status);
@@ -401,15 +405,16 @@ create_stack(struct mISDNdevice *dev)
 	newst->own.send = mISDN_queue_message;
 	newst->own.recv = mISDN_queue_message;
 	if (*debug & DEBUG_CORE_FUNC)
-		printk(KERN_DEBUG "%s: st(%s)\n", __func__, newst->dev->name);
+		printk(KERN_DEBUG "%s: st(%s)\n", __func__,
+		    dev_name(&newst->dev->dev));
 	newst->notify = &done;
 	newst->thread = kthread_run(mISDNStackd, (void *)newst, "mISDN_%s",
-		newst->dev->name);
+		dev_name(&newst->dev->dev));
 	if (IS_ERR(newst->thread)) {
 		err = PTR_ERR(newst->thread);
 		printk(KERN_ERR
 			"mISDN:cannot create kernel thread for %s (%d)\n",
-			newst->dev->name, err);
+			dev_name(&newst->dev->dev), err);
 		delete_teimanager(dev->teimgr);
 		kfree(newst);
 	} else
@@ -428,8 +433,8 @@ connect_layer1(struct mISDNdevice *dev, struct mISDNchannel *ch,
 
 	if (*debug &  DEBUG_CORE_FUNC)
 		printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n",
-			__func__, dev->name, protocol, adr->dev, adr->channel,
-			 adr->sapi, adr->tei);
+			__func__, dev_name(&dev->dev), protocol, adr->dev,
+			adr->channel, adr->sapi, adr->tei);
 	switch (protocol) {
 	case ISDN_P_NT_S0:
 	case ISDN_P_NT_E1:
@@ -473,7 +478,7 @@ connect_Bstack(struct mISDNdevice *dev, struct mISDNchannel *ch,
 
 	if (*debug &  DEBUG_CORE_FUNC)
 		printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n",
-			__func__, dev->name, protocol,
+			__func__, dev_name(&dev->dev), protocol,
 			adr->dev, adr->channel, adr->sapi,
 			adr->tei);
 	ch->st = dev->D.st;
@@ -529,7 +534,7 @@ create_l2entity(struct mISDNdevice *dev, struct mISDNchannel *ch,
 
 	if (*debug &  DEBUG_CORE_FUNC)
 		printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n",
-			__func__, dev->name, protocol,
+			__func__, dev_name(&dev->dev), protocol,
 			adr->dev, adr->channel, adr->sapi,
 			adr->tei);
 	rq.protocol = ISDN_P_TE_S0;
@@ -590,7 +595,7 @@ delete_channel(struct mISDNchannel *ch)
 	}
 	if (*debug & DEBUG_CORE_FUNC)
 		printk(KERN_DEBUG "%s: st(%s) protocol(%x)\n", __func__,
-		    ch->st->dev->name, ch->protocol);
+		    dev_name(&ch->st->dev->dev), ch->protocol);
 	if (ch->protocol >= ISDN_P_B_START) {
 		if (ch->peer) {
 			ch->peer->ctrl(ch->peer, CLOSE_CHANNEL, NULL);
@@ -643,7 +648,7 @@ delete_stack(struct mISDNdevice *dev)
 
 	if (*debug & DEBUG_CORE_FUNC)
 		printk(KERN_DEBUG "%s: st(%s)\n", __func__,
-		    st->dev->name);
+		    dev_name(&st->dev->dev));
 	if (dev->teimgr)
 		delete_teimanager(dev->teimgr);
 	if (st->thread) {
diff --git a/drivers/isdn/mISDN/tei.c b/drivers/isdn/mISDN/tei.c
index 5c43d19e7c11..b452dead8fd0 100644
--- a/drivers/isdn/mISDN/tei.c
+++ b/drivers/isdn/mISDN/tei.c
@@ -968,9 +968,9 @@ create_teimgr(struct manager *mgr, struct channel_req *crq)
 
 	if (*debug & DEBUG_L2_TEI)
 		printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n",
-			__func__, mgr->ch.st->dev->name, crq->protocol,
-			crq->adr.dev, crq->adr.channel, crq->adr.sapi,
-			crq->adr.tei);
+			__func__, dev_name(&mgr->ch.st->dev->dev),
+			crq->protocol, crq->adr.dev, crq->adr.channel,
+			crq->adr.sapi, crq->adr.tei);
 	if (crq->adr.sapi != 0) /* not supported yet */
 		return -EINVAL;
 	if (crq->adr.tei > GROUP_TEI)
diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index be09476ed854..a59febb6143a 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -431,7 +431,6 @@ struct mISDN_sock {
 struct mISDNdevice {
 	struct mISDNchannel	D;
 	u_int			id;
-	char			name[MISDN_MAX_IDLEN];
 	u_int			Dprotocols;
 	u_int			Bprotocols;
 	u_int			nrbchan;
-- 
cgit v1.2.3


From 1f28fa19d34c0d9186f274e61e4b3dcfc6428c5c Mon Sep 17 00:00:00 2001
From: Martin Bachem <m.bachem@gmx.de>
Date: Wed, 3 Sep 2008 15:17:45 +0200
Subject: mISDN: Add E-Channel logging features

New prim PH_DATA_E_IND.

 - all E-ch frames are indicated by recv_Echannel(), which pushes E-Channel
   frames into dch's rqueue
 - if dchannel is opened with channel nr 0, no E-Channel logging
   is requested
 - if dchannel is opened with channel nr 1, E-Channel logging
   is requested. if layer1 does not support that, -EINVAL
   in return is appropriate

Signed-off-by: Martin Bachem <m.bachem@gmx.de>
Signed-off-by: Karsten Keil <kkeil@suse.de>
---
 drivers/isdn/mISDN/hwchannel.c | 19 +++++++++++++++++++
 drivers/isdn/mISDN/stack.c     |  2 +-
 include/linux/mISDNhw.h        |  1 +
 include/linux/mISDNif.h        |  1 +
 4 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/mISDN/hwchannel.c b/drivers/isdn/mISDN/hwchannel.c
index 535ceacc05b9..ab1168a110ae 100644
--- a/drivers/isdn/mISDN/hwchannel.c
+++ b/drivers/isdn/mISDN/hwchannel.c
@@ -165,6 +165,25 @@ recv_Dchannel(struct dchannel *dch)
 }
 EXPORT_SYMBOL(recv_Dchannel);
 
+void
+recv_Echannel(struct dchannel *ech, struct dchannel *dch)
+{
+	struct mISDNhead *hh;
+
+	if (ech->rx_skb->len < 2) { /* at least 2 for sapi / tei */
+		dev_kfree_skb(ech->rx_skb);
+		ech->rx_skb = NULL;
+		return;
+	}
+	hh = mISDN_HEAD_P(ech->rx_skb);
+	hh->prim = PH_DATA_E_IND;
+	hh->id = get_sapi_tei(ech->rx_skb->data);
+	skb_queue_tail(&dch->rqueue, ech->rx_skb);
+	ech->rx_skb = NULL;
+	schedule_event(dch, FLG_RECVQUEUE);
+}
+EXPORT_SYMBOL(recv_Echannel);
+
 void
 recv_Bchannel(struct bchannel *bch)
 {
diff --git a/drivers/isdn/mISDN/stack.c b/drivers/isdn/mISDN/stack.c
index 9b9fab47cb2b..8cff570bb8df 100644
--- a/drivers/isdn/mISDN/stack.c
+++ b/drivers/isdn/mISDN/stack.c
@@ -453,7 +453,7 @@ connect_layer1(struct mISDNdevice *dev, struct mISDNchannel *ch,
 		ch->peer = &dev->D.st->own;
 		ch->st = dev->D.st;
 		rq.protocol = protocol;
-		rq.adr.channel = 0;
+		rq.adr.channel = adr->channel;
 		err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq);
 		printk(KERN_DEBUG "%s: ret 1 %d\n", __func__, err);
 		if (err)
diff --git a/include/linux/mISDNhw.h b/include/linux/mISDNhw.h
index 9384b92dfc65..97ffdc1d3442 100644
--- a/include/linux/mISDNhw.h
+++ b/include/linux/mISDNhw.h
@@ -184,6 +184,7 @@ extern void	queue_ch_frame(struct mISDNchannel *, u_int,
 extern int	dchannel_senddata(struct dchannel *, struct sk_buff *);
 extern int	bchannel_senddata(struct bchannel *, struct sk_buff *);
 extern void	recv_Dchannel(struct dchannel *);
+extern void	recv_Echannel(struct dchannel *, struct dchannel *);
 extern void	recv_Bchannel(struct bchannel *);
 extern void	recv_Dchannel_skb(struct dchannel *, struct sk_buff *);
 extern void	recv_Bchannel_skb(struct bchannel *, struct sk_buff *);
diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index a59febb6143a..f75d596c5316 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -80,6 +80,7 @@
 #define PH_DEACTIVATE_IND	0x0202
 #define PH_DEACTIVATE_CNF	0x4202
 #define PH_DATA_IND		0x2002
+#define PH_DATA_E_IND		0x3002
 #define MPH_ACTIVATE_IND	0x0502
 #define MPH_DEACTIVATE_IND	0x0602
 #define MPH_INFORMATION_IND	0x0702
-- 
cgit v1.2.3


From 3bd69ad197a4a3d0085a5dc3b5796111bf176b12 Mon Sep 17 00:00:00 2001
From: Andreas Eversberg <andreas@eversberg.eu>
Date: Sat, 6 Sep 2008 09:03:46 +0200
Subject: mISDN: Add ISDN sample clock API to mISDN core

Add ISDN sample clock API to mISDN core (new file clock.c)
hfcmulti and mISDNdsp use clock API.

Signed-off-by: Andreas Eversberg <andreas@eversberg.eu>
Signed-off-by: Karsten Keil <kkeil@suse.de>
---
 drivers/isdn/hardware/mISDN/hfc_multi.h |   3 +
 drivers/isdn/hardware/mISDN/hfcmulti.c  |  45 +++++--
 drivers/isdn/mISDN/Makefile             |   2 +-
 drivers/isdn/mISDN/clock.c              | 216 ++++++++++++++++++++++++++++++++
 drivers/isdn/mISDN/core.c               |   1 +
 drivers/isdn/mISDN/core.h               |   2 +
 drivers/isdn/mISDN/dsp_cmx.c            |  40 +++---
 include/linux/mISDNif.h                 |  17 +++
 8 files changed, 294 insertions(+), 32 deletions(-)
 create mode 100644 drivers/isdn/mISDN/clock.c

(limited to 'include/linux')

diff --git a/drivers/isdn/hardware/mISDN/hfc_multi.h b/drivers/isdn/hardware/mISDN/hfc_multi.h
index 4aa6a8b41f50..663b77f578be 100644
--- a/drivers/isdn/hardware/mISDN/hfc_multi.h
+++ b/drivers/isdn/hardware/mISDN/hfc_multi.h
@@ -197,6 +197,9 @@ struct hfc_multi {
 
 	spinlock_t	lock;	/* the lock */
 
+	struct mISDNclock *iclock; /* isdn clock support */
+	int		iclock_on;
+
 	/*
 	 * the channel index is counted from 0, regardless where the channel
 	 * is located on the hfc-channel.
diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c
index 3fc2e9d95341..592db93105f9 100644
--- a/drivers/isdn/hardware/mISDN/hfcmulti.c
+++ b/drivers/isdn/hardware/mISDN/hfcmulti.c
@@ -133,6 +133,12 @@
  *	Give the value of the clock control register (A_ST_CLK_DLY)
  *	of the S/T interfaces in TE mode.
  *	This register is needed for the TBR3 certification, so don't change it.
+ *
+ * clock:
+ *	NOTE: only one clockdelay_te value must be given once
+ *	Selects interface with clock source for mISDN and applications.
+ *	Set to card number starting with 1. Set to -1 to disable.
+ *	By default, the first card is used as clock source.
  */
 
 /*
@@ -190,12 +196,13 @@ static int nt_t1_count[] = { 3840, 1920, 960, 480, 240, 120, 60, 30  };
  */
 
 static uint	type[MAX_CARDS];
-static uint	pcm[MAX_CARDS];
-static uint	dslot[MAX_CARDS];
+static int	pcm[MAX_CARDS];
+static int	dslot[MAX_CARDS];
 static uint	iomode[MAX_CARDS];
 static uint	port[MAX_PORTS];
 static uint	debug;
 static uint	poll;
+static int	clock;
 static uint	timer;
 static uint	clockdelay_te = CLKDEL_TE;
 static uint	clockdelay_nt = CLKDEL_NT;
@@ -207,12 +214,13 @@ MODULE_LICENSE("GPL");
 MODULE_VERSION(HFC_MULTI_VERSION);
 module_param(debug, uint, S_IRUGO | S_IWUSR);
 module_param(poll, uint, S_IRUGO | S_IWUSR);
+module_param(clock, int, S_IRUGO | S_IWUSR);
 module_param(timer, uint, S_IRUGO | S_IWUSR);
 module_param(clockdelay_te, uint, S_IRUGO | S_IWUSR);
 module_param(clockdelay_nt, uint, S_IRUGO | S_IWUSR);
 module_param_array(type, uint, NULL, S_IRUGO | S_IWUSR);
-module_param_array(pcm, uint, NULL, S_IRUGO | S_IWUSR);
-module_param_array(dslot, uint, NULL, S_IRUGO | S_IWUSR);
+module_param_array(pcm, int, NULL, S_IRUGO | S_IWUSR);
+module_param_array(dslot, int, NULL, S_IRUGO | S_IWUSR);
 module_param_array(iomode, uint, NULL, S_IRUGO | S_IWUSR);
 module_param_array(port, uint, NULL, S_IRUGO | S_IWUSR);
 
@@ -2629,6 +2637,7 @@ hfcmulti_interrupt(int intno, void *dev_id)
 		iqcnt = 0;
 	}
 #endif
+
 	if (!r_irq_statech &&
 	    !(status & (V_DTMF_STA | V_LOST_STA | V_EXT_IRQSTA |
 	    V_MISC_IRQSTA | V_FR_IRQSTA))) {
@@ -2683,11 +2692,13 @@ hfcmulti_interrupt(int intno, void *dev_id)
 					plxsd_checksync(hc, 0);
 			}
 		}
-		if (r_irq_misc & V_TI_IRQ)
+		if (r_irq_misc & V_TI_IRQ) {
+			if (hc->iclock_on)
+				mISDN_clock_update(hc->iclock, poll, NULL);
 			handle_timer_irq(hc);
+		}
 
 		if (r_irq_misc & V_DTMF_IRQ) {
-			/* -> DTMF IRQ */
 			hfcmulti_dtmf(hc);
 		}
 		if (r_irq_misc & V_IRQ_PROC) {
@@ -4075,6 +4086,15 @@ hfcm_dctrl(struct mISDNchannel *ch, u_int cmd, void *arg)
 	return err;
 }
 
+static int
+clockctl(void *priv, int enable)
+{
+	struct hfc_multi *hc = priv;
+
+	hc->iclock_on = enable;
+	return 0;
+}
+
 /*
  * initialize the card
  */
@@ -4489,10 +4509,14 @@ release_card(struct hfc_multi *hc)
 		printk(KERN_WARNING "%s: release card (%d) entered\n",
 		    __func__, hc->id);
 
+	/* unregister clock source */
+	if (hc->iclock)
+		mISDN_unregister_clock(hc->iclock);
+
+	/* disable irq */
 	spin_lock_irqsave(&hc->lock, flags);
 	disable_hwirq(hc);
 	spin_unlock_irqrestore(&hc->lock, flags);
-
 	udelay(1000);
 
 	/* dimm leds */
@@ -5003,6 +5027,10 @@ hfcmulti_init(struct pci_dev *pdev, const struct pci_device_id *ent)
 	list_add_tail(&hc->list, &HFClist);
 	spin_unlock_irqrestore(&HFClock, flags);
 
+	/* use as clock source */
+	if (clock == HFC_cnt + 1)
+		hc->iclock = mISDN_register_clock("HFCMulti", 0, clockctl, hc);
+
 	/* initialize hardware */
 	ret_err = init_card(hc);
 	if (ret_err) {
@@ -5273,6 +5301,9 @@ HFCmulti_init(void)
 
 	}
 
+	if (!clock)
+		clock = 1;
+
 	err = pci_register_driver(&hfcmultipci_driver);
 	if (err < 0) {
 		printk(KERN_ERR "error registering pci driver: %x\n", err);
diff --git a/drivers/isdn/mISDN/Makefile b/drivers/isdn/mISDN/Makefile
index 1cb5e633cf75..0a6bd2a9e730 100644
--- a/drivers/isdn/mISDN/Makefile
+++ b/drivers/isdn/mISDN/Makefile
@@ -8,6 +8,6 @@ obj-$(CONFIG_MISDN_L1OIP) += l1oip.o
 
 # multi objects
 
-mISDN_core-objs := core.o fsm.o socket.o hwchannel.o stack.o layer1.o layer2.o tei.o timerdev.o
+mISDN_core-objs := core.o fsm.o socket.o clock.o hwchannel.o stack.o layer1.o layer2.o tei.o timerdev.o
 mISDN_dsp-objs := dsp_core.o dsp_cmx.o dsp_tones.o dsp_dtmf.o dsp_audio.o dsp_blowfish.o dsp_pipeline.o dsp_hwec.o
 l1oip-objs := l1oip_core.o l1oip_codec.o
diff --git a/drivers/isdn/mISDN/clock.c b/drivers/isdn/mISDN/clock.c
new file mode 100644
index 000000000000..44d9c3d5d33d
--- /dev/null
+++ b/drivers/isdn/mISDN/clock.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright 2008  by Andreas Eversberg <andreas@eversberg.eu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Quick API description:
+ *
+ * A clock source registers using mISDN_register_clock:
+ * 	name = text string to name clock source
+ *	priority = value to priorize clock sources (0 = default)
+ *	ctl = callback function to enable/disable clock source
+ *	priv = private pointer of clock source
+ * 	return = pointer to clock source structure;
+ *
+ * Note: Callback 'ctl' can be called before mISDN_register_clock returns!
+ *       Also it can be called during mISDN_unregister_clock.
+ *
+ * A clock source calls mISDN_clock_update with given samples elapsed, if
+ * enabled. If function call is delayed, tv must be set with the timestamp
+ * of the actual event.
+ *
+ * A clock source unregisters using mISDN_unregister_clock.
+ *
+ * To get current clock, call mISDN_clock_get. The signed short value
+ * counts the number of samples since. Time since last clock event is added.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/mISDNif.h>
+#include "core.h"
+
+static u_int *debug;
+static LIST_HEAD(iclock_list);
+DEFINE_RWLOCK(iclock_lock);
+u16	iclock_count;		/* counter of last clock */
+struct	timeval iclock_tv;	/* time stamp of last clock */
+int	iclock_tv_valid;	/* already received one timestamp */
+struct	mISDNclock *iclock_current;
+
+void
+mISDN_init_clock(u_int *dp)
+{
+	debug = dp;
+	do_gettimeofday(&iclock_tv);
+}
+
+static void
+select_iclock(void)
+{
+	struct mISDNclock *iclock, *bestclock = NULL, *lastclock = NULL;
+	int pri = -128;
+
+	list_for_each_entry(iclock, &iclock_list, list) {
+		if (iclock->pri > pri) {
+			pri = iclock->pri;
+			bestclock = iclock;
+		}
+		if (iclock_current == iclock)
+			lastclock = iclock;
+	}
+	if (lastclock && bestclock != lastclock) {
+		/* last used clock source still exists but changes, disable */
+		if (*debug & DEBUG_CLOCK)
+			printk(KERN_DEBUG "Old clock source '%s' disable.\n",
+				lastclock->name);
+		lastclock->ctl(lastclock->priv, 0);
+	}
+	if (bestclock && bestclock != iclock_current) {
+		/* new clock source selected, enable */
+		if (*debug & DEBUG_CLOCK)
+			printk(KERN_DEBUG "New clock source '%s' enable.\n",
+				bestclock->name);
+		bestclock->ctl(bestclock->priv, 1);
+	}
+	if (bestclock != iclock_current) {
+		/* no clock received yet */
+		iclock_tv_valid = 0;
+	}
+	iclock_current = bestclock;
+}
+
+struct mISDNclock
+*mISDN_register_clock(char *name, int pri, clockctl_func_t *ctl, void *priv)
+{
+	u_long			flags;
+	struct mISDNclock	*iclock;
+
+	if (*debug & (DEBUG_CORE | DEBUG_CLOCK))
+		printk(KERN_DEBUG "%s: %s %d\n", __func__, name, pri);
+	iclock = kzalloc(sizeof(struct mISDNclock), GFP_ATOMIC);
+	if (!iclock) {
+		printk(KERN_ERR "%s: No memory for clock entry.\n", __func__);
+		return NULL;
+	}
+	strncpy(iclock->name, name, sizeof(iclock->name)-1);
+	iclock->pri = pri;
+	iclock->priv = priv;
+	iclock->ctl = ctl;
+	write_lock_irqsave(&iclock_lock, flags);
+	list_add_tail(&iclock->list, &iclock_list);
+	select_iclock();
+	write_unlock_irqrestore(&iclock_lock, flags);
+	return iclock;
+}
+EXPORT_SYMBOL(mISDN_register_clock);
+
+void
+mISDN_unregister_clock(struct mISDNclock *iclock)
+{
+	u_long	flags;
+
+	if (*debug & (DEBUG_CORE | DEBUG_CLOCK))
+		printk(KERN_DEBUG "%s: %s %d\n", __func__, iclock->name,
+			iclock->pri);
+	write_lock_irqsave(&iclock_lock, flags);
+	if (iclock_current == iclock) {
+		if (*debug & DEBUG_CLOCK)
+			printk(KERN_DEBUG
+				"Current clock source '%s' unregisters.\n",
+				iclock->name);
+		iclock->ctl(iclock->priv, 0);
+	}
+	list_del(&iclock->list);
+	select_iclock();
+	write_unlock_irqrestore(&iclock_lock, flags);
+}
+EXPORT_SYMBOL(mISDN_unregister_clock);
+
+void
+mISDN_clock_update(struct mISDNclock *iclock, int samples, struct timeval *tv)
+{
+	u_long		flags;
+	struct timeval	tv_now;
+	time_t		elapsed_sec;
+	int		elapsed_8000th;
+
+	write_lock_irqsave(&iclock_lock, flags);
+	if (iclock_current != iclock) {
+		printk(KERN_ERR "%s: '%s' sends us clock updates, but we do "
+			"listen to '%s'. This is a bug!\n", __func__,
+			iclock->name,
+			iclock_current ? iclock_current->name : "nothing");
+		iclock->ctl(iclock->priv, 0);
+		write_unlock_irqrestore(&iclock_lock, flags);
+		return;
+	}
+	if (iclock_tv_valid) {
+		/* increment sample counter by given samples */
+		iclock_count += samples;
+		if (tv) { /* tv must be set, if function call is delayed */
+			iclock_tv.tv_sec = tv->tv_sec;
+			iclock_tv.tv_usec = tv->tv_usec;
+		} else
+			do_gettimeofday(&iclock_tv);
+	} else {
+		/* calc elapsed time by system clock */
+		if (tv) { /* tv must be set, if function call is delayed */
+			tv_now.tv_sec = tv->tv_sec;
+			tv_now.tv_usec = tv->tv_usec;
+		} else
+			do_gettimeofday(&tv_now);
+		elapsed_sec = tv_now.tv_sec - iclock_tv.tv_sec;
+		elapsed_8000th = (tv_now.tv_usec / 125)
+			- (iclock_tv.tv_usec / 125);
+		if (elapsed_8000th < 0) {
+			elapsed_sec -= 1;
+			elapsed_8000th += 8000;
+		}
+		/* add elapsed time to counter and set new timestamp */
+		iclock_count += elapsed_sec * 8000 + elapsed_8000th;
+		iclock_tv.tv_sec = tv_now.tv_sec;
+		iclock_tv.tv_usec = tv_now.tv_usec;
+		iclock_tv_valid = 1;
+		if (*debug & DEBUG_CLOCK)
+			printk("Received first clock from source '%s'.\n",
+			    iclock_current ? iclock_current->name : "nothing");
+	}
+	write_unlock_irqrestore(&iclock_lock, flags);
+}
+EXPORT_SYMBOL(mISDN_clock_update);
+
+unsigned short
+mISDN_clock_get(void)
+{
+	u_long		flags;
+	struct timeval	tv_now;
+	time_t		elapsed_sec;
+	int		elapsed_8000th;
+	u16		count;
+
+	read_lock_irqsave(&iclock_lock, flags);
+	/* calc elapsed time by system clock */
+	do_gettimeofday(&tv_now);
+	elapsed_sec = tv_now.tv_sec - iclock_tv.tv_sec;
+	elapsed_8000th = (tv_now.tv_usec / 125) - (iclock_tv.tv_usec / 125);
+	if (elapsed_8000th < 0) {
+		elapsed_sec -= 1;
+		elapsed_8000th += 8000;
+	}
+	/* add elapsed time to counter */
+	count =	iclock_count + elapsed_sec * 8000 + elapsed_8000th;
+	read_unlock_irqrestore(&iclock_lock, flags);
+	return count;
+}
+EXPORT_SYMBOL(mISDN_clock_get);
+
diff --git a/drivers/isdn/mISDN/core.c b/drivers/isdn/mISDN/core.c
index 43fd97b0fe42..9116f54def2c 100644
--- a/drivers/isdn/mISDN/core.c
+++ b/drivers/isdn/mISDN/core.c
@@ -199,6 +199,7 @@ mISDNInit(void)
 
 	printk(KERN_INFO "Modular ISDN core version %d.%d.%d\n",
 		MISDN_MAJOR_VERSION, MISDN_MINOR_VERSION, MISDN_RELEASE);
+	mISDN_init_clock(&debug);
 	mISDN_initstack(&debug);
 	err = mISDN_inittimer(&debug);
 	if (err)
diff --git a/drivers/isdn/mISDN/core.h b/drivers/isdn/mISDN/core.h
index 7da7233b4c1a..7ac2f81a812b 100644
--- a/drivers/isdn/mISDN/core.h
+++ b/drivers/isdn/mISDN/core.h
@@ -74,4 +74,6 @@ extern void	l1_cleanup(void);
 extern int 	Isdnl2_Init(u_int *);
 extern void	Isdnl2_cleanup(void);
 
+extern void	mISDN_init_clock(u_int *);
+
 #endif
diff --git a/drivers/isdn/mISDN/dsp_cmx.c b/drivers/isdn/mISDN/dsp_cmx.c
index 04dbb407f7a0..efe4c7430e6d 100644
--- a/drivers/isdn/mISDN/dsp_cmx.c
+++ b/drivers/isdn/mISDN/dsp_cmx.c
@@ -1557,13 +1557,11 @@ send_packet:
 	schedule_work(&dsp->workq);
 }
 
-static u32	samplecount;
+static u32	jittercount; /* counter for jitter check */;
 struct timer_list dsp_spl_tl;
 u32	dsp_spl_jiffies; /* calculate the next time to fire */
-#ifdef UNUSED
-static u32	dsp_start_jiffies; /* jiffies at the time, the calculation begins */
-#endif /* UNUSED */
-static struct timeval dsp_start_tv; /* time at start of calculation */
+static u16	dsp_count; /* last sample count */
+static int	dsp_count_valid ; /* if we have last sample count */
 
 void
 dsp_cmx_send(void *arg)
@@ -1577,38 +1575,32 @@ dsp_cmx_send(void *arg)
 	int r, rr;
 	int jittercheck = 0, delay, i;
 	u_long flags;
-	struct timeval tv;
-	u32 elapsed;
-	s16 length;
+	u16 length, count;
 
 	/* lock */
 	spin_lock_irqsave(&dsp_lock, flags);
 
-	if (!dsp_start_tv.tv_sec) {
-		do_gettimeofday(&dsp_start_tv);
+	if (!dsp_count_valid) {
+		dsp_count = mISDN_clock_get();
 		length = dsp_poll;
+		dsp_count_valid = 1;
 	} else {
-		do_gettimeofday(&tv);
-		elapsed = ((tv.tv_sec - dsp_start_tv.tv_sec) * 8000)
-		    + ((s32)(tv.tv_usec / 125) - (dsp_start_tv.tv_usec / 125));
-		dsp_start_tv.tv_sec = tv.tv_sec;
-		dsp_start_tv.tv_usec = tv.tv_usec;
-		length = elapsed;
+		count = mISDN_clock_get();
+		length = count - dsp_count;
+		dsp_count = count;
 	}
 	if (length > MAX_POLL + 100)
 		length = MAX_POLL + 100;
-/* printk(KERN_DEBUG "len=%d dsp_count=0x%x.%04x dsp_poll_diff=0x%x.%04x\n",
- length, dsp_count >> 16, dsp_count & 0xffff, dsp_poll_diff >> 16,
- dsp_poll_diff & 0xffff);
- */
+	/* printk(KERN_DEBUG "len=%d dsp_count=0x%x\n", length, dsp_count); */
 
 	/*
-	 * check if jitter needs to be checked
-	 * (this is about every second = 8192 samples)
+	 * check if jitter needs to be checked (this is every second)
 	 */
-	samplecount += length;
-	if ((samplecount & 8191) < length)
+	jittercount += length;
+	if (jittercount >= 8000) {
+		jittercount -= 8000;
 		jittercheck = 1;
+	}
 
 	/* loop all members that do not require conference mixing */
 	list_for_each_entry(dsp, &dsp_ilist, list) {
diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index f75d596c5316..364f1018f0d1 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -371,6 +371,7 @@ struct mISDN_ctrl_req {
 #define DEBUG_L2_TEI		0x00100000
 #define DEBUG_L2_TEIFSM		0x00200000
 #define DEBUG_TIMER		0x01000000
+#define DEBUG_CLOCK		0x02000000
 
 #define mISDN_HEAD_P(s)		((struct mISDNhead *)&s->cb[0])
 #define mISDN_HEAD_PRIM(s)	(((struct mISDNhead *)&s->cb[0])->prim)
@@ -384,6 +385,7 @@ struct mISDN_ctrl_req {
 struct mISDNchannel;
 struct mISDNdevice;
 struct mISDNstack;
+struct mISDNclock;
 
 struct channel_req {
 	u_int			protocol;
@@ -460,6 +462,16 @@ struct mISDNstack {
 #endif
 };
 
+typedef	int	(clockctl_func_t)(void *, int);
+
+struct	mISDNclock {
+	struct list_head	list;
+	char			name[64];
+	int			pri;
+	clockctl_func_t		*ctl;
+	void			*priv;
+};
+
 /* global alloc/queue functions */
 
 static inline struct sk_buff *
@@ -510,8 +522,13 @@ extern int	mISDN_register_device(struct mISDNdevice *, char *name);
 extern void	mISDN_unregister_device(struct mISDNdevice *);
 extern int	mISDN_register_Bprotocol(struct Bprotocol *);
 extern void	mISDN_unregister_Bprotocol(struct Bprotocol *);
+extern struct mISDNclock *mISDN_register_clock(char *, int, clockctl_func_t *,
+						void *);
+extern void	mISDN_unregister_clock(struct mISDNclock *);
 
 extern void	set_channel_address(struct mISDNchannel *, u_int, u_int);
+extern void	mISDN_clock_update(struct mISDNclock *, int, struct timeval *);
+extern unsigned short mISDN_clock_get(void);
 
 #endif /* __KERNEL__ */
 #endif /* mISDNIF_H */
-- 
cgit v1.2.3


From 02282eee56b75a35e6bbc42cc34c9005eb1653f4 Mon Sep 17 00:00:00 2001
From: Martin Bachem <m.bachem@gmx.de>
Date: Mon, 8 Sep 2008 15:57:48 +0200
Subject: mISDN: Add ISDN_P_TE_UP0 / ISDN_P_NT_UP0

- new layer1 protocols for UP0 bus
- helper #defines to test for TE/NT/S0/E1/UP0

Signed-off-by: Martin Bachem <m.bachem@gmx.de>
Signed-off-by: Karsten Keil <kkeil@suse.de>
---
 include/linux/mISDNif.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index 364f1018f0d1..7f65aa0c1cc5 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -200,6 +200,18 @@
 #define ISDN_P_NT_S0  		0x02
 #define ISDN_P_TE_E1		0x03
 #define ISDN_P_NT_E1  		0x04
+#define ISDN_P_TE_UP0		0x05
+#define ISDN_P_NT_UP0		0x06
+
+#define IS_ISDN_P_TE(p) ((p == ISDN_P_TE_S0) || (p == ISDN_P_TE_E1) || \
+				(p == ISDN_P_TE_UP0))
+#define IS_ISDN_P_NT(p) ((p == ISDN_P_NT_S0) || (p == ISDN_P_NT_E1) || \
+				(p == ISDN_P_NT_UP0))
+#define IS_ISDN_P_S0(p) ((p == ISDN_P_TE_S0) || (p == ISDN_P_NT_S0))
+#define IS_ISDN_P_E1(p) ((p == ISDN_P_TE_E1) || (p == ISDN_P_NT_E1))
+#define IS_ISDN_P_UP0(p) ((p == ISDN_P_TE_UP0) || (p == ISDN_P_NT_UP0))
+
+
 #define ISDN_P_LAPD_TE		0x10
 #define	ISDN_P_LAPD_NT		0x11
 
-- 
cgit v1.2.3


From 1b4d33121f1d991f6ae226cc3333428ff87627bb Mon Sep 17 00:00:00 2001
From: Andreas Eversberg <andreas@eversberg.eu>
Date: Sun, 14 Sep 2008 12:30:18 +0200
Subject: mISDN: Fix deactivation, if peer IP is removed from l1oip instance.

 Added GETPEER operation.
 Socket now checks if device is already busy at a differen mode.

Signed-off-by: Andreas Eversberg <andreas@eversberg.eu>
Signed-off-by: Karsten Keil <kkeil@suse.de>
---
 drivers/isdn/mISDN/l1oip_core.c | 22 +++++++++++++++++++++-
 drivers/isdn/mISDN/socket.c     | 20 ++++++++++++++++++++
 drivers/isdn/mISDN/stack.c      | 18 ------------------
 include/linux/mISDNif.h         |  5 +++--
 4 files changed, 44 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c
index 0884dd6892f8..3ddcd2e09dc9 100644
--- a/drivers/isdn/mISDN/l1oip_core.c
+++ b/drivers/isdn/mISDN/l1oip_core.c
@@ -777,6 +777,8 @@ fail:
 static void
 l1oip_socket_close(struct l1oip *hc)
 {
+	struct dchannel *dch = hc->chan[hc->d_idx].dch;
+
 	/* kill thread */
 	if (hc->socket_thread) {
 		if (debug & DEBUG_L1OIP_SOCKET)
@@ -785,6 +787,16 @@ l1oip_socket_close(struct l1oip *hc)
 		send_sig(SIGTERM, hc->socket_thread, 0);
 		wait_for_completion(&hc->socket_complete);
 	}
+
+	/* if active, we send up a PH_DEACTIVATE and deactivate */
+	if (test_bit(FLG_ACTIVE, &dch->Flags)) {
+		if (debug & (DEBUG_L1OIP_MSG|DEBUG_L1OIP_SOCKET))
+			printk(KERN_DEBUG "%s: interface become deactivated "
+				"due to timeout\n", __func__);
+		test_and_clear_bit(FLG_ACTIVE, &dch->Flags);
+		_queue_data(&dch->dev.D, PH_DEACTIVATE_IND, MISDN_ID_ANY, 0,
+			NULL, GFP_ATOMIC);
+	}
 }
 
 static int
@@ -944,7 +956,8 @@ channel_dctrl(struct dchannel *dch, struct mISDN_ctrl_req *cq)
 
 	switch (cq->op) {
 	case MISDN_CTRL_GETOP:
-		cq->op = MISDN_CTRL_SETPEER | MISDN_CTRL_UNSETPEER;
+		cq->op = MISDN_CTRL_SETPEER | MISDN_CTRL_UNSETPEER
+			| MISDN_CTRL_GETPEER;
 		break;
 	case MISDN_CTRL_SETPEER:
 		hc->remoteip = (u32)cq->p1;
@@ -964,6 +977,13 @@ channel_dctrl(struct dchannel *dch, struct mISDN_ctrl_req *cq)
 		hc->remoteip = 0;
 		l1oip_socket_open(hc);
 		break;
+	case MISDN_CTRL_GETPEER:
+		if (debug & DEBUG_L1OIP_SOCKET)
+			printk(KERN_DEBUG "%s: getting ip address.\n",
+				__func__);
+		(u32)cq->p1 = hc->remoteip;
+		cq->p2 = hc->remoteport | (hc->localport << 16);
+		break;
 	default:
 		printk(KERN_WARNING "%s: unknown Op %x\n",
 		    __func__, cq->op);
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 2f6d6e88ff2c..916569ca156d 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -460,6 +460,8 @@ data_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 {
 	struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr;
 	struct sock *sk = sock->sk;
+	struct hlist_node *node;
+	struct sock *csk;
 	int err = 0;
 
 	if (*debug & DEBUG_SOCKET)
@@ -480,6 +482,24 @@ data_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		err = -ENODEV;
 		goto done;
 	}
+
+	read_lock_bh(&data_sockets.lock);
+	sk_for_each(csk, node, &data_sockets.head) {
+		if (sk == csk)
+			continue;
+		if (_pms(csk)->dev != _pms(sk)->dev)
+			continue;
+		if (csk->sk_protocol >= ISDN_P_B_START)
+			continue;
+		if (IS_ISDN_P_TE(csk->sk_protocol)
+				== IS_ISDN_P_TE(sk->sk_protocol))
+			continue;
+		read_unlock_bh(&data_sockets.lock);
+		err = -EBUSY;
+		goto done;
+	}
+	read_unlock_bh(&data_sockets.lock);
+
 	_pms(sk)->ch.send = mISDN_send;
 	_pms(sk)->ch.ctrl = mISDN_ctrl;
 
diff --git a/drivers/isdn/mISDN/stack.c b/drivers/isdn/mISDN/stack.c
index 8cff570bb8df..63afa8cf9e07 100644
--- a/drivers/isdn/mISDN/stack.c
+++ b/drivers/isdn/mISDN/stack.c
@@ -440,15 +440,6 @@ connect_layer1(struct mISDNdevice *dev, struct mISDNchannel *ch,
 	case ISDN_P_NT_E1:
 	case ISDN_P_TE_S0:
 	case ISDN_P_TE_E1:
-#ifdef PROTOCOL_CHECK
-		/* this should be enhanced */
-		if (!list_empty(&dev->D.st->layer2)
-			&& dev->D.protocol != protocol)
-			return -EBUSY;
-		if (!hlist_empty(&dev->D.st->l1sock.head)
-			&& dev->D.protocol != protocol)
-			return -EBUSY;
-#endif
 		ch->recv = mISDN_queue_message;
 		ch->peer = &dev->D.st->own;
 		ch->st = dev->D.st;
@@ -546,15 +537,6 @@ create_l2entity(struct mISDNdevice *dev, struct mISDNchannel *ch,
 		if (dev->Dprotocols & (1 << ISDN_P_NT_E1))
 			rq.protocol = ISDN_P_NT_E1;
 	case ISDN_P_LAPD_TE:
-#ifdef PROTOCOL_CHECK
-		/* this should be enhanced */
-		if (!list_empty(&dev->D.st->layer2)
-			&& dev->D.protocol != protocol)
-			return -EBUSY;
-		if (!hlist_empty(&dev->D.st->l1sock.head)
-			&& dev->D.protocol != protocol)
-			return -EBUSY;
-#endif
 		ch->recv = mISDN_queue_message;
 		ch->peer = &dev->D.st->own;
 		ch->st = dev->D.st;
diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index 7f65aa0c1cc5..3f9988849f32 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -204,9 +204,9 @@
 #define ISDN_P_NT_UP0		0x06
 
 #define IS_ISDN_P_TE(p) ((p == ISDN_P_TE_S0) || (p == ISDN_P_TE_E1) || \
-				(p == ISDN_P_TE_UP0))
+				(p == ISDN_P_TE_UP0) || (p == ISDN_P_LAPD_TE))
 #define IS_ISDN_P_NT(p) ((p == ISDN_P_NT_S0) || (p == ISDN_P_NT_E1) || \
-				(p == ISDN_P_NT_UP0))
+				(p == ISDN_P_NT_UP0) || (p == ISDN_P_LAPD_NT))
 #define IS_ISDN_P_S0(p) ((p == ISDN_P_TE_S0) || (p == ISDN_P_NT_S0))
 #define IS_ISDN_P_E1(p) ((p == ISDN_P_TE_E1) || (p == ISDN_P_NT_E1))
 #define IS_ISDN_P_UP0(p) ((p == ISDN_P_TE_UP0) || (p == ISDN_P_NT_UP0))
@@ -333,6 +333,7 @@ clear_channelmap(u_int nr, u_char *map)
 #define MISDN_CTRL_UNSETPEER		0x0080
 #define MISDN_CTRL_RX_OFF		0x0100
 #define MISDN_CTRL_FILL_EMPTY		0x0200
+#define MISDN_CTRL_GETPEER		0x0400
 #define MISDN_CTRL_HW_FEATURES_OP	0x2000
 #define MISDN_CTRL_HW_FEATURES		0x2001
 #define MISDN_CTRL_HFC_OP		0x4000
-- 
cgit v1.2.3


From b36b654a7e82308cea063cdf909a7f246105c2a3 Mon Sep 17 00:00:00 2001
From: Matthias Urlichs <smurf@smurf.noris.de>
Date: Sat, 16 Aug 2008 00:09:24 +0200
Subject: mISDN: Create /sys/class/mISDN

Create /sys/class/mISDN and implement functions to handle
device renames.

Signed-Off-By: Matthias Urlichs <matthias@urlichs.de>
Signed-off-by: Karsten Keil <kkeil@suse.de>
---
 drivers/isdn/hardware/mISDN/hfcmulti.c |   6 +-
 drivers/isdn/hardware/mISDN/hfcpci.c   | 100 ++++++------
 drivers/isdn/mISDN/core.c              | 271 ++++++++++++++++++++++++++-------
 drivers/isdn/mISDN/l1oip_core.c        |   3 +-
 include/linux/mISDNif.h                |   8 +-
 5 files changed, 273 insertions(+), 115 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c
index cd3f3994b7f8..97f4708b3879 100644
--- a/drivers/isdn/hardware/mISDN/hfcmulti.c
+++ b/drivers/isdn/hardware/mISDN/hfcmulti.c
@@ -4718,7 +4718,7 @@ init_e1_port(struct hfc_multi *hc, struct hm_map *m)
 	} else
 		hc->chan[hc->dslot].jitter = 2; /* default */
 	snprintf(name, MISDN_MAX_IDLEN - 1, "hfc-e1.%d", HFC_cnt + 1);
-	ret = mISDN_register_device(&dch->dev, name);
+	ret = mISDN_register_device(&dch->dev, &hc->pci_dev->dev, name);
 	if (ret)
 		goto free_chan;
 	hc->created[0] = 1;
@@ -4826,9 +4826,9 @@ init_multi_port(struct hfc_multi *hc, int pt)
 		test_and_set_bit(HFC_CFG_DIS_ECHANNEL,
 		    &hc->chan[i + 2].cfg);
 	}
-	snprintf(name, MISDN_MAX_IDLEN - 1, "hfc-%ds.%d/%d",
+	snprintf(name, MISDN_MAX_IDLEN - 1, "hfc-%ds.%d-%d",
 		hc->type, HFC_cnt + 1, pt + 1);
-	ret = mISDN_register_device(&dch->dev, name);
+	ret = mISDN_register_device(&dch->dev, &hc->pci_dev->dev, name);
 	if (ret)
 		goto free_chan;
 	hc->created[pt] = 1;
diff --git a/drivers/isdn/hardware/mISDN/hfcpci.c b/drivers/isdn/hardware/mISDN/hfcpci.c
index cd5d26c9909f..8df12bf02af3 100644
--- a/drivers/isdn/hardware/mISDN/hfcpci.c
+++ b/drivers/isdn/hardware/mISDN/hfcpci.c
@@ -64,9 +64,6 @@ MODULE_LICENSE("GPL");
 module_param(debug, uint, 0);
 module_param(poll, uint, S_IRUGO | S_IWUSR);
 
-static LIST_HEAD(HFClist);
-static DEFINE_RWLOCK(HFClock);
-
 enum {
 	HFC_CCD_2BD0,
 	HFC_CCD_B000,
@@ -136,7 +133,6 @@ struct hfcPCI_hw {
 
 
 struct hfc_pci {
-	struct list_head	list;
 	u_char			subtype;
 	u_char			chanlimit;
 	u_char			initdone;
@@ -1227,41 +1223,6 @@ hfcpci_int(int intno, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static void
-hfcpci_softirq(void *arg)
-{
-	u_long		flags;
-	struct bchannel	*bch;
-	struct hfc_pci	*hc;
-
-	write_lock_irqsave(&HFClock, flags);
-	list_for_each_entry(hc, &HFClist, list) {
-		if (hc->hw.int_m2 & HFCPCI_IRQ_ENABLE) {
-			spin_lock(&hc->lock);
-			bch = Sel_BCS(hc, hc->hw.bswapped ? 2 : 1);
-			if (bch && bch->state == ISDN_P_B_RAW) { /* B1 rx&tx */
-				main_rec_hfcpci(bch);
-				tx_birq(bch);
-			}
-			bch = Sel_BCS(hc, hc->hw.bswapped ? 1 : 2);
-			if (bch && bch->state == ISDN_P_B_RAW) { /* B2 rx&tx */
-				main_rec_hfcpci(bch);
-				tx_birq(bch);
-			}
-			spin_unlock(&hc->lock);
-		}
-	}
-	write_unlock_irqrestore(&HFClock, flags);
-
-	/* if next event would be in the past ... */
-	if ((s32)(hfc_jiffies + tics - jiffies) <= 0)
-		hfc_jiffies = jiffies + 1;
-	else
-		hfc_jiffies += tics;
-	hfc_tl.expires = hfc_jiffies;
-	add_timer(&hfc_tl);
-}
-
 /*
  * timer callback for D-chan busy resolution. Currently no function
  */
@@ -2131,7 +2092,6 @@ release_card(struct hfc_pci *hc) {
 	mISDN_freebchannel(&hc->bch[1]);
 	mISDN_freebchannel(&hc->bch[0]);
 	mISDN_freedchannel(&hc->dch);
-	list_del(&hc->list);
 	pci_set_drvdata(hc->pdev, NULL);
 	kfree(hc);
 }
@@ -2141,7 +2101,6 @@ setup_card(struct hfc_pci *card)
 {
 	int		err = -EINVAL;
 	u_int		i;
-	u_long		flags;
 	char		name[MISDN_MAX_IDLEN];
 
 	card->dch.debug = debug;
@@ -2169,13 +2128,10 @@ setup_card(struct hfc_pci *card)
 	if (err)
 		goto error;
 	snprintf(name, MISDN_MAX_IDLEN - 1, "hfc-pci.%d", HFC_cnt + 1);
-	err = mISDN_register_device(&card->dch.dev, name);
+	err = mISDN_register_device(&card->dch.dev, &card->pdev->dev, name);
 	if (err)
 		goto error;
 	HFC_cnt++;
-	write_lock_irqsave(&HFClock, flags);
-	list_add_tail(&card->list, &HFClist);
-	write_unlock_irqrestore(&HFClock, flags);
 	printk(KERN_INFO "HFC %d cards installed\n", HFC_cnt);
 	return 0;
 error:
@@ -2311,15 +2267,12 @@ static void __devexit
 hfc_remove_pci(struct pci_dev *pdev)
 {
 	struct hfc_pci	*card = pci_get_drvdata(pdev);
-	u_long		flags;
 
-	if (card) {
-		write_lock_irqsave(&HFClock, flags);
+	if (card)
 		release_card(card);
-		write_unlock_irqrestore(&HFClock, flags);
-	} else
+	else
 		if (debug)
-			printk(KERN_WARNING "%s: drvdata allready removed\n",
+			printk(KERN_WARNING "%s: drvdata already removed\n",
 			    __func__);
 }
 
@@ -2331,6 +2284,46 @@ static struct pci_driver hfc_driver = {
 	.id_table = hfc_ids,
 };
 
+static int
+_hfcpci_softirq(struct device *dev, void *arg)
+{
+	struct hfc_pci  *hc = dev_get_drvdata(dev);
+	struct bchannel *bch;
+	if (hc == NULL)
+		return 0;
+
+	if (hc->hw.int_m2 & HFCPCI_IRQ_ENABLE) {
+		spin_lock(&hc->lock);
+		bch = Sel_BCS(hc, hc->hw.bswapped ? 2 : 1);
+		if (bch && bch->state == ISDN_P_B_RAW) { /* B1 rx&tx */
+			main_rec_hfcpci(bch);
+			tx_birq(bch);
+		}
+		bch = Sel_BCS(hc, hc->hw.bswapped ? 1 : 2);
+		if (bch && bch->state == ISDN_P_B_RAW) { /* B2 rx&tx */
+			main_rec_hfcpci(bch);
+			tx_birq(bch);
+		}
+		spin_unlock(&hc->lock);
+	}
+	return 0;
+}
+
+static void
+hfcpci_softirq(void *arg)
+{
+	(void) driver_for_each_device(&hfc_driver.driver, NULL, arg,
+					_hfcpci_softirq);
+
+	/* if next event would be in the past ... */
+	if ((s32)(hfc_jiffies + tics - jiffies) <= 0)
+		hfc_jiffies = jiffies + 1;
+	else
+		hfc_jiffies += tics;
+	hfc_tl.expires = hfc_jiffies;
+	add_timer(&hfc_tl);
+}
+
 static int __init
 HFC_init(void)
 {
@@ -2375,14 +2368,9 @@ HFC_init(void)
 static void __exit
 HFC_cleanup(void)
 {
-	struct hfc_pci	*card, *next;
-
 	if (timer_pending(&hfc_tl))
 		del_timer(&hfc_tl);
 
-	list_for_each_entry_safe(card, next, &HFClist, list) {
-		release_card(card);
-	}
 	pci_unregister_driver(&hfc_driver);
 }
 
diff --git a/drivers/isdn/mISDN/core.c b/drivers/isdn/mISDN/core.c
index 9116f54def2c..9426c9827e47 100644
--- a/drivers/isdn/mISDN/core.c
+++ b/drivers/isdn/mISDN/core.c
@@ -25,39 +25,183 @@ MODULE_AUTHOR("Karsten Keil");
 MODULE_LICENSE("GPL");
 module_param(debug, uint, S_IRUGO | S_IWUSR);
 
-static LIST_HEAD(devices);
-static DEFINE_RWLOCK(device_lock);
 static u64		device_ids;
 #define MAX_DEVICE_ID	63
 
 static LIST_HEAD(Bprotocols);
 static DEFINE_RWLOCK(bp_lock);
 
+static void mISDN_dev_release(struct device *dev)
+{
+	/* nothing to do: the device is part of its parent's data structure */
+}
+
+static ssize_t _show_id(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct mISDNdevice *mdev = dev_to_mISDN(dev);
+
+	if (!mdev)
+		return -ENODEV;
+	return sprintf(buf, "%d\n", mdev->id);
+}
+
+static ssize_t _show_nrbchan(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct mISDNdevice *mdev = dev_to_mISDN(dev);
+
+	if (!mdev)
+		return -ENODEV;
+	return sprintf(buf, "%d\n", mdev->nrbchan);
+}
+
+static ssize_t _show_d_protocols(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct mISDNdevice *mdev = dev_to_mISDN(dev);
+
+	if (!mdev)
+		return -ENODEV;
+	return sprintf(buf, "%d\n", mdev->Dprotocols);
+}
+
+static ssize_t _show_b_protocols(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct mISDNdevice *mdev = dev_to_mISDN(dev);
+
+	if (!mdev)
+		return -ENODEV;
+	return sprintf(buf, "%d\n", mdev->Bprotocols | get_all_Bprotocols());
+}
+
+static ssize_t _show_protocol(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct mISDNdevice *mdev = dev_to_mISDN(dev);
+
+	if (!mdev)
+		return -ENODEV;
+	return sprintf(buf, "%d\n", mdev->D.protocol);
+}
+
+static ssize_t _show_name(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	strcpy(buf, dev_name(dev));
+	return strlen(buf);
+}
+
+#if 0 /* hangs */
+static ssize_t _set_name(struct device *dev, struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	int err = 0;
+	char *out = kmalloc(count + 1, GFP_KERNEL);
+
+	if (!out)
+		return -ENOMEM;
+
+	memcpy(out, buf, count);
+	if (count && out[count - 1] == '\n')
+		out[--count] = 0;
+	if (count)
+		err = device_rename(dev, out);
+	kfree(out);
+
+	return (err < 0) ? err : count;
+}
+#endif
+
+static ssize_t _show_channelmap(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct mISDNdevice *mdev = dev_to_mISDN(dev);
+	char *bp = buf;
+	int i;
+
+	for (i = 0; i <= mdev->nrbchan; i++)
+		*bp++ = test_channelmap(i, mdev->channelmap) ? '1' : '0';
+
+	return bp - buf;
+}
+
+static struct device_attribute mISDN_dev_attrs[] = {
+	__ATTR(id,          S_IRUGO,         _show_id,          NULL),
+	__ATTR(d_protocols, S_IRUGO,         _show_d_protocols, NULL),
+	__ATTR(b_protocols, S_IRUGO,         _show_b_protocols, NULL),
+	__ATTR(protocol,    S_IRUGO,         _show_protocol,    NULL),
+	__ATTR(channelmap,  S_IRUGO,         _show_channelmap,  NULL),
+	__ATTR(nrbchan,     S_IRUGO,         _show_nrbchan,     NULL),
+	__ATTR(name,        S_IRUGO,         _show_name,        NULL),
+/*	__ATTR(name,        S_IRUGO|S_IWUSR, _show_name,       _set_name), */
+	{}
+};
+
+#ifdef CONFIG_HOTPLUG
+static int mISDN_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct mISDNdevice *mdev = dev_to_mISDN(dev);
+
+	if (!mdev)
+		return 0;
+
+	if (add_uevent_var(env, "nchans=%d", mdev->nrbchan))
+		return -ENOMEM;
+
+	return 0;
+}
+#endif
+
+static void mISDN_class_release(struct class *cls)
+{
+	/* do nothing, it's static */
+}
+
+static struct class mISDN_class = {
+	.name = "mISDN",
+	.owner = THIS_MODULE,
+#ifdef CONFIG_HOTPLUG
+	.dev_uevent = mISDN_uevent,
+#endif
+	.dev_attrs = mISDN_dev_attrs,
+	.dev_release = mISDN_dev_release,
+	.class_release = mISDN_class_release,
+};
+
+static int
+_get_mdevice(struct device *dev, void *id)
+{
+	struct mISDNdevice *mdev = dev_to_mISDN(dev);
+
+	if (!mdev)
+		return 0;
+	if (mdev->id != *(u_int *)id)
+		return 0;
+	return 1;
+}
+
 struct mISDNdevice
 *get_mdevice(u_int id)
 {
-	struct mISDNdevice	*dev;
+	return dev_to_mISDN(class_find_device(&mISDN_class, NULL, &id,
+		_get_mdevice));
+}
 
-	read_lock(&device_lock);
-	list_for_each_entry(dev, &devices, D.list)
-		if (dev->id == id) {
-			read_unlock(&device_lock);
-			return dev;
-		}
-	read_unlock(&device_lock);
-	return NULL;
+static int
+_get_mdevice_count(struct device *dev, void *cnt)
+{
+	*(int *)cnt += 1;
+	return 0;
 }
 
 int
 get_mdevice_count(void)
 {
-	struct mISDNdevice	*dev;
-	int			cnt = 0;
+	int cnt = 0;
 
-	read_lock(&device_lock);
-	list_for_each_entry(dev, &devices, D.list)
-		cnt++;
-	read_unlock(&device_lock);
+	class_for_each_device(&mISDN_class, NULL, &cnt, _get_mdevice_count);
 	return cnt;
 }
 
@@ -68,19 +212,24 @@ get_free_devid(void)
 
 	for (i = 0; i <= MAX_DEVICE_ID; i++)
 		if (!test_and_set_bit(i, (u_long *)&device_ids))
-			return i;
-	return -1;
+			break;
+	if (i > MAX_DEVICE_ID)
+		return -1;
+	return i;
 }
 
 int
-mISDN_register_device(struct mISDNdevice *dev, char *name)
+mISDN_register_device(struct mISDNdevice *dev,
+			struct device *parent, char *name)
 {
-	u_long	flags;
 	int	err;
 
 	dev->id = get_free_devid();
+	err = -EBUSY;
 	if (dev->id < 0)
-		return -EBUSY;
+		goto error1;
+
+	device_initialize(&dev->dev);
 	if (name && name[0])
 		dev_set_name(&dev->dev, "%s", name);
 	else
@@ -90,26 +239,39 @@ mISDN_register_device(struct mISDNdevice *dev, char *name)
 			dev_name(&dev->dev), dev->id);
 	err = create_stack(dev);
 	if (err)
-		return err;
-	write_lock_irqsave(&device_lock, flags);
-	list_add_tail(&dev->D.list, &devices);
-	write_unlock_irqrestore(&device_lock, flags);
+		goto error1;
+
+	dev->dev.class = &mISDN_class;
+	dev->dev.platform_data = dev;
+	dev->dev.parent = parent;
+	dev_set_drvdata(&dev->dev, dev);
+
+	err = device_add(&dev->dev);
+	if (err)
+		goto error3;
 	return 0;
+
+error3:
+	delete_stack(dev);
+	return err;
+error1:
+	return err;
+
 }
 EXPORT_SYMBOL(mISDN_register_device);
 
 void
 mISDN_unregister_device(struct mISDNdevice *dev) {
-	u_long	flags;
-
 	if (debug & DEBUG_CORE)
 		printk(KERN_DEBUG "mISDN_unregister %s %d\n",
 			dev_name(&dev->dev), dev->id);
-	write_lock_irqsave(&device_lock, flags);
-	list_del(&dev->D.list);
-	write_unlock_irqrestore(&device_lock, flags);
+	/* sysfs_remove_link(&dev->dev.kobj, "device"); */
+	device_del(&dev->dev);
+	dev_set_drvdata(&dev->dev, NULL);
+
 	test_and_clear_bit(dev->id, (u_long *)&device_ids);
 	delete_stack(dev);
+	put_device(&dev->dev);
 }
 EXPORT_SYMBOL(mISDN_unregister_device);
 
@@ -201,42 +363,43 @@ mISDNInit(void)
 		MISDN_MAJOR_VERSION, MISDN_MINOR_VERSION, MISDN_RELEASE);
 	mISDN_init_clock(&debug);
 	mISDN_initstack(&debug);
+	err = class_register(&mISDN_class);
+	if (err)
+		goto error1;
 	err = mISDN_inittimer(&debug);
 	if (err)
-		goto error;
+		goto error2;
 	err = l1_init(&debug);
-	if (err) {
-		mISDN_timer_cleanup();
-		goto error;
-	}
+	if (err)
+		goto error3;
 	err = Isdnl2_Init(&debug);
-	if (err) {
-		mISDN_timer_cleanup();
-		l1_cleanup();
-		goto error;
-	}
+	if (err)
+		goto error4;
 	err = misdn_sock_init(&debug);
-	if (err) {
-		mISDN_timer_cleanup();
-		l1_cleanup();
-		Isdnl2_cleanup();
-	}
-error:
+	if (err)
+		goto error5;
+	return 0;
+
+error5:
+	Isdnl2_cleanup();
+error4:
+	l1_cleanup();
+error3:
+	mISDN_timer_cleanup();
+error2:
+	class_unregister(&mISDN_class);
+error1:
 	return err;
 }
 
 static void mISDN_cleanup(void)
 {
 	misdn_sock_cleanup();
-	mISDN_timer_cleanup();
-	l1_cleanup();
 	Isdnl2_cleanup();
+	l1_cleanup();
+	mISDN_timer_cleanup();
+	class_unregister(&mISDN_class);
 
-	if (!list_empty(&devices))
-		printk(KERN_ERR "%s devices still registered\n", __func__);
-
-	if (!list_empty(&Bprotocols))
-		printk(KERN_ERR "%s Bprotocols still registered\n", __func__);
 	printk(KERN_DEBUG "mISDNcore unloaded\n");
 }
 
diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c
index a6a9e1af2ad6..abe574989572 100644
--- a/drivers/isdn/mISDN/l1oip_core.c
+++ b/drivers/isdn/mISDN/l1oip_core.c
@@ -1433,7 +1433,8 @@ init_card(struct l1oip *hc, int pri, int bundle)
 		hc->chan[i + ch].bch = bch;
 		set_channelmap(bch->nr, dch->dev.channelmap);
 	}
-	ret = mISDN_register_device(&dch->dev, hc->name);
+	/* TODO: create a parent device for this driver */
+	ret = mISDN_register_device(&dch->dev, NULL, hc->name);
 	if (ret)
 		return ret;
 	hc->registered = 1;
diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index 3f9988849f32..d4229aebf648 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -531,7 +531,8 @@ _queue_data(struct mISDNchannel *ch, u_int prim,
 
 /* global register/unregister functions */
 
-extern int	mISDN_register_device(struct mISDNdevice *, char *name);
+extern int	mISDN_register_device(struct mISDNdevice *,
+					struct device *parent, char *name);
 extern void	mISDN_unregister_device(struct mISDNdevice *);
 extern int	mISDN_register_Bprotocol(struct Bprotocol *);
 extern void	mISDN_unregister_Bprotocol(struct Bprotocol *);
@@ -539,6 +540,11 @@ extern struct mISDNclock *mISDN_register_clock(char *, int, clockctl_func_t *,
 						void *);
 extern void	mISDN_unregister_clock(struct mISDNclock *);
 
+static inline struct mISDNdevice *dev_to_mISDN(struct device *dev)
+{
+	return dev_get_drvdata(dev);
+}
+
 extern void	set_channel_address(struct mISDNchannel *, u_int, u_int);
 extern void	mISDN_clock_update(struct mISDNclock *, int, struct timeval *);
 extern unsigned short mISDN_clock_get(void);
-- 
cgit v1.2.3


From 3f75e84a6a697c5cffb78ee15e79498a35473e05 Mon Sep 17 00:00:00 2001
From: Martin Bachem <m.bachem@gmx.de>
Date: Tue, 4 Nov 2008 14:11:22 +0100
Subject: mISDN: Add layer1 prim MPH_INFORMATION_REQ

MPH_INFORMATION provides full D- and B-Channel status overview

- new layer1 primitive: MPF_INFORMATON_REQ
- layer1 replies with MPH_INFORMATION_IND containing
   - dch->[state,Flags,nrbchan]
   - bch[]->[protocol,Flags]
- hardware driver should send MPH_INFORMATION_IND
  on all ph state changes and BChannel state changes to MISDN_ID_ANY

Signed-off-by: Martin Bachem <m.bachem@gmx.de>
Signed-off-by: Karsten Keil <kkeil@suse.de>
---
 include/linux/mISDNif.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index d4229aebf648..557477ac3d5b 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -289,6 +289,23 @@ struct mISDN_devrename {
 	char			name[MISDN_MAX_IDLEN]; /* new name */
 };
 
+/* MPH_INFORMATION_REQ payload */
+struct ph_info_ch {
+        __u32 protocol;
+        __u64 Flags;
+};
+
+struct ph_info_dch {
+        struct ph_info_ch ch;
+        __u16 state;
+        __u16 num_bch;
+};
+
+struct ph_info {
+        struct ph_info_dch dch;
+        struct ph_info_ch  bch[];
+};
+
 /* timer device ioctl */
 #define IMADDTIMER	_IOR('I', 64, int)
 #define IMDELTIMER	_IOR('I', 65, int)
-- 
cgit v1.2.3


From 4db8e282f2d1dfa43d51ce2a4817901312c9134d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 9 Jan 2009 14:32:46 -0800
Subject: Revert "driver core: move knode_bus into private structure"

This reverts commit b9daa99ee533578e3f88231e7a16784dcb44ec42.

Turns out that device_initialize shouldn't fail silently.
This series needs to be reworked in order to get into proper
shape.

Reported-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/base.h    |  4 ----
 drivers/base/bus.c     | 40 +++++++++++++---------------------------
 include/linux/device.h |  1 +
 3 files changed, 14 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index b676f8f801f8..8af0bb2c0aa8 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -69,7 +69,6 @@ struct class_private {
  * @klist_children - klist containing all children of this device
  * @knode_parent - node in sibling list
  * @knode_driver - node in driver list
- * @knode_bus - node in bus list
  * @device - pointer back to the struct class that this structure is
  * associated with.
  *
@@ -79,15 +78,12 @@ struct device_private {
 	struct klist klist_children;
 	struct klist_node knode_parent;
 	struct klist_node knode_driver;
-	struct klist_node knode_bus;
 	struct device *device;
 };
 #define to_device_private_parent(obj)	\
 	container_of(obj, struct device_private, knode_parent)
 #define to_device_private_driver(obj)	\
 	container_of(obj, struct device_private, knode_driver)
-#define to_device_private_bus(obj)	\
-	container_of(obj, struct device_private, knode_bus)
 
 /* initialisation functions */
 extern int devices_init(void);
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 0f0a50444672..83f32b891fa9 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -253,14 +253,7 @@ static ssize_t store_drivers_probe(struct bus_type *bus,
 static struct device *next_device(struct klist_iter *i)
 {
 	struct klist_node *n = klist_next(i);
-	struct device *dev = NULL;
-	struct device_private *dev_prv;
-
-	if (n) {
-		dev_prv = to_device_private_bus(n);
-		dev = dev_prv->device;
-	}
-	return dev;
+	return n ? container_of(n, struct device, knode_bus) : NULL;
 }
 
 /**
@@ -293,7 +286,7 @@ int bus_for_each_dev(struct bus_type *bus, struct device *start,
 		return -EINVAL;
 
 	klist_iter_init_node(&bus->p->klist_devices, &i,
-			     (start ? &start->p->knode_bus : NULL));
+			     (start ? &start->knode_bus : NULL));
 	while ((dev = next_device(&i)) && !error)
 		error = fn(dev, data);
 	klist_iter_exit(&i);
@@ -327,7 +320,7 @@ struct device *bus_find_device(struct bus_type *bus,
 		return NULL;
 
 	klist_iter_init_node(&bus->p->klist_devices, &i,
-			     (start ? &start->p->knode_bus : NULL));
+			     (start ? &start->knode_bus : NULL));
 	while ((dev = next_device(&i)))
 		if (match(dev, data) && get_device(dev))
 			break;
@@ -514,8 +507,7 @@ void bus_attach_device(struct device *dev)
 			ret = device_attach(dev);
 		WARN_ON(ret < 0);
 		if (ret >= 0)
-			klist_add_tail(&dev->p->knode_bus,
-				       &bus->p->klist_devices);
+			klist_add_tail(&dev->knode_bus, &bus->p->klist_devices);
 	}
 }
 
@@ -536,8 +528,8 @@ void bus_remove_device(struct device *dev)
 		sysfs_remove_link(&dev->bus->p->devices_kset->kobj,
 				  dev_name(dev));
 		device_remove_attrs(dev->bus, dev);
-		if (klist_node_attached(&dev->p->knode_bus))
-			klist_del(&dev->p->knode_bus);
+		if (klist_node_attached(&dev->knode_bus))
+			klist_del(&dev->knode_bus);
 
 		pr_debug("bus: '%s': remove device %s\n",
 			 dev->bus->name, dev_name(dev));
@@ -839,16 +831,14 @@ static void bus_remove_attrs(struct bus_type *bus)
 
 static void klist_devices_get(struct klist_node *n)
 {
-	struct device_private *dev_prv = to_device_private_bus(n);
-	struct device *dev = dev_prv->device;
+	struct device *dev = container_of(n, struct device, knode_bus);
 
 	get_device(dev);
 }
 
 static void klist_devices_put(struct klist_node *n)
 {
-	struct device_private *dev_prv = to_device_private_bus(n);
-	struct device *dev = dev_prv->device;
+	struct device *dev = container_of(n, struct device, knode_bus);
 
 	put_device(dev);
 }
@@ -1003,20 +993,18 @@ static void device_insertion_sort_klist(struct device *a, struct list_head *list
 {
 	struct list_head *pos;
 	struct klist_node *n;
-	struct device_private *dev_prv;
 	struct device *b;
 
 	list_for_each(pos, list) {
 		n = container_of(pos, struct klist_node, n_node);
-		dev_prv = to_device_private_bus(n);
-		b = dev_prv->device;
+		b = container_of(n, struct device, knode_bus);
 		if (compare(a, b) <= 0) {
-			list_move_tail(&a->p->knode_bus.n_node,
-				       &b->p->knode_bus.n_node);
+			list_move_tail(&a->knode_bus.n_node,
+				       &b->knode_bus.n_node);
 			return;
 		}
 	}
-	list_move_tail(&a->p->knode_bus.n_node, list);
+	list_move_tail(&a->knode_bus.n_node, list);
 }
 
 void bus_sort_breadthfirst(struct bus_type *bus,
@@ -1026,7 +1014,6 @@ void bus_sort_breadthfirst(struct bus_type *bus,
 	LIST_HEAD(sorted_devices);
 	struct list_head *pos, *tmp;
 	struct klist_node *n;
-	struct device_private *dev_prv;
 	struct device *dev;
 	struct klist *device_klist;
 
@@ -1035,8 +1022,7 @@ void bus_sort_breadthfirst(struct bus_type *bus,
 	spin_lock(&device_klist->k_lock);
 	list_for_each_safe(pos, tmp, &device_klist->k_list) {
 		n = container_of(pos, struct klist_node, n_node);
-		dev_prv = to_device_private_bus(n);
-		dev = dev_prv->device;
+		dev = container_of(n, struct device, knode_bus);
 		device_insertion_sort_klist(dev, &sorted_devices, compare);
 	}
 	list_splice(&sorted_devices, &device_klist->k_list);
diff --git a/include/linux/device.h b/include/linux/device.h
index 7d9da4b4993f..8987f4776064 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -366,6 +366,7 @@ struct device_dma_parameters {
 };
 
 struct device {
+	struct klist_node	knode_bus;
 	struct device		*parent;
 
 	struct device_private	*p;
-- 
cgit v1.2.3


From cda5e83fdea476dce9c0a9b1152cd6ca46832cc4 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 9 Jan 2009 14:44:18 -0800
Subject: Revert "driver core: move knode_driver into private structure"

This reverts commit 93e746db183b3bdbbda67900f79b5835f9cb388f.

Turns out that device_initialize shouldn't fail silently.
This series needs to be reworked in order to get into proper
shape.

Reported-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/base.h    |  4 ----
 drivers/base/dd.c      | 13 +++++--------
 drivers/base/driver.c  | 13 +++----------
 include/linux/device.h |  1 +
 4 files changed, 9 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 8af0bb2c0aa8..f5cf31c664d7 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -68,7 +68,6 @@ struct class_private {
  *
  * @klist_children - klist containing all children of this device
  * @knode_parent - node in sibling list
- * @knode_driver - node in driver list
  * @device - pointer back to the struct class that this structure is
  * associated with.
  *
@@ -77,13 +76,10 @@ struct class_private {
 struct device_private {
 	struct klist klist_children;
 	struct klist_node knode_parent;
-	struct klist_node knode_driver;
 	struct device *device;
 };
 #define to_device_private_parent(obj)	\
 	container_of(obj, struct device_private, knode_parent)
-#define to_device_private_driver(obj)	\
-	container_of(obj, struct device_private, knode_driver)
 
 /* initialisation functions */
 extern int devices_init(void);
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 6fdaf76f033f..315bed8d5e7f 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -28,7 +28,7 @@
 
 static void driver_bound(struct device *dev)
 {
-	if (klist_node_attached(&dev->p->knode_driver)) {
+	if (klist_node_attached(&dev->knode_driver)) {
 		printk(KERN_WARNING "%s: device %s already bound\n",
 			__func__, kobject_name(&dev->kobj));
 		return;
@@ -41,7 +41,7 @@ static void driver_bound(struct device *dev)
 		blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
 					     BUS_NOTIFY_BOUND_DRIVER, dev);
 
-	klist_add_tail(&dev->p->knode_driver, &dev->driver->p->klist_devices);
+	klist_add_tail(&dev->knode_driver, &dev->driver->p->klist_devices);
 }
 
 static int driver_sysfs_add(struct device *dev)
@@ -310,7 +310,7 @@ static void __device_release_driver(struct device *dev)
 			drv->remove(dev);
 		devres_release_all(dev);
 		dev->driver = NULL;
-		klist_remove(&dev->p->knode_driver);
+		klist_remove(&dev->knode_driver);
 	}
 }
 
@@ -340,7 +340,6 @@ EXPORT_SYMBOL_GPL(device_release_driver);
  */
 void driver_detach(struct device_driver *drv)
 {
-	struct device_private *dev_prv;
 	struct device *dev;
 
 	for (;;) {
@@ -349,10 +348,8 @@ void driver_detach(struct device_driver *drv)
 			spin_unlock(&drv->p->klist_devices.k_lock);
 			break;
 		}
-		dev_prv = list_entry(drv->p->klist_devices.k_list.prev,
-				     struct device_private,
-				     knode_driver.n_node);
-		dev = dev_prv->device;
+		dev = list_entry(drv->p->klist_devices.k_list.prev,
+				struct device, knode_driver.n_node);
 		get_device(dev);
 		spin_unlock(&drv->p->klist_devices.k_lock);
 
diff --git a/drivers/base/driver.c b/drivers/base/driver.c
index b76cc69f1106..1e2bda780e48 100644
--- a/drivers/base/driver.c
+++ b/drivers/base/driver.c
@@ -19,14 +19,7 @@
 static struct device *next_device(struct klist_iter *i)
 {
 	struct klist_node *n = klist_next(i);
-	struct device *dev = NULL;
-	struct device_private *dev_prv;
-
-	if (n) {
-		dev_prv = to_device_private_driver(n);
-		dev = dev_prv->device;
-	}
-	return dev;
+	return n ? container_of(n, struct device, knode_driver) : NULL;
 }
 
 /**
@@ -49,7 +42,7 @@ int driver_for_each_device(struct device_driver *drv, struct device *start,
 		return -EINVAL;
 
 	klist_iter_init_node(&drv->p->klist_devices, &i,
-			     start ? &start->p->knode_driver : NULL);
+			     start ? &start->knode_driver : NULL);
 	while ((dev = next_device(&i)) && !error)
 		error = fn(dev, data);
 	klist_iter_exit(&i);
@@ -83,7 +76,7 @@ struct device *driver_find_device(struct device_driver *drv,
 		return NULL;
 
 	klist_iter_init_node(&drv->p->klist_devices, &i,
-			     (start ? &start->p->knode_driver : NULL));
+			     (start ? &start->knode_driver : NULL));
 	while ((dev = next_device(&i)))
 		if (match(dev, data) && get_device(dev))
 			break;
diff --git a/include/linux/device.h b/include/linux/device.h
index 8987f4776064..c66ceb15acd8 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -366,6 +366,7 @@ struct device_dma_parameters {
 };
 
 struct device {
+	struct klist_node	knode_driver;
 	struct klist_node	knode_bus;
 	struct device		*parent;
 
-- 
cgit v1.2.3


From e2d4077678c7ec7661003c268120582adc544897 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 9 Jan 2009 14:55:37 -0800
Subject: Revert "driver core: move klist_children into private structure"

This reverts commit 11c3b5c3e08f4d855cbef52883c266b9ab9df879.

Turns out that device_initialize shouldn't fail silently.
This series needs to be reworked in order to get into proper
shape.

Reported-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/base.h    |  6 ------
 drivers/base/core.c    | 37 +++++++++++++------------------------
 include/linux/device.h |  2 ++
 3 files changed, 15 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index f5cf31c664d7..6b20809b5fd4 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -66,20 +66,14 @@ struct class_private {
 /**
  * struct device_private - structure to hold the private to the driver core portions of the device structure.
  *
- * @klist_children - klist containing all children of this device
- * @knode_parent - node in sibling list
  * @device - pointer back to the struct class that this structure is
  * associated with.
  *
  * Nothing outside of the driver core should ever touch these fields.
  */
 struct device_private {
-	struct klist klist_children;
-	struct klist_node knode_parent;
 	struct device *device;
 };
-#define to_device_private_parent(obj)	\
-	container_of(obj, struct device_private, knode_parent)
 
 /* initialisation functions */
 extern int devices_init(void);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 61df508fa62b..5c2acf7116d7 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -509,16 +509,14 @@ EXPORT_SYMBOL_GPL(device_schedule_callback_owner);
 
 static void klist_children_get(struct klist_node *n)
 {
-	struct device_private *p = to_device_private_parent(n);
-	struct device *dev = p->device;
+	struct device *dev = container_of(n, struct device, knode_parent);
 
 	get_device(dev);
 }
 
 static void klist_children_put(struct klist_node *n)
 {
-	struct device_private *p = to_device_private_parent(n);
-	struct device *dev = p->device;
+	struct device *dev = container_of(n, struct device, knode_parent);
 
 	put_device(dev);
 }
@@ -548,7 +546,7 @@ void device_initialize(struct device *dev)
 	dev->p->device = dev;
 	dev->kobj.kset = devices_kset;
 	kobject_init(&dev->kobj, &device_ktype);
-	klist_init(&dev->p->klist_children, klist_children_get,
+	klist_init(&dev->klist_children, klist_children_get,
 		   klist_children_put);
 	INIT_LIST_HEAD(&dev->dma_pools);
 	init_MUTEX(&dev->sem);
@@ -932,8 +930,7 @@ int device_add(struct device *dev)
 	kobject_uevent(&dev->kobj, KOBJ_ADD);
 	bus_attach_device(dev);
 	if (parent)
-		klist_add_tail(&dev->p->knode_parent,
-			       &parent->p->klist_children);
+		klist_add_tail(&dev->knode_parent, &parent->klist_children);
 
 	if (dev->class) {
 		mutex_lock(&dev->class->p->class_mutex);
@@ -1047,7 +1044,7 @@ void device_del(struct device *dev)
 	device_pm_remove(dev);
 	dpm_sysfs_remove(dev);
 	if (parent)
-		klist_del(&dev->p->knode_parent);
+		klist_del(&dev->knode_parent);
 	if (MAJOR(dev->devt)) {
 		device_remove_sys_dev_entry(dev);
 		device_remove_file(dev, &devt_attr);
@@ -1108,14 +1105,7 @@ void device_unregister(struct device *dev)
 static struct device *next_device(struct klist_iter *i)
 {
 	struct klist_node *n = klist_next(i);
-	struct device *dev = NULL;
-	struct device_private *p;
-
-	if (n) {
-		p = to_device_private_parent(n);
-		dev = p->device;
-	}
-	return dev;
+	return n ? container_of(n, struct device, knode_parent) : NULL;
 }
 
 /**
@@ -1137,7 +1127,7 @@ int device_for_each_child(struct device *parent, void *data,
 	struct device *child;
 	int error = 0;
 
-	klist_iter_init(&parent->p->klist_children, &i);
+	klist_iter_init(&parent->klist_children, &i);
 	while ((child = next_device(&i)) && !error)
 		error = fn(child, data);
 	klist_iter_exit(&i);
@@ -1168,7 +1158,7 @@ struct device *device_find_child(struct device *parent, void *data,
 	if (!parent)
 		return NULL;
 
-	klist_iter_init(&parent->p->klist_children, &i);
+	klist_iter_init(&parent->klist_children, &i);
 	while ((child = next_device(&i)))
 		if (match(child, data) && get_device(child))
 			break;
@@ -1582,10 +1572,9 @@ int device_move(struct device *dev, struct device *new_parent)
 	old_parent = dev->parent;
 	dev->parent = new_parent;
 	if (old_parent)
-		klist_remove(&dev->p->knode_parent);
+		klist_remove(&dev->knode_parent);
 	if (new_parent) {
-		klist_add_tail(&dev->p->knode_parent,
-			       &new_parent->p->klist_children);
+		klist_add_tail(&dev->knode_parent, &new_parent->klist_children);
 		set_dev_node(dev, dev_to_node(new_parent));
 	}
 
@@ -1597,11 +1586,11 @@ int device_move(struct device *dev, struct device *new_parent)
 		device_move_class_links(dev, new_parent, old_parent);
 		if (!kobject_move(&dev->kobj, &old_parent->kobj)) {
 			if (new_parent)
-				klist_remove(&dev->p->knode_parent);
+				klist_remove(&dev->knode_parent);
 			dev->parent = old_parent;
 			if (old_parent) {
-				klist_add_tail(&dev->p->knode_parent,
-					       &old_parent->p->klist_children);
+				klist_add_tail(&dev->knode_parent,
+					       &old_parent->klist_children);
 				set_dev_node(dev, dev_to_node(old_parent));
 			}
 		}
diff --git a/include/linux/device.h b/include/linux/device.h
index c66ceb15acd8..2975351635d3 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -366,6 +366,8 @@ struct device_dma_parameters {
 };
 
 struct device {
+	struct klist		klist_children;
+	struct klist_node	knode_parent;	/* node in sibling list */
 	struct klist_node	knode_driver;
 	struct klist_node	knode_bus;
 	struct device		*parent;
-- 
cgit v1.2.3


From 926beadb3dfaddccb3348a5b9e6c2a1f8290a220 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 9 Jan 2009 15:06:12 -0800
Subject: Revert "driver core: create a private portion of struct device"

This reverts commit 2831fe6f9cc4e16c103504ee09a47a084297c0f3.

Turns out that device_initialize shouldn't fail silently.
This series needs to be reworked in order to get into proper
shape.

Reported-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/base.h    | 12 ------------
 drivers/base/core.c    |  8 --------
 include/linux/device.h |  3 ---
 3 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 6b20809b5fd4..0a5f055dffba 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -63,18 +63,6 @@ struct class_private {
 #define to_class(obj)	\
 	container_of(obj, struct class_private, class_subsys.kobj)
 
-/**
- * struct device_private - structure to hold the private to the driver core portions of the device structure.
- *
- * @device - pointer back to the struct class that this structure is
- * associated with.
- *
- * Nothing outside of the driver core should ever touch these fields.
- */
-struct device_private {
-	struct device *device;
-};
-
 /* initialisation functions */
 extern int devices_init(void);
 extern int buses_init(void);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 5c2acf7116d7..8079afca4972 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -109,7 +109,6 @@ static struct sysfs_ops dev_sysfs_ops = {
 static void device_release(struct kobject *kobj)
 {
 	struct device *dev = to_dev(kobj);
-	struct device_private *p = dev->p;
 
 	if (dev->release)
 		dev->release(dev);
@@ -121,7 +120,6 @@ static void device_release(struct kobject *kobj)
 		WARN(1, KERN_ERR "Device '%s' does not have a release() "
 			"function, it is broken and must be fixed.\n",
 			dev_name(dev));
-	kfree(p);
 }
 
 static struct kobj_type device_ktype = {
@@ -538,12 +536,6 @@ static void klist_children_put(struct klist_node *n)
  */
 void device_initialize(struct device *dev)
 {
-	dev->p = kzalloc(sizeof(*dev->p), GFP_KERNEL);
-	if (!dev->p) {
-		WARN_ON(1);
-		return;
-	}
-	dev->p->device = dev;
 	dev->kobj.kset = devices_kset;
 	kobject_init(&dev->kobj, &device_ktype);
 	klist_init(&dev->klist_children, klist_children_get,
diff --git a/include/linux/device.h b/include/linux/device.h
index 2975351635d3..45e5b1921fbb 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -28,7 +28,6 @@
 #define BUS_ID_SIZE		20
 
 struct device;
-struct device_private;
 struct device_driver;
 struct driver_private;
 struct class;
@@ -372,8 +371,6 @@ struct device {
 	struct klist_node	knode_bus;
 	struct device		*parent;
 
-	struct device_private	*p;
-
 	struct kobject kobj;
 	char	bus_id[BUS_ID_SIZE];	/* position on parent bus */
 	unsigned		uevent_suppress:1;
-- 
cgit v1.2.3


From 85c210edc46d602a1562aeea0fc74919349c8cf0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Fri, 9 Jan 2009 16:40:53 -0800
Subject: compiler-gcc.h: add more comments to RELOC_HIDE

Requested by C. Lameter

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Mike Travis <travis@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Richard Henderson <rth@twiddle.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index af40f8eb86f0..1514d534deeb 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -11,9 +11,19 @@
 /* The "volatile" is due to gcc bugs */
 #define barrier() __asm__ __volatile__("": : :"memory")
 
-/* This macro obfuscates arithmetic on a variable address so that gcc
-   shouldn't recognize the original var, and make assumptions about it */
 /*
+ * This macro obfuscates arithmetic on a variable address so that gcc
+ * shouldn't recognize the original var, and make assumptions about it.
+ *
+ * This is needed because the C standard makes it undefined to do
+ * pointer arithmetic on "objects" outside their boundaries and the
+ * gcc optimizers assume this is the case. In particular they
+ * assume such arithmetic does not wrap.
+ *
+ * A miscompilation has been observed because of this on PPC.
+ * To work around it we hide the relationship of the pointer and the object
+ * using this macro.
+ *
  * Versions of the ppc64 compiler before 4.1 had a bug where use of
  * RELOC_HIDE could trash r30. The bug can be worked around by changing
  * the inline assembly constraint from =g to =r, in this particular
-- 
cgit v1.2.3


From 69347a236b22c3962ea812511495e502dedfd50c Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 9 Jan 2009 16:40:56 -0800
Subject: memstick: annotate endianness of attribute structs

The code was shifting the endianness appropriately everywhere, annotate
the structs to avoid the sparse warnings when assigning the endian types
to the struct members, or passing them to be[16|32]_to_cpu:

drivers/memstick/core/mspro_block.c:331:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:333:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:335:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:337:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:341:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:347:4: warning: cast to restricted __be32
drivers/memstick/core/mspro_block.c:356:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:358:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:364:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:367:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:369:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:371:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:377:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:478:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:480:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:482:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:484:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:486:4: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:689:22:    expected unsigned int [unsigned] [assigned] data_address
drivers/memstick/core/mspro_block.c:689:22:    got restricted __be32 [usertype] <noident>
drivers/memstick/core/mspro_block.c:697:3: warning: cast to restricted __be32
drivers/memstick/core/mspro_block.c:960:17: warning: incorrect type in initializer (different base types)
drivers/memstick/core/mspro_block.c:960:17:    expected unsigned short [unsigned] data_count
drivers/memstick/core/mspro_block.c:960:17:    got restricted __be16 [usertype] <noident>
drivers/memstick/core/mspro_block.c:993:6: warning: cast to restricted __be16
drivers/memstick/core/mspro_block.c:995:28: warning: cast to restricted __be16

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Cc: Alex Dubov <oakad@yahoo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/memstick/core/mspro_block.c | 43 ++++++++++++++++++-------------------
 include/linux/memstick.h            |  4 ++--
 2 files changed, 23 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 1f1e3982b1aa..de143deb06f0 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -52,14 +52,14 @@ struct mspro_sys_attr {
 };
 
 struct mspro_attr_entry {
-	unsigned int  address;
-	unsigned int  size;
+	__be32 address;
+	__be32 size;
 	unsigned char id;
 	unsigned char reserved[3];
 } __attribute__((packed));
 
 struct mspro_attribute {
-	unsigned short          signature;
+	__be16 signature;
 	unsigned short          version;
 	unsigned char           count;
 	unsigned char           reserved[11];
@@ -69,28 +69,28 @@ struct mspro_attribute {
 struct mspro_sys_info {
 	unsigned char  class;
 	unsigned char  reserved0;
-	unsigned short block_size;
-	unsigned short block_count;
-	unsigned short user_block_count;
-	unsigned short page_size;
+	__be16 block_size;
+	__be16 block_count;
+	__be16 user_block_count;
+	__be16 page_size;
 	unsigned char  reserved1[2];
 	unsigned char  assembly_date[8];
-	unsigned int   serial_number;
+	__be32 serial_number;
 	unsigned char  assembly_maker_code;
 	unsigned char  assembly_model_code[3];
-	unsigned short memory_maker_code;
-	unsigned short memory_model_code;
+	__be16 memory_maker_code;
+	__be16 memory_model_code;
 	unsigned char  reserved2[4];
 	unsigned char  vcc;
 	unsigned char  vpp;
-	unsigned short controller_number;
-	unsigned short controller_function;
-	unsigned short start_sector;
-	unsigned short unit_size;
+	__be16 controller_number;
+	__be16 controller_function;
+	__be16 start_sector;
+	__be16 unit_size;
 	unsigned char  ms_sub_class;
 	unsigned char  reserved3[4];
 	unsigned char  interface_type;
-	unsigned short controller_code;
+	__be16 controller_code;
 	unsigned char  format_type;
 	unsigned char  reserved4;
 	unsigned char  device_type;
@@ -124,11 +124,11 @@ struct mspro_specfile {
 } __attribute__((packed));
 
 struct mspro_devinfo {
-	unsigned short cylinders;
-	unsigned short heads;
-	unsigned short bytes_per_track;
-	unsigned short bytes_per_sector;
-	unsigned short sectors_per_track;
+	__be16 cylinders;
+	__be16 heads;
+	__be16 bytes_per_track;
+	__be16 bytes_per_sector;
+	__be16 sectors_per_track;
 	unsigned char  reserved[6];
 } __attribute__((packed));
 
@@ -338,8 +338,7 @@ static ssize_t mspro_block_attr_show_sysinfo(struct device *dev,
 	rc += scnprintf(buffer + rc, PAGE_SIZE - rc, "assembly date: "
 			"GMT%+d:%d %04u-%02u-%02u %02u:%02u:%02u\n",
 			date_tz, date_tz_f,
-			be16_to_cpu(*(unsigned short *)
-				    &x_sys->assembly_date[1]),
+			be16_to_cpup((__be16 *)&x_sys->assembly_date[1]),
 			x_sys->assembly_date[3], x_sys->assembly_date[4],
 			x_sys->assembly_date[5], x_sys->assembly_date[6],
 			x_sys->assembly_date[7]);
diff --git a/include/linux/memstick.h b/include/linux/memstick.h
index d0c37e682234..690c35a9d4cc 100644
--- a/include/linux/memstick.h
+++ b/include/linux/memstick.h
@@ -100,8 +100,8 @@ struct mspro_param_register {
 #define MEMSTICK_SYS_PAR8   0x40
 #define MEMSTICK_SYS_SERIAL 0x80
 
-	unsigned short data_count;
-	unsigned int   data_address;
+	__be16 data_count;
+	__be32 data_address;
 	unsigned char  tpc_param;
 } __attribute__((packed));
 
-- 
cgit v1.2.3


From c4be0c1dc4cdc37b175579be1460f15ac6495e9a Mon Sep 17 00:00:00 2001
From: Takashi Sato <t-sato@yk.jp.nec.com>
Date: Fri, 9 Jan 2009 16:40:58 -0800
Subject: filesystem freeze: add error handling of write_super_lockfs/unlockfs

Currently, ext3 in mainline Linux doesn't have the freeze feature which
suspends write requests.  So, we cannot take a backup which keeps the
filesystem's consistency with the storage device's features (snapshot and
replication) while it is mounted.

In many case, a commercial filesystem (e.g.  VxFS) has the freeze feature
and it would be used to get the consistent backup.

If Linux's standard filesystem ext3 has the freeze feature, we can do it
without a commercial filesystem.

So I have implemented the ioctls of the freeze feature.
I think we can take the consistent backup with the following steps.
1. Freeze the filesystem with the freeze ioctl.
2. Separate the replication volume or create the snapshot
   with the storage device's feature.
3. Unfreeze the filesystem with the unfreeze ioctl.
4. Take the backup from the separated replication volume
   or the snapshot.

This patch:

VFS:
Changed the type of write_super_lockfs and unlockfs from "void"
to "int" so that they can return an error.
Rename write_super_lockfs and unlockfs of the super block operation
freeze_fs and unfreeze_fs to avoid a confusion.

ext3, ext4, xfs, gfs2, jfs:
Changed the type of write_super_lockfs and unlockfs from "void"
to "int" so that write_super_lockfs returns an error if needed,
and unlockfs always returns 0.

reiserfs:
Changed the type of write_super_lockfs and unlockfs from "void"
to "int" so that they always return 0 (success) to keep a current behavior.

Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
Signed-off-by: Masayuki Hamaguchi <m-hamaguchi@ys.jp.nec.com>
Cc: <xfs-masters@oss.sgi.com>
Cc: <linux-ext4@vger.kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking |  8 +++----
 Documentation/filesystems/vfs.txt |  8 +++----
 fs/buffer.c                       |  8 +++----
 fs/ext3/super.c                   | 45 +++++++++++++++++++++++++--------------
 fs/ext4/super.c                   | 45 +++++++++++++++++++++++++++------------
 fs/gfs2/ops_super.c               | 16 ++++++++------
 fs/jfs/super.c                    | 10 +++++----
 fs/reiserfs/super.c               | 10 +++++----
 fs/xfs/linux-2.6/xfs_super.c      |  8 +++----
 fs/xfs/xfs_fsops.c                | 11 ++++++----
 fs/xfs/xfs_fsops.h                |  2 +-
 include/linux/fs.h                |  4 ++--
 12 files changed, 107 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index cfbfa15a46ba..ec6a9392a173 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -97,8 +97,8 @@ prototypes:
 	void (*put_super) (struct super_block *);
 	void (*write_super) (struct super_block *);
 	int (*sync_fs)(struct super_block *sb, int wait);
-	void (*write_super_lockfs) (struct super_block *);
-	void (*unlockfs) (struct super_block *);
+	int (*freeze_fs) (struct super_block *);
+	int (*unfreeze_fs) (struct super_block *);
 	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*clear_inode) (struct inode *);
@@ -119,8 +119,8 @@ delete_inode:		no
 put_super:		yes	yes	no
 write_super:		no	yes	read
 sync_fs:		no	no	read
-write_super_lockfs:	?
-unlockfs:		?
+freeze_fs:		?
+unfreeze_fs:		?
 statfs:			no	no	no
 remount_fs:		yes	yes	maybe		(see below)
 clear_inode:		no
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index ef19afa186a9..deeeed0faa8f 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -210,8 +210,8 @@ struct super_operations {
         void (*put_super) (struct super_block *);
         void (*write_super) (struct super_block *);
         int (*sync_fs)(struct super_block *sb, int wait);
-        void (*write_super_lockfs) (struct super_block *);
-        void (*unlockfs) (struct super_block *);
+        int (*freeze_fs) (struct super_block *);
+        int (*unfreeze_fs) (struct super_block *);
         int (*statfs) (struct dentry *, struct kstatfs *);
         int (*remount_fs) (struct super_block *, int *, char *);
         void (*clear_inode) (struct inode *);
@@ -270,11 +270,11 @@ or bottom half).
   	a superblock. The second parameter indicates whether the method
 	should wait until the write out has been completed. Optional.
 
-  write_super_lockfs: called when VFS is locking a filesystem and
+  freeze_fs: called when VFS is locking a filesystem and
   	forcing it into a consistent state.  This method is currently
   	used by the Logical Volume Manager (LVM).
 
-  unlockfs: called when VFS is unlocking a filesystem and making it writable
+  unfreeze_fs: called when VFS is unlocking a filesystem and making it writable
   	again.
 
   statfs: called when the VFS needs to get filesystem statistics. This
diff --git a/fs/buffer.c b/fs/buffer.c
index c26da785938a..87f9e537b8c3 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -221,8 +221,8 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 
 		sync_blockdev(sb->s_bdev);
 
-		if (sb->s_op->write_super_lockfs)
-			sb->s_op->write_super_lockfs(sb);
+		if (sb->s_op->freeze_fs)
+			sb->s_op->freeze_fs(sb);
 	}
 
 	sync_blockdev(bdev);
@@ -242,8 +242,8 @@ void thaw_bdev(struct block_device *bdev, struct super_block *sb)
 	if (sb) {
 		BUG_ON(sb->s_bdev != bdev);
 
-		if (sb->s_op->unlockfs)
-			sb->s_op->unlockfs(sb);
+		if (sb->s_op->unfreeze_fs)
+			sb->s_op->unfreeze_fs(sb);
 		sb->s_frozen = SB_UNFROZEN;
 		smp_wmb();
 		wake_up(&sb->s_wait_unfrozen);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5d047a030a73..b70d90e08a3c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -48,8 +48,8 @@ static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
 			     unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
 			       unsigned int);
-static void ext3_commit_super (struct super_block * sb,
-			       struct ext3_super_block * es,
+static int ext3_commit_super(struct super_block *sb,
+			       struct ext3_super_block *es,
 			       int sync);
 static void ext3_mark_recovery_complete(struct super_block * sb,
 					struct ext3_super_block * es);
@@ -60,9 +60,9 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
 				     char nbuf[16]);
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
 static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
-static void ext3_unlockfs(struct super_block *sb);
+static int ext3_unfreeze(struct super_block *sb);
 static void ext3_write_super (struct super_block * sb);
-static void ext3_write_super_lockfs(struct super_block *sb);
+static int ext3_freeze(struct super_block *sb);
 
 /*
  * Wrappers for journal_start/end.
@@ -759,8 +759,8 @@ static const struct super_operations ext3_sops = {
 	.put_super	= ext3_put_super,
 	.write_super	= ext3_write_super,
 	.sync_fs	= ext3_sync_fs,
-	.write_super_lockfs = ext3_write_super_lockfs,
-	.unlockfs	= ext3_unlockfs,
+	.freeze_fs	= ext3_freeze,
+	.unfreeze_fs	= ext3_unfreeze,
 	.statfs		= ext3_statfs,
 	.remount_fs	= ext3_remount,
 	.clear_inode	= ext3_clear_inode,
@@ -2311,21 +2311,23 @@ static int ext3_create_journal(struct super_block * sb,
 	return 0;
 }
 
-static void ext3_commit_super (struct super_block * sb,
-			       struct ext3_super_block * es,
+static int ext3_commit_super(struct super_block *sb,
+			       struct ext3_super_block *es,
 			       int sync)
 {
 	struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
+	int error = 0;
 
 	if (!sbh)
-		return;
+		return error;
 	es->s_wtime = cpu_to_le32(get_seconds());
 	es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
 	es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
 	if (sync)
-		sync_dirty_buffer(sbh);
+		error = sync_dirty_buffer(sbh);
+	return error;
 }
 
 
@@ -2439,12 +2441,14 @@ static int ext3_sync_fs(struct super_block *sb, int wait)
  * LVM calls this function before a (read-only) snapshot is created.  This
  * gives us a chance to flush the journal completely and mark the fs clean.
  */
-static void ext3_write_super_lockfs(struct super_block *sb)
+static int ext3_freeze(struct super_block *sb)
 {
+	int error = 0;
+	journal_t *journal;
 	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		journal_t *journal = EXT3_SB(sb)->s_journal;
+		journal = EXT3_SB(sb)->s_journal;
 
 		/* Now we set up the journal barrier. */
 		journal_lock_updates(journal);
@@ -2453,20 +2457,28 @@ static void ext3_write_super_lockfs(struct super_block *sb)
 		 * We don't want to clear needs_recovery flag when we failed
 		 * to flush the journal.
 		 */
-		if (journal_flush(journal) < 0)
-			return;
+		error = journal_flush(journal);
+		if (error < 0)
+			goto out;
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-		ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+		error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+		if (error)
+			goto out;
 	}
+	return 0;
+
+out:
+	journal_unlock_updates(journal);
+	return error;
 }
 
 /*
  * Called by LVM after the snapshot is done.  We need to reset the RECOVER
  * flag here, even though the filesystem is not technically dirty yet.
  */
-static void ext3_unlockfs(struct super_block *sb)
+static int ext3_unfreeze(struct super_block *sb)
 {
 	if (!(sb->s_flags & MS_RDONLY)) {
 		lock_super(sb);
@@ -2476,6 +2488,7 @@ static void ext3_unlockfs(struct super_block *sb)
 		unlock_super(sb);
 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
 	}
+	return 0;
 }
 
 static int ext3_remount (struct super_block * sb, int * flags, char * data)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f7e0be8ab1b..e5f06a5f045e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -51,7 +51,7 @@ struct proc_dir_entry *ext4_proc_root;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
-static void ext4_commit_super(struct super_block *sb,
+static int ext4_commit_super(struct super_block *sb,
 			      struct ext4_super_block *es, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
 					struct ext4_super_block *es);
@@ -62,9 +62,9 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
 				     char nbuf[16]);
 static int ext4_remount(struct super_block *sb, int *flags, char *data);
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
-static void ext4_unlockfs(struct super_block *sb);
+static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
-static void ext4_write_super_lockfs(struct super_block *sb);
+static int ext4_freeze(struct super_block *sb);
 
 
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
@@ -978,8 +978,8 @@ static const struct super_operations ext4_sops = {
 	.put_super	= ext4_put_super,
 	.write_super	= ext4_write_super,
 	.sync_fs	= ext4_sync_fs,
-	.write_super_lockfs = ext4_write_super_lockfs,
-	.unlockfs	= ext4_unlockfs,
+	.freeze_fs	= ext4_freeze,
+	.unfreeze_fs	= ext4_unfreeze,
 	.statfs		= ext4_statfs,
 	.remount_fs	= ext4_remount,
 	.clear_inode	= ext4_clear_inode,
@@ -2888,13 +2888,14 @@ static int ext4_load_journal(struct super_block *sb,
 	return 0;
 }
 
-static void ext4_commit_super(struct super_block *sb,
+static int ext4_commit_super(struct super_block *sb,
 			      struct ext4_super_block *es, int sync)
 {
 	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
+	int error = 0;
 
 	if (!sbh)
-		return;
+		return error;
 	if (buffer_write_io_error(sbh)) {
 		/*
 		 * Oh, dear.  A previous attempt to write the
@@ -2918,14 +2919,19 @@ static void ext4_commit_super(struct super_block *sb,
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
 	if (sync) {
-		sync_dirty_buffer(sbh);
-		if (buffer_write_io_error(sbh)) {
+		error = sync_dirty_buffer(sbh);
+		if (error)
+			return error;
+
+		error = buffer_write_io_error(sbh);
+		if (error) {
 			printk(KERN_ERR "EXT4-fs: I/O error while writing "
 			       "superblock for %s.\n", sb->s_id);
 			clear_buffer_write_io_error(sbh);
 			set_buffer_uptodate(sbh);
 		}
 	}
+	return error;
 }
 
 
@@ -3058,12 +3064,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
  * LVM calls this function before a (read-only) snapshot is created.  This
  * gives us a chance to flush the journal completely and mark the fs clean.
  */
-static void ext4_write_super_lockfs(struct super_block *sb)
+static int ext4_freeze(struct super_block *sb)
 {
+	int error = 0;
+	journal_t *journal;
 	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		journal_t *journal = EXT4_SB(sb)->s_journal;
+		journal = EXT4_SB(sb)->s_journal;
 
 		if (journal) {
 			/* Now we set up the journal barrier. */
@@ -3073,21 +3081,29 @@ static void ext4_write_super_lockfs(struct super_block *sb)
 			 * We don't want to clear needs_recovery flag when we
 			 * failed to flush the journal.
 			 */
-			if (jbd2_journal_flush(journal) < 0)
-				return;
+			error = jbd2_journal_flush(journal);
+			if (error < 0)
+				goto out;
 		}
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+		error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+		if (error)
+			goto out;
 	}
+	return 0;
+out:
+	jbd2_journal_unlock_updates(journal);
+	return error;
 }
 
 /*
  * Called by LVM after the snapshot is done.  We need to reset the RECOVER
  * flag here, even though the filesystem is not technically dirty yet.
  */
-static void ext4_unlockfs(struct super_block *sb)
+static int ext4_unfreeze(struct super_block *sb)
 {
 	if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
 		lock_super(sb);
@@ -3097,6 +3113,7 @@ static void ext4_unlockfs(struct super_block *sb)
 		unlock_super(sb);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 	}
+	return 0;
 }
 
 static int ext4_remount(struct super_block *sb, int *flags, char *data)
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 777783deddcb..320323d03479 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -211,18 +211,18 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
 }
 
 /**
- * gfs2_write_super_lockfs - prevent further writes to the filesystem
+ * gfs2_freeze - prevent further writes to the filesystem
  * @sb: the VFS structure for the filesystem
  *
  */
 
-static void gfs2_write_super_lockfs(struct super_block *sb)
+static int gfs2_freeze(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	int error;
 
 	if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-		return;
+		return -EINVAL;
 
 	for (;;) {
 		error = gfs2_freeze_fs(sdp);
@@ -242,17 +242,19 @@ static void gfs2_write_super_lockfs(struct super_block *sb)
 		fs_err(sdp, "retrying...\n");
 		msleep(1000);
 	}
+	return 0;
 }
 
 /**
- * gfs2_unlockfs - reallow writes to the filesystem
+ * gfs2_unfreeze - reallow writes to the filesystem
  * @sb: the VFS structure for the filesystem
  *
  */
 
-static void gfs2_unlockfs(struct super_block *sb)
+static int gfs2_unfreeze(struct super_block *sb)
 {
 	gfs2_unfreeze_fs(sb->s_fs_info);
+	return 0;
 }
 
 /**
@@ -688,8 +690,8 @@ const struct super_operations gfs2_super_ops = {
 	.put_super		= gfs2_put_super,
 	.write_super		= gfs2_write_super,
 	.sync_fs		= gfs2_sync_fs,
-	.write_super_lockfs 	= gfs2_write_super_lockfs,
-	.unlockfs		= gfs2_unlockfs,
+	.freeze_fs 		= gfs2_freeze,
+	.unfreeze_fs		= gfs2_unfreeze,
 	.statfs			= gfs2_statfs,
 	.remount_fs		= gfs2_remount_fs,
 	.clear_inode		= gfs2_clear_inode,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0dae345e481b..b37d1f78b854 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -543,7 +543,7 @@ out_kfree:
 	return ret;
 }
 
-static void jfs_write_super_lockfs(struct super_block *sb)
+static int jfs_freeze(struct super_block *sb)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 	struct jfs_log *log = sbi->log;
@@ -553,9 +553,10 @@ static void jfs_write_super_lockfs(struct super_block *sb)
 		lmLogShutdown(log);
 		updateSuper(sb, FM_CLEAN);
 	}
+	return 0;
 }
 
-static void jfs_unlockfs(struct super_block *sb)
+static int jfs_unfreeze(struct super_block *sb)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 	struct jfs_log *log = sbi->log;
@@ -568,6 +569,7 @@ static void jfs_unlockfs(struct super_block *sb)
 		else
 			txResume(sb);
 	}
+	return 0;
 }
 
 static int jfs_get_sb(struct file_system_type *fs_type,
@@ -735,8 +737,8 @@ static const struct super_operations jfs_super_operations = {
 	.delete_inode	= jfs_delete_inode,
 	.put_super	= jfs_put_super,
 	.sync_fs	= jfs_sync_fs,
-	.write_super_lockfs = jfs_write_super_lockfs,
-	.unlockfs       = jfs_unlockfs,
+	.freeze_fs	= jfs_freeze,
+	.unfreeze_fs	= jfs_unfreeze,
 	.statfs		= jfs_statfs,
 	.remount_fs	= jfs_remount,
 	.show_options	= jfs_show_options,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c55651f1407c..f3c820b75829 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -83,7 +83,7 @@ static void reiserfs_write_super(struct super_block *s)
 	reiserfs_sync_fs(s, 1);
 }
 
-static void reiserfs_write_super_lockfs(struct super_block *s)
+static int reiserfs_freeze(struct super_block *s)
 {
 	struct reiserfs_transaction_handle th;
 	reiserfs_write_lock(s);
@@ -101,11 +101,13 @@ static void reiserfs_write_super_lockfs(struct super_block *s)
 	}
 	s->s_dirt = 0;
 	reiserfs_write_unlock(s);
+	return 0;
 }
 
-static void reiserfs_unlockfs(struct super_block *s)
+static int reiserfs_unfreeze(struct super_block *s)
 {
 	reiserfs_allow_writes(s);
+	return 0;
 }
 
 extern const struct in_core_key MAX_IN_CORE_KEY;
@@ -613,8 +615,8 @@ static const struct super_operations reiserfs_sops = {
 	.put_super = reiserfs_put_super,
 	.write_super = reiserfs_write_super,
 	.sync_fs = reiserfs_sync_fs,
-	.write_super_lockfs = reiserfs_write_super_lockfs,
-	.unlockfs = reiserfs_unlockfs,
+	.freeze_fs = reiserfs_freeze,
+	.unfreeze_fs = reiserfs_unfreeze,
 	.statfs = reiserfs_statfs,
 	.remount_fs = reiserfs_remount,
 	.show_options = generic_show_options,
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index be846d606ae8..95a971080368 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1269,14 +1269,14 @@ xfs_fs_remount(
  * need to take care of the metadata. Once that's done write a dummy
  * record to dirty the log in case of a crash while frozen.
  */
-STATIC void
-xfs_fs_lockfs(
+STATIC int
+xfs_fs_freeze(
 	struct super_block	*sb)
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
 	xfs_quiesce_attr(mp);
-	xfs_fs_log_dummy(mp);
+	return -xfs_fs_log_dummy(mp);
 }
 
 STATIC int
@@ -1557,7 +1557,7 @@ static struct super_operations xfs_super_operations = {
 	.put_super		= xfs_fs_put_super,
 	.write_super		= xfs_fs_write_super,
 	.sync_fs		= xfs_fs_sync_super,
-	.write_super_lockfs	= xfs_fs_lockfs,
+	.freeze_fs		= xfs_fs_freeze,
 	.statfs			= xfs_fs_statfs,
 	.remount_fs		= xfs_fs_remount,
 	.show_options		= xfs_fs_show_options,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 852b6d32e8d0..680d0e0ec932 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -595,17 +595,19 @@ out:
 	return 0;
 }
 
-void
+int
 xfs_fs_log_dummy(
 	xfs_mount_t	*mp)
 {
 	xfs_trans_t	*tp;
 	xfs_inode_t	*ip;
+	int		error;
 
 	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-	if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) {
+	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+	if (error) {
 		xfs_trans_cancel(tp, 0);
-		return;
+		return error;
 	}
 
 	ip = mp->m_rootip;
@@ -615,9 +617,10 @@ xfs_fs_log_dummy(
 	xfs_trans_ihold(tp, ip);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	xfs_trans_set_sync(tp);
-	xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp, 0);
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
 }
 
 int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 300d0c9d61ad..88435e0a77c9 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
 				xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern void xfs_fs_log_dummy(xfs_mount_t *mp);
+extern int xfs_fs_log_dummy(xfs_mount_t *mp);
 
 #endif	/* __XFS_FSOPS_H__ */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0b87b29f4797..3e59182de9df 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1377,8 +1377,8 @@ struct super_operations {
 	void (*put_super) (struct super_block *);
 	void (*write_super) (struct super_block *);
 	int (*sync_fs)(struct super_block *sb, int wait);
-	void (*write_super_lockfs) (struct super_block *);
-	void (*unlockfs) (struct super_block *);
+	int (*freeze_fs) (struct super_block *);
+	int (*unfreeze_fs) (struct super_block *);
 	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*clear_inode) (struct inode *);
-- 
cgit v1.2.3


From fcccf502540e3d752d33b2d8e976034dee81f9f7 Mon Sep 17 00:00:00 2001
From: Takashi Sato <t-sato@yk.jp.nec.com>
Date: Fri, 9 Jan 2009 16:40:59 -0800
Subject: filesystem freeze: implement generic freeze feature

The ioctls for the generic freeze feature are below.
o Freeze the filesystem
  int ioctl(int fd, int FIFREEZE, arg)
    fd: The file descriptor of the mountpoint
    FIFREEZE: request code for the freeze
    arg: Ignored
    Return value: 0 if the operation succeeds. Otherwise, -1

o Unfreeze the filesystem
  int ioctl(int fd, int FITHAW, arg)
    fd: The file descriptor of the mountpoint
    FITHAW: request code for unfreeze
    arg: Ignored
    Return value: 0 if the operation succeeds. Otherwise, -1
    Error number: If the filesystem has already been unfrozen,
                  errno is set to EINVAL.

[akpm@linux-foundation.org: fix CONFIG_BLOCK=n]
Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
Signed-off-by: Masayuki Hamaguchi <m-hamaguchi@ys.jp.nec.com>
Cc: <xfs-masters@oss.sgi.com>
Cc: <linux-ext4@vger.kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c              |  2 ++
 fs/buffer.c                 | 74 +++++++++++++++++++++++++++++++++++++++------
 fs/ioctl.c                  | 46 ++++++++++++++++++++++++++++
 include/linux/buffer_head.h | 11 ++++++-
 include/linux/fs.h          |  7 +++++
 5 files changed, 130 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index ac7031f12ea5..b3c1efff5e1d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -285,6 +285,8 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&bdev->bd_holder_list);
 #endif
 	inode_init_once(&ei->vfs_inode);
+	/* Initialize mutex for freeze. */
+	mutex_init(&bdev->bd_fsfreeze_mutex);
 }
 
 static inline void __bd_forget(struct inode *inode)
diff --git a/fs/buffer.c b/fs/buffer.c
index 87f9e537b8c3..b6e8b8632e2f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -203,10 +203,25 @@ int fsync_bdev(struct block_device *bdev)
  * happen on bdev until thaw_bdev() is called.
  * If a superblock is found on this device, we take the s_umount semaphore
  * on it to make sure nobody unmounts until the snapshot creation is done.
+ * The reference counter (bd_fsfreeze_count) guarantees that only the last
+ * unfreeze process can unfreeze the frozen filesystem actually when multiple
+ * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
+ * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * actually.
  */
 struct super_block *freeze_bdev(struct block_device *bdev)
 {
 	struct super_block *sb;
+	int error = 0;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (bdev->bd_fsfreeze_count > 0) {
+		bdev->bd_fsfreeze_count++;
+		sb = get_super(bdev);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return sb;
+	}
+	bdev->bd_fsfreeze_count++;
 
 	down(&bdev->bd_mount_sem);
 	sb = get_super(bdev);
@@ -221,11 +236,24 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 
 		sync_blockdev(sb->s_bdev);
 
-		if (sb->s_op->freeze_fs)
-			sb->s_op->freeze_fs(sb);
+		if (sb->s_op->freeze_fs) {
+			error = sb->s_op->freeze_fs(sb);
+			if (error) {
+				printk(KERN_ERR
+					"VFS:Filesystem freeze failed\n");
+				sb->s_frozen = SB_UNFROZEN;
+				drop_super(sb);
+				up(&bdev->bd_mount_sem);
+				bdev->bd_fsfreeze_count--;
+				mutex_unlock(&bdev->bd_fsfreeze_mutex);
+				return ERR_PTR(error);
+			}
+		}
 	}
 
 	sync_blockdev(bdev);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+
 	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
 }
 EXPORT_SYMBOL(freeze_bdev);
@@ -237,20 +265,48 @@ EXPORT_SYMBOL(freeze_bdev);
  *
  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
  */
-void thaw_bdev(struct block_device *bdev, struct super_block *sb)
+int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 {
+	int error = 0;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (!bdev->bd_fsfreeze_count) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return -EINVAL;
+	}
+
+	bdev->bd_fsfreeze_count--;
+	if (bdev->bd_fsfreeze_count > 0) {
+		if (sb)
+			drop_super(sb);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return 0;
+	}
+
 	if (sb) {
 		BUG_ON(sb->s_bdev != bdev);
-
-		if (sb->s_op->unfreeze_fs)
-			sb->s_op->unfreeze_fs(sb);
-		sb->s_frozen = SB_UNFROZEN;
-		smp_wmb();
-		wake_up(&sb->s_wait_unfrozen);
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (sb->s_op->unfreeze_fs) {
+				error = sb->s_op->unfreeze_fs(sb);
+				if (error) {
+					printk(KERN_ERR
+						"VFS:Filesystem thaw failed\n");
+					sb->s_frozen = SB_FREEZE_TRANS;
+					bdev->bd_fsfreeze_count++;
+					mutex_unlock(&bdev->bd_fsfreeze_mutex);
+					return error;
+				}
+			}
+			sb->s_frozen = SB_UNFROZEN;
+			smp_wmb();
+			wake_up(&sb->s_wait_unfrozen);
+		}
 		drop_super(sb);
 	}
 
 	up(&bdev->bd_mount_sem);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	return 0;
 }
 EXPORT_SYMBOL(thaw_bdev);
 
diff --git a/fs/ioctl.c b/fs/ioctl.c
index cc3f1aa1cf7b..20b0a8a24c6b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -439,6 +439,43 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
 	return error;
 }
 
+static int ioctl_fsfreeze(struct file *filp)
+{
+	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* If filesystem doesn't support freeze feature, return. */
+	if (sb->s_op->freeze_fs == NULL)
+		return -EOPNOTSUPP;
+
+	/* If a blockdevice-backed filesystem isn't specified, return. */
+	if (sb->s_bdev == NULL)
+		return -EINVAL;
+
+	/* Freeze */
+	sb = freeze_bdev(sb->s_bdev);
+	if (IS_ERR(sb))
+		return PTR_ERR(sb);
+	return 0;
+}
+
+static int ioctl_fsthaw(struct file *filp)
+{
+	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
+	if (sb->s_bdev == NULL)
+		return -EINVAL;
+
+	/* Thaw */
+	return thaw_bdev(sb->s_bdev, sb);
+}
+
 /*
  * When you add any new common ioctls to the switches above and below
  * please update compat_sys_ioctl() too.
@@ -486,6 +523,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		} else
 			error = -ENOTTY;
 		break;
+
+	case FIFREEZE:
+		error = ioctl_fsfreeze(filp);
+		break;
+
+	case FITHAW:
+		error = ioctl_fsthaw(filp);
+		break;
+
 	default:
 		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
 			error = file_ioctl(filp, cmd, arg);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 8605f8a74df9..bd7ac793be19 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -171,7 +171,7 @@ void __wait_on_buffer(struct buffer_head *);
 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
 int fsync_bdev(struct block_device *);
 struct super_block *freeze_bdev(struct block_device *);
-void thaw_bdev(struct block_device *, struct super_block *);
+int thaw_bdev(struct block_device *, struct super_block *);
 int fsync_super(struct super_block *);
 int fsync_no_super(struct block_device *);
 struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
@@ -346,6 +346,15 @@ static inline int remove_inode_buffers(struct inode *inode) { return 1; }
 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
 static inline void invalidate_bdev(struct block_device *bdev) {}
 
+static inline struct super_block *freeze_bdev(struct block_device *sb)
+{
+	return NULL;
+}
+
+static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
+{
+	return 0;
+}
 
 #endif /* CONFIG_BLOCK */
 #endif /* _LINUX_BUFFER_HEAD_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3e59182de9df..6022f44043f2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -234,6 +234,8 @@ struct inodes_stat_t {
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
 #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */
+#define FIFREEZE	_IOWR('X', 119, int)	/* Freeze */
+#define FITHAW		_IOWR('X', 120, int)	/* Thaw */
 
 #define	FS_IOC_GETFLAGS			_IOR('f', 1, long)
 #define	FS_IOC_SETFLAGS			_IOW('f', 2, long)
@@ -591,6 +593,11 @@ struct block_device {
 	 * care to not mess up bd_private for that case.
 	 */
 	unsigned long		bd_private;
+
+	/* The counter of freeze processes */
+	int			bd_fsfreeze_count;
+	/* Mutex for freeze */
+	struct mutex		bd_fsfreeze_mutex;
 };
 
 /*
-- 
cgit v1.2.3


From f4b477c47332367d35686bd2b808c2156b96d7c7 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sat, 10 Jan 2009 11:12:09 +0000
Subject: rbtree: add const qualifier to some functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 'rb_first()', 'rb_last()', 'rb_next()' and 'rb_prev()' calls
take a pointer to an RB node or RB root. They do not change the
pointed objects, so add a 'const' qualifier in order to make life
of the users of these functions easier.

Indeed, if I have my own constant pointer &const struct my_type *p,
and I call 'rb_next(&p->rb)', I get a GCC warning:

warning: passing argument 1 of ‘rb_next’ discards qualifiers from pointer target type

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rbtree.h |  8 ++++----
 lib/rbtree.c           | 12 ++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 344bc3495ddb..9c295411d01f 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -140,10 +140,10 @@ extern void rb_insert_color(struct rb_node *, struct rb_root *);
 extern void rb_erase(struct rb_node *, struct rb_root *);
 
 /* Find logical next and previous nodes in a tree */
-extern struct rb_node *rb_next(struct rb_node *);
-extern struct rb_node *rb_prev(struct rb_node *);
-extern struct rb_node *rb_first(struct rb_root *);
-extern struct rb_node *rb_last(struct rb_root *);
+extern struct rb_node *rb_next(const struct rb_node *);
+extern struct rb_node *rb_prev(const struct rb_node *);
+extern struct rb_node *rb_first(const struct rb_root *);
+extern struct rb_node *rb_last(const struct rb_root *);
 
 /* Fast replacement of a single node without remove/rebalance/add/rebalance */
 extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 48499c2d88cc..9956b99649f0 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -292,7 +292,7 @@ EXPORT_SYMBOL(rb_erase);
 /*
  * This function returns the first node (in sort order) of the tree.
  */
-struct rb_node *rb_first(struct rb_root *root)
+struct rb_node *rb_first(const struct rb_root *root)
 {
 	struct rb_node	*n;
 
@@ -305,7 +305,7 @@ struct rb_node *rb_first(struct rb_root *root)
 }
 EXPORT_SYMBOL(rb_first);
 
-struct rb_node *rb_last(struct rb_root *root)
+struct rb_node *rb_last(const struct rb_root *root)
 {
 	struct rb_node	*n;
 
@@ -318,7 +318,7 @@ struct rb_node *rb_last(struct rb_root *root)
 }
 EXPORT_SYMBOL(rb_last);
 
-struct rb_node *rb_next(struct rb_node *node)
+struct rb_node *rb_next(const struct rb_node *node)
 {
 	struct rb_node *parent;
 
@@ -331,7 +331,7 @@ struct rb_node *rb_next(struct rb_node *node)
 		node = node->rb_right; 
 		while (node->rb_left)
 			node=node->rb_left;
-		return node;
+		return (struct rb_node *)node;
 	}
 
 	/* No right-hand children.  Everything down and left is
@@ -347,7 +347,7 @@ struct rb_node *rb_next(struct rb_node *node)
 }
 EXPORT_SYMBOL(rb_next);
 
-struct rb_node *rb_prev(struct rb_node *node)
+struct rb_node *rb_prev(const struct rb_node *node)
 {
 	struct rb_node *parent;
 
@@ -360,7 +360,7 @@ struct rb_node *rb_prev(struct rb_node *node)
 		node = node->rb_left; 
 		while (node->rb_right)
 			node=node->rb_right;
-		return node;
+		return (struct rb_node *)node;
 	}
 
 	/* No left-hand children. Go up till we find an ancestor which
-- 
cgit v1.2.3


From 886ad09fc83342aa1c5a02a0b6d3298b78a8067f Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 9 Jan 2009 15:54:07 -0800
Subject: libata: Add a per-host flag to opt-in into parallel port probes

This patch adds a per host flag that allows drivers to opt in into
having its busses scanned in parallel.

Drivers that do not set this flag get their ports scanned in
the "original" sequence.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/ata/ahci.c        | 3 +++
 drivers/ata/libata-core.c | 9 +++++++++
 include/linux/libata.h    | 1 +
 3 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 7f701cbe14ab..96039671e3b9 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -2660,6 +2660,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	host->iomap = pcim_iomap_table(pdev);
 	host->private_data = hpriv;
 
+	if (!(hpriv->cap & HOST_CAP_SSS))
+		host->flags |= ATA_HOST_PARALLEL_SCAN;
+
 	if (pi.flags & ATA_FLAG_EM)
 		ahci_reset_em(host);
 
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index c507a9ac78f4..f810078fafcc 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -5920,6 +5920,15 @@ static void async_port_probe(void *data, async_cookie_t cookie)
 {
 	int rc;
 	struct ata_port *ap = data;
+
+	/*
+	 * If we're not allowed to scan this host in parallel,
+	 * we need to wait until all previous scans have completed
+	 * before going further.
+	 */
+	if (!(ap->host->flags & ATA_HOST_PARALLEL_SCAN))
+		async_synchronize_cookie(cookie);
+
 	/* probe */
 	if (ap->ops->error_handler) {
 		struct ata_eh_info *ehi = &ap->link.eh_info;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 4f7c8fb4d3fe..b6b8a7f3ec66 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -239,6 +239,7 @@ enum {
 	/* host set flags */
 	ATA_HOST_SIMPLEX	= (1 << 0),	/* Host is simplex, one DMA channel per host only */
 	ATA_HOST_STARTED	= (1 << 1),	/* Host started */
+	ATA_HOST_PARALLEL_SCAN	= (1 << 2),	/* Ports on this host can be scanned in parallel */
 
 	/* bits 24:31 of host->flags are reserved for LLD specific flags */
 
-- 
cgit v1.2.3


From f52046b14b1e1a8a02ae48d0c69d39c5e204644f Mon Sep 17 00:00:00 2001
From: Balaji Rao <balajirrao@openmoko.org>
Date: Fri, 9 Jan 2009 01:49:01 +0100
Subject: mfd: PCF50633 core driver

This patch implements the core of the PCF50633 driver. This core driver has
generic register read/write functions and does interrupt management for its
sub devices.

Signed-off-by: Balaji Rao <balajirrao@openmoko.org>
Cc: Andy Green <andy@openmoko.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/mfd/Kconfig               |   9 +
 drivers/mfd/Makefile              |   2 +
 drivers/mfd/pcf50633-core.c       | 710 ++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/pcf50633/core.h | 218 ++++++++++++
 4 files changed, 939 insertions(+)
 create mode 100644 drivers/mfd/pcf50633-core.c
 create mode 100644 include/linux/mfd/pcf50633/core.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 416f9e7286ba..3ac32e45b03b 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -217,6 +217,15 @@ config MFD_WM8350_I2C
 	  I2C as the control interface.  Additional options must be
 	  selected to enable support for the functionality of the chip.
 
+config MFD_PCF50633
+	tristate "Support for NXP PCF50633"
+	depends on I2C
+	help
+	  Say yes here if you have NXP PCF50633 chip on your board.
+	  This core driver provides register access and IRQ handling
+	  facilities, and registers devices for the various functions
+	  so that function-specific drivers can bind to them.
+
 endmenu
 
 menu "Multimedia Capabilities Port drivers"
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 0c9418b36c26..23d4d10981b3 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -37,3 +37,5 @@ endif
 obj-$(CONFIG_UCB1400_CORE)	+= ucb1400_core.o
 
 obj-$(CONFIG_PMIC_DA903X)	+= da903x.o
+
+obj-$(CONFIG_MFD_PCF50633)	+= pcf50633-core.o
\ No newline at end of file
diff --git a/drivers/mfd/pcf50633-core.c b/drivers/mfd/pcf50633-core.c
new file mode 100644
index 000000000000..24508e28e3fb
--- /dev/null
+++ b/drivers/mfd/pcf50633-core.c
@@ -0,0 +1,710 @@
+/* NXP PCF50633 Power Management Unit (PMU) driver
+ *
+ * (C) 2006-2008 by Openmoko, Inc.
+ * Author: Harald Welte <laforge@openmoko.org>
+ * 	   Balaji Rao <balajirrao@openmoko.org>
+ * All rights reserved.
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/sysfs.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/workqueue.h>
+#include <linux/platform_device.h>
+#include <linux/i2c.h>
+#include <linux/irq.h>
+
+#include <linux/mfd/pcf50633/core.h>
+
+/* Two MBCS registers used during cold start */
+#define PCF50633_REG_MBCS1		0x4b
+#define PCF50633_REG_MBCS2		0x4c
+#define PCF50633_MBCS1_USBPRES 		0x01
+#define PCF50633_MBCS1_ADAPTPRES	0x01
+
+static int __pcf50633_read(struct pcf50633 *pcf, u8 reg, int num, u8 *data)
+{
+	int ret;
+
+	ret = i2c_smbus_read_i2c_block_data(pcf->i2c_client, reg,
+				num, data);
+	if (ret < 0)
+		dev_err(pcf->dev, "Error reading %d regs at %d\n", num, reg);
+
+	return ret;
+}
+
+static int __pcf50633_write(struct pcf50633 *pcf, u8 reg, int num, u8 *data)
+{
+	int ret;
+
+	ret = i2c_smbus_write_i2c_block_data(pcf->i2c_client, reg,
+				num, data);
+	if (ret < 0)
+		dev_err(pcf->dev, "Error writing %d regs at %d\n", num, reg);
+
+	return ret;
+
+}
+
+/* Read a block of upto 32 regs  */
+int pcf50633_read_block(struct pcf50633 *pcf, u8 reg,
+					int nr_regs, u8 *data)
+{
+	int ret;
+
+	mutex_lock(&pcf->lock);
+	ret = __pcf50633_read(pcf, reg, nr_regs, data);
+	mutex_unlock(&pcf->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pcf50633_read_block);
+
+/* Write a block of upto 32 regs  */
+int pcf50633_write_block(struct pcf50633 *pcf , u8 reg,
+					int nr_regs, u8 *data)
+{
+	int ret;
+
+	mutex_lock(&pcf->lock);
+	ret = __pcf50633_write(pcf, reg, nr_regs, data);
+	mutex_unlock(&pcf->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pcf50633_write_block);
+
+u8 pcf50633_reg_read(struct pcf50633 *pcf, u8 reg)
+{
+	u8 val;
+
+	mutex_lock(&pcf->lock);
+	__pcf50633_read(pcf, reg, 1, &val);
+	mutex_unlock(&pcf->lock);
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(pcf50633_reg_read);
+
+int pcf50633_reg_write(struct pcf50633 *pcf, u8 reg, u8 val)
+{
+	int ret;
+
+	mutex_lock(&pcf->lock);
+	ret = __pcf50633_write(pcf, reg, 1, &val);
+	mutex_unlock(&pcf->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pcf50633_reg_write);
+
+int pcf50633_reg_set_bit_mask(struct pcf50633 *pcf, u8 reg, u8 mask, u8 val)
+{
+	int ret;
+	u8 tmp;
+
+	val &= mask;
+
+	mutex_lock(&pcf->lock);
+	ret = __pcf50633_read(pcf, reg, 1, &tmp);
+	if (ret < 0)
+		goto out;
+
+	tmp &= ~mask;
+	tmp |= val;
+	ret = __pcf50633_write(pcf, reg, 1, &tmp);
+
+out:
+	mutex_unlock(&pcf->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pcf50633_reg_set_bit_mask);
+
+int pcf50633_reg_clear_bits(struct pcf50633 *pcf, u8 reg, u8 val)
+{
+	int ret;
+	u8 tmp;
+
+	mutex_lock(&pcf->lock);
+	ret = __pcf50633_read(pcf, reg, 1, &tmp);
+	if (ret < 0)
+		goto out;
+
+	tmp &= ~val;
+	ret = __pcf50633_write(pcf, reg, 1, &tmp);
+
+out:
+	mutex_unlock(&pcf->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pcf50633_reg_clear_bits);
+
+/* sysfs attributes */
+static ssize_t show_dump_regs(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct pcf50633 *pcf = dev_get_drvdata(dev);
+	u8 dump[16];
+	int n, n1, idx = 0;
+	char *buf1 = buf;
+	static u8 address_no_read[] = { /* must be ascending */
+		PCF50633_REG_INT1,
+		PCF50633_REG_INT2,
+		PCF50633_REG_INT3,
+		PCF50633_REG_INT4,
+		PCF50633_REG_INT5,
+		0 /* terminator */
+	};
+
+	for (n = 0; n < 256; n += sizeof(dump)) {
+		for (n1 = 0; n1 < sizeof(dump); n1++)
+			if (n == address_no_read[idx]) {
+				idx++;
+				dump[n1] = 0x00;
+			} else
+				dump[n1] = pcf50633_reg_read(pcf, n + n1);
+
+		hex_dump_to_buffer(dump, sizeof(dump), 16, 1, buf1, 128, 0);
+		buf1 += strlen(buf1);
+		*buf1++ = '\n';
+		*buf1 = '\0';
+	}
+
+	return buf1 - buf;
+}
+static DEVICE_ATTR(dump_regs, 0400, show_dump_regs, NULL);
+
+static ssize_t show_resume_reason(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pcf50633 *pcf = dev_get_drvdata(dev);
+	int n;
+
+	n = sprintf(buf, "%02x%02x%02x%02x%02x\n",
+				pcf->resume_reason[0],
+				pcf->resume_reason[1],
+				pcf->resume_reason[2],
+				pcf->resume_reason[3],
+				pcf->resume_reason[4]);
+
+	return n;
+}
+static DEVICE_ATTR(resume_reason, 0400, show_resume_reason, NULL);
+
+static struct attribute *pcf_sysfs_entries[] = {
+	&dev_attr_dump_regs.attr,
+	&dev_attr_resume_reason.attr,
+	NULL,
+};
+
+static struct attribute_group pcf_attr_group = {
+	.name	= NULL,			/* put in device directory */
+	.attrs	= pcf_sysfs_entries,
+};
+
+int pcf50633_register_irq(struct pcf50633 *pcf, int irq,
+			void (*handler) (int, void *), void *data)
+{
+	if (irq < 0 || irq > PCF50633_NUM_IRQ || !handler)
+		return -EINVAL;
+
+	if (WARN_ON(pcf->irq_handler[irq].handler))
+		return -EBUSY;
+
+	mutex_lock(&pcf->lock);
+	pcf->irq_handler[irq].handler = handler;
+	pcf->irq_handler[irq].data = data;
+	mutex_unlock(&pcf->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pcf50633_register_irq);
+
+int pcf50633_free_irq(struct pcf50633 *pcf, int irq)
+{
+	if (irq < 0 || irq > PCF50633_NUM_IRQ)
+		return -EINVAL;
+
+	mutex_lock(&pcf->lock);
+	pcf->irq_handler[irq].handler = NULL;
+	mutex_unlock(&pcf->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pcf50633_free_irq);
+
+static int __pcf50633_irq_mask_set(struct pcf50633 *pcf, int irq, u8 mask)
+{
+	u8 reg, bits, tmp;
+	int ret = 0, idx;
+
+	idx = irq >> 3;
+	reg =  PCF50633_REG_INT1M + idx;
+	bits = 1 << (irq & 0x07);
+
+	mutex_lock(&pcf->lock);
+
+	if (mask) {
+		ret = __pcf50633_read(pcf, reg, 1, &tmp);
+		if (ret < 0)
+			goto out;
+
+		tmp |= bits;
+
+		ret = __pcf50633_write(pcf, reg, 1, &tmp);
+		if (ret < 0)
+			goto out;
+
+		pcf->mask_regs[idx] &= ~bits;
+		pcf->mask_regs[idx] |= bits;
+	} else {
+		ret = __pcf50633_read(pcf, reg, 1, &tmp);
+		if (ret < 0)
+			goto out;
+
+		tmp &= ~bits;
+
+		ret = __pcf50633_write(pcf, reg, 1, &tmp);
+		if (ret < 0)
+			goto out;
+
+		pcf->mask_regs[idx] &= ~bits;
+	}
+out:
+	mutex_unlock(&pcf->lock);
+
+	return ret;
+}
+
+int pcf50633_irq_mask(struct pcf50633 *pcf, int irq)
+{
+	dev_info(pcf->dev, "Masking IRQ %d\n", irq);
+
+	return __pcf50633_irq_mask_set(pcf, irq, 1);
+}
+EXPORT_SYMBOL_GPL(pcf50633_irq_mask);
+
+int pcf50633_irq_unmask(struct pcf50633 *pcf, int irq)
+{
+	dev_info(pcf->dev, "Unmasking IRQ %d\n", irq);
+
+	return __pcf50633_irq_mask_set(pcf, irq, 0);
+}
+EXPORT_SYMBOL_GPL(pcf50633_irq_unmask);
+
+int pcf50633_irq_mask_get(struct pcf50633 *pcf, int irq)
+{
+	u8 reg, bits;
+
+	reg =  irq >> 3;
+	bits = 1 << (irq & 0x07);
+
+	return pcf->mask_regs[reg] & bits;
+}
+EXPORT_SYMBOL_GPL(pcf50633_irq_mask_get);
+
+static void pcf50633_irq_call_handler(struct pcf50633 *pcf, int irq)
+{
+	if (pcf->irq_handler[irq].handler)
+		pcf->irq_handler[irq].handler(irq, pcf->irq_handler[irq].data);
+}
+
+/* Maximum amount of time ONKEY is held before emergency action is taken */
+#define PCF50633_ONKEY1S_TIMEOUT 8
+
+static void pcf50633_irq_worker(struct work_struct *work)
+{
+	struct pcf50633 *pcf;
+	int ret, i, j;
+	u8 pcf_int[5], chgstat;
+
+	pcf = container_of(work, struct pcf50633, irq_work);
+
+	/* Read the 5 INT regs in one transaction */
+	ret = pcf50633_read_block(pcf, PCF50633_REG_INT1,
+						ARRAY_SIZE(pcf_int), pcf_int);
+	if (ret != ARRAY_SIZE(pcf_int)) {
+		dev_err(pcf->dev, "Error reading INT registers\n");
+
+		/*
+		 * If this doesn't ACK the interrupt to the chip, we'll be
+		 * called once again as we're level triggered.
+		 */
+		goto out;
+	}
+
+	/* We immediately read the usb and adapter status. We thus make sure
+	 * only of USBINS/USBREM IRQ handlers are called */
+	if (pcf_int[0] & (PCF50633_INT1_USBINS | PCF50633_INT1_USBREM)) {
+		chgstat = pcf50633_reg_read(pcf, PCF50633_REG_MBCS2);
+		if (chgstat & (0x3 << 4))
+			pcf_int[0] &= ~(1 << PCF50633_INT1_USBREM);
+		else
+			pcf_int[0] &= ~(1 << PCF50633_INT1_USBINS);
+	}
+
+	/* Make sure only one of ADPINS or ADPREM is set */
+	if (pcf_int[0] & (PCF50633_INT1_ADPINS | PCF50633_INT1_ADPREM)) {
+		chgstat = pcf50633_reg_read(pcf, PCF50633_REG_MBCS2);
+		if (chgstat & (0x3 << 4))
+			pcf_int[0] &= ~(1 << PCF50633_INT1_ADPREM);
+		else
+			pcf_int[0] &= ~(1 << PCF50633_INT1_ADPINS);
+	}
+
+	dev_dbg(pcf->dev, "INT1=0x%02x INT2=0x%02x INT3=0x%02x "
+			"INT4=0x%02x INT5=0x%02x\n", pcf_int[0],
+			pcf_int[1], pcf_int[2], pcf_int[3], pcf_int[4]);
+
+	/* Some revisions of the chip don't have a 8s standby mode on
+	 * ONKEY1S press. We try to manually do it in such cases. */
+	if ((pcf_int[0] & PCF50633_INT1_SECOND) && pcf->onkey1s_held) {
+		dev_info(pcf->dev, "ONKEY1S held for %d secs\n",
+							pcf->onkey1s_held);
+		if (pcf->onkey1s_held++ == PCF50633_ONKEY1S_TIMEOUT)
+			if (pcf->pdata->force_shutdown)
+				pcf->pdata->force_shutdown(pcf);
+	}
+
+	if (pcf_int[2] & PCF50633_INT3_ONKEY1S) {
+		dev_info(pcf->dev, "ONKEY1S held\n");
+		pcf->onkey1s_held = 1 ;
+
+		/* Unmask IRQ_SECOND */
+		pcf50633_reg_clear_bits(pcf, PCF50633_REG_INT1M,
+						PCF50633_INT1_SECOND);
+
+		/* Unmask IRQ_ONKEYR */
+		pcf50633_reg_clear_bits(pcf, PCF50633_REG_INT2M,
+						PCF50633_INT2_ONKEYR);
+	}
+
+	if ((pcf_int[1] & PCF50633_INT2_ONKEYR) && pcf->onkey1s_held) {
+		pcf->onkey1s_held = 0;
+
+		/* Mask SECOND and ONKEYR interrupts */
+		if (pcf->mask_regs[0] & PCF50633_INT1_SECOND)
+			pcf50633_reg_set_bit_mask(pcf,
+					PCF50633_REG_INT1M,
+					PCF50633_INT1_SECOND,
+					PCF50633_INT1_SECOND);
+
+		if (pcf->mask_regs[1] & PCF50633_INT2_ONKEYR)
+			pcf50633_reg_set_bit_mask(pcf,
+					PCF50633_REG_INT2M,
+					PCF50633_INT2_ONKEYR,
+					PCF50633_INT2_ONKEYR);
+	}
+
+	/* Have we just resumed ? */
+	if (pcf->is_suspended) {
+		pcf->is_suspended = 0;
+
+		/* Set the resume reason filtering out non resumers */
+		for (i = 0; i < ARRAY_SIZE(pcf_int); i++)
+			pcf->resume_reason[i] = pcf_int[i] &
+						pcf->pdata->resumers[i];
+
+		/* Make sure we don't pass on any ONKEY events to
+		 * userspace now */
+		pcf_int[1] &= ~(PCF50633_INT2_ONKEYR | PCF50633_INT2_ONKEYF);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(pcf_int); i++) {
+		/* Unset masked interrupts */
+		pcf_int[i] &= ~pcf->mask_regs[i];
+
+		for (j = 0; j < 8 ; j++)
+			if (pcf_int[i] & (1 << j))
+				pcf50633_irq_call_handler(pcf, (i * 8) + j);
+	}
+
+out:
+	put_device(pcf->dev);
+	enable_irq(pcf->irq);
+}
+
+static irqreturn_t pcf50633_irq(int irq, void *data)
+{
+	struct pcf50633 *pcf = data;
+
+	dev_dbg(pcf->dev, "pcf50633_irq\n");
+
+	get_device(pcf->dev);
+	disable_irq(pcf->irq);
+	schedule_work(&pcf->irq_work);
+
+	return IRQ_HANDLED;
+}
+
+static void
+pcf50633_client_dev_register(struct pcf50633 *pcf, const char *name,
+						struct platform_device **pdev)
+{
+	struct pcf50633_subdev_pdata *subdev_pdata;
+	int ret;
+
+	*pdev = platform_device_alloc(name, -1);
+	if (!*pdev) {
+		dev_err(pcf->dev, "Falied to allocate %s\n", name);
+		return;
+	}
+
+	subdev_pdata = kmalloc(sizeof(*subdev_pdata), GFP_KERNEL);
+	if (!subdev_pdata) {
+		dev_err(pcf->dev, "Error allocating subdev pdata\n");
+		platform_device_put(*pdev);
+	}
+
+	subdev_pdata->pcf = pcf;
+	platform_device_add_data(*pdev, subdev_pdata, sizeof(*subdev_pdata));
+
+	(*pdev)->dev.parent = pcf->dev;
+
+	ret = platform_device_add(*pdev);
+	if (ret) {
+		dev_err(pcf->dev, "Failed to register %s: %d\n", name, ret);
+		platform_device_put(*pdev);
+		*pdev = NULL;
+	}
+}
+
+#ifdef CONFIG_PM
+static int pcf50633_suspend(struct device *dev, pm_message_t state)
+{
+	struct pcf50633 *pcf;
+	int ret = 0, i;
+	u8 res[5];
+
+	pcf = dev_get_drvdata(dev);
+
+	/* Make sure our interrupt handlers are not called
+	 * henceforth */
+	disable_irq(pcf->irq);
+
+	/* Make sure that any running IRQ worker has quit */
+	cancel_work_sync(&pcf->irq_work);
+
+	/* Save the masks */
+	ret = pcf50633_read_block(pcf, PCF50633_REG_INT1M,
+				ARRAY_SIZE(pcf->suspend_irq_masks),
+					pcf->suspend_irq_masks);
+	if (ret < 0) {
+		dev_err(pcf->dev, "error saving irq masks\n");
+		goto out;
+	}
+
+	/* Write wakeup irq masks */
+	for (i = 0; i < ARRAY_SIZE(res); i++)
+		res[i] = ~pcf->pdata->resumers[i];
+
+	ret = pcf50633_write_block(pcf, PCF50633_REG_INT1M,
+					ARRAY_SIZE(res), &res[0]);
+	if (ret < 0) {
+		dev_err(pcf->dev, "error writing wakeup irq masks\n");
+		goto out;
+	}
+
+	pcf->is_suspended = 1;
+
+out:
+	return ret;
+}
+
+static int pcf50633_resume(struct device *dev)
+{
+	struct pcf50633 *pcf;
+	int ret;
+
+	pcf = dev_get_drvdata(dev);
+
+	/* Write the saved mask registers */
+	ret = pcf50633_write_block(pcf, PCF50633_REG_INT1M,
+				ARRAY_SIZE(pcf->suspend_irq_masks),
+					pcf->suspend_irq_masks);
+	if (ret < 0)
+		dev_err(pcf->dev, "Error restoring saved suspend masks\n");
+
+	/* Restore regulators' state */
+
+
+	get_device(pcf->dev);
+
+	/*
+	 * Clear any pending interrupts and set resume reason if any.
+	 * This will leave with enable_irq()
+	 */
+	pcf50633_irq_worker(&pcf->irq_work);
+
+	return 0;
+}
+#else
+#define pcf50633_suspend NULL
+#define pcf50633_resume NULL
+#endif
+
+static int __devinit pcf50633_probe(struct i2c_client *client,
+				const struct i2c_device_id *ids)
+{
+	struct pcf50633 *pcf;
+	struct pcf50633_platform_data *pdata = client->dev.platform_data;
+	int i, ret = 0;
+	int version, variant;
+
+	pcf = kzalloc(sizeof(*pcf), GFP_KERNEL);
+	if (!pcf)
+		return -ENOMEM;
+
+	pcf->pdata = pdata;
+
+	mutex_init(&pcf->lock);
+
+	i2c_set_clientdata(client, pcf);
+	pcf->dev = &client->dev;
+	pcf->i2c_client = client;
+	pcf->irq = client->irq;
+
+	INIT_WORK(&pcf->irq_work, pcf50633_irq_worker);
+
+	version = pcf50633_reg_read(pcf, 0);
+	variant = pcf50633_reg_read(pcf, 1);
+	if (version < 0 || variant < 0) {
+		dev_err(pcf->dev, "Unable to probe pcf50633\n");
+		ret = -ENODEV;
+		goto err;
+	}
+
+	dev_info(pcf->dev, "Probed device version %d variant %d\n",
+							version, variant);
+
+	/* Enable all interrupts except RTC SECOND */
+	pcf->mask_regs[0] = 0x80;
+	pcf50633_reg_write(pcf, PCF50633_REG_INT1M, pcf->mask_regs[0]);
+	pcf50633_reg_write(pcf, PCF50633_REG_INT2M, 0x00);
+	pcf50633_reg_write(pcf, PCF50633_REG_INT3M, 0x00);
+	pcf50633_reg_write(pcf, PCF50633_REG_INT4M, 0x00);
+	pcf50633_reg_write(pcf, PCF50633_REG_INT5M, 0x00);
+
+	/* Create sub devices */
+	pcf50633_client_dev_register(pcf, "pcf50633-input",
+						&pcf->input_pdev);
+	pcf50633_client_dev_register(pcf, "pcf50633-rtc",
+						&pcf->rtc_pdev);
+	pcf50633_client_dev_register(pcf, "pcf50633-mbc",
+						&pcf->mbc_pdev);
+	pcf50633_client_dev_register(pcf, "pcf50633-adc",
+						&pcf->adc_pdev);
+
+	for (i = 0; i < PCF50633_NUM_REGULATORS; i++) {
+		struct platform_device *pdev;
+
+		pdev = platform_device_alloc("pcf50633-regltr", i);
+		if (!pdev) {
+			dev_err(pcf->dev, "Cannot create regulator\n");
+			continue;
+		}
+
+		pdev->dev.parent = pcf->dev;
+		pdev->dev.platform_data = &pdata->reg_init_data[i];
+		pdev->dev.driver_data = pcf;
+		pcf->regulator_pdev[i] = pdev;
+
+		platform_device_add(pdev);
+	}
+
+	if (client->irq) {
+		set_irq_handler(client->irq, handle_level_irq);
+		ret = request_irq(client->irq, pcf50633_irq,
+				IRQF_TRIGGER_LOW, "pcf50633", pcf);
+
+		if (ret) {
+			dev_err(pcf->dev, "Failed to request IRQ %d\n", ret);
+			goto err;
+		}
+	} else {
+		dev_err(pcf->dev, "No IRQ configured\n");
+		goto err;
+	}
+
+	if (enable_irq_wake(client->irq) < 0)
+		dev_err(pcf->dev, "IRQ %u cannot be enabled as wake-up source"
+			"in this hardware revision", client->irq);
+
+	ret = sysfs_create_group(&client->dev.kobj, &pcf_attr_group);
+	if (ret)
+		dev_err(pcf->dev, "error creating sysfs entries\n");
+
+	if (pdata->probe_done)
+		pdata->probe_done(pcf);
+
+	return 0;
+
+err:
+	kfree(pcf);
+	return ret;
+}
+
+static int __devexit pcf50633_remove(struct i2c_client *client)
+{
+	struct pcf50633 *pcf = i2c_get_clientdata(client);
+	int i;
+
+	free_irq(pcf->irq, pcf);
+
+	platform_device_unregister(pcf->input_pdev);
+	platform_device_unregister(pcf->rtc_pdev);
+	platform_device_unregister(pcf->mbc_pdev);
+	platform_device_unregister(pcf->adc_pdev);
+
+	for (i = 0; i < PCF50633_NUM_REGULATORS; i++)
+		platform_device_unregister(pcf->regulator_pdev[i]);
+
+	kfree(pcf);
+
+	return 0;
+}
+
+static struct i2c_device_id pcf50633_id_table[] = {
+	{"pcf50633", 0x73},
+};
+
+static struct i2c_driver pcf50633_driver = {
+	.driver = {
+		.name	= "pcf50633",
+		.suspend = pcf50633_suspend,
+		.resume	= pcf50633_resume,
+	},
+	.id_table = pcf50633_id_table,
+	.probe = pcf50633_probe,
+	.remove = __devexit_p(pcf50633_remove),
+};
+
+static int __init pcf50633_init(void)
+{
+	return i2c_add_driver(&pcf50633_driver);
+}
+
+static void __exit pcf50633_exit(void)
+{
+	i2c_del_driver(&pcf50633_driver);
+}
+
+MODULE_DESCRIPTION("I2C chip driver for NXP PCF50633 PMU");
+MODULE_AUTHOR("Harald Welte <laforge@openmoko.org>");
+MODULE_LICENSE("GPL");
+
+module_init(pcf50633_init);
+module_exit(pcf50633_exit);
diff --git a/include/linux/mfd/pcf50633/core.h b/include/linux/mfd/pcf50633/core.h
new file mode 100644
index 000000000000..4455b212d75a
--- /dev/null
+++ b/include/linux/mfd/pcf50633/core.h
@@ -0,0 +1,218 @@
+/*
+ * core.h  -- Core driver for NXP PCF50633
+ *
+ * (C) 2006-2008 by Openmoko, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __LINUX_MFD_PCF50633_CORE_H
+#define __LINUX_MFD_PCF50633_CORE_H
+
+#include <linux/i2c.h>
+#include <linux/workqueue.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+#include <linux/power_supply.h>
+
+struct pcf50633;
+
+#define PCF50633_NUM_REGULATORS	11
+
+struct pcf50633_platform_data {
+	struct regulator_init_data reg_init_data[PCF50633_NUM_REGULATORS];
+
+	char **batteries;
+	int num_batteries;
+
+	/* Callbacks */
+	void (*probe_done)(struct pcf50633 *);
+	void (*mbc_event_callback)(struct pcf50633 *, int);
+	void (*regulator_registered)(struct pcf50633 *, int);
+	void (*force_shutdown)(struct pcf50633 *);
+
+	u8 resumers[5];
+};
+
+struct pcf50633_subdev_pdata {
+	struct pcf50633 *pcf;
+};
+
+struct pcf50633_irq {
+	void (*handler) (int, void *);
+	void *data;
+};
+
+int pcf50633_register_irq(struct pcf50633 *pcf, int irq,
+			void (*handler) (int, void *), void *data);
+int pcf50633_free_irq(struct pcf50633 *pcf, int irq);
+
+int pcf50633_irq_mask(struct pcf50633 *pcf, int irq);
+int pcf50633_irq_unmask(struct pcf50633 *pcf, int irq);
+int pcf50633_irq_mask_get(struct pcf50633 *pcf, int irq);
+
+int pcf50633_read_block(struct pcf50633 *, u8 reg,
+					int nr_regs, u8 *data);
+int pcf50633_write_block(struct pcf50633 *pcf, u8 reg,
+					int nr_regs, u8 *data);
+u8 pcf50633_reg_read(struct pcf50633 *, u8 reg);
+int pcf50633_reg_write(struct pcf50633 *pcf, u8 reg, u8 val);
+
+int pcf50633_reg_set_bit_mask(struct pcf50633 *pcf, u8 reg, u8 mask, u8 val);
+int pcf50633_reg_clear_bits(struct pcf50633 *pcf, u8 reg, u8 bits);
+
+/* Interrupt registers */
+
+#define PCF50633_REG_INT1	0x02
+#define PCF50633_REG_INT2	0x03
+#define PCF50633_REG_INT3	0x04
+#define PCF50633_REG_INT4	0x05
+#define PCF50633_REG_INT5	0x06
+
+#define PCF50633_REG_INT1M	0x07
+#define PCF50633_REG_INT2M	0x08
+#define PCF50633_REG_INT3M	0x09
+#define PCF50633_REG_INT4M	0x0a
+#define PCF50633_REG_INT5M	0x0b
+
+enum {
+	/* Chip IRQs */
+	PCF50633_IRQ_ADPINS,
+	PCF50633_IRQ_ADPREM,
+	PCF50633_IRQ_USBINS,
+	PCF50633_IRQ_USBREM,
+	PCF50633_IRQ_RESERVED1,
+	PCF50633_IRQ_RESERVED2,
+	PCF50633_IRQ_ALARM,
+	PCF50633_IRQ_SECOND,
+	PCF50633_IRQ_ONKEYR,
+	PCF50633_IRQ_ONKEYF,
+	PCF50633_IRQ_EXTON1R,
+	PCF50633_IRQ_EXTON1F,
+	PCF50633_IRQ_EXTON2R,
+	PCF50633_IRQ_EXTON2F,
+	PCF50633_IRQ_EXTON3R,
+	PCF50633_IRQ_EXTON3F,
+	PCF50633_IRQ_BATFULL,
+	PCF50633_IRQ_CHGHALT,
+	PCF50633_IRQ_THLIMON,
+	PCF50633_IRQ_THLIMOFF,
+	PCF50633_IRQ_USBLIMON,
+	PCF50633_IRQ_USBLIMOFF,
+	PCF50633_IRQ_ADCRDY,
+	PCF50633_IRQ_ONKEY1S,
+	PCF50633_IRQ_LOWSYS,
+	PCF50633_IRQ_LOWBAT,
+	PCF50633_IRQ_HIGHTMP,
+	PCF50633_IRQ_AUTOPWRFAIL,
+	PCF50633_IRQ_DWN1PWRFAIL,
+	PCF50633_IRQ_DWN2PWRFAIL,
+	PCF50633_IRQ_LEDPWRFAIL,
+	PCF50633_IRQ_LEDOVP,
+	PCF50633_IRQ_LDO1PWRFAIL,
+	PCF50633_IRQ_LDO2PWRFAIL,
+	PCF50633_IRQ_LDO3PWRFAIL,
+	PCF50633_IRQ_LDO4PWRFAIL,
+	PCF50633_IRQ_LDO5PWRFAIL,
+	PCF50633_IRQ_LDO6PWRFAIL,
+	PCF50633_IRQ_HCLDOPWRFAIL,
+	PCF50633_IRQ_HCLDOOVL,
+
+	/* Always last */
+	PCF50633_NUM_IRQ,
+};
+
+struct pcf50633 {
+	struct device *dev;
+	struct i2c_client *i2c_client;
+
+	struct pcf50633_platform_data *pdata;
+	int irq;
+	struct pcf50633_irq irq_handler[PCF50633_NUM_IRQ];
+	struct work_struct irq_work;
+	struct mutex lock;
+
+	u8 mask_regs[5];
+
+	u8 suspend_irq_masks[5];
+	u8 resume_reason[5];
+	int is_suspended;
+
+	int onkey1s_held;
+
+	struct platform_device *rtc_pdev;
+	struct platform_device *mbc_pdev;
+	struct platform_device *adc_pdev;
+	struct platform_device *input_pdev;
+	struct platform_device *regulator_pdev[PCF50633_NUM_REGULATORS];
+};
+
+enum pcf50633_reg_int1 {
+	PCF50633_INT1_ADPINS	= 0x01,	/* Adapter inserted */
+	PCF50633_INT1_ADPREM	= 0x02,	/* Adapter removed */
+	PCF50633_INT1_USBINS	= 0x04,	/* USB inserted */
+	PCF50633_INT1_USBREM	= 0x08,	/* USB removed */
+	/* reserved */
+	PCF50633_INT1_ALARM	= 0x40, /* RTC alarm time is reached */
+	PCF50633_INT1_SECOND	= 0x80,	/* RTC periodic second interrupt */
+};
+
+enum pcf50633_reg_int2 {
+	PCF50633_INT2_ONKEYR	= 0x01, /* ONKEY rising edge */
+	PCF50633_INT2_ONKEYF	= 0x02, /* ONKEY falling edge */
+	PCF50633_INT2_EXTON1R	= 0x04, /* EXTON1 rising edge */
+	PCF50633_INT2_EXTON1F	= 0x08, /* EXTON1 falling edge */
+	PCF50633_INT2_EXTON2R	= 0x10, /* EXTON2 rising edge */
+	PCF50633_INT2_EXTON2F	= 0x20, /* EXTON2 falling edge */
+	PCF50633_INT2_EXTON3R	= 0x40, /* EXTON3 rising edge */
+	PCF50633_INT2_EXTON3F	= 0x80, /* EXTON3 falling edge */
+};
+
+enum pcf50633_reg_int3 {
+	PCF50633_INT3_BATFULL	= 0x01, /* Battery full */
+	PCF50633_INT3_CHGHALT	= 0x02,	/* Charger halt */
+	PCF50633_INT3_THLIMON	= 0x04,
+	PCF50633_INT3_THLIMOFF	= 0x08,
+	PCF50633_INT3_USBLIMON	= 0x10,
+	PCF50633_INT3_USBLIMOFF	= 0x20,
+	PCF50633_INT3_ADCRDY	= 0x40, /* ADC result ready */
+	PCF50633_INT3_ONKEY1S	= 0x80,	/* ONKEY pressed 1 second */
+};
+
+enum pcf50633_reg_int4 {
+	PCF50633_INT4_LOWSYS		= 0x01,
+	PCF50633_INT4_LOWBAT		= 0x02,
+	PCF50633_INT4_HIGHTMP		= 0x04,
+	PCF50633_INT4_AUTOPWRFAIL	= 0x08,
+	PCF50633_INT4_DWN1PWRFAIL	= 0x10,
+	PCF50633_INT4_DWN2PWRFAIL	= 0x20,
+	PCF50633_INT4_LEDPWRFAIL	= 0x40,
+	PCF50633_INT4_LEDOVP		= 0x80,
+};
+
+enum pcf50633_reg_int5 {
+	PCF50633_INT5_LDO1PWRFAIL	= 0x01,
+	PCF50633_INT5_LDO2PWRFAIL	= 0x02,
+	PCF50633_INT5_LDO3PWRFAIL	= 0x04,
+	PCF50633_INT5_LDO4PWRFAIL	= 0x08,
+	PCF50633_INT5_LDO5PWRFAIL	= 0x10,
+	PCF50633_INT5_LDO6PWRFAIL	= 0x20,
+	PCF50633_INT5_HCLDOPWRFAIL	= 0x40,
+	PCF50633_INT5_HCLDOOVL		= 0x80,
+};
+
+/* misc. registers */
+#define PCF50633_REG_OOCSHDWN	0x0c
+
+/* LED registers */
+#define PCF50633_REG_LEDOUT 0x28
+#define PCF50633_REG_LEDENA 0x29
+#define PCF50633_REG_LEDCTL 0x2a
+#define PCF50633_REG_LEDDIM 0x2b
+
+#endif
+
-- 
cgit v1.2.3


From 08c3e06a5eb27d43b712adef18379f8464425e71 Mon Sep 17 00:00:00 2001
From: Balaji Rao <balajirrao@openmoko.org>
Date: Fri, 9 Jan 2009 01:49:26 +0100
Subject: mfd: PCF50633 adc driver

This patch adds basic support for the PCF50633 ADC. The subtractive mode
is not supported yet.

Since we don't have adc subsystem, it currently lives in drivers/mfd.

Signed-off-by: Balaji Rao <balajirrao@openmoko.org>
Cc: Andy Green <andy@openmoko.com>
Acked-by: Jonathan Cameron <jonathan.cameron@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/mfd/Kconfig              |   7 +
 drivers/mfd/Makefile             |   3 +-
 drivers/mfd/pcf50633-adc.c       | 277 +++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/pcf50633/adc.h |  72 ++++++++++
 4 files changed, 358 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mfd/pcf50633-adc.c
 create mode 100644 include/linux/mfd/pcf50633/adc.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 3ac32e45b03b..2fa3504e165a 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -226,6 +226,13 @@ config MFD_PCF50633
 	  facilities, and registers devices for the various functions
 	  so that function-specific drivers can bind to them.
 
+config PCF50633_ADC
+	tristate "Support for NXP PCF50633 ADC"
+	depends on MFD_PCF50633
+	help
+	 Say yes here if you want to include support for ADC in the
+	 NXP PCF50633 chip.
+
 endmenu
 
 menu "Multimedia Capabilities Port drivers"
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 23d4d10981b3..138f9c4d3d7b 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -38,4 +38,5 @@ obj-$(CONFIG_UCB1400_CORE)	+= ucb1400_core.o
 
 obj-$(CONFIG_PMIC_DA903X)	+= da903x.o
 
-obj-$(CONFIG_MFD_PCF50633)	+= pcf50633-core.o
\ No newline at end of file
+obj-$(CONFIG_MFD_PCF50633)	+= pcf50633-core.o
+obj-$(CONFIG_PCF50633_ADC)	+= pcf50633-adc.o
\ No newline at end of file
diff --git a/drivers/mfd/pcf50633-adc.c b/drivers/mfd/pcf50633-adc.c
new file mode 100644
index 000000000000..c2d05becfa97
--- /dev/null
+++ b/drivers/mfd/pcf50633-adc.c
@@ -0,0 +1,277 @@
+/* NXP PCF50633 ADC Driver
+ *
+ * (C) 2006-2008 by Openmoko, Inc.
+ * Author: Balaji Rao <balajirrao@openmoko.org>
+ * All rights reserved.
+ *
+ * Broken down from monstrous PCF50633 driver mainly by
+ * Harald Welte, Andy Green and Werner Almesberger
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ *  NOTE: This driver does not yet support subtractive ADC mode, which means
+ *  you can do only one measurement per read request.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/completion.h>
+
+#include <linux/mfd/pcf50633/core.h>
+#include <linux/mfd/pcf50633/adc.h>
+
+struct pcf50633_adc_request {
+	int mux;
+	int avg;
+	int result;
+	void (*callback)(struct pcf50633 *, void *, int);
+	void *callback_param;
+
+	/* Used in case of sync requests */
+	struct completion completion;
+
+};
+
+#define PCF50633_MAX_ADC_FIFO_DEPTH 8
+
+struct pcf50633_adc {
+	struct pcf50633 *pcf;
+
+	/* Private stuff */
+	struct pcf50633_adc_request *queue[PCF50633_MAX_ADC_FIFO_DEPTH];
+	int queue_head;
+	int queue_tail;
+	struct mutex queue_mutex;
+};
+
+static inline struct pcf50633_adc *__to_adc(struct pcf50633 *pcf)
+{
+	return platform_get_drvdata(pcf->adc_pdev);
+}
+
+static void adc_setup(struct pcf50633 *pcf, int channel, int avg)
+{
+	channel &= PCF50633_ADCC1_ADCMUX_MASK;
+
+	/* kill ratiometric, but enable ACCSW biasing */
+	pcf50633_reg_write(pcf, PCF50633_REG_ADCC2, 0x00);
+	pcf50633_reg_write(pcf, PCF50633_REG_ADCC3, 0x01);
+
+	/* start ADC conversion on selected channel */
+	pcf50633_reg_write(pcf, PCF50633_REG_ADCC1, channel | avg |
+		    PCF50633_ADCC1_ADCSTART | PCF50633_ADCC1_RES_10BIT);
+}
+
+static void trigger_next_adc_job_if_any(struct pcf50633 *pcf)
+{
+	struct pcf50633_adc *adc = __to_adc(pcf);
+	int head;
+
+	mutex_lock(&adc->queue_mutex);
+
+	head = adc->queue_head;
+
+	if (!adc->queue[head]) {
+		mutex_unlock(&adc->queue_mutex);
+		return;
+	}
+	mutex_unlock(&adc->queue_mutex);
+
+	adc_setup(pcf, adc->queue[head]->mux, adc->queue[head]->avg);
+}
+
+static int
+adc_enqueue_request(struct pcf50633 *pcf, struct pcf50633_adc_request *req)
+{
+	struct pcf50633_adc *adc = __to_adc(pcf);
+	int head, tail;
+
+	mutex_lock(&adc->queue_mutex);
+
+	head = adc->queue_head;
+	tail = adc->queue_tail;
+
+	if (adc->queue[tail]) {
+		mutex_unlock(&adc->queue_mutex);
+		return -EBUSY;
+	}
+
+	adc->queue[tail] = req;
+	adc->queue_tail = (tail + 1) & (PCF50633_MAX_ADC_FIFO_DEPTH - 1);
+
+	mutex_unlock(&adc->queue_mutex);
+
+	trigger_next_adc_job_if_any(pcf);
+
+	return 0;
+}
+
+static void
+pcf50633_adc_sync_read_callback(struct pcf50633 *pcf, void *param, int result)
+{
+	struct pcf50633_adc_request *req = param;
+
+	req->result = result;
+	complete(&req->completion);
+}
+
+int pcf50633_adc_sync_read(struct pcf50633 *pcf, int mux, int avg)
+{
+	struct pcf50633_adc_request *req;
+
+	/* req is freed when the result is ready, in interrupt handler */
+	req = kzalloc(sizeof(*req), GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	req->mux = mux;
+	req->avg = avg;
+	req->callback =  pcf50633_adc_sync_read_callback;
+	req->callback_param = req;
+
+	init_completion(&req->completion);
+	adc_enqueue_request(pcf, req);
+	wait_for_completion(&req->completion);
+
+	return req->result;
+}
+EXPORT_SYMBOL_GPL(pcf50633_adc_sync_read);
+
+int pcf50633_adc_async_read(struct pcf50633 *pcf, int mux, int avg,
+			     void (*callback)(struct pcf50633 *, void *, int),
+			     void *callback_param)
+{
+	struct pcf50633_adc_request *req;
+
+	/* req is freed when the result is ready, in interrupt handler */
+	req = kmalloc(sizeof(*req), GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	req->mux = mux;
+	req->avg = avg;
+	req->callback = callback;
+	req->callback_param = callback_param;
+
+	adc_enqueue_request(pcf, req);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pcf50633_adc_async_read);
+
+static int adc_result(struct pcf50633 *pcf)
+{
+	u8 adcs1, adcs3;
+	u16 result;
+
+	adcs1 = pcf50633_reg_read(pcf, PCF50633_REG_ADCS1);
+	adcs3 = pcf50633_reg_read(pcf, PCF50633_REG_ADCS3);
+	result = (adcs1 << 2) | (adcs3 & PCF50633_ADCS3_ADCDAT1L_MASK);
+
+	dev_dbg(pcf->dev, "adc result = %d\n", result);
+
+	return result;
+}
+
+static void pcf50633_adc_irq(int irq, void *data)
+{
+	struct pcf50633_adc *adc = data;
+	struct pcf50633 *pcf = adc->pcf;
+	struct pcf50633_adc_request *req;
+	int head;
+
+	mutex_lock(&adc->queue_mutex);
+	head = adc->queue_head;
+
+	req = adc->queue[head];
+	if (WARN_ON(!req)) {
+		dev_err(pcf->dev, "pcf50633-adc irq: ADC queue empty!\n");
+		mutex_unlock(&adc->queue_mutex);
+		return;
+	}
+	adc->queue[head] = NULL;
+	adc->queue_head = (head + 1) &
+				      (PCF50633_MAX_ADC_FIFO_DEPTH - 1);
+
+	mutex_unlock(&adc->queue_mutex);
+
+	req->callback(pcf, req->callback_param, adc_result(pcf));
+	kfree(req);
+
+	trigger_next_adc_job_if_any(pcf);
+}
+
+static int __devinit pcf50633_adc_probe(struct platform_device *pdev)
+{
+	struct pcf50633_subdev_pdata *pdata = pdev->dev.platform_data;
+	struct pcf50633_adc *adc;
+
+	adc = kzalloc(sizeof(*adc), GFP_KERNEL);
+	if (!adc)
+		return -ENOMEM;
+
+	adc->pcf = pdata->pcf;
+	platform_set_drvdata(pdev, adc);
+
+	pcf50633_register_irq(pdata->pcf, PCF50633_IRQ_ADCRDY,
+					pcf50633_adc_irq, adc);
+
+	mutex_init(&adc->queue_mutex);
+
+	return 0;
+}
+
+static int __devexit pcf50633_adc_remove(struct platform_device *pdev)
+{
+	struct pcf50633_adc *adc = platform_get_drvdata(pdev);
+	int i, head;
+
+	pcf50633_free_irq(adc->pcf, PCF50633_IRQ_ADCRDY);
+
+	mutex_lock(&adc->queue_mutex);
+	head = adc->queue_head;
+
+	if (WARN_ON(adc->queue[head]))
+		dev_err(adc->pcf->dev,
+			"adc driver removed with request pending\n");
+
+	for (i = 0; i < PCF50633_MAX_ADC_FIFO_DEPTH; i++)
+		kfree(adc->queue[i]);
+
+	mutex_unlock(&adc->queue_mutex);
+	kfree(adc);
+
+	return 0;
+}
+
+static struct platform_driver pcf50633_adc_driver = {
+	.driver = {
+		.name = "pcf50633-adc",
+	},
+	.probe = pcf50633_adc_probe,
+	.remove = __devexit_p(pcf50633_adc_remove),
+};
+
+static int __init pcf50633_adc_init(void)
+{
+	return platform_driver_register(&pcf50633_adc_driver);
+}
+module_init(pcf50633_adc_init);
+
+static void __exit pcf50633_adc_exit(void)
+{
+	platform_driver_unregister(&pcf50633_adc_driver);
+}
+module_exit(pcf50633_adc_exit);
+
+MODULE_AUTHOR("Balaji Rao <balajirrao@openmoko.org>");
+MODULE_DESCRIPTION("PCF50633 adc driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:pcf50633-adc");
+
diff --git a/include/linux/mfd/pcf50633/adc.h b/include/linux/mfd/pcf50633/adc.h
new file mode 100644
index 000000000000..56669b4183ad
--- /dev/null
+++ b/include/linux/mfd/pcf50633/adc.h
@@ -0,0 +1,72 @@
+/*
+ * adc.h  -- Driver for NXP PCF50633 ADC
+ *
+ * (C) 2006-2008 by Openmoko, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __LINUX_MFD_PCF50633_ADC_H
+#define __LINUX_MFD_PCF50633_ADC_H
+
+#include <linux/mfd/pcf50633/core.h>
+#include <linux/platform_device.h>
+
+/* ADC Registers */
+#define PCF50633_REG_ADCC3		0x52
+#define PCF50633_REG_ADCC2		0x53
+#define PCF50633_REG_ADCC1		0x54
+#define PCF50633_REG_ADCS1		0x55
+#define PCF50633_REG_ADCS2		0x56
+#define PCF50633_REG_ADCS3		0x57
+
+#define PCF50633_ADCC1_ADCSTART		0x01
+#define PCF50633_ADCC1_RES_10BIT	0x02
+#define PCF50633_ADCC1_AVERAGE_NO	0x00
+#define PCF50633_ADCC1_AVERAGE_4	0x04
+#define PCF50633_ADCC1_AVERAGE_8	0x08
+#define PCF50633_ADCC1_AVERAGE_16	0x0c
+#define PCF50633_ADCC1_MUX_BATSNS_RES	0x00
+#define PCF50633_ADCC1_MUX_BATSNS_SUBTR	0x10
+#define PCF50633_ADCC1_MUX_ADCIN2_RES	0x20
+#define PCF50633_ADCC1_MUX_ADCIN2_SUBTR	0x30
+#define PCF50633_ADCC1_MUX_BATTEMP	0x60
+#define PCF50633_ADCC1_MUX_ADCIN1	0x70
+#define PCF50633_ADCC1_AVERAGE_MASK	0x0c
+#define PCF50633_ADCC1_ADCMUX_MASK	0xf0
+
+#define PCF50633_ADCC2_RATIO_NONE	0x00
+#define PCF50633_ADCC2_RATIO_BATTEMP	0x01
+#define PCF50633_ADCC2_RATIO_ADCIN1	0x02
+#define PCF50633_ADCC2_RATIO_BOTH	0x03
+#define PCF50633_ADCC2_RATIOSETTL_100US 0x04
+
+#define PCF50633_ADCC3_ACCSW_EN		0x01
+#define PCF50633_ADCC3_NTCSW_EN		0x04
+#define PCF50633_ADCC3_RES_DIV_TWO	0x10
+#define PCF50633_ADCC3_RES_DIV_THREE	0x00
+
+#define PCF50633_ADCS3_REF_NTCSW	0x00
+#define PCF50633_ADCS3_REF_ACCSW	0x10
+#define PCF50633_ADCS3_REF_2V0		0x20
+#define PCF50633_ADCS3_REF_VISA		0x30
+#define PCF50633_ADCS3_REF_2V0_2	0x70
+#define PCF50633_ADCS3_ADCRDY		0x80
+
+#define PCF50633_ADCS3_ADCDAT1L_MASK	0x03
+#define PCF50633_ADCS3_ADCDAT2L_MASK	0x0c
+#define PCF50633_ADCS3_ADCDAT2L_SHIFT	2
+#define PCF50633_ASCS3_REF_MASK		0x70
+
+extern int
+pcf50633_adc_async_read(struct pcf50633 *pcf, int mux, int avg,
+		void (*callback)(struct pcf50633 *, void *, int),
+		void *callback_param);
+extern int
+pcf50633_adc_sync_read(struct pcf50633 *pcf, int mux, int avg);
+
+#endif /* __LINUX_PCF50633_ADC_H */
-- 
cgit v1.2.3


From 6a3d119b4ce29cf32bfe91eb61d46e9dbd8ce38a Mon Sep 17 00:00:00 2001
From: Balaji Rao <balajirrao@openmoko.org>
Date: Fri, 9 Jan 2009 01:49:37 +0100
Subject: mfd: PCF50633 gpio support

What the PCF05633 calls as a 'GPIO' is much more than the GPIO in the linux
sense and there are only 4 of them - which means, the gpiolib is not used
here.

Signed-off-by: Balaji Rao <balajirrao@openmoko.org>
Cc: Andy Green <andy@openmoko.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/mfd/Kconfig               |   7 +++
 drivers/mfd/Makefile              |   3 +-
 drivers/mfd/pcf50633-gpio.c       | 118 ++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/pcf50633/gpio.h |  52 +++++++++++++++++
 4 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mfd/pcf50633-gpio.c
 create mode 100644 include/linux/mfd/pcf50633/gpio.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 2fa3504e165a..06a2b0f7737c 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -233,6 +233,13 @@ config PCF50633_ADC
 	 Say yes here if you want to include support for ADC in the
 	 NXP PCF50633 chip.
 
+config PCF50633_GPIO
+	tristate "Support for NXP PCF50633 GPIO"
+	depends on MFD_PCF50633
+	help
+	 Say yes here if you want to include support GPIO for pins on
+	 the PCF50633 chip.
+
 endmenu
 
 menu "Multimedia Capabilities Port drivers"
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 138f9c4d3d7b..3afb5192e4da 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -39,4 +39,5 @@ obj-$(CONFIG_UCB1400_CORE)	+= ucb1400_core.o
 obj-$(CONFIG_PMIC_DA903X)	+= da903x.o
 
 obj-$(CONFIG_MFD_PCF50633)	+= pcf50633-core.o
-obj-$(CONFIG_PCF50633_ADC)	+= pcf50633-adc.o
\ No newline at end of file
+obj-$(CONFIG_PCF50633_ADC)	+= pcf50633-adc.o
+obj-$(CONFIG_PCF50633_GPIO)	+= pcf50633-gpio.o
\ No newline at end of file
diff --git a/drivers/mfd/pcf50633-gpio.c b/drivers/mfd/pcf50633-gpio.c
new file mode 100644
index 000000000000..2fa2eca5c9cc
--- /dev/null
+++ b/drivers/mfd/pcf50633-gpio.c
@@ -0,0 +1,118 @@
+/* NXP PCF50633 GPIO Driver
+ *
+ * (C) 2006-2008 by Openmoko, Inc.
+ * Author: Balaji Rao <balajirrao@openmoko.org>
+ * All rights reserved.
+ *
+ * Broken down from monstrous PCF50633 driver mainly by
+ * Harald Welte, Andy Green and Werner Almesberger
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+
+#include <linux/mfd/pcf50633/core.h>
+#include <linux/mfd/pcf50633/gpio.h>
+
+enum pcf50633_regulator_id {
+	PCF50633_REGULATOR_AUTO,
+	PCF50633_REGULATOR_DOWN1,
+	PCF50633_REGULATOR_DOWN2,
+	PCF50633_REGULATOR_LDO1,
+	PCF50633_REGULATOR_LDO2,
+	PCF50633_REGULATOR_LDO3,
+	PCF50633_REGULATOR_LDO4,
+	PCF50633_REGULATOR_LDO5,
+	PCF50633_REGULATOR_LDO6,
+	PCF50633_REGULATOR_HCLDO,
+	PCF50633_REGULATOR_MEMLDO,
+};
+
+#define PCF50633_REG_AUTOOUT	0x1a
+#define PCF50633_REG_DOWN1OUT	0x1e
+#define PCF50633_REG_DOWN2OUT	0x22
+#define PCF50633_REG_MEMLDOOUT	0x26
+#define PCF50633_REG_LDO1OUT	0x2d
+#define PCF50633_REG_LDO2OUT	0x2f
+#define PCF50633_REG_LDO3OUT	0x31
+#define PCF50633_REG_LDO4OUT	0x33
+#define PCF50633_REG_LDO5OUT	0x35
+#define PCF50633_REG_LDO6OUT	0x37
+#define PCF50633_REG_HCLDOOUT	0x39
+
+static const u8 pcf50633_regulator_registers[PCF50633_NUM_REGULATORS] = {
+	[PCF50633_REGULATOR_AUTO]	= PCF50633_REG_AUTOOUT,
+	[PCF50633_REGULATOR_DOWN1]	= PCF50633_REG_DOWN1OUT,
+	[PCF50633_REGULATOR_DOWN2]	= PCF50633_REG_DOWN2OUT,
+	[PCF50633_REGULATOR_MEMLDO]	= PCF50633_REG_MEMLDOOUT,
+	[PCF50633_REGULATOR_LDO1]	= PCF50633_REG_LDO1OUT,
+	[PCF50633_REGULATOR_LDO2]	= PCF50633_REG_LDO2OUT,
+	[PCF50633_REGULATOR_LDO3]	= PCF50633_REG_LDO3OUT,
+	[PCF50633_REGULATOR_LDO4]	= PCF50633_REG_LDO4OUT,
+	[PCF50633_REGULATOR_LDO5]	= PCF50633_REG_LDO5OUT,
+	[PCF50633_REGULATOR_LDO6]	= PCF50633_REG_LDO6OUT,
+	[PCF50633_REGULATOR_HCLDO]	= PCF50633_REG_HCLDOOUT,
+};
+
+int pcf50633_gpio_set(struct pcf50633 *pcf, int gpio, u8 val)
+{
+	u8 reg;
+
+	reg = gpio - PCF50633_GPIO1 + PCF50633_REG_GPIO1CFG;
+
+	return pcf50633_reg_set_bit_mask(pcf, reg, 0x07, val);
+}
+EXPORT_SYMBOL_GPL(pcf50633_gpio_set);
+
+u8 pcf50633_gpio_get(struct pcf50633 *pcf, int gpio)
+{
+	u8 reg, val;
+
+	reg = gpio - PCF50633_GPIO1 + PCF50633_REG_GPIO1CFG;
+	val = pcf50633_reg_read(pcf, reg) & 0x07;
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(pcf50633_gpio_get);
+
+int pcf50633_gpio_invert_set(struct pcf50633 *pcf, int gpio, int invert)
+{
+	u8 val, reg;
+
+	reg = gpio - PCF50633_GPIO1 + PCF50633_REG_GPIO1CFG;
+	val = !!invert << 3;
+
+	return pcf50633_reg_set_bit_mask(pcf, reg, 1 << 3, val);
+}
+EXPORT_SYMBOL_GPL(pcf50633_gpio_invert_set);
+
+int pcf50633_gpio_invert_get(struct pcf50633 *pcf, int gpio)
+{
+	u8 reg, val;
+
+	reg = gpio - PCF50633_GPIO1 + PCF50633_REG_GPIO1CFG;
+	val = pcf50633_reg_read(pcf, reg);
+
+	return val & (1 << 3);
+}
+EXPORT_SYMBOL_GPL(pcf50633_gpio_invert_get);
+
+int pcf50633_gpio_power_supply_set(struct pcf50633 *pcf,
+					int gpio, int regulator, int on)
+{
+	u8 reg, val, mask;
+
+	/* the *ENA register is always one after the *OUT register */
+	reg = pcf50633_regulator_registers[regulator] + 1;
+
+	val = !!on << (gpio - PCF50633_GPIO1);
+	mask = 1 << (gpio - PCF50633_GPIO1);
+
+	return pcf50633_reg_set_bit_mask(pcf, reg, mask, val);
+}
+EXPORT_SYMBOL_GPL(pcf50633_gpio_power_supply_set);
diff --git a/include/linux/mfd/pcf50633/gpio.h b/include/linux/mfd/pcf50633/gpio.h
new file mode 100644
index 000000000000..a42b845efc54
--- /dev/null
+++ b/include/linux/mfd/pcf50633/gpio.h
@@ -0,0 +1,52 @@
+/*
+ * gpio.h -- GPIO driver for NXP PCF50633
+ *
+ * (C) 2006-2008 by Openmoko, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __LINUX_MFD_PCF50633_GPIO_H
+#define __LINUX_MFD_PCF50633_GPIO_H
+
+#include <linux/mfd/pcf50633/core.h>
+
+#define PCF50633_GPIO1		1
+#define PCF50633_GPIO2		2
+#define PCF50633_GPIO3		3
+#define PCF50633_GPO		4
+
+#define PCF50633_REG_GPIO1CFG	0x14
+#define PCF50633_REG_GPIO2CFG	0x15
+#define PCF50633_REG_GPIO3CFG	0x16
+#define PCF50633_REG_GPOCFG 	0x17
+
+#define PCF50633_GPOCFG_GPOSEL_MASK	0x07
+
+enum pcf50633_reg_gpocfg {
+	PCF50633_GPOCFG_GPOSEL_0	= 0x00,
+	PCF50633_GPOCFG_GPOSEL_LED_NFET	= 0x01,
+	PCF50633_GPOCFG_GPOSEL_SYSxOK	= 0x02,
+	PCF50633_GPOCFG_GPOSEL_CLK32K	= 0x03,
+	PCF50633_GPOCFG_GPOSEL_ADAPUSB	= 0x04,
+	PCF50633_GPOCFG_GPOSEL_USBxOK	= 0x05,
+	PCF50633_GPOCFG_GPOSEL_ACTPH4	= 0x06,
+	PCF50633_GPOCFG_GPOSEL_1	= 0x07,
+	PCF50633_GPOCFG_GPOSEL_INVERSE	= 0x08,
+};
+
+int pcf50633_gpio_set(struct pcf50633 *pcf, int gpio, u8 val);
+u8 pcf50633_gpio_get(struct pcf50633 *pcf, int gpio);
+
+int pcf50633_gpio_invert_set(struct pcf50633 *, int gpio, int invert);
+int pcf50633_gpio_invert_get(struct pcf50633 *pcf, int gpio);
+
+int pcf50633_gpio_power_supply_set(struct pcf50633 *,
+					int gpio, int regulator, int on);
+#endif /* __LINUX_MFD_PCF50633_GPIO_H */
+
+
-- 
cgit v1.2.3


From f5714dc97d63cc0dd1219bd0eb2e1f8df1e4347a Mon Sep 17 00:00:00 2001
From: Balaji Rao <balajirrao@openmoko.org>
Date: Fri, 9 Jan 2009 01:50:55 +0100
Subject: power_supply: PCF50633 battery charger driver

Signed-off-by: Balaji Rao <balajirrao@openmoko.org>
Cc: Andy Green <andy@openmoko.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Acked-by: Anton Vorontsov <cbouatmailru@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/power/Kconfig            |   6 +
 drivers/power/Makefile           |   1 +
 drivers/power/pcf50633-charger.c | 358 +++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/pcf50633/mbc.h | 134 +++++++++++++++
 4 files changed, 499 insertions(+)
 create mode 100644 drivers/power/pcf50633-charger.c
 create mode 100644 include/linux/mfd/pcf50633/mbc.h

(limited to 'include/linux')

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index 668472405a57..33da1127992a 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -82,4 +82,10 @@ config BATTERY_DA9030
 	  Say Y here to enable support for batteries charger integrated into
 	  DA9030 PMIC.
 
+config CHARGER_PCF50633
+	tristate "NXP PCF50633 MBC"
+	depends on MFD_PCF50633
+	help
+	 Say Y to include support for NXP PCF50633 Main Battery Charger.
+
 endif # POWER_SUPPLY
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index eebb15505a40..2fcf41d13e5c 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_BATTERY_TOSA)	+= tosa_battery.o
 obj-$(CONFIG_BATTERY_WM97XX)	+= wm97xx_battery.o
 obj-$(CONFIG_BATTERY_BQ27x00)	+= bq27x00_battery.o
 obj-$(CONFIG_BATTERY_DA9030)	+= da9030_battery.o
+obj-$(CONFIG_CHARGER_PCF50633)	+= pcf50633-charger.o
\ No newline at end of file
diff --git a/drivers/power/pcf50633-charger.c b/drivers/power/pcf50633-charger.c
new file mode 100644
index 000000000000..e988ec130fcd
--- /dev/null
+++ b/drivers/power/pcf50633-charger.c
@@ -0,0 +1,358 @@
+/* NXP PCF50633 Main Battery Charger Driver
+ *
+ * (C) 2006-2008 by Openmoko, Inc.
+ * Author: Balaji Rao <balajirrao@openmoko.org>
+ * All rights reserved.
+ *
+ * Broken down from monstrous PCF50633 driver mainly by
+ * Harald Welte, Andy Green and Werner Almesberger
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/sysfs.h>
+#include <linux/platform_device.h>
+#include <linux/power_supply.h>
+
+#include <linux/mfd/pcf50633/core.h>
+#include <linux/mfd/pcf50633/mbc.h>
+
+struct pcf50633_mbc {
+	struct pcf50633 *pcf;
+
+	int adapter_active;
+	int adapter_online;
+	int usb_active;
+	int usb_online;
+
+	struct power_supply usb;
+	struct power_supply adapter;
+};
+
+int pcf50633_mbc_usb_curlim_set(struct pcf50633 *pcf, int ma)
+{
+	struct pcf50633_mbc *mbc = platform_get_drvdata(pcf->mbc_pdev);
+	int ret = 0;
+	u8 bits;
+
+	if (ma >= 1000)
+		bits = PCF50633_MBCC7_USB_1000mA;
+	else if (ma >= 500)
+		bits = PCF50633_MBCC7_USB_500mA;
+	else if (ma >= 100)
+		bits = PCF50633_MBCC7_USB_100mA;
+	else
+		bits = PCF50633_MBCC7_USB_SUSPEND;
+
+	ret = pcf50633_reg_set_bit_mask(pcf, PCF50633_REG_MBCC7,
+					PCF50633_MBCC7_USB_MASK, bits);
+	if (ret)
+		dev_err(pcf->dev, "error setting usb curlim to %d mA\n", ma);
+	else
+		dev_info(pcf->dev, "usb curlim to %d mA\n", ma);
+
+	power_supply_changed(&mbc->usb);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pcf50633_mbc_usb_curlim_set);
+
+int pcf50633_mbc_get_status(struct pcf50633 *pcf)
+{
+	struct pcf50633_mbc *mbc  = platform_get_drvdata(pcf->mbc_pdev);
+	int status = 0;
+
+	if (mbc->usb_online)
+		status |= PCF50633_MBC_USB_ONLINE;
+	if (mbc->usb_active)
+		status |= PCF50633_MBC_USB_ACTIVE;
+	if (mbc->adapter_online)
+		status |= PCF50633_MBC_ADAPTER_ONLINE;
+	if (mbc->adapter_active)
+		status |= PCF50633_MBC_ADAPTER_ACTIVE;
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(pcf50633_mbc_get_status);
+
+void pcf50633_mbc_set_status(struct pcf50633 *pcf, int what, int status)
+{
+	struct pcf50633_mbc *mbc = platform_get_drvdata(pcf->mbc_pdev);
+
+	if (what & PCF50633_MBC_USB_ONLINE)
+		mbc->usb_online = !!status;
+	if (what & PCF50633_MBC_USB_ACTIVE)
+		mbc->usb_active = !!status;
+	if (what & PCF50633_MBC_ADAPTER_ONLINE)
+		mbc->adapter_online = !!status;
+	if (what & PCF50633_MBC_ADAPTER_ACTIVE)
+		mbc->adapter_active = !!status;
+}
+EXPORT_SYMBOL_GPL(pcf50633_mbc_set_status);
+
+static ssize_t
+show_chgmode(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct pcf50633_mbc *mbc = dev_get_drvdata(dev);
+
+	u8 mbcs2 = pcf50633_reg_read(mbc->pcf, PCF50633_REG_MBCS2);
+	u8 chgmod = (mbcs2 & PCF50633_MBCS2_MBC_MASK);
+
+	return sprintf(buf, "%d\n", chgmod);
+}
+static DEVICE_ATTR(chgmode, S_IRUGO, show_chgmode, NULL);
+
+static ssize_t
+show_usblim(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct pcf50633_mbc *mbc = dev_get_drvdata(dev);
+	u8 usblim = pcf50633_reg_read(mbc->pcf, PCF50633_REG_MBCC7) &
+						PCF50633_MBCC7_USB_MASK;
+	unsigned int ma;
+
+	if (usblim == PCF50633_MBCC7_USB_1000mA)
+		ma = 1000;
+	else if (usblim == PCF50633_MBCC7_USB_500mA)
+		ma = 500;
+	else if (usblim == PCF50633_MBCC7_USB_100mA)
+		ma = 100;
+	else
+		ma = 0;
+
+	return sprintf(buf, "%u\n", ma);
+}
+
+static ssize_t set_usblim(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct pcf50633_mbc *mbc = dev_get_drvdata(dev);
+	unsigned long ma;
+	int ret;
+
+	ret = strict_strtoul(buf, 10, &ma);
+	if (ret)
+		return -EINVAL;
+
+	pcf50633_mbc_usb_curlim_set(mbc->pcf, ma);
+
+	return count;
+}
+
+static DEVICE_ATTR(usb_curlim, S_IRUGO | S_IWUSR, show_usblim, set_usblim);
+
+static struct attribute *pcf50633_mbc_sysfs_entries[] = {
+	&dev_attr_chgmode.attr,
+	&dev_attr_usb_curlim.attr,
+	NULL,
+};
+
+static struct attribute_group mbc_attr_group = {
+	.name	= NULL,			/* put in device directory */
+	.attrs	= pcf50633_mbc_sysfs_entries,
+};
+
+static void
+pcf50633_mbc_irq_handler(int irq, void *data)
+{
+	struct pcf50633_mbc *mbc = data;
+
+	/* USB */
+	if (irq == PCF50633_IRQ_USBINS) {
+		mbc->usb_online = 1;
+	} else if (irq == PCF50633_IRQ_USBREM) {
+		mbc->usb_online = 0;
+		mbc->usb_active = 0;
+		pcf50633_mbc_usb_curlim_set(mbc->pcf, 0);
+	}
+
+	/* Adapter */
+	if (irq == PCF50633_IRQ_ADPINS) {
+		mbc->adapter_online = 1;
+		mbc->adapter_active = 1;
+	} else if (irq == PCF50633_IRQ_ADPREM) {
+		mbc->adapter_online = 0;
+		mbc->adapter_active = 0;
+	}
+
+	if (irq == PCF50633_IRQ_BATFULL) {
+		mbc->usb_active = 0;
+		mbc->adapter_active = 0;
+	}
+
+	power_supply_changed(&mbc->usb);
+	power_supply_changed(&mbc->adapter);
+
+	if (mbc->pcf->pdata->mbc_event_callback)
+		mbc->pcf->pdata->mbc_event_callback(mbc->pcf, irq);
+}
+
+static int adapter_get_property(struct power_supply *psy,
+			enum power_supply_property psp,
+			union power_supply_propval *val)
+{
+	struct pcf50633_mbc *mbc = container_of(psy, struct pcf50633_mbc, usb);
+	int ret = 0;
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_ONLINE:
+		val->intval =  mbc->adapter_online;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+static int usb_get_property(struct power_supply *psy,
+			enum power_supply_property psp,
+			union power_supply_propval *val)
+{
+	struct pcf50633_mbc *mbc = container_of(psy, struct pcf50633_mbc, usb);
+	int ret = 0;
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_ONLINE:
+		val->intval = mbc->usb_online;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+static enum power_supply_property power_props[] = {
+	POWER_SUPPLY_PROP_ONLINE,
+};
+
+static const u8 mbc_irq_handlers[] = {
+	PCF50633_IRQ_ADPINS,
+	PCF50633_IRQ_ADPREM,
+	PCF50633_IRQ_USBINS,
+	PCF50633_IRQ_USBREM,
+	PCF50633_IRQ_BATFULL,
+	PCF50633_IRQ_CHGHALT,
+	PCF50633_IRQ_THLIMON,
+	PCF50633_IRQ_THLIMOFF,
+	PCF50633_IRQ_USBLIMON,
+	PCF50633_IRQ_USBLIMOFF,
+	PCF50633_IRQ_LOWSYS,
+	PCF50633_IRQ_LOWBAT,
+};
+
+static int __devinit pcf50633_mbc_probe(struct platform_device *pdev)
+{
+	struct pcf50633_mbc *mbc;
+	struct pcf50633_subdev_pdata *pdata = pdev->dev.platform_data;
+	int ret;
+	int i;
+	u8 mbcs1;
+
+	mbc = kzalloc(sizeof(*mbc), GFP_KERNEL);
+	if (!mbc)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, mbc);
+	mbc->pcf = pdata->pcf;
+
+	/* Set up IRQ handlers */
+	for (i = 0; i < ARRAY_SIZE(mbc_irq_handlers); i++)
+		pcf50633_register_irq(mbc->pcf, mbc_irq_handlers[i],
+					pcf50633_mbc_irq_handler, mbc);
+
+	/* Create power supplies */
+	mbc->adapter.name		= "adapter";
+	mbc->adapter.type		= POWER_SUPPLY_TYPE_MAINS;
+	mbc->adapter.properties		= power_props;
+	mbc->adapter.num_properties	= ARRAY_SIZE(power_props);
+	mbc->adapter.get_property	= &adapter_get_property;
+	mbc->adapter.supplied_to	= mbc->pcf->pdata->batteries;
+	mbc->adapter.num_supplicants	= mbc->pcf->pdata->num_batteries;
+
+	mbc->usb.name			= "usb";
+	mbc->usb.type			= POWER_SUPPLY_TYPE_USB;
+	mbc->usb.properties		= power_props;
+	mbc->usb.num_properties		= ARRAY_SIZE(power_props);
+	mbc->usb.get_property		= usb_get_property;
+	mbc->usb.supplied_to		= mbc->pcf->pdata->batteries;
+	mbc->usb.num_supplicants	= mbc->pcf->pdata->num_batteries;
+
+	ret = power_supply_register(&pdev->dev, &mbc->adapter);
+	if (ret) {
+		dev_err(mbc->pcf->dev, "failed to register adapter\n");
+		kfree(mbc);
+		return ret;
+	}
+
+	ret = power_supply_register(&pdev->dev, &mbc->usb);
+	if (ret) {
+		dev_err(mbc->pcf->dev, "failed to register usb\n");
+		power_supply_unregister(&mbc->adapter);
+		kfree(mbc);
+		return ret;
+	}
+
+	ret = sysfs_create_group(&pdev->dev.kobj, &mbc_attr_group);
+	if (ret)
+		dev_err(mbc->pcf->dev, "failed to create sysfs entries\n");
+
+	mbcs1 = pcf50633_reg_read(mbc->pcf, PCF50633_REG_MBCS1);
+	if (mbcs1 & PCF50633_MBCS1_USBPRES)
+		pcf50633_mbc_irq_handler(PCF50633_IRQ_USBINS, mbc);
+	if (mbcs1 & PCF50633_MBCS1_ADAPTPRES)
+		pcf50633_mbc_irq_handler(PCF50633_IRQ_ADPINS, mbc);
+
+	return 0;
+}
+
+static int __devexit pcf50633_mbc_remove(struct platform_device *pdev)
+{
+	struct pcf50633_mbc *mbc = platform_get_drvdata(pdev);
+	int i;
+
+	/* Remove IRQ handlers */
+	for (i = 0; i < ARRAY_SIZE(mbc_irq_handlers); i++)
+		pcf50633_free_irq(mbc->pcf, mbc_irq_handlers[i]);
+
+	power_supply_unregister(&mbc->usb);
+	power_supply_unregister(&mbc->adapter);
+
+	kfree(mbc);
+
+	return 0;
+}
+
+static struct platform_driver pcf50633_mbc_driver = {
+	.driver = {
+		.name = "pcf50633-mbc",
+	},
+	.probe = pcf50633_mbc_probe,
+	.remove = __devexit_p(pcf50633_mbc_remove),
+};
+
+static int __init pcf50633_mbc_init(void)
+{
+	return platform_driver_register(&pcf50633_mbc_driver);
+}
+module_init(pcf50633_mbc_init);
+
+static void __exit pcf50633_mbc_exit(void)
+{
+	platform_driver_unregister(&pcf50633_mbc_driver);
+}
+module_exit(pcf50633_mbc_exit);
+
+MODULE_AUTHOR("Balaji Rao <balajirrao@openmoko.org>");
+MODULE_DESCRIPTION("PCF50633 mbc driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:pcf50633-mbc");
diff --git a/include/linux/mfd/pcf50633/mbc.h b/include/linux/mfd/pcf50633/mbc.h
new file mode 100644
index 000000000000..6e17619b773a
--- /dev/null
+++ b/include/linux/mfd/pcf50633/mbc.h
@@ -0,0 +1,134 @@
+/*
+ * mbc.h  -- Driver for NXP PCF50633 Main Battery Charger
+ *
+ * (C) 2006-2008 by Openmoko, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __LINUX_MFD_PCF50633_MBC_H
+#define __LINUX_MFD_PCF50633_MBC_H
+
+#include <linux/mfd/pcf50633/core.h>
+#include <linux/platform_device.h>
+
+#define PCF50633_REG_MBCC1	0x43
+#define PCF50633_REG_MBCC2	0x44
+#define PCF50633_REG_MBCC3	0x45
+#define PCF50633_REG_MBCC4	0x46
+#define PCF50633_REG_MBCC5	0x47
+#define PCF50633_REG_MBCC6	0x48
+#define PCF50633_REG_MBCC7	0x49
+#define PCF50633_REG_MBCC8	0x4a
+#define PCF50633_REG_MBCS1	0x4b
+#define PCF50633_REG_MBCS2	0x4c
+#define PCF50633_REG_MBCS3	0x4d
+
+enum pcf50633_reg_mbcc1 {
+	PCF50633_MBCC1_CHGENA		= 0x01,	/* Charger enable */
+	PCF50633_MBCC1_AUTOSTOP		= 0x02,
+	PCF50633_MBCC1_AUTORES		= 0x04, /* automatic resume */
+	PCF50633_MBCC1_RESUME		= 0x08, /* explicit resume cmd */
+	PCF50633_MBCC1_RESTART		= 0x10, /* restart charging */
+	PCF50633_MBCC1_PREWDTIME_60M	= 0x20,	/* max. precharging time */
+	PCF50633_MBCC1_WDTIME_1H	= 0x00,
+	PCF50633_MBCC1_WDTIME_2H	= 0x40,
+	PCF50633_MBCC1_WDTIME_4H	= 0x80,
+	PCF50633_MBCC1_WDTIME_6H	= 0xc0,
+};
+#define PCF50633_MBCC1_WDTIME_MASK	  0xc0
+
+enum pcf50633_reg_mbcc2 {
+	PCF50633_MBCC2_VBATCOND_2V7	= 0x00,
+	PCF50633_MBCC2_VBATCOND_2V85	= 0x01,
+	PCF50633_MBCC2_VBATCOND_3V0	= 0x02,
+	PCF50633_MBCC2_VBATCOND_3V15	= 0x03,
+	PCF50633_MBCC2_VMAX_4V		= 0x00,
+	PCF50633_MBCC2_VMAX_4V20	= 0x28,
+	PCF50633_MBCC2_VRESDEBTIME_64S	= 0x80,	/* debounce time (32/64sec) */
+};
+
+enum pcf50633_reg_mbcc7 {
+	PCF50633_MBCC7_USB_100mA	= 0x00,
+	PCF50633_MBCC7_USB_500mA	= 0x01,
+	PCF50633_MBCC7_USB_1000mA	= 0x02,
+	PCF50633_MBCC7_USB_SUSPEND	= 0x03,
+	PCF50633_MBCC7_BATTEMP_EN	= 0x04,
+	PCF50633_MBCC7_BATSYSIMAX_1A6	= 0x00,
+	PCF50633_MBCC7_BATSYSIMAX_1A8	= 0x40,
+	PCF50633_MBCC7_BATSYSIMAX_2A0	= 0x80,
+	PCF50633_MBCC7_BATSYSIMAX_2A2	= 0xc0,
+};
+#define PCF50633_MBCC7_USB_MASK 0x03
+
+enum pcf50633_reg_mbcc8 {
+	PCF50633_MBCC8_USBENASUS	= 0x10,
+};
+
+enum pcf50633_reg_mbcs1 {
+	PCF50633_MBCS1_USBPRES		= 0x01,
+	PCF50633_MBCS1_USBOK		= 0x02,
+	PCF50633_MBCS1_ADAPTPRES	= 0x04,
+	PCF50633_MBCS1_ADAPTOK		= 0x08,
+	PCF50633_MBCS1_TBAT_OK		= 0x00,
+	PCF50633_MBCS1_TBAT_ABOVE	= 0x10,
+	PCF50633_MBCS1_TBAT_BELOW	= 0x20,
+	PCF50633_MBCS1_TBAT_UNDEF	= 0x30,
+	PCF50633_MBCS1_PREWDTEXP	= 0x40,
+	PCF50633_MBCS1_WDTEXP		= 0x80,
+};
+
+enum pcf50633_reg_mbcs2_mbcmod {
+	PCF50633_MBCS2_MBC_PLAY		= 0x00,
+	PCF50633_MBCS2_MBC_USB_PRE	= 0x01,
+	PCF50633_MBCS2_MBC_USB_PRE_WAIT	= 0x02,
+	PCF50633_MBCS2_MBC_USB_FAST	= 0x03,
+	PCF50633_MBCS2_MBC_USB_FAST_WAIT = 0x04,
+	PCF50633_MBCS2_MBC_USB_SUSPEND	= 0x05,
+	PCF50633_MBCS2_MBC_ADP_PRE	= 0x06,
+	PCF50633_MBCS2_MBC_ADP_PRE_WAIT	= 0x07,
+	PCF50633_MBCS2_MBC_ADP_FAST	= 0x08,
+	PCF50633_MBCS2_MBC_ADP_FAST_WAIT = 0x09,
+	PCF50633_MBCS2_MBC_BAT_FULL	= 0x0a,
+	PCF50633_MBCS2_MBC_HALT		= 0x0b,
+};
+#define PCF50633_MBCS2_MBC_MASK		0x0f
+enum pcf50633_reg_mbcs2_chgstat {
+	PCF50633_MBCS2_CHGS_NONE	= 0x00,
+	PCF50633_MBCS2_CHGS_ADAPTER	= 0x10,
+	PCF50633_MBCS2_CHGS_USB		= 0x20,
+	PCF50633_MBCS2_CHGS_BOTH	= 0x30,
+};
+#define PCF50633_MBCS2_RESSTAT_AUTO	0x40
+
+enum pcf50633_reg_mbcs3 {
+	PCF50633_MBCS3_USBLIM_PLAY	= 0x01,
+	PCF50633_MBCS3_USBLIM_CGH	= 0x02,
+	PCF50633_MBCS3_TLIM_PLAY	= 0x04,
+	PCF50633_MBCS3_TLIM_CHG		= 0x08,
+	PCF50633_MBCS3_ILIM		= 0x10,	/* 1: Ibat > Icutoff */
+	PCF50633_MBCS3_VLIM		= 0x20,	/* 1: Vbat == Vmax */
+	PCF50633_MBCS3_VBATSTAT		= 0x40,	/* 1: Vbat > Vbatcond */
+	PCF50633_MBCS3_VRES		= 0x80, /* 1: Vbat > Vth(RES) */
+};
+
+#define PCF50633_MBCC2_VBATCOND_MASK	  0x03
+#define PCF50633_MBCC2_VMAX_MASK	  0x3c
+
+/* Charger status */
+#define PCF50633_MBC_USB_ONLINE		0x01
+#define PCF50633_MBC_USB_ACTIVE		0x02
+#define PCF50633_MBC_ADAPTER_ONLINE	0x04
+#define PCF50633_MBC_ADAPTER_ACTIVE	0x08
+
+int pcf50633_mbc_usb_curlim_set(struct pcf50633 *pcf, int ma);
+
+int pcf50633_mbc_get_status(struct pcf50633 *);
+void pcf50633_mbc_set_status(struct pcf50633 *, int what, int status);
+
+#endif
+
-- 
cgit v1.2.3


From 5ec271e745350c7df6a6ebca24b43cb7a10bfa4a Mon Sep 17 00:00:00 2001
From: Balaji Rao <balajirrao@openmoko.org>
Date: Fri, 9 Jan 2009 01:51:01 +0100
Subject: regulator: PCF50633 pmic driver

Changes from V1:
	- Removed support for suspend_enable & suspend_disable functions.

Signed-off-by: Balaji Rao <balajirrao@openmoko.org>
Cc: Andy Green <andy@openmoko.com>
Cc: Liam Girdwood <lrg@slimlogic.co.uk>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/regulator/Kconfig              |   7 +
 drivers/regulator/Makefile             |   1 +
 drivers/regulator/pcf50633-regulator.c | 329 +++++++++++++++++++++++++++++++++
 include/linux/mfd/pcf50633/pmic.h      |  67 +++++++
 4 files changed, 404 insertions(+)
 create mode 100644 drivers/regulator/pcf50633-regulator.c
 create mode 100644 include/linux/mfd/pcf50633/pmic.h

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 39360e2a4540..e7e0cf102d6d 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -73,4 +73,11 @@ config REGULATOR_DA903X
 	  Say y here to support the BUCKs and LDOs regulators found on
 	  Dialog Semiconductor DA9030/DA9034 PMIC.
 
+config REGULATOR_PCF50633
+	tristate "PCF50633 regulator driver"
+        depends on MFD_PCF50633
+	help
+	 Say Y here to support the voltage regulators and convertors
+	 on PCF50633
+
 endif
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 254d40c02ee8..61b30c6ddecc 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -11,5 +11,6 @@ obj-$(CONFIG_REGULATOR_BQ24022) += bq24022.o
 obj-$(CONFIG_REGULATOR_WM8350) += wm8350-regulator.o
 obj-$(CONFIG_REGULATOR_WM8400) += wm8400-regulator.o
 obj-$(CONFIG_REGULATOR_DA903X)	+= da903x.o
+obj-$(CONFIG_REGULATOR_PCF50633) += pcf50633-regulator.o
 
 ccflags-$(CONFIG_REGULATOR_DEBUG) += -DDEBUG
diff --git a/drivers/regulator/pcf50633-regulator.c b/drivers/regulator/pcf50633-regulator.c
new file mode 100644
index 000000000000..4cc85ec6e120
--- /dev/null
+++ b/drivers/regulator/pcf50633-regulator.c
@@ -0,0 +1,329 @@
+/* NXP PCF50633 PMIC Driver
+ *
+ * (C) 2006-2008 by Openmoko, Inc.
+ * Author: Balaji Rao <balajirrao@openmoko.org>
+ * All rights reserved.
+ *
+ * Broken down from monstrous PCF50633 driver mainly by
+ * Harald Welte and Andy Green and Werner Almesberger
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/platform_device.h>
+
+#include <linux/mfd/pcf50633/core.h>
+#include <linux/mfd/pcf50633/pmic.h>
+
+#define PCF50633_REGULATOR(_name, _id) 		\
+	{					\
+		.name = _name, 			\
+		.id = _id,			\
+		.ops = &pcf50633_regulator_ops,	\
+		.type = REGULATOR_VOLTAGE, 	\
+		.owner = THIS_MODULE, 		\
+	}
+
+static const u8 pcf50633_regulator_registers[PCF50633_NUM_REGULATORS] = {
+	[PCF50633_REGULATOR_AUTO]	= PCF50633_REG_AUTOOUT,
+	[PCF50633_REGULATOR_DOWN1]	= PCF50633_REG_DOWN1OUT,
+	[PCF50633_REGULATOR_DOWN2]	= PCF50633_REG_DOWN2OUT,
+	[PCF50633_REGULATOR_MEMLDO]	= PCF50633_REG_MEMLDOOUT,
+	[PCF50633_REGULATOR_LDO1]	= PCF50633_REG_LDO1OUT,
+	[PCF50633_REGULATOR_LDO2]	= PCF50633_REG_LDO2OUT,
+	[PCF50633_REGULATOR_LDO3]	= PCF50633_REG_LDO3OUT,
+	[PCF50633_REGULATOR_LDO4]	= PCF50633_REG_LDO4OUT,
+	[PCF50633_REGULATOR_LDO5]	= PCF50633_REG_LDO5OUT,
+	[PCF50633_REGULATOR_LDO6]	= PCF50633_REG_LDO6OUT,
+	[PCF50633_REGULATOR_HCLDO]	= PCF50633_REG_HCLDOOUT,
+};
+
+/* Bits from voltage value */
+static u8 auto_voltage_bits(unsigned int millivolts)
+{
+	if (millivolts < 1800)
+		return 0;
+	if (millivolts > 3800)
+		return 0xff;
+
+	millivolts -= 625;
+
+	return millivolts / 25;
+}
+
+static u8 down_voltage_bits(unsigned int millivolts)
+{
+	if (millivolts < 625)
+		return 0;
+	else if (millivolts > 3000)
+		return 0xff;
+
+	millivolts -= 625;
+
+	return millivolts / 25;
+}
+
+static u8 ldo_voltage_bits(unsigned int millivolts)
+{
+	if (millivolts < 900)
+		return 0;
+	else if (millivolts > 3600)
+		return 0x1f;
+
+	millivolts -= 900;
+	return millivolts / 100;
+}
+
+/* Obtain voltage value from bits */
+static unsigned int auto_voltage_value(u8 bits)
+{
+	if (bits < 0x2f)
+		return 0;
+
+	return 625 + (bits * 25);
+}
+
+
+static unsigned int down_voltage_value(u8 bits)
+{
+	return 625 + (bits * 25);
+}
+
+
+static unsigned int ldo_voltage_value(u8 bits)
+{
+	bits &= 0x1f;
+
+	return 900 + (bits * 100);
+}
+
+static int pcf50633_regulator_set_voltage(struct regulator_dev *rdev,
+						int min_uV, int max_uV)
+{
+	struct pcf50633 *pcf;
+	int regulator_id, millivolts;
+	u8 volt_bits, regnr;
+
+	pcf = rdev_get_drvdata(rdev);
+
+	regulator_id = rdev_get_id(rdev);
+	if (regulator_id >= PCF50633_NUM_REGULATORS)
+		return -EINVAL;
+
+	millivolts = min_uV / 1000;
+
+	regnr = pcf50633_regulator_registers[regulator_id];
+
+	switch (regulator_id) {
+	case PCF50633_REGULATOR_AUTO:
+		volt_bits = auto_voltage_bits(millivolts);
+		break;
+	case PCF50633_REGULATOR_DOWN1:
+		volt_bits = down_voltage_bits(millivolts);
+		break;
+	case PCF50633_REGULATOR_DOWN2:
+		volt_bits = down_voltage_bits(millivolts);
+		break;
+	case PCF50633_REGULATOR_LDO1:
+	case PCF50633_REGULATOR_LDO2:
+	case PCF50633_REGULATOR_LDO3:
+	case PCF50633_REGULATOR_LDO4:
+	case PCF50633_REGULATOR_LDO5:
+	case PCF50633_REGULATOR_LDO6:
+	case PCF50633_REGULATOR_HCLDO:
+		volt_bits = ldo_voltage_bits(millivolts);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return pcf50633_reg_write(pcf, regnr, volt_bits);
+}
+
+static int pcf50633_regulator_get_voltage(struct regulator_dev *rdev)
+{
+	struct pcf50633 *pcf;
+	int regulator_id, millivolts, volt_bits;
+	u8 regnr;
+
+	pcf = rdev_get_drvdata(rdev);;
+
+	regulator_id = rdev_get_id(rdev);
+	if (regulator_id >= PCF50633_NUM_REGULATORS)
+		return -EINVAL;
+
+	regnr = pcf50633_regulator_registers[regulator_id];
+
+	volt_bits = pcf50633_reg_read(pcf, regnr);
+	if (volt_bits < 0)
+		return -1;
+
+	switch (regulator_id) {
+	case PCF50633_REGULATOR_AUTO:
+		millivolts = auto_voltage_value(volt_bits);
+		break;
+	case PCF50633_REGULATOR_DOWN1:
+		millivolts = down_voltage_value(volt_bits);
+		break;
+	case PCF50633_REGULATOR_DOWN2:
+		millivolts = down_voltage_value(volt_bits);
+		break;
+	case PCF50633_REGULATOR_LDO1:
+	case PCF50633_REGULATOR_LDO2:
+	case PCF50633_REGULATOR_LDO3:
+	case PCF50633_REGULATOR_LDO4:
+	case PCF50633_REGULATOR_LDO5:
+	case PCF50633_REGULATOR_LDO6:
+	case PCF50633_REGULATOR_HCLDO:
+		millivolts = ldo_voltage_value(volt_bits);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return millivolts * 1000;
+}
+
+static int pcf50633_regulator_enable(struct regulator_dev *rdev)
+{
+	struct pcf50633 *pcf = rdev_get_drvdata(rdev);
+	int regulator_id;
+	u8 regnr;
+
+	regulator_id = rdev_get_id(rdev);
+	if (regulator_id >= PCF50633_NUM_REGULATORS)
+		return -EINVAL;
+
+	/* The *ENA register is always one after the *OUT register */
+	regnr = pcf50633_regulator_registers[regulator_id] + 1;
+
+	return pcf50633_reg_set_bit_mask(pcf, regnr, PCF50633_REGULATOR_ON,
+						       PCF50633_REGULATOR_ON);
+}
+
+static int pcf50633_regulator_disable(struct regulator_dev *rdev)
+{
+	struct pcf50633 *pcf = rdev_get_drvdata(rdev);
+	int regulator_id;
+	u8 regnr;
+
+	regulator_id = rdev_get_id(rdev);
+	if (regulator_id >= PCF50633_NUM_REGULATORS)
+		return -EINVAL;
+
+	/* the *ENA register is always one after the *OUT register */
+	regnr = pcf50633_regulator_registers[regulator_id] + 1;
+
+	return pcf50633_reg_set_bit_mask(pcf, regnr,
+					PCF50633_REGULATOR_ON, 0);
+}
+
+static int pcf50633_regulator_is_enabled(struct regulator_dev *rdev)
+{
+	struct pcf50633 *pcf = rdev_get_drvdata(rdev);
+	int regulator_id = rdev_get_id(rdev);
+	u8 regnr;
+
+	regulator_id = rdev_get_id(rdev);
+	if (regulator_id >= PCF50633_NUM_REGULATORS)
+		return -EINVAL;
+
+	/* the *ENA register is always one after the *OUT register */
+	regnr = pcf50633_regulator_registers[regulator_id] + 1;
+
+	return pcf50633_reg_read(pcf, regnr) & PCF50633_REGULATOR_ON;
+}
+
+static struct regulator_ops pcf50633_regulator_ops = {
+	.set_voltage = pcf50633_regulator_set_voltage,
+	.get_voltage = pcf50633_regulator_get_voltage,
+	.enable = pcf50633_regulator_enable,
+	.disable = pcf50633_regulator_disable,
+	.is_enabled = pcf50633_regulator_is_enabled,
+};
+
+static struct regulator_desc regulators[] = {
+	[PCF50633_REGULATOR_AUTO] =
+		PCF50633_REGULATOR("auto", PCF50633_REGULATOR_AUTO),
+	[PCF50633_REGULATOR_DOWN1] =
+		PCF50633_REGULATOR("down1", PCF50633_REGULATOR_DOWN1),
+	[PCF50633_REGULATOR_DOWN2] =
+		PCF50633_REGULATOR("down2", PCF50633_REGULATOR_DOWN2),
+	[PCF50633_REGULATOR_LDO1] =
+		PCF50633_REGULATOR("ldo1", PCF50633_REGULATOR_LDO1),
+	[PCF50633_REGULATOR_LDO2] =
+		PCF50633_REGULATOR("ldo2", PCF50633_REGULATOR_LDO2),
+	[PCF50633_REGULATOR_LDO3] =
+		PCF50633_REGULATOR("ldo3", PCF50633_REGULATOR_LDO3),
+	[PCF50633_REGULATOR_LDO4] =
+		PCF50633_REGULATOR("ldo4", PCF50633_REGULATOR_LDO4),
+	[PCF50633_REGULATOR_LDO5] =
+		PCF50633_REGULATOR("ldo5", PCF50633_REGULATOR_LDO5),
+	[PCF50633_REGULATOR_LDO6] =
+		PCF50633_REGULATOR("ldo6", PCF50633_REGULATOR_LDO6),
+	[PCF50633_REGULATOR_HCLDO] =
+		PCF50633_REGULATOR("hcldo", PCF50633_REGULATOR_HCLDO),
+	[PCF50633_REGULATOR_MEMLDO] =
+		PCF50633_REGULATOR("memldo", PCF50633_REGULATOR_MEMLDO),
+};
+
+static int __devinit pcf50633_regulator_probe(struct platform_device *pdev)
+{
+	struct regulator_dev *rdev;
+	struct pcf50633 *pcf;
+
+	/* Already set by core driver */
+	pcf = platform_get_drvdata(pdev);
+
+	rdev = regulator_register(&regulators[pdev->id], &pdev->dev, pcf);
+	if (IS_ERR(rdev))
+		return PTR_ERR(rdev);
+
+	if (pcf->pdata->regulator_registered)
+		pcf->pdata->regulator_registered(pcf, pdev->id);
+
+	return 0;
+}
+
+static int __devexit pcf50633_regulator_remove(struct platform_device *pdev)
+{
+	struct regulator_dev *rdev = platform_get_drvdata(pdev);
+
+	regulator_unregister(rdev);
+
+	return 0;
+}
+
+static struct platform_driver pcf50633_regulator_driver = {
+	.driver = {
+		.name = "pcf50633-regltr",
+	},
+	.probe = pcf50633_regulator_probe,
+	.remove = __devexit_p(pcf50633_regulator_remove),
+};
+
+static int __init pcf50633_regulator_init(void)
+{
+	return platform_driver_register(&pcf50633_regulator_driver);
+}
+module_init(pcf50633_regulator_init);
+
+static void __exit pcf50633_regulator_exit(void)
+{
+	platform_driver_unregister(&pcf50633_regulator_driver);
+}
+module_exit(pcf50633_regulator_exit);
+
+MODULE_AUTHOR("Balaji Rao <balajirrao@openmoko.org>");
+MODULE_DESCRIPTION("PCF50633 regulator driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:pcf50633-regulator");
diff --git a/include/linux/mfd/pcf50633/pmic.h b/include/linux/mfd/pcf50633/pmic.h
new file mode 100644
index 000000000000..2d3dbe53b235
--- /dev/null
+++ b/include/linux/mfd/pcf50633/pmic.h
@@ -0,0 +1,67 @@
+#ifndef __LINUX_MFD_PCF50633_PMIC_H
+#define __LINUX_MFD_PCF50633_PMIC_H
+
+#include <linux/mfd/pcf50633/core.h>
+#include <linux/platform_device.h>
+
+#define PCF50633_REG_AUTOOUT	0x1a
+#define PCF50633_REG_AUTOENA	0x1b
+#define PCF50633_REG_AUTOCTL	0x1c
+#define PCF50633_REG_AUTOMXC	0x1d
+#define PCF50633_REG_DOWN1OUT	0x1e
+#define PCF50633_REG_DOWN1ENA	0x1f
+#define PCF50633_REG_DOWN1CTL	0x20
+#define PCF50633_REG_DOWN1MXC	0x21
+#define PCF50633_REG_DOWN2OUT	0x22
+#define PCF50633_REG_DOWN2ENA	0x23
+#define PCF50633_REG_DOWN2CTL	0x24
+#define PCF50633_REG_DOWN2MXC	0x25
+#define PCF50633_REG_MEMLDOOUT	0x26
+#define PCF50633_REG_MEMLDOENA	0x27
+#define PCF50633_REG_LDO1OUT	0x2d
+#define PCF50633_REG_LDO1ENA	0x2e
+#define PCF50633_REG_LDO2OUT	0x2f
+#define PCF50633_REG_LDO2ENA	0x30
+#define PCF50633_REG_LDO3OUT	0x31
+#define PCF50633_REG_LDO3ENA	0x32
+#define PCF50633_REG_LDO4OUT	0x33
+#define PCF50633_REG_LDO4ENA	0x34
+#define PCF50633_REG_LDO5OUT	0x35
+#define PCF50633_REG_LDO5ENA	0x36
+#define PCF50633_REG_LDO6OUT	0x37
+#define PCF50633_REG_LDO6ENA	0x38
+#define PCF50633_REG_HCLDOOUT	0x39
+#define PCF50633_REG_HCLDOENA	0x3a
+#define PCF50633_REG_HCLDOOVL	0x40
+
+enum pcf50633_regulator_enable {
+	PCF50633_REGULATOR_ON		= 0x01,
+	PCF50633_REGULATOR_ON_GPIO1	= 0x02,
+	PCF50633_REGULATOR_ON_GPIO2	= 0x04,
+	PCF50633_REGULATOR_ON_GPIO3	= 0x08,
+};
+#define PCF50633_REGULATOR_ON_MASK	0x0f
+
+enum pcf50633_regulator_phase {
+	PCF50633_REGULATOR_ACTPH1	= 0x00,
+	PCF50633_REGULATOR_ACTPH2	= 0x10,
+	PCF50633_REGULATOR_ACTPH3	= 0x20,
+	PCF50633_REGULATOR_ACTPH4	= 0x30,
+};
+#define PCF50633_REGULATOR_ACTPH_MASK	0x30
+
+enum pcf50633_regulator_id {
+	PCF50633_REGULATOR_AUTO,
+	PCF50633_REGULATOR_DOWN1,
+	PCF50633_REGULATOR_DOWN2,
+	PCF50633_REGULATOR_LDO1,
+	PCF50633_REGULATOR_LDO2,
+	PCF50633_REGULATOR_LDO3,
+	PCF50633_REGULATOR_LDO4,
+	PCF50633_REGULATOR_LDO5,
+	PCF50633_REGULATOR_LDO6,
+	PCF50633_REGULATOR_HCLDO,
+	PCF50633_REGULATOR_MEMLDO,
+};
+#endif
+
-- 
cgit v1.2.3


From 53ce3d9564908794ae7dd32969089b57df5fc098 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 9 Jan 2009 12:27:08 -0800
Subject: smp_call_function_single(): be slightly less stupid

If you do

	smp_call_function_single(expression-with-side-effects, ...)

then expression-with-side-effects never gets evaluated on UP builds.

As always, implementing it in C is the correct thing to do.

While we're there, uninline it for size and possible header dependency
reasons.

And create a new kernel/up.c, as a place in which to put
uniprocessor-specific code and storage.  It should mirror kernel/smp.c.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/smp.h | 13 +++----------
 kernel/Makefile     |  6 +++++-
 kernel/up.c         | 18 ++++++++++++++++++
 3 files changed, 26 insertions(+), 11 deletions(-)
 create mode 100644 kernel/up.c

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index b82466968101..715196b09d67 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -24,6 +24,9 @@ struct call_single_data {
 /* total number of cpus in this system (may exceed NR_CPUS) */
 extern unsigned int total_cpus;
 
+int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
+				int wait);
+
 #ifdef CONFIG_SMP
 
 #include <linux/preempt.h>
@@ -79,8 +82,6 @@ smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info,
 	return 0;
 }
 
-int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
-				int wait);
 void __smp_call_function_single(int cpuid, struct call_single_data *data);
 
 /*
@@ -140,14 +141,6 @@ static inline int up_smp_call_function(void (*func)(void *), void *info)
 static inline void smp_send_reschedule(int cpu) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
-#define smp_call_function_single(cpuid, func, info, wait) \
-({ \
-	WARN_ON(cpuid != 0);	\
-	local_irq_disable();	\
-	(func)(info);		\
-	local_irq_enable();	\
-	0;			\
-})
 #define smp_call_function_mask(mask, func, info, wait) \
 			(up_smp_call_function(func, info))
 #define smp_call_function_many(mask, func, info, wait) \
diff --git a/kernel/Makefile b/kernel/Makefile
index 2921d90ce32f..2aebc4cd7878 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -40,7 +40,11 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
+ifeq ($(CONFIG_USE_GENERIC_SMP_HELPERS),y)
+obj-y += smp.o
+else
+obj-y += up.o
+endif
 obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
diff --git a/kernel/up.c b/kernel/up.c
new file mode 100644
index 000000000000..ce62cc9e9f71
--- /dev/null
+++ b/kernel/up.c
@@ -0,0 +1,18 @@
+/*
+ * Uniprocessor-only support functions.  The counterpart to kernel/smp.c
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+				int wait)
+{
+	WARN_ON(cpuid != 0);
+	local_irq_disable();
+	(func)(info);
+	local_irq_enable();
+	return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single);
-- 
cgit v1.2.3


From 649274d993212e7c23c0cb734572c2311c200872 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 11 Jan 2009 00:20:39 -0800
Subject: net_dma: acquire/release dma channels on ifup/ifdown

The recent dmaengine rework removed the capability to remove dma device
driver modules while net_dma is active.  Rather than notify
dmaengine-clients that channels are trying to be removed, we now rely on
clients to notify dmaengine when they no longer have a need for
channels.  Teach net_dma to release channels by taking dmaengine
references at netdevice open and dropping references at netdevice close.

Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dmaengine.h | 10 ++++++++++
 net/core/dev.c            | 13 ++++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 64dea2ab326c..c73f1e2b59b7 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -270,8 +270,18 @@ struct dma_device {
 
 /* --- public DMA engine API --- */
 
+#ifdef CONFIG_DMA_ENGINE
 void dmaengine_get(void);
 void dmaengine_put(void);
+#else
+static inline void dmaengine_get(void)
+{
+}
+static inline void dmaengine_put(void)
+{
+}
+#endif
+
 dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
 	void *dest, void *src, size_t len);
 dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
diff --git a/net/core/dev.c b/net/core/dev.c
index 5f736f1ceeae..b715a55cccc4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1087,6 +1087,11 @@ int dev_open(struct net_device *dev)
 		 */
 		dev->flags |= IFF_UP;
 
+		/*
+		 *	Enable NET_DMA
+		 */
+		dmaengine_get();
+
 		/*
 		 *	Initialize multicasting status
 		 */
@@ -1164,6 +1169,11 @@ int dev_close(struct net_device *dev)
 	 */
 	call_netdevice_notifiers(NETDEV_DOWN, dev);
 
+	/*
+	 *	Shutdown NET_DMA
+	 */
+	dmaengine_put();
+
 	return 0;
 }
 
@@ -5151,9 +5161,6 @@ static int __init net_dev_init(void)
 	hotcpu_notifier(dev_cpu_callback, 0);
 	dst_init();
 	dev_mcast_init();
-	#ifdef CONFIG_NET_DMA
-	dmaengine_get();
-	#endif
 	rc = 0;
 out:
 	return rc;
-- 
cgit v1.2.3


From 57de16e612d63138bd2c618449af9d8312466e25 Mon Sep 17 00:00:00 2001
From: Martin Bachem <m.bachem@gmx.de>
Date: Sun, 26 Oct 2008 13:30:09 +0100
Subject: BUGFIX: used NULL pointer at ioctl(sk,IMGETDEVINFO,&devinfo) when
 devinfo.id not registered

daxtar example # modprobe hfcsusb
daxtar example # modprobe mISDN_l1loop
daxtar example # ./misdnportinfo
Found 3 devices
        id:             0
        Dprotocols:     00000006
        Bprotocols:     0000000e
        protocol:       0
        nrbchan:        2
        name:           HFC-S_USB.1
        id:             1
        Dprotocols:     00000006
        Bprotocols:     0000000e
        protocol:       0
        nrbchan:        2
        name:           mISDN_l1loop.1
        id:             2
        Dprotocols:     00000006
        Bprotocols:     0000000e
        protocol:       0
        nrbchan:        2
        name:           mISDN_l1loop.2
daxtar example # rmmod hfcsusb
daxtar example # ./misdnportinfo
Found 2 devices
*Segmentation* *fault*

dmesg:

[ 9914.939718] BUG: unable to handle kernel NULL pointer dereference at 000000d4
[ 9914.939721] IP: [<f8f9f2dd>] :mISDN_core:get_mdevice+0x19/0x22
[ 9914.939729] *pde = 00000000
[ 9914.939732] Oops: 0000 [#14] PREEMPT SMP
[ 9914.939734] Modules linked in: mISDN_l1loop mISDN_core vmnet vmblock vmci vmmon coretemp w83627ehf hwmon_vid rfcomm l2cap blue
tooth usbhid snd_usb_audio snd_usb_lib snd_rawmidi snd_hwdep fuse nvidia(P) uhci_hcd i2c_i801 ehci_hcd snd_hda_intel atl1 usbcore i2c_core parport_seria
l [last unloaded: hfcsusb]
[ 9914.939751] Pid: 29618, comm: misdnportinfo Tainted: P      D   (2.6.27.3 #5)
[ 9914.939753] EIP: 0060:[<f8f9f2dd>] EFLAGS: 00210246 CPU: 0
[ 9914.939758] EIP is at get_mdevice+0x19/0x22 [mISDN_core]
[ 9914.939760] EAX: 00000000 EBX: f8fa791c ECX: f6afaa58 EDX: f7960cf4
[ 9914.939762] ESI: 80044944 EDI: bfc2e62c EBP: bfc2e62c ESP: f5adbef4
[ 9914.939763]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
[ 9914.939765] Process misdnportinfo (pid: 29618, ti=f5ada000 task=f6bec430 task.ti=f5ada000)
[ 9914.939767] Stack: f8f9f4e0 00000000 f8f9f867 bfc2e62c 0000000a c02461e8 00200246 c042dde8
[ 9914.939771]        00000003 c042dde4 00000000 00000001 00200082 c0114775 00000000 00000000
[ 9914.939775]        00000003 f7088010 00200282 f8fa791c 80044944 bfc2e62c bfc2e62c c02f6615
[ 9914.939780] Call Trace:
[ 9914.939782]  [<f8f9f4e0>] _get_mdevice+0x0/0x18 [mISDN_core]
[ 9914.939789]  [<f8f9f867>] base_sock_ioctl+0x7a/0x129 [mISDN_core]
[ 9914.939789]  [<c02461e8>] opost+0x171/0x182
[ 9914.939789]  [<c0114775>] __wake_up+0x29/0x39
[ 9914.939789]  [<c02f6615>] sock_ioctl+0x1b5/0x1d9
[ 9914.939789]  [<c02f6460>] sock_ioctl+0x0/0x1d9
[ 9914.939789]  [<c016794c>] vfs_ioctl+0x1c/0x5d
[ 9914.939789]  [<c0167bcb>] do_vfs_ioctl+0x23e/0x24e
[ 9914.939789]  [<c0167c07>] sys_ioctl+0x2c/0x45
[ 9914.939789]  [<c0102cbd>] sysenter_do_call+0x12/0x21
[ 9914.939789]  [<c0350000>] pci_fixup_i450gx+0x4e/0x56
[ 9914.939789]  =======================
[ 9914.939789] Code: 00 68 02 f0 f9 f8 e8 ae b4 2c c7 8b 44 24 04 5a 59 c3 83 ec 04 31 d2 89 04 24 89 e1 b8 ac df fa f8 68 e0 f4
f9 f8 e8 4a b5 2c c7 <8b> 80 d4 00 00 00 5a 59 c3 53 89 cb 8d 90 9c 00 00 00 89 c8 e8
[ 9914.939789] EIP: [<f8f9f2dd>] get_mdevice+0x19/0x22 [mISDN_core] SS:ESP 0068:f5adbef4
[ 9914.939858] ---[ end trace 50e18a715b019424 ]---

Signed-off-by: Martin Bachem <m.bachem@gmx.de>
Signed-off-by: Karsten Keil <kkeil@suse.de>
---
 include/linux/mISDNif.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index 557477ac3d5b..5da3d95b27f1 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -559,7 +559,10 @@ extern void	mISDN_unregister_clock(struct mISDNclock *);
 
 static inline struct mISDNdevice *dev_to_mISDN(struct device *dev)
 {
-	return dev_get_drvdata(dev);
+	if (dev)
+		return dev_get_drvdata(dev);
+	else
+		return NULL;
 }
 
 extern void	set_channel_address(struct mISDNchannel *, u_int, u_int);
-- 
cgit v1.2.3


From 2e4c77bea3d8b17d94f8ee382411f359b708560f Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Tue, 30 Dec 2008 14:16:41 +0100
Subject: m68k: dio - Kill warn_unused_result warnings

warning: ignoring return value of 'device_register', declared with attribute
warn_unused_result
warning: ignoring return value of 'device_create_file', declared with
attribute warn_unused_result

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 drivers/dio/dio-sysfs.c | 16 ++++++++++------
 drivers/dio/dio.c       | 18 +++++++++++++++---
 include/linux/dio.h     |  2 +-
 3 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dio/dio-sysfs.c b/drivers/dio/dio-sysfs.c
index f46463038847..91d5f4da8769 100644
--- a/drivers/dio/dio-sysfs.c
+++ b/drivers/dio/dio-sysfs.c
@@ -63,15 +63,19 @@ static ssize_t dio_show_resource(struct device *dev, struct device_attribute *at
 }
 static DEVICE_ATTR(resource, S_IRUGO, dio_show_resource, NULL);
 
-void dio_create_sysfs_dev_files(struct dio_dev *d)
+int dio_create_sysfs_dev_files(struct dio_dev *d)
 {
 	struct device *dev = &d->dev;
+	int error;
 
 	/* current configuration's attributes */
-	device_create_file(dev, &dev_attr_id);
-	device_create_file(dev, &dev_attr_ipl);
-	device_create_file(dev, &dev_attr_secid);
-	device_create_file(dev, &dev_attr_name);
-	device_create_file(dev, &dev_attr_resource);
+	if ((error = device_create_file(dev, &dev_attr_id)) ||
+	    (error = device_create_file(dev, &dev_attr_ipl)) ||
+	    (error = device_create_file(dev, &dev_attr_secid)) ||
+	    (error = device_create_file(dev, &dev_attr_name)) ||
+	    (error = device_create_file(dev, &dev_attr_resource)))
+		return error;
+
+	return 0;
 }
 
diff --git a/drivers/dio/dio.c b/drivers/dio/dio.c
index 07f274f853d9..10c3c498358c 100644
--- a/drivers/dio/dio.c
+++ b/drivers/dio/dio.c
@@ -173,6 +173,7 @@ static int __init dio_init(void)
 	mm_segment_t fs;
 	int i;
 	struct dio_dev *dev;
+	int error;
 
 	if (!MACH_IS_HP300)
 		return 0;
@@ -182,7 +183,11 @@ static int __init dio_init(void)
 	/* Initialize the DIO bus */ 
 	INIT_LIST_HEAD(&dio_bus.devices);
 	strcpy(dio_bus.dev.bus_id, "dio");
-	device_register(&dio_bus.dev);
+	error = device_register(&dio_bus.dev);
+	if (error) {
+		pr_err("DIO: Error registering dio_bus\n");
+		return error;
+	}
 
 	/* Request all resources */
 	dio_bus.num_resources = (hp300_model == HP_320 ? 1 : 2);
@@ -252,8 +257,15 @@ static int __init dio_init(void)
 
 		if (scode >= DIOII_SCBASE)
 			iounmap(va);
-		device_register(&dev->dev);
-		dio_create_sysfs_dev_files(dev);
+		error = device_register(&dev->dev);
+		if (error) {
+			pr_err("DIO: Error registering device %s\n",
+			       dev->name);
+			continue;
+		}
+		error = dio_create_sysfs_dev_files(dev);
+		if (error)
+			dev_err(&dev->dev, "Error creating sysfs files\n");
         }
 	return 0;
 }
diff --git a/include/linux/dio.h b/include/linux/dio.h
index 1e65ebc2a3db..b2dd31ca1710 100644
--- a/include/linux/dio.h
+++ b/include/linux/dio.h
@@ -241,7 +241,7 @@ struct dio_driver {
 
 extern int dio_find(int deviceid);
 extern unsigned long dio_scodetophysaddr(int scode);
-extern void dio_create_sysfs_dev_files(struct dio_dev *);
+extern int dio_create_sysfs_dev_files(struct dio_dev *);
 
 /* New-style probing */
 extern int dio_register_driver(struct dio_driver *);
-- 
cgit v1.2.3