From d7e5a5462f68270ed66efff22b1981be57a28c19 Mon Sep 17 00:00:00 2001
From: Segher Boessenkool <segher@kernel.crashing.org>
Date: Wed, 2 May 2007 12:18:41 +0200
Subject: [RSLIB] Support non-canonical GF representations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For the CAFÉ NAND controller, we need to support non-canonical
representations of the Galois field. Allow the caller to provide its own
function for generating the field, and CAFÉ can use rslib instead of its
own implementation.

Signed-off-by: Segher Boessenkool <segher@kernel.crashing.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/rslib.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rslib.h b/include/linux/rslib.h
index ace25acfdc97..746580c1939c 100644
--- a/include/linux/rslib.h
+++ b/include/linux/rslib.h
@@ -34,6 +34,7 @@
  * @prim:	Primitive element, index form
  * @iprim:	prim-th root of 1, index form
  * @gfpoly:	The primitive generator polynominal
+ * @gffunc:	Function to generate the field, if non-canonical representation
  * @users:	Users of this structure
  * @list:	List entry for the rs control list
 */
@@ -48,6 +49,7 @@ struct rs_control {
 	int 		prim;
 	int 		iprim;
 	int		gfpoly;
+	int		(*gffunc)(int);
 	int		users;
 	struct list_head list;
 };
@@ -77,6 +79,8 @@ int decode_rs16(struct rs_control *rs, uint16_t *data, uint16_t *par, int len,
 /* Create or get a matching rs control structure */
 struct rs_control *init_rs(int symsize, int gfpoly, int fcr, int prim,
 			   int nroots);
+struct rs_control *init_rs_non_canonical(int symsize, int (*func)(int),
+                                         int fcr, int prim, int nroots);
 
 /* Release a rs control structure */
 void free_rs(struct rs_control *rs);
-- 
cgit v1.2.3


From 972edcb79ec8c8512ed5b29ca6718065328d6992 Mon Sep 17 00:00:00 2001
From: Vitaly Wool <vitalywool@gmail.com>
Date: Sun, 6 May 2007 18:46:57 +0400
Subject: [MTD] [NAND] platform NAND driver: update header

This patch extends nand.h in order to enable platform NAND driver.

Signed-off-by: Vitaly Wool <vitalywool@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/mtd/nand.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index cf197ad62da6..d2365c8dcacc 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -560,6 +560,7 @@ extern int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
  * @chip_delay:		R/B delay value in us
  * @options:		Option flags, e.g. 16bit buswidth
  * @ecclayout:		ecc layout info structure
+ * @part_probe_types:	NULL-terminated array of probe types
  * @priv:		hardware controller specific settings
  */
 struct platform_nand_chip {
@@ -570,6 +571,7 @@ struct platform_nand_chip {
 	struct nand_ecclayout	*ecclayout;
 	int			chip_delay;
 	unsigned int		options;
+	const char		**part_probe_types;
 	void			*priv;
 };
 
@@ -578,6 +580,8 @@ struct platform_nand_chip {
  * @hwcontrol:		platform specific hardware control structure
  * @dev_ready:		platform specific function to read ready/busy pin
  * @select_chip:	platform specific chip select function
+ * @cmd_ctrl:		platform specific function for controlling
+ *			ALE/CLE/nCE. Also used to write command and address
  * @priv:		private data to transport driver specific settings
  *
  * All fields are optional and depend on the hardware driver requirements
@@ -586,9 +590,21 @@ struct platform_nand_ctrl {
 	void		(*hwcontrol)(struct mtd_info *mtd, int cmd);
 	int		(*dev_ready)(struct mtd_info *mtd);
 	void		(*select_chip)(struct mtd_info *mtd, int chip);
+	void		(*cmd_ctrl)(struct mtd_info *mtd, int dat,
+				    unsigned int ctrl);
 	void		*priv;
 };
 
+/**
+ * struct platform_nand_data - container structure for platform-specific data
+ * @chip:		chip level chip structure
+ * @ctrl:		controller level device structure
+ */
+struct platform_nand_data {
+	struct platform_nand_chip	chip;
+	struct platform_nand_ctrl	ctrl;
+};
+
 /* Some helpers to access the data structures */
 static inline
 struct platform_nand_chip *get_platform_nandchip(struct mtd_info *mtd)
-- 
cgit v1.2.3


From 225c7b1feef1b41170f7037a5b10a65cd8a42c54 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Tue, 8 May 2007 18:00:38 -0700
Subject: IB/mlx4: Add a driver Mellanox ConnectX InfiniBand adapters

Add an InfiniBand driver for Mellanox ConnectX adapters.  Because
these adapters can also be used as ethernet NICs and Fibre Channel
HBAs, the driver is split into two modules:

  mlx4_core: Handles low-level things like device initialization and
    processing firmware commands.  Also controls resource allocation
    so that the InfiniBand, ethernet and FC functions can share a
    device without stepping on each other.

  mlx4_ib: Handles InfiniBand-specific things; plugs into the
    InfiniBand midlayer.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/Kconfig            |    2 +
 drivers/infiniband/Makefile           |    1 +
 drivers/infiniband/hw/mlx4/Kconfig    |    9 +
 drivers/infiniband/hw/mlx4/Makefile   |    3 +
 drivers/infiniband/hw/mlx4/ah.c       |  100 +++
 drivers/infiniband/hw/mlx4/cq.c       |  525 +++++++++++++
 drivers/infiniband/hw/mlx4/doorbell.c |  216 ++++++
 drivers/infiniband/hw/mlx4/mad.c      |  339 +++++++++
 drivers/infiniband/hw/mlx4/main.c     |  651 +++++++++++++++++
 drivers/infiniband/hw/mlx4/mlx4_ib.h  |  285 ++++++++
 drivers/infiniband/hw/mlx4/mr.c       |  184 +++++
 drivers/infiniband/hw/mlx4/qp.c       | 1294 +++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/mlx4/srq.c      |  334 +++++++++
 drivers/infiniband/hw/mlx4/user.h     |   92 +++
 drivers/net/Kconfig                   |   14 +
 drivers/net/Makefile                  |    1 +
 drivers/net/mlx4/Makefile             |    4 +
 drivers/net/mlx4/alloc.c              |  179 +++++
 drivers/net/mlx4/catas.c              |   70 ++
 drivers/net/mlx4/cmd.c                |  429 +++++++++++
 drivers/net/mlx4/cq.c                 |  254 +++++++
 drivers/net/mlx4/eq.c                 |  696 ++++++++++++++++++
 drivers/net/mlx4/fw.c                 |  775 ++++++++++++++++++++
 drivers/net/mlx4/fw.h                 |  167 +++++
 drivers/net/mlx4/icm.c                |  379 ++++++++++
 drivers/net/mlx4/icm.h                |  135 ++++
 drivers/net/mlx4/intf.c               |  165 +++++
 drivers/net/mlx4/main.c               |  936 ++++++++++++++++++++++++
 drivers/net/mlx4/mcg.c                |  380 ++++++++++
 drivers/net/mlx4/mlx4.h               |  348 +++++++++
 drivers/net/mlx4/mr.c                 |  479 ++++++++++++
 drivers/net/mlx4/pd.c                 |  102 +++
 drivers/net/mlx4/profile.c            |  238 ++++++
 drivers/net/mlx4/qp.c                 |  280 +++++++
 drivers/net/mlx4/reset.c              |  181 +++++
 drivers/net/mlx4/srq.c                |  227 ++++++
 include/linux/mlx4/cmd.h              |  178 +++++
 include/linux/mlx4/cq.h               |  123 ++++
 include/linux/mlx4/device.h           |  331 +++++++++
 include/linux/mlx4/doorbell.h         |   97 +++
 include/linux/mlx4/driver.h           |   59 ++
 include/linux/mlx4/qp.h               |  288 ++++++++
 include/linux/mlx4/srq.h              |   42 ++
 43 files changed, 11592 insertions(+)
 create mode 100644 drivers/infiniband/hw/mlx4/Kconfig
 create mode 100644 drivers/infiniband/hw/mlx4/Makefile
 create mode 100644 drivers/infiniband/hw/mlx4/ah.c
 create mode 100644 drivers/infiniband/hw/mlx4/cq.c
 create mode 100644 drivers/infiniband/hw/mlx4/doorbell.c
 create mode 100644 drivers/infiniband/hw/mlx4/mad.c
 create mode 100644 drivers/infiniband/hw/mlx4/main.c
 create mode 100644 drivers/infiniband/hw/mlx4/mlx4_ib.h
 create mode 100644 drivers/infiniband/hw/mlx4/mr.c
 create mode 100644 drivers/infiniband/hw/mlx4/qp.c
 create mode 100644 drivers/infiniband/hw/mlx4/srq.c
 create mode 100644 drivers/infiniband/hw/mlx4/user.h
 create mode 100644 drivers/net/mlx4/Makefile
 create mode 100644 drivers/net/mlx4/alloc.c
 create mode 100644 drivers/net/mlx4/catas.c
 create mode 100644 drivers/net/mlx4/cmd.c
 create mode 100644 drivers/net/mlx4/cq.c
 create mode 100644 drivers/net/mlx4/eq.c
 create mode 100644 drivers/net/mlx4/fw.c
 create mode 100644 drivers/net/mlx4/fw.h
 create mode 100644 drivers/net/mlx4/icm.c
 create mode 100644 drivers/net/mlx4/icm.h
 create mode 100644 drivers/net/mlx4/intf.c
 create mode 100644 drivers/net/mlx4/main.c
 create mode 100644 drivers/net/mlx4/mcg.c
 create mode 100644 drivers/net/mlx4/mlx4.h
 create mode 100644 drivers/net/mlx4/mr.c
 create mode 100644 drivers/net/mlx4/pd.c
 create mode 100644 drivers/net/mlx4/profile.c
 create mode 100644 drivers/net/mlx4/qp.c
 create mode 100644 drivers/net/mlx4/reset.c
 create mode 100644 drivers/net/mlx4/srq.c
 create mode 100644 include/linux/mlx4/cmd.h
 create mode 100644 include/linux/mlx4/cq.h
 create mode 100644 include/linux/mlx4/device.h
 create mode 100644 include/linux/mlx4/doorbell.h
 create mode 100644 include/linux/mlx4/driver.h
 create mode 100644 include/linux/mlx4/qp.h
 create mode 100644 include/linux/mlx4/srq.h

(limited to 'include/linux')

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 82afba5c0bf4..37deaae49190 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -45,6 +45,8 @@ source "drivers/infiniband/hw/ehca/Kconfig"
 source "drivers/infiniband/hw/amso1100/Kconfig"
 source "drivers/infiniband/hw/cxgb3/Kconfig"
 
+source "drivers/infiniband/hw/mlx4/Kconfig"
+
 source "drivers/infiniband/ulp/ipoib/Kconfig"
 
 source "drivers/infiniband/ulp/srp/Kconfig"
diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile
index da2066c4f22c..75f325e40b54 100644
--- a/drivers/infiniband/Makefile
+++ b/drivers/infiniband/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_INFINIBAND_IPATH)		+= hw/ipath/
 obj-$(CONFIG_INFINIBAND_EHCA)		+= hw/ehca/
 obj-$(CONFIG_INFINIBAND_AMSO1100)	+= hw/amso1100/
 obj-$(CONFIG_INFINIBAND_CXGB3)		+= hw/cxgb3/
+obj-$(CONFIG_MLX4_INFINIBAND)		+= hw/mlx4/
 obj-$(CONFIG_INFINIBAND_IPOIB)		+= ulp/ipoib/
 obj-$(CONFIG_INFINIBAND_SRP)		+= ulp/srp/
 obj-$(CONFIG_INFINIBAND_ISER)		+= ulp/iser/
diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig
new file mode 100644
index 000000000000..b8912cdb9663
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/Kconfig
@@ -0,0 +1,9 @@
+config MLX4_INFINIBAND
+	tristate "Mellanox ConnectX HCA support"
+	depends on INFINIBAND
+	select MLX4_CORE
+	---help---
+	  This driver provides low-level InfiniBand support for
+	  Mellanox ConnectX PCI Express host channel adapters (HCAs).
+	  This is required to use InfiniBand protocols such as
+	  IP-over-IB or SRP with these devices.
diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile
new file mode 100644
index 000000000000..70f09c7826da
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_MLX4_INFINIBAND)	+= mlx4_ib.o
+
+mlx4_ib-y :=	ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
new file mode 100644
index 000000000000..c75ac9463e20
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4_ib.h"
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	struct mlx4_dev *dev = to_mdev(pd->device)->dev;
+	struct mlx4_ib_ah *ah;
+
+	ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	memset(&ah->av, 0, sizeof ah->av);
+
+	ah->av.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
+	ah->av.g_slid  = ah_attr->src_path_bits;
+	ah->av.dlid    = cpu_to_be16(ah_attr->dlid);
+	if (ah_attr->static_rate) {
+		ah->av.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
+		while (ah->av.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << ah->av.stat_rate & dev->caps.stat_rate_support))
+			--ah->av.stat_rate;
+	}
+	ah->av.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		ah->av.g_slid   |= 0x80;
+		ah->av.gid_index = ah_attr->grh.sgid_index;
+		ah->av.hop_limit = ah_attr->grh.hop_limit;
+		ah->av.sl_tclass_flowlabel |=
+			cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+				    ah_attr->grh.flow_label);
+		memcpy(ah->av.dgid, ah_attr->grh.dgid.raw, 16);
+	}
+
+	return &ah->ibah;
+}
+
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct mlx4_ib_ah *ah = to_mah(ibah);
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->dlid	       = be16_to_cpu(ah->av.dlid);
+	ah_attr->sl	       = be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28;
+	ah_attr->port_num      = be32_to_cpu(ah->av.port_pd) >> 24;
+	if (ah->av.stat_rate)
+		ah_attr->static_rate = ah->av.stat_rate - MLX4_STAT_RATE_OFFSET;
+	ah_attr->src_path_bits = ah->av.g_slid & 0x7F;
+
+	if (mlx4_ib_ah_grh_present(ah)) {
+		ah_attr->ah_flags = IB_AH_GRH;
+
+		ah_attr->grh.traffic_class =
+			be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20;
+		ah_attr->grh.flow_label =
+			be32_to_cpu(ah->av.sl_tclass_flowlabel) & 0xfffff;
+		ah_attr->grh.hop_limit  = ah->av.hop_limit;
+		ah_attr->grh.sgid_index = ah->av.gid_index;
+		memcpy(ah_attr->grh.dgid.raw, ah->av.dgid, 16);
+	}
+
+	return 0;
+}
+
+int mlx4_ib_destroy_ah(struct ib_ah *ah)
+{
+	kfree(to_mah(ah));
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
new file mode 100644
index 000000000000..b2a290c6703a
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -0,0 +1,525 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+static void mlx4_ib_cq_comp(struct mlx4_cq *cq)
+{
+	struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
+	ibcq->comp_handler(ibcq, ibcq->cq_context);
+}
+
+static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct ib_cq *ibcq;
+
+	if (type != MLX4_EVENT_TYPE_CQ_ERROR) {
+		printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+		       "on CQ %06x\n", type, cq->cqn);
+		return;
+	}
+
+	ibcq = &to_mibcq(cq)->ibcq;
+	if (ibcq->event_handler) {
+		event.device     = ibcq->device;
+		event.event      = IB_EVENT_CQ_ERR;
+		event.element.cq = ibcq;
+		ibcq->event_handler(&event, ibcq->cq_context);
+	}
+}
+
+static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n)
+{
+	int offset = n * sizeof (struct mlx4_cqe);
+
+	if (buf->buf.nbufs == 1)
+		return buf->buf.u.direct.buf + offset;
+	else
+		return buf->buf.u.page_list[offset >> PAGE_SHIFT].buf +
+			(offset & (PAGE_SIZE - 1));
+}
+
+static void *get_cqe(struct mlx4_ib_cq *cq, int n)
+{
+	return get_cqe_from_buf(&cq->buf, n);
+}
+
+static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
+{
+	struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+
+	return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+		!!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
+}
+
+static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq)
+{
+	return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+				struct ib_ucontext *context,
+				struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_cq *cq;
+	struct mlx4_uar *uar;
+	int buf_size;
+	int err;
+
+	if (entries < 1 || entries > dev->dev->caps.max_cqes)
+		return ERR_PTR(-EINVAL);
+
+	cq = kmalloc(sizeof *cq, GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	entries      = roundup_pow_of_two(entries + 1);
+	cq->ibcq.cqe = entries - 1;
+	buf_size     = entries * sizeof (struct mlx4_cqe);
+	spin_lock_init(&cq->lock);
+
+	if (context) {
+		struct mlx4_ib_create_cq ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err_cq;
+		}
+
+		cq->umem = ib_umem_get(context, ucmd.buf_addr, buf_size,
+				       IB_ACCESS_LOCAL_WRITE);
+		if (IS_ERR(cq->umem)) {
+			err = PTR_ERR(cq->umem);
+			goto err_cq;
+		}
+
+		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(cq->umem),
+				    ilog2(cq->umem->page_size), &cq->buf.mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_ib_umem_write_mtt(dev, &cq->buf.mtt, cq->umem);
+		if (err)
+			goto err_mtt;
+
+		err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
+					  &cq->db);
+		if (err)
+			goto err_mtt;
+
+		uar = &to_mucontext(context)->uar;
+	} else {
+		err = mlx4_ib_db_alloc(dev, &cq->db, 1);
+		if (err)
+			goto err_cq;
+
+		cq->mcq.set_ci_db  = cq->db.db;
+		cq->mcq.arm_db     = cq->db.db + 1;
+		*cq->mcq.set_ci_db = 0;
+		*cq->mcq.arm_db    = 0;
+
+		if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &cq->buf.buf)) {
+			err = -ENOMEM;
+			goto err_db;
+		}
+
+		err = mlx4_mtt_init(dev->dev, cq->buf.buf.npages, cq->buf.buf.page_shift,
+				    &cq->buf.mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_buf_write_mtt(dev->dev, &cq->buf.mtt, &cq->buf.buf);
+		if (err)
+			goto err_mtt;
+
+		uar = &dev->priv_uar;
+	}
+
+	err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
+			    cq->db.dma, &cq->mcq);
+	if (err)
+		goto err_dbmap;
+
+	cq->mcq.comp  = mlx4_ib_cq_comp;
+	cq->mcq.event = mlx4_ib_cq_event;
+
+	if (context)
+		if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) {
+			err = -EFAULT;
+			goto err_dbmap;
+		}
+
+	return &cq->ibcq;
+
+err_dbmap:
+	if (context)
+		mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db);
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt);
+
+err_buf:
+	if (context)
+		ib_umem_release(cq->umem);
+	else
+		mlx4_buf_free(dev->dev, entries * sizeof (struct mlx4_cqe),
+			      &cq->buf.buf);
+
+err_db:
+	if (!context)
+		mlx4_ib_db_free(dev, &cq->db);
+
+err_cq:
+	kfree(cq);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_destroy_cq(struct ib_cq *cq)
+{
+	struct mlx4_ib_dev *dev = to_mdev(cq->device);
+	struct mlx4_ib_cq *mcq = to_mcq(cq);
+
+	mlx4_cq_free(dev->dev, &mcq->mcq);
+	mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt);
+
+	if (cq->uobject) {
+		mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
+		ib_umem_release(mcq->umem);
+	} else {
+		mlx4_buf_free(dev->dev, (cq->cqe + 1) * sizeof (struct mlx4_cqe),
+			      &mcq->buf.buf);
+		mlx4_ib_db_free(dev, &mcq->db);
+	}
+
+	kfree(mcq);
+
+	return 0;
+}
+
+static void dump_cqe(void *cqe)
+{
+	__be32 *buf = cqe;
+
+	printk(KERN_DEBUG "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+	       be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]),
+	       be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]),
+	       be32_to_cpu(buf[6]), be32_to_cpu(buf[7]));
+}
+
+static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe,
+				     struct ib_wc *wc)
+{
+	if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) {
+		printk(KERN_DEBUG "local QP operation err "
+		       "(QPN %06x, WQE index %x, vendor syndrome %02x, "
+		       "opcode = %02x)\n",
+		       be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index),
+		       cqe->vendor_err_syndrome,
+		       cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+		dump_cqe(cqe);
+	}
+
+	switch (cqe->syndrome) {
+	case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+		wc->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+		wc->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR:
+		wc->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_WR_FLUSH_ERR:
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_MW_BIND_ERR:
+		wc->status = IB_WC_MW_BIND_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_BAD_RESP_ERR:
+		wc->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+		wc->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+		wc->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+		wc->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_OP_ERR:
+		wc->status = IB_WC_REM_OP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+		wc->status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+		wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+		wc->status = IB_WC_REM_ABORT_ERR;
+		break;
+	default:
+		wc->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	wc->vendor_err = cqe->vendor_err_syndrome;
+}
+
+static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
+			    struct mlx4_ib_qp **cur_qp,
+			    struct ib_wc *wc)
+{
+	struct mlx4_cqe *cqe;
+	struct mlx4_qp *mqp;
+	struct mlx4_ib_wq *wq;
+	struct mlx4_ib_srq *srq;
+	int is_send;
+	int is_error;
+	u16 wqe_ctr;
+
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	++cq->mcq.cons_index;
+
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	is_send  = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+		MLX4_CQE_OPCODE_ERROR;
+
+	if (!*cur_qp ||
+	    (be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) {
+		/*
+		 * We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev,
+				       be32_to_cpu(cqe->my_qpn));
+		if (unlikely(!mqp)) {
+			printk(KERN_WARNING "CQ %06x with entry for unknown QPN %06x\n",
+			       cq->mcq.cqn, be32_to_cpu(cqe->my_qpn) & 0xffffff);
+			return -EINVAL;
+		}
+
+		*cur_qp = to_mibqp(mqp);
+	}
+
+	wc->qp = &(*cur_qp)->ibqp;
+
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		wqe_ctr = be16_to_cpu(cqe->wqe_index);
+		wq->tail += wqe_ctr - (u16) wq->tail;
+		wc->wr_id = wq->wrid[wq->tail & (wq->max - 1)];
+		++wq->tail;
+	} else if ((*cur_qp)->ibqp.srq) {
+		srq = to_msrq((*cur_qp)->ibqp.srq);
+		wqe_ctr = be16_to_cpu(cqe->wqe_index);
+		wc->wr_id = srq->wrid[wqe_ctr];
+		mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+	} else {
+		wq	  = &(*cur_qp)->rq;
+		wc->wr_id = wq->wrid[wq->tail & (wq->max - 1)];
+		++wq->tail;
+	}
+
+	if (unlikely(is_error)) {
+		mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc);
+		return 0;
+	}
+
+	wc->status = IB_WC_SUCCESS;
+
+	if (is_send) {
+		wc->wc_flags = 0;
+		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_OPCODE_RDMA_WRITE_IMM:
+			wc->wc_flags |= IB_WC_WITH_IMM;
+		case MLX4_OPCODE_RDMA_WRITE:
+			wc->opcode    = IB_WC_RDMA_WRITE;
+			break;
+		case MLX4_OPCODE_SEND_IMM:
+			wc->wc_flags |= IB_WC_WITH_IMM;
+		case MLX4_OPCODE_SEND:
+			wc->opcode    = IB_WC_SEND;
+			break;
+		case MLX4_OPCODE_RDMA_READ:
+			wc->opcode    = IB_WC_SEND;
+			wc->byte_len  = be32_to_cpu(cqe->byte_cnt);
+			break;
+		case MLX4_OPCODE_ATOMIC_CS:
+			wc->opcode    = IB_WC_COMP_SWAP;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_ATOMIC_FA:
+			wc->opcode    = IB_WC_FETCH_ADD;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_BIND_MW:
+			wc->opcode    = IB_WC_BIND_MW;
+			break;
+		}
+	} else {
+		wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+
+		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+			wc->opcode   = IB_WC_RECV_RDMA_WITH_IMM;
+			wc->wc_flags = IB_WC_WITH_IMM;
+			wc->imm_data = cqe->immed_rss_invalid;
+			break;
+		case MLX4_RECV_OPCODE_SEND:
+			wc->opcode   = IB_WC_RECV;
+			wc->wc_flags = 0;
+			break;
+		case MLX4_RECV_OPCODE_SEND_IMM:
+			wc->opcode   = IB_WC_RECV;
+			wc->wc_flags = IB_WC_WITH_IMM;
+			wc->imm_data = cqe->immed_rss_invalid;
+			break;
+		}
+
+		wc->slid	   = be16_to_cpu(cqe->rlid);
+		wc->sl		   = cqe->sl >> 4;
+		wc->src_qp	   = be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff;
+		wc->dlid_path_bits = (be32_to_cpu(cqe->g_mlpath_rqpn) >> 24) & 0x7f;
+		wc->wc_flags      |= be32_to_cpu(cqe->g_mlpath_rqpn) & 0x80000000 ?
+			IB_WC_GRH : 0;
+		wc->pkey_index     = be32_to_cpu(cqe->immed_rss_invalid) >> 16;
+	}
+
+	return 0;
+}
+
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct mlx4_ib_cq *cq = to_mcq(ibcq);
+	struct mlx4_ib_qp *cur_qp = NULL;
+	unsigned long flags;
+	int npolled;
+	int err = 0;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+		err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled);
+		if (err)
+			break;
+	}
+
+	if (npolled)
+		mlx4_cq_set_ci(&cq->mcq);
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	if (err == 0 || err == -EAGAIN)
+		return npolled;
+	else
+		return err;
+}
+
+int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	mlx4_cq_arm(&to_mcq(ibcq)->mcq,
+		    (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		    MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT,
+		    to_mdev(ibcq->device)->uar_map,
+		    MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock));
+
+	return 0;
+}
+
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+	u32 prod_index;
+	int nfreed = 0;
+	struct mlx4_cqe *cqe;
+
+	/*
+	 * First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
+		if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+			break;
+
+	/*
+	 * Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+		if ((be32_to_cpu(cqe->my_qpn) & 0xffffff) == qpn) {
+			if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
+				mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index));
+			++nfreed;
+		} else if (nfreed)
+			memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe),
+			       cqe, sizeof *cqe);
+	}
+
+	if (nfreed) {
+		cq->mcq.cons_index += nfreed;
+		/*
+		 * Make sure update of buffer contents is done before
+		 * updating consumer index.
+		 */
+		wmb();
+		mlx4_cq_set_ci(&cq->mcq);
+	}
+}
+
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+	spin_lock_irq(&cq->lock);
+	__mlx4_ib_cq_clean(cq, qpn, srq);
+	spin_unlock_irq(&cq->lock);
+}
diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c
new file mode 100644
index 000000000000..1c36087aef14
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/doorbell.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+
+struct mlx4_ib_db_pgdir {
+	struct list_head	list;
+	DECLARE_BITMAP(order0, MLX4_IB_DB_PER_PAGE);
+	DECLARE_BITMAP(order1, MLX4_IB_DB_PER_PAGE / 2);
+	unsigned long	       *bits[2];
+	__be32		       *db_page;
+	dma_addr_t		db_dma;
+};
+
+static struct mlx4_ib_db_pgdir *mlx4_ib_alloc_db_pgdir(struct mlx4_ib_dev *dev)
+{
+	struct mlx4_ib_db_pgdir *pgdir;
+
+	pgdir = kzalloc(sizeof *pgdir, GFP_KERNEL);
+	if (!pgdir)
+		return NULL;
+
+	bitmap_fill(pgdir->order1, MLX4_IB_DB_PER_PAGE / 2);
+	pgdir->bits[0] = pgdir->order0;
+	pgdir->bits[1] = pgdir->order1;
+	pgdir->db_page = dma_alloc_coherent(dev->ib_dev.dma_device,
+					    PAGE_SIZE, &pgdir->db_dma,
+					    GFP_KERNEL);
+	if (!pgdir->db_page) {
+		kfree(pgdir);
+		return NULL;
+	}
+
+	return pgdir;
+}
+
+static int mlx4_ib_alloc_db_from_pgdir(struct mlx4_ib_db_pgdir *pgdir,
+				       struct mlx4_ib_db *db, int order)
+{
+	int o;
+	int i;
+
+	for (o = order; o <= 1; ++o) {
+		i = find_first_bit(pgdir->bits[o], MLX4_IB_DB_PER_PAGE >> o);
+		if (i < MLX4_IB_DB_PER_PAGE >> o)
+			goto found;
+	}
+
+	return -ENOMEM;
+
+found:
+	clear_bit(i, pgdir->bits[o]);
+
+	i <<= o;
+
+	if (o > order)
+		set_bit(i ^ 1, pgdir->bits[order]);
+
+	db->u.pgdir = pgdir;
+	db->index   = i;
+	db->db      = pgdir->db_page + db->index;
+	db->dma     = pgdir->db_dma  + db->index * 4;
+	db->order   = order;
+
+	return 0;
+}
+
+int mlx4_ib_db_alloc(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db, int order)
+{
+	struct mlx4_ib_db_pgdir *pgdir;
+	int ret = 0;
+
+	mutex_lock(&dev->pgdir_mutex);
+
+	list_for_each_entry(pgdir, &dev->pgdir_list, list)
+		if (!mlx4_ib_alloc_db_from_pgdir(pgdir, db, order))
+			goto out;
+
+	pgdir = mlx4_ib_alloc_db_pgdir(dev);
+	if (!pgdir) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_add(&pgdir->list, &dev->pgdir_list);
+
+	/* This should never fail -- we just allocated an empty page: */
+	WARN_ON(mlx4_ib_alloc_db_from_pgdir(pgdir, db, order));
+
+out:
+	mutex_unlock(&dev->pgdir_mutex);
+
+	return ret;
+}
+
+void mlx4_ib_db_free(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db)
+{
+	int o;
+	int i;
+
+	mutex_lock(&dev->pgdir_mutex);
+
+	o = db->order;
+	i = db->index;
+
+	if (db->order == 0 && test_bit(i ^ 1, db->u.pgdir->order0)) {
+		clear_bit(i ^ 1, db->u.pgdir->order0);
+		++o;
+	}
+
+	i >>= o;
+	set_bit(i, db->u.pgdir->bits[o]);
+
+	if (bitmap_full(db->u.pgdir->order1, MLX4_IB_DB_PER_PAGE / 2)) {
+		dma_free_coherent(dev->ib_dev.dma_device, PAGE_SIZE,
+				  db->u.pgdir->db_page, db->u.pgdir->db_dma);
+		list_del(&db->u.pgdir->list);
+		kfree(db->u.pgdir);
+	}
+
+	mutex_unlock(&dev->pgdir_mutex);
+}
+
+struct mlx4_ib_user_db_page {
+	struct list_head	list;
+	struct ib_umem	       *umem;
+	unsigned long		user_virt;
+	int			refcnt;
+};
+
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+			struct mlx4_ib_db *db)
+{
+	struct mlx4_ib_user_db_page *page;
+	struct ib_umem_chunk *chunk;
+	int err = 0;
+
+	mutex_lock(&context->db_page_mutex);
+
+	list_for_each_entry(page, &context->db_page_list, list)
+		if (page->user_virt == (virt & PAGE_MASK))
+			goto found;
+
+	page = kmalloc(sizeof *page, GFP_KERNEL);
+	if (!page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	page->user_virt = (virt & PAGE_MASK);
+	page->refcnt    = 0;
+	page->umem      = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
+				      PAGE_SIZE, 0);
+	if (IS_ERR(page->umem)) {
+		err = PTR_ERR(page->umem);
+		kfree(page);
+		goto out;
+	}
+
+	list_add(&page->list, &context->db_page_list);
+
+found:
+	chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list);
+	db->dma		= sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK);
+	db->u.user_page = page;
+	++page->refcnt;
+
+out:
+	mutex_unlock(&context->db_page_mutex);
+
+	return err;
+}
+
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_ib_db *db)
+{
+	mutex_lock(&context->db_page_mutex);
+
+	if (!--db->u.user_page->refcnt) {
+		list_del(&db->u.user_page->list);
+		ib_umem_release(db->u.user_page->umem);
+		kfree(db->u.user_page);
+	}
+
+	mutex_unlock(&context->db_page_mutex);
+}
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
new file mode 100644
index 000000000000..333091787c5f
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_ib.h"
+
+enum {
+	MLX4_IB_VENDOR_CLASS1 = 0x9,
+	MLX4_IB_VENDOR_CLASS2 = 0xa
+};
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+		 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		 void *in_mad, void *response_mad)
+{
+	struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+	void *inbox;
+	int err;
+	u32 in_modifier = port;
+	u8 op_modifier = 0;
+
+	inmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(inmailbox))
+		return PTR_ERR(inmailbox);
+	inbox = inmailbox->buf;
+
+	outmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(outmailbox)) {
+		mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+		return PTR_ERR(outmailbox);
+	}
+
+	memcpy(inbox, in_mad, 256);
+
+	/*
+	 * Key check traps can't be generated unless we have in_wc to
+	 * tell us where to send the trap.
+	 */
+	if (ignore_mkey || !in_wc)
+		op_modifier |= 0x1;
+	if (ignore_bkey || !in_wc)
+		op_modifier |= 0x2;
+
+	if (in_wc) {
+		struct {
+			__be32		my_qpn;
+			u32		reserved1;
+			__be32		rqpn;
+			u8		sl;
+			u8		g_path;
+			u16		reserved2[2];
+			__be16		pkey;
+			u32		reserved3[11];
+			u8		grh[40];
+		} *ext_info;
+
+		memset(inbox + 256, 0, 256);
+		ext_info = inbox + 256;
+
+		ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num);
+		ext_info->rqpn   = cpu_to_be32(in_wc->src_qp);
+		ext_info->sl     = in_wc->sl << 4;
+		ext_info->g_path = in_wc->dlid_path_bits |
+			(in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+		ext_info->pkey   = cpu_to_be16(in_wc->pkey_index);
+
+		if (in_grh)
+			memcpy(ext_info->grh, in_grh, 40);
+
+		op_modifier |= 0x4;
+
+		in_modifier |= in_wc->slid << 16;
+	}
+
+	err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma,
+			   in_modifier, op_modifier,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C);
+
+	if (!err);
+		memcpy(response_mad, outmailbox->buf, 256);
+
+	mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+	mlx4_free_cmd_mailbox(dev->dev, outmailbox);
+
+	return err;
+}
+
+static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl)
+{
+	struct ib_ah *new_ah;
+	struct ib_ah_attr ah_attr;
+
+	if (!dev->send_agent[port_num - 1][0])
+		return;
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = lid;
+	ah_attr.sl       = sl;
+	ah_attr.port_num = port_num;
+
+	new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+			      &ah_attr);
+	if (IS_ERR(new_ah))
+		return;
+
+	spin_lock(&dev->sm_lock);
+	if (dev->sm_ah[port_num - 1])
+		ib_destroy_ah(dev->sm_ah[port_num - 1]);
+	dev->sm_ah[port_num - 1] = new_ah;
+	spin_unlock(&dev->sm_lock);
+}
+
+/*
+ * Snoop SM MADs for port info and P_Key table sets, so we can
+ * synthesize LID change and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad)
+{
+	struct ib_event event;
+
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method == IB_MGMT_METHOD_SET) {
+		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
+			struct ib_port_info *pinfo =
+				(struct ib_port_info *) ((struct ib_smp *) mad)->data;
+
+			update_sm_ah(to_mdev(ibdev), port_num,
+				     be16_to_cpu(pinfo->sm_lid),
+				     pinfo->neighbormtu_mastersmsl & 0xf);
+
+			event.device	       = ibdev;
+			event.element.port_num = port_num;
+
+			if(pinfo->clientrereg_resv_subnetto & 0x80)
+				event.event    = IB_EVENT_CLIENT_REREGISTER;
+			else
+				event.event    = IB_EVENT_LID_CHANGE;
+
+			ib_dispatch_event(&event);
+		}
+
+		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+			event.device	       = ibdev;
+			event.event	       = IB_EVENT_PKEY_CHANGE;
+			event.element.port_num = port_num;
+			ib_dispatch_event(&event);
+		}
+	}
+}
+
+static void node_desc_override(struct ib_device *dev,
+			       struct ib_mad *mad)
+{
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
+	    mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
+		spin_lock(&to_mdev(dev)->sm_lock);
+		memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64);
+		spin_unlock(&to_mdev(dev)->sm_lock);
+	}
+}
+
+static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *mad)
+{
+	int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+	int ret;
+
+	if (agent) {
+		send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
+					      IB_MGMT_MAD_DATA, GFP_ATOMIC);
+		/*
+		 * We rely here on the fact that MLX QPs don't use the
+		 * address handle after the send is posted (this is
+		 * wrong following the IB spec strictly, but we know
+		 * it's OK for our devices).
+		 */
+		spin_lock(&dev->sm_lock);
+		memcpy(send_buf->mad, mad, sizeof *mad);
+		if ((send_buf->ah = dev->sm_ah[port_num - 1]))
+			ret = ib_post_send_mad(send_buf, NULL);
+		else
+			ret = -EINVAL;
+		spin_unlock(&dev->sm_lock);
+
+		if (ret)
+			ib_free_send_mad(send_buf);
+	}
+}
+
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	u16 slid;
+	int err;
+
+	slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) {
+		forward_trap(to_mdev(ibdev), port_num, in_mad);
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+	}
+
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/*
+		 * Don't process SMInfo queries or vendor-specific
+		 * MADs -- the SMA can't handle them.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
+		    ((in_mad->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) ==
+		     IB_SMP_ATTR_VENDOR_MASK))
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1   ||
+		   in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else
+		return IB_MAD_RESULT_SUCCESS;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev),
+			   mad_flags & IB_MAD_IGNORE_MKEY,
+			   mad_flags & IB_MAD_IGNORE_BKEY,
+			   port_num, in_wc, in_grh, in_mad, out_mad);
+	if (err)
+		return IB_MAD_RESULT_FAILURE;
+
+	if (!out_mad->mad_hdr.status) {
+		smp_snoop(ibdev, port_num, in_mad);
+		node_desc_override(ibdev, out_mad);
+	}
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+	int ret;
+
+	for (p = 0; p < dev->dev->caps.num_ports; ++p)
+		for (q = 0; q <= 1; ++q) {
+			agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+						      q ? IB_QPT_GSI : IB_QPT_SMI,
+						      NULL, 0, send_handler,
+						      NULL, NULL);
+			if (IS_ERR(agent)) {
+				ret = PTR_ERR(agent);
+				goto err;
+			}
+			dev->send_agent[p][q] = agent;
+		}
+
+	return 0;
+
+err:
+	for (p = 0; p < dev->dev->caps.num_ports; ++p)
+		for (q = 0; q <= 1; ++q)
+			if (dev->send_agent[p][q])
+				ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+	return ret;
+}
+
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+
+	for (p = 0; p < dev->dev->caps.num_ports; ++p) {
+		for (q = 0; q <= 1; ++q) {
+			agent = dev->send_agent[p][q];
+			dev->send_agent[p][q] = NULL;
+			ib_unregister_mad_agent(agent);
+		}
+
+		if (dev->sm_ah[p])
+			ib_destroy_ah(dev->sm_ah[p]);
+	}
+}
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
new file mode 100644
index 000000000000..688ecb4c39f3
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -0,0 +1,651 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+#define DRV_NAME	"mlx4_ib"
+#define DRV_VERSION	"0.01"
+#define DRV_RELDATE	"May 1, 2006"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static const char mlx4_ib_version[] __devinitdata =
+	DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static void init_query_mad(struct ib_smp *mad)
+{
+	mad->base_version  = 1;
+	mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	mad->class_version = 1;
+	mad->method	   = IB_MGMT_METHOD_GET;
+}
+
+static int mlx4_ib_query_device(struct ib_device *ibdev,
+				struct ib_device_attr *props)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memset(props, 0, sizeof *props);
+
+	props->fw_ver = dev->dev->caps.fw_ver;
+	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT		|
+		IB_DEVICE_SYS_IMAGE_GUID		|
+		IB_DEVICE_RC_RNR_NAK_GEN;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM)
+		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT)
+		props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+
+	props->vendor_id	   = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
+		0xffffff;
+	props->vendor_part_id	   = be16_to_cpup((__be16 *) (out_mad->data + 30));
+	props->hw_ver		   = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&props->sys_image_guid, out_mad->data +	4, 8);
+
+	props->max_mr_size	   = ~0ull;
+	props->page_size_cap	   = dev->dev->caps.page_size_cap;
+	props->max_qp		   = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps;
+	props->max_qp_wr	   = dev->dev->caps.max_wqes;
+	props->max_sge		   = min(dev->dev->caps.max_sq_sg,
+					 dev->dev->caps.max_rq_sg);
+	props->max_cq		   = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs;
+	props->max_cqe		   = dev->dev->caps.max_cqes;
+	props->max_mr		   = dev->dev->caps.num_mpts - dev->dev->caps.reserved_mrws;
+	props->max_pd		   = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds;
+	props->max_qp_rd_atom	   = dev->dev->caps.max_qp_dest_rdma;
+	props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma;
+	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
+	props->max_srq		   = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs;
+	props->max_srq_wr	   = dev->dev->caps.max_srq_wqes;
+	props->max_srq_sge	   = dev->dev->caps.max_srq_sge;
+	props->local_ca_ack_delay  = dev->dev->caps.local_ca_ack_delay;
+	props->atomic_cap	   = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ?
+		IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+	props->max_pkeys	   = dev->dev->caps.pkey_table_len;
+	props->max_mcast_grp	   = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms;
+	props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+					   props->max_mcast_grp;
+	props->max_map_per_fmr = (1 << (32 - ilog2(dev->dev->caps.num_mpts))) - 1;
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return err;
+}
+
+static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+			      struct ib_port_attr *props)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(props, 0, sizeof *props);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	props->lid		= be16_to_cpup((__be16 *) (out_mad->data + 16));
+	props->lmc		= out_mad->data[34] & 0x7;
+	props->sm_lid		= be16_to_cpup((__be16 *) (out_mad->data + 18));
+	props->sm_sl		= out_mad->data[36] & 0xf;
+	props->state		= out_mad->data[32] & 0xf;
+	props->phys_state	= out_mad->data[33] >> 4;
+	props->port_cap_flags	= be32_to_cpup((__be32 *) (out_mad->data + 20));
+	props->gid_tbl_len	= to_mdev(ibdev)->dev->caps.gid_table_len;
+	props->max_msg_sz	= 0x80000000;
+	props->pkey_tbl_len	= to_mdev(ibdev)->dev->caps.pkey_table_len;
+	props->bad_pkey_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 46));
+	props->qkey_viol_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 48));
+	props->active_width	= out_mad->data[31] & 0xf;
+	props->active_speed	= out_mad->data[35] >> 4;
+	props->max_mtu		= out_mad->data[41] & 0xf;
+	props->active_mtu	= out_mad->data[36] >> 4;
+	props->subnet_timeout	= out_mad->data[51] & 0x1f;
+	props->max_vl_num	= out_mad->data[37] >> 4;
+	props->init_type_reply	= out_mad->data[41] >> 4;
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return err;
+}
+
+static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			     union ib_gid *gid)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw, out_mad->data + 8, 8);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
+	in_mad->attr_mod = cpu_to_be32(index / 8);
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			      u16 *pkey)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	*pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
+				 struct ib_device_modify *props)
+{
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+		spin_lock(&to_mdev(ibdev)->sm_lock);
+		memcpy(ibdev->node_desc, props->node_desc, 64);
+		spin_unlock(&to_mdev(ibdev)->sm_lock);
+	}
+
+	return 0;
+}
+
+static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols,
+			 u32 cap_mask)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memset(mailbox->buf, 0, 256);
+	*(u8 *) mailbox->buf	     = !!reset_qkey_viols << 6;
+	((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask);
+
+	err = mlx4_cmd(dev->dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev->dev, mailbox);
+	return err;
+}
+
+static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
+			       struct ib_port_modify *props)
+{
+	struct ib_port_attr attr;
+	u32 cap_mask;
+	int err;
+
+	mutex_lock(&to_mdev(ibdev)->cap_mask_mutex);
+
+	err = mlx4_ib_query_port(ibdev, port, &attr);
+	if (err)
+		goto out;
+
+	cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+		~props->clr_port_cap_mask;
+
+	err = mlx4_SET_PORT(to_mdev(ibdev), port,
+			    !!(mask & IB_PORT_RESET_QKEY_CNTR),
+			    cap_mask);
+
+out:
+	mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+	return err;
+}
+
+static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
+						  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_ucontext *context;
+	struct mlx4_ib_alloc_ucontext_resp resp;
+	int err;
+
+	resp.qp_tab_size      = dev->dev->caps.num_qps;
+	resp.bf_reg_size      = dev->dev->caps.bf_reg_size;
+	resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar);
+	if (err) {
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	INIT_LIST_HEAD(&context->db_page_list);
+	mutex_init(&context->db_page_mutex);
+
+	err = ib_copy_to_udata(udata, &resp, sizeof resp);
+	if (err) {
+		mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
+		kfree(context);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return &context->ibucontext;
+}
+
+static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+
+	mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar);
+	kfree(context);
+
+	return 0;
+}
+
+static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct mlx4_ib_dev *dev = to_mdev(context->device);
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	if (vma->vm_pgoff == 0) {
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				       to_mucontext(context)->uar.pfn,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+	} else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) {
+		/* FIXME want pgprot_writecombine() for BlueFlame pages */
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				       to_mucontext(context)->uar.pfn +
+				       dev->dev->caps.num_uars,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+	} else
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
+				      struct ib_ucontext *context,
+				      struct ib_udata *udata)
+{
+	struct mlx4_ib_pd *pd;
+	int err;
+
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context)
+		if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) {
+			mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+
+	return &pd->ibpd;
+}
+
+static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
+{
+	mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
+	kfree(pd);
+
+	return 0;
+}
+
+static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return mlx4_multicast_attach(to_mdev(ibqp->device)->dev,
+				     &to_mqp(ibqp)->mqp, gid->raw);
+}
+
+static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return mlx4_multicast_detach(to_mdev(ibqp->device)->dev,
+				     &to_mqp(ibqp)->mqp, gid->raw);
+}
+
+static int init_node_data(struct mlx4_ib_dev *dev)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+
+	err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
+
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static void *mlx4_ib_add(struct mlx4_dev *dev)
+{
+	struct mlx4_ib_dev *ibdev;
+
+	ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
+	if (!ibdev) {
+		dev_err(&dev->pdev->dev, "Device struct alloc failed\n");
+		return NULL;
+	}
+
+	if (mlx4_pd_alloc(dev, &ibdev->priv_pdn))
+		goto err_dealloc;
+
+	if (mlx4_uar_alloc(dev, &ibdev->priv_uar))
+		goto err_pd;
+
+	ibdev->uar_map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!ibdev->uar_map)
+		goto err_uar;
+
+	INIT_LIST_HEAD(&ibdev->pgdir_list);
+	mutex_init(&ibdev->pgdir_mutex);
+
+	ibdev->dev = dev;
+
+	strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
+	ibdev->ib_dev.owner		= THIS_MODULE;
+	ibdev->ib_dev.node_type		= RDMA_NODE_IB_CA;
+	ibdev->ib_dev.phys_port_cnt	= dev->caps.num_ports;
+	ibdev->ib_dev.num_comp_vectors	= 1;
+	ibdev->ib_dev.dma_device	= &dev->pdev->dev;
+
+	ibdev->ib_dev.uverbs_abi_ver	= MLX4_IB_UVERBS_ABI_VERSION;
+	ibdev->ib_dev.uverbs_cmd_mask	=
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+	ibdev->ib_dev.query_device	= mlx4_ib_query_device;
+	ibdev->ib_dev.query_port	= mlx4_ib_query_port;
+	ibdev->ib_dev.query_gid		= mlx4_ib_query_gid;
+	ibdev->ib_dev.query_pkey	= mlx4_ib_query_pkey;
+	ibdev->ib_dev.modify_device	= mlx4_ib_modify_device;
+	ibdev->ib_dev.modify_port	= mlx4_ib_modify_port;
+	ibdev->ib_dev.alloc_ucontext	= mlx4_ib_alloc_ucontext;
+	ibdev->ib_dev.dealloc_ucontext	= mlx4_ib_dealloc_ucontext;
+	ibdev->ib_dev.mmap		= mlx4_ib_mmap;
+	ibdev->ib_dev.alloc_pd		= mlx4_ib_alloc_pd;
+	ibdev->ib_dev.dealloc_pd	= mlx4_ib_dealloc_pd;
+	ibdev->ib_dev.create_ah		= mlx4_ib_create_ah;
+	ibdev->ib_dev.query_ah		= mlx4_ib_query_ah;
+	ibdev->ib_dev.destroy_ah	= mlx4_ib_destroy_ah;
+	ibdev->ib_dev.create_srq	= mlx4_ib_create_srq;
+	ibdev->ib_dev.modify_srq	= mlx4_ib_modify_srq;
+	ibdev->ib_dev.destroy_srq	= mlx4_ib_destroy_srq;
+	ibdev->ib_dev.post_srq_recv	= mlx4_ib_post_srq_recv;
+	ibdev->ib_dev.create_qp		= mlx4_ib_create_qp;
+	ibdev->ib_dev.modify_qp		= mlx4_ib_modify_qp;
+	ibdev->ib_dev.destroy_qp	= mlx4_ib_destroy_qp;
+	ibdev->ib_dev.post_send		= mlx4_ib_post_send;
+	ibdev->ib_dev.post_recv		= mlx4_ib_post_recv;
+	ibdev->ib_dev.create_cq		= mlx4_ib_create_cq;
+	ibdev->ib_dev.destroy_cq	= mlx4_ib_destroy_cq;
+	ibdev->ib_dev.poll_cq		= mlx4_ib_poll_cq;
+	ibdev->ib_dev.req_notify_cq	= mlx4_ib_arm_cq;
+	ibdev->ib_dev.get_dma_mr	= mlx4_ib_get_dma_mr;
+	ibdev->ib_dev.reg_user_mr	= mlx4_ib_reg_user_mr;
+	ibdev->ib_dev.dereg_mr		= mlx4_ib_dereg_mr;
+	ibdev->ib_dev.attach_mcast	= mlx4_ib_mcg_attach;
+	ibdev->ib_dev.detach_mcast	= mlx4_ib_mcg_detach;
+	ibdev->ib_dev.process_mad	= mlx4_ib_process_mad;
+
+	if (init_node_data(ibdev))
+		goto err_map;
+
+	spin_lock_init(&ibdev->sm_lock);
+	mutex_init(&ibdev->cap_mask_mutex);
+
+	if (ib_register_device(&ibdev->ib_dev))
+		goto err_map;
+
+	if (mlx4_ib_mad_init(ibdev))
+		goto err_reg;
+
+	return ibdev;
+
+err_reg:
+	ib_unregister_device(&ibdev->ib_dev);
+
+err_map:
+	iounmap(ibdev->uar_map);
+
+err_uar:
+	mlx4_uar_free(dev, &ibdev->priv_uar);
+
+err_pd:
+	mlx4_pd_free(dev, ibdev->priv_pdn);
+
+err_dealloc:
+	ib_dealloc_device(&ibdev->ib_dev);
+
+	return NULL;
+}
+
+static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
+{
+	struct mlx4_ib_dev *ibdev = ibdev_ptr;
+	int p;
+
+	for (p = 1; p <= dev->caps.num_ports; ++p)
+		mlx4_CLOSE_PORT(dev, p);
+
+	mlx4_ib_mad_cleanup(ibdev);
+	ib_unregister_device(&ibdev->ib_dev);
+	iounmap(ibdev->uar_map);
+	mlx4_uar_free(dev, &ibdev->priv_uar);
+	mlx4_pd_free(dev, ibdev->priv_pdn);
+	ib_dealloc_device(&ibdev->ib_dev);
+}
+
+static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
+			  enum mlx4_dev_event event, int subtype,
+			  int port)
+{
+	struct ib_event ibev;
+
+	switch (event) {
+	case MLX4_EVENT_TYPE_PORT_CHANGE:
+		ibev.event = subtype == MLX4_PORT_CHANGE_SUBTYPE_ACTIVE ?
+			IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+		break;
+
+	case MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR:
+		ibev.event = IB_EVENT_DEVICE_FATAL;
+		break;
+
+	default:
+		return;
+	}
+
+	ibev.device	      = ibdev_ptr;
+	ibev.element.port_num = port;
+
+	ib_dispatch_event(&ibev);
+}
+
+static struct mlx4_interface mlx4_ib_interface = {
+	.add	= mlx4_ib_add,
+	.remove	= mlx4_ib_remove,
+	.event	= mlx4_ib_event
+};
+
+static int __init mlx4_ib_init(void)
+{
+	return mlx4_register_interface(&mlx4_ib_interface);
+}
+
+static void __exit mlx4_ib_cleanup(void)
+{
+	mlx4_unregister_interface(&mlx4_ib_interface);
+}
+
+module_init(mlx4_ib_init);
+module_exit(mlx4_ib_cleanup);
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
new file mode 100644
index 000000000000..93dac71f3230
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_H
+#define MLX4_IB_H
+
+#include <linux/compiler.h>
+#include <linux/list.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+enum {
+	MLX4_IB_DB_PER_PAGE	= PAGE_SIZE / 4
+};
+
+struct mlx4_ib_db_pgdir;
+struct mlx4_ib_user_db_page;
+
+struct mlx4_ib_db {
+	__be32		       *db;
+	union {
+		struct mlx4_ib_db_pgdir	       *pgdir;
+		struct mlx4_ib_user_db_page    *user_page;
+	}			u;
+	dma_addr_t		dma;
+	int			index;
+	int			order;
+};
+
+struct mlx4_ib_ucontext {
+	struct ib_ucontext	ibucontext;
+	struct mlx4_uar		uar;
+	struct list_head	db_page_list;
+	struct mutex		db_page_mutex;
+};
+
+struct mlx4_ib_pd {
+	struct ib_pd		ibpd;
+	u32			pdn;
+};
+
+struct mlx4_ib_cq_buf {
+	struct mlx4_buf		buf;
+	struct mlx4_mtt		mtt;
+};
+
+struct mlx4_ib_cq {
+	struct ib_cq		ibcq;
+	struct mlx4_cq		mcq;
+	struct mlx4_ib_cq_buf	buf;
+	struct mlx4_ib_db	db;
+	spinlock_t		lock;
+	struct ib_umem	       *umem;
+};
+
+struct mlx4_ib_mr {
+	struct ib_mr		ibmr;
+	struct mlx4_mr		mmr;
+	struct ib_umem	       *umem;
+};
+
+struct mlx4_ib_wq {
+	u64		       *wrid;
+	spinlock_t		lock;
+	int			max;
+	int			max_gs;
+	int			offset;
+	int			wqe_shift;
+	unsigned		head;
+	unsigned		tail;
+};
+
+struct mlx4_ib_qp {
+	struct ib_qp		ibqp;
+	struct mlx4_qp		mqp;
+	struct mlx4_buf		buf;
+
+	struct mlx4_ib_db	db;
+	struct mlx4_ib_wq	rq;
+
+	u32			doorbell_qpn;
+	__be32			sq_signal_bits;
+	struct mlx4_ib_wq	sq;
+
+	struct ib_umem	       *umem;
+	struct mlx4_mtt		mtt;
+	int			buf_size;
+	struct mutex		mutex;
+	u8			port;
+	u8			alt_port;
+	u8			atomic_rd_en;
+	u8			resp_depth;
+	u8			state;
+};
+
+struct mlx4_ib_srq {
+	struct ib_srq		ibsrq;
+	struct mlx4_srq		msrq;
+	struct mlx4_buf		buf;
+	struct mlx4_ib_db	db;
+	u64		       *wrid;
+	spinlock_t		lock;
+	int			head;
+	int			tail;
+	u16			wqe_ctr;
+	struct ib_umem	       *umem;
+	struct mlx4_mtt		mtt;
+	struct mutex		mutex;
+};
+
+struct mlx4_ib_ah {
+	struct ib_ah		ibah;
+	struct mlx4_av		av;
+};
+
+struct mlx4_ib_dev {
+	struct ib_device	ib_dev;
+	struct mlx4_dev	       *dev;
+	void __iomem	       *uar_map;
+
+	struct list_head	pgdir_list;
+	struct mutex		pgdir_mutex;
+
+	struct mlx4_uar		priv_uar;
+	u32			priv_pdn;
+	MLX4_DECLARE_DOORBELL_LOCK(uar_lock);
+
+	struct ib_mad_agent    *send_agent[MLX4_MAX_PORTS][2];
+	struct ib_ah	       *sm_ah[MLX4_MAX_PORTS];
+	spinlock_t		sm_lock;
+
+	struct mutex		cap_mask_mutex;
+};
+
+static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mlx4_ib_dev, ib_dev);
+}
+
+static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext);
+}
+
+static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mlx4_ib_pd, ibpd);
+}
+
+static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mlx4_ib_cq, ibcq);
+}
+
+static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq)
+{
+	return container_of(mcq, struct mlx4_ib_cq, mcq);
+}
+
+static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mlx4_ib_mr, ibmr);
+}
+
+static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mlx4_ib_qp, ibqp);
+}
+
+static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp)
+{
+	return container_of(mqp, struct mlx4_ib_qp, mqp);
+}
+
+static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct mlx4_ib_srq, ibsrq);
+}
+
+static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq)
+{
+	return container_of(msrq, struct mlx4_ib_srq, msrq);
+}
+
+static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mlx4_ib_ah, ibah);
+}
+
+int mlx4_ib_db_alloc(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db, int order);
+void mlx4_ib_db_free(struct mlx4_ib_dev *dev, struct mlx4_ib_db *db);
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+			struct mlx4_ib_db *db);
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_ib_db *db);
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc);
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+			   struct ib_umem *umem);
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata);
+int mlx4_ib_dereg_mr(struct ib_mr *mr);
+
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+				struct ib_ucontext *context,
+				struct ib_udata *udata);
+int mlx4_ib_destroy_cq(struct ib_cq *cq);
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
+int mlx4_ib_destroy_ah(struct ib_ah *ah);
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata);
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mlx4_ib_destroy_srq(struct ib_srq *srq);
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index);
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr);
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata);
+int mlx4_ib_destroy_qp(struct ib_qp *qp);
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata);
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+		 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		 void *in_mad, void *response_mad);
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad);
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev);
+
+static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
+{
+	return !!(ah->av.g_slid & 0x80);
+}
+
+#endif /* MLX4_IB_H */
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
new file mode 100644
index 000000000000..85ae906f1d12
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4_ib.h"
+
+static u32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX4_PERM_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MLX4_PERM_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX4_PERM_LOCAL_WRITE  : 0) |
+	       MLX4_PERM_LOCAL_READ;
+}
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mlx4_ib_mr *mr;
+	int err;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
+			    ~0ull, convert_access(acc), 0, 0, &mr->mmr);
+	if (err)
+		goto err_free;
+
+	err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
+	if (err)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_mr:
+	mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+			   struct ib_umem *umem)
+{
+	u64 *pages;
+	struct ib_umem_chunk *chunk;
+	int i, j, k;
+	int n;
+	int len;
+	int err = 0;
+
+	pages = (u64 *) __get_free_page(GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	i = n = 0;
+
+	list_for_each_entry(chunk, &umem->chunk_list, list)
+		for (j = 0; j < chunk->nmap; ++j) {
+			len = sg_dma_len(&chunk->page_list[j]) >> mtt->page_shift;
+			for (k = 0; k < len; ++k) {
+				pages[i++] = sg_dma_address(&chunk->page_list[j]) +
+					umem->page_size * k;
+				/*
+				 * Be friendly to WRITE_MTT firmware
+				 * command, and pass it chunks of
+				 * appropriate size.
+				 */
+				if (i == PAGE_SIZE / sizeof (u64) - 2) {
+					err = mlx4_write_mtt(dev->dev, mtt, n,
+							     i, pages);
+					if (err)
+						goto out;
+					n += i;
+					i = 0;
+				}
+			}
+		}
+
+	if (i)
+		err = mlx4_write_mtt(dev->dev, mtt, n, i, pages);
+
+out:
+	free_page((unsigned long) pages);
+	return err;
+}
+
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_mr *mr;
+	int shift;
+	int err;
+	int n;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->umem = ib_umem_get(pd->uobject->context, start, length, access_flags);
+	if (IS_ERR(mr->umem)) {
+		err = PTR_ERR(mr->umem);
+		goto err_free;
+	}
+
+	n = ib_umem_page_count(mr->umem);
+	shift = ilog2(mr->umem->page_size);
+
+	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
+			    convert_access(access_flags), n, shift, &mr->mmr);
+	if (err)
+		goto err_umem;
+
+	err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
+	if (err)
+		goto err_mr;
+
+	err = mlx4_mr_enable(dev->dev, &mr->mmr);
+	if (err)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+
+	return &mr->ibmr;
+
+err_mr:
+	mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_umem:
+	ib_umem_release(mr->umem);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
+{
+	struct mlx4_ib_mr *mr = to_mmr(ibmr);
+
+	mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+	kfree(mr);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
new file mode 100644
index 000000000000..5cd706908450
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -0,0 +1,1294 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+enum {
+	MLX4_IB_ACK_REQ_FREQ	= 8,
+};
+
+enum {
+	MLX4_IB_DEFAULT_SCHED_QUEUE	= 0x83,
+	MLX4_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f
+};
+
+enum {
+	/*
+	 * Largest possible UD header: send with GRH and immediate data.
+	 */
+	MLX4_IB_UD_HEADER_SIZE		= 72
+};
+
+struct mlx4_ib_sqp {
+	struct mlx4_ib_qp	qp;
+	int			pkey_index;
+	u32			qkey;
+	u32			send_psn;
+	struct ib_ud_header	ud_header;
+	u8			header_buf[MLX4_IB_UD_HEADER_SIZE];
+};
+
+static const __be32 mlx4_ib_opcode[] = {
+	[IB_WR_SEND]			= __constant_cpu_to_be32(MLX4_OPCODE_SEND),
+	[IB_WR_SEND_WITH_IMM]		= __constant_cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+	[IB_WR_RDMA_WRITE]		= __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+	[IB_WR_RDMA_WRITE_WITH_IMM]	= __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+	[IB_WR_RDMA_READ]		= __constant_cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+	[IB_WR_ATOMIC_CMP_AND_SWP]	= __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+	[IB_WR_ATOMIC_FETCH_AND_ADD]	= __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+};
+
+static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
+{
+	return container_of(mqp, struct mlx4_ib_sqp, qp);
+}
+
+static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
+		qp->mqp.qpn <= dev->dev->caps.sqp_start + 3;
+}
+
+static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
+		qp->mqp.qpn <= dev->dev->caps.sqp_start + 1;
+}
+
+static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
+{
+	if (qp->buf.nbufs == 1)
+		return qp->buf.u.direct.buf + offset;
+	else
+		return qp->buf.u.page_list[offset >> PAGE_SHIFT].buf +
+			(offset & (PAGE_SIZE - 1));
+}
+
+static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
+}
+
+static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
+
+	if (type == MLX4_EVENT_TYPE_PATH_MIG)
+		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+	if (ibqp->event_handler) {
+		event.device     = ibqp->device;
+		event.element.qp = ibqp;
+		switch (type) {
+		case MLX4_EVENT_TYPE_PATH_MIG:
+			event.event = IB_EVENT_PATH_MIG;
+			break;
+		case MLX4_EVENT_TYPE_COMM_EST:
+			event.event = IB_EVENT_COMM_EST;
+			break;
+		case MLX4_EVENT_TYPE_SQ_DRAINED:
+			event.event = IB_EVENT_SQ_DRAINED;
+			break;
+		case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			break;
+		case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+			event.event = IB_EVENT_QP_FATAL;
+			break;
+		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+			event.event = IB_EVENT_PATH_MIG_ERR;
+			break;
+		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			event.event = IB_EVENT_QP_REQ_ERR;
+			break;
+		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+			event.event = IB_EVENT_QP_ACCESS_ERR;
+			break;
+		default:
+			printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+			       "on QP %06x\n", type, qp->qpn);
+			return;
+		}
+
+		ibqp->event_handler(&event, ibqp->qp_context);
+	}
+}
+
+static int send_wqe_overhead(enum ib_qp_type type)
+{
+	/*
+	 * UD WQEs must have a datagram segment.
+	 * RC and UC WQEs might have a remote address segment.
+	 * MLX WQEs need two extra inline data segments (for the UD
+	 * header and space for the ICRC).
+	 */
+	switch (type) {
+	case IB_QPT_UD:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_datagram_seg);
+	case IB_QPT_UC:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_raddr_seg);
+	case IB_QPT_RC:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_atomic_seg) +
+			sizeof (struct mlx4_wqe_raddr_seg);
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			ALIGN(MLX4_IB_UD_HEADER_SIZE +
+			      sizeof (struct mlx4_wqe_inline_seg),
+			      sizeof (struct mlx4_wqe_data_seg)) +
+			ALIGN(4 +
+			      sizeof (struct mlx4_wqe_inline_seg),
+			      sizeof (struct mlx4_wqe_data_seg));
+	default:
+		return sizeof (struct mlx4_wqe_ctrl_seg);
+	}
+}
+
+static int set_qp_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+		       enum ib_qp_type type, struct mlx4_ib_qp *qp)
+{
+	/* Sanity check QP size before proceeding */
+	if (cap->max_send_wr	 > dev->dev->caps.max_wqes  ||
+	    cap->max_recv_wr	 > dev->dev->caps.max_wqes  ||
+	    cap->max_send_sge	 > dev->dev->caps.max_sq_sg ||
+	    cap->max_recv_sge	 > dev->dev->caps.max_rq_sg ||
+	    cap->max_inline_data + send_wqe_overhead(type) +
+	    sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
+		return -EINVAL;
+
+	/*
+	 * For MLX transport we need 2 extra S/G entries:
+	 * one for the header and one for the checksum at the end
+	 */
+	if ((type == IB_QPT_SMI || type == IB_QPT_GSI) &&
+	    cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
+		return -EINVAL;
+
+	qp->rq.max = cap->max_recv_wr ? roundup_pow_of_two(cap->max_recv_wr) : 0;
+	qp->sq.max = cap->max_send_wr ? roundup_pow_of_two(cap->max_send_wr) : 0;
+
+	qp->rq.wqe_shift = ilog2(roundup_pow_of_two(cap->max_recv_sge *
+						    sizeof (struct mlx4_wqe_data_seg)));
+	qp->rq.max_gs    = (1 << qp->rq.wqe_shift) / sizeof (struct mlx4_wqe_data_seg);
+
+	qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge *
+							sizeof (struct mlx4_wqe_data_seg),
+							cap->max_inline_data +
+							sizeof (struct mlx4_wqe_inline_seg)) +
+						    send_wqe_overhead(type)));
+	qp->sq.max_gs    = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) /
+		sizeof (struct mlx4_wqe_data_seg);
+
+	qp->buf_size = (qp->rq.max << qp->rq.wqe_shift) +
+		(qp->sq.max << qp->sq.wqe_shift);
+	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+		qp->rq.offset = 0;
+		qp->sq.offset = qp->rq.max << qp->rq.wqe_shift;
+	} else {
+		qp->rq.offset = qp->sq.max << qp->sq.wqe_shift;
+		qp->sq.offset = 0;
+	}
+
+	cap->max_send_wr  = qp->sq.max;
+	cap->max_recv_wr  = qp->rq.max;
+	cap->max_send_sge = qp->sq.max_gs;
+	cap->max_recv_sge = qp->rq.max_gs;
+	cap->max_inline_data = (1 << qp->sq.wqe_shift) - send_wqe_overhead(type) -
+		sizeof (struct mlx4_wqe_inline_seg);
+
+	return 0;
+}
+
+static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
+{
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	int err;
+	int i;
+
+	mutex_init(&qp->mutex);
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+
+	qp->state	 = IB_QPS_RESET;
+	qp->atomic_rd_en = 0;
+	qp->resp_depth   = 0;
+
+	qp->rq.head	    = 0;
+	qp->rq.tail	    = 0;
+	qp->sq.head	    = 0;
+	qp->sq.tail	    = 0;
+
+	err = set_qp_size(dev, &init_attr->cap, init_attr->qp_type, qp);
+	if (err)
+		goto err;
+
+	if (pd->uobject) {
+		struct mlx4_ib_create_qp ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err;
+		}
+
+		qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+				       qp->buf_size, 0);
+		if (IS_ERR(qp->umem)) {
+			err = PTR_ERR(qp->umem);
+			goto err;
+		}
+
+		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem),
+				    ilog2(qp->umem->page_size), &qp->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
+		if (err)
+			goto err_mtt;
+
+		err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+					  ucmd.db_addr, &qp->db);
+		if (err)
+			goto err_mtt;
+	} else {
+		err = mlx4_ib_db_alloc(dev, &qp->db, 0);
+		if (err)
+			goto err;
+
+		*qp->db.db = 0;
+
+		if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) {
+			err = -ENOMEM;
+			goto err_db;
+		}
+
+		err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
+				    &qp->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
+		if (err)
+			goto err_mtt;
+
+		for (i = 0; i < qp->sq.max; ++i) {
+			ctrl = get_send_wqe(qp, i);
+			ctrl->owner_opcode = cpu_to_be32(1 << 31);
+		}
+
+		qp->sq.wrid  = kmalloc(qp->sq.max * sizeof (u64), GFP_KERNEL);
+		qp->rq.wrid  = kmalloc(qp->rq.max * sizeof (u64), GFP_KERNEL);
+
+		if (!qp->sq.wrid || !qp->rq.wrid) {
+			err = -ENOMEM;
+			goto err_wrid;
+		}
+
+		/* We don't support inline sends for kernel QPs (yet) */
+		init_attr->cap.max_inline_data = 0;
+	}
+
+	err = mlx4_qp_alloc(dev->dev, sqpn, &qp->mqp);
+	if (err)
+		goto err_wrid;
+
+	/*
+	 * Hardware wants QPN written in big-endian order (after
+	 * shifting) for send doorbell.  Precompute this value to save
+	 * a little bit when posting sends.
+	 */
+	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+	else
+		qp->sq_signal_bits = 0;
+
+	qp->mqp.event = mlx4_ib_qp_event;
+
+	return 0;
+
+err_wrid:
+	if (pd->uobject)
+		mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db);
+	else {
+		kfree(qp->sq.wrid);
+		kfree(qp->rq.wrid);
+	}
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+err_buf:
+	if (pd->uobject)
+		ib_umem_release(qp->umem);
+	else
+		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+
+err_db:
+	if (!pd->uobject)
+		mlx4_ib_db_free(dev, &qp->db);
+
+err:
+	return err;
+}
+
+static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:	return MLX4_QP_STATE_RST;
+	case IB_QPS_INIT:	return MLX4_QP_STATE_INIT;
+	case IB_QPS_RTR:	return MLX4_QP_STATE_RTR;
+	case IB_QPS_RTS:	return MLX4_QP_STATE_RTS;
+	case IB_QPS_SQD:	return MLX4_QP_STATE_SQD;
+	case IB_QPS_SQE:	return MLX4_QP_STATE_SQER;
+	case IB_QPS_ERR:	return MLX4_QP_STATE_ERR;
+	default:		return -1;
+	}
+}
+
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+{
+	if (send_cq == recv_cq)
+		spin_lock_irq(&send_cq->lock);
+	else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+		spin_lock_irq(&send_cq->lock);
+		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock_irq(&recv_cq->lock);
+		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+{
+	if (send_cq == recv_cq)
+		spin_unlock_irq(&send_cq->lock);
+	else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+		spin_unlock(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else {
+		spin_unlock(&send_cq->lock);
+		spin_unlock_irq(&recv_cq->lock);
+	}
+}
+
+static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+			      int is_user)
+{
+	struct mlx4_ib_cq *send_cq, *recv_cq;
+
+	if (qp->state != IB_QPS_RESET)
+		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+			printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n",
+			       qp->mqp.qpn);
+
+	send_cq = to_mcq(qp->ibqp.send_cq);
+	recv_cq = to_mcq(qp->ibqp.recv_cq);
+
+	mlx4_ib_lock_cqs(send_cq, recv_cq);
+
+	if (!is_user) {
+		__mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
+				 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
+		if (send_cq != recv_cq)
+			__mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+	}
+
+	mlx4_qp_remove(dev->dev, &qp->mqp);
+
+	mlx4_ib_unlock_cqs(send_cq, recv_cq);
+
+	mlx4_qp_free(dev->dev, &qp->mqp);
+	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+	if (is_user) {
+		mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context),
+				      &qp->db);
+		ib_umem_release(qp->umem);
+	} else {
+		kfree(qp->sq.wrid);
+		kfree(qp->rq.wrid);
+		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+		mlx4_ib_db_free(dev, &qp->db);
+	}
+}
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_sqp *sqp;
+	struct mlx4_ib_qp *qp;
+	int err;
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	{
+		qp = kmalloc(sizeof *qp, GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		err = create_qp_common(dev, pd, init_attr, udata, 0, qp);
+		if (err) {
+			kfree(qp);
+			return ERR_PTR(err);
+		}
+
+		qp->ibqp.qp_num = qp->mqp.qpn;
+
+		break;
+	}
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	{
+		/* Userspace is not allowed to create special QPs: */
+		if (pd->uobject)
+			return ERR_PTR(-EINVAL);
+
+		sqp = kmalloc(sizeof *sqp, GFP_KERNEL);
+		if (!sqp)
+			return ERR_PTR(-ENOMEM);
+
+		qp = &sqp->qp;
+
+		err = create_qp_common(dev, pd, init_attr, udata,
+				       dev->dev->caps.sqp_start +
+				       (init_attr->qp_type == IB_QPT_SMI ? 0 : 2) +
+				       init_attr->port_num - 1,
+				       qp);
+		if (err) {
+			kfree(sqp);
+			return ERR_PTR(err);
+		}
+
+		qp->port	= init_attr->port_num;
+		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+		break;
+	}
+	default:
+		/* Don't support raw QPs */
+		return ERR_PTR(-EINVAL);
+	}
+
+	return &qp->ibqp;
+}
+
+int mlx4_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct mlx4_ib_dev *dev = to_mdev(qp->device);
+	struct mlx4_ib_qp *mqp = to_mqp(qp);
+
+	if (is_qp0(dev, mqp))
+		mlx4_CLOSE_PORT(dev->dev, mqp->port);
+
+	destroy_qp_common(dev, mqp, !!qp->pd->uobject);
+
+	if (is_sqp(dev, mqp))
+		kfree(to_msqp(mqp));
+	else
+		kfree(mqp);
+
+	return 0;
+}
+
+static void init_port(struct mlx4_ib_dev *dev, int port)
+{
+	struct mlx4_init_port_param param;
+	int err;
+
+	memset(&param, 0, sizeof param);
+
+	param.port_width_cap = dev->dev->caps.port_width_cap;
+	param.vl_cap	     = dev->dev->caps.vl_cap;
+	param.mtu	     = ib_mtu_enum_to_int(dev->dev->caps.mtu_cap);
+	param.max_gid	     = dev->dev->caps.gid_table_len;
+	param.max_pkey	     = dev->dev->caps.pkey_table_len;
+
+	err = mlx4_INIT_PORT(dev->dev, &param, port);
+	if (err)
+		printk(KERN_WARNING "INIT_PORT failed, return code %d.\n", err);
+}
+
+static int to_mlx4_st(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_RC:		return MLX4_QP_ST_RC;
+	case IB_QPT_UC:		return MLX4_QP_ST_UC;
+	case IB_QPT_UD:		return MLX4_QP_ST_UD;
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:	return MLX4_QP_ST_MLX;
+	default:		return -1;
+	}
+}
+
+static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, struct ib_qp_attr *attr,
+				   int attr_mask)
+{
+	u8 dest_rd_atomic;
+	u32 access_flags;
+	u32 hw_access_flags = 0;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		dest_rd_atomic = attr->max_dest_rd_atomic;
+	else
+		dest_rd_atomic = qp->resp_depth;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		access_flags = attr->qp_access_flags;
+	else
+		access_flags = qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		hw_access_flags |= MLX4_QP_BIT_RRE;
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+		hw_access_flags |= MLX4_QP_BIT_RAE;
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		hw_access_flags |= MLX4_QP_BIT_RWE;
+
+	return cpu_to_be32(hw_access_flags);
+}
+
+static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, struct ib_qp_attr *attr,
+			    int attr_mask)
+{
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		sqp->pkey_index = attr->pkey_index;
+	if (attr_mask & IB_QP_QKEY)
+		sqp->qkey = attr->qkey;
+	if (attr_mask & IB_QP_SQ_PSN)
+		sqp->send_psn = attr->sq_psn;
+}
+
+static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
+{
+	path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
+}
+
+static int mlx4_set_path(struct mlx4_ib_dev *dev, struct ib_ah_attr *ah,
+			 struct mlx4_qp_path *path, u8 port)
+{
+	path->grh_mylmc     = ah->src_path_bits & 0x7f;
+	path->rlid	    = cpu_to_be16(ah->dlid);
+	if (ah->static_rate) {
+		path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET;
+		while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
+			--path->static_rate;
+	} else
+		path->static_rate = 0;
+	path->counter_index = 0xff;
+
+	if (ah->ah_flags & IB_AH_GRH) {
+		if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len) {
+			printk(KERN_ERR "sgid_index (%u) too large. max is %d\n",
+			       ah->grh.sgid_index, dev->dev->caps.gid_table_len - 1);
+			return -1;
+		}
+
+		path->grh_mylmc |= 1 << 7;
+		path->mgid_index = ah->grh.sgid_index;
+		path->hop_limit  = ah->grh.hop_limit;
+		path->tclass_flowlabel =
+			cpu_to_be32((ah->grh.traffic_class << 20) |
+				    (ah->grh.flow_label));
+		memcpy(path->rgid, ah->grh.dgid.raw, 16);
+	}
+
+	path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+		((port - 1) << 6) | ((ah->sl & 0xf) << 2);
+
+	return 0;
+}
+
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_qp_context *context;
+	enum mlx4_qp_optpar optpar = 0;
+	enum ib_qp_state cur_state, new_state;
+	int sqd_event;
+	int err = -EINVAL;
+
+	context = kzalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	mutex_lock(&qp->mutex);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask))
+		goto out;
+
+	if ((attr_mask & IB_QP_PKEY_INDEX) &&
+	     attr->pkey_index >= dev->dev->caps.pkey_table_len) {
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) {
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > 1 << dev->dev->caps.max_qp_dest_rdma) {
+		goto out;
+	}
+
+	context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
+				     (to_mlx4_st(ibqp->qp_type) << 16));
+	context->flags     |= cpu_to_be32(1 << 8); /* DE? */
+
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+		context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+	else {
+		optpar |= MLX4_QP_OPTPAR_PM_STATE;
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+
+	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
+	    ibqp->qp_type == IB_QPT_UD)
+		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
+	else if (attr_mask & IB_QP_PATH_MTU) {
+		if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
+			printk(KERN_ERR "path MTU (%u) is invalid\n",
+			       attr->path_mtu);
+			return -EINVAL;
+		}
+		context->mtu_msgmax = (attr->path_mtu << 5) | 31;
+	}
+
+	if (qp->rq.max)
+		context->rq_size_stride = ilog2(qp->rq.max) << 3;
+	context->rq_size_stride |= qp->rq.wqe_shift - 4;
+
+	if (qp->sq.max)
+		context->sq_size_stride = ilog2(qp->sq.max) << 3;
+	context->sq_size_stride |= qp->sq.wqe_shift - 4;
+
+	if (qp->ibqp.uobject)
+		context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index);
+	else
+		context->usr_page = cpu_to_be32(dev->priv_uar.index);
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+
+	if (attr_mask & IB_QP_PORT) {
+		if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD &&
+		    !(attr_mask & IB_QP_AV)) {
+			mlx4_set_sched(&context->pri_path, attr->port_num);
+			optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE;
+		}
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		context->pri_path.pkey_index = attr->pkey_index;
+		optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
+	}
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
+		optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
+	}
+
+	if (attr_mask & IB_QP_AV) {
+		if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path,
+				  attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
+			   MLX4_QP_OPTPAR_SCHED_QUEUE);
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		context->pri_path.ackto = attr->timeout << 3;
+		optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_pkey_index >= dev->dev->caps.pkey_table_len)
+			return -EINVAL;
+
+		if (attr->alt_port_num == 0 ||
+		    attr->alt_port_num > dev->dev->caps.num_ports)
+			return -EINVAL;
+
+		if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
+				  attr->alt_port_num))
+			return -EINVAL;
+
+		context->alt_path.pkey_index = attr->alt_pkey_index;
+		context->alt_path.ackto = attr->alt_timeout << 3;
+		optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
+	}
+
+	context->pd	    = cpu_to_be32(to_mpd(ibqp->pd)->pdn);
+	context->params1    = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+		optpar |= MLX4_QP_OPTPAR_RETRY_COUNT;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic)
+			context->params1 |=
+				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+		optpar |= MLX4_QP_OPTPAR_SRA_MAX;
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN)
+		context->next_send_psn = cpu_to_be32(attr->sq_psn);
+
+	context->cqn_send = cpu_to_be32(to_mcq(ibqp->send_cq)->mcq.cqn);
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attr->max_dest_rd_atomic)
+			context->params2 |=
+				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+		optpar |= MLX4_QP_OPTPAR_RRA_MAX;
+	}
+
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+		context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask);
+		optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
+	}
+
+	if (ibqp->srq)
+		context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+		optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT;
+	}
+	if (attr_mask & IB_QP_RQ_PSN)
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	context->cqn_recv = cpu_to_be32(to_mcq(ibqp->recv_cq)->mcq.cqn);
+
+	if (attr_mask & IB_QP_QKEY) {
+		context->qkey = cpu_to_be32(attr->qkey);
+		optpar |= MLX4_QP_OPTPAR_Q_KEY;
+	}
+
+	if (ibqp->srq)
+		context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn);
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->db_rec_addr = cpu_to_be64(qp->db.dma);
+
+	if (cur_state == IB_QPS_INIT &&
+	    new_state == IB_QPS_RTR  &&
+	    (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
+	     ibqp->qp_type == IB_QPT_UD)) {
+		context->pri_path.sched_queue = (qp->port - 1) << 6;
+		if (is_qp0(dev, qp))
+			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
+		else
+			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
+	}
+
+	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
+	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
+		sqd_event = 1;
+	else
+		sqd_event = 0;
+
+	err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
+			     to_mlx4_state(new_state), context, optpar,
+			     sqd_event, &qp->mqp);
+	if (err)
+		goto out;
+
+	qp->state = new_state;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->atomic_rd_en = attr->qp_access_flags;
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT)
+		qp->port = attr->port_num;
+	if (attr_mask & IB_QP_ALT_PATH)
+		qp->alt_port = attr->alt_port_num;
+
+	if (is_sqp(dev, qp))
+		store_sqp_attrs(to_msqp(qp), attr, attr_mask);
+
+	/*
+	 * If we moved QP0 to RTR, bring the IB link up; if we moved
+	 * QP0 to RESET or ERROR, bring the link back down.
+	 */
+	if (is_qp0(dev, qp)) {
+		if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR)
+			init_port(dev, qp->port);
+
+		if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
+		    (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+			mlx4_CLOSE_PORT(dev->dev, qp->port);
+	}
+
+	/*
+	 * If we moved a kernel QP to RESET, clean up all old CQ
+	 * entries and reinitialize the QP.
+	 */
+	if (new_state == IB_QPS_RESET && !ibqp->uobject) {
+		mlx4_ib_cq_clean(to_mcq(ibqp->recv_cq), qp->mqp.qpn,
+				 ibqp->srq ? to_msrq(ibqp->srq): NULL);
+		if (ibqp->send_cq != ibqp->recv_cq)
+			mlx4_ib_cq_clean(to_mcq(ibqp->send_cq), qp->mqp.qpn, NULL);
+
+		qp->rq.head = 0;
+		qp->rq.tail = 0;
+		qp->sq.head = 0;
+		qp->sq.tail = 0;
+		*qp->db.db  = 0;
+	}
+
+out:
+	mutex_unlock(&qp->mutex);
+	kfree(context);
+	return err;
+}
+
+static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+			    void *wqe)
+{
+	struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev;
+	struct mlx4_wqe_mlx_seg *mlx = wqe;
+	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+	u16 pkey;
+	int send_size;
+	int header_size;
+	int i;
+
+	send_size = 0;
+	for (i = 0; i < wr->num_sge; ++i)
+		send_size += wr->sg_list[i].length;
+
+	ib_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah), &sqp->ud_header);
+
+	sqp->ud_header.lrh.service_level   =
+		be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28;
+	sqp->ud_header.lrh.destination_lid = ah->av.dlid;
+	sqp->ud_header.lrh.source_lid      = cpu_to_be16(ah->av.g_slid & 0x7f);
+	if (mlx4_ib_ah_grh_present(ah)) {
+		sqp->ud_header.grh.traffic_class =
+			(be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20) & 0xff;
+		sqp->ud_header.grh.flow_label    =
+			ah->av.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.port_pd) >> 24,
+				  ah->av.gid_index, &sqp->ud_header.grh.source_gid);
+		memcpy(sqp->ud_header.grh.destination_gid.raw,
+		       ah->av.dgid, 16);
+	}
+
+	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+	mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
+				  (sqp->ud_header.lrh.destination_lid ==
+				   IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
+				  (sqp->ud_header.lrh.service_level << 8));
+	mlx->rlid   = sqp->ud_header.lrh.destination_lid;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY;
+		sqp->ud_header.immediate_present = 0;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->ud_header.immediate_present = 1;
+		sqp->ud_header.immediate_data    = wr->imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+	if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+		sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	if (!sqp->qp.ibqp.qp_num)
+		ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
+	else
+		ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey);
+	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+					       sqp->qkey : wr->wr.ud.remote_qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+	if (0) {
+		printk(KERN_ERR "built UD header of size %d:\n", header_size);
+		for (i = 0; i < header_size / 4; ++i) {
+			if (i % 8 == 0)
+				printk("  [%02x] ", i * 4);
+			printk(" %08x",
+			       be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
+			if ((i + 1) % 8 == 0)
+				printk("\n");
+		}
+		printk("\n");
+	}
+
+	inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+	memcpy(inl + 1, sqp->header_buf, header_size);
+
+	return ALIGN(sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+}
+
+static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+	unsigned cur;
+	struct mlx4_ib_cq *cq;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max;
+}
+
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	unsigned long flags;
+	int nreq;
+	int err = 0;
+	int ind;
+	int size;
+	int i;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	ind = qp->sq.head;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.max - 1));
+		qp->sq.wrid[ind & (qp->sq.max - 1)] = wr->wr_id;
+
+		ctrl->srcrb_flags =
+			(wr->send_flags & IB_SEND_SIGNALED ?
+			 cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+			(wr->send_flags & IB_SEND_SOLICITED ?
+			 cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
+			qp->sq_signal_bits;
+
+		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+			ctrl->imm = wr->imm_data;
+		else
+			ctrl->imm = 0;
+
+		wqe += sizeof *ctrl;
+		size = sizeof *ctrl / 16;
+
+		switch (ibqp->qp_type) {
+		case IB_QPT_RC:
+		case IB_QPT_UC:
+			switch (wr->opcode) {
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				((struct mlx4_wqe_raddr_seg *) wqe)->raddr =
+					cpu_to_be64(wr->wr.atomic.remote_addr);
+				((struct mlx4_wqe_raddr_seg *) wqe)->rkey =
+					cpu_to_be32(wr->wr.atomic.rkey);
+				((struct mlx4_wqe_raddr_seg *) wqe)->reserved = 0;
+
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+
+				if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+					((struct mlx4_wqe_atomic_seg *) wqe)->swap_add =
+						cpu_to_be64(wr->wr.atomic.swap);
+					((struct mlx4_wqe_atomic_seg *) wqe)->compare =
+						cpu_to_be64(wr->wr.atomic.compare_add);
+				} else {
+					((struct mlx4_wqe_atomic_seg *) wqe)->swap_add =
+						cpu_to_be64(wr->wr.atomic.compare_add);
+					((struct mlx4_wqe_atomic_seg *) wqe)->compare = 0;
+				}
+
+				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
+				size += (sizeof (struct mlx4_wqe_raddr_seg) +
+					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
+
+				break;
+
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				((struct mlx4_wqe_raddr_seg *) wqe)->raddr =
+					cpu_to_be64(wr->wr.rdma.remote_addr);
+				((struct mlx4_wqe_raddr_seg *) wqe)->rkey =
+					cpu_to_be32(wr->wr.rdma.rkey);
+				((struct mlx4_wqe_raddr_seg *) wqe)->reserved = 0;
+
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
+
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+			break;
+
+		case IB_QPT_UD:
+			memcpy(((struct mlx4_wqe_datagram_seg *) wqe)->av,
+			       &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
+			((struct mlx4_wqe_datagram_seg *) wqe)->dqpn =
+				cpu_to_be32(wr->wr.ud.remote_qpn);
+			((struct mlx4_wqe_datagram_seg *) wqe)->qkey =
+				cpu_to_be32(wr->wr.ud.remote_qkey);
+
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+			break;
+
+		case IB_QPT_SMI:
+		case IB_QPT_GSI:
+			err = build_mlx_header(to_msqp(qp), wr, ctrl);
+			if (err < 0) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe  += err;
+			size += err / 16;
+
+			err = 0;
+			break;
+
+		default:
+			break;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			((struct mlx4_wqe_data_seg *) wqe)->byte_count =
+				cpu_to_be32(wr->sg_list[i].length);
+			((struct mlx4_wqe_data_seg *) wqe)->lkey =
+				cpu_to_be32(wr->sg_list[i].lkey);
+			((struct mlx4_wqe_data_seg *) wqe)->addr =
+				cpu_to_be64(wr->sg_list[i].addr);
+
+			wqe  += sizeof (struct mlx4_wqe_data_seg);
+			size += sizeof (struct mlx4_wqe_data_seg) / 16;
+		}
+
+		/* Add one more inline data segment for ICRC for MLX sends */
+		if (qp->ibqp.qp_type == IB_QPT_SMI || qp->ibqp.qp_type == IB_QPT_GSI) {
+			((struct mlx4_wqe_inline_seg *) wqe)->byte_count =
+				cpu_to_be32((1 << 31) | 4);
+			((u32 *) wqe)[1] = 0;
+			wqe  += sizeof (struct mlx4_wqe_data_seg);
+			size += sizeof (struct mlx4_wqe_data_seg) / 16;
+		}
+
+		ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
+				    MLX4_WQE_CTRL_FENCE : 0) | size;
+
+		/*
+		 * Make sure descriptor is fully written before
+		 * setting ownership bit (because HW can start
+		 * executing as soon as we do).
+		 */
+		wmb();
+
+		if (wr->opcode < 0 || wr->opcode > ARRAY_SIZE(mlx4_ib_opcode)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
+			(ind & qp->sq.max ? cpu_to_be32(1 << 31) : 0);
+
+		++ind;
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		writel(qp->doorbell_qpn,
+		       to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL);
+
+		/*
+		 * Make sure doorbells don't leak out of SQ spinlock
+		 * and reach the HCA out of order.
+		 */
+		mmiowb();
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+	return err;
+}
+
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_wqe_data_seg *scat;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int i;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	ind = qp->rq.head & (qp->rq.max - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.send_cq)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		scat = get_recv_wqe(qp, ind);
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+			scat[i].lkey       = cpu_to_be32(wr->sg_list[i].lkey);
+			scat[i].addr       = cpu_to_be64(wr->sg_list[i].addr);
+		}
+
+		if (i < qp->rq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+
+		qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (qp->rq.max - 1);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
new file mode 100644
index 000000000000..42ab4a801d6a
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+static void *get_wqe(struct mlx4_ib_srq *srq, int n)
+{
+	int offset = n << srq->msrq.wqe_shift;
+
+	if (srq->buf.nbufs == 1)
+		return srq->buf.u.direct.buf + offset;
+	else
+		return srq->buf.u.page_list[offset >> PAGE_SHIFT].buf +
+			(offset & (PAGE_SIZE - 1));
+}
+
+static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq;
+
+	if (ibsrq->event_handler) {
+		event.device      = ibsrq->device;
+		event.element.srq = ibsrq;
+		switch (type) {
+		case MLX4_EVENT_TYPE_SRQ_LIMIT:
+			event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			break;
+		case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+			event.event = IB_EVENT_SRQ_ERR;
+			break;
+		default:
+			printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+			       "on SRQ %06x\n", type, srq->srqn);
+			return;
+		}
+
+		ibsrq->event_handler(&event, ibsrq->srq_context);
+	}
+}
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_srq *srq;
+	struct mlx4_wqe_srq_next_seg *next;
+	int desc_size;
+	int buf_size;
+	int err;
+	int i;
+
+	/* Sanity check SRQ size before proceeding */
+	if (init_attr->attr.max_wr  >= dev->dev->caps.max_srq_wqes ||
+	    init_attr->attr.max_sge >  dev->dev->caps.max_srq_sge)
+		return ERR_PTR(-EINVAL);
+
+	srq = kmalloc(sizeof *srq, GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&srq->mutex);
+	spin_lock_init(&srq->lock);
+	srq->msrq.max    = roundup_pow_of_two(init_attr->attr.max_wr + 1);
+	srq->msrq.max_gs = init_attr->attr.max_sge;
+
+	desc_size = max(32UL,
+			roundup_pow_of_two(sizeof (struct mlx4_wqe_srq_next_seg) +
+					   srq->msrq.max_gs *
+					   sizeof (struct mlx4_wqe_data_seg)));
+	srq->msrq.wqe_shift = ilog2(desc_size);
+
+	buf_size = srq->msrq.max * desc_size;
+
+	if (pd->uobject) {
+		struct mlx4_ib_create_srq ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err_srq;
+		}
+
+		srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+					buf_size, 0);
+		if (IS_ERR(srq->umem)) {
+			err = PTR_ERR(srq->umem);
+			goto err_srq;
+		}
+
+		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem),
+				    ilog2(srq->umem->page_size), &srq->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_ib_umem_write_mtt(dev, &srq->mtt, srq->umem);
+		if (err)
+			goto err_mtt;
+
+		err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+					  ucmd.db_addr, &srq->db);
+		if (err)
+			goto err_mtt;
+	} else {
+		err = mlx4_ib_db_alloc(dev, &srq->db, 0);
+		if (err)
+			goto err_srq;
+
+		*srq->db.db = 0;
+
+		if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf)) {
+			err = -ENOMEM;
+			goto err_db;
+		}
+
+		srq->head    = 0;
+		srq->tail    = srq->msrq.max - 1;
+		srq->wqe_ctr = 0;
+
+		for (i = 0; i < srq->msrq.max; ++i) {
+			next = get_wqe(srq, i);
+			next->next_wqe_index =
+				cpu_to_be16((i + 1) & (srq->msrq.max - 1));
+		}
+
+		err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift,
+				    &srq->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf);
+		if (err)
+			goto err_mtt;
+
+		srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
+		if (!srq->wrid) {
+			err = -ENOMEM;
+			goto err_mtt;
+		}
+	}
+
+	err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, &srq->mtt,
+			     srq->db.dma, &srq->msrq);
+	if (err)
+		goto err_wrid;
+
+	srq->msrq.event = mlx4_ib_srq_event;
+
+	if (pd->uobject)
+		if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) {
+			err = -EFAULT;
+			goto err_wrid;
+		}
+
+	init_attr->attr.max_wr = srq->msrq.max - 1;
+
+	return &srq->ibsrq;
+
+err_wrid:
+	if (pd->uobject)
+		mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+	else
+		kfree(srq->wrid);
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &srq->mtt);
+
+err_buf:
+	if (pd->uobject)
+		ib_umem_release(srq->umem);
+	else
+		mlx4_buf_free(dev->dev, buf_size, &srq->buf);
+
+err_db:
+	if (!pd->uobject)
+		mlx4_ib_db_free(dev, &srq->db);
+
+err_srq:
+	kfree(srq);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+
+	/* We don't support resizing SRQs (yet?) */
+	if (attr_mask & IB_SRQ_MAX_WR)
+		return -EINVAL;
+
+	if (attr_mask & IB_SRQ_LIMIT) {
+		if (attr->srq_limit >= srq->msrq.max)
+			return -EINVAL;
+
+		mutex_lock(&srq->mutex);
+		ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit);
+		mutex_unlock(&srq->mutex);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int mlx4_ib_destroy_srq(struct ib_srq *srq)
+{
+	struct mlx4_ib_dev *dev = to_mdev(srq->device);
+	struct mlx4_ib_srq *msrq = to_msrq(srq);
+
+	mlx4_srq_free(dev->dev, &msrq->msrq);
+	mlx4_mtt_cleanup(dev->dev, &msrq->mtt);
+
+	if (srq->uobject) {
+		mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+		ib_umem_release(msrq->umem);
+	} else {
+		kfree(msrq->wrid);
+		mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift,
+			      &msrq->buf);
+		mlx4_ib_db_free(dev, &msrq->db);
+	}
+
+	kfree(msrq);
+
+	return 0;
+}
+
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index)
+{
+	struct mlx4_wqe_srq_next_seg *next;
+
+	/* always called with interrupts disabled. */
+	spin_lock(&srq->lock);
+
+	next = get_wqe(srq, srq->tail);
+	next->next_wqe_index = cpu_to_be16(wqe_index);
+	srq->tail = wqe_index;
+
+	spin_unlock(&srq->lock);
+}
+
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr)
+{
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	struct mlx4_wqe_srq_next_seg *next;
+	struct mlx4_wqe_data_seg *scat;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		srq->wrid[srq->head] = wr->wr_id;
+
+		next      = get_wqe(srq, srq->head);
+		srq->head = be16_to_cpu(next->next_wqe_index);
+		scat      = (struct mlx4_wqe_data_seg *) (next + 1);
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+			scat[i].lkey       = cpu_to_be32(wr->sg_list[i].lkey);
+			scat[i].addr       = cpu_to_be64(wr->sg_list[i].addr);
+		}
+
+		if (i < srq->msrq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+	}
+
+	if (likely(nreq)) {
+		srq->wqe_ctr += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*srq->db.db = cpu_to_be32(srq->wqe_ctr);
+	}
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx4/user.h b/drivers/infiniband/hw/mlx4/user.h
new file mode 100644
index 000000000000..5b8eddc9fa83
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/user.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_USER_H
+#define MLX4_IB_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MLX4_IB_UVERBS_ABI_VERSION	1
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mlx4_ib_alloc_ucontext_resp {
+	__u32	qp_tab_size;
+	__u16	bf_reg_size;
+	__u16	bf_regs_per_page;
+};
+
+struct mlx4_ib_alloc_pd_resp {
+	__u32	pdn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_create_cq {
+	__u64	buf_addr;
+	__u64	db_addr;
+};
+
+struct mlx4_ib_create_cq_resp {
+	__u32	cqn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_resize_cq {
+	__u64	buf_addr;
+};
+
+struct mlx4_ib_create_srq {
+	__u64	buf_addr;
+	__u64	db_addr;
+};
+
+struct mlx4_ib_create_srq_resp {
+	__u32	srqn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_create_qp {
+	__u64	buf_addr;
+	__u64	db_addr;
+};
+
+#endif /* MLX4_IB_USER_H */
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index b86ccd2ecd5b..6db12d0265da 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2493,6 +2493,20 @@ config PASEMI_MAC
 	  This driver supports the on-chip 1/10Gbit Ethernet controller on
 	  PA Semi's PWRficient line of chips.
 
+config MLX4_CORE
+	tristate
+	depends on PCI
+	default n
+
+config MLX4_DEBUG
+	bool "Verbose debugging output" if (MLX4_CORE && EMBEDDED)
+	default y
+	---help---
+	  This option causes debugging code to be compiled into the
+	  mlx4_core driver.  The output can be turned on via the
+	  debug_level module parameter (which can also be set after
+	  the driver is loaded through sysfs).
+
 endmenu
 
 source "drivers/net/tokenring/Kconfig"
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 59c0459a037c..7faeeeabc838 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -197,6 +197,7 @@ obj-$(CONFIG_SMC911X) += smc911x.o
 obj-$(CONFIG_DM9000) += dm9000.o
 obj-$(CONFIG_FEC_8XX) += fec_8xx/
 obj-$(CONFIG_PASEMI_MAC) += pasemi_mac.o
+obj-$(CONFIG_MLX4_CORE) += mlx4/
 
 obj-$(CONFIG_MACB) += macb.o
 
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
new file mode 100644
index 000000000000..0952a6528f58
--- /dev/null
+++ b/drivers/net/mlx4/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_MLX4_CORE)		+= mlx4_core.o
+
+mlx4_core-y :=	alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \
+		mr.o pd.o profile.o qp.o reset.o srq.o
diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c
new file mode 100644
index 000000000000..9ffdb9d29da9
--- /dev/null
+++ b/drivers/net/mlx4/alloc.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+
+#include "mlx4.h"
+
+u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
+{
+	u32 obj;
+
+	spin_lock(&bitmap->lock);
+
+	obj = find_next_zero_bit(bitmap->table, bitmap->max, bitmap->last);
+	if (obj >= bitmap->max) {
+		bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
+		obj = find_first_zero_bit(bitmap->table, bitmap->max);
+	}
+
+	if (obj < bitmap->max) {
+		set_bit(obj, bitmap->table);
+		obj |= bitmap->top;
+		bitmap->last = obj + 1;
+	} else
+		obj = -1;
+
+	spin_unlock(&bitmap->lock);
+
+	return obj;
+}
+
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj)
+{
+	obj &= bitmap->max - 1;
+
+	spin_lock(&bitmap->lock);
+	clear_bit(obj, bitmap->table);
+	bitmap->last = min(bitmap->last, obj);
+	bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
+	spin_unlock(&bitmap->lock);
+}
+
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, u32 reserved)
+{
+	int i;
+
+	/* num must be a power of 2 */
+	if (num != roundup_pow_of_two(num))
+		return -EINVAL;
+
+	bitmap->last = 0;
+	bitmap->top  = 0;
+	bitmap->max  = num;
+	bitmap->mask = mask;
+	spin_lock_init(&bitmap->lock);
+	bitmap->table = kzalloc(BITS_TO_LONGS(num) * sizeof (long), GFP_KERNEL);
+	if (!bitmap->table)
+		return -ENOMEM;
+
+	for (i = 0; i < reserved; ++i)
+		set_bit(i, bitmap->table);
+
+	return 0;
+}
+
+void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap)
+{
+	kfree(bitmap->table);
+}
+
+/*
+ * Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0.  If the
+ * requested size is > max_direct, we split the allocation into
+ * multiple pages, so we don't require too much contiguous memory.
+ */
+
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+		   struct mlx4_buf *buf)
+{
+	dma_addr_t t;
+
+	if (size <= max_direct) {
+		buf->nbufs        = 1;
+		buf->npages       = 1;
+		buf->page_shift   = get_order(size) + PAGE_SHIFT;
+		buf->u.direct.buf = dma_alloc_coherent(&dev->pdev->dev,
+						       size, &t, GFP_KERNEL);
+		if (!buf->u.direct.buf)
+			return -ENOMEM;
+
+		buf->u.direct.map = t;
+
+		while (t & ((1 << buf->page_shift) - 1)) {
+			--buf->page_shift;
+			buf->npages *= 2;
+		}
+
+		memset(buf->u.direct.buf, 0, size);
+	} else {
+		int i;
+
+		buf->nbufs       = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		buf->npages      = buf->nbufs;
+		buf->page_shift  = PAGE_SHIFT;
+		buf->u.page_list = kzalloc(buf->nbufs * sizeof *buf->u.page_list,
+					   GFP_KERNEL);
+		if (!buf->u.page_list)
+			return -ENOMEM;
+
+		for (i = 0; i < buf->nbufs; ++i) {
+			buf->u.page_list[i].buf =
+				dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+						   &t, GFP_KERNEL);
+			if (!buf->u.page_list[i].buf)
+				goto err_free;
+
+			buf->u.page_list[i].map = t;
+
+			memset(buf->u.page_list[i].buf, 0, PAGE_SIZE);
+		}
+	}
+
+	return 0;
+
+err_free:
+	mlx4_buf_free(dev, size, buf);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_alloc);
+
+void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
+{
+	int i;
+
+	if (buf->nbufs == 1)
+		dma_free_coherent(&dev->pdev->dev, size, buf->u.direct.buf,
+				  buf->u.direct.map);
+	else {
+		for (i = 0; i < buf->nbufs; ++i)
+			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+					  buf->u.page_list[i].buf,
+					  buf->u.page_list[i].map);
+		kfree(buf->u.page_list);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_free);
diff --git a/drivers/net/mlx4/catas.c b/drivers/net/mlx4/catas.c
new file mode 100644
index 000000000000..1bb088aeaf71
--- /dev/null
+++ b/drivers/net/mlx4/catas.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4.h"
+
+void mlx4_handle_catas_err(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	int i;
+
+	mlx4_err(dev, "Catastrophic error detected:\n");
+	for (i = 0; i < priv->fw.catas_size; ++i)
+		mlx4_err(dev, "  buf[%02x]: %08x\n",
+			 i, swab32(readl(priv->catas_err.map + i)));
+
+	mlx4_dispatch_event(dev, MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR, 0, 0);
+}
+
+void mlx4_map_catas_buf(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned long addr;
+
+	addr = pci_resource_start(dev->pdev, priv->fw.catas_bar) +
+		priv->fw.catas_offset;
+
+	priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
+	if (!priv->catas_err.map)
+		mlx4_warn(dev, "Failed to map catastrophic error buffer at 0x%lx\n",
+			  addr);
+
+}
+
+void mlx4_unmap_catas_buf(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (priv->catas_err.map)
+		iounmap(priv->catas_err.map);
+}
diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c
new file mode 100644
index 000000000000..c1f81a993f5d
--- /dev/null
+++ b/drivers/net/mlx4/cmd.c
@@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include <asm/io.h>
+
+#include "mlx4.h"
+
+#define CMD_POLL_TOKEN 0xffff
+
+enum {
+	/* command completed successfully: */
+	CMD_STAT_OK		= 0x00,
+	/* Internal error (such as a bus error) occurred while processing command: */
+	CMD_STAT_INTERNAL_ERR	= 0x01,
+	/* Operation/command not supported or opcode modifier not supported: */
+	CMD_STAT_BAD_OP		= 0x02,
+	/* Parameter not supported or parameter out of range: */
+	CMD_STAT_BAD_PARAM	= 0x03,
+	/* System not enabled or bad system state: */
+	CMD_STAT_BAD_SYS_STATE	= 0x04,
+	/* Attempt to access reserved or unallocaterd resource: */
+	CMD_STAT_BAD_RESOURCE	= 0x05,
+	/* Requested resource is currently executing a command, or is otherwise busy: */
+	CMD_STAT_RESOURCE_BUSY	= 0x06,
+	/* Required capability exceeds device limits: */
+	CMD_STAT_EXCEED_LIM	= 0x08,
+	/* Resource is not in the appropriate state or ownership: */
+	CMD_STAT_BAD_RES_STATE	= 0x09,
+	/* Index out of range: */
+	CMD_STAT_BAD_INDEX	= 0x0a,
+	/* FW image corrupted: */
+	CMD_STAT_BAD_NVMEM	= 0x0b,
+	/* Attempt to modify a QP/EE which is not in the presumed state: */
+	CMD_STAT_BAD_QP_STATE   = 0x10,
+	/* Bad segment parameters (Address/Size): */
+	CMD_STAT_BAD_SEG_PARAM	= 0x20,
+	/* Memory Region has Memory Windows bound to: */
+	CMD_STAT_REG_BOUND	= 0x21,
+	/* HCA local attached memory not present: */
+	CMD_STAT_LAM_NOT_PRE	= 0x22,
+	/* Bad management packet (silently discarded): */
+	CMD_STAT_BAD_PKT	= 0x30,
+	/* More outstanding CQEs in CQ than new CQ size: */
+	CMD_STAT_BAD_SIZE	= 0x40
+};
+
+enum {
+	HCR_IN_PARAM_OFFSET	= 0x00,
+	HCR_IN_MODIFIER_OFFSET	= 0x08,
+	HCR_OUT_PARAM_OFFSET	= 0x0c,
+	HCR_TOKEN_OFFSET	= 0x14,
+	HCR_STATUS_OFFSET	= 0x18,
+
+	HCR_OPMOD_SHIFT		= 12,
+	HCR_T_BIT		= 21,
+	HCR_E_BIT		= 22,
+	HCR_GO_BIT		= 23
+};
+
+enum {
+	GO_BIT_TIMEOUT		= 10000
+};
+
+struct mlx4_cmd_context {
+	struct completion	done;
+	int			result;
+	int			next;
+	u64			out_param;
+	u16			token;
+};
+
+static int mlx4_status_to_errno(u8 status) {
+	static const int trans_table[] = {
+		[CMD_STAT_INTERNAL_ERR]	  = -EIO,
+		[CMD_STAT_BAD_OP]	  = -EPERM,
+		[CMD_STAT_BAD_PARAM]	  = -EINVAL,
+		[CMD_STAT_BAD_SYS_STATE]  = -ENXIO,
+		[CMD_STAT_BAD_RESOURCE]	  = -EBADF,
+		[CMD_STAT_RESOURCE_BUSY]  = -EBUSY,
+		[CMD_STAT_EXCEED_LIM]	  = -ENOMEM,
+		[CMD_STAT_BAD_RES_STATE]  = -EBADF,
+		[CMD_STAT_BAD_INDEX]	  = -EBADF,
+		[CMD_STAT_BAD_NVMEM]	  = -EFAULT,
+		[CMD_STAT_BAD_QP_STATE]   = -EINVAL,
+		[CMD_STAT_BAD_SEG_PARAM]  = -EFAULT,
+		[CMD_STAT_REG_BOUND]	  = -EBUSY,
+		[CMD_STAT_LAM_NOT_PRE]	  = -EAGAIN,
+		[CMD_STAT_BAD_PKT]	  = -EINVAL,
+		[CMD_STAT_BAD_SIZE]	  = -ENOMEM,
+	};
+
+	if (status >= ARRAY_SIZE(trans_table) ||
+	    (status != CMD_STAT_OK && trans_table[status] == 0))
+		return -EIO;
+
+	return trans_table[status];
+}
+
+static int cmd_pending(struct mlx4_dev *dev)
+{
+	u32 status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
+
+	return (status & swab32(1 << HCR_GO_BIT)) ||
+		(mlx4_priv(dev)->cmd.toggle ==
+		 !!(status & swab32(1 << HCR_T_BIT)));
+}
+
+static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
+			 u32 in_modifier, u8 op_modifier, u16 op, u16 token,
+			 int event)
+{
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	u32 __iomem *hcr = cmd->hcr;
+	int ret = -EAGAIN;
+	unsigned long end;
+
+	mutex_lock(&cmd->hcr_mutex);
+
+	end = jiffies;
+	if (event)
+		end += HZ * 10;
+
+	while (cmd_pending(dev)) {
+		if (time_after_eq(jiffies, end))
+			goto out;
+		cond_resched();
+	}
+
+	/*
+	 * We use writel (instead of something like memcpy_toio)
+	 * because writes of less than 32 bits to the HCR don't work
+	 * (and some architectures such as ia64 implement memcpy_toio
+	 * in terms of writeb).
+	 */
+	__raw_writel((__force u32) cpu_to_be32(in_param >> 32),		  hcr + 0);
+	__raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful),  hcr + 1);
+	__raw_writel((__force u32) cpu_to_be32(in_modifier),		  hcr + 2);
+	__raw_writel((__force u32) cpu_to_be32(out_param >> 32),	  hcr + 3);
+	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), hcr + 4);
+	__raw_writel((__force u32) cpu_to_be32(token << 16),		  hcr + 5);
+
+	/* __raw_writel may not order writes. */
+	wmb();
+
+	__raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT)		|
+					       (cmd->toggle << HCR_T_BIT)	|
+					       (event ? (1 << HCR_E_BIT) : 0)	|
+					       (op_modifier << HCR_OPMOD_SHIFT) |
+					       op),			  hcr + 6);
+	cmd->toggle = cmd->toggle ^ 1;
+
+	ret = 0;
+
+out:
+	mutex_unlock(&cmd->hcr_mutex);
+	return ret;
+}
+
+static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			 int out_is_imm, u32 in_modifier, u8 op_modifier,
+			 u16 op, unsigned long timeout)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	void __iomem *hcr = priv->cmd.hcr;
+	int err = 0;
+	unsigned long end;
+
+	down(&priv->cmd.poll_sem);
+
+	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+			    in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
+	if (err)
+		goto out;
+
+	end = msecs_to_jiffies(timeout) + jiffies;
+	while (cmd_pending(dev) && time_before(jiffies, end))
+		cond_resched();
+
+	if (cmd_pending(dev)) {
+		err = -ETIMEDOUT;
+		goto out;
+	}
+
+	if (out_is_imm)
+		*out_param =
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(hcr + HCR_OUT_PARAM_OFFSET)) << 32 |
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(hcr + HCR_OUT_PARAM_OFFSET + 4));
+
+	err = mlx4_status_to_errno(be32_to_cpu((__force __be32)
+					       __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24);
+
+out:
+	up(&priv->cmd.poll_sem);
+	return err;
+}
+
+void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_context *context =
+		&priv->cmd.context[token & priv->cmd.token_mask];
+
+	/* previously timed out command completing at long last */
+	if (token != context->token)
+		return;
+
+	context->result    = mlx4_status_to_errno(status);
+	context->out_param = out_param;
+
+	context->token += priv->cmd.token_mask + 1;
+
+	complete(&context->done);
+}
+
+static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			 int out_is_imm, u32 in_modifier, u8 op_modifier,
+			 u16 op, unsigned long timeout)
+{
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	struct mlx4_cmd_context *context;
+	int err = 0;
+
+	down(&cmd->event_sem);
+
+	spin_lock(&cmd->context_lock);
+	BUG_ON(cmd->free_head < 0);
+	context = &cmd->context[cmd->free_head];
+	cmd->free_head = context->next;
+	spin_unlock(&cmd->context_lock);
+
+	init_completion(&context->done);
+
+	mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+		      in_modifier, op_modifier, op, context->token, 1);
+
+	if (!wait_for_completion_timeout(&context->done, msecs_to_jiffies(timeout))) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = context->result;
+	if (err)
+		goto out;
+
+	if (out_is_imm)
+		*out_param = context->out_param;
+
+out:
+	spin_lock(&cmd->context_lock);
+	context->next = cmd->free_head;
+	cmd->free_head = context - cmd->context;
+	spin_unlock(&cmd->context_lock);
+
+	up(&cmd->event_sem);
+	return err;
+}
+
+int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+	       int out_is_imm, u32 in_modifier, u8 op_modifier,
+	       u16 op, unsigned long timeout)
+{
+	if (mlx4_priv(dev)->cmd.use_events)
+		return mlx4_cmd_wait(dev, in_param, out_param, out_is_imm,
+				     in_modifier, op_modifier, op, timeout);
+	else
+		return mlx4_cmd_poll(dev, in_param, out_param, out_is_imm,
+				     in_modifier, op_modifier, op, timeout);
+}
+EXPORT_SYMBOL_GPL(__mlx4_cmd);
+
+int mlx4_cmd_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_init(&priv->cmd.hcr_mutex);
+	sema_init(&priv->cmd.poll_sem, 1);
+	priv->cmd.use_events = 0;
+	priv->cmd.toggle     = 1;
+
+	priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_HCR_BASE,
+				MLX4_HCR_SIZE);
+	if (!priv->cmd.hcr) {
+		mlx4_err(dev, "Couldn't map command register.");
+		return -ENOMEM;
+	}
+
+	priv->cmd.pool = pci_pool_create("mlx4_cmd", dev->pdev,
+					 MLX4_MAILBOX_SIZE,
+					 MLX4_MAILBOX_SIZE, 0);
+	if (!priv->cmd.pool) {
+		iounmap(priv->cmd.hcr);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void mlx4_cmd_cleanup(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	pci_pool_destroy(priv->cmd.pool);
+	iounmap(priv->cmd.hcr);
+}
+
+/*
+ * Switch to using events to issue FW commands (can only be called
+ * after event queue for command events has been initialized).
+ */
+int mlx4_cmd_use_events(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	priv->cmd.context = kmalloc(priv->cmd.max_cmds *
+				   sizeof (struct mlx4_cmd_context),
+				   GFP_KERNEL);
+	if (!priv->cmd.context)
+		return -ENOMEM;
+
+	for (i = 0; i < priv->cmd.max_cmds; ++i) {
+		priv->cmd.context[i].token = i;
+		priv->cmd.context[i].next  = i + 1;
+	}
+
+	priv->cmd.context[priv->cmd.max_cmds - 1].next = -1;
+	priv->cmd.free_head = 0;
+
+	sema_init(&priv->cmd.event_sem, priv->cmd.max_cmds);
+	spin_lock_init(&priv->cmd.context_lock);
+
+	for (priv->cmd.token_mask = 1;
+	     priv->cmd.token_mask < priv->cmd.max_cmds;
+	     priv->cmd.token_mask <<= 1)
+		; /* nothing */
+	--priv->cmd.token_mask;
+
+	priv->cmd.use_events = 1;
+
+	down(&priv->cmd.poll_sem);
+
+	return 0;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mlx4_cmd_use_polling(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	priv->cmd.use_events = 0;
+
+	for (i = 0; i < priv->cmd.max_cmds; ++i)
+		down(&priv->cmd.event_sem);
+
+	kfree(priv->cmd.context);
+
+	up(&priv->cmd.poll_sem);
+}
+
+struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = kmalloc(sizeof *mailbox, GFP_KERNEL);
+	if (!mailbox)
+		return ERR_PTR(-ENOMEM);
+
+	mailbox->buf = pci_pool_alloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
+				      &mailbox->dma);
+	if (!mailbox->buf) {
+		kfree(mailbox);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return mailbox;
+}
+EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox);
+
+void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox)
+{
+	if (!mailbox)
+		return;
+
+	pci_pool_free(mlx4_priv(dev)->cmd.pool, mailbox->buf, mailbox->dma);
+	kfree(mailbox);
+}
+EXPORT_SYMBOL_GPL(mlx4_free_cmd_mailbox);
diff --git a/drivers/net/mlx4/cq.c b/drivers/net/mlx4/cq.c
new file mode 100644
index 000000000000..437d78ad0912
--- /dev/null
+++ b/drivers/net/mlx4/cq.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/hardirq.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+struct mlx4_cq_context {
+	__be32			flags;
+	u16			reserved1[3];
+	__be16			page_offset;
+	__be32			logsize_usrpage;
+	u8			reserved2;
+	u8			cq_period;
+	u8			reserved3;
+	u8			cq_max_count;
+	u8			reserved4[3];
+	u8			comp_eqn;
+	u8			log_page_size;
+	u8			reserved5[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	__be32			last_notified_index;
+	__be32			solicit_producer_index;
+	__be32			consumer_index;
+	__be32			producer_index;
+	u8			reserved6[2];
+	__be64			db_rec_addr;
+};
+
+#define MLX4_CQ_STATUS_OK		( 0 << 28)
+#define MLX4_CQ_STATUS_OVERFLOW		( 9 << 28)
+#define MLX4_CQ_STATUS_WRITE_FAIL	(10 << 28)
+#define MLX4_CQ_FLAG_CC			( 1 << 18)
+#define MLX4_CQ_FLAG_OI			( 1 << 17)
+#define MLX4_CQ_STATE_ARMED		( 9 <<  8)
+#define MLX4_CQ_STATE_ARMED_SOL		( 6 <<  8)
+#define MLX4_EQ_STATE_FIRED		(10 <<  8)
+
+void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
+{
+	struct mlx4_cq *cq;
+
+	cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree,
+			       cqn & (dev->caps.num_cqs - 1));
+	if (!cq) {
+		mlx4_warn(dev, "Completion event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	++cq->arm_sn;
+
+	cq->comp(cq);
+}
+
+void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type)
+{
+	struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+	struct mlx4_cq *cq;
+
+	spin_lock(&cq_table->lock);
+
+	cq = radix_tree_lookup(&cq_table->tree, cqn & (dev->caps.num_cqs - 1));
+	if (cq)
+		atomic_inc(&cq->refcount);
+
+	spin_unlock(&cq_table->lock);
+
+	if (!cq) {
+		mlx4_warn(dev, "Async event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	cq->event(cq, event_type);
+
+	if (atomic_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+}
+
+static int mlx4_SW2HW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int cq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, cq_num, 0, MLX4_CMD_SW2HW_CQ,
+			MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_HW2SW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int cq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, cq_num,
+			    mailbox ? 0 : 1, MLX4_CMD_HW2SW_CQ,
+			    MLX4_CMD_TIME_CLASS_A);
+}
+
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
+		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cq_context *cq_context;
+	u64 mtt_addr;
+	int err;
+
+	cq->cqn = mlx4_bitmap_alloc(&cq_table->bitmap);
+	if (cq->cqn == -1)
+		return -ENOMEM;
+
+	err = mlx4_table_get(dev, &cq_table->table, cq->cqn);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &cq_table->cmpt_table, cq->cqn);
+	if (err)
+		goto err_put;
+
+	spin_lock_irq(&cq_table->lock);
+	err = radix_tree_insert(&cq_table->tree, cq->cqn, cq);
+	spin_unlock_irq(&cq_table->lock);
+	if (err)
+		goto err_cmpt_put;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_radix;
+	}
+
+	cq_context = mailbox->buf;
+	memset(cq_context, 0, sizeof *cq_context);
+
+	cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index);
+	cq_context->comp_eqn        = priv->eq_table.eq[MLX4_EQ_COMP].eqn;
+	cq_context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, mtt);
+	cq_context->mtt_base_addr_h = mtt_addr >> 32;
+	cq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+	cq_context->db_rec_addr     = cpu_to_be64(db_rec);
+
+	err = mlx4_SW2HW_CQ(dev, mailbox, cq->cqn);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		goto err_radix;
+
+	cq->cons_index = 0;
+	cq->arm_sn     = 1;
+	cq->uar        = uar;
+	atomic_set(&cq->refcount, 1);
+	init_completion(&cq->free);
+
+	return 0;
+
+err_radix:
+	spin_lock_irq(&cq_table->lock);
+	radix_tree_delete(&cq_table->tree, cq->cqn);
+	spin_unlock_irq(&cq_table->lock);
+
+err_cmpt_put:
+	mlx4_table_put(dev, &cq_table->cmpt_table, cq->cqn);
+
+err_put:
+	mlx4_table_put(dev, &cq_table->table, cq->cqn);
+
+err_out:
+	mlx4_bitmap_free(&cq_table->bitmap, cq->cqn);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_alloc);
+
+void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+	int err;
+
+	err = mlx4_HW2SW_CQ(dev, NULL, cq->cqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn);
+
+	synchronize_irq(priv->eq_table.eq[MLX4_EQ_COMP].irq);
+
+	spin_lock_irq(&cq_table->lock);
+	radix_tree_delete(&cq_table->tree, cq->cqn);
+	spin_unlock_irq(&cq_table->lock);
+
+	if (atomic_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+	wait_for_completion(&cq->free);
+
+	mlx4_table_put(dev, &cq_table->table, cq->cqn);
+	mlx4_bitmap_free(&cq_table->bitmap, cq->cqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_free);
+
+int __devinit mlx4_init_cq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+	int err;
+
+	spin_lock_init(&cq_table->lock);
+	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
+
+	err = mlx4_bitmap_init(&cq_table->bitmap, dev->caps.num_cqs,
+			       dev->caps.num_cqs - 1, dev->caps.reserved_cqs);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+void mlx4_cleanup_cq_table(struct mlx4_dev *dev)
+{
+	/* Nothing to do to clean up radix_tree */
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->cq_table.bitmap);
+}
diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c
new file mode 100644
index 000000000000..acf1c801a1b8
--- /dev/null
+++ b/drivers/net/mlx4/eq.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+enum {
+	MLX4_NUM_ASYNC_EQE	= 0x100,
+	MLX4_NUM_SPARE_EQE	= 0x80,
+	MLX4_EQ_ENTRY_SIZE	= 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_eq_context {
+	__be32			flags;
+	u16			reserved1[3];
+	__be16			page_offset;
+	u8			log_eq_size;
+	u8			reserved2[4];
+	u8			eq_period;
+	u8			reserved3;
+	u8			eq_max_count;
+	u8			reserved4[3];
+	u8			intr;
+	u8			log_page_size;
+	u8			reserved5[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	u32			reserved6[2];
+	__be32			consumer_index;
+	__be32			producer_index;
+	u32			reserved7[4];
+};
+
+#define MLX4_EQ_STATUS_OK	   ( 0 << 28)
+#define MLX4_EQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MLX4_EQ_OWNER_SW	   ( 0 << 24)
+#define MLX4_EQ_OWNER_HW	   ( 1 << 24)
+#define MLX4_EQ_FLAG_EC		   ( 1 << 18)
+#define MLX4_EQ_FLAG_OI		   ( 1 << 17)
+#define MLX4_EQ_STATE_ARMED	   ( 9 <<  8)
+#define MLX4_EQ_STATE_FIRED	   (10 <<  8)
+#define MLX4_EQ_STATE_ALWAYS_ARMED (11 <<  8)
+
+#define MLX4_ASYNC_EVENT_MASK ((1ull << MLX4_EVENT_TYPE_PATH_MIG)	    | \
+			       (1ull << MLX4_EVENT_TYPE_COMM_EST)	    | \
+			       (1ull << MLX4_EVENT_TYPE_SQ_DRAINED)	    | \
+			       (1ull << MLX4_EVENT_TYPE_CQ_ERROR)	    | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_CATAS_ERROR)	    | \
+			       (1ull << MLX4_EVENT_TYPE_EEC_CATAS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_PATH_MIG_FAILED)    | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR)  | \
+			       (1ull << MLX4_EVENT_TYPE_PORT_CHANGE)	    | \
+			       (1ull << MLX4_EVENT_TYPE_ECC_DETECT)	    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE)    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_LIMIT)	    | \
+			       (1ull << MLX4_EVENT_TYPE_CMD))
+#define MLX4_CATAS_EVENT_MASK  (1ull << MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR)
+
+struct mlx4_eqe {
+	u8			reserved1;
+	u8			type;
+	u8			reserved2;
+	u8			subtype;
+	union {
+		u32		raw[6];
+		struct {
+			__be32	cqn;
+		} __attribute__((packed)) comp;
+		struct {
+			u16	reserved1;
+			__be16	token;
+			u32	reserved2;
+			u8	reserved3[3];
+			u8	status;
+			__be64	out_param;
+		} __attribute__((packed)) cmd;
+		struct {
+			__be32	qpn;
+		} __attribute__((packed)) qp;
+		struct {
+			__be32	srqn;
+		} __attribute__((packed)) srq;
+		struct {
+			__be32	cqn;
+			u32	reserved1;
+			u8	reserved2[3];
+			u8	syndrome;
+		} __attribute__((packed)) cq_err;
+		struct {
+			u32	reserved1[2];
+			__be32	port;
+		} __attribute__((packed)) port_change;
+	}			event;
+	u8			reserved3[3];
+	u8			owner;
+} __attribute__((packed));
+
+static void eq_set_ci(struct mlx4_eq *eq, int req_not)
+{
+	__raw_writel((__force u32) cpu_to_be32((eq->cons_index & 0xffffff) |
+					       req_not << 31),
+		     eq->doorbell);
+	/* We still want ordering, just not swabbing, so add a barrier */
+	mb();
+}
+
+static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry)
+{
+	unsigned long off = (entry & (eq->nent - 1)) * MLX4_EQ_ENTRY_SIZE;
+	return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE;
+}
+
+static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq)
+{
+	struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index);
+	return !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? NULL : eqe;
+}
+
+static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
+{
+	struct mlx4_eqe *eqe;
+	int cqn;
+	int eqes_found = 0;
+	int set_ci = 0;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		/*
+		 * Make sure we read EQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		rmb();
+
+		switch (eqe->type) {
+		case MLX4_EVENT_TYPE_COMP:
+			cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+			mlx4_cq_completion(dev, cqn);
+			break;
+
+		case MLX4_EVENT_TYPE_PATH_MIG:
+		case MLX4_EVENT_TYPE_COMM_EST:
+		case MLX4_EVENT_TYPE_SQ_DRAINED:
+		case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+		case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+			mlx4_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				      eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_SRQ_LIMIT:
+		case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+			mlx4_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & 0xffffff,
+				      eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_CMD:
+			mlx4_cmd_event(dev,
+				       be16_to_cpu(eqe->event.cmd.token),
+				       eqe->event.cmd.status,
+				       be64_to_cpu(eqe->event.cmd.out_param));
+			break;
+
+		case MLX4_EVENT_TYPE_PORT_CHANGE:
+			mlx4_dispatch_event(dev, eqe->type, eqe->subtype,
+					    be32_to_cpu(eqe->event.port_change.port) >> 28);
+			break;
+
+		case MLX4_EVENT_TYPE_CQ_ERROR:
+			mlx4_warn(dev, "CQ %s on CQN %06x\n",
+				  eqe->event.cq_err.syndrome == 1 ?
+				  "overrun" : "access violation",
+				  be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff);
+			mlx4_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn),
+				      eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_EQ_OVERFLOW:
+			mlx4_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
+			break;
+
+		case MLX4_EVENT_TYPE_EEC_CATAS_ERROR:
+		case MLX4_EVENT_TYPE_ECC_DETECT:
+		default:
+			mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at index %u\n",
+				  eqe->type, eqe->subtype, eq->eqn, eq->cons_index);
+			break;
+		};
+
+		++eq->cons_index;
+		eqes_found = 1;
+		++set_ci;
+
+		/*
+		 * The HCA will think the queue has overflowed if we
+		 * don't tell it we've been processing events.  We
+		 * create our EQs with MLX4_NUM_SPARE_EQE extra
+		 * entries, so we must update our consumer index at
+		 * least that often.
+		 */
+		if (unlikely(set_ci >= MLX4_NUM_SPARE_EQE)) {
+			/*
+			 * Conditional on hca_type is OK here because
+			 * this is a rare case, not the fast path.
+			 */
+			eq_set_ci(eq, 0);
+			set_ci = 0;
+		}
+	}
+
+	eq_set_ci(eq, 1);
+
+	return eqes_found;
+}
+
+static irqreturn_t mlx4_interrupt(int irq, void *dev_ptr)
+{
+	struct mlx4_dev *dev = dev_ptr;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int work = 0;
+	int i;
+
+	writel(priv->eq_table.clr_mask, priv->eq_table.clr_int);
+
+	for (i = 0; i < MLX4_EQ_CATAS; ++i)
+		work |= mlx4_eq_int(dev, &priv->eq_table.eq[i]);
+
+	return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mlx4_msi_x_interrupt(int irq, void *eq_ptr)
+{
+	struct mlx4_eq  *eq  = eq_ptr;
+	struct mlx4_dev *dev = eq->dev;
+
+	mlx4_eq_int(dev, eq);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mlx4_catas_interrupt(int irq, void *dev_ptr)
+{
+	mlx4_handle_catas_err(dev_ptr);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static int mlx4_MAP_EQ(struct mlx4_dev *dev, u64 event_mask, int unmap,
+			int eq_num)
+{
+	return mlx4_cmd(dev, event_mask, (unmap << 31) | eq_num,
+			0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B);
+}
+
+static int mlx4_SW2HW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int eq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, eq_num, 0, MLX4_CMD_SW2HW_EQ,
+			MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_HW2SW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int eq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox->dma, eq_num, 0, MLX4_CMD_HW2SW_EQ,
+			    MLX4_CMD_TIME_CLASS_A);
+}
+
+static void __devinit __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev,
+					       struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int index;
+
+	index = eq->eqn / 4 - dev->caps.reserved_eqs / 4;
+
+	if (!priv->eq_table.uar_map[index]) {
+		priv->eq_table.uar_map[index] =
+			ioremap(pci_resource_start(dev->pdev, 2) +
+				((eq->eqn / 4) << PAGE_SHIFT),
+				PAGE_SIZE);
+		if (!priv->eq_table.uar_map[index]) {
+			mlx4_err(dev, "Couldn't map EQ doorbell for EQN 0x%06x\n",
+				 eq->eqn);
+			return NULL;
+		}
+	}
+
+	return priv->eq_table.uar_map[index] + 0x800 + 8 * (eq->eqn % 4);
+}
+
+static int __devinit mlx4_create_eq(struct mlx4_dev *dev, int nent,
+				    u8 intr, struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_eq_context *eq_context;
+	int npages;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	u64 mtt_addr;
+	int err = -ENOMEM;
+	int i;
+
+	eq->dev   = dev;
+	eq->nent  = roundup_pow_of_two(max(nent, 2));
+	npages = PAGE_ALIGN(eq->nent * MLX4_EQ_ENTRY_SIZE) / PAGE_SIZE;
+
+	eq->page_list = kmalloc(npages * sizeof *eq->page_list,
+				GFP_KERNEL);
+	if (!eq->page_list)
+		goto err_out;
+
+	for (i = 0; i < npages; ++i)
+		eq->page_list[i].buf = NULL;
+
+	dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+	if (!dma_list)
+		goto err_out_free;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		goto err_out_free;
+	eq_context = mailbox->buf;
+
+	for (i = 0; i < npages; ++i) {
+		eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev,
+							  PAGE_SIZE, &t, GFP_KERNEL);
+		if (!eq->page_list[i].buf)
+			goto err_out_free_pages;
+
+		dma_list[i] = t;
+		eq->page_list[i].map = t;
+
+		memset(eq->page_list[i].buf, 0, PAGE_SIZE);
+	}
+
+	eq->eqn = mlx4_bitmap_alloc(&priv->eq_table.bitmap);
+	if (eq->eqn == -1)
+		goto err_out_free_pages;
+
+	eq->doorbell = mlx4_get_eq_uar(dev, eq);
+	if (!eq->doorbell) {
+		err = -ENOMEM;
+		goto err_out_free_eq;
+	}
+
+	err = mlx4_mtt_init(dev, npages, PAGE_SHIFT, &eq->mtt);
+	if (err)
+		goto err_out_free_eq;
+
+	err = mlx4_write_mtt(dev, &eq->mtt, 0, npages, dma_list);
+	if (err)
+		goto err_out_free_mtt;
+
+	memset(eq_context, 0, sizeof *eq_context);
+	eq_context->flags	  = cpu_to_be32(MLX4_EQ_STATUS_OK   |
+						MLX4_EQ_STATE_ARMED);
+	eq_context->log_eq_size	  = ilog2(eq->nent);
+	eq_context->intr	  = intr;
+	eq_context->log_page_size = PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, &eq->mtt);
+	eq_context->mtt_base_addr_h = mtt_addr >> 32;
+	eq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+
+	err = mlx4_SW2HW_EQ(dev, mailbox, eq->eqn);
+	if (err) {
+		mlx4_warn(dev, "SW2HW_EQ failed (%d)\n", err);
+		goto err_out_free_mtt;
+	}
+
+	kfree(dma_list);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	eq->cons_index = 0;
+
+	return err;
+
+err_out_free_mtt:
+	mlx4_mtt_cleanup(dev, &eq->mtt);
+
+err_out_free_eq:
+	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn);
+
+err_out_free_pages:
+	for (i = 0; i < npages; ++i)
+		if (eq->page_list[i].buf)
+			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+					  eq->page_list[i].buf,
+					  eq->page_list[i].map);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_out_free:
+	kfree(eq->page_list);
+	kfree(dma_list);
+
+err_out:
+	return err;
+}
+
+static void mlx4_free_eq(struct mlx4_dev *dev,
+			 struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	int npages = PAGE_ALIGN(MLX4_EQ_ENTRY_SIZE * eq->nent) / PAGE_SIZE;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return;
+
+	err = mlx4_HW2SW_EQ(dev, mailbox, eq->eqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_EQ failed (%d)\n", err);
+
+	if (0) {
+		mlx4_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
+		for (i = 0; i < sizeof (struct mlx4_eq_context) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpup(mailbox->buf + i * 4));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	mlx4_mtt_cleanup(dev, &eq->mtt);
+	for (i = 0; i < npages; ++i)
+		pci_free_consistent(dev->pdev, PAGE_SIZE,
+				    eq->page_list[i].buf,
+				    eq->page_list[i].map);
+
+	kfree(eq->page_list);
+	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+}
+
+static void mlx4_free_irqs(struct mlx4_dev *dev)
+{
+	struct mlx4_eq_table *eq_table = &mlx4_priv(dev)->eq_table;
+	int i;
+
+	if (eq_table->have_irq)
+		free_irq(dev->pdev->irq, dev);
+	for (i = 0; i < MLX4_NUM_EQ; ++i)
+		if (eq_table->eq[i].have_irq)
+			free_irq(eq_table->eq[i].irq, eq_table->eq + i);
+}
+
+static int __devinit mlx4_map_clr_int(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->clr_base = ioremap(pci_resource_start(dev->pdev, priv->fw.clr_int_bar) +
+				 priv->fw.clr_int_base, MLX4_CLR_INT_SIZE);
+	if (!priv->clr_base) {
+		mlx4_err(dev, "Couldn't map interrupt clear register, aborting.\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void mlx4_unmap_clr_int(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	iounmap(priv->clr_base);
+}
+
+int __devinit mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int ret;
+
+	/*
+	 * We assume that mapping one page is enough for the whole EQ
+	 * context table.  This is fine with all current HCAs, because
+	 * we only use 32 EQs and each EQ uses 64 bytes of context
+	 * memory, or 1 KB total.
+	 */
+	priv->eq_table.icm_virt = icm_virt;
+	priv->eq_table.icm_page = alloc_page(GFP_HIGHUSER);
+	if (!priv->eq_table.icm_page)
+		return -ENOMEM;
+	priv->eq_table.icm_dma  = pci_map_page(dev->pdev, priv->eq_table.icm_page, 0,
+					       PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+	if (pci_dma_mapping_error(priv->eq_table.icm_dma)) {
+		__free_page(priv->eq_table.icm_page);
+		return -ENOMEM;
+	}
+
+	ret = mlx4_MAP_ICM_page(dev, priv->eq_table.icm_dma, icm_virt);
+	if (ret) {
+		pci_unmap_page(dev->pdev, priv->eq_table.icm_dma, PAGE_SIZE,
+			       PCI_DMA_BIDIRECTIONAL);
+		__free_page(priv->eq_table.icm_page);
+	}
+
+	return ret;
+}
+
+void mlx4_unmap_eq_icm(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mlx4_UNMAP_ICM(dev, priv->eq_table.icm_virt, 1);
+	pci_unmap_page(dev->pdev, priv->eq_table.icm_dma, PAGE_SIZE,
+		       PCI_DMA_BIDIRECTIONAL);
+	__free_page(priv->eq_table.icm_page);
+}
+
+int __devinit mlx4_init_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int i;
+
+	err = mlx4_bitmap_init(&priv->eq_table.bitmap, dev->caps.num_eqs,
+			       dev->caps.num_eqs - 1, dev->caps.reserved_eqs);
+	if (err)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(priv->eq_table.uar_map); ++i)
+		priv->eq_table.uar_map[i] = NULL;
+
+	err = mlx4_map_clr_int(dev);
+	if (err)
+		goto err_out_free;
+
+	priv->eq_table.clr_mask =
+		swab32(1 << (priv->eq_table.inta_pin & 31));
+	priv->eq_table.clr_int  = priv->clr_base +
+		(priv->eq_table.inta_pin < 32 ? 4 : 0);
+
+	err = mlx4_create_eq(dev, dev->caps.num_cqs + MLX4_NUM_SPARE_EQE,
+			     (dev->flags & MLX4_FLAG_MSI_X) ? MLX4_EQ_COMP : 0,
+			     &priv->eq_table.eq[MLX4_EQ_COMP]);
+	if (err)
+		goto err_out_unmap;
+
+	err = mlx4_create_eq(dev, MLX4_NUM_ASYNC_EQE + MLX4_NUM_SPARE_EQE,
+			     (dev->flags & MLX4_FLAG_MSI_X) ? MLX4_EQ_ASYNC : 0,
+			     &priv->eq_table.eq[MLX4_EQ_ASYNC]);
+	if (err)
+		goto err_out_comp;
+
+	if (dev->flags & MLX4_FLAG_MSI_X) {
+		static const char *eq_name[] = {
+			[MLX4_EQ_COMP]  = DRV_NAME " (comp)",
+			[MLX4_EQ_ASYNC] = DRV_NAME " (async)",
+			[MLX4_EQ_CATAS] = DRV_NAME " (catas)"
+		};
+
+		err = mlx4_create_eq(dev, 1, MLX4_EQ_CATAS,
+				     &priv->eq_table.eq[MLX4_EQ_CATAS]);
+		if (err)
+			goto err_out_async;
+
+		for (i = 0; i < MLX4_EQ_CATAS; ++i) {
+			err = request_irq(priv->eq_table.eq[i].irq,
+					  mlx4_msi_x_interrupt,
+					  0, eq_name[i], priv->eq_table.eq + i);
+			if (err)
+				goto err_out_catas;
+
+			priv->eq_table.eq[i].have_irq = 1;
+		}
+
+		err = request_irq(priv->eq_table.eq[MLX4_EQ_CATAS].irq,
+				  mlx4_catas_interrupt, 0,
+				  eq_name[MLX4_EQ_CATAS], dev);
+		if (err)
+			goto err_out_catas;
+
+		priv->eq_table.eq[MLX4_EQ_CATAS].have_irq = 1;
+	} else {
+		err = request_irq(dev->pdev->irq, mlx4_interrupt,
+				  SA_SHIRQ, DRV_NAME, dev);
+		if (err)
+			goto err_out_async;
+
+		priv->eq_table.have_irq = 1;
+	}
+
+	err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0,
+			  priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
+	if (err)
+		mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+			   priv->eq_table.eq[MLX4_EQ_ASYNC].eqn, err);
+
+	for (i = 0; i < MLX4_EQ_CATAS; ++i)
+		eq_set_ci(&priv->eq_table.eq[i], 1);
+
+	if (dev->flags & MLX4_FLAG_MSI_X) {
+		err = mlx4_MAP_EQ(dev, MLX4_CATAS_EVENT_MASK, 0,
+				  priv->eq_table.eq[MLX4_EQ_CATAS].eqn);
+		if (err)
+			mlx4_warn(dev, "MAP_EQ for catas EQ %d failed (%d)\n",
+				  priv->eq_table.eq[MLX4_EQ_CATAS].eqn, err);
+	}
+
+	return 0;
+
+err_out_catas:
+	mlx4_free_eq(dev, &priv->eq_table.eq[MLX4_EQ_CATAS]);
+
+err_out_async:
+	mlx4_free_eq(dev, &priv->eq_table.eq[MLX4_EQ_ASYNC]);
+
+err_out_comp:
+	mlx4_free_eq(dev, &priv->eq_table.eq[MLX4_EQ_COMP]);
+
+err_out_unmap:
+	mlx4_unmap_clr_int(dev);
+	mlx4_free_irqs(dev);
+
+err_out_free:
+	mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+	return err;
+}
+
+void mlx4_cleanup_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	if (dev->flags & MLX4_FLAG_MSI_X)
+		mlx4_MAP_EQ(dev, MLX4_CATAS_EVENT_MASK, 1,
+			    priv->eq_table.eq[MLX4_EQ_CATAS].eqn);
+
+	mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 1,
+		    priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
+
+	mlx4_free_irqs(dev);
+
+	for (i = 0; i < MLX4_EQ_CATAS; ++i)
+		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
+	if (dev->flags & MLX4_FLAG_MSI_X)
+		mlx4_free_eq(dev, &priv->eq_table.eq[MLX4_EQ_CATAS]);
+
+	mlx4_unmap_clr_int(dev);
+
+	for (i = 0; i < ARRAY_SIZE(priv->eq_table.uar_map); ++i)
+		if (priv->eq_table.uar_map[i])
+			iounmap(priv->eq_table.uar_map[i]);
+
+	mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+}
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
new file mode 100644
index 000000000000..c42717313663
--- /dev/null
+++ b/drivers/net/mlx4/fw.c
@@ -0,0 +1,775 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/cmd.h>
+
+#include "fw.h"
+#include "icm.h"
+
+extern void __buggy_use_of_MLX4_GET(void);
+extern void __buggy_use_of_MLX4_PUT(void);
+
+#define MLX4_GET(dest, source, offset)				      \
+	do {							      \
+		void *__p = (char *) (source) + (offset);	      \
+		switch (sizeof (dest)) {			      \
+		case 1: (dest) = *(u8 *) __p;	    break;	      \
+		case 2: (dest) = be16_to_cpup(__p); break;	      \
+		case 4: (dest) = be32_to_cpup(__p); break;	      \
+		case 8: (dest) = be64_to_cpup(__p); break;	      \
+		default: __buggy_use_of_MLX4_GET();		      \
+		}						      \
+	} while (0)
+
+#define MLX4_PUT(dest, source, offset)				      \
+	do {							      \
+		void *__d = ((char *) (dest) + (offset));	      \
+		switch (sizeof(source)) {			      \
+		case 1: *(u8 *) __d = (source);		       break; \
+		case 2:	*(__be16 *) __d = cpu_to_be16(source); break; \
+		case 4:	*(__be32 *) __d = cpu_to_be32(source); break; \
+		case 8:	*(__be64 *) __d = cpu_to_be64(source); break; \
+		default: __buggy_use_of_MLX4_PUT();		      \
+		}						      \
+	} while (0)
+
+static void dump_dev_cap_flags(struct mlx4_dev *dev, u32 flags)
+{
+	static const char *fname[] = {
+		[ 0] = "RC transport",
+		[ 1] = "UC transport",
+		[ 2] = "UD transport",
+		[ 3] = "SRC transport",
+		[ 4] = "reliable multicast",
+		[ 5] = "FCoIB support",
+		[ 6] = "SRQ support",
+		[ 7] = "IPoIB checksum offload",
+		[ 8] = "P_Key violation counter",
+		[ 9] = "Q_Key violation counter",
+		[10] = "VMM",
+		[16] = "MW support",
+		[17] = "APM support",
+		[18] = "Atomic ops support",
+		[19] = "Raw multicast support",
+		[20] = "Address vector port checking support",
+		[21] = "UD multicast support",
+		[24] = "Demand paging support",
+		[25] = "Router support"
+	};
+	int i;
+
+	mlx4_dbg(dev, "DEV_CAP flags:\n");
+	for (i = 0; i < 32; ++i)
+		if (fname[i] && (flags & (1 << i)))
+			mlx4_dbg(dev, "    %s\n", fname[i]);
+}
+
+int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	u8 field;
+	u16 size;
+	u16 stat_rate;
+	int err;
+
+#define QUERY_DEV_CAP_OUT_SIZE		       0x100
+#define QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET		0x10
+#define QUERY_DEV_CAP_MAX_QP_SZ_OFFSET		0x11
+#define QUERY_DEV_CAP_RSVD_QP_OFFSET		0x12
+#define QUERY_DEV_CAP_MAX_QP_OFFSET		0x13
+#define QUERY_DEV_CAP_RSVD_SRQ_OFFSET		0x14
+#define QUERY_DEV_CAP_MAX_SRQ_OFFSET		0x15
+#define QUERY_DEV_CAP_RSVD_EEC_OFFSET		0x16
+#define QUERY_DEV_CAP_MAX_EEC_OFFSET		0x17
+#define QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET		0x19
+#define QUERY_DEV_CAP_RSVD_CQ_OFFSET		0x1a
+#define QUERY_DEV_CAP_MAX_CQ_OFFSET		0x1b
+#define QUERY_DEV_CAP_MAX_MPT_OFFSET		0x1d
+#define QUERY_DEV_CAP_RSVD_EQ_OFFSET		0x1e
+#define QUERY_DEV_CAP_MAX_EQ_OFFSET		0x1f
+#define QUERY_DEV_CAP_RSVD_MTT_OFFSET		0x20
+#define QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET		0x21
+#define QUERY_DEV_CAP_RSVD_MRW_OFFSET		0x22
+#define QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET	0x23
+#define QUERY_DEV_CAP_MAX_AV_OFFSET		0x27
+#define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET		0x29
+#define QUERY_DEV_CAP_MAX_RES_QP_OFFSET		0x2b
+#define QUERY_DEV_CAP_MAX_RDMA_OFFSET		0x2f
+#define QUERY_DEV_CAP_RSZ_SRQ_OFFSET		0x33
+#define QUERY_DEV_CAP_ACK_DELAY_OFFSET		0x35
+#define QUERY_DEV_CAP_MTU_WIDTH_OFFSET		0x36
+#define QUERY_DEV_CAP_VL_PORT_OFFSET		0x37
+#define QUERY_DEV_CAP_MAX_GID_OFFSET		0x3b
+#define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET	0x3c
+#define QUERY_DEV_CAP_MAX_PKEY_OFFSET		0x3f
+#define QUERY_DEV_CAP_FLAGS_OFFSET		0x44
+#define QUERY_DEV_CAP_RSVD_UAR_OFFSET		0x48
+#define QUERY_DEV_CAP_UAR_SZ_OFFSET		0x49
+#define QUERY_DEV_CAP_PAGE_SZ_OFFSET		0x4b
+#define QUERY_DEV_CAP_BF_OFFSET			0x4c
+#define QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET	0x4d
+#define QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET	0x4e
+#define QUERY_DEV_CAP_LOG_MAX_BF_PAGES_OFFSET	0x4f
+#define QUERY_DEV_CAP_MAX_SG_SQ_OFFSET		0x51
+#define QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET	0x52
+#define QUERY_DEV_CAP_MAX_SG_RQ_OFFSET		0x55
+#define QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET	0x56
+#define QUERY_DEV_CAP_MAX_QP_MCG_OFFSET		0x61
+#define QUERY_DEV_CAP_RSVD_MCG_OFFSET		0x62
+#define QUERY_DEV_CAP_MAX_MCG_OFFSET		0x63
+#define QUERY_DEV_CAP_RSVD_PD_OFFSET		0x64
+#define QUERY_DEV_CAP_MAX_PD_OFFSET		0x65
+#define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET	0x80
+#define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET	0x82
+#define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET	0x84
+#define QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET	0x86
+#define QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET	0x88
+#define QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET	0x8a
+#define QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET	0x8c
+#define QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET	0x8e
+#define QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET	0x90
+#define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET	0x92
+#define QUERY_DEV_CAP_BMME_FLAGS_OFFSET		0x97
+#define QUERY_DEV_CAP_RSVD_LKEY_OFFSET		0x98
+#define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET		0xa0
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+			   MLX4_CMD_TIME_CLASS_A);
+
+	if (err)
+		goto out;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET);
+	dev_cap->reserved_qps = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET);
+	dev_cap->max_qps = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_SRQ_OFFSET);
+	dev_cap->reserved_srqs = 1 << (field >> 4);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_OFFSET);
+	dev_cap->max_srqs = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET);
+	dev_cap->max_cq_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_CQ_OFFSET);
+	dev_cap->reserved_cqs = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_OFFSET);
+	dev_cap->max_cqs = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MPT_OFFSET);
+	dev_cap->max_mpts = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET);
+	dev_cap->reserved_eqs = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET);
+	dev_cap->max_eqs = 1 << (field & 0x7);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET);
+	dev_cap->reserved_mtts = 1 << (field >> 4);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET);
+	dev_cap->max_mrw_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MRW_OFFSET);
+	dev_cap->reserved_mrws = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET);
+	dev_cap->max_mtt_seg = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_REQ_QP_OFFSET);
+	dev_cap->max_requester_per_qp = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RES_QP_OFFSET);
+	dev_cap->max_responder_per_qp = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RDMA_OFFSET);
+	dev_cap->max_rdma_global = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET);
+	dev_cap->local_ca_ack_delay = field & 0x1f;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
+	dev_cap->max_mtu	= field >> 4;
+	dev_cap->max_port_width = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+	dev_cap->max_vl    = field >> 4;
+	dev_cap->num_ports = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
+	dev_cap->max_gids = 1 << (field & 0xf);
+	MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET);
+	dev_cap->stat_rate_support = stat_rate;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET);
+	dev_cap->max_pkeys = 1 << (field & 0xf);
+	MLX4_GET(dev_cap->flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET);
+	dev_cap->reserved_uars = field >> 4;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET);
+	dev_cap->uar_size = 1 << ((field & 0x3f) + 20);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PAGE_SZ_OFFSET);
+	dev_cap->min_page_sz = 1 << field;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_BF_OFFSET);
+	if (field & 0x80) {
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET);
+		dev_cap->bf_reg_size = 1 << (field & 0x1f);
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET);
+		dev_cap->bf_regs_per_page = 1 << (field & 0x3f);
+		mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n",
+			 dev_cap->bf_reg_size, dev_cap->bf_regs_per_page);
+	} else {
+		dev_cap->bf_reg_size = 0;
+		mlx4_dbg(dev, "BlueFlame not available\n");
+	}
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_SQ_OFFSET);
+	dev_cap->max_sq_sg = field;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET);
+	dev_cap->max_sq_desc_sz = size;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_MCG_OFFSET);
+	dev_cap->max_qp_per_mcg = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MCG_OFFSET);
+	dev_cap->reserved_mgms = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MCG_OFFSET);
+	dev_cap->max_mcgs = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_PD_OFFSET);
+	dev_cap->reserved_pds = field >> 4;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PD_OFFSET);
+	dev_cap->max_pds = 1 << (field & 0x3f);
+
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET);
+	dev_cap->rdmarc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET);
+	dev_cap->qpc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET);
+	dev_cap->aux_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET);
+	dev_cap->altc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET);
+	dev_cap->eqc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET);
+	dev_cap->cqc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET);
+	dev_cap->srq_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET);
+	dev_cap->cmpt_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET);
+	dev_cap->mtt_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET);
+	dev_cap->dmpt_entry_sz = size;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET);
+	dev_cap->max_srq_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_SZ_OFFSET);
+	dev_cap->max_qp_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSZ_SRQ_OFFSET);
+	dev_cap->resize_srq = field & 1;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_RQ_OFFSET);
+	dev_cap->max_rq_sg = field;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET);
+	dev_cap->max_rq_desc_sz = size;
+
+	MLX4_GET(dev_cap->bmme_flags, outbox,
+		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	MLX4_GET(dev_cap->reserved_lkey, outbox,
+		 QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
+	MLX4_GET(dev_cap->max_icm_sz, outbox,
+		 QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET);
+
+	if (dev_cap->bmme_flags & 1)
+		mlx4_dbg(dev, "Base MM extensions: yes "
+			 "(flags %d, rsvd L_Key %08x)\n",
+			 dev_cap->bmme_flags, dev_cap->reserved_lkey);
+	else
+		mlx4_dbg(dev, "Base MM extensions: no\n");
+
+	/*
+	 * Each UAR has 4 EQ doorbells; so if a UAR is reserved, then
+	 * we can't use any EQs whose doorbell falls on that page,
+	 * even if the EQ itself isn't reserved.
+	 */
+	dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4,
+				    dev_cap->reserved_eqs);
+
+	mlx4_dbg(dev, "Max ICM size %lld MB\n",
+		 (unsigned long long) dev_cap->max_icm_sz >> 20);
+	mlx4_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+		 dev_cap->max_qps, dev_cap->reserved_qps, dev_cap->qpc_entry_sz);
+	mlx4_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n",
+		 dev_cap->max_srqs, dev_cap->reserved_srqs, dev_cap->srq_entry_sz);
+	mlx4_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+		 dev_cap->max_cqs, dev_cap->reserved_cqs, dev_cap->cqc_entry_sz);
+	mlx4_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
+		 dev_cap->max_eqs, dev_cap->reserved_eqs, dev_cap->eqc_entry_sz);
+	mlx4_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+		 dev_cap->reserved_mrws, dev_cap->reserved_mtts);
+	mlx4_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+		 dev_cap->max_pds, dev_cap->reserved_pds, dev_cap->reserved_uars);
+	mlx4_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+		 dev_cap->max_pds, dev_cap->reserved_mgms);
+	mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
+		 dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz);
+	mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n",
+		 dev_cap->local_ca_ack_delay, 128 << dev_cap->max_mtu,
+		 dev_cap->max_port_width);
+	mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n",
+		 dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
+	mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
+		 dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg);
+
+	dump_dev_cap_flags(dev, dev_cap->flags);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_icm_iter iter;
+	__be64 *pages;
+	int lg;
+	int nent = 0;
+	int i;
+	int err = 0;
+	int ts = 0, tc = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
+	pages = mailbox->buf;
+
+	for (mlx4_icm_first(icm, &iter);
+	     !mlx4_icm_last(&iter);
+	     mlx4_icm_next(&iter)) {
+		/*
+		 * We have to pass pages that are aligned to their
+		 * size, so find the least significant 1 in the
+		 * address or size and use that as our log2 size.
+		 */
+		lg = ffs(mlx4_icm_addr(&iter) | mlx4_icm_size(&iter)) - 1;
+		if (lg < MLX4_ICM_PAGE_SHIFT) {
+			mlx4_warn(dev, "Got FW area not aligned to %d (%llx/%lx).\n",
+				   MLX4_ICM_PAGE_SIZE,
+				   (unsigned long long) mlx4_icm_addr(&iter),
+				   mlx4_icm_size(&iter));
+			err = -EINVAL;
+			goto out;
+		}
+
+		for (i = 0; i < mlx4_icm_size(&iter) >> lg; ++i) {
+			if (virt != -1) {
+				pages[nent * 2] = cpu_to_be64(virt);
+				virt += 1 << lg;
+			}
+
+			pages[nent * 2 + 1] =
+				cpu_to_be64((mlx4_icm_addr(&iter) + (i << lg)) |
+					    (lg - MLX4_ICM_PAGE_SHIFT));
+			ts += 1 << (lg - 10);
+			++tc;
+
+			if (++nent == MLX4_MAILBOX_SIZE / 16) {
+				err = mlx4_cmd(dev, mailbox->dma, nent, 0, op,
+						MLX4_CMD_TIME_CLASS_B);
+				if (err)
+					goto out;
+				nent = 0;
+			}
+		}
+	}
+
+	if (nent)
+		err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, MLX4_CMD_TIME_CLASS_B);
+	if (err)
+		goto out;
+
+	switch (op) {
+	case MLX4_CMD_MAP_FA:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts);
+		break;
+	case MLX4_CMD_MAP_ICM_AUX:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts);
+		break;
+	case MLX4_CMD_MAP_ICM:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n",
+			  tc, ts, (unsigned long long) virt - (ts << 10));
+		break;
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_FA, icm, -1);
+}
+
+int mlx4_UNMAP_FA(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA, MLX4_CMD_TIME_CLASS_B);
+}
+
+
+int mlx4_RUN_FW(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW, MLX4_CMD_TIME_CLASS_A);
+}
+
+int mlx4_QUERY_FW(struct mlx4_dev *dev)
+{
+	struct mlx4_fw  *fw  = &mlx4_priv(dev)->fw;
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	int err = 0;
+	u64 fw_ver;
+	u8 lg;
+
+#define QUERY_FW_OUT_SIZE             0x100
+#define QUERY_FW_VER_OFFSET            0x00
+#define QUERY_FW_MAX_CMD_OFFSET        0x0f
+#define QUERY_FW_ERR_START_OFFSET      0x30
+#define QUERY_FW_ERR_SIZE_OFFSET       0x38
+#define QUERY_FW_ERR_BAR_OFFSET        0x3c
+
+#define QUERY_FW_SIZE_OFFSET           0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET   0x20
+#define QUERY_FW_CLR_INT_BAR_OFFSET    0x28
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_FW,
+			    MLX4_CMD_TIME_CLASS_A);
+	if (err)
+		goto out;
+
+	MLX4_GET(fw_ver, outbox, QUERY_FW_VER_OFFSET);
+	/*
+	 * FW subminor version is at more signifant bits than minor
+	 * version, so swap here.
+	 */
+	dev->caps.fw_ver = (fw_ver & 0xffff00000000ull) |
+		((fw_ver & 0xffff0000ull) >> 16) |
+		((fw_ver & 0x0000ffffull) << 16);
+
+	MLX4_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+	cmd->max_cmds = 1 << lg;
+
+	mlx4_dbg(dev, "FW version %d.%d.%03d, max commands %d\n",
+		 (int) (dev->caps.fw_ver >> 32),
+		 (int) (dev->caps.fw_ver >> 16) & 0xffff,
+		 (int) dev->caps.fw_ver & 0xffff,
+		 cmd->max_cmds);
+
+	MLX4_GET(fw->catas_offset, outbox, QUERY_FW_ERR_START_OFFSET);
+	MLX4_GET(fw->catas_size,   outbox, QUERY_FW_ERR_SIZE_OFFSET);
+	MLX4_GET(fw->catas_bar,    outbox, QUERY_FW_ERR_BAR_OFFSET);
+	fw->catas_bar = (fw->catas_bar >> 6) * 2;
+
+	mlx4_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x, BAR %d\n",
+		 (unsigned long long) fw->catas_offset, fw->catas_size, fw->catas_bar);
+
+	MLX4_GET(fw->fw_pages,     outbox, QUERY_FW_SIZE_OFFSET);
+	MLX4_GET(fw->clr_int_base, outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+	MLX4_GET(fw->clr_int_bar,  outbox, QUERY_FW_CLR_INT_BAR_OFFSET);
+	fw->clr_int_bar = (fw->clr_int_bar >> 6) * 2;
+
+	mlx4_dbg(dev, "FW size %d KB\n", fw->fw_pages >> 2);
+
+	/*
+	 * Round up number of system pages needed in case
+	 * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
+	 */
+	fw->fw_pages =
+		ALIGN(fw->fw_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >>
+		(PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT);
+
+	mlx4_dbg(dev, "Clear int @ %llx, BAR %d\n",
+		 (unsigned long long) fw->clr_int_base, fw->clr_int_bar);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static void get_board_id(void *vsd, char *board_id)
+{
+	int i;
+
+#define VSD_OFFSET_SIG1		0x00
+#define VSD_OFFSET_SIG2		0xde
+#define VSD_OFFSET_MLX_BOARD_ID	0xd0
+#define VSD_OFFSET_TS_BOARD_ID	0x20
+
+#define VSD_SIGNATURE_TOPSPIN	0x5ad
+
+	memset(board_id, 0, MLX4_BOARD_ID_LEN);
+
+	if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN &&
+	    be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) {
+		strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MLX4_BOARD_ID_LEN);
+	} else {
+		/*
+		 * The board ID is a string but the firmware byte
+		 * swaps each 4-byte word before passing it back to
+		 * us.  Therefore we need to swab it before printing.
+		 */
+		for (i = 0; i < 4; ++i)
+			((u32 *) board_id)[i] =
+				swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4));
+	}
+}
+
+int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	int err;
+
+#define QUERY_ADAPTER_OUT_SIZE             0x100
+#define QUERY_ADAPTER_VENDOR_ID_OFFSET     0x00
+#define QUERY_ADAPTER_DEVICE_ID_OFFSET     0x04
+#define QUERY_ADAPTER_REVISION_ID_OFFSET   0x08
+#define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
+#define QUERY_ADAPTER_VSD_OFFSET           0x20
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_ADAPTER,
+			   MLX4_CMD_TIME_CLASS_A);
+	if (err)
+		goto out;
+
+	MLX4_GET(adapter->vendor_id, outbox,   QUERY_ADAPTER_VENDOR_ID_OFFSET);
+	MLX4_GET(adapter->device_id, outbox,   QUERY_ADAPTER_DEVICE_ID_OFFSET);
+	MLX4_GET(adapter->revision_id, outbox, QUERY_ADAPTER_REVISION_ID_OFFSET);
+	MLX4_GET(adapter->inta_pin, outbox,    QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+	get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
+		     adapter->board_id);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be32 *inbox;
+	int err;
+
+#define INIT_HCA_IN_SIZE		 0x200
+#define INIT_HCA_VERSION_OFFSET		 0x000
+#define	 INIT_HCA_VERSION		 2
+#define INIT_HCA_FLAGS_OFFSET		 0x014
+#define INIT_HCA_QPC_OFFSET		 0x020
+#define	 INIT_HCA_QPC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x10)
+#define	 INIT_HCA_LOG_QP_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x17)
+#define	 INIT_HCA_SRQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x28)
+#define	 INIT_HCA_LOG_SRQ_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x2f)
+#define	 INIT_HCA_CQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x30)
+#define	 INIT_HCA_LOG_CQ_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x37)
+#define	 INIT_HCA_ALTC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x40)
+#define	 INIT_HCA_AUXC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x50)
+#define	 INIT_HCA_EQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x60)
+#define	 INIT_HCA_LOG_EQ_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x67)
+#define	 INIT_HCA_RDMARC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x70)
+#define	 INIT_HCA_LOG_RD_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x77)
+#define INIT_HCA_MCAST_OFFSET		 0x0c0
+#define	 INIT_HCA_MC_BASE_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x00)
+#define	 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
+#define	 INIT_HCA_LOG_MC_HASH_SZ_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x16)
+#define	 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define INIT_HCA_TPT_OFFSET		 0x0f0
+#define	 INIT_HCA_DMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x00)
+#define	 INIT_HCA_LOG_MPT_SZ_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x0b)
+#define	 INIT_HCA_MTT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x10)
+#define	 INIT_HCA_CMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x18)
+#define INIT_HCA_UAR_OFFSET		 0x120
+#define	 INIT_HCA_LOG_UAR_SZ_OFFSET	 (INIT_HCA_UAR_OFFSET + 0x0a)
+#define  INIT_HCA_UAR_PAGE_SZ_OFFSET     (INIT_HCA_UAR_OFFSET + 0x0b)
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, INIT_HCA_IN_SIZE);
+
+	*((u8 *) mailbox->buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION;
+
+#if defined(__LITTLE_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+	/* Check port for UD address vector: */
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1);
+
+	/* QPC/EEC/CQC/EQC/RDMARC attributes */
+
+	MLX4_PUT(inbox, param->qpc_base,      INIT_HCA_QPC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_qps,   INIT_HCA_LOG_QP_OFFSET);
+	MLX4_PUT(inbox, param->srqc_base,     INIT_HCA_SRQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_srqs,  INIT_HCA_LOG_SRQ_OFFSET);
+	MLX4_PUT(inbox, param->cqc_base,      INIT_HCA_CQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_cqs,   INIT_HCA_LOG_CQ_OFFSET);
+	MLX4_PUT(inbox, param->altc_base,     INIT_HCA_ALTC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->auxc_base,     INIT_HCA_AUXC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->eqc_base,      INIT_HCA_EQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_eqs,   INIT_HCA_LOG_EQ_OFFSET);
+	MLX4_PUT(inbox, param->rdmarc_base,   INIT_HCA_RDMARC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_rd_per_qp, INIT_HCA_LOG_RD_OFFSET);
+
+	/* multicast attributes */
+
+	MLX4_PUT(inbox, param->mc_base,		INIT_HCA_MC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+	MLX4_PUT(inbox, param->log_mc_hash_sz,  INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+	MLX4_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+
+	/* TPT attributes */
+
+	MLX4_PUT(inbox, param->dmpt_base,  INIT_HCA_DMPT_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MLX4_PUT(inbox, param->mtt_base,   INIT_HCA_MTT_BASE_OFFSET);
+	MLX4_PUT(inbox, param->cmpt_base,  INIT_HCA_CMPT_BASE_OFFSET);
+
+	/* UAR attributes */
+
+	MLX4_PUT(inbox, (u8) (PAGE_SHIFT - 12), INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	MLX4_PUT(inbox, param->log_uar_sz,      INIT_HCA_LOG_UAR_SZ_OFFSET);
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 1000);
+
+	if (err)
+		mlx4_err(dev, "INIT_HCA returns %d\n", err);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_INIT_PORT(struct mlx4_dev *dev, struct mlx4_init_port_param *param, int port)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *inbox;
+	int err;
+	u32 flags;
+
+#define INIT_PORT_IN_SIZE          256
+#define INIT_PORT_FLAGS_OFFSET     0x00
+#define INIT_PORT_FLAG_SIG         (1 << 18)
+#define INIT_PORT_FLAG_NG          (1 << 17)
+#define INIT_PORT_FLAG_G0          (1 << 16)
+#define INIT_PORT_VL_SHIFT         4
+#define INIT_PORT_PORT_WIDTH_SHIFT 8
+#define INIT_PORT_MTU_OFFSET       0x04
+#define INIT_PORT_MAX_GID_OFFSET   0x06
+#define INIT_PORT_MAX_PKEY_OFFSET  0x0a
+#define INIT_PORT_GUID0_OFFSET     0x10
+#define INIT_PORT_NODE_GUID_OFFSET 0x18
+#define INIT_PORT_SI_GUID_OFFSET   0x20
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, INIT_PORT_IN_SIZE);
+
+	flags = 0;
+	flags |= param->set_guid0     ? INIT_PORT_FLAG_G0  : 0;
+	flags |= param->set_node_guid ? INIT_PORT_FLAG_NG  : 0;
+	flags |= param->set_si_guid   ? INIT_PORT_FLAG_SIG : 0;
+	flags |= (param->vl_cap & 0xf) << INIT_PORT_VL_SHIFT;
+	flags |= (param->port_width_cap & 0xf) << INIT_PORT_PORT_WIDTH_SHIFT;
+	MLX4_PUT(inbox, flags,            INIT_PORT_FLAGS_OFFSET);
+
+	MLX4_PUT(inbox, param->mtu,       INIT_PORT_MTU_OFFSET);
+	MLX4_PUT(inbox, param->max_gid,   INIT_PORT_MAX_GID_OFFSET);
+	MLX4_PUT(inbox, param->max_pkey,  INIT_PORT_MAX_PKEY_OFFSET);
+	MLX4_PUT(inbox, param->guid0,     INIT_PORT_GUID0_OFFSET);
+	MLX4_PUT(inbox, param->node_guid, INIT_PORT_NODE_GUID_OFFSET);
+	MLX4_PUT(inbox, param->si_guid,   INIT_PORT_SI_GUID_OFFSET);
+
+	err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_INIT_PORT,
+		       MLX4_CMD_TIME_CLASS_A);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_INIT_PORT);
+
+int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port)
+{
+	return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000);
+}
+EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT);
+
+int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic)
+{
+	return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000);
+}
+
+int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages)
+{
+	int ret = mlx4_cmd_imm(dev, icm_size, aux_pages, 0, 0,
+			       MLX4_CMD_SET_ICM_SIZE,
+			       MLX4_CMD_TIME_CLASS_A);
+	if (ret)
+		return ret;
+
+	/*
+	 * Round up number of system pages needed in case
+	 * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
+	 */
+	*aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >>
+		(PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT);
+
+	return 0;
+}
+
+int mlx4_NOP(struct mlx4_dev *dev)
+{
+	/* Input modifier of 0x1f means "finish as soon as possible." */
+	return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, 100);
+}
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
new file mode 100644
index 000000000000..2616fa53d4d0
--- /dev/null
+++ b/drivers/net/mlx4/fw.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_FW_H
+#define MLX4_FW_H
+
+#include "mlx4.h"
+#include "icm.h"
+
+struct mlx4_dev_cap {
+	int max_srq_sz;
+	int max_qp_sz;
+	int reserved_qps;
+	int max_qps;
+	int reserved_srqs;
+	int max_srqs;
+	int max_cq_sz;
+	int reserved_cqs;
+	int max_cqs;
+	int max_mpts;
+	int reserved_eqs;
+	int max_eqs;
+	int reserved_mtts;
+	int max_mrw_sz;
+	int reserved_mrws;
+	int max_mtt_seg;
+	int max_requester_per_qp;
+	int max_responder_per_qp;
+	int max_rdma_global;
+	int local_ca_ack_delay;
+	int max_mtu;
+	int max_port_width;
+	int max_vl;
+	int num_ports;
+	int max_gids;
+	u16 stat_rate_support;
+	int max_pkeys;
+	u32 flags;
+	int reserved_uars;
+	int uar_size;
+	int min_page_sz;
+	int bf_reg_size;
+	int bf_regs_per_page;
+	int max_sq_sg;
+	int max_sq_desc_sz;
+	int max_rq_sg;
+	int max_rq_desc_sz;
+	int max_qp_per_mcg;
+	int reserved_mgms;
+	int max_mcgs;
+	int reserved_pds;
+	int max_pds;
+	int qpc_entry_sz;
+	int rdmarc_entry_sz;
+	int altc_entry_sz;
+	int aux_entry_sz;
+	int srq_entry_sz;
+	int cqc_entry_sz;
+	int eqc_entry_sz;
+	int dmpt_entry_sz;
+	int cmpt_entry_sz;
+	int mtt_entry_sz;
+	int resize_srq;
+	u8  bmme_flags;
+	u32 reserved_lkey;
+	u64 max_icm_sz;
+};
+
+struct mlx4_adapter {
+	u32  vendor_id;
+	u32  device_id;
+	u32  revision_id;
+	char board_id[MLX4_BOARD_ID_LEN];
+	u8   inta_pin;
+};
+
+struct mlx4_init_hca_param {
+	u64 qpc_base;
+	u64 rdmarc_base;
+	u64 auxc_base;
+	u64 altc_base;
+	u64 srqc_base;
+	u64 cqc_base;
+	u64 eqc_base;
+	u64 mc_base;
+	u64 dmpt_base;
+	u64 cmpt_base;
+	u64 mtt_base;
+	u16 log_mc_entry_sz;
+	u16 log_mc_hash_sz;
+	u8  log_num_qps;
+	u8  log_num_srqs;
+	u8  log_num_cqs;
+	u8  log_num_eqs;
+	u8  log_rd_per_qp;
+	u8  log_mc_table_sz;
+	u8  log_mpt_sz;
+	u8  log_uar_sz;
+};
+
+struct mlx4_init_ib_param {
+	int port_width;
+	int vl_cap;
+	int mtu_cap;
+	u16 gid_cap;
+	u16 pkey_cap;
+	int set_guid0;
+	u64 guid0;
+	int set_node_guid;
+	u64 node_guid;
+	int set_si_guid;
+	u64 si_guid;
+};
+
+struct mlx4_set_ib_param {
+	int set_si_guid;
+	int reset_qkey_viol;
+	u64 si_guid;
+	u32 cap_mask;
+};
+
+int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap);
+int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_FA(struct mlx4_dev *dev);
+int mlx4_RUN_FW(struct mlx4_dev *dev);
+int mlx4_QUERY_FW(struct mlx4_dev *dev);
+int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter);
+int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param);
+int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic);
+int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt);
+int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages);
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
+int mlx4_NOP(struct mlx4_dev *dev);
+
+#endif /* MLX4_FW_H */
diff --git a/drivers/net/mlx4/icm.c b/drivers/net/mlx4/icm.c
new file mode 100644
index 000000000000..e96feaed6ed4
--- /dev/null
+++ b/drivers/net/mlx4/icm.c
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+#include "fw.h"
+
+/*
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk.
+ */
+enum {
+	MLX4_ICM_ALLOC_SIZE	= 1 << 18,
+	MLX4_TABLE_CHUNK_SIZE	= 1 << 18
+};
+
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+	struct mlx4_icm_chunk *chunk, *tmp;
+	int i;
+
+	list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+		if (chunk->nsg > 0)
+			pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+				     PCI_DMA_BIDIRECTIONAL);
+
+		for (i = 0; i < chunk->npages; ++i)
+			__free_pages(chunk->mem[i].page,
+				     get_order(chunk->mem[i].length));
+
+		kfree(chunk);
+	}
+
+	kfree(icm);
+}
+
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
+				gfp_t gfp_mask)
+{
+	struct mlx4_icm *icm;
+	struct mlx4_icm_chunk *chunk = NULL;
+	int cur_order;
+
+	icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+	if (!icm)
+		return icm;
+
+	icm->refcount = 0;
+	INIT_LIST_HEAD(&icm->chunk_list);
+
+	cur_order = get_order(MLX4_ICM_ALLOC_SIZE);
+
+	while (npages > 0) {
+		if (!chunk) {
+			chunk = kmalloc(sizeof *chunk,
+					gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+			if (!chunk)
+				goto fail;
+
+			chunk->npages = 0;
+			chunk->nsg    = 0;
+			list_add_tail(&chunk->list, &icm->chunk_list);
+		}
+
+		while (1 << cur_order > npages)
+			--cur_order;
+
+		chunk->mem[chunk->npages].page = alloc_pages(gfp_mask, cur_order);
+		if (chunk->mem[chunk->npages].page) {
+			chunk->mem[chunk->npages].length = PAGE_SIZE << cur_order;
+			chunk->mem[chunk->npages].offset = 0;
+
+			if (++chunk->npages == MLX4_ICM_CHUNK_LEN) {
+				chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+							chunk->npages,
+							PCI_DMA_BIDIRECTIONAL);
+
+				if (chunk->nsg <= 0)
+					goto fail;
+
+				chunk = NULL;
+			}
+
+			npages -= 1 << cur_order;
+		} else {
+			--cur_order;
+			if (cur_order < 0)
+				goto fail;
+		}
+	}
+
+	if (chunk) {
+		chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+					chunk->npages,
+					PCI_DMA_BIDIRECTIONAL);
+
+		if (chunk->nsg <= 0)
+			goto fail;
+	}
+
+	return icm;
+
+fail:
+	mlx4_free_icm(dev, icm);
+	return NULL;
+}
+
+static int mlx4_MAP_ICM(struct mlx4_dev *dev, struct mlx4_icm *icm, u64 virt)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM, icm, virt);
+}
+
+int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count)
+{
+	return mlx4_cmd(dev, virt, page_count, 0, MLX4_CMD_UNMAP_ICM,
+			MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_MAP_ICM_page(struct mlx4_dev *dev, u64 dma_addr, u64 virt)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be64 *inbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	inbox[0] = cpu_to_be64(virt);
+	inbox[1] = cpu_to_be64(dma_addr);
+
+	err = mlx4_cmd(dev, mailbox->dma, 1, 0, MLX4_CMD_MAP_ICM,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	if (!err)
+		mlx4_dbg(dev, "Mapped page at %llx to %llx for ICM.\n",
+			  (unsigned long long) dma_addr, (unsigned long long) virt);
+
+	return err;
+}
+
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM_AUX, icm, -1);
+}
+
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_ICM_AUX, MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj)
+{
+	int i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+	int ret = 0;
+
+	mutex_lock(&table->mutex);
+
+	if (table->icm[i]) {
+		++table->icm[i]->refcount;
+		goto out;
+	}
+
+	table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+				       (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+				       __GFP_NOWARN);
+	if (!table->icm[i]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (mlx4_MAP_ICM(dev, table->icm[i], table->virt +
+			 (u64) i * MLX4_TABLE_CHUNK_SIZE)) {
+		mlx4_free_icm(dev, table->icm[i]);
+		table->icm[i] = NULL;
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	++table->icm[i]->refcount;
+
+out:
+	mutex_unlock(&table->mutex);
+	return ret;
+}
+
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj)
+{
+	int i;
+
+	i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+
+	mutex_lock(&table->mutex);
+
+	if (--table->icm[i]->refcount == 0) {
+		mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
+			       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+		mlx4_free_icm(dev, table->icm[i]);
+		table->icm[i] = NULL;
+	}
+
+	mutex_unlock(&table->mutex);
+}
+
+void *mlx4_table_find(struct mlx4_icm_table *table, int obj)
+{
+	int idx, offset, i;
+	struct mlx4_icm_chunk *chunk;
+	struct mlx4_icm *icm;
+	struct page *page = NULL;
+
+	if (!table->lowmem)
+		return NULL;
+
+	mutex_lock(&table->mutex);
+
+	idx = obj & (table->num_obj - 1);
+	icm = table->icm[idx / (MLX4_TABLE_CHUNK_SIZE / table->obj_size)];
+	offset = idx % (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+
+	if (!icm)
+		goto out;
+
+	list_for_each_entry(chunk, &icm->chunk_list, list) {
+		for (i = 0; i < chunk->npages; ++i) {
+			if (chunk->mem[i].length > offset) {
+				page = chunk->mem[i].page;
+				goto out;
+			}
+			offset -= chunk->mem[i].length;
+		}
+	}
+
+out:
+	mutex_unlock(&table->mutex);
+	return page ? lowmem_page_address(page) + offset : NULL;
+}
+
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			 int start, int end)
+{
+	int inc = MLX4_TABLE_CHUNK_SIZE / table->obj_size;
+	int i, err;
+
+	for (i = start; i <= end; i += inc) {
+		err = mlx4_table_get(dev, table, i);
+		if (err)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	while (i > start) {
+		i -= inc;
+		mlx4_table_put(dev, table, i);
+	}
+
+	return err;
+}
+
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			  int start, int end)
+{
+	int i;
+
+	for (i = start; i <= end; i += MLX4_TABLE_CHUNK_SIZE / table->obj_size)
+		mlx4_table_put(dev, table, i);
+}
+
+int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			u64 virt, int obj_size,	int nobj, int reserved,
+			int use_lowmem)
+{
+	int obj_per_chunk;
+	int num_icm;
+	unsigned chunk_size;
+	int i;
+
+	obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
+	num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
+
+	table->icm      = kcalloc(num_icm, sizeof *table->icm, GFP_KERNEL);
+	if (!table->icm)
+		return -ENOMEM;
+	table->virt     = virt;
+	table->num_icm  = num_icm;
+	table->num_obj  = nobj;
+	table->obj_size = obj_size;
+	table->lowmem   = use_lowmem;
+	mutex_init(&table->mutex);
+
+	for (i = 0; i * MLX4_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+		chunk_size = MLX4_TABLE_CHUNK_SIZE;
+		if ((i + 1) * MLX4_TABLE_CHUNK_SIZE > nobj * obj_size)
+			chunk_size = PAGE_ALIGN(nobj * obj_size - i * MLX4_TABLE_CHUNK_SIZE);
+
+		table->icm[i] = mlx4_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
+					       (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+					       __GFP_NOWARN);
+		if (!table->icm[i])
+			goto err;
+		if (mlx4_MAP_ICM(dev, table->icm[i], virt + i * MLX4_TABLE_CHUNK_SIZE)) {
+			mlx4_free_icm(dev, table->icm[i]);
+			table->icm[i] = NULL;
+			goto err;
+		}
+
+		/*
+		 * Add a reference to this ICM chunk so that it never
+		 * gets freed (since it contains reserved firmware objects).
+		 */
+		++table->icm[i]->refcount;
+	}
+
+	return 0;
+
+err:
+	for (i = 0; i < num_icm; ++i)
+		if (table->icm[i]) {
+			mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE,
+				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+			mlx4_free_icm(dev, table->icm[i]);
+		}
+
+	return -ENOMEM;
+}
+
+void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
+{
+	int i;
+
+	for (i = 0; i < table->num_icm; ++i)
+		if (table->icm[i]) {
+			mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
+				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+			mlx4_free_icm(dev, table->icm[i]);
+		}
+
+	kfree(table->icm);
+}
diff --git a/drivers/net/mlx4/icm.h b/drivers/net/mlx4/icm.h
new file mode 100644
index 000000000000..bea223d879a5
--- /dev/null
+++ b/drivers/net/mlx4/icm.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_ICM_H
+#define MLX4_ICM_H
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/mutex.h>
+
+#define MLX4_ICM_CHUNK_LEN						\
+	((256 - sizeof (struct list_head) - 2 * sizeof (int)) /		\
+	 (sizeof (struct scatterlist)))
+
+enum {
+	MLX4_ICM_PAGE_SHIFT	= 12,
+	MLX4_ICM_PAGE_SIZE	= 1 << MLX4_ICM_PAGE_SHIFT,
+};
+
+struct mlx4_icm_chunk {
+	struct list_head	list;
+	int			npages;
+	int			nsg;
+	struct scatterlist	mem[MLX4_ICM_CHUNK_LEN];
+};
+
+struct mlx4_icm {
+	struct list_head	chunk_list;
+	int			refcount;
+};
+
+struct mlx4_icm_iter {
+	struct mlx4_icm	       *icm;
+	struct mlx4_icm_chunk  *chunk;
+	int			page_idx;
+};
+
+struct mlx4_dev;
+
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, gfp_t gfp_mask);
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm);
+
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			 int start, int end);
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			  int start, int end);
+int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			u64 virt, int obj_size,	int nobj, int reserved,
+			int use_lowmem);
+void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table);
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
+void *mlx4_table_find(struct mlx4_icm_table *table, int obj);
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			 int start, int end);
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			  int start, int end);
+
+static inline void mlx4_icm_first(struct mlx4_icm *icm,
+				  struct mlx4_icm_iter *iter)
+{
+	iter->icm      = icm;
+	iter->chunk    = list_empty(&icm->chunk_list) ?
+		NULL : list_entry(icm->chunk_list.next,
+				  struct mlx4_icm_chunk, list);
+	iter->page_idx = 0;
+}
+
+static inline int mlx4_icm_last(struct mlx4_icm_iter *iter)
+{
+	return !iter->chunk;
+}
+
+static inline void mlx4_icm_next(struct mlx4_icm_iter *iter)
+{
+	if (++iter->page_idx >= iter->chunk->nsg) {
+		if (iter->chunk->list.next == &iter->icm->chunk_list) {
+			iter->chunk = NULL;
+			return;
+		}
+
+		iter->chunk = list_entry(iter->chunk->list.next,
+					 struct mlx4_icm_chunk, list);
+		iter->page_idx = 0;
+	}
+}
+
+static inline dma_addr_t mlx4_icm_addr(struct mlx4_icm_iter *iter)
+{
+	return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+static inline unsigned long mlx4_icm_size(struct mlx4_icm_iter *iter)
+{
+	return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
+}
+
+int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count);
+int mlx4_MAP_ICM_page(struct mlx4_dev *dev, u64 dma_addr, u64 virt);
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
+
+#endif /* MLX4_ICM_H */
diff --git a/drivers/net/mlx4/intf.c b/drivers/net/mlx4/intf.c
new file mode 100644
index 000000000000..65854f9e9c76
--- /dev/null
+++ b/drivers/net/mlx4/intf.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/driver.h>
+
+#include "mlx4.h"
+
+struct mlx4_device_context {
+	struct list_head	list;
+	struct mlx4_interface  *intf;
+	void		       *context;
+};
+
+static LIST_HEAD(intf_list);
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(intf_mutex);
+
+static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
+{
+	struct mlx4_device_context *dev_ctx;
+
+	dev_ctx = kmalloc(sizeof *dev_ctx, GFP_KERNEL);
+	if (!dev_ctx)
+		return;
+
+	dev_ctx->intf    = intf;
+	dev_ctx->context = intf->add(&priv->dev);
+
+	if (dev_ctx->context) {
+		spin_lock_irq(&priv->ctx_lock);
+		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+		spin_unlock_irq(&priv->ctx_lock);
+	} else
+		kfree(dev_ctx);
+}
+
+static void mlx4_remove_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
+{
+	struct mlx4_device_context *dev_ctx;
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf == intf) {
+			spin_lock_irq(&priv->ctx_lock);
+			list_del(&dev_ctx->list);
+			spin_unlock_irq(&priv->ctx_lock);
+
+			intf->remove(&priv->dev, dev_ctx->context);
+			kfree(dev_ctx);
+			return;
+		}
+}
+
+int mlx4_register_interface(struct mlx4_interface *intf)
+{
+	struct mlx4_priv *priv;
+
+	if (!intf->add || !intf->remove)
+		return -EINVAL;
+
+	mutex_lock(&intf_mutex);
+
+	list_add_tail(&intf->list, &intf_list);
+	list_for_each_entry(priv, &dev_list, dev_list)
+		mlx4_add_device(intf, priv);
+
+	mutex_unlock(&intf_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_interface);
+
+void mlx4_unregister_interface(struct mlx4_interface *intf)
+{
+	struct mlx4_priv *priv;
+
+	mutex_lock(&intf_mutex);
+
+	list_for_each_entry(priv, &dev_list, dev_list)
+		mlx4_remove_device(intf, priv);
+
+	list_del(&intf->list);
+
+	mutex_unlock(&intf_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_interface);
+
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_event type,
+			 int subtype, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_device_context *dev_ctx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf->event)
+			dev_ctx->intf->event(dev, dev_ctx->context, type,
+					     subtype, port);
+
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+}
+
+int mlx4_register_device(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_interface *intf;
+
+	INIT_LIST_HEAD(&priv->ctx_list);
+	spin_lock_init(&priv->ctx_lock);
+
+	mutex_lock(&intf_mutex);
+
+	list_add_tail(&priv->dev_list, &dev_list);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx4_add_device(intf, priv);
+
+	mutex_unlock(&intf_mutex);
+
+	return 0;
+}
+
+void mlx4_unregister_device(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_interface *intf;
+
+	mutex_lock(&intf_mutex);
+
+	list_for_each_entry(intf, &intf_list, list)
+		mlx4_remove_device(intf, priv);
+
+	list_del(&priv->dev_list);
+
+	mutex_unlock(&intf_mutex);
+}
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
new file mode 100644
index 000000000000..4debb024eaf9
--- /dev/null
+++ b/drivers/net/mlx4/main.c
@@ -0,0 +1,936 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+#include "mlx4.h"
+#include "fw.h"
+#include "icm.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+#ifdef CONFIG_MLX4_DEBUG
+
+int mlx4_debug_level = 0;
+module_param_named(debug_level, mlx4_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+
+#endif /* CONFIG_MLX4_DEBUG */
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static const char mlx4_version[] __devinitdata =
+	DRV_NAME ": Mellanox ConnectX core driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static struct mlx4_profile default_profile = {
+	.num_qp		= 1 << 16,
+	.num_srq	= 1 << 16,
+	.rdmarc_per_qp	= 4,
+	.num_cq		= 1 << 16,
+	.num_mcg	= 1 << 13,
+	.num_mpt	= 1 << 17,
+	.num_mtt	= 1 << 20,
+};
+
+static int __devinit mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	int err;
+
+	err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+		return err;
+	}
+
+	if (dev_cap->min_page_sz > PAGE_SIZE) {
+		mlx4_err(dev, "HCA minimum page size of %d bigger than "
+			 "kernel PAGE_SIZE of %ld, aborting.\n",
+			 dev_cap->min_page_sz, PAGE_SIZE);
+		return -ENODEV;
+	}
+	if (dev_cap->num_ports > MLX4_MAX_PORTS) {
+		mlx4_err(dev, "HCA has %d ports, but we only support %d, "
+			 "aborting.\n",
+			 dev_cap->num_ports, MLX4_MAX_PORTS);
+		return -ENODEV;
+	}
+
+	if (dev_cap->uar_size > pci_resource_len(dev->pdev, 2)) {
+		mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than "
+			 "PCI resource 2 size of 0x%llx, aborting.\n",
+			 dev_cap->uar_size,
+			 (unsigned long long) pci_resource_len(dev->pdev, 2));
+		return -ENODEV;
+	}
+
+	dev->caps.num_ports	     = dev_cap->num_ports;
+	dev->caps.num_uars	     = dev_cap->uar_size / PAGE_SIZE;
+	dev->caps.vl_cap	     = dev_cap->max_vl;
+	dev->caps.mtu_cap	     = dev_cap->max_mtu;
+	dev->caps.gid_table_len	     = dev_cap->max_gids;
+	dev->caps.pkey_table_len     = dev_cap->max_pkeys;
+	dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay;
+	dev->caps.bf_reg_size	     = dev_cap->bf_reg_size;
+	dev->caps.bf_regs_per_page   = dev_cap->bf_regs_per_page;
+	dev->caps.max_sq_sg	     = dev_cap->max_sq_sg;
+	dev->caps.max_rq_sg	     = dev_cap->max_rq_sg;
+	dev->caps.max_wqes	     = dev_cap->max_qp_sz;
+	dev->caps.max_qp_init_rdma   = dev_cap->max_requester_per_qp;
+	dev->caps.reserved_qps	     = dev_cap->reserved_qps;
+	dev->caps.max_srq_wqes	     = dev_cap->max_srq_sz;
+	dev->caps.max_srq_sge	     = dev_cap->max_rq_sg - 1;
+	dev->caps.reserved_srqs	     = dev_cap->reserved_srqs;
+	dev->caps.max_sq_desc_sz     = dev_cap->max_sq_desc_sz;
+	dev->caps.max_rq_desc_sz     = dev_cap->max_rq_desc_sz;
+	dev->caps.num_qp_per_mgm     = MLX4_QP_PER_MGM;
+	/*
+	 * Subtract 1 from the limit because we need to allocate a
+	 * spare CQE so the HCA HW can tell the difference between an
+	 * empty CQ and a full CQ.
+	 */
+	dev->caps.max_cqes	     = dev_cap->max_cq_sz - 1;
+	dev->caps.reserved_cqs	     = dev_cap->reserved_cqs;
+	dev->caps.reserved_eqs	     = dev_cap->reserved_eqs;
+	dev->caps.reserved_mtts	     = dev_cap->reserved_mtts;
+	dev->caps.reserved_mrws	     = dev_cap->reserved_mrws;
+	dev->caps.reserved_uars	     = dev_cap->reserved_uars;
+	dev->caps.reserved_pds	     = dev_cap->reserved_pds;
+	dev->caps.port_width_cap     = dev_cap->max_port_width;
+	dev->caps.mtt_entry_sz	     = MLX4_MTT_ENTRY_PER_SEG * dev_cap->mtt_entry_sz;
+	dev->caps.page_size_cap	     = ~(u32) (dev_cap->min_page_sz - 1);
+	dev->caps.flags		     = dev_cap->flags;
+	dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
+
+	return 0;
+}
+
+static int __devinit mlx4_load_fw(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	priv->fw.fw_icm = mlx4_alloc_icm(dev, priv->fw.fw_pages,
+					 GFP_HIGHUSER | __GFP_NOWARN);
+	if (!priv->fw.fw_icm) {
+		mlx4_err(dev, "Couldn't allocate FW area, aborting.\n");
+		return -ENOMEM;
+	}
+
+	err = mlx4_MAP_FA(dev, priv->fw.fw_icm);
+	if (err) {
+		mlx4_err(dev, "MAP_FA command failed, aborting.\n");
+		goto err_free;
+	}
+
+	err = mlx4_RUN_FW(dev);
+	if (err) {
+		mlx4_err(dev, "RUN_FW command failed, aborting.\n");
+		goto err_unmap_fa;
+	}
+
+	return 0;
+
+err_unmap_fa:
+	mlx4_UNMAP_FA(dev);
+
+err_free:
+	mlx4_free_icm(dev, priv->fw.fw_icm);
+	return err;
+}
+
+static int __devinit mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base,
+					  int cmpt_entry_sz)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_QP *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_qps,
+				  dev->caps.reserved_qps, 0);
+	if (err)
+		goto err;
+
+	err = mlx4_init_icm_table(dev, &priv->srq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_SRQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_srqs,
+				  dev->caps.reserved_srqs, 0);
+	if (err)
+		goto err_qp;
+
+	err = mlx4_init_icm_table(dev, &priv->cq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_CQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_cqs,
+				  dev->caps.reserved_cqs, 0);
+	if (err)
+		goto err_srq;
+
+	err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_EQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz,
+				  roundup_pow_of_two(MLX4_NUM_EQ +
+						     dev->caps.reserved_eqs),
+				  MLX4_NUM_EQ + dev->caps.reserved_eqs, 0);
+	if (err)
+		goto err_cq;
+
+	return 0;
+
+err_cq:
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+
+err_srq:
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+
+err_qp:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+err:
+	return err;
+}
+
+static int __devinit mlx4_init_icm(struct mlx4_dev *dev,
+				   struct mlx4_dev_cap *dev_cap,
+				   struct mlx4_init_hca_param *init_hca,
+				   u64 icm_size)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 aux_pages;
+	int err;
+
+	err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages);
+	if (err) {
+		mlx4_err(dev, "SET_ICM_SIZE command failed, aborting.\n");
+		return err;
+	}
+
+	mlx4_dbg(dev, "%lld KB of HCA context requires %lld KB aux memory.\n",
+		 (unsigned long long) icm_size >> 10,
+		 (unsigned long long) aux_pages << 2);
+
+	priv->fw.aux_icm = mlx4_alloc_icm(dev, aux_pages,
+					  GFP_HIGHUSER | __GFP_NOWARN);
+	if (!priv->fw.aux_icm) {
+		mlx4_err(dev, "Couldn't allocate aux memory, aborting.\n");
+		return -ENOMEM;
+	}
+
+	err = mlx4_MAP_ICM_AUX(dev, priv->fw.aux_icm);
+	if (err) {
+		mlx4_err(dev, "MAP_ICM_AUX command failed, aborting.\n");
+		goto err_free_aux;
+	}
+
+	err = mlx4_init_cmpt_table(dev, init_hca->cmpt_base, dev_cap->cmpt_entry_sz);
+	if (err) {
+		mlx4_err(dev, "Failed to map cMPT context memory, aborting.\n");
+		goto err_unmap_aux;
+	}
+
+	err = mlx4_map_eq_icm(dev, init_hca->eqc_base);
+	if (err) {
+		mlx4_err(dev, "Failed to map EQ context memory, aborting.\n");
+		goto err_unmap_cmpt;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->mr_table.mtt_table,
+				  init_hca->mtt_base,
+				  dev->caps.mtt_entry_sz,
+				  dev->caps.num_mtt_segs,
+				  dev->caps.reserved_mtts, 1);
+	if (err) {
+		mlx4_err(dev, "Failed to map MTT context memory, aborting.\n");
+		goto err_unmap_eq;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->mr_table.dmpt_table,
+				  init_hca->dmpt_base,
+				  dev_cap->dmpt_entry_sz,
+				  dev->caps.num_mpts,
+				  dev->caps.reserved_mrws, 1);
+	if (err) {
+		mlx4_err(dev, "Failed to map dMPT context memory, aborting.\n");
+		goto err_unmap_mtt;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.qp_table,
+				  init_hca->qpc_base,
+				  dev_cap->qpc_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map QP context memory, aborting.\n");
+		goto err_unmap_dmpt;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.auxc_table,
+				  init_hca->auxc_base,
+				  dev_cap->aux_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map AUXC context memory, aborting.\n");
+		goto err_unmap_qp;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.altc_table,
+				  init_hca->altc_base,
+				  dev_cap->altc_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map ALTC context memory, aborting.\n");
+		goto err_unmap_auxc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.rdmarc_table,
+				  init_hca->rdmarc_base,
+				  dev_cap->rdmarc_entry_sz << priv->qp_table.rdmarc_shift,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map RDMARC context memory, aborting\n");
+		goto err_unmap_altc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->cq_table.table,
+				  init_hca->cqc_base,
+				  dev_cap->cqc_entry_sz,
+				  dev->caps.num_cqs,
+				  dev->caps.reserved_cqs, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map CQ context memory, aborting.\n");
+		goto err_unmap_rdmarc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->srq_table.table,
+				  init_hca->srqc_base,
+				  dev_cap->srq_entry_sz,
+				  dev->caps.num_srqs,
+				  dev->caps.reserved_srqs, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map SRQ context memory, aborting.\n");
+		goto err_unmap_cq;
+	}
+
+	/*
+	 * It's not strictly required, but for simplicity just map the
+	 * whole multicast group table now.  The table isn't very big
+	 * and it's a lot easier than trying to track ref counts.
+	 */
+	err = mlx4_init_icm_table(dev, &priv->mcg_table.table,
+				  init_hca->mc_base, MLX4_MGM_ENTRY_SIZE,
+				  dev->caps.num_mgms + dev->caps.num_amgms,
+				  dev->caps.num_mgms + dev->caps.num_amgms,
+				  0);
+	if (err) {
+		mlx4_err(dev, "Failed to map MCG context memory, aborting.\n");
+		goto err_unmap_srq;
+	}
+
+	return 0;
+
+err_unmap_srq:
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.table);
+
+err_unmap_cq:
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.table);
+
+err_unmap_rdmarc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table);
+
+err_unmap_altc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table);
+
+err_unmap_auxc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table);
+
+err_unmap_qp:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table);
+
+err_unmap_dmpt:
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table);
+
+err_unmap_mtt:
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table);
+
+err_unmap_eq:
+	mlx4_unmap_eq_icm(dev);
+
+err_unmap_cmpt:
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+err_unmap_aux:
+	mlx4_UNMAP_ICM_AUX(dev);
+
+err_free_aux:
+	mlx4_free_icm(dev, priv->fw.aux_icm);
+
+	return err;
+}
+
+static void mlx4_free_icms(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mlx4_cleanup_icm_table(dev, &priv->mcg_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table);
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table);
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+	mlx4_unmap_eq_icm(dev);
+
+	mlx4_UNMAP_ICM_AUX(dev);
+	mlx4_free_icm(dev, priv->fw.aux_icm);
+}
+
+static void mlx4_close_hca(struct mlx4_dev *dev)
+{
+	mlx4_CLOSE_HCA(dev, 0);
+	mlx4_free_icms(dev);
+	mlx4_UNMAP_FA(dev);
+	mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm);
+}
+
+static int __devinit mlx4_init_hca(struct mlx4_dev *dev)
+{
+	struct mlx4_priv	  *priv = mlx4_priv(dev);
+	struct mlx4_adapter	   adapter;
+	struct mlx4_dev_cap	   dev_cap;
+	struct mlx4_profile	   profile;
+	struct mlx4_init_hca_param init_hca;
+	u64 icm_size;
+	int err;
+
+	err = mlx4_QUERY_FW(dev);
+	if (err) {
+		mlx4_err(dev, "QUERY_FW command failed, aborting.\n");
+		return err;
+	}
+
+	err = mlx4_load_fw(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to start FW, aborting.\n");
+		return err;
+	}
+
+	err = mlx4_dev_cap(dev, &dev_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+		goto err_stop_fw;
+	}
+
+	profile = default_profile;
+
+	icm_size = mlx4_make_profile(dev, &profile, &dev_cap, &init_hca);
+	if ((long long) icm_size < 0) {
+		err = icm_size;
+		goto err_stop_fw;
+	}
+
+	init_hca.log_uar_sz = ilog2(dev->caps.num_uars);
+
+	err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size);
+	if (err)
+		goto err_stop_fw;
+
+	err = mlx4_INIT_HCA(dev, &init_hca);
+	if (err) {
+		mlx4_err(dev, "INIT_HCA command failed, aborting.\n");
+		goto err_free_icm;
+	}
+
+	err = mlx4_QUERY_ADAPTER(dev, &adapter);
+	if (err) {
+		mlx4_err(dev, "QUERY_ADAPTER command failed, aborting.\n");
+		goto err_close;
+	}
+
+	priv->eq_table.inta_pin = adapter.inta_pin;
+	priv->rev_id		= adapter.revision_id;
+	memcpy(priv->board_id, adapter.board_id, sizeof priv->board_id);
+
+	return 0;
+
+err_close:
+	mlx4_close_hca(dev);
+
+err_free_icm:
+	mlx4_free_icms(dev);
+
+err_stop_fw:
+	mlx4_UNMAP_FA(dev);
+	mlx4_free_icm(dev, priv->fw.fw_icm);
+
+	return err;
+}
+
+static int __devinit mlx4_setup_hca(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	MLX4_INIT_DOORBELL_LOCK(&priv->doorbell_lock);
+
+	err = mlx4_init_uar_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "user access region table, aborting.\n");
+		return err;
+	}
+
+	err = mlx4_uar_alloc(dev, &priv->driver_uar);
+	if (err) {
+		mlx4_err(dev, "Failed to allocate driver access region, "
+			 "aborting.\n");
+		goto err_uar_table_free;
+	}
+
+	priv->kar = ioremap(priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!priv->kar) {
+		mlx4_err(dev, "Couldn't map kernel access region, "
+			 "aborting.\n");
+		err = -ENOMEM;
+		goto err_uar_free;
+	}
+
+	err = mlx4_init_pd_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "protection domain table, aborting.\n");
+		goto err_kar_unmap;
+	}
+
+	err = mlx4_init_mr_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "memory region table, aborting.\n");
+		goto err_pd_table_free;
+	}
+
+	mlx4_map_catas_buf(dev);
+
+	err = mlx4_init_eq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "event queue table, aborting.\n");
+		goto err_catas_buf;
+	}
+
+	err = mlx4_cmd_use_events(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to switch to event-driven "
+			 "firmware commands, aborting.\n");
+		goto err_eq_table_free;
+	}
+
+	err = mlx4_NOP(dev);
+	if (err) {
+		mlx4_err(dev, "NOP command failed to generate interrupt "
+			 "(IRQ %d), aborting.\n",
+			 priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
+		if (dev->flags & MLX4_FLAG_MSI_X)
+			mlx4_err(dev, "Try again with MSI-X disabled.\n");
+		else
+			mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+
+		goto err_cmd_poll;
+	}
+
+	mlx4_dbg(dev, "NOP command IRQ test passed\n");
+
+	err = mlx4_init_cq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "completion queue table, aborting.\n");
+		goto err_cmd_poll;
+	}
+
+	err = mlx4_init_srq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "shared receive queue table, aborting.\n");
+		goto err_cq_table_free;
+	}
+
+	err = mlx4_init_qp_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "queue pair table, aborting.\n");
+		goto err_srq_table_free;
+	}
+
+	err = mlx4_init_mcg_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "multicast group table, aborting.\n");
+		goto err_qp_table_free;
+	}
+
+	return 0;
+
+err_qp_table_free:
+	mlx4_cleanup_qp_table(dev);
+
+err_srq_table_free:
+	mlx4_cleanup_srq_table(dev);
+
+err_cq_table_free:
+	mlx4_cleanup_cq_table(dev);
+
+err_cmd_poll:
+	mlx4_cmd_use_polling(dev);
+
+err_eq_table_free:
+	mlx4_cleanup_eq_table(dev);
+
+err_catas_buf:
+	mlx4_unmap_catas_buf(dev);
+	mlx4_cleanup_mr_table(dev);
+
+err_pd_table_free:
+	mlx4_cleanup_pd_table(dev);
+
+err_kar_unmap:
+	iounmap(priv->kar);
+
+err_uar_free:
+	mlx4_uar_free(dev, &priv->driver_uar);
+
+err_uar_table_free:
+	mlx4_cleanup_uar_table(dev);
+	return err;
+}
+
+static void __devinit mlx4_enable_msi_x(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct msix_entry entries[MLX4_NUM_EQ];
+	int err;
+	int i;
+
+	if (msi_x) {
+		for (i = 0; i < MLX4_NUM_EQ; ++i)
+			entries[i].entry = i;
+
+		err = pci_enable_msix(dev->pdev, entries, ARRAY_SIZE(entries));
+		if (err) {
+			if (err > 0)
+				mlx4_info(dev, "Only %d MSI-X vectors available, "
+					  "not using MSI-X\n", err);
+			goto no_msi;
+		}
+
+		for (i = 0; i < MLX4_NUM_EQ; ++i)
+			priv->eq_table.eq[i].irq = entries[i].vector;
+
+		dev->flags |= MLX4_FLAG_MSI_X;
+		return;
+	}
+
+no_msi:
+	for (i = 0; i < MLX4_NUM_EQ; ++i)
+		priv->eq_table.eq[i].irq = dev->pdev->irq;
+}
+
+static int __devinit mlx4_init_one(struct pci_dev *pdev,
+				   const struct pci_device_id *id)
+{
+	static int mlx4_version_printed;
+	struct mlx4_priv *priv;
+	struct mlx4_dev *dev;
+	int err;
+
+	if (!mlx4_version_printed) {
+		printk(KERN_INFO "%s", mlx4_version);
+		++mlx4_version_printed;
+	}
+
+	printk(KERN_INFO PFX "Initializing %s\n",
+	       pci_name(pdev));
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, "
+			"aborting.\n");
+		return err;
+	}
+
+	/*
+	 * Check for BARs.  We expect 0: 1MB, 2: 8MB, 4: DDR (may not
+	 * be present)
+	 */
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+	    pci_resource_len(pdev, 0) != 1 << 20) {
+		dev_err(&pdev->dev, "Missing DCS, aborting.\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing UAR, aborting.\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+
+	err = pci_request_region(pdev, 0, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot request control region, aborting.\n");
+		goto err_disable_pdev;
+	}
+
+	err = pci_request_region(pdev, 2, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot request UAR region, aborting.\n");
+		goto err_release_bar0;
+	}
+
+	pci_set_master(pdev);
+
+	err = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
+		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
+			goto err_release_bar2;
+		}
+	}
+	err = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit "
+			 "consistent PCI DMA mask.\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		if (err) {
+			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
+				"aborting.\n");
+			goto err_release_bar2;
+		}
+	}
+
+	priv = kzalloc(sizeof *priv, GFP_KERNEL);
+	if (!priv) {
+		dev_err(&pdev->dev, "Device struct alloc failed, "
+			"aborting.\n");
+		err = -ENOMEM;
+		goto err_release_bar2;
+	}
+
+	dev       = &priv->dev;
+	dev->pdev = pdev;
+
+	/*
+	 * Now reset the HCA before we touch the PCI capabilities or
+	 * attempt a firmware command, since a boot ROM may have left
+	 * the HCA in an undefined state.
+	 */
+	err = mlx4_reset(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to reset HCA, aborting.\n");
+		goto err_free_dev;
+	}
+
+	mlx4_enable_msi_x(dev);
+
+	if (mlx4_cmd_init(dev)) {
+		mlx4_err(dev, "Failed to init command interface, aborting.\n");
+		goto err_free_dev;
+	}
+
+	err = mlx4_init_hca(dev);
+	if (err)
+		goto err_cmd;
+
+	err = mlx4_setup_hca(dev);
+	if (err)
+		goto err_close;
+
+	err = mlx4_register_device(dev);
+	if (err)
+		goto err_cleanup;
+
+	pci_set_drvdata(pdev, dev);
+
+	return 0;
+
+err_cleanup:
+	mlx4_cleanup_mcg_table(dev);
+	mlx4_cleanup_qp_table(dev);
+	mlx4_cleanup_srq_table(dev);
+	mlx4_cleanup_cq_table(dev);
+	mlx4_cmd_use_polling(dev);
+	mlx4_cleanup_eq_table(dev);
+
+	mlx4_unmap_catas_buf(dev);
+
+	mlx4_cleanup_mr_table(dev);
+	mlx4_cleanup_pd_table(dev);
+	mlx4_cleanup_uar_table(dev);
+
+err_close:
+	mlx4_close_hca(dev);
+
+err_cmd:
+	mlx4_cmd_cleanup(dev);
+
+err_free_dev:
+	if (dev->flags & MLX4_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+
+	kfree(priv);
+
+err_release_bar2:
+	pci_release_region(pdev, 2);
+
+err_release_bar0:
+	pci_release_region(pdev, 0);
+
+err_disable_pdev:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+	return err;
+}
+
+static void __devexit mlx4_remove_one(struct pci_dev *pdev)
+{
+	struct mlx4_dev  *dev  = pci_get_drvdata(pdev);
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int p;
+
+	if (dev) {
+		mlx4_unregister_device(dev);
+
+		for (p = 1; p <= dev->caps.num_ports; ++p)
+			mlx4_CLOSE_PORT(dev, p);
+
+		mlx4_cleanup_mcg_table(dev);
+		mlx4_cleanup_qp_table(dev);
+		mlx4_cleanup_srq_table(dev);
+		mlx4_cleanup_cq_table(dev);
+		mlx4_cmd_use_polling(dev);
+		mlx4_cleanup_eq_table(dev);
+
+		mlx4_unmap_catas_buf(dev);
+
+		mlx4_cleanup_mr_table(dev);
+		mlx4_cleanup_pd_table(dev);
+
+		iounmap(priv->kar);
+		mlx4_uar_free(dev, &priv->driver_uar);
+		mlx4_cleanup_uar_table(dev);
+		mlx4_close_hca(dev);
+		mlx4_cmd_cleanup(dev);
+
+		if (dev->flags & MLX4_FLAG_MSI_X)
+			pci_disable_msix(pdev);
+
+		kfree(priv);
+		pci_release_region(pdev, 2);
+		pci_release_region(pdev, 0);
+		pci_disable_device(pdev);
+		pci_set_drvdata(pdev, NULL);
+	}
+}
+
+static struct pci_device_id mlx4_pci_table[] = {
+	{ PCI_VDEVICE(MELLANOX, 0x6340) }, /* MT25408 "Hermon" SDR */
+	{ PCI_VDEVICE(MELLANOX, 0x634a) }, /* MT25408 "Hermon" DDR */
+	{ PCI_VDEVICE(MELLANOX, 0x6354) }, /* MT25408 "Hermon" QDR */
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mlx4_pci_table);
+
+static struct pci_driver mlx4_driver = {
+	.name		= DRV_NAME,
+	.id_table	= mlx4_pci_table,
+	.probe		= mlx4_init_one,
+	.remove		= __devexit_p(mlx4_remove_one)
+};
+
+static int __init mlx4_init(void)
+{
+	int ret;
+
+	ret = pci_register_driver(&mlx4_driver);
+	return ret < 0 ? ret : 0;
+}
+
+static void __exit mlx4_cleanup(void)
+{
+	pci_unregister_driver(&mlx4_driver);
+}
+
+module_init(mlx4_init);
+module_exit(mlx4_cleanup);
diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c
new file mode 100644
index 000000000000..672024a0ee71
--- /dev/null
+++ b/drivers/net/mlx4/mcg.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+
+struct mlx4_mgm {
+	__be32			next_gid_index;
+	__be32			members_count;
+	u32			reserved[2];
+	u8			gid[16];
+	__be32			qp[MLX4_QP_PER_MGM];
+};
+
+static const u8 zero_gid[16];	/* automatically initialized to 0 */
+
+static int mlx4_READ_MCG(struct mlx4_dev *dev, int index,
+			 struct mlx4_cmd_mailbox *mailbox)
+{
+	return mlx4_cmd_box(dev, 0, mailbox->dma, index, 0, MLX4_CMD_READ_MCG,
+			    MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_WRITE_MCG(struct mlx4_dev *dev, int index,
+			  struct mlx4_cmd_mailbox *mailbox)
+{
+	return mlx4_cmd(dev, mailbox->dma, index, 0, MLX4_CMD_WRITE_MCG,
+			MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_MGID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  u16 *hash)
+{
+	u64 imm;
+	int err;
+
+	err = mlx4_cmd_imm(dev, mailbox->dma, &imm, 0, 0, MLX4_CMD_MGID_HASH,
+			   MLX4_CMD_TIME_CLASS_A);
+
+	if (!err)
+		*hash = imm;
+
+	return err;
+}
+
+/*
+ * Caller must hold MCG table semaphore.  gid and mgm parameters must
+ * be properly aligned for command interface.
+ *
+ *  Returns 0 unless a firmware command error occurs.
+ *
+ * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1
+ * and *mgm holds MGM entry.
+ *
+ * if GID is found in AMGM, *index = index in AMGM, *prev = index of
+ * previous entry in hash chain and *mgm holds AMGM entry.
+ *
+ * If no AMGM exists for given gid, *index = -1, *prev = index of last
+ * entry in hash chain and *mgm holds end of hash chain.
+ */
+static int find_mgm(struct mlx4_dev *dev,
+		    u8 *gid, struct mlx4_cmd_mailbox *mgm_mailbox,
+		    u16 *hash, int *prev, int *index)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm = mgm_mailbox->buf;
+	u8 *mgid;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+	mgid = mailbox->buf;
+
+	memcpy(mgid, gid, 16);
+
+	err = mlx4_MGID_HASH(dev, mailbox, hash);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		return err;
+
+	if (0)
+		mlx4_dbg(dev, "Hash for %04x:%04x:%04x:%04x:"
+			  "%04x:%04x:%04x:%04x is %04x\n",
+			  be16_to_cpu(((__be16 *) gid)[0]),
+			  be16_to_cpu(((__be16 *) gid)[1]),
+			  be16_to_cpu(((__be16 *) gid)[2]),
+			  be16_to_cpu(((__be16 *) gid)[3]),
+			  be16_to_cpu(((__be16 *) gid)[4]),
+			  be16_to_cpu(((__be16 *) gid)[5]),
+			  be16_to_cpu(((__be16 *) gid)[6]),
+			  be16_to_cpu(((__be16 *) gid)[7]),
+			  *hash);
+
+	*index = *hash;
+	*prev  = -1;
+
+	do {
+		err = mlx4_READ_MCG(dev, *index, mgm_mailbox);
+		if (err)
+			return err;
+
+		if (!memcmp(mgm->gid, zero_gid, 16)) {
+			if (*index != *hash) {
+				mlx4_err(dev, "Found zero MGID in AMGM.\n");
+				err = -EINVAL;
+			}
+			return err;
+		}
+
+		if (!memcmp(mgm->gid, gid, 16))
+			return err;
+
+		*prev = *index;
+		*index = be32_to_cpu(mgm->next_gid_index) >> 6;
+	} while (*index);
+
+	*index = -1;
+	return err;
+}
+
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 members_count;
+	u16 hash;
+	int index, prev;
+	int link = 0;
+	int i;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&priv->mcg_table.mutex);
+
+	err = find_mgm(dev, gid, mailbox, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index != -1) {
+		if (!memcmp(mgm->gid, zero_gid, 16))
+			memcpy(mgm->gid, gid, 16);
+	} else {
+		link = 1;
+
+		index = mlx4_bitmap_alloc(&priv->mcg_table.bitmap);
+		if (index == -1) {
+			mlx4_err(dev, "No AMGM entries left\n");
+			err = -ENOMEM;
+			goto out;
+		}
+		index += dev->caps.num_mgms;
+
+		err = mlx4_READ_MCG(dev, index, mailbox);
+		if (err)
+			goto out;
+
+		memset(mgm, 0, sizeof *mgm);
+		memcpy(mgm->gid, gid, 16);
+	}
+
+	members_count = be32_to_cpu(mgm->members_count);
+	if (members_count == MLX4_QP_PER_MGM) {
+		mlx4_err(dev, "MGM at index %x is full.\n", index);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < members_count; ++i)
+		if (mgm->qp[i] == cpu_to_be32(qp->qpn)) {
+			mlx4_dbg(dev, "QP %06x already a member of MGM\n", qp->qpn);
+			err = 0;
+			goto out;
+		}
+
+	mgm->qp[members_count++] = cpu_to_be32(qp->qpn);
+	mgm->members_count       = cpu_to_be32(members_count);
+
+	err = mlx4_WRITE_MCG(dev, index, mailbox);
+	if (err)
+		goto out;
+
+	if (!link)
+		goto out;
+
+	err = mlx4_READ_MCG(dev, prev, mailbox);
+	if (err)
+		goto out;
+
+	mgm->next_gid_index = cpu_to_be32(index << 6);
+
+	err = mlx4_WRITE_MCG(dev, prev, mailbox);
+	if (err)
+		goto out;
+
+out:
+	if (err && link && index != -1) {
+		if (index < dev->caps.num_mgms)
+			mlx4_warn(dev, "Got AMGM index %d < %d",
+				  index, dev->caps.num_mgms);
+		else
+			mlx4_bitmap_free(&priv->mcg_table.bitmap,
+					 index - dev->caps.num_mgms);
+	}
+	mutex_unlock(&priv->mcg_table.mutex);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_attach);
+
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 members_count;
+	u16 hash;
+	int prev, index;
+	int i, loc;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&priv->mcg_table.mutex);
+
+	err = find_mgm(dev, gid, mailbox, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index == -1) {
+		mlx4_err(dev, "MGID %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x "
+			  "not found\n",
+			  be16_to_cpu(((__be16 *) gid)[0]),
+			  be16_to_cpu(((__be16 *) gid)[1]),
+			  be16_to_cpu(((__be16 *) gid)[2]),
+			  be16_to_cpu(((__be16 *) gid)[3]),
+			  be16_to_cpu(((__be16 *) gid)[4]),
+			  be16_to_cpu(((__be16 *) gid)[5]),
+			  be16_to_cpu(((__be16 *) gid)[6]),
+			  be16_to_cpu(((__be16 *) gid)[7]));
+		err = -EINVAL;
+		goto out;
+	}
+
+	members_count = be32_to_cpu(mgm->members_count);
+	for (loc = -1, i = 0; i < members_count; ++i)
+		if (mgm->qp[i] == cpu_to_be32(qp->qpn))
+			loc = i;
+
+	if (loc == -1) {
+		mlx4_err(dev, "QP %06x not found in MGM\n", qp->qpn);
+		err = -EINVAL;
+		goto out;
+	}
+
+
+	mgm->members_count = cpu_to_be32(--members_count);
+	mgm->qp[loc]       = mgm->qp[i - 1];
+	mgm->qp[i - 1]     = 0;
+
+	err = mlx4_WRITE_MCG(dev, index, mailbox);
+	if (err)
+		goto out;
+
+	if (i != 1)
+		goto out;
+
+	if (prev == -1) {
+		/* Remove entry from MGM */
+		int amgm_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+		if (amgm_index) {
+			err = mlx4_READ_MCG(dev, amgm_index, mailbox);
+			if (err)
+				goto out;
+		} else
+			memset(mgm->gid, 0, 16);
+
+		err = mlx4_WRITE_MCG(dev, index, mailbox);
+		if (err)
+			goto out;
+
+		if (amgm_index) {
+			if (amgm_index < dev->caps.num_mgms)
+				mlx4_warn(dev, "MGM entry %d had AMGM index %d < %d",
+					  index, amgm_index, dev->caps.num_mgms);
+			else
+				mlx4_bitmap_free(&priv->mcg_table.bitmap,
+						 amgm_index - dev->caps.num_mgms);
+		}
+	} else {
+		/* Remove entry from AMGM */
+		int cur_next_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+		err = mlx4_READ_MCG(dev, prev, mailbox);
+		if (err)
+			goto out;
+
+		mgm->next_gid_index = cpu_to_be32(cur_next_index << 6);
+
+		err = mlx4_WRITE_MCG(dev, prev, mailbox);
+		if (err)
+			goto out;
+
+		if (index < dev->caps.num_mgms)
+			mlx4_warn(dev, "entry %d had next AMGM index %d < %d",
+				  prev, index, dev->caps.num_mgms);
+		else
+			mlx4_bitmap_free(&priv->mcg_table.bitmap,
+					 index - dev->caps.num_mgms);
+	}
+
+out:
+	mutex_unlock(&priv->mcg_table.mutex);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_detach);
+
+int __devinit mlx4_init_mcg_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	err = mlx4_bitmap_init(&priv->mcg_table.bitmap,
+			       dev->caps.num_amgms, dev->caps.num_amgms - 1, 0);
+	if (err)
+		return err;
+
+	mutex_init(&priv->mcg_table.mutex);
+
+	return 0;
+}
+
+void mlx4_cleanup_mcg_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->mcg_table.bitmap);
+}
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
new file mode 100644
index 000000000000..9befbae3d196
--- /dev/null
+++ b/drivers/net/mlx4/mlx4.h
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_H
+#define MLX4_H
+
+#include <linux/radix-tree.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+#define DRV_NAME	"mlx4_core"
+#define PFX		DRV_NAME ": "
+#define DRV_VERSION	"0.01"
+#define DRV_RELDATE	"May 1, 2007"
+
+enum {
+	MLX4_HCR_BASE		= 0x80680,
+	MLX4_HCR_SIZE		= 0x0001c,
+	MLX4_CLR_INT_SIZE	= 0x00008
+};
+
+enum {
+	MLX4_BOARD_ID_LEN	= 64
+};
+
+enum {
+	MLX4_MGM_ENTRY_SIZE	=  0x40,
+	MLX4_QP_PER_MGM		= 4 * (MLX4_MGM_ENTRY_SIZE / 16 - 2),
+	MLX4_MTT_ENTRY_PER_SEG	= 8
+};
+
+enum {
+	MLX4_EQ_ASYNC,
+	MLX4_EQ_COMP,
+	MLX4_EQ_CATAS,
+	MLX4_NUM_EQ
+};
+
+enum {
+	MLX4_NUM_PDS		= 1 << 15
+};
+
+enum {
+	MLX4_CMPT_TYPE_QP	= 0,
+	MLX4_CMPT_TYPE_SRQ	= 1,
+	MLX4_CMPT_TYPE_CQ	= 2,
+	MLX4_CMPT_TYPE_EQ	= 3,
+	MLX4_CMPT_NUM_TYPE
+};
+
+enum {
+	MLX4_CMPT_SHIFT		= 24,
+	MLX4_NUM_CMPTS		= MLX4_CMPT_NUM_TYPE << MLX4_CMPT_SHIFT
+};
+
+#ifdef CONFIG_MLX4_DEBUG
+extern int mlx4_debug_level;
+
+#define mlx4_dbg(mdev, format, arg...)					\
+	do {								\
+		if (mlx4_debug_level)					\
+			dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \
+	} while (0)
+
+#else /* CONFIG_MLX4_DEBUG */
+
+#define mlx4_dbg(mdev, format, arg...) do { (void) mdev; } while (0)
+
+#endif /* CONFIG_MLX4_DEBUG */
+
+#define mlx4_err(mdev, format, arg...) \
+	dev_err(&mdev->pdev->dev, format, ## arg)
+#define mlx4_info(mdev, format, arg...) \
+	dev_info(&mdev->pdev->dev, format, ## arg)
+#define mlx4_warn(mdev, format, arg...) \
+	dev_warn(&mdev->pdev->dev, format, ## arg)
+
+struct mlx4_bitmap {
+	u32			last;
+	u32			top;
+	u32			max;
+	u32			mask;
+	spinlock_t		lock;
+	unsigned long	       *table;
+};
+
+struct mlx4_buddy {
+	unsigned long	      **bits;
+	int			max_order;
+	spinlock_t		lock;
+};
+
+struct mlx4_icm;
+
+struct mlx4_icm_table {
+	u64			virt;
+	int			num_icm;
+	int			num_obj;
+	int			obj_size;
+	int			lowmem;
+	struct mutex		mutex;
+	struct mlx4_icm	      **icm;
+};
+
+struct mlx4_eq {
+	struct mlx4_dev	       *dev;
+	void __iomem	       *doorbell;
+	int			eqn;
+	u32			cons_index;
+	u16			irq;
+	u16			have_irq;
+	int			nent;
+	struct mlx4_buf_list   *page_list;
+	struct mlx4_mtt		mtt;
+};
+
+struct mlx4_profile {
+	int			num_qp;
+	int			rdmarc_per_qp;
+	int			num_srq;
+	int			num_cq;
+	int			num_mcg;
+	int			num_mpt;
+	int			num_mtt;
+};
+
+struct mlx4_fw {
+	u64			clr_int_base;
+	u64			catas_offset;
+	struct mlx4_icm	       *fw_icm;
+	struct mlx4_icm	       *aux_icm;
+	u32			catas_size;
+	u16			fw_pages;
+	u8			clr_int_bar;
+	u8			catas_bar;
+};
+
+struct mlx4_cmd {
+	struct pci_pool	       *pool;
+	void __iomem	       *hcr;
+	struct mutex		hcr_mutex;
+	struct semaphore	poll_sem;
+	struct semaphore	event_sem;
+	int			max_cmds;
+	spinlock_t		context_lock;
+	int			free_head;
+	struct mlx4_cmd_context *context;
+	u16			token_mask;
+	u8			use_events;
+	u8			toggle;
+};
+
+struct mlx4_uar_table {
+	struct mlx4_bitmap	bitmap;
+};
+
+struct mlx4_mr_table {
+	struct mlx4_bitmap	mpt_bitmap;
+	struct mlx4_buddy	mtt_buddy;
+	u64			mtt_base;
+	u64			mpt_base;
+	struct mlx4_icm_table	mtt_table;
+	struct mlx4_icm_table	dmpt_table;
+};
+
+struct mlx4_cq_table {
+	struct mlx4_bitmap	bitmap;
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+	struct mlx4_icm_table	table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+struct mlx4_eq_table {
+	struct mlx4_bitmap	bitmap;
+	void __iomem	       *clr_int;
+	void __iomem	       *uar_map[(MLX4_NUM_EQ + 6) / 4];
+	u32			clr_mask;
+	struct mlx4_eq		eq[MLX4_NUM_EQ];
+	u64			icm_virt;
+	struct page	       *icm_page;
+	dma_addr_t		icm_dma;
+	struct mlx4_icm_table	cmpt_table;
+	int			have_irq;
+	u8			inta_pin;
+};
+
+struct mlx4_srq_table {
+	struct mlx4_bitmap	bitmap;
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+	struct mlx4_icm_table	table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+struct mlx4_qp_table {
+	struct mlx4_bitmap	bitmap;
+	u32			rdmarc_base;
+	int			rdmarc_shift;
+	spinlock_t		lock;
+	struct mlx4_icm_table	qp_table;
+	struct mlx4_icm_table	auxc_table;
+	struct mlx4_icm_table	altc_table;
+	struct mlx4_icm_table	rdmarc_table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+struct mlx4_mcg_table {
+	struct mutex		mutex;
+	struct mlx4_bitmap	bitmap;
+	struct mlx4_icm_table	table;
+};
+
+struct mlx4_catas_err {
+	u32 __iomem	       *map;
+	int			size;
+};
+
+struct mlx4_priv {
+	struct mlx4_dev		dev;
+
+	struct list_head	dev_list;
+	struct list_head	ctx_list;
+	spinlock_t		ctx_lock;
+
+	struct mlx4_fw		fw;
+	struct mlx4_cmd		cmd;
+
+	struct mlx4_bitmap	pd_bitmap;
+	struct mlx4_uar_table	uar_table;
+	struct mlx4_mr_table	mr_table;
+	struct mlx4_cq_table	cq_table;
+	struct mlx4_eq_table	eq_table;
+	struct mlx4_srq_table	srq_table;
+	struct mlx4_qp_table	qp_table;
+	struct mlx4_mcg_table	mcg_table;
+
+	struct mlx4_catas_err	catas_err;
+
+	void __iomem	       *clr_base;
+
+	struct mlx4_uar		driver_uar;
+	void __iomem	       *kar;
+	MLX4_DECLARE_DOORBELL_LOCK(doorbell_lock)
+
+	u32			rev_id;
+	char			board_id[MLX4_BOARD_ID_LEN];
+};
+
+static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
+{
+	return container_of(dev, struct mlx4_priv, dev);
+}
+
+u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj);
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, u32 reserved);
+void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap);
+
+int mlx4_reset(struct mlx4_dev *dev);
+
+int mlx4_init_pd_table(struct mlx4_dev *dev);
+int mlx4_init_uar_table(struct mlx4_dev *dev);
+int mlx4_init_mr_table(struct mlx4_dev *dev);
+int mlx4_init_eq_table(struct mlx4_dev *dev);
+int mlx4_init_cq_table(struct mlx4_dev *dev);
+int mlx4_init_qp_table(struct mlx4_dev *dev);
+int mlx4_init_srq_table(struct mlx4_dev *dev);
+int mlx4_init_mcg_table(struct mlx4_dev *dev);
+
+void mlx4_cleanup_pd_table(struct mlx4_dev *dev);
+void mlx4_cleanup_uar_table(struct mlx4_dev *dev);
+void mlx4_cleanup_mr_table(struct mlx4_dev *dev);
+void mlx4_cleanup_eq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_cq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_qp_table(struct mlx4_dev *dev);
+void mlx4_cleanup_srq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_mcg_table(struct mlx4_dev *dev);
+
+void mlx4_map_catas_buf(struct mlx4_dev *dev);
+void mlx4_unmap_catas_buf(struct mlx4_dev *dev);
+
+int mlx4_register_device(struct mlx4_dev *dev);
+void mlx4_unregister_device(struct mlx4_dev *dev);
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_event type,
+			 int subtype, int port);
+
+struct mlx4_dev_cap;
+struct mlx4_init_hca_param;
+
+u64 mlx4_make_profile(struct mlx4_dev *dev,
+		      struct mlx4_profile *request,
+		      struct mlx4_dev_cap *dev_cap,
+		      struct mlx4_init_hca_param *init_hca);
+
+int mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt);
+void mlx4_unmap_eq_icm(struct mlx4_dev *dev);
+
+int mlx4_cmd_init(struct mlx4_dev *dev);
+void mlx4_cmd_cleanup(struct mlx4_dev *dev);
+void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param);
+int mlx4_cmd_use_events(struct mlx4_dev *dev);
+void mlx4_cmd_use_polling(struct mlx4_dev *dev);
+
+void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn);
+void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type);
+
+void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type);
+
+void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
+
+void mlx4_handle_catas_err(struct mlx4_dev *dev);
+
+#endif /* MLX4_H */
diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c
new file mode 100644
index 000000000000..b33864dab179
--- /dev/null
+++ b/drivers/net/mlx4/mr.c
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_mpt_entry {
+	__be32 flags;
+	__be32 qpn;
+	__be32 key;
+	__be32 pd;
+	__be64 start;
+	__be64 length;
+	__be32 lkey;
+	__be32 win_cnt;
+	u8	reserved1[3];
+	u8	mtt_rep;
+	__be64 mtt_seg;
+	__be32 mtt_sz;
+	__be32 entity_size;
+	__be32 first_byte_offset;
+} __attribute__((packed));
+
+#define MLX4_MPT_FLAG_SW_OWNS	    (0xfUL << 28)
+#define MLX4_MPT_FLAG_MIO	    (1 << 17)
+#define MLX4_MPT_FLAG_BIND_ENABLE   (1 << 15)
+#define MLX4_MPT_FLAG_PHYSICAL	    (1 <<  9)
+#define MLX4_MPT_FLAG_REGION	    (1 <<  8)
+
+#define MLX4_MTT_FLAG_PRESENT		1
+
+static u32 mlx4_buddy_alloc(struct mlx4_buddy *buddy, int order)
+{
+	int o;
+	int m;
+	u32 seg;
+
+	spin_lock(&buddy->lock);
+
+	for (o = order; o <= buddy->max_order; ++o) {
+		m = 1 << (buddy->max_order - o);
+		seg = find_first_bit(buddy->bits[o], m);
+		if (seg < m)
+			goto found;
+	}
+
+	spin_unlock(&buddy->lock);
+	return -1;
+
+ found:
+	clear_bit(seg, buddy->bits[o]);
+
+	while (o > order) {
+		--o;
+		seg <<= 1;
+		set_bit(seg ^ 1, buddy->bits[o]);
+	}
+
+	spin_unlock(&buddy->lock);
+
+	seg <<= order;
+
+	return seg;
+}
+
+static void mlx4_buddy_free(struct mlx4_buddy *buddy, u32 seg, int order)
+{
+	seg >>= order;
+
+	spin_lock(&buddy->lock);
+
+	while (test_bit(seg ^ 1, buddy->bits[order])) {
+		clear_bit(seg ^ 1, buddy->bits[order]);
+		seg >>= 1;
+		++order;
+	}
+
+	set_bit(seg, buddy->bits[order]);
+
+	spin_unlock(&buddy->lock);
+}
+
+static int __devinit mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order)
+{
+	int i, s;
+
+	buddy->max_order = max_order;
+	spin_lock_init(&buddy->lock);
+
+	buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *),
+			      GFP_KERNEL);
+	if (!buddy->bits)
+		goto err_out;
+
+	for (i = 0; i <= buddy->max_order; ++i) {
+		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
+		buddy->bits[i] = kmalloc(s * sizeof (long), GFP_KERNEL);
+		if (!buddy->bits[i])
+			goto err_out_free;
+		bitmap_zero(buddy->bits[i], 1 << (buddy->max_order - i));
+	}
+
+	set_bit(0, buddy->bits[buddy->max_order]);
+
+	return 0;
+
+err_out_free:
+	for (i = 0; i <= buddy->max_order; ++i)
+		kfree(buddy->bits[i]);
+
+	kfree(buddy->bits);
+
+err_out:
+	return -ENOMEM;
+}
+
+static void mlx4_buddy_cleanup(struct mlx4_buddy *buddy)
+{
+	int i;
+
+	for (i = 0; i <= buddy->max_order; ++i)
+		kfree(buddy->bits[i]);
+
+	kfree(buddy->bits);
+}
+
+static u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+	u32 seg;
+
+	seg = mlx4_buddy_alloc(&mr_table->mtt_buddy, order);
+	if (seg == -1)
+		return -1;
+
+	if (mlx4_table_get_range(dev, &mr_table->mtt_table, seg,
+				 seg + (1 << order) - 1)) {
+		mlx4_buddy_free(&mr_table->mtt_buddy, seg, order);
+		return -1;
+	}
+
+	return seg;
+}
+
+int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
+		  struct mlx4_mtt *mtt)
+{
+	int i;
+
+	if (!npages) {
+		mtt->order      = -1;
+		mtt->page_shift = MLX4_ICM_PAGE_SHIFT;
+		return 0;
+	} else
+		mtt->page_shift = page_shift;
+
+	for (mtt->order = 0, i = MLX4_MTT_ENTRY_PER_SEG; i < npages; i <<= 1)
+		++mtt->order;
+
+	mtt->first_seg = mlx4_alloc_mtt_range(dev, mtt->order);
+	if (mtt->first_seg == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_init);
+
+void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	if (mtt->order < 0)
+		return;
+
+	mlx4_buddy_free(&mr_table->mtt_buddy, mtt->first_seg, mtt->order);
+	mlx4_table_put_range(dev, &mr_table->mtt_table, mtt->first_seg,
+			     mtt->first_seg + (1 << mtt->order) - 1);
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_cleanup);
+
+u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
+{
+	return (u64) mtt->first_seg * dev->caps.mtt_entry_sz;
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_addr);
+
+static u32 hw_index_to_key(u32 ind)
+{
+	return (ind >> 24) | (ind << 8);
+}
+
+static u32 key_to_hw_index(u32 key)
+{
+	return (key << 24) | (key >> 8);
+}
+
+static int mlx4_SW2HW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int mpt_index)
+{
+	return mlx4_cmd(dev, mailbox->dma, mpt_index, 0, MLX4_CMD_SW2HW_MPT,
+			MLX4_CMD_TIME_CLASS_B);
+}
+
+static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int mpt_index)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index,
+			    !mailbox, MLX4_CMD_HW2SW_MPT, MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access,
+		  int npages, int page_shift, struct mlx4_mr *mr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 index;
+	int err;
+
+	index = mlx4_bitmap_alloc(&priv->mr_table.mpt_bitmap);
+	if (index == -1) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	mr->iova       = iova;
+	mr->size       = size;
+	mr->pd	       = pd;
+	mr->access     = access;
+	mr->enabled    = 0;
+	mr->key	       = hw_index_to_key(index);
+
+	err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
+	if (err)
+		goto err_index;
+
+	return 0;
+
+err_index:
+	mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, index);
+
+err:
+	kfree(mr);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_alloc);
+
+void mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	if (mr->enabled) {
+		err = mlx4_HW2SW_MPT(dev, NULL,
+				     key_to_hw_index(mr->key) &
+				     (dev->caps.num_mpts - 1));
+		if (err)
+			mlx4_warn(dev, "HW2SW_MPT failed (%d)\n", err);
+	}
+
+	mlx4_mtt_cleanup(dev, &mr->mtt);
+	mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, key_to_hw_index(mr->key));
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_free);
+
+int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mpt_entry *mpt_entry;
+	int err;
+
+	err = mlx4_table_get(dev, &mr_table->dmpt_table, key_to_hw_index(mr->key));
+	if (err)
+		return err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_table;
+	}
+	mpt_entry = mailbox->buf;
+
+	memset(mpt_entry, 0, sizeof *mpt_entry);
+
+	mpt_entry->flags = cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS	 |
+				       MLX4_MPT_FLAG_MIO	 |
+				       MLX4_MPT_FLAG_REGION	 |
+				       mr->access);
+	if (mr->mtt.order < 0)
+		mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
+
+	mpt_entry->key	       = cpu_to_be32(key_to_hw_index(mr->key));
+	mpt_entry->pd	       = cpu_to_be32(mr->pd);
+	mpt_entry->start       = cpu_to_be64(mr->iova);
+	mpt_entry->length      = cpu_to_be64(mr->size);
+	mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift);
+	mpt_entry->mtt_seg     = cpu_to_be64(mlx4_mtt_addr(dev, &mr->mtt));
+
+	err = mlx4_SW2HW_MPT(dev, mailbox,
+			     key_to_hw_index(mr->key) & (dev->caps.num_mpts - 1));
+	if (err) {
+		mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_cmd;
+	}
+
+	mr->enabled = 1;
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return 0;
+
+err_cmd:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_table:
+	mlx4_table_put(dev, &mr_table->dmpt_table, key_to_hw_index(mr->key));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_enable);
+
+static int mlx4_WRITE_MTT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int num_mtt)
+{
+	return mlx4_cmd(dev, mailbox->dma, num_mtt, 0, MLX4_CMD_WRITE_MTT,
+			MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   int start_index, int npages, u64 *page_list)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be64 *mtt_entry;
+	int i;
+	int err = 0;
+
+	if (mtt->order < 0)
+		return -EINVAL;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	mtt_entry = mailbox->buf;
+
+	while (npages > 0) {
+		mtt_entry[0] = cpu_to_be64(mlx4_mtt_addr(dev, mtt) + start_index * 8);
+		mtt_entry[1] = 0;
+
+		for (i = 0; i < npages && i < MLX4_MAILBOX_SIZE / 8 - 2; ++i)
+			mtt_entry[i + 2] = cpu_to_be64(page_list[i] |
+						       MLX4_MTT_FLAG_PRESENT);
+
+		/*
+		 * If we have an odd number of entries to write, add
+		 * one more dummy entry for firmware efficiency.
+		 */
+		if (i & 1)
+			mtt_entry[i + 2] = 0;
+
+		err = mlx4_WRITE_MTT(dev, mailbox, (i + 1) & ~1);
+		if (err)
+			goto out;
+
+		npages      -= i;
+		start_index += i;
+		page_list   += i;
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_write_mtt);
+
+int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		       struct mlx4_buf *buf)
+{
+	u64 *page_list;
+	int err;
+	int i;
+
+	page_list = kmalloc(buf->npages * sizeof *page_list, GFP_KERNEL);
+	if (!page_list)
+		return -ENOMEM;
+
+	for (i = 0; i < buf->npages; ++i)
+		if (buf->nbufs == 1)
+			page_list[i] = buf->u.direct.map + (i << buf->page_shift);
+		else
+			page_list[i] = buf->u.page_list[i].map;
+
+	err = mlx4_write_mtt(dev, mtt, 0, buf->npages, page_list);
+
+	kfree(page_list);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_write_mtt);
+
+int __devinit mlx4_init_mr_table(struct mlx4_dev *dev)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+	int err;
+
+	err = mlx4_bitmap_init(&mr_table->mpt_bitmap, dev->caps.num_mpts,
+			       ~0, dev->caps.reserved_mrws);
+	if (err)
+		return err;
+
+	err = mlx4_buddy_init(&mr_table->mtt_buddy,
+			      ilog2(dev->caps.num_mtt_segs));
+	if (err)
+		goto err_buddy;
+
+	if (dev->caps.reserved_mtts) {
+		if (mlx4_alloc_mtt_range(dev, ilog2(dev->caps.reserved_mtts)) == -1) {
+			mlx4_warn(dev, "MTT table of order %d is too small.\n",
+				  mr_table->mtt_buddy.max_order);
+			err = -ENOMEM;
+			goto err_reserve_mtts;
+		}
+	}
+
+	return 0;
+
+err_reserve_mtts:
+	mlx4_buddy_cleanup(&mr_table->mtt_buddy);
+
+err_buddy:
+	mlx4_bitmap_cleanup(&mr_table->mpt_bitmap);
+
+	return err;
+}
+
+void mlx4_cleanup_mr_table(struct mlx4_dev *dev)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	mlx4_buddy_cleanup(&mr_table->mtt_buddy);
+	mlx4_bitmap_cleanup(&mr_table->mpt_bitmap);
+}
diff --git a/drivers/net/mlx4/pd.c b/drivers/net/mlx4/pd.c
new file mode 100644
index 000000000000..23dea1ee7750
--- /dev/null
+++ b/drivers/net/mlx4/pd.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include <asm/page.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	*pdn = mlx4_bitmap_alloc(&priv->pd_bitmap);
+	if (*pdn == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_pd_alloc);
+
+void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->pd_bitmap, pdn);
+}
+EXPORT_SYMBOL_GPL(mlx4_pd_free);
+
+int __devinit mlx4_init_pd_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return mlx4_bitmap_init(&priv->pd_bitmap, dev->caps.num_pds,
+				(1 << 24) - 1, dev->caps.reserved_pds);
+}
+
+void mlx4_cleanup_pd_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->pd_bitmap);
+}
+
+
+int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar)
+{
+	uar->index = mlx4_bitmap_alloc(&mlx4_priv(dev)->uar_table.bitmap);
+	if (uar->index == -1)
+		return -ENOMEM;
+
+	uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_uar_alloc);
+
+void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->uar_table.bitmap, uar->index);
+}
+EXPORT_SYMBOL_GPL(mlx4_uar_free);
+
+int mlx4_init_uar_table(struct mlx4_dev *dev)
+{
+	return mlx4_bitmap_init(&mlx4_priv(dev)->uar_table.bitmap,
+				dev->caps.num_uars, dev->caps.num_uars - 1,
+				max(128, dev->caps.reserved_uars));
+}
+
+void mlx4_cleanup_uar_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->uar_table.bitmap);
+}
diff --git a/drivers/net/mlx4/profile.c b/drivers/net/mlx4/profile.c
new file mode 100644
index 000000000000..9ca42b213d54
--- /dev/null
+++ b/drivers/net/mlx4/profile.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+enum {
+	MLX4_RES_QP,
+	MLX4_RES_RDMARC,
+	MLX4_RES_ALTC,
+	MLX4_RES_AUXC,
+	MLX4_RES_SRQ,
+	MLX4_RES_CQ,
+	MLX4_RES_EQ,
+	MLX4_RES_DMPT,
+	MLX4_RES_CMPT,
+	MLX4_RES_MTT,
+	MLX4_RES_MCG,
+	MLX4_RES_NUM
+};
+
+static const char *res_name[] = {
+	[MLX4_RES_QP]		= "QP",
+	[MLX4_RES_RDMARC]	= "RDMARC",
+	[MLX4_RES_ALTC]		= "ALTC",
+	[MLX4_RES_AUXC]		= "AUXC",
+	[MLX4_RES_SRQ]		= "SRQ",
+	[MLX4_RES_CQ]		= "CQ",
+	[MLX4_RES_EQ]		= "EQ",
+	[MLX4_RES_DMPT]		= "DMPT",
+	[MLX4_RES_CMPT]		= "CMPT",
+	[MLX4_RES_MTT]		= "MTT",
+	[MLX4_RES_MCG]		= "MCG",
+};
+
+u64 mlx4_make_profile(struct mlx4_dev *dev,
+		      struct mlx4_profile *request,
+		      struct mlx4_dev_cap *dev_cap,
+		      struct mlx4_init_hca_param *init_hca)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource {
+		u64 size;
+		u64 start;
+		int type;
+		int num;
+		int log_num;
+	};
+
+	u64 total_size = 0;
+	struct mlx4_resource *profile;
+	struct mlx4_resource tmp;
+	int i, j;
+
+	profile = kzalloc(MLX4_RES_NUM * sizeof *profile, GFP_KERNEL);
+	if (!profile)
+		return -ENOMEM;
+
+	profile[MLX4_RES_QP].size     = dev_cap->qpc_entry_sz;
+	profile[MLX4_RES_RDMARC].size = dev_cap->rdmarc_entry_sz;
+	profile[MLX4_RES_ALTC].size   = dev_cap->altc_entry_sz;
+	profile[MLX4_RES_AUXC].size   = dev_cap->aux_entry_sz;
+	profile[MLX4_RES_SRQ].size    = dev_cap->srq_entry_sz;
+	profile[MLX4_RES_CQ].size     = dev_cap->cqc_entry_sz;
+	profile[MLX4_RES_EQ].size     = dev_cap->eqc_entry_sz;
+	profile[MLX4_RES_DMPT].size   = dev_cap->dmpt_entry_sz;
+	profile[MLX4_RES_CMPT].size   = dev_cap->cmpt_entry_sz;
+	profile[MLX4_RES_MTT].size    = MLX4_MTT_ENTRY_PER_SEG * dev_cap->mtt_entry_sz;
+	profile[MLX4_RES_MCG].size    = MLX4_MGM_ENTRY_SIZE;
+
+	profile[MLX4_RES_QP].num      = request->num_qp;
+	profile[MLX4_RES_RDMARC].num  = request->num_qp * request->rdmarc_per_qp;
+	profile[MLX4_RES_ALTC].num    = request->num_qp;
+	profile[MLX4_RES_AUXC].num    = request->num_qp;
+	profile[MLX4_RES_SRQ].num     = request->num_srq;
+	profile[MLX4_RES_CQ].num      = request->num_cq;
+	profile[MLX4_RES_EQ].num      = MLX4_NUM_EQ + dev_cap->reserved_eqs;
+	profile[MLX4_RES_DMPT].num    = request->num_mpt;
+	profile[MLX4_RES_CMPT].num    = MLX4_NUM_CMPTS;
+	profile[MLX4_RES_MTT].num     = request->num_mtt;
+	profile[MLX4_RES_MCG].num     = request->num_mcg;
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		profile[i].type     = i;
+		profile[i].num      = roundup_pow_of_two(profile[i].num);
+		profile[i].log_num  = ilog2(profile[i].num);
+		profile[i].size    *= profile[i].num;
+		profile[i].size     = max(profile[i].size, (u64) PAGE_SIZE);
+	}
+
+	/*
+	 * Sort the resources in decreasing order of size.  Since they
+	 * all have sizes that are powers of 2, we'll be able to keep
+	 * resources aligned to their size and pack them without gaps
+	 * using the sorted order.
+	 */
+	for (i = MLX4_RES_NUM; i > 0; --i)
+		for (j = 1; j < i; ++j) {
+			if (profile[j].size > profile[j - 1].size) {
+				tmp	       = profile[j];
+				profile[j]     = profile[j - 1];
+				profile[j - 1] = tmp;
+			}
+		}
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		if (profile[i].size) {
+			profile[i].start = total_size;
+			total_size	+= profile[i].size;
+		}
+
+		if (total_size > dev_cap->max_icm_sz) {
+			mlx4_err(dev, "Profile requires 0x%llx bytes; "
+				  "won't fit in 0x%llx bytes of context memory.\n",
+				  (unsigned long long) total_size,
+				  (unsigned long long) dev_cap->max_icm_sz);
+			kfree(profile);
+			return -ENOMEM;
+		}
+
+		if (profile[i].size)
+			mlx4_dbg(dev, "  profile[%2d] (%6s): 2^%02d entries @ 0x%10llx, "
+				  "size 0x%10llx\n",
+				 i, res_name[profile[i].type], profile[i].log_num,
+				 (unsigned long long) profile[i].start,
+				 (unsigned long long) profile[i].size);
+	}
+
+	mlx4_dbg(dev, "HCA context memory: reserving %d KB\n",
+		 (int) (total_size >> 10));
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		switch (profile[i].type) {
+		case MLX4_RES_QP:
+			dev->caps.num_qps     = profile[i].num;
+			init_hca->qpc_base    = profile[i].start;
+			init_hca->log_num_qps = profile[i].log_num;
+			break;
+		case MLX4_RES_RDMARC:
+			for (priv->qp_table.rdmarc_shift = 0;
+			     request->num_qp << priv->qp_table.rdmarc_shift < profile[i].num;
+			     ++priv->qp_table.rdmarc_shift)
+				; /* nothing */
+			dev->caps.max_qp_dest_rdma = 1 << priv->qp_table.rdmarc_shift;
+			priv->qp_table.rdmarc_base   = (u32) profile[i].start;
+			init_hca->rdmarc_base	     = profile[i].start;
+			init_hca->log_rd_per_qp	     = priv->qp_table.rdmarc_shift;
+			break;
+		case MLX4_RES_ALTC:
+			init_hca->altc_base = profile[i].start;
+			break;
+		case MLX4_RES_AUXC:
+			init_hca->auxc_base = profile[i].start;
+			break;
+		case MLX4_RES_SRQ:
+			dev->caps.num_srqs     = profile[i].num;
+			init_hca->srqc_base    = profile[i].start;
+			init_hca->log_num_srqs = profile[i].log_num;
+			break;
+		case MLX4_RES_CQ:
+			dev->caps.num_cqs     = profile[i].num;
+			init_hca->cqc_base    = profile[i].start;
+			init_hca->log_num_cqs = profile[i].log_num;
+			break;
+		case MLX4_RES_EQ:
+			dev->caps.num_eqs     = profile[i].num;
+			init_hca->eqc_base    = profile[i].start;
+			init_hca->log_num_eqs = profile[i].log_num;
+			break;
+		case MLX4_RES_DMPT:
+			dev->caps.num_mpts	= profile[i].num;
+			priv->mr_table.mpt_base = profile[i].start;
+			init_hca->dmpt_base	= profile[i].start;
+			init_hca->log_mpt_sz	= profile[i].log_num;
+			break;
+		case MLX4_RES_CMPT:
+			init_hca->cmpt_base	 = profile[i].start;
+			break;
+		case MLX4_RES_MTT:
+			dev->caps.num_mtt_segs	 = profile[i].num;
+			priv->mr_table.mtt_base	 = profile[i].start;
+			init_hca->mtt_base	 = profile[i].start;
+			break;
+		case MLX4_RES_MCG:
+			dev->caps.num_mgms	  = profile[i].num >> 1;
+			dev->caps.num_amgms	  = profile[i].num >> 1;
+			init_hca->mc_base	  = profile[i].start;
+			init_hca->log_mc_entry_sz = ilog2(MLX4_MGM_ENTRY_SIZE);
+			init_hca->log_mc_table_sz = profile[i].log_num;
+			init_hca->log_mc_hash_sz  = profile[i].log_num - 1;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * PDs don't take any HCA memory, but we assign them as part
+	 * of the HCA profile anyway.
+	 */
+	dev->caps.num_pds = MLX4_NUM_PDS;
+
+	kfree(profile);
+	return total_size;
+}
diff --git a/drivers/net/mlx4/qp.c b/drivers/net/mlx4/qp.c
new file mode 100644
index 000000000000..7f8b7d55b6e1
--- /dev/null
+++ b/drivers/net/mlx4/qp.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	struct mlx4_qp *qp;
+
+	spin_lock(&qp_table->lock);
+
+	qp = __mlx4_qp_lookup(dev, qpn);
+	if (qp)
+		atomic_inc(&qp->refcount);
+
+	spin_unlock(&qp_table->lock);
+
+	if (!qp) {
+		mlx4_warn(dev, "Async event for bogus QP %08x\n", qpn);
+		return;
+	}
+
+	qp->event(qp, event_type);
+
+	if (atomic_dec_and_test(&qp->refcount))
+		complete(&qp->free);
+}
+
+int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		   struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar,
+		   int sqd_event, struct mlx4_qp *qp)
+{
+	static const u16 op[MLX4_QP_NUM_STATE][MLX4_QP_NUM_STATE] = {
+		[MLX4_QP_STATE_RST] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_INIT]	= MLX4_CMD_RST2INIT_QP,
+		},
+		[MLX4_QP_STATE_INIT]  = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_INIT]	= MLX4_CMD_INIT2INIT_QP,
+			[MLX4_QP_STATE_RTR]	= MLX4_CMD_INIT2RTR_QP,
+		},
+		[MLX4_QP_STATE_RTR]   = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_RTR2RTS_QP,
+		},
+		[MLX4_QP_STATE_RTS]   = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_RTS2RTS_QP,
+			[MLX4_QP_STATE_SQD]	= MLX4_CMD_RTS2SQD_QP,
+		},
+		[MLX4_QP_STATE_SQD] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_SQD2RTS_QP,
+			[MLX4_QP_STATE_SQD]	= MLX4_CMD_SQD2SQD_QP,
+		},
+		[MLX4_QP_STATE_SQER] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_SQERR2RTS_QP,
+		},
+		[MLX4_QP_STATE_ERR] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+		}
+	};
+
+	struct mlx4_cmd_mailbox *mailbox;
+	int ret = 0;
+
+	if (cur_state < 0 || cur_state >= MLX4_QP_NUM_STATE ||
+	    new_state < 0 || cur_state >= MLX4_QP_NUM_STATE ||
+	    !op[cur_state][new_state])
+		return -EINVAL;
+
+	if (op[cur_state][new_state] == MLX4_CMD_2RST_QP)
+		return mlx4_cmd(dev, 0, qp->qpn, 2,
+				MLX4_CMD_2RST_QP, MLX4_CMD_TIME_CLASS_A);
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (cur_state == MLX4_QP_STATE_RST && new_state == MLX4_QP_STATE_INIT) {
+		u64 mtt_addr = mlx4_mtt_addr(dev, mtt);
+		context->mtt_base_addr_h = mtt_addr >> 32;
+		context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+		context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+	}
+
+	*(__be32 *) mailbox->buf = cpu_to_be32(optpar);
+	memcpy(mailbox->buf + 8, context, sizeof *context);
+
+	((struct mlx4_qp_context *) (mailbox->buf + 8))->local_qpn =
+		cpu_to_be32(qp->qpn);
+
+	ret = mlx4_cmd(dev, mailbox->dma, qp->qpn | (!!sqd_event << 31),
+		       new_state == MLX4_QP_STATE_RST ? 2 : 0,
+		       op[cur_state][new_state], MLX4_CMD_TIME_CLASS_C);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_modify);
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int sqpn, struct mlx4_qp *qp)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	int err;
+
+	if (sqpn)
+		qp->qpn = sqpn;
+	else {
+		qp->qpn = mlx4_bitmap_alloc(&qp_table->bitmap);
+		if (qp->qpn == -1)
+			return -ENOMEM;
+	}
+
+	err = mlx4_table_get(dev, &qp_table->qp_table, qp->qpn);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &qp_table->auxc_table, qp->qpn);
+	if (err)
+		goto err_put_qp;
+
+	err = mlx4_table_get(dev, &qp_table->altc_table, qp->qpn);
+	if (err)
+		goto err_put_auxc;
+
+	err = mlx4_table_get(dev, &qp_table->rdmarc_table, qp->qpn);
+	if (err)
+		goto err_put_altc;
+
+	err = mlx4_table_get(dev, &qp_table->cmpt_table, qp->qpn);
+	if (err)
+		goto err_put_rdmarc;
+
+	spin_lock_irq(&qp_table->lock);
+	err = radix_tree_insert(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1), qp);
+	spin_unlock_irq(&qp_table->lock);
+	if (err)
+		goto err_put_cmpt;
+
+	atomic_set(&qp->refcount, 1);
+	init_completion(&qp->free);
+
+	return 0;
+
+err_put_cmpt:
+	mlx4_table_put(dev, &qp_table->cmpt_table, qp->qpn);
+
+err_put_rdmarc:
+	mlx4_table_put(dev, &qp_table->rdmarc_table, qp->qpn);
+
+err_put_altc:
+	mlx4_table_put(dev, &qp_table->altc_table, qp->qpn);
+
+err_put_auxc:
+	mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn);
+
+err_put_qp:
+	mlx4_table_put(dev, &qp_table->qp_table, qp->qpn);
+
+err_out:
+	if (!sqpn)
+		mlx4_bitmap_free(&qp_table->bitmap, qp->qpn);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_alloc);
+
+void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp_table->lock, flags);
+	radix_tree_delete(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1));
+	spin_unlock_irqrestore(&qp_table->lock, flags);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_remove);
+
+void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+
+	if (atomic_dec_and_test(&qp->refcount))
+		complete(&qp->free);
+	wait_for_completion(&qp->free);
+
+	mlx4_table_put(dev, &qp_table->cmpt_table, qp->qpn);
+	mlx4_table_put(dev, &qp_table->rdmarc_table, qp->qpn);
+	mlx4_table_put(dev, &qp_table->altc_table, qp->qpn);
+	mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn);
+	mlx4_table_put(dev, &qp_table->qp_table, qp->qpn);
+
+	mlx4_bitmap_free(&qp_table->bitmap, qp->qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_free);
+
+static int mlx4_CONF_SPECIAL_QP(struct mlx4_dev *dev, u32 base_qpn)
+{
+	return mlx4_cmd(dev, 0, base_qpn, 0, MLX4_CMD_CONF_SPECIAL_QP,
+			MLX4_CMD_TIME_CLASS_B);
+}
+
+int __devinit mlx4_init_qp_table(struct mlx4_dev *dev)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	int err;
+
+	spin_lock_init(&qp_table->lock);
+	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
+
+	/*
+	 * We reserve 2 extra QPs per port for the special QPs.  The
+	 * block of special QPs must be aligned to a multiple of 8, so
+	 * round up.
+	 */
+	dev->caps.sqp_start = ALIGN(dev->caps.reserved_qps, 8);
+	err = mlx4_bitmap_init(&qp_table->bitmap, dev->caps.num_qps,
+			       (1 << 24) - 1, dev->caps.sqp_start + 8);
+	if (err)
+		return err;
+
+	return mlx4_CONF_SPECIAL_QP(dev, dev->caps.sqp_start);
+}
+
+void mlx4_cleanup_qp_table(struct mlx4_dev *dev)
+{
+	mlx4_CONF_SPECIAL_QP(dev, 0);
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->qp_table.bitmap);
+}
diff --git a/drivers/net/mlx4/reset.c b/drivers/net/mlx4/reset.c
new file mode 100644
index 000000000000..51eef8492e93
--- /dev/null
+++ b/drivers/net/mlx4/reset.c
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#include "mlx4.h"
+
+int mlx4_reset(struct mlx4_dev *dev)
+{
+	void __iomem *reset;
+	u32 *hca_header = NULL;
+	int pcie_cap;
+	u16 devctl;
+	u16 linkctl;
+	u16 vendor;
+	unsigned long end;
+	u32 sem;
+	int i;
+	int err = 0;
+
+#define MLX4_RESET_BASE		0xf0000
+#define MLX4_RESET_SIZE		  0x400
+#define MLX4_SEM_OFFSET		  0x3fc
+#define MLX4_RESET_OFFSET	   0x10
+#define MLX4_RESET_VALUE	swab32(1)
+
+#define MLX4_SEM_TIMEOUT_JIFFIES	(10 * HZ)
+#define MLX4_RESET_TIMEOUT_JIFFIES	(2 * HZ)
+
+	/*
+	 * Reset the chip.  This is somewhat ugly because we have to
+	 * save off the PCI header before reset and then restore it
+	 * after the chip reboots.  We skip config space offsets 22
+	 * and 23 since those have a special meaning.
+	 */
+
+	/* Do we need to save off the full 4K PCI Express header?? */
+	hca_header = kmalloc(256, GFP_KERNEL);
+	if (!hca_header) {
+		err = -ENOMEM;
+		mlx4_err(dev, "Couldn't allocate memory to save HCA "
+			  "PCI header, aborting.\n");
+		goto out;
+	}
+
+	pcie_cap = pci_find_capability(dev->pdev, PCI_CAP_ID_EXP);
+
+	for (i = 0; i < 64; ++i) {
+		if (i == 22 || i == 23)
+			continue;
+		if (pci_read_config_dword(dev->pdev, i * 4, hca_header + i)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't save HCA "
+				  "PCI header, aborting.\n");
+			goto out;
+		}
+	}
+
+	reset = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_RESET_BASE,
+			MLX4_RESET_SIZE);
+	if (!reset) {
+		err = -ENOMEM;
+		mlx4_err(dev, "Couldn't map HCA reset register, aborting.\n");
+		goto out;
+	}
+
+	/* grab HW semaphore to lock out flash updates */
+	end = jiffies + MLX4_SEM_TIMEOUT_JIFFIES;
+	do {
+		sem = readl(reset + MLX4_SEM_OFFSET);
+		if (!sem)
+			break;
+
+		msleep(1);
+	} while (time_before(jiffies, end));
+
+	if (sem) {
+		mlx4_err(dev, "Failed to obtain HW semaphore, aborting\n");
+		err = -EAGAIN;
+		iounmap(reset);
+		goto out;
+	}
+
+	/* actually hit reset */
+	writel(MLX4_RESET_VALUE, reset + MLX4_RESET_OFFSET);
+	iounmap(reset);
+
+	end = jiffies + MLX4_RESET_TIMEOUT_JIFFIES;
+	do {
+		if (!pci_read_config_word(dev->pdev, PCI_VENDOR_ID, &vendor) &&
+		    vendor != 0xffff)
+			break;
+
+		msleep(1);
+	} while (time_before(jiffies, end));
+
+	if (vendor == 0xffff) {
+		err = -ENODEV;
+		mlx4_err(dev, "PCI device did not come back after reset, "
+			  "aborting.\n");
+		goto out;
+	}
+
+	/* Now restore the PCI headers */
+	if (pcie_cap) {
+		devctl = hca_header[(pcie_cap + PCI_EXP_DEVCTL) / 4];
+		if (pci_write_config_word(dev->pdev, pcie_cap + PCI_EXP_DEVCTL,
+					   devctl)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA PCI Express "
+				 "Device Control register, aborting.\n");
+			goto out;
+		}
+		linkctl = hca_header[(pcie_cap + PCI_EXP_LNKCTL) / 4];
+		if (pci_write_config_word(dev->pdev, pcie_cap + PCI_EXP_LNKCTL,
+					   linkctl)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA PCI Express "
+				 "Link control register, aborting.\n");
+			goto out;
+		}
+	}
+
+	for (i = 0; i < 16; ++i) {
+		if (i * 4 == PCI_COMMAND)
+			continue;
+
+		if (pci_write_config_dword(dev->pdev, i * 4, hca_header[i])) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA reg %x, "
+				  "aborting.\n", i);
+			goto out;
+		}
+	}
+
+	if (pci_write_config_dword(dev->pdev, PCI_COMMAND,
+				   hca_header[PCI_COMMAND / 4])) {
+		err = -ENODEV;
+		mlx4_err(dev, "Couldn't restore HCA COMMAND, "
+			  "aborting.\n");
+		goto out;
+	}
+
+out:
+	kfree(hca_header);
+
+	return err;
+}
diff --git a/drivers/net/mlx4/srq.c b/drivers/net/mlx4/srq.c
new file mode 100644
index 000000000000..2134f83aed87
--- /dev/null
+++ b/drivers/net/mlx4/srq.c
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+struct mlx4_srq_context {
+	__be32			state_logsize_srqn;
+	u8			logstride;
+	u8			reserved1[3];
+	u8			pg_offset;
+	u8			reserved2[3];
+	u32			reserved3;
+	u8			log_page_size;
+	u8			reserved4[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	__be32			pd;
+	__be16			limit_watermark;
+	__be16			wqe_cnt;
+	u16			reserved5;
+	__be16			wqe_counter;
+	u32			reserved6;
+	__be64			db_rec_addr;
+};
+
+void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_srq *srq;
+
+	spin_lock(&srq_table->lock);
+
+	srq = radix_tree_lookup(&srq_table->tree, srqn & (dev->caps.num_srqs - 1));
+	if (srq)
+		atomic_inc(&srq->refcount);
+
+	spin_unlock(&srq_table->lock);
+
+	if (!srq) {
+		mlx4_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
+		return;
+	}
+
+	srq->event(srq, event_type);
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+}
+
+static int mlx4_SW2HW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, srq_num, 0, MLX4_CMD_SW2HW_SRQ,
+			MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_HW2SW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, srq_num,
+			    mailbox ? 0 : 1, MLX4_CMD_HW2SW_SRQ,
+			    MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_ARM_SRQ(struct mlx4_dev *dev, int srq_num, int limit_watermark)
+{
+	return mlx4_cmd(dev, limit_watermark, srq_num, 0, MLX4_CMD_ARM_SRQ,
+			MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, struct mlx4_mtt *mtt,
+		   u64 db_rec, struct mlx4_srq *srq)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_srq_context *srq_context;
+	u64 mtt_addr;
+	int err;
+
+	srq->srqn = mlx4_bitmap_alloc(&srq_table->bitmap);
+	if (srq->srqn == -1)
+		return -ENOMEM;
+
+	err = mlx4_table_get(dev, &srq_table->table, srq->srqn);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &srq_table->cmpt_table, srq->srqn);
+	if (err)
+		goto err_put;
+
+	spin_lock_irq(&srq_table->lock);
+	err = radix_tree_insert(&srq_table->tree, srq->srqn, srq);
+	spin_unlock_irq(&srq_table->lock);
+	if (err)
+		goto err_cmpt_put;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_radix;
+	}
+
+	srq_context = mailbox->buf;
+	memset(srq_context, 0, sizeof *srq_context);
+
+	srq_context->state_logsize_srqn = cpu_to_be32((ilog2(srq->max) << 24) |
+						      srq->srqn);
+	srq_context->logstride          = srq->wqe_shift - 4;
+	srq_context->log_page_size      = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, mtt);
+	srq_context->mtt_base_addr_h    = mtt_addr >> 32;
+	srq_context->mtt_base_addr_l    = cpu_to_be32(mtt_addr & 0xffffffff);
+	srq_context->pd			= cpu_to_be32(pdn);
+	srq_context->db_rec_addr        = cpu_to_be64(db_rec);
+
+	err = mlx4_SW2HW_SRQ(dev, mailbox, srq->srqn);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		goto err_radix;
+
+	atomic_set(&srq->refcount, 1);
+	init_completion(&srq->free);
+
+	return 0;
+
+err_radix:
+	spin_lock_irq(&srq_table->lock);
+	radix_tree_delete(&srq_table->tree, srq->srqn);
+	spin_unlock_irq(&srq_table->lock);
+
+err_cmpt_put:
+	mlx4_table_put(dev, &srq_table->cmpt_table, srq->srqn);
+
+err_put:
+	mlx4_table_put(dev, &srq_table->table, srq->srqn);
+
+err_out:
+	mlx4_bitmap_free(&srq_table->bitmap, srq->srqn);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_alloc);
+
+void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	int err;
+
+	err = mlx4_HW2SW_SRQ(dev, NULL, srq->srqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_SRQ failed (%d) for SRQN %06x\n", err, srq->srqn);
+
+	spin_lock_irq(&srq_table->lock);
+	radix_tree_delete(&srq_table->tree, srq->srqn);
+	spin_unlock_irq(&srq_table->lock);
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+	wait_for_completion(&srq->free);
+
+	mlx4_table_put(dev, &srq_table->table, srq->srqn);
+	mlx4_bitmap_free(&srq_table->bitmap, srq->srqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_free);
+
+int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark)
+{
+	return mlx4_ARM_SRQ(dev, srq->srqn, limit_watermark);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_arm);
+
+int __devinit mlx4_init_srq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	int err;
+
+	spin_lock_init(&srq_table->lock);
+	INIT_RADIX_TREE(&srq_table->tree, GFP_ATOMIC);
+
+	err = mlx4_bitmap_init(&srq_table->bitmap, dev->caps.num_srqs,
+			       dev->caps.num_srqs - 1, dev->caps.reserved_srqs);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+void mlx4_cleanup_srq_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->srq_table.bitmap);
+}
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
new file mode 100644
index 000000000000..4fb552d12f7a
--- /dev/null
+++ b/include/linux/mlx4/cmd.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_CMD_H
+#define MLX4_CMD_H
+
+#include <linux/dma-mapping.h>
+
+enum {
+	/* initialization and general commands */
+	MLX4_CMD_SYS_EN		 = 0x1,
+	MLX4_CMD_SYS_DIS	 = 0x2,
+	MLX4_CMD_MAP_FA		 = 0xfff,
+	MLX4_CMD_UNMAP_FA	 = 0xffe,
+	MLX4_CMD_RUN_FW		 = 0xff6,
+	MLX4_CMD_MOD_STAT_CFG	 = 0x34,
+	MLX4_CMD_QUERY_DEV_CAP	 = 0x3,
+	MLX4_CMD_QUERY_FW	 = 0x4,
+	MLX4_CMD_ENABLE_LAM	 = 0xff8,
+	MLX4_CMD_DISABLE_LAM	 = 0xff7,
+	MLX4_CMD_QUERY_DDR	 = 0x5,
+	MLX4_CMD_QUERY_ADAPTER	 = 0x6,
+	MLX4_CMD_INIT_HCA	 = 0x7,
+	MLX4_CMD_CLOSE_HCA	 = 0x8,
+	MLX4_CMD_INIT_PORT	 = 0x9,
+	MLX4_CMD_CLOSE_PORT	 = 0xa,
+	MLX4_CMD_QUERY_HCA	 = 0xb,
+	MLX4_CMD_SET_PORT	 = 0xc,
+	MLX4_CMD_ACCESS_DDR	 = 0x2e,
+	MLX4_CMD_MAP_ICM	 = 0xffa,
+	MLX4_CMD_UNMAP_ICM	 = 0xff9,
+	MLX4_CMD_MAP_ICM_AUX	 = 0xffc,
+	MLX4_CMD_UNMAP_ICM_AUX	 = 0xffb,
+	MLX4_CMD_SET_ICM_SIZE	 = 0xffd,
+
+	/* TPT commands */
+	MLX4_CMD_SW2HW_MPT	 = 0xd,
+	MLX4_CMD_QUERY_MPT	 = 0xe,
+	MLX4_CMD_HW2SW_MPT	 = 0xf,
+	MLX4_CMD_READ_MTT	 = 0x10,
+	MLX4_CMD_WRITE_MTT	 = 0x11,
+	MLX4_CMD_SYNC_TPT	 = 0x2f,
+
+	/* EQ commands */
+	MLX4_CMD_MAP_EQ		 = 0x12,
+	MLX4_CMD_SW2HW_EQ	 = 0x13,
+	MLX4_CMD_HW2SW_EQ	 = 0x14,
+	MLX4_CMD_QUERY_EQ	 = 0x15,
+
+	/* CQ commands */
+	MLX4_CMD_SW2HW_CQ	 = 0x16,
+	MLX4_CMD_HW2SW_CQ	 = 0x17,
+	MLX4_CMD_QUERY_CQ	 = 0x18,
+	MLX4_CMD_RESIZE_CQ	 = 0x2c,
+
+	/* SRQ commands */
+	MLX4_CMD_SW2HW_SRQ	 = 0x35,
+	MLX4_CMD_HW2SW_SRQ	 = 0x36,
+	MLX4_CMD_QUERY_SRQ	 = 0x37,
+	MLX4_CMD_ARM_SRQ	 = 0x40,
+
+	/* QP/EE commands */
+	MLX4_CMD_RST2INIT_QP	 = 0x19,
+	MLX4_CMD_INIT2RTR_QP	 = 0x1a,
+	MLX4_CMD_RTR2RTS_QP	 = 0x1b,
+	MLX4_CMD_RTS2RTS_QP	 = 0x1c,
+	MLX4_CMD_SQERR2RTS_QP	 = 0x1d,
+	MLX4_CMD_2ERR_QP	 = 0x1e,
+	MLX4_CMD_RTS2SQD_QP	 = 0x1f,
+	MLX4_CMD_SQD2SQD_QP	 = 0x38,
+	MLX4_CMD_SQD2RTS_QP	 = 0x20,
+	MLX4_CMD_2RST_QP	 = 0x21,
+	MLX4_CMD_QUERY_QP	 = 0x22,
+	MLX4_CMD_INIT2INIT_QP	 = 0x2d,
+	MLX4_CMD_SUSPEND_QP	 = 0x32,
+	MLX4_CMD_UNSUSPEND_QP	 = 0x33,
+	/* special QP and management commands */
+	MLX4_CMD_CONF_SPECIAL_QP = 0x23,
+	MLX4_CMD_MAD_IFC	 = 0x24,
+
+	/* multicast commands */
+	MLX4_CMD_READ_MCG	 = 0x25,
+	MLX4_CMD_WRITE_MCG	 = 0x26,
+	MLX4_CMD_MGID_HASH	 = 0x27,
+
+	/* miscellaneous commands */
+	MLX4_CMD_DIAG_RPRT	 = 0x30,
+	MLX4_CMD_NOP		 = 0x31,
+
+	/* debug commands */
+	MLX4_CMD_QUERY_DEBUG_MSG = 0x2a,
+	MLX4_CMD_SET_DEBUG_MSG	 = 0x2b,
+};
+
+enum {
+	MLX4_CMD_TIME_CLASS_A	= 10000,
+	MLX4_CMD_TIME_CLASS_B	= 10000,
+	MLX4_CMD_TIME_CLASS_C	= 10000,
+};
+
+enum {
+	MLX4_MAILBOX_SIZE	=  4096
+};
+
+struct mlx4_dev;
+
+struct mlx4_cmd_mailbox {
+	void		       *buf;
+	dma_addr_t		dma;
+};
+
+int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+	       int out_is_imm, u32 in_modifier, u8 op_modifier,
+	       u16 op, unsigned long timeout);
+
+/* Invoke a command with no output parameter */
+static inline int mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u32 in_modifier,
+			   u8 op_modifier, u16 op, unsigned long timeout)
+{
+	return __mlx4_cmd(dev, in_param, NULL, 0, in_modifier,
+			  op_modifier, op, timeout);
+}
+
+/* Invoke a command with an output mailbox */
+static inline int mlx4_cmd_box(struct mlx4_dev *dev, u64 in_param, u64 out_param,
+			       u32 in_modifier, u8 op_modifier, u16 op,
+			       unsigned long timeout)
+{
+	return __mlx4_cmd(dev, in_param, &out_param, 0, in_modifier,
+			  op_modifier, op, timeout);
+}
+
+/*
+ * Invoke a command with an immediate output parameter (and copy the
+ * output into the caller's out_param pointer after the command
+ * executes).
+ */
+static inline int mlx4_cmd_imm(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			       u32 in_modifier, u8 op_modifier, u16 op,
+			       unsigned long timeout)
+{
+	return __mlx4_cmd(dev, in_param, out_param, 1, in_modifier,
+			  op_modifier, op, timeout);
+}
+
+struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev);
+void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox);
+
+#endif /* MLX4_CMD_H */
diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h
new file mode 100644
index 000000000000..0181e0a57cbf
--- /dev/null
+++ b/include/linux/mlx4/cq.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_CQ_H
+#define MLX4_CQ_H
+
+#include <linux/types.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+struct mlx4_cqe {
+	__be32			my_qpn;
+	__be32			immed_rss_invalid;
+	__be32			g_mlpath_rqpn;
+	u8			sl;
+	u8			reserved1;
+	__be16			rlid;
+	u32			reserved2;
+	__be32			byte_cnt;
+	__be16			wqe_index;
+	__be16			checksum;
+	u8			reserved3[3];
+	u8			owner_sr_opcode;
+};
+
+struct mlx4_err_cqe {
+	__be32			my_qpn;
+	u32			reserved1[5];
+	__be16			wqe_index;
+	u8			vendor_err_syndrome;
+	u8			syndrome;
+	u8			reserved2[3];
+	u8			owner_sr_opcode;
+};
+
+enum {
+	MLX4_CQE_OWNER_MASK	= 0x80,
+	MLX4_CQE_IS_SEND_MASK	= 0x40,
+	MLX4_CQE_OPCODE_MASK	= 0x1f
+};
+
+enum {
+	MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR		= 0x01,
+	MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR		= 0x02,
+	MLX4_CQE_SYNDROME_LOCAL_PROT_ERR		= 0x04,
+	MLX4_CQE_SYNDROME_WR_FLUSH_ERR			= 0x05,
+	MLX4_CQE_SYNDROME_MW_BIND_ERR			= 0x06,
+	MLX4_CQE_SYNDROME_BAD_RESP_ERR			= 0x10,
+	MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR		= 0x11,
+	MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR		= 0x12,
+	MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR		= 0x13,
+	MLX4_CQE_SYNDROME_REMOTE_OP_ERR			= 0x14,
+	MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR	= 0x15,
+	MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR		= 0x16,
+	MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR		= 0x22,
+};
+
+static inline void mlx4_cq_arm(struct mlx4_cq *cq, u32 cmd,
+			       void __iomem *uar_page,
+			       spinlock_t *doorbell_lock)
+{
+	__be32 doorbell[2];
+	u32 sn;
+	u32 ci;
+
+	sn = cq->arm_sn & 3;
+	ci = cq->cons_index & 0xffffff;
+
+	*cq->arm_db = cpu_to_be32(sn << 28 | cmd | ci);
+
+	/*
+	 * Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	wmb();
+
+	doorbell[0] = cpu_to_be32(sn << 28 | cmd | cq->cqn);
+	doorbell[1] = cpu_to_be32(ci);
+
+	mlx4_write64(doorbell, uar_page + MLX4_CQ_DOORBELL, doorbell_lock);
+}
+
+static inline void mlx4_cq_set_ci(struct mlx4_cq *cq)
+{
+	*cq->set_ci_db = cpu_to_be32(cq->cons_index & 0xffffff);
+}
+
+enum {
+	MLX4_CQ_DB_REQ_NOT_SOL		= 1 << 24,
+	MLX4_CQ_DB_REQ_NOT		= 2 << 24
+};
+
+#endif /* MLX4_CQ_H */
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
new file mode 100644
index 000000000000..8c5f8fd86841
--- /dev/null
+++ b/include/linux/mlx4/device.h
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DEVICE_H
+#define MLX4_DEVICE_H
+
+#include <linux/pci.h>
+#include <linux/completion.h>
+#include <linux/radix-tree.h>
+
+#include <asm/atomic.h>
+
+enum {
+	MLX4_FLAG_MSI_X		= 1 << 0,
+};
+
+enum {
+	MLX4_MAX_PORTS		= 2
+};
+
+enum {
+	MLX4_DEV_CAP_FLAG_RC		= 1 <<  0,
+	MLX4_DEV_CAP_FLAG_UC		= 1 <<  1,
+	MLX4_DEV_CAP_FLAG_UD		= 1 <<  2,
+	MLX4_DEV_CAP_FLAG_SRQ		= 1 <<  6,
+	MLX4_DEV_CAP_FLAG_IPOIB_CSUM	= 1 <<  7,
+	MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1 <<  8,
+	MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1 <<  9,
+	MLX4_DEV_CAP_FLAG_MEM_WINDOW	= 1 << 16,
+	MLX4_DEV_CAP_FLAG_APM		= 1 << 17,
+	MLX4_DEV_CAP_FLAG_ATOMIC	= 1 << 18,
+	MLX4_DEV_CAP_FLAG_RAW_MCAST	= 1 << 19,
+	MLX4_DEV_CAP_FLAG_UD_AV_PORT	= 1 << 20,
+	MLX4_DEV_CAP_FLAG_UD_MCAST	= 1 << 21
+};
+
+enum mlx4_event {
+	MLX4_EVENT_TYPE_COMP		   = 0x00,
+	MLX4_EVENT_TYPE_PATH_MIG	   = 0x01,
+	MLX4_EVENT_TYPE_COMM_EST	   = 0x02,
+	MLX4_EVENT_TYPE_SQ_DRAINED	   = 0x03,
+	MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE	   = 0x13,
+	MLX4_EVENT_TYPE_SRQ_LIMIT	   = 0x14,
+	MLX4_EVENT_TYPE_CQ_ERROR	   = 0x04,
+	MLX4_EVENT_TYPE_WQ_CATAS_ERROR	   = 0x05,
+	MLX4_EVENT_TYPE_EEC_CATAS_ERROR	   = 0x06,
+	MLX4_EVENT_TYPE_PATH_MIG_FAILED	   = 0x07,
+	MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+	MLX4_EVENT_TYPE_WQ_ACCESS_ERROR	   = 0x11,
+	MLX4_EVENT_TYPE_SRQ_CATAS_ERROR	   = 0x12,
+	MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR  = 0x08,
+	MLX4_EVENT_TYPE_PORT_CHANGE	   = 0x09,
+	MLX4_EVENT_TYPE_EQ_OVERFLOW	   = 0x0f,
+	MLX4_EVENT_TYPE_ECC_DETECT	   = 0x0e,
+	MLX4_EVENT_TYPE_CMD		   = 0x0a
+};
+
+enum {
+	MLX4_PORT_CHANGE_SUBTYPE_DOWN	= 1,
+	MLX4_PORT_CHANGE_SUBTYPE_ACTIVE	= 4
+};
+
+enum {
+	MLX4_PERM_LOCAL_READ	= 1 << 10,
+	MLX4_PERM_LOCAL_WRITE	= 1 << 11,
+	MLX4_PERM_REMOTE_READ	= 1 << 12,
+	MLX4_PERM_REMOTE_WRITE	= 1 << 13,
+	MLX4_PERM_ATOMIC	= 1 << 14
+};
+
+enum {
+	MLX4_OPCODE_NOP			= 0x00,
+	MLX4_OPCODE_SEND_INVAL		= 0x01,
+	MLX4_OPCODE_RDMA_WRITE		= 0x08,
+	MLX4_OPCODE_RDMA_WRITE_IMM	= 0x09,
+	MLX4_OPCODE_SEND		= 0x0a,
+	MLX4_OPCODE_SEND_IMM		= 0x0b,
+	MLX4_OPCODE_LSO			= 0x0e,
+	MLX4_OPCODE_RDMA_READ		= 0x10,
+	MLX4_OPCODE_ATOMIC_CS		= 0x11,
+	MLX4_OPCODE_ATOMIC_FA		= 0x12,
+	MLX4_OPCODE_ATOMIC_MASK_CS	= 0x14,
+	MLX4_OPCODE_ATOMIC_MASK_FA	= 0x15,
+	MLX4_OPCODE_BIND_MW		= 0x18,
+	MLX4_OPCODE_FMR			= 0x19,
+	MLX4_OPCODE_LOCAL_INVAL		= 0x1b,
+	MLX4_OPCODE_CONFIG_CMD		= 0x1f,
+
+	MLX4_RECV_OPCODE_RDMA_WRITE_IMM	= 0x00,
+	MLX4_RECV_OPCODE_SEND		= 0x01,
+	MLX4_RECV_OPCODE_SEND_IMM	= 0x02,
+	MLX4_RECV_OPCODE_SEND_INVAL	= 0x03,
+
+	MLX4_CQE_OPCODE_ERROR		= 0x1e,
+	MLX4_CQE_OPCODE_RESIZE		= 0x16,
+};
+
+enum {
+	MLX4_STAT_RATE_OFFSET	= 5
+};
+
+struct mlx4_caps {
+	u64			fw_ver;
+	int			num_ports;
+	int			vl_cap;
+	int			mtu_cap;
+	int			gid_table_len;
+	int			pkey_table_len;
+	int			local_ca_ack_delay;
+	int			num_uars;
+	int			bf_reg_size;
+	int			bf_regs_per_page;
+	int			max_sq_sg;
+	int			max_rq_sg;
+	int			num_qps;
+	int			max_wqes;
+	int			max_sq_desc_sz;
+	int			max_rq_desc_sz;
+	int			max_qp_init_rdma;
+	int			max_qp_dest_rdma;
+	int			reserved_qps;
+	int			sqp_start;
+	int			num_srqs;
+	int			max_srq_wqes;
+	int			max_srq_sge;
+	int			reserved_srqs;
+	int			num_cqs;
+	int			max_cqes;
+	int			reserved_cqs;
+	int			num_eqs;
+	int			reserved_eqs;
+	int			num_mpts;
+	int			num_mtt_segs;
+	int			fmr_reserved_mtts;
+	int			reserved_mtts;
+	int			reserved_mrws;
+	int			reserved_uars;
+	int			num_mgms;
+	int			num_amgms;
+	int			reserved_mcgs;
+	int			num_qp_per_mgm;
+	int			num_pds;
+	int			reserved_pds;
+	int			mtt_entry_sz;
+	u32			page_size_cap;
+	u32			flags;
+	u16			stat_rate_support;
+	u8			port_width_cap;
+};
+
+struct mlx4_buf_list {
+	void		       *buf;
+	dma_addr_t		map;
+};
+
+struct mlx4_buf {
+	union {
+		struct mlx4_buf_list	direct;
+		struct mlx4_buf_list   *page_list;
+	} u;
+	int			nbufs;
+	int			npages;
+	int			page_shift;
+};
+
+struct mlx4_mtt {
+	u32			first_seg;
+	int			order;
+	int			page_shift;
+};
+
+struct mlx4_mr {
+	struct mlx4_mtt		mtt;
+	u64			iova;
+	u64			size;
+	u32			key;
+	u32			pd;
+	u32			access;
+	int			enabled;
+};
+
+struct mlx4_uar {
+	unsigned long		pfn;
+	int			index;
+};
+
+struct mlx4_cq {
+	void (*comp)		(struct mlx4_cq *);
+	void (*event)		(struct mlx4_cq *, enum mlx4_event);
+
+	struct mlx4_uar	       *uar;
+
+	u32			cons_index;
+
+	__be32		       *set_ci_db;
+	__be32		       *arm_db;
+	int			arm_sn;
+
+	int			cqn;
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx4_qp {
+	void (*event)		(struct mlx4_qp *, enum mlx4_event);
+
+	int			qpn;
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx4_srq {
+	void (*event)		(struct mlx4_srq *, enum mlx4_event);
+
+	int			srqn;
+	int			max;
+	int			max_gs;
+	int			wqe_shift;
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx4_av {
+	__be32			port_pd;
+	u8			reserved1;
+	u8			g_slid;
+	__be16			dlid;
+	u8			reserved2;
+	u8			gid_index;
+	u8			stat_rate;
+	u8			hop_limit;
+	__be32			sl_tclass_flowlabel;
+	u8			dgid[16];
+};
+
+struct mlx4_dev {
+	struct pci_dev	       *pdev;
+	unsigned long		flags;
+	struct mlx4_caps	caps;
+	struct radix_tree_root	qp_table_tree;
+};
+
+struct mlx4_init_port_param {
+	int			set_guid0;
+	int			set_node_guid;
+	int			set_si_guid;
+	u16			mtu;
+	int			port_width_cap;
+	u16			vl_cap;
+	u16			max_gid;
+	u16			max_pkey;
+	u64			guid0;
+	u64			node_guid;
+	u64			si_guid;
+};
+
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+		   struct mlx4_buf *buf);
+void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
+
+int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn);
+void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn);
+
+int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar);
+void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar);
+
+int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
+		  struct mlx4_mtt *mtt);
+void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt);
+u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt);
+
+int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access,
+		  int npages, int page_shift, struct mlx4_mr *mr);
+void mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   int start_index, int npages, u64 *page_list);
+int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		       struct mlx4_buf *buf);
+
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
+		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq);
+void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int sqpn, struct mlx4_qp *qp);
+void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp);
+
+int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, struct mlx4_mtt *mtt,
+		   u64 db_rec, struct mlx4_srq *srq);
+void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq);
+int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark);
+
+int mlx4_INIT_PORT(struct mlx4_dev *dev, struct mlx4_init_port_param *param, int port);
+int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port);
+
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]);
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]);
+
+#endif /* MLX4_DEVICE_H */
diff --git a/include/linux/mlx4/doorbell.h b/include/linux/mlx4/doorbell.h
new file mode 100644
index 000000000000..3f2da442d7cb
--- /dev/null
+++ b/include/linux/mlx4/doorbell.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DOORBELL_H
+#define MLX4_DOORBELL_H
+
+#include <linux/types.h>
+#include <linux/io.h>
+
+#define MLX4_SEND_DOORBELL    0x14
+#define MLX4_CQ_DOORBELL      0x20
+
+#if BITS_PER_LONG == 64
+/*
+ * Assume that we can just write a 64-bit doorbell atomically.  s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MLX4_DECLARE_DOORBELL_LOCK(name)
+#define MLX4_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MLX4_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline void mlx4_write64_raw(__be64 val, void __iomem *dest)
+{
+	__raw_writeq((__force u64) val, dest);
+}
+
+static inline void mlx4_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	__raw_writeq(*(u64 *) val, dest);
+}
+
+#else
+
+/*
+ * Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MLX4_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MLX4_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
+#define MLX4_GET_DOORBELL_LOCK(ptr)      (ptr)
+
+static inline void mlx4_write64_raw(__be64 val, void __iomem *dest)
+{
+	__raw_writel(((__force u32 *) &val)[0], dest);
+	__raw_writel(((__force u32 *) &val)[1], dest + 4);
+}
+
+static inline void mlx4_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(doorbell_lock, flags);
+	__raw_writel((__force u32) val[0], dest);
+	__raw_writel((__force u32) val[1], dest + 4);
+	spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+#endif
+
+#endif /* MLX4_DOORBELL_H */
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
new file mode 100644
index 000000000000..1b835ca49df1
--- /dev/null
+++ b/include/linux/mlx4/driver.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DRIVER_H
+#define MLX4_DRIVER_H
+
+#include <linux/device.h>
+
+struct mlx4_dev;
+
+enum mlx4_dev_event {
+	MLX4_DEV_EVENT_CATASTROPHIC_ERROR,
+	MLX4_DEV_EVENT_PORT_UP,
+	MLX4_DEV_EVENT_PORT_DOWN,
+	MLX4_DEV_EVENT_PORT_REINIT,
+};
+
+struct mlx4_interface {
+	void *			(*add)	 (struct mlx4_dev *dev);
+	void			(*remove)(struct mlx4_dev *dev, void *context);
+	void			(*event) (struct mlx4_dev *dev, void *context,
+					  enum mlx4_dev_event event, int subtype,
+					  int port);
+	struct list_head	list;
+};
+
+int mlx4_register_interface(struct mlx4_interface *intf);
+void mlx4_unregister_interface(struct mlx4_interface *intf);
+
+#endif /* MLX4_DRIVER_H */
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
new file mode 100644
index 000000000000..9eeb61adf6a3
--- /dev/null
+++ b/include/linux/mlx4/qp.h
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_QP_H
+#define MLX4_QP_H
+
+#include <linux/types.h>
+
+#include <linux/mlx4/device.h>
+
+#define MLX4_INVALID_LKEY	0x100
+
+enum mlx4_qp_optpar {
+	MLX4_QP_OPTPAR_ALT_ADDR_PATH		= 1 << 0,
+	MLX4_QP_OPTPAR_RRE			= 1 << 1,
+	MLX4_QP_OPTPAR_RAE			= 1 << 2,
+	MLX4_QP_OPTPAR_RWE			= 1 << 3,
+	MLX4_QP_OPTPAR_PKEY_INDEX		= 1 << 4,
+	MLX4_QP_OPTPAR_Q_KEY			= 1 << 5,
+	MLX4_QP_OPTPAR_RNR_TIMEOUT		= 1 << 6,
+	MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH	= 1 << 7,
+	MLX4_QP_OPTPAR_SRA_MAX			= 1 << 8,
+	MLX4_QP_OPTPAR_RRA_MAX			= 1 << 9,
+	MLX4_QP_OPTPAR_PM_STATE			= 1 << 10,
+	MLX4_QP_OPTPAR_RETRY_COUNT		= 1 << 12,
+	MLX4_QP_OPTPAR_RNR_RETRY		= 1 << 13,
+	MLX4_QP_OPTPAR_ACK_TIMEOUT		= 1 << 14,
+	MLX4_QP_OPTPAR_SCHED_QUEUE		= 1 << 16
+};
+
+enum mlx4_qp_state {
+	MLX4_QP_STATE_RST			= 0,
+	MLX4_QP_STATE_INIT			= 1,
+	MLX4_QP_STATE_RTR			= 2,
+	MLX4_QP_STATE_RTS			= 3,
+	MLX4_QP_STATE_SQER			= 4,
+	MLX4_QP_STATE_SQD			= 5,
+	MLX4_QP_STATE_ERR			= 6,
+	MLX4_QP_STATE_SQ_DRAINING		= 7,
+	MLX4_QP_NUM_STATE
+};
+
+enum {
+	MLX4_QP_ST_RC				= 0x0,
+	MLX4_QP_ST_UC				= 0x1,
+	MLX4_QP_ST_RD				= 0x2,
+	MLX4_QP_ST_UD				= 0x3,
+	MLX4_QP_ST_MLX				= 0x7
+};
+
+enum {
+	MLX4_QP_PM_MIGRATED			= 0x3,
+	MLX4_QP_PM_ARMED			= 0x0,
+	MLX4_QP_PM_REARM			= 0x1
+};
+
+enum {
+	/* params1 */
+	MLX4_QP_BIT_SRE				= 1 << 15,
+	MLX4_QP_BIT_SWE				= 1 << 14,
+	MLX4_QP_BIT_SAE				= 1 << 13,
+	/* params2 */
+	MLX4_QP_BIT_RRE				= 1 << 15,
+	MLX4_QP_BIT_RWE				= 1 << 14,
+	MLX4_QP_BIT_RAE				= 1 << 13,
+	MLX4_QP_BIT_RIC				= 1 <<	4,
+};
+
+struct mlx4_qp_path {
+	u8			fl;
+	u8			reserved1[2];
+	u8			pkey_index;
+	u8			reserved2;
+	u8			grh_mylmc;
+	__be16			rlid;
+	u8			ackto;
+	u8			mgid_index;
+	u8			static_rate;
+	u8			hop_limit;
+	__be32			tclass_flowlabel;
+	u8			rgid[16];
+	u8			sched_queue;
+	u8			snooper_flags;
+	u8			reserved3[2];
+	u8			counter_index;
+	u8			reserved4[7];
+};
+
+struct mlx4_qp_context {
+	__be32			flags;
+	__be32			pd;
+	u8			mtu_msgmax;
+	u8			rq_size_stride;
+	u8			sq_size_stride;
+	u8			rlkey;
+	__be32			usr_page;
+	__be32			local_qpn;
+	__be32			remote_qpn;
+	struct			mlx4_qp_path pri_path;
+	struct			mlx4_qp_path alt_path;
+	__be32			params1;
+	u32			reserved1;
+	__be32			next_send_psn;
+	__be32			cqn_send;
+	u32			reserved2[2];
+	__be32			last_acked_psn;
+	__be32			ssn;
+	__be32			params2;
+	__be32			rnr_nextrecvpsn;
+	__be32			srcd;
+	__be32			cqn_recv;
+	__be64			db_rec_addr;
+	__be32			qkey;
+	__be32			srqn;
+	__be32			msn;
+	__be16			rq_wqe_counter;
+	__be16			sq_wqe_counter;
+	u32			reserved3[2];
+	__be32			param3;
+	__be32			nummmcpeers_basemkey;
+	u8			log_page_size;
+	u8			reserved4[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	u32			reserved5[10];
+};
+
+enum {
+	MLX4_WQE_CTRL_FENCE	= 1 << 6,
+	MLX4_WQE_CTRL_CQ_UPDATE	= 3 << 2,
+	MLX4_WQE_CTRL_SOLICITED	= 1 << 1,
+};
+
+struct mlx4_wqe_ctrl_seg {
+	__be32			owner_opcode;
+	u8			reserved2[3];
+	u8			fence_size;
+	/*
+	 * High 24 bits are SRC remote buffer; low 8 bits are flags:
+	 * [7]   SO (strong ordering)
+	 * [5]   TCP/UDP checksum
+	 * [4]   IP checksum
+	 * [3:2] C (generate completion queue entry)
+	 * [1]   SE (solicited event)
+	 */
+	__be32			srcrb_flags;
+	/*
+	 * imm is immediate data for send/RDMA write w/ immediate;
+	 * also invalidation key for send with invalidate; input
+	 * modifier for WQEs on CCQs.
+	 */
+	__be32			imm;
+};
+
+enum {
+	MLX4_WQE_MLX_VL15	= 1 << 17,
+	MLX4_WQE_MLX_SLR	= 1 << 16
+};
+
+struct mlx4_wqe_mlx_seg {
+	u8			owner;
+	u8			reserved1[2];
+	u8			opcode;
+	u8			reserved2[3];
+	u8			size;
+	/*
+	 * [17]    VL15
+	 * [16]    SLR
+	 * [15:12] static rate
+	 * [11:8]  SL
+	 * [4]     ICRC
+	 * [3:2]   C
+	 * [0]     FL (force loopback)
+	 */
+	__be32			flags;
+	__be16			rlid;
+	u16			reserved3;
+};
+
+struct mlx4_wqe_datagram_seg {
+	__be32			av[8];
+	__be32			dqpn;
+	__be32			qkey;
+	__be32			reservd[2];
+};
+
+struct mlx4_wqe_bind_seg {
+	__be32			flags1;
+	__be32			flags2;
+	__be32			new_rkey;
+	__be32			lkey;
+	__be64			addr;
+	__be64			length;
+};
+
+struct mlx4_wqe_fmr_seg {
+	__be32			flags;
+	__be32			mem_key;
+	__be64			buf_list;
+	__be64			start_addr;
+	__be64			reg_len;
+	__be32			offset;
+	__be32			page_size;
+	u32			reserved[2];
+};
+
+struct mlx4_wqe_fmr_ext_seg {
+	u8			flags;
+	u8			reserved;
+	__be16			app_mask;
+	__be16			wire_app_tag;
+	__be16			mem_app_tag;
+	__be32			wire_ref_tag_base;
+	__be32			mem_ref_tag_base;
+};
+
+struct mlx4_wqe_local_inval_seg {
+	u8			flags;
+	u8			reserved1[3];
+	__be32			mem_key;
+	u8			reserved2[3];
+	u8			guest_id;
+	__be64			pa;
+};
+
+struct mlx4_wqe_raddr_seg {
+	__be64			raddr;
+	__be32			rkey;
+	u32			reserved;
+};
+
+struct mlx4_wqe_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+};
+
+struct mlx4_wqe_data_seg {
+	__be32			byte_count;
+	__be32			lkey;
+	__be64			addr;
+};
+
+struct mlx4_wqe_inline_seg {
+	__be32			byte_count;
+};
+
+int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		   struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar,
+		   int sqd_event, struct mlx4_qp *qp);
+
+static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
+{
+	return radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1));
+}
+
+void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp);
+
+#endif /* MLX4_QP_H */
diff --git a/include/linux/mlx4/srq.h b/include/linux/mlx4/srq.h
new file mode 100644
index 000000000000..799a0697a383
--- /dev/null
+++ b/include/linux/mlx4/srq.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_SRQ_H
+#define MLX4_SRQ_H
+
+struct mlx4_wqe_srq_next_seg {
+	u16			reserved1;
+	__be16			next_wqe_index;
+	u32			reserved2[3];
+};
+
+#endif /* MLX4_SRQ_H */
-- 
cgit v1.2.3


From beb7dd86a101263bf63a78c7c6d4da3849b35bd6 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Wed, 9 May 2007 07:14:03 +0200
Subject: Fix misspellings collected by members of KJ list.

Fix the misspellings of "propogate", "writting" and (oh, the shame
:-) "kenrel" in the source tree.

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 Documentation/sysctl/kernel.txt           | 2 +-
 arch/powerpc/oprofile/op_model_cell.c     | 2 +-
 arch/x86_64/kernel/io_apic.c              | 2 +-
 drivers/char/tty_io.c                     | 8 ++++----
 drivers/media/dvb/frontends/dib7000m.c    | 2 +-
 drivers/media/dvb/frontends/dib7000p.c    | 2 +-
 drivers/media/video/em28xx/em28xx-i2c.c   | 2 +-
 drivers/net/irda/donauboe.h               | 2 +-
 drivers/net/wireless/prism54/islpci_dev.c | 2 +-
 drivers/net/wireless/wavelan_cs.c         | 4 ++--
 drivers/net/wireless/wavelan_cs.p.h       | 2 +-
 drivers/sbus/char/bpp.c                   | 2 +-
 drivers/usb/serial/aircable.c             | 4 ++--
 drivers/usb/serial/io_edgeport.c          | 4 ++--
 drivers/video/matrox/matroxfb_Ti3026.c    | 2 +-
 drivers/video/matrox/matroxfb_accel.c     | 2 +-
 drivers/video/matrox/matroxfb_base.c      | 2 +-
 drivers/video/matrox/matroxfb_misc.c      | 2 +-
 fs/direct-io.c                            | 2 +-
 fs/reiserfs/journal.c                     | 4 ++--
 include/asm-generic/bitops/atomic.h       | 2 +-
 include/asm-i386/bitops.h                 | 2 +-
 include/asm-i386/boot.h                   | 2 +-
 include/asm-i386/sync_bitops.h            | 2 +-
 include/asm-m32r/system.h                 | 2 +-
 include/asm-mips/bootinfo.h               | 2 +-
 include/linux/mount.h                     | 2 +-
 sound/pci/ice1712/delta.h                 | 4 ++--
 sound/sparc/dbri.c                        | 2 +-
 29 files changed, 37 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 5922e84d9133..d3d3c255ddb7 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -228,7 +228,7 @@ Controls the kernel's behaviour when an oops or BUG is encountered.
 
 pid_max:
 
-PID allocation wrap value.  When the kenrel's next PID value
+PID allocation wrap value.  When the kernel's next PID value
 reaches this value, it wraps back to a minimum PID value.
 PIDs of value pid_max or larger are not allocated.
 
diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c
index 626b29f38304..c29293befba9 100644
--- a/arch/powerpc/oprofile/op_model_cell.c
+++ b/arch/powerpc/oprofile/op_model_cell.c
@@ -747,7 +747,7 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 		 * counter value etc.) are not copied to the actual registers
 		 * until the performance monitor is enabled.  In order to get
 		 * this to work as desired, the permormance monitor needs to
-		 * be disabled while writting to the latches.  This is a
+		 * be disabled while writing to the latches.  This is a
 		 * HW design issue.
 		 */
 		cbe_enable_pm(cpu);
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 4d582589fa89..d8bfe315471c 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -1413,7 +1413,7 @@ static void ack_apic_level(unsigned int irq)
 
 	/*
 	 * We must acknowledge the irq before we move it or the acknowledge will
-	 * not propogate properly.
+	 * not propagate properly.
 	 */
 	ack_APIC_irq();
 
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 7710a6a77d97..f6ac1d316ea4 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -1573,11 +1573,11 @@ void no_tty(void)
 
 
 /**
- *	stop_tty	-	propogate flow control
+ *	stop_tty	-	propagate flow control
  *	@tty: tty to stop
  *
  *	Perform flow control to the driver. For PTY/TTY pairs we
- *	must also propogate the TIOCKPKT status. May be called
+ *	must also propagate the TIOCKPKT status. May be called
  *	on an already stopped device and will not re-call the driver
  *	method.
  *
@@ -1607,11 +1607,11 @@ void stop_tty(struct tty_struct *tty)
 EXPORT_SYMBOL(stop_tty);
 
 /**
- *	start_tty	-	propogate flow control
+ *	start_tty	-	propagate flow control
  *	@tty: tty to start
  *
  *	Start a tty that has been stopped if at all possible. Perform
- *	any neccessary wakeups and propogate the TIOCPKT status. If this
+ *	any neccessary wakeups and propagate the TIOCPKT status. If this
  *	is the tty was previous stopped and is being started then the
  *	driver start method is invoked and the line discipline woken.
  *
diff --git a/drivers/media/dvb/frontends/dib7000m.c b/drivers/media/dvb/frontends/dib7000m.c
index f5d40aa3d27f..f64546c6aeb5 100644
--- a/drivers/media/dvb/frontends/dib7000m.c
+++ b/drivers/media/dvb/frontends/dib7000m.c
@@ -266,7 +266,7 @@ static int dib7000m_sad_calib(struct dib7000m_state *state)
 {
 
 /* internal */
-//	dib7000m_write_word(state, 928, (3 << 14) | (1 << 12) | (524 << 0)); // sampling clock of the SAD is writting in set_bandwidth
+//	dib7000m_write_word(state, 928, (3 << 14) | (1 << 12) | (524 << 0)); // sampling clock of the SAD is written in set_bandwidth
 	dib7000m_write_word(state, 929, (0 << 1) | (0 << 0));
 	dib7000m_write_word(state, 930, 776); // 0.625*3.3 / 4096
 
diff --git a/drivers/media/dvb/frontends/dib7000p.c b/drivers/media/dvb/frontends/dib7000p.c
index 0349a4b5da3f..aece458cfe12 100644
--- a/drivers/media/dvb/frontends/dib7000p.c
+++ b/drivers/media/dvb/frontends/dib7000p.c
@@ -223,7 +223,7 @@ static int dib7000p_set_bandwidth(struct dvb_frontend *demod, u8 BW_Idx)
 static int dib7000p_sad_calib(struct dib7000p_state *state)
 {
 /* internal */
-//	dib7000p_write_word(state, 72, (3 << 14) | (1 << 12) | (524 << 0)); // sampling clock of the SAD is writting in set_bandwidth
+//	dib7000p_write_word(state, 72, (3 << 14) | (1 << 12) | (524 << 0)); // sampling clock of the SAD is written in set_bandwidth
 	dib7000p_write_word(state, 73, (0 << 1) | (0 << 0));
 	dib7000p_write_word(state, 74, 776); // 0.625*3.3 / 4096
 
diff --git a/drivers/media/video/em28xx/em28xx-i2c.c b/drivers/media/video/em28xx/em28xx-i2c.c
index 563a8319e608..54ccc6e1f92e 100644
--- a/drivers/media/video/em28xx/em28xx-i2c.c
+++ b/drivers/media/video/em28xx/em28xx-i2c.c
@@ -70,7 +70,7 @@ static int em2800_i2c_send_max4(struct em28xx *dev, unsigned char addr,
 
 	ret = dev->em28xx_write_regs(dev, 4 - len, &b2[4 - len], 2 + len);
 	if (ret != 2 + len) {
-		em28xx_warn("writting to i2c device failed (error=%i)\n", ret);
+		em28xx_warn("writing to i2c device failed (error=%i)\n", ret);
 		return -EIO;
 	}
 	for (write_timeout = EM2800_I2C_WRITE_TIMEOUT; write_timeout > 0;
diff --git a/drivers/net/irda/donauboe.h b/drivers/net/irda/donauboe.h
index 2ab173d9a0e4..1e67720f1066 100644
--- a/drivers/net/irda/donauboe.h
+++ b/drivers/net/irda/donauboe.h
@@ -113,7 +113,7 @@
 /* RxOver  overflow in Recv FIFO                                */
 /* SipRcv  received serial gap  (or other condition you set)    */
 /* Interrupts are enabled by writing a one to the IER register  */
-/* Interrupts are cleared by writting a one to the ISR register */
+/* Interrupts are cleared by writing a one to the ISR register */
 /*                                                              */
 /* 6. The remaining registers: 0x6 and 0x3 appear to be         */
 /*    reserved parts of 16 or 32 bit registersthe remainder     */
diff --git a/drivers/net/wireless/prism54/islpci_dev.c b/drivers/net/wireless/prism54/islpci_dev.c
index a037b11dac9d..084795355b74 100644
--- a/drivers/net/wireless/prism54/islpci_dev.c
+++ b/drivers/net/wireless/prism54/islpci_dev.c
@@ -115,7 +115,7 @@ isl_upload_firmware(islpci_private *priv)
 			    ISL38XX_MEMORY_WINDOW_SIZE : fw_len;
 			u32 __iomem *dev_fw_ptr = device_base + ISL38XX_DIRECT_MEM_WIN;
 
-			/* set the cards base address for writting the data */
+			/* set the card's base address for writing the data */
 			isl38xx_w32_flush(device_base, reg,
 					  ISL38XX_DIR_MEM_BASE_REG);
 			wmb();	/* be paranoid */
diff --git a/drivers/net/wireless/wavelan_cs.c b/drivers/net/wireless/wavelan_cs.c
index 67b867f837ca..5740d4d4267c 100644
--- a/drivers/net/wireless/wavelan_cs.c
+++ b/drivers/net/wireless/wavelan_cs.c
@@ -176,7 +176,7 @@ psa_write(struct net_device *	dev,
   volatile u_char __iomem *verify = lp->mem + PSA_ADDR +
     (psaoff(0, psa_comp_number) << 1);
 
-  /* Authorize writting to PSA */
+  /* Authorize writing to PSA */
   hacr_write(base, HACR_PWR_STAT | HACR_ROM_WEN);
 
   while(n-- > 0)
@@ -1676,7 +1676,7 @@ wv_set_frequency(u_long		base,	/* i/o port of the card */
       fee_write(base, 0x60,
 		dac, 2);
 
-      /* We now should verify here that the EEprom writting was ok */
+      /* We now should verify here that the EEprom writing was ok */
 
       /* ReRead the first area */
       fee_read(base, 0x00,
diff --git a/drivers/net/wireless/wavelan_cs.p.h b/drivers/net/wireless/wavelan_cs.p.h
index 4d1c4905c749..4b9de0093a7b 100644
--- a/drivers/net/wireless/wavelan_cs.p.h
+++ b/drivers/net/wireless/wavelan_cs.p.h
@@ -120,7 +120,7 @@
  * the Wavelan itself (NCR -> AT&T -> Lucent).
  *
  * All started with Anders Klemets <klemets@paul.rutgers.edu>,
- * writting a Wavelan ISA driver for the MACH microkernel. Girish
+ * writing a Wavelan ISA driver for the MACH microkernel. Girish
  * Welling <welling@paul.rutgers.edu> had also worked on it.
  * Keith Moore modify this for the Pcmcia hardware.
  * 
diff --git a/drivers/sbus/char/bpp.c b/drivers/sbus/char/bpp.c
index 74b999d77bbf..4fab0c23814c 100644
--- a/drivers/sbus/char/bpp.c
+++ b/drivers/sbus/char/bpp.c
@@ -156,7 +156,7 @@ static unsigned short get_pins(unsigned minor)
 #define BPP_ICR      0x18
 #define BPP_SIZE     0x1A
 
-/* BPP_CSR.  Bits of type RW1 are cleared with writting '1'. */
+/* BPP_CSR.  Bits of type RW1 are cleared with writing '1'. */
 #define P_DEV_ID_MASK   0xf0000000      /* R   */
 #define P_DEV_ID_ZEBRA  0x40000000
 #define P_DEV_ID_L64854 0xa0000000      /*      == NCR 89C100+89C105. Pity. */
diff --git a/drivers/usb/serial/aircable.c b/drivers/usb/serial/aircable.c
index b675735bfbee..fbc8c27d5d99 100644
--- a/drivers/usb/serial/aircable.c
+++ b/drivers/usb/serial/aircable.c
@@ -9,7 +9,7 @@
  * The device works as an standard CDC device, it has 2 interfaces, the first
  * one is for firmware access and the second is the serial one.
  * The protocol is very simply, there are two posibilities reading or writing.
- * When writting the first urb must have a Header that starts with 0x20 0x29 the
+ * When writing the first urb must have a Header that starts with 0x20 0x29 the
  * next two bytes must say how much data will be sended.
  * When reading the process is almost equal except that the header starts with
  * 0x00 0x20.
@@ -18,7 +18,7 @@
  * buffer: The First and Second byte is used for a Header, the Third and Fourth
  * tells the  device the amount of information the package holds.
  * Packages are 60 bytes long Header Stuff.
- * When writting to the device the first two bytes of the header are 0x20 0x29
+ * When writing to the device the first two bytes of the header are 0x20 0x29
  * When reading the bytes are 0x00 0x20, or 0x00 0x10, there is an strange
  * situation, when too much data arrives to the device because it sends the data
  * but with out the header. I will use a simply hack to override this situation,
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index 18f74ac76565..4807f960150b 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -2465,7 +2465,7 @@ static int send_cmd_write_uart_register (struct edgeport_port *edge_port, __u8 r
 	    ((edge_serial->is_epic) &&
 	     (!edge_serial->epic_descriptor.Supports.IOSPWriteMCR) &&
 	     (regNum == MCR))) {
-		dbg("SendCmdWriteUartReg - Not writting to MCR Register");
+		dbg("SendCmdWriteUartReg - Not writing to MCR Register");
 		return 0;
 	}
 
@@ -2473,7 +2473,7 @@ static int send_cmd_write_uart_register (struct edgeport_port *edge_port, __u8 r
 	    ((edge_serial->is_epic) &&
 	     (!edge_serial->epic_descriptor.Supports.IOSPWriteLCR) &&
 	     (regNum == LCR))) {
-		dbg ("SendCmdWriteUartReg - Not writting to LCR Register");
+		dbg ("SendCmdWriteUartReg - Not writing to LCR Register");
 		return 0;
 	}
 
diff --git a/drivers/video/matrox/matroxfb_Ti3026.c b/drivers/video/matrox/matroxfb_Ti3026.c
index a5690a5f29d5..9445cdb759b1 100644
--- a/drivers/video/matrox/matroxfb_Ti3026.c
+++ b/drivers/video/matrox/matroxfb_Ti3026.c
@@ -72,7 +72,7 @@
  *     (c) 1998 Gerd Knorr <kraxel@cs.tu-berlin.de>
  *
  * (following author is not in any relation with this code, but his ideas
- *  were used when writting this driver)
+ *  were used when writing this driver)
  *
  *		 FreeVBE/AF (Matrox), "Shawn Hargreaves" <shawn@talula.demon.co.uk>
  *
diff --git a/drivers/video/matrox/matroxfb_accel.c b/drivers/video/matrox/matroxfb_accel.c
index a5c825d99466..c57aaadf410c 100644
--- a/drivers/video/matrox/matroxfb_accel.c
+++ b/drivers/video/matrox/matroxfb_accel.c
@@ -70,7 +70,7 @@
  *     (c) 1998 Gerd Knorr <kraxel@cs.tu-berlin.de>
  *
  * (following author is not in any relation with this code, but his ideas
- *  were used when writting this driver)
+ *  were used when writing this driver)
  *
  *		 FreeVBE/AF (Matrox), "Shawn Hargreaves" <shawn@talula.demon.co.uk>
  *
diff --git a/drivers/video/matrox/matroxfb_base.c b/drivers/video/matrox/matroxfb_base.c
index cb2aa402ddfd..c8559a756b75 100644
--- a/drivers/video/matrox/matroxfb_base.c
+++ b/drivers/video/matrox/matroxfb_base.c
@@ -93,7 +93,7 @@
  *     (c) 1998 Gerd Knorr <kraxel@cs.tu-berlin.de>
  *
  * (following author is not in any relation with this code, but his ideas
- *  were used when writting this driver)
+ *  were used when writing this driver)
  *
  *		 FreeVBE/AF (Matrox), "Shawn Hargreaves" <shawn@talula.demon.co.uk>
  *
diff --git a/drivers/video/matrox/matroxfb_misc.c b/drivers/video/matrox/matroxfb_misc.c
index 18886b629cb1..5948e54b9ef9 100644
--- a/drivers/video/matrox/matroxfb_misc.c
+++ b/drivers/video/matrox/matroxfb_misc.c
@@ -78,7 +78,7 @@
  *     (c) 1998 Gerd Knorr <kraxel@cs.tu-berlin.de>
  *
  * (following author is not in any relation with this code, but his ideas
- *  were used when writting this driver)
+ *  were used when writing this driver)
  *
  *		 FreeVBE/AF (Matrox), "Shawn Hargreaves" <shawn@talula.demon.co.uk>
  *
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d9d0833444f5..1e88d8d1d2a9 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -439,7 +439,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
  * Wait on and process all in-flight BIOs.  This must only be called once
  * all bios have been issued so that the refcount can only decrease.
  * This just waits for all bios to make it through dio_bio_complete.  IO
- * errors are propogated through dio->io_error and should be propogated via
+ * errors are propagated through dio->io_error and should be propagated via
  * dio_complete().
  */
 static void dio_await_completion(struct dio *dio)
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e073fd86cf60..f25086aeef5f 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1110,7 +1110,7 @@ static int flush_commit_list(struct super_block *s,
 	if (!barrier) {
 		/* If there was a write error in the journal - we can't commit
 		 * this transaction - it will be invalid and, if successful,
-		 * will just end up propogating the write error out to
+		 * will just end up propagating the write error out to
 		 * the file system. */
 		if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
 			if (buffer_dirty(jl->j_commit_bh))
@@ -1125,7 +1125,7 @@ static int flush_commit_list(struct super_block *s,
 
 	/* If there was a write error in the journal - we can't commit this
 	 * transaction - it will be invalid and, if successful, will just end
-	 * up propogating the write error out to the filesystem. */
+	 * up propagating the write error out to the filesystem. */
 	if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
 		reiserfs_warning(s, "journal-615: buffer write failed");
diff --git a/include/asm-generic/bitops/atomic.h b/include/asm-generic/bitops/atomic.h
index 78339319ba02..cd8a9641bd66 100644
--- a/include/asm-generic/bitops/atomic.h
+++ b/include/asm-generic/bitops/atomic.h
@@ -58,7 +58,7 @@ extern raw_spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned;
  * if you do not require the atomic guarantees.
  *
  * Note: there are no guarantees that this function will not be reordered
- * on non x86 architectures, so if you are writting portable code,
+ * on non x86 architectures, so if you are writing portable code,
  * make sure not to rely on its reordering guarantees.
  *
  * Note that @nr may be almost arbitrarily large; this function is not
diff --git a/include/asm-i386/bitops.h b/include/asm-i386/bitops.h
index 273b50629357..a20fe9822f60 100644
--- a/include/asm-i386/bitops.h
+++ b/include/asm-i386/bitops.h
@@ -27,7 +27,7 @@
  * if you do not require the atomic guarantees.
  *
  * Note: there are no guarantees that this function will not be reordered
- * on non x86 architectures, so if you are writting portable code,
+ * on non x86 architectures, so if you are writing portable code,
  * make sure not to rely on its reordering guarantees.
  *
  * Note that @nr may be almost arbitrarily large; this function is not
diff --git a/include/asm-i386/boot.h b/include/asm-i386/boot.h
index e7686d0a8413..bd024ab4fe53 100644
--- a/include/asm-i386/boot.h
+++ b/include/asm-i386/boot.h
@@ -12,7 +12,7 @@
 #define EXTENDED_VGA	0xfffe		/* 80x50 mode */
 #define ASK_VGA		0xfffd		/* ask for it at bootup */
 
-/* Physical address where kenrel should be loaded. */
+/* Physical address where kernel should be loaded. */
 #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
 				+ (CONFIG_PHYSICAL_ALIGN - 1)) \
 				& ~(CONFIG_PHYSICAL_ALIGN - 1))
diff --git a/include/asm-i386/sync_bitops.h b/include/asm-i386/sync_bitops.h
index 7d72351bea75..cbce08a2d135 100644
--- a/include/asm-i386/sync_bitops.h
+++ b/include/asm-i386/sync_bitops.h
@@ -24,7 +24,7 @@
  * if you do not require the atomic guarantees.
  *
  * Note: there are no guarantees that this function will not be reordered
- * on non x86 architectures, so if you are writting portable code,
+ * on non-x86 architectures, so if you are writing portable code,
  * make sure not to rely on its reordering guarantees.
  *
  * Note that @nr may be almost arbitrarily large; this function is not
diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h
index 06cdece35865..f62f5c9abba6 100644
--- a/include/asm-m32r/system.h
+++ b/include/asm-m32r/system.h
@@ -136,7 +136,7 @@ extern void  __xchg_called_with_bad_pointer(void);
 	"add3	"reg0", "addr", #0x2000;		\n\t"	\
 	"ld	"reg0", @"reg0";			\n\t"	\
 	"unlock	"reg0", @"reg1";			\n\t"
-	/* FIXME: This workaround code cannot handle kenrel modules
+	/* FIXME: This workaround code cannot handle kernel modules
 	 * correctly under SMP environment.
 	 */
 #else	/* CONFIG_CHIP_M32700_TS1 */
diff --git a/include/asm-mips/bootinfo.h b/include/asm-mips/bootinfo.h
index c7c945baf1ee..dbf834f4dac4 100644
--- a/include/asm-mips/bootinfo.h
+++ b/include/asm-mips/bootinfo.h
@@ -254,7 +254,7 @@ extern void free_init_pages(const char *what,
 extern char arcs_cmdline[CL_SIZE];
 
 /*
- * Registers a0, a1, a3 and a4 as passed to the kenrel entry by firmware
+ * Registers a0, a1, a3 and a4 as passed to the kernel entry by firmware
  */
 extern unsigned long fw_arg0, fw_arg1, fw_arg2, fw_arg3;
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index dab69afee2fa..6d3047d8c91c 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -33,7 +33,7 @@ struct mnt_namespace;
 
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
-#define MNT_PNODE_MASK	0x3000	/* propogation flag mask */
+#define MNT_PNODE_MASK	0x3000	/* propagation flag mask */
 
 struct vfsmount {
 	struct list_head mnt_hash;
diff --git a/sound/pci/ice1712/delta.h b/sound/pci/ice1712/delta.h
index e65d669af639..e47861ccd6e7 100644
--- a/sound/pci/ice1712/delta.h
+++ b/sound/pci/ice1712/delta.h
@@ -63,7 +63,7 @@ extern const struct snd_ice1712_card_info snd_ice1712_delta_cards[];
 					/* look to CS8414 datasheet */
 #define ICE1712_DELTA_SPDIF_OUT_STAT_CLOCK 0x04
 					/* S/PDIF output status clock */
-					/* (writting on rising edge - 0->1) */
+					/* (writing on rising edge - 0->1) */
 					/* all except Delta44 */
 					/* look to CS8404A datasheet */
 #define ICE1712_DELTA_SPDIF_OUT_STAT_DATA 0x08
@@ -100,7 +100,7 @@ extern const struct snd_ice1712_card_info snd_ice1712_delta_cards[];
 					/* AKM4524 serial data */
 #define ICE1712_DELTA_CODEC_SERIAL_CLOCK 0x20
 					/* AKM4524 serial clock */
-					/* (writting on rising edge - 0->1 */
+					/* (writing on rising edge - 0->1 */
 #define ICE1712_DELTA_CODEC_CHIP_A	0x40
 #define ICE1712_DELTA_CODEC_CHIP_B	0x80
 					/* 1 - select chip A or B */
diff --git a/sound/sparc/dbri.c b/sound/sparc/dbri.c
index 25a2a7333006..e07085a7cfc3 100644
--- a/sound/sparc/dbri.c
+++ b/sound/sparc/dbri.c
@@ -673,7 +673,7 @@ static s32 *dbri_cmdlock(struct snd_dbri * dbri, int len)
 }
 
 /*
- * Send prepared cmd string. It works by writting a JUMP cmd into
+ * Send prepared cmd string. It works by writing a JUMP cmd into
  * the last WAIT cmd and force DBRI to reread the cmd.
  * The JUMP cmd points to the new cmd string.
  * It also releases the cmdlock spinlock.
-- 
cgit v1.2.3


From 59c51591a0ac7568824f541f57de967e88adaa07 Mon Sep 17 00:00:00 2001
From: Michael Opdenacker <michael@free-electrons.com>
Date: Wed, 9 May 2007 08:57:56 +0200
Subject: Fix occurrences of "the the "

Signed-off-by: Michael Opdenacker <michael@free-electrons.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 Documentation/ABI/removed/devfs              | 2 +-
 Documentation/driver-model/platform.txt      | 2 +-
 Documentation/netlabel/introduction.txt      | 2 +-
 Documentation/pci.txt                        | 2 +-
 Documentation/powerpc/booting-without-of.txt | 2 +-
 arch/mips/Makefile                           | 2 +-
 arch/mips/pci/fixup-sb1250.c                 | 4 ++--
 arch/powerpc/platforms/cell/io-workarounds.c | 2 +-
 arch/v850/kernel/entry.S                     | 2 +-
 block/ll_rw_blk.c                            | 2 +-
 drivers/block/rd.c                           | 2 +-
 drivers/char/drm/drm_dma.c                   | 2 +-
 drivers/char/drm/drm_vm.c                    | 2 +-
 drivers/char/drm/r300_reg.h                  | 2 +-
 drivers/char/pcmcia/cm4000_cs.c              | 2 +-
 drivers/ide/pci/siimage.c                    | 2 +-
 drivers/ieee1394/nodemgr.c                   | 2 +-
 drivers/isdn/hardware/eicon/divasync.h       | 2 +-
 drivers/isdn/hisax/hfc_usb.c                 | 4 ++--
 drivers/media/dvb/dvb-usb/dvb-usb-remote.c   | 2 +-
 drivers/media/dvb/frontends/tda10021.c       | 2 +-
 drivers/media/dvb/frontends/ves1x93.c        | 2 +-
 drivers/media/video/em28xx/em28xx-video.c    | 2 +-
 drivers/media/video/usbvideo/vicam.c         | 2 +-
 drivers/message/fusion/mptbase.c             | 2 +-
 drivers/mtd/maps/nettel.c                    | 2 +-
 drivers/mtd/onenand/onenand_base.c           | 2 +-
 drivers/net/bonding/bond_main.c              | 2 +-
 drivers/net/eepro.c                          | 2 +-
 drivers/net/ixgb/ixgb_ee.c                   | 2 +-
 drivers/net/meth.h                           | 2 +-
 drivers/net/tulip/interrupt.c                | 2 +-
 drivers/net/tulip/winbond-840.c              | 2 +-
 drivers/net/tulip/xircom_cb.c                | 2 +-
 drivers/net/typhoon.c                        | 2 +-
 drivers/net/wireless/airport.c               | 2 +-
 drivers/s390/char/sclp_rw.c                  | 2 +-
 drivers/s390/net/qeth_main.c                 | 2 +-
 drivers/s390/scsi/zfcp_qdio.c                | 2 +-
 drivers/scsi/aic7xxx/aic79xx_pci.c           | 2 +-
 drivers/scsi/aic94xx/Makefile                | 2 +-
 drivers/scsi/dc395x.c                        | 6 +++---
 drivers/scsi/scsi_lib.c                      | 2 +-
 drivers/usb/misc/auerswald.c                 | 2 +-
 drivers/usb/net/usbnet.h                     | 2 +-
 drivers/video/i810/i810_main.c               | 2 +-
 drivers/video/skeletonfb.c                   | 2 +-
 fs/jfs/jfs_dmap.c                            | 2 +-
 fs/jfs/jfs_imap.c                            | 4 ++--
 fs/jfs/jfs_logmgr.c                          | 2 +-
 fs/xfs/xfs_itable.c                          | 2 +-
 include/asm-arm/dma-mapping.h                | 2 +-
 include/asm-powerpc/ppc-pci.h                | 2 +-
 include/linux/ext3_fs_i.h                    | 2 +-
 include/linux/ext4_fs_i.h                    | 2 +-
 include/linux/radix-tree.h                   | 4 ++--
 include/linux/security.h                     | 2 +-
 include/linux/usb.h                          | 2 +-
 kernel/relay.c                               | 2 +-
 kernel/wait.c                                | 2 +-
 mm/mmap.c                                    | 2 +-
 net/decnet/af_decnet.c                       | 2 +-
 net/ipv4/cipso_ipv4.c                        | 2 +-
 net/ipv4/ipvs/ip_vs_sed.c                    | 2 +-
 net/ipv4/udp.c                               | 2 +-
 net/llc/af_llc.c                             | 2 +-
 net/netfilter/nf_conntrack_expect.c          | 2 +-
 net/sctp/chunk.c                             | 2 +-
 net/sctp/socket.c                            | 4 ++--
 net/sunrpc/svcauth.c                         | 2 +-
 scripts/basic/docproc.c                      | 2 +-
 sound/pci/ac97/ac97_codec.c                  | 2 +-
 72 files changed, 79 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/removed/devfs b/Documentation/ABI/removed/devfs
index 8195c4e0d0a1..8ffd28bf6598 100644
--- a/Documentation/ABI/removed/devfs
+++ b/Documentation/ABI/removed/devfs
@@ -6,7 +6,7 @@ Description:
 	races, contains a naming policy within the kernel that is
 	against the LSB, and can be replaced by using udev.
 	The files fs/devfs/*, include/linux/devfs_fs*.h were removed,
-	along with the the assorted devfs function calls throughout the
+	along with the assorted devfs function calls throughout the
 	kernel tree.
 
 Users:
diff --git a/Documentation/driver-model/platform.txt b/Documentation/driver-model/platform.txt
index f7c9262b2dc8..c5491c22086c 100644
--- a/Documentation/driver-model/platform.txt
+++ b/Documentation/driver-model/platform.txt
@@ -125,7 +125,7 @@ three different ways to find such a match:
       usually register later during booting, or by module loading.
 
     - Registering a driver using platform_driver_probe() works just like
-      using platform_driver_register(), except that the the driver won't
+      using platform_driver_register(), except that the driver won't
       be probed later if another device registers.  (Which is OK, since
       this interface is only for use with non-hotpluggable devices.)
 
diff --git a/Documentation/netlabel/introduction.txt b/Documentation/netlabel/introduction.txt
index a4ffba1694c8..5ecd8d1dcf4e 100644
--- a/Documentation/netlabel/introduction.txt
+++ b/Documentation/netlabel/introduction.txt
@@ -30,7 +30,7 @@ The communication layer exists to allow NetLabel configuration and monitoring
 from user space.  The NetLabel communication layer uses a message based
 protocol built on top of the Generic NETLINK transport mechanism.  The exact
 formatting of these NetLabel messages as well as the Generic NETLINK family
-names can be found in the the 'net/netlabel/' directory as comments in the
+names can be found in the 'net/netlabel/' directory as comments in the
 header files as well as in 'include/net/netlabel.h'.
 
  * Security Module API
diff --git a/Documentation/pci.txt b/Documentation/pci.txt
index e2c9d0a0c43d..d38261b67905 100644
--- a/Documentation/pci.txt
+++ b/Documentation/pci.txt
@@ -373,7 +373,7 @@ E.g. clearing pending interrupts.
 
 3.6 Register IRQ handler
 ~~~~~~~~~~~~~~~~~~~~~~~~
-While calling request_irq() is the the last step described here,
+While calling request_irq() is the last step described here,
 this is often just another intermediate step to initialize a device.
 This step can often be deferred until the device is opened for use.
 
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt
index d4bfae75c946..c141f2388474 100644
--- a/Documentation/powerpc/booting-without-of.txt
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -1444,7 +1444,7 @@ platforms are moved over to use the flattened-device-tree model.
    Basically, it is a bus of devices, that could act more or less
    as a complete entity (UCC, USB etc ). All of them should be siblings on
    the "root" qe node, using the common properties from there.
-   The description below applies to the the qe of MPC8360 and
+   The description below applies to the qe of MPC8360 and
    more nodes and properties would be extended in the future.
 
    i) Root QE device
diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index f2f742df32c7..4892db88a86a 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -92,7 +92,7 @@ cflags-y += -ffreestanding
 # when fed the toolchain default!
 #
 # Certain gcc versions upto gcc 4.1.1 (probably 4.2-subversion as of
-# 2006-10-10 don't properly change the the predefined symbols if -EB / -EL
+# 2006-10-10 don't properly change the predefined symbols if -EB / -EL
 # are used, so we kludge that here.  A bug has been filed at
 # http://gcc.gnu.org/bugzilla/show_bug.cgi?id=29413.
 #
diff --git a/arch/mips/pci/fixup-sb1250.c b/arch/mips/pci/fixup-sb1250.c
index 7a7444874e80..0ad39e53f7b1 100644
--- a/arch/mips/pci/fixup-sb1250.c
+++ b/arch/mips/pci/fixup-sb1250.c
@@ -14,7 +14,7 @@
 #include <linux/pci.h>
 
 /*
- * Set the the BCM1250, etc. PCI host bridge's TRDY timeout
+ * Set the BCM1250, etc. PCI host bridge's TRDY timeout
  * to the finite max.
  */
 static void __init quirk_sb1250_pci(struct pci_dev *dev)
@@ -35,7 +35,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SIBYTE, PCI_DEVICE_ID_BCM1250_HT,
 			quirk_sb1250_ht);
 
 /*
- * Set the the SP1011 HT/PCI bridge's TRDY timeout to the finite max.
+ * Set the SP1011 HT/PCI bridge's TRDY timeout to the finite max.
  */
 static void __init quirk_sp1011(struct pci_dev *dev)
 {
diff --git a/arch/powerpc/platforms/cell/io-workarounds.c b/arch/powerpc/platforms/cell/io-workarounds.c
index d68d920eb2c4..7fb92f23f380 100644
--- a/arch/powerpc/platforms/cell/io-workarounds.c
+++ b/arch/powerpc/platforms/cell/io-workarounds.c
@@ -74,7 +74,7 @@ static void spider_io_flush(const volatile void __iomem *addr)
 	/* Fast path if we have a non-0 token, it indicates which bus we
 	 * are on.
 	 *
-	 * If the token is 0, that means either the the ioremap was done
+	 * If the token is 0, that means either that the ioremap was done
 	 * before we initialized this layer, or it's a PIO operation. We
 	 * fallback to a low path in this case. Hopefully, internal devices
 	 * which are ioremap'ed early should use in_XX/out_XX functions
diff --git a/arch/v850/kernel/entry.S b/arch/v850/kernel/entry.S
index 8bc521ca081f..e4327a8d6bcd 100644
--- a/arch/v850/kernel/entry.S
+++ b/arch/v850/kernel/entry.S
@@ -523,7 +523,7 @@ END(ret_from_trap)
 
 
 /* This the initial entry point for a new child thread, with an appropriate
-   stack in place that makes it look the the child is in the middle of an
+   stack in place that makes it look that the child is in the middle of an
    syscall.  This function is actually `returned to' from switch_thread
    (copy_thread makes ret_from_fork the return address in each new thread's
    saved context).  */
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index d99d402953a3..f294f1538f1e 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -1704,7 +1704,7 @@ EXPORT_SYMBOL(blk_stop_queue);
  *     on a queue, such as calling the unplug function after a timeout.
  *     A block device may call blk_sync_queue to ensure that any
  *     such activity is cancelled, thus allowing it to release resources
- *     the the callbacks might use. The caller must already have made sure
+ *     that the callbacks might use. The caller must already have made sure
  *     that its ->make_request_fn will not re-add plugging prior to calling
  *     this function.
  *
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index 43d4ebcb3b44..a1512da32410 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -151,7 +151,7 @@ static int ramdisk_commit_write(struct file *file, struct page *page,
 }
 
 /*
- * ->writepage to the the blockdev's mapping has to redirty the page so that the
+ * ->writepage to the blockdev's mapping has to redirty the page so that the
  * VM doesn't go and steal it.  We return AOP_WRITEPAGE_ACTIVATE so that the VM
  * won't try to (pointlessly) write the page again for a while.
  *
diff --git a/drivers/char/drm/drm_dma.c b/drivers/char/drm/drm_dma.c
index 892db7096986..32ed19c9ec1c 100644
--- a/drivers/char/drm/drm_dma.c
+++ b/drivers/char/drm/drm_dma.c
@@ -65,7 +65,7 @@ int drm_dma_setup(drm_device_t * dev)
  * \param dev DRM device.
  *
  * Free all pages associated with DMA buffers, the buffers and pages lists, and
- * finally the the drm_device::dma structure itself.
+ * finally the drm_device::dma structure itself.
  */
 void drm_dma_takedown(drm_device_t * dev)
 {
diff --git a/drivers/char/drm/drm_vm.c b/drivers/char/drm/drm_vm.c
index 35540cfb43dd..b5c5b9fa84c3 100644
--- a/drivers/char/drm/drm_vm.c
+++ b/drivers/char/drm/drm_vm.c
@@ -157,7 +157,7 @@ static __inline__ struct page *drm_do_vm_nopage(struct vm_area_struct *vma,
  * \param address access address.
  * \return pointer to the page structure.
  *
- * Get the the mapping, find the real physical page to map, get the page, and
+ * Get the mapping, find the real physical page to map, get the page, and
  * return it.
  */
 static __inline__ struct page *drm_do_vm_shm_nopage(struct vm_area_struct *vma,
diff --git a/drivers/char/drm/r300_reg.h b/drivers/char/drm/r300_reg.h
index a881f96c983e..ecda760ae8c0 100644
--- a/drivers/char/drm/r300_reg.h
+++ b/drivers/char/drm/r300_reg.h
@@ -293,7 +293,7 @@ I am fairly certain that they are correct unless stated otherwise in comments.
 #       define R300_PVS_CNTL_1_PROGRAM_START_SHIFT   0
 #       define R300_PVS_CNTL_1_POS_END_SHIFT         10
 #       define R300_PVS_CNTL_1_PROGRAM_END_SHIFT     20
-/* Addresses are relative the the vertex program parameters area. */
+/* Addresses are relative to the vertex program parameters area. */
 #define R300_VAP_PVS_CNTL_2                 0x22D4
 #       define R300_PVS_CNTL_2_PARAM_OFFSET_SHIFT 0
 #       define R300_PVS_CNTL_2_PARAM_COUNT_SHIFT  16
diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
index e91b43a014b0..4ea587983aef 100644
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ b/drivers/char/pcmcia/cm4000_cs.c
@@ -1114,7 +1114,7 @@ static ssize_t cmm_write(struct file *filp, const char __user *buf,
 	/*
 	 * wait for atr to become valid.
 	 * note: it is important to lock this code. if we dont, the monitor
-	 * could be run between test_bit and the the call the sleep on the
+	 * could be run between test_bit and the call to sleep on the
 	 * atr-queue.  if *then* the monitor detects atr valid, it will wake up
 	 * any process on the atr-queue, *but* since we have been interrupted,
 	 * we do not yet sleep on this queue. this would result in a missed
diff --git a/drivers/ide/pci/siimage.c b/drivers/ide/pci/siimage.c
index c0188de3cc66..79cec50a242f 100644
--- a/drivers/ide/pci/siimage.c
+++ b/drivers/ide/pci/siimage.c
@@ -831,7 +831,7 @@ static void __devinit init_mmio_iops_siimage(ide_hwif_t *hwif)
 
 	/*
 	 *	Now set up the hw. We have to do this ourselves as
-	 *	the MMIO layout isnt the same as the the standard port
+	 *	the MMIO layout isnt the same as the standard port
 	 *	based I/O
 	 */
 
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index 6a1a0572275e..835937e38529 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -1702,7 +1702,7 @@ static int nodemgr_host_thread(void *__hi)
 			generation = get_hpsb_generation(host);
 
 			/* If we get a reset before we are done waiting, then
-			 * start the the waiting over again */
+			 * start the waiting over again */
 			if (generation != g)
 				g = generation, i = 0;
 		}
diff --git a/drivers/isdn/hardware/eicon/divasync.h b/drivers/isdn/hardware/eicon/divasync.h
index af3eb9e795b5..85784a7ffb25 100644
--- a/drivers/isdn/hardware/eicon/divasync.h
+++ b/drivers/isdn/hardware/eicon/divasync.h
@@ -216,7 +216,7 @@ typedef struct
 #define SERIAL_HOOK_RING 0x85
 #define SERIAL_HOOK_DETACH 0x8f
  unsigned char Flags;           /* function refinements   */
- /* parameters passed by the the ATTACH request      */
+ /* parameters passed by the ATTACH request      */
  SERIAL_INT_CB InterruptHandler; /* called on each interrupt  */
  SERIAL_DPC_CB DeferredHandler; /* called on hook state changes */
  void   *HandlerContext; /* context for both handlers */
diff --git a/drivers/isdn/hisax/hfc_usb.c b/drivers/isdn/hisax/hfc_usb.c
index 99e70d4103b6..1f18f1993387 100644
--- a/drivers/isdn/hisax/hfc_usb.c
+++ b/drivers/isdn/hisax/hfc_usb.c
@@ -1217,11 +1217,11 @@ usb_init(hfcusb_data * hfc)
 	/* aux = output, reset off */
 	write_usb(hfc, HFCUSB_CIRM, 0x10);
 
-	/* set USB_SIZE to match the the wMaxPacketSize for INT or BULK transfers */
+	/* set USB_SIZE to match the wMaxPacketSize for INT or BULK transfers */
 	write_usb(hfc, HFCUSB_USB_SIZE,
 		  (hfc->packet_size / 8) | ((hfc->packet_size / 8) << 4));
 
-	/* set USB_SIZE_I to match the the wMaxPacketSize for ISO transfers */
+	/* set USB_SIZE_I to match the wMaxPacketSize for ISO transfers */
 	write_usb(hfc, HFCUSB_USB_SIZE_I, hfc->iso_packet_size);
 
 	/* enable PCM/GCI master mode */
diff --git a/drivers/media/dvb/dvb-usb/dvb-usb-remote.c b/drivers/media/dvb/dvb-usb/dvb-usb-remote.c
index 68ed3a788083..9200a30dd1b9 100644
--- a/drivers/media/dvb/dvb-usb/dvb-usb-remote.c
+++ b/drivers/media/dvb/dvb-usb/dvb-usb-remote.c
@@ -3,7 +3,7 @@
  * Copyright (C) 2004-6 Patrick Boettcher (patrick.boettcher@desy.de)
  * see dvb-usb-init.c for copyright information.
  *
- * This file contains functions for initializing the the input-device and for handling remote-control-queries.
+ * This file contains functions for initializing the input-device and for handling remote-control-queries.
  */
 #include "dvb-usb-common.h"
 #include <linux/usb/input.h>
diff --git a/drivers/media/dvb/frontends/tda10021.c b/drivers/media/dvb/frontends/tda10021.c
index 110536843e8e..e725f612a6b7 100644
--- a/drivers/media/dvb/frontends/tda10021.c
+++ b/drivers/media/dvb/frontends/tda10021.c
@@ -1,6 +1,6 @@
 /*
     TDA10021  - Single Chip Cable Channel Receiver driver module
-	       used on the the Siemens DVB-C cards
+	       used on the Siemens DVB-C cards
 
     Copyright (C) 1999 Convergence Integrated Media GmbH <ralph@convergence.de>
     Copyright (C) 2004 Markus Schulz <msc@antzsystem.de>
diff --git a/drivers/media/dvb/frontends/ves1x93.c b/drivers/media/dvb/frontends/ves1x93.c
index 54d7b07571b8..23fd0303c91b 100644
--- a/drivers/media/dvb/frontends/ves1x93.c
+++ b/drivers/media/dvb/frontends/ves1x93.c
@@ -306,7 +306,7 @@ static int ves1x93_read_status(struct dvb_frontend* fe, fe_status_t* status)
 	 * The ves1893 sometimes returns sync values that make no sense,
 	 * because, e.g., the SIGNAL bit is 0, while some of the higher
 	 * bits are 1 (and how can there be a CARRIER w/o a SIGNAL?).
-	 * Tests showed that the the VITERBI and SYNC bits are returned
+	 * Tests showed that the VITERBI and SYNC bits are returned
 	 * reliably, while the SIGNAL and CARRIER bits ar sometimes wrong.
 	 * If such a case occurs, we read the value again, until we get a
 	 * valid value.
diff --git a/drivers/media/video/em28xx/em28xx-video.c b/drivers/media/video/em28xx/em28xx-video.c
index bec67609500f..2c7b158ce7e1 100644
--- a/drivers/media/video/em28xx/em28xx-video.c
+++ b/drivers/media/video/em28xx/em28xx-video.c
@@ -1729,7 +1729,7 @@ static int em28xx_usb_probe(struct usb_interface *interface,
 
 	endpoint = &interface->cur_altsetting->endpoint[1].desc;
 
-	/* check if the the device has the iso in endpoint at the correct place */
+	/* check if the device has the iso in endpoint at the correct place */
 	if ((endpoint->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) !=
 	    USB_ENDPOINT_XFER_ISOC) {
 		em28xx_err(DRIVER_NAME " probing error: endpoint is non-ISO endpoint!\n");
diff --git a/drivers/media/video/usbvideo/vicam.c b/drivers/media/video/usbvideo/vicam.c
index 876fd2768242..982b115193f8 100644
--- a/drivers/media/video/usbvideo/vicam.c
+++ b/drivers/media/video/usbvideo/vicam.c
@@ -28,7 +28,7 @@
  *
  * Portions of this code were also copied from usbvideo.c
  *
- * Special thanks to the the whole team at Sourceforge for help making
+ * Special thanks to the whole team at Sourceforge for help making
  * this driver become a reality.  Notably:
  * Andy Armstrong who reverse engineered the color encoding and
  * Pavel Machek and Chris Cheney who worked on reverse engineering the
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index 97471af4309c..5021d1a2a1d4 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -3585,7 +3585,7 @@ initChainBuffers(MPT_ADAPTER *ioc)
 	 * index = chain_idx
 	 *
 	 * Calculate the number of chain buffers needed(plus 1) per I/O
-	 * then multiply the the maximum number of simultaneous cmds
+	 * then multiply the maximum number of simultaneous cmds
 	 *
 	 * num_sge = num sge in request frame + last chain buffer
 	 * scale = num sge per chain buffer if no chain element
diff --git a/drivers/mtd/maps/nettel.c b/drivers/mtd/maps/nettel.c
index 9f53c655af3a..7b96cd02f82b 100644
--- a/drivers/mtd/maps/nettel.c
+++ b/drivers/mtd/maps/nettel.c
@@ -358,7 +358,7 @@ int __init nettel_init(void)
 	/* Turn other PAR off so the first probe doesn't find it */
 	*intel1par = 0;
 
-	/* Probe for the the size of the first Intel flash */
+	/* Probe for the size of the first Intel flash */
 	nettel_intel_map.size = maxsize;
 	nettel_intel_map.phys = intel0addr;
 	nettel_intel_map.virt = ioremap_nocache(intel0addr, maxsize);
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 000794c6caf5..0537fac8de74 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -2192,7 +2192,7 @@ static int onenand_check_maf(int manuf)
  * @param mtd		MTD device structure
  *
  * OneNAND detection method:
- *   Compare the the values from command with ones from register
+ *   Compare the values from command with ones from register
  */
 static int onenand_probe(struct mtd_info *mtd)
 {
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 724bce51f936..223517dcbcfd 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3461,7 +3461,7 @@ void bond_unregister_arp(struct bonding *bond)
 /*---------------------------- Hashing Policies -----------------------------*/
 
 /*
- * Hash for the the output device based upon layer 3 and layer 4 data. If
+ * Hash for the output device based upon layer 3 and layer 4 data. If
  * the packet is a frag or not TCP or UDP, just use layer 3 data.  If it is
  * altogether not IP, mimic bond_xmit_hash_policy_l2()
  */
diff --git a/drivers/net/eepro.c b/drivers/net/eepro.c
index 39654e1e2bed..47680237f783 100644
--- a/drivers/net/eepro.c
+++ b/drivers/net/eepro.c
@@ -1126,7 +1126,7 @@ static void eepro_tx_timeout (struct net_device *dev)
 	printk (KERN_ERR "%s: transmit timed out, %s?\n", dev->name,
 		"network cable problem");
 	/* This is not a duplicate. One message for the console,
-	   one for the the log file  */
+	   one for the log file  */
 	printk (KERN_DEBUG "%s: transmit timed out, %s?\n", dev->name,
 		"network cable problem");
 	eepro_complete_selreset(ioaddr);
diff --git a/drivers/net/ixgb/ixgb_ee.c b/drivers/net/ixgb/ixgb_ee.c
index f15aebde7b90..52c99d01d568 100644
--- a/drivers/net/ixgb/ixgb_ee.c
+++ b/drivers/net/ixgb/ixgb_ee.c
@@ -315,7 +315,7 @@ ixgb_wait_eeprom_command(struct ixgb_hw *hw)
  * hw - Struct containing variables accessed by shared code
  *
  * Reads the first 64 16 bit words of the EEPROM and sums the values read.
- * If the the sum of the 64 16 bit words is 0xBABA, the EEPROM's checksum is
+ * If the sum of the 64 16 bit words is 0xBABA, the EEPROM's checksum is
  * valid.
  *
  * Returns:
diff --git a/drivers/net/meth.h b/drivers/net/meth.h
index 84960dae2a22..ea3b8fc86d1e 100644
--- a/drivers/net/meth.h
+++ b/drivers/net/meth.h
@@ -126,7 +126,7 @@ typedef struct rx_packet {
 				       /*   Note: when loopback is set this bit becomes collision control.  Setting this bit will */
 				       /*         cause a collision to be reported. */
 
-				       /* Bits 5 and 6 are used to determine the the Destination address filter mode */
+				       /* Bits 5 and 6 are used to determine the Destination address filter mode */
 #define METH_ACCEPT_MY 0			/* 00: Accept PHY address only */
 #define METH_ACCEPT_MCAST 0x20	/* 01: Accept physical, broadcast, and multicast filter matches only */
 #define METH_ACCEPT_AMCAST 0x40	/* 10: Accept physical, broadcast, and all multicast packets */
diff --git a/drivers/net/tulip/interrupt.c b/drivers/net/tulip/interrupt.c
index 9b08afbd1f65..ea896777bcaf 100644
--- a/drivers/net/tulip/interrupt.c
+++ b/drivers/net/tulip/interrupt.c
@@ -269,7 +269,7 @@ done:
             This would turn on IM for devices that is not contributing
             to backlog congestion with unnecessary latency.
 
-             We monitor the the device RX-ring and have:
+             We monitor the device RX-ring and have:
 
              HW Interrupt Mitigation either ON or OFF.
 
diff --git a/drivers/net/tulip/winbond-840.c b/drivers/net/tulip/winbond-840.c
index fa440706fb4a..38f3b99716b8 100644
--- a/drivers/net/tulip/winbond-840.c
+++ b/drivers/net/tulip/winbond-840.c
@@ -1021,7 +1021,7 @@ static int start_tx(struct sk_buff *skb, struct net_device *dev)
 		np->tx_ring[entry].length |= DescEndRing;
 
 	/* Now acquire the irq spinlock.
-	 * The difficult race is the the ordering between
+	 * The difficult race is the ordering between
 	 * increasing np->cur_tx and setting DescOwned:
 	 * - if np->cur_tx is increased first the interrupt
 	 *   handler could consider the packet as transmitted
diff --git a/drivers/net/tulip/xircom_cb.c b/drivers/net/tulip/xircom_cb.c
index 985a1810ca59..2470b1ee33c0 100644
--- a/drivers/net/tulip/xircom_cb.c
+++ b/drivers/net/tulip/xircom_cb.c
@@ -1043,7 +1043,7 @@ static int enable_promisc(struct xircom_private *card)
 
 
 /*
-link_status() checks the the links status and will return 0 for no link, 10 for 10mbit link and 100 for.. guess what.
+link_status() checks the links status and will return 0 for no link, 10 for 10mbit link and 100 for.. guess what.
 
 Must be called in locked state with interrupts disabled
 */
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
index f2dd7763cd0b..f72573594121 100644
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -639,7 +639,7 @@ typhoon_issue_command(struct typhoon *tp, int num_cmd, struct cmd_desc *cmd,
 
 	typhoon_inc_cmd_index(&ring->lastWrite, num_cmd);
 
-	/* "I feel a presence... another warrior is on the the mesa."
+	/* "I feel a presence... another warrior is on the mesa."
 	 */
 	wmb();
 	iowrite32(ring->lastWrite, tp->ioaddr + TYPHOON_REG_CMD_READY);
diff --git a/drivers/net/wireless/airport.c b/drivers/net/wireless/airport.c
index 38fac3bbcd82..7d5b8c2cc614 100644
--- a/drivers/net/wireless/airport.c
+++ b/drivers/net/wireless/airport.c
@@ -149,7 +149,7 @@ static int airport_hard_reset(struct orinoco_private *priv)
 	/* Vitally important.  If we don't do this it seems we get an
 	 * interrupt somewhere during the power cycle, since
 	 * hw_unavailable is already set it doesn't get ACKed, we get
-	 * into an interrupt loop and the the PMU decides to turn us
+	 * into an interrupt loop and the PMU decides to turn us
 	 * off. */
 	disable_irq(dev->irq);
 
diff --git a/drivers/s390/char/sclp_rw.c b/drivers/s390/char/sclp_rw.c
index bbd5b8b66f42..d6b06ab81188 100644
--- a/drivers/s390/char/sclp_rw.c
+++ b/drivers/s390/char/sclp_rw.c
@@ -23,7 +23,7 @@
 
 /*
  * The room for the SCCB (only for writing) is not equal to a pages size
- * (as it is specified as the maximum size in the the SCLP documentation)
+ * (as it is specified as the maximum size in the SCLP documentation)
  * because of the additional data structure described above.
  */
 #define MAX_SCCB_ROOM (PAGE_SIZE - sizeof(struct sclp_buffer))
diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c
index 29d176036e5c..0b96d49dd636 100644
--- a/drivers/s390/net/qeth_main.c
+++ b/drivers/s390/net/qeth_main.c
@@ -2860,7 +2860,7 @@ qeth_flush_buffers(struct qeth_qdio_out_q *queue, int under_int,
 			if (!atomic_read(&queue->set_pci_flags_count)){
 				/*
 				 * there's no outstanding PCI any more, so we
-				 * have to request a PCI to be sure the the PCI
+				 * have to request a PCI to be sure that the PCI
 				 * will wake at some time in the future then we
 				 * can flush packed buffers that might still be
 				 * hanging around, which can happen if no
diff --git a/drivers/s390/scsi/zfcp_qdio.c b/drivers/s390/scsi/zfcp_qdio.c
index cb08ca3cc0f9..bdf5782b8a7a 100644
--- a/drivers/s390/scsi/zfcp_qdio.c
+++ b/drivers/s390/scsi/zfcp_qdio.c
@@ -222,7 +222,7 @@ zfcp_qdio_handler_error_check(struct zfcp_adapter *adapter, unsigned int status,
                 * Since we have been using this adapter, it is save to assume
                 * that it is not failed but recoverable. The card seems to
                 * report link-up events by self-initiated queue shutdown.
-                * That is why we need to clear the the link-down flag
+                * That is why we need to clear the link-down flag
                 * which is set again in case we have missed by a mile.
                 */
                zfcp_erp_adapter_reopen(
diff --git a/drivers/scsi/aic7xxx/aic79xx_pci.c b/drivers/scsi/aic7xxx/aic79xx_pci.c
index 8d72bbae96ad..0bada0028aa0 100644
--- a/drivers/scsi/aic7xxx/aic79xx_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_pci.c
@@ -966,7 +966,7 @@ ahd_aic790X_setup(struct ahd_softc *ahd)
 			      |  AHD_BUSFREEREV_BUG;
 		ahd->bugs |= AHD_LQOOVERRUN_BUG|AHD_EARLY_REQ_BUG;
 
-		/* If the user requested the the SLOWCRC bit to be set. */
+		/* If the user requested that the SLOWCRC bit to be set. */
 		if (aic79xx_slowcrc)
 			ahd->features |= AHD_AIC79XXB_SLOWCRC;
 
diff --git a/drivers/scsi/aic94xx/Makefile b/drivers/scsi/aic94xx/Makefile
index e6b70123940c..e78ce0fa44d2 100644
--- a/drivers/scsi/aic94xx/Makefile
+++ b/drivers/scsi/aic94xx/Makefile
@@ -6,7 +6,7 @@
 #
 # This file is licensed under GPLv2.
 #
-# This file is part of the the aic94xx driver.
+# This file is part of the aic94xx driver.
 #
 # The aic94xx driver is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c
index a965ed3548d5..564ea90ed3a0 100644
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -541,7 +541,7 @@ static struct ParameterData __devinitdata cfg_data[] = {
 
 
 /*
- * Safe settings. If set to zero the the BIOS/default values with
+ * Safe settings. If set to zero the BIOS/default values with
  * command line overrides will be used. If set to 1 then safe and
  * slow settings will be used.
  */
@@ -617,7 +617,7 @@ static void __devinit fix_settings(void)
 
 /*
  * Mapping from the eeprom delay index value (index into this array)
- * to the the number of actual seconds that the delay should be for.
+ * to the number of actual seconds that the delay should be for.
  */
 static char __devinitdata eeprom_index_to_delay_map[] = 
 	{ 1, 3, 5, 10, 16, 30, 60, 120 };
@@ -4136,7 +4136,7 @@ static void __devinit trms1040_write_all(struct NvRamType *eeprom, unsigned long
  * @io_port:	base I/O address
  * @addr:	offset into SEEPROM
  *
- * Returns the the byte read.
+ * Returns the byte read.
  **/
 static u8 __devinit trms1040_get_data(unsigned long io_port, u8 addr)
 {
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 61fbcdcbb009..1f5a07bf2a75 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -173,7 +173,7 @@ int scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
  * @retries:	number of times to retry request
  * @flags:	or into request flags;
  *
- * returns the req->errors value which is the the scsi_cmnd result
+ * returns the req->errors value which is the scsi_cmnd result
  * field.
  **/
 int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
diff --git a/drivers/usb/misc/auerswald.c b/drivers/usb/misc/auerswald.c
index b5332e679c46..88fb56d5db8f 100644
--- a/drivers/usb/misc/auerswald.c
+++ b/drivers/usb/misc/auerswald.c
@@ -1307,7 +1307,7 @@ static int auerswald_addservice (pauerswald_t cp, pauerscon_t scp)
 }
 
 
-/* remove a service from the the device
+/* remove a service from the device
    scp->id must be set! */
 static void auerswald_removeservice (pauerswald_t cp, pauerscon_t scp)
 {
diff --git a/drivers/usb/net/usbnet.h b/drivers/usb/net/usbnet.h
index cbb53e065d6c..82db5a8e528e 100644
--- a/drivers/usb/net/usbnet.h
+++ b/drivers/usb/net/usbnet.h
@@ -129,7 +129,7 @@ extern void usbnet_disconnect(struct usb_interface *);
 
 
 /* Drivers that reuse some of the standard USB CDC infrastructure
- * (notably, using multiple interfaces according to the the CDC
+ * (notably, using multiple interfaces according to the CDC
  * union descriptor) get some helper code.
  */
 struct cdc_state {
diff --git a/drivers/video/i810/i810_main.c b/drivers/video/i810/i810_main.c
index 7e760197cf29..1a7d7789d877 100644
--- a/drivers/video/i810/i810_main.c
+++ b/drivers/video/i810/i810_main.c
@@ -1717,7 +1717,7 @@ static int __devinit i810_alloc_agp_mem(struct fb_info *info)
  * @info: pointer to device specific info structure
  *
  * DESCRIPTION:
- * Sets the the user monitor's horizontal and vertical
+ * Sets the user monitor's horizontal and vertical
  * frequency limits
  */
 static void __devinit i810_init_monspecs(struct fb_info *info)
diff --git a/drivers/video/skeletonfb.c b/drivers/video/skeletonfb.c
index 842b5cd054c6..836a612af977 100644
--- a/drivers/video/skeletonfb.c
+++ b/drivers/video/skeletonfb.c
@@ -14,7 +14,7 @@
  *  of it. 
  *
  *  First the roles of struct fb_info and struct display have changed. Struct
- *  display will go away. The way the the new framebuffer console code will
+ *  display will go away. The way the new framebuffer console code will
  *  work is that it will act to translate data about the tty/console in 
  *  struct vc_data to data in a device independent way in struct fb_info. Then
  *  various functions in struct fb_ops will be called to store the device 
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 82b0544bd76d..f3b1ebb22280 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -1507,7 +1507,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
 		if (l2nb < budmin) {
 
 			/* search the lower level dmap control pages to get
-			 * the starting block number of the the dmap that
+			 * the starting block number of the dmap that
 			 * contains or starts off the free space.
 			 */
 			if ((rc =
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index c465607be991..c6530227cda6 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -386,7 +386,7 @@ int diRead(struct inode *ip)
 		return -EIO;
 	}
 
-	/* locate the the disk inode requested */
+	/* locate the disk inode requested */
 	dp = (struct dinode *) mp->data;
 	dp += rel_inode;
 
@@ -1407,7 +1407,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 	inum = pip->i_ino + 1;
 	ino = inum & (INOSPERIAG - 1);
 
-	/* back off the the hint if it is outside of the iag */
+	/* back off the hint if it is outside of the iag */
 	if (ino == 0)
 		inum = pip->i_ino;
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 6a3f00dc8c83..44a2f33cb98d 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1960,7 +1960,7 @@ static void lbmfree(struct lbuf * bp)
 /*
  * NAME:	lbmRedrive
  *
- * FUNCTION:	add a log buffer to the the log redrive list
+ * FUNCTION:	add a log buffer to the log redrive list
  *
  * PARAMETER:
  *     bp	- log buffer
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7775ddc0b3c6..e725ddd3de5f 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -809,7 +809,7 @@ xfs_inumbers(
 				xfs_buf_relse(agbp);
 				agbp = NULL;
 				/*
-				 * Move up the the last inode in the current
+				 * Move up the last inode in the current
 				 * chunk.  The lookup_ge will always get
 				 * us the first inode in the next chunk.
 				 */
diff --git a/include/asm-arm/dma-mapping.h b/include/asm-arm/dma-mapping.h
index abfb75b654c7..c8b5d0db0cf0 100644
--- a/include/asm-arm/dma-mapping.h
+++ b/include/asm-arm/dma-mapping.h
@@ -445,7 +445,7 @@ extern void dmabounce_unregister_dev(struct device *);
  *
  * The dmabounce routines call this function whenever a dma-mapping
  * is requested to determine whether a given buffer needs to be bounced
- * or not. The function must return 0 if the the buffer is OK for
+ * or not. The function must return 0 if the buffer is OK for
  * DMA access and 1 if the buffer needs to be bounced.
  *
  */
diff --git a/include/asm-powerpc/ppc-pci.h b/include/asm-powerpc/ppc-pci.h
index d74b2965bb82..6dcd7a811fe1 100644
--- a/include/asm-powerpc/ppc-pci.h
+++ b/include/asm-powerpc/ppc-pci.h
@@ -64,7 +64,7 @@ struct pci_dev *pci_get_device_by_addr(unsigned long addr);
  * eeh_slot_error_detail -- record and EEH error condition to the log
  * @severity: 1 if temporary, 2 if permanent failure.
  *
- * Obtains the the EEH error details from the RTAS subsystem,
+ * Obtains the EEH error details from the RTAS subsystem,
  * and then logs these details with the RTAS error log system.
  */
 void eeh_slot_error_detail (struct pci_dn *pdn, int severity);
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index 4395e5206746..7894dd0f3b77 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -54,7 +54,7 @@ struct ext3_block_alloc_info {
 	/*
 	 * Was i_next_alloc_goal in ext3_inode_info
 	 * is the *physical* companion to i_next_alloc_block.
-	 * it the the physical block number of the block which was most-recentl
+	 * it the physical block number of the block which was most-recentl
 	 * allocated to this file.  This give us the goal (target) for the next
 	 * allocation when we detect linearly ascending requests.
 	 */
diff --git a/include/linux/ext4_fs_i.h b/include/linux/ext4_fs_i.h
index bb42379cb7fd..d5b177e5b395 100644
--- a/include/linux/ext4_fs_i.h
+++ b/include/linux/ext4_fs_i.h
@@ -52,7 +52,7 @@ struct ext4_block_alloc_info {
 	/*
 	 * Was i_next_alloc_goal in ext4_inode_info
 	 * is the *physical* companion to i_next_alloc_block.
-	 * it the the physical block number of the block which was most-recentl
+	 * it the physical block number of the block which was most-recentl
 	 * allocated to this file.  This give us the goal (target) for the next
 	 * allocation when we detect linearly ascending requests.
 	 */
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 0deb842541ac..f9e77d2ee320 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -87,10 +87,10 @@ do {									\
  * management of their lifetimes must be completely managed by API users.
  *
  * For API usage, in general,
- * - any function _modifying_ the the tree or tags (inserting or deleting
+ * - any function _modifying_ the tree or tags (inserting or deleting
  *   items, setting or clearing tags must exclude other modifications, and
  *   exclude any functions reading the tree.
- * - any function _reading_ the the tree or tags (looking up items or tags,
+ * - any function _reading_ the tree or tags (looking up items or tags,
  *   gang lookups) must exclude modifications to the tree, but may occur
  *   concurrently with other readers.
  *
diff --git a/include/linux/security.h b/include/linux/security.h
index 47e82c120f9a..9eb9e0fe0331 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -322,7 +322,7 @@ struct request_sock;
  *	@dir contains the inode structure of parent of the new file.
  *	@dentry contains the dentry structure of the new file.
  *	@mode contains the mode of the new file.
- *	@dev contains the the device number.
+ *	@dev contains the device number.
  *	Return 0 if permission is granted.
  * @inode_rename:
  *	Check for permission to rename a file or directory.
diff --git a/include/linux/usb.h b/include/linux/usb.h
index cfbd2bb8fa2c..94bd38a6d947 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -126,7 +126,7 @@ enum usb_interface_condition {
  * Each interface may have alternate settings.  The initial configuration
  * of a device sets altsetting 0, but the device driver can change
  * that setting using usb_set_interface().  Alternate settings are often
- * used to control the the use of periodic endpoints, such as by having
+ * used to control the use of periodic endpoints, such as by having
  * different endpoints use different amounts of reserved USB bandwidth.
  * All standards-conformant USB devices that use isochronous endpoints
  * will use them in non-default settings.
diff --git a/kernel/relay.c b/kernel/relay.c
index 577f251c7e28..d24395e8b6e5 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -310,7 +310,7 @@ static struct rchan_callbacks default_channel_callbacks = {
 
 /**
  *	wakeup_readers - wake up readers waiting on a channel
- *	@work: work struct that contains the the channel buffer
+ *	@work: work struct that contains the channel buffer
  *
  *	This is the work function used to defer reader waking.  The
  *	reason waking is deferred is that calling directly from write
diff --git a/kernel/wait.c b/kernel/wait.c
index 59a82f63275d..444ddbfaefc4 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -61,7 +61,7 @@ EXPORT_SYMBOL(remove_wait_queue);
  * The spin_unlock() itself is semi-permeable and only protects
  * one way (it only protects stuff inside the critical region and
  * stops them from bleeding out - it would still allow subsequent
- * loads to move into the the critical region).
+ * loads to move into the critical region).
  */
 void fastcall
 prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
diff --git a/mm/mmap.c b/mm/mmap.c
index cc1f543eb1b8..68b9ad2ef1d6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1720,7 +1720,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 
 /*
  * Split a vma into two pieces at address 'addr', a new vma is allocated
- * either for the first part or the the tail.
+ * either for the first part or the tail.
  */
 int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	      unsigned long addr, int new_below)
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 9fbe87c93802..bfa910b6ad25 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1839,7 +1839,7 @@ static inline int dn_queue_too_long(struct dn_scp *scp, struct sk_buff_head *que
 }
 
 /*
- * The DECnet spec requires the the "routing layer" accepts packets which
+ * The DECnet spec requires that the "routing layer" accepts packets which
  * are at least 230 bytes in size. This excludes any headers which the NSP
  * layer might add, so we always assume that we'll be using the maximal
  * length header on data packets. The variation in length is due to the
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index e1f18489db1d..86a2b52aad38 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -629,7 +629,7 @@ doi_walk_return:
  * @domain: the domain to add
  *
  * Description:
- * Adds the @domain to the the DOI specified by @doi_def, this function
+ * Adds the @domain to the DOI specified by @doi_def, this function
  * should only be called by external functions (i.e. NetLabel).  This function
  * does allocate memory.  Returns zero on success, negative values on failure.
  *
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
index ff366f7390d9..dd7c128f9db3 100644
--- a/net/ipv4/ipvs/ip_vs_sed.c
+++ b/net/ipv4/ipvs/ip_vs_sed.c
@@ -18,7 +18,7 @@
  * The SED algorithm attempts to minimize each job's expected delay until
  * completion. The expected delay that the job will experience is
  * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
- * jobs on the the ith server and Ui is the fixed service rate (weight) of
+ * jobs on the ith server and Ui is the fixed service rate (weight) of
  * the ith server. The SED algorithm adopts a greedy policy that each does
  * what is in its own best interest, i.e. to join the queue which would
  * minimize its expected delay of completion.
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 113e0c4c8a92..66026df1cc76 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -983,7 +983,7 @@ int udp_disconnect(struct sock *sk, int flags)
 }
 
 /* return:
- * 	1  if the the UDP system should process it
+ * 	1  if the UDP system should process it
  *	0  if we should drop this packet
  * 	-1 if it should get processed by xfrm4_rcv_encap
  */
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 7d9fa38b6a7d..6b8a103cf9e6 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -324,7 +324,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
 		memset(&laddr, 0, sizeof(laddr));
 		memset(&daddr, 0, sizeof(daddr));
 		/*
-		 * FIXME: check if the the address is multicast,
+		 * FIXME: check if the address is multicast,
 		 * 	  only SOCK_DGRAM can do this.
 		 */
 		memcpy(laddr.mac, addr->sllc_mac, IFHWADDRLEN);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index c31af29a4439..117cbfdb910c 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -177,7 +177,7 @@ void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
 	struct nf_conntrack_expect *i;
 
 	write_lock_bh(&nf_conntrack_lock);
-	/* choose the the oldest expectation to evict */
+	/* choose the oldest expectation to evict */
 	list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
 		if (expect_matches(i, exp) && del_timer(&i->timeout)) {
 			nf_ct_unlink_expect(i);
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 83ef411772f4..77fb7b06a9c4 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -3,7 +3,7 @@
  *
  * This file is part of the SCTP kernel reference Implementation
  *
- * This file contains the code relating the the chunk abstraction.
+ * This file contains the code relating the chunk abstraction.
  *
  * The SCTP reference implementation is free software;
  * you can redistribute it and/or modify it under the terms of
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9f1a908776de..83a76ba9d7b3 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2586,7 +2586,7 @@ static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, int opt
  *
  * 7.1.2 SCTP_ASSOCINFO
  *
- * This option is used to tune the the maximum retransmission attempts
+ * This option is used to tune the maximum retransmission attempts
  * of the association.
  * Returns an error if the new association retransmission value is
  * greater than the sum of the retransmission value  of the peer.
@@ -4547,7 +4547,7 @@ static int sctp_getsockopt_rtoinfo(struct sock *sk, int len,
  *
  * 7.1.2 SCTP_ASSOCINFO
  *
- * This option is used to tune the the maximum retransmission attempts
+ * This option is used to tune the maximum retransmission attempts
  * of the association.
  * Returns an error if the new association retransmission value is
  * greater than the sum of the retransmission value  of the peer.
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index f5c3808bf85a..af7c5f05c6e1 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -64,7 +64,7 @@ int svc_set_client(struct svc_rqst *rqstp)
 }
 
 /* A request, which was authenticated, has now executed.
- * Time to finalise the the credentials and verifier
+ * Time to finalise the credentials and verifier
  * and release and resources
  */
 int svc_authorise(struct svc_rqst *rqstp)
diff --git a/scripts/basic/docproc.c b/scripts/basic/docproc.c
index d6071cbf13d7..f4d2f68452ba 100644
--- a/scripts/basic/docproc.c
+++ b/scripts/basic/docproc.c
@@ -211,7 +211,7 @@ void find_export_symbols(char * filename)
  * Document all external or internal functions in a file.
  * Call kernel-doc with following parameters:
  * kernel-doc -docbook -nofunction function_name1 filename
- * function names are obtained from all the the src files
+ * function names are obtained from all the src files
  * by find_export_symbols.
  * intfunc uses -nofunction
  * extfunc uses -function
diff --git a/sound/pci/ac97/ac97_codec.c b/sound/pci/ac97/ac97_codec.c
index a9eec2a2357d..3bfb2102fc5d 100644
--- a/sound/pci/ac97/ac97_codec.c
+++ b/sound/pci/ac97/ac97_codec.c
@@ -1083,7 +1083,7 @@ static void check_volume_resolution(struct snd_ac97 *ac97, int reg, unsigned cha
 		unsigned short val;
 		snd_ac97_write(ac97, reg, 0x8080 | cbit[i] | (cbit[i] << 8));
 		/* Do the read twice due to buffers on some ac97 codecs.
-		 * e.g. The STAC9704 returns exactly what you wrote the the register
+		 * e.g. The STAC9704 returns exactly what you wrote to the register
 		 * if you read it immediately. This causes the detect routine to fail.
 		 */
 		val = snd_ac97_read(ac97, reg);
-- 
cgit v1.2.3


From 5886269962f94fa9185c32db3ec936c612503235 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <ukleinek@informatik.uni-freiburg.de>
Date: Wed, 9 May 2007 07:51:49 +0200
Subject: fix file specification in comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Many files include the filename at the beginning, serveral used a wrong one.

Signed-off-by: Uwe Kleine-König <ukleinek@informatik.uni-freiburg.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 arch/arm/mach-s3c2410/sleep.S                     | 2 +-
 arch/arm/plat-s3c24xx/sleep.S                     | 2 +-
 arch/i386/kernel/cpu/mcheck/therm_throt.c         | 2 +-
 arch/powerpc/platforms/8xx/mpc86xads_setup.c      | 2 +-
 arch/powerpc/platforms/8xx/mpc885ads_setup.c      | 2 +-
 arch/ppc/platforms/mpc866ads_setup.c              | 2 +-
 arch/ppc/syslib/ipic.c                            | 2 +-
 arch/sh/boards/se/7751/setup.c                    | 2 +-
 arch/sh/kernel/cpu/sh3/entry.S                    | 2 +-
 arch/sh/kernel/cpu/sh4a/clock-sh73180.c           | 2 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7343.c            | 2 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7770.c            | 2 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7780.c            | 2 +-
 arch/sh/kernel/vsyscall/vsyscall.c                | 2 +-
 arch/um/include/sysdep-i386/archsetjmp.h          | 2 +-
 arch/um/include/sysdep-x86_64/archsetjmp.h        | 2 +-
 arch/xtensa/kernel/pci-dma.c                      | 2 +-
 drivers/leds/leds-h1940.c                         | 2 +-
 drivers/video/console/softcursor.c                | 2 +-
 fs/jbd/checkpoint.c                               | 2 +-
 fs/jbd/recovery.c                                 | 2 +-
 fs/jbd/revoke.c                                   | 2 +-
 fs/jbd/transaction.c                              | 2 +-
 fs/jbd2/checkpoint.c                              | 2 +-
 fs/jbd2/recovery.c                                | 2 +-
 fs/jbd2/revoke.c                                  | 2 +-
 fs/jbd2/transaction.c                             | 2 +-
 include/asm-arm/arch-iop32x/glantank.h            | 2 +-
 include/asm-arm/arch-iop32x/n2100.h               | 2 +-
 include/asm-arm/arch-s3c2410/regs-power.h         | 2 +-
 include/asm-arm/arch-s3c2410/regs-s3c2443-clock.h | 2 +-
 include/asm-arm/arch-s3c2410/regs-watchdog.h      | 2 +-
 include/asm-arm/arch-s3c2410/udc.h                | 2 +-
 include/asm-sh/edosk7705.h                        | 2 +-
 include/asm-sh/snapgear.h                         | 2 +-
 include/asm-xtensa/platform-iss/simcall.h         | 2 +-
 include/linux/generic_acl.h                       | 2 +-
 sound/soc/pxa/pxa2xx-ac97.h                       | 2 +-
 sound/soc/pxa/pxa2xx-i2s.h                        | 2 +-
 39 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-s3c2410/sleep.S b/arch/arm/mach-s3c2410/sleep.S
index 637aaba65390..d1eeed2ad47c 100644
--- a/arch/arm/mach-s3c2410/sleep.S
+++ b/arch/arm/mach-s3c2410/sleep.S
@@ -1,4 +1,4 @@
-/* linux/arch/arm/mach-s3c2410/s3c2410-sleep.S
+/* linux/arch/arm/mach-s3c2410/sleep.S
  *
  * Copyright (c) 2004 Simtec Electronics
  *	Ben Dooks <ben@simtec.co.uk>
diff --git a/arch/arm/plat-s3c24xx/sleep.S b/arch/arm/plat-s3c24xx/sleep.S
index 435349dc3243..7b7ae790b00d 100644
--- a/arch/arm/plat-s3c24xx/sleep.S
+++ b/arch/arm/plat-s3c24xx/sleep.S
@@ -1,4 +1,4 @@
-/* linux/arch/arm/mach-s3c2410/sleep.S
+/* linux/arch/arm/plat-s3c24xx/sleep.S
  *
  * Copyright (c) 2004 Simtec Electronics
  *	Ben Dooks <ben@simtec.co.uk>
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
index 065005c3f168..2f28540caae2 100644
--- a/arch/i386/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -1,5 +1,5 @@
 /*
- * linux/arch/i386/kerne/cpu/mcheck/therm_throt.c
+ * linux/arch/i386/kernel/cpu/mcheck/therm_throt.c
  *
  * Thermal throttle event support code (such as syslog messaging and rate
  * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
diff --git a/arch/powerpc/platforms/8xx/mpc86xads_setup.c b/arch/powerpc/platforms/8xx/mpc86xads_setup.c
index a35315af5c53..cf0e7bc8c2e7 100644
--- a/arch/powerpc/platforms/8xx/mpc86xads_setup.c
+++ b/arch/powerpc/platforms/8xx/mpc86xads_setup.c
@@ -1,4 +1,4 @@
-/*arch/ppc/platforms/mpc86xads-setup.c
+/*arch/powerpc/platforms/8xx/mpc86xads_setup.c
  *
  * Platform setup for the Freescale mpc86xads board
  *
diff --git a/arch/powerpc/platforms/8xx/mpc885ads_setup.c b/arch/powerpc/platforms/8xx/mpc885ads_setup.c
index a57b57785acd..c36e475d93dc 100644
--- a/arch/powerpc/platforms/8xx/mpc885ads_setup.c
+++ b/arch/powerpc/platforms/8xx/mpc885ads_setup.c
@@ -1,4 +1,4 @@
-/*arch/ppc/platforms/mpc885ads-setup.c
+/*arch/powerpc/platforms/8xx/mpc885ads_setup.c
  *
  * Platform setup for the Freescale mpc885ads board
  *
diff --git a/arch/ppc/platforms/mpc866ads_setup.c b/arch/ppc/platforms/mpc866ads_setup.c
index 7ce5364fdb3b..bf72204125c5 100644
--- a/arch/ppc/platforms/mpc866ads_setup.c
+++ b/arch/ppc/platforms/mpc866ads_setup.c
@@ -1,4 +1,4 @@
-/*arch/ppc/platforms/mpc866ads-setup.c
+/*arch/ppc/platforms/mpc866ads_setup.c
  *
  * Platform setup for the Freescale mpc866ads board
  *
diff --git a/arch/ppc/syslib/ipic.c b/arch/ppc/syslib/ipic.c
index 10659c24b1be..9192777d0f78 100644
--- a/arch/ppc/syslib/ipic.c
+++ b/arch/ppc/syslib/ipic.c
@@ -1,5 +1,5 @@
 /*
- * include/asm-ppc/ipic.c
+ * arch/ppc/syslib/ipic.c
  *
  * IPIC routines implementations.
  *
diff --git a/arch/sh/boards/se/7751/setup.c b/arch/sh/boards/se/7751/setup.c
index 770defed9c4a..52c7bfa57c2c 100644
--- a/arch/sh/boards/se/7751/setup.c
+++ b/arch/sh/boards/se/7751/setup.c
@@ -1,5 +1,5 @@
 /*
- * linux/arch/sh/kernel/setup_7751se.c
+ * linux/arch/sh/boards/se/7751/setup.c
  *
  * Copyright (C) 2000  Kazumoto Kojima
  *
diff --git a/arch/sh/kernel/cpu/sh3/entry.S b/arch/sh/kernel/cpu/sh3/entry.S
index f3e827f29a46..832c0b4a1e6c 100644
--- a/arch/sh/kernel/cpu/sh3/entry.S
+++ b/arch/sh/kernel/cpu/sh3/entry.S
@@ -1,5 +1,5 @@
 /*
- * arch/sh/kernel/entry.S
+ * arch/sh/kernel/cpu/sh3/entry.S
  *
  *  Copyright (C) 1999, 2000, 2002  Niibe Yutaka
  *  Copyright (C) 2003 - 2006  Paul Mundt
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh73180.c b/arch/sh/kernel/cpu/sh4a/clock-sh73180.c
index 2fa5cb2ae68d..6d5ba373a75e 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh73180.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh73180.c
@@ -1,5 +1,5 @@
 /*
- * arch/sh/kernel/cpu/sh4/clock-sh73180.c
+ * arch/sh/kernel/cpu/sh4a/clock-sh73180.c
  *
  * SH73180 support for the clock framework
  *
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7343.c b/arch/sh/kernel/cpu/sh4a/clock-sh7343.c
index 1707a213f0cf..7adc4f16e95a 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7343.c
@@ -1,5 +1,5 @@
 /*
- * arch/sh/kernel/cpu/sh4/clock-sh7343.c
+ * arch/sh/kernel/cpu/sh4a/clock-sh7343.c
  *
  * SH7343/SH7722 support for the clock framework
  *
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7770.c b/arch/sh/kernel/cpu/sh4a/clock-sh7770.c
index c8694bac6477..8e236062c721 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7770.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7770.c
@@ -1,5 +1,5 @@
 /*
- * arch/sh/kernel/cpu/sh4/clock-sh7770.c
+ * arch/sh/kernel/cpu/sh4a/clock-sh7770.c
  *
  * SH7770 support for the clock framework
  *
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
index 9e6a216750c8..01f3da619d3d 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
@@ -1,5 +1,5 @@
 /*
- * arch/sh/kernel/cpu/sh4/clock-sh7780.c
+ * arch/sh/kernel/cpu/sh4a/clock-sh7780.c
  *
  * SH7780 support for the clock framework
  *
diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c
index 7b0f66f03319..e146bafcd14f 100644
--- a/arch/sh/kernel/vsyscall/vsyscall.c
+++ b/arch/sh/kernel/vsyscall/vsyscall.c
@@ -1,5 +1,5 @@
 /*
- * arch/sh/kernel/vsyscall.c
+ * arch/sh/kernel/vsyscall/vsyscall.c
  *
  *  Copyright (C) 2006 Paul Mundt
  *
diff --git a/arch/um/include/sysdep-i386/archsetjmp.h b/arch/um/include/sysdep-i386/archsetjmp.h
index 11bafab669e9..0f312085ce1d 100644
--- a/arch/um/include/sysdep-i386/archsetjmp.h
+++ b/arch/um/include/sysdep-i386/archsetjmp.h
@@ -1,5 +1,5 @@
 /*
- * arch/i386/include/klibc/archsetjmp.h
+ * arch/um/include/sysdep-i386/archsetjmp.h
  */
 
 #ifndef _KLIBC_ARCHSETJMP_H
diff --git a/arch/um/include/sysdep-x86_64/archsetjmp.h b/arch/um/include/sysdep-x86_64/archsetjmp.h
index 9a5e1a6ec800..2af8f12ca161 100644
--- a/arch/um/include/sysdep-x86_64/archsetjmp.h
+++ b/arch/um/include/sysdep-x86_64/archsetjmp.h
@@ -1,5 +1,5 @@
 /*
- * arch/x86_64/include/klibc/archsetjmp.h
+ * arch/um/include/sysdep-x86_64/archsetjmp.h
  */
 
 #ifndef _KLIBC_ARCHSETJMP_H
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index ca76f071666e..f5319d78c876 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -1,5 +1,5 @@
 /*
- * arch/xtensa/pci-dma.c
+ * arch/xtensa/kernel/pci-dma.c
  *
  * DMA coherent memory allocation.
  *
diff --git a/drivers/leds/leds-h1940.c b/drivers/leds/leds-h1940.c
index 1d49d2ade557..677c99325be5 100644
--- a/drivers/leds/leds-h1940.c
+++ b/drivers/leds/leds-h1940.c
@@ -1,5 +1,5 @@
 /*
- * drivers/leds/h1940-leds.c
+ * drivers/leds/leds-h1940.c
  * Copyright (c) Arnaud Patard <arnaud.patard@rtp-net.org>
  *
  * This file is subject to the terms and conditions of the GNU General Public
diff --git a/drivers/video/console/softcursor.c b/drivers/video/console/softcursor.c
index f577bd80e020..03cfb7ac5733 100644
--- a/drivers/video/console/softcursor.c
+++ b/drivers/video/console/softcursor.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/video/softcursor.c
+ * linux/drivers/video/console/softcursor.c
  *
  * Generic software cursor for frame buffer devices
  *
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 0208cc7ac5d0..47552d4a6324 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/checkpoint.c
+ * linux/fs/jbd/checkpoint.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
  *
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 11563fe2a52b..2a5f4b833e35 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/recovery.c
+ * linux/fs/jbd/recovery.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
  *
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index a68cbb605022..824e3b7d4ec1 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/revoke.c
+ * linux/fs/jbd/revoke.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
  *
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index f9822fc07851..772b6531a2a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/transaction.c
+ * linux/fs/jbd/transaction.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
  *
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 68039fa9a566..3fccde7ba008 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/checkpoint.c
+ * linux/fs/jbd2/checkpoint.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
  *
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 9f10acafaf70..395c92a04ac9 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/recovery.c
+ * linux/fs/jbd2/recovery.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
  *
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 1e864dcc49ea..9246e763da78 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/revoke.c
+ * linux/fs/jbd2/revoke.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
  *
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e347d8c078bc..7946ff43fc40 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/transaction.c
+ * linux/fs/jbd2/transaction.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
  *
diff --git a/include/asm-arm/arch-iop32x/glantank.h b/include/asm-arm/arch-iop32x/glantank.h
index 3b065618dd00..bf0665acc1c1 100644
--- a/include/asm-arm/arch-iop32x/glantank.h
+++ b/include/asm-arm/arch-iop32x/glantank.h
@@ -1,5 +1,5 @@
 /*
- * include/asm/arch-iop32x/glantank.h
+ * include/asm-arm/arch-iop32x/glantank.h
  *
  * IO-Data GLAN Tank board registers
  */
diff --git a/include/asm-arm/arch-iop32x/n2100.h b/include/asm-arm/arch-iop32x/n2100.h
index fed31a648425..77a8af476629 100644
--- a/include/asm-arm/arch-iop32x/n2100.h
+++ b/include/asm-arm/arch-iop32x/n2100.h
@@ -1,5 +1,5 @@
 /*
- * include/asm/arch-iop32x/n2100.h
+ * include/asm-arm/arch-iop32x/n2100.h
  *
  * Thecus N2100 board registers
  */
diff --git a/include/asm-arm/arch-s3c2410/regs-power.h b/include/asm-arm/arch-s3c2410/regs-power.h
index 6c319ea2afac..94ff96505b6a 100644
--- a/include/asm-arm/arch-s3c2410/regs-power.h
+++ b/include/asm-arm/arch-s3c2410/regs-power.h
@@ -1,4 +1,4 @@
-/* linux/include/asm/arch-s3c2410/regs-power.h
+/* linux/include/asm-arm/arch-s3c2410/regs-power.h
  *
  * Copyright (c) 2003,2004,2005,2006 Simtec Electronics <linux@simtec.co.uk>
  *		      http://armlinux.simtec.co.uk/
diff --git a/include/asm-arm/arch-s3c2410/regs-s3c2443-clock.h b/include/asm-arm/arch-s3c2410/regs-s3c2443-clock.h
index ff0536d2de42..cd9e26568f85 100644
--- a/include/asm-arm/arch-s3c2410/regs-s3c2443-clock.h
+++ b/include/asm-arm/arch-s3c2410/regs-s3c2443-clock.h
@@ -1,4 +1,4 @@
-/* linux/include/asm-arm/arch-s3c2410/regs-clock.h
+/* linux/include/asm-arm/arch-s3c2410/regs-s3c2443-clock.h
  *
  * Copyright (c) 2007 Simtec Electronics
  *	Ben Dooks <ben@simtec.co.uk>
diff --git a/include/asm-arm/arch-s3c2410/regs-watchdog.h b/include/asm-arm/arch-s3c2410/regs-watchdog.h
index f4fff448c7bd..a9c5d491bdb6 100644
--- a/include/asm-arm/arch-s3c2410/regs-watchdog.h
+++ b/include/asm-arm/arch-s3c2410/regs-watchdog.h
@@ -1,4 +1,4 @@
-/* linux/include/asm/arch-s3c2410/regs-watchdog.h
+/* linux/include/asm-arm/arch-s3c2410/regs-watchdog.h
  *
  * Copyright (c) 2003 Simtec Electronics <linux@simtec.co.uk>
  *		      http://www.simtec.co.uk/products/SWLINUX/
diff --git a/include/asm-arm/arch-s3c2410/udc.h b/include/asm-arm/arch-s3c2410/udc.h
index e59ec339d614..b8aa6cb69b58 100644
--- a/include/asm-arm/arch-s3c2410/udc.h
+++ b/include/asm-arm/arch-s3c2410/udc.h
@@ -1,4 +1,4 @@
-/* linux/include/asm/arch-s3c2410/udc.h
+/* linux/include/asm-arm/arch-s3c2410/udc.h
  *
  * Copyright (c) 2005 Arnaud Patard <arnaud.patard@rtp-net.org>
  *
diff --git a/include/asm-sh/edosk7705.h b/include/asm-sh/edosk7705.h
index a1089a65bc36..5bdc9d9be3de 100644
--- a/include/asm-sh/edosk7705.h
+++ b/include/asm-sh/edosk7705.h
@@ -1,5 +1,5 @@
 /*
- * include/asm-sh/edosk7705/io.h
+ * include/asm-sh/edosk7705.h
  *
  * Modified version of io_se.h for the EDOSK7705 specific functions.
  *
diff --git a/include/asm-sh/snapgear.h b/include/asm-sh/snapgear.h
index 6b5e4ddc073a..2d712e72c9e5 100644
--- a/include/asm-sh/snapgear.h
+++ b/include/asm-sh/snapgear.h
@@ -1,5 +1,5 @@
 /*
- * include/asm-sh/snapgear/io.h
+ * include/asm-sh/snapgear.h
  *
  * Modified version of io_se.h for the snapgear-specific functions.
  *
diff --git a/include/asm-xtensa/platform-iss/simcall.h b/include/asm-xtensa/platform-iss/simcall.h
index 6acb572759a6..b7952c06a2b7 100644
--- a/include/asm-xtensa/platform-iss/simcall.h
+++ b/include/asm-xtensa/platform-iss/simcall.h
@@ -1,5 +1,5 @@
 /*
- * include/asm-xtensa/platform-iss/hardware.h
+ * include/asm-xtensa/platform-iss/simcall.h
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
diff --git a/include/linux/generic_acl.h b/include/linux/generic_acl.h
index 80764f40be75..886f5faa08cb 100644
--- a/include/linux/generic_acl.h
+++ b/include/linux/generic_acl.h
@@ -1,5 +1,5 @@
 /*
- * fs/generic_acl.c
+ * include/linux/generic_acl.h
  *
  * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
  *
diff --git a/sound/soc/pxa/pxa2xx-ac97.h b/sound/soc/pxa/pxa2xx-ac97.h
index 4c4b882316ac..b8ccfee095c4 100644
--- a/sound/soc/pxa/pxa2xx-ac97.h
+++ b/sound/soc/pxa/pxa2xx-ac97.h
@@ -1,5 +1,5 @@
 /*
- * linux/sound/arm/pxa2xx-ac97.h
+ * linux/sound/soc/pxa/pxa2xx-ac97.h
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/sound/soc/pxa/pxa2xx-i2s.h b/sound/soc/pxa/pxa2xx-i2s.h
index a2484f0881f1..4435bd9f884f 100644
--- a/sound/soc/pxa/pxa2xx-i2s.h
+++ b/sound/soc/pxa/pxa2xx-i2s.h
@@ -1,5 +1,5 @@
 /*
- * linux/sound/arm/pxa2xx-i2s.h
+ * linux/sound/soc/pxa/pxa2xx-i2s.h
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
-- 
cgit v1.2.3


From 121e70b69aef898a3c02fa90d0a2108381bcf975 Mon Sep 17 00:00:00 2001
From: John Anthony Kazos Jr <jakj@j-a-k-j.com>
Date: Wed, 9 May 2007 08:30:57 +0200
Subject: include files: convert "include" subdirectory to UTF-8

Convert the "include" subdirectory to UTF-8.

Signed-off-by: John Anthony Kazos Jr. <jakj@j-a-k-j.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 include/asm-arm/arch-imx/imx-regs.h        | 2 +-
 include/asm-arm/arch-integrator/platform.h | 2 +-
 include/asm-arm/arch-omap/board.h          | 2 +-
 include/asm-arm/arch-omap/dma.h            | 2 +-
 include/asm-arm/arch-omap/gpio.h           | 2 +-
 include/asm-m68k/atarihw.h                 | 2 +-
 include/asm-m68k/atariints.h               | 2 +-
 include/asm-ppc/hydra.h                    | 2 +-
 include/linux/i2c-algo-bit.h               | 2 +-
 include/linux/i2c-algo-pcf.h               | 2 +-
 include/linux/irda.h                       | 2 +-
 include/linux/meye.h                       | 2 +-
 include/linux/sonypi.h                     | 2 +-
 include/net/irda/af_irda.h                 | 2 +-
 include/net/irda/irda.h                    | 2 +-
 include/net/irda/iriap.h                   | 2 +-
 include/net/irda/iriap_event.h             | 2 +-
 include/net/irda/irias_object.h            | 2 +-
 include/net/irda/irlan_client.h            | 2 +-
 include/net/irda/irlan_common.h            | 2 +-
 include/net/irda/irlan_eth.h               | 2 +-
 include/net/irda/irlan_event.h             | 2 +-
 include/net/irda/irlan_filter.h            | 2 +-
 include/net/irda/irlan_provider.h          | 2 +-
 include/net/irda/irlap.h                   | 2 +-
 include/net/irda/irlmp.h                   | 2 +-
 include/net/irda/irlmp_event.h             | 2 +-
 include/net/irda/irlmp_frame.h             | 2 +-
 include/net/irda/irmod.h                   | 2 +-
 include/net/irda/irqueue.h                 | 2 +-
 include/net/irda/irttp.h                   | 2 +-
 include/net/irda/parameters.h              | 2 +-
 include/net/irda/timer.h                   | 2 +-
 include/net/irda/wrapper.h                 | 2 +-
 34 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-arm/arch-imx/imx-regs.h b/include/asm-arm/arch-imx/imx-regs.h
index de6494a4dc6b..30de404c61f5 100644
--- a/include/asm-arm/arch-imx/imx-regs.h
+++ b/include/asm-arm/arch-imx/imx-regs.h
@@ -297,7 +297,7 @@
 #define SAR(x)  __REG2( IMX_DMAC_BASE + 0x80, (x) << 6)	/* Source Address Registers */
 #define DAR(x)  __REG2( IMX_DMAC_BASE + 0x84, (x) << 6)	/* Destination Address Registers */
 #define CNTR(x) __REG2( IMX_DMAC_BASE + 0x88, (x) << 6)	/* Count Registers */
-#define CCR(x)  __REG2( IMX_DMAC_BASE + 0x8c, (x) << 6)	/*�Control Registers */
+#define CCR(x)  __REG2( IMX_DMAC_BASE + 0x8c, (x) << 6)	/* Control Registers */
 #define RSSR(x) __REG2( IMX_DMAC_BASE + 0x90, (x) << 6)	/* Request source select Registers */
 #define BLR(x)  __REG2( IMX_DMAC_BASE + 0x94, (x) << 6)	/* Burst length Registers */
 #define RTOR(x) __REG2( IMX_DMAC_BASE + 0x98, (x) << 6)	/* Request timeout Registers */
diff --git a/include/asm-arm/arch-integrator/platform.h b/include/asm-arm/arch-integrator/platform.h
index 96ad3d2a66d1..83c4c1ceb411 100644
--- a/include/asm-arm/arch-integrator/platform.h
+++ b/include/asm-arm/arch-integrator/platform.h
@@ -17,7 +17,7 @@
  *                 from .s file by awk -f s2h.awk
  */
 /**************************************************************************
- * * Copyright � ARM Limited 1998.  All rights reserved.
+ * * Copyright © ARM Limited 1998.  All rights reserved.
  * ***********************************************************************/
 /* ************************************************************************
  *
diff --git a/include/asm-arm/arch-omap/board.h b/include/asm-arm/arch-omap/board.h
index edf1dc6ad919..4fd717e626a2 100644
--- a/include/asm-arm/arch-omap/board.h
+++ b/include/asm-arm/arch-omap/board.h
@@ -4,7 +4,7 @@
  *  Information structures for board-specific data
  *
  *  Copyright (C) 2004	Nokia Corporation
- *  Written by Juha Yrj�l� <juha.yrjola@nokia.com>
+ *  Written by Juha Yrjölä <juha.yrjola@nokia.com>
  */
 
 #ifndef _OMAP_BOARD_H
diff --git a/include/asm-arm/arch-omap/dma.h b/include/asm-arm/arch-omap/dma.h
index d591d0585bba..f7774192a41e 100644
--- a/include/asm-arm/arch-omap/dma.h
+++ b/include/asm-arm/arch-omap/dma.h
@@ -2,7 +2,7 @@
  *  linux/include/asm-arm/arch-omap/dma.h
  *
  *  Copyright (C) 2003 Nokia Corporation
- *  Author: Juha Yrj�l� <juha.yrjola@nokia.com>
+ *  Author: Juha Yrjölä <juha.yrjola@nokia.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff --git a/include/asm-arm/arch-omap/gpio.h b/include/asm-arm/arch-omap/gpio.h
index 590917efc94a..97b397dd7e87 100644
--- a/include/asm-arm/arch-omap/gpio.h
+++ b/include/asm-arm/arch-omap/gpio.h
@@ -5,7 +5,7 @@
  *
  * Copyright (C) 2003-2005 Nokia Corporation
  *
- * Written by Juha Yrj�l� <juha.yrjola@nokia.com>
+ * Written by Juha Yrjölä <juha.yrjola@nokia.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff --git a/include/asm-m68k/atarihw.h b/include/asm-m68k/atarihw.h
index f28acd0fd689..6211363a345f 100644
--- a/include/asm-m68k/atarihw.h
+++ b/include/asm-m68k/atarihw.h
@@ -2,7 +2,7 @@
 ** linux/atarihw.h -- This header defines some macros and pointers for
 **                    the various Atari custom hardware registers.
 **
-** Copyright 1994 by Bj�rn Brauel
+** Copyright 1994 by Bjrn Brauel
 **
 ** 5/1/94 Roman Hodek:
 **   Added definitions for TT specific chips.
diff --git a/include/asm-m68k/atariints.h b/include/asm-m68k/atariints.h
index 0ed454fc24bb..ce6c445805bd 100644
--- a/include/asm-m68k/atariints.h
+++ b/include/asm-m68k/atariints.h
@@ -1,7 +1,7 @@
 /*
 ** atariints.h -- Atari Linux interrupt handling structs and prototypes
 **
-** Copyright 1994 by Bj�rn Brauel
+** Copyright 1994 by Bjrn Brauel
 **
 ** 5/2/94 Roman Hodek:
 **   TT interrupt definitions added.
diff --git a/include/asm-ppc/hydra.h b/include/asm-ppc/hydra.h
index 833a8aff2a80..1ad4eed07fbe 100644
--- a/include/asm-ppc/hydra.h
+++ b/include/asm-ppc/hydra.h
@@ -8,7 +8,7 @@
  *	Macintosh Technology in the Common Hardware Reference Platform
  *	Apple Computer, Inc.
  *
- *	� Copyright 1995 Apple Computer, Inc. All rights reserved.
+ *	© Copyright 1995 Apple Computer, Inc. All rights reserved.
  *
  *  It's available online from http://chrp.apple.com/MacTech.pdf.
  *  You can obtain paper copies of this book from computer bookstores or by
diff --git a/include/linux/i2c-algo-bit.h b/include/linux/i2c-algo-bit.h
index 9ee0f800592f..111334f5b922 100644
--- a/include/linux/i2c-algo-bit.h
+++ b/include/linux/i2c-algo-bit.h
@@ -18,7 +18,7 @@
     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.                */
 /* ------------------------------------------------------------------------- */
 
-/* With some changes from Ky�sti M�lkki <kmalkki@cc.hut.fi> and even
+/* With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and even
    Frodo Looijaard <frodol@dds.nl> */
 
 #ifndef _LINUX_I2C_ALGO_BIT_H
diff --git a/include/linux/i2c-algo-pcf.h b/include/linux/i2c-algo-pcf.h
index 994eb86f882c..77afbb60fd11 100644
--- a/include/linux/i2c-algo-pcf.h
+++ b/include/linux/i2c-algo-pcf.h
@@ -19,7 +19,7 @@
     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.                */
 /* ------------------------------------------------------------------------- */
 
-/* With some changes from Ky�sti M�lkki <kmalkki@cc.hut.fi> and even
+/* With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and even
    Frodo Looijaard <frodol@dds.nl> */
 
 #ifndef _LINUX_I2C_ALGO_PCF_H
diff --git a/include/linux/irda.h b/include/linux/irda.h
index 09d8f105a5a8..945ba3110874 100644
--- a/include/linux/irda.h
+++ b/include/linux/irda.h
@@ -16,7 +16,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *  
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
diff --git a/include/linux/meye.h b/include/linux/meye.h
index 11ec45e9a132..39fd9c8ddd4b 100644
--- a/include/linux/meye.h
+++ b/include/linux/meye.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2001-2003 Stelian Pop <stelian@popies.net>
  *
- * Copyright (C) 2001-2002 Alc�ve <www.alcove.com>
+ * Copyright (C) 2001-2002 Alcôve <www.alcove.com>
  *
  * Copyright (C) 2000 Andrew Tridgell <tridge@valinux.com>
  *
diff --git a/include/linux/sonypi.h b/include/linux/sonypi.h
index f56d24734950..34d4b075f7b8 100644
--- a/include/linux/sonypi.h
+++ b/include/linux/sonypi.h
@@ -5,7 +5,7 @@
  *
  * Copyright (C) 2005 Narayanan R S <nars@kadamba.org>
 
- * Copyright (C) 2001-2002 Alc�ve <www.alcove.com>
+ * Copyright (C) 2001-2002 Alcôve <www.alcove.com>
  *
  * Copyright (C) 2001 Michael Ashley <m.ashley@unsw.edu.au>
  *
diff --git a/include/net/irda/af_irda.h b/include/net/irda/af_irda.h
index 7a209f61c482..0df574931522 100644
--- a/include/net/irda/af_irda.h
+++ b/include/net/irda/af_irda.h
@@ -17,7 +17,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *  
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *     
diff --git a/include/net/irda/irda.h b/include/net/irda/irda.h
index 89fe534045f1..36bee441aa56 100644
--- a/include/net/irda/irda.h
+++ b/include/net/irda/irda.h
@@ -17,7 +17,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *  
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *     
diff --git a/include/net/irda/iriap.h b/include/net/irda/iriap.h
index 2007c5a0a43f..fcc896491a95 100644
--- a/include/net/irda/iriap.h
+++ b/include/net/irda/iriap.h
@@ -17,7 +17,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
diff --git a/include/net/irda/iriap_event.h b/include/net/irda/iriap_event.h
index 4ca3d2071b03..89747f06d9eb 100644
--- a/include/net/irda/iriap_event.h
+++ b/include/net/irda/iriap_event.h
@@ -16,7 +16,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
diff --git a/include/net/irda/irias_object.h b/include/net/irda/irias_object.h
index c41196b87955..83f78081799c 100644
--- a/include/net/irda/irias_object.h
+++ b/include/net/irda/irias_object.h
@@ -16,7 +16,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *  
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *     
diff --git a/include/net/irda/irlan_client.h b/include/net/irda/irlan_client.h
index 736dabe211e3..fa8455eda280 100644
--- a/include/net/irda/irlan_client.h
+++ b/include/net/irda/irlan_client.h
@@ -16,7 +16,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
diff --git a/include/net/irda/irlan_common.h b/include/net/irda/irlan_common.h
index 9592c374b41d..73cacb3ac16c 100644
--- a/include/net/irda/irlan_common.h
+++ b/include/net/irda/irlan_common.h
@@ -17,7 +17,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
diff --git a/include/net/irda/irlan_eth.h b/include/net/irda/irlan_eth.h
index 9a9b3619d305..0062347600b9 100644
--- a/include/net/irda/irlan_eth.h
+++ b/include/net/irda/irlan_eth.h
@@ -16,7 +16,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *  
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *     
diff --git a/include/net/irda/irlan_event.h b/include/net/irda/irlan_event.h
index b9baac9eb8b6..6d9539f05806 100644
--- a/include/net/irda/irlan_event.h
+++ b/include/net/irda/irlan_event.h
@@ -16,7 +16,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
diff --git a/include/net/irda/irlan_filter.h b/include/net/irda/irlan_filter.h
index 1720539ac2c1..a5a2539485bd 100644
--- a/include/net/irda/irlan_filter.h
+++ b/include/net/irda/irlan_filter.h
@@ -16,7 +16,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *  
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *     
diff --git a/include/net/irda/irlan_provider.h b/include/net/irda/irlan_provider.h
index ca51d5b7c999..92f3b0e1029b 100644
--- a/include/net/irda/irlan_provider.h
+++ b/include/net/irda/irlan_provider.h
@@ -16,7 +16,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
diff --git a/include/net/irda/irlap.h b/include/net/irda/irlap.h
index e77eb88d9226..f0248fb8e196 100644
--- a/include/net/irda/irlap.h
+++ b/include/net/irda/irlap.h
@@ -18,7 +18,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
diff --git a/include/net/irda/irlmp.h b/include/net/irda/irlmp.h
index e212b9bc2503..3ffc1d0f93d6 100644
--- a/include/net/irda/irlmp.h
+++ b/include/net/irda/irlmp.h
@@ -18,7 +18,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
diff --git a/include/net/irda/irlmp_event.h b/include/net/irda/irlmp_event.h
index 03c6f81a502a..e03ae4ae3963 100644
--- a/include/net/irda/irlmp_event.h
+++ b/include/net/irda/irlmp_event.h
@@ -18,7 +18,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
diff --git a/include/net/irda/irlmp_frame.h b/include/net/irda/irlmp_frame.h
index c463f8bca856..1906eb71422e 100644
--- a/include/net/irda/irlmp_frame.h
+++ b/include/net/irda/irlmp_frame.h
@@ -17,7 +17,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
diff --git a/include/net/irda/irmod.h b/include/net/irda/irmod.h
index 72b446c1e22c..86f0dbb8ee5d 100644
--- a/include/net/irda/irmod.h
+++ b/include/net/irda/irmod.h
@@ -17,7 +17,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *  
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charg.
  *     
diff --git a/include/net/irda/irqueue.h b/include/net/irda/irqueue.h
index 335b0ace9665..37f512bd6733 100644
--- a/include/net/irda/irqueue.h
+++ b/include/net/irda/irqueue.h
@@ -21,7 +21,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *  
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *     
diff --git a/include/net/irda/irttp.h b/include/net/irda/irttp.h
index a899e5837be8..cf80c1af5854 100644
--- a/include/net/irda/irttp.h
+++ b/include/net/irda/irttp.h
@@ -18,7 +18,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
diff --git a/include/net/irda/parameters.h b/include/net/irda/parameters.h
index 3a605d37ddbf..c0d938847bd3 100644
--- a/include/net/irda/parameters.h
+++ b/include/net/irda/parameters.h
@@ -26,7 +26,7 @@
  *     Foundation, Inc., 59 Temple Place, Suite 330, Boston, 
  *     MA 02111-1307 USA
  *
- *     Michel D�nzer <daenzer@debian.org>, 10/2001
+ *     Michel Dänzer <daenzer@debian.org>, 10/2001
  *     - simplify irda_pv_t to avoid endianness issues
  *     
  ********************************************************************/
diff --git a/include/net/irda/timer.h b/include/net/irda/timer.h
index cb61568547d1..cb2615ccf761 100644
--- a/include/net/irda/timer.h
+++ b/include/net/irda/timer.h
@@ -18,7 +18,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
diff --git a/include/net/irda/wrapper.h b/include/net/irda/wrapper.h
index 98768b3f9e31..2942ad6ab932 100644
--- a/include/net/irda/wrapper.h
+++ b/include/net/irda/wrapper.h
@@ -17,7 +17,7 @@
  *     published by the Free Software Foundation; either version 2 of 
  *     the License, or (at your option) any later version.
  *
- *     Neither Dag Brattli nor University of Troms� admit liability nor
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
  *     provide warranty for any of this software. This material is 
  *     provided "AS-IS" and at no charge.
  *
-- 
cgit v1.2.3


From 36200b76008d52d16b170d4f7dae9cfe00f5eb2b Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Thu, 3 May 2007 15:58:49 -0400
Subject: [MTD] Remove unnecessary user space check from mtd.h.

Since the header file include/linux/mtd/mtd.h is not exported to user
space, remove the user space check and error.

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/mtd/mtd.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 45d482ce8397..12a9a18f6e16 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -9,10 +9,6 @@
 #ifndef __MTD_MTD_H__
 #define __MTD_MTD_H__
 
-#ifndef __KERNEL__
-#error This is a kernel header. Perhaps include mtd-user.h instead?
-#endif
-
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/uio.h>
-- 
cgit v1.2.3


From 42f209d3c94516affeb5e578fae62925f531a2d9 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Fri, 4 May 2007 15:49:38 -0400
Subject: [MTD] Delete allegedly obsolete "bank_size" field of mtd_info.

Delete the allegedly obsolete "bank_size" member of struct mtd_info.

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 drivers/mtd/mtdpart.c   | 1 -
 include/linux/mtd/mtd.h | 3 ---
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 1af989023c66..9c6236852942 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -347,7 +347,6 @@ int add_mtd_partitions(struct mtd_info *master,
 		slave->mtd.subpage_sft = master->subpage_sft;
 
 		slave->mtd.name = parts[i].name;
-		slave->mtd.bank_size = master->bank_size;
 		slave->mtd.owner = master->owner;
 
 		slave->mtd.read = part_read;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 12a9a18f6e16..fd64ccfbce02 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -133,9 +133,6 @@ struct mtd_info {
 	int numeraseregions;
 	struct mtd_erase_region_info *eraseregions;
 
-	/* This really shouldn't be here. It can go away in 2.5 */
-	u_int32_t bank_size;
-
 	int (*erase) (struct mtd_info *mtd, struct erase_info *instr);
 
 	/* This stuff for eXecute-In-Place */
-- 
cgit v1.2.3


From 97416ce82e20a9511ec369822098a8d20998398a Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 9 May 2007 02:32:35 -0700
Subject: Declare {compat_}sys_utimensat

This is needed before Powerpc can wire up the syscall.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h   | 3 +++
 include/linux/syscalls.h | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index ccd863dd77fa..70a157a130bb 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -253,5 +253,8 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
 			const compat_sigset_t __user *sigmask,
 			compat_size_t sigsetsize);
 
+asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename,
+				struct compat_timespec __user *t, int flags);
+
 #endif /* CONFIG_COMPAT */
 #endif /* _LINUX_COMPAT_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1912c6cbef55..3139f4412297 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -576,6 +576,8 @@ asmlinkage long sys_fstatat64(int dfd, char __user *filename,
 			       struct stat64 __user *statbuf, int flag);
 asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
 			       int bufsiz);
+asmlinkage long sys_utimensat(int dfd, char __user *filename,
+				struct timespec __user *utimes, int flags);
 asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename,
 				     struct compat_timeval __user *t);
 asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename,
-- 
cgit v1.2.3


From a3d25c275d383975504dc53c25b691df59bd3c48 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 9 May 2007 02:33:18 -0700
Subject: PM: Separate hibernation code from suspend code

[ With Johannes Berg <johannes@sipsolutions.net> ]

Separate the hibernation (aka suspend to disk code) from the other suspend
code.  In particular:

 * Remove the definitions related to hibernation from include/linux/pm.h
 * Introduce struct hibernation_ops and a new hibernate() function to hibernate
   the system, defined in include/linux/suspend.h
 * Separate suspend code in kernel/power/main.c from hibernation-related code
   in kernel/power/disk.c and kernel/power/user.c (with the help of
   hibernation_ops)
 * Switch ACPI (the only user of pm_ops.pm_disk_mode) to hibernation_ops

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Greg KH <greg@kroah.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/power/userland-swsusp.txt |  26 +++--
 drivers/acpi/sleep/main.c               |  67 ++++++++---
 drivers/acpi/sleep/proc.c               |   2 +-
 drivers/i2c/chips/tps65010.c            |   2 +-
 include/linux/pm.h                      |  31 +----
 include/linux/suspend.h                 |  24 ++++
 kernel/power/disk.c                     | 195 ++++++++++++++++++--------------
 kernel/power/main.c                     |  42 +++----
 kernel/power/power.h                    |   7 +-
 kernel/power/user.c                     |  13 +--
 kernel/sys.c                            |   2 +-
 11 files changed, 226 insertions(+), 185 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
index 000556c932e9..e00c6cf09e85 100644
--- a/Documentation/power/userland-swsusp.txt
+++ b/Documentation/power/userland-swsusp.txt
@@ -93,21 +93,23 @@ SNAPSHOT_S2RAM - suspend to RAM; using this call causes the kernel to
 	to resume the system from RAM if there's enough battery power or restore
 	its state on the basis of the saved suspend image otherwise)
 
-SNAPSHOT_PMOPS - enable the usage of the pmops->prepare, pmops->enter and
-	pmops->finish methods (the in-kernel swsusp knows these as the "platform
-	method") which are needed on many machines to (among others) speed up
-	the resume by letting the BIOS skip some steps or to let the system
-	recognise the correct state of the hardware after the resume (in
-	particular on many machines this ensures that unplugged AC
-	adapters get correctly detected and that kacpid does not run wild after
-	the resume).  The last ioctl() argument can take one of the three
-	values, defined in kernel/power/power.h:
+SNAPSHOT_PMOPS - enable the usage of the hibernation_ops->prepare,
+	hibernate_ops->enter and hibernation_ops->finish methods (the in-kernel
+	swsusp knows these as the "platform method") which are needed on many
+	machines to (among others) speed up the resume by letting the BIOS skip
+	some steps or to let the system recognise the correct state of the
+	hardware after the resume (in particular on many machines this ensures
+	that unplugged AC adapters get correctly detected and that kacpid does
+	not run wild after the resume).  The last ioctl() argument can take one
+	of the three values, defined in kernel/power/power.h:
 	PMOPS_PREPARE - make the kernel carry out the
-		pm_ops->prepare(PM_SUSPEND_DISK) operation
+		hibernation_ops->prepare() operation
 	PMOPS_ENTER - make the kernel power off the system by calling
-		pm_ops->enter(PM_SUSPEND_DISK)
+		hibernation_ops->enter()
 	PMOPS_FINISH - make the kernel carry out the
-		pm_ops->finish(PM_SUSPEND_DISK) operation
+		hibernation_ops->finish() operation
+	Note that the actual constants are misnamed because they surface
+	internal kernel implementation details that have changed.
 
 The device's read() operation can be used to transfer the snapshot image from
 the kernel.  It has the following limitations:
diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index f8c63410bcbf..52b23471dd69 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -29,7 +29,6 @@ static u32 acpi_suspend_states[] = {
 	[PM_SUSPEND_ON] = ACPI_STATE_S0,
 	[PM_SUSPEND_STANDBY] = ACPI_STATE_S1,
 	[PM_SUSPEND_MEM] = ACPI_STATE_S3,
-	[PM_SUSPEND_DISK] = ACPI_STATE_S4,
 	[PM_SUSPEND_MAX] = ACPI_STATE_S5
 };
 
@@ -94,14 +93,6 @@ static int acpi_pm_enter(suspend_state_t pm_state)
 		do_suspend_lowlevel();
 		break;
 
-	case PM_SUSPEND_DISK:
-		if (acpi_pm_ops.pm_disk_mode == PM_DISK_PLATFORM)
-			status = acpi_enter_sleep_state(acpi_state);
-		break;
-	case PM_SUSPEND_MAX:
-		acpi_power_off();
-		break;
-
 	default:
 		return -EINVAL;
 	}
@@ -157,12 +148,13 @@ int acpi_suspend(u32 acpi_state)
 	suspend_state_t states[] = {
 		[1] = PM_SUSPEND_STANDBY,
 		[3] = PM_SUSPEND_MEM,
-		[4] = PM_SUSPEND_DISK,
 		[5] = PM_SUSPEND_MAX
 	};
 
 	if (acpi_state < 6 && states[acpi_state])
 		return pm_suspend(states[acpi_state]);
+	if (acpi_state == 4)
+		return hibernate();
 	return -EINVAL;
 }
 
@@ -189,6 +181,49 @@ static struct pm_ops acpi_pm_ops = {
 	.finish = acpi_pm_finish,
 };
 
+#ifdef CONFIG_SOFTWARE_SUSPEND
+static int acpi_hibernation_prepare(void)
+{
+	return acpi_sleep_prepare(ACPI_STATE_S4);
+}
+
+static int acpi_hibernation_enter(void)
+{
+	acpi_status status = AE_OK;
+	unsigned long flags = 0;
+
+	ACPI_FLUSH_CPU_CACHE();
+
+	local_irq_save(flags);
+	acpi_enable_wakeup_device(ACPI_STATE_S4);
+	/* This shouldn't return.  If it returns, we have a problem */
+	status = acpi_enter_sleep_state(ACPI_STATE_S4);
+	local_irq_restore(flags);
+
+	return ACPI_SUCCESS(status) ? 0 : -EFAULT;
+}
+
+static void acpi_hibernation_finish(void)
+{
+	acpi_leave_sleep_state(ACPI_STATE_S4);
+	acpi_disable_wakeup_device(ACPI_STATE_S4);
+
+	/* reset firmware waking vector */
+	acpi_set_firmware_waking_vector((acpi_physical_address) 0);
+
+	if (init_8259A_after_S1) {
+		printk("Broken toshiba laptop -> kicking interrupts\n");
+		init_8259A(0);
+	}
+}
+
+static struct hibernation_ops acpi_hibernation_ops = {
+	.prepare = acpi_hibernation_prepare,
+	.enter = acpi_hibernation_enter,
+	.finish = acpi_hibernation_finish,
+};
+#endif /* CONFIG_SOFTWARE_SUSPEND */
+
 /*
  * Toshiba fails to preserve interrupts over S1, reinitialization
  * of 8259 is needed after S1 resume.
@@ -227,14 +262,18 @@ int __init acpi_sleep_init(void)
 			sleep_states[i] = 1;
 			printk(" S%d", i);
 		}
-		if (i == ACPI_STATE_S4) {
-			if (sleep_states[i])
-				acpi_pm_ops.pm_disk_mode = PM_DISK_PLATFORM;
-		}
 	}
 	printk(")\n");
 
 	pm_set_ops(&acpi_pm_ops);
+
+#ifdef CONFIG_SOFTWARE_SUSPEND
+	if (sleep_states[ACPI_STATE_S4])
+		hibernation_set_ops(&acpi_hibernation_ops);
+#else
+	sleep_states[ACPI_STATE_S4] = 0;
+#endif
+
 	return 0;
 }
 
diff --git a/drivers/acpi/sleep/proc.c b/drivers/acpi/sleep/proc.c
index 5a76e5be61d5..76b45f0b8341 100644
--- a/drivers/acpi/sleep/proc.c
+++ b/drivers/acpi/sleep/proc.c
@@ -60,7 +60,7 @@ acpi_system_write_sleep(struct file *file,
 	state = simple_strtoul(str, NULL, 0);
 #ifdef CONFIG_SOFTWARE_SUSPEND
 	if (state == 4) {
-		error = pm_suspend(PM_SUSPEND_DISK);
+		error = hibernate();
 		goto Done;
 	}
 #endif
diff --git a/drivers/i2c/chips/tps65010.c b/drivers/i2c/chips/tps65010.c
index 7ed92dc3d833..3c3f2ebf3fc9 100644
--- a/drivers/i2c/chips/tps65010.c
+++ b/drivers/i2c/chips/tps65010.c
@@ -354,7 +354,7 @@ static void tps65010_interrupt(struct tps65010 *tps)
 			 * also needs to get error handling and probably
 			 * an #ifdef CONFIG_SOFTWARE_SUSPEND
 			 */
-			pm_suspend(PM_SUSPEND_DISK);
+			hibernate();
 #endif
 			poll = 1;
 		}
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 6e8fa3049e5d..87545e0f0b58 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -107,26 +107,11 @@ typedef int __bitwise suspend_state_t;
 #define PM_SUSPEND_ON		((__force suspend_state_t) 0)
 #define PM_SUSPEND_STANDBY	((__force suspend_state_t) 1)
 #define PM_SUSPEND_MEM		((__force suspend_state_t) 3)
-#define PM_SUSPEND_DISK		((__force suspend_state_t) 4)
-#define PM_SUSPEND_MAX		((__force suspend_state_t) 5)
-
-typedef int __bitwise suspend_disk_method_t;
-
-/* invalid must be 0 so struct pm_ops initialisers can leave it out */
-#define PM_DISK_INVALID		((__force suspend_disk_method_t) 0)
-#define	PM_DISK_PLATFORM	((__force suspend_disk_method_t) 1)
-#define	PM_DISK_SHUTDOWN	((__force suspend_disk_method_t) 2)
-#define	PM_DISK_REBOOT		((__force suspend_disk_method_t) 3)
-#define	PM_DISK_TEST		((__force suspend_disk_method_t) 4)
-#define	PM_DISK_TESTPROC	((__force suspend_disk_method_t) 5)
-#define	PM_DISK_MAX		((__force suspend_disk_method_t) 6)
+#define PM_SUSPEND_MAX		((__force suspend_state_t) 4)
 
 /**
  * struct pm_ops - Callbacks for managing platform dependent suspend states.
  * @valid: Callback to determine whether the given state can be entered.
- * 	If %CONFIG_SOFTWARE_SUSPEND is set then %PM_SUSPEND_DISK is
- *	always valid and never passed to this call. If not assigned,
- *	no suspend states are valid.
  *	Valid states are advertised in /sys/power/state but can still
  *	be rejected by prepare or enter if the conditions aren't right.
  *	There is a %pm_valid_only_mem function available that can be assigned
@@ -140,24 +125,12 @@ typedef int __bitwise suspend_disk_method_t;
  *
  * @finish: Called when the system has left the given state and all devices
  *	are resumed. The return value is ignored.
- *
- * @pm_disk_mode: The generic code always allows one of the shutdown methods
- *	%PM_DISK_SHUTDOWN, %PM_DISK_REBOOT, %PM_DISK_TEST and
- *	%PM_DISK_TESTPROC. If this variable is set, the mode it is set
- *	to is allowed in addition to those modes and is also made default.
- *	When this mode is sent selected, the @prepare call will be called
- *	before suspending to disk (if present), the @enter call should be
- *	present and will be called after all state has been saved and the
- *	machine is ready to be powered off; the @finish callback is called
- *	after state has been restored. All these calls are called with
- *	%PM_SUSPEND_DISK as the state.
  */
 struct pm_ops {
 	int (*valid)(suspend_state_t state);
 	int (*prepare)(suspend_state_t state);
 	int (*enter)(suspend_state_t state);
 	int (*finish)(suspend_state_t state);
-	suspend_disk_method_t pm_disk_mode;
 };
 
 /**
@@ -276,8 +249,6 @@ extern void device_power_up(void);
 extern void device_resume(void);
 
 #ifdef CONFIG_PM
-extern suspend_disk_method_t pm_disk_mode;
-
 extern int device_suspend(pm_message_t state);
 extern int device_prepare_suspend(pm_message_t state);
 
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 9d2aa1a12aa0..d74da9122b60 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -32,6 +32,24 @@ static inline int pm_prepare_console(void) { return 0; }
 static inline void pm_restore_console(void) {}
 #endif
 
+/**
+ * struct hibernation_ops - hibernation platform support
+ *
+ * The methods in this structure allow a platform to override the default
+ * mechanism of shutting down the machine during a hibernation transition.
+ *
+ * All three methods must be assigned.
+ *
+ * @prepare: prepare system for hibernation
+ * @enter: shut down system after state has been saved to disk
+ * @finish: finish/clean up after state has been reloaded
+ */
+struct hibernation_ops {
+	int (*prepare)(void);
+	int (*enter)(void);
+	void (*finish)(void);
+};
+
 #if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND)
 /* kernel/power/snapshot.c */
 extern void __init register_nosave_region(unsigned long, unsigned long);
@@ -39,11 +57,17 @@ extern int swsusp_page_is_forbidden(struct page *);
 extern void swsusp_set_page_free(struct page *);
 extern void swsusp_unset_page_free(struct page *);
 extern unsigned long get_safe_page(gfp_t gfp_mask);
+
+extern void hibernation_set_ops(struct hibernation_ops *ops);
+extern int hibernate(void);
 #else
 static inline void register_nosave_region(unsigned long b, unsigned long e) {}
 static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
 static inline void swsusp_set_page_free(struct page *p) {}
 static inline void swsusp_unset_page_free(struct page *p) {}
+
+static inline void hibernation_set_ops(struct hibernation_ops *ops) {}
+static inline int hibernate(void) { return -ENOSYS; }
 #endif /* defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND) */
 
 void save_processor_state(void);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 06331374d862..b5f0543ed84d 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -30,30 +30,69 @@ char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
 
+enum {
+	HIBERNATION_INVALID,
+	HIBERNATION_PLATFORM,
+	HIBERNATION_TEST,
+	HIBERNATION_TESTPROC,
+	HIBERNATION_SHUTDOWN,
+	HIBERNATION_REBOOT,
+	/* keep last */
+	__HIBERNATION_AFTER_LAST
+};
+#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1)
+#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1)
+
+static int hibernation_mode = HIBERNATION_SHUTDOWN;
+
+struct hibernation_ops *hibernation_ops;
+
+/**
+ * hibernation_set_ops - set the global hibernate operations
+ * @ops: the hibernation operations to use in subsequent hibernation transitions
+ */
+
+void hibernation_set_ops(struct hibernation_ops *ops)
+{
+	if (ops && !(ops->prepare && ops->enter && ops->finish)) {
+		WARN_ON(1);
+		return;
+	}
+	mutex_lock(&pm_mutex);
+	hibernation_ops = ops;
+	if (ops)
+		hibernation_mode = HIBERNATION_PLATFORM;
+	else if (hibernation_mode == HIBERNATION_PLATFORM)
+		hibernation_mode = HIBERNATION_SHUTDOWN;
+
+	mutex_unlock(&pm_mutex);
+}
+
+
 /**
  *	platform_prepare - prepare the machine for hibernation using the
  *	platform driver if so configured and return an error code if it fails
  */
 
-static inline int platform_prepare(void)
+static int platform_prepare(void)
 {
-	int error = 0;
+	return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ?
+		hibernation_ops->prepare() : 0;
+}
 
-	switch (pm_disk_mode) {
-	case PM_DISK_TEST:
-	case PM_DISK_TESTPROC:
-	case PM_DISK_SHUTDOWN:
-	case PM_DISK_REBOOT:
-		break;
-	default:
-		if (pm_ops && pm_ops->prepare)
-			error = pm_ops->prepare(PM_SUSPEND_DISK);
-	}
-	return error;
+/**
+ *	platform_finish - switch the machine to the normal mode of operation
+ *	using the platform driver (must be called after platform_prepare())
+ */
+
+static void platform_finish(void)
+{
+	if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops)
+		hibernation_ops->finish();
 }
 
 /**
- *	power_down - Shut machine down for hibernate.
+ *	power_down - Shut the machine down for hibernation.
  *
  *	Use the platform driver, if configured so; otherwise try
  *	to power off or reboot.
@@ -61,20 +100,20 @@ static inline int platform_prepare(void)
 
 static void power_down(void)
 {
-	switch (pm_disk_mode) {
-	case PM_DISK_TEST:
-	case PM_DISK_TESTPROC:
+	switch (hibernation_mode) {
+	case HIBERNATION_TEST:
+	case HIBERNATION_TESTPROC:
 		break;
-	case PM_DISK_SHUTDOWN:
+	case HIBERNATION_SHUTDOWN:
 		kernel_power_off();
 		break;
-	case PM_DISK_REBOOT:
+	case HIBERNATION_REBOOT:
 		kernel_restart(NULL);
 		break;
-	default:
-		if (pm_ops && pm_ops->enter) {
+	case HIBERNATION_PLATFORM:
+		if (hibernation_ops) {
 			kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
-			pm_ops->enter(PM_SUSPEND_DISK);
+			hibernation_ops->enter();
 			break;
 		}
 	}
@@ -87,20 +126,6 @@ static void power_down(void)
 	while(1);
 }
 
-static inline void platform_finish(void)
-{
-	switch (pm_disk_mode) {
-	case PM_DISK_TEST:
-	case PM_DISK_TESTPROC:
-	case PM_DISK_SHUTDOWN:
-	case PM_DISK_REBOOT:
-		break;
-	default:
-		if (pm_ops && pm_ops->finish)
-			pm_ops->finish(PM_SUSPEND_DISK);
-	}
-}
-
 static void unprepare_processes(void)
 {
 	thaw_processes();
@@ -120,13 +145,10 @@ static int prepare_processes(void)
 }
 
 /**
- *	pm_suspend_disk - The granpappy of hibernation power management.
- *
- *	If not, then call swsusp to do its thing, then figure out how
- *	to power down the system.
+ *	hibernate - The granpappy of the built-in hibernation management
  */
 
-int pm_suspend_disk(void)
+int hibernate(void)
 {
 	int error;
 
@@ -143,7 +165,8 @@ int pm_suspend_disk(void)
 	if (error)
 		goto Finish;
 
-	if (pm_disk_mode == PM_DISK_TESTPROC) {
+	mutex_lock(&pm_mutex);
+	if (hibernation_mode == HIBERNATION_TESTPROC) {
 		printk("swsusp debug: Waiting for 5 seconds.\n");
 		mdelay(5000);
 		goto Thaw;
@@ -168,7 +191,7 @@ int pm_suspend_disk(void)
 	if (error)
 		goto Enable_cpus;
 
-	if (pm_disk_mode == PM_DISK_TEST) {
+	if (hibernation_mode == HIBERNATION_TEST) {
 		printk("swsusp debug: Waiting for 5 seconds.\n");
 		mdelay(5000);
 		goto Enable_cpus;
@@ -205,6 +228,7 @@ int pm_suspend_disk(void)
 	device_resume();
 	resume_console();
  Thaw:
+	mutex_unlock(&pm_mutex);
 	unprepare_processes();
  Finish:
 	free_basic_memory_bitmaps();
@@ -220,7 +244,7 @@ int pm_suspend_disk(void)
  *	Called as a late_initcall (so all devices are discovered and
  *	initialized), we call swsusp to see if we have a saved image or not.
  *	If so, we quiesce devices, the restore the saved image. We will
- *	return above (in pm_suspend_disk() ) if everything goes well.
+ *	return above (in hibernate() ) if everything goes well.
  *	Otherwise, we fail gracefully and return to the normally
  *	scheduled program.
  *
@@ -315,25 +339,26 @@ static int software_resume(void)
 late_initcall(software_resume);
 
 
-static const char * const pm_disk_modes[] = {
-	[PM_DISK_PLATFORM]	= "platform",
-	[PM_DISK_SHUTDOWN]	= "shutdown",
-	[PM_DISK_REBOOT]	= "reboot",
-	[PM_DISK_TEST]		= "test",
-	[PM_DISK_TESTPROC]	= "testproc",
+static const char * const hibernation_modes[] = {
+	[HIBERNATION_PLATFORM]	= "platform",
+	[HIBERNATION_SHUTDOWN]	= "shutdown",
+	[HIBERNATION_REBOOT]	= "reboot",
+	[HIBERNATION_TEST]	= "test",
+	[HIBERNATION_TESTPROC]	= "testproc",
 };
 
 /**
- *	disk - Control suspend-to-disk mode
+ *	disk - Control hibernation mode
  *
  *	Suspend-to-disk can be handled in several ways. We have a few options
  *	for putting the system to sleep - using the platform driver (e.g. ACPI
- *	or other pm_ops), powering off the system or rebooting the system
- *	(for testing) as well as the two test modes.
+ *	or other hibernation_ops), powering off the system or rebooting the
+ *	system (for testing) as well as the two test modes.
  *
  *	The system can support 'platform', and that is known a priori (and
- *	encoded in pm_ops). However, the user may choose 'shutdown' or 'reboot'
- *	as alternatives, as well as the test modes 'test' and 'testproc'.
+ *	encoded by the presence of hibernation_ops). However, the user may
+ *	choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
+ *	test modes, 'test' or 'testproc'.
  *
  *	show() will display what the mode is currently set to.
  *	store() will accept one of
@@ -345,7 +370,7 @@ static const char * const pm_disk_modes[] = {
  *	'testproc'
  *
  *	It will only change to 'platform' if the system
- *	supports it (as determined from pm_ops->pm_disk_mode).
+ *	supports it (as determined by having hibernation_ops).
  */
 
 static ssize_t disk_show(struct kset *kset, char *buf)
@@ -353,28 +378,25 @@ static ssize_t disk_show(struct kset *kset, char *buf)
 	int i;
 	char *start = buf;
 
-	for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
-		if (!pm_disk_modes[i])
+	for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+		if (!hibernation_modes[i])
 			continue;
 		switch (i) {
-		case PM_DISK_SHUTDOWN:
-		case PM_DISK_REBOOT:
-		case PM_DISK_TEST:
-		case PM_DISK_TESTPROC:
+		case HIBERNATION_SHUTDOWN:
+		case HIBERNATION_REBOOT:
+		case HIBERNATION_TEST:
+		case HIBERNATION_TESTPROC:
 			break;
-		default:
-			if (pm_ops && pm_ops->enter &&
-			    (i == pm_ops->pm_disk_mode))
+		case HIBERNATION_PLATFORM:
+			if (hibernation_ops)
 				break;
 			/* not a valid mode, continue with loop */
 			continue;
 		}
-		if (i == pm_disk_mode)
-			buf += sprintf(buf, "[%s]", pm_disk_modes[i]);
+		if (i == hibernation_mode)
+			buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
 		else
-			buf += sprintf(buf, "%s", pm_disk_modes[i]);
-		if (i+1 != PM_DISK_MAX)
-			buf += sprintf(buf, " ");
+			buf += sprintf(buf, "%s ", hibernation_modes[i]);
 	}
 	buf += sprintf(buf, "\n");
 	return buf-start;
@@ -387,39 +409,38 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
 	int i;
 	int len;
 	char *p;
-	suspend_disk_method_t mode = 0;
+	int mode = HIBERNATION_INVALID;
 
 	p = memchr(buf, '\n', n);
 	len = p ? p - buf : n;
 
 	mutex_lock(&pm_mutex);
-	for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
-		if (!strncmp(buf, pm_disk_modes[i], len)) {
+	for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+		if (!strncmp(buf, hibernation_modes[i], len)) {
 			mode = i;
 			break;
 		}
 	}
-	if (mode) {
+	if (mode != HIBERNATION_INVALID) {
 		switch (mode) {
-		case PM_DISK_SHUTDOWN:
-		case PM_DISK_REBOOT:
-		case PM_DISK_TEST:
-		case PM_DISK_TESTPROC:
-			pm_disk_mode = mode;
+		case HIBERNATION_SHUTDOWN:
+		case HIBERNATION_REBOOT:
+		case HIBERNATION_TEST:
+		case HIBERNATION_TESTPROC:
+			hibernation_mode = mode;
 			break;
-		default:
-			if (pm_ops && pm_ops->enter &&
-			    (mode == pm_ops->pm_disk_mode))
-				pm_disk_mode = mode;
+		case HIBERNATION_PLATFORM:
+			if (hibernation_ops)
+				hibernation_mode = mode;
 			else
 				error = -EINVAL;
 		}
-	} else {
+	} else
 		error = -EINVAL;
-	}
 
-	pr_debug("PM: suspend-to-disk mode set to '%s'\n",
-		 pm_disk_modes[mode]);
+	if (!error)
+		pr_debug("PM: suspend-to-disk mode set to '%s'\n",
+			 hibernation_modes[mode]);
 	mutex_unlock(&pm_mutex);
 	return error ? error : n;
 }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f6dda685e7e2..40d56a31245e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -30,7 +30,6 @@
 DEFINE_MUTEX(pm_mutex);
 
 struct pm_ops *pm_ops;
-suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
 
 /**
  *	pm_set_ops - Set the global power method table. 
@@ -41,10 +40,6 @@ void pm_set_ops(struct pm_ops * ops)
 {
 	mutex_lock(&pm_mutex);
 	pm_ops = ops;
-	if (ops && ops->pm_disk_mode != PM_DISK_INVALID) {
-		pm_disk_mode = ops->pm_disk_mode;
-	} else
-		pm_disk_mode = PM_DISK_SHUTDOWN;
 	mutex_unlock(&pm_mutex);
 }
 
@@ -184,24 +179,12 @@ static void suspend_finish(suspend_state_t state)
 static const char * const pm_states[PM_SUSPEND_MAX] = {
 	[PM_SUSPEND_STANDBY]	= "standby",
 	[PM_SUSPEND_MEM]	= "mem",
-	[PM_SUSPEND_DISK]	= "disk",
 };
 
 static inline int valid_state(suspend_state_t state)
 {
-	/* Suspend-to-disk does not really need low-level support.
-	 * It can work with shutdown/reboot if needed. If it isn't
-	 * configured, then it cannot be supported.
-	 */
-	if (state == PM_SUSPEND_DISK)
-#ifdef CONFIG_SOFTWARE_SUSPEND
-		return 1;
-#else
-		return 0;
-#endif
-
-	/* all other states need lowlevel support and need to be
-	 * valid to the lowlevel implementation, no valid callback
+	/* All states need lowlevel support and need to be valid
+	 * to the lowlevel implementation, no valid callback
 	 * implies that none are valid. */
 	if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state))
 		return 0;
@@ -229,11 +212,6 @@ static int enter_state(suspend_state_t state)
 	if (!mutex_trylock(&pm_mutex))
 		return -EBUSY;
 
-	if (state == PM_SUSPEND_DISK) {
-		error = pm_suspend_disk();
-		goto Unlock;
-	}
-
 	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
 	if ((error = suspend_prepare(state)))
 		goto Unlock;
@@ -251,7 +229,7 @@ static int enter_state(suspend_state_t state)
 
 /**
  *	pm_suspend - Externally visible function for suspending system.
- *	@state:		Enumarted value of state to enter.
+ *	@state:		Enumerated value of state to enter.
  *
  *	Determine whether or not value is within range, get state 
  *	structure, and enter (above).
@@ -289,7 +267,13 @@ static ssize_t state_show(struct kset *kset, char *buf)
 		if (pm_states[i] && valid_state(i))
 			s += sprintf(s,"%s ", pm_states[i]);
 	}
-	s += sprintf(s,"\n");
+#ifdef CONFIG_SOFTWARE_SUSPEND
+	s += sprintf(s, "%s\n", "disk");
+#else
+	if (s != buf)
+		/* convert the last space to a newline */
+		*(s-1) = '\n';
+#endif
 	return (s - buf);
 }
 
@@ -304,6 +288,12 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
 	p = memchr(buf, '\n', n);
 	len = p ? p - buf : n;
 
+	/* First, check if we are requested to hibernate */
+	if (!strncmp(buf, "disk", len)) {
+		error = hibernate();
+		return error ? error : n;
+	}
+
 	for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
 		if (*s && !strncmp(buf, *s, len))
 			break;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 34b43542785a..51381487103f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -25,12 +25,7 @@ struct swsusp_info {
  */
 #define SPARE_PAGES	((1024 * 1024) >> PAGE_SHIFT)
 
-extern int pm_suspend_disk(void);
-#else
-static inline int pm_suspend_disk(void)
-{
-	return -EPERM;
-}
+extern struct hibernation_ops *hibernation_ops;
 #endif
 
 extern int pfn_is_nosave(unsigned long);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 040560d9c312..24d7d78e6f42 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -130,16 +130,16 @@ static inline int platform_prepare(void)
 {
 	int error = 0;
 
-	if (pm_ops && pm_ops->prepare)
-		error = pm_ops->prepare(PM_SUSPEND_DISK);
+	if (hibernation_ops)
+		error = hibernation_ops->prepare();
 
 	return error;
 }
 
 static inline void platform_finish(void)
 {
-	if (pm_ops && pm_ops->finish)
-		pm_ops->finish(PM_SUSPEND_DISK);
+	if (hibernation_ops)
+		hibernation_ops->finish();
 }
 
 static inline int snapshot_suspend(int platform_suspend)
@@ -384,7 +384,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		switch (arg) {
 
 		case PMOPS_PREPARE:
-			if (pm_ops && pm_ops->enter) {
+			if (hibernation_ops) {
 				data->platform_suspend = 1;
 				error = 0;
 			} else {
@@ -395,8 +395,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		case PMOPS_ENTER:
 			if (data->platform_suspend) {
 				kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
-				error = pm_ops->enter(PM_SUSPEND_DISK);
-				error = 0;
+				error = hibernation_ops->enter();
 			}
 			break;
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 926bf9d7ac45..8ecfe3473779 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -881,7 +881,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 #ifdef CONFIG_SOFTWARE_SUSPEND
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
 		{
-			int ret = pm_suspend(PM_SUSPEND_DISK);
+			int ret = hibernate();
 			unlock_kernel();
 			return ret;
 		}
-- 
cgit v1.2.3


From dd2a345f8f002845636dbf5d2d768bb5cd8a5f59 Mon Sep 17 00:00:00 2001
From: Dave Gilbert <linux@treblig.org>
Date: Wed, 9 May 2007 02:33:24 -0700
Subject: Display all possible partitions when the root filesystem failed to
 mount

Display all possible partitions when the root filesystem is not mounted.
This helps to track spell'o's and missing drivers.

Updated to work with newer kernels.

Example output:

VFS: Cannot open root device "foobar" or unknown-block(0,0)
Please append a correct "root=" boot option; here are the available partitions:
0800    8388608 sda driver: sd
  0801     192748 sda1
  0802    8193150 sda2
0810    4194304 sdb driver: sd
Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(0,0)

[akpm@linux-foundation.org: cleanups, fix printk warnings]
Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Cc: Dave Gilbert <linux@treblig.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/genhd.c         | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/genhd.h |  1 +
 init/do_mounts.c      |  7 ++++++-
 3 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index b5664440896c..93a2cf654597 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -213,6 +213,59 @@ struct gendisk *get_gendisk(dev_t dev, int *part)
 	return  kobj ? to_disk(kobj) : NULL;
 }
 
+/*
+ * print a full list of all partitions - intended for places where the root
+ * filesystem can't be mounted and thus to give the victim some idea of what
+ * went wrong
+ */
+void __init printk_all_partitions(void)
+{
+	int n;
+	struct gendisk *sgp;
+
+	mutex_lock(&block_subsys_lock);
+	/* For each block device... */
+	list_for_each_entry(sgp, &block_subsys.list, kobj.entry) {
+		char buf[BDEVNAME_SIZE];
+		/*
+		 * Don't show empty devices or things that have been surpressed
+		 */
+		if (get_capacity(sgp) == 0 ||
+		    (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
+			continue;
+
+		/*
+		 * Note, unlike /proc/partitions, I am showing the numbers in
+		 * hex - the same format as the root= option takes.
+		 */
+		printk("%02x%02x %10llu %s",
+			sgp->major, sgp->first_minor,
+			(unsigned long long)get_capacity(sgp) >> 1,
+			disk_name(sgp, 0, buf));
+		if (sgp->driverfs_dev != NULL &&
+		    sgp->driverfs_dev->driver != NULL)
+			printk(" driver: %s\n",
+				sgp->driverfs_dev->driver->name);
+		else
+			printk(" (driver?)\n");
+
+		/* now show the partitions */
+		for (n = 0; n < sgp->minors - 1; ++n) {
+			if (sgp->part[n] == NULL)
+				continue;
+			if (sgp->part[n]->nr_sects == 0)
+				continue;
+			printk("  %02x%02x %10llu %s\n",
+				sgp->major, n + 1 + sgp->first_minor,
+				(unsigned long long)sgp->part[n]->nr_sects >> 1,
+				disk_name(sgp, n + 1, buf));
+		} /* partition subloop */
+	} /* Block device loop */
+
+	mutex_unlock(&block_subsys_lock);
+	return;
+}
+
 #ifdef CONFIG_PROC_FS
 /* iterator */
 static void *part_start(struct seq_file *part, loff_t *pos)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 2c65da7cabb2..f589559cf070 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -413,6 +413,7 @@ char *disk_name (struct gendisk *hd, int part, char *buf);
 extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
 extern void add_partition(struct gendisk *, int, sector_t, sector_t, int);
 extern void delete_partition(struct gendisk *, int);
+extern void printk_all_partitions(void);
 
 extern struct gendisk *alloc_disk_node(int minors, int node_id);
 extern struct gendisk *alloc_disk(int minors);
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 3f57ed4599d6..46fe407fb03e 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -7,6 +7,7 @@
 #include <linux/root_dev.h>
 #include <linux/security.h>
 #include <linux/delay.h>
+#include <linux/genhd.h>
 #include <linux/mount.h>
 #include <linux/device.h>
 #include <linux/init.h>
@@ -308,17 +309,21 @@ retry:
 	        /*
 		 * Allow the user to distinguish between failed sys_open
 		 * and bad superblock on root device.
+		 * and give them a list of the available devices
 		 */
 #ifdef CONFIG_BLOCK
 		__bdevname(ROOT_DEV, b);
 #endif
 		printk("VFS: Cannot open root device \"%s\" or %s\n",
 				root_device_name, b);
-		printk("Please append a correct \"root=\" boot option\n");
+		printk("Please append a correct \"root=\" boot option; here are the available partitions:\n");
 
+		printk_all_partitions();
 		panic("VFS: Unable to mount root fs on %s", b);
 	}
 
+	printk("List of all partitions:\n");
+	printk_all_partitions();
 	printk("No filesystem could mount root, tried: ");
 	for (p = fs_names; *p; p += strlen(p)+1)
 		printk(" %s", p);
-- 
cgit v1.2.3


From 2f4dfe206a2fc07099dfad77a8ea2f4b4ae2140f Mon Sep 17 00:00:00 2001
From: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Date: Wed, 9 May 2007 02:33:25 -0700
Subject: Remove hardcoding of hard_smp_processor_id on UP systems

With the advent of kdump, the assumption that the boot CPU when booting an UP
kernel is always the CPU with a particular hardware ID (often 0) (usually
referred to as BSP on some architectures) is not valid anymore.  The reason
being that the dump capture kernel boots on the crashed CPU (the CPU that
invoked crash_kexec), which may be or may not be that particular CPU.

Move definition of hard_smp_processor_id for the UP case to
architecture-specific code ("asm/smp.h") where it belongs, so that each
architecture can provide its own implementation.

Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Cc: "Luck, Tony" <tony.luck@intel.com>
Acked-by: Andi Kleen <ak@suse.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc64/kernel/traps.c | 1 +
 include/asm-alpha/smp.h     | 1 +
 include/asm-i386/smp.h      | 3 ++-
 include/asm-ia64/smp.h      | 3 ++-
 include/asm-m32r/smp.h      | 6 +++++-
 include/asm-powerpc/smp.h   | 1 +
 include/asm-s390/smp.h      | 1 +
 include/asm-sparc/smp.h     | 1 +
 include/asm-sparc64/smp.h   | 1 +
 include/asm-um/smp.h        | 4 ++++
 include/asm-x86_64/smp.h    | 4 +++-
 include/linux/smp.h         | 1 -
 12 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c
index dc652f210290..d0fde36395b4 100644
--- a/arch/sparc64/kernel/traps.c
+++ b/arch/sparc64/kernel/traps.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/kdebug.h>
 
+#include <asm/smp.h>
 #include <asm/delay.h>
 #include <asm/system.h>
 #include <asm/ptrace.h>
diff --git a/include/asm-alpha/smp.h b/include/asm-alpha/smp.h
index a1a1eca6be45..286e1d844f63 100644
--- a/include/asm-alpha/smp.h
+++ b/include/asm-alpha/smp.h
@@ -51,6 +51,7 @@ int smp_call_function_on_cpu(void (*func) (void *info), void *info,int retry, in
 
 #else /* CONFIG_SMP */
 
+#define hard_smp_processor_id()		0
 #define smp_call_function_on_cpu(func,info,retry,wait,cpu)    ({ 0; })
 
 #endif /* CONFIG_SMP */
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index 090abc1da32a..3243fa6f455f 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -147,12 +147,13 @@ extern unsigned int num_processors;
 
 #else /* CONFIG_SMP */
 
+#define hard_smp_processor_id()		0
 #define safe_smp_processor_id()		0
 #define cpu_physical_id(cpu)		boot_cpu_physical_apicid
 
 #define NO_PROC_ID		0xFF		/* No processor magic marker */
 
-#endif
+#endif /* CONFIG_SMP */
 
 #ifndef __ASSEMBLY__
 
diff --git a/include/asm-ia64/smp.h b/include/asm-ia64/smp.h
index 60fd4ae014f6..62014b643ecd 100644
--- a/include/asm-ia64/smp.h
+++ b/include/asm-ia64/smp.h
@@ -128,8 +128,9 @@ extern void unlock_ipi_calllock(void);
 extern void identify_siblings (struct cpuinfo_ia64 *);
 extern int is_multithreading_enabled(void);
 
-#else
+#else /* CONFIG_SMP */
 
+#define hard_smp_processor_id()		0
 #define cpu_logical_id(i)		0
 #define cpu_physical_id(i)		ia64_get_lid()
 
diff --git a/include/asm-m32r/smp.h b/include/asm-m32r/smp.h
index abd937ac5239..078e1a51a042 100644
--- a/include/asm-m32r/smp.h
+++ b/include/asm-m32r/smp.h
@@ -108,6 +108,10 @@ extern unsigned long send_IPI_mask_phys(cpumask_t, int, int);
 #define IPI_SHIFT	(0)
 #define NR_IPIS		(8)
 
-#endif	/* CONFIG_SMP */
+#else	/* CONFIG_SMP */
+
+#define hard_smp_processor_id()		0
+
+#endif /* CONFIG_SMP */
 
 #endif	/* _ASM_M32R_SMP_H */
diff --git a/include/asm-powerpc/smp.h b/include/asm-powerpc/smp.h
index 01717f266dc9..d037f50580e2 100644
--- a/include/asm-powerpc/smp.h
+++ b/include/asm-powerpc/smp.h
@@ -83,6 +83,7 @@ extern void __cpu_die(unsigned int cpu);
 
 #else
 /* for UP */
+#define hard_smp_processor_id()		0
 #define smp_setup_cpu_maps()
 
 #endif /* CONFIG_SMP */
diff --git a/include/asm-s390/smp.h b/include/asm-s390/smp.h
index 0a28e6d6ef40..76e424f718c6 100644
--- a/include/asm-s390/smp.h
+++ b/include/asm-s390/smp.h
@@ -110,6 +110,7 @@ static inline void smp_send_stop(void)
 	__load_psw_mask(psw_kernel_bits & ~PSW_MASK_MCHECK);
 }
 
+#define hard_smp_processor_id()		0
 #define smp_cpu_not_running(cpu)	1
 #define smp_setup_cpu_possible_map()	do { } while (0)
 #endif
diff --git a/include/asm-sparc/smp.h b/include/asm-sparc/smp.h
index b9da9a600e35..b3f492208fd2 100644
--- a/include/asm-sparc/smp.h
+++ b/include/asm-sparc/smp.h
@@ -165,6 +165,7 @@ void smp_setup_cpu_possible_map(void);
 
 #else /* SMP */
 
+#define hard_smp_processor_id()		0
 #define smp_setup_cpu_possible_map() do { } while (0)
 
 #endif /* !(SMP) */
diff --git a/include/asm-sparc64/smp.h b/include/asm-sparc64/smp.h
index cca54804b722..869d16fb907b 100644
--- a/include/asm-sparc64/smp.h
+++ b/include/asm-sparc64/smp.h
@@ -48,6 +48,7 @@ extern unsigned char boot_cpu_id;
 
 #else
 
+#define hard_smp_processor_id()		0
 #define smp_setup_cpu_possible_map() do { } while (0)
 #define boot_cpu_id	(0)
 
diff --git a/include/asm-um/smp.h b/include/asm-um/smp.h
index ca552261ed1f..84f8cf29324e 100644
--- a/include/asm-um/smp.h
+++ b/include/asm-um/smp.h
@@ -24,6 +24,10 @@ extern inline void smp_cpus_done(unsigned int maxcpus)
 
 extern struct task_struct *idle_threads[NR_CPUS];
 
+#else
+
+#define hard_smp_processor_id()		0
+
 #endif
 
 #endif
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index d5704421456b..f62fda527439 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -71,7 +71,9 @@ extern unsigned __cpuinitdata disabled_cpus;
 
 #define NO_PROC_ID		0xFF		/* No processor magic marker */
 
-#endif
+#else /* CONFIG_SMP */
+#define hard_smp_processor_id() 0
+#endif /* CONFIG_SMP */
 
 /*
  * Some lowlevel functions might want to know about
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 7ba23ec8211b..3f70149eabbb 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -83,7 +83,6 @@ void smp_prepare_boot_cpu(void);
  *	These macros fold the SMP functionality into a single CPU system
  */
 #define raw_smp_processor_id()			0
-#define hard_smp_processor_id()			0
 static inline int up_smp_call_function(void)
 {
 	return 0;
-- 
cgit v1.2.3


From 8813d1c00ca923c1683da625ff85959be1db9a49 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@SteelEye.com>
Date: Wed, 9 May 2007 02:33:30 -0700
Subject: mca: add integrated device bus matching

The MCA bus has a few "integrated" functions, which are effectively virtual
slots on the bus.  The problem is that these special functions don't have
dedicated pos IDs, so we have to manufacture ids for them outside the pos
space ...  and these ids can't be matched by the standard matching function,
so add a special registration that requests a list of pos ids or a particular
integrated function.

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mca/mca-bus.c    | 26 ++++++++++++++++----------
 drivers/mca/mca-driver.c | 13 +++++++++++++
 include/linux/mca.h      |  2 ++
 3 files changed, 31 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mca/mca-bus.c b/drivers/mca/mca-bus.c
index a70fe00aea16..67b8e9453b19 100644
--- a/drivers/mca/mca-bus.c
+++ b/drivers/mca/mca-bus.c
@@ -48,18 +48,24 @@ static int mca_bus_match (struct device *dev, struct device_driver *drv)
 	struct mca_device *mca_dev = to_mca_device (dev);
 	struct mca_driver *mca_drv = to_mca_driver (drv);
 	const unsigned short *mca_ids = mca_drv->id_table;
-	int i;
-
-	if (!mca_ids)
-		return 0;
-
-	for(i = 0; mca_ids[i]; i++) {
-		if (mca_ids[i] == mca_dev->pos_id) {
-			mca_dev->index = i;
-			return 1;
+	int i = 0;
+
+	if (mca_ids) {
+		for(i = 0; mca_ids[i]; i++) {
+			if (mca_ids[i] == mca_dev->pos_id) {
+				mca_dev->index = i;
+				return 1;
+			}
 		}
 	}
-
+	/* If the integrated id is present, treat it as though it were an
+	 * additional id in the id_table (it can't be because by definition,
+	 * integrated id's overflow a short */
+	if (mca_drv->integrated_id && mca_dev->pos_id ==
+	    mca_drv->integrated_id) {
+		mca_dev->index = i;
+		return 1;
+	}
 	return 0;
 }
 
diff --git a/drivers/mca/mca-driver.c b/drivers/mca/mca-driver.c
index 2223466b3d8a..32cd39bcc715 100644
--- a/drivers/mca/mca-driver.c
+++ b/drivers/mca/mca-driver.c
@@ -36,12 +36,25 @@ int mca_register_driver(struct mca_driver *mca_drv)
 		mca_drv->driver.bus = &mca_bus_type;
 		if ((r = driver_register(&mca_drv->driver)) < 0)
 			return r;
+		mca_drv->integrated_id = 0;
 	}
 
 	return 0;
 }
 EXPORT_SYMBOL(mca_register_driver);
 
+int mca_register_driver_integrated(struct mca_driver *mca_driver,
+				   int integrated_id)
+{
+	int r = mca_register_driver(mca_driver);
+
+	if (!r)
+		mca_driver->integrated_id = integrated_id;
+
+	return r;
+}
+EXPORT_SYMBOL(mca_register_driver_integrated);
+
 void mca_unregister_driver(struct mca_driver *mca_drv)
 {
 	if (MCA_bus)
diff --git a/include/linux/mca.h b/include/linux/mca.h
index 5cff2923092b..37972704617f 100644
--- a/include/linux/mca.h
+++ b/include/linux/mca.h
@@ -94,6 +94,7 @@ struct mca_bus {
 struct mca_driver {
 	const short		*id_table;
 	void			*driver_data;
+	int			integrated_id;
 	struct device_driver	driver;
 };
 #define to_mca_driver(mdriver) container_of(mdriver, struct mca_driver, driver)
@@ -125,6 +126,7 @@ extern enum MCA_AdapterStatus mca_device_status(struct mca_device *mca_dev);
 extern struct bus_type mca_bus_type;
 
 extern int mca_register_driver(struct mca_driver *drv);
+extern int mca_register_driver_integrated(struct mca_driver *, int);
 extern void mca_unregister_driver(struct mca_driver *drv);
 
 /* WARNING: only called by the boot time device setup */
-- 
cgit v1.2.3


From 55c0d1f83e481dd6c77f52f7dcfeb043b8b740fa Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 9 May 2007 02:33:37 -0700
Subject: Move sig_kernel_* et al macros to linux/signal.h

This patch moves the sig_kernel_* and related macros from kernel/signal.c
to linux/signal.h, and cleans them up slightly.  I need the sig_kernel_*
macros for default signal behavior in the utrace code, and want to avoid
duplication or overhead to share the knowledge.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/signal.h | 125 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/signal.c        | 119 ----------------------------------------------
 2 files changed, 125 insertions(+), 119 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 14749056dd63..3fa0fab4a04b 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -243,6 +243,131 @@ extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
 
 extern struct kmem_cache *sighand_cachep;
 
+/*
+ * In POSIX a signal is sent either to a specific thread (Linux task)
+ * or to the process as a whole (Linux thread group).  How the signal
+ * is sent determines whether it's to one thread or the whole group,
+ * which determines which signal mask(s) are involved in blocking it
+ * from being delivered until later.  When the signal is delivered,
+ * either it's caught or ignored by a user handler or it has a default
+ * effect that applies to the whole thread group (POSIX process).
+ *
+ * The possible effects an unblocked signal set to SIG_DFL can have are:
+ *   ignore	- Nothing Happens
+ *   terminate	- kill the process, i.e. all threads in the group,
+ * 		  similar to exit_group.  The group leader (only) reports
+ *		  WIFSIGNALED status to its parent.
+ *   coredump	- write a core dump file describing all threads using
+ *		  the same mm and then kill all those threads
+ *   stop 	- stop all the threads in the group, i.e. TASK_STOPPED state
+ *
+ * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
+ * Other signals when not blocked and set to SIG_DFL behaves as follows.
+ * The job control signals also have other special effects.
+ *
+ *	+--------------------+------------------+
+ *	|  POSIX signal      |  default action  |
+ *	+--------------------+------------------+
+ *	|  SIGHUP            |  terminate	|
+ *	|  SIGINT            |	terminate	|
+ *	|  SIGQUIT           |	coredump 	|
+ *	|  SIGILL            |	coredump 	|
+ *	|  SIGTRAP           |	coredump 	|
+ *	|  SIGABRT/SIGIOT    |	coredump 	|
+ *	|  SIGBUS            |	coredump 	|
+ *	|  SIGFPE            |	coredump 	|
+ *	|  SIGKILL           |	terminate(+)	|
+ *	|  SIGUSR1           |	terminate	|
+ *	|  SIGSEGV           |	coredump 	|
+ *	|  SIGUSR2           |	terminate	|
+ *	|  SIGPIPE           |	terminate	|
+ *	|  SIGALRM           |	terminate	|
+ *	|  SIGTERM           |	terminate	|
+ *	|  SIGCHLD           |	ignore   	|
+ *	|  SIGCONT           |	ignore(*)	|
+ *	|  SIGSTOP           |	stop(*)(+)  	|
+ *	|  SIGTSTP           |	stop(*)  	|
+ *	|  SIGTTIN           |	stop(*)  	|
+ *	|  SIGTTOU           |	stop(*)  	|
+ *	|  SIGURG            |	ignore   	|
+ *	|  SIGXCPU           |	coredump 	|
+ *	|  SIGXFSZ           |	coredump 	|
+ *	|  SIGVTALRM         |	terminate	|
+ *	|  SIGPROF           |	terminate	|
+ *	|  SIGPOLL/SIGIO     |	terminate	|
+ *	|  SIGSYS/SIGUNUSED  |	coredump 	|
+ *	|  SIGSTKFLT         |	terminate	|
+ *	|  SIGWINCH          |	ignore   	|
+ *	|  SIGPWR            |	terminate	|
+ *	|  SIGRTMIN-SIGRTMAX |	terminate       |
+ *	+--------------------+------------------+
+ *	|  non-POSIX signal  |  default action  |
+ *	+--------------------+------------------+
+ *	|  SIGEMT            |  coredump	|
+ *	+--------------------+------------------+
+ *
+ * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
+ * (*) Special job control effects:
+ * When SIGCONT is sent, it resumes the process (all threads in the group)
+ * from TASK_STOPPED state and also clears any pending/queued stop signals
+ * (any of those marked with "stop(*)").  This happens regardless of blocking,
+ * catching, or ignoring SIGCONT.  When any stop signal is sent, it clears
+ * any pending/queued SIGCONT signals; this happens regardless of blocking,
+ * catching, or ignored the stop signal, though (except for SIGSTOP) the
+ * default action of stopping the process may happen later or never.
+ */
+
+#ifdef SIGEMT
+#define SIGEMT_MASK	rt_sigmask(SIGEMT)
+#else
+#define SIGEMT_MASK	0
+#endif
+
+#if SIGRTMIN > BITS_PER_LONG
+#define rt_sigmask(sig)	(1ULL << ((sig)-1))
+#else
+#define rt_sigmask(sig)	sigmask(sig)
+#endif
+#define siginmask(sig, mask) (rt_sigmask(sig) & (mask))
+
+#define SIG_KERNEL_ONLY_MASK (\
+	rt_sigmask(SIGKILL)   |  rt_sigmask(SIGSTOP))
+
+#define SIG_KERNEL_STOP_MASK (\
+	rt_sigmask(SIGSTOP)   |  rt_sigmask(SIGTSTP)   | \
+	rt_sigmask(SIGTTIN)   |  rt_sigmask(SIGTTOU)   )
+
+#define SIG_KERNEL_COREDUMP_MASK (\
+        rt_sigmask(SIGQUIT)   |  rt_sigmask(SIGILL)    | \
+	rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGABRT)   | \
+        rt_sigmask(SIGFPE)    |  rt_sigmask(SIGSEGV)   | \
+	rt_sigmask(SIGBUS)    |  rt_sigmask(SIGSYS)    | \
+        rt_sigmask(SIGXCPU)   |  rt_sigmask(SIGXFSZ)   | \
+	SIGEMT_MASK				       )
+
+#define SIG_KERNEL_IGNORE_MASK (\
+        rt_sigmask(SIGCONT)   |  rt_sigmask(SIGCHLD)   | \
+	rt_sigmask(SIGWINCH)  |  rt_sigmask(SIGURG)    )
+
+#define sig_kernel_only(sig) \
+	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_ONLY_MASK))
+#define sig_kernel_coredump(sig) \
+	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_COREDUMP_MASK))
+#define sig_kernel_ignore(sig) \
+	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_IGNORE_MASK))
+#define sig_kernel_stop(sig) \
+	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_STOP_MASK))
+
+#define sig_needs_tasklist(sig)	((sig) == SIGCONT)
+
+#define sig_user_defined(t, signr) \
+	(((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&	\
+	 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
+
+#define sig_fatal(t, signr) \
+	(!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
+	 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_SIGNAL_H */
diff --git a/kernel/signal.c b/kernel/signal.c
index 1368e67c8482..4c8f49eadf7d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -38,125 +38,6 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
-/*
- * In POSIX a signal is sent either to a specific thread (Linux task)
- * or to the process as a whole (Linux thread group).  How the signal
- * is sent determines whether it's to one thread or the whole group,
- * which determines which signal mask(s) are involved in blocking it
- * from being delivered until later.  When the signal is delivered,
- * either it's caught or ignored by a user handler or it has a default
- * effect that applies to the whole thread group (POSIX process).
- *
- * The possible effects an unblocked signal set to SIG_DFL can have are:
- *   ignore	- Nothing Happens
- *   terminate	- kill the process, i.e. all threads in the group,
- * 		  similar to exit_group.  The group leader (only) reports
- *		  WIFSIGNALED status to its parent.
- *   coredump	- write a core dump file describing all threads using
- *		  the same mm and then kill all those threads
- *   stop 	- stop all the threads in the group, i.e. TASK_STOPPED state
- *
- * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
- * Other signals when not blocked and set to SIG_DFL behaves as follows.
- * The job control signals also have other special effects.
- *
- *	+--------------------+------------------+
- *	|  POSIX signal      |  default action  |
- *	+--------------------+------------------+
- *	|  SIGHUP            |  terminate	|
- *	|  SIGINT            |	terminate	|
- *	|  SIGQUIT           |	coredump 	|
- *	|  SIGILL            |	coredump 	|
- *	|  SIGTRAP           |	coredump 	|
- *	|  SIGABRT/SIGIOT    |	coredump 	|
- *	|  SIGBUS            |	coredump 	|
- *	|  SIGFPE            |	coredump 	|
- *	|  SIGKILL           |	terminate(+)	|
- *	|  SIGUSR1           |	terminate	|
- *	|  SIGSEGV           |	coredump 	|
- *	|  SIGUSR2           |	terminate	|
- *	|  SIGPIPE           |	terminate	|
- *	|  SIGALRM           |	terminate	|
- *	|  SIGTERM           |	terminate	|
- *	|  SIGCHLD           |	ignore   	|
- *	|  SIGCONT           |	ignore(*)	|
- *	|  SIGSTOP           |	stop(*)(+)  	|
- *	|  SIGTSTP           |	stop(*)  	|
- *	|  SIGTTIN           |	stop(*)  	|
- *	|  SIGTTOU           |	stop(*)  	|
- *	|  SIGURG            |	ignore   	|
- *	|  SIGXCPU           |	coredump 	|
- *	|  SIGXFSZ           |	coredump 	|
- *	|  SIGVTALRM         |	terminate	|
- *	|  SIGPROF           |	terminate	|
- *	|  SIGPOLL/SIGIO     |	terminate	|
- *	|  SIGSYS/SIGUNUSED  |	coredump 	|
- *	|  SIGSTKFLT         |	terminate	|
- *	|  SIGWINCH          |	ignore   	|
- *	|  SIGPWR            |	terminate	|
- *	|  SIGRTMIN-SIGRTMAX |	terminate       |
- *	+--------------------+------------------+
- *	|  non-POSIX signal  |  default action  |
- *	+--------------------+------------------+
- *	|  SIGEMT            |  coredump	|
- *	+--------------------+------------------+
- *
- * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
- * (*) Special job control effects:
- * When SIGCONT is sent, it resumes the process (all threads in the group)
- * from TASK_STOPPED state and also clears any pending/queued stop signals
- * (any of those marked with "stop(*)").  This happens regardless of blocking,
- * catching, or ignoring SIGCONT.  When any stop signal is sent, it clears
- * any pending/queued SIGCONT signals; this happens regardless of blocking,
- * catching, or ignored the stop signal, though (except for SIGSTOP) the
- * default action of stopping the process may happen later or never.
- */
-
-#ifdef SIGEMT
-#define M_SIGEMT	M(SIGEMT)
-#else
-#define M_SIGEMT	0
-#endif
-
-#if SIGRTMIN > BITS_PER_LONG
-#define M(sig) (1ULL << ((sig)-1))
-#else
-#define M(sig) (1UL << ((sig)-1))
-#endif
-#define T(sig, mask) (M(sig) & (mask))
-
-#define SIG_KERNEL_ONLY_MASK (\
-	M(SIGKILL)   |  M(SIGSTOP)                                   )
-
-#define SIG_KERNEL_STOP_MASK (\
-	M(SIGSTOP)   |  M(SIGTSTP)   |  M(SIGTTIN)   |  M(SIGTTOU)   )
-
-#define SIG_KERNEL_COREDUMP_MASK (\
-        M(SIGQUIT)   |  M(SIGILL)    |  M(SIGTRAP)   |  M(SIGABRT)   | \
-        M(SIGFPE)    |  M(SIGSEGV)   |  M(SIGBUS)    |  M(SIGSYS)    | \
-        M(SIGXCPU)   |  M(SIGXFSZ)   |  M_SIGEMT                     )
-
-#define SIG_KERNEL_IGNORE_MASK (\
-        M(SIGCONT)   |  M(SIGCHLD)   |  M(SIGWINCH)  |  M(SIGURG)    )
-
-#define sig_kernel_only(sig) \
-		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_ONLY_MASK))
-#define sig_kernel_coredump(sig) \
-		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_COREDUMP_MASK))
-#define sig_kernel_ignore(sig) \
-		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_IGNORE_MASK))
-#define sig_kernel_stop(sig) \
-		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_STOP_MASK))
-
-#define sig_needs_tasklist(sig)	((sig) == SIGCONT)
-
-#define sig_user_defined(t, signr) \
-	(((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&	\
-	 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
-
-#define sig_fatal(t, signr) \
-	(!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
-	 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
 
 static int sig_ignored(struct task_struct *t, int sig)
 {
-- 
cgit v1.2.3


From 18d8362d517cb2bd97761294924fe6c2a6ee5e3c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 9 May 2007 02:33:39 -0700
Subject: mutex_lock_interruptible(): add __must_check

It's not sane to use mutex_lock_interruptible() and to then ignore the result.

Ditto down_interruptible(), but I'm lazy.

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mutex.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index b81bc2adaeff..0d50ea3df689 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -121,11 +121,12 @@ static inline int fastcall mutex_is_locked(struct mutex *lock)
  * Also see Documentation/mutex-design.txt.
  */
 extern void fastcall mutex_lock(struct mutex *lock);
-extern int fastcall mutex_lock_interruptible(struct mutex *lock);
+extern int __must_check fastcall mutex_lock_interruptible(struct mutex *lock);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
-extern int mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass);
+extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
+					unsigned int subclass);
 #else
 # define mutex_lock_nested(lock, subclass) mutex_lock(lock)
 # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
-- 
cgit v1.2.3


From b89deed32ccc96098bd6bc953c64bba6b847774f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:33:52 -0700
Subject: implement flush_work()

A basic problem with flush_scheduled_work() is that it blocks behind _all_
presently-queued works, rather than just the work whcih the caller wants to
flush.  If the caller holds some lock, and if one of the queued work happens
to want that lock as well then accidental deadlocks can occur.

One example of this is the phy layer: it wants to flush work while holding
rtnl_lock().  But if a linkwatch event happens to be queued, the phy code will
deadlock because the linkwatch callback function takes rtnl_lock.

So we implement a new function which will flush a *single* work - just the one
which the caller wants to free up.  Thus we avoid the accidental deadlocks
which can arise from unrelated subsystems' callbacks taking shared locks.

flush_work() non-blockingly dequeues the work_struct which we want to kill,
then it waits for its handler to complete on all CPUs.

Add ->current_work to the "struct cpu_workqueue_struct", it points to
currently running "struct work_struct". When flush_work(work) detects
->current_work == work, it inserts a barrier at the _head_ of ->worklist
(and thus right _after_ that work) and waits for completition. This means
that the next work fired on that CPU will be this barrier, or another
barrier queued by concurrent flush_work(), so the caller of flush_work()
will be woken before any "regular" work has a chance to run.

When wait_on_work() unlocks workqueue_mutex (or whatever we choose to protect
against CPU hotplug), CPU may go away. But in that case take_over_work() will
move a barrier we queued to another CPU, it will be fired sometime, and
wait_on_work() will be woken.

Actually, we are doing cleanup_workqueue_thread()->kthread_stop() before
take_over_work(), so cwq->thread should complete its ->worklist (and thus
the barrier), because currently we don't check kthread_should_stop() in
run_workqueue(). But even if we did, everything should be ok.

[akpm@osdl.org: cleanup]
[akpm@osdl.org: add flush_work_keventd() wrapper]
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h |  4 +-
 kernel/workqueue.c        | 95 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 95 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index f16ba1e0687d..26a70992dec8 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -178,6 +178,8 @@ extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq, struct delay
 extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	struct delayed_work *work, unsigned long delay);
 extern void FASTCALL(flush_workqueue(struct workqueue_struct *wq));
+extern void flush_work(struct workqueue_struct *wq, struct work_struct *work);
+extern void flush_work_keventd(struct work_struct *work);
 
 extern int FASTCALL(schedule_work(struct work_struct *work));
 extern int FASTCALL(run_scheduled_work(struct work_struct *work));
@@ -199,7 +201,7 @@ int execute_in_process_context(work_func_t fn, struct execute_work *);
  * Kill off a pending schedule_delayed_work().  Note that the work callback
  * function may still be running on return from cancel_delayed_work(), unless
  * it returns 1 and the work doesn't re-arm itself. Run flush_workqueue() or
- * cancel_work_sync() to wait on it.
+ * flush_work() or cancel_work_sync() to wait on it.
  */
 static inline int cancel_delayed_work(struct delayed_work *work)
 {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b7bb37ab03bc..918d55267a12 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -46,6 +46,7 @@ struct cpu_workqueue_struct {
 
 	struct workqueue_struct *wq;
 	struct task_struct *thread;
+	struct work_struct *current_work;
 
 	int run_depth;		/* Detect run_workqueue() recursion depth */
 
@@ -120,6 +121,7 @@ static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work
 	    && work_pending(work)
 	    && !list_empty(&work->entry)) {
 		work_func_t f = work->func;
+		cwq->current_work = work;
 		list_del_init(&work->entry);
 		spin_unlock_irqrestore(&cwq->lock, flags);
 
@@ -128,6 +130,7 @@ static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work
 		f(work);
 
 		spin_lock_irqsave(&cwq->lock, flags);
+		cwq->current_work = NULL;
 		ret = 1;
 	}
 	spin_unlock_irqrestore(&cwq->lock, flags);
@@ -166,6 +169,17 @@ int fastcall run_scheduled_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(run_scheduled_work);
 
+static void insert_work(struct cpu_workqueue_struct *cwq,
+				struct work_struct *work, int tail)
+{
+	set_wq_data(work, cwq);
+	if (tail)
+		list_add_tail(&work->entry, &cwq->worklist);
+	else
+		list_add(&work->entry, &cwq->worklist);
+	wake_up(&cwq->more_work);
+}
+
 /* Preempt must be disabled. */
 static void __queue_work(struct cpu_workqueue_struct *cwq,
 			 struct work_struct *work)
@@ -173,9 +187,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 	unsigned long flags;
 
 	spin_lock_irqsave(&cwq->lock, flags);
-	set_wq_data(work, cwq);
-	list_add_tail(&work->entry, &cwq->worklist);
-	wake_up(&cwq->more_work);
+	insert_work(cwq, work, 1);
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
@@ -305,6 +317,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 						struct work_struct, entry);
 		work_func_t f = work->func;
 
+		cwq->current_work = work;
 		list_del_init(cwq->worklist.next);
 		spin_unlock_irqrestore(&cwq->lock, flags);
 
@@ -325,6 +338,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 		}
 
 		spin_lock_irqsave(&cwq->lock, flags);
+		cwq->current_work = NULL;
 	}
 	cwq->run_depth--;
 	spin_unlock_irqrestore(&cwq->lock, flags);
@@ -449,6 +463,75 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
+static void wait_on_work(struct cpu_workqueue_struct *cwq,
+				struct work_struct *work)
+{
+	struct wq_barrier barr;
+	int running = 0;
+
+	spin_lock_irq(&cwq->lock);
+	if (unlikely(cwq->current_work == work)) {
+		init_wq_barrier(&barr);
+		insert_work(cwq, &barr.work, 0);
+		running = 1;
+	}
+	spin_unlock_irq(&cwq->lock);
+
+	if (unlikely(running)) {
+		mutex_unlock(&workqueue_mutex);
+		wait_for_completion(&barr.done);
+		mutex_lock(&workqueue_mutex);
+	}
+}
+
+/**
+ * flush_work - block until a work_struct's callback has terminated
+ * @wq: the workqueue on which the work is queued
+ * @work: the work which is to be flushed
+ *
+ * flush_work() will attempt to cancel the work if it is queued.  If the work's
+ * callback appears to be running, flush_work() will block until it has
+ * completed.
+ *
+ * flush_work() is designed to be used when the caller is tearing down data
+ * structures which the callback function operates upon.  It is expected that,
+ * prior to calling flush_work(), the caller has arranged for the work to not
+ * be requeued.
+ */
+void flush_work(struct workqueue_struct *wq, struct work_struct *work)
+{
+	struct cpu_workqueue_struct *cwq;
+
+	mutex_lock(&workqueue_mutex);
+	cwq = get_wq_data(work);
+	/* Was it ever queued ? */
+	if (!cwq)
+		goto out;
+
+	/*
+	 * This work can't be re-queued, and the lock above protects us
+	 * from take_over_work(), no need to re-check that get_wq_data()
+	 * is still the same when we take cwq->lock.
+	 */
+	spin_lock_irq(&cwq->lock);
+	list_del_init(&work->entry);
+	work_release(work);
+	spin_unlock_irq(&cwq->lock);
+
+	if (is_single_threaded(wq)) {
+		/* Always use first cpu's area. */
+		wait_on_work(per_cpu_ptr(wq->cpu_wq, singlethread_cpu), work);
+	} else {
+		int cpu;
+
+		for_each_online_cpu(cpu)
+			wait_on_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+	}
+out:
+	mutex_unlock(&workqueue_mutex);
+}
+EXPORT_SYMBOL_GPL(flush_work);
+
 static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
 						   int cpu, int freezeable)
 {
@@ -650,6 +733,12 @@ void flush_scheduled_work(void)
 }
 EXPORT_SYMBOL(flush_scheduled_work);
 
+void flush_work_keventd(struct work_struct *work)
+{
+	flush_work(keventd_wq, work);
+}
+EXPORT_SYMBOL(flush_work_keventd);
+
 /**
  * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work.
  * @wq:   the controlling workqueue structure
-- 
cgit v1.2.3


From 19a75d83ffeab004cfcfac64024ad3997bac7220 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 9 May 2007 02:33:56 -0700
Subject: kblockd: use flush_work

Switch the kblockd flushing from a global flush to a more specific
flush_work().

(akpm: bypassed maintainers, sorry.  There are other patches which depend on
this)

Cc: "Maciej W. Rozycki" <macro@linux-mips.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Jens Axboe <axboe@suse.de>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/as-iosched.c     | 2 +-
 block/ll_rw_blk.c      | 7 +++----
 include/linux/blkdev.h | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 640aa839d63f..109e91b91ffa 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1306,7 +1306,7 @@ static void as_exit_queue(elevator_t *e)
 	struct as_data *ad = e->elevator_data;
 
 	del_timer_sync(&ad->antic_timer);
-	kblockd_flush();
+	kblockd_flush_work(&ad->antic_work);
 
 	BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
 	BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index d99d402953a3..c059767c552c 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -1712,7 +1712,6 @@ EXPORT_SYMBOL(blk_stop_queue);
 void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->unplug_timer);
-	kblockd_flush();
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -3632,11 +3631,11 @@ int kblockd_schedule_work(struct work_struct *work)
 
 EXPORT_SYMBOL(kblockd_schedule_work);
 
-void kblockd_flush(void)
+void kblockd_flush_work(struct work_struct *work)
 {
-	flush_workqueue(kblockd_workqueue);
+	flush_work(kblockd_workqueue, work);
 }
-EXPORT_SYMBOL(kblockd_flush);
+EXPORT_SYMBOL(kblockd_flush_work);
 
 int __init blk_dev_init(void)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a686eabe22d6..db5b00a792f5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -854,7 +854,7 @@ static inline void put_dev_sector(Sector p)
 
 struct work_struct;
 int kblockd_schedule_work(struct work_struct *work);
-void kblockd_flush(void);
+void kblockd_flush_work(struct work_struct *work);
 
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
-- 
cgit v1.2.3


From 7c9cb38302e78d24e37f7d8a2ea7eed4ae5f2fa7 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@comcast.net>
Date: Wed, 9 May 2007 02:34:01 -0700
Subject: relay: use plain timer instead of delayed work

relay doesn't need to use schedule_delayed_work() for waking readers
when a simple timer will do.

Signed-off-by: Tom Zanussi <zanussi@comcast.net>
Cc: Satyam Sharma <satyam.sharma@gmail.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/relay.h |  3 ++-
 kernel/relay.c        | 35 ++++++++++++++++-------------------
 2 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/relay.h b/include/linux/relay.h
index 759a0f97bec2..6cd8c4425fc7 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -12,6 +12,7 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/timer.h>
 #include <linux/wait.h>
 #include <linux/list.h>
 #include <linux/fs.h>
@@ -38,7 +39,7 @@ struct rchan_buf
 	size_t subbufs_consumed;	/* count of sub-buffers consumed */
 	struct rchan *chan;		/* associated channel */
 	wait_queue_head_t read_wait;	/* reader wait queue */
-	struct delayed_work wake_readers; /* reader wake-up work struct */
+	struct timer_list timer; 	/* reader wake-up timer */
 	struct dentry *dentry;		/* channel file dentry */
 	struct kref kref;		/* channel buffer refcount */
 	struct page **page_array;	/* array of current buffer pages */
diff --git a/kernel/relay.c b/kernel/relay.c
index 577f251c7e28..e804589c863c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -310,16 +310,13 @@ static struct rchan_callbacks default_channel_callbacks = {
 
 /**
  *	wakeup_readers - wake up readers waiting on a channel
- *	@work: work struct that contains the the channel buffer
+ *	@data: contains the the channel buffer
  *
- *	This is the work function used to defer reader waking.  The
- *	reason waking is deferred is that calling directly from write
- *	causes problems if you're writing from say the scheduler.
+ *	This is the timer function used to defer reader waking.
  */
-static void wakeup_readers(struct work_struct *work)
+static void wakeup_readers(unsigned long data)
 {
-	struct rchan_buf *buf =
-		container_of(work, struct rchan_buf, wake_readers.work);
+	struct rchan_buf *buf = (struct rchan_buf *)data;
 	wake_up_interruptible(&buf->read_wait);
 }
 
@@ -337,11 +334,9 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
 	if (init) {
 		init_waitqueue_head(&buf->read_wait);
 		kref_init(&buf->kref);
-		INIT_DELAYED_WORK(&buf->wake_readers, NULL);
-	} else {
-		cancel_delayed_work(&buf->wake_readers);
-		flush_scheduled_work();
-	}
+		setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+	} else
+		del_timer_sync(&buf->timer);
 
 	buf->subbufs_produced = 0;
 	buf->subbufs_consumed = 0;
@@ -447,8 +442,7 @@ end:
 static void relay_close_buf(struct rchan_buf *buf)
 {
 	buf->finalized = 1;
-	cancel_delayed_work(&buf->wake_readers);
-	flush_scheduled_work();
+	del_timer_sync(&buf->timer);
 	kref_put(&buf->kref, relay_remove_buf);
 }
 
@@ -608,11 +602,14 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 		buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
 			buf->padding[old_subbuf];
 		smp_mb();
-		if (waitqueue_active(&buf->read_wait)) {
-			PREPARE_DELAYED_WORK(&buf->wake_readers,
-					     wakeup_readers);
-			schedule_delayed_work(&buf->wake_readers, 1);
-		}
+		if (waitqueue_active(&buf->read_wait))
+			/*
+			 * Calling wake_up_interruptible() from here
+			 * will deadlock if we happen to be logging
+			 * from the scheduler (trying to re-grab
+			 * rq->lock), so defer it.
+			 */
+			__mod_timer(&buf->timer, jiffies + 1);
 	}
 
 	old = buf->data;
-- 
cgit v1.2.3


From 6f7cc11aa6c7d5002e16096c7590944daece70ed Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 9 May 2007 02:34:02 -0700
Subject: Extend notifier_call_chain to count nr_calls made

Since 2.6.18-something, the community has been bugged by the problem to
provide a clean and a stable mechanism to postpone a cpu-hotplug event as
lock_cpu_hotplug was badly broken.

This is another proposal towards solving that problem.  This one is along the
lines of the solution provided in kernel/workqueue.c

Instead of having a global mechanism like lock_cpu_hotplug, we allow the
subsytems to define their own per-subsystem hot cpu mutexes.  These would be
taken(released) where ever we are currently calling
lock_cpu_hotplug(unlock_cpu_hotplug).

Also, in the per-subsystem hotcpu callback function,we take this mutex before
we handle any pre-cpu-hotplug events and release it once we finish handling
the post-cpu-hotplug events.  A standard means for doing this has been
provided in [PATCH 2/4] and demonstrated in [PATCH 3/4].

The ordering of these per-subsystem mutexes might still prove to be a
problem, but hopefully lockdep should help us get out of that muddle.

The patch set to be applied against linux-2.6.19-rc5 is as follows:

[PATCH 1/4] :	Extend notifier_call_chain with an option to specify the
		number of notifications to be sent and also count the
		number of notifications actually sent.

[PATCH 2/4] :	Define events CPU_LOCK_ACQUIRE and CPU_LOCK_RELEASE
		and send out notifications for these in _cpu_up and
		_cpu_down. This would help us standardise the acquire and
		release of the subsystem locks in the hotcpu
		callback functions of these subsystems.

[PATCH 3/4] :	Eliminate lock_cpu_hotplug from kernel/sched.c.

[PATCH 4/4] :	In workqueue_cpu_callback function, acquire(release) the
		workqueue_mutex while handling
		CPU_LOCK_ACQUIRE(CPU_LOCK_RELEASE).

If the per-subsystem-locking approach survives the test of time, we can expect
a slow phasing out of lock_cpu_hotplug, which has not yet been eliminated in
these patches :)

This patch:

Provide notifier_call_chain with an option to call only a specified number of
notifiers and also record the number of call to notifiers made.

The need for this enhancement was identified in the post entitled
"Slab - Eliminate lock_cpu_hotplug from slab"
(http://lkml.org/lkml/2006/10/28/92) by Ravikiran G Thirumalai and
Andrew Morton.

This patch adds two additional parameters to notifier_call_chain API namely
 - int nr_to_calls : Number of notifier_functions to be called.
 		     The don't care value is -1.

 - unsigned int *nr_calls : Records the total number of notifier_funtions
			    called by notifier_call_chain. The don't care
			    value is NULL.

[michal.k.k.piotrowski@gmail.com: build fix]
Credit: Andrew Morton <akpm@osdl.org>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Michal Piotrowski <michal.k.k.piotrowski@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/notifier.h | 52 +++++++++++++++------------
 kernel/sys.c             | 94 +++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 107 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 10a43ed0527e..e34221bf8946 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -112,32 +112,40 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 
 #ifdef __KERNEL__
 
-extern int atomic_notifier_chain_register(struct atomic_notifier_head *,
-		struct notifier_block *);
-extern int blocking_notifier_chain_register(struct blocking_notifier_head *,
-		struct notifier_block *);
-extern int raw_notifier_chain_register(struct raw_notifier_head *,
-		struct notifier_block *);
-extern int srcu_notifier_chain_register(struct srcu_notifier_head *,
-		struct notifier_block *);
-
-extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *,
-		struct notifier_block *);
-extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *,
-		struct notifier_block *);
-extern int raw_notifier_chain_unregister(struct raw_notifier_head *,
-		struct notifier_block *);
-extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *,
-		struct notifier_block *);
-
-extern int atomic_notifier_call_chain(struct atomic_notifier_head *,
+extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
+		struct notifier_block *nb);
+extern int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+		struct notifier_block *nb);
+extern int raw_notifier_chain_register(struct raw_notifier_head *nh,
+		struct notifier_block *nb);
+extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
+		struct notifier_block *nb);
+
+extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+		struct notifier_block *nb);
+extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
+		struct notifier_block *nb);
+extern int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
+		struct notifier_block *nb);
+extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
+		struct notifier_block *nb);
+
+extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
 		unsigned long val, void *v);
-extern int blocking_notifier_call_chain(struct blocking_notifier_head *,
+extern int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+	unsigned long val, void *v, int nr_to_call, int *nr_calls);
+extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
 		unsigned long val, void *v);
-extern int raw_notifier_call_chain(struct raw_notifier_head *,
+extern int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+	unsigned long val, void *v, int nr_to_call, int *nr_calls);
+extern int raw_notifier_call_chain(struct raw_notifier_head *nh,
 		unsigned long val, void *v);
-extern int srcu_notifier_call_chain(struct srcu_notifier_head *,
+extern int __raw_notifier_call_chain(struct raw_notifier_head *nh,
+	unsigned long val, void *v, int nr_to_call, int *nr_calls);
+extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 		unsigned long val, void *v);
+extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+	unsigned long val, void *v, int nr_to_call, int *nr_calls);
 
 #define NOTIFY_DONE		0x0000		/* Don't care */
 #define NOTIFY_OK		0x0001		/* Suits me */
diff --git a/kernel/sys.c b/kernel/sys.c
index 8ecfe3473779..d4985df21b60 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -134,19 +134,39 @@ static int notifier_chain_unregister(struct notifier_block **nl,
 	return -ENOENT;
 }
 
+/**
+ * notifier_call_chain - Informs the registered notifiers about an event.
+ *	@nl:		Pointer to head of the blocking notifier chain
+ *	@val:		Value passed unmodified to notifier function
+ *	@v:		Pointer passed unmodified to notifier function
+ *	@nr_to_call:	Number of notifier functions to be called. Don't care
+ *		     	value of this parameter is -1.
+ *	@nr_calls:	Records the number of notifications sent. Don't care
+ *		   	value of this field is NULL.
+ * 	@returns:	notifier_call_chain returns the value returned by the
+ *			last notifier function called.
+ */
+
 static int __kprobes notifier_call_chain(struct notifier_block **nl,
-		unsigned long val, void *v)
+					unsigned long val, void *v,
+					int nr_to_call,	int *nr_calls)
 {
 	int ret = NOTIFY_DONE;
 	struct notifier_block *nb, *next_nb;
 
 	nb = rcu_dereference(*nl);
-	while (nb) {
+
+	while (nb && nr_to_call) {
 		next_nb = rcu_dereference(nb->next);
 		ret = nb->notifier_call(nb, val, v);
+
+		if (nr_calls)
+			(*nr_calls)++;
+
 		if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
 			break;
 		nb = next_nb;
+		nr_to_call--;
 	}
 	return ret;
 }
@@ -205,10 +225,12 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
 EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
 
 /**
- *	atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ *	__atomic_notifier_call_chain - Call functions in an atomic notifier chain
  *	@nh: Pointer to head of the atomic notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
+ *	@nr_to_call: See the comment for notifier_call_chain.
+ *	@nr_calls: See the comment for notifier_call_chain.
  *
  *	Calls each function in a notifier chain in turn.  The functions
  *	run in an atomic context, so they must not block.
@@ -222,19 +244,27 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
  *	of the last notifier function called.
  */
  
-int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
-		unsigned long val, void *v)
+int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+					unsigned long val, void *v,
+					int nr_to_call, int *nr_calls)
 {
 	int ret;
 
 	rcu_read_lock();
-	ret = notifier_call_chain(&nh->head, val, v);
+	ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
 	rcu_read_unlock();
 	return ret;
 }
 
-EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
+
+int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+		unsigned long val, void *v)
+{
+	return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
+}
 
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
 /*
  *	Blocking notifier chain routines.  All access to the chain is
  *	synchronized by an rwsem.
@@ -304,10 +334,12 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
 EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
 
 /**
- *	blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ *	__blocking_notifier_call_chain - Call functions in a blocking notifier chain
  *	@nh: Pointer to head of the blocking notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
+ *	@nr_to_call: See comment for notifier_call_chain.
+ *	@nr_calls: See comment for notifier_call_chain.
  *
  *	Calls each function in a notifier chain in turn.  The functions
  *	run in a process context, so they are allowed to block.
@@ -320,8 +352,9 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
  *	of the last notifier function called.
  */
  
-int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
-		unsigned long val, void *v)
+int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+				   unsigned long val, void *v,
+				   int nr_to_call, int *nr_calls)
 {
 	int ret = NOTIFY_DONE;
 
@@ -332,12 +365,19 @@ int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
 	 */
 	if (rcu_dereference(nh->head)) {
 		down_read(&nh->rwsem);
-		ret = notifier_call_chain(&nh->head, val, v);
+		ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
+					nr_calls);
 		up_read(&nh->rwsem);
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
 
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+		unsigned long val, void *v)
+{
+	return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
+}
 EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
 
 /*
@@ -383,10 +423,12 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
 EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
 
 /**
- *	raw_notifier_call_chain - Call functions in a raw notifier chain
+ *	__raw_notifier_call_chain - Call functions in a raw notifier chain
  *	@nh: Pointer to head of the raw notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
+ *	@nr_to_call: See comment for notifier_call_chain.
+ *	@nr_calls: See comment for notifier_call_chain
  *
  *	Calls each function in a notifier chain in turn.  The functions
  *	run in an undefined context.
@@ -400,10 +442,19 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
  *	of the last notifier function called.
  */
 
+int __raw_notifier_call_chain(struct raw_notifier_head *nh,
+			      unsigned long val, void *v,
+			      int nr_to_call, int *nr_calls)
+{
+	return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+}
+
+EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
+
 int raw_notifier_call_chain(struct raw_notifier_head *nh,
 		unsigned long val, void *v)
 {
-	return notifier_call_chain(&nh->head, val, v);
+	return __raw_notifier_call_chain(nh, val, v, -1, NULL);
 }
 
 EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
@@ -478,10 +529,12 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
 EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
 
 /**
- *	srcu_notifier_call_chain - Call functions in an SRCU notifier chain
+ *	__srcu_notifier_call_chain - Call functions in an SRCU notifier chain
  *	@nh: Pointer to head of the SRCU notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
+ *	@nr_to_call: See comment for notifier_call_chain.
+ *	@nr_calls: See comment for notifier_call_chain
  *
  *	Calls each function in a notifier chain in turn.  The functions
  *	run in a process context, so they are allowed to block.
@@ -494,18 +547,25 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
  *	of the last notifier function called.
  */
 
-int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
-		unsigned long val, void *v)
+int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+			       unsigned long val, void *v,
+			       int nr_to_call, int *nr_calls)
 {
 	int ret;
 	int idx;
 
 	idx = srcu_read_lock(&nh->srcu);
-	ret = notifier_call_chain(&nh->head, val, v);
+	ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
 	srcu_read_unlock(&nh->srcu, idx);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
 
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+		unsigned long val, void *v)
+{
+	return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
+}
 EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
 
 /**
-- 
cgit v1.2.3


From baaca49f415b25fdbe2a8f3c22b39929e450fbfd Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 9 May 2007 02:34:03 -0700
Subject: Define and use new events,CPU_LOCK_ACQUIRE and CPU_LOCK_RELEASE

This is an attempt to provide an alternate mechanism for postponing
a hotplug event instead of using a global mechanism like lock_cpu_hotplug.

The proposal is to add two new events namely CPU_LOCK_ACQUIRE and
CPU_LOCK_RELEASE. The notification for these two events would be sent
out before and after a cpu_hotplug event respectively.

During the CPU_LOCK_ACQUIRE event, a cpu-hotplug-aware subsystem is
supposed to acquire any per-subsystem hotcpu mutex ( Eg. workqueue_mutex
in kernel/workqueue.c ).

During the CPU_LOCK_RELEASE release event the cpu-hotplug-aware subsystem
is supposed to release the per-subsystem hotcpu mutex.

The reasons for defining new events as opposed to reusing the existing events
like CPU_UP_PREPARE/CPU_UP_FAILED/CPU_ONLINE for locking/unlocking of
per-subsystem hotcpu mutexes are as follow:

	- CPU_LOCK_ACQUIRE: All hotcpu mutexes are taken before subsystems
	start handling pre-hotplug events like CPU_UP_PREPARE/CPU_DOWN_PREPARE
	etc, thus ensuring a clean handling of these events.

	- CPU_LOCK_RELEASE: The hotcpu mutexes will be released only after
	all subsystems have handled post-hotplug events like CPU_DOWN_FAILED,
	CPU_DEAD,CPU_ONLINE etc thereby ensuring that there are no subsequent
	clashes amongst the interdependent subsystems after a cpu hotplugs.

This patch also uses __raw_notifier_call chain in _cpu_up to take care
of the dependency between the two consequetive calls to
raw_notifier_call_chain.

[akpm@linux-foundation.org: fix a bug]
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/notifier.h |  2 ++
 kernel/cpu.c             | 19 ++++++++++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index e34221bf8946..1903e5490c04 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -194,6 +194,8 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 #define CPU_DOWN_PREPARE	0x0005 /* CPU (unsigned)v going down */
 #define CPU_DOWN_FAILED		0x0006 /* CPU (unsigned)v NOT going down */
 #define CPU_DEAD		0x0007 /* CPU (unsigned)v dead */
+#define CPU_LOCK_ACQUIRE	0x0008 /* Acquire all hotcpu locks */
+#define CPU_LOCK_RELEASE	0x0009 /* Release all hotcpu locks */
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_NOTIFIER_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 36e70845cfc3..48810498b355 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -132,12 +132,15 @@ static int _cpu_down(unsigned int cpu)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
+	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE,
+						(void *)(long)cpu);
 	err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
 						(void *)(long)cpu);
 	if (err == NOTIFY_BAD) {
 		printk("%s: attempt to take down CPU %u failed\n",
 				__FUNCTION__, cpu);
-		return -EINVAL;
+		err = -EINVAL;
+		goto out_release;
 	}
 
 	/* Ensure that we are not runnable on dying cpu */
@@ -185,6 +188,9 @@ out_thread:
 	err = kthread_stop(p);
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
+out_release:
+	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE,
+						(void *)(long)cpu);
 	return err;
 }
 
@@ -206,13 +212,15 @@ int cpu_down(unsigned int cpu)
 /* Requires cpu_add_remove_lock to be held */
 static int __cpuinit _cpu_up(unsigned int cpu)
 {
-	int ret;
+	int ret, nr_calls = 0;
 	void *hcpu = (void *)(long)cpu;
 
 	if (cpu_online(cpu) || !cpu_present(cpu))
 		return -EINVAL;
 
-	ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
+	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu,
+							-1, &nr_calls);
 	if (ret == NOTIFY_BAD) {
 		printk("%s: attempt to bring up CPU %u failed\n",
 				__FUNCTION__, cpu);
@@ -233,8 +241,9 @@ static int __cpuinit _cpu_up(unsigned int cpu)
 
 out_notify:
 	if (ret != 0)
-		raw_notifier_call_chain(&cpu_chain,
-				CPU_UP_CANCELED, hcpu);
+		__raw_notifier_call_chain(&cpu_chain,
+				CPU_UP_CANCELED, hcpu, nr_calls, NULL);
+	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 7097a87afe937a5879528d52880c2d95f089e96c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:10 -0700
Subject: workqueue: kill run_scheduled_work()

Because it has no callers.

Actually, I think the whole idea of run_scheduled_work() was not right, not
good to mix "unqueue this work and execute its ->func()" in one function.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h |  1 -
 kernel/workqueue.c        | 73 -----------------------------------------------
 2 files changed, 74 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 26a70992dec8..2a58f16e1961 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -182,7 +182,6 @@ extern void flush_work(struct workqueue_struct *wq, struct work_struct *work);
 extern void flush_work_keventd(struct work_struct *work);
 
 extern int FASTCALL(schedule_work(struct work_struct *work));
-extern int FASTCALL(run_scheduled_work(struct work_struct *work));
 extern int FASTCALL(schedule_delayed_work(struct delayed_work *work, unsigned long delay));
 
 extern int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a981add58fb9..ea422254f8bf 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -98,79 +98,6 @@ static inline void *get_wq_data(struct work_struct *work)
 	return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
 }
 
-static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
-{
-	int ret = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cwq->lock, flags);
-	/*
-	 * We need to re-validate the work info after we've gotten
-	 * the cpu_workqueue lock. We can run the work now iff:
-	 *
-	 *  - the wq_data still matches the cpu_workqueue_struct
-	 *  - AND the work is still marked pending
-	 *  - AND the work is still on a list (which will be this
-	 *    workqueue_struct list)
-	 *
-	 * All these conditions are important, because we
-	 * need to protect against the work being run right
-	 * now on another CPU (all but the last one might be
-	 * true if it's currently running and has not been
-	 * released yet, for example).
-	 */
-	if (get_wq_data(work) == cwq
-	    && work_pending(work)
-	    && !list_empty(&work->entry)) {
-		work_func_t f = work->func;
-		cwq->current_work = work;
-		list_del_init(&work->entry);
-		spin_unlock_irqrestore(&cwq->lock, flags);
-
-		if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
-			work_release(work);
-		f(work);
-
-		spin_lock_irqsave(&cwq->lock, flags);
-		cwq->current_work = NULL;
-		ret = 1;
-	}
-	spin_unlock_irqrestore(&cwq->lock, flags);
-	return ret;
-}
-
-/**
- * run_scheduled_work - run scheduled work synchronously
- * @work: work to run
- *
- * This checks if the work was pending, and runs it
- * synchronously if so. It returns a boolean to indicate
- * whether it had any scheduled work to run or not.
- *
- * NOTE! This _only_ works for normal work_structs. You
- * CANNOT use this for delayed work, because the wq data
- * for delayed work will not point properly to the per-
- * CPU workqueue struct, but will change!
- */
-int fastcall run_scheduled_work(struct work_struct *work)
-{
-	for (;;) {
-		struct cpu_workqueue_struct *cwq;
-
-		if (!work_pending(work))
-			return 0;
-		if (list_empty(&work->entry))
-			return 0;
-		/* NOTE! This depends intimately on __queue_work! */
-		cwq = get_wq_data(work);
-		if (!cwq)
-			return 0;
-		if (__run_work(cwq, work))
-			return 1;
-	}
-}
-EXPORT_SYMBOL(run_scheduled_work);
-
 static void insert_work(struct cpu_workqueue_struct *cwq,
 				struct work_struct *work, int tail)
 {
-- 
cgit v1.2.3


From 1634c48f8b85dcb05101f1eb2eab9af40b5976da Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:18 -0700
Subject: make cancel_rearming_delayed_work() work on any workqueue, not just
 keventd_wq

cancel_rearming_delayed_workqueue(wq, dwork) doesn't need the first
parameter.  We don't hang on un-queued dwork any longer, and work->data
doesn't change its type.  This means we can always figure out "wq" from
dwork when it is needed.

Remove this parameter, and rename the function to
cancel_rearming_delayed_work().  Re-create an inline "obsolete"
cancel_rearming_delayed_workqueue(wq) which just calls
cancel_rearming_delayed_work().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h | 13 ++++++++++---
 kernel/workqueue.c        | 27 +++++++++------------------
 2 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 2a58f16e1961..27110c04f21e 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -191,9 +191,6 @@ extern int current_is_keventd(void);
 extern int keventd_up(void);
 
 extern void init_workqueues(void);
-void cancel_rearming_delayed_work(struct delayed_work *work);
-void cancel_rearming_delayed_workqueue(struct workqueue_struct *,
-				       struct delayed_work *);
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
 /*
@@ -212,4 +209,14 @@ static inline int cancel_delayed_work(struct delayed_work *work)
 	return ret;
 }
 
+extern void cancel_rearming_delayed_work(struct delayed_work *work);
+
+/* Obsolete. use cancel_rearming_delayed_work() */
+static inline
+void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
+					struct delayed_work *work)
+{
+	cancel_rearming_delayed_work(work);
+}
+
 #endif
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 985902e2e071..41eaffd125ca 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -555,32 +555,23 @@ void flush_work_keventd(struct work_struct *work)
 EXPORT_SYMBOL(flush_work_keventd);
 
 /**
- * cancel_rearming_delayed_workqueue - kill off a delayed work whose handler rearms the delayed work.
- * @wq:   the controlling workqueue structure
+ * cancel_rearming_delayed_work - kill off a delayed work whose handler rearms the delayed work.
  * @dwork: the delayed work struct
  *
  * Note that the work callback function may still be running on return from
  * cancel_delayed_work(). Run flush_workqueue() or flush_work() to wait on it.
  */
-void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
-				       struct delayed_work *dwork)
+void cancel_rearming_delayed_work(struct delayed_work *dwork)
 {
-	/* Was it ever queued ? */
-	if (!get_wq_data(&dwork->work))
-		return;
+	struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
 
-	while (!cancel_delayed_work(dwork))
-		flush_workqueue(wq);
-}
-EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
+	/* Was it ever queued ? */
+	if (cwq != NULL) {
+		struct workqueue_struct *wq = cwq->wq;
 
-/**
- * cancel_rearming_delayed_work - kill off a delayed keventd work whose handler rearms the delayed work.
- * @dwork: the delayed work struct
- */
-void cancel_rearming_delayed_work(struct delayed_work *dwork)
-{
-	cancel_rearming_delayed_workqueue(keventd_wq, dwork);
+		while (!cancel_delayed_work(dwork))
+			flush_workqueue(wq);
+	}
 }
 EXPORT_SYMBOL(cancel_rearming_delayed_work);
 
-- 
cgit v1.2.3


From 23b2e5991afde5af91a1a661d7f47ee56120759e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:19 -0700
Subject: workqueue: kill NOAUTOREL works

We don't have any users, and it is not so trivial to use NOAUTOREL works
correctly.  It is better to simplify API.

Delete NOAUTOREL support and rename work_release to work_clear_pending to
avoid a confusion.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h | 64 +++++++----------------------------------------
 kernel/workqueue.c        |  5 ++--
 2 files changed, 11 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 27110c04f21e..e1581dce5890 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -24,15 +24,13 @@ typedef void (*work_func_t)(struct work_struct *work);
 struct work_struct {
 	atomic_long_t data;
 #define WORK_STRUCT_PENDING 0		/* T if work item pending execution */
-#define WORK_STRUCT_NOAUTOREL 1		/* F if work item automatically released on exec */
 #define WORK_STRUCT_FLAG_MASK (3UL)
 #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
 	struct list_head entry;
 	work_func_t func;
 };
 
-#define WORK_DATA_INIT(autorelease) \
-	ATOMIC_LONG_INIT((autorelease) << WORK_STRUCT_NOAUTOREL)
+#define WORK_DATA_INIT()	ATOMIC_LONG_INIT(0)
 
 struct delayed_work {
 	struct work_struct work;
@@ -44,14 +42,8 @@ struct execute_work {
 };
 
 #define __WORK_INITIALIZER(n, f) {				\
-	.data = WORK_DATA_INIT(0),				\
-        .entry	= { &(n).entry, &(n).entry },			\
-	.func = (f),						\
-	}
-
-#define __WORK_INITIALIZER_NAR(n, f) {				\
-	.data = WORK_DATA_INIT(1),				\
-        .entry	= { &(n).entry, &(n).entry },			\
+	.data = WORK_DATA_INIT(),				\
+	.entry	= { &(n).entry, &(n).entry },			\
 	.func = (f),						\
 	}
 
@@ -60,23 +52,12 @@ struct execute_work {
 	.timer = TIMER_INITIALIZER(NULL, 0, 0),			\
 	}
 
-#define __DELAYED_WORK_INITIALIZER_NAR(n, f) {			\
-	.work = __WORK_INITIALIZER_NAR((n).work, (f)),		\
-	.timer = TIMER_INITIALIZER(NULL, 0, 0),			\
-	}
-
 #define DECLARE_WORK(n, f)					\
 	struct work_struct n = __WORK_INITIALIZER(n, f)
 
-#define DECLARE_WORK_NAR(n, f)					\
-	struct work_struct n = __WORK_INITIALIZER_NAR(n, f)
-
 #define DECLARE_DELAYED_WORK(n, f)				\
 	struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f)
 
-#define DECLARE_DELAYED_WORK_NAR(n, f)			\
-	struct dwork_struct n = __DELAYED_WORK_INITIALIZER_NAR(n, f)
-
 /*
  * initialize a work item's function pointer
  */
@@ -95,16 +76,9 @@ struct execute_work {
  * assignment of the work data initializer allows the compiler
  * to generate better code.
  */
-#define INIT_WORK(_work, _func)					\
-	do {							\
-		(_work)->data = (atomic_long_t) WORK_DATA_INIT(0);	\
-		INIT_LIST_HEAD(&(_work)->entry);		\
-		PREPARE_WORK((_work), (_func));			\
-	} while (0)
-
-#define INIT_WORK_NAR(_work, _func)					\
+#define INIT_WORK(_work, _func)						\
 	do {								\
-		(_work)->data = (atomic_long_t) WORK_DATA_INIT(1);	\
+		(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
 		INIT_LIST_HEAD(&(_work)->entry);			\
 		PREPARE_WORK((_work), (_func));				\
 	} while (0)
@@ -115,12 +89,6 @@ struct execute_work {
 		init_timer(&(_work)->timer);			\
 	} while (0)
 
-#define INIT_DELAYED_WORK_NAR(_work, _func)			\
-	do {							\
-		INIT_WORK_NAR(&(_work)->work, (_func));		\
-		init_timer(&(_work)->timer);			\
-	} while (0)
-
 #define INIT_DELAYED_WORK_DEFERRABLE(_work, _func)			\
 	do {							\
 		INIT_WORK(&(_work)->work, (_func));		\
@@ -143,24 +111,10 @@ struct execute_work {
 	work_pending(&(w)->work)
 
 /**
- * work_release - Release a work item under execution
- * @work: The work item to release
- *
- * This is used to release a work item that has been initialised with automatic
- * release mode disabled (WORK_STRUCT_NOAUTOREL is set).  This gives the work
- * function the opportunity to grab auxiliary data from the container of the
- * work_struct before clearing the pending bit as the work_struct may be
- * subject to deallocation the moment the pending bit is cleared.
- *
- * In such a case, this should be called in the work function after it has
- * fetched any data it may require from the containter of the work_struct.
- * After this function has been called, the work_struct may be scheduled for
- * further execution or it may be deallocated unless other precautions are
- * taken.
- *
- * This should also be used to release a delayed work item.
+ * work_clear_pending - for internal use only, mark a work item as not pending
+ * @work: The work item in question
  */
-#define work_release(work) \
+#define work_clear_pending(work) \
 	clear_bit(WORK_STRUCT_PENDING, work_data_bits(work))
 
 
@@ -205,7 +159,7 @@ static inline int cancel_delayed_work(struct delayed_work *work)
 
 	ret = del_timer(&work->timer);
 	if (ret)
-		work_release(&work->work);
+		work_clear_pending(&work->work);
 	return ret;
 }
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 41eaffd125ca..0611de815a8f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -246,8 +246,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 		spin_unlock_irq(&cwq->lock);
 
 		BUG_ON(get_wq_data(work) != cwq);
-		if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
-			work_release(work);
+		work_clear_pending(work);
 		f(work);
 
 		if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
@@ -453,7 +452,7 @@ void flush_work(struct workqueue_struct *wq, struct work_struct *work)
 	 */
 	spin_lock_irq(&cwq->lock);
 	list_del_init(&work->entry);
-	work_release(work);
+	work_clear_pending(work);
 	spin_unlock_irq(&cwq->lock);
 
 	for_each_cpu_mask(cpu, *cpu_map)
-- 
cgit v1.2.3


From 28e53bddf814485699a4142bc056fd37d4e11dd4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:22 -0700
Subject: unify flush_work/flush_work_keventd and rename it to cancel_work_sync

flush_work(wq, work) doesn't need the first parameter, we can use cwq->wq
(this was possible from the very beginnig, I missed this).  So we can unify
flush_work_keventd and flush_work.

Also, rename flush_work() to cancel_work_sync() and fix all callers.
Perhaps this is not the best name, but "flush_work" is really bad.

(akpm: this is why the earlier patches bypassed maintainers)

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Auke Kok <auke-jan.h.kok@intel.com>,
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/ll_rw_blk.c              |  2 +-
 drivers/ata/libata-core.c      |  8 ++++----
 drivers/net/e1000/e1000_main.c |  2 +-
 drivers/net/phy/phy.c          |  4 ++--
 drivers/net/tg3.c              |  2 +-
 fs/aio.c                       |  4 ++--
 include/linux/workqueue.h      | 21 ++++++++++++---------
 kernel/workqueue.c             | 36 +++++++++++++++++-------------------
 net/ipv4/ipvs/ip_vs_ctl.c      |  2 +-
 9 files changed, 41 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index c059767c552c..df506571ed60 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3633,7 +3633,7 @@ EXPORT_SYMBOL(kblockd_schedule_work);
 
 void kblockd_flush_work(struct work_struct *work)
 {
-	flush_work(kblockd_workqueue, work);
+	cancel_work_sync(work);
 }
 EXPORT_SYMBOL(kblockd_flush_work);
 
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index b74e56caba6f..fef87dd70d17 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1316,7 +1316,7 @@ void ata_port_flush_task(struct ata_port *ap)
 	spin_unlock_irqrestore(ap->lock, flags);
 
 	DPRINTK("flush #1\n");
-	flush_work(ata_wq, &ap->port_task.work); /* akpm: seems unneeded */
+	cancel_work_sync(&ap->port_task.work); /* akpm: seems unneeded */
 
 	/*
 	 * At this point, if a task is running, it's guaranteed to see
@@ -1327,7 +1327,7 @@ void ata_port_flush_task(struct ata_port *ap)
 		if (ata_msg_ctl(ap))
 			ata_port_printk(ap, KERN_DEBUG, "%s: flush #2\n",
 					__FUNCTION__);
-		flush_work(ata_wq, &ap->port_task.work);
+		cancel_work_sync(&ap->port_task.work);
 	}
 
 	spin_lock_irqsave(ap->lock, flags);
@@ -6475,9 +6475,9 @@ void ata_port_detach(struct ata_port *ap)
 	/* Flush hotplug task.  The sequence is similar to
 	 * ata_port_flush_task().
 	 */
-	flush_work(ata_aux_wq, &ap->hotplug_task.work); /* akpm: why? */
+	cancel_work_sync(&ap->hotplug_task.work); /* akpm: why? */
 	cancel_delayed_work(&ap->hotplug_task);
-	flush_work(ata_aux_wq, &ap->hotplug_task.work);
+	cancel_work_sync(&ap->hotplug_task.work);
 
  skip_eh:
 	/* remove the associated SCSI host */
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 397e25bdbfec..637ae8f68791 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -1214,7 +1214,7 @@ e1000_remove(struct pci_dev *pdev)
 	int i;
 #endif
 
-	flush_work_keventd(&adapter->reset_task);
+	cancel_work_sync(&adapter->reset_task);
 
 	e1000_release_manageability(adapter);
 
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index f445c465b14e..f71dab347667 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -663,9 +663,9 @@ int phy_stop_interrupts(struct phy_device *phydev)
 
 	/*
 	 * Finish any pending work; we might have been scheduled to be called
-	 * from keventd ourselves, but flush_work_keventd() handles that.
+	 * from keventd ourselves, but cancel_work_sync() handles that.
 	 */
-	flush_work_keventd(&phydev->phy_queue);
+	cancel_work_sync(&phydev->phy_queue);
 
 	free_irq(phydev->irq, phydev);
 
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 0c0f9c817321..923b9c725cc3 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -7386,7 +7386,7 @@ static int tg3_close(struct net_device *dev)
 {
 	struct tg3 *tp = netdev_priv(dev);
 
-	flush_work_keventd(&tp->reset_task);
+	cancel_work_sync(&tp->reset_task);
 
 	netif_stop_queue(dev);
 
diff --git a/fs/aio.c b/fs/aio.c
index d18690bb03e9..ac1c1587aa02 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -348,7 +348,7 @@ void fastcall exit_aio(struct mm_struct *mm)
 		/*
 		 * Ensure we don't leave the ctx on the aio_wq
 		 */
-		flush_work(aio_wq, &ctx->wq.work);
+		cancel_work_sync(&ctx->wq.work);
 
 		if (1 != atomic_read(&ctx->users))
 			printk(KERN_DEBUG
@@ -371,7 +371,7 @@ void fastcall __put_ioctx(struct kioctx *ctx)
 	BUG_ON(ctx->reqs_active);
 
 	cancel_delayed_work(&ctx->wq);
-	flush_work(aio_wq, &ctx->wq.work);
+	cancel_work_sync(&ctx->wq.work);
 	aio_free_ring(ctx);
 	mmdrop(ctx->mm);
 	ctx->mm = NULL;
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index e1581dce5890..d555f31c0746 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -128,30 +128,33 @@ extern struct workqueue_struct *__create_workqueue(const char *name,
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
 extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work));
-extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay));
+extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq,
+			struct delayed_work *work, unsigned long delay));
 extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
-	struct delayed_work *work, unsigned long delay);
+			struct delayed_work *work, unsigned long delay);
+
 extern void FASTCALL(flush_workqueue(struct workqueue_struct *wq));
-extern void flush_work(struct workqueue_struct *wq, struct work_struct *work);
-extern void flush_work_keventd(struct work_struct *work);
+extern void flush_scheduled_work(void);
 
 extern int FASTCALL(schedule_work(struct work_struct *work));
-extern int FASTCALL(schedule_delayed_work(struct delayed_work *work, unsigned long delay));
-
-extern int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay);
+extern int FASTCALL(schedule_delayed_work(struct delayed_work *work,
+					unsigned long delay));
+extern int schedule_delayed_work_on(int cpu, struct delayed_work *work,
+					unsigned long delay);
 extern int schedule_on_each_cpu(work_func_t func);
-extern void flush_scheduled_work(void);
 extern int current_is_keventd(void);
 extern int keventd_up(void);
 
 extern void init_workqueues(void);
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
+extern void cancel_work_sync(struct work_struct *work);
+
 /*
  * Kill off a pending schedule_delayed_work().  Note that the work callback
  * function may still be running on return from cancel_delayed_work(), unless
  * it returns 1 and the work doesn't re-arm itself. Run flush_workqueue() or
- * flush_work() or cancel_work_sync() to wait on it.
+ * cancel_work_sync() to wait on it.
  */
 static inline int cancel_delayed_work(struct delayed_work *work)
 {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 63885abf1ba0..c9ab4293904f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -413,23 +413,23 @@ static void wait_on_work(struct cpu_workqueue_struct *cwq,
 }
 
 /**
- * flush_work - block until a work_struct's callback has terminated
- * @wq: the workqueue on which the work is queued
+ * cancel_work_sync - block until a work_struct's callback has terminated
  * @work: the work which is to be flushed
  *
- * flush_work() will attempt to cancel the work if it is queued.  If the work's
- * callback appears to be running, flush_work() will block until it has
- * completed.
+ * cancel_work_sync() will attempt to cancel the work if it is queued. If the
+ * work's callback appears to be running, cancel_work_sync() will block until
+ * it has completed.
  *
- * flush_work() is designed to be used when the caller is tearing down data
- * structures which the callback function operates upon.  It is expected that,
- * prior to calling flush_work(), the caller has arranged for the work to not
- * be requeued.
+ * cancel_work_sync() is designed to be used when the caller is tearing down
+ * data structures which the callback function operates upon. It is expected
+ * that, prior to calling cancel_work_sync(), the caller has arranged for the
+ * work to not be requeued.
  */
-void flush_work(struct workqueue_struct *wq, struct work_struct *work)
+void cancel_work_sync(struct work_struct *work)
 {
-	const cpumask_t *cpu_map = wq_cpu_map(wq);
 	struct cpu_workqueue_struct *cwq;
+	struct workqueue_struct *wq;
+	const cpumask_t *cpu_map;
 	int cpu;
 
 	might_sleep();
@@ -448,10 +448,13 @@ void flush_work(struct workqueue_struct *wq, struct work_struct *work)
 	work_clear_pending(work);
 	spin_unlock_irq(&cwq->lock);
 
+	wq = cwq->wq;
+	cpu_map = wq_cpu_map(wq);
+
 	for_each_cpu_mask(cpu, *cpu_map)
 		wait_on_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
-EXPORT_SYMBOL_GPL(flush_work);
+EXPORT_SYMBOL_GPL(cancel_work_sync);
 
 
 static struct workqueue_struct *keventd_wq;
@@ -540,18 +543,13 @@ void flush_scheduled_work(void)
 }
 EXPORT_SYMBOL(flush_scheduled_work);
 
-void flush_work_keventd(struct work_struct *work)
-{
-	flush_work(keventd_wq, work);
-}
-EXPORT_SYMBOL(flush_work_keventd);
-
 /**
  * cancel_rearming_delayed_work - kill off a delayed work whose handler rearms the delayed work.
  * @dwork: the delayed work struct
  *
  * Note that the work callback function may still be running on return from
- * cancel_delayed_work(). Run flush_workqueue() or flush_work() to wait on it.
+ * cancel_delayed_work(). Run flush_workqueue() or cancel_work_sync() to wait
+ * on it.
  */
 void cancel_rearming_delayed_work(struct delayed_work *dwork)
 {
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 342e836677a1..68fe1d4d0210 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -2387,7 +2387,7 @@ void ip_vs_control_cleanup(void)
 	EnterFunction(2);
 	ip_vs_trash_cleanup();
 	cancel_rearming_delayed_work(&defense_work);
-	flush_work_keventd(&defense_work.work);
+	cancel_work_sync(&defense_work.work);
 	ip_vs_kill_estimator(&ip_vs_stats);
 	unregister_sysctl_table(sysctl_header);
 	proc_net_remove("ip_vs_stats");
-- 
cgit v1.2.3


From 73c279927f89561ecb45b2dfdf9314bafcfd9f67 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 May 2007 02:34:32 -0700
Subject: kthread: don't depend on work queues

Currently there is a circular reference between work queue initialization
and kthread initialization.  This prevents the kthread infrastructure from
initializing until after work queues have been initialized.

We want the properties of tasks created with kthread_create to be as close
as possible to the init_task and to not be contaminated by user processes.
The later we start our kthreadd that creates these tasks the harder it is
to avoid contamination from user processes and the more of a mess we have
to clean up because the defaults have changed on us.

So this patch modifies the kthread support to not use work queues but to
instead use a simple list of structures, and to have kthreadd start from
init_task immediately after our kernel thread that execs /sbin/init.

By being a true child of init_task we only have to change those process
settings that we want to have different from init_task, such as our process
name, the cpus that are allowed, blocking all signals and setting SIGCHLD
to SIG_IGN so that all of our children are reaped automatically.

By being a true child of init_task we also naturally get our ppid set to 0
and do not wind up as a child of PID == 1.  Ensuring that tasks generated
by kthread_create will not slow down the functioning of the wait family of
functions.

[akpm@linux-foundation.org: use interruptible sleeps]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kthread.h |   3 ++
 init/main.c             |   5 ++
 kernel/kthread.c        | 124 ++++++++++++++++++++++++++----------------------
 3 files changed, 76 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 1c65e7a9f186..00dd957e245b 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -30,4 +30,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu);
 int kthread_stop(struct task_struct *k);
 int kthread_should_stop(void);
 
+int kthreadd(void *unused);
+extern struct task_struct *kthreadd_task;
+
 #endif /* _LINUX_KTHREAD_H */
diff --git a/init/main.c b/init/main.c
index c1537e0ddceb..e8d080cab443 100644
--- a/init/main.c
+++ b/init/main.c
@@ -54,6 +54,7 @@
 #include <linux/lockdep.h>
 #include <linux/pid_namespace.h>
 #include <linux/device.h>
+#include <linux/kthread.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -425,8 +426,12 @@ static void __init setup_command_line(char *command_line)
 static void noinline rest_init(void)
 	__releases(kernel_lock)
 {
+	int pid;
+
 	kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
 	numa_default_policy();
+	pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
+	kthreadd_task = find_task_by_pid(pid);
 	unlock_kernel();
 
 	/*
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 87c50ccd1d4e..0eb0070a3c57 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,7 +1,7 @@
 /* Kernel thread helper functions.
  *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
  *
- * Creation is done via keventd, so that we get a clean environment
+ * Creation is done via kthreadd, so that we get a clean environment
  * even if we're invoked from userspace (think modprobe, hotplug cpu,
  * etc.).
  */
@@ -15,24 +15,22 @@
 #include <linux/mutex.h>
 #include <asm/semaphore.h>
 
-/*
- * We dont want to execute off keventd since it might
- * hold a semaphore our callers hold too:
- */
-static struct workqueue_struct *helper_wq;
+static DEFINE_SPINLOCK(kthread_create_lock);
+static LIST_HEAD(kthread_create_list);
+struct task_struct *kthreadd_task;
 
 struct kthread_create_info
 {
-	/* Information passed to kthread() from keventd. */
+	/* Information passed to kthread() from kthreadd. */
 	int (*threadfn)(void *data);
 	void *data;
 	struct completion started;
 
-	/* Result passed back to kthread_create() from keventd. */
+	/* Result passed back to kthread_create() from kthreadd. */
 	struct task_struct *result;
 	struct completion done;
 
-	struct work_struct work;
+	struct list_head list;
 };
 
 struct kthread_stop_info
@@ -60,42 +58,17 @@ int kthread_should_stop(void)
 }
 EXPORT_SYMBOL(kthread_should_stop);
 
-static void kthread_exit_files(void)
-{
-	struct fs_struct *fs;
-	struct task_struct *tsk = current;
-
-	exit_fs(tsk);		/* current->fs->count--; */
-	fs = init_task.fs;
-	tsk->fs = fs;
-	atomic_inc(&fs->count);
- 	exit_files(tsk);
-	current->files = init_task.files;
-	atomic_inc(&tsk->files->count);
-}
-
 static int kthread(void *_create)
 {
 	struct kthread_create_info *create = _create;
 	int (*threadfn)(void *data);
 	void *data;
-	sigset_t blocked;
 	int ret = -EINTR;
 
-	kthread_exit_files();
-
-	/* Copy data: it's on keventd's stack */
+	/* Copy data: it's on kthread's stack */
 	threadfn = create->threadfn;
 	data = create->data;
 
-	/* Block and flush all signals (in case we're not from keventd). */
-	sigfillset(&blocked);
-	sigprocmask(SIG_BLOCK, &blocked, NULL);
-	flush_signals(current);
-
-	/* By default we can run anywhere, unlike keventd. */
-	set_cpus_allowed(current, CPU_MASK_ALL);
-
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_INTERRUPTIBLE);
 	complete(&create->started);
@@ -112,11 +85,8 @@ static int kthread(void *_create)
 	return 0;
 }
 
-/* We are keventd: create a thread. */
-static void keventd_create_kthread(struct work_struct *work)
+static void create_kthread(struct kthread_create_info *create)
 {
-	struct kthread_create_info *create =
-		container_of(work, struct kthread_create_info, work);
 	int pid;
 
 	/* We want our own signal handler (we take no signals by default). */
@@ -162,17 +132,14 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 	create.data = data;
 	init_completion(&create.started);
 	init_completion(&create.done);
-	INIT_WORK(&create.work, keventd_create_kthread);
-
-	/*
-	 * The workqueue needs to start up first:
-	 */
-	if (!helper_wq)
-		create.work.func(&create.work);
-	else {
-		queue_work(helper_wq, &create.work);
-		wait_for_completion(&create.done);
-	}
+
+	spin_lock(&kthread_create_lock);
+	list_add_tail(&create.list, &kthread_create_list);
+	wake_up_process(kthreadd_task);
+	spin_unlock(&kthread_create_lock);
+
+	wait_for_completion(&create.done);
+
 	if (!IS_ERR(create.result)) {
 		va_list args;
 		va_start(args, namefmt);
@@ -180,7 +147,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 			  namefmt, args);
 		va_end(args);
 	}
-
 	return create.result;
 }
 EXPORT_SYMBOL(kthread_create);
@@ -245,12 +211,58 @@ int kthread_stop(struct task_struct *k)
 }
 EXPORT_SYMBOL(kthread_stop);
 
-static __init int helper_init(void)
+
+static __init void kthreadd_setup(void)
 {
-	helper_wq = create_singlethread_workqueue("kthread");
-	BUG_ON(!helper_wq);
+	struct task_struct *tsk = current;
+	struct k_sigaction sa;
+	sigset_t blocked;
 
-	return 0;
+	set_task_comm(tsk, "kthreadd");
+
+	/* Block and flush all signals */
+	sigfillset(&blocked);
+	sigprocmask(SIG_BLOCK, &blocked, NULL);
+	flush_signals(tsk);
+
+	/* SIG_IGN makes children autoreap: see do_notify_parent(). */
+	sa.sa.sa_handler = SIG_IGN;
+	sa.sa.sa_flags = 0;
+	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
+	do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
+
+	set_user_nice(current, -5);
+	set_cpus_allowed(current, CPU_MASK_ALL);
 }
 
-core_initcall(helper_init);
+int kthreadd(void *unused)
+{
+	/* Setup a clean context for our children to inherit. */
+	kthreadd_setup();
+
+	current->flags |= PF_NOFREEZE;
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (list_empty(&kthread_create_list))
+			schedule();
+		__set_current_state(TASK_RUNNING);
+
+		spin_lock(&kthread_create_lock);
+		while (!list_empty(&kthread_create_list)) {
+			struct kthread_create_info *create;
+
+			create = list_entry(kthread_create_list.next,
+					    struct kthread_create_info, list);
+			list_del_init(&create->list);
+			spin_unlock(&kthread_create_lock);
+
+			create_kthread(create);
+
+			spin_lock(&kthread_create_lock);
+		}
+		spin_unlock(&kthread_create_lock);
+	}
+
+	return 0;
+}
-- 
cgit v1.2.3


From 10ab825bdef8df510f99c703a5a2d9b13a4e31a5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 9 May 2007 02:34:37 -0700
Subject: change kernel threads to ignore signals instead of blocking them

Currently kernel threads use sigprocmask(SIG_BLOCK) to protect against
signals.  This doesn't prevent the signal delivery, this only blocks
signal_wake_up().  Every "killall -33 kthreadd" means a "struct siginfo"
leak.

Change kthreadd_setup() to set all handlers to SIG_IGN instead of blocking
them (make a new helper ignore_signals() for that).  If the kernel thread
needs some signal, it should use allow_signal() anyway, and in that case it
should not use CLONE_SIGHAND.

Note that we can't change daemonize() (should die!) in the same way,
because it can be used along with CLONE_SIGHAND.  This means that
allow_signal() still should unblock the signal to work correctly with
daemonize()ed threads.

However, disallow_signal() doesn't block the signal any longer but ignores
it.

NOTE: with or without this patch the kernel threads are not protected from
handle_stop_signal(), this seems harmless, but not good.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  1 +
 kernel/exit.c         |  2 +-
 kernel/kthread.c      | 17 +++--------------
 kernel/signal.c       | 10 ++++++++++
 4 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3d95c480f58d..28000b1658f9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1317,6 +1317,7 @@ extern int in_egroup_p(gid_t);
 
 extern void proc_caches_init(void);
 extern void flush_signals(struct task_struct *);
+extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
 extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index bc982cd72743..b0c6f0c3a2df 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -347,7 +347,7 @@ int disallow_signal(int sig)
 		return -EINVAL;
 
 	spin_lock_irq(&current->sighand->siglock);
-	sigaddset(&current->blocked, sig);
+	current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 	return 0;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 0eb0070a3c57..df8a8e8f6ca4 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -215,24 +215,13 @@ EXPORT_SYMBOL(kthread_stop);
 static __init void kthreadd_setup(void)
 {
 	struct task_struct *tsk = current;
-	struct k_sigaction sa;
-	sigset_t blocked;
 
 	set_task_comm(tsk, "kthreadd");
 
-	/* Block and flush all signals */
-	sigfillset(&blocked);
-	sigprocmask(SIG_BLOCK, &blocked, NULL);
-	flush_signals(tsk);
+	ignore_signals(tsk);
 
-	/* SIG_IGN makes children autoreap: see do_notify_parent(). */
-	sa.sa.sa_handler = SIG_IGN;
-	sa.sa.sa_flags = 0;
-	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
-	do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
-
-	set_user_nice(current, -5);
-	set_cpus_allowed(current, CPU_MASK_ALL);
+	set_user_nice(tsk, -5);
+	set_cpus_allowed(tsk, CPU_MASK_ALL);
 }
 
 int kthreadd(void *unused)
diff --git a/kernel/signal.c b/kernel/signal.c
index 23ae6d62fc41..2ac3a668d9dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -209,6 +209,16 @@ void flush_signals(struct task_struct *t)
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
+void ignore_signals(struct task_struct *t)
+{
+	int i;
+
+	for (i = 0; i < _NSIG; ++i)
+		t->sighand->action[i].sa.sa_handler = SIG_IGN;
+
+	flush_signals(t);
+}
+
 /*
  * Flush all handlers for a task.
  */
-- 
cgit v1.2.3


From 8842c9655b2b7f0e8e6c50a773b649e5d8a57678 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 9 May 2007 02:34:46 -0700
Subject: remove nfs4_acl_add_ace()

nfs4_acl_add_ace() can now be removed.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Neil Brown <neilb@cse.unsw.edu.au>
Acked-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/nfs4acl.c        | 17 -----------------
 include/linux/nfs4_acl.h |  1 -
 2 files changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 673a53c014a3..cc3b7badd486 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -137,7 +137,6 @@ struct ace_container {
 static short ace2type(struct nfs4_ace *);
 static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *,
 				unsigned int);
-void nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
 
 struct nfs4_acl *
 nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl,
@@ -785,21 +784,6 @@ nfs4_acl_new(int n)
 	return acl;
 }
 
-void
-nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask,
-		int whotype, uid_t who)
-{
-	struct nfs4_ace *ace = acl->aces + acl->naces;
-
-	ace->type = type;
-	ace->flag = flag;
-	ace->access_mask = access_mask;
-	ace->whotype = whotype;
-	ace->who = who;
-
-	acl->naces++;
-}
-
 static struct {
 	char *string;
 	int   stringlen;
@@ -851,6 +835,5 @@ nfs4_acl_write_who(int who, char *p)
 }
 
 EXPORT_SYMBOL(nfs4_acl_new);
-EXPORT_SYMBOL(nfs4_acl_add_ace);
 EXPORT_SYMBOL(nfs4_acl_get_whotype);
 EXPORT_SYMBOL(nfs4_acl_write_who);
diff --git a/include/linux/nfs4_acl.h b/include/linux/nfs4_acl.h
index 409b6e02f337..c9c05a78e9bb 100644
--- a/include/linux/nfs4_acl.h
+++ b/include/linux/nfs4_acl.h
@@ -44,7 +44,6 @@
 #define NFS4_ACL_MAX 170
 
 struct nfs4_acl *nfs4_acl_new(int);
-void nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
 int nfs4_acl_get_whotype(char *, u32);
 int nfs4_acl_write_who(int who, char *p);
 int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
-- 
cgit v1.2.3


From 7ac1bea5507218da03f6005d228789da5a831c3f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 9 May 2007 02:34:48 -0700
Subject: knfsd: rename sk_defer_lock to sk_lock

Now that sk_defer_lock protects two different things, make the name more
generic.

Also don't bother with disabling _bh as the lock is only ever taken from
process context.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sunrpc/svcsock.h |  3 ++-
 net/sunrpc/svcauth_unix.c      | 10 +++++-----
 net/sunrpc/svcsock.c           | 13 +++++++------
 3 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 7909687557bf..e21dd93ac4b7 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -37,7 +37,8 @@ struct svc_sock {
 
 	atomic_t    	    	sk_reserved;	/* space on outq that is reserved */
 
-	spinlock_t		sk_defer_lock;	/* protects sk_deferred */
+	spinlock_t		sk_lock;	/* protects sk_deferred and
+						 * sk_info_authunix */
 	struct list_head	sk_deferred;	/* deferred requests that need to
 						 * be revisted */
 	struct mutex		sk_mutex;	/* to serialize sending data */
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 2bd23ea2aa8b..07dcd20cbee4 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -385,7 +385,7 @@ ip_map_cached_get(struct svc_rqst *rqstp)
 {
 	struct ip_map *ipm;
 	struct svc_sock *svsk = rqstp->rq_sock;
-	spin_lock_bh(&svsk->sk_defer_lock);
+	spin_lock(&svsk->sk_lock);
 	ipm = svsk->sk_info_authunix;
 	if (ipm != NULL) {
 		if (!cache_valid(&ipm->h)) {
@@ -395,13 +395,13 @@ ip_map_cached_get(struct svc_rqst *rqstp)
 			 * same IP address.
 			 */
 			svsk->sk_info_authunix = NULL;
-			spin_unlock_bh(&svsk->sk_defer_lock);
+			spin_unlock(&svsk->sk_lock);
 			cache_put(&ipm->h, &ip_map_cache);
 			return NULL;
 		}
 		cache_get(&ipm->h);
 	}
-	spin_unlock_bh(&svsk->sk_defer_lock);
+	spin_unlock(&svsk->sk_lock);
 	return ipm;
 }
 
@@ -410,14 +410,14 @@ ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm)
 {
 	struct svc_sock *svsk = rqstp->rq_sock;
 
-	spin_lock_bh(&svsk->sk_defer_lock);
+	spin_lock(&svsk->sk_lock);
 	if (svsk->sk_sock->type == SOCK_STREAM &&
 	    svsk->sk_info_authunix == NULL) {
 		/* newly cached, keep the reference */
 		svsk->sk_info_authunix = ipm;
 		ipm = NULL;
 	}
-	spin_unlock_bh(&svsk->sk_defer_lock);
+	spin_unlock(&svsk->sk_lock);
 	if (ipm)
 		cache_put(&ipm->h, &ip_map_cache);
 }
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 22f61aee4824..fdb1386f5dcd 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -53,7 +53,8 @@
  * 	svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
  *	when both need to be taken (rare), svc_serv->sv_lock is first.
  *	BKL protects svc_serv->sv_nrthread.
- *	svc_sock->sk_defer_lock protects the svc_sock->sk_deferred list
+ *	svc_sock->sk_lock protects the svc_sock->sk_deferred list
+ *             and the ->sk_info_authunix cache.
  *	svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply.
  *
  *	Some flags can be set to certain values at any time
@@ -1633,7 +1634,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	svsk->sk_server = serv;
 	atomic_set(&svsk->sk_inuse, 1);
 	svsk->sk_lastrecv = get_seconds();
-	spin_lock_init(&svsk->sk_defer_lock);
+	spin_lock_init(&svsk->sk_lock);
 	INIT_LIST_HEAD(&svsk->sk_deferred);
 	INIT_LIST_HEAD(&svsk->sk_ready);
 	mutex_init(&svsk->sk_mutex);
@@ -1857,9 +1858,9 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
 	dprintk("revisit queued\n");
 	svsk = dr->svsk;
 	dr->svsk = NULL;
-	spin_lock_bh(&svsk->sk_defer_lock);
+	spin_lock(&svsk->sk_lock);
 	list_add(&dr->handle.recent, &svsk->sk_deferred);
-	spin_unlock_bh(&svsk->sk_defer_lock);
+	spin_unlock(&svsk->sk_lock);
 	set_bit(SK_DEFERRED, &svsk->sk_flags);
 	svc_sock_enqueue(svsk);
 	svc_sock_put(svsk);
@@ -1925,7 +1926,7 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
 
 	if (!test_bit(SK_DEFERRED, &svsk->sk_flags))
 		return NULL;
-	spin_lock_bh(&svsk->sk_defer_lock);
+	spin_lock(&svsk->sk_lock);
 	clear_bit(SK_DEFERRED, &svsk->sk_flags);
 	if (!list_empty(&svsk->sk_deferred)) {
 		dr = list_entry(svsk->sk_deferred.next,
@@ -1934,6 +1935,6 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
 		list_del_init(&dr->handle.recent);
 		set_bit(SK_DEFERRED, &svsk->sk_flags);
 	}
-	spin_unlock_bh(&svsk->sk_defer_lock);
+	spin_unlock(&svsk->sk_lock);
 	return dr;
 }
-- 
cgit v1.2.3


From cd123012d99fde4759500fee611e724e4f3016e3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 9 May 2007 02:34:50 -0700
Subject: RPC: add wrapper for svc_reserve to account for checksum

When the kernel calls svc_reserve to downsize the expected size of an RPC
reply, it fails to account for the possibility of a checksum at the end of
the packet.  If a client mounts a NFSv2/3 with sec=krb5i/p, and does I/O
then you'll generally see messages similar to this in the server's ring
buffer:

RPC request reserved 164 but used 208

While I was never able to verify it, I suspect that this problem is also
the root cause of some oopses I've seen under these conditions:

https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=227726

This is probably also a problem for other sec= types and for NFSv4.  The
large reserved size for NFSv4 compound packets seems to generally paper
over the problem, however.

This patch adds a wrapper for svc_reserve that accounts for the possibility
of a checksum.  It also fixes up the appropriate callers of svc_reserve to
call the wrapper.  For now, it just uses a hardcoded value that I
determined via testing.  That value may need to be revised upward as things
change, or we may want to eventually add a new auth_op that attempts to
calculate this somehow.

Unfortunately, there doesn't seem to be a good way to reliably determine
the expected checksum length prior to actually calculating it, particularly
with schemes like spkm3.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Acked-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/nfs3proc.c         |  2 +-
 fs/nfsd/nfsproc.c          |  2 +-
 include/linux/sunrpc/svc.h | 19 +++++++++++++++++++
 net/sunrpc/svc.c           |  2 +-
 4 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 7f5bad0393b1..eac82830bfd7 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -177,7 +177,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 	if (max_blocksize < resp->count)
 		resp->count = max_blocksize;
 
-	svc_reserve(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
+	svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
 
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = nfsd_read(rqstp, &resp->fh, NULL,
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 5cc2eec981b8..b2c7147aa921 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -155,7 +155,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
 				argp->count);
 		argp->count = NFSSVC_MAXBLKSIZE_V2;
 	}
-	svc_reserve(rqstp, (19<<2) + argp->count + 4);
+	svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
 
 	resp->count = argp->count;
 	nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 35fa4d5aadd0..4a7ae8ab6eb8 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -396,4 +396,23 @@ char *		   svc_print_addr(struct svc_rqst *, char *, size_t);
 
 #define	RPC_MAX_ADDRBUFLEN	(63U)
 
+/*
+ * When we want to reduce the size of the reserved space in the response
+ * buffer, we need to take into account the size of any checksum data that
+ * may be at the end of the packet. This is difficult to determine exactly
+ * for all cases without actually generating the checksum, so we just use a
+ * static value.
+ */
+static inline void
+svc_reserve_auth(struct svc_rqst *rqstp, int space)
+{
+	int			added_space = 0;
+
+	switch(rqstp->rq_authop->flavour) {
+		case RPC_AUTH_GSS:
+			added_space = RPC_MAX_AUTH_SIZE;
+	}
+	return svc_reserve(rqstp, space + added_space);
+}
+
 #endif /* SUNRPC_SVC_H */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index b7503c103ae8..e673ef993904 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -907,7 +907,7 @@ svc_process(struct svc_rqst *rqstp)
 	 * better idea of reply size
 	 */
 	if (procp->pc_xdrressize)
-		svc_reserve(rqstp, procp->pc_xdrressize<<2);
+		svc_reserve_auth(rqstp, procp->pc_xdrressize<<2);
 
 	/* Call the function that processes the request. */
 	if (!versp->vs_dispatch) {
-- 
cgit v1.2.3


From b8522ead3534c6cd06752b47a3bc380956191a2a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 9 May 2007 02:34:58 -0700
Subject: aio is unlikely

Stick an unlikely() around is_aio(): I assert that most IO is synchronous.

Cc: Suparna Bhattacharya <suparna@in.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/aio.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index a30ef13c9e62..43dc2ebfaa0e 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -226,7 +226,8 @@ int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		__put_ioctx(kioctx);					\
 } while (0)
 
-#define in_aio() !is_sync_wait(current->io_wait)
+#define in_aio() (unlikely(!is_sync_wait(current->io_wait)))
+
 /* may be used for debugging */
 #define warn_if_async()							\
 do {									\
-- 
cgit v1.2.3


From f34c506b0385b43abd25c490335036ecbb173aed Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 9 May 2007 02:34:59 -0700
Subject: declare struct ktime

Some smarty went and inflicted ktime_t as a typedef upon us, so we cannot
forward declare it.

Create a new `union ktime', map ktime_t onto that.  Now we need to kill off
this ktime_t thing.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ktime.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 81bb9c7a4eb3..c762954bda14 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -43,7 +43,7 @@
  * plain scalar nanosecond based representation can be selected by the
  * config switch CONFIG_KTIME_SCALAR.
  */
-typedef union {
+union ktime {
 	s64	tv64;
 #if BITS_PER_LONG != 64 && !defined(CONFIG_KTIME_SCALAR)
 	struct {
@@ -54,7 +54,9 @@ typedef union {
 # endif
 	} tv;
 #endif
-} ktime_t;
+};
+
+typedef union ktime ktime_t;		/* Kill this */
 
 #define KTIME_MAX			((s64)~((u64)1 << 63))
 #if (BITS_PER_LONG == 64)
-- 
cgit v1.2.3


From c19384b5b296905d4988c7c684ff540a0f9d65be Mon Sep 17 00:00:00 2001
From: Pierre Peiffer <pierre.peiffer@bull.net>
Date: Wed, 9 May 2007 02:35:02 -0700
Subject: Make futex_wait() use an hrtimer for timeout

This patch modifies futex_wait() to use an hrtimer + schedule() in place of
schedule_timeout().

schedule_timeout() is tick based, therefore the timeout granularity is the
tick (1 ms, 4 ms or 10 ms depending on HZ).  By using a high resolution timer
for timeout wakeup, we can attain a much finer timeout granularity (in the
microsecond range).  This parallels what is already done for futex_lock_pi().

The timeout passed to the syscall is no longer converted to jiffies and is
therefore passed to do_futex() and futex_wait() as an absolute ktime_t
therefore keeping nanosecond resolution.

Also this removes the need to pass the nanoseconds timeout part to
futex_lock_pi() in val2.

In futex_wait(), if there is no timeout then a regular schedule() is
performed.  Otherwise, an hrtimer is fired before schedule() is called.

[akpm@linux-foundation.org: fix `make headers_check']
Signed-off-by: Sebastien Dugue <sebastien.dugue@bull.net>
Signed-off-by: Pierre Peiffer <pierre.peiffer@bull.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/futex.h |  4 ++-
 kernel/futex.c        | 89 ++++++++++++++++++++++++++-------------------------
 kernel/futex_compat.c | 19 ++++++-----
 3 files changed, 57 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 820125c628c1..34e54f2b8997 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -3,6 +3,8 @@
 
 #include <linux/sched.h>
 
+union ktime;
+
 /* Second argument to futex syscall */
 
 
@@ -94,7 +96,7 @@ struct robust_list_head {
 #define ROBUST_LIST_LIMIT	2048
 
 #ifdef __KERNEL__
-long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
 
 extern int
diff --git a/kernel/futex.c b/kernel/futex.c
index 685ee2362a5e..e1246ccbf89a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1001,16 +1001,16 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
 }
 
 static long futex_wait_restart(struct restart_block *restart);
-static int futex_wait_abstime(u32 __user *uaddr, u32 val,
-			int timed, unsigned long abs_time)
+static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 {
 	struct task_struct *curr = current;
 	DECLARE_WAITQUEUE(wait, curr);
 	struct futex_hash_bucket *hb;
 	struct futex_q q;
-	unsigned long time_left = 0;
 	u32 uval;
 	int ret;
+	struct hrtimer_sleeper t;
+	int rem = 0;
 
 	q.pi_state = NULL;
  retry:
@@ -1088,20 +1088,29 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
 	 * !plist_node_empty() is safe here without any lock.
 	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
 	 */
-	time_left = 0;
 	if (likely(!plist_node_empty(&q.list))) {
-		unsigned long rel_time;
-
-		if (timed) {
-			unsigned long now = jiffies;
-			if (time_after(now, abs_time))
-				rel_time = 0;
-			else
-				rel_time = abs_time - now;
-		} else
-			rel_time = MAX_SCHEDULE_TIMEOUT;
+		if (!abs_time)
+			schedule();
+		else {
+			hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+			hrtimer_init_sleeper(&t, current);
+			t.timer.expires = *abs_time;
+
+			hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
+
+			/*
+			 * the timer could have already expired, in which
+			 * case current would be flagged for rescheduling.
+			 * Don't bother calling schedule.
+			 */
+			if (likely(t.task))
+				schedule();
+
+			hrtimer_cancel(&t.timer);
 
-		time_left = schedule_timeout(rel_time);
+			/* Flag if a timeout occured */
+			rem = (t.task == NULL);
+		}
 	}
 	__set_current_state(TASK_RUNNING);
 
@@ -1113,14 +1122,14 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	if (!unqueue_me(&q))
 		return 0;
-	if (time_left == 0)
+	if (rem)
 		return -ETIMEDOUT;
 
 	/*
 	 * We expect signal_pending(current), but another thread may
 	 * have handled it for us already.
 	 */
-	if (time_left == MAX_SCHEDULE_TIMEOUT)
+	if (!abs_time)
 		return -ERESTARTSYS;
 	else {
 		struct restart_block *restart;
@@ -1128,8 +1137,7 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
 		restart->fn = futex_wait_restart;
 		restart->arg0 = (unsigned long)uaddr;
 		restart->arg1 = (unsigned long)val;
-		restart->arg2 = (unsigned long)timed;
-		restart->arg3 = abs_time;
+		restart->arg2 = (unsigned long)abs_time;
 		return -ERESTART_RESTARTBLOCK;
 	}
 
@@ -1141,21 +1149,15 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
 	return ret;
 }
 
-static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time)
-{
-	int timed = (rel_time != MAX_SCHEDULE_TIMEOUT);
-	return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time);
-}
 
 static long futex_wait_restart(struct restart_block *restart)
 {
 	u32 __user *uaddr = (u32 __user *)restart->arg0;
 	u32 val = (u32)restart->arg1;
-	int timed = (int)restart->arg2;
-	unsigned long abs_time = restart->arg3;
+	ktime_t *abs_time = (ktime_t *)restart->arg2;
 
 	restart->fn = do_no_restart_syscall;
-	return (long)futex_wait_abstime(uaddr, val, timed, abs_time);
+	return (long)futex_wait(uaddr, val, abs_time);
 }
 
 
@@ -1165,8 +1167,8 @@ static long futex_wait_restart(struct restart_block *restart)
  * if there are waiters then it will block, it does PI, etc. (Due to
  * races the kernel might see a 0 value of the futex too.)
  */
-static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
-			 long nsec, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
+			 int trylock)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct task_struct *curr = current;
@@ -1178,11 +1180,11 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 	if (refill_pi_state_cache())
 		return -ENOMEM;
 
-	if (sec != MAX_SCHEDULE_TIMEOUT) {
+	if (time) {
 		to = &timeout;
 		hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
 		hrtimer_init_sleeper(to, current);
-		to->timer.expires = ktime_set(sec, nsec);
+		to->timer.expires = *time;
 	}
 
 	q.pi_state = NULL;
@@ -1818,7 +1820,7 @@ void exit_robust_list(struct task_struct *curr)
 	}
 }
 
-long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
 	int ret;
@@ -1844,13 +1846,13 @@ long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
 		ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
 		break;
 	case FUTEX_LOCK_PI:
-		ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
+		ret = futex_lock_pi(uaddr, val, timeout, 0);
 		break;
 	case FUTEX_UNLOCK_PI:
 		ret = futex_unlock_pi(uaddr);
 		break;
 	case FUTEX_TRYLOCK_PI:
-		ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
+		ret = futex_lock_pi(uaddr, 0, timeout, 1);
 		break;
 	default:
 		ret = -ENOSYS;
@@ -1863,21 +1865,20 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 			  struct timespec __user *utime, u32 __user *uaddr2,
 			  u32 val3)
 {
-	struct timespec t;
-	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+	struct timespec ts;
+	ktime_t t, *tp = NULL;
 	u32 val2 = 0;
 
 	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
-		if (copy_from_user(&t, utime, sizeof(t)) != 0)
+		if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
 			return -EFAULT;
-		if (!timespec_valid(&t))
+		if (!timespec_valid(&ts))
 			return -EINVAL;
+
+		t = timespec_to_ktime(ts);
 		if (op == FUTEX_WAIT)
-			timeout = timespec_to_jiffies(&t) + 1;
-		else {
-			timeout = t.tv_sec;
-			val2 = t.tv_nsec;
-		}
+			t = ktime_add(ktime_get(), t);
+		tp = &t;
 	}
 	/*
 	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
@@ -1885,7 +1886,7 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
 		val2 = (u32) (unsigned long) utime;
 
-	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
 
 static int futexfs_get_sb(struct file_system_type *fs_type,
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 50f24eea6cd0..dff27c471ea6 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -141,24 +141,23 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 		struct compat_timespec __user *utime, u32 __user *uaddr2,
 		u32 val3)
 {
-	struct timespec t;
-	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+	struct timespec ts;
+	ktime_t t, *tp = NULL;
 	int val2 = 0;
 
 	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
-		if (get_compat_timespec(&t, utime))
+		if (get_compat_timespec(&ts, utime))
 			return -EFAULT;
-		if (!timespec_valid(&t))
+		if (!timespec_valid(&ts))
 			return -EINVAL;
+
+		t = timespec_to_ktime(ts);
 		if (op == FUTEX_WAIT)
-			timeout = timespec_to_jiffies(&t) + 1;
-		else {
-			timeout = t.tv_sec;
-			val2 = t.tv_nsec;
-		}
+			t = ktime_add(ktime_get(), t);
+		tp = &t;
 	}
 	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
 		val2 = (int) (unsigned long) utime;
 
-	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
-- 
cgit v1.2.3


From d0aa7a70bf03b9de9e995ab272293be1f7937822 Mon Sep 17 00:00:00 2001
From: Pierre Peiffer <pierre.peiffer@bull.net>
Date: Wed, 9 May 2007 02:35:02 -0700
Subject: futex_requeue_pi optimization

This patch provides the futex_requeue_pi functionality, which allows some
threads waiting on a normal futex to be requeued on the wait-queue of a
PI-futex.

This provides an optimization, already used for (normal) futexes, to be used
with the PI-futexes.

This optimization is currently used by the glibc in pthread_broadcast, when
using "normal" mutexes.  With futex_requeue_pi, it can be used with
PRIO_INHERIT mutexes too.

Signed-off-by: Pierre Peiffer <pierre.peiffer@bull.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/futex.h   |   9 +-
 kernel/futex.c          | 541 +++++++++++++++++++++++++++++++++++++++++++-----
 kernel/futex_compat.c   |   3 +-
 kernel/rtmutex.c        |  41 +---
 kernel/rtmutex_common.h |  34 +++
 5 files changed, 540 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 34e54f2b8997..1bd8dfcb037b 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -17,6 +17,7 @@ union ktime;
 #define FUTEX_LOCK_PI		6
 #define FUTEX_UNLOCK_PI		7
 #define FUTEX_TRYLOCK_PI	8
+#define FUTEX_CMP_REQUEUE_PI	9
 
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
@@ -84,10 +85,15 @@ struct robust_list_head {
  */
 #define FUTEX_OWNER_DIED	0x40000000
 
+/*
+ * Some processes have been requeued on this PI-futex
+ */
+#define FUTEX_WAITER_REQUEUED	0x20000000
+
 /*
  * The rest of the robust-futex field is for the TID:
  */
-#define FUTEX_TID_MASK		0x3fffffff
+#define FUTEX_TID_MASK		0x0fffffff
 
 /*
  * This limit protects against a deliberately circular list.
@@ -111,6 +117,7 @@ handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
  * We set bit 0 to indicate if it's an inode-based key.
  */
 union futex_key {
+	u32 __user *uaddr;
 	struct {
 		unsigned long pgoff;
 		struct inode *inode;
diff --git a/kernel/futex.c b/kernel/futex.c
index e1246ccbf89a..4a60ef55dab4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -53,6 +53,12 @@
 
 #include "rtmutex_common.h"
 
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 
 /*
@@ -102,6 +108,12 @@ struct futex_q {
 	/* Optional priority inheritance state: */
 	struct futex_pi_state *pi_state;
 	struct task_struct *task;
+
+	/*
+	 * This waiter is used in case of requeue from a
+	 * normal futex to a PI-futex
+	 */
+	struct rt_mutex_waiter waiter;
 };
 
 /*
@@ -180,6 +192,9 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
 	if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
 		return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
 
+	/* Save the user address in the ley */
+	key->uaddr = uaddr;
+
 	/*
 	 * Private mappings are handled in a simple way.
 	 *
@@ -439,7 +454,8 @@ void exit_pi_state_list(struct task_struct *curr)
 }
 
 static int
-lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+		union futex_key *key, struct futex_pi_state **ps)
 {
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_q *this, *next;
@@ -450,7 +466,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	head = &hb->chain;
 
 	plist_for_each_entry_safe(this, next, head, list) {
-		if (match_futex(&this->key, &me->key)) {
+		if (match_futex(&this->key, key)) {
 			/*
 			 * Another waiter already exists - bump up
 			 * the refcount and return its pi_state:
@@ -465,7 +481,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 			WARN_ON(!atomic_read(&pi_state->refcount));
 
 			atomic_inc(&pi_state->refcount);
-			me->pi_state = pi_state;
+			*ps = pi_state;
 
 			return 0;
 		}
@@ -492,7 +508,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
 
 	/* Store the key for possible exit cleanups: */
-	pi_state->key = me->key;
+	pi_state->key = *key;
 
 	spin_lock_irq(&p->pi_lock);
 	WARN_ON(!list_empty(&pi_state->list));
@@ -502,7 +518,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 
 	put_task_struct(p);
 
-	me->pi_state = pi_state;
+	*ps = pi_state;
 
 	return 0;
 }
@@ -562,6 +578,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	 */
 	if (!(uval & FUTEX_OWNER_DIED)) {
 		newval = FUTEX_WAITERS | new_owner->pid;
+		/* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
+		newval |= (uval & FUTEX_WAITER_REQUEUED);
 
 		pagefault_disable();
 		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -665,6 +683,254 @@ out:
 	return ret;
 }
 
+/*
+ * Called from futex_requeue_pi.
+ * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
+ * PI-futex value; search its associated pi_state if an owner exist
+ * or create a new one without owner.
+ */
+static inline int
+lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
+			    union futex_key *key,
+			    struct futex_pi_state **pi_state)
+{
+	u32 curval, uval, newval;
+
+retry:
+	/*
+	 * We can't handle a fault cleanly because we can't
+	 * release the locks here. Simply return the fault.
+	 */
+	if (get_futex_value_locked(&curval, uaddr))
+		return -EFAULT;
+
+	/* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
+	if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
+	    != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
+		/*
+		 * No waiters yet, we prepare the futex to have some waiters.
+		 */
+
+		uval = curval;
+		newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
+
+		pagefault_disable();
+		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+		pagefault_enable();
+
+		if (unlikely(curval == -EFAULT))
+			return -EFAULT;
+		if (unlikely(curval != uval))
+			goto retry;
+	}
+
+	if (!(curval & FUTEX_TID_MASK)
+	    || lookup_pi_state(curval, hb, key, pi_state)) {
+		/* the futex has no owner (yet) or the lookup failed:
+		   allocate one pi_state without owner */
+
+		*pi_state = alloc_pi_state();
+
+		/* Already stores the key: */
+		(*pi_state)->key = *key;
+
+		/* init the mutex without owner */
+		__rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
+	}
+
+	return 0;
+}
+
+/*
+ * Keep the first nr_wake waiter from futex1, wake up one,
+ * and requeue the next nr_requeue waiters following hashed on
+ * one physical page to another physical page (PI-futex uaddr2)
+ */
+static int futex_requeue_pi(u32 __user *uaddr1, u32 __user *uaddr2,
+			    int nr_wake, int nr_requeue, u32 *cmpval)
+{
+	union futex_key key1, key2;
+	struct futex_hash_bucket *hb1, *hb2;
+	struct plist_head *head1;
+	struct futex_q *this, *next;
+	struct futex_pi_state *pi_state2 = NULL;
+	struct rt_mutex_waiter *waiter, *top_waiter = NULL;
+	struct rt_mutex *lock2 = NULL;
+	int ret, drop_count = 0;
+
+	if (refill_pi_state_cache())
+		return -ENOMEM;
+
+retry:
+	/*
+	 * First take all the futex related locks:
+	 */
+	down_read(&current->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr1, &key1);
+	if (unlikely(ret != 0))
+		goto out;
+	ret = get_futex_key(uaddr2, &key2);
+	if (unlikely(ret != 0))
+		goto out;
+
+	hb1 = hash_futex(&key1);
+	hb2 = hash_futex(&key2);
+
+	double_lock_hb(hb1, hb2);
+
+	if (likely(cmpval != NULL)) {
+		u32 curval;
+
+		ret = get_futex_value_locked(&curval, uaddr1);
+
+		if (unlikely(ret)) {
+			spin_unlock(&hb1->lock);
+			if (hb1 != hb2)
+				spin_unlock(&hb2->lock);
+
+			/*
+			 * If we would have faulted, release mmap_sem, fault
+			 * it in and start all over again.
+			 */
+			up_read(&current->mm->mmap_sem);
+
+			ret = get_user(curval, uaddr1);
+
+			if (!ret)
+				goto retry;
+
+			return ret;
+		}
+		if (curval != *cmpval) {
+			ret = -EAGAIN;
+			goto out_unlock;
+		}
+	}
+
+	head1 = &hb1->chain;
+	plist_for_each_entry_safe(this, next, head1, list) {
+		if (!match_futex (&this->key, &key1))
+			continue;
+		if (++ret <= nr_wake) {
+			wake_futex(this);
+		} else {
+			/*
+			 * FIRST: get and set the pi_state
+			 */
+			if (!pi_state2) {
+				int s;
+				/* do this only the first time we requeue someone */
+				s = lookup_pi_state_for_requeue(uaddr2, hb2,
+								&key2, &pi_state2);
+				if (s) {
+					ret = s;
+					goto out_unlock;
+				}
+
+				lock2 = &pi_state2->pi_mutex;
+				spin_lock(&lock2->wait_lock);
+
+				/* Save the top waiter of the wait_list */
+				if (rt_mutex_has_waiters(lock2))
+					top_waiter = rt_mutex_top_waiter(lock2);
+			} else
+				atomic_inc(&pi_state2->refcount);
+
+
+			this->pi_state = pi_state2;
+
+			/*
+			 * SECOND: requeue futex_q to the correct hashbucket
+			 */
+
+			/*
+			 * If key1 and key2 hash to the same bucket, no need to
+			 * requeue.
+			 */
+			if (likely(head1 != &hb2->chain)) {
+				plist_del(&this->list, &hb1->chain);
+				plist_add(&this->list, &hb2->chain);
+				this->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+				this->list.plist.lock = &hb2->lock;
+#endif
+			}
+			this->key = key2;
+			get_futex_key_refs(&key2);
+			drop_count++;
+
+
+			/*
+			 * THIRD: queue it to lock2
+			 */
+			spin_lock_irq(&this->task->pi_lock);
+			waiter = &this->waiter;
+			waiter->task = this->task;
+			waiter->lock = lock2;
+			plist_node_init(&waiter->list_entry, this->task->prio);
+			plist_node_init(&waiter->pi_list_entry, this->task->prio);
+			plist_add(&waiter->list_entry, &lock2->wait_list);
+			this->task->pi_blocked_on = waiter;
+			spin_unlock_irq(&this->task->pi_lock);
+
+			if (ret - nr_wake >= nr_requeue)
+				break;
+		}
+	}
+
+	/* If we've requeued some tasks and the top_waiter of the rt_mutex
+	   has changed, we must adjust the priority of the owner, if any */
+	if (drop_count) {
+		struct task_struct *owner = rt_mutex_owner(lock2);
+		if (owner &&
+		    (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
+			int chain_walk = 0;
+
+			spin_lock_irq(&owner->pi_lock);
+			if (top_waiter)
+				plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+			else
+				/*
+				 * There was no waiters before the requeue,
+				 * the flag must be updated
+				 */
+				mark_rt_mutex_waiters(lock2);
+
+			plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+			__rt_mutex_adjust_prio(owner);
+			if (owner->pi_blocked_on) {
+				chain_walk = 1;
+				get_task_struct(owner);
+			}
+
+			spin_unlock_irq(&owner->pi_lock);
+			spin_unlock(&lock2->wait_lock);
+
+			if (chain_walk)
+				rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
+							   current);
+		} else {
+			/* No owner or the top_waiter does not change */
+			mark_rt_mutex_waiters(lock2);
+			spin_unlock(&lock2->wait_lock);
+		}
+	}
+
+out_unlock:
+	spin_unlock(&hb1->lock);
+	if (hb1 != hb2)
+		spin_unlock(&hb2->lock);
+
+	/* drop_futex_key_refs() must be called outside the spinlocks. */
+	while (--drop_count >= 0)
+		drop_futex_key_refs(&key1);
+
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
 /*
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
@@ -984,9 +1250,10 @@ static int unqueue_me(struct futex_q *q)
 
 /*
  * PI futexes can not be requeued and must remove themself from the
- * hash bucket. The hash bucket lock is held on entry and dropped here.
+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
+ * and dropped here.
  */
-static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
+static void unqueue_me_pi(struct futex_q *q)
 {
 	WARN_ON(plist_node_empty(&q->list));
 	plist_del(&q->list, &q->list.plist);
@@ -995,11 +1262,65 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
 	free_pi_state(q->pi_state);
 	q->pi_state = NULL;
 
-	spin_unlock(&hb->lock);
+	spin_unlock(q->lock_ptr);
 
 	drop_futex_key_refs(&q->key);
 }
 
+/*
+ * Fixup the pi_state owner with current.
+ *
+ * The cur->mm semaphore must be  held, it is released at return of this
+ * function.
+ */
+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+				struct futex_hash_bucket *hb,
+				struct task_struct *curr)
+{
+	u32 newtid = curr->pid | FUTEX_WAITERS;
+	struct futex_pi_state *pi_state = q->pi_state;
+	u32 uval, curval, newval;
+	int ret;
+
+	/* Owner died? */
+	if (pi_state->owner != NULL) {
+		spin_lock_irq(&pi_state->owner->pi_lock);
+		WARN_ON(list_empty(&pi_state->list));
+		list_del_init(&pi_state->list);
+		spin_unlock_irq(&pi_state->owner->pi_lock);
+	} else
+		newtid |= FUTEX_OWNER_DIED;
+
+	pi_state->owner = curr;
+
+	spin_lock_irq(&curr->pi_lock);
+	WARN_ON(!list_empty(&pi_state->list));
+	list_add(&pi_state->list, &curr->pi_state_list);
+	spin_unlock_irq(&curr->pi_lock);
+
+	/* Unqueue and drop the lock */
+	unqueue_me_pi(q);
+	up_read(&curr->mm->mmap_sem);
+	/*
+	 * We own it, so we have to replace the pending owner
+	 * TID. This must be atomic as we have preserve the
+	 * owner died bit here.
+	 */
+	ret = get_user(uval, uaddr);
+	while (!ret) {
+		newval = (uval & FUTEX_OWNER_DIED) | newtid;
+		newval |= (uval & FUTEX_WAITER_REQUEUED);
+		curval = futex_atomic_cmpxchg_inatomic(uaddr,
+						       uval, newval);
+		if (curval == -EFAULT)
+ 			ret = -EFAULT;
+		if (curval == uval)
+			break;
+		uval = curval;
+	}
+	return ret;
+}
+
 static long futex_wait_restart(struct restart_block *restart);
 static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 {
@@ -1009,7 +1330,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 	struct futex_q q;
 	u32 uval;
 	int ret;
-	struct hrtimer_sleeper t;
+	struct hrtimer_sleeper t, *to = NULL;
 	int rem = 0;
 
 	q.pi_state = NULL;
@@ -1063,6 +1384,14 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 	if (uval != val)
 		goto out_unlock_release_sem;
 
+	/*
+	 * This rt_mutex_waiter structure is prepared here and will
+	 * be used only if this task is requeued from a normal futex to
+	 * a PI-futex with futex_requeue_pi.
+	 */
+	debug_rt_mutex_init_waiter(&q.waiter);
+	q.waiter.task = NULL;
+
 	/* Only actually queue if *uaddr contained val.  */
 	__queue_me(&q, hb);
 
@@ -1092,6 +1421,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 		if (!abs_time)
 			schedule();
 		else {
+			to = &t;
 			hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 			hrtimer_init_sleeper(&t, current);
 			t.timer.expires = *abs_time;
@@ -1119,6 +1449,66 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 	 * we are the only user of it.
 	 */
 
+	if (q.pi_state) {
+		/*
+		 * We were woken but have been requeued on a PI-futex.
+		 * We have to complete the lock acquisition by taking
+		 * the rtmutex.
+		 */
+
+		struct rt_mutex *lock = &q.pi_state->pi_mutex;
+
+		spin_lock(&lock->wait_lock);
+		if (unlikely(q.waiter.task)) {
+			remove_waiter(lock, &q.waiter);
+		}
+		spin_unlock(&lock->wait_lock);
+
+		if (rem)
+			ret = -ETIMEDOUT;
+		else
+			ret = rt_mutex_timed_lock(lock, to, 1);
+
+		down_read(&curr->mm->mmap_sem);
+		spin_lock(q.lock_ptr);
+
+		/*
+		 * Got the lock. We might not be the anticipated owner if we
+		 * did a lock-steal - fix up the PI-state in that case.
+		 */
+		if (!ret && q.pi_state->owner != curr) {
+			/*
+			 * We MUST play with the futex we were requeued on,
+			 * NOT the current futex.
+			 * We can retrieve it from the key of the pi_state
+			 */
+			uaddr = q.pi_state->key.uaddr;
+
+			/* mmap_sem and hash_bucket lock are unlocked at
+			   return of this function */
+			ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+		} else {
+			/*
+			 * Catch the rare case, where the lock was released
+			 * when we were on the way back before we locked
+			 * the hash bucket.
+			 */
+			if (ret && q.pi_state->owner == curr) {
+				if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+					ret = 0;
+			}
+			/* Unqueue and drop the lock */
+			unqueue_me_pi(&q);
+			up_read(&curr->mm->mmap_sem);
+		}
+
+		debug_rt_mutex_free_waiter(&q.waiter);
+
+		return ret;
+	}
+
+	debug_rt_mutex_free_waiter(&q.waiter);
+
 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	if (!unqueue_me(&q))
 		return 0;
@@ -1161,6 +1551,51 @@ static long futex_wait_restart(struct restart_block *restart)
 }
 
 
+static void set_pi_futex_owner(struct futex_hash_bucket *hb,
+			       union futex_key *key, struct task_struct *p)
+{
+	struct plist_head *head;
+	struct futex_q *this, *next;
+	struct futex_pi_state *pi_state = NULL;
+	struct rt_mutex *lock;
+
+	/* Search a waiter that should already exists */
+
+	head = &hb->chain;
+
+	plist_for_each_entry_safe(this, next, head, list) {
+		if (match_futex (&this->key, key)) {
+			pi_state = this->pi_state;
+			break;
+		}
+	}
+
+	BUG_ON(!pi_state);
+
+	/* set p as pi_state's owner */
+	lock = &pi_state->pi_mutex;
+
+	spin_lock(&lock->wait_lock);
+	spin_lock_irq(&p->pi_lock);
+
+	list_add(&pi_state->list, &p->pi_state_list);
+	pi_state->owner = p;
+
+
+	/* set p as pi_mutex's owner */
+	debug_rt_mutex_proxy_lock(lock, p);
+	WARN_ON(rt_mutex_owner(lock));
+	rt_mutex_set_owner(lock, p, 0);
+	rt_mutex_deadlock_account_lock(lock, p);
+
+	plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry,
+		  &p->pi_waiters);
+	__rt_mutex_adjust_prio(p);
+
+	spin_unlock_irq(&p->pi_lock);
+	spin_unlock(&lock->wait_lock);
+}
+
 /*
  * Userspace tried a 0 -> TID atomic transition of the futex value
  * and failed. The kernel side here does the whole locking operation:
@@ -1175,7 +1610,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	struct futex_hash_bucket *hb;
 	u32 uval, newval, curval;
 	struct futex_q q;
-	int ret, attempt = 0;
+	int ret, lock_held, attempt = 0;
 
 	if (refill_pi_state_cache())
 		return -ENOMEM;
@@ -1198,6 +1633,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	hb = queue_lock(&q, -1, NULL);
 
  retry_locked:
+	lock_held = 0;
+
 	/*
 	 * To avoid races, we attempt to take the lock here again
 	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1216,7 +1653,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
 		if (!detect && 0)
 			force_sig(SIGKILL, current);
-		ret = -EDEADLK;
+		/*
+		 * Normally, this check is done in user space.
+		 * In case of requeue, the owner may attempt to lock this futex,
+		 * even if the ownership has already been given by the previous
+		 * waker.
+		 * In the usual case, this is a case of deadlock, but not in case
+		 * of REQUEUE_PI.
+		 */
+		if (!(curval & FUTEX_WAITER_REQUEUED))
+			ret = -EDEADLK;
 		goto out_unlock_release_sem;
 	}
 
@@ -1228,7 +1674,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 		goto out_unlock_release_sem;
 
 	uval = curval;
-	newval = uval | FUTEX_WAITERS;
+	/*
+	 * In case of a requeue, check if there already is an owner
+	 * If not, just take the futex.
+	 */
+	if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
+		/* set current as futex owner */
+		newval = curval | current->pid;
+		lock_held = 1;
+	} else
+		/* Set the WAITERS flag, so the owner will know it has someone
+		   to wake at next unlock */
+		newval = curval | FUTEX_WAITERS;
 
 	pagefault_disable();
 	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1239,11 +1696,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	if (unlikely(curval != uval))
 		goto retry_locked;
 
+	if (lock_held) {
+		set_pi_futex_owner(hb, &q.key, curr);
+		goto out_unlock_release_sem;
+	}
+
 	/*
 	 * We dont have the lock. Look up the PI state (or create it if
 	 * we are the first waiter):
 	 */
-	ret = lookup_pi_state(uval, hb, &q);
+	ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
 
 	if (unlikely(ret)) {
 		/*
@@ -1306,45 +1768,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	 * Got the lock. We might not be the anticipated owner if we
 	 * did a lock-steal - fix up the PI-state in that case.
 	 */
-	if (!ret && q.pi_state->owner != curr) {
-		u32 newtid = current->pid | FUTEX_WAITERS;
-
-		/* Owner died? */
-		if (q.pi_state->owner != NULL) {
-			spin_lock_irq(&q.pi_state->owner->pi_lock);
-			WARN_ON(list_empty(&q.pi_state->list));
-			list_del_init(&q.pi_state->list);
-			spin_unlock_irq(&q.pi_state->owner->pi_lock);
-		} else
-			newtid |= FUTEX_OWNER_DIED;
-
-		q.pi_state->owner = current;
-
-		spin_lock_irq(&current->pi_lock);
-		WARN_ON(!list_empty(&q.pi_state->list));
-		list_add(&q.pi_state->list, &current->pi_state_list);
-		spin_unlock_irq(&current->pi_lock);
-
-		/* Unqueue and drop the lock */
-		unqueue_me_pi(&q, hb);
-		up_read(&curr->mm->mmap_sem);
-		/*
-		 * We own it, so we have to replace the pending owner
-		 * TID. This must be atomic as we have preserve the
-		 * owner died bit here.
-		 */
-		ret = get_user(uval, uaddr);
-		while (!ret) {
-			newval = (uval & FUTEX_OWNER_DIED) | newtid;
-			curval = futex_atomic_cmpxchg_inatomic(uaddr,
-							       uval, newval);
-			if (curval == -EFAULT)
-				ret = -EFAULT;
-			if (curval == uval)
-				break;
-			uval = curval;
-		}
-	} else {
+	if (!ret && q.pi_state->owner != curr)
+		/* mmap_sem is unlocked at return of this function */
+		ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+	else {
 		/*
 		 * Catch the rare case, where the lock was released
 		 * when we were on the way back before we locked
@@ -1355,7 +1782,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 				ret = 0;
 		}
 		/* Unqueue and drop the lock */
-		unqueue_me_pi(&q, hb);
+		unqueue_me_pi(&q);
 		up_read(&curr->mm->mmap_sem);
 	}
 
@@ -1724,6 +2151,8 @@ retry:
 		 * userspace.
 		 */
 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+		/* Also keep the FUTEX_WAITER_REQUEUED flag if set */
+		mval |= (uval & FUTEX_WAITER_REQUEUED);
 		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
 
 		if (nval == -EFAULT)
@@ -1854,6 +2283,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	case FUTEX_TRYLOCK_PI:
 		ret = futex_lock_pi(uaddr, 0, timeout, 1);
 		break;
+	case FUTEX_CMP_REQUEUE_PI:
+		ret = futex_requeue_pi(uaddr, uaddr2, val, val2, &val3);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
@@ -1883,7 +2315,8 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 	/*
 	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
 	 */
-	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+	    || op == FUTEX_CMP_REQUEUE_PI)
 		val2 = (u32) (unsigned long) utime;
 
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index dff27c471ea6..338a9b489fbc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -156,7 +156,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 			t = ktime_add(ktime_get(), t);
 		tp = &t;
 	}
-	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+	    || op == FUTEX_CMP_REQUEUE_PI)
 		val2 = (int) (unsigned long) utime;
 
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 180978cb2f75..12879f6c1ec3 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -56,7 +56,7 @@
  * state.
  */
 
-static void
+void
 rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
 		   unsigned long mask)
 {
@@ -80,29 +80,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 		clear_rt_mutex_waiters(lock);
 }
 
-/*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
- */
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
-# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
-	unsigned long owner, *p = (unsigned long *) &lock->owner;
-
-	do {
-		owner = *p;
-	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
-}
-#else
-# define rt_mutex_cmpxchg(l,c,n)	(0)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
-	lock->owner = (struct task_struct *)
-			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
-}
-#endif
-
 /*
  * Calculate task priority from the waiter list priority
  *
@@ -123,7 +100,7 @@ int rt_mutex_getprio(struct task_struct *task)
  *
  * This can be both boosting and unboosting. task->pi_lock must be held.
  */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
+void __rt_mutex_adjust_prio(struct task_struct *task)
 {
 	int prio = rt_mutex_getprio(task);
 
@@ -159,11 +136,11 @@ int max_lock_depth = 1024;
  * Decreases task's usage by one - may thus free the task.
  * Returns 0 or -EDEADLK.
  */
-static int rt_mutex_adjust_prio_chain(struct task_struct *task,
-				      int deadlock_detect,
-				      struct rt_mutex *orig_lock,
-				      struct rt_mutex_waiter *orig_waiter,
-				      struct task_struct *top_task)
+int rt_mutex_adjust_prio_chain(struct task_struct *task,
+			       int deadlock_detect,
+			       struct rt_mutex *orig_lock,
+			       struct rt_mutex_waiter *orig_waiter,
+			       struct task_struct *top_task)
 {
 	struct rt_mutex *lock;
 	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -524,8 +501,8 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
  *
  * Must be called with lock->wait_lock held
  */
-static void remove_waiter(struct rt_mutex *lock,
-			  struct rt_mutex_waiter *waiter)
+void remove_waiter(struct rt_mutex *lock,
+		   struct rt_mutex_waiter *waiter)
 {
 	int first = (waiter == rt_mutex_top_waiter(lock));
 	struct task_struct *owner = rt_mutex_owner(lock);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 9c75856e791e..242ec7ee740b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -112,6 +112,29 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
 	return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
 }
 
+/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+	do {
+		owner = *p;
+	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n)	(0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	lock->owner = (struct task_struct *)
+			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
 /*
  * PI-futex support (proxy locking functions, etc.):
  */
@@ -120,4 +143,15 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				       struct task_struct *proxy_owner);
 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 				  struct task_struct *proxy_owner);
+
+extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+			       unsigned long mask);
+extern void __rt_mutex_adjust_prio(struct task_struct *task);
+extern int rt_mutex_adjust_prio_chain(struct task_struct *task,
+				      int deadlock_detect,
+				      struct rt_mutex *orig_lock,
+				      struct rt_mutex_waiter *orig_waiter,
+				      struct task_struct *top_task);
+extern void remove_waiter(struct rt_mutex *lock,
+			  struct rt_mutex_waiter *waiter);
 #endif
-- 
cgit v1.2.3


From 34f01cc1f512fa783302982776895c73714ebbc2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 9 May 2007 02:35:04 -0700
Subject: FUTEX: new PRIVATE futexes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  Analysis of current linux futex code :
  --------------------------------------

A central hash table futex_queues[] holds all contexts (futex_q) of waiting
threads.

Each futex_wait()/futex_wait() has to obtain a spinlock on a hash slot to
perform lookups or insert/deletion of a futex_q.

When a futex_wait() is done, calling thread has to :

1) - Obtain a read lock on mmap_sem to be able to validate the user pointer
     (calling find_vma()). This validation tells us if the futex uses
     an inode based store (mapped file), or mm based store (anonymous mem)

2) - compute a hash key

3) - Atomic increment of reference counter on an inode or a mm_struct

4) - lock part of futex_queues[] hash table

5) - perform the test on value of futex.
	(rollback is value != expected_value, returns EWOULDBLOCK)
	(various loops if test triggers mm faults)

6) queue the context into hash table, release the lock got in 4)

7) - release the read_lock on mmap_sem

   <block>

8) Eventually unqueue the context (but rarely, as this part  may be done
   by the futex_wake())

Futexes were designed to improve scalability but current implementation has
various problems :

- Central hashtable :

  This means scalability problems if many processes/threads want to use
  futexes at the same time.
  This means NUMA unbalance because this hashtable is located on one node.

- Using mmap_sem on every futex() syscall :

  Even if mmap_sem is a rw_semaphore, up_read()/down_read() are doing atomic
  ops on mmap_sem, dirtying cache line :
    - lot of cache line ping pongs on SMP configurations.

  mmap_sem is also extensively used by mm code (page faults, mmap()/munmap())
  Highly threaded processes might suffer from mmap_sem contention.

  mmap_sem is also used by oprofile code. Enabling oprofile hurts threaded
  programs because of contention on the mmap_sem cache line.

- Using an atomic_inc()/atomic_dec() on inode ref counter or mm ref counter:
  It's also a cache line ping pong on SMP. It also increases mmap_sem hold time
  because of cache misses.

Most of these scalability problems come from the fact that futexes are in
one global namespace.  As we use a central hash table, we must make sure
they are all using the same reference (given by the mm subsystem).  We
chose to force all futexes be 'shared'.  This has a cost.

But fact is POSIX defined PRIVATE and SHARED, allowing clear separation,
and optimal performance if carefuly implemented.  Time has come for linux
to have better threading performance.

The goal is to permit new futex commands to avoid :
 - Taking the mmap_sem semaphore, conflicting with other subsystems.
 - Modifying a ref_count on mm or an inode, still conflicting with mm or fs.

This is possible because, for one process using PTHREAD_PROCESS_PRIVATE
futexes, we only need to distinguish futexes by their virtual address, no
matter the underlying mm storage is.

If glibc wants to exploit this new infrastructure, it should use new
_PRIVATE futex subcommands for PTHREAD_PROCESS_PRIVATE futexes.  And be
prepared to fallback on old subcommands for old kernels.  Using one global
variable with the FUTEX_PRIVATE_FLAG or 0 value should be OK.

PTHREAD_PROCESS_SHARED futexes should still use the old subcommands.

Compatibility with old applications is preserved, they still hit the
scalability problems, but new applications can fly :)

Note : the same SHARED futex (mapped on a file) can be used by old binaries
*and* new binaries, because both binaries will use the old subcommands.

Note : Vast majority of futexes should be using PROCESS_PRIVATE semantic,
as this is the default semantic. Almost all applications should benefit
of this changes (new kernel and updated libc)

Some bench results on a Pentium M 1.6 GHz (SMP kernel on a UP machine)

/* calling futex_wait(addr, value) with value != *addr */
433 cycles per futex(FUTEX_WAIT) call (mixing 2 futexes)
424 cycles per futex(FUTEX_WAIT) call (using one futex)
334 cycles per futex(FUTEX_WAIT_PRIVATE) call (mixing 2 futexes)
334 cycles per futex(FUTEX_WAIT_PRIVATE) call (using one futex)
For reference :
187 cycles per getppid() call
188 cycles per umask() call
181 cycles per ni_syscall() call

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Pierre Peiffer <pierre.peiffer@bull.net>
Cc: "Ulrich Drepper" <drepper@gmail.com>
Cc: "Nick Piggin" <nickpiggin@yahoo.com.au>
Cc: "Ingo Molnar" <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/futex.h |  29 ++++-
 kernel/futex.c        | 324 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 236 insertions(+), 117 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 1bd8dfcb037b..899fc7f20edd 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -19,6 +19,18 @@ union ktime;
 #define FUTEX_TRYLOCK_PI	8
 #define FUTEX_CMP_REQUEUE_PI	9
 
+#define FUTEX_PRIVATE_FLAG	128
+#define FUTEX_CMD_MASK		~FUTEX_PRIVATE_FLAG
+
+#define FUTEX_WAIT_PRIVATE	(FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
+#define FUTEX_WAKE_PRIVATE	(FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
+#define FUTEX_REQUEUE_PRIVATE	(FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG)
+#define FUTEX_CMP_REQUEUE_PRIVATE (FUTEX_CMP_REQUEUE | FUTEX_PRIVATE_FLAG)
+#define FUTEX_WAKE_OP_PRIVATE	(FUTEX_WAKE_OP | FUTEX_PRIVATE_FLAG)
+#define FUTEX_LOCK_PI_PRIVATE	(FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG)
+#define FUTEX_UNLOCK_PI_PRIVATE	(FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG)
+#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
+
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
  * thread exit time.
@@ -114,8 +126,18 @@ handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
  * Don't rearrange members without looking at hash_futex().
  *
  * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We set bit 0 to indicate if it's an inode-based key.
- */
+ * We use the two low order bits of offset to tell what is the kind of key :
+ *  00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
+ *       (no reference on an inode or mm)
+ *  01 : Shared futex (PTHREAD_PROCESS_SHARED)
+ *	mapped on a file (reference on the underlying inode)
+ *  10 : Shared futex (PTHREAD_PROCESS_SHARED)
+ *       (but private mapping on an mm, and reference taken on it)
+*/
+
+#define FUT_OFF_INODE    1 /* We set bit 0 if key has a reference on inode */
+#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
+
 union futex_key {
 	u32 __user *uaddr;
 	struct {
@@ -134,7 +156,8 @@ union futex_key {
 		int offset;
 	} both;
 };
-int get_futex_key(u32 __user *uaddr, union futex_key *key);
+int get_futex_key(u32 __user *uaddr, struct rw_semaphore *shared,
+		  union futex_key *key);
 void get_futex_key_refs(union futex_key *key);
 void drop_futex_key_refs(union futex_key *key);
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 4a60ef55dab4..b7ce15c67e32 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -16,6 +16,9 @@
  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  *
+ *  PRIVATE futexes by Eric Dumazet
+ *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -150,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
 		&& key1->both.offset == key2->both.offset);
 }
 
-/*
- * Get parameters which are the keys for a futex.
+/**
+ * get_futex_key - Get parameters which are the keys for a futex.
+ * @uaddr: virtual address of the futex
+ * @shared: NULL for a PROCESS_PRIVATE futex,
+ *	&current->mm->mmap_sem for a PROCESS_SHARED futex
+ * @key: address where result is stored.
+ *
+ * Returns a negative error code or 0
+ * The key words are stored in *key on success.
  *
  * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
  * offset_within_page).  For private mappings, it's (uaddr, current->mm).
  * We can usually work out the index without swapping in the page.
  *
- * Returns: 0, or negative error code.
- * The key words are stored in *key on success.
- *
- * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
+ * fshared is NULL for PROCESS_PRIVATE futexes
+ * For other futexes, it points to &current->mm->mmap_sem and
+ * caller must have taken the reader lock. but NOT any spinlocks.
  */
-int get_futex_key(u32 __user *uaddr, union futex_key *key)
+int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
+		  union futex_key *key)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
@@ -174,10 +184,24 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
 	 * The futex address must be "naturally" aligned.
 	 */
 	key->both.offset = address % PAGE_SIZE;
-	if (unlikely((key->both.offset % sizeof(u32)) != 0))
+	if (unlikely((address % sizeof(u32)) != 0))
 		return -EINVAL;
 	address -= key->both.offset;
 
+	/*
+	 * PROCESS_PRIVATE futexes are fast.
+	 * As the mm cannot disappear under us and the 'key' only needs
+	 * virtual address, we dont even have to find the underlying vma.
+	 * Note : We do have to check 'uaddr' is a valid user address,
+	 *        but access_ok() should be faster than find_vma()
+	 */
+	if (!fshared) {
+		if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
+			return -EFAULT;
+		key->private.mm = mm;
+		key->private.address = address;
+		return 0;
+	}
 	/*
 	 * The futex is hashed differently depending on whether
 	 * it's in a shared or private mapping.  So check vma first.
@@ -205,6 +229,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
 	 * mappings of _writable_ handles.
 	 */
 	if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
+		key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
 		key->private.mm = mm;
 		key->private.address = address;
 		return 0;
@@ -214,7 +239,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
 	 * Linear file mappings are also simple.
 	 */
 	key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
-	key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
+	key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
 	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
 		key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
 				     + vma->vm_pgoff);
@@ -242,16 +267,18 @@ EXPORT_SYMBOL_GPL(get_futex_key);
  * Take a reference to the resource addressed by a key.
  * Can be called while holding spinlocks.
  *
- * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
- * function, if it is called at all.  mmap_sem keeps key->shared.inode valid.
  */
 inline void get_futex_key_refs(union futex_key *key)
 {
-	if (key->both.ptr != 0) {
-		if (key->both.offset & 1)
+	if (key->both.ptr == 0)
+		return;
+	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+		case FUT_OFF_INODE:
 			atomic_inc(&key->shared.inode->i_count);
-		else
+			break;
+		case FUT_OFF_MMSHARED:
 			atomic_inc(&key->private.mm->mm_count);
+			break;
 	}
 }
 EXPORT_SYMBOL_GPL(get_futex_key_refs);
@@ -262,11 +289,15 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs);
  */
 void drop_futex_key_refs(union futex_key *key)
 {
-	if (key->both.ptr != 0) {
-		if (key->both.offset & 1)
+	if (key->both.ptr == 0)
+		return;
+	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+		case FUT_OFF_INODE:
 			iput(key->shared.inode);
-		else
+			break;
+		case FUT_OFF_MMSHARED:
 			mmdrop(key->private.mm);
+			break;
 	}
 }
 EXPORT_SYMBOL_GPL(drop_futex_key_refs);
@@ -283,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 }
 
 /*
- * Fault handling. Called with current->mm->mmap_sem held.
+ * Fault handling.
+ * if fshared is non NULL, current->mm->mmap_sem is already held
  */
-static int futex_handle_fault(unsigned long address, int attempt)
+static int futex_handle_fault(unsigned long address,
+			      struct rw_semaphore *fshared, int attempt)
 {
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
+	int ret = -EFAULT;
 
-	if (attempt > 2 || !(vma = find_vma(mm, address)) ||
-	    vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
-		return -EFAULT;
+	if (attempt > 2)
+		return ret;
 
-	switch (handle_mm_fault(mm, vma, address, 1)) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	default:
-		return -EFAULT;
+	if (!fshared)
+		down_read(&mm->mmap_sem);
+	vma = find_vma(mm, address);
+	if (vma && address >= vma->vm_start &&
+	    (vma->vm_flags & VM_WRITE)) {
+		switch (handle_mm_fault(mm, vma, address, 1)) {
+		case VM_FAULT_MINOR:
+			ret = 0;
+			current->min_flt++;
+			break;
+		case VM_FAULT_MAJOR:
+			ret = 0;
+			current->maj_flt++;
+			break;
+		}
 	}
-	return 0;
+	if (!fshared)
+		up_read(&mm->mmap_sem);
+	return ret;
 }
 
 /*
@@ -647,7 +688,8 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake(u32 __user *uaddr, int nr_wake)
+static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
+		      int nr_wake)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
@@ -655,9 +697,10 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
 	union futex_key key;
 	int ret;
 
-	down_read(&current->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 
-	ret = get_futex_key(uaddr, &key);
+	ret = get_futex_key(uaddr, fshared, &key);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -679,7 +722,8 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
 
 	spin_unlock(&hb->lock);
 out:
-	up_read(&current->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 	return ret;
 }
 
@@ -746,7 +790,9 @@ retry:
  * and requeue the next nr_requeue waiters following hashed on
  * one physical page to another physical page (PI-futex uaddr2)
  */
-static int futex_requeue_pi(u32 __user *uaddr1, u32 __user *uaddr2,
+static int futex_requeue_pi(u32 __user *uaddr1,
+			    struct rw_semaphore *fshared,
+			    u32 __user *uaddr2,
 			    int nr_wake, int nr_requeue, u32 *cmpval)
 {
 	union futex_key key1, key2;
@@ -765,12 +811,13 @@ retry:
 	/*
 	 * First take all the futex related locks:
 	 */
-	down_read(&current->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 
-	ret = get_futex_key(uaddr1, &key1);
+	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, &key2);
+	ret = get_futex_key(uaddr2, fshared, &key2);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -793,7 +840,8 @@ retry:
 			 * If we would have faulted, release mmap_sem, fault
 			 * it in and start all over again.
 			 */
-			up_read(&current->mm->mmap_sem);
+			if (fshared)
+				up_read(fshared);
 
 			ret = get_user(curval, uaddr1);
 
@@ -927,7 +975,8 @@ out_unlock:
 		drop_futex_key_refs(&key1);
 
 out:
-	up_read(&current->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 	return ret;
 }
 
@@ -936,7 +985,8 @@ out:
  * to this virtual address:
  */
 static int
-futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
+	      u32 __user *uaddr2,
 	      int nr_wake, int nr_wake2, int op)
 {
 	union futex_key key1, key2;
@@ -946,12 +996,13 @@ futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
 	int ret, op_ret, attempt = 0;
 
 retryfull:
-	down_read(&current->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 
-	ret = get_futex_key(uaddr1, &key1);
+	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, &key2);
+	ret = get_futex_key(uaddr2, fshared, &key2);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -991,11 +1042,10 @@ retry:
 		 * still holding the mmap_sem.
 		 */
 		if (attempt++) {
-			if (futex_handle_fault((unsigned long)uaddr2,
-						attempt)) {
-				ret = -EFAULT;
+			ret = futex_handle_fault((unsigned long)uaddr2,
+						fshared, attempt);
+			if (ret)
 				goto out;
-			}
 			goto retry;
 		}
 
@@ -1003,7 +1053,8 @@ retry:
 		 * If we would have faulted, release mmap_sem,
 		 * fault it in and start all over again.
 		 */
-		up_read(&current->mm->mmap_sem);
+		if (fshared)
+			up_read(fshared);
 
 		ret = get_user(dummy, uaddr2);
 		if (ret)
@@ -1040,7 +1091,8 @@ retry:
 	if (hb1 != hb2)
 		spin_unlock(&hb2->lock);
 out:
-	up_read(&current->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 	return ret;
 }
 
@@ -1048,7 +1100,8 @@ out:
  * Requeue all waiters hashed on one physical page to another
  * physical page.
  */
-static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
+static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
+			 u32 __user *uaddr2,
 			 int nr_wake, int nr_requeue, u32 *cmpval)
 {
 	union futex_key key1, key2;
@@ -1058,12 +1111,13 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
 	int ret, drop_count = 0;
 
  retry:
-	down_read(&current->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 
-	ret = get_futex_key(uaddr1, &key1);
+	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, &key2);
+	ret = get_futex_key(uaddr2, fshared, &key2);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -1086,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
 			 * If we would have faulted, release mmap_sem, fault
 			 * it in and start all over again.
 			 */
-			up_read(&current->mm->mmap_sem);
+			if (fshared)
+				up_read(fshared);
 
 			ret = get_user(curval, uaddr1);
 
@@ -1139,7 +1194,8 @@ out_unlock:
 		drop_futex_key_refs(&key1);
 
 out:
-	up_read(&current->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 	return ret;
 }
 
@@ -1273,7 +1329,8 @@ static void unqueue_me_pi(struct futex_q *q)
  * The cur->mm semaphore must be  held, it is released at return of this
  * function.
  */
-static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
+				struct futex_q *q,
 				struct futex_hash_bucket *hb,
 				struct task_struct *curr)
 {
@@ -1300,7 +1357,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 
 	/* Unqueue and drop the lock */
 	unqueue_me_pi(q);
-	up_read(&curr->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 	/*
 	 * We own it, so we have to replace the pending owner
 	 * TID. This must be atomic as we have preserve the
@@ -1321,8 +1379,15 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 	return ret;
 }
 
+/*
+ * In case we must use restart_block to restart a futex_wait,
+ * we encode in the 'arg3' shared capability
+ */
+#define ARG3_SHARED  1
+
 static long futex_wait_restart(struct restart_block *restart);
-static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
+static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+		      u32 val, ktime_t *abs_time)
 {
 	struct task_struct *curr = current;
 	DECLARE_WAITQUEUE(wait, curr);
@@ -1335,9 +1400,10 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 
 	q.pi_state = NULL;
  retry:
-	down_read(&curr->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 
-	ret = get_futex_key(uaddr, &q.key);
+	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
 		goto out_release_sem;
 
@@ -1360,8 +1426,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 	 * a wakeup when *uaddr != val on entry to the syscall.  This is
 	 * rare, but normal.
 	 *
-	 * We hold the mmap semaphore, so the mapping cannot have changed
-	 * since we looked it up in get_futex_key.
+	 * for shared futexes, we hold the mmap semaphore, so the mapping
+	 * cannot have changed since we looked it up in get_futex_key.
 	 */
 	ret = get_futex_value_locked(&uval, uaddr);
 
@@ -1372,7 +1438,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 		 * If we would have faulted, release mmap_sem, fault it in and
 		 * start all over again.
 		 */
-		up_read(&curr->mm->mmap_sem);
+		if (fshared)
+			up_read(fshared);
 
 		ret = get_user(uval, uaddr);
 
@@ -1399,7 +1466,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 	 * Now the futex is queued and we have checked the data, we
 	 * don't want to hold mmap_sem while we sleep.
 	 */
-	up_read(&curr->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 
 	/*
 	 * There might have been scheduling since the queue_me(), as we
@@ -1469,7 +1537,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 		else
 			ret = rt_mutex_timed_lock(lock, to, 1);
 
-		down_read(&curr->mm->mmap_sem);
+		if (fshared)
+			down_read(fshared);
 		spin_lock(q.lock_ptr);
 
 		/*
@@ -1486,7 +1555,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 
 			/* mmap_sem and hash_bucket lock are unlocked at
 			   return of this function */
-			ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+			ret = fixup_pi_state_owner(uaddr, fshared,
+						   &q, hb, curr);
 		} else {
 			/*
 			 * Catch the rare case, where the lock was released
@@ -1499,7 +1569,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 			}
 			/* Unqueue and drop the lock */
 			unqueue_me_pi(&q);
-			up_read(&curr->mm->mmap_sem);
+			if (fshared)
+				up_read(fshared);
 		}
 
 		debug_rt_mutex_free_waiter(&q.waiter);
@@ -1528,6 +1599,9 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 		restart->arg0 = (unsigned long)uaddr;
 		restart->arg1 = (unsigned long)val;
 		restart->arg2 = (unsigned long)abs_time;
+		restart->arg3 = 0;
+		if (fshared)
+			restart->arg3 |= ARG3_SHARED;
 		return -ERESTART_RESTARTBLOCK;
 	}
 
@@ -1535,7 +1609,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
 	queue_unlock(&q, hb);
 
  out_release_sem:
-	up_read(&curr->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 	return ret;
 }
 
@@ -1545,9 +1620,12 @@ static long futex_wait_restart(struct restart_block *restart)
 	u32 __user *uaddr = (u32 __user *)restart->arg0;
 	u32 val = (u32)restart->arg1;
 	ktime_t *abs_time = (ktime_t *)restart->arg2;
+	struct rw_semaphore *fshared = NULL;
 
 	restart->fn = do_no_restart_syscall;
-	return (long)futex_wait(uaddr, val, abs_time);
+	if (restart->arg3 & ARG3_SHARED)
+		fshared = &current->mm->mmap_sem;
+	return (long)futex_wait(uaddr, fshared, val, abs_time);
 }
 
 
@@ -1602,8 +1680,8 @@ static void set_pi_futex_owner(struct futex_hash_bucket *hb,
  * if there are waiters then it will block, it does PI, etc. (Due to
  * races the kernel might see a 0 value of the futex too.)
  */
-static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
-			 int trylock)
+static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
+			 int detect, ktime_t *time, int trylock)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct task_struct *curr = current;
@@ -1624,9 +1702,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 
 	q.pi_state = NULL;
  retry:
-	down_read(&curr->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 
-	ret = get_futex_key(uaddr, &q.key);
+	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
 		goto out_release_sem;
 
@@ -1747,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	 * Now the futex is queued and we have checked the data, we
 	 * don't want to hold mmap_sem while we sleep.
 	 */
-	up_read(&curr->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 
 	WARN_ON(!q.pi_state);
 	/*
@@ -1761,7 +1841,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 		ret = ret ? 0 : -EWOULDBLOCK;
 	}
 
-	down_read(&curr->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 	spin_lock(q.lock_ptr);
 
 	/*
@@ -1770,7 +1851,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	 */
 	if (!ret && q.pi_state->owner != curr)
 		/* mmap_sem is unlocked at return of this function */
-		ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+		ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
 	else {
 		/*
 		 * Catch the rare case, where the lock was released
@@ -1783,7 +1864,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 		}
 		/* Unqueue and drop the lock */
 		unqueue_me_pi(&q);
-		up_read(&curr->mm->mmap_sem);
+		if (fshared)
+			up_read(fshared);
 	}
 
 	if (!detect && ret == -EDEADLK && 0)
@@ -1795,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	queue_unlock(&q, hb);
 
  out_release_sem:
-	up_read(&curr->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 	return ret;
 
  uaddr_faulted:
@@ -1806,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
 	 * still holding the mmap_sem.
 	 */
 	if (attempt++) {
-		if (futex_handle_fault((unsigned long)uaddr, attempt)) {
-			ret = -EFAULT;
+		ret = futex_handle_fault((unsigned long)uaddr, fshared,
+					 attempt);
+		if (ret)
 			goto out_unlock_release_sem;
-		}
 		goto retry_locked;
 	}
 
 	queue_unlock(&q, hb);
-	up_read(&curr->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 
 	ret = get_user(uval, uaddr);
 	if (!ret && (uval != -EFAULT))
@@ -1828,7 +1912,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
  * This is the in-kernel slowpath: we look up the PI state (if any),
  * and do the rt-mutex unlock.
  */
-static int futex_unlock_pi(u32 __user *uaddr)
+static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
@@ -1848,9 +1932,10 @@ retry:
 	/*
 	 * First take all the futex related locks:
 	 */
-	down_read(&current->mm->mmap_sem);
+	if (fshared)
+		down_read(fshared);
 
-	ret = get_futex_key(uaddr, &key);
+	ret = get_futex_key(uaddr, fshared, &key);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -1909,7 +1994,8 @@ retry_locked:
 out_unlock:
 	spin_unlock(&hb->lock);
 out:
-	up_read(&current->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 
 	return ret;
 
@@ -1921,15 +2007,16 @@ pi_faulted:
 	 * still holding the mmap_sem.
 	 */
 	if (attempt++) {
-		if (futex_handle_fault((unsigned long)uaddr, attempt)) {
-			ret = -EFAULT;
+		ret = futex_handle_fault((unsigned long)uaddr, fshared,
+					 attempt);
+		if (ret)
 			goto out_unlock;
-		}
 		goto retry_locked;
 	}
 
 	spin_unlock(&hb->lock);
-	up_read(&current->mm->mmap_sem);
+	if (fshared)
+		up_read(fshared);
 
 	ret = get_user(uval, uaddr);
 	if (!ret && (uval != -EFAULT))
@@ -1981,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
 	struct futex_q *q;
 	struct file *filp;
 	int ret, err;
+	struct rw_semaphore *fshared;
 	static unsigned long printk_interval;
 
 	if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
@@ -2022,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal)
 	}
 	q->pi_state = NULL;
 
-	down_read(&current->mm->mmap_sem);
-	err = get_futex_key(uaddr, &q->key);
+	fshared = &current->mm->mmap_sem;
+	down_read(fshared);
+	err = get_futex_key(uaddr, fshared, &q->key);
 
 	if (unlikely(err != 0)) {
-		up_read(&current->mm->mmap_sem);
+		up_read(fshared);
 		kfree(q);
 		goto error;
 	}
@@ -2038,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
 	filp->private_data = q;
 
 	queue_me(q, ret, filp);
-	up_read(&current->mm->mmap_sem);
+	up_read(fshared);
 
 	/* Now we map fd to filp, so userspace can access it */
 	fd_install(ret, filp);
@@ -2167,7 +2256,7 @@ retry:
 		 */
 		if (!pi) {
 			if (uval & FUTEX_WAITERS)
-				futex_wake(uaddr, 1);
+				futex_wake(uaddr, &curr->mm->mmap_sem, 1);
 		}
 	}
 	return 0;
@@ -2223,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr)
 		return;
 
 	if (pending)
-		handle_futex_death((void __user *)pending + futex_offset, curr, pip);
+		handle_futex_death((void __user *)pending + futex_offset,
+				   curr, pip);
 
 	while (entry != &head->list) {
 		/*
@@ -2253,38 +2343,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
 	int ret;
+	int cmd = op & FUTEX_CMD_MASK;
+	struct rw_semaphore *fshared = NULL;
+
+	if (!(op & FUTEX_PRIVATE_FLAG))
+		fshared = &current->mm->mmap_sem;
 
-	switch (op) {
+	switch (cmd) {
 	case FUTEX_WAIT:
-		ret = futex_wait(uaddr, val, timeout);
+		ret = futex_wait(uaddr, fshared, val, timeout);
 		break;
 	case FUTEX_WAKE:
-		ret = futex_wake(uaddr, val);
+		ret = futex_wake(uaddr, fshared, val);
 		break;
 	case FUTEX_FD:
 		/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
 		ret = futex_fd(uaddr, val);
 		break;
 	case FUTEX_REQUEUE:
-		ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
 		break;
 	case FUTEX_CMP_REQUEUE:
-		ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
 		break;
 	case FUTEX_WAKE_OP:
-		ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+		ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
 		break;
 	case FUTEX_LOCK_PI:
-		ret = futex_lock_pi(uaddr, val, timeout, 0);
+		ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
 		break;
 	case FUTEX_UNLOCK_PI:
-		ret = futex_unlock_pi(uaddr);
+		ret = futex_unlock_pi(uaddr, fshared);
 		break;
 	case FUTEX_TRYLOCK_PI:
-		ret = futex_lock_pi(uaddr, 0, timeout, 1);
+		ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
 		break;
 	case FUTEX_CMP_REQUEUE_PI:
-		ret = futex_requeue_pi(uaddr, uaddr2, val, val2, &val3);
+		ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);
 		break;
 	default:
 		ret = -ENOSYS;
@@ -2300,23 +2395,24 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 	struct timespec ts;
 	ktime_t t, *tp = NULL;
 	u32 val2 = 0;
+	int cmd = op & FUTEX_CMD_MASK;
 
-	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
+	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
 		if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
 			return -EFAULT;
 		if (!timespec_valid(&ts))
 			return -EINVAL;
 
 		t = timespec_to_ktime(ts);
-		if (op == FUTEX_WAIT)
+		if (cmd == FUTEX_WAIT)
 			t = ktime_add(ktime_get(), t);
 		tp = &t;
 	}
 	/*
-	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
+	 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
 	 */
-	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
-	    || op == FUTEX_CMP_REQUEUE_PI)
+	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
+	    || cmd == FUTEX_CMP_REQUEUE_PI)
 		val2 = (u32) (unsigned long) utime;
 
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-- 
cgit v1.2.3


From 01f2705daf5a36208e69d7cf95db9c330f843af6 Mon Sep 17 00:00:00 2001
From: Nate Diller <nate.diller@gmail.com>
Date: Wed, 9 May 2007 02:35:07 -0700
Subject: fs: convert core functions to zero_user_page

It's very common for file systems to need to zero part or all of a page,
the simplist way is just to use kmap_atomic() and memset().  There's
actually a library function in include/linux/highmem.h that does exactly
that, but it's confusingly named memclear_highpage_flush(), which is
descriptive of *how* it does the work rather than what the *purpose* is.
So this patchset renames the function to zero_user_page(), and calls it
from the various places that currently open code it.

This first patch introduces the new function call, and converts all the
core kernel callsites, both the open-coded ones and the old
memclear_highpage_flush() ones.  Following this patch is a series of
conversions for each file system individually, per AKPM, and finally a
patch deprecating the old call.  The diffstat below shows the entire
patchset.

[akpm@linux-foundation.org: fix a few things]
Signed-off-by: Nate Diller <nate.diller@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/loop.c    |  6 +-----
 fs/buffer.c             | 56 +++++++++++--------------------------------------
 fs/direct-io.c          |  8 ++-----
 fs/mpage.c              | 15 +++++--------
 include/linux/highmem.h | 28 +++++++++++++++++--------
 mm/filemap_xip.c        |  7 +------
 mm/truncate.c           |  3 ++-
 7 files changed, 42 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index af6d7274a7cc..18cdd8c77626 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -243,17 +243,13 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
 				bvec->bv_page, bv_offs, size, IV);
 		if (unlikely(transfer_result)) {
-			char *kaddr;
-
 			/*
 			 * The transfer failed, but we still write the data to
 			 * keep prepare/commit calls balanced.
 			 */
 			printk(KERN_ERR "loop: transfer error block %llu\n",
 			       (unsigned long long)index);
-			kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr + offset, 0, size);
-			kunmap_atomic(kaddr, KM_USER0);
+			zero_user_page(page, offset, size, KM_USER0);
 		}
 		flush_dcache_page(page);
 		ret = aops->commit_write(file, page, offset,
diff --git a/fs/buffer.c b/fs/buffer.c
index eb820b82a636..fc2d763a8d78 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1846,13 +1846,8 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
 		if (block_start >= to)
 			break;
 		if (buffer_new(bh)) {
-			void *kaddr;
-
 			clear_buffer_new(bh);
-			kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr+block_start, 0, bh->b_size);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr, KM_USER0);
+			zero_user_page(page, block_start, bh->b_size, KM_USER0);
 			set_buffer_uptodate(bh);
 			mark_buffer_dirty(bh);
 		}
@@ -1940,10 +1935,8 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
 					SetPageError(page);
 			}
 			if (!buffer_mapped(bh)) {
-				void *kaddr = kmap_atomic(page, KM_USER0);
-				memset(kaddr + i * blocksize, 0, blocksize);
-				flush_dcache_page(page);
-				kunmap_atomic(kaddr, KM_USER0);
+				zero_user_page(page, i * blocksize, blocksize,
+						KM_USER0);
 				if (!err)
 					set_buffer_uptodate(bh);
 				continue;
@@ -2086,7 +2079,6 @@ int cont_prepare_write(struct page *page, unsigned offset,
 	long status;
 	unsigned zerofrom;
 	unsigned blocksize = 1 << inode->i_blkbits;
-	void *kaddr;
 
 	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
 		status = -ENOMEM;
@@ -2108,10 +2100,8 @@ int cont_prepare_write(struct page *page, unsigned offset,
 						PAGE_CACHE_SIZE, get_block);
 		if (status)
 			goto out_unmap;
-		kaddr = kmap_atomic(new_page, KM_USER0);
-		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
-		flush_dcache_page(new_page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(page, zerofrom, PAGE_CACHE_SIZE - zerofrom,
+				KM_USER0);
 		generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
 		unlock_page(new_page);
 		page_cache_release(new_page);
@@ -2138,10 +2128,7 @@ int cont_prepare_write(struct page *page, unsigned offset,
 	if (status)
 		goto out1;
 	if (zerofrom < offset) {
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr+zerofrom, 0, offset-zerofrom);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(page, zerofrom, offset - zerofrom, KM_USER0);
 		__block_commit_write(inode, page, zerofrom, offset);
 	}
 	return 0;
@@ -2340,10 +2327,7 @@ failed:
 	 * Error recovery is pretty slack.  Clear the page and mark it dirty
 	 * so we'll later zero out any blocks which _were_ allocated.
 	 */
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr, 0, PAGE_CACHE_SIZE);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
+	zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
 	SetPageUptodate(page);
 	set_page_dirty(page);
 	return ret;
@@ -2382,7 +2366,6 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
 	loff_t i_size = i_size_read(inode);
 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 	unsigned offset;
-	void *kaddr;
 	int ret;
 
 	/* Is the page fully inside i_size? */
@@ -2413,10 +2396,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
+	zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
 out:
 	ret = mpage_writepage(page, get_block, wbc);
 	if (ret == -EAGAIN)
@@ -2437,7 +2417,6 @@ int nobh_truncate_page(struct address_space *mapping, loff_t from)
 	unsigned to;
 	struct page *page;
 	const struct address_space_operations *a_ops = mapping->a_ops;
-	char *kaddr;
 	int ret = 0;
 
 	if ((offset & (blocksize - 1)) == 0)
@@ -2451,10 +2430,8 @@ int nobh_truncate_page(struct address_space *mapping, loff_t from)
 	to = (offset + blocksize) & ~(blocksize - 1);
 	ret = a_ops->prepare_write(NULL, page, offset, to);
 	if (ret == 0) {
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(page, offset, PAGE_CACHE_SIZE - offset,
+				KM_USER0);
 		/*
 		 * It would be more correct to call aops->commit_write()
 		 * here, but this is more efficient.
@@ -2480,7 +2457,6 @@ int block_truncate_page(struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	struct page *page;
 	struct buffer_head *bh;
-	void *kaddr;
 	int err;
 
 	blocksize = 1 << inode->i_blkbits;
@@ -2534,11 +2510,7 @@ int block_truncate_page(struct address_space *mapping,
 			goto unlock;
 	}
 
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, length);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
-
+	zero_user_page(page, offset, length, KM_USER0);
 	mark_buffer_dirty(bh);
 	err = 0;
 
@@ -2559,7 +2531,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 	loff_t i_size = i_size_read(inode);
 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 	unsigned offset;
-	void *kaddr;
 
 	/* Is the page fully inside i_size? */
 	if (page->index < end_index)
@@ -2585,10 +2556,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
+	zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
 	return __block_write_full_page(inode, page, get_block, wbc);
 }
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d9d0833444f5..8aa2d8b04ef1 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -867,7 +867,6 @@ static int do_direct_IO(struct dio *dio)
 do_holes:
 			/* Handle holes */
 			if (!buffer_mapped(map_bh)) {
-				char *kaddr;
 				loff_t i_size_aligned;
 
 				/* AKPM: eargh, -ENOTBLK is a hack */
@@ -888,11 +887,8 @@ do_holes:
 					page_cache_release(page);
 					goto out;
 				}
-				kaddr = kmap_atomic(page, KM_USER0);
-				memset(kaddr + (block_in_page << blkbits),
-						0, 1 << blkbits);
-				flush_dcache_page(page);
-				kunmap_atomic(kaddr, KM_USER0);
+				zero_user_page(page, block_in_page << blkbits,
+						1 << blkbits, KM_USER0);
 				dio->block_in_file++;
 				block_in_page++;
 				goto next_block;
diff --git a/fs/mpage.c b/fs/mpage.c
index fa2441f57b41..0fb914fc2ee0 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -284,11 +284,9 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 	}
 
 	if (first_hole != blocks_per_page) {
-		char *kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + (first_hole << blkbits), 0,
-				PAGE_CACHE_SIZE - (first_hole << blkbits));
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(page, first_hole << blkbits,
+				PAGE_CACHE_SIZE - (first_hole << blkbits),
+				KM_USER0);
 		if (first_hole == 0) {
 			SetPageUptodate(page);
 			unlock_page(page);
@@ -576,14 +574,11 @@ page_is_mapped:
 		 * written out to the file."
 		 */
 		unsigned offset = i_size & (PAGE_CACHE_SIZE - 1);
-		char *kaddr;
 
 		if (page->index > end_index || !offset)
 			goto confused;
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(page, offset, PAGE_CACHE_SIZE - offset,
+				KM_USER0);
 	}
 
 	/*
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index a515eb0afdfb..b5f2ab42d984 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -94,17 +94,27 @@ static inline void clear_highpage(struct page *page)
 
 /*
  * Same but also flushes aliased cache contents to RAM.
+ *
+ * This must be a macro because KM_USER0 and friends aren't defined if
+ * !CONFIG_HIGHMEM
  */
-static inline void memclear_highpage_flush(struct page *page, unsigned int offset, unsigned int size)
+#define zero_user_page(page, offset, size, km_type)		\
+	do {							\
+		void *kaddr;					\
+								\
+		BUG_ON((offset) + (size) > PAGE_SIZE);		\
+								\
+		kaddr = kmap_atomic(page, km_type);		\
+		memset((char *)kaddr + (offset), 0, (size));	\
+		flush_dcache_page(page);			\
+		kunmap_atomic(kaddr, (km_type));		\
+	} while (0)
+
+
+static inline void memclear_highpage_flush(struct page *page,
+			unsigned int offset, unsigned int size)
 {
-	void *kaddr;
-
-	BUG_ON(offset + size > PAGE_SIZE);
-
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset((char *)kaddr + offset, 0, size);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
+	zero_user_page(page, offset, size, KM_USER0);
 }
 
 #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index cbb335813ec0..1b49dab9b25d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -434,7 +434,6 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
 	unsigned blocksize;
 	unsigned length;
 	struct page *page;
-	void *kaddr;
 
 	BUG_ON(!mapping->a_ops->get_xip_page);
 
@@ -458,11 +457,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
 		else
 			return PTR_ERR(page);
 	}
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, length);
-	kunmap_atomic(kaddr, KM_USER0);
-
-	flush_dcache_page(page);
+	zero_user_page(page, offset, length, KM_USER0);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/truncate.c b/mm/truncate.c
index 0f4b6d18ab0e..4fbe1a2da5fb 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -12,6 +12,7 @@
 #include <linux/swap.h>
 #include <linux/module.h>
 #include <linux/pagemap.h>
+#include <linux/highmem.h>
 #include <linux/pagevec.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>	/* grr. try_to_release_page,
@@ -46,7 +47,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
-	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+	zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0);
 	if (PagePrivate(page))
 		do_invalidatepage(page, partial);
 }
-- 
cgit v1.2.3


From f37bc2712b54ec641e0c0c8634f1a4b61d9956c0 Mon Sep 17 00:00:00 2001
From: Nate Diller <nate.diller@gmail.com>
Date: Wed, 9 May 2007 02:35:09 -0700
Subject: fs: deprecate memclear_highpage_flush

Now that all the in-tree users are converted over to zero_user_page(),
deprecate the old memclear_highpage_flush() call.

Signed-off-by: Nate Diller <nate.diller@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index b5f2ab42d984..98e2cce996a4 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -110,8 +110,7 @@ static inline void clear_highpage(struct page *page)
 		kunmap_atomic(kaddr, (km_type));		\
 	} while (0)
 
-
-static inline void memclear_highpage_flush(struct page *page,
+static inline void __deprecated memclear_highpage_flush(struct page *page,
 			unsigned int offset, unsigned int size)
 {
 	zero_user_page(page, offset, size, KM_USER0);
-- 
cgit v1.2.3


From 8bb7844286fb8c9fce6f65d8288aeb09d03a5e0d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 9 May 2007 02:35:10 -0700
Subject: Add suspend-related notifications for CPU hotplug

Since nonboot CPUs are now disabled after tasks and devices have been
frozen and the CPU hotplug infrastructure is used for this purpose, we need
special CPU hotplug notifications that will help the CPU-hotplug-aware
subsystems distinguish normal CPU hotplug events from CPU hotplug events
related to a system-wide suspend or resume operation in progress.  This
patch introduces such notifications and causes them to be used during
suspend and resume transitions.  It also changes all of the
CPU-hotplug-aware subsystems to take these notifications into consideration
(for now they are handled in the same way as the corresponding "normal"
ones).

[oleg@tv-sign.ru: cleanups]
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cpu-hotplug.txt             |  9 ++++++--
 arch/i386/kernel/cpu/intel_cacheinfo.c    |  2 ++
 arch/i386/kernel/cpu/mcheck/therm_throt.c |  2 ++
 arch/i386/kernel/cpuid.c                  |  2 ++
 arch/i386/kernel/microcode.c              |  3 +++
 arch/i386/kernel/msr.c                    |  2 ++
 arch/ia64/kernel/err_inject.c             |  2 ++
 arch/ia64/kernel/palinfo.c                |  2 ++
 arch/ia64/kernel/salinfo.c                |  2 ++
 arch/ia64/kernel/topology.c               |  2 ++
 arch/powerpc/kernel/sysfs.c               |  2 ++
 arch/powerpc/mm/numa.c                    |  3 +++
 arch/s390/appldata/appldata_base.c        |  2 ++
 arch/s390/kernel/smp.c                    |  2 ++
 arch/x86_64/kernel/mce.c                  |  2 ++
 arch/x86_64/kernel/mce_amd.c              |  2 ++
 arch/x86_64/kernel/vsyscall.c             |  2 +-
 block/ll_rw_blk.c                         |  2 +-
 drivers/base/topology.c                   |  3 +++
 drivers/cpufreq/cpufreq.c                 |  3 +++
 drivers/cpufreq/cpufreq_stats.c           |  2 ++
 drivers/hwmon/coretemp.c                  |  2 ++
 drivers/infiniband/hw/ehca/ehca_irq.c     |  6 ++++++
 drivers/kvm/kvm_main.c                    |  3 +++
 fs/buffer.c                               |  2 +-
 fs/xfs/xfs_mount.c                        |  3 +++
 include/linux/notifier.h                  | 12 +++++++++++
 kernel/cpu.c                              | 34 ++++++++++++++++---------------
 kernel/hrtimer.c                          |  2 ++
 kernel/profile.c                          |  4 ++++
 kernel/rcupdate.c                         |  2 ++
 kernel/relay.c                            |  2 ++
 kernel/sched.c                            | 10 +++++++++
 kernel/softirq.c                          |  4 ++++
 kernel/softlockup.c                       |  4 ++++
 kernel/timer.c                            |  2 ++
 kernel/workqueue.c                        |  2 ++
 lib/radix-tree.c                          |  2 +-
 mm/page_alloc.c                           |  5 ++++-
 mm/slab.c                                 |  6 ++++++
 mm/slub.c                                 |  2 ++
 mm/swap.c                                 |  2 +-
 mm/vmscan.c                               |  2 +-
 mm/vmstat.c                               |  3 +++
 net/core/dev.c                            |  2 +-
 net/core/flow.c                           |  2 +-
 net/iucv/iucv.c                           |  6 ++++++
 47 files changed, 152 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index cc60d29b954c..b6d24c22274b 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -217,14 +217,17 @@ Q: What happens when a CPU is being logically offlined?
 A: The following happen, listed in no particular order :-)
 
 - A notification is sent to in-kernel registered modules by sending an event
-  CPU_DOWN_PREPARE
+  CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the
+  CPU is being offlined while tasks are frozen due to a suspend operation in
+  progress
 - All process is migrated away from this outgoing CPU to a new CPU
 - All interrupts targeted to this CPU is migrated to a new CPU
 - timers/bottom half/task lets are also migrated to a new CPU
 - Once all services are migrated, kernel calls an arch specific routine
   __cpu_disable() to perform arch specific cleanup.
 - Once this is successful, an event for successful cleanup is sent by an event
-  CPU_DEAD.
+  CPU_DEAD (or CPU_DEAD_FROZEN if tasks are frozen due to a suspend while the
+  CPU is being offlined).
 
   "It is expected that each service cleans up when the CPU_DOWN_PREPARE
   notifier is called, when CPU_DEAD is called its expected there is nothing
@@ -242,9 +245,11 @@ A: This is what you would need in your kernel code to receive notifications.
 
 		switch (action) {
 		case CPU_ONLINE:
+		case CPU_ONLINE_FROZEN:
 			foobar_online_action(cpu);
 			break;
 		case CPU_DEAD:
+		case CPU_DEAD_FROZEN:
 			foobar_dead_action(cpu);
 			break;
 		}
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 80b4c5d421b1..e5be819492ef 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -733,9 +733,11 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
 	sys_dev = get_cpu_sysdev(cpu);
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		cache_add_dev(sys_dev);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		cache_remove_dev(sys_dev);
 		break;
 	}
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
index 065005c3f168..5b0a040213c2 100644
--- a/arch/i386/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -137,10 +137,12 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
 	mutex_lock(&therm_cpu_lock);
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		err = thermal_throttle_add_dev(sys_dev);
 		WARN_ON(err);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		thermal_throttle_remove_dev(sys_dev);
 		break;
 	}
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index eeae0d992337..5c2faa10e9fa 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -169,9 +169,11 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long ac
 
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		cpuid_device_create(cpu);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
 		break;
 	}
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index cbe7ec8dbb9f..7d934e493e74 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -775,10 +775,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 	sys_dev = get_cpu_sysdev(cpu);
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
 		mc_sysdev_add(sys_dev);
 		break;
 	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
 		mc_sysdev_remove(sys_dev);
 		break;
 	}
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 8cd0a91ce107..0c1069b8d638 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -153,9 +153,11 @@ static int msr_class_cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		msr_device_create(cpu);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
 		break;
 	}
diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
index d3e9f33e8bdd..6a49600cf337 100644
--- a/arch/ia64/kernel/err_inject.c
+++ b/arch/ia64/kernel/err_inject.c
@@ -236,9 +236,11 @@ static int __cpuinit err_inject_cpu_callback(struct notifier_block *nfb,
 	sys_dev = get_cpu_sysdev(cpu);
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		err_inject_add_dev(sys_dev);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		err_inject_remove_dev(sys_dev);
 		break;
 	}
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index a71df9ae0397..85829e27785c 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -975,9 +975,11 @@ static int palinfo_cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		create_palinfo_proc_entries(hotcpu);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		remove_palinfo_proc_entries(hotcpu);
 		break;
 	}
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index a51f1d0bfb70..89f6b138a62c 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -582,6 +582,7 @@ salinfo_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu
 	struct salinfo_data *data;
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		spin_lock_irqsave(&data_saved_lock, flags);
 		for (i = 0, data = salinfo_data;
 		     i < ARRAY_SIZE(salinfo_data);
@@ -592,6 +593,7 @@ salinfo_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu
 		spin_unlock_irqrestore(&data_saved_lock, flags);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		spin_lock_irqsave(&data_saved_lock, flags);
 		for (i = 0, data = salinfo_data;
 		     i < ARRAY_SIZE(salinfo_data);
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 687500ddb4b8..94ae3c87d828 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -412,9 +412,11 @@ static int __cpuinit cache_cpu_callback(struct notifier_block *nfb,
 	sys_dev = get_cpu_sysdev(cpu);
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		cache_add_dev(sys_dev);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		cache_remove_dev(sys_dev);
 		break;
 	}
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index cae39d9dfe48..68991c2d4a1b 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -342,10 +342,12 @@ static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
 
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		register_cpu_online(cpu);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		unregister_cpu_online(cpu);
 		break;
 #endif
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b3a592b25ab3..de45aa82d97b 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -252,12 +252,15 @@ static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		numa_setup_cpu(lcpu);
 		ret = NOTIFY_OK;
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 		unmap_cpu_from_node(lcpu);
 		break;
 		ret = NOTIFY_OK;
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index ee89b33145d5..81a2b92ab0c2 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -567,9 +567,11 @@ appldata_cpu_notify(struct notifier_block *self,
 {
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		appldata_online_cpu((long) hcpu);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		appldata_offline_cpu((long) hcpu);
 		break;
 	default:
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index b7977027a28f..09f028a3266b 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -789,10 +789,12 @@ static int __cpuinit smp_cpu_notify(struct notifier_block *self,
 
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		if (sysdev_create_file(s, &attr_capability))
 			return NOTIFY_BAD;
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		sysdev_remove_file(s, &attr_capability);
 		break;
 	}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 442169640e45..a14375dd5425 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -720,9 +720,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		mce_create_device(cpu);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		mce_remove_device(cpu);
 		break;
 	}
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index d0bd5d66e103..03356e64f9c8 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -654,9 +654,11 @@ static int threshold_cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		threshold_create_device(cpu);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		threshold_remove_device(cpu);
 		break;
 	default:
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index dc32cef96195..51d4c6fa88c8 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -327,7 +327,7 @@ static int __cpuinit
 cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 {
 	long cpu = (long)arg;
-	if (action == CPU_ONLINE)
+	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
 		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
 	return NOTIFY_DONE;
 }
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index df506571ed60..cd54672da99f 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3507,7 +3507,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
 	 * If a CPU goes away, splice its entries to the current CPU
 	 * and trigger a run of the softirq
 	 */
-	if (action == CPU_DEAD) {
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 		int cpu = (unsigned long) hcpu;
 
 		local_irq_disable();
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 067a9e8bc377..8d8cdfec6529 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -126,10 +126,13 @@ static int __cpuinit topology_cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		rc = topology_add_dev(cpu);
 		break;
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		topology_remove_dev(cpu);
 		break;
 	}
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 893dbaf386fb..eb37fba9b7ef 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1685,9 +1685,11 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
 	if (sys_dev) {
 		switch (action) {
 		case CPU_ONLINE:
+		case CPU_ONLINE_FROZEN:
 			cpufreq_add_dev(sys_dev);
 			break;
 		case CPU_DOWN_PREPARE:
+		case CPU_DOWN_PREPARE_FROZEN:
 			if (unlikely(lock_policy_rwsem_write(cpu)))
 				BUG();
 
@@ -1699,6 +1701,7 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
 			__cpufreq_remove_dev(sys_dev);
 			break;
 		case CPU_DOWN_FAILED:
+		case CPU_DOWN_FAILED_FROZEN:
 			cpufreq_add_dev(sys_dev);
 			break;
 		}
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index d1c7cac9316c..d2f0cbd8b8f3 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -313,9 +313,11 @@ static int cpufreq_stat_cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		cpufreq_update_policy(cpu);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		cpufreq_stats_free_table(cpu);
 		break;
 	}
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 03b1f650d1c4..75e3911810a3 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -309,9 +309,11 @@ static int coretemp_cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		coretemp_device_add(cpu);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		coretemp_device_remove(cpu);
 		break;
 	}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index f284be1c9166..82dda2faf4d0 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -745,6 +745,7 @@ static int comp_pool_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		ehca_gen_dbg("CPU: %x (CPU_PREPARE)", cpu);
 		if(!create_comp_task(pool, cpu)) {
 			ehca_gen_err("Can't create comp_task for cpu: %x", cpu);
@@ -752,24 +753,29 @@ static int comp_pool_callback(struct notifier_block *nfb,
 		}
 		break;
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 		ehca_gen_dbg("CPU: %x (CPU_CANCELED)", cpu);
 		cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
 		kthread_bind(cct->task, any_online_cpu(cpu_online_map));
 		destroy_comp_task(pool, cpu);
 		break;
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		ehca_gen_dbg("CPU: %x (CPU_ONLINE)", cpu);
 		cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
 		kthread_bind(cct->task, cpu);
 		wake_up_process(cct->task);
 		break;
 	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
 		ehca_gen_dbg("CPU: %x (CPU_DOWN_PREPARE)", cpu);
 		break;
 	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
 		ehca_gen_dbg("CPU: %x (CPU_DOWN_FAILED)", cpu);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		ehca_gen_dbg("CPU: %x (CPU_DEAD)", cpu);
 		destroy_comp_task(pool, cpu);
 		take_over_work(pool, cpu);
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index c8b8cfa332bb..0d892600ff00 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -2889,7 +2889,9 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 
 	switch (val) {
 	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
 		       cpu);
 		decache_vcpus_on_cpu(cpu);
@@ -2897,6 +2899,7 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 					 NULL, 0, 1);
 		break;
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
 		       cpu);
 		smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
diff --git a/fs/buffer.c b/fs/buffer.c
index fc2d763a8d78..aecd057cd0e0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2946,7 +2946,7 @@ static void buffer_exit_cpu(int cpu)
 static int buffer_cpu_notify(struct notifier_block *self,
 			      unsigned long action, void *hcpu)
 {
-	if (action == CPU_DEAD)
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
 		buffer_exit_cpu((unsigned long)hcpu);
 	return NOTIFY_OK;
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f5aa3ef855fb..a96bde6df96d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1734,11 +1734,13 @@ xfs_icsb_cpu_notify(
 			per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		/* Easy Case - initialize the area and locks, and
 		 * then rebalance when online does everything else for us. */
 		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
 		break;
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		xfs_icsb_lock(mp);
 		xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0);
 		xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0);
@@ -1746,6 +1748,7 @@ xfs_icsb_cpu_notify(
 		xfs_icsb_unlock(mp);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		/* Disable all the counters, then fold the dead cpu's
 		 * count into the total on the global superblock and
 		 * re-enable the counters. */
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 1903e5490c04..9431101bf876 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -197,5 +197,17 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 #define CPU_LOCK_ACQUIRE	0x0008 /* Acquire all hotcpu locks */
 #define CPU_LOCK_RELEASE	0x0009 /* Release all hotcpu locks */
 
+/* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
+ * operation in progress
+ */
+#define CPU_TASKS_FROZEN	0x0010
+
+#define CPU_ONLINE_FROZEN	(CPU_ONLINE | CPU_TASKS_FROZEN)
+#define CPU_UP_PREPARE_FROZEN	(CPU_UP_PREPARE | CPU_TASKS_FROZEN)
+#define CPU_UP_CANCELED_FROZEN	(CPU_UP_CANCELED | CPU_TASKS_FROZEN)
+#define CPU_DOWN_PREPARE_FROZEN	(CPU_DOWN_PREPARE | CPU_TASKS_FROZEN)
+#define CPU_DOWN_FAILED_FROZEN	(CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
+#define CPU_DEAD_FROZEN		(CPU_DEAD | CPU_TASKS_FROZEN)
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_NOTIFIER_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 28cb6c71a47a..369d2892687d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -120,12 +120,13 @@ static int take_cpu_down(void *unused)
 }
 
 /* Requires cpu_add_remove_lock to be held */
-static int _cpu_down(unsigned int cpu)
+static int _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
 	struct task_struct *p;
 	cpumask_t old_allowed, tmp;
 	void *hcpu = (void *)(long)cpu;
+	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 
 	if (num_online_cpus() == 1)
 		return -EBUSY;
@@ -134,11 +135,11 @@ static int _cpu_down(unsigned int cpu)
 		return -EINVAL;
 
 	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
-	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
 					hcpu, -1, &nr_calls);
 	if (err == NOTIFY_BAD) {
-		__raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, hcpu,
-					  nr_calls, NULL);
+		__raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
+					  hcpu, nr_calls, NULL);
 		printk("%s: attempt to take down CPU %u failed\n",
 				__FUNCTION__, cpu);
 		err = -EINVAL;
@@ -157,7 +158,7 @@ static int _cpu_down(unsigned int cpu)
 
 	if (IS_ERR(p) || cpu_online(cpu)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
-		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
 					    hcpu) == NOTIFY_BAD)
 			BUG();
 
@@ -176,7 +177,8 @@ static int _cpu_down(unsigned int cpu)
 	__cpu_die(cpu);
 
 	/* CPU is completely dead: tell everyone.  Too late to complain. */
-	if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, hcpu) == NOTIFY_BAD)
+	if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
+				    hcpu) == NOTIFY_BAD)
 		BUG();
 
 	check_for_tasks(cpu);
@@ -186,8 +188,7 @@ out_thread:
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
 out_release:
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE,
-						(void *)(long)cpu);
+	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
 	return err;
 }
 
@@ -199,7 +200,7 @@ int cpu_down(unsigned int cpu)
 	if (cpu_hotplug_disabled)
 		err = -EBUSY;
 	else
-		err = _cpu_down(cpu);
+		err = _cpu_down(cpu, 0);
 
 	mutex_unlock(&cpu_add_remove_lock);
 	return err;
@@ -207,16 +208,17 @@ int cpu_down(unsigned int cpu)
 #endif /*CONFIG_HOTPLUG_CPU*/
 
 /* Requires cpu_add_remove_lock to be held */
-static int __cpuinit _cpu_up(unsigned int cpu)
+static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 {
 	int ret, nr_calls = 0;
 	void *hcpu = (void *)(long)cpu;
+	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 
 	if (cpu_online(cpu) || !cpu_present(cpu))
 		return -EINVAL;
 
 	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
-	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu,
+	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
 							-1, &nr_calls);
 	if (ret == NOTIFY_BAD) {
 		printk("%s: attempt to bring up CPU %u failed\n",
@@ -234,12 +236,12 @@ static int __cpuinit _cpu_up(unsigned int cpu)
 	BUG_ON(!cpu_online(cpu));
 
 	/* Now call notifier in preparation. */
-	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
 
 out_notify:
 	if (ret != 0)
 		__raw_notifier_call_chain(&cpu_chain,
-				CPU_UP_CANCELED, hcpu, nr_calls, NULL);
+				CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
 	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
 
 	return ret;
@@ -253,7 +255,7 @@ int __cpuinit cpu_up(unsigned int cpu)
 	if (cpu_hotplug_disabled)
 		err = -EBUSY;
 	else
-		err = _cpu_up(cpu);
+		err = _cpu_up(cpu, 0);
 
 	mutex_unlock(&cpu_add_remove_lock);
 	return err;
@@ -283,7 +285,7 @@ int disable_nonboot_cpus(void)
 	for_each_online_cpu(cpu) {
 		if (cpu == first_cpu)
 			continue;
-		error = _cpu_down(cpu);
+		error = _cpu_down(cpu, 1);
 		if (!error) {
 			cpu_set(cpu, frozen_cpus);
 			printk("CPU%d is down\n", cpu);
@@ -318,7 +320,7 @@ void enable_nonboot_cpus(void)
 	suspend_cpu_hotplug = 1;
 	printk("Enabling non-boot CPUs ...\n");
 	for_each_cpu_mask(cpu, frozen_cpus) {
-		error = _cpu_up(cpu);
+		error = _cpu_up(cpu, 1);
 		if (!error) {
 			printk("CPU%d is up\n", cpu);
 			continue;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c9f4f044a8a8..23c03f43e196 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1411,11 +1411,13 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
 	switch (action) {
 
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		init_hrtimers_cpu(cpu);
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
 		migrate_hrtimers(cpu);
 		break;
diff --git a/kernel/profile.c b/kernel/profile.c
index 9bfadb248dd8..cc91b9bf759d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -340,6 +340,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		node = cpu_to_node(cpu);
 		per_cpu(cpu_profile_flip, cpu) = 0;
 		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
@@ -365,10 +366,13 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 		__free_page(page);
 		return NOTIFY_BAD;
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		cpu_set(cpu, prof_cpu_mask);
 		break;
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		cpu_clear(cpu, prof_cpu_mask);
 		if (per_cpu(cpu_profile_hits, cpu)[0]) {
 			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 3554b76da84c..2c2dd8410dc4 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -558,9 +558,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	long cpu = (long)hcpu;
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		rcu_online_cpu(cpu);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		rcu_offline_cpu(cpu);
 		break;
 	default:
diff --git a/kernel/relay.c b/kernel/relay.c
index e804589c863c..61a504900eaa 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -484,6 +484,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
 
 	switch(action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		mutex_lock(&relay_channels_mutex);
 		list_for_each_entry(chan, &relay_channels, list) {
 			if (chan->buf[hotcpu])
@@ -500,6 +501,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
 		mutex_unlock(&relay_channels_mutex);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		/* No need to flush the cpu : will be flushed upon
 		 * final relay_flush() call. */
 		break;
diff --git a/kernel/sched.c b/kernel/sched.c
index fe1a9c2b855a..799d23b4e35d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5394,6 +5394,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
@@ -5407,12 +5408,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		/* Strictly unneccessary, as first user will wake it. */
 		wake_up_process(cpu_rq(cpu)->migration_thread);
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 		if (!cpu_rq(cpu)->migration_thread)
 			break;
 		/* Unbind it from offline cpu so it can run.  Fall thru. */
@@ -5423,6 +5426,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		migrate_live_tasks(cpu);
 		rq = cpu_rq(cpu);
 		kthread_stop(rq->migration_thread);
@@ -6912,14 +6916,20 @@ static int update_sched_domains(struct notifier_block *nfb,
 {
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
 		detach_destroy_domains(&cpu_online_map);
 		return NOTIFY_OK;
 
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		/*
 		 * Fall through and re-initialise the domains.
 		 */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8b75008e2bd8..0b9886a00e74 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -593,6 +593,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
 		if (IS_ERR(p)) {
 			printk("ksoftirqd for %i failed\n", hotcpu);
@@ -602,16 +603,19 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
   		per_cpu(ksoftirqd, hotcpu) = p;
  		break;
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		wake_up_process(per_cpu(ksoftirqd, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 		if (!per_cpu(ksoftirqd, hotcpu))
 			break;
 		/* Unbind so it can run.  Fall thru. */
 		kthread_bind(per_cpu(ksoftirqd, hotcpu),
 			     any_online_cpu(cpu_online_map));
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		p = per_cpu(ksoftirqd, hotcpu);
 		per_cpu(ksoftirqd, hotcpu) = NULL;
 		kthread_stop(p);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 8fa7040247ad..0131e296ffb4 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -146,6 +146,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		BUG_ON(per_cpu(watchdog_task, hotcpu));
 		p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
 		if (IS_ERR(p)) {
@@ -157,16 +158,19 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		kthread_bind(p, hotcpu);
  		break;
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 		if (!per_cpu(watchdog_task, hotcpu))
 			break;
 		/* Unbind so it can run.  Fall thru. */
 		kthread_bind(per_cpu(watchdog_task, hotcpu),
 			     any_online_cpu(cpu_online_map));
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		p = per_cpu(watchdog_task, hotcpu);
 		per_cpu(watchdog_task, hotcpu) = NULL;
 		kthread_stop(p);
diff --git a/kernel/timer.c b/kernel/timer.c
index 58f6dd07c80b..de85f8491c1d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1293,11 +1293,13 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
 	long cpu = (long)hcpu;
 	switch(action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		if (init_timers_cpu(cpu) < 0)
 			return NOTIFY_BAD;
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		migrate_timers(cpu);
 		break;
 #endif
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b976ed87dd37..fb56fedd5c02 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -799,6 +799,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	struct cpu_workqueue_struct *cwq;
 	struct workqueue_struct *wq;
 
+	action &= ~CPU_TASKS_FROZEN;
+
 	switch (action) {
 	case CPU_LOCK_ACQUIRE:
 		mutex_lock(&workqueue_mutex);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index d69ddbe43865..402eb4eb6b23 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1004,7 +1004,7 @@ static int radix_tree_callback(struct notifier_block *nfb,
        struct radix_tree_preload *rtp;
 
        /* Free per-cpu pool of perloaded nodes */
-       if (action == CPU_DEAD) {
+       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
                rtp = &per_cpu(radix_tree_preloads, cpu);
                while (rtp->nr) {
                        kmem_cache_free(radix_tree_node_cachep,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6fd0b7455b0b..d53cbf8acb8e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2148,11 +2148,14 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		if (process_zones(cpu))
 			ret = NOTIFY_BAD;
 		break;
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		free_zone_pagesets(cpu);
 		break;
 	default:
@@ -3012,7 +3015,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
 {
 	int cpu = (unsigned long)hcpu;
 
-	if (action == CPU_DEAD) {
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 		local_irq_disable();
 		__drain_pages(cpu);
 		vm_events_fold_cpu(cpu);
diff --git a/mm/slab.c b/mm/slab.c
index 1a7a10de2a4d..6f3d6e240c61 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1190,6 +1190,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 		mutex_lock(&cache_chain_mutex);
 		break;
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		/*
 		 * We need to do this right in the beginning since
 		 * alloc_arraycache's are going to use this list.
@@ -1276,10 +1277,12 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 		}
 		break;
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		start_cpu_timer(cpu);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
   	case CPU_DOWN_PREPARE:
+  	case CPU_DOWN_PREPARE_FROZEN:
 		/*
 		 * Shutdown cache reaper. Note that the cache_chain_mutex is
 		 * held so that if cache_reap() is invoked it cannot do
@@ -1291,9 +1294,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 		per_cpu(reap_work, cpu).work.func = NULL;
   		break;
   	case CPU_DOWN_FAILED:
+  	case CPU_DOWN_FAILED_FROZEN:
 		start_cpu_timer(cpu);
   		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		/*
 		 * Even if all the cpus of a node are down, we don't free the
 		 * kmem_list3 of any cache. This to avoid a race between
@@ -1305,6 +1310,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 		/* fall thru */
 #endif
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
 			struct array_cache *shared;
diff --git a/mm/slub.c b/mm/slub.c
index f7c120b93c41..a581fa8ae11a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2514,7 +2514,9 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		for_all_slabs(__flush_cpu_slab, cpu);
 		break;
 	default:
diff --git a/mm/swap.c b/mm/swap.c
index 218c52a24a21..d3cb966fe992 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -488,7 +488,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
 	long *committed;
 
 	committed = &per_cpu(committed_space, (long)hcpu);
-	if (action == CPU_DEAD) {
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 		atomic_add(*committed, &vm_committed_space);
 		*committed = 0;
 		__lru_add_drain((long)hcpu);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1c8e75a1cfcd..1be5a6376ef0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1528,7 +1528,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 	pg_data_t *pgdat;
 	cpumask_t mask;
 
-	if (action == CPU_ONLINE) {
+	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
 		for_each_online_pgdat(pgdat) {
 			mask = node_to_cpumask(pgdat->node_id);
 			if (any_online_cpu(mask) != NR_CPUS)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6c488d6ac425..9a66dc4aed43 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -650,8 +650,11 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
 {
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		refresh_zone_stat_thresholds();
 		break;
 	default:
diff --git a/net/core/dev.c b/net/core/dev.c
index 4317c1be4d3f..8301e2ac747f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3450,7 +3450,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
 	struct softnet_data *sd, *oldsd;
 
-	if (action != CPU_DEAD)
+	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
 		return NOTIFY_OK;
 
 	local_irq_disable();
diff --git a/net/core/flow.c b/net/core/flow.c
index 5d25697920b1..051430545a05 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -338,7 +338,7 @@ static int flow_cache_cpu(struct notifier_block *nfb,
 			  unsigned long action,
 			  void *hcpu)
 {
-	if (action == CPU_DEAD)
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
 		__flow_cache_shrink((unsigned long)hcpu, 0);
 	return NOTIFY_OK;
 }
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index fb3faf72e850..b7333061016d 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -556,6 +556,7 @@ static int __cpuinit iucv_cpu_notify(struct notifier_block *self,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		if (!percpu_populate(iucv_irq_data,
 				     sizeof(struct iucv_irq_data),
 				     GFP_KERNEL|GFP_DMA, cpu))
@@ -567,15 +568,20 @@ static int __cpuinit iucv_cpu_notify(struct notifier_block *self,
 		}
 		break;
 	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		percpu_depopulate(iucv_param, cpu);
 		percpu_depopulate(iucv_irq_data, cpu);
 		break;
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
 		smp_call_function_on(iucv_declare_cpu, NULL, 0, 1, cpu);
 		break;
 	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
 		cpumask = iucv_buffer_cpumask;
 		cpu_clear(cpu, cpumask);
 		if (cpus_empty(cpumask))
-- 
cgit v1.2.3


From d1187ed21026fd512b87851d0ca26d9ae16f9059 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 9 May 2007 02:35:12 -0700
Subject: vmstat: use our own timer events

vmstat is currently using the cache reaper to periodically bring the
statistics up to date.  The cache reaper does only exists in SLUB as a way to
provide compatibility with SLAB.  This patch removes the vmstat calls from the
slab allocators and provides its own handling.

The advantage is also that we can use a different frequency for the updates.
Refreshing vm stats is a pretty fast job so we can run this every second and
stagger this by only one tick.  This will lead to some overlap in large
systems.  F.e a system running at 250 HZ with 1024 processors will have 4 vm
updates occurring at once.

However, the vm stats update only accesses per node information.  It is only
necessary to stagger the vm statistics updates per processor in each node.  Vm
counter updates occurring on distant nodes will not cause cacheline
contention.

We could implement an alternate approach that runs the first processor on each
node at the second and then each of the other processor on a node on a
subsequent tick.  That may be useful to keep a large amount of the second free
of timer activity.  Maybe the timer folks will have some feedback on this one?

[jirislaby@gmail.com: add missing break]
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h |  3 ---
 mm/slab.c              |  1 -
 mm/slub.c              |  1 -
 mm/vmstat.c            | 40 ++++++++++++++++++++++++++++++++++++----
 4 files changed, 36 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index acb1f105870c..d9325cf8a134 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -212,8 +212,6 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 
 void refresh_cpu_vm_stats(int);
-void refresh_vm_stats(void);
-
 #else /* CONFIG_SMP */
 
 /*
@@ -260,7 +258,6 @@ static inline void __dec_zone_page_state(struct page *page,
 #define mod_zone_page_state __mod_zone_page_state
 
 static inline void refresh_cpu_vm_stats(int cpu) { }
-static inline void refresh_vm_stats(void) { }
 #endif
 
 #endif /* _LINUX_VMSTAT_H */
diff --git a/mm/slab.c b/mm/slab.c
index 6f3d6e240c61..e50908b2bfac 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4156,7 +4156,6 @@ next:
 	check_irq_on();
 	mutex_unlock(&cache_chain_mutex);
 	next_reap_node();
-	refresh_cpu_vm_stats(smp_processor_id());
 out:
 	/* Set up the next iteration */
 	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
diff --git a/mm/slub.c b/mm/slub.c
index a581fa8ae11a..dbb206503a8d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2580,7 +2580,6 @@ static DEFINE_PER_CPU(struct delayed_work, reap_work);
 static void cache_reap(struct work_struct *unused)
 {
 	next_reap_node();
-	refresh_cpu_vm_stats(smp_processor_id());
 	schedule_delayed_work(&__get_cpu_var(reap_work),
 				      REAPTIMEOUT_CPUC);
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9a66dc4aed43..9d824643a22f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -640,6 +640,22 @@ const struct seq_operations vmstat_op = {
 #endif /* CONFIG_PROC_FS */
 
 #ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+
+static void vmstat_update(struct work_struct *w)
+{
+	refresh_cpu_vm_stats(smp_processor_id());
+	schedule_delayed_work(&__get_cpu_var(vmstat_work), HZ);
+}
+
+static void __devinit start_cpu_timer(int cpu)
+{
+	struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
+
+	INIT_DELAYED_WORK(vmstat_work, vmstat_update);
+	schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu);
+}
+
 /*
  * Use the cpu notifier to insure that the thresholds are recalculated
  * when necessary.
@@ -648,11 +664,22 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
 		unsigned long action,
 		void *hcpu)
 {
+	long cpu = (long)hcpu;
+
 	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		start_cpu_timer(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
+		per_cpu(vmstat_work, cpu).work.func = NULL;
+		break;
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+		start_cpu_timer(cpu);
+		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		refresh_zone_stat_thresholds();
@@ -668,8 +695,13 @@ static struct notifier_block __cpuinitdata vmstat_notifier =
 
 int __init setup_vmstat(void)
 {
+	int cpu;
+
 	refresh_zone_stat_thresholds();
 	register_cpu_notifier(&vmstat_notifier);
+
+	for_each_online_cpu(cpu)
+		start_cpu_timer(cpu);
 	return 0;
 }
 module_init(setup_vmstat)
-- 
cgit v1.2.3


From 4037d452202e34214e8a939fa5621b2b3bbb45b7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 9 May 2007 02:35:14 -0700
Subject: Move remote node draining out of slab allocators

Currently the slab allocators contain callbacks into the page allocator to
perform the draining of pagesets on remote nodes.  This requires SLUB to have
a whole subsystem in order to be compatible with SLAB.  Moving node draining
out of the slab allocators avoids a section of code in SLUB.

Move the node draining so that is is done when the vm statistics are updated.
At that point we are already touching all the cachelines with the pagesets of
a processor.

Add a expire counter there.  If we have to update per zone or global vm
statistics then assume that the pageset will require subsequent draining.

The expire counter will be decremented on each vm stats update pass until it
reaches zero.  Then we will drain one batch from the pageset.  The draining
will cause vm counter updates which will then cause another expiration until
the pcp is empty.  So we will drain a batch every 3 seconds.

Note that remote node draining is a somewhat esoteric feature that is required
on large NUMA systems because otherwise significant portions of system memory
can become trapped in pcp queues.  The number of pcp is determined by the
number of processors and nodes in a system.  A system with 4 processors and 2
nodes has 8 pcps which is okay.  But a system with 1024 processors and 512
nodes has 512k pcps with a high potential for large amount of memory being
caught in them.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h    |  6 +---
 include/linux/mmzone.h |  3 ++
 mm/page_alloc.c        | 45 +++++++++------------------
 mm/slab.c              |  6 ----
 mm/slub.c              | 84 --------------------------------------------------
 mm/vmstat.c            | 54 +++++++++++++++++++++++++++++---
 6 files changed, 67 insertions(+), 131 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 97a36c3d96e2..0d2ef0b082a6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -176,10 +176,6 @@ extern void FASTCALL(free_cold_page(struct page *page));
 #define free_page(addr) free_pages((addr),0)
 
 void page_alloc_init(void);
-#ifdef CONFIG_NUMA
-void drain_node_pages(int node);
-#else
-static inline void drain_node_pages(int node) { };
-#endif
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 
 #endif /* __LINUX_GFP_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2f1544e83042..d09b1345a3a1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -83,6 +83,9 @@ struct per_cpu_pages {
 
 struct per_cpu_pageset {
 	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
+#ifdef CONFIG_NUMA
+	s8 expire;
+#endif
 #ifdef CONFIG_SMP
 	s8 stat_threshold;
 	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d53cbf8acb8e..f9b5d6d5f4d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -691,43 +691,26 @@ static void __init setup_nr_node_ids(void) {}
 
 #ifdef CONFIG_NUMA
 /*
- * Called from the slab reaper to drain pagesets on a particular node that
- * belongs to the currently executing processor.
+ * Called from the vmstat counter updater to drain pagesets of this
+ * currently executing processor on remote nodes after they have
+ * expired.
+ *
  * Note that this function must be called with the thread pinned to
  * a single processor.
  */
-void drain_node_pages(int nodeid)
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
-	int i;
-	enum zone_type z;
 	unsigned long flags;
+	int to_drain;
 
-	for (z = 0; z < MAX_NR_ZONES; z++) {
-		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
-		struct per_cpu_pageset *pset;
-
-		if (!populated_zone(zone))
-			continue;
-
-		pset = zone_pcp(zone, smp_processor_id());
-		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
-			struct per_cpu_pages *pcp;
-
-			pcp = &pset->pcp[i];
-			if (pcp->count) {
-				int to_drain;
-
-				local_irq_save(flags);
-				if (pcp->count >= pcp->batch)
-					to_drain = pcp->batch;
-				else
-					to_drain = pcp->count;
-				free_pages_bulk(zone, to_drain, &pcp->list, 0);
-				pcp->count -= to_drain;
-				local_irq_restore(flags);
-			}
-		}
-	}
+	local_irq_save(flags);
+	if (pcp->count >= pcp->batch)
+		to_drain = pcp->batch;
+	else
+		to_drain = pcp->count;
+	free_pages_bulk(zone, to_drain, &pcp->list, 0);
+	pcp->count -= to_drain;
+	local_irq_restore(flags);
 }
 #endif
 
diff --git a/mm/slab.c b/mm/slab.c
index e50908b2bfac..944b20581f8c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -928,12 +928,6 @@ static void next_reap_node(void)
 {
 	int node = __get_cpu_var(reap_node);
 
-	/*
-	 * Also drain per cpu pages on remote zones
-	 */
-	if (node != numa_node_id())
-		drain_node_pages(node);
-
 	node = next_node(node, node_online_map);
 	if (unlikely(node >= MAX_NUMNODES))
 		node = first_node(node_online_map);
diff --git a/mm/slub.c b/mm/slub.c
index dbb206503a8d..bd2efae02bcd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2530,90 +2530,6 @@ static struct notifier_block __cpuinitdata slab_notifier =
 
 #endif
 
-#ifdef CONFIG_NUMA
-
-/*****************************************************************
- * Generic reaper used to support the page allocator
- * (the cpu slabs are reaped by a per slab workqueue).
- *
- * Maybe move this to the page allocator?
- ****************************************************************/
-
-static DEFINE_PER_CPU(unsigned long, reap_node);
-
-static void init_reap_node(int cpu)
-{
-	int node;
-
-	node = next_node(cpu_to_node(cpu), node_online_map);
-	if (node == MAX_NUMNODES)
-		node = first_node(node_online_map);
-
-	__get_cpu_var(reap_node) = node;
-}
-
-static void next_reap_node(void)
-{
-	int node = __get_cpu_var(reap_node);
-
-	/*
-	 * Also drain per cpu pages on remote zones
-	 */
-	if (node != numa_node_id())
-		drain_node_pages(node);
-
-	node = next_node(node, node_online_map);
-	if (unlikely(node >= MAX_NUMNODES))
-		node = first_node(node_online_map);
-	__get_cpu_var(reap_node) = node;
-}
-#else
-#define init_reap_node(cpu) do { } while (0)
-#define next_reap_node(void) do { } while (0)
-#endif
-
-#define REAPTIMEOUT_CPUC	(2*HZ)
-
-#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct delayed_work, reap_work);
-
-static void cache_reap(struct work_struct *unused)
-{
-	next_reap_node();
-	schedule_delayed_work(&__get_cpu_var(reap_work),
-				      REAPTIMEOUT_CPUC);
-}
-
-static void __devinit start_cpu_timer(int cpu)
-{
-	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
-
-	/*
-	 * When this gets called from do_initcalls via cpucache_init(),
-	 * init_workqueues() has already run, so keventd will be setup
-	 * at that time.
-	 */
-	if (keventd_up() && reap_work->work.func == NULL) {
-		init_reap_node(cpu);
-		INIT_DELAYED_WORK(reap_work, cache_reap);
-		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
-	}
-}
-
-static int __init cpucache_init(void)
-{
-	int cpu;
-
-	/*
-	 * Register the timers that drain pcp pages and update vm statistics
-	 */
-	for_each_online_cpu(cpu)
-		start_cpu_timer(cpu);
-	return 0;
-}
-__initcall(cpucache_init);
-#endif
-
 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
 {
 	struct kmem_cache *s = get_slab(size, gfpflags);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 006eb7621869..9832d9a41d8c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -281,6 +281,17 @@ EXPORT_SYMBOL(dec_zone_page_state);
 
 /*
  * Update the zone counters for one cpu.
+ *
+ * Note that refresh_cpu_vm_stats strives to only access
+ * node local memory. The per cpu pagesets on remote zones are placed
+ * in the memory local to the processor using that pageset. So the
+ * loop over all zones will access a series of cachelines local to
+ * the processor.
+ *
+ * The call to zone_page_state_add updates the cachelines with the
+ * statistics in the remote zone struct as well as the global cachelines
+ * with the global counters. These could cause remote node cache line
+ * bouncing and will have to be only done when necessary.
  */
 void refresh_cpu_vm_stats(int cpu)
 {
@@ -289,21 +300,54 @@ void refresh_cpu_vm_stats(int cpu)
 	unsigned long flags;
 
 	for_each_zone(zone) {
-		struct per_cpu_pageset *pcp;
+		struct per_cpu_pageset *p;
 
 		if (!populated_zone(zone))
 			continue;
 
-		pcp = zone_pcp(zone, cpu);
+		p = zone_pcp(zone, cpu);
 
 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-			if (pcp->vm_stat_diff[i]) {
+			if (p->vm_stat_diff[i]) {
 				local_irq_save(flags);
-				zone_page_state_add(pcp->vm_stat_diff[i],
+				zone_page_state_add(p->vm_stat_diff[i],
 					zone, i);
-				pcp->vm_stat_diff[i] = 0;
+				p->vm_stat_diff[i] = 0;
+#ifdef CONFIG_NUMA
+				/* 3 seconds idle till flush */
+				p->expire = 3;
+#endif
 				local_irq_restore(flags);
 			}
+#ifdef CONFIG_NUMA
+		/*
+		 * Deal with draining the remote pageset of this
+		 * processor
+		 *
+		 * Check if there are pages remaining in this pageset
+		 * if not then there is nothing to expire.
+		 */
+		if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
+			continue;
+
+		/*
+		 * We never drain zones local to this processor.
+		 */
+		if (zone_to_nid(zone) == numa_node_id()) {
+			p->expire = 0;
+			continue;
+		}
+
+		p->expire--;
+		if (p->expire)
+			continue;
+
+		if (p->pcp[0].count)
+			drain_zone_pages(zone, p->pcp + 0);
+
+		if (p->pcp[1].count)
+			drain_zone_pages(zone, p->pcp + 1);
+#endif
 	}
 }
 
-- 
cgit v1.2.3


From b52f52a093bb1e841e014c2087b5bee7162da413 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 May 2007 02:35:15 -0700
Subject: clocksource: fix resume logic

We need to make sure that the clocksources are resumed, when timekeeping is
resumed.  The current resume logic does not guarantee this.

Add a resume function pointer to the clocksource struct, so clocksource
drivers which need to reinitialize the clocksource can provide a resume
function.

Add a resume function, which calls the maybe available clocksource resume
functions and resets the watchdog function, so a stable TSC can be used
accross suspend/resume.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/clocksource.h |  3 +++
 kernel/time/clocksource.c   | 45 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c              |  2 ++
 3 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 2665ca04cf8f..bf297b03a4e4 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -49,6 +49,7 @@ struct clocksource;
  * @shift:		cycle to nanosecond divisor (power of two)
  * @flags:		flags describing special properties
  * @vread:		vsyscall based read
+ * @resume:		resume function for the clocksource, if necessary
  * @cycle_interval:	Used internally by timekeeping core, please ignore.
  * @xtime_interval:	Used internally by timekeeping core, please ignore.
  */
@@ -65,6 +66,7 @@ struct clocksource {
 	u32 shift;
 	unsigned long flags;
 	cycle_t (*vread)(void);
+	void (*resume)(void);
 
 	/* timekeeping specific data, ignore */
 	cycle_t cycle_interval;
@@ -209,6 +211,7 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
 extern int clocksource_register(struct clocksource*);
 extern struct clocksource* clocksource_get_next(void);
 extern void clocksource_change_rating(struct clocksource *cs, int rating);
+extern void clocksource_resume(void);
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
 extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index db0c725de5ea..3db5c3c460d7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -74,6 +74,8 @@ static struct clocksource *watchdog;
 static struct timer_list watchdog_timer;
 static DEFINE_SPINLOCK(watchdog_lock);
 static cycle_t watchdog_last;
+static int watchdog_resumed;
+
 /*
  * Interval: 0.5sec Threshold: 0.0625s
  */
@@ -98,15 +100,26 @@ static void clocksource_watchdog(unsigned long data)
 	struct clocksource *cs, *tmp;
 	cycle_t csnow, wdnow;
 	int64_t wd_nsec, cs_nsec;
+	int resumed;
 
 	spin_lock(&watchdog_lock);
 
+	resumed = watchdog_resumed;
+	if (unlikely(resumed))
+		watchdog_resumed = 0;
+
 	wdnow = watchdog->read();
 	wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
 	watchdog_last = wdnow;
 
 	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
 		csnow = cs->read();
+
+		if (unlikely(resumed)) {
+			cs->wd_last = csnow;
+			continue;
+		}
+
 		/* Initialized ? */
 		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
 			if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
@@ -136,6 +149,13 @@ static void clocksource_watchdog(unsigned long data)
 	}
 	spin_unlock(&watchdog_lock);
 }
+static void clocksource_resume_watchdog(void)
+{
+	spin_lock(&watchdog_lock);
+	watchdog_resumed = 1;
+	spin_unlock(&watchdog_lock);
+}
+
 static void clocksource_check_watchdog(struct clocksource *cs)
 {
 	struct clocksource *cse;
@@ -182,8 +202,33 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 	if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
 		cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
 }
+
+static inline void clocksource_resume_watchdog(void) { }
 #endif
 
+/**
+ * clocksource_resume - resume the clocksource(s)
+ */
+void clocksource_resume(void)
+{
+	struct list_head *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&clocksource_lock, flags);
+
+	list_for_each(tmp, &clocksource_list) {
+		struct clocksource *cs;
+
+		cs = list_entry(tmp, struct clocksource, list);
+		if (cs->resume)
+			cs->resume();
+	}
+
+	clocksource_resume_watchdog();
+
+	spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+
 /**
  * clocksource_get_next - Returns the selected clocksource
  *
diff --git a/kernel/timer.c b/kernel/timer.c
index de85f8491c1d..59a28b1752f8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1499,6 +1499,8 @@ unregister_time_interpolator(struct time_interpolator *ti)
 		prev = &curr->next;
 	}
 
+	clocksource_resume();
+
 	write_seqlock_irqsave(&xtime_lock, flags);
 	if (ti == time_interpolator) {
 		/* we lost the best time-interpolator: */
-- 
cgit v1.2.3


From e61a1c1c4f240cec61300c8f27518c3e47570fd4 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Wed, 9 May 2007 02:35:15 -0700
Subject: Allow arch to initialize arch field of the module structure

This will later allow an arch to add module specific information via linker
generated tables instead of poking directly in the module object structure.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/module.h | 3 +++
 scripts/mod/modpost.c  | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 6d3dc9c4ff96..792d483c9af7 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -356,6 +356,9 @@ struct module
 	   keeping pointers to this stuff */
 	char *args;
 };
+#ifndef MODULE_ARCH_INIT
+#define MODULE_ARCH_INIT {}
+#endif
 
 /* FIXME: It'd be nice to isolate modules during init, too, so they
    aren't used before they (may) fail.  But presently too much code
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 480e18b00aa6..113dc77b9f60 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -1343,6 +1343,7 @@ static void add_header(struct buffer *b, struct module *mod)
 		buf_printf(b, "#ifdef CONFIG_MODULE_UNLOAD\n"
 			      " .exit = cleanup_module,\n"
 			      "#endif\n");
+	buf_printf(b, " .arch = MODULE_ARCH_INIT,\n");
 	buf_printf(b, "};\n");
 }
 
-- 
cgit v1.2.3


From f7e4217b007d1f73e7e3cf10ba4fea4a608c603f Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Wed, 9 May 2007 02:35:17 -0700
Subject: rename thread_info to stack

This finally renames the thread_info field in task structure to stack, so that
the assumptions about this field are gone and archs have more freedom about
placing the thread_info structure.

Nonbroken archs which have a proper thread pointer can do the access to both
current thread and task structure via a single pointer.

It'll allow for a few more cleanups of the fork code, from which e.g.  ia64
could benefit.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
[akpm@linux-foundation.org: build fix]
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Greg Ungerer <gerg@uclinux.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
Cc: Richard Curnow <rc@rc0.org.uk>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Andi Kleen <ak@muc.de>
Cc: Chris Zankel <chris@zankel.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/blackfin/kernel/ptrace.c     | 6 +++---
 arch/h8300/kernel/asm-offsets.c   | 2 +-
 arch/ia64/kernel/mca.c            | 2 +-
 arch/mips/kernel/asm-offsets.c    | 2 +-
 arch/parisc/kernel/asm-offsets.c  | 2 +-
 arch/powerpc/kernel/asm-offsets.c | 2 +-
 arch/ppc/kernel/asm-offsets.c     | 2 +-
 arch/s390/kernel/asm-offsets.c    | 2 +-
 arch/sparc/kernel/asm-offsets.c   | 2 +-
 arch/v850/kernel/asm-offsets.c    | 2 +-
 arch/xtensa/kernel/asm-offsets.c  | 2 +-
 include/asm-alpha/thread_info.h   | 8 ++++----
 include/asm-blackfin/processor.h  | 6 +++---
 include/asm-blackfin/system.h     | 4 ++--
 include/asm-m68k/thread_info.h    | 6 +++---
 include/asm-x86_64/system.h       | 2 +-
 include/linux/init_task.h         | 2 +-
 include/linux/sched.h             | 8 ++++----
 kernel/fork.c                     | 4 ++--
 19 files changed, 33 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c
index d7c8e514cb92..e718bb4a1ef0 100644
--- a/arch/blackfin/kernel/ptrace.c
+++ b/arch/blackfin/kernel/ptrace.c
@@ -73,7 +73,7 @@
 static inline struct pt_regs *get_user_regs(struct task_struct *task)
 {
 	return (struct pt_regs *)
-	    ((unsigned long)task->thread_info +
+	    ((unsigned long)task_stack_page(task) +
 	     (THREAD_SIZE - sizeof(struct pt_regs)));
 }
 
@@ -99,7 +99,7 @@ static inline long get_reg(struct task_struct *task, int regno)
 	unsigned char *reg_ptr;
 
 	struct pt_regs *regs =
-	    (struct pt_regs *)((unsigned long)task->thread_info +
+	    (struct pt_regs *)((unsigned long)task_stack_page(task) +
 			       (THREAD_SIZE - sizeof(struct pt_regs)));
 	reg_ptr = (char *)regs;
 
@@ -125,7 +125,7 @@ put_reg(struct task_struct *task, int regno, unsigned long data)
 	char * reg_ptr;
 
 	struct pt_regs *regs =
-	    (struct pt_regs *)((unsigned long)task->thread_info +
+	    (struct pt_regs *)((unsigned long)task_stack_page(task) +
 			       (THREAD_SIZE - sizeof(struct pt_regs)));
 	reg_ptr = (char *)regs;
 
diff --git a/arch/h8300/kernel/asm-offsets.c b/arch/h8300/kernel/asm-offsets.c
index b78b82ad28a3..fc30b4fd0914 100644
--- a/arch/h8300/kernel/asm-offsets.c
+++ b/arch/h8300/kernel/asm-offsets.c
@@ -30,7 +30,7 @@ int main(void)
 	DEFINE(TASK_PTRACE, offsetof(struct task_struct, ptrace));
 	DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked));
 	DEFINE(TASK_THREAD, offsetof(struct task_struct, thread));
-	DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, thread_info));
+	DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, stack));
 	DEFINE(TASK_MM, offsetof(struct task_struct, mm));
 	DEFINE(TASK_ACTIVE_MM, offsetof(struct task_struct, active_mm));
 
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 1d7cc7e2ce32..f8ae709de0b5 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1689,7 +1689,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
 	ti->preempt_count = 1;
 	ti->task = p;
 	ti->cpu = cpu;
-	p->thread_info = ti;
+	p->stack = ti;
 	p->state = TASK_UNINTERRUPTIBLE;
 	cpu_set(cpu, p->cpus_allowed);
 	INIT_LIST_HEAD(&p->tasks);
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index 761a779d5c4f..3b27309d54b1 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -82,7 +82,7 @@ void output_task_defines(void)
 {
 	text("/* MIPS task_struct offsets. */");
 	offset("#define TASK_STATE         ", struct task_struct, state);
-	offset("#define TASK_THREAD_INFO   ", struct task_struct, thread_info);
+	offset("#define TASK_THREAD_INFO   ", struct task_struct, stack);
 	offset("#define TASK_FLAGS         ", struct task_struct, flags);
 	offset("#define TASK_MM            ", struct task_struct, mm);
 	offset("#define TASK_PID           ", struct task_struct, pid);
diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c
index 54fdb959149c..d3b7917a87cb 100644
--- a/arch/parisc/kernel/asm-offsets.c
+++ b/arch/parisc/kernel/asm-offsets.c
@@ -54,7 +54,7 @@
 
 int main(void)
 {
-	DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, thread_info));
+	DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, stack));
 	DEFINE(TASK_STATE, offsetof(struct task_struct, state));
 	DEFINE(TASK_FLAGS, offsetof(struct task_struct, flags));
 	DEFINE(TASK_SIGPENDING, offsetof(struct task_struct, pending));
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8f48560b7ee2..37bc35e69dbe 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -58,7 +58,7 @@ int main(void)
 #ifdef CONFIG_PPC64
 	DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context));
 #else
-	DEFINE(THREAD_INFO, offsetof(struct task_struct, thread_info));
+	DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
 	DEFINE(PTRACE, offsetof(struct task_struct, ptrace));
 #endif /* CONFIG_PPC64 */
 
diff --git a/arch/ppc/kernel/asm-offsets.c b/arch/ppc/kernel/asm-offsets.c
index c5850a272650..e8e94321b59e 100644
--- a/arch/ppc/kernel/asm-offsets.c
+++ b/arch/ppc/kernel/asm-offsets.c
@@ -35,7 +35,7 @@ int
 main(void)
 {
 	DEFINE(THREAD, offsetof(struct task_struct, thread));
-	DEFINE(THREAD_INFO, offsetof(struct task_struct, thread_info));
+	DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
 	DEFINE(MM, offsetof(struct task_struct, mm));
 	DEFINE(PTRACE, offsetof(struct task_struct, ptrace));
 	DEFINE(KSP, offsetof(struct thread_struct, ksp));
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index ec514fe5ccd0..1375f8a4469e 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -15,7 +15,7 @@
 
 int main(void)
 {
-	DEFINE(__THREAD_info, offsetof(struct task_struct, thread_info),);
+	DEFINE(__THREAD_info, offsetof(struct task_struct, stack),);
 	DEFINE(__THREAD_ksp, offsetof(struct task_struct, thread.ksp),);
 	DEFINE(__THREAD_per, offsetof(struct task_struct, thread.per_info),);
 	DEFINE(__THREAD_mm_segment,
diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c
index 29d7cfd1c970..6773ed76e414 100644
--- a/arch/sparc/kernel/asm-offsets.c
+++ b/arch/sparc/kernel/asm-offsets.c
@@ -28,7 +28,7 @@ int foo(void)
 	DEFINE(AOFF_task_gid, offsetof(struct task_struct, gid));
 	DEFINE(AOFF_task_euid, offsetof(struct task_struct, euid));
 	DEFINE(AOFF_task_egid, offsetof(struct task_struct, egid));
-	/* DEFINE(THREAD_INFO, offsetof(struct task_struct, thread_info)); */
+	/* DEFINE(THREAD_INFO, offsetof(struct task_struct, stack)); */
 	DEFINE(ASIZ_task_uid,	sizeof(current->uid));
 	DEFINE(ASIZ_task_gid,	sizeof(current->gid));
 	DEFINE(ASIZ_task_euid,	sizeof(current->euid));
diff --git a/arch/v850/kernel/asm-offsets.c b/arch/v850/kernel/asm-offsets.c
index 24f291369070..cee5c3142d41 100644
--- a/arch/v850/kernel/asm-offsets.c
+++ b/arch/v850/kernel/asm-offsets.c
@@ -29,7 +29,7 @@ int main (void)
 	DEFINE (TASK_PTRACE, offsetof (struct task_struct, ptrace));
 	DEFINE (TASK_BLOCKED, offsetof (struct task_struct, blocked));
 	DEFINE (TASK_THREAD, offsetof (struct task_struct, thread));
-	DEFINE (TASK_THREAD_INFO, offsetof (struct task_struct, thread_info));
+	DEFINE (TASK_THREAD_INFO, offsetof (struct task_struct, stack));
 	DEFINE (TASK_MM, offsetof (struct task_struct, mm));
 	DEFINE (TASK_ACTIVE_MM, offsetof (struct task_struct, active_mm));
 	DEFINE (TASK_PID, offsetof (struct task_struct, pid));
diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c
index b256cfbef344..698079b3a336 100644
--- a/arch/xtensa/kernel/asm-offsets.c
+++ b/arch/xtensa/kernel/asm-offsets.c
@@ -70,7 +70,7 @@ int main(void)
 	DEFINE(TASK_ACTIVE_MM, offsetof (struct task_struct, active_mm));
 	DEFINE(TASK_PID, offsetof (struct task_struct, pid));
 	DEFINE(TASK_THREAD, offsetof (struct task_struct, thread));
-	DEFINE(TASK_THREAD_INFO, offsetof (struct task_struct, thread_info));
+	DEFINE(TASK_THREAD_INFO, offsetof (struct task_struct, stack));
 	DEFINE(TASK_STRUCT_SIZE, sizeof (struct task_struct));
 	BLANK();
 
diff --git a/include/asm-alpha/thread_info.h b/include/asm-alpha/thread_info.h
index eeb3bef91e11..f4defc2bd3fb 100644
--- a/include/asm-alpha/thread_info.h
+++ b/include/asm-alpha/thread_info.h
@@ -97,7 +97,7 @@ register struct thread_info *__current_thread_info __asm__("$8");
 				 1 << TIF_UAC_SIGBUS)
 
 #define SET_UNALIGN_CTL(task,value)	({				     \
-	(task)->thread_info->flags = (((task)->thread_info->flags &	     \
+	task_thread_info(task)->flags = ((task_thread_info(task)->flags &    \
 		~ALPHA_UAC_MASK)					     \
 		| (((value) << ALPHA_UAC_SHIFT)       & (1<<TIF_UAC_NOPRINT))\
 		| (((value) << (ALPHA_UAC_SHIFT + 1)) & (1<<TIF_UAC_SIGBUS)) \
@@ -105,11 +105,11 @@ register struct thread_info *__current_thread_info __asm__("$8");
 	0; })
 
 #define GET_UNALIGN_CTL(task,value)	({				\
-	put_user(((task)->thread_info->flags & (1 << TIF_UAC_NOPRINT))	\
+	put_user((task_thread_info(task)->flags & (1 << TIF_UAC_NOPRINT))\
 		  >> ALPHA_UAC_SHIFT					\
-		 | ((task)->thread_info->flags & (1 << TIF_UAC_SIGBUS))	\
+		 | (task_thread_info(task)->flags & (1 << TIF_UAC_SIGBUS))\
 		 >> (ALPHA_UAC_SHIFT + 1)				\
-		 | ((task)->thread_info->flags & (1 << TIF_UAC_NOFIX))	\
+		 | (task_thread_info(task)->flags & (1 << TIF_UAC_NOFIX))\
 		 >> (ALPHA_UAC_SHIFT - 1),				\
 		 (int __user *)(value));				\
 	})
diff --git a/include/asm-blackfin/processor.h b/include/asm-blackfin/processor.h
index 997465c93e82..0336ff132c16 100644
--- a/include/asm-blackfin/processor.h
+++ b/include/asm-blackfin/processor.h
@@ -58,10 +58,10 @@ do {									\
 	(_regs)->pc = (_pc);						\
 	if (current->mm)						\
 		(_regs)->p5 = current->mm->start_data;			\
-	current->thread_info->l1_task_info.stack_start			\
+	task_thread_info(current)->l1_task_info.stack_start		\
 		= (void *)current->mm->context.stack_start;		\
-	current->thread_info->l1_task_info.lowest_sp = (void *)(_usp);	       	\
-	memcpy(L1_SCRATCH_TASK_INFO, &current->thread_info->l1_task_info,	\
+	task_thread_info(current)->l1_task_info.lowest_sp = (void *)(_usp); \
+	memcpy(L1_SCRATCH_TASK_INFO, &task_thread_info(current)->l1_task_info, \
 		sizeof(*L1_SCRATCH_TASK_INFO));				\
 	wrusp(_usp);							\
 } while(0)
diff --git a/include/asm-blackfin/system.h b/include/asm-blackfin/system.h
index b5bf6e7cb5e8..5e5f1a0566c0 100644
--- a/include/asm-blackfin/system.h
+++ b/include/asm-blackfin/system.h
@@ -239,9 +239,9 @@ asmlinkage struct task_struct *resume(struct task_struct *prev, struct task_stru
 
 #define switch_to(prev,next,last) \
 do {    \
-	memcpy (&prev->thread_info->l1_task_info, L1_SCRATCH_TASK_INFO, \
+	memcpy (&task_thread_info(prev)->l1_task_info, L1_SCRATCH_TASK_INFO, \
 		sizeof *L1_SCRATCH_TASK_INFO); \
-	memcpy (L1_SCRATCH_TASK_INFO, &next->thread_info->l1_task_info, \
+	memcpy (L1_SCRATCH_TASK_INFO, &task_thread_info(next)->l1_task_info, \
 		sizeof *L1_SCRATCH_TASK_INFO); \
 	(last) = resume (prev, next);   \
 } while (0)
diff --git a/include/asm-m68k/thread_info.h b/include/asm-m68k/thread_info.h
index c4d622a57dfb..d635a3752488 100644
--- a/include/asm-m68k/thread_info.h
+++ b/include/asm-m68k/thread_info.h
@@ -37,17 +37,17 @@ struct thread_info {
 #define init_stack		(init_thread_union.stack)
 
 #define task_thread_info(tsk)	(&(tsk)->thread.info)
-#define task_stack_page(tsk)	((void *)(tsk)->thread_info)
+#define task_stack_page(tsk)	((tsk)->stack)
 #define current_thread_info()	task_thread_info(current)
 
 #define __HAVE_THREAD_FUNCTIONS
 
 #define setup_thread_stack(p, org) ({			\
-	*(struct task_struct **)(p)->thread_info = (p);	\
+	*(struct task_struct **)(p)->stack = (p);	\
 	task_thread_info(p)->task = (p);		\
 })
 
-#define end_of_stack(p) ((unsigned long *)(p)->thread_info + 1)
+#define end_of_stack(p) ((unsigned long *)(p)->stack + 1)
 
 /* entry.S relies on these definitions!
  * bits 0-7 are tested at every exception exit
diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h
index b7b8021e8c43..ead9f9a56234 100644
--- a/include/asm-x86_64/system.h
+++ b/include/asm-x86_64/system.h
@@ -39,7 +39,7 @@
 		       [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
 		       [ti_flags] "i" (offsetof(struct thread_info, flags)),\
 		       [tif_fork] "i" (TIF_FORK),			  \
-		       [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
+		       [thread_info] "i" (offsetof(struct task_struct, stack)), \
 		       [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))   \
 		     : "memory", "cc" __EXTRA_CLOBBER)
     
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 795102309bf1..45170b2fa253 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -95,7 +95,7 @@ extern struct group_info init_groups;
 #define INIT_TASK(tsk)	\
 {									\
 	.state		= 0,						\
-	.thread_info	= &init_thread_info,				\
+	.stack		= &init_thread_info,				\
 	.usage		= ATOMIC_INIT(2),				\
 	.flags		= 0,						\
 	.lock_depth	= -1,						\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 28000b1658f9..17b72d88c4cb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -817,7 +817,7 @@ struct prio_array;
 
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
-	struct thread_info *thread_info;
+	void *stack;
 	atomic_t usage;
 	unsigned int flags;	/* per process flags, defined below */
 	unsigned int ptrace;
@@ -1513,8 +1513,8 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
 
 #ifndef __HAVE_THREAD_FUNCTIONS
 
-#define task_thread_info(task) (task)->thread_info
-#define task_stack_page(task) ((void*)((task)->thread_info))
+#define task_thread_info(task)	((struct thread_info *)(task)->stack)
+#define task_stack_page(task)	((task)->stack)
 
 static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
 {
@@ -1524,7 +1524,7 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct
 
 static inline unsigned long *end_of_stack(struct task_struct *p)
 {
-	return (unsigned long *)(p->thread_info + 1);
+	return (unsigned long *)(task_thread_info(p) + 1);
 }
 
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index a8dd75d4992b..5dd3979747f5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,7 +105,7 @@ static struct kmem_cache *mm_cachep;
 
 void free_task(struct task_struct *tsk)
 {
-	free_thread_info(tsk->thread_info);
+	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	free_task_struct(tsk);
 }
@@ -175,7 +175,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	}
 
 	*tsk = *orig;
-	tsk->thread_info = ti;
+	tsk->stack = ti;
 	setup_thread_stack(tsk, orig);
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-- 
cgit v1.2.3


From 0d7ebbbc6eaa5539f78ab20ed6ff1725a4e332ef Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 9 May 2007 02:35:27 -0700
Subject: compiler: introduce __used and __maybe_unused

__used is defined to be __attribute__((unused)) for all pre-3.3 gcc
compilers to suppress warnings for unused functions because perhaps they
are referenced only in inline assembly.  It is defined to be
__attribute__((used)) for gcc 3.3 and later so that the code is still
emitted for such functions.

__maybe_unused is defined to be __attribute__((unused)) for both function
and variable use if it could possibly be unreferenced due to the evaluation
of preprocessor macros.  Function prototypes shall be marked with
__maybe_unused if the actual definition of the function is dependant on
preprocessor macros.

No update to compiler-intel.h is necessary because ICC supports both
__attribute__((used)) and __attribute__((unused)) as specified by the gcc
manual.

__attribute_used__ is deprecated and will be removed once all current
code is converted to using __used.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Adrian Bunk <bunk@stusta.de>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h  |  1 +
 include/linux/compiler-gcc3.h |  6 ++++--
 include/linux/compiler-gcc4.h |  3 ++-
 include/linux/compiler.h      | 21 ++++++++++++++++++---
 4 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index a9f794716a81..03ec2311fb29 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -40,3 +40,4 @@
 #define  noinline			__attribute__((noinline))
 #define __attribute_pure__		__attribute__((pure))
 #define __attribute_const__		__attribute__((__const__))
+#define __maybe_unused			__attribute__((unused))
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index ecd621fd27d2..a9e2863c2dbf 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -4,9 +4,11 @@
 #include <linux/compiler-gcc.h>
 
 #if __GNUC_MINOR__ >= 3
-# define __attribute_used__	__attribute__((__used__))
+# define __used			__attribute__((__used__))
+# define __attribute_used__	__used				/* deprecated */
 #else
-# define __attribute_used__	__attribute__((__unused__))
+# define __used			__attribute__((__unused__))
+# define __attribute_used__	__used				/* deprecated */
 #endif
 
 #if __GNUC_MINOR__ >= 4
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index fd0cc7c4a636..a03e9398a6c2 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -12,7 +12,8 @@
 # define __inline		__inline	__attribute__((always_inline))
 #endif
 
-#define __attribute_used__	__attribute__((__used__))
+#define __used			__attribute__((__used__))
+#define __attribute_used__	__used			/* deprecated */
 #define __must_check 		__attribute__((warn_unused_result))
 #define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
 #define __always_inline		inline __attribute__((always_inline))
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 3b6949b41745..498c35920762 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -108,15 +108,30 @@ extern void __chk_io_ptr(const void __iomem *);
  * Allow us to avoid 'defined but not used' warnings on functions and data,
  * as well as force them to be emitted to the assembly file.
  *
- * As of gcc 3.3, static functions that are not marked with attribute((used))
- * may be elided from the assembly file.  As of gcc 3.3, static data not so
+ * As of gcc 3.4, static functions that are not marked with attribute((used))
+ * may be elided from the assembly file.  As of gcc 3.4, static data not so
  * marked will not be elided, but this may change in a future gcc version.
  *
+ * NOTE: Because distributions shipped with a backported unit-at-a-time
+ * compiler in gcc 3.3, we must define __used to be __attribute__((used))
+ * for gcc >=3.3 instead of 3.4.
+ *
  * In prior versions of gcc, such functions and data would be emitted, but
  * would be warned about except with attribute((unused)).
+ *
+ * Mark functions that are referenced only in inline assembly as __used so
+ * the code is emitted even though it appears to be unreferenced.
  */
 #ifndef __attribute_used__
-# define __attribute_used__	/* unimplemented */
+# define __attribute_used__	/* deprecated */
+#endif
+
+#ifndef __used
+# define __used			/* unimplemented */
+#endif
+
+#ifndef __maybe_unused
+# define __maybe_unused		/* unimplemented */
 #endif
 
 /*
-- 
cgit v1.2.3


From 5a87ede94595f58934000e26e8b13398e63868b5 Mon Sep 17 00:00:00 2001
From: "Antonino A. Daplas" <adaplas@gmail.com>
Date: Wed, 9 May 2007 02:35:32 -0700
Subject: svgalib: move fb_get_caps to svgalib

Move fb_get_caps() method to svgalib.c as svga_get_caps() so it can be used by
s3fb, arkfb and vt8623fb.

Signed-off-by: Antonino Daplas <adaplas@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/arkfb.c    |  1 +
 drivers/video/s3fb.c     | 19 +------------------
 drivers/video/svgalib.c  | 17 +++++++++++++++++
 drivers/video/vt8623fb.c |  1 +
 include/linux/svga.h     |  2 ++
 5 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/arkfb.c b/drivers/video/arkfb.c
index 9288dd11dcef..ba6fede5c466 100644
--- a/drivers/video/arkfb.c
+++ b/drivers/video/arkfb.c
@@ -919,6 +919,7 @@ static struct fb_ops arkfb_ops = {
 	.fb_fillrect	= arkfb_fillrect,
 	.fb_copyarea	= cfb_copyarea,
 	.fb_imageblit	= arkfb_imageblit,
+	.fb_get_caps    = svga_get_caps,
 };
 
 
diff --git a/drivers/video/s3fb.c b/drivers/video/s3fb.c
index 756fafb41d78..d11735895a01 100644
--- a/drivers/video/s3fb.c
+++ b/drivers/video/s3fb.c
@@ -796,23 +796,6 @@ static int s3fb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info)
 	return 0;
 }
 
-/* Get capabilities of accelerator based on the mode */
-
-static void s3fb_get_caps(struct fb_info *info, struct fb_blit_caps *caps,
-			  struct fb_var_screeninfo *var)
-{
-	if (var->bits_per_pixel == 0) {
-		/* can only support 256 8x16 bitmap */
-		caps->x = 1 << (8 - 1);
-		caps->y = 1 << (16 - 1);
-		caps->len = 256;
-	} else {
-		caps->x = ~(u32)0;
-		caps->y = ~(u32)0;
-		caps->len = ~(u32)0;
-	}
-}
-
 /* ------------------------------------------------------------------------- */
 
 /* Frame buffer operations */
@@ -829,7 +812,7 @@ static struct fb_ops s3fb_ops = {
 	.fb_fillrect	= s3fb_fillrect,
 	.fb_copyarea	= cfb_copyarea,
 	.fb_imageblit	= s3fb_imageblit,
-	.fb_get_caps    = s3fb_get_caps,
+	.fb_get_caps    = svga_get_caps,
 };
 
 /* ------------------------------------------------------------------------- */
diff --git a/drivers/video/svgalib.c b/drivers/video/svgalib.c
index 079cdc911e48..25df928d37d8 100644
--- a/drivers/video/svgalib.c
+++ b/drivers/video/svgalib.c
@@ -347,6 +347,23 @@ int svga_get_tilemax(struct fb_info *info)
 	return 256;
 }
 
+/* Get capabilities of accelerator based on the mode */
+
+void svga_get_caps(struct fb_info *info, struct fb_blit_caps *caps,
+		   struct fb_var_screeninfo *var)
+{
+	if (var->bits_per_pixel == 0) {
+		/* can only support 256 8x16 bitmap */
+		caps->x = 1 << (8 - 1);
+		caps->y = 1 << (16 - 1);
+		caps->len = 256;
+	} else {
+		caps->x = (var->bits_per_pixel == 4) ? 1 << (8 - 1) : ~(u32)0;
+		caps->y = ~(u32)0;
+		caps->len = ~(u32)0;
+	}
+}
+EXPORT_SYMBOL(svga_get_caps);
 
 /* ------------------------------------------------------------------------- */
 
diff --git a/drivers/video/vt8623fb.c b/drivers/video/vt8623fb.c
index 8883b6f36a7a..5e9755e464a1 100644
--- a/drivers/video/vt8623fb.c
+++ b/drivers/video/vt8623fb.c
@@ -635,6 +635,7 @@ static struct fb_ops vt8623fb_ops = {
 	.fb_fillrect	= vt8623fb_fillrect,
 	.fb_copyarea	= cfb_copyarea,
 	.fb_imageblit	= vt8623fb_imageblit,
+	.fb_get_caps    = svga_get_caps,
 };
 
 
diff --git a/include/linux/svga.h b/include/linux/svga.h
index e1cc552e04fe..13ad0b82ac28 100644
--- a/include/linux/svga.h
+++ b/include/linux/svga.h
@@ -113,6 +113,8 @@ void svga_tilefill(struct fb_info *info, struct fb_tilerect *rect);
 void svga_tileblit(struct fb_info *info, struct fb_tileblit *blit);
 void svga_tilecursor(struct fb_info *info, struct fb_tilecursor *cursor);
 int svga_get_tilemax(struct fb_info *info);
+void svga_get_caps(struct fb_info *info, struct fb_blit_caps *caps,
+		   struct fb_var_screeninfo *var);
 
 int svga_compute_pll(const struct svga_pll *pll, u32 f_wanted, u16 *m, u16 *n, u16 *r, int node);
 int svga_check_timings(const struct svga_timing_regs *tm, struct fb_var_screeninfo *var, int node);
-- 
cgit v1.2.3


From 880169dd2edc4297b7811a0542be9766ca6945bc Mon Sep 17 00:00:00 2001
From: Haavard Skinnemoen <hskinnemoen@atmel.com>
Date: Wed, 9 May 2007 02:35:33 -0700
Subject: fbdev: add support for AVR32

Provide framebuffer page protection flags and definitions of
fb_readl/fb_writel for AVR32.

Signed-off-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: "Antonino A. Daplas" <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/fbmem.c | 4 ++++
 include/linux/fb.h    | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 08d4e11d9121..38c2e2558f5e 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1236,6 +1236,10 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
 	pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
 #elif defined(__arm__) || defined(__sh__) || defined(__m32r__)
 	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+#elif defined(__avr32__)
+	vma->vm_page_prot = __pgprot((pgprot_val(vma->vm_page_prot)
+				      & ~_PAGE_CACHABLE)
+				     | (_PAGE_BUFFER | _PAGE_DIRTY));
 #elif defined(__ia64__)
 	if (efi_range_is_wc(vma->vm_start, vma->vm_end - vma->vm_start))
 		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
diff --git a/include/linux/fb.h b/include/linux/fb.h
index dff7a728948c..c654d0e9ce33 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -868,7 +868,7 @@ struct fb_info {
 #define fb_writeq sbus_writeq
 #define fb_memset sbus_memset_io
 
-#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || (defined(__sh__) && !defined(__SH5__)) || defined(__powerpc__)
+#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || (defined(__sh__) && !defined(__SH5__)) || defined(__powerpc__) || defined(__avr32__)
 
 #define fb_readb __raw_readb
 #define fb_readw __raw_readw
-- 
cgit v1.2.3


From 5b479c91da90eef605f851508744bfe8269591a0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 9 May 2007 02:35:39 -0700
Subject: md: improve partition detection in md array

md currently uses ->media_changed to make sure rescan_partitions
is call on md array after they are assembled.

However that doesn't happen until the array is opened, which is later
than some people would like.

So use blkdev_ioctl to do the rescan immediately that the
array has been assembled.

This means we can remove all the ->change infrastructure as it was only used
to trigger a partition rescan.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/md.c           | 26 ++++++++------------------
 drivers/md/raid1.c        |  1 -
 drivers/md/raid5.c        |  2 --
 include/linux/raid/md_k.h |  1 -
 4 files changed, 8 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 65814b0340cb..2901d0c0ee9e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3104,6 +3104,7 @@ static int do_md_run(mddev_t * mddev)
 	struct gendisk *disk;
 	struct mdk_personality *pers;
 	char b[BDEVNAME_SIZE];
+	struct block_device *bdev;
 
 	if (list_empty(&mddev->disks))
 		/* cannot run an array with no devices.. */
@@ -3331,7 +3332,13 @@ static int do_md_run(mddev_t * mddev)
 	md_wakeup_thread(mddev->thread);
 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
 
-	mddev->changed = 1;
+	bdev = bdget_disk(mddev->gendisk, 0);
+	if (bdev) {
+		bd_set_size(bdev, mddev->array_size << 1);
+		blkdev_ioctl(bdev->bd_inode, NULL, BLKRRPART, 0);
+		bdput(bdev);
+	}
+
 	md_new_event(mddev);
 	kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE);
 	return 0;
@@ -3453,7 +3460,6 @@ static int do_md_stop(mddev_t * mddev, int mode)
 			mddev->pers = NULL;
 
 			set_capacity(disk, 0);
-			mddev->changed = 1;
 
 			if (mddev->ro)
 				mddev->ro = 0;
@@ -4593,20 +4599,6 @@ static int md_release(struct inode *inode, struct file * file)
 	return 0;
 }
 
-static int md_media_changed(struct gendisk *disk)
-{
-	mddev_t *mddev = disk->private_data;
-
-	return mddev->changed;
-}
-
-static int md_revalidate(struct gendisk *disk)
-{
-	mddev_t *mddev = disk->private_data;
-
-	mddev->changed = 0;
-	return 0;
-}
 static struct block_device_operations md_fops =
 {
 	.owner		= THIS_MODULE,
@@ -4614,8 +4606,6 @@ static struct block_device_operations md_fops =
 	.release	= md_release,
 	.ioctl		= md_ioctl,
 	.getgeo		= md_getgeo,
-	.media_changed	= md_media_changed,
-	.revalidate_disk= md_revalidate,
 };
 
 static int md_thread(void * arg)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 97ee870b265d..1b7130cad21f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2063,7 +2063,6 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
 	 */
 	mddev->array_size = sectors>>1;
 	set_capacity(mddev->gendisk, mddev->array_size << 1);
-	mddev->changed = 1;
 	if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
 		mddev->recovery_cp = mddev->size << 1;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 061375ee6592..a72e70ad0975 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3864,7 +3864,6 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
 	mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
 	set_capacity(mddev->gendisk, mddev->array_size << 1);
-	mddev->changed = 1;
 	if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
 		mddev->recovery_cp = mddev->size << 1;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -3999,7 +3998,6 @@ static void end_reshape(raid5_conf_t *conf)
 		conf->mddev->array_size = conf->mddev->size *
 			(conf->raid_disks - conf->max_degraded);
 		set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
-		conf->mddev->changed = 1;
 
 		bdev = bdget_disk(conf->mddev->gendisk, 0);
 		if (bdev) {
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index de72c49747c8..a121f36f4437 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -201,7 +201,6 @@ struct mddev_s
 	struct mutex			reconfig_mutex;
 	atomic_t			active;
 
-	int				changed;	/* true if we might need to reread partition info */
 	int				degraded;	/* whether md should consider
 							 * adding a spare
 							 */
-- 
cgit v1.2.3


From 18137207236285989dfc0ee7f929b954199228f3 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:07 +0200
Subject: ide: fix UDMA/MWDMA/SWDMA masks (v3)

* use 0x00 instead of 0x80 to disable ->{ultra,mwdma,swdma}_mask
* add udma_mask field to ide_pci_device_t and use it to initialize
  ->ultra_mask in aec62xx, cmd64x, pdc202xx_{new,old} and piix drivers
* fix UDMA masks to match with chipset specific *_ratemask()
  (alim15x3, hpt366, serverworks and siimage drivers need UDMA mask
   filtering method - done in the next patch)

v2:
* piix: fix cable detection for 82801AA_1 and 82372FB_1
  [ Noticed by Sergei Shtylyov <sshtylyov@ru.mvista.com>. ]
* cmd64x: use hwif->cds->udma_mask
  [ Suggested by Sergei Shtylyov <sshtylyov@ru.mvista.com>. ]
* aec62xx: fix newly introduced bug - check DMA status not command register
  [ Noticed by Sergei Shtylyov <sshtylyov@ru.mvista.com>. ]

v3:
* piix: use hwif->cds->udma_mask
  [ Suggested by Sergei Shtylyov <sshtylyov@ru.mvista.com>. ]

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide.c              |  3 --
 drivers/ide/pci/aec62xx.c      | 19 +++++++++--
 drivers/ide/pci/alim15x3.c     | 13 ++++++--
 drivers/ide/pci/cmd64x.c       | 17 +++++-----
 drivers/ide/pci/pdc202xx_new.c | 10 +++++-
 drivers/ide/pci/pdc202xx_old.c |  7 +++-
 drivers/ide/pci/piix.c         | 73 ++++++++++++++++++------------------------
 drivers/ide/pci/sis5513.c      |  5 ++-
 include/linux/ide.h            |  1 +
 9 files changed, 88 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index ae5bf2be6f52..c06424fe647b 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -216,9 +216,6 @@ static void init_hwif_data(ide_hwif_t *hwif, unsigned int index)
 	hwif->bus_state	= BUSSTATE_ON;
 
 	hwif->atapi_dma = 0;		/* disable all atapi dma */ 
-	hwif->ultra_mask = 0x80;	/* disable all ultra */
-	hwif->mwdma_mask = 0x80;	/* disable all mwdma */
-	hwif->swdma_mask = 0x80;	/* disable all swdma */
 
 	init_completion(&hwif->gendev_rel_comp);
 
diff --git a/drivers/ide/pci/aec62xx.c b/drivers/ide/pci/aec62xx.c
index 73bdf64dbbfc..abe0b1bb55ff 100644
--- a/drivers/ide/pci/aec62xx.c
+++ b/drivers/ide/pci/aec62xx.c
@@ -261,11 +261,13 @@ static unsigned int __devinit init_chipset_aec62xx(struct pci_dev *dev, const ch
 
 static void __devinit init_hwif_aec62xx(ide_hwif_t *hwif)
 {
+	struct pci_dev *dev = hwif->pci_dev;
+
 	hwif->autodma = 0;
 	hwif->tuneproc = &aec62xx_tune_drive;
 	hwif->speedproc = &aec62xx_tune_chipset;
 
-	if (hwif->pci_dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF)
+	if (dev->device == PCI_DEVICE_ID_ARTOP_ATP850UF)
 		hwif->serialized = hwif->channel;
 
 	if (hwif->mate)
@@ -277,7 +279,15 @@ static void __devinit init_hwif_aec62xx(ide_hwif_t *hwif)
 		return;
 	}
 
-	hwif->ultra_mask = 0x7f;
+	hwif->ultra_mask = hwif->cds->udma_mask;
+
+	/* atp865 and atp865r */
+	if (hwif->ultra_mask == 0x3f) {
+		/* check bit 0x10 of DMA status register */
+		if (inb(pci_resource_start(dev, 4) + 2) & 0x10)
+ 			hwif->ultra_mask = 0x7f; /* udma0-6 */
+	}
+
 	hwif->mwdma_mask = 0x07;
 
 	hwif->ide_dma_check	= &aec62xx_config_drive_xfer_rate;
@@ -344,6 +354,7 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 		.bootable	= OFF_BOARD,
+		.udma_mask	= 0x07, /* udma0-2 */
 	},{	/* 1 */
 		.name		= "AEC6260",
 		.init_setup	= init_setup_aec62xx,
@@ -353,6 +364,7 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.channels	= 2,
 		.autodma	= NOAUTODMA,
 		.bootable	= OFF_BOARD,
+		.udma_mask	= 0x1f, /* udma0-4 */
 	},{	/* 2 */
 		.name		= "AEC6260R",
 		.init_setup	= init_setup_aec62xx,
@@ -363,6 +375,7 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 		.bootable	= NEVER_BOARD,
+		.udma_mask	= 0x1f, /* udma0-4 */
 	},{	/* 3 */
 		.name		= "AEC6X80",
 		.init_setup	= init_setup_aec6x80,
@@ -372,6 +385,7 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.udma_mask	= 0x3f, /* udma0-5 */
 	},{	/* 4 */
 		.name		= "AEC6X80R",
 		.init_setup	= init_setup_aec6x80,
@@ -382,6 +396,7 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 		.bootable	= OFF_BOARD,
+		.udma_mask	= 0x3f, /* udma0-5 */
 	}
 };
 
diff --git a/drivers/ide/pci/alim15x3.c b/drivers/ide/pci/alim15x3.c
index 946a12746cb5..26c27d946c67 100644
--- a/drivers/ide/pci/alim15x3.c
+++ b/drivers/ide/pci/alim15x3.c
@@ -783,8 +783,17 @@ static void __devinit init_hwif_common_ali15x3 (ide_hwif_t *hwif)
 
 	hwif->atapi_dma = 1;
 
-	if (m5229_revision > 0x20)
-		hwif->ultra_mask = 0x7f;
+	if (m5229_revision <= 0x20)
+		hwif->ultra_mask = 0x00; /* no udma */
+	else if (m5229_revision < 0xC2)
+		hwif->ultra_mask = 0x07; /* udma0-2 */
+	else if (m5229_revision == 0xC2 || m5229_revision == 0xC3)
+		hwif->ultra_mask = 0x1f; /* udma0-4 */
+	else if (m5229_revision == 0xC4)
+		hwif->ultra_mask = 0x3f; /* udma0-5 */
+	else
+		hwif->ultra_mask = 0x7f; /* udma0-6 */
+
 	hwif->mwdma_mask = 0x07;
 	hwif->swdma_mask = 0x07;
 
diff --git a/drivers/ide/pci/cmd64x.c b/drivers/ide/pci/cmd64x.c
index 77f51ab6d439..610c45f7b4e2 100644
--- a/drivers/ide/pci/cmd64x.c
+++ b/drivers/ide/pci/cmd64x.c
@@ -644,15 +644,12 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
 
 	hwif->atapi_dma = 1;
 
-	hwif->ultra_mask = 0x3f;
-	hwif->mwdma_mask = 0x07;
+	hwif->ultra_mask = hwif->cds->udma_mask;
+
+	if (dev->device == PCI_DEVICE_ID_CMD_646 && class_rev < 5)
+		hwif->ultra_mask = 0x00;
 
-	if (dev->device == PCI_DEVICE_ID_CMD_643)
-		hwif->ultra_mask = 0x80;
-	if (dev->device == PCI_DEVICE_ID_CMD_646)
-		hwif->ultra_mask = (class_rev > 0x04) ? 0x07 : 0x80;
-	if (dev->device == PCI_DEVICE_ID_CMD_648)
-		hwif->ultra_mask = 0x1f;
+	hwif->mwdma_mask = 0x07;
 
 	hwif->ide_dma_check = &cmd64x_config_drive_for_dma;
 	if (!(hwif->udma_four))
@@ -716,6 +713,7 @@ static ide_pci_device_t cmd64x_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x00,0x00,0x00}, {0x51,0x08,0x08}},
 		.bootable	= ON_BOARD,
+		.udma_mask	= 0x00, /* no udma */
 	},{	/* 1 */
 		.name		= "CMD646",
 		.init_setup	= init_setup_cmd646,
@@ -725,6 +723,7 @@ static ide_pci_device_t cmd64x_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x51,0x04,0x04}, {0x51,0x08,0x08}},
 		.bootable	= ON_BOARD,
+		.udma_mask	= 0x07, /* udma0-2 */
 	},{	/* 2 */
 		.name		= "CMD648",
 		.init_setup	= init_setup_cmd64x,
@@ -734,6 +733,7 @@ static ide_pci_device_t cmd64x_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x51,0x04,0x04}, {0x51,0x08,0x08}},
 		.bootable	= ON_BOARD,
+		.udma_mask	= 0x1f, /* udma0-4 */
 	},{	/* 3 */
 		.name		= "CMD649",
 		.init_setup	= init_setup_cmd64x,
@@ -743,6 +743,7 @@ static ide_pci_device_t cmd64x_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x51,0x04,0x04}, {0x51,0x08,0x08}},
 		.bootable	= ON_BOARD,
+		.udma_mask	= 0x3f, /* udma0-5 */
 	}
 };
 
diff --git a/drivers/ide/pci/pdc202xx_new.c b/drivers/ide/pci/pdc202xx_new.c
index 2cdd629c653d..a23853445af9 100644
--- a/drivers/ide/pci/pdc202xx_new.c
+++ b/drivers/ide/pci/pdc202xx_new.c
@@ -543,7 +543,8 @@ static void __devinit init_hwif_pdc202new(ide_hwif_t *hwif)
 	hwif->drives[0].autotune = hwif->drives[1].autotune = 1;
 
 	hwif->atapi_dma  = 1;
-	hwif->ultra_mask = 0x7f;
+
+	hwif->ultra_mask = hwif->cds->udma_mask;
 	hwif->mwdma_mask = 0x07;
 
 	hwif->err_stops_fifo = 1;
@@ -619,6 +620,7 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.udma_mask	= 0x3f, /* udma0-5 */
 	},{	/* 1 */
 		.name		= "PDC20269",
 		.init_setup	= init_setup_pdcnew,
@@ -627,6 +629,7 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.udma_mask	= 0x7f, /* udma0-6*/
 	},{	/* 2 */
 		.name		= "PDC20270",
 		.init_setup	= init_setup_pdc20270,
@@ -635,6 +638,7 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.udma_mask	= 0x3f, /* udma0-5 */
 	},{	/* 3 */
 		.name		= "PDC20271",
 		.init_setup	= init_setup_pdcnew,
@@ -643,6 +647,7 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.udma_mask	= 0x7f, /* udma0-6*/
 	},{	/* 4 */
 		.name		= "PDC20275",
 		.init_setup	= init_setup_pdcnew,
@@ -651,6 +656,7 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.udma_mask	= 0x7f, /* udma0-6*/
 	},{	/* 5 */
 		.name		= "PDC20276",
 		.init_setup	= init_setup_pdc20276,
@@ -659,6 +665,7 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.udma_mask	= 0x7f, /* udma0-6*/
 	},{	/* 6 */
 		.name		= "PDC20277",
 		.init_setup	= init_setup_pdcnew,
@@ -667,6 +674,7 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.udma_mask	= 0x7f, /* udma0-6*/
 	}
 };
 
diff --git a/drivers/ide/pci/pdc202xx_old.c b/drivers/ide/pci/pdc202xx_old.c
index a7a639fe1eaf..d7a38067fb34 100644
--- a/drivers/ide/pci/pdc202xx_old.c
+++ b/drivers/ide/pci/pdc202xx_old.c
@@ -478,7 +478,7 @@ static void __devinit init_hwif_pdc202xx(ide_hwif_t *hwif)
 
 	hwif->drives[0].autotune = hwif->drives[1].autotune = 1;
 
-	hwif->ultra_mask = 0x3f;
+	hwif->ultra_mask = hwif->cds->udma_mask;
 	hwif->mwdma_mask = 0x07;
 	hwif->swdma_mask = 0x07;
 	hwif->atapi_dma = 1;
@@ -587,6 +587,7 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 16,
+		.udma_mask	= 0x07, /* udma0-2 */
 	},{	/* 1 */
 		.name		= "PDC20262",
 		.init_setup	= init_setup_pdc202ata4,
@@ -597,6 +598,7 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 48,
+		.udma_mask	= 0x1f, /* udma0-4 */
 	},{	/* 2 */
 		.name		= "PDC20263",
 		.init_setup	= init_setup_pdc202ata4,
@@ -607,6 +609,7 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 48,
+		.udma_mask	= 0x1f, /* udma0-4 */
 	},{	/* 3 */
 		.name		= "PDC20265",
 		.init_setup	= init_setup_pdc20265,
@@ -617,6 +620,7 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 48,
+		.udma_mask	= 0x3f, /* udma0-5 */
 	},{	/* 4 */
 		.name		= "PDC20267",
 		.init_setup	= init_setup_pdc202xx,
@@ -627,6 +631,7 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 48,
+		.udma_mask	= 0x3f, /* udma0-5 */
 	}
 };
 
diff --git a/drivers/ide/pci/piix.c b/drivers/ide/pci/piix.c
index 061d300ab8be..17ea44edb4eb 100644
--- a/drivers/ide/pci/piix.c
+++ b/drivers/ide/pci/piix.c
@@ -524,26 +524,14 @@ static void __devinit init_hwif_piix(ide_hwif_t *hwif)
 		hwif->ide_dma_clear_irq = &piix_dma_clear_irq;
 
 	hwif->atapi_dma = 1;
-	hwif->ultra_mask = 0x3f;
+
+	hwif->ultra_mask = hwif->cds->udma_mask;
 	hwif->mwdma_mask = 0x06;
 	hwif->swdma_mask = 0x04;
 
-	switch(hwif->pci_dev->device) {
-		case PCI_DEVICE_ID_INTEL_82371FB_0:
-		case PCI_DEVICE_ID_INTEL_82371FB_1:
-		case PCI_DEVICE_ID_INTEL_82371SB_1:
-			hwif->ultra_mask = 0x80;
-			break;
-		case PCI_DEVICE_ID_INTEL_82371AB:
-		case PCI_DEVICE_ID_INTEL_82443MX_1:
-		case PCI_DEVICE_ID_INTEL_82451NX:
-		case PCI_DEVICE_ID_INTEL_82801AB_1:
-			hwif->ultra_mask = 0x07;
-			break;
-		default:
-			if (!hwif->udma_four)
-				hwif->udma_four = piix_cable_detect(hwif);
-			break;
+	if (hwif->ultra_mask & 0x78) {
+		if (!hwif->udma_four)
+			hwif->udma_four = piix_cable_detect(hwif);
 	}
 
 	if (no_piix_dma)
@@ -557,7 +545,7 @@ static void __devinit init_hwif_piix(ide_hwif_t *hwif)
 	hwif->drives[0].autodma = hwif->autodma;
 }
 
-#define DECLARE_PIIX_DEV(name_str) \
+#define DECLARE_PIIX_DEV(name_str, udma) \
 	{						\
 		.name		= name_str,		\
 		.init_chipset	= init_chipset_piix,	\
@@ -566,11 +554,12 @@ static void __devinit init_hwif_piix(ide_hwif_t *hwif)
 		.autodma	= AUTODMA,		\
 		.enablebits	= {{0x41,0x80,0x80}, {0x43,0x80,0x80}}, \
 		.bootable	= ON_BOARD,		\
+		.udma_mask	= udma,			\
 	}
 
 static ide_pci_device_t piix_pci_info[] __devinitdata = {
-	/*  0 */ DECLARE_PIIX_DEV("PIIXa"),
-	/*  1 */ DECLARE_PIIX_DEV("PIIXb"),
+	/*  0 */ DECLARE_PIIX_DEV("PIIXa", 0x00),	/* no udma */
+	/*  1 */ DECLARE_PIIX_DEV("PIIXb", 0x00),	/* no udma */
 
 	/*  2 */
 	{	/*
@@ -587,28 +576,28 @@ static ide_pci_device_t piix_pci_info[] __devinitdata = {
 		.flags		= IDEPCI_FLAG_ISA_PORTS
 	},
 
-	/*  3 */ DECLARE_PIIX_DEV("PIIX3"),
-	/*  4 */ DECLARE_PIIX_DEV("PIIX4"),
-	/*  5 */ DECLARE_PIIX_DEV("ICH0"),
-	/*  6 */ DECLARE_PIIX_DEV("PIIX4"),
-	/*  7 */ DECLARE_PIIX_DEV("ICH"),
-	/*  8 */ DECLARE_PIIX_DEV("PIIX4"),
-	/*  9 */ DECLARE_PIIX_DEV("PIIX4"),
-	/* 10 */ DECLARE_PIIX_DEV("ICH2"),
-	/* 11 */ DECLARE_PIIX_DEV("ICH2M"),
-	/* 12 */ DECLARE_PIIX_DEV("ICH3M"),
-	/* 13 */ DECLARE_PIIX_DEV("ICH3"),
-	/* 14 */ DECLARE_PIIX_DEV("ICH4"),
-	/* 15 */ DECLARE_PIIX_DEV("ICH5"),
-	/* 16 */ DECLARE_PIIX_DEV("C-ICH"),
-	/* 17 */ DECLARE_PIIX_DEV("ICH4"),
-	/* 18 */ DECLARE_PIIX_DEV("ICH5-SATA"),
-	/* 19 */ DECLARE_PIIX_DEV("ICH5"),
-	/* 20 */ DECLARE_PIIX_DEV("ICH6"),
-	/* 21 */ DECLARE_PIIX_DEV("ICH7"),
-	/* 22 */ DECLARE_PIIX_DEV("ICH4"),
-	/* 23 */ DECLARE_PIIX_DEV("ESB2"),
-	/* 24 */ DECLARE_PIIX_DEV("ICH8M"),
+	/*  3 */ DECLARE_PIIX_DEV("PIIX3", 0x00),	/* no udma */
+	/*  4 */ DECLARE_PIIX_DEV("PIIX4", 0x07),	/* udma0-2 */
+	/*  5 */ DECLARE_PIIX_DEV("ICH0",  0x07),	/* udma0-2 */
+	/*  6 */ DECLARE_PIIX_DEV("PIIX4", 0x07),	/* udma0-2 */
+	/*  7 */ DECLARE_PIIX_DEV("ICH",   0x1f),	/* udma0-4 */
+	/*  8 */ DECLARE_PIIX_DEV("PIIX4", 0x1f),	/* udma0-4 */
+	/*  9 */ DECLARE_PIIX_DEV("PIIX4", 0x07),	/* udma0-2 */
+	/* 10 */ DECLARE_PIIX_DEV("ICH2",  0x3f),	/* udma0-5 */
+	/* 11 */ DECLARE_PIIX_DEV("ICH2M", 0x3f),	/* udma0-5 */
+	/* 12 */ DECLARE_PIIX_DEV("ICH3M", 0x3f),	/* udma0-5 */
+	/* 13 */ DECLARE_PIIX_DEV("ICH3",  0x3f),	/* udma0-5 */
+	/* 14 */ DECLARE_PIIX_DEV("ICH4",  0x3f),	/* udma0-5 */
+	/* 15 */ DECLARE_PIIX_DEV("ICH5",  0x3f),	/* udma0-5 */
+	/* 16 */ DECLARE_PIIX_DEV("C-ICH", 0x3f),	/* udma0-5 */
+	/* 17 */ DECLARE_PIIX_DEV("ICH4",  0x3f),	/* udma0-5 */
+	/* 18 */ DECLARE_PIIX_DEV("ICH5-SATA", 0x3f),	/* udma0-5 */
+	/* 19 */ DECLARE_PIIX_DEV("ICH5",  0x3f),	/* udma0-5 */
+	/* 20 */ DECLARE_PIIX_DEV("ICH6",  0x3f),	/* udma0-5 */
+	/* 21 */ DECLARE_PIIX_DEV("ICH7",  0x3f),	/* udma0-5 */
+	/* 22 */ DECLARE_PIIX_DEV("ICH4",  0x3f),	/* udma0-5 */
+	/* 23 */ DECLARE_PIIX_DEV("ESB2",  0x3f),	/* udma0-5 */
+	/* 24 */ DECLARE_PIIX_DEV("ICH8M", 0x3f),	/* udma0-5 */
 };
 
 /**
diff --git a/drivers/ide/pci/sis5513.c b/drivers/ide/pci/sis5513.c
index 2ba0669f36a1..6fe4ecb6c06c 100644
--- a/drivers/ide/pci/sis5513.c
+++ b/drivers/ide/pci/sis5513.c
@@ -858,6 +858,8 @@ static unsigned int __devinit ata66_sis5513 (ide_hwif_t *hwif)
 
 static void __devinit init_hwif_sis5513 (ide_hwif_t *hwif)
 {
+	u8 udma_rates[] = { 0x00, 0x00, 0x07, 0x1f, 0x3f, 0x3f, 0x7f, 0x7f };
+
 	hwif->autodma = 0;
 
 	if (!hwif->irq)
@@ -873,7 +875,8 @@ static void __devinit init_hwif_sis5513 (ide_hwif_t *hwif)
 	}
 
 	hwif->atapi_dma = 1;
-	hwif->ultra_mask = 0x7f;
+
+	hwif->ultra_mask = udma_rates[chipset_family];
 	hwif->mwdma_mask = 0x07;
 	hwif->swdma_mask = 0x07;
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 418dfb5adadd..c9375c863584 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1257,6 +1257,7 @@ typedef struct ide_pci_device_s {
 	unsigned int		extra;
 	struct ide_pci_device_s	*next;
 	u8			flags;
+	u8			udma_mask;
 } ide_pci_device_t;
 
 extern int ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
-- 
cgit v1.2.3


From 2d5eaa6dd744a641e75503232a01f52d0768884c Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:08 +0200
Subject: ide: rework the code for selecting the best DMA transfer mode (v3)

Depends on the "ide: fix UDMA/MWDMA/SWDMA masks" patch.

* add ide_hwif_t.udma_filter hook for filtering UDMA mask
  (use it in alim15x3, hpt366, siimage and serverworks drivers)
* add ide_max_dma_mode() for finding best DMA mode for the device
  (loosely based on some older libata-core.c code)
* convert ide_dma_speed() users to use ide_max_dma_mode()
* make ide_rate_filter() take "ide_drive_t *drive" as an argument instead
  of "u8 mode" and teach it to how to use UDMA mask to do filtering
* use ide_rate_filter() in hpt366 driver
* remove no longer needed ide_dma_speed() and *_ratemask()
* unexport eighty_ninty_three()

v2:
* rename ->filter_udma_mask to ->udma_filter
  [ Suggested by Sergei Shtylyov <sshtylyov@ru.mvista.com>. ]

v3:
* updated for scc_pata driver (fixes XFER_UDMA_6 filtering for user-space
  originated transfer mode change requests when 100MHz clock is used)

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/arm/icside.c       |   2 +-
 drivers/ide/cris/ide-cris.c    |   2 +-
 drivers/ide/ide-dma.c          |  74 ++++++++++++++++++++++++
 drivers/ide/ide-iops.c         |   2 -
 drivers/ide/ide-lib.c          | 125 ++++++-----------------------------------
 drivers/ide/ide.c              |   1 +
 drivers/ide/pci/aec62xx.c      |  32 +----------
 drivers/ide/pci/alim15x3.c     |  76 ++++++-------------------
 drivers/ide/pci/atiixp.c       |  20 +------
 drivers/ide/pci/cmd64x.c       |  65 +++++----------------
 drivers/ide/pci/cs5535.c       |  20 +------
 drivers/ide/pci/hpt34x.c       |   9 +--
 drivers/ide/pci/hpt366.c       |  67 +++++++++++-----------
 drivers/ide/pci/it8213.c       |  20 +------
 drivers/ide/pci/it821x.c       |  20 +------
 drivers/ide/pci/jmicron.c      |  21 +------
 drivers/ide/pci/pdc202xx_new.c |  14 +----
 drivers/ide/pci/pdc202xx_old.c |  27 +--------
 drivers/ide/pci/piix.c         |  66 +---------------------
 drivers/ide/pci/scc_pata.c     |  21 +------
 drivers/ide/pci/serverworks.c  |  31 ++++++----
 drivers/ide/pci/siimage.c      |  45 +++++++--------
 drivers/ide/pci/sis5513.c      |  14 +----
 drivers/ide/pci/slc90e66.c     |  13 +----
 drivers/ide/pci/tc86c001.c     |   9 +--
 drivers/ide/pci/triflex.c      |   4 +-
 include/linux/ide.h            |  10 ++--
 27 files changed, 236 insertions(+), 574 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/arm/icside.c b/drivers/ide/arm/icside.c
index e2953fc1fafb..f383ace2a087 100644
--- a/drivers/ide/arm/icside.c
+++ b/drivers/ide/arm/icside.c
@@ -342,7 +342,7 @@ static int icside_dma_check(ide_drive_t *drive)
 	 * Enable DMA on any drive that has multiword DMA
 	 */
 	if (id->field_valid & 2) {
-		xfer_mode = ide_dma_speed(drive, 0);
+		xfer_mode = ide_max_dma_mode(drive);
 		goto out;
 	}
 
diff --git a/drivers/ide/cris/ide-cris.c b/drivers/ide/cris/ide-cris.c
index 5e8efc89255a..9691d089fce8 100644
--- a/drivers/ide/cris/ide-cris.c
+++ b/drivers/ide/cris/ide-cris.c
@@ -1004,7 +1004,7 @@ static int cris_ide_build_dmatable (ide_drive_t *drive)
 
 static int cris_config_drive_for_dma (ide_drive_t *drive)
 {
-	u8 speed = ide_dma_speed(drive, 1);
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!speed)
 		return 0;
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index fd213088b06b..f28fabb791fe 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -705,6 +705,80 @@ int ide_use_dma(ide_drive_t *drive)
 
 EXPORT_SYMBOL_GPL(ide_use_dma);
 
+static const u8 xfer_mode_bases[] = {
+	XFER_UDMA_0,
+	XFER_MW_DMA_0,
+	XFER_SW_DMA_0,
+};
+
+static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base)
+{
+	struct hd_driveid *id = drive->id;
+	ide_hwif_t *hwif = drive->hwif;
+	unsigned int mask = 0;
+
+	switch(base) {
+	case XFER_UDMA_0:
+		if ((id->field_valid & 4) == 0)
+			break;
+
+		mask = id->dma_ultra & hwif->ultra_mask;
+
+		if (hwif->udma_filter)
+			mask &= hwif->udma_filter(drive);
+
+		if ((mask & 0x78) && (eighty_ninty_three(drive) == 0))
+			mask &= 0x07;
+		break;
+	case XFER_MW_DMA_0:
+		mask = id->dma_mword & hwif->mwdma_mask;
+		break;
+	case XFER_SW_DMA_0:
+		mask = id->dma_1word & hwif->swdma_mask;
+		break;
+	default:
+		BUG();
+		break;
+	}
+
+	return mask;
+}
+
+/**
+ *	ide_max_dma_mode	-	compute DMA speed
+ *	@drive: IDE device
+ *
+ *	Checks the drive capabilities and returns the speed to use
+ *	for the DMA transfer.  Returns 0 if the drive is incapable
+ *	of DMA transfers.
+ */
+
+u8 ide_max_dma_mode(ide_drive_t *drive)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	unsigned int mask;
+	int x, i;
+	u8 mode = 0;
+
+	if (drive->media != ide_disk && hwif->atapi_dma == 0)
+		return 0;
+
+	for (i = 0; i < ARRAY_SIZE(xfer_mode_bases); i++) {
+		mask = ide_get_mode_mask(drive, xfer_mode_bases[i]);
+		x = fls(mask) - 1;
+		if (x >= 0) {
+			mode = xfer_mode_bases[i] + x;
+			break;
+		}
+	}
+
+	printk(KERN_DEBUG "%s: selected mode 0x%x\n", drive->name, mode);
+
+	return mode;
+}
+
+EXPORT_SYMBOL_GPL(ide_max_dma_mode);
+
 void ide_dma_verbose(ide_drive_t *drive)
 {
 	struct hd_driveid *id	= drive->id;
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 3caa176b3155..ed6128f6cd98 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -592,8 +592,6 @@ u8 eighty_ninty_three (ide_drive_t *drive)
 	return 1;
 }
 
-EXPORT_SYMBOL(eighty_ninty_three);
-
 int ide_ata66_check (ide_drive_t *drive, ide_task_t *args)
 {
 	if ((args->tfRegister[IDE_COMMAND_OFFSET] == WIN_SETFEATURES) &&
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index 68719314df3f..4557fc5a3ea3 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -69,123 +69,34 @@ char *ide_xfer_verbose (u8 xfer_rate)
 EXPORT_SYMBOL(ide_xfer_verbose);
 
 /**
- *	ide_dma_speed	-	compute DMA speed
- *	@drive: drive
- *	@mode:	modes available
- *
- *	Checks the drive capabilities and returns the speed to use
- *	for the DMA transfer.  Returns 0 if the drive is incapable
- *	of DMA transfers.
- */
- 
-u8 ide_dma_speed(ide_drive_t *drive, u8 mode)
-{
-	struct hd_driveid *id   = drive->id;
-	ide_hwif_t *hwif	= HWIF(drive);
-	u8 ultra_mask, mwdma_mask, swdma_mask;
-	u8 speed = 0;
-
-	if (drive->media != ide_disk && hwif->atapi_dma == 0)
-		return 0;
-
-	/* Capable of UltraDMA modes? */
-	ultra_mask = id->dma_ultra & hwif->ultra_mask;
-
-	if (!(id->field_valid & 4))
-		mode = 0;	/* fallback to MW/SW DMA if no UltraDMA */
-
-	switch (mode) {
-	case 4:
-		if (ultra_mask & 0x40) {
-			speed = XFER_UDMA_6;
-			break;
-		}
-	case 3:
-		if (ultra_mask & 0x20) {
-			speed = XFER_UDMA_5;
-			break;
-		}
-	case 2:
-		if (ultra_mask & 0x10) {
-			speed = XFER_UDMA_4;
-			break;
-		}
-		if (ultra_mask & 0x08) {
-			speed = XFER_UDMA_3;
-			break;
-		}
-	case 1:
-		if (ultra_mask & 0x04) {
-			speed = XFER_UDMA_2;
-			break;
-		}
-		if (ultra_mask & 0x02) {
-			speed = XFER_UDMA_1;
-			break;
-		}
-		if (ultra_mask & 0x01) {
-			speed = XFER_UDMA_0;
-			break;
-		}
-	case 0:
-		mwdma_mask = id->dma_mword & hwif->mwdma_mask;
-
-		if (mwdma_mask & 0x04) {
-			speed = XFER_MW_DMA_2;
-			break;
-		}
-		if (mwdma_mask & 0x02) {
-			speed = XFER_MW_DMA_1;
-			break;
-		}
-		if (mwdma_mask & 0x01) {
-			speed = XFER_MW_DMA_0;
-			break;
-		}
-
-		swdma_mask = id->dma_1word & hwif->swdma_mask;
-
-		if (swdma_mask & 0x04) {
-			speed = XFER_SW_DMA_2;
-			break;
-		}
-		if (swdma_mask & 0x02) {
-			speed = XFER_SW_DMA_1;
-			break;
-		}
-		if (swdma_mask & 0x01) {
-			speed = XFER_SW_DMA_0;
-			break;
-		}
-	}
-
-	return speed;
-}
-EXPORT_SYMBOL(ide_dma_speed);
-
-
-/**
- *	ide_rate_filter		-	return best speed for mode
- *	@mode: modes available
+ *	ide_rate_filter		-	filter transfer mode
+ *	@drive: IDE device
  *	@speed: desired speed
  *
- *	Given the available DMA/UDMA mode this function returns
+ *	Given the available transfer modes this function returns
  *	the best available speed at or below the speed requested.
+ *
+ *	FIXME: filter also PIO/SWDMA/MWDMA modes
  */
 
-u8 ide_rate_filter (u8 mode, u8 speed) 
+u8 ide_rate_filter(ide_drive_t *drive, u8 speed)
 {
 #ifdef CONFIG_BLK_DEV_IDEDMA
-	static u8 speed_max[] = {
-		XFER_MW_DMA_2, XFER_UDMA_2, XFER_UDMA_4,
-		XFER_UDMA_5, XFER_UDMA_6
-	};
+	ide_hwif_t *hwif = drive->hwif;
+	u8 mask = hwif->ultra_mask, mode = XFER_MW_DMA_2;
+
+	if (hwif->udma_filter)
+		mask = hwif->udma_filter(drive);
+
+	if ((mask & 0x78) && (eighty_ninty_three(drive) == 0))
+		mask &= 0x07;
+
+	if (mask)
+		mode = fls(mask) - 1 + XFER_UDMA_0;
 
 //	printk("%s: mode 0x%02x, speed 0x%02x\n", __FUNCTION__, mode, speed);
 
-	/* So that we remember to update this if new modes appear */
-	BUG_ON(mode > 4);
-	return min(speed, speed_max[mode]);
+	return min(speed, mode);
 #else /* !CONFIG_BLK_DEV_IDEDMA */
 	return min(speed, (u8)XFER_PIO_4);
 #endif /* CONFIG_BLK_DEV_IDEDMA */
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index c06424fe647b..73f752142f9e 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -477,6 +477,7 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif)
 
 	hwif->tuneproc			= tmp_hwif->tuneproc;
 	hwif->speedproc			= tmp_hwif->speedproc;
+	hwif->udma_filter		= tmp_hwif->udma_filter;
 	hwif->selectproc		= tmp_hwif->selectproc;
 	hwif->reset_poll		= tmp_hwif->reset_poll;
 	hwif->pre_reset			= tmp_hwif->pre_reset;
diff --git a/drivers/ide/pci/aec62xx.c b/drivers/ide/pci/aec62xx.c
index abe0b1bb55ff..099539e8c7a3 100644
--- a/drivers/ide/pci/aec62xx.c
+++ b/drivers/ide/pci/aec62xx.c
@@ -87,38 +87,12 @@ static u8 pci_bus_clock_list_ultra (u8 speed, struct chipset_bus_clock_list_entr
 	return chipset_table->ultra_settings;
 }
 
-static u8 aec62xx_ratemask (ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= HWIF(drive);
-	u8 mode;
-
-	switch(hwif->pci_dev->device) {
-		case PCI_DEVICE_ID_ARTOP_ATP865:
-		case PCI_DEVICE_ID_ARTOP_ATP865R:
-			mode = (inb(hwif->channel ?
-				    hwif->mate->dma_status :
-				    hwif->dma_status) & 0x10) ? 4 : 3;
-			break;
-		case PCI_DEVICE_ID_ARTOP_ATP860:
-		case PCI_DEVICE_ID_ARTOP_ATP860R:
-			mode = 2;
-			break;
-		case PCI_DEVICE_ID_ARTOP_ATP850UF:
-		default:
-			return 1;
-	}
-
-	if (!eighty_ninty_three(drive))
-		mode = min(mode, (u8)1);
-	return mode;
-}
-
 static int aec6210_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 {
 	ide_hwif_t *hwif	= HWIF(drive);
 	struct pci_dev *dev	= hwif->pci_dev;
 	u16 d_conf		= 0;
-	u8 speed	= ide_rate_filter(aec62xx_ratemask(drive), xferspeed);
+	u8 speed		= ide_rate_filter(drive, xferspeed);
 	u8 ultra = 0, ultra_conf = 0;
 	u8 tmp0 = 0, tmp1 = 0, tmp2 = 0;
 	unsigned long flags;
@@ -145,7 +119,7 @@ static int aec6260_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 {
 	ide_hwif_t *hwif	= HWIF(drive);
 	struct pci_dev *dev	= hwif->pci_dev;
-	u8 speed	= ide_rate_filter(aec62xx_ratemask(drive), xferspeed);
+	u8 speed	= ide_rate_filter(drive, xferspeed);
 	u8 unit		= (drive->select.b.unit & 0x01);
 	u8 tmp1 = 0, tmp2 = 0;
 	u8 ultra = 0, drive_conf = 0, ultra_conf = 0;
@@ -183,7 +157,7 @@ static int aec62xx_tune_chipset (ide_drive_t *drive, u8 speed)
 
 static int config_chipset_for_dma (ide_drive_t *drive)
 {
-	u8 speed = ide_dma_speed(drive, aec62xx_ratemask(drive));	
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!(speed))
 		return 0;
diff --git a/drivers/ide/pci/alim15x3.c b/drivers/ide/pci/alim15x3.c
index 26c27d946c67..76643b626bdd 100644
--- a/drivers/ide/pci/alim15x3.c
+++ b/drivers/ide/pci/alim15x3.c
@@ -378,74 +378,31 @@ static void ali15x3_tune_drive (ide_drive_t *drive, u8 pio)
 }
 
 /**
- *	ali15x3_can_ultra	-	check for ultra DMA support
- *	@drive: drive to do the check
+ *	ali_udma_filter		-	compute UDMA mask
+ *	@drive: IDE device
  *
- *	Check the drive and controller revisions. Return 0 if UDMA is
- *	not available, or 1 if UDMA can be used. The actual rules for
- *	the ALi are
+ *	Return available UDMA modes.
+ *
+ *	The actual rules for the ALi are:
  *		No UDMA on revisions <= 0x20
  *		Disk only for revisions < 0xC2
  *		Not WDC drives for revisions < 0xC2
  *
  *	FIXME: WDC ifdef needs to die
  */
- 
-static u8 ali15x3_can_ultra (ide_drive_t *drive)
-{
-#ifndef CONFIG_WDC_ALI15X3
-	struct hd_driveid *id	= drive->id;
-#endif /* CONFIG_WDC_ALI15X3 */
-
-	if (m5229_revision <= 0x20) {
-		return 0;
-	} else if ((m5229_revision < 0xC2) &&
-#ifndef CONFIG_WDC_ALI15X3
-		   ((chip_is_1543c_e && strstr(id->model, "WDC ")) ||
-		    (drive->media!=ide_disk))) {
-#else /* CONFIG_WDC_ALI15X3 */
-		   (drive->media!=ide_disk)) {
-#endif /* CONFIG_WDC_ALI15X3 */
-		return 0;
-	} else {
-		return 1;
-	}
-}
 
-/**
- *	ali15x3_ratemask	-	generate DMA mode list
- *	@drive: drive to compute against
- *
- *	Generate a list of the available DMA modes for the drive. 
- *	FIXME: this function contains lots of bogus masking we can dump
- *
- *	Return the highest available mode (UDMA33, UDMA66, UDMA100,..)
- */
- 
-static u8 ali15x3_ratemask (ide_drive_t *drive)
+static u8 ali_udma_filter(ide_drive_t *drive)
 {
-	u8 mode = 0, can_ultra	= ali15x3_can_ultra(drive);
-
-	if (m5229_revision > 0xC4 && can_ultra) {
-		mode = 4;
-	} else if (m5229_revision == 0xC4 && can_ultra) {
-		mode = 3;
-	} else if (m5229_revision >= 0xC2 && can_ultra) {
-		mode = 2;
-	} else if (can_ultra) {
-		return 1;
-	} else {
-		return 0;
+	if (m5229_revision > 0x20 && m5229_revision < 0xC2) {
+		if (drive->media != ide_disk)
+			return 0;
+#ifndef CONFIG_WDC_ALI15X3
+		if (chip_is_1543c_e && strstr(drive->id->model, "WDC "))
+			return 0;
+#endif
 	}
 
-	/*
-	 *	If the drive sees no suitable cable then UDMA 33
-	 *	is the highest permitted mode
-	 */
-	 
-	if (!eighty_ninty_three(drive))
-		mode = min(mode, (u8)1);
-	return mode;
+	return drive->hwif->ultra_mask;
 }
 
 /**
@@ -461,7 +418,7 @@ static int ali15x3_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 {
 	ide_hwif_t *hwif	= HWIF(drive);
 	struct pci_dev *dev	= hwif->pci_dev;
-	u8 speed		= ide_rate_filter(ali15x3_ratemask(drive), xferspeed);
+	u8 speed		= ide_rate_filter(drive, xferspeed);
 	u8 speed1		= speed;
 	u8 unit			= (drive->select.b.unit & 0x01);
 	u8 tmpbyte		= 0x00;
@@ -511,7 +468,7 @@ static int ali15x3_tune_chipset (ide_drive_t *drive, u8 xferspeed)
  
 static int config_chipset_for_dma (ide_drive_t *drive)
 {
-	u8 speed = ide_dma_speed(drive, ali15x3_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!(speed))
 		return 0;
@@ -771,6 +728,7 @@ static void __devinit init_hwif_common_ali15x3 (ide_hwif_t *hwif)
 	hwif->autodma = 0;
 	hwif->tuneproc = &ali15x3_tune_drive;
 	hwif->speedproc = &ali15x3_tune_chipset;
+	hwif->udma_filter = &ali_udma_filter;
 
 	/* don't use LBA48 DMA on ALi devices before rev 0xC5 */
 	hwif->no_lba48_dma = (m5229_revision <= 0xC4) ? 1 : 0;
diff --git a/drivers/ide/pci/atiixp.c b/drivers/ide/pci/atiixp.c
index 2d48af32e3f4..f7e80d6076d7 100644
--- a/drivers/ide/pci/atiixp.c
+++ b/drivers/ide/pci/atiixp.c
@@ -48,22 +48,6 @@ static int save_mdma_mode[4];
 
 static DEFINE_SPINLOCK(atiixp_lock);
 
-/**
- *	atiixp_ratemask		-	compute rate mask for ATIIXP IDE
- *	@drive: IDE drive to compute for
- *
- *	Returns the available modes for the ATIIXP IDE controller.
- */
-
-static u8 atiixp_ratemask(ide_drive_t *drive)
-{
-	u8 mode = 3;
-
-	if (!eighty_ninty_three(drive))
-		mode = min(mode, (u8)1);
-	return mode;
-}
-
 /**
  *	atiixp_dma_2_pio		-	return the PIO mode matching DMA
  *	@xfer_rate: transfer speed
@@ -189,7 +173,7 @@ static int atiixp_speedproc(ide_drive_t *drive, u8 xferspeed)
 	u16 tmp16;
 	u8 speed, pio;
 
-	speed = ide_rate_filter(atiixp_ratemask(drive), xferspeed);
+	speed = ide_rate_filter(drive, xferspeed);
 
 	spin_lock_irqsave(&atiixp_lock, flags);
 
@@ -233,7 +217,7 @@ static int atiixp_speedproc(ide_drive_t *drive, u8 xferspeed)
 
 static int atiixp_config_drive_for_dma(ide_drive_t *drive)
 {
-	u8 speed = ide_dma_speed(drive, atiixp_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!speed)
 		return 0;
diff --git a/drivers/ide/pci/cmd64x.c b/drivers/ide/pci/cmd64x.c
index 610c45f7b4e2..19f5ac1f866c 100644
--- a/drivers/ide/pci/cmd64x.c
+++ b/drivers/ide/pci/cmd64x.c
@@ -292,55 +292,6 @@ static void cmd64x_tune_drive (ide_drive_t *drive, u8 pio)
 	(void) ide_config_drive_speed(drive, XFER_PIO_0 + pio);
 }
 
-static u8 cmd64x_ratemask (ide_drive_t *drive)
-{
-	struct pci_dev *dev	= HWIF(drive)->pci_dev;
-	u8 mode = 0;
-
-	switch(dev->device) {
-		case PCI_DEVICE_ID_CMD_649:
-			mode = 3;
-			break;
-		case PCI_DEVICE_ID_CMD_648:
-			mode = 2;
-			break;
-		case PCI_DEVICE_ID_CMD_643:
-			return 0;
-
-		case PCI_DEVICE_ID_CMD_646:
-		{
-			unsigned int class_rev	= 0;
-			pci_read_config_dword(dev,
-				PCI_CLASS_REVISION, &class_rev);
-			class_rev &= 0xff;
-		/*
-		 * UltraDMA only supported on PCI646U and PCI646U2, which
-		 * correspond to revisions 0x03, 0x05 and 0x07 respectively.
-		 * Actually, although the CMD tech support people won't
-		 * tell me the details, the 0x03 revision cannot support
-		 * UDMA correctly without hardware modifications, and even
-		 * then it only works with Quantum disks due to some
-		 * hold time assumptions in the 646U part which are fixed
-		 * in the 646U2.
-		 *
-		 * So we only do UltraDMA on revision 0x05 and 0x07 chipsets.
-		 */
-			switch(class_rev) {
-				case 0x07:
-				case 0x05:
-					return 1;
-				case 0x03:
-				case 0x01:
-				default:
-					return 0;
-			}
-		}
-	}
-	if (!eighty_ninty_three(drive))
-		mode = min(mode, (u8)1);
-	return mode;
-}
-
 static int cmd64x_tune_chipset (ide_drive_t *drive, u8 speed)
 {
 	ide_hwif_t *hwif	= HWIF(drive);
@@ -348,7 +299,7 @@ static int cmd64x_tune_chipset (ide_drive_t *drive, u8 speed)
 	u8 unit			= drive->dn & 0x01;
 	u8 regU = 0, pciU	= hwif->channel ? UDIDETCR1 : UDIDETCR0;
 
-	speed = ide_rate_filter(cmd64x_ratemask(drive), speed);
+	speed = ide_rate_filter(drive, speed);
 
 	if (speed >= XFER_SW_DMA_0) {
 		(void) pci_read_config_byte(dev, pciU, &regU);
@@ -403,7 +354,7 @@ static int cmd64x_tune_chipset (ide_drive_t *drive, u8 speed)
 
 static int config_chipset_for_dma (ide_drive_t *drive)
 {
-	u8 speed	= ide_dma_speed(drive, cmd64x_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!speed)
 		return 0;
@@ -646,6 +597,18 @@ static void __devinit init_hwif_cmd64x(ide_hwif_t *hwif)
 
 	hwif->ultra_mask = hwif->cds->udma_mask;
 
+	/*
+	 * UltraDMA only supported on PCI646U and PCI646U2, which
+	 * correspond to revisions 0x03, 0x05 and 0x07 respectively.
+	 * Actually, although the CMD tech support people won't
+	 * tell me the details, the 0x03 revision cannot support
+	 * UDMA correctly without hardware modifications, and even
+	 * then it only works with Quantum disks due to some
+	 * hold time assumptions in the 646U part which are fixed
+	 * in the 646U2.
+	 *
+	 * So we only do UltraDMA on revision 0x05 and 0x07 chipsets.
+	 */
 	if (dev->device == PCI_DEVICE_ID_CMD_646 && class_rev < 5)
 		hwif->ultra_mask = 0x00;
 
diff --git a/drivers/ide/pci/cs5535.c b/drivers/ide/pci/cs5535.c
index 45f43efbf92c..66a101e470d0 100644
--- a/drivers/ide/pci/cs5535.c
+++ b/drivers/ide/pci/cs5535.c
@@ -127,20 +127,6 @@ static void cs5535_set_speed(ide_drive_t *drive, u8 speed)
 	}
 }
 
-static u8 cs5535_ratemask(ide_drive_t *drive)
-{
-	/* eighty93 will return 1 if it's 80core and capable of
-	exceeding udma2, 0 otherwise. we need ratemask to set
-	the max speed and if we can > udma2 then we return 2
-	which selects speed_max as udma4 which is the 5535's max
-	speed, and 1 selects udma2 which is the max for 40c */
-	if (!eighty_ninty_three(drive))
-		return 1;
-
-	return 2;
-}
-
-
 /****
  *	cs5535_set_drive         -     Configure the drive to the new speed
  *	@drive: Drive to set up
@@ -151,7 +137,7 @@ static u8 cs5535_ratemask(ide_drive_t *drive)
  */
 static int cs5535_set_drive(ide_drive_t *drive, u8 speed)
 {
-	speed = ide_rate_filter(cs5535_ratemask(drive), speed);
+	speed = ide_rate_filter(drive, speed);
 	ide_config_drive_speed(drive, speed);
 	cs5535_set_speed(drive, speed);
 
@@ -180,9 +166,7 @@ static void cs5535_tuneproc(ide_drive_t *drive, u8 xferspeed)
 
 static int cs5535_config_drive_for_dma(ide_drive_t *drive)
 {
-	u8 speed;
-
-	speed = ide_dma_speed(drive, cs5535_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	/* If no DMA speed was available then let dma_check hit pio */
 	if (!speed) {
diff --git a/drivers/ide/pci/hpt34x.c b/drivers/ide/pci/hpt34x.c
index 924eaa3a5708..473e1b33dbf7 100644
--- a/drivers/ide/pci/hpt34x.c
+++ b/drivers/ide/pci/hpt34x.c
@@ -43,15 +43,10 @@
 
 #define HPT343_DEBUG_DRIVE_INFO		0
 
-static u8 hpt34x_ratemask (ide_drive_t *drive)
-{
-	return 1;
-}
-
 static int hpt34x_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 {
 	struct pci_dev *dev	= HWIF(drive)->pci_dev;
-	u8 speed	= ide_rate_filter(hpt34x_ratemask(drive), xferspeed);
+	u8 speed = ide_rate_filter(drive, xferspeed);
 	u32 reg1= 0, tmp1 = 0, reg2 = 0, tmp2 = 0;
 	u8			hi_speed, lo_speed;
 
@@ -98,7 +93,7 @@ static void hpt34x_tune_drive (ide_drive_t *drive, u8 pio)
 
 static int config_chipset_for_dma (ide_drive_t *drive)
 {
-	u8 speed = ide_dma_speed(drive, hpt34x_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!(speed))
 		return 0;
diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c
index cf9d344d19f8..de5ad9c35dc6 100644
--- a/drivers/ide/pci/hpt366.c
+++ b/drivers/ide/pci/hpt366.c
@@ -514,43 +514,31 @@ static int check_in_drive_list(ide_drive_t *drive, const char **list)
 	return 0;
 }
 
-static u8 hpt3xx_ratemask(ide_drive_t *drive)
-{
-	struct hpt_info *info	= pci_get_drvdata(HWIF(drive)->pci_dev);
-	u8 mode			= info->max_mode;
-
-	if (!eighty_ninty_three(drive) && mode)
-		mode = min(mode, (u8)1);
-	return mode;
-}
-
 /*
  *	Note for the future; the SATA hpt37x we must set
  *	either PIO or UDMA modes 0,4,5
  */
- 
-static u8 hpt3xx_ratefilter(ide_drive_t *drive, u8 speed)
+
+static u8 hpt3xx_udma_filter(ide_drive_t *drive)
 {
 	struct hpt_info *info	= pci_get_drvdata(HWIF(drive)->pci_dev);
 	u8 chip_type		= info->chip_type;
-	u8 mode			= hpt3xx_ratemask(drive);
-
-	if (drive->media != ide_disk)
-		return min(speed, (u8)XFER_PIO_4);
+	u8 mode			= info->max_mode;
+	u8 mask;
 
 	switch (mode) {
 		case 0x04:
-			speed = min_t(u8, speed, XFER_UDMA_6);
+			mask = 0x7f;
 			break;
 		case 0x03:
-			speed = min_t(u8, speed, XFER_UDMA_5);
+			mask = 0x3f;
 			if (chip_type >= HPT374)
 				break;
 			if (!check_in_drive_list(drive, bad_ata100_5))
 				goto check_bad_ata33;
 			/* fall thru */
 		case 0x02:
-			speed = min_t(u8, speed, XFER_UDMA_4);
+			mask = 0x1f;
 
 			/*
 			 * CHECK ME, Does this need to be changed to HPT374 ??
@@ -561,13 +549,13 @@ static u8 hpt3xx_ratefilter(ide_drive_t *drive, u8 speed)
 			    !check_in_drive_list(drive, bad_ata66_4))
 				goto check_bad_ata33;
 
-			speed = min_t(u8, speed, XFER_UDMA_3);
+			mask = 0x0f;
 			if (HPT366_ALLOW_ATA66_3 &&
 			    !check_in_drive_list(drive, bad_ata66_3))
 				goto check_bad_ata33;
 			/* fall thru */
 		case 0x01:
-			speed = min_t(u8, speed, XFER_UDMA_2);
+			mask = 0x07;
 
 		check_bad_ata33:
 			if (chip_type >= HPT370A)
@@ -577,10 +565,10 @@ static u8 hpt3xx_ratefilter(ide_drive_t *drive, u8 speed)
 			/* fall thru */
 		case 0x00:
 		default:
-			speed = min_t(u8, speed, XFER_MW_DMA_2);
+			mask = 0x00;
 			break;
 	}
-	return speed;
+	return mask;
 }
 
 static u32 get_speed_setting(u8 speed, struct hpt_info *info)
@@ -608,12 +596,19 @@ static int hpt36x_tune_chipset(ide_drive_t *drive, u8 xferspeed)
 	ide_hwif_t *hwif	= HWIF(drive);
 	struct pci_dev  *dev	= hwif->pci_dev;
 	struct hpt_info	*info	= pci_get_drvdata(dev);
-	u8  speed		= hpt3xx_ratefilter(drive, xferspeed);
+	u8  speed		= ide_rate_filter(drive, xferspeed);
 	u8  itr_addr		= drive->dn ? 0x44 : 0x40;
-	u32 itr_mask		= speed < XFER_MW_DMA_0 ? 0x30070000 :
-				 (speed < XFER_UDMA_0   ? 0xc0070000 : 0xc03800ff);
-	u32 new_itr		= get_speed_setting(speed, info);
 	u32 old_itr		= 0;
+	u32 itr_mask, new_itr;
+
+	/* TODO: move this to ide_rate_filter() [ check ->atapi_dma ] */
+	if (drive->media != ide_disk)
+		speed = min_t(u8, speed, XFER_PIO_4);
+
+	itr_mask = speed < XFER_MW_DMA_0 ? 0x30070000 :
+		  (speed < XFER_UDMA_0   ? 0xc0070000 : 0xc03800ff);
+
+	new_itr = get_speed_setting(speed, info);
 
 	/*
 	 * Disable on-chip PIO FIFO/buffer (and PIO MST mode as well)
@@ -633,12 +628,19 @@ static int hpt37x_tune_chipset(ide_drive_t *drive, u8 xferspeed)
 	ide_hwif_t *hwif	= HWIF(drive);
 	struct pci_dev  *dev	= hwif->pci_dev;
 	struct hpt_info	*info	= pci_get_drvdata(dev);
-	u8  speed		= hpt3xx_ratefilter(drive, xferspeed);
+	u8  speed		= ide_rate_filter(drive, xferspeed);
 	u8  itr_addr		= 0x40 + (drive->dn * 4);
-	u32 itr_mask		= speed < XFER_MW_DMA_0 ? 0x303c0000 :
-				 (speed < XFER_UDMA_0   ? 0xc03c0000 : 0xc1c001ff);
-	u32 new_itr		= get_speed_setting(speed, info);
 	u32 old_itr		= 0;
+	u32 itr_mask, new_itr;
+
+	/* TODO: move this to ide_rate_filter() [ check ->atapi_dma ] */
+	if (drive->media != ide_disk)
+		speed = min_t(u8, speed, XFER_PIO_4);
+
+	itr_mask = speed < XFER_MW_DMA_0 ? 0x303c0000 :
+		  (speed < XFER_UDMA_0   ? 0xc03c0000 : 0xc1c001ff);
+
+	new_itr = get_speed_setting(speed, info);
 
 	pci_read_config_dword(dev, itr_addr, &old_itr);
 	new_itr = (new_itr & ~itr_mask) | (old_itr & itr_mask);
@@ -676,7 +678,7 @@ static void hpt3xx_tune_drive(ide_drive_t *drive, u8 pio)
  */
 static int config_chipset_for_dma(ide_drive_t *drive)
 {
-	u8 speed = ide_dma_speed(drive, hpt3xx_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!speed)
 		return 0;
@@ -1271,6 +1273,7 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
 	hwif->intrproc			= &hpt3xx_intrproc;
 	hwif->maskproc			= &hpt3xx_maskproc;
 	hwif->busproc			= &hpt3xx_busproc;
+	hwif->udma_filter		= &hpt3xx_udma_filter;
 
 	/*
 	 * HPT3xxN chips have some complications:
diff --git a/drivers/ide/pci/it8213.c b/drivers/ide/pci/it8213.c
index 424f00bb160d..02b56cb7bb1b 100644
--- a/drivers/ide/pci/it8213.c
+++ b/drivers/ide/pci/it8213.c
@@ -17,22 +17,6 @@
 
 #include <asm/io.h>
 
-/*
- *	it8213_ratemask	-	Compute available modes
- *	@drive: IDE drive
- *
- *	Compute the available speeds for the devices on the interface. This
- *	is all modes to ATA133 clipped by drive cable setup.
- */
-
-static u8 it8213_ratemask (ide_drive_t *drive)
-{
-	u8 mode	= 4;
-	if (!eighty_ninty_three(drive))
-		mode = min_t(u8, mode, 1);
-	return mode;
-}
-
 /**
  *	it8213_dma_2_pio		-	return the PIO mode matching DMA
  *	@xfer_rate: transfer speed
@@ -145,7 +129,7 @@ static int it8213_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 	ide_hwif_t *hwif	= HWIF(drive);
 	struct pci_dev *dev	= hwif->pci_dev;
 	u8 maslave		= 0x40;
-	u8 speed		= ide_rate_filter(it8213_ratemask(drive), xferspeed);
+	u8 speed		= ide_rate_filter(drive, xferspeed);
 	int a_speed		= 3 << (drive->dn * 4);
 	int u_flag		= 1 << drive->dn;
 	int v_flag		= 0x01 << drive->dn;
@@ -222,7 +206,7 @@ static int it8213_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 
 static int config_chipset_for_dma (ide_drive_t *drive)
 {
-	u8 speed = ide_dma_speed(drive, it8213_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!speed)
 		return 0;
diff --git a/drivers/ide/pci/it821x.c b/drivers/ide/pci/it821x.c
index 4e1254813ee0..442f658c6ae7 100644
--- a/drivers/ide/pci/it821x.c
+++ b/drivers/ide/pci/it821x.c
@@ -228,22 +228,6 @@ static void it821x_clock_strategy(ide_drive_t *drive)
 	}
 }
 
-/**
- *	it821x_ratemask	-	Compute available modes
- *	@drive: IDE drive
- *
- *	Compute the available speeds for the devices on the interface. This
- *	is all modes to ATA133 clipped by drive cable setup.
- */
-
-static u8 it821x_ratemask (ide_drive_t *drive)
-{
-	u8 mode	= 4;
-	if (!eighty_ninty_three(drive))
-		mode = min(mode, (u8)1);
-	return mode;
-}
-
 /**
  *	it821x_tunepio	-	tune a drive
  *	@drive: drive to tune
@@ -438,7 +422,7 @@ static int it821x_tune_chipset (ide_drive_t *drive, byte xferspeed)
 
 	ide_hwif_t *hwif	= drive->hwif;
 	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
-	u8 speed		= ide_rate_filter(it821x_ratemask(drive), xferspeed);
+	u8 speed		= ide_rate_filter(drive, xferspeed);
 
 	switch (speed) {
 	case XFER_PIO_4:
@@ -488,7 +472,7 @@ static int it821x_tune_chipset (ide_drive_t *drive, byte xferspeed)
 
 static int config_chipset_for_dma (ide_drive_t *drive)
 {
-	u8 speed	= ide_dma_speed(drive, it821x_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (speed == 0)
 		return 0;
diff --git a/drivers/ide/pci/jmicron.c b/drivers/ide/pci/jmicron.c
index be4fc96c29e0..dbb3c199cba9 100644
--- a/drivers/ide/pci/jmicron.c
+++ b/drivers/ide/pci/jmicron.c
@@ -21,22 +21,6 @@ typedef enum {
 	PORT_SATA = 2,
 } port_type;
 
-/**
- *	jmicron_ratemask	-	Compute available modes
- *	@drive: IDE drive
- *
- *	Compute the available speeds for the devices on the interface. This
- *	is all modes to ATA133 clipped by drive cable setup.
- */
-
-static u8 jmicron_ratemask(ide_drive_t *drive)
-{
-	u8 mode	= 4;
-	if (!eighty_ninty_three(drive))
-		mode = min(mode, (u8)1);
-	return mode;
-}
-
 /**
  *	ata66_jmicron		-	Cable check
  *	@hwif: IDE port
@@ -129,8 +113,7 @@ static void config_jmicron_chipset_for_pio (ide_drive_t *drive, byte set_speed)
 
 static int jmicron_tune_chipset (ide_drive_t *drive, byte xferspeed)
 {
-
-	u8 speed		= ide_rate_filter(jmicron_ratemask(drive), xferspeed);
+	u8 speed = ide_rate_filter(drive, xferspeed);
 
 	return ide_config_drive_speed(drive, speed);
 }
@@ -145,7 +128,7 @@ static int jmicron_tune_chipset (ide_drive_t *drive, byte xferspeed)
 
 static int config_chipset_for_dma (ide_drive_t *drive)
 {
-	u8 speed	= ide_dma_speed(drive, jmicron_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!speed)
 		return 0;
diff --git a/drivers/ide/pci/pdc202xx_new.c b/drivers/ide/pci/pdc202xx_new.c
index a23853445af9..772ca4007de4 100644
--- a/drivers/ide/pci/pdc202xx_new.c
+++ b/drivers/ide/pci/pdc202xx_new.c
@@ -82,16 +82,6 @@ static u8 max_dma_rate(struct pci_dev *pdev)
 	return mode;
 }
 
-static u8 pdcnew_ratemask(ide_drive_t *drive)
-{
-	u8 mode = max_dma_rate(HWIF(drive)->pci_dev);
-
-	if (!eighty_ninty_three(drive))
-		mode = min_t(u8, mode, 1);
-
-	return	mode;
-}
-
 /**
  * get_indexed_reg - Get indexed register
  * @hwif: for the port address
@@ -164,7 +154,7 @@ static int pdcnew_tune_chipset(ide_drive_t *drive, u8 speed)
 	u8 adj			= (drive->dn & 1) ? 0x08 : 0x00;
 	int			err;
 
-	speed = ide_rate_filter(pdcnew_ratemask(drive), speed);
+	speed = ide_rate_filter(drive, speed);
 
 	/*
 	 * Issue SETFEATURES_XFER to the drive first. PDC202xx hardware will
@@ -267,7 +257,7 @@ static int config_chipset_for_dma(ide_drive_t *drive)
 		set_indexed_reg(hwif, 0x13 + adj, tmp | 0x03);
 	}
 
-	speed = ide_dma_speed(drive, pdcnew_ratemask(drive));
+	speed = ide_max_dma_mode(drive);
 
 	if (!speed)
 		return 0;
diff --git a/drivers/ide/pci/pdc202xx_old.c b/drivers/ide/pci/pdc202xx_old.c
index d7a38067fb34..207a6191fac7 100644
--- a/drivers/ide/pci/pdc202xx_old.c
+++ b/drivers/ide/pci/pdc202xx_old.c
@@ -101,35 +101,12 @@ static const char *pdc_quirk_drives[] = {
 #define	MC1		0x02	/* DMA"C" timing */
 #define	MC0		0x01	/* DMA"C" timing */
 
-static u8 pdc202xx_ratemask (ide_drive_t *drive)
-{
-	u8 mode;
-
-	switch(HWIF(drive)->pci_dev->device) {
-		case PCI_DEVICE_ID_PROMISE_20267:
-		case PCI_DEVICE_ID_PROMISE_20265:
-			mode = 3;
-			break;
-		case PCI_DEVICE_ID_PROMISE_20263:
-		case PCI_DEVICE_ID_PROMISE_20262:
-			mode = 2;
-			break;
-		case PCI_DEVICE_ID_PROMISE_20246:
-			return 1;
-		default:
-			return 0;
-	}
-	if (!eighty_ninty_three(drive))
-		mode = min(mode, (u8)1);
-	return mode;
-}
-
 static int pdc202xx_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 {
 	ide_hwif_t *hwif	= HWIF(drive);
 	struct pci_dev *dev	= hwif->pci_dev;
 	u8 drive_pci		= 0x60 + (drive->dn << 2);
-	u8 speed	= ide_rate_filter(pdc202xx_ratemask(drive), xferspeed);
+	u8 speed		= ide_rate_filter(drive, xferspeed);
 
 	u32			drive_conf;
 	u8			AP, BP, CP, DP;
@@ -308,7 +285,7 @@ chipset_is_set:
 	if (drive->media == ide_disk)	/* PREFETCH_EN */
 		pci_write_config_byte(dev, (drive_pci), AP|PREFETCH_EN);
 
-	speed = ide_dma_speed(drive, pdc202xx_ratemask(drive));
+	speed = ide_max_dma_mode(drive);
 
 	if (!(speed)) {
 		/* restore original pci-config space */
diff --git a/drivers/ide/pci/piix.c b/drivers/ide/pci/piix.c
index 17ea44edb4eb..84d3938a430a 100644
--- a/drivers/ide/pci/piix.c
+++ b/drivers/ide/pci/piix.c
@@ -105,68 +105,6 @@
 
 static int no_piix_dma;
 
-/**
- *	piix_ratemask		-	compute rate mask for PIIX IDE
- *	@drive: IDE drive to compute for
- *
- *	Returns the available modes for the PIIX IDE controller.
- */
- 
-static u8 piix_ratemask (ide_drive_t *drive)
-{
-	struct pci_dev *dev	= HWIF(drive)->pci_dev;
-	u8 mode;
-
-	switch(dev->device) {
-		case PCI_DEVICE_ID_INTEL_82801EB_1:
-			mode = 3;
-			break;
-		/* UDMA 100 capable */
-		case PCI_DEVICE_ID_INTEL_82801BA_8:
-		case PCI_DEVICE_ID_INTEL_82801BA_9:
-		case PCI_DEVICE_ID_INTEL_82801CA_10:
-		case PCI_DEVICE_ID_INTEL_82801CA_11:
-		case PCI_DEVICE_ID_INTEL_82801E_11:
-		case PCI_DEVICE_ID_INTEL_82801DB_1:
-		case PCI_DEVICE_ID_INTEL_82801DB_10:
-		case PCI_DEVICE_ID_INTEL_82801DB_11:
-		case PCI_DEVICE_ID_INTEL_82801EB_11:
-		case PCI_DEVICE_ID_INTEL_ESB_2:
-		case PCI_DEVICE_ID_INTEL_ICH6_19:
-		case PCI_DEVICE_ID_INTEL_ICH7_21:
-		case PCI_DEVICE_ID_INTEL_ESB2_18:
-		case PCI_DEVICE_ID_INTEL_ICH8_6:
-			mode = 3;
-			break;
-		/* UDMA 66 capable */
-		case PCI_DEVICE_ID_INTEL_82801AA_1:
-		case PCI_DEVICE_ID_INTEL_82372FB_1:
-			mode = 2;
-			break;
-		/* UDMA 33 capable */
-		case PCI_DEVICE_ID_INTEL_82371AB:
-		case PCI_DEVICE_ID_INTEL_82443MX_1:
-		case PCI_DEVICE_ID_INTEL_82451NX:
-		case PCI_DEVICE_ID_INTEL_82801AB_1:
-			return 1;
-		/* Non UDMA capable (MWDMA2) */
-		case PCI_DEVICE_ID_INTEL_82371SB_1:
-		case PCI_DEVICE_ID_INTEL_82371FB_1:
-		case PCI_DEVICE_ID_INTEL_82371FB_0:
-		case PCI_DEVICE_ID_INTEL_82371MX:
-		default:
-			return 0;
-	}
-	
-	/*
-	 *	If we are UDMA66 capable fall back to UDMA33 
-	 *	if the drive cannot see an 80pin cable.
-	 */
-	if (!eighty_ninty_three(drive))
-		mode = min_t(u8, mode, 1);
-	return mode;
-}
-
 /**
  *	piix_dma_2_pio		-	return the PIO mode matching DMA
  *	@xfer_rate: transfer speed
@@ -301,7 +239,7 @@ static int piix_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 	ide_hwif_t *hwif	= HWIF(drive);
 	struct pci_dev *dev	= hwif->pci_dev;
 	u8 maslave		= hwif->channel ? 0x42 : 0x40;
-	u8 speed		= ide_rate_filter(piix_ratemask(drive), xferspeed);
+	u8 speed		= ide_rate_filter(drive, xferspeed);
 	int a_speed		= 3 << (drive->dn * 4);
 	int u_flag		= 1 << drive->dn;
 	int v_flag		= 0x01 << drive->dn;
@@ -376,7 +314,7 @@ static int piix_tune_chipset (ide_drive_t *drive, u8 xferspeed)
  
 static int piix_config_drive_for_dma (ide_drive_t *drive)
 {
-	u8 speed = ide_dma_speed(drive, piix_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	/*
 	 * If no DMA speed was available or the chipset has DMA bugs
diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c
index f84bf791f72e..cbf936325355 100644
--- a/drivers/ide/pci/scc_pata.c
+++ b/drivers/ide/pci/scc_pata.c
@@ -189,23 +189,6 @@ scc_ide_outsl(unsigned long port, void *addr, u32 count)
 	}
 }
 
-/**
- *	scc_ratemask	-	Compute available modes
- *	@drive: IDE drive
- *
- *	Compute the available speeds for the devices on the interface.
- *	Enforce UDMA33 as a limit if there is no 80pin cable present.
- */
-
-static u8 scc_ratemask(ide_drive_t *drive)
-{
-	u8 mode = 4;
-
-	if (!eighty_ninty_three(drive))
-		mode = min(mode, (u8)1);
-	return mode;
-}
-
 /**
  *	scc_tuneproc	-	tune a drive PIO mode
  *	@drive: drive to tune
@@ -273,7 +256,7 @@ static void scc_tuneproc(ide_drive_t *drive, byte mode_wanted)
 static int scc_tune_chipset(ide_drive_t *drive, byte xferspeed)
 {
 	ide_hwif_t *hwif = HWIF(drive);
-	u8 speed = ide_rate_filter(scc_ratemask(drive), xferspeed);
+	u8 speed = ide_rate_filter(drive, xferspeed);
 	struct scc_ports *ports = ide_get_hwifdata(hwif);
 	unsigned long ctl_base = ports->ctl;
 	unsigned long cckctrl_port = ctl_base + 0xff0;
@@ -347,7 +330,7 @@ static int scc_tune_chipset(ide_drive_t *drive, byte xferspeed)
 
 static int scc_config_chipset_for_dma(ide_drive_t *drive)
 {
-	u8 speed = ide_dma_speed(drive, scc_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!speed)
 		return 0;
diff --git a/drivers/ide/pci/serverworks.c b/drivers/ide/pci/serverworks.c
index dbcd37a0c652..2fa6d92d16cc 100644
--- a/drivers/ide/pci/serverworks.c
+++ b/drivers/ide/pci/serverworks.c
@@ -65,16 +65,16 @@ static int check_in_drive_lists (ide_drive_t *drive, const char **list)
 	return 0;
 }
 
-static u8 svwks_ratemask (ide_drive_t *drive)
+static u8 svwks_udma_filter(ide_drive_t *drive)
 {
 	struct pci_dev *dev     = HWIF(drive)->pci_dev;
-	u8 mode = 0;
+	u8 mask = 0;
 
 	if (!svwks_revision)
 		pci_read_config_byte(dev, PCI_REVISION_ID, &svwks_revision);
 
 	if (dev->device == PCI_DEVICE_ID_SERVERWORKS_HT1000IDE)
-		return 2;
+		return 0x1f;
 	if (dev->device == PCI_DEVICE_ID_SERVERWORKS_OSB4IDE) {
 		u32 reg = 0;
 		if (isa_dev)
@@ -86,25 +86,31 @@ static u8 svwks_ratemask (ide_drive_t *drive)
 		if(drive->media == ide_disk)
 			return 0;
 		/* Check the OSB4 DMA33 enable bit */
-		return ((reg & 0x00004000) == 0x00004000) ? 1 : 0;
+		return ((reg & 0x00004000) == 0x00004000) ? 0x07 : 0;
 	} else if (svwks_revision < SVWKS_CSB5_REVISION_NEW) {
-		return 1;
+		return 0x07;
 	} else if (svwks_revision >= SVWKS_CSB5_REVISION_NEW) {
-		u8 btr = 0;
+		u8 btr = 0, mode;
 		pci_read_config_byte(dev, 0x5A, &btr);
 		mode = btr & 0x3;
-		if (!eighty_ninty_three(drive))
-			mode = min(mode, (u8)1);
+
 		/* If someone decides to do UDMA133 on CSB5 the same
 		   issue will bite so be inclusive */
 		if (mode > 2 && check_in_drive_lists(drive, svwks_bad_ata100))
 			mode = 2;
+
+		switch(mode) {
+		case 2:	 mask = 0x1f; break;
+		case 1:	 mask = 0x07; break;
+		default: mask = 0x00; break;
+		}
 	}
 	if (((dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE) ||
 	     (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2)) &&
 	    (!(PCI_FUNC(dev->devfn) & 1)))
-		mode = 2;
-	return mode;
+		mask = 0x1f;
+
+	return mask;
 }
 
 static u8 svwks_csb_check (struct pci_dev *dev)
@@ -141,7 +147,7 @@ static int svwks_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 	if (xferspeed == 255)	/* PIO auto-tuning */
 		speed = XFER_PIO_0 + pio;
 	else
-		speed = ide_rate_filter(svwks_ratemask(drive), xferspeed);
+		speed = ide_rate_filter(drive, xferspeed);
 
 	/* If we are about to put a disk into UDMA mode we screwed up.
 	   Our code assumes we never _ever_ do this on an OSB4 */
@@ -304,7 +310,7 @@ static void svwks_tune_drive (ide_drive_t *drive, u8 pio)
 
 static int config_chipset_for_dma (ide_drive_t *drive)
 {
-	u8 speed = ide_dma_speed(drive, svwks_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!(speed))
 		speed = XFER_PIO_0 + ide_get_best_pio_mode(drive, 255, 5, NULL);
@@ -500,6 +506,7 @@ static void __devinit init_hwif_svwks (ide_hwif_t *hwif)
 
 	hwif->tuneproc = &svwks_tune_drive;
 	hwif->speedproc = &svwks_tune_chipset;
+	hwif->udma_filter = &svwks_udma_filter;
 
 	hwif->atapi_dma = 1;
 
diff --git a/drivers/ide/pci/siimage.c b/drivers/ide/pci/siimage.c
index c0188de3cc66..5314ec913724 100644
--- a/drivers/ide/pci/siimage.c
+++ b/drivers/ide/pci/siimage.c
@@ -122,45 +122,41 @@ static inline unsigned long siimage_seldev(ide_drive_t *drive, int r)
 }
 
 /**
- *	siimage_ratemask	-	Compute available modes
- *	@drive: IDE drive
+ *	sil_udma_filter		-	compute UDMA mask
+ *	@drive: IDE device
+ *
+ *	Compute the available UDMA speeds for the device on the interface.
  *
- *	Compute the available speeds for the devices on the interface.
  *	For the CMD680 this depends on the clocking mode (scsc), for the
- *	SI3312 SATA controller life is a bit simpler. Enforce UDMA33
- *	as a limit if there is no 80pin cable present.
+ *	SI3112 SATA controller life is a bit simpler.
  */
- 
-static byte siimage_ratemask (ide_drive_t *drive)
+
+static u8 sil_udma_filter(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif	= HWIF(drive);
-	u8 mode	= 0, scsc = 0;
+	ide_hwif_t *hwif = drive->hwif;
 	unsigned long base = (unsigned long) hwif->hwif_data;
+	u8 mask = 0, scsc = 0;
 
 	if (hwif->mmio)
 		scsc = hwif->INB(base + 0x4A);
 	else
 		pci_read_config_byte(hwif->pci_dev, 0x8A, &scsc);
 
-	if(is_sata(hwif))
-	{
-		if(strstr(drive->id->model, "Maxtor"))
-			return 3;
-		return 4;
+	if (is_sata(hwif)) {
+		mask = strstr(drive->id->model, "Maxtor") ? 0x3f : 0x7f;
+		goto out;
 	}
-	
+
 	if ((scsc & 0x30) == 0x10)	/* 133 */
-		mode = 4;
+		mask = 0x7f;
 	else if ((scsc & 0x30) == 0x20)	/* 2xPCI */
-		mode = 4;
+		mask = 0x7f;
 	else if ((scsc & 0x30) == 0x00)	/* 100 */
-		mode = 3;
+		mask = 0x3f;
 	else 	/* Disabled ? */
 		BUG();
-
-	if (!eighty_ninty_three(drive))
-		mode = min(mode, (u8)1);
-	return mode;
+out:
+	return mask;
 }
 
 /**
@@ -306,7 +302,7 @@ static int siimage_tune_chipset (ide_drive_t *drive, byte xferspeed)
 	ide_hwif_t *hwif	= HWIF(drive);
 	u16 ultra = 0, multi	= 0;
 	u8 mode = 0, unit	= drive->select.b.unit;
-	u8 speed		= ide_rate_filter(siimage_ratemask(drive), xferspeed);
+	u8 speed		= ide_rate_filter(drive, xferspeed);
 	unsigned long base	= (unsigned long)hwif->hwif_data;
 	u8 scsc = 0, addr_mask	= ((hwif->channel) ?
 				    ((hwif->mmio) ? 0xF4 : 0x84) :
@@ -389,7 +385,7 @@ static int siimage_tune_chipset (ide_drive_t *drive, byte xferspeed)
  
 static int config_chipset_for_dma (ide_drive_t *drive)
 {
-	u8 speed	= ide_dma_speed(drive, siimage_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!speed)
 		return 0;
@@ -989,6 +985,7 @@ static void __devinit init_hwif_siimage(ide_hwif_t *hwif)
 	hwif->tuneproc	= &siimage_tuneproc;
 	hwif->reset_poll = &siimage_reset_poll;
 	hwif->pre_reset = &siimage_pre_reset;
+	hwif->udma_filter = &sil_udma_filter;
 
 	if(is_sata(hwif)) {
 		static int first = 1;
diff --git a/drivers/ide/pci/sis5513.c b/drivers/ide/pci/sis5513.c
index 6fe4ecb6c06c..83c80ed73e99 100644
--- a/drivers/ide/pci/sis5513.c
+++ b/drivers/ide/pci/sis5513.c
@@ -428,16 +428,6 @@ static int sis_get_info (char *buffer, char **addr, off_t offset, int count)
 }
 #endif /* defined(DISPLAY_SIS_TIMINGS) && defined(CONFIG_PROC_FS) */
 
-static u8 sis5513_ratemask (ide_drive_t *drive)
-{
-	u8 rates[] = { 0, 0, 1, 2, 3, 3, 4, 4 };
-	u8 mode = rates[chipset_family];
-
-	if (!eighty_ninty_three(drive))
-		mode = min(mode, (u8)1);
-	return mode;
-}
-
 /*
  * Configuration functions
  */
@@ -563,7 +553,7 @@ static int sis5513_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 	u8 drive_pci, reg, speed;
 	u32 regdw;
 
-	speed = ide_rate_filter(sis5513_ratemask(drive), xferspeed);
+	speed = ide_rate_filter(drive, xferspeed);
 
 	/* See config_art_rwp_pio for drive pci config registers */
 	drive_pci = 0x40;
@@ -653,7 +643,7 @@ static void sis5513_tune_drive (ide_drive_t *drive, u8 pio)
  */
 static int config_chipset_for_dma (ide_drive_t *drive)
 {
-	u8 speed	= ide_dma_speed(drive, sis5513_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 #ifdef DEBUG
 	printk("SIS5513: config_chipset_for_dma, drive %d, ultra %x\n",
diff --git a/drivers/ide/pci/slc90e66.c b/drivers/ide/pci/slc90e66.c
index 852ccb36da1d..9e95a5cbf984 100644
--- a/drivers/ide/pci/slc90e66.c
+++ b/drivers/ide/pci/slc90e66.c
@@ -21,15 +21,6 @@
 
 #include <asm/io.h>
 
-static u8 slc90e66_ratemask (ide_drive_t *drive)
-{
-	u8 mode	= 2;
-
-	if (!eighty_ninty_three(drive))
-		mode = min_t(u8, mode, 1);
-	return mode;
-}
-
 static u8 slc90e66_dma_2_pio (u8 xfer_rate) {
 	switch(xfer_rate) {
 		case XFER_UDMA_4:
@@ -122,7 +113,7 @@ static int slc90e66_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 	ide_hwif_t *hwif	= HWIF(drive);
 	struct pci_dev *dev	= hwif->pci_dev;
 	u8 maslave		= hwif->channel ? 0x42 : 0x40;
-	u8 speed	= ide_rate_filter(slc90e66_ratemask(drive), xferspeed);
+	u8 speed		= ide_rate_filter(drive, xferspeed);
 	int sitre = 0, a_speed	= 7 << (drive->dn * 4);
 	int u_speed = 0, u_flag = 1 << drive->dn;
 	u16			reg4042, reg44, reg48, reg4a;
@@ -171,7 +162,7 @@ static int slc90e66_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 
 static int slc90e66_config_drive_for_dma (ide_drive_t *drive)
 {
-	u8 speed = ide_dma_speed(drive, slc90e66_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!speed)
 		return 0;
diff --git a/drivers/ide/pci/tc86c001.c b/drivers/ide/pci/tc86c001.c
index 0b6d81d6ce48..168f035caa3f 100644
--- a/drivers/ide/pci/tc86c001.c
+++ b/drivers/ide/pci/tc86c001.c
@@ -13,18 +13,13 @@
 #include <linux/pci.h>
 #include <linux/ide.h>
 
-static inline u8 tc86c001_ratemask(ide_drive_t *drive)
-{
-	return eighty_ninty_three(drive) ? 2 : 1;
-}
-
 static int tc86c001_tune_chipset(ide_drive_t *drive, u8 speed)
 {
 	ide_hwif_t *hwif	= HWIF(drive);
 	unsigned long scr_port	= hwif->config_data + (drive->dn ? 0x02 : 0x00);
 	u16 mode, scr		= hwif->INW(scr_port);
 
-	speed = ide_rate_filter(tc86c001_ratemask(drive), speed);
+	speed = ide_rate_filter(drive, speed);
 
 	switch (speed) {
 		case XFER_UDMA_4:	mode = 0x00c0; break;
@@ -174,7 +169,7 @@ static int tc86c001_busproc(ide_drive_t *drive, int state)
 
 static int config_chipset_for_dma(ide_drive_t *drive)
 {
-	u8 speed = ide_dma_speed(drive, tc86c001_ratemask(drive));
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!speed)
 		return 0;
diff --git a/drivers/ide/pci/triflex.c b/drivers/ide/pci/triflex.c
index 5e06179c3469..8a877235b949 100644
--- a/drivers/ide/pci/triflex.c
+++ b/drivers/ide/pci/triflex.c
@@ -48,7 +48,7 @@ static int triflex_tune_chipset(ide_drive_t *drive, u8 xferspeed)
 	u16 timing = 0;
 	u32 triflex_timings = 0;
 	u8 unit = (drive->select.b.unit & 0x01);
-	u8 speed = ide_rate_filter(0, xferspeed);
+	u8 speed = ide_rate_filter(drive, xferspeed);
 	
 	pci_read_config_dword(dev, channel_offset, &triflex_timings);
 	
@@ -102,7 +102,7 @@ static void triflex_tune_drive(ide_drive_t *drive, u8 pio)
 
 static int triflex_config_drive_for_dma(ide_drive_t *drive)
 {
-	int speed = ide_dma_speed(drive, 0); /* No ultra speeds */
+	u8 speed = ide_max_dma_mode(drive);
 
 	if (!speed)
 		return 0;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c9375c863584..23ab4dc05009 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -717,11 +717,8 @@ typedef struct hwif_s {
 	int	(*quirkproc)(ide_drive_t *);
 	/* driver soft-power interface */
 	int	(*busproc)(ide_drive_t *, int);
-//	/* host rate limiter */
-//	u8	(*ratemask)(ide_drive_t *);
-//	/* device rate limiter */
-//	u8	(*ratefilter)(ide_drive_t *, u8);
 #endif
+	u8 (*udma_filter)(ide_drive_t *);
 
 	void (*ata_input_data)(ide_drive_t *, void *, u32);
 	void (*ata_output_data)(ide_drive_t *, void *, u32);
@@ -1279,6 +1276,7 @@ int ide_in_drive_list(struct hd_driveid *, const struct drive_list_entry *);
 int __ide_dma_bad_drive(ide_drive_t *);
 int __ide_dma_good_drive(ide_drive_t *);
 int ide_use_dma(ide_drive_t *);
+u8 ide_max_dma_mode(ide_drive_t *);
 void ide_dma_off(ide_drive_t *);
 void ide_dma_verbose(ide_drive_t *);
 int ide_set_dma(ide_drive_t *);
@@ -1305,6 +1303,7 @@ extern int __ide_dma_timeout(ide_drive_t *);
 
 #else
 static inline int ide_use_dma(ide_drive_t *drive) { return 0; }
+static inline u8 ide_max_dma_mode(ide_drive_t *drive) { return 0; }
 static inline void ide_dma_off(ide_drive_t *drive) { ; }
 static inline void ide_dma_verbose(ide_drive_t *drive) { ; }
 static inline int ide_set_dma(ide_drive_t *drive) { return 1; }
@@ -1349,8 +1348,7 @@ static inline void ide_set_hwifdata (ide_hwif_t * hwif, void *data)
 }
 
 /* ide-lib.c */
-extern u8 ide_dma_speed(ide_drive_t *drive, u8 mode);
-extern u8 ide_rate_filter(u8 mode, u8 speed); 
+u8 ide_rate_filter(ide_drive_t *, u8);
 extern int ide_dma_enable(ide_drive_t *drive);
 extern char *ide_xfer_verbose(u8 xfer_rate);
 extern void ide_toggle_bounce(ide_drive_t *drive, int on);
-- 
cgit v1.2.3


From 29e744d088e3555f4efbdf390f01088dd66993b6 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:09 +0200
Subject: ide: add ide_tune_dma() helper

After reworking the code responsible for selecting the best DMA
transfer mode it is now possible to add generic ide_tune_dma() helper.

Convert some IDE PCI host drivers to use it (the ones left need more work).

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-dma.c      | 20 ++++++++++++++++++++
 drivers/ide/pci/aec62xx.c  | 13 +------------
 drivers/ide/pci/atiixp.c   | 22 +---------------------
 drivers/ide/pci/cs5535.c   | 15 +--------------
 drivers/ide/pci/hpt34x.c   | 20 +-------------------
 drivers/ide/pci/hpt366.c   | 20 +-------------------
 drivers/ide/pci/it8213.c   | 21 +--------------------
 drivers/ide/pci/jmicron.c  | 21 +--------------------
 drivers/ide/pci/piix.c     | 26 +-------------------------
 drivers/ide/pci/sis5513.c  | 21 +--------------------
 drivers/ide/pci/slc90e66.c | 13 +------------
 drivers/ide/pci/tc86c001.c | 13 +------------
 drivers/ide/pci/triflex.c  | 13 +------------
 include/linux/ide.h        |  2 ++
 14 files changed, 34 insertions(+), 206 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index f28fabb791fe..5fe85191d49c 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -779,6 +779,26 @@ u8 ide_max_dma_mode(ide_drive_t *drive)
 
 EXPORT_SYMBOL_GPL(ide_max_dma_mode);
 
+int ide_tune_dma(ide_drive_t *drive)
+{
+	u8 speed;
+
+	/* TODO: use only ide_max_dma_mode() */
+	if (!ide_use_dma(drive))
+		return 0;
+
+	speed = ide_max_dma_mode(drive);
+
+	if (!speed)
+		return 0;
+
+	drive->hwif->speedproc(drive, speed);
+
+	return ide_dma_enable(drive);
+}
+
+EXPORT_SYMBOL_GPL(ide_tune_dma);
+
 void ide_dma_verbose(ide_drive_t *drive)
 {
 	struct hd_driveid *id	= drive->id;
diff --git a/drivers/ide/pci/aec62xx.c b/drivers/ide/pci/aec62xx.c
index 099539e8c7a3..b173bc66ce1e 100644
--- a/drivers/ide/pci/aec62xx.c
+++ b/drivers/ide/pci/aec62xx.c
@@ -155,17 +155,6 @@ static int aec62xx_tune_chipset (ide_drive_t *drive, u8 speed)
 	}
 }
 
-static int config_chipset_for_dma (ide_drive_t *drive)
-{
-	u8 speed = ide_max_dma_mode(drive);
-
-	if (!(speed))
-		return 0;
-
-	(void) aec62xx_tune_chipset(drive, speed);
-	return ide_dma_enable(drive);
-}
-
 static void aec62xx_tune_drive (ide_drive_t *drive, u8 pio)
 {
 	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
@@ -174,7 +163,7 @@ static void aec62xx_tune_drive (ide_drive_t *drive, u8 pio)
 
 static int aec62xx_config_drive_xfer_rate (ide_drive_t *drive)
 {
-	if (ide_use_dma(drive) && config_chipset_for_dma(drive))
+	if (ide_tune_dma(drive))
 		return 0;
 
 	if (ide_use_fast_pio(drive))
diff --git a/drivers/ide/pci/atiixp.c b/drivers/ide/pci/atiixp.c
index f7e80d6076d7..0e52ad722a72 100644
--- a/drivers/ide/pci/atiixp.c
+++ b/drivers/ide/pci/atiixp.c
@@ -206,26 +206,6 @@ static int atiixp_speedproc(ide_drive_t *drive, u8 xferspeed)
 	return ide_config_drive_speed(drive, speed);
 }
 
-/**
- *	atiixp_config_drive_for_dma	-	configure drive for DMA
- *	@drive: IDE drive to configure
- *
- *	Set up a ATIIXP interface channel for the best available speed.
- *	We prefer UDMA if it is available and then MWDMA. If DMA is
- *	not available we switch to PIO and return 0.
- */
-
-static int atiixp_config_drive_for_dma(ide_drive_t *drive)
-{
-	u8 speed = ide_max_dma_mode(drive);
-
-	if (!speed)
-		return 0;
-
-	(void) atiixp_speedproc(drive, speed);
-	return ide_dma_enable(drive);
-}
-
 /**
  *	atiixp_dma_check	-	set up an IDE device
  *	@drive: IDE drive to configure
@@ -240,7 +220,7 @@ static int atiixp_dma_check(ide_drive_t *drive)
 
 	drive->init_speed = 0;
 
-	if (ide_use_dma(drive) && atiixp_config_drive_for_dma(drive))
+	if (ide_tune_dma(drive))
 		return 0;
 
 	if (ide_use_fast_pio(drive)) {
diff --git a/drivers/ide/pci/cs5535.c b/drivers/ide/pci/cs5535.c
index 66a101e470d0..41925c47ef05 100644
--- a/drivers/ide/pci/cs5535.c
+++ b/drivers/ide/pci/cs5535.c
@@ -164,26 +164,13 @@ static void cs5535_tuneproc(ide_drive_t *drive, u8 xferspeed)
 	cs5535_set_speed(drive, xferspeed);
 }
 
-static int cs5535_config_drive_for_dma(ide_drive_t *drive)
-{
-	u8 speed = ide_max_dma_mode(drive);
-
-	/* If no DMA speed was available then let dma_check hit pio */
-	if (!speed) {
-		return 0;
-	}
-
-	cs5535_set_drive(drive, speed);
-	return ide_dma_enable(drive);
-}
-
 static int cs5535_dma_check(ide_drive_t *drive)
 {
 	u8 speed;
 
 	drive->init_speed = 0;
 
-	if (ide_use_dma(drive) && cs5535_config_drive_for_dma(drive))
+	if (ide_tune_dma(drive))
 		return 0;
 
 	if (ide_use_fast_pio(drive)) {
diff --git a/drivers/ide/pci/hpt34x.c b/drivers/ide/pci/hpt34x.c
index 473e1b33dbf7..2c24c3de8846 100644
--- a/drivers/ide/pci/hpt34x.c
+++ b/drivers/ide/pci/hpt34x.c
@@ -84,29 +84,11 @@ static void hpt34x_tune_drive (ide_drive_t *drive, u8 pio)
 	(void) hpt34x_tune_chipset(drive, (XFER_PIO_0 + pio));
 }
 
-/*
- * This allows the configuration of ide_pci chipset registers
- * for cards that learn about the drive's UDMA, DMA, PIO capabilities
- * after the drive is reported by the OS.  Initially for designed for
- * HPT343 UDMA chipset by HighPoint|Triones Technologies, Inc.
- */
-
-static int config_chipset_for_dma (ide_drive_t *drive)
-{
-	u8 speed = ide_max_dma_mode(drive);
-
-	if (!(speed))
-		return 0;
-
-	(void) hpt34x_tune_chipset(drive, speed);
-	return ide_dma_enable(drive);
-}
-
 static int hpt34x_config_drive_xfer_rate (ide_drive_t *drive)
 {
 	drive->init_speed = 0;
 
-	if (ide_use_dma(drive) && config_chipset_for_dma(drive))
+	if (ide_tune_dma(drive))
 #ifndef CONFIG_HPT34X_AUTODMA
 		return -1;
 #else
diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c
index de5ad9c35dc6..fcbc5605b38e 100644
--- a/drivers/ide/pci/hpt366.c
+++ b/drivers/ide/pci/hpt366.c
@@ -669,24 +669,6 @@ static void hpt3xx_tune_drive(ide_drive_t *drive, u8 pio)
 	(void) hpt3xx_tune_chipset (drive, XFER_PIO_0 + pio);
 }
 
-/*
- * This allows the configuration of ide_pci chipset registers
- * for cards that learn about the drive's UDMA, DMA, PIO capabilities
- * after the drive is reported by the OS.  Initially designed for
- * HPT366 UDMA chipset by HighPoint|Triones Technologies, Inc.
- *
- */
-static int config_chipset_for_dma(ide_drive_t *drive)
-{
-	u8 speed = ide_max_dma_mode(drive);
-
-	if (!speed)
-		return 0;
-
-	(void) hpt3xx_tune_chipset(drive, speed);
-	return ide_dma_enable(drive);
-}
-
 static int hpt3xx_quirkproc(ide_drive_t *drive)
 {
 	struct hd_driveid *id	= drive->id;
@@ -741,7 +723,7 @@ static int hpt366_config_drive_xfer_rate(ide_drive_t *drive)
 {
 	drive->init_speed = 0;
 
-	if (ide_use_dma(drive) && config_chipset_for_dma(drive))
+	if (ide_tune_dma(drive))
 		return 0;
 
 	if (ide_use_fast_pio(drive))
diff --git a/drivers/ide/pci/it8213.c b/drivers/ide/pci/it8213.c
index 02b56cb7bb1b..c04a02687b95 100644
--- a/drivers/ide/pci/it8213.c
+++ b/drivers/ide/pci/it8213.c
@@ -197,25 +197,6 @@ static int it8213_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 	return ide_config_drive_speed(drive, speed);
 }
 
-/*
- *	config_chipset_for_dma	-	configure for DMA
- *	@drive: drive to configure
- *
- *	Called by the IDE layer when it wants the timings set up.
- */
-
-static int config_chipset_for_dma (ide_drive_t *drive)
-{
-	u8 speed = ide_max_dma_mode(drive);
-
-	if (!speed)
-		return 0;
-
-	it8213_tune_chipset(drive, speed);
-
-	return ide_dma_enable(drive);
-}
-
 /**
  *	it8213_configure_drive_for_dma	-	set up for DMA transfers
  *	@drive: drive we are going to set up
@@ -230,7 +211,7 @@ static int it8213_config_drive_for_dma (ide_drive_t *drive)
 {
 	u8 pio;
 
-	if (ide_use_dma(drive) && config_chipset_for_dma(drive))
+	if (ide_tune_dma(drive))
 		return 0;
 
 	pio = ide_get_best_pio_mode(drive, 255, 4, NULL);
diff --git a/drivers/ide/pci/jmicron.c b/drivers/ide/pci/jmicron.c
index dbb3c199cba9..76ed25147229 100644
--- a/drivers/ide/pci/jmicron.c
+++ b/drivers/ide/pci/jmicron.c
@@ -118,25 +118,6 @@ static int jmicron_tune_chipset (ide_drive_t *drive, byte xferspeed)
 	return ide_config_drive_speed(drive, speed);
 }
 
-/**
- *	config_chipset_for_dma	-	configure for DMA
- *	@drive: drive to configure
- *
- *	As the JMicron snoops for timings all we actually need to do is
- *	make sure we don't set an invalid mode.
- */
-
-static int config_chipset_for_dma (ide_drive_t *drive)
-{
-	u8 speed = ide_max_dma_mode(drive);
-
-	if (!speed)
-		return 0;
-
-	jmicron_tune_chipset(drive, speed);
-	return ide_dma_enable(drive);
-}
-
 /**
  *	jmicron_configure_drive_for_dma	-	set up for DMA transfers
  *	@drive: drive we are going to set up
@@ -147,7 +128,7 @@ static int config_chipset_for_dma (ide_drive_t *drive)
 
 static int jmicron_config_drive_for_dma (ide_drive_t *drive)
 {
-	if (ide_use_dma(drive) && config_chipset_for_dma(drive))
+	if (ide_tune_dma(drive))
 		return 0;
 
 	config_jmicron_chipset_for_pio(drive, 1);
diff --git a/drivers/ide/pci/piix.c b/drivers/ide/pci/piix.c
index 84d3938a430a..8b219dd63024 100644
--- a/drivers/ide/pci/piix.c
+++ b/drivers/ide/pci/piix.c
@@ -303,30 +303,6 @@ static int piix_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 	return ide_config_drive_speed(drive, speed);
 }
 
-/**
- *	piix_config_drive_for_dma	-	configure drive for DMA
- *	@drive: IDE drive to configure
- *
- *	Set up a PIIX interface channel for the best available speed.
- *	We prefer UDMA if it is available and then MWDMA.  If DMA is
- *	not available we switch to PIO and return 0.
- */
- 
-static int piix_config_drive_for_dma (ide_drive_t *drive)
-{
-	u8 speed = ide_max_dma_mode(drive);
-
-	/*
-	 * If no DMA speed was available or the chipset has DMA bugs
-	 * then disable DMA and use PIO
-	 */
-	if (!speed)
-		return 0;
-
-	(void) piix_tune_chipset(drive, speed);
-	return ide_dma_enable(drive);
-}
-
 /**
  *	piix_config_drive_xfer_rate	-	set up an IDE device
  *	@drive: IDE drive to configure
@@ -339,7 +315,7 @@ static int piix_config_drive_xfer_rate (ide_drive_t *drive)
 {
 	drive->init_speed = 0;
 
-	if (ide_use_dma(drive) && piix_config_drive_for_dma(drive))
+	if (ide_tune_dma(drive))
 		return 0;
 
 	if (ide_use_fast_pio(drive))
diff --git a/drivers/ide/pci/sis5513.c b/drivers/ide/pci/sis5513.c
index 83c80ed73e99..41953fe4fa6b 100644
--- a/drivers/ide/pci/sis5513.c
+++ b/drivers/ide/pci/sis5513.c
@@ -638,32 +638,13 @@ static void sis5513_tune_drive (ide_drive_t *drive, u8 pio)
 	(void) config_chipset_for_pio(drive, pio);
 }
 
-/*
- * ((id->hw_config & 0x4000|0x2000) && (HWIF(drive)->udma_four))
- */
-static int config_chipset_for_dma (ide_drive_t *drive)
-{
-	u8 speed = ide_max_dma_mode(drive);
-
-#ifdef DEBUG
-	printk("SIS5513: config_chipset_for_dma, drive %d, ultra %x\n",
-	       drive->dn, drive->id->dma_ultra);
-#endif
-
-	if (!(speed))
-		return 0;
-
-	sis5513_tune_chipset(drive, speed);
-	return ide_dma_enable(drive);
-}
-
 static int sis5513_config_xfer_rate(ide_drive_t *drive)
 {
 	config_art_rwp_pio(drive, 5);
 
 	drive->init_speed = 0;
 
-	if (ide_use_dma(drive) && config_chipset_for_dma(drive))
+	if (ide_tune_dma(drive))
 		return 0;
 
 	if (ide_use_fast_pio(drive))
diff --git a/drivers/ide/pci/slc90e66.c b/drivers/ide/pci/slc90e66.c
index 9e95a5cbf984..c40f291f91e0 100644
--- a/drivers/ide/pci/slc90e66.c
+++ b/drivers/ide/pci/slc90e66.c
@@ -160,22 +160,11 @@ static int slc90e66_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 	return ide_config_drive_speed(drive, speed);
 }
 
-static int slc90e66_config_drive_for_dma (ide_drive_t *drive)
-{
-	u8 speed = ide_max_dma_mode(drive);
-
-	if (!speed)
-		return 0;
-
-	(void) slc90e66_tune_chipset(drive, speed);
-	return ide_dma_enable(drive);
-}
-
 static int slc90e66_config_drive_xfer_rate (ide_drive_t *drive)
 {
 	drive->init_speed = 0;
 
-	if (ide_use_dma(drive) && slc90e66_config_drive_for_dma(drive))
+	if (ide_tune_dma(drive))
 		return 0;
 
 	if (ide_use_fast_pio(drive))
diff --git a/drivers/ide/pci/tc86c001.c b/drivers/ide/pci/tc86c001.c
index 168f035caa3f..cee619bb2eaf 100644
--- a/drivers/ide/pci/tc86c001.c
+++ b/drivers/ide/pci/tc86c001.c
@@ -167,20 +167,9 @@ static int tc86c001_busproc(ide_drive_t *drive, int state)
 	return 0;
 }
 
-static int config_chipset_for_dma(ide_drive_t *drive)
-{
-	u8 speed = ide_max_dma_mode(drive);
-
-	if (!speed)
-		return 0;
-
-	(void) tc86c001_tune_chipset(drive, speed);
-	return ide_dma_enable(drive);
-}
-
 static int tc86c001_config_drive_xfer_rate(ide_drive_t *drive)
 {
-	if (ide_use_dma(drive) && config_chipset_for_dma(drive))
+	if (ide_tune_dma(drive))
 		return 0;
 
 	if (ide_use_fast_pio(drive))
diff --git a/drivers/ide/pci/triflex.c b/drivers/ide/pci/triflex.c
index 8a877235b949..35e8c612638f 100644
--- a/drivers/ide/pci/triflex.c
+++ b/drivers/ide/pci/triflex.c
@@ -100,20 +100,9 @@ static void triflex_tune_drive(ide_drive_t *drive, u8 pio)
 	(void) triflex_tune_chipset(drive, (XFER_PIO_0 + use_pio));
 }
 
-static int triflex_config_drive_for_dma(ide_drive_t *drive)
-{
-	u8 speed = ide_max_dma_mode(drive);
-
-	if (!speed)
-		return 0;
-
-	(void) triflex_tune_chipset(drive, speed);
-	 return ide_dma_enable(drive);
-}
-
 static int triflex_config_drive_xfer_rate(ide_drive_t *drive)
 {
-	if (ide_use_dma(drive) && triflex_config_drive_for_dma(drive))
+	if (ide_tune_dma(drive))
 		return 0;
 
 	triflex_tune_drive(drive, 255);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 23ab4dc05009..d03fa2d5d75a 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1277,6 +1277,7 @@ int __ide_dma_bad_drive(ide_drive_t *);
 int __ide_dma_good_drive(ide_drive_t *);
 int ide_use_dma(ide_drive_t *);
 u8 ide_max_dma_mode(ide_drive_t *);
+int ide_tune_dma(ide_drive_t *);
 void ide_dma_off(ide_drive_t *);
 void ide_dma_verbose(ide_drive_t *);
 int ide_set_dma(ide_drive_t *);
@@ -1304,6 +1305,7 @@ extern int __ide_dma_timeout(ide_drive_t *);
 #else
 static inline int ide_use_dma(ide_drive_t *drive) { return 0; }
 static inline u8 ide_max_dma_mode(ide_drive_t *drive) { return 0; }
+static inline int ide_tune_dma(ide_drive_t *drive) { return 0; }
 static inline void ide_dma_off(ide_drive_t *drive) { ; }
 static inline void ide_dma_verbose(ide_drive_t *drive) { ; }
 static inline int ide_set_dma(ide_drive_t *drive) { return 1; }
-- 
cgit v1.2.3


From ecfd80e4a514123070b4cfb674b817ba75055df2 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:09 +0200
Subject: ide: make /proc/ide/ optional

All important information/features should be already available through
sysfs and ioctl interfaces.

Add CONFIG_IDE_PROC_FS (CONFIG_SCSI_PROC_FS rip-off) config option,
disabling it makes IDE driver ~5 kB smaller (on x86-32).

While at it add CONFIG_PROC_FS=n versions of proc_ide_{create,destroy}()
and remove no longer needed #ifdefs.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/Kconfig        | 11 +++++++++++
 drivers/ide/Makefile       |  2 +-
 drivers/ide/ide-cd.c       |  4 ++--
 drivers/ide/ide-disk.c     |  4 ++--
 drivers/ide/ide-floppy.c   |  4 ++--
 drivers/ide/ide-tape.c     |  2 +-
 drivers/ide/ide.c          | 15 ++++++---------
 drivers/ide/pci/alim15x3.c |  8 ++++----
 drivers/ide/pci/amd74xx.c  |  6 +++---
 drivers/ide/pci/cmd64x.c   |  8 ++++----
 drivers/ide/pci/sis5513.c  |  6 +++---
 drivers/scsi/ide-scsi.c    |  2 +-
 include/linux/ide.h        | 14 ++++++++------
 13 files changed, 48 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index 5bdf64b77913..a678bbecb489 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -291,6 +291,17 @@ config IDE_TASK_IOCTL
 
 	  If you are unsure, say N here.
 
+config IDE_PROC_FS
+	bool "legacy /proc/ide/ support"
+	depends on IDE && PROC_FS
+	default y
+	help
+	  This option enables support for the various files in
+	  /proc/ide.  In Linux 2.6 this has been superseded by
+	  files in sysfs but many legacy applications rely on this.
+
+	  If unsure say Y.
+
 comment "IDE chipset support/bugfixes"
 
 config IDE_GENERIC
diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile
index d9f029e8ff74..75dc6969e0a7 100644
--- a/drivers/ide/Makefile
+++ b/drivers/ide/Makefile
@@ -20,7 +20,7 @@ ide-core-$(CONFIG_BLK_DEV_CMD640)	+= pci/cmd640.o
 # Core IDE code - must come before legacy
 ide-core-$(CONFIG_BLK_DEV_IDEPCI)	+= setup-pci.o
 ide-core-$(CONFIG_BLK_DEV_IDEDMA)	+= ide-dma.o
-ide-core-$(CONFIG_PROC_FS)		+= ide-proc.o
+ide-core-$(CONFIG_IDE_PROC_FS)		+= ide-proc.o
 ide-core-$(CONFIG_BLK_DEV_IDEPNP)	+= ide-pnp.o
 ide-core-$(CONFIG_BLK_DEV_IDEACPI)	+= ide-acpi.o
 
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 638becda81c6..c3a0079789fd 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -3274,7 +3274,7 @@ int ide_cdrom_setup (ide_drive_t *drive)
 	return 0;
 }
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IDE_PROC_FS
 static
 sector_t ide_cdrom_capacity (ide_drive_t *drive)
 {
@@ -3321,7 +3321,7 @@ static void ide_cd_release(struct kref *kref)
 
 static int ide_cd_probe(ide_drive_t *);
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IDE_PROC_FS
 static int proc_idecd_read_capacity
 	(char *page, char **start, off_t off, int count, int *eof, void *data)
 {
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 37aa6ddd9702..e7bc4d35c40c 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -559,7 +559,7 @@ static sector_t idedisk_capacity (ide_drive_t *drive)
 	return drive->capacity64 - drive->sect0;
 }
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IDE_PROC_FS
 
 static int smart_enable(ide_drive_t *drive)
 {
@@ -683,7 +683,7 @@ static ide_proc_entry_t idedisk_proc[] = {
 
 #define	idedisk_proc	NULL
 
-#endif	/* CONFIG_PROC_FS */
+#endif	/* CONFIG_IDE_PROC_FS */
 
 static void idedisk_prepare_flush(request_queue_t *q, struct request *rq)
 {
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 57cd21c5b2c1..84d398f71e0f 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -1892,7 +1892,7 @@ static void ide_floppy_release(struct kref *kref)
 	kfree(floppy);
 }
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IDE_PROC_FS
 
 static int proc_idefloppy_read_capacity
 	(char *page, char **start, off_t off, int count, int *eof, void *data)
@@ -1914,7 +1914,7 @@ static ide_proc_entry_t idefloppy_proc[] = {
 
 #define	idefloppy_proc	NULL
 
-#endif	/* CONFIG_PROC_FS */
+#endif	/* CONFIG_IDE_PROC_FS */
 
 static int ide_floppy_probe(ide_drive_t *);
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 4e59239fef75..ad306ae24e24 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -4730,7 +4730,7 @@ static void ide_tape_release(struct kref *kref)
 	kfree(tape);
 }
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IDE_PROC_FS
 
 static int proc_idetape_read_name
 	(char *page, char **start, off_t off, int count, int *eof, void *data)
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 73f752142f9e..7a96f6f07494 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -350,7 +350,7 @@ static int ide_system_bus_speed(void)
 	return system_bus_speed;
 }
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IDE_PROC_FS
 struct proc_dir_entry *proc_ide_root;
 #endif
 
@@ -1892,7 +1892,7 @@ static void __init probe_for_hwifs (void)
 
 void ide_register_subdriver(ide_drive_t *drive, ide_driver_t *driver)
 {
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IDE_PROC_FS
 	ide_add_proc_entries(drive->proc, driver->proc, drive);
 #endif
 }
@@ -1914,8 +1914,8 @@ EXPORT_SYMBOL(ide_register_subdriver);
 void ide_unregister_subdriver(ide_drive_t *drive, ide_driver_t *driver)
 {
 	unsigned long flags;
-	
-#ifdef CONFIG_PROC_FS
+
+#ifdef CONFIG_IDE_PROC_FS
 	ide_remove_proc_entries(drive->proc, driver->proc);
 #endif
 	down(&ide_setting_sem);
@@ -2069,7 +2069,7 @@ static int __init ide_init(void)
 
 	init_ide_data();
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IDE_PROC_FS
 	proc_ide_root = proc_mkdir("ide", NULL);
 #endif
 
@@ -2099,9 +2099,8 @@ static int __init ide_init(void)
 	probe_for_hwifs();
 	initializing = 0;
 
-#ifdef CONFIG_PROC_FS
 	proc_ide_create();
-#endif
+
 	return 0;
 }
 
@@ -2141,9 +2140,7 @@ void __exit cleanup_module (void)
 	pnpide_exit();
 #endif
 
-#ifdef CONFIG_PROC_FS
 	proc_ide_destroy();
-#endif
 
 	bus_unregister(&ide_bus_type);
 }
diff --git a/drivers/ide/pci/alim15x3.c b/drivers/ide/pci/alim15x3.c
index 76643b626bdd..428efdae0c7b 100644
--- a/drivers/ide/pci/alim15x3.c
+++ b/drivers/ide/pci/alim15x3.c
@@ -50,7 +50,7 @@ static u8 m5229_revision;
 static u8 chip_is_1543c_e;
 static struct pci_dev *isa_dev;
 
-#if defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_PROC_FS)
+#if defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_IDE_PROC_FS)
 #include <linux/stat.h>
 #include <linux/proc_fs.h>
 
@@ -278,7 +278,7 @@ static int ali_get_info (char *buffer, char **addr, off_t offset, int count)
 
 	return p-buffer; /* => must be less than 4k! */
 }
-#endif  /* defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_PROC_FS) */
+#endif  /* defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_IDE_PROC_FS) */
 
 /**
  *	ali15x3_tune_pio	-	set up chipset for PIO mode
@@ -566,13 +566,13 @@ static unsigned int __devinit init_chipset_ali15x3 (struct pci_dev *dev, const c
 
 	isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
 
-#if defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_PROC_FS)
+#if defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_IDE_PROC_FS)
 	if (!ali_proc) {
 		ali_proc = 1;
 		bmide_dev = dev;
 		ide_pci_create_host_proc("ali", ali_get_info);
 	}
-#endif  /* defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_PROC_FS) */
+#endif  /* defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_IDE_PROC_FS) */
 
 	local_irq_save(flags);
 
diff --git a/drivers/ide/pci/amd74xx.c b/drivers/ide/pci/amd74xx.c
index 7989bdd842a2..becb1a5648b0 100644
--- a/drivers/ide/pci/amd74xx.c
+++ b/drivers/ide/pci/amd74xx.c
@@ -92,7 +92,7 @@ static unsigned char amd_cyc2udma[] = { 6, 6, 5, 4, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3
  * AMD /proc entry.
  */
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IDE_PROC_FS
 
 #include <linux/stat.h>
 #include <linux/proc_fs.h>
@@ -402,14 +402,14 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, const ch
  * Register /proc/ide/amd74xx entry
  */
 
-#if defined(DISPLAY_AMD_TIMINGS) && defined(CONFIG_PROC_FS)
+#if defined(DISPLAY_AMD_TIMINGS) && defined(CONFIG_IDE_PROC_FS)
         if (!amd74xx_proc) {
                 amd_base = pci_resource_start(dev, 4);
                 bmide_dev = dev;
 		ide_pci_create_host_proc("amd74xx", amd74xx_get_info);
                 amd74xx_proc = 1;
         }
-#endif /* DISPLAY_AMD_TIMINGS && CONFIG_PROC_FS */
+#endif /* DISPLAY_AMD_TIMINGS && CONFIG_IDE_PROC_FS */
 
 	return dev->irq;
 }
diff --git a/drivers/ide/pci/cmd64x.c b/drivers/ide/pci/cmd64x.c
index 19f5ac1f866c..61ea96b5555c 100644
--- a/drivers/ide/pci/cmd64x.c
+++ b/drivers/ide/pci/cmd64x.c
@@ -74,7 +74,7 @@
 #define UDIDETCR1	0x7B
 #define DTPR1		0x7C
 
-#if defined(DISPLAY_CMD64X_TIMINGS) && defined(CONFIG_PROC_FS)
+#if defined(DISPLAY_CMD64X_TIMINGS) && defined(CONFIG_IDE_PROC_FS)
 #include <linux/stat.h>
 #include <linux/proc_fs.h>
 
@@ -165,7 +165,7 @@ static int cmd64x_get_info (char *buffer, char **addr, off_t offset, int count)
 	return p-buffer;	/* => must be less than 4k! */
 }
 
-#endif	/* defined(DISPLAY_CMD64X_TIMINGS) && defined(CONFIG_PROC_FS) */
+#endif	/* defined(DISPLAY_CMD64X_TIMINGS) && defined(CONFIG_IDE_PROC_FS) */
 
 static u8 quantize_timing(int timing, int quant)
 {
@@ -548,7 +548,7 @@ static unsigned int __devinit init_chipset_cmd64x(struct pci_dev *dev, const cha
 	(void) pci_write_config_byte(dev, UDIDETCR0, 0xf0);
 #endif /* CONFIG_PPC */
 
-#if defined(DISPLAY_CMD64X_TIMINGS) && defined(CONFIG_PROC_FS)
+#if defined(DISPLAY_CMD64X_TIMINGS) && defined(CONFIG_IDE_PROC_FS)
 
 	cmd_devs[n_cmd_devs++] = dev;
 
@@ -556,7 +556,7 @@ static unsigned int __devinit init_chipset_cmd64x(struct pci_dev *dev, const cha
 		cmd64x_proc = 1;
 		ide_pci_create_host_proc("cmd64x", cmd64x_get_info);
 	}
-#endif /* DISPLAY_CMD64X_TIMINGS && CONFIG_PROC_FS */
+#endif /* DISPLAY_CMD64X_TIMINGS && CONFIG_IDE_PROC_FS */
 
 	return 0;
 }
diff --git a/drivers/ide/pci/sis5513.c b/drivers/ide/pci/sis5513.c
index 41953fe4fa6b..2bde1b92784a 100644
--- a/drivers/ide/pci/sis5513.c
+++ b/drivers/ide/pci/sis5513.c
@@ -191,7 +191,7 @@ static char* chipset_capability[] = {
 	"ATA 133 (1st gen)", "ATA 133 (2nd gen)"
 };
 
-#if defined(DISPLAY_SIS_TIMINGS) && defined(CONFIG_PROC_FS)
+#if defined(DISPLAY_SIS_TIMINGS) && defined(CONFIG_IDE_PROC_FS)
 #include <linux/stat.h>
 #include <linux/proc_fs.h>
 
@@ -426,7 +426,7 @@ static int sis_get_info (char *buffer, char **addr, off_t offset, int count)
 
 	return len > count ? count : len;
 }
-#endif /* defined(DISPLAY_SIS_TIMINGS) && defined(CONFIG_PROC_FS) */
+#endif /* defined(DISPLAY_SIS_TIMINGS) && defined(CONFIG_IDE_PROC_FS) */
 
 /*
  * Configuration functions
@@ -797,7 +797,7 @@ static unsigned int __devinit init_chipset_sis5513 (struct pci_dev *dev, const c
 				break;
 		}
 
-#if defined(DISPLAY_SIS_TIMINGS) && defined(CONFIG_PROC_FS)
+#if defined(DISPLAY_SIS_TIMINGS) && defined(CONFIG_IDE_PROC_FS)
 		if (!sis_proc) {
 			sis_proc = 1;
 			bmide_dev = dev;
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 2b5b8a93bc10..531df3a29c47 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -770,7 +770,7 @@ static void ide_scsi_remove(ide_drive_t *drive)
 
 static int ide_scsi_probe(ide_drive_t *);
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IDE_PROC_FS
 static ide_proc_entry_t idescsi_proc[] = {
 	{ "capacity", S_IFREG|S_IRUGO, proc_ide_read_capacity, NULL },
 	{ NULL, 0, NULL, NULL }
diff --git a/include/linux/ide.h b/include/linux/ide.h
index d03fa2d5d75a..697c39dd66a1 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -912,15 +912,15 @@ typedef struct {
 	write_proc_t	*write_proc;
 } ide_proc_entry_t;
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IDE_PROC_FS
 extern struct proc_dir_entry *proc_ide_root;
 
-extern void proc_ide_create(void);
-extern void proc_ide_destroy(void);
-extern void create_proc_ide_interfaces(void);
+void proc_ide_create(void);
+void proc_ide_destroy(void);
+void create_proc_ide_interfaces(void);
 void destroy_proc_ide_interface(ide_hwif_t *);
-extern void ide_add_proc_entries(struct proc_dir_entry *, ide_proc_entry_t *, void *);
-extern void ide_remove_proc_entries(struct proc_dir_entry *, ide_proc_entry_t *);
+void ide_add_proc_entries(struct proc_dir_entry *, ide_proc_entry_t *, void *);
+void ide_remove_proc_entries(struct proc_dir_entry *, ide_proc_entry_t *);
 read_proc_t proc_ide_read_capacity;
 read_proc_t proc_ide_read_geometry;
 
@@ -944,6 +944,8 @@ void ide_pci_create_host_proc(const char *, get_info_t *);
 	return len;			\
 }
 #else
+static inline void proc_ide_create(void) { ; }
+static inline void proc_ide_destroy(void) { ; }
 static inline void create_proc_ide_interfaces(void) { ; }
 static inline void destroy_proc_ide_interface(ide_hwif_t *hwif) { ; }
 #define PROC_IDE_READ_RETURN(page,start,off,count,eof,len) return 0;
-- 
cgit v1.2.3


From 1497943ee692aa7519fa972d0e3a339649bf3a96 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:10 +0200
Subject: ide: split off ioctl handling from IDE settings (v2)

* do write permission and min/max checks in ide_procset_t functions

* ide-disk.c: drive->id is always available so cleanup "multcount" setting
  accordingly

* ide-disk.c: "address" setting was incorrectly defined as type TYPE_INTA,
  fix it by using type TYPE_BYTE and updating ide_drive_t->adressing field,
  the bug didn't trigger because this IDE setting uses custom ->set function

* ide.c: add set_ksettings() for handling HDIO_SET_KEEPSETTINGS ioctl

* ide.c: add set_unmaskirq() for handling HDIO_SET_UNMASKINTR ioctl

* handle ioctls directly in generic_ide_ioclt() and idedisk_ioctl()
  instead of using IDE settings to deal with them

* remove no longer needed ide_find_setting_by_ioctl() and {read,write}_ioctl
  fields from ide_settings_t, also remove now unused TYPE_INTA handling

v2:
* add missing EXPORT_SYMBOL_GPL(ide_setting_sem) needed now for ide-disk

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c     |   2 +-
 drivers/ide/ide-disk.c   |  81 ++++++++++++++++++----
 drivers/ide/ide-floppy.c |  10 +--
 drivers/ide/ide-tape.c   |  30 ++++----
 drivers/ide/ide.c        | 176 ++++++++++++++++++++++++++++-------------------
 drivers/scsi/ide-scsi.c  |  12 ++--
 include/linux/ide.h      |  16 ++---
 7 files changed, 203 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index c3a0079789fd..29408cfd9869 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -3061,7 +3061,7 @@ int ide_cdrom_probe_capabilities (ide_drive_t *drive)
 
 static void ide_cdrom_add_settings(ide_drive_t *drive)
 {
-	ide_add_setting(drive,	"dsc_overlap",		SETTING_RW, -1, -1, TYPE_BYTE, 0, 1, 1,	1, &drive->dsc_overlap, NULL);
+	ide_add_setting(drive, "dsc_overlap", SETTING_RW, TYPE_BYTE, 0, 1, 1, 1, &drive->dsc_overlap, NULL);
 }
 
 /*
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index e7bc4d35c40c..fb162cb3ebf5 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -737,6 +737,9 @@ static int set_multcount(ide_drive_t *drive, int arg)
 {
 	struct request rq;
 
+	if (arg < 0 || arg > drive->id->max_multsect)
+		return -EINVAL;
+
 	if (drive->special.b.set_multmode)
 		return -EBUSY;
 	ide_init_drive_cmd (&rq);
@@ -749,6 +752,9 @@ static int set_multcount(ide_drive_t *drive, int arg)
 
 static int set_nowerr(ide_drive_t *drive, int arg)
 {
+	if (arg < 0 || arg > 1)
+		return -EINVAL;
+
 	if (ide_spin_wait_hwgroup(drive))
 		return -EBUSY;
 	drive->nowerr = arg;
@@ -800,6 +806,9 @@ static int write_cache(ide_drive_t *drive, int arg)
 	ide_task_t args;
 	int err = 1;
 
+	if (arg < 0 || arg > 1)
+		return -EINVAL;
+
 	if (ide_id_has_flush_cache(drive->id)) {
 		memset(&args, 0, sizeof(ide_task_t));
 		args.tfRegister[IDE_FEATURE_OFFSET]	= (arg) ?
@@ -835,6 +844,9 @@ static int set_acoustic (ide_drive_t *drive, int arg)
 {
 	ide_task_t args;
 
+	if (arg < 0 || arg > 254)
+		return -EINVAL;
+
 	memset(&args, 0, sizeof(ide_task_t));
 	args.tfRegister[IDE_FEATURE_OFFSET]	= (arg) ? SETFEATURES_EN_AAM :
 							  SETFEATURES_DIS_AAM;
@@ -855,6 +867,9 @@ static int set_acoustic (ide_drive_t *drive, int arg)
  */
 static int set_lba_addressing(ide_drive_t *drive, int arg)
 {
+	if (arg < 0 || arg > 2)
+		return -EINVAL;
+
 	drive->addressing =  0;
 
 	if (HWIF(drive)->no_lba48)
@@ -870,18 +885,18 @@ static void idedisk_add_settings(ide_drive_t *drive)
 {
 	struct hd_driveid *id = drive->id;
 
-	ide_add_setting(drive,	"bios_cyl",		SETTING_RW,					-1,			-1,			TYPE_INT,	0,	65535,				1,	1,	&drive->bios_cyl,		NULL);
-	ide_add_setting(drive,	"bios_head",		SETTING_RW,					-1,			-1,			TYPE_BYTE,	0,	255,				1,	1,	&drive->bios_head,		NULL);
-	ide_add_setting(drive,	"bios_sect",		SETTING_RW,					-1,			-1,			TYPE_BYTE,	0,	63,				1,	1,	&drive->bios_sect,		NULL);
-	ide_add_setting(drive,	"address",		SETTING_RW,					HDIO_GET_ADDRESS,	HDIO_SET_ADDRESS,	TYPE_INTA,	0,	2,				1,	1,	&drive->addressing,	set_lba_addressing);
-	ide_add_setting(drive,	"bswap",		SETTING_READ,					-1,			-1,			TYPE_BYTE,	0,	1,				1,	1,	&drive->bswap,			NULL);
-	ide_add_setting(drive,	"multcount",		id ? SETTING_RW : SETTING_READ,			HDIO_GET_MULTCOUNT,	HDIO_SET_MULTCOUNT,	TYPE_BYTE,	0,	id ? id->max_multsect : 0,	1,	1,	&drive->mult_count,		set_multcount);
-	ide_add_setting(drive,	"nowerr",		SETTING_RW,					HDIO_GET_NOWERR,	HDIO_SET_NOWERR,	TYPE_BYTE,	0,	1,				1,	1,	&drive->nowerr,			set_nowerr);
-	ide_add_setting(drive,	"lun",			SETTING_RW,					-1,			-1,			TYPE_INT,	0,	7,				1,	1,	&drive->lun,			NULL);
-	ide_add_setting(drive,	"wcache",		SETTING_RW,					HDIO_GET_WCACHE,	HDIO_SET_WCACHE,	TYPE_BYTE,	0,	1,				1,	1,	&drive->wcache,			write_cache);
-	ide_add_setting(drive,	"acoustic",		SETTING_RW,					HDIO_GET_ACOUSTIC,	HDIO_SET_ACOUSTIC,	TYPE_BYTE,	0,	254,				1,	1,	&drive->acoustic,		set_acoustic);
- 	ide_add_setting(drive,	"failures",		SETTING_RW,					-1,			-1,			TYPE_INT,	0,	65535,				1,	1,	&drive->failures,		NULL);
- 	ide_add_setting(drive,	"max_failures",		SETTING_RW,					-1,			-1,			TYPE_INT,	0,	65535,				1,	1,	&drive->max_failures,		NULL);
+	ide_add_setting(drive,	"bios_cyl",	SETTING_RW,	TYPE_INT,	0,	65535,			1,	1,	&drive->bios_cyl,	NULL);
+	ide_add_setting(drive,	"bios_head",	SETTING_RW,	TYPE_BYTE,	0,	255,			1,	1,	&drive->bios_head,	NULL);
+	ide_add_setting(drive,	"bios_sect",	SETTING_RW,	TYPE_BYTE,	0,	63,			1,	1,	&drive->bios_sect,	NULL);
+	ide_add_setting(drive,	"address",	SETTING_RW,	TYPE_BYTE,	0,	2,			1,	1,	&drive->addressing,	set_lba_addressing);
+	ide_add_setting(drive,	"bswap",	SETTING_READ,	TYPE_BYTE,	0,	1,			1,	1,	&drive->bswap,		NULL);
+	ide_add_setting(drive,	"multcount",	SETTING_RW,	TYPE_BYTE,	0,	id->max_multsect,	1,	1,	&drive->mult_count,	set_multcount);
+	ide_add_setting(drive,	"nowerr",	SETTING_RW,	TYPE_BYTE,	0,	1,			1,	1,	&drive->nowerr,		set_nowerr);
+	ide_add_setting(drive,	"lun",		SETTING_RW,	TYPE_INT,	0,	7,			1,	1,	&drive->lun,		NULL);
+	ide_add_setting(drive,	"wcache",	SETTING_RW,	TYPE_BYTE,	0,	1,			1,	1,	&drive->wcache,		write_cache);
+	ide_add_setting(drive,	"acoustic",	SETTING_RW,	TYPE_BYTE,	0,	254,			1,	1,	&drive->acoustic,	set_acoustic);
+ 	ide_add_setting(drive,	"failures",	SETTING_RW,	TYPE_INT,	0,	65535,			1,	1,	&drive->failures,	NULL);
+ 	ide_add_setting(drive,	"max_failures",	SETTING_RW,	TYPE_INT,	0,	65535,			1,	1,	&drive->max_failures,	NULL);
 }
 
 static void idedisk_setup (ide_drive_t *drive)
@@ -1140,9 +1155,49 @@ static int idedisk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 static int idedisk_ioctl(struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
+	unsigned long flags;
 	struct block_device *bdev = inode->i_bdev;
 	struct ide_disk_obj *idkp = ide_disk_g(bdev->bd_disk);
-	return generic_ide_ioctl(idkp->drive, file, bdev, cmd, arg);
+	ide_drive_t *drive = idkp->drive;
+	int err, (*setfunc)(ide_drive_t *, int);
+	u8 *val;
+
+	switch (cmd) {
+	case HDIO_GET_ADDRESS:	 val = &drive->addressing;	goto read_val;
+	case HDIO_GET_MULTCOUNT: val = &drive->mult_count;	goto read_val;
+	case HDIO_GET_NOWERR:	 val = &drive->nowerr;		goto read_val;
+	case HDIO_GET_WCACHE:	 val = &drive->wcache;		goto read_val;
+	case HDIO_GET_ACOUSTIC:	 val = &drive->acoustic;	goto read_val;
+	case HDIO_SET_ADDRESS:	 setfunc = set_lba_addressing;	goto set_val;
+	case HDIO_SET_MULTCOUNT: setfunc = set_multcount;	goto set_val;
+	case HDIO_SET_NOWERR:	 setfunc = set_nowerr;		goto set_val;
+	case HDIO_SET_WCACHE:	 setfunc = write_cache;		goto set_val;
+	case HDIO_SET_ACOUSTIC:	 setfunc = set_acoustic;	goto set_val;
+	}
+
+	return generic_ide_ioctl(drive, file, bdev, cmd, arg);
+
+read_val:
+	down(&ide_setting_sem);
+	spin_lock_irqsave(&ide_lock, flags);
+	err = *val;
+	spin_unlock_irqrestore(&ide_lock, flags);
+	up(&ide_setting_sem);
+	return err >= 0 ? put_user(err, (long __user *)arg) : err;
+
+set_val:
+	if (bdev != bdev->bd_contains)
+		err = -EINVAL;
+	else {
+		if (!capable(CAP_SYS_ADMIN))
+			err = -EACCES;
+		else {
+			down(&ide_setting_sem);
+			err = setfunc(drive, arg);
+			up(&ide_setting_sem);
+		}
+	}
+	return err;
 }
 
 static int idedisk_media_changed(struct gendisk *disk)
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 84d398f71e0f..38fe45cf4019 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -1816,12 +1816,12 @@ static void idefloppy_add_settings(ide_drive_t *drive)
 	idefloppy_floppy_t *floppy = drive->driver_data;
 
 /*
- *			drive	setting name	read/write	ioctl	ioctl		data type	min	max	mul_factor	div_factor	data pointer		set function
+ *			drive	setting name	read/write	data type	min	max	mul_factor	div_factor	data pointer		set function
  */
-	ide_add_setting(drive,	"bios_cyl",		SETTING_RW,					-1,			-1,			TYPE_INT,	0,	1023,				1,	1,	&drive->bios_cyl,		NULL);
-	ide_add_setting(drive,	"bios_head",		SETTING_RW,					-1,			-1,			TYPE_BYTE,	0,	255,				1,	1,	&drive->bios_head,		NULL);
-	ide_add_setting(drive,	"bios_sect",		SETTING_RW,					-1,			-1,			TYPE_BYTE,	0,	63,				1,	1,	&drive->bios_sect,		NULL);
-	ide_add_setting(drive,	"ticks",		SETTING_RW,					-1,			-1,			TYPE_BYTE,	0,	255,				1,	1,	&floppy->ticks,		NULL);
+	ide_add_setting(drive,	"bios_cyl",	SETTING_RW,	TYPE_INT,	0,	1023,		1,		1,	&drive->bios_cyl,	NULL);
+	ide_add_setting(drive,	"bios_head",	SETTING_RW,	TYPE_BYTE,	0,	255,		1,		1,	&drive->bios_head,	NULL);
+	ide_add_setting(drive,	"bios_sect",	SETTING_RW,	TYPE_BYTE,	0,	63,		1,		1,	&drive->bios_sect,	NULL);
+	ide_add_setting(drive,	"ticks",	SETTING_RW,	TYPE_BYTE,	0,	255,		1,		1,	&floppy->ticks,		NULL);
 }
 
 /*
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index ad306ae24e24..fa0d22de37a7 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -4566,22 +4566,22 @@ static void idetape_add_settings (ide_drive_t *drive)
 	idetape_tape_t *tape = drive->driver_data;
 
 /*
- *			drive	setting name	read/write	ioctl	ioctl		data type	min			max			mul_factor			div_factor			data pointer				set function
+ *			drive	setting name		read/write	data type	min			max			mul_factor			div_factor	data pointer				set function
  */
-	ide_add_setting(drive,	"buffer",	SETTING_READ,	-1,	-1,		TYPE_SHORT,	0,			0xffff,			1,				2,				&tape->capabilities.buffer_size,	NULL);
-	ide_add_setting(drive,	"pipeline_min",	SETTING_RW,	-1,	-1,		TYPE_INT,	1,			0xffff,			tape->stage_size / 1024,	1,				&tape->min_pipeline,			NULL);
-	ide_add_setting(drive,	"pipeline",	SETTING_RW,	-1,	-1,		TYPE_INT,	1,			0xffff,			tape->stage_size / 1024,	1,				&tape->max_stages,			NULL);
-	ide_add_setting(drive,	"pipeline_max",	SETTING_RW,	-1,	-1,		TYPE_INT,	1,			0xffff,			tape->stage_size / 1024,	1,				&tape->max_pipeline,			NULL);
-	ide_add_setting(drive,	"pipeline_used",SETTING_READ,	-1,	-1,		TYPE_INT,	0,			0xffff,			tape->stage_size / 1024,	1,				&tape->nr_stages,			NULL);
-	ide_add_setting(drive,	"pipeline_pending",SETTING_READ,-1,	-1,		TYPE_INT,	0,			0xffff,			tape->stage_size / 1024,	1,				&tape->nr_pending_stages,		NULL);
-	ide_add_setting(drive,	"speed",	SETTING_READ,	-1,	-1,		TYPE_SHORT,	0,			0xffff,			1,				1,				&tape->capabilities.speed,		NULL);
-	ide_add_setting(drive,	"stage",	SETTING_READ,	-1,	-1,		TYPE_INT,	0,			0xffff,			1,				1024,				&tape->stage_size,			NULL);
-	ide_add_setting(drive,	"tdsc",		SETTING_RW,	-1,	-1,		TYPE_INT,	IDETAPE_DSC_RW_MIN,	IDETAPE_DSC_RW_MAX,	1000,				HZ,				&tape->best_dsc_rw_frequency,		NULL);
-	ide_add_setting(drive,	"dsc_overlap",	SETTING_RW,	-1,	-1,		TYPE_BYTE,	0,			1,			1,				1,				&drive->dsc_overlap,			NULL);
-	ide_add_setting(drive,	"pipeline_head_speed_c",SETTING_READ,	-1,	-1,	TYPE_INT,	0,			0xffff,			1,				1,				&tape->controlled_pipeline_head_speed,	NULL);
-	ide_add_setting(drive,	"pipeline_head_speed_u",SETTING_READ,	-1,	-1,	TYPE_INT,	0,			0xffff,			1,				1,				&tape->uncontrolled_pipeline_head_speed,	NULL);
-	ide_add_setting(drive,	"avg_speed",	SETTING_READ,	-1,	-1,		TYPE_INT,	0,			0xffff,			1,				1,				&tape->avg_speed,		NULL);
-	ide_add_setting(drive,	"debug_level",SETTING_RW,	-1,	-1,		TYPE_INT,	0,			0xffff,			1,				1,				&tape->debug_level,		NULL);
+	ide_add_setting(drive,	"buffer",		SETTING_READ,	TYPE_SHORT,	0,			0xffff,			1,				2,		&tape->capabilities.buffer_size,	NULL);
+	ide_add_setting(drive,	"pipeline_min",		SETTING_RW,	TYPE_INT,	1,			0xffff,			tape->stage_size / 1024,	1,		&tape->min_pipeline,			NULL);
+	ide_add_setting(drive,	"pipeline",		SETTING_RW,	TYPE_INT,	1,			0xffff,			tape->stage_size / 1024,	1,		&tape->max_stages,			NULL);
+	ide_add_setting(drive,	"pipeline_max",		SETTING_RW,	TYPE_INT,	1,			0xffff,			tape->stage_size / 1024,	1,		&tape->max_pipeline,			NULL);
+	ide_add_setting(drive,	"pipeline_used",	SETTING_READ,	TYPE_INT,	0,			0xffff,			tape->stage_size / 1024,	1,		&tape->nr_stages,			NULL);
+	ide_add_setting(drive,	"pipeline_pending",	SETTING_READ,	TYPE_INT,	0,			0xffff,			tape->stage_size / 1024,	1,		&tape->nr_pending_stages,		NULL);
+	ide_add_setting(drive,	"speed",		SETTING_READ,	TYPE_SHORT,	0,			0xffff,			1,				1,		&tape->capabilities.speed,		NULL);
+	ide_add_setting(drive,	"stage",		SETTING_READ,	TYPE_INT,	0,			0xffff,			1,				1024,		&tape->stage_size,			NULL);
+	ide_add_setting(drive,	"tdsc",			SETTING_RW,	TYPE_INT,	IDETAPE_DSC_RW_MIN,	IDETAPE_DSC_RW_MAX,	1000,				HZ,		&tape->best_dsc_rw_frequency,		NULL);
+	ide_add_setting(drive,	"dsc_overlap",		SETTING_RW,	TYPE_BYTE,	0,			1,			1,				1,		&drive->dsc_overlap,			NULL);
+	ide_add_setting(drive,	"pipeline_head_speed_c",SETTING_READ,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->controlled_pipeline_head_speed,	NULL);
+	ide_add_setting(drive,	"pipeline_head_speed_u",SETTING_READ,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->uncontrolled_pipeline_head_speed,NULL);
+	ide_add_setting(drive,	"avg_speed",		SETTING_READ,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->avg_speed,			NULL);
+	ide_add_setting(drive,	"debug_level",		SETTING_RW,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->debug_level,			NULL);
 }
 
 /*
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 7a96f6f07494..8793a960210c 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -823,13 +823,13 @@ EXPORT_SYMBOL(ide_register_hw);
 
 DECLARE_MUTEX(ide_setting_sem);
 
+EXPORT_SYMBOL_GPL(ide_setting_sem);
+
 /**
  *	__ide_add_setting	-	add an ide setting option
  *	@drive: drive to use
  *	@name: setting name
  *	@rw: true if the function is read write
- *	@read_ioctl: function to call on read
- *	@write_ioctl: function to call on write
  *	@data_type: type of data
  *	@min: range minimum
  *	@max: range maximum
@@ -850,7 +850,7 @@ DECLARE_MUTEX(ide_setting_sem);
  *	remove.
  */
 
-static int __ide_add_setting(ide_drive_t *drive, const char *name, int rw, int read_ioctl, int write_ioctl, int data_type, int min, int max, int mul_factor, int div_factor, void *data, ide_procset_t *set, int auto_remove)
+static int __ide_add_setting(ide_drive_t *drive, const char *name, int rw, int data_type, int min, int max, int mul_factor, int div_factor, void *data, ide_procset_t *set, int auto_remove)
 {
 	ide_settings_t **p = (ide_settings_t **) &drive->settings, *setting = NULL;
 
@@ -863,8 +863,6 @@ static int __ide_add_setting(ide_drive_t *drive, const char *name, int rw, int r
 		goto abort;
 	strcpy(setting->name, name);
 	setting->rw = rw;
-	setting->read_ioctl = read_ioctl;
-	setting->write_ioctl = write_ioctl;
 	setting->data_type = data_type;
 	setting->min = min;
 	setting->max = max;
@@ -885,9 +883,9 @@ abort:
 	return -1;
 }
 
-int ide_add_setting(ide_drive_t *drive, const char *name, int rw, int read_ioctl, int write_ioctl, int data_type, int min, int max, int mul_factor, int div_factor, void *data, ide_procset_t *set)
+int ide_add_setting(ide_drive_t *drive, const char *name, int rw, int data_type, int min, int max, int mul_factor, int div_factor, void *data, ide_procset_t *set)
 {
-	return __ide_add_setting(drive, name, rw, read_ioctl, write_ioctl, data_type, min, max, mul_factor, div_factor, data, set, 1);
+	return __ide_add_setting(drive, name, rw, data_type, min, max, mul_factor, div_factor, data, set, 1);
 }
 
 EXPORT_SYMBOL(ide_add_setting);
@@ -918,29 +916,6 @@ static void __ide_remove_setting (ide_drive_t *drive, char *name)
 	kfree(setting);
 }
 
-/**
- *	ide_find_setting_by_ioctl	-	find a drive specific ioctl
- *	@drive: drive to scan
- *	@cmd: ioctl command to handle
- *
- *	Scan's the device setting table for a matching entry and returns
- *	this or NULL if no entry is found. The caller must hold the
- *	setting semaphore
- */
- 
-static ide_settings_t *ide_find_setting_by_ioctl (ide_drive_t *drive, int cmd)
-{
-	ide_settings_t *setting = drive->settings;
-
-	while (setting) {
-		if (setting->read_ioctl == cmd || setting->write_ioctl == cmd)
-			break;
-		setting = setting->next;
-	}
-	
-	return setting;
-}
-
 /**
  *	ide_find_setting_by_name	-	find a drive specific setting
  *	@drive: drive to scan
@@ -1014,7 +989,6 @@ int ide_read_setting (ide_drive_t *drive, ide_settings_t *setting)
 				val = *((u16 *) setting->data);
 				break;
 			case TYPE_INT:
-			case TYPE_INTA:
 				val = *((u32 *) setting->data);
 				break;
 		}
@@ -1076,17 +1050,14 @@ EXPORT_SYMBOL(ide_spin_wait_hwgroup);
 
 int ide_write_setting (ide_drive_t *drive, ide_settings_t *setting, int val)
 {
-	int i;
-	u32 *p;
-
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
+	if (setting->set)
+		return setting->set(drive, val);
 	if (!(setting->rw & SETTING_WRITE))
 		return -EPERM;
 	if (val < setting->min || val > setting->max)
 		return -EINVAL;
-	if (setting->set)
-		return setting->set(drive, val);
 	if (ide_spin_wait_hwgroup(drive))
 		return -EBUSY;
 	switch (setting->data_type) {
@@ -1099,11 +1070,6 @@ int ide_write_setting (ide_drive_t *drive, ide_settings_t *setting, int val)
 		case TYPE_INT:
 			*((u32 *) setting->data) = val;
 			break;
-		case TYPE_INTA:
-			p = (u32 *) setting->data;
-			for (i = 0; i < 1 << PARTN_BITS; i++, p++)
-				*p = val;
-			break;
 	}
 	spin_unlock_irq(&ide_lock);
 	return 0;
@@ -1111,6 +1077,12 @@ int ide_write_setting (ide_drive_t *drive, ide_settings_t *setting, int val)
 
 static int set_io_32bit(ide_drive_t *drive, int arg)
 {
+	if (drive->no_io_32bit)
+		return -EPERM;
+
+	if (arg < 0 || arg > 1 + (SUPPORT_VLB_SYNC << 1))
+		return -EINVAL;
+
 	drive->io_32bit = arg;
 #ifdef CONFIG_BLK_DEV_DTC2278
 	if (HWIF(drive)->chipset == ide_dtc2278)
@@ -1119,12 +1091,28 @@ static int set_io_32bit(ide_drive_t *drive, int arg)
 	return 0;
 }
 
+static int set_ksettings(ide_drive_t *drive, int arg)
+{
+	if (arg < 0 || arg > 1)
+		return -EINVAL;
+
+	if (ide_spin_wait_hwgroup(drive))
+		return -EBUSY;
+	drive->keep_settings = arg;
+	spin_unlock_irq(&ide_lock);
+
+	return 0;
+}
+
 static int set_using_dma (ide_drive_t *drive, int arg)
 {
 #ifdef CONFIG_BLK_DEV_IDEDMA
 	ide_hwif_t *hwif = drive->hwif;
 	int err = -EPERM;
 
+	if (arg < 0 || arg > 1)
+		return -EINVAL;
+
 	if (!drive->id || !(drive->id->capability & 1))
 		goto out;
 
@@ -1157,6 +1145,9 @@ static int set_using_dma (ide_drive_t *drive, int arg)
 out:
 	return err;
 #else
+	if (arg < 0 || arg > 1)
+		return -EINVAL;
+
 	return -EPERM;
 #endif
 }
@@ -1165,6 +1156,9 @@ static int set_pio_mode (ide_drive_t *drive, int arg)
 {
 	struct request rq;
 
+	if (arg < 0 || arg > 255)
+		return -EINVAL;
+
 	if (!HWIF(drive)->tuneproc)
 		return -ENOSYS;
 	if (drive->special.b.set_tune)
@@ -1176,9 +1170,30 @@ static int set_pio_mode (ide_drive_t *drive, int arg)
 	return 0;
 }
 
+static int set_unmaskirq(ide_drive_t *drive, int arg)
+{
+	if (drive->no_unmask)
+		return -EPERM;
+
+	if (arg < 0 || arg > 1)
+		return -EINVAL;
+
+	if (ide_spin_wait_hwgroup(drive))
+		return -EBUSY;
+	drive->unmask = arg;
+	spin_unlock_irq(&ide_lock);
+
+	return 0;
+}
+
 static int set_xfer_rate (ide_drive_t *drive, int arg)
 {
-	int err = ide_wait_cmd(drive,
+	int err;
+
+	if (arg < 0 || arg > 70)
+		return -EINVAL;
+
+	err = ide_wait_cmd(drive,
 			WIN_SETFEATURES, (u8) arg,
 			SETFEATURES_XFER, 0, NULL);
 
@@ -1193,25 +1208,24 @@ static int set_xfer_rate (ide_drive_t *drive, int arg)
  *	ide_add_generic_settings	-	generic ide settings
  *	@drive: drive being configured
  *
- *	Add the generic parts of the system settings to the /proc files and
- *	ioctls for this IDE device. The caller must not be holding the
- *	ide_setting_sem.
+ *	Add the generic parts of the system settings to the /proc files.
+ *	The caller must not be holding the ide_setting_sem.
  */
 
 void ide_add_generic_settings (ide_drive_t *drive)
 {
 /*
- *			  drive		setting name		read/write access				read ioctl		write ioctl		data type	min	max				mul_factor	div_factor	data pointer			set function
+ *			  drive		setting name		read/write access				data type	min	max				mul_factor	div_factor	data pointer			set function
  */
-	__ide_add_setting(drive,	"io_32bit",		drive->no_io_32bit ? SETTING_READ : SETTING_RW,	HDIO_GET_32BIT,		HDIO_SET_32BIT,		TYPE_BYTE,	0,	1 + (SUPPORT_VLB_SYNC << 1),	1,		1,		&drive->io_32bit,		set_io_32bit,	0);
-	__ide_add_setting(drive,	"keepsettings",		SETTING_RW,					HDIO_GET_KEEPSETTINGS,	HDIO_SET_KEEPSETTINGS,	TYPE_BYTE,	0,	1,				1,		1,		&drive->keep_settings,		NULL,		0);
-	__ide_add_setting(drive,	"nice1",		SETTING_RW,					-1,			-1,			TYPE_BYTE,	0,	1,				1,		1,		&drive->nice1,			NULL,		0);
-	__ide_add_setting(drive,	"pio_mode",		SETTING_WRITE,					-1,			HDIO_SET_PIO_MODE,	TYPE_BYTE,	0,	255,				1,		1,		NULL,				set_pio_mode,	0);
-	__ide_add_setting(drive,	"unmaskirq",		drive->no_unmask ? SETTING_READ : SETTING_RW,	HDIO_GET_UNMASKINTR,	HDIO_SET_UNMASKINTR,	TYPE_BYTE,	0,	1,				1,		1,		&drive->unmask,			NULL,		0);
-	__ide_add_setting(drive,	"using_dma",		SETTING_RW,					HDIO_GET_DMA,		HDIO_SET_DMA,		TYPE_BYTE,	0,	1,				1,		1,		&drive->using_dma,		set_using_dma,	0);
-	__ide_add_setting(drive,	"init_speed",		SETTING_RW,					-1,			-1,			TYPE_BYTE,	0,	70,				1,		1,		&drive->init_speed,		NULL,		0);
-	__ide_add_setting(drive,	"current_speed",	SETTING_RW,					-1,			-1,			TYPE_BYTE,	0,	70,				1,		1,		&drive->current_speed,		set_xfer_rate,	0);
-	__ide_add_setting(drive,	"number",		SETTING_RW,					-1,			-1,			TYPE_BYTE,	0,	3,				1,		1,		&drive->dn,			NULL,		0);
+	__ide_add_setting(drive,	"io_32bit",		drive->no_io_32bit ? SETTING_READ : SETTING_RW,	TYPE_BYTE,	0,	1 + (SUPPORT_VLB_SYNC << 1),	1,		1,		&drive->io_32bit,		set_io_32bit,	0);
+	__ide_add_setting(drive,	"keepsettings",		SETTING_RW,					TYPE_BYTE,	0,	1,				1,		1,		&drive->keep_settings,		NULL,		0);
+	__ide_add_setting(drive,	"nice1",		SETTING_RW,					TYPE_BYTE,	0,	1,				1,		1,		&drive->nice1,			NULL,		0);
+	__ide_add_setting(drive,	"pio_mode",		SETTING_WRITE,					TYPE_BYTE,	0,	255,				1,		1,		NULL,				set_pio_mode,	0);
+	__ide_add_setting(drive,	"unmaskirq",		drive->no_unmask ? SETTING_READ : SETTING_RW,	TYPE_BYTE,	0,	1,				1,		1,		&drive->unmask,			NULL,		0);
+	__ide_add_setting(drive,	"using_dma",		SETTING_RW,					TYPE_BYTE,	0,	1,				1,		1,		&drive->using_dma,		set_using_dma,	0);
+	__ide_add_setting(drive,	"init_speed",		SETTING_RW,					TYPE_BYTE,	0,	70,				1,		1,		&drive->init_speed,		NULL,		0);
+	__ide_add_setting(drive,	"current_speed",	SETTING_RW,					TYPE_BYTE,	0,	70,				1,		1,		&drive->current_speed,		set_xfer_rate,	0);
+	__ide_add_setting(drive,	"number",		SETTING_RW,					TYPE_BYTE,	0,	3,				1,		1,		&drive->dn,			NULL,		0);
 }
 
 /**
@@ -1283,27 +1297,23 @@ static int generic_ide_resume(struct device *dev)
 int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device *bdev,
 			unsigned int cmd, unsigned long arg)
 {
-	ide_settings_t *setting;
+	unsigned long flags;
 	ide_driver_t *drv;
-	int err = 0;
 	void __user *p = (void __user *)arg;
+	int err = 0, (*setfunc)(ide_drive_t *, int);
+	u8 *val;
 
-	down(&ide_setting_sem);
-	if ((setting = ide_find_setting_by_ioctl(drive, cmd)) != NULL) {
-		if (cmd == setting->read_ioctl) {
-			err = ide_read_setting(drive, setting);
-			up(&ide_setting_sem);
-			return err >= 0 ? put_user(err, (long __user *)arg) : err;
-		} else {
-			if (bdev != bdev->bd_contains)
-				err = -EINVAL;
-			else
-				err = ide_write_setting(drive, setting, arg);
-			up(&ide_setting_sem);
-			return err;
-		}
+	switch (cmd) {
+	case HDIO_GET_32BIT:	    val = &drive->io_32bit;	 goto read_val;
+	case HDIO_GET_KEEPSETTINGS: val = &drive->keep_settings; goto read_val;
+	case HDIO_GET_UNMASKINTR:   val = &drive->unmask;	 goto read_val;
+	case HDIO_GET_DMA:	    val = &drive->using_dma;	 goto read_val;
+	case HDIO_SET_32BIT:	    setfunc = set_io_32bit;	 goto set_val;
+	case HDIO_SET_KEEPSETTINGS: setfunc = set_ksettings;	 goto set_val;
+	case HDIO_SET_PIO_MODE:	    setfunc = set_pio_mode;	 goto set_val;
+	case HDIO_SET_UNMASKINTR:   setfunc = set_unmaskirq;	 goto set_val;
+	case HDIO_SET_DMA:	    setfunc = set_using_dma;	 goto set_val;
 	}
-	up(&ide_setting_sem);
 
 	switch (cmd) {
 		case HDIO_OBSOLETE_IDENTITY:
@@ -1432,6 +1442,28 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device
 		default:
 			return -EINVAL;
 	}
+
+read_val:
+	down(&ide_setting_sem);
+	spin_lock_irqsave(&ide_lock, flags);
+	err = *val;
+	spin_unlock_irqrestore(&ide_lock, flags);
+	up(&ide_setting_sem);
+	return err >= 0 ? put_user(err, (long __user *)arg) : err;
+
+set_val:
+	if (bdev != bdev->bd_contains)
+		err = -EINVAL;
+	else {
+		if (!capable(CAP_SYS_ADMIN))
+			err = -EACCES;
+		else {
+			down(&ide_setting_sem);
+			err = setfunc(drive, arg);
+			up(&ide_setting_sem);
+		}
+	}
+	return err;
 }
 
 EXPORT_SYMBOL(generic_ide_ioctl);
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 531df3a29c47..4388b8ab69a1 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -726,13 +726,13 @@ static void idescsi_add_settings(ide_drive_t *drive)
 	idescsi_scsi_t *scsi = drive_to_idescsi(drive);
 
 /*
- *			drive	setting name	read/write	ioctl	ioctl		data type	min	max	mul_factor	div_factor	data pointer		set function
+ *			drive	setting name	read/write	data type	min	max	mul_factor	div_factor	data pointer		set function
  */
-	ide_add_setting(drive,	"bios_cyl",	SETTING_RW,	-1,	-1,		TYPE_INT,	0,	1023,	1,		1,		&drive->bios_cyl,	NULL);
-	ide_add_setting(drive,	"bios_head",	SETTING_RW,	-1,	-1,		TYPE_BYTE,	0,	255,	1,		1,		&drive->bios_head,	NULL);
-	ide_add_setting(drive,	"bios_sect",	SETTING_RW,	-1,	-1,		TYPE_BYTE,	0,	63,	1,		1,		&drive->bios_sect,	NULL);
-	ide_add_setting(drive,	"transform",	SETTING_RW,	-1,	-1,		TYPE_INT,	0,	3,	1,		1,		&scsi->transform,	NULL);
-	ide_add_setting(drive,	"log",		SETTING_RW,	-1,	-1,		TYPE_INT,	0,	1,	1,		1,		&scsi->log,		NULL);
+	ide_add_setting(drive,	"bios_cyl",	SETTING_RW,	TYPE_INT,	0,	1023,	1,		1,		&drive->bios_cyl,	NULL);
+	ide_add_setting(drive,	"bios_head",	SETTING_RW,	TYPE_BYTE,	0,	255,	1,		1,		&drive->bios_head,	NULL);
+	ide_add_setting(drive,	"bios_sect",	SETTING_RW,	TYPE_BYTE,	0,	63,	1,		1,		&drive->bios_sect,	NULL);
+	ide_add_setting(drive,	"transform",	SETTING_RW,	TYPE_INT,	0,	3,	1,		1,		&scsi->transform,	NULL);
+	ide_add_setting(drive,	"log",		SETTING_RW,	TYPE_INT,	0,	1,	1,		1,		&scsi->log,		NULL);
 }
 
 /*
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 697c39dd66a1..591a0b55e31c 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -601,16 +601,11 @@ typedef struct ide_drive_s {
 	unsigned remap_0_to_1	: 1;	/* 0=noremap, 1=remap 0->1 (for EZDrive) */
 	unsigned blocked        : 1;	/* 1=powermanagment told us not to do anything, so sleep nicely */
 	unsigned vdma		: 1;	/* 1=doing PIO over DMA 0=doing normal DMA */
-	unsigned addressing;		/*      : 3;
-					 *  0=28-bit
-					 *  1=48-bit
-					 *  2=48-bit doing 28-bit
-					 *  3=64-bit
-					 */
 	unsigned scsi		: 1;	/* 0=default, 1=ide-scsi emulation */
 	unsigned sleeping	: 1;	/* 1=sleeping & sleep field valid */
 	unsigned post_reset	: 1;
 
+	u8	addressing;	/* 0=28-bit, 1=48-bit, 2=48-bit doing 28-bit */
         u8	quirk_list;	/* considered quirky, set for a specific host */
         u8	init_speed;	/* transfer rate set at boot */
         u8	current_speed;	/* current transfer rate set */
@@ -870,9 +865,8 @@ typedef struct hwgroup_s {
  */
 
 #define TYPE_INT	0
-#define TYPE_INTA	1
-#define TYPE_BYTE	2
-#define TYPE_SHORT	3
+#define TYPE_BYTE	1
+#define TYPE_SHORT	2
 
 #define SETTING_READ	(1 << 0)
 #define SETTING_WRITE	(1 << 1)
@@ -882,8 +876,6 @@ typedef int (ide_procset_t)(ide_drive_t *, int);
 typedef struct ide_settings_s {
 	char			*name;
 	int			rw;
-	int			read_ioctl;
-	int			write_ioctl;
 	int			data_type;
 	int			min;
 	int			max;
@@ -896,7 +888,7 @@ typedef struct ide_settings_s {
 } ide_settings_t;
 
 extern struct semaphore ide_setting_sem;
-extern int ide_add_setting(ide_drive_t *drive, const char *name, int rw, int read_ioctl, int write_ioctl, int data_type, int min, int max, int mul_factor, int div_factor, void *data, ide_procset_t *set);
+int ide_add_setting(ide_drive_t *, const char *, int, int, int, int, int, int, void *, ide_procset_t *set);
 extern ide_settings_t *ide_find_setting_by_name(ide_drive_t *drive, char *name);
 extern int ide_read_setting(ide_drive_t *t, ide_settings_t *setting);
 extern int ide_write_setting(ide_drive_t *drive, ide_settings_t *setting, int val);
-- 
cgit v1.2.3


From 7662d046df09e80680b77b68de896beab45e675e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:10 +0200
Subject: ide: move IDE settings handling to ide-proc.c

* move
	__ide_add_setting()
	ide_add_setting()
	__ide_remove_setting()
	auto_remove_settings()
	ide_find_setting_by_name()
	ide_read_setting()
	ide_write_setting()
	set_xfer_rate()
	ide_add_generic_settings()
	ide_register_subdriver()
	ide_unregister_subdriver()

  from ide.c to ide-proc.c

* set_{io_32bit,pio_mode,using_dma}() cannot be marked static now, fix it

* rename ide_[un]register_subdriver() to ide_proc_[un]register_driver(),
  update device drivers to use new names

* add CONFIG_IDE_PROC_FS=n versions of ide_proc_[un]register_driver()
  and ide_add_generic_settings()

* make ide_find_setting_by_name(), ide_{read,write}_setting()
  and ide_{add,remove}_proc_entries() static

* cover IDE settings code in device drivers with CONFIG_IDE_PROC_FS #ifdef,
  also while at it cover with CONFIG_IDE_PROC_FS #ifdef ide_driver_t.proc

* remove bogus comment from ide.h

* cover with CONFIG_IDE_PROC_FS #ifdef .proc and .settings in ide_drive_t

Besides saner code this patch results in the IDE core smaller by ~2 kB
(on x86-32) and IDE disk driver by ~1 kB (ditto) when CONFIG_IDE_PROC_FS=n.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c     |  14 ++-
 drivers/ide/ide-disk.c   |  16 +--
 drivers/ide/ide-floppy.c |  16 +--
 drivers/ide/ide-proc.c   | 310 +++++++++++++++++++++++++++++++++++++++++++++-
 drivers/ide/ide-tape.c   |  17 +--
 drivers/ide/ide.c        | 313 +----------------------------------------------
 drivers/scsi/ide-scsi.c  |  16 ++-
 include/linux/ide.h      |  39 +++---
 8 files changed, 378 insertions(+), 363 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 29408cfd9869..252ab8295edf 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -3059,10 +3059,14 @@ int ide_cdrom_probe_capabilities (ide_drive_t *drive)
 	return nslots;
 }
 
+#ifdef CONFIG_IDE_PROC_FS
 static void ide_cdrom_add_settings(ide_drive_t *drive)
 {
 	ide_add_setting(drive, "dsc_overlap", SETTING_RW, TYPE_BYTE, 0, 1, 1, 1, &drive->dsc_overlap, NULL);
 }
+#else
+static inline void ide_cdrom_add_settings(ide_drive_t *drive) { ; }
+#endif
 
 /*
  * standard prep_rq_fn that builds 10 byte cmds
@@ -3291,7 +3295,7 @@ static void ide_cd_remove(ide_drive_t *drive)
 {
 	struct cdrom_info *info = drive->driver_data;
 
-	ide_unregister_subdriver(drive, info->driver);
+	ide_proc_unregister_driver(drive, info->driver);
 
 	del_gendisk(info->disk);
 
@@ -3336,8 +3340,6 @@ static ide_proc_entry_t idecd_proc[] = {
 	{ "capacity", S_IFREG|S_IRUGO, proc_idecd_read_capacity, NULL },
 	{ NULL, 0, NULL, NULL }
 };
-#else
-# define idecd_proc	NULL
 #endif
 
 static ide_driver_t ide_cdrom_driver = {
@@ -3355,7 +3357,9 @@ static ide_driver_t ide_cdrom_driver = {
 	.end_request		= ide_end_request,
 	.error			= __ide_error,
 	.abort			= __ide_abort,
+#ifdef CONFIG_IDE_PROC_FS
 	.proc			= idecd_proc,
+#endif
 };
 
 static int idecd_open(struct inode * inode, struct file * file)
@@ -3517,7 +3521,7 @@ static int ide_cd_probe(ide_drive_t *drive)
 
 	ide_init_disk(g, drive);
 
-	ide_register_subdriver(drive, &ide_cdrom_driver);
+	ide_proc_register_driver(drive, &ide_cdrom_driver);
 
 	kref_init(&info->kref);
 
@@ -3534,7 +3538,7 @@ static int ide_cd_probe(ide_drive_t *drive)
 	g->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE;
 	if (ide_cdrom_setup(drive)) {
 		struct cdrom_device_info *devinfo = &info->devinfo;
-		ide_unregister_subdriver(drive, &ide_cdrom_driver);
+		ide_proc_unregister_driver(drive, &ide_cdrom_driver);
 		kfree(info->buffer);
 		kfree(info->toc);
 		kfree(info->changer_info);
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index fb162cb3ebf5..7fff773f2df7 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -560,7 +560,6 @@ static sector_t idedisk_capacity (ide_drive_t *drive)
 }
 
 #ifdef CONFIG_IDE_PROC_FS
-
 static int smart_enable(ide_drive_t *drive)
 {
 	ide_task_t args;
@@ -678,11 +677,6 @@ static ide_proc_entry_t idedisk_proc[] = {
 	{ "smart_thresholds",	S_IFREG|S_IRUSR,	proc_idedisk_read_smart_thresholds,	NULL },
 	{ NULL, 0, NULL, NULL }
 };
-
-#else
-
-#define	idedisk_proc	NULL
-
 #endif	/* CONFIG_IDE_PROC_FS */
 
 static void idedisk_prepare_flush(request_queue_t *q, struct request *rq)
@@ -881,6 +875,7 @@ static int set_lba_addressing(ide_drive_t *drive, int arg)
 	return 0;
 }
 
+#ifdef CONFIG_IDE_PROC_FS
 static void idedisk_add_settings(ide_drive_t *drive)
 {
 	struct hd_driveid *id = drive->id;
@@ -898,6 +893,9 @@ static void idedisk_add_settings(ide_drive_t *drive)
  	ide_add_setting(drive,	"failures",	SETTING_RW,	TYPE_INT,	0,	65535,			1,	1,	&drive->failures,	NULL);
  	ide_add_setting(drive,	"max_failures",	SETTING_RW,	TYPE_INT,	0,	65535,			1,	1,	&drive->max_failures,	NULL);
 }
+#else
+static inline void idedisk_add_settings(ide_drive_t *drive) { ; }
+#endif
 
 static void idedisk_setup (ide_drive_t *drive)
 {
@@ -1016,7 +1014,7 @@ static void ide_disk_remove(ide_drive_t *drive)
 	struct ide_disk_obj *idkp = drive->driver_data;
 	struct gendisk *g = idkp->disk;
 
-	ide_unregister_subdriver(drive, idkp->driver);
+	ide_proc_unregister_driver(drive, idkp->driver);
 
 	del_gendisk(g);
 
@@ -1081,7 +1079,9 @@ static ide_driver_t idedisk_driver = {
 	.end_request		= ide_end_request,
 	.error			= __ide_error,
 	.abort			= __ide_abort,
+#ifdef CONFIG_IDE_PROC_FS
 	.proc			= idedisk_proc,
+#endif
 };
 
 static int idedisk_open(struct inode *inode, struct file *filp)
@@ -1257,7 +1257,7 @@ static int ide_disk_probe(ide_drive_t *drive)
 
 	ide_init_disk(g, drive);
 
-	ide_register_subdriver(drive, &idedisk_driver);
+	ide_proc_register_driver(drive, &idedisk_driver);
 
 	kref_init(&idkp->kref);
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 38fe45cf4019..f429be88c4f9 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -1811,6 +1811,7 @@ static int idefloppy_identify_device (ide_drive_t *drive,struct hd_driveid *id)
 	return 0;
 }
 
+#ifdef CONFIG_IDE_PROC_FS
 static void idefloppy_add_settings(ide_drive_t *drive)
 {
 	idefloppy_floppy_t *floppy = drive->driver_data;
@@ -1823,6 +1824,9 @@ static void idefloppy_add_settings(ide_drive_t *drive)
 	ide_add_setting(drive,	"bios_sect",	SETTING_RW,	TYPE_BYTE,	0,	63,		1,		1,	&drive->bios_sect,	NULL);
 	ide_add_setting(drive,	"ticks",	SETTING_RW,	TYPE_BYTE,	0,	255,		1,		1,	&floppy->ticks,		NULL);
 }
+#else
+static inline void idefloppy_add_settings(ide_drive_t *drive) { ; }
+#endif
 
 /*
  *	Driver initialization.
@@ -1873,7 +1877,7 @@ static void ide_floppy_remove(ide_drive_t *drive)
 	idefloppy_floppy_t *floppy = drive->driver_data;
 	struct gendisk *g = floppy->disk;
 
-	ide_unregister_subdriver(drive, floppy->driver);
+	ide_proc_unregister_driver(drive, floppy->driver);
 
 	del_gendisk(g);
 
@@ -1893,7 +1897,6 @@ static void ide_floppy_release(struct kref *kref)
 }
 
 #ifdef CONFIG_IDE_PROC_FS
-
 static int proc_idefloppy_read_capacity
 	(char *page, char **start, off_t off, int count, int *eof, void *data)
 {
@@ -1909,11 +1912,6 @@ static ide_proc_entry_t idefloppy_proc[] = {
 	{ "geometry",	S_IFREG|S_IRUGO,	proc_ide_read_geometry,	NULL },
 	{ NULL, 0, NULL, NULL }
 };
-
-#else
-
-#define	idefloppy_proc	NULL
-
 #endif	/* CONFIG_IDE_PROC_FS */
 
 static int ide_floppy_probe(ide_drive_t *);
@@ -1933,7 +1931,9 @@ static ide_driver_t idefloppy_driver = {
 	.end_request		= idefloppy_do_end_request,
 	.error			= __ide_error,
 	.abort			= __ide_abort,
+#ifdef CONFIG_IDE_PROC_FS
 	.proc			= idefloppy_proc,
+#endif
 };
 
 static int idefloppy_open(struct inode *inode, struct file *filp)
@@ -2159,7 +2159,7 @@ static int ide_floppy_probe(ide_drive_t *drive)
 
 	ide_init_disk(g, drive);
 
-	ide_register_subdriver(drive, &idefloppy_driver);
+	ide_proc_register_driver(drive, &idefloppy_driver);
 
 	kref_init(&floppy->kref);
 
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index a9e0b30fb1f2..949a6f609d84 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -3,6 +3,8 @@
  *
  *  Copyright (C) 1997-1998	Mark Lord
  *  Copyright (C) 2003		Red Hat <alan@redhat.com>
+ *
+ *  Some code was moved here from ide.c, see it for original copyrights.
  */
 
 /*
@@ -121,6 +123,265 @@ static int proc_ide_read_identify
 	PROC_IDE_READ_RETURN(page,start,off,count,eof,len);
 }
 
+/**
+ *	__ide_add_setting	-	add an ide setting option
+ *	@drive: drive to use
+ *	@name: setting name
+ *	@rw: true if the function is read write
+ *	@data_type: type of data
+ *	@min: range minimum
+ *	@max: range maximum
+ *	@mul_factor: multiplication scale
+ *	@div_factor: divison scale
+ *	@data: private data field
+ *	@set: setting
+ *	@auto_remove: setting auto removal flag
+ *
+ *	Removes the setting named from the device if it is present.
+ *	The function takes the settings_lock to protect against
+ *	parallel changes. This function must not be called from IRQ
+ *	context. Returns 0 on success or -1 on failure.
+ *
+ *	BUGS: This code is seriously over-engineered. There is also
+ *	magic about how the driver specific features are setup. If
+ *	a driver is attached we assume the driver settings are auto
+ *	remove.
+ */
+
+static int __ide_add_setting(ide_drive_t *drive, const char *name, int rw, int data_type, int min, int max, int mul_factor, int div_factor, void *data, ide_procset_t *set, int auto_remove)
+{
+	ide_settings_t **p = (ide_settings_t **) &drive->settings, *setting = NULL;
+
+	down(&ide_setting_sem);
+	while ((*p) && strcmp((*p)->name, name) < 0)
+		p = &((*p)->next);
+	if ((setting = kzalloc(sizeof(*setting), GFP_KERNEL)) == NULL)
+		goto abort;
+	if ((setting->name = kmalloc(strlen(name) + 1, GFP_KERNEL)) == NULL)
+		goto abort;
+	strcpy(setting->name, name);
+	setting->rw = rw;
+	setting->data_type = data_type;
+	setting->min = min;
+	setting->max = max;
+	setting->mul_factor = mul_factor;
+	setting->div_factor = div_factor;
+	setting->data = data;
+	setting->set = set;
+
+	setting->next = *p;
+	if (auto_remove)
+		setting->auto_remove = 1;
+	*p = setting;
+	up(&ide_setting_sem);
+	return 0;
+abort:
+	up(&ide_setting_sem);
+	kfree(setting);
+	return -1;
+}
+
+int ide_add_setting(ide_drive_t *drive, const char *name, int rw, int data_type, int min, int max, int mul_factor, int div_factor, void *data, ide_procset_t *set)
+{
+	return __ide_add_setting(drive, name, rw, data_type, min, max, mul_factor, div_factor, data, set, 1);
+}
+
+EXPORT_SYMBOL(ide_add_setting);
+
+/**
+ *	__ide_remove_setting	-	remove an ide setting option
+ *	@drive: drive to use
+ *	@name: setting name
+ *
+ *	Removes the setting named from the device if it is present.
+ *	The caller must hold the setting semaphore.
+ */
+
+static void __ide_remove_setting (ide_drive_t *drive, char *name)
+{
+	ide_settings_t **p, *setting;
+
+	p = (ide_settings_t **) &drive->settings;
+
+	while ((*p) && strcmp((*p)->name, name))
+		p = &((*p)->next);
+	if ((setting = (*p)) == NULL)
+		return;
+
+	(*p) = setting->next;
+
+	kfree(setting->name);
+	kfree(setting);
+}
+
+/**
+ *	auto_remove_settings	-	remove driver specific settings
+ *	@drive: drive
+ *
+ *	Automatically remove all the driver specific settings for this
+ *	drive. This function may not be called from IRQ context. The
+ *	caller must hold ide_setting_sem.
+ */
+
+static void auto_remove_settings (ide_drive_t *drive)
+{
+	ide_settings_t *setting;
+repeat:
+	setting = drive->settings;
+	while (setting) {
+		if (setting->auto_remove) {
+			__ide_remove_setting(drive, setting->name);
+			goto repeat;
+		}
+		setting = setting->next;
+	}
+}
+
+/**
+ *	ide_find_setting_by_name	-	find a drive specific setting
+ *	@drive: drive to scan
+ *	@name: setting name
+ *
+ *	Scan's the device setting table for a matching entry and returns
+ *	this or NULL if no entry is found. The caller must hold the
+ *	setting semaphore
+ */
+
+static ide_settings_t *ide_find_setting_by_name(ide_drive_t *drive, char *name)
+{
+	ide_settings_t *setting = drive->settings;
+
+	while (setting) {
+		if (strcmp(setting->name, name) == 0)
+			break;
+		setting = setting->next;
+	}
+	return setting;
+}
+
+/**
+ *	ide_read_setting	-	read an IDE setting
+ *	@drive: drive to read from
+ *	@setting: drive setting
+ *
+ *	Read a drive setting and return the value. The caller
+ *	must hold the ide_setting_sem when making this call.
+ *
+ *	BUGS: the data return and error are the same return value
+ *	so an error -EINVAL and true return of the same value cannot
+ *	be told apart
+ */
+
+static int ide_read_setting(ide_drive_t *drive, ide_settings_t *setting)
+{
+	int		val = -EINVAL;
+	unsigned long	flags;
+
+	if ((setting->rw & SETTING_READ)) {
+		spin_lock_irqsave(&ide_lock, flags);
+		switch(setting->data_type) {
+			case TYPE_BYTE:
+				val = *((u8 *) setting->data);
+				break;
+			case TYPE_SHORT:
+				val = *((u16 *) setting->data);
+				break;
+			case TYPE_INT:
+				val = *((u32 *) setting->data);
+				break;
+		}
+		spin_unlock_irqrestore(&ide_lock, flags);
+	}
+	return val;
+}
+
+/**
+ *	ide_write_setting	-	read an IDE setting
+ *	@drive: drive to read from
+ *	@setting: drive setting
+ *	@val: value
+ *
+ *	Write a drive setting if it is possible. The caller
+ *	must hold the ide_setting_sem when making this call.
+ *
+ *	BUGS: the data return and error are the same return value
+ *	so an error -EINVAL and true return of the same value cannot
+ *	be told apart
+ *
+ *	FIXME:  This should be changed to enqueue a special request
+ *	to the driver to change settings, and then wait on a sema for completion.
+ *	The current scheme of polling is kludgy, though safe enough.
+ */
+
+static int ide_write_setting(ide_drive_t *drive, ide_settings_t *setting, int val)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (setting->set)
+		return setting->set(drive, val);
+	if (!(setting->rw & SETTING_WRITE))
+		return -EPERM;
+	if (val < setting->min || val > setting->max)
+		return -EINVAL;
+	if (ide_spin_wait_hwgroup(drive))
+		return -EBUSY;
+	switch (setting->data_type) {
+		case TYPE_BYTE:
+			*((u8 *) setting->data) = val;
+			break;
+		case TYPE_SHORT:
+			*((u16 *) setting->data) = val;
+			break;
+		case TYPE_INT:
+			*((u32 *) setting->data) = val;
+			break;
+	}
+	spin_unlock_irq(&ide_lock);
+	return 0;
+}
+
+static int set_xfer_rate (ide_drive_t *drive, int arg)
+{
+	int err;
+
+	if (arg < 0 || arg > 70)
+		return -EINVAL;
+
+	err = ide_wait_cmd(drive,
+			WIN_SETFEATURES, (u8) arg,
+			SETFEATURES_XFER, 0, NULL);
+
+	if (!err && arg) {
+		ide_set_xfer_rate(drive, (u8) arg);
+		ide_driveid_update(drive);
+	}
+	return err;
+}
+
+/**
+ *	ide_add_generic_settings	-	generic ide settings
+ *	@drive: drive being configured
+ *
+ *	Add the generic parts of the system settings to the /proc files.
+ *	The caller must not be holding the ide_setting_sem.
+ */
+
+void ide_add_generic_settings (ide_drive_t *drive)
+{
+/*
+ *			  drive		setting name		read/write access				data type	min	max				mul_factor	div_factor	data pointer			set function
+ */
+	__ide_add_setting(drive,	"io_32bit",		drive->no_io_32bit ? SETTING_READ : SETTING_RW,	TYPE_BYTE,	0,	1 + (SUPPORT_VLB_SYNC << 1),	1,		1,		&drive->io_32bit,		set_io_32bit,	0);
+	__ide_add_setting(drive,	"keepsettings",		SETTING_RW,					TYPE_BYTE,	0,	1,				1,		1,		&drive->keep_settings,		NULL,		0);
+	__ide_add_setting(drive,	"nice1",		SETTING_RW,					TYPE_BYTE,	0,	1,				1,		1,		&drive->nice1,			NULL,		0);
+	__ide_add_setting(drive,	"pio_mode",		SETTING_WRITE,					TYPE_BYTE,	0,	255,				1,		1,		NULL,				set_pio_mode,	0);
+	__ide_add_setting(drive,	"unmaskirq",		drive->no_unmask ? SETTING_READ : SETTING_RW,	TYPE_BYTE,	0,	1,				1,		1,		&drive->unmask,			NULL,		0);
+	__ide_add_setting(drive,	"using_dma",		SETTING_RW,					TYPE_BYTE,	0,	1,				1,		1,		&drive->using_dma,		set_using_dma,	0);
+	__ide_add_setting(drive,	"init_speed",		SETTING_RW,					TYPE_BYTE,	0,	70,				1,		1,		&drive->init_speed,		NULL,		0);
+	__ide_add_setting(drive,	"current_speed",	SETTING_RW,					TYPE_BYTE,	0,	70,				1,		1,		&drive->current_speed,		set_xfer_rate,	0);
+	__ide_add_setting(drive,	"number",		SETTING_RW,					TYPE_BYTE,	0,	3,				1,		1,		&drive->dn,			NULL,		0);
+}
+
 static void proc_ide_settings_warn(void)
 {
 	static int warned = 0;
@@ -399,7 +660,7 @@ static ide_proc_entry_t generic_drive_entries[] = {
 	{ NULL,	0, NULL, NULL }
 };
 
-void ide_add_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p, void *data)
+static void ide_add_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p, void *data)
 {
 	struct proc_dir_entry *ent;
 
@@ -415,7 +676,7 @@ void ide_add_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p, void
 	}
 }
 
-void ide_remove_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p)
+static void ide_remove_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p)
 {
 	if (!dir || !p)
 		return;
@@ -425,6 +686,51 @@ void ide_remove_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p)
 	}
 }
 
+void ide_proc_register_driver(ide_drive_t *drive, ide_driver_t *driver)
+{
+	ide_add_proc_entries(drive->proc, driver->proc, drive);
+}
+
+EXPORT_SYMBOL(ide_proc_register_driver);
+
+/**
+ *	ide_proc_unregister_driver	-	remove driver specific data
+ *	@drive: drive
+ *	@driver: driver
+ *
+ *	Clean up the driver specific /proc files and IDE settings
+ *	for a given drive.
+ *
+ *	Takes ide_setting_sem and ide_lock.
+ *	Caller must hold none of the locks.
+ */
+
+void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver)
+{
+	unsigned long flags;
+
+	ide_remove_proc_entries(drive->proc, driver->proc);
+
+	down(&ide_setting_sem);
+	spin_lock_irqsave(&ide_lock, flags);
+	/*
+	 * ide_setting_sem protects the settings list
+	 * ide_lock protects the use of settings
+	 *
+	 * so we need to hold both, ide_settings_sem because we want to
+	 * modify the settings list, and ide_lock because we cannot take
+	 * a setting out that is being used.
+	 *
+	 * OTOH both ide_{read,write}_setting are only ever used under
+	 * ide_setting_sem.
+	 */
+	auto_remove_settings(drive);
+	spin_unlock_irqrestore(&ide_lock, flags);
+	up(&ide_setting_sem);
+}
+
+EXPORT_SYMBOL(ide_proc_unregister_driver);
+
 static void create_proc_ide_drives(ide_hwif_t *hwif)
 {
 	int	d;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index fa0d22de37a7..e82bfa5e0ab8 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -4561,6 +4561,8 @@ static void idetape_get_blocksize_from_block_descriptor(ide_drive_t *drive)
 	printk(KERN_INFO "ide-tape: Adjusted block size - %d\n", tape->tape_block_size);
 #endif /* IDETAPE_DEBUG_INFO */
 }
+
+#ifdef CONFIG_IDE_PROC_FS
 static void idetape_add_settings (ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
@@ -4583,6 +4585,9 @@ static void idetape_add_settings (ide_drive_t *drive)
 	ide_add_setting(drive,	"avg_speed",		SETTING_READ,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->avg_speed,			NULL);
 	ide_add_setting(drive,	"debug_level",		SETTING_RW,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->debug_level,			NULL);
 }
+#else
+static inline void idetape_add_settings(ide_drive_t *drive) { ; }
+#endif
 
 /*
  *	ide_setup is called to:
@@ -4703,7 +4708,7 @@ static void ide_tape_remove(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
-	ide_unregister_subdriver(drive, tape->driver);
+	ide_proc_unregister_driver(drive, tape->driver);
 
 	ide_unregister_region(tape->disk);
 
@@ -4731,7 +4736,6 @@ static void ide_tape_release(struct kref *kref)
 }
 
 #ifdef CONFIG_IDE_PROC_FS
-
 static int proc_idetape_read_name
 	(char *page, char **start, off_t off, int count, int *eof, void *data)
 {
@@ -4749,11 +4753,6 @@ static ide_proc_entry_t idetape_proc[] = {
 	{ "name",	S_IFREG|S_IRUGO,	proc_idetape_read_name,	NULL },
 	{ NULL, 0, NULL, NULL }
 };
-
-#else
-
-#define	idetape_proc	NULL
-
 #endif
 
 static int ide_tape_probe(ide_drive_t *);
@@ -4773,7 +4772,9 @@ static ide_driver_t idetape_driver = {
 	.end_request		= idetape_end_request,
 	.error			= __ide_error,
 	.abort			= __ide_abort,
+#ifdef CONFIG_IDE_PROC_FS
 	.proc			= idetape_proc,
+#endif
 };
 
 /*
@@ -4864,7 +4865,7 @@ static int ide_tape_probe(ide_drive_t *drive)
 
 	ide_init_disk(g, drive);
 
-	ide_register_subdriver(drive, &idetape_driver);
+	ide_proc_register_driver(drive, &idetape_driver);
 
 	kref_init(&tape->kref);
 
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 8793a960210c..614c5fd43cd2 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -825,178 +825,6 @@ DECLARE_MUTEX(ide_setting_sem);
 
 EXPORT_SYMBOL_GPL(ide_setting_sem);
 
-/**
- *	__ide_add_setting	-	add an ide setting option
- *	@drive: drive to use
- *	@name: setting name
- *	@rw: true if the function is read write
- *	@data_type: type of data
- *	@min: range minimum
- *	@max: range maximum
- *	@mul_factor: multiplication scale
- *	@div_factor: divison scale
- *	@data: private data field
- *	@set: setting
- *	@auto_remove: setting auto removal flag
- *
- *	Removes the setting named from the device if it is present.
- *	The function takes the settings_lock to protect against 
- *	parallel changes. This function must not be called from IRQ
- *	context. Returns 0 on success or -1 on failure.
- *
- *	BUGS: This code is seriously over-engineered. There is also
- *	magic about how the driver specific features are setup. If
- *	a driver is attached we assume the driver settings are auto
- *	remove.
- */
-
-static int __ide_add_setting(ide_drive_t *drive, const char *name, int rw, int data_type, int min, int max, int mul_factor, int div_factor, void *data, ide_procset_t *set, int auto_remove)
-{
-	ide_settings_t **p = (ide_settings_t **) &drive->settings, *setting = NULL;
-
-	down(&ide_setting_sem);
-	while ((*p) && strcmp((*p)->name, name) < 0)
-		p = &((*p)->next);
-	if ((setting = kzalloc(sizeof(*setting), GFP_KERNEL)) == NULL)
-		goto abort;
-	if ((setting->name = kmalloc(strlen(name) + 1, GFP_KERNEL)) == NULL)
-		goto abort;
-	strcpy(setting->name, name);
-	setting->rw = rw;
-	setting->data_type = data_type;
-	setting->min = min;
-	setting->max = max;
-	setting->mul_factor = mul_factor;
-	setting->div_factor = div_factor;
-	setting->data = data;
-	setting->set = set;
-	
-	setting->next = *p;
-	if (auto_remove)
-		setting->auto_remove = 1;
-	*p = setting;
-	up(&ide_setting_sem);
-	return 0;
-abort:
-	up(&ide_setting_sem);
-	kfree(setting);
-	return -1;
-}
-
-int ide_add_setting(ide_drive_t *drive, const char *name, int rw, int data_type, int min, int max, int mul_factor, int div_factor, void *data, ide_procset_t *set)
-{
-	return __ide_add_setting(drive, name, rw, data_type, min, max, mul_factor, div_factor, data, set, 1);
-}
-
-EXPORT_SYMBOL(ide_add_setting);
-
-/**
- *	__ide_remove_setting	-	remove an ide setting option
- *	@drive: drive to use
- *	@name: setting name
- *
- *	Removes the setting named from the device if it is present.
- *	The caller must hold the setting semaphore.
- */
- 
-static void __ide_remove_setting (ide_drive_t *drive, char *name)
-{
-	ide_settings_t **p, *setting;
-
-	p = (ide_settings_t **) &drive->settings;
-
-	while ((*p) && strcmp((*p)->name, name))
-		p = &((*p)->next);
-	if ((setting = (*p)) == NULL)
-		return;
-
-	(*p) = setting->next;
-	
-	kfree(setting->name);
-	kfree(setting);
-}
-
-/**
- *	ide_find_setting_by_name	-	find a drive specific setting
- *	@drive: drive to scan
- *	@name: setting name
- *
- *	Scan's the device setting table for a matching entry and returns
- *	this or NULL if no entry is found. The caller must hold the
- *	setting semaphore
- */
- 
-ide_settings_t *ide_find_setting_by_name (ide_drive_t *drive, char *name)
-{
-	ide_settings_t *setting = drive->settings;
-
-	while (setting) {
-		if (strcmp(setting->name, name) == 0)
-			break;
-		setting = setting->next;
-	}
-	return setting;
-}
-
-/**
- *	auto_remove_settings	-	remove driver specific settings
- *	@drive: drive
- *
- *	Automatically remove all the driver specific settings for this
- *	drive. This function may not be called from IRQ context. The
- *	caller must hold ide_setting_sem.
- */
- 
-static void auto_remove_settings (ide_drive_t *drive)
-{
-	ide_settings_t *setting;
-repeat:
-	setting = drive->settings;
-	while (setting) {
-		if (setting->auto_remove) {
-			__ide_remove_setting(drive, setting->name);
-			goto repeat;
-		}
-		setting = setting->next;
-	}
-}
-
-/**
- *	ide_read_setting	-	read an IDE setting
- *	@drive: drive to read from
- *	@setting: drive setting
- *
- *	Read a drive setting and return the value. The caller
- *	must hold the ide_setting_sem when making this call.
- *
- *	BUGS: the data return and error are the same return value
- *	so an error -EINVAL and true return of the same value cannot
- *	be told apart
- */
- 
-int ide_read_setting (ide_drive_t *drive, ide_settings_t *setting)
-{
-	int		val = -EINVAL;
-	unsigned long	flags;
-
-	if ((setting->rw & SETTING_READ)) {
-		spin_lock_irqsave(&ide_lock, flags);
-		switch(setting->data_type) {
-			case TYPE_BYTE:
-				val = *((u8 *) setting->data);
-				break;
-			case TYPE_SHORT:
-				val = *((u16 *) setting->data);
-				break;
-			case TYPE_INT:
-				val = *((u32 *) setting->data);
-				break;
-		}
-		spin_unlock_irqrestore(&ide_lock, flags);
-	}
-	return val;
-}
-
 /**
  *	ide_spin_wait_hwgroup	-	wait for group
  *	@drive: drive in the group
@@ -1030,52 +858,7 @@ int ide_spin_wait_hwgroup (ide_drive_t *drive)
 
 EXPORT_SYMBOL(ide_spin_wait_hwgroup);
 
-/**
- *	ide_write_setting	-	read an IDE setting
- *	@drive: drive to read from
- *	@setting: drive setting
- *	@val: value
- *
- *	Write a drive setting if it is possible. The caller
- *	must hold the ide_setting_sem when making this call.
- *
- *	BUGS: the data return and error are the same return value
- *	so an error -EINVAL and true return of the same value cannot
- *	be told apart
- *
- *	FIXME:  This should be changed to enqueue a special request
- *	to the driver to change settings, and then wait on a sema for completion.
- *	The current scheme of polling is kludgy, though safe enough.
- */
-
-int ide_write_setting (ide_drive_t *drive, ide_settings_t *setting, int val)
-{
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-	if (setting->set)
-		return setting->set(drive, val);
-	if (!(setting->rw & SETTING_WRITE))
-		return -EPERM;
-	if (val < setting->min || val > setting->max)
-		return -EINVAL;
-	if (ide_spin_wait_hwgroup(drive))
-		return -EBUSY;
-	switch (setting->data_type) {
-		case TYPE_BYTE:
-			*((u8 *) setting->data) = val;
-			break;
-		case TYPE_SHORT:
-			*((u16 *) setting->data) = val;
-			break;
-		case TYPE_INT:
-			*((u32 *) setting->data) = val;
-			break;
-	}
-	spin_unlock_irq(&ide_lock);
-	return 0;
-}
-
-static int set_io_32bit(ide_drive_t *drive, int arg)
+int set_io_32bit(ide_drive_t *drive, int arg)
 {
 	if (drive->no_io_32bit)
 		return -EPERM;
@@ -1104,7 +887,7 @@ static int set_ksettings(ide_drive_t *drive, int arg)
 	return 0;
 }
 
-static int set_using_dma (ide_drive_t *drive, int arg)
+int set_using_dma(ide_drive_t *drive, int arg)
 {
 #ifdef CONFIG_BLK_DEV_IDEDMA
 	ide_hwif_t *hwif = drive->hwif;
@@ -1152,7 +935,7 @@ out:
 #endif
 }
 
-static int set_pio_mode (ide_drive_t *drive, int arg)
+int set_pio_mode(ide_drive_t *drive, int arg)
 {
 	struct request rq;
 
@@ -1186,48 +969,6 @@ static int set_unmaskirq(ide_drive_t *drive, int arg)
 	return 0;
 }
 
-static int set_xfer_rate (ide_drive_t *drive, int arg)
-{
-	int err;
-
-	if (arg < 0 || arg > 70)
-		return -EINVAL;
-
-	err = ide_wait_cmd(drive,
-			WIN_SETFEATURES, (u8) arg,
-			SETFEATURES_XFER, 0, NULL);
-
-	if (!err && arg) {
-		ide_set_xfer_rate(drive, (u8) arg);
-		ide_driveid_update(drive);
-	}
-	return err;
-}
-
-/**
- *	ide_add_generic_settings	-	generic ide settings
- *	@drive: drive being configured
- *
- *	Add the generic parts of the system settings to the /proc files.
- *	The caller must not be holding the ide_setting_sem.
- */
-
-void ide_add_generic_settings (ide_drive_t *drive)
-{
-/*
- *			  drive		setting name		read/write access				data type	min	max				mul_factor	div_factor	data pointer			set function
- */
-	__ide_add_setting(drive,	"io_32bit",		drive->no_io_32bit ? SETTING_READ : SETTING_RW,	TYPE_BYTE,	0,	1 + (SUPPORT_VLB_SYNC << 1),	1,		1,		&drive->io_32bit,		set_io_32bit,	0);
-	__ide_add_setting(drive,	"keepsettings",		SETTING_RW,					TYPE_BYTE,	0,	1,				1,		1,		&drive->keep_settings,		NULL,		0);
-	__ide_add_setting(drive,	"nice1",		SETTING_RW,					TYPE_BYTE,	0,	1,				1,		1,		&drive->nice1,			NULL,		0);
-	__ide_add_setting(drive,	"pio_mode",		SETTING_WRITE,					TYPE_BYTE,	0,	255,				1,		1,		NULL,				set_pio_mode,	0);
-	__ide_add_setting(drive,	"unmaskirq",		drive->no_unmask ? SETTING_READ : SETTING_RW,	TYPE_BYTE,	0,	1,				1,		1,		&drive->unmask,			NULL,		0);
-	__ide_add_setting(drive,	"using_dma",		SETTING_RW,					TYPE_BYTE,	0,	1,				1,		1,		&drive->using_dma,		set_using_dma,	0);
-	__ide_add_setting(drive,	"init_speed",		SETTING_RW,					TYPE_BYTE,	0,	70,				1,		1,		&drive->init_speed,		NULL,		0);
-	__ide_add_setting(drive,	"current_speed",	SETTING_RW,					TYPE_BYTE,	0,	70,				1,		1,		&drive->current_speed,		set_xfer_rate,	0);
-	__ide_add_setting(drive,	"number",		SETTING_RW,					TYPE_BYTE,	0,	3,				1,		1,		&drive->dn,			NULL,		0);
-}
-
 /**
  *	system_bus_clock	-	clock guess
  *
@@ -1922,54 +1663,6 @@ static void __init probe_for_hwifs (void)
 #endif
 }
 
-void ide_register_subdriver(ide_drive_t *drive, ide_driver_t *driver)
-{
-#ifdef CONFIG_IDE_PROC_FS
-	ide_add_proc_entries(drive->proc, driver->proc, drive);
-#endif
-}
-
-EXPORT_SYMBOL(ide_register_subdriver);
-
-/**
- *	ide_unregister_subdriver	-	disconnect drive from driver
- *	@drive: drive to unplug
- *	@driver: driver
- *
- *	Disconnect a drive from the driver it was attached to and then
- *	clean up the various proc files and other objects attached to it.
- *
- *	Takes ide_setting_sem and ide_lock.
- *	Caller must hold none of the locks.
- */
-
-void ide_unregister_subdriver(ide_drive_t *drive, ide_driver_t *driver)
-{
-	unsigned long flags;
-
-#ifdef CONFIG_IDE_PROC_FS
-	ide_remove_proc_entries(drive->proc, driver->proc);
-#endif
-	down(&ide_setting_sem);
-	spin_lock_irqsave(&ide_lock, flags);
-	/*
-	 * ide_setting_sem protects the settings list
-	 * ide_lock protects the use of settings
-	 *
-	 * so we need to hold both, ide_settings_sem because we want to
-	 * modify the settings list, and ide_lock because we cannot take
-	 * a setting out that is being used.
-	 *
-	 * OTOH both ide_{read,write}_setting are only ever used under
-	 * ide_setting_sem.
-	 */
-	auto_remove_settings(drive);
-	spin_unlock_irqrestore(&ide_lock, flags);
-	up(&ide_setting_sem);
-}
-
-EXPORT_SYMBOL(ide_unregister_subdriver);
-
 /*
  * Probe module
  */
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 4388b8ab69a1..8263f752809d 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -721,6 +721,7 @@ static ide_startstop_t idescsi_do_request (ide_drive_t *drive, struct request *r
 	return ide_stopped;
 }
 
+#ifdef CONFIG_IDE_PROC_FS
 static void idescsi_add_settings(ide_drive_t *drive)
 {
 	idescsi_scsi_t *scsi = drive_to_idescsi(drive);
@@ -734,6 +735,9 @@ static void idescsi_add_settings(ide_drive_t *drive)
 	ide_add_setting(drive,	"transform",	SETTING_RW,	TYPE_INT,	0,	3,	1,		1,		&scsi->transform,	NULL);
 	ide_add_setting(drive,	"log",		SETTING_RW,	TYPE_INT,	0,	1,	1,		1,		&scsi->log,		NULL);
 }
+#else
+static inline void idescsi_add_settings(ide_drive_t *drive) { ; }
+#endif
 
 /*
  *	Driver initialization.
@@ -756,7 +760,7 @@ static void ide_scsi_remove(ide_drive_t *drive)
 	struct ide_scsi_obj *scsi = scsihost_to_idescsi(scsihost);
 	struct gendisk *g = scsi->disk;
 
-	ide_unregister_subdriver(drive, scsi->driver);
+	ide_proc_unregister_driver(drive, scsi->driver);
 
 	ide_unregister_region(g);
 
@@ -775,8 +779,6 @@ static ide_proc_entry_t idescsi_proc[] = {
 	{ "capacity", S_IFREG|S_IRUGO, proc_ide_read_capacity, NULL },
 	{ NULL, 0, NULL, NULL }
 };
-#else
-# define idescsi_proc	NULL
 #endif
 
 static ide_driver_t idescsi_driver = {
@@ -790,11 +792,13 @@ static ide_driver_t idescsi_driver = {
 	.version		= IDESCSI_VERSION,
 	.media			= ide_scsi,
 	.supports_dsc_overlap	= 0,
-	.proc			= idescsi_proc,
 	.do_request		= idescsi_do_request,
 	.end_request		= idescsi_end_request,
 	.error                  = idescsi_atapi_error,
 	.abort                  = idescsi_atapi_abort,
+#ifdef CONFIG_IDE_PROC_FS
+	.proc			= idescsi_proc,
+#endif
 };
 
 static int idescsi_ide_open(struct inode *inode, struct file *filp)
@@ -1153,7 +1157,7 @@ static int ide_scsi_probe(ide_drive_t *drive)
 	idescsi->host = host;
 	idescsi->disk = g;
 	g->private_data = &idescsi->driver;
-	ide_register_subdriver(drive, &idescsi_driver);
+	ide_proc_register_driver(drive, &idescsi_driver);
 	err = 0;
 	idescsi_setup(drive, idescsi);
 	g->fops = &idescsi_ops;
@@ -1165,7 +1169,7 @@ static int ide_scsi_probe(ide_drive_t *drive)
 	}
 	/* fall through on error */
 	ide_unregister_region(g);
-	ide_unregister_subdriver(drive, &idescsi_driver);
+	ide_proc_unregister_driver(drive, &idescsi_driver);
 
 	put_disk(g);
 out_host_put:
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 591a0b55e31c..477b8c6be727 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -559,9 +559,10 @@ typedef struct ide_drive_s {
 	struct ide_drive_s 	*next;	/* circular list of hwgroup drives */
 	void		*driver_data;	/* extra driver data */
 	struct hd_driveid	*id;	/* drive model identification info */
+#ifdef CONFIG_IDE_PROC_FS
 	struct proc_dir_entry *proc;	/* /proc/ide/ directory entry */
 	struct ide_settings_s *settings;/* /proc/ide/ drive settings */
-
+#endif
 	struct hwif_s		*hwif;	/* actually (ide_hwif_t *) */
 
 	unsigned long sleep;		/* sleep until this time */
@@ -858,8 +859,15 @@ typedef struct hwgroup_s {
 	unsigned char cmd_buf[4];
 } ide_hwgroup_t;
 
-/* structure attached to the request for IDE_TASK_CMDS */
+typedef struct ide_driver_s ide_driver_t;
+
+extern struct semaphore ide_setting_sem;
 
+int set_io_32bit(ide_drive_t *, int);
+int set_pio_mode(ide_drive_t *, int);
+int set_using_dma(ide_drive_t *, int);
+
+#ifdef CONFIG_IDE_PROC_FS
 /*
  * configurable drive settings
  */
@@ -887,12 +895,7 @@ typedef struct ide_settings_s {
 	struct ide_settings_s	*next;
 } ide_settings_t;
 
-extern struct semaphore ide_setting_sem;
 int ide_add_setting(ide_drive_t *, const char *, int, int, int, int, int, int, void *, ide_procset_t *set);
-extern ide_settings_t *ide_find_setting_by_name(ide_drive_t *drive, char *name);
-extern int ide_read_setting(ide_drive_t *t, ide_settings_t *setting);
-extern int ide_write_setting(ide_drive_t *drive, ide_settings_t *setting, int val);
-extern void ide_add_generic_settings(ide_drive_t *drive);
 
 /*
  * /proc/ide interface
@@ -904,15 +907,17 @@ typedef struct {
 	write_proc_t	*write_proc;
 } ide_proc_entry_t;
 
-#ifdef CONFIG_IDE_PROC_FS
 extern struct proc_dir_entry *proc_ide_root;
 
 void proc_ide_create(void);
 void proc_ide_destroy(void);
 void create_proc_ide_interfaces(void);
 void destroy_proc_ide_interface(ide_hwif_t *);
-void ide_add_proc_entries(struct proc_dir_entry *, ide_proc_entry_t *, void *);
-void ide_remove_proc_entries(struct proc_dir_entry *, ide_proc_entry_t *);
+void ide_proc_register_driver(ide_drive_t *, ide_driver_t *);
+void ide_proc_unregister_driver(ide_drive_t *, ide_driver_t *);
+
+void ide_add_generic_settings(ide_drive_t *);
+
 read_proc_t proc_ide_read_capacity;
 read_proc_t proc_ide_read_geometry;
 
@@ -940,6 +945,9 @@ static inline void proc_ide_create(void) { ; }
 static inline void proc_ide_destroy(void) { ; }
 static inline void create_proc_ide_interfaces(void) { ; }
 static inline void destroy_proc_ide_interface(ide_hwif_t *hwif) { ; }
+static inline void ide_proc_register_driver(ide_drive_t *drive, ide_driver_t *driver) { ; }
+static inline void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver) { ; }
+static inline void ide_add_generic_settings(ide_drive_t *drive) { ; }
 #define PROC_IDE_READ_RETURN(page,start,off,count,eof,len) return 0;
 #endif
 
@@ -982,7 +990,7 @@ enum {
  * The gendriver.owner field should be set to the module owner of this driver.
  * The gendriver.name field should be set to the name of this driver
  */
-typedef struct ide_driver_s {
+struct ide_driver_s {
 	const char			*version;
 	u8				media;
 	unsigned supports_dsc_overlap	: 1;
@@ -990,12 +998,14 @@ typedef struct ide_driver_s {
 	int		(*end_request)(ide_drive_t *, int, int);
 	ide_startstop_t	(*error)(ide_drive_t *, struct request *rq, u8, u8);
 	ide_startstop_t	(*abort)(ide_drive_t *, struct request *rq);
-	ide_proc_entry_t	*proc;
 	struct device_driver	gen_driver;
 	int		(*probe)(ide_drive_t *);
 	void		(*remove)(ide_drive_t *);
 	void		(*shutdown)(ide_drive_t *);
-} ide_driver_t;
+#ifdef CONFIG_IDE_PROC_FS
+	ide_proc_entry_t	*proc;
+#endif
+};
 
 #define to_ide_driver(drv) container_of(drv, ide_driver_t, gen_driver)
 
@@ -1205,9 +1215,6 @@ extern void default_hwif_iops(ide_hwif_t *);
 extern void default_hwif_mmiops(ide_hwif_t *);
 extern void default_hwif_transport(ide_hwif_t *);
 
-void ide_register_subdriver(ide_drive_t *, ide_driver_t *);
-void ide_unregister_subdriver(ide_drive_t *, ide_driver_t *);
-
 #define ON_BOARD		1
 #define NEVER_BOARD		0
 
-- 
cgit v1.2.3


From 7f8f48af0861c38c28d4abd550102643e0ea9e6a Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:10 +0200
Subject: ide: cable detection fixes (take 2)

Tejun's recent eighty_ninty_three() fix has inspired me to do more thorough
review of the cable detection code...

* print user-friendly warning about limiting the maximum transfer speed
  to UDMA33 (and the reason behind it) when 80-wire cable is not detected,
  also while at it cleanup eighty_ninty_three() a bit

* use eighty_ninty_three() in ide_ata66_check(), this actually fixes 3 bugs:
  - bit 14 (word 93 validity check) == 1 && bit 13 (80-wire cable test) == 1
    were used as 80-wire cable present test for CONFIG_IDEDMA_IVB=n case
    (please see FIXME comment in eighty_ninty_three() for more details)
  - CONFIG_IDEDMA_IVB=y/n cases were interchanged
  - check for SATA devices was missing

* remove private cable warnings from pdc_202xx{old,new} drivers now that core
  code provides this functionality (plus, in pdc202xx_new case the test could
  give false warnings for ATAPI devices because pdc202xx_new driver doesn't
  even support ATAPI DMA)

Cc: Tejun Heo <htejun@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-iops.c         | 53 +++++++++++++++++++++++-------------------
 drivers/ide/ide-lib.c          | 11 +++++++--
 drivers/ide/pci/pdc202xx_new.c | 16 -------------
 drivers/ide/pci/pdc202xx_old.c | 20 +---------------
 include/linux/ide.h            |  1 +
 5 files changed, 40 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index ed6128f6cd98..f0be5f665a0e 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -571,25 +571,40 @@ EXPORT_SYMBOL(ide_wait_stat);
  */
 u8 eighty_ninty_three (ide_drive_t *drive)
 {
-	if(HWIF(drive)->udma_four == 0)
-		return 0;
+	ide_hwif_t *hwif = drive->hwif;
+	struct hd_driveid *id = drive->id;
+
+	if (hwif->udma_four == 0)
+		goto no_80w;
 
 	/* Check for SATA but only if we are ATA5 or higher */
-	if (drive->id->hw_config == 0 && (drive->id->major_rev_num & 0x7FE0))
+	if (id->hw_config == 0 && (id->major_rev_num & 0x7FE0))
 		return 1;
-	if (!(drive->id->hw_config & 0x6000))
-		return 0;
-#ifndef CONFIG_IDEDMA_IVB
-	if(!(drive->id->hw_config & 0x4000))
-		return 0;
-#endif /* CONFIG_IDEDMA_IVB */
+
 	/*
 	 * FIXME:
 	 * - change master/slave IDENTIFY order
 	 * - force bit13 (80c cable present) check
 	 *   (unless the slave device is pre-ATA3)
 	 */
-	return 1;
+#ifndef CONFIG_IDEDMA_IVB
+	if (id->hw_config & 0x4000)
+#else
+	if (id->hw_config & 0x6000)
+#endif
+		return 1;
+
+no_80w:
+	if (drive->udma33_warned == 1)
+		return 0;
+
+	printk(KERN_WARNING "%s: %s side 80-wire cable detection failed, "
+			    "limiting max speed to UDMA33\n",
+			    drive->name, hwif->udma_four ? "drive" : "host");
+
+	drive->udma33_warned = 1;
+
+	return 0;
 }
 
 int ide_ata66_check (ide_drive_t *drive, ide_task_t *args)
@@ -597,23 +612,13 @@ int ide_ata66_check (ide_drive_t *drive, ide_task_t *args)
 	if ((args->tfRegister[IDE_COMMAND_OFFSET] == WIN_SETFEATURES) &&
 	    (args->tfRegister[IDE_SECTOR_OFFSET] > XFER_UDMA_2) &&
 	    (args->tfRegister[IDE_FEATURE_OFFSET] == SETFEATURES_XFER)) {
-#ifndef CONFIG_IDEDMA_IVB
-		if ((drive->id->hw_config & 0x6000) == 0) {
-#else /* !CONFIG_IDEDMA_IVB */
-		if (((drive->id->hw_config & 0x2000) == 0) ||
-		    ((drive->id->hw_config & 0x4000) == 0)) {
-#endif /* CONFIG_IDEDMA_IVB */
-			printk("%s: Speed warnings UDMA 3/4/5 is not "
-				"functional.\n", drive->name);
-			return 1;
-		}
-		if (!HWIF(drive)->udma_four) {
-			printk("%s: Speed warnings UDMA 3/4/5 is not "
-				"functional.\n",
-				HWIF(drive)->name);
+		if (eighty_ninty_three(drive) == 0) {
+			printk(KERN_WARNING "%s: UDMA speeds >UDMA33 cannot "
+					    "be set\n", drive->name);
 			return 1;
 		}
 	}
+
 	return 0;
 }
 
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index 4557fc5a3ea3..3be3c69383f2 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -88,8 +88,15 @@ u8 ide_rate_filter(ide_drive_t *drive, u8 speed)
 	if (hwif->udma_filter)
 		mask = hwif->udma_filter(drive);
 
-	if ((mask & 0x78) && (eighty_ninty_three(drive) == 0))
-		mask &= 0x07;
+	/*
+	 * TODO: speed > XFER_UDMA_2 extra check is needed to avoid false
+	 * cable warning from eighty_ninty_three(), moving ide_rate_filter()
+	 * calls from ->speedproc to core code will make this hack go away
+	 */
+	if (speed > XFER_UDMA_2) {
+		if ((mask & 0x78) && (eighty_ninty_three(drive) == 0))
+			mask &= 0x07;
+	}
 
 	if (mask)
 		mode = fls(mask) - 1 + XFER_UDMA_0;
diff --git a/drivers/ide/pci/pdc202xx_new.c b/drivers/ide/pci/pdc202xx_new.c
index 772ca4007de4..65b1e124edf7 100644
--- a/drivers/ide/pci/pdc202xx_new.c
+++ b/drivers/ide/pci/pdc202xx_new.c
@@ -37,8 +37,6 @@
 #include <asm/pci-bridge.h>
 #endif
 
-#define PDC202_DEBUG_CABLE	0
-
 #undef DEBUG
 
 #ifdef DEBUG
@@ -234,17 +232,8 @@ static int config_chipset_for_dma(ide_drive_t *drive)
 {
 	struct hd_driveid *id	= drive->id;
 	ide_hwif_t *hwif	= HWIF(drive);
-	u8 ultra_66		= (id->dma_ultra & 0x0078) ? 1 : 0;
-	u8 cable		= pdcnew_cable_detect(hwif);
 	u8 speed;
 
-	if (ultra_66 && cable) {
-		printk(KERN_WARNING "Warning: %s channel "
-		       "requires an 80-pin cable for operation.\n",
-		       hwif->channel ? "Secondary" : "Primary");
-		printk(KERN_WARNING "%s reduced to Ultra33 mode.\n", drive->name);
-	}
-
 	if (id->capability & 4) {
 		/*
 		 * Set IORDY_EN & PREFETCH_EN (this seems to have
@@ -547,11 +536,6 @@ static void __devinit init_hwif_pdc202new(ide_hwif_t *hwif)
 	if (!noautodma)
 		hwif->autodma = 1;
 	hwif->drives[0].autodma = hwif->drives[1].autodma = hwif->autodma;
-
-#if PDC202_DEBUG_CABLE
-	printk(KERN_DEBUG "%s: %s-pin cable\n",
-		hwif->name, hwif->udma_four ? "80" : "40");
-#endif /* PDC202_DEBUG_CABLE */
 }
 
 static int __devinit init_setup_pdcnew(struct pci_dev *dev, ide_pci_device_t *d)
diff --git a/drivers/ide/pci/pdc202xx_old.c b/drivers/ide/pci/pdc202xx_old.c
index 207a6191fac7..7146fe3f6ba7 100644
--- a/drivers/ide/pci/pdc202xx_old.c
+++ b/drivers/ide/pci/pdc202xx_old.c
@@ -46,7 +46,6 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 
-#define PDC202_DEBUG_CABLE		0
 #define PDC202XX_DEBUG_DRIVE_INFO	0
 
 static const char *pdc_quirk_drives[] = {
@@ -238,20 +237,7 @@ static int config_chipset_for_dma (ide_drive_t *drive)
 	u32 drive_conf		= 0;
 	u8 drive_pci		= 0x60 + (drive->dn << 2);
 	u8 test1 = 0, test2 = 0, speed = -1;
-	u8 AP = 0, cable = 0;
-
-	u8 ultra_66		= ((id->dma_ultra & 0x0010) ||
-				   (id->dma_ultra & 0x0008)) ? 1 : 0;
-
-	if (dev->device != PCI_DEVICE_ID_PROMISE_20246)
-		cable = pdc202xx_old_cable_detect(hwif);
-	else
-		ultra_66 = 0;
-
-	if (ultra_66 && cable) {
-		printk(KERN_WARNING "Warning: %s channel requires an 80-pin cable for operation.\n", hwif->channel ? "Secondary":"Primary");
-		printk(KERN_WARNING "%s reduced to Ultra33 mode.\n", drive->name);
-	}
+	u8 AP = 0;
 
 	if (dev->device != PCI_DEVICE_ID_PROMISE_20246)
 		pdc_old_disable_66MHz_clock(drive->hwif);
@@ -477,10 +463,6 @@ static void __devinit init_hwif_pdc202xx(ide_hwif_t *hwif)
 	if (!noautodma)
 		hwif->autodma = 1;
 	hwif->drives[0].autodma = hwif->drives[1].autodma = hwif->autodma;
-#if PDC202_DEBUG_CABLE
-	printk(KERN_DEBUG "%s: %s-pin cable\n",
-		hwif->name, hwif->udma_four ? "80" : "40");
-#endif /* PDC202_DEBUG_CABLE */	
 }
 
 static void __devinit init_dma_pdc202xx(ide_hwif_t *hwif, unsigned long dmabase)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 477b8c6be727..ca924b295c2e 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -605,6 +605,7 @@ typedef struct ide_drive_s {
 	unsigned scsi		: 1;	/* 0=default, 1=ide-scsi emulation */
 	unsigned sleeping	: 1;	/* 1=sleeping & sleep field valid */
 	unsigned post_reset	: 1;
+	unsigned udma33_warned	: 1;
 
 	u8	addressing;	/* 0=28-bit, 1=48-bit, 2=48-bit doing 28-bit */
         u8	quirk_list;	/* considered quirky, set for a specific host */
-- 
cgit v1.2.3


From 869c56ee9de1b72cd3f8ab9cdfbd3601e55c61f2 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:10 +0200
Subject: ide: add "initializing" argument to ide_register_hw()

Add "initializing" argument to ide_register_hw() and use it instead of ide.c
wide variable of the same name.  Update all users of ide_register_hw()
accordingly.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/arm/bast-ide.c     |  2 +-
 drivers/ide/arm/ide_arm.c      |  2 +-
 drivers/ide/cris/ide-cris.c    |  2 +-
 drivers/ide/h8300/ide-h8300.c  |  2 +-
 drivers/ide/ide-pnp.c          |  2 +-
 drivers/ide/ide.c              | 16 +++++++---------
 drivers/ide/legacy/buddha.c    |  2 +-
 drivers/ide/legacy/falconide.c |  2 +-
 drivers/ide/legacy/gayle.c     |  2 +-
 drivers/ide/legacy/ide-cs.c    |  2 +-
 drivers/ide/legacy/macide.c    |  6 +++---
 drivers/ide/legacy/q40ide.c    |  2 +-
 drivers/ide/pci/delkin_cb.c    |  2 +-
 drivers/macintosh/mediabay.c   |  2 +-
 include/linux/ide.h            |  5 +++--
 15 files changed, 25 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/arm/bast-ide.c b/drivers/ide/arm/bast-ide.c
index 9d474e5fd8dc..f7449d04114a 100644
--- a/drivers/ide/arm/bast-ide.c
+++ b/drivers/ide/arm/bast-ide.c
@@ -45,7 +45,7 @@ bastide_register(unsigned int base, unsigned int aux, int irq,
 	hw.io_ports[IDE_CONTROL_OFFSET] = aux + (6 * 0x20);
 	hw.irq = irq;
 
-	ide_register_hw(&hw, hwif);
+	ide_register_hw(&hw, 0, hwif);
 
 	return 0;
 }
diff --git a/drivers/ide/arm/ide_arm.c b/drivers/ide/arm/ide_arm.c
index 23488c4d1fcd..a3d6744e870a 100644
--- a/drivers/ide/arm/ide_arm.c
+++ b/drivers/ide/arm/ide_arm.c
@@ -38,6 +38,6 @@ void __init ide_arm_init(void)
 		memset(&hw, 0, sizeof(hw));
 		ide_std_init_ports(&hw, IDE_ARM_IO, IDE_ARM_IO + 0x206);
 		hw.irq = IDE_ARM_IRQ;
-		ide_register_hw(&hw, NULL);
+		ide_register_hw(&hw, 1, NULL);
 	}
 }
diff --git a/drivers/ide/cris/ide-cris.c b/drivers/ide/cris/ide-cris.c
index 9691d089fce8..c04cb25a01ff 100644
--- a/drivers/ide/cris/ide-cris.c
+++ b/drivers/ide/cris/ide-cris.c
@@ -796,7 +796,7 @@ init_e100_ide (void)
 		                ide_offsets,
 		                0, 0, cris_ide_ack_intr,
 		                ide_default_irq(0));
-		ide_register_hw(&hw, &hwif);
+		ide_register_hw(&hw, 1, &hwif);
 		hwif->mmio = 1;
 		hwif->chipset = ide_etrax100;
 		hwif->tuneproc = &tune_cris_ide;
diff --git a/drivers/ide/h8300/ide-h8300.c b/drivers/ide/h8300/ide-h8300.c
index 88750a300337..6d26ad7360d5 100644
--- a/drivers/ide/h8300/ide-h8300.c
+++ b/drivers/ide/h8300/ide-h8300.c
@@ -101,7 +101,7 @@ void __init h8300_ide_init(void)
 	hw_setup(&hw);
 
 	/* register if */
-	idx = ide_register_hw(&hw, &hwif);
+	idx = ide_register_hw(&hw, 1, &hwif);
 	if (idx == -1) {
 		printk(KERN_ERR "ide-h8300: IDE I/F register failed\n");
 		return;
diff --git a/drivers/ide/ide-pnp.c b/drivers/ide/ide-pnp.c
index 98410ca044cf..2b8009c50e91 100644
--- a/drivers/ide/ide-pnp.c
+++ b/drivers/ide/ide-pnp.c
@@ -42,7 +42,7 @@ static int idepnp_probe(struct pnp_dev * dev, const struct pnp_device_id *dev_id
 	hw.irq = pnp_irq(dev, 0);
 	hw.dma = NO_DMA;
 
-	index = ide_register_hw(&hw, &hwif);
+	index = ide_register_hw(&hw, 1, &hwif);
 
 	if (index != -1) {
 	    	printk(KERN_INFO "ide%d: generic PnP IDE interface\n", index);
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 614c5fd43cd2..0fc532850bbe 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -168,7 +168,6 @@ static const u8 ide_hwif_to_major[] = { IDE0_MAJOR, IDE1_MAJOR,
 
 static int idebus_parameter;	/* holds the "idebus=" parameter */
 static int system_bus_speed;	/* holds what we think is VESA/PCI bus speed */
-static int initializing;	/* set while initializing built-in drivers */
 
 DECLARE_MUTEX(ide_cfg_sem);
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(ide_lock);
@@ -302,9 +301,7 @@ static void __init init_ide_data (void)
 #endif
 	}
 #ifdef CONFIG_IDE_ARM
-	initializing = 1;
 	ide_arm_init();
-	initializing = 0;
 #endif
 }
 
@@ -749,6 +746,7 @@ void ide_setup_ports (	hw_regs_t *hw,
 /**
  *	ide_register_hw_with_fixup	-	register IDE interface
  *	@hw: hardware registers
+ *	@initializing: set while initializing built-in drivers
  *	@hwifp: pointer to returned hwif
  *	@fixup: fixup function
  *
@@ -758,7 +756,9 @@ void ide_setup_ports (	hw_regs_t *hw,
  *	Returns -1 on error.
  */
 
-int ide_register_hw_with_fixup(hw_regs_t *hw, ide_hwif_t **hwifp, void(*fixup)(ide_hwif_t *hwif))
+int ide_register_hw_with_fixup(hw_regs_t *hw, int initializing,
+			       ide_hwif_t **hwifp,
+			       void(*fixup)(ide_hwif_t *hwif))
 {
 	int index, retry = 1;
 	ide_hwif_t *hwif;
@@ -810,9 +810,9 @@ found:
 
 EXPORT_SYMBOL(ide_register_hw_with_fixup);
 
-int ide_register_hw(hw_regs_t *hw, ide_hwif_t **hwifp)
+int ide_register_hw(hw_regs_t *hw, int initializing, ide_hwif_t **hwifp)
 {
-	return ide_register_hw_with_fixup(hw, hwifp, NULL);
+	return ide_register_hw_with_fixup(hw, initializing, hwifp, NULL);
 }
 
 EXPORT_SYMBOL(ide_register_hw);
@@ -1108,7 +1108,7 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device
 			ide_init_hwif_ports(&hw, (unsigned long) args[0],
 					    (unsigned long) args[1], NULL);
 			hw.irq = args[2];
-			if (ide_register_hw(&hw, NULL) == -1)
+			if (ide_register_hw(&hw, 0, NULL) == -1)
 				return -EIO;
 			return 0;
 		}
@@ -1819,10 +1819,8 @@ static int __init ide_init(void)
 		(void)qd65xx_init();
 #endif
 
-	initializing = 1;
 	/* Probe for special PCI and other "known" interface chipsets. */
 	probe_for_hwifs();
-	initializing = 0;
 
 	proc_ide_create();
 
diff --git a/drivers/ide/legacy/buddha.c b/drivers/ide/legacy/buddha.c
index 1ed224a01f79..101aee1711c4 100644
--- a/drivers/ide/legacy/buddha.c
+++ b/drivers/ide/legacy/buddha.c
@@ -213,7 +213,7 @@ fail_base2:
 						IRQ_AMIGA_PORTS);
 			}	
 			
-			index = ide_register_hw(&hw, &hwif);
+			index = ide_register_hw(&hw, 1, &hwif);
 			if (index != -1) {
 				hwif->mmio = 1;
 				printk("ide%d: ", index);
diff --git a/drivers/ide/legacy/falconide.c b/drivers/ide/legacy/falconide.c
index a9f2cd5bb81e..e1e9d9d6893f 100644
--- a/drivers/ide/legacy/falconide.c
+++ b/drivers/ide/legacy/falconide.c
@@ -70,7 +70,7 @@ void __init falconide_init(void)
 			0, 0, NULL,
 //			falconide_iops,
 			IRQ_MFP_IDE);
-	index = ide_register_hw(&hw, NULL);
+	index = ide_register_hw(&hw, 1, NULL);
 
 	if (index != -1)
 	    printk("ide%d: Falcon IDE interface\n", index);
diff --git a/drivers/ide/legacy/gayle.c b/drivers/ide/legacy/gayle.c
index dcfadbbf55d8..0830a021bbb6 100644
--- a/drivers/ide/legacy/gayle.c
+++ b/drivers/ide/legacy/gayle.c
@@ -165,7 +165,7 @@ found:
 //			&gayle_iops,
 			IRQ_AMIGA_PORTS);
 
-	index = ide_register_hw(&hw, &hwif);
+	index = ide_register_hw(&hw, 1, &hwif);
 	if (index != -1) {
 	    hwif->mmio = 1;
 	    switch (i) {
diff --git a/drivers/ide/legacy/ide-cs.c b/drivers/ide/legacy/ide-cs.c
index c6522a64d7ec..2f3977f195b7 100644
--- a/drivers/ide/legacy/ide-cs.c
+++ b/drivers/ide/legacy/ide-cs.c
@@ -153,7 +153,7 @@ static int idecs_register(unsigned long io, unsigned long ctl, unsigned long irq
     hw.irq = irq;
     hw.chipset = ide_pci;
     hw.dev = &handle->dev;
-    return ide_register_hw_with_fixup(&hw, NULL, ide_undecoded_slave);
+    return ide_register_hw_with_fixup(&hw, 0, NULL, ide_undecoded_slave);
 }
 
 /*======================================================================
diff --git a/drivers/ide/legacy/macide.c b/drivers/ide/legacy/macide.c
index 4c0079ad52ac..c211fc78345d 100644
--- a/drivers/ide/legacy/macide.c
+++ b/drivers/ide/legacy/macide.c
@@ -102,21 +102,21 @@ void macide_init(void)
 				0, 0, macide_ack_intr,
 //				quadra_ide_iops,
 				IRQ_NUBUS_F);
-		index = ide_register_hw(&hw, &hwif);
+		index = ide_register_hw(&hw, 1, &hwif);
 		break;
 	case MAC_IDE_PB:
 		ide_setup_ports(&hw, IDE_BASE, macide_offsets,
 				0, 0, macide_ack_intr,
 //				macide_pb_iops,
 				IRQ_NUBUS_C);
-		index = ide_register_hw(&hw, &hwif);
+		index = ide_register_hw(&hw, 1, &hwif);
 		break;
 	case MAC_IDE_BABOON:
 		ide_setup_ports(&hw, BABOON_BASE, macide_offsets,
 				0, 0, NULL,
 //				macide_baboon_iops,
 				IRQ_BABOON_1);
-		index = ide_register_hw(&hw, &hwif);
+		index = ide_register_hw(&hw, 1, &hwif);
 		if (index == -1) break;
 		if (macintosh_config->ident == MAC_MODEL_PB190) {
 
diff --git a/drivers/ide/legacy/q40ide.c b/drivers/ide/legacy/q40ide.c
index 74f08124eabb..e628a983ce33 100644
--- a/drivers/ide/legacy/q40ide.c
+++ b/drivers/ide/legacy/q40ide.c
@@ -142,7 +142,7 @@ void q40ide_init(void)
 			0, NULL,
 //			m68kide_iops,
 			q40ide_default_irq(pcide_bases[i]));
-	index = ide_register_hw(&hw, &hwif);
+	index = ide_register_hw(&hw, 1, &hwif);
 	// **FIXME**
 	if (index != -1)
 		hwif->mmio = 1;
diff --git a/drivers/ide/pci/delkin_cb.c b/drivers/ide/pci/delkin_cb.c
index dd7ec37fdeab..46f4a888c037 100644
--- a/drivers/ide/pci/delkin_cb.c
+++ b/drivers/ide/pci/delkin_cb.c
@@ -80,7 +80,7 @@ delkin_cb_probe (struct pci_dev *dev, const struct pci_device_id *id)
 	hw.irq = dev->irq;
 	hw.chipset = ide_pci;		/* this enables IRQ sharing */
 
-	rc = ide_register_hw_with_fixup(&hw, &hwif, ide_undecoded_slave);
+	rc = ide_register_hw_with_fixup(&hw, 0, &hwif, ide_undecoded_slave);
 	if (rc < 0) {
 		printk(KERN_ERR "delkin_cb: ide_register_hw failed (%d)\n", rc);
 		pci_disable_device(dev);
diff --git a/drivers/macintosh/mediabay.c b/drivers/macintosh/mediabay.c
index 0acf2f7fd9d7..c803d2bba65d 100644
--- a/drivers/macintosh/mediabay.c
+++ b/drivers/macintosh/mediabay.c
@@ -563,7 +563,7 @@ static void media_bay_step(int i)
 				ide_init_hwif_ports(&hw, (unsigned long) bay->cd_base, (unsigned long) 0, NULL);
 				hw.irq = bay->cd_irq;
 				hw.chipset = ide_pmac;
-				bay->cd_index = ide_register_hw(&hw, NULL);
+				bay->cd_index = ide_register_hw(&hw, 0, NULL);
 				pmu_resume();
 			}
 			if (bay->cd_index == -1) {
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ca924b295c2e..bdb97655ef61 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -223,8 +223,9 @@ typedef struct hw_regs_s {
 /*
  * Register new hardware with ide
  */
-int ide_register_hw(hw_regs_t *hw, struct hwif_s **hwifp);
-int ide_register_hw_with_fixup(hw_regs_t *, struct hwif_s **, void (*)(struct hwif_s *));
+int ide_register_hw(hw_regs_t *, int, struct hwif_s **);
+int ide_register_hw_with_fixup(hw_regs_t *, int, struct hwif_s **,
+			       void (*)(struct hwif_s *));
 
 /*
  * Set up hw_regs_t structure before calling ide_register_hw (optional)
-- 
cgit v1.2.3


From 5cbf79cdb37be2aa2a1b4fa94144526b14557060 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:11 +0200
Subject: ide: add ide_proc_register_port()

* create_proc_ide_interfaces() tries to add /proc entries for every probed
  and initialized IDE port, replace it by ide_proc_register_port() which does
  it only for the given port (also rename destroy_proc_ide_interface() to
  ide_proc_unregister_port() for consistency)

* convert {create,destroy}_proc_ide_interface[s]() users to use new functions

* pmac driver depended on proc_ide_create() to add /proc port entries, fix it

* au1xxx-ide, swarm and cs5520 drivers depended indirectly on ide-generic
  driver (CONFIG_IDE_GENERIC=y) to add port /proc entries, fix them

* there is now no need to add /proc entries for IDE ports in proc_ide_create()
  so don't do it

* proc_ide_create() needs now to be called before drivers are probed - fix it,
  while at it make proc_ide_create() create /proc "ide" directory

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/arm/icside.c      |  7 +++++--
 drivers/ide/arm/rapide.c      |  2 +-
 drivers/ide/ide-generic.c     |  2 --
 drivers/ide/ide-probe.c       |  3 +++
 drivers/ide/ide-proc.c        | 34 +++++++++++++++++-----------------
 drivers/ide/ide.c             | 14 +++-----------
 drivers/ide/legacy/ali14xx.c  |  3 ++-
 drivers/ide/legacy/dtc2278.c  |  3 ++-
 drivers/ide/legacy/ht6560b.c  |  3 ++-
 drivers/ide/legacy/qd65xx.c   |  7 ++++---
 drivers/ide/legacy/umc8672.c  |  3 ++-
 drivers/ide/mips/au1xxx-ide.c |  3 +++
 drivers/ide/mips/swarm.c      |  3 +++
 drivers/ide/pci/cs5520.c      | 20 ++++++++++++++++----
 drivers/ide/pci/sgiioc4.c     |  2 +-
 drivers/ide/ppc/pmac.c        |  2 ++
 drivers/ide/setup-pci.c       | 25 +++++++++++++++++++++----
 include/linux/ide.h           | 10 ++++------
 18 files changed, 91 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/arm/icside.c b/drivers/ide/arm/icside.c
index f383ace2a087..1fe0457243db 100644
--- a/drivers/ide/arm/icside.c
+++ b/drivers/ide/arm/icside.c
@@ -591,7 +591,8 @@ icside_register_v5(struct icside_state *state, struct expansion_card *ec)
 	state->hwif[0] = hwif;
 
 	probe_hwif_init(hwif);
-	create_proc_ide_interfaces();
+
+	ide_proc_register_port(hwif);
 
 	return 0;
 }
@@ -679,7 +680,9 @@ icside_register_v6(struct icside_state *state, struct expansion_card *ec)
 
 	probe_hwif_init(hwif);
 	probe_hwif_init(mate);
-	create_proc_ide_interfaces();
+
+	ide_proc_register_port(hwif);
+	ide_proc_register_port(mate);
 
 	return 0;
 
diff --git a/drivers/ide/arm/rapide.c b/drivers/ide/arm/rapide.c
index 9c6c49fdd2b1..890ea3fac3c6 100644
--- a/drivers/ide/arm/rapide.c
+++ b/drivers/ide/arm/rapide.c
@@ -76,7 +76,7 @@ rapide_probe(struct expansion_card *ec, const struct ecard_id *id)
 		hwif->gendev.parent = &ec->dev;
 		hwif->noprobe = 0;
 		probe_hwif_init(hwif);
-		create_proc_ide_interfaces();
+		ide_proc_register_port(hwif);
 		ecard_set_drvdata(ec, hwif);
 		goto out;
 	}
diff --git a/drivers/ide/ide-generic.c b/drivers/ide/ide-generic.c
index 99fd56151131..0f72b98d727f 100644
--- a/drivers/ide/ide-generic.c
+++ b/drivers/ide/ide-generic.c
@@ -22,8 +22,6 @@ static int __init ide_generic_init(void)
 	if (ide_hwifs[0].io_ports[IDE_DATA_OFFSET])
 		ide_release_lock();	/* for atari only */
 
-	create_proc_ide_interfaces();
-
 	return 0;
 }
 
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 8f15c23aa70d..3cebed77f55d 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1427,6 +1427,9 @@ int ideprobe_init (void)
 				}
 		}
 	}
+	for (index = 0; index < MAX_HWIFS; ++index)
+		if (probe[index])
+			ide_proc_register_port(&ide_hwifs[index]);
 	return 0;
 }
 
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index 949a6f609d84..d50bd996ff22 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -39,6 +39,8 @@
 
 #include <asm/io.h>
 
+static struct proc_dir_entry *proc_ide_root;
+
 static int proc_ide_read_imodel
 	(char *page, char **start, off_t off, int count, int *eof, void *data)
 {
@@ -783,26 +785,24 @@ static ide_proc_entry_t hwif_entries[] = {
 	{ NULL,	0, NULL, NULL }
 };
 
-void create_proc_ide_interfaces(void)
+void ide_proc_register_port(ide_hwif_t *hwif)
 {
-	int	h;
+	if (!hwif->present)
+		return;
 
-	for (h = 0; h < MAX_HWIFS; h++) {
-		ide_hwif_t *hwif = &ide_hwifs[h];
+	if (!hwif->proc) {
+		hwif->proc = proc_mkdir(hwif->name, proc_ide_root);
 
-		if (!hwif->present)
-			continue;
-		if (!hwif->proc) {
-			hwif->proc = proc_mkdir(hwif->name, proc_ide_root);
-			if (!hwif->proc)
-				return;
-			ide_add_proc_entries(hwif->proc, hwif_entries, hwif);
-		}
-		create_proc_ide_drives(hwif);
+		if (!hwif->proc)
+			return;
+
+		ide_add_proc_entries(hwif->proc, hwif_entries, hwif);
 	}
+
+	create_proc_ide_drives(hwif);
 }
 
-EXPORT_SYMBOL(create_proc_ide_interfaces);
+EXPORT_SYMBOL_GPL(ide_proc_register_port);
 
 #ifdef CONFIG_BLK_DEV_IDEPCI
 void ide_pci_create_host_proc(const char *name, get_info_t *get_info)
@@ -813,7 +813,7 @@ void ide_pci_create_host_proc(const char *name, get_info_t *get_info)
 EXPORT_SYMBOL_GPL(ide_pci_create_host_proc);
 #endif
 
-void destroy_proc_ide_interface(ide_hwif_t *hwif)
+void ide_proc_unregister_port(ide_hwif_t *hwif)
 {
 	if (hwif->proc) {
 		destroy_proc_ide_drives(hwif);
@@ -860,11 +860,11 @@ void proc_ide_create(void)
 {
 	struct proc_dir_entry *entry;
 
+	proc_ide_root = proc_mkdir("ide", NULL);
+
 	if (!proc_ide_root)
 		return;
 
-	create_proc_ide_interfaces();
-
 	entry = create_proc_entry("drivers", 0, proc_ide_root);
 	if (entry)
 		entry->proc_fops = &ide_drivers_operations;
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 0fc532850bbe..038f2610e734 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -347,10 +347,6 @@ static int ide_system_bus_speed(void)
 	return system_bus_speed;
 }
 
-#ifdef CONFIG_IDE_PROC_FS
-struct proc_dir_entry *proc_ide_root;
-#endif
-
 static struct resource* hwif_request_region(ide_hwif_t *hwif,
 					    unsigned long addr, int num)
 {
@@ -594,7 +590,7 @@ void ide_unregister(unsigned int index)
 
 	spin_unlock_irq(&ide_lock);
 
-	destroy_proc_ide_interface(hwif);
+	ide_proc_unregister_port(hwif);
 
 	hwgroup = hwif->hwgroup;
 	/*
@@ -799,7 +795,7 @@ found:
 
 	if (!initializing) {
 		probe_hwif_init_with_fixup(hwif, fixup);
-		create_proc_ide_interfaces();
+		ide_proc_register_port(hwif);
 	}
 
 	if (hwifp)
@@ -1794,9 +1790,7 @@ static int __init ide_init(void)
 
 	init_ide_data();
 
-#ifdef CONFIG_IDE_PROC_FS
-	proc_ide_root = proc_mkdir("ide", NULL);
-#endif
+	proc_ide_create();
 
 #ifdef CONFIG_BLK_DEV_ALI14XX
 	if (probe_ali14xx)
@@ -1822,8 +1816,6 @@ static int __init ide_init(void)
 	/* Probe for special PCI and other "known" interface chipsets. */
 	probe_for_hwifs();
 
-	proc_ide_create();
-
 	return 0;
 }
 
diff --git a/drivers/ide/legacy/ali14xx.c b/drivers/ide/legacy/ali14xx.c
index 91961aa03047..df17ed68c0bc 100644
--- a/drivers/ide/legacy/ali14xx.c
+++ b/drivers/ide/legacy/ali14xx.c
@@ -223,7 +223,8 @@ static int __init ali14xx_probe(void)
 	probe_hwif_init(hwif);
 	probe_hwif_init(mate);
 
-	create_proc_ide_interfaces();
+	ide_proc_register_port(hwif);
+	ide_proc_register_port(mate);
 
 	return 0;
 }
diff --git a/drivers/ide/legacy/dtc2278.c b/drivers/ide/legacy/dtc2278.c
index 0219ffa64db6..36a3f0ac6162 100644
--- a/drivers/ide/legacy/dtc2278.c
+++ b/drivers/ide/legacy/dtc2278.c
@@ -138,7 +138,8 @@ static int __init dtc2278_probe(void)
 	probe_hwif_init(hwif);
 	probe_hwif_init(mate);
 
-	create_proc_ide_interfaces();
+	ide_proc_register_port(hwif);
+	ide_proc_register_port(mate);
 
 	return 0;
 }
diff --git a/drivers/ide/legacy/ht6560b.c b/drivers/ide/legacy/ht6560b.c
index a2832643c522..c8f353b1296f 100644
--- a/drivers/ide/legacy/ht6560b.c
+++ b/drivers/ide/legacy/ht6560b.c
@@ -357,7 +357,8 @@ int __init ht6560b_init(void)
 	probe_hwif_init(hwif);
 	probe_hwif_init(mate);
 
-	create_proc_ide_interfaces();
+	ide_proc_register_port(hwif);
+	ide_proc_register_port(mate);
 
 	return 0;
 
diff --git a/drivers/ide/legacy/qd65xx.c b/drivers/ide/legacy/qd65xx.c
index 2fb8f50f1293..d1414a75b523 100644
--- a/drivers/ide/legacy/qd65xx.c
+++ b/drivers/ide/legacy/qd65xx.c
@@ -427,7 +427,7 @@ static int __init qd_probe(int base)
 		qd_setup(hwif, base, config, QD6500_DEF_DATA, QD6500_DEF_DATA,
 			 &qd6500_tune_drive);
 
-		create_proc_ide_interfaces();
+		ide_proc_register_port(hwif);
 
 		return 1;
 	}
@@ -459,7 +459,7 @@ static int __init qd_probe(int base)
 				 &qd6580_tune_drive);
 			qd_write_reg(QD_DEF_CONTR,QD_CONTROL_PORT);
 
-			create_proc_ide_interfaces();
+			ide_proc_register_port(hwif);
 
 			return 1;
 		} else {
@@ -479,7 +479,8 @@ static int __init qd_probe(int base)
 				 &qd6580_tune_drive);
 			qd_write_reg(QD_DEF_CONTR,QD_CONTROL_PORT);
 
-			create_proc_ide_interfaces();
+			ide_proc_register_port(hwif);
+			ide_proc_register_port(mate);
 
 			return 0; /* no other qd65xx possible */
 		}
diff --git a/drivers/ide/legacy/umc8672.c b/drivers/ide/legacy/umc8672.c
index ca7974455578..ddc403a0bd82 100644
--- a/drivers/ide/legacy/umc8672.c
+++ b/drivers/ide/legacy/umc8672.c
@@ -160,7 +160,8 @@ static int __init umc8672_probe(void)
 	probe_hwif_init(hwif);
 	probe_hwif_init(mate);
 
-	create_proc_ide_interfaces();
+	ide_proc_register_port(hwif);
+	ide_proc_register_port(mate);
 
 	return 0;
 }
diff --git a/drivers/ide/mips/au1xxx-ide.c b/drivers/ide/mips/au1xxx-ide.c
index d54d9fe92a7d..ca95e990862e 100644
--- a/drivers/ide/mips/au1xxx-ide.c
+++ b/drivers/ide/mips/au1xxx-ide.c
@@ -760,6 +760,9 @@ static int au_ide_probe(struct device *dev)
 #endif
 
 	probe_hwif_init(hwif);
+
+	ide_proc_register_port(hwif);
+
 	dev_set_drvdata(dev, hwif);
 
 	printk(KERN_INFO "Au1xxx IDE(builtin) configured for %s\n", mode );
diff --git a/drivers/ide/mips/swarm.c b/drivers/ide/mips/swarm.c
index 81fa06851b27..6e935d7c63fd 100644
--- a/drivers/ide/mips/swarm.c
+++ b/drivers/ide/mips/swarm.c
@@ -129,6 +129,9 @@ static int __devinit swarm_ide_probe(struct device *dev)
 	hwif->irq = hwif->hw.irq;
 
 	probe_hwif_init(hwif);
+
+	ide_proc_register_port(hwif);
+
 	dev_set_drvdata(dev, hwif);
 
 	return 0;
diff --git a/drivers/ide/pci/cs5520.c b/drivers/ide/pci/cs5520.c
index 400859a839f7..3b88a3a56116 100644
--- a/drivers/ide/pci/cs5520.c
+++ b/drivers/ide/pci/cs5520.c
@@ -213,6 +213,7 @@ static ide_pci_device_t cyrix_chipsets[] __devinitdata = {
  
 static int __devinit cs5520_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_hwif_t *hwif = NULL, *mate = NULL;
 	ata_index_t index;
 	ide_pci_device_t *d = &cyrix_chipsets[id->driver_data];
 
@@ -239,10 +240,21 @@ static int __devinit cs5520_init_one(struct pci_dev *dev, const struct pci_devic
 
 	ide_pci_setup_ports(dev, d, 14, &index);
 
-	if((index.b.low & 0xf0) != 0xf0)
-		probe_hwif_init(&ide_hwifs[index.b.low]);
-	if((index.b.high & 0xf0) != 0xf0)
-		probe_hwif_init(&ide_hwifs[index.b.high]);
+	if ((index.b.low & 0xf0) != 0xf0)
+		hwif = &ide_hwifs[index.b.low];
+	if ((index.b.high & 0xf0) != 0xf0)
+		mate = &ide_hwifs[index.b.high];
+
+	if (hwif)
+		probe_hwif_init(hwif);
+	if (mate)
+		probe_hwif_init(mate);
+
+	if (hwif)
+		ide_proc_register_port(hwif);
+	if (mate)
+		ide_proc_register_port(mate);
+
 	return 0;
 }
 
diff --git a/drivers/ide/pci/sgiioc4.c b/drivers/ide/pci/sgiioc4.c
index fd09b295a69d..d3185e29a38e 100644
--- a/drivers/ide/pci/sgiioc4.c
+++ b/drivers/ide/pci/sgiioc4.c
@@ -692,7 +692,7 @@ sgiioc4_ide_setup_pci_device(struct pci_dev *dev, ide_pci_device_t * d)
 		return -EIO;
 
 	/* Create /proc/ide entries */
-	create_proc_ide_interfaces();
+	ide_proc_register_port(hwif);
 
 	return 0;
 }
diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c
index a49ebe44babd..45fc36f0f219 100644
--- a/drivers/ide/ppc/pmac.c
+++ b/drivers/ide/ppc/pmac.c
@@ -1276,6 +1276,8 @@ pmac_ide_setup_device(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif)
 	/* We probe the hwif now */
 	probe_hwif_init(hwif);
 
+	ide_proc_register_port(hwif);
+
 	return 0;
 }
 
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index 118fb3205ca8..892cda755782 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -702,6 +702,7 @@ out:
 
 int ide_setup_pci_device(struct pci_dev *dev, ide_pci_device_t *d)
 {
+	ide_hwif_t *hwif = NULL, *mate = NULL;
 	ata_index_t index_list;
 	int ret;
 
@@ -710,11 +711,19 @@ int ide_setup_pci_device(struct pci_dev *dev, ide_pci_device_t *d)
 		goto out;
 
 	if ((index_list.b.low & 0xf0) != 0xf0)
-		probe_hwif_init_with_fixup(&ide_hwifs[index_list.b.low], d->fixup);
+		hwif = &ide_hwifs[index_list.b.low];
 	if ((index_list.b.high & 0xf0) != 0xf0)
-		probe_hwif_init_with_fixup(&ide_hwifs[index_list.b.high], d->fixup);
+		mate = &ide_hwifs[index_list.b.high];
 
-	create_proc_ide_interfaces();
+	if (hwif)
+		probe_hwif_init_with_fixup(hwif, d->fixup);
+	if (mate)
+		probe_hwif_init_with_fixup(mate, d->fixup);
+
+	if (hwif)
+		ide_proc_register_port(hwif);
+	if (mate)
+		ide_proc_register_port(mate);
 out:
 	return ret;
 }
@@ -748,7 +757,15 @@ int ide_setup_pci_devices(struct pci_dev *dev1, struct pci_dev *dev2,
 		}
 	}
 
-	create_proc_ide_interfaces();
+	for (i = 0; i < 2; i++) {
+		u8 idx[2] = { index_list[i].b.low, index_list[i].b.high };
+		int j;
+
+		for (j = 0; j < 2; j++) {
+			if ((idx[j] & 0xf0) != 0xf0)
+				ide_proc_register_port(ide_hwifs + idx[j]);
+		}
+	}
 out:
 	return ret;
 }
diff --git a/include/linux/ide.h b/include/linux/ide.h
index bdb97655ef61..52d482a16dd9 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -909,12 +909,10 @@ typedef struct {
 	write_proc_t	*write_proc;
 } ide_proc_entry_t;
 
-extern struct proc_dir_entry *proc_ide_root;
-
 void proc_ide_create(void);
 void proc_ide_destroy(void);
-void create_proc_ide_interfaces(void);
-void destroy_proc_ide_interface(ide_hwif_t *);
+void ide_proc_register_port(ide_hwif_t *);
+void ide_proc_unregister_port(ide_hwif_t *);
 void ide_proc_register_driver(ide_drive_t *, ide_driver_t *);
 void ide_proc_unregister_driver(ide_drive_t *, ide_driver_t *);
 
@@ -945,8 +943,8 @@ void ide_pci_create_host_proc(const char *, get_info_t *);
 #else
 static inline void proc_ide_create(void) { ; }
 static inline void proc_ide_destroy(void) { ; }
-static inline void create_proc_ide_interfaces(void) { ; }
-static inline void destroy_proc_ide_interface(ide_hwif_t *hwif) { ; }
+static inline void ide_proc_register_port(ide_hwif_t *hwif) { ; }
+static inline void ide_proc_unregister_port(ide_hwif_t *hwif) { ; }
 static inline void ide_proc_register_driver(ide_drive_t *drive, ide_driver_t *driver) { ; }
 static inline void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver) { ; }
 static inline void ide_add_generic_settings(ide_drive_t *drive) { ; }
-- 
cgit v1.2.3


From 6d208b39c45edee5def6c201fcd51561c5a39828 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 10 May 2007 00:01:11 +0200
Subject: ide: legacy PCI bus order probing fixes

IDE PCI host drivers should register themselves with IDE core only when
IDE driver is built-in, otherwise (IDE driver is modular and thus IDE PCI
host drivers are also modular) the code has no effect and just complicates
the probing.

Fix it by adding new config option CONFIG_IDEPCI_PCIBUS (defined only when
needed and invisible to the user) and covering by #ifdef/#endif the code
in question.  It turned out that "ide=reverse" was silently accepted but did
nothing in case when IDE driver was modular, this is fixed now.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/Kconfig     |  3 +++
 drivers/ide/ide.c       | 10 +++++-----
 drivers/ide/setup-pci.c |  2 ++
 include/linux/ide.h     |  5 +++++
 4 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index a678bbecb489..1d06b415ede9 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -371,6 +371,9 @@ config IDEPCI_SHARE_IRQ
 	  It is safe to say Y to this question, in most cases.
 	  If unsure, say N.
 
+config IDEPCI_PCIBUS_ORDER
+	def_bool PCI && BLK_DEV_IDE=y && BLK_DEV_IDEPCI
+
 config BLK_DEV_OFFBOARD
 	bool "Boot off-board chipsets first support"
 	depends on PCI && BLK_DEV_IDEPCI
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 038f2610e734..f2b547ff7722 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -172,7 +172,7 @@ static int system_bus_speed;	/* holds what we think is VESA/PCI bus speed */
 DECLARE_MUTEX(ide_cfg_sem);
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(ide_lock);
 
-#ifdef CONFIG_BLK_DEV_IDEPCI
+#ifdef CONFIG_IDEPCI_PCIBUS_ORDER
 static int ide_scan_direction; /* THIS was formerly 2.2.x pci=reverse */
 #endif
 
@@ -1333,13 +1333,13 @@ static int __init ide_setup(char *s)
 		return 1;
 	}
 
-#ifdef CONFIG_BLK_DEV_IDEPCI
+#ifdef CONFIG_IDEPCI_PCIBUS_ORDER
 	if (!strcmp(s, "ide=reverse")) {
 		ide_scan_direction = 1;
 		printk(" : Enabled support for IDE inverse scan order.\n");
 		return 1;
 	}
-#endif /* CONFIG_BLK_DEV_IDEPCI */
+#endif
 
 #ifdef CONFIG_BLK_DEV_IDEACPI
 	if (!strcmp(s, "ide=noacpi")) {
@@ -1599,9 +1599,9 @@ extern void __init h8300_ide_init(void);
  */
 static void __init probe_for_hwifs (void)
 {
-#ifdef CONFIG_BLK_DEV_IDEPCI
+#ifdef CONFIG_IDEPCI_PCIBUS_ORDER
 	ide_scan_pcibus(ide_scan_direction);
-#endif /* CONFIG_BLK_DEV_IDEPCI */
+#endif
 
 #ifdef CONFIG_ETRAX_IDE
 	{
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index 892cda755782..67035ba4bf5e 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -772,6 +772,7 @@ out:
 
 EXPORT_SYMBOL_GPL(ide_setup_pci_devices);
 
+#ifdef CONFIG_IDEPCI_PCIBUS_ORDER
 /*
  *	Module interfaces
  */
@@ -878,3 +879,4 @@ void __init ide_scan_pcibus (int scan_direction)
 		__pci_register_driver(d, d->driver.owner, d->driver.mod_name);
 	}
 }
+#endif
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 52d482a16dd9..df4e6a510310 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1205,9 +1205,14 @@ void ide_init_disk(struct gendisk *, ide_drive_t *);
 
 extern int ideprobe_init(void);
 
+#ifdef CONFIG_IDEPCI_PCIBUS_ORDER
 extern void ide_scan_pcibus(int scan_direction) __init;
 extern int __ide_pci_register_driver(struct pci_driver *driver, struct module *owner, const char *mod_name);
 #define ide_pci_register_driver(d) __ide_pci_register_driver(d, THIS_MODULE, KBUILD_MODNAME)
+#else
+#define ide_pci_register_driver(d) pci_register_driver(d)
+#endif
+
 void ide_pci_setup_ports(struct pci_dev *, struct ide_pci_device_s *, int, ata_index_t *);
 extern void ide_setup_pci_noise (struct pci_dev *dev, struct ide_pci_device_s *d);
 
-- 
cgit v1.2.3


From 44ce6294d07555c3d313757105fd44b78208407f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Wed, 9 May 2007 18:51:36 -0700
Subject: Revert "md: improve partition detection in md array"

This reverts commit 5b479c91da90eef605f851508744bfe8269591a0.

Quoth Neil Brown:

  "It causes an oops when auto-detecting raid arrays, and it doesn't
   seem easy to fix.

   The array may not be 'open' when do_md_run is called, so
   bdev->bd_disk might be NULL, so bd_set_size can oops.

   This whole approach of opening an md device before it has been
   assembled just seems to get more and more painful.  I think I'm going
   to have to come up with something clever to provide both backward
   comparability with usage expectation, and sane integration into the
   rest of the kernel."

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/md.c           | 26 ++++++++++++++++++--------
 drivers/md/raid1.c        |  1 +
 drivers/md/raid5.c        |  2 ++
 include/linux/raid/md_k.h |  1 +
 4 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2901d0c0ee9e..65814b0340cb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3104,7 +3104,6 @@ static int do_md_run(mddev_t * mddev)
 	struct gendisk *disk;
 	struct mdk_personality *pers;
 	char b[BDEVNAME_SIZE];
-	struct block_device *bdev;
 
 	if (list_empty(&mddev->disks))
 		/* cannot run an array with no devices.. */
@@ -3332,13 +3331,7 @@ static int do_md_run(mddev_t * mddev)
 	md_wakeup_thread(mddev->thread);
 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
 
-	bdev = bdget_disk(mddev->gendisk, 0);
-	if (bdev) {
-		bd_set_size(bdev, mddev->array_size << 1);
-		blkdev_ioctl(bdev->bd_inode, NULL, BLKRRPART, 0);
-		bdput(bdev);
-	}
-
+	mddev->changed = 1;
 	md_new_event(mddev);
 	kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE);
 	return 0;
@@ -3460,6 +3453,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
 			mddev->pers = NULL;
 
 			set_capacity(disk, 0);
+			mddev->changed = 1;
 
 			if (mddev->ro)
 				mddev->ro = 0;
@@ -4599,6 +4593,20 @@ static int md_release(struct inode *inode, struct file * file)
 	return 0;
 }
 
+static int md_media_changed(struct gendisk *disk)
+{
+	mddev_t *mddev = disk->private_data;
+
+	return mddev->changed;
+}
+
+static int md_revalidate(struct gendisk *disk)
+{
+	mddev_t *mddev = disk->private_data;
+
+	mddev->changed = 0;
+	return 0;
+}
 static struct block_device_operations md_fops =
 {
 	.owner		= THIS_MODULE,
@@ -4606,6 +4614,8 @@ static struct block_device_operations md_fops =
 	.release	= md_release,
 	.ioctl		= md_ioctl,
 	.getgeo		= md_getgeo,
+	.media_changed	= md_media_changed,
+	.revalidate_disk= md_revalidate,
 };
 
 static int md_thread(void * arg)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1b7130cad21f..97ee870b265d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2063,6 +2063,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
 	 */
 	mddev->array_size = sectors>>1;
 	set_capacity(mddev->gendisk, mddev->array_size << 1);
+	mddev->changed = 1;
 	if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
 		mddev->recovery_cp = mddev->size << 1;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a72e70ad0975..061375ee6592 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3864,6 +3864,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
 	mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
 	set_capacity(mddev->gendisk, mddev->array_size << 1);
+	mddev->changed = 1;
 	if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
 		mddev->recovery_cp = mddev->size << 1;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -3998,6 +3999,7 @@ static void end_reshape(raid5_conf_t *conf)
 		conf->mddev->array_size = conf->mddev->size *
 			(conf->raid_disks - conf->max_degraded);
 		set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
+		conf->mddev->changed = 1;
 
 		bdev = bdget_disk(conf->mddev->gendisk, 0);
 		if (bdev) {
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index a121f36f4437..de72c49747c8 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -201,6 +201,7 @@ struct mddev_s
 	struct mutex			reconfig_mutex;
 	atomic_t			active;
 
+	int				changed;	/* true if we might need to reread partition info */
 	int				degraded;	/* whether md should consider
 							 * adding a spare
 							 */
-- 
cgit v1.2.3