From 2526c151c31358aec66b63921dd712bbec5ee0cb Mon Sep 17 00:00:00 2001 From: Jon Smirl Date: Fri, 9 Jan 2009 15:49:06 -0700 Subject: drivers/of: Add the of_find_i2c_device_by_node function. The of_find_i2c_device_by_node function allows you to follow a reference in the device tree to an i2c device node and then locate the linux device instantiated by the device tree. Example use: an I2S bus driver finding the i2c_device instance for a codec described by a device tree node. This was waiting for Anton's i2c patches that were just added. Signed-off-by: Jon Smirl Signed-off-by: Grant Likely --- include/linux/of_i2c.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_i2c.h b/include/linux/of_i2c.h index bd2a870ec296..34974b5a76f7 100644 --- a/include/linux/of_i2c.h +++ b/include/linux/of_i2c.h @@ -17,4 +17,7 @@ void of_register_i2c_devices(struct i2c_adapter *adap, struct device_node *adap_node); +/* must call put_device() when done with returned i2c_client device */ +struct i2c_client *of_find_i2c_device_by_node(struct device_node *node); + #endif /* __LINUX_OF_I2C_H */ -- cgit v1.2.3 From f52046b14b1e1a8a02ae48d0c69d39c5e204644f Mon Sep 17 00:00:00 2001 From: Balaji Rao Date: Fri, 9 Jan 2009 01:49:01 +0100 Subject: mfd: PCF50633 core driver This patch implements the core of the PCF50633 driver. This core driver has generic register read/write functions and does interrupt management for its sub devices. Signed-off-by: Balaji Rao Cc: Andy Green Signed-off-by: Samuel Ortiz --- drivers/mfd/Kconfig | 9 + drivers/mfd/Makefile | 2 + drivers/mfd/pcf50633-core.c | 710 ++++++++++++++++++++++++++++++++++++++ include/linux/mfd/pcf50633/core.h | 218 ++++++++++++ 4 files changed, 939 insertions(+) create mode 100644 drivers/mfd/pcf50633-core.c create mode 100644 include/linux/mfd/pcf50633/core.h (limited to 'include/linux') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 416f9e7286ba..3ac32e45b03b 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -217,6 +217,15 @@ config MFD_WM8350_I2C I2C as the control interface. Additional options must be selected to enable support for the functionality of the chip. +config MFD_PCF50633 + tristate "Support for NXP PCF50633" + depends on I2C + help + Say yes here if you have NXP PCF50633 chip on your board. + This core driver provides register access and IRQ handling + facilities, and registers devices for the various functions + so that function-specific drivers can bind to them. + endmenu menu "Multimedia Capabilities Port drivers" diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 0c9418b36c26..23d4d10981b3 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -37,3 +37,5 @@ endif obj-$(CONFIG_UCB1400_CORE) += ucb1400_core.o obj-$(CONFIG_PMIC_DA903X) += da903x.o + +obj-$(CONFIG_MFD_PCF50633) += pcf50633-core.o \ No newline at end of file diff --git a/drivers/mfd/pcf50633-core.c b/drivers/mfd/pcf50633-core.c new file mode 100644 index 000000000000..24508e28e3fb --- /dev/null +++ b/drivers/mfd/pcf50633-core.c @@ -0,0 +1,710 @@ +/* NXP PCF50633 Power Management Unit (PMU) driver + * + * (C) 2006-2008 by Openmoko, Inc. + * Author: Harald Welte + * Balaji Rao + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* Two MBCS registers used during cold start */ +#define PCF50633_REG_MBCS1 0x4b +#define PCF50633_REG_MBCS2 0x4c +#define PCF50633_MBCS1_USBPRES 0x01 +#define PCF50633_MBCS1_ADAPTPRES 0x01 + +static int __pcf50633_read(struct pcf50633 *pcf, u8 reg, int num, u8 *data) +{ + int ret; + + ret = i2c_smbus_read_i2c_block_data(pcf->i2c_client, reg, + num, data); + if (ret < 0) + dev_err(pcf->dev, "Error reading %d regs at %d\n", num, reg); + + return ret; +} + +static int __pcf50633_write(struct pcf50633 *pcf, u8 reg, int num, u8 *data) +{ + int ret; + + ret = i2c_smbus_write_i2c_block_data(pcf->i2c_client, reg, + num, data); + if (ret < 0) + dev_err(pcf->dev, "Error writing %d regs at %d\n", num, reg); + + return ret; + +} + +/* Read a block of upto 32 regs */ +int pcf50633_read_block(struct pcf50633 *pcf, u8 reg, + int nr_regs, u8 *data) +{ + int ret; + + mutex_lock(&pcf->lock); + ret = __pcf50633_read(pcf, reg, nr_regs, data); + mutex_unlock(&pcf->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(pcf50633_read_block); + +/* Write a block of upto 32 regs */ +int pcf50633_write_block(struct pcf50633 *pcf , u8 reg, + int nr_regs, u8 *data) +{ + int ret; + + mutex_lock(&pcf->lock); + ret = __pcf50633_write(pcf, reg, nr_regs, data); + mutex_unlock(&pcf->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(pcf50633_write_block); + +u8 pcf50633_reg_read(struct pcf50633 *pcf, u8 reg) +{ + u8 val; + + mutex_lock(&pcf->lock); + __pcf50633_read(pcf, reg, 1, &val); + mutex_unlock(&pcf->lock); + + return val; +} +EXPORT_SYMBOL_GPL(pcf50633_reg_read); + +int pcf50633_reg_write(struct pcf50633 *pcf, u8 reg, u8 val) +{ + int ret; + + mutex_lock(&pcf->lock); + ret = __pcf50633_write(pcf, reg, 1, &val); + mutex_unlock(&pcf->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(pcf50633_reg_write); + +int pcf50633_reg_set_bit_mask(struct pcf50633 *pcf, u8 reg, u8 mask, u8 val) +{ + int ret; + u8 tmp; + + val &= mask; + + mutex_lock(&pcf->lock); + ret = __pcf50633_read(pcf, reg, 1, &tmp); + if (ret < 0) + goto out; + + tmp &= ~mask; + tmp |= val; + ret = __pcf50633_write(pcf, reg, 1, &tmp); + +out: + mutex_unlock(&pcf->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(pcf50633_reg_set_bit_mask); + +int pcf50633_reg_clear_bits(struct pcf50633 *pcf, u8 reg, u8 val) +{ + int ret; + u8 tmp; + + mutex_lock(&pcf->lock); + ret = __pcf50633_read(pcf, reg, 1, &tmp); + if (ret < 0) + goto out; + + tmp &= ~val; + ret = __pcf50633_write(pcf, reg, 1, &tmp); + +out: + mutex_unlock(&pcf->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(pcf50633_reg_clear_bits); + +/* sysfs attributes */ +static ssize_t show_dump_regs(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct pcf50633 *pcf = dev_get_drvdata(dev); + u8 dump[16]; + int n, n1, idx = 0; + char *buf1 = buf; + static u8 address_no_read[] = { /* must be ascending */ + PCF50633_REG_INT1, + PCF50633_REG_INT2, + PCF50633_REG_INT3, + PCF50633_REG_INT4, + PCF50633_REG_INT5, + 0 /* terminator */ + }; + + for (n = 0; n < 256; n += sizeof(dump)) { + for (n1 = 0; n1 < sizeof(dump); n1++) + if (n == address_no_read[idx]) { + idx++; + dump[n1] = 0x00; + } else + dump[n1] = pcf50633_reg_read(pcf, n + n1); + + hex_dump_to_buffer(dump, sizeof(dump), 16, 1, buf1, 128, 0); + buf1 += strlen(buf1); + *buf1++ = '\n'; + *buf1 = '\0'; + } + + return buf1 - buf; +} +static DEVICE_ATTR(dump_regs, 0400, show_dump_regs, NULL); + +static ssize_t show_resume_reason(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pcf50633 *pcf = dev_get_drvdata(dev); + int n; + + n = sprintf(buf, "%02x%02x%02x%02x%02x\n", + pcf->resume_reason[0], + pcf->resume_reason[1], + pcf->resume_reason[2], + pcf->resume_reason[3], + pcf->resume_reason[4]); + + return n; +} +static DEVICE_ATTR(resume_reason, 0400, show_resume_reason, NULL); + +static struct attribute *pcf_sysfs_entries[] = { + &dev_attr_dump_regs.attr, + &dev_attr_resume_reason.attr, + NULL, +}; + +static struct attribute_group pcf_attr_group = { + .name = NULL, /* put in device directory */ + .attrs = pcf_sysfs_entries, +}; + +int pcf50633_register_irq(struct pcf50633 *pcf, int irq, + void (*handler) (int, void *), void *data) +{ + if (irq < 0 || irq > PCF50633_NUM_IRQ || !handler) + return -EINVAL; + + if (WARN_ON(pcf->irq_handler[irq].handler)) + return -EBUSY; + + mutex_lock(&pcf->lock); + pcf->irq_handler[irq].handler = handler; + pcf->irq_handler[irq].data = data; + mutex_unlock(&pcf->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(pcf50633_register_irq); + +int pcf50633_free_irq(struct pcf50633 *pcf, int irq) +{ + if (irq < 0 || irq > PCF50633_NUM_IRQ) + return -EINVAL; + + mutex_lock(&pcf->lock); + pcf->irq_handler[irq].handler = NULL; + mutex_unlock(&pcf->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(pcf50633_free_irq); + +static int __pcf50633_irq_mask_set(struct pcf50633 *pcf, int irq, u8 mask) +{ + u8 reg, bits, tmp; + int ret = 0, idx; + + idx = irq >> 3; + reg = PCF50633_REG_INT1M + idx; + bits = 1 << (irq & 0x07); + + mutex_lock(&pcf->lock); + + if (mask) { + ret = __pcf50633_read(pcf, reg, 1, &tmp); + if (ret < 0) + goto out; + + tmp |= bits; + + ret = __pcf50633_write(pcf, reg, 1, &tmp); + if (ret < 0) + goto out; + + pcf->mask_regs[idx] &= ~bits; + pcf->mask_regs[idx] |= bits; + } else { + ret = __pcf50633_read(pcf, reg, 1, &tmp); + if (ret < 0) + goto out; + + tmp &= ~bits; + + ret = __pcf50633_write(pcf, reg, 1, &tmp); + if (ret < 0) + goto out; + + pcf->mask_regs[idx] &= ~bits; + } +out: + mutex_unlock(&pcf->lock); + + return ret; +} + +int pcf50633_irq_mask(struct pcf50633 *pcf, int irq) +{ + dev_info(pcf->dev, "Masking IRQ %d\n", irq); + + return __pcf50633_irq_mask_set(pcf, irq, 1); +} +EXPORT_SYMBOL_GPL(pcf50633_irq_mask); + +int pcf50633_irq_unmask(struct pcf50633 *pcf, int irq) +{ + dev_info(pcf->dev, "Unmasking IRQ %d\n", irq); + + return __pcf50633_irq_mask_set(pcf, irq, 0); +} +EXPORT_SYMBOL_GPL(pcf50633_irq_unmask); + +int pcf50633_irq_mask_get(struct pcf50633 *pcf, int irq) +{ + u8 reg, bits; + + reg = irq >> 3; + bits = 1 << (irq & 0x07); + + return pcf->mask_regs[reg] & bits; +} +EXPORT_SYMBOL_GPL(pcf50633_irq_mask_get); + +static void pcf50633_irq_call_handler(struct pcf50633 *pcf, int irq) +{ + if (pcf->irq_handler[irq].handler) + pcf->irq_handler[irq].handler(irq, pcf->irq_handler[irq].data); +} + +/* Maximum amount of time ONKEY is held before emergency action is taken */ +#define PCF50633_ONKEY1S_TIMEOUT 8 + +static void pcf50633_irq_worker(struct work_struct *work) +{ + struct pcf50633 *pcf; + int ret, i, j; + u8 pcf_int[5], chgstat; + + pcf = container_of(work, struct pcf50633, irq_work); + + /* Read the 5 INT regs in one transaction */ + ret = pcf50633_read_block(pcf, PCF50633_REG_INT1, + ARRAY_SIZE(pcf_int), pcf_int); + if (ret != ARRAY_SIZE(pcf_int)) { + dev_err(pcf->dev, "Error reading INT registers\n"); + + /* + * If this doesn't ACK the interrupt to the chip, we'll be + * called once again as we're level triggered. + */ + goto out; + } + + /* We immediately read the usb and adapter status. We thus make sure + * only of USBINS/USBREM IRQ handlers are called */ + if (pcf_int[0] & (PCF50633_INT1_USBINS | PCF50633_INT1_USBREM)) { + chgstat = pcf50633_reg_read(pcf, PCF50633_REG_MBCS2); + if (chgstat & (0x3 << 4)) + pcf_int[0] &= ~(1 << PCF50633_INT1_USBREM); + else + pcf_int[0] &= ~(1 << PCF50633_INT1_USBINS); + } + + /* Make sure only one of ADPINS or ADPREM is set */ + if (pcf_int[0] & (PCF50633_INT1_ADPINS | PCF50633_INT1_ADPREM)) { + chgstat = pcf50633_reg_read(pcf, PCF50633_REG_MBCS2); + if (chgstat & (0x3 << 4)) + pcf_int[0] &= ~(1 << PCF50633_INT1_ADPREM); + else + pcf_int[0] &= ~(1 << PCF50633_INT1_ADPINS); + } + + dev_dbg(pcf->dev, "INT1=0x%02x INT2=0x%02x INT3=0x%02x " + "INT4=0x%02x INT5=0x%02x\n", pcf_int[0], + pcf_int[1], pcf_int[2], pcf_int[3], pcf_int[4]); + + /* Some revisions of the chip don't have a 8s standby mode on + * ONKEY1S press. We try to manually do it in such cases. */ + if ((pcf_int[0] & PCF50633_INT1_SECOND) && pcf->onkey1s_held) { + dev_info(pcf->dev, "ONKEY1S held for %d secs\n", + pcf->onkey1s_held); + if (pcf->onkey1s_held++ == PCF50633_ONKEY1S_TIMEOUT) + if (pcf->pdata->force_shutdown) + pcf->pdata->force_shutdown(pcf); + } + + if (pcf_int[2] & PCF50633_INT3_ONKEY1S) { + dev_info(pcf->dev, "ONKEY1S held\n"); + pcf->onkey1s_held = 1 ; + + /* Unmask IRQ_SECOND */ + pcf50633_reg_clear_bits(pcf, PCF50633_REG_INT1M, + PCF50633_INT1_SECOND); + + /* Unmask IRQ_ONKEYR */ + pcf50633_reg_clear_bits(pcf, PCF50633_REG_INT2M, + PCF50633_INT2_ONKEYR); + } + + if ((pcf_int[1] & PCF50633_INT2_ONKEYR) && pcf->onkey1s_held) { + pcf->onkey1s_held = 0; + + /* Mask SECOND and ONKEYR interrupts */ + if (pcf->mask_regs[0] & PCF50633_INT1_SECOND) + pcf50633_reg_set_bit_mask(pcf, + PCF50633_REG_INT1M, + PCF50633_INT1_SECOND, + PCF50633_INT1_SECOND); + + if (pcf->mask_regs[1] & PCF50633_INT2_ONKEYR) + pcf50633_reg_set_bit_mask(pcf, + PCF50633_REG_INT2M, + PCF50633_INT2_ONKEYR, + PCF50633_INT2_ONKEYR); + } + + /* Have we just resumed ? */ + if (pcf->is_suspended) { + pcf->is_suspended = 0; + + /* Set the resume reason filtering out non resumers */ + for (i = 0; i < ARRAY_SIZE(pcf_int); i++) + pcf->resume_reason[i] = pcf_int[i] & + pcf->pdata->resumers[i]; + + /* Make sure we don't pass on any ONKEY events to + * userspace now */ + pcf_int[1] &= ~(PCF50633_INT2_ONKEYR | PCF50633_INT2_ONKEYF); + } + + for (i = 0; i < ARRAY_SIZE(pcf_int); i++) { + /* Unset masked interrupts */ + pcf_int[i] &= ~pcf->mask_regs[i]; + + for (j = 0; j < 8 ; j++) + if (pcf_int[i] & (1 << j)) + pcf50633_irq_call_handler(pcf, (i * 8) + j); + } + +out: + put_device(pcf->dev); + enable_irq(pcf->irq); +} + +static irqreturn_t pcf50633_irq(int irq, void *data) +{ + struct pcf50633 *pcf = data; + + dev_dbg(pcf->dev, "pcf50633_irq\n"); + + get_device(pcf->dev); + disable_irq(pcf->irq); + schedule_work(&pcf->irq_work); + + return IRQ_HANDLED; +} + +static void +pcf50633_client_dev_register(struct pcf50633 *pcf, const char *name, + struct platform_device **pdev) +{ + struct pcf50633_subdev_pdata *subdev_pdata; + int ret; + + *pdev = platform_device_alloc(name, -1); + if (!*pdev) { + dev_err(pcf->dev, "Falied to allocate %s\n", name); + return; + } + + subdev_pdata = kmalloc(sizeof(*subdev_pdata), GFP_KERNEL); + if (!subdev_pdata) { + dev_err(pcf->dev, "Error allocating subdev pdata\n"); + platform_device_put(*pdev); + } + + subdev_pdata->pcf = pcf; + platform_device_add_data(*pdev, subdev_pdata, sizeof(*subdev_pdata)); + + (*pdev)->dev.parent = pcf->dev; + + ret = platform_device_add(*pdev); + if (ret) { + dev_err(pcf->dev, "Failed to register %s: %d\n", name, ret); + platform_device_put(*pdev); + *pdev = NULL; + } +} + +#ifdef CONFIG_PM +static int pcf50633_suspend(struct device *dev, pm_message_t state) +{ + struct pcf50633 *pcf; + int ret = 0, i; + u8 res[5]; + + pcf = dev_get_drvdata(dev); + + /* Make sure our interrupt handlers are not called + * henceforth */ + disable_irq(pcf->irq); + + /* Make sure that any running IRQ worker has quit */ + cancel_work_sync(&pcf->irq_work); + + /* Save the masks */ + ret = pcf50633_read_block(pcf, PCF50633_REG_INT1M, + ARRAY_SIZE(pcf->suspend_irq_masks), + pcf->suspend_irq_masks); + if (ret < 0) { + dev_err(pcf->dev, "error saving irq masks\n"); + goto out; + } + + /* Write wakeup irq masks */ + for (i = 0; i < ARRAY_SIZE(res); i++) + res[i] = ~pcf->pdata->resumers[i]; + + ret = pcf50633_write_block(pcf, PCF50633_REG_INT1M, + ARRAY_SIZE(res), &res[0]); + if (ret < 0) { + dev_err(pcf->dev, "error writing wakeup irq masks\n"); + goto out; + } + + pcf->is_suspended = 1; + +out: + return ret; +} + +static int pcf50633_resume(struct device *dev) +{ + struct pcf50633 *pcf; + int ret; + + pcf = dev_get_drvdata(dev); + + /* Write the saved mask registers */ + ret = pcf50633_write_block(pcf, PCF50633_REG_INT1M, + ARRAY_SIZE(pcf->suspend_irq_masks), + pcf->suspend_irq_masks); + if (ret < 0) + dev_err(pcf->dev, "Error restoring saved suspend masks\n"); + + /* Restore regulators' state */ + + + get_device(pcf->dev); + + /* + * Clear any pending interrupts and set resume reason if any. + * This will leave with enable_irq() + */ + pcf50633_irq_worker(&pcf->irq_work); + + return 0; +} +#else +#define pcf50633_suspend NULL +#define pcf50633_resume NULL +#endif + +static int __devinit pcf50633_probe(struct i2c_client *client, + const struct i2c_device_id *ids) +{ + struct pcf50633 *pcf; + struct pcf50633_platform_data *pdata = client->dev.platform_data; + int i, ret = 0; + int version, variant; + + pcf = kzalloc(sizeof(*pcf), GFP_KERNEL); + if (!pcf) + return -ENOMEM; + + pcf->pdata = pdata; + + mutex_init(&pcf->lock); + + i2c_set_clientdata(client, pcf); + pcf->dev = &client->dev; + pcf->i2c_client = client; + pcf->irq = client->irq; + + INIT_WORK(&pcf->irq_work, pcf50633_irq_worker); + + version = pcf50633_reg_read(pcf, 0); + variant = pcf50633_reg_read(pcf, 1); + if (version < 0 || variant < 0) { + dev_err(pcf->dev, "Unable to probe pcf50633\n"); + ret = -ENODEV; + goto err; + } + + dev_info(pcf->dev, "Probed device version %d variant %d\n", + version, variant); + + /* Enable all interrupts except RTC SECOND */ + pcf->mask_regs[0] = 0x80; + pcf50633_reg_write(pcf, PCF50633_REG_INT1M, pcf->mask_regs[0]); + pcf50633_reg_write(pcf, PCF50633_REG_INT2M, 0x00); + pcf50633_reg_write(pcf, PCF50633_REG_INT3M, 0x00); + pcf50633_reg_write(pcf, PCF50633_REG_INT4M, 0x00); + pcf50633_reg_write(pcf, PCF50633_REG_INT5M, 0x00); + + /* Create sub devices */ + pcf50633_client_dev_register(pcf, "pcf50633-input", + &pcf->input_pdev); + pcf50633_client_dev_register(pcf, "pcf50633-rtc", + &pcf->rtc_pdev); + pcf50633_client_dev_register(pcf, "pcf50633-mbc", + &pcf->mbc_pdev); + pcf50633_client_dev_register(pcf, "pcf50633-adc", + &pcf->adc_pdev); + + for (i = 0; i < PCF50633_NUM_REGULATORS; i++) { + struct platform_device *pdev; + + pdev = platform_device_alloc("pcf50633-regltr", i); + if (!pdev) { + dev_err(pcf->dev, "Cannot create regulator\n"); + continue; + } + + pdev->dev.parent = pcf->dev; + pdev->dev.platform_data = &pdata->reg_init_data[i]; + pdev->dev.driver_data = pcf; + pcf->regulator_pdev[i] = pdev; + + platform_device_add(pdev); + } + + if (client->irq) { + set_irq_handler(client->irq, handle_level_irq); + ret = request_irq(client->irq, pcf50633_irq, + IRQF_TRIGGER_LOW, "pcf50633", pcf); + + if (ret) { + dev_err(pcf->dev, "Failed to request IRQ %d\n", ret); + goto err; + } + } else { + dev_err(pcf->dev, "No IRQ configured\n"); + goto err; + } + + if (enable_irq_wake(client->irq) < 0) + dev_err(pcf->dev, "IRQ %u cannot be enabled as wake-up source" + "in this hardware revision", client->irq); + + ret = sysfs_create_group(&client->dev.kobj, &pcf_attr_group); + if (ret) + dev_err(pcf->dev, "error creating sysfs entries\n"); + + if (pdata->probe_done) + pdata->probe_done(pcf); + + return 0; + +err: + kfree(pcf); + return ret; +} + +static int __devexit pcf50633_remove(struct i2c_client *client) +{ + struct pcf50633 *pcf = i2c_get_clientdata(client); + int i; + + free_irq(pcf->irq, pcf); + + platform_device_unregister(pcf->input_pdev); + platform_device_unregister(pcf->rtc_pdev); + platform_device_unregister(pcf->mbc_pdev); + platform_device_unregister(pcf->adc_pdev); + + for (i = 0; i < PCF50633_NUM_REGULATORS; i++) + platform_device_unregister(pcf->regulator_pdev[i]); + + kfree(pcf); + + return 0; +} + +static struct i2c_device_id pcf50633_id_table[] = { + {"pcf50633", 0x73}, +}; + +static struct i2c_driver pcf50633_driver = { + .driver = { + .name = "pcf50633", + .suspend = pcf50633_suspend, + .resume = pcf50633_resume, + }, + .id_table = pcf50633_id_table, + .probe = pcf50633_probe, + .remove = __devexit_p(pcf50633_remove), +}; + +static int __init pcf50633_init(void) +{ + return i2c_add_driver(&pcf50633_driver); +} + +static void __exit pcf50633_exit(void) +{ + i2c_del_driver(&pcf50633_driver); +} + +MODULE_DESCRIPTION("I2C chip driver for NXP PCF50633 PMU"); +MODULE_AUTHOR("Harald Welte "); +MODULE_LICENSE("GPL"); + +module_init(pcf50633_init); +module_exit(pcf50633_exit); diff --git a/include/linux/mfd/pcf50633/core.h b/include/linux/mfd/pcf50633/core.h new file mode 100644 index 000000000000..4455b212d75a --- /dev/null +++ b/include/linux/mfd/pcf50633/core.h @@ -0,0 +1,218 @@ +/* + * core.h -- Core driver for NXP PCF50633 + * + * (C) 2006-2008 by Openmoko, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#ifndef __LINUX_MFD_PCF50633_CORE_H +#define __LINUX_MFD_PCF50633_CORE_H + +#include +#include +#include +#include +#include + +struct pcf50633; + +#define PCF50633_NUM_REGULATORS 11 + +struct pcf50633_platform_data { + struct regulator_init_data reg_init_data[PCF50633_NUM_REGULATORS]; + + char **batteries; + int num_batteries; + + /* Callbacks */ + void (*probe_done)(struct pcf50633 *); + void (*mbc_event_callback)(struct pcf50633 *, int); + void (*regulator_registered)(struct pcf50633 *, int); + void (*force_shutdown)(struct pcf50633 *); + + u8 resumers[5]; +}; + +struct pcf50633_subdev_pdata { + struct pcf50633 *pcf; +}; + +struct pcf50633_irq { + void (*handler) (int, void *); + void *data; +}; + +int pcf50633_register_irq(struct pcf50633 *pcf, int irq, + void (*handler) (int, void *), void *data); +int pcf50633_free_irq(struct pcf50633 *pcf, int irq); + +int pcf50633_irq_mask(struct pcf50633 *pcf, int irq); +int pcf50633_irq_unmask(struct pcf50633 *pcf, int irq); +int pcf50633_irq_mask_get(struct pcf50633 *pcf, int irq); + +int pcf50633_read_block(struct pcf50633 *, u8 reg, + int nr_regs, u8 *data); +int pcf50633_write_block(struct pcf50633 *pcf, u8 reg, + int nr_regs, u8 *data); +u8 pcf50633_reg_read(struct pcf50633 *, u8 reg); +int pcf50633_reg_write(struct pcf50633 *pcf, u8 reg, u8 val); + +int pcf50633_reg_set_bit_mask(struct pcf50633 *pcf, u8 reg, u8 mask, u8 val); +int pcf50633_reg_clear_bits(struct pcf50633 *pcf, u8 reg, u8 bits); + +/* Interrupt registers */ + +#define PCF50633_REG_INT1 0x02 +#define PCF50633_REG_INT2 0x03 +#define PCF50633_REG_INT3 0x04 +#define PCF50633_REG_INT4 0x05 +#define PCF50633_REG_INT5 0x06 + +#define PCF50633_REG_INT1M 0x07 +#define PCF50633_REG_INT2M 0x08 +#define PCF50633_REG_INT3M 0x09 +#define PCF50633_REG_INT4M 0x0a +#define PCF50633_REG_INT5M 0x0b + +enum { + /* Chip IRQs */ + PCF50633_IRQ_ADPINS, + PCF50633_IRQ_ADPREM, + PCF50633_IRQ_USBINS, + PCF50633_IRQ_USBREM, + PCF50633_IRQ_RESERVED1, + PCF50633_IRQ_RESERVED2, + PCF50633_IRQ_ALARM, + PCF50633_IRQ_SECOND, + PCF50633_IRQ_ONKEYR, + PCF50633_IRQ_ONKEYF, + PCF50633_IRQ_EXTON1R, + PCF50633_IRQ_EXTON1F, + PCF50633_IRQ_EXTON2R, + PCF50633_IRQ_EXTON2F, + PCF50633_IRQ_EXTON3R, + PCF50633_IRQ_EXTON3F, + PCF50633_IRQ_BATFULL, + PCF50633_IRQ_CHGHALT, + PCF50633_IRQ_THLIMON, + PCF50633_IRQ_THLIMOFF, + PCF50633_IRQ_USBLIMON, + PCF50633_IRQ_USBLIMOFF, + PCF50633_IRQ_ADCRDY, + PCF50633_IRQ_ONKEY1S, + PCF50633_IRQ_LOWSYS, + PCF50633_IRQ_LOWBAT, + PCF50633_IRQ_HIGHTMP, + PCF50633_IRQ_AUTOPWRFAIL, + PCF50633_IRQ_DWN1PWRFAIL, + PCF50633_IRQ_DWN2PWRFAIL, + PCF50633_IRQ_LEDPWRFAIL, + PCF50633_IRQ_LEDOVP, + PCF50633_IRQ_LDO1PWRFAIL, + PCF50633_IRQ_LDO2PWRFAIL, + PCF50633_IRQ_LDO3PWRFAIL, + PCF50633_IRQ_LDO4PWRFAIL, + PCF50633_IRQ_LDO5PWRFAIL, + PCF50633_IRQ_LDO6PWRFAIL, + PCF50633_IRQ_HCLDOPWRFAIL, + PCF50633_IRQ_HCLDOOVL, + + /* Always last */ + PCF50633_NUM_IRQ, +}; + +struct pcf50633 { + struct device *dev; + struct i2c_client *i2c_client; + + struct pcf50633_platform_data *pdata; + int irq; + struct pcf50633_irq irq_handler[PCF50633_NUM_IRQ]; + struct work_struct irq_work; + struct mutex lock; + + u8 mask_regs[5]; + + u8 suspend_irq_masks[5]; + u8 resume_reason[5]; + int is_suspended; + + int onkey1s_held; + + struct platform_device *rtc_pdev; + struct platform_device *mbc_pdev; + struct platform_device *adc_pdev; + struct platform_device *input_pdev; + struct platform_device *regulator_pdev[PCF50633_NUM_REGULATORS]; +}; + +enum pcf50633_reg_int1 { + PCF50633_INT1_ADPINS = 0x01, /* Adapter inserted */ + PCF50633_INT1_ADPREM = 0x02, /* Adapter removed */ + PCF50633_INT1_USBINS = 0x04, /* USB inserted */ + PCF50633_INT1_USBREM = 0x08, /* USB removed */ + /* reserved */ + PCF50633_INT1_ALARM = 0x40, /* RTC alarm time is reached */ + PCF50633_INT1_SECOND = 0x80, /* RTC periodic second interrupt */ +}; + +enum pcf50633_reg_int2 { + PCF50633_INT2_ONKEYR = 0x01, /* ONKEY rising edge */ + PCF50633_INT2_ONKEYF = 0x02, /* ONKEY falling edge */ + PCF50633_INT2_EXTON1R = 0x04, /* EXTON1 rising edge */ + PCF50633_INT2_EXTON1F = 0x08, /* EXTON1 falling edge */ + PCF50633_INT2_EXTON2R = 0x10, /* EXTON2 rising edge */ + PCF50633_INT2_EXTON2F = 0x20, /* EXTON2 falling edge */ + PCF50633_INT2_EXTON3R = 0x40, /* EXTON3 rising edge */ + PCF50633_INT2_EXTON3F = 0x80, /* EXTON3 falling edge */ +}; + +enum pcf50633_reg_int3 { + PCF50633_INT3_BATFULL = 0x01, /* Battery full */ + PCF50633_INT3_CHGHALT = 0x02, /* Charger halt */ + PCF50633_INT3_THLIMON = 0x04, + PCF50633_INT3_THLIMOFF = 0x08, + PCF50633_INT3_USBLIMON = 0x10, + PCF50633_INT3_USBLIMOFF = 0x20, + PCF50633_INT3_ADCRDY = 0x40, /* ADC result ready */ + PCF50633_INT3_ONKEY1S = 0x80, /* ONKEY pressed 1 second */ +}; + +enum pcf50633_reg_int4 { + PCF50633_INT4_LOWSYS = 0x01, + PCF50633_INT4_LOWBAT = 0x02, + PCF50633_INT4_HIGHTMP = 0x04, + PCF50633_INT4_AUTOPWRFAIL = 0x08, + PCF50633_INT4_DWN1PWRFAIL = 0x10, + PCF50633_INT4_DWN2PWRFAIL = 0x20, + PCF50633_INT4_LEDPWRFAIL = 0x40, + PCF50633_INT4_LEDOVP = 0x80, +}; + +enum pcf50633_reg_int5 { + PCF50633_INT5_LDO1PWRFAIL = 0x01, + PCF50633_INT5_LDO2PWRFAIL = 0x02, + PCF50633_INT5_LDO3PWRFAIL = 0x04, + PCF50633_INT5_LDO4PWRFAIL = 0x08, + PCF50633_INT5_LDO5PWRFAIL = 0x10, + PCF50633_INT5_LDO6PWRFAIL = 0x20, + PCF50633_INT5_HCLDOPWRFAIL = 0x40, + PCF50633_INT5_HCLDOOVL = 0x80, +}; + +/* misc. registers */ +#define PCF50633_REG_OOCSHDWN 0x0c + +/* LED registers */ +#define PCF50633_REG_LEDOUT 0x28 +#define PCF50633_REG_LEDENA 0x29 +#define PCF50633_REG_LEDCTL 0x2a +#define PCF50633_REG_LEDDIM 0x2b + +#endif + -- cgit v1.2.3 From 08c3e06a5eb27d43b712adef18379f8464425e71 Mon Sep 17 00:00:00 2001 From: Balaji Rao Date: Fri, 9 Jan 2009 01:49:26 +0100 Subject: mfd: PCF50633 adc driver This patch adds basic support for the PCF50633 ADC. The subtractive mode is not supported yet. Since we don't have adc subsystem, it currently lives in drivers/mfd. Signed-off-by: Balaji Rao Cc: Andy Green Acked-by: Jonathan Cameron Signed-off-by: Samuel Ortiz --- drivers/mfd/Kconfig | 7 + drivers/mfd/Makefile | 3 +- drivers/mfd/pcf50633-adc.c | 277 +++++++++++++++++++++++++++++++++++++++ include/linux/mfd/pcf50633/adc.h | 72 ++++++++++ 4 files changed, 358 insertions(+), 1 deletion(-) create mode 100644 drivers/mfd/pcf50633-adc.c create mode 100644 include/linux/mfd/pcf50633/adc.h (limited to 'include/linux') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 3ac32e45b03b..2fa3504e165a 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -226,6 +226,13 @@ config MFD_PCF50633 facilities, and registers devices for the various functions so that function-specific drivers can bind to them. +config PCF50633_ADC + tristate "Support for NXP PCF50633 ADC" + depends on MFD_PCF50633 + help + Say yes here if you want to include support for ADC in the + NXP PCF50633 chip. + endmenu menu "Multimedia Capabilities Port drivers" diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 23d4d10981b3..138f9c4d3d7b 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -38,4 +38,5 @@ obj-$(CONFIG_UCB1400_CORE) += ucb1400_core.o obj-$(CONFIG_PMIC_DA903X) += da903x.o -obj-$(CONFIG_MFD_PCF50633) += pcf50633-core.o \ No newline at end of file +obj-$(CONFIG_MFD_PCF50633) += pcf50633-core.o +obj-$(CONFIG_PCF50633_ADC) += pcf50633-adc.o \ No newline at end of file diff --git a/drivers/mfd/pcf50633-adc.c b/drivers/mfd/pcf50633-adc.c new file mode 100644 index 000000000000..c2d05becfa97 --- /dev/null +++ b/drivers/mfd/pcf50633-adc.c @@ -0,0 +1,277 @@ +/* NXP PCF50633 ADC Driver + * + * (C) 2006-2008 by Openmoko, Inc. + * Author: Balaji Rao + * All rights reserved. + * + * Broken down from monstrous PCF50633 driver mainly by + * Harald Welte, Andy Green and Werner Almesberger + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * NOTE: This driver does not yet support subtractive ADC mode, which means + * you can do only one measurement per read request. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +struct pcf50633_adc_request { + int mux; + int avg; + int result; + void (*callback)(struct pcf50633 *, void *, int); + void *callback_param; + + /* Used in case of sync requests */ + struct completion completion; + +}; + +#define PCF50633_MAX_ADC_FIFO_DEPTH 8 + +struct pcf50633_adc { + struct pcf50633 *pcf; + + /* Private stuff */ + struct pcf50633_adc_request *queue[PCF50633_MAX_ADC_FIFO_DEPTH]; + int queue_head; + int queue_tail; + struct mutex queue_mutex; +}; + +static inline struct pcf50633_adc *__to_adc(struct pcf50633 *pcf) +{ + return platform_get_drvdata(pcf->adc_pdev); +} + +static void adc_setup(struct pcf50633 *pcf, int channel, int avg) +{ + channel &= PCF50633_ADCC1_ADCMUX_MASK; + + /* kill ratiometric, but enable ACCSW biasing */ + pcf50633_reg_write(pcf, PCF50633_REG_ADCC2, 0x00); + pcf50633_reg_write(pcf, PCF50633_REG_ADCC3, 0x01); + + /* start ADC conversion on selected channel */ + pcf50633_reg_write(pcf, PCF50633_REG_ADCC1, channel | avg | + PCF50633_ADCC1_ADCSTART | PCF50633_ADCC1_RES_10BIT); +} + +static void trigger_next_adc_job_if_any(struct pcf50633 *pcf) +{ + struct pcf50633_adc *adc = __to_adc(pcf); + int head; + + mutex_lock(&adc->queue_mutex); + + head = adc->queue_head; + + if (!adc->queue[head]) { + mutex_unlock(&adc->queue_mutex); + return; + } + mutex_unlock(&adc->queue_mutex); + + adc_setup(pcf, adc->queue[head]->mux, adc->queue[head]->avg); +} + +static int +adc_enqueue_request(struct pcf50633 *pcf, struct pcf50633_adc_request *req) +{ + struct pcf50633_adc *adc = __to_adc(pcf); + int head, tail; + + mutex_lock(&adc->queue_mutex); + + head = adc->queue_head; + tail = adc->queue_tail; + + if (adc->queue[tail]) { + mutex_unlock(&adc->queue_mutex); + return -EBUSY; + } + + adc->queue[tail] = req; + adc->queue_tail = (tail + 1) & (PCF50633_MAX_ADC_FIFO_DEPTH - 1); + + mutex_unlock(&adc->queue_mutex); + + trigger_next_adc_job_if_any(pcf); + + return 0; +} + +static void +pcf50633_adc_sync_read_callback(struct pcf50633 *pcf, void *param, int result) +{ + struct pcf50633_adc_request *req = param; + + req->result = result; + complete(&req->completion); +} + +int pcf50633_adc_sync_read(struct pcf50633 *pcf, int mux, int avg) +{ + struct pcf50633_adc_request *req; + + /* req is freed when the result is ready, in interrupt handler */ + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->mux = mux; + req->avg = avg; + req->callback = pcf50633_adc_sync_read_callback; + req->callback_param = req; + + init_completion(&req->completion); + adc_enqueue_request(pcf, req); + wait_for_completion(&req->completion); + + return req->result; +} +EXPORT_SYMBOL_GPL(pcf50633_adc_sync_read); + +int pcf50633_adc_async_read(struct pcf50633 *pcf, int mux, int avg, + void (*callback)(struct pcf50633 *, void *, int), + void *callback_param) +{ + struct pcf50633_adc_request *req; + + /* req is freed when the result is ready, in interrupt handler */ + req = kmalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->mux = mux; + req->avg = avg; + req->callback = callback; + req->callback_param = callback_param; + + adc_enqueue_request(pcf, req); + + return 0; +} +EXPORT_SYMBOL_GPL(pcf50633_adc_async_read); + +static int adc_result(struct pcf50633 *pcf) +{ + u8 adcs1, adcs3; + u16 result; + + adcs1 = pcf50633_reg_read(pcf, PCF50633_REG_ADCS1); + adcs3 = pcf50633_reg_read(pcf, PCF50633_REG_ADCS3); + result = (adcs1 << 2) | (adcs3 & PCF50633_ADCS3_ADCDAT1L_MASK); + + dev_dbg(pcf->dev, "adc result = %d\n", result); + + return result; +} + +static void pcf50633_adc_irq(int irq, void *data) +{ + struct pcf50633_adc *adc = data; + struct pcf50633 *pcf = adc->pcf; + struct pcf50633_adc_request *req; + int head; + + mutex_lock(&adc->queue_mutex); + head = adc->queue_head; + + req = adc->queue[head]; + if (WARN_ON(!req)) { + dev_err(pcf->dev, "pcf50633-adc irq: ADC queue empty!\n"); + mutex_unlock(&adc->queue_mutex); + return; + } + adc->queue[head] = NULL; + adc->queue_head = (head + 1) & + (PCF50633_MAX_ADC_FIFO_DEPTH - 1); + + mutex_unlock(&adc->queue_mutex); + + req->callback(pcf, req->callback_param, adc_result(pcf)); + kfree(req); + + trigger_next_adc_job_if_any(pcf); +} + +static int __devinit pcf50633_adc_probe(struct platform_device *pdev) +{ + struct pcf50633_subdev_pdata *pdata = pdev->dev.platform_data; + struct pcf50633_adc *adc; + + adc = kzalloc(sizeof(*adc), GFP_KERNEL); + if (!adc) + return -ENOMEM; + + adc->pcf = pdata->pcf; + platform_set_drvdata(pdev, adc); + + pcf50633_register_irq(pdata->pcf, PCF50633_IRQ_ADCRDY, + pcf50633_adc_irq, adc); + + mutex_init(&adc->queue_mutex); + + return 0; +} + +static int __devexit pcf50633_adc_remove(struct platform_device *pdev) +{ + struct pcf50633_adc *adc = platform_get_drvdata(pdev); + int i, head; + + pcf50633_free_irq(adc->pcf, PCF50633_IRQ_ADCRDY); + + mutex_lock(&adc->queue_mutex); + head = adc->queue_head; + + if (WARN_ON(adc->queue[head])) + dev_err(adc->pcf->dev, + "adc driver removed with request pending\n"); + + for (i = 0; i < PCF50633_MAX_ADC_FIFO_DEPTH; i++) + kfree(adc->queue[i]); + + mutex_unlock(&adc->queue_mutex); + kfree(adc); + + return 0; +} + +static struct platform_driver pcf50633_adc_driver = { + .driver = { + .name = "pcf50633-adc", + }, + .probe = pcf50633_adc_probe, + .remove = __devexit_p(pcf50633_adc_remove), +}; + +static int __init pcf50633_adc_init(void) +{ + return platform_driver_register(&pcf50633_adc_driver); +} +module_init(pcf50633_adc_init); + +static void __exit pcf50633_adc_exit(void) +{ + platform_driver_unregister(&pcf50633_adc_driver); +} +module_exit(pcf50633_adc_exit); + +MODULE_AUTHOR("Balaji Rao "); +MODULE_DESCRIPTION("PCF50633 adc driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:pcf50633-adc"); + diff --git a/include/linux/mfd/pcf50633/adc.h b/include/linux/mfd/pcf50633/adc.h new file mode 100644 index 000000000000..56669b4183ad --- /dev/null +++ b/include/linux/mfd/pcf50633/adc.h @@ -0,0 +1,72 @@ +/* + * adc.h -- Driver for NXP PCF50633 ADC + * + * (C) 2006-2008 by Openmoko, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#ifndef __LINUX_MFD_PCF50633_ADC_H +#define __LINUX_MFD_PCF50633_ADC_H + +#include +#include + +/* ADC Registers */ +#define PCF50633_REG_ADCC3 0x52 +#define PCF50633_REG_ADCC2 0x53 +#define PCF50633_REG_ADCC1 0x54 +#define PCF50633_REG_ADCS1 0x55 +#define PCF50633_REG_ADCS2 0x56 +#define PCF50633_REG_ADCS3 0x57 + +#define PCF50633_ADCC1_ADCSTART 0x01 +#define PCF50633_ADCC1_RES_10BIT 0x02 +#define PCF50633_ADCC1_AVERAGE_NO 0x00 +#define PCF50633_ADCC1_AVERAGE_4 0x04 +#define PCF50633_ADCC1_AVERAGE_8 0x08 +#define PCF50633_ADCC1_AVERAGE_16 0x0c +#define PCF50633_ADCC1_MUX_BATSNS_RES 0x00 +#define PCF50633_ADCC1_MUX_BATSNS_SUBTR 0x10 +#define PCF50633_ADCC1_MUX_ADCIN2_RES 0x20 +#define PCF50633_ADCC1_MUX_ADCIN2_SUBTR 0x30 +#define PCF50633_ADCC1_MUX_BATTEMP 0x60 +#define PCF50633_ADCC1_MUX_ADCIN1 0x70 +#define PCF50633_ADCC1_AVERAGE_MASK 0x0c +#define PCF50633_ADCC1_ADCMUX_MASK 0xf0 + +#define PCF50633_ADCC2_RATIO_NONE 0x00 +#define PCF50633_ADCC2_RATIO_BATTEMP 0x01 +#define PCF50633_ADCC2_RATIO_ADCIN1 0x02 +#define PCF50633_ADCC2_RATIO_BOTH 0x03 +#define PCF50633_ADCC2_RATIOSETTL_100US 0x04 + +#define PCF50633_ADCC3_ACCSW_EN 0x01 +#define PCF50633_ADCC3_NTCSW_EN 0x04 +#define PCF50633_ADCC3_RES_DIV_TWO 0x10 +#define PCF50633_ADCC3_RES_DIV_THREE 0x00 + +#define PCF50633_ADCS3_REF_NTCSW 0x00 +#define PCF50633_ADCS3_REF_ACCSW 0x10 +#define PCF50633_ADCS3_REF_2V0 0x20 +#define PCF50633_ADCS3_REF_VISA 0x30 +#define PCF50633_ADCS3_REF_2V0_2 0x70 +#define PCF50633_ADCS3_ADCRDY 0x80 + +#define PCF50633_ADCS3_ADCDAT1L_MASK 0x03 +#define PCF50633_ADCS3_ADCDAT2L_MASK 0x0c +#define PCF50633_ADCS3_ADCDAT2L_SHIFT 2 +#define PCF50633_ASCS3_REF_MASK 0x70 + +extern int +pcf50633_adc_async_read(struct pcf50633 *pcf, int mux, int avg, + void (*callback)(struct pcf50633 *, void *, int), + void *callback_param); +extern int +pcf50633_adc_sync_read(struct pcf50633 *pcf, int mux, int avg); + +#endif /* __LINUX_PCF50633_ADC_H */ -- cgit v1.2.3 From 6a3d119b4ce29cf32bfe91eb61d46e9dbd8ce38a Mon Sep 17 00:00:00 2001 From: Balaji Rao Date: Fri, 9 Jan 2009 01:49:37 +0100 Subject: mfd: PCF50633 gpio support What the PCF05633 calls as a 'GPIO' is much more than the GPIO in the linux sense and there are only 4 of them - which means, the gpiolib is not used here. Signed-off-by: Balaji Rao Cc: Andy Green Signed-off-by: Samuel Ortiz --- drivers/mfd/Kconfig | 7 +++ drivers/mfd/Makefile | 3 +- drivers/mfd/pcf50633-gpio.c | 118 ++++++++++++++++++++++++++++++++++++++ include/linux/mfd/pcf50633/gpio.h | 52 +++++++++++++++++ 4 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 drivers/mfd/pcf50633-gpio.c create mode 100644 include/linux/mfd/pcf50633/gpio.h (limited to 'include/linux') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 2fa3504e165a..06a2b0f7737c 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -233,6 +233,13 @@ config PCF50633_ADC Say yes here if you want to include support for ADC in the NXP PCF50633 chip. +config PCF50633_GPIO + tristate "Support for NXP PCF50633 GPIO" + depends on MFD_PCF50633 + help + Say yes here if you want to include support GPIO for pins on + the PCF50633 chip. + endmenu menu "Multimedia Capabilities Port drivers" diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 138f9c4d3d7b..3afb5192e4da 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -39,4 +39,5 @@ obj-$(CONFIG_UCB1400_CORE) += ucb1400_core.o obj-$(CONFIG_PMIC_DA903X) += da903x.o obj-$(CONFIG_MFD_PCF50633) += pcf50633-core.o -obj-$(CONFIG_PCF50633_ADC) += pcf50633-adc.o \ No newline at end of file +obj-$(CONFIG_PCF50633_ADC) += pcf50633-adc.o +obj-$(CONFIG_PCF50633_GPIO) += pcf50633-gpio.o \ No newline at end of file diff --git a/drivers/mfd/pcf50633-gpio.c b/drivers/mfd/pcf50633-gpio.c new file mode 100644 index 000000000000..2fa2eca5c9cc --- /dev/null +++ b/drivers/mfd/pcf50633-gpio.c @@ -0,0 +1,118 @@ +/* NXP PCF50633 GPIO Driver + * + * (C) 2006-2008 by Openmoko, Inc. + * Author: Balaji Rao + * All rights reserved. + * + * Broken down from monstrous PCF50633 driver mainly by + * Harald Welte, Andy Green and Werner Almesberger + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + */ + +#include + +#include +#include + +enum pcf50633_regulator_id { + PCF50633_REGULATOR_AUTO, + PCF50633_REGULATOR_DOWN1, + PCF50633_REGULATOR_DOWN2, + PCF50633_REGULATOR_LDO1, + PCF50633_REGULATOR_LDO2, + PCF50633_REGULATOR_LDO3, + PCF50633_REGULATOR_LDO4, + PCF50633_REGULATOR_LDO5, + PCF50633_REGULATOR_LDO6, + PCF50633_REGULATOR_HCLDO, + PCF50633_REGULATOR_MEMLDO, +}; + +#define PCF50633_REG_AUTOOUT 0x1a +#define PCF50633_REG_DOWN1OUT 0x1e +#define PCF50633_REG_DOWN2OUT 0x22 +#define PCF50633_REG_MEMLDOOUT 0x26 +#define PCF50633_REG_LDO1OUT 0x2d +#define PCF50633_REG_LDO2OUT 0x2f +#define PCF50633_REG_LDO3OUT 0x31 +#define PCF50633_REG_LDO4OUT 0x33 +#define PCF50633_REG_LDO5OUT 0x35 +#define PCF50633_REG_LDO6OUT 0x37 +#define PCF50633_REG_HCLDOOUT 0x39 + +static const u8 pcf50633_regulator_registers[PCF50633_NUM_REGULATORS] = { + [PCF50633_REGULATOR_AUTO] = PCF50633_REG_AUTOOUT, + [PCF50633_REGULATOR_DOWN1] = PCF50633_REG_DOWN1OUT, + [PCF50633_REGULATOR_DOWN2] = PCF50633_REG_DOWN2OUT, + [PCF50633_REGULATOR_MEMLDO] = PCF50633_REG_MEMLDOOUT, + [PCF50633_REGULATOR_LDO1] = PCF50633_REG_LDO1OUT, + [PCF50633_REGULATOR_LDO2] = PCF50633_REG_LDO2OUT, + [PCF50633_REGULATOR_LDO3] = PCF50633_REG_LDO3OUT, + [PCF50633_REGULATOR_LDO4] = PCF50633_REG_LDO4OUT, + [PCF50633_REGULATOR_LDO5] = PCF50633_REG_LDO5OUT, + [PCF50633_REGULATOR_LDO6] = PCF50633_REG_LDO6OUT, + [PCF50633_REGULATOR_HCLDO] = PCF50633_REG_HCLDOOUT, +}; + +int pcf50633_gpio_set(struct pcf50633 *pcf, int gpio, u8 val) +{ + u8 reg; + + reg = gpio - PCF50633_GPIO1 + PCF50633_REG_GPIO1CFG; + + return pcf50633_reg_set_bit_mask(pcf, reg, 0x07, val); +} +EXPORT_SYMBOL_GPL(pcf50633_gpio_set); + +u8 pcf50633_gpio_get(struct pcf50633 *pcf, int gpio) +{ + u8 reg, val; + + reg = gpio - PCF50633_GPIO1 + PCF50633_REG_GPIO1CFG; + val = pcf50633_reg_read(pcf, reg) & 0x07; + + return val; +} +EXPORT_SYMBOL_GPL(pcf50633_gpio_get); + +int pcf50633_gpio_invert_set(struct pcf50633 *pcf, int gpio, int invert) +{ + u8 val, reg; + + reg = gpio - PCF50633_GPIO1 + PCF50633_REG_GPIO1CFG; + val = !!invert << 3; + + return pcf50633_reg_set_bit_mask(pcf, reg, 1 << 3, val); +} +EXPORT_SYMBOL_GPL(pcf50633_gpio_invert_set); + +int pcf50633_gpio_invert_get(struct pcf50633 *pcf, int gpio) +{ + u8 reg, val; + + reg = gpio - PCF50633_GPIO1 + PCF50633_REG_GPIO1CFG; + val = pcf50633_reg_read(pcf, reg); + + return val & (1 << 3); +} +EXPORT_SYMBOL_GPL(pcf50633_gpio_invert_get); + +int pcf50633_gpio_power_supply_set(struct pcf50633 *pcf, + int gpio, int regulator, int on) +{ + u8 reg, val, mask; + + /* the *ENA register is always one after the *OUT register */ + reg = pcf50633_regulator_registers[regulator] + 1; + + val = !!on << (gpio - PCF50633_GPIO1); + mask = 1 << (gpio - PCF50633_GPIO1); + + return pcf50633_reg_set_bit_mask(pcf, reg, mask, val); +} +EXPORT_SYMBOL_GPL(pcf50633_gpio_power_supply_set); diff --git a/include/linux/mfd/pcf50633/gpio.h b/include/linux/mfd/pcf50633/gpio.h new file mode 100644 index 000000000000..a42b845efc54 --- /dev/null +++ b/include/linux/mfd/pcf50633/gpio.h @@ -0,0 +1,52 @@ +/* + * gpio.h -- GPIO driver for NXP PCF50633 + * + * (C) 2006-2008 by Openmoko, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#ifndef __LINUX_MFD_PCF50633_GPIO_H +#define __LINUX_MFD_PCF50633_GPIO_H + +#include + +#define PCF50633_GPIO1 1 +#define PCF50633_GPIO2 2 +#define PCF50633_GPIO3 3 +#define PCF50633_GPO 4 + +#define PCF50633_REG_GPIO1CFG 0x14 +#define PCF50633_REG_GPIO2CFG 0x15 +#define PCF50633_REG_GPIO3CFG 0x16 +#define PCF50633_REG_GPOCFG 0x17 + +#define PCF50633_GPOCFG_GPOSEL_MASK 0x07 + +enum pcf50633_reg_gpocfg { + PCF50633_GPOCFG_GPOSEL_0 = 0x00, + PCF50633_GPOCFG_GPOSEL_LED_NFET = 0x01, + PCF50633_GPOCFG_GPOSEL_SYSxOK = 0x02, + PCF50633_GPOCFG_GPOSEL_CLK32K = 0x03, + PCF50633_GPOCFG_GPOSEL_ADAPUSB = 0x04, + PCF50633_GPOCFG_GPOSEL_USBxOK = 0x05, + PCF50633_GPOCFG_GPOSEL_ACTPH4 = 0x06, + PCF50633_GPOCFG_GPOSEL_1 = 0x07, + PCF50633_GPOCFG_GPOSEL_INVERSE = 0x08, +}; + +int pcf50633_gpio_set(struct pcf50633 *pcf, int gpio, u8 val); +u8 pcf50633_gpio_get(struct pcf50633 *pcf, int gpio); + +int pcf50633_gpio_invert_set(struct pcf50633 *, int gpio, int invert); +int pcf50633_gpio_invert_get(struct pcf50633 *pcf, int gpio); + +int pcf50633_gpio_power_supply_set(struct pcf50633 *, + int gpio, int regulator, int on); +#endif /* __LINUX_MFD_PCF50633_GPIO_H */ + + -- cgit v1.2.3 From f5714dc97d63cc0dd1219bd0eb2e1f8df1e4347a Mon Sep 17 00:00:00 2001 From: Balaji Rao Date: Fri, 9 Jan 2009 01:50:55 +0100 Subject: power_supply: PCF50633 battery charger driver Signed-off-by: Balaji Rao Cc: Andy Green Cc: David Woodhouse Acked-by: Anton Vorontsov Signed-off-by: Samuel Ortiz --- drivers/power/Kconfig | 6 + drivers/power/Makefile | 1 + drivers/power/pcf50633-charger.c | 358 +++++++++++++++++++++++++++++++++++++++ include/linux/mfd/pcf50633/mbc.h | 134 +++++++++++++++ 4 files changed, 499 insertions(+) create mode 100644 drivers/power/pcf50633-charger.c create mode 100644 include/linux/mfd/pcf50633/mbc.h (limited to 'include/linux') diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig index 668472405a57..33da1127992a 100644 --- a/drivers/power/Kconfig +++ b/drivers/power/Kconfig @@ -82,4 +82,10 @@ config BATTERY_DA9030 Say Y here to enable support for batteries charger integrated into DA9030 PMIC. +config CHARGER_PCF50633 + tristate "NXP PCF50633 MBC" + depends on MFD_PCF50633 + help + Say Y to include support for NXP PCF50633 Main Battery Charger. + endif # POWER_SUPPLY diff --git a/drivers/power/Makefile b/drivers/power/Makefile index eebb15505a40..2fcf41d13e5c 100644 --- a/drivers/power/Makefile +++ b/drivers/power/Makefile @@ -25,3 +25,4 @@ obj-$(CONFIG_BATTERY_TOSA) += tosa_battery.o obj-$(CONFIG_BATTERY_WM97XX) += wm97xx_battery.o obj-$(CONFIG_BATTERY_BQ27x00) += bq27x00_battery.o obj-$(CONFIG_BATTERY_DA9030) += da9030_battery.o +obj-$(CONFIG_CHARGER_PCF50633) += pcf50633-charger.o \ No newline at end of file diff --git a/drivers/power/pcf50633-charger.c b/drivers/power/pcf50633-charger.c new file mode 100644 index 000000000000..e988ec130fcd --- /dev/null +++ b/drivers/power/pcf50633-charger.c @@ -0,0 +1,358 @@ +/* NXP PCF50633 Main Battery Charger Driver + * + * (C) 2006-2008 by Openmoko, Inc. + * Author: Balaji Rao + * All rights reserved. + * + * Broken down from monstrous PCF50633 driver mainly by + * Harald Welte, Andy Green and Werner Almesberger + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct pcf50633_mbc { + struct pcf50633 *pcf; + + int adapter_active; + int adapter_online; + int usb_active; + int usb_online; + + struct power_supply usb; + struct power_supply adapter; +}; + +int pcf50633_mbc_usb_curlim_set(struct pcf50633 *pcf, int ma) +{ + struct pcf50633_mbc *mbc = platform_get_drvdata(pcf->mbc_pdev); + int ret = 0; + u8 bits; + + if (ma >= 1000) + bits = PCF50633_MBCC7_USB_1000mA; + else if (ma >= 500) + bits = PCF50633_MBCC7_USB_500mA; + else if (ma >= 100) + bits = PCF50633_MBCC7_USB_100mA; + else + bits = PCF50633_MBCC7_USB_SUSPEND; + + ret = pcf50633_reg_set_bit_mask(pcf, PCF50633_REG_MBCC7, + PCF50633_MBCC7_USB_MASK, bits); + if (ret) + dev_err(pcf->dev, "error setting usb curlim to %d mA\n", ma); + else + dev_info(pcf->dev, "usb curlim to %d mA\n", ma); + + power_supply_changed(&mbc->usb); + + return ret; +} +EXPORT_SYMBOL_GPL(pcf50633_mbc_usb_curlim_set); + +int pcf50633_mbc_get_status(struct pcf50633 *pcf) +{ + struct pcf50633_mbc *mbc = platform_get_drvdata(pcf->mbc_pdev); + int status = 0; + + if (mbc->usb_online) + status |= PCF50633_MBC_USB_ONLINE; + if (mbc->usb_active) + status |= PCF50633_MBC_USB_ACTIVE; + if (mbc->adapter_online) + status |= PCF50633_MBC_ADAPTER_ONLINE; + if (mbc->adapter_active) + status |= PCF50633_MBC_ADAPTER_ACTIVE; + + return status; +} +EXPORT_SYMBOL_GPL(pcf50633_mbc_get_status); + +void pcf50633_mbc_set_status(struct pcf50633 *pcf, int what, int status) +{ + struct pcf50633_mbc *mbc = platform_get_drvdata(pcf->mbc_pdev); + + if (what & PCF50633_MBC_USB_ONLINE) + mbc->usb_online = !!status; + if (what & PCF50633_MBC_USB_ACTIVE) + mbc->usb_active = !!status; + if (what & PCF50633_MBC_ADAPTER_ONLINE) + mbc->adapter_online = !!status; + if (what & PCF50633_MBC_ADAPTER_ACTIVE) + mbc->adapter_active = !!status; +} +EXPORT_SYMBOL_GPL(pcf50633_mbc_set_status); + +static ssize_t +show_chgmode(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct pcf50633_mbc *mbc = dev_get_drvdata(dev); + + u8 mbcs2 = pcf50633_reg_read(mbc->pcf, PCF50633_REG_MBCS2); + u8 chgmod = (mbcs2 & PCF50633_MBCS2_MBC_MASK); + + return sprintf(buf, "%d\n", chgmod); +} +static DEVICE_ATTR(chgmode, S_IRUGO, show_chgmode, NULL); + +static ssize_t +show_usblim(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct pcf50633_mbc *mbc = dev_get_drvdata(dev); + u8 usblim = pcf50633_reg_read(mbc->pcf, PCF50633_REG_MBCC7) & + PCF50633_MBCC7_USB_MASK; + unsigned int ma; + + if (usblim == PCF50633_MBCC7_USB_1000mA) + ma = 1000; + else if (usblim == PCF50633_MBCC7_USB_500mA) + ma = 500; + else if (usblim == PCF50633_MBCC7_USB_100mA) + ma = 100; + else + ma = 0; + + return sprintf(buf, "%u\n", ma); +} + +static ssize_t set_usblim(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct pcf50633_mbc *mbc = dev_get_drvdata(dev); + unsigned long ma; + int ret; + + ret = strict_strtoul(buf, 10, &ma); + if (ret) + return -EINVAL; + + pcf50633_mbc_usb_curlim_set(mbc->pcf, ma); + + return count; +} + +static DEVICE_ATTR(usb_curlim, S_IRUGO | S_IWUSR, show_usblim, set_usblim); + +static struct attribute *pcf50633_mbc_sysfs_entries[] = { + &dev_attr_chgmode.attr, + &dev_attr_usb_curlim.attr, + NULL, +}; + +static struct attribute_group mbc_attr_group = { + .name = NULL, /* put in device directory */ + .attrs = pcf50633_mbc_sysfs_entries, +}; + +static void +pcf50633_mbc_irq_handler(int irq, void *data) +{ + struct pcf50633_mbc *mbc = data; + + /* USB */ + if (irq == PCF50633_IRQ_USBINS) { + mbc->usb_online = 1; + } else if (irq == PCF50633_IRQ_USBREM) { + mbc->usb_online = 0; + mbc->usb_active = 0; + pcf50633_mbc_usb_curlim_set(mbc->pcf, 0); + } + + /* Adapter */ + if (irq == PCF50633_IRQ_ADPINS) { + mbc->adapter_online = 1; + mbc->adapter_active = 1; + } else if (irq == PCF50633_IRQ_ADPREM) { + mbc->adapter_online = 0; + mbc->adapter_active = 0; + } + + if (irq == PCF50633_IRQ_BATFULL) { + mbc->usb_active = 0; + mbc->adapter_active = 0; + } + + power_supply_changed(&mbc->usb); + power_supply_changed(&mbc->adapter); + + if (mbc->pcf->pdata->mbc_event_callback) + mbc->pcf->pdata->mbc_event_callback(mbc->pcf, irq); +} + +static int adapter_get_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val) +{ + struct pcf50633_mbc *mbc = container_of(psy, struct pcf50633_mbc, usb); + int ret = 0; + + switch (psp) { + case POWER_SUPPLY_PROP_ONLINE: + val->intval = mbc->adapter_online; + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static int usb_get_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val) +{ + struct pcf50633_mbc *mbc = container_of(psy, struct pcf50633_mbc, usb); + int ret = 0; + + switch (psp) { + case POWER_SUPPLY_PROP_ONLINE: + val->intval = mbc->usb_online; + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static enum power_supply_property power_props[] = { + POWER_SUPPLY_PROP_ONLINE, +}; + +static const u8 mbc_irq_handlers[] = { + PCF50633_IRQ_ADPINS, + PCF50633_IRQ_ADPREM, + PCF50633_IRQ_USBINS, + PCF50633_IRQ_USBREM, + PCF50633_IRQ_BATFULL, + PCF50633_IRQ_CHGHALT, + PCF50633_IRQ_THLIMON, + PCF50633_IRQ_THLIMOFF, + PCF50633_IRQ_USBLIMON, + PCF50633_IRQ_USBLIMOFF, + PCF50633_IRQ_LOWSYS, + PCF50633_IRQ_LOWBAT, +}; + +static int __devinit pcf50633_mbc_probe(struct platform_device *pdev) +{ + struct pcf50633_mbc *mbc; + struct pcf50633_subdev_pdata *pdata = pdev->dev.platform_data; + int ret; + int i; + u8 mbcs1; + + mbc = kzalloc(sizeof(*mbc), GFP_KERNEL); + if (!mbc) + return -ENOMEM; + + platform_set_drvdata(pdev, mbc); + mbc->pcf = pdata->pcf; + + /* Set up IRQ handlers */ + for (i = 0; i < ARRAY_SIZE(mbc_irq_handlers); i++) + pcf50633_register_irq(mbc->pcf, mbc_irq_handlers[i], + pcf50633_mbc_irq_handler, mbc); + + /* Create power supplies */ + mbc->adapter.name = "adapter"; + mbc->adapter.type = POWER_SUPPLY_TYPE_MAINS; + mbc->adapter.properties = power_props; + mbc->adapter.num_properties = ARRAY_SIZE(power_props); + mbc->adapter.get_property = &adapter_get_property; + mbc->adapter.supplied_to = mbc->pcf->pdata->batteries; + mbc->adapter.num_supplicants = mbc->pcf->pdata->num_batteries; + + mbc->usb.name = "usb"; + mbc->usb.type = POWER_SUPPLY_TYPE_USB; + mbc->usb.properties = power_props; + mbc->usb.num_properties = ARRAY_SIZE(power_props); + mbc->usb.get_property = usb_get_property; + mbc->usb.supplied_to = mbc->pcf->pdata->batteries; + mbc->usb.num_supplicants = mbc->pcf->pdata->num_batteries; + + ret = power_supply_register(&pdev->dev, &mbc->adapter); + if (ret) { + dev_err(mbc->pcf->dev, "failed to register adapter\n"); + kfree(mbc); + return ret; + } + + ret = power_supply_register(&pdev->dev, &mbc->usb); + if (ret) { + dev_err(mbc->pcf->dev, "failed to register usb\n"); + power_supply_unregister(&mbc->adapter); + kfree(mbc); + return ret; + } + + ret = sysfs_create_group(&pdev->dev.kobj, &mbc_attr_group); + if (ret) + dev_err(mbc->pcf->dev, "failed to create sysfs entries\n"); + + mbcs1 = pcf50633_reg_read(mbc->pcf, PCF50633_REG_MBCS1); + if (mbcs1 & PCF50633_MBCS1_USBPRES) + pcf50633_mbc_irq_handler(PCF50633_IRQ_USBINS, mbc); + if (mbcs1 & PCF50633_MBCS1_ADAPTPRES) + pcf50633_mbc_irq_handler(PCF50633_IRQ_ADPINS, mbc); + + return 0; +} + +static int __devexit pcf50633_mbc_remove(struct platform_device *pdev) +{ + struct pcf50633_mbc *mbc = platform_get_drvdata(pdev); + int i; + + /* Remove IRQ handlers */ + for (i = 0; i < ARRAY_SIZE(mbc_irq_handlers); i++) + pcf50633_free_irq(mbc->pcf, mbc_irq_handlers[i]); + + power_supply_unregister(&mbc->usb); + power_supply_unregister(&mbc->adapter); + + kfree(mbc); + + return 0; +} + +static struct platform_driver pcf50633_mbc_driver = { + .driver = { + .name = "pcf50633-mbc", + }, + .probe = pcf50633_mbc_probe, + .remove = __devexit_p(pcf50633_mbc_remove), +}; + +static int __init pcf50633_mbc_init(void) +{ + return platform_driver_register(&pcf50633_mbc_driver); +} +module_init(pcf50633_mbc_init); + +static void __exit pcf50633_mbc_exit(void) +{ + platform_driver_unregister(&pcf50633_mbc_driver); +} +module_exit(pcf50633_mbc_exit); + +MODULE_AUTHOR("Balaji Rao "); +MODULE_DESCRIPTION("PCF50633 mbc driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:pcf50633-mbc"); diff --git a/include/linux/mfd/pcf50633/mbc.h b/include/linux/mfd/pcf50633/mbc.h new file mode 100644 index 000000000000..6e17619b773a --- /dev/null +++ b/include/linux/mfd/pcf50633/mbc.h @@ -0,0 +1,134 @@ +/* + * mbc.h -- Driver for NXP PCF50633 Main Battery Charger + * + * (C) 2006-2008 by Openmoko, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#ifndef __LINUX_MFD_PCF50633_MBC_H +#define __LINUX_MFD_PCF50633_MBC_H + +#include +#include + +#define PCF50633_REG_MBCC1 0x43 +#define PCF50633_REG_MBCC2 0x44 +#define PCF50633_REG_MBCC3 0x45 +#define PCF50633_REG_MBCC4 0x46 +#define PCF50633_REG_MBCC5 0x47 +#define PCF50633_REG_MBCC6 0x48 +#define PCF50633_REG_MBCC7 0x49 +#define PCF50633_REG_MBCC8 0x4a +#define PCF50633_REG_MBCS1 0x4b +#define PCF50633_REG_MBCS2 0x4c +#define PCF50633_REG_MBCS3 0x4d + +enum pcf50633_reg_mbcc1 { + PCF50633_MBCC1_CHGENA = 0x01, /* Charger enable */ + PCF50633_MBCC1_AUTOSTOP = 0x02, + PCF50633_MBCC1_AUTORES = 0x04, /* automatic resume */ + PCF50633_MBCC1_RESUME = 0x08, /* explicit resume cmd */ + PCF50633_MBCC1_RESTART = 0x10, /* restart charging */ + PCF50633_MBCC1_PREWDTIME_60M = 0x20, /* max. precharging time */ + PCF50633_MBCC1_WDTIME_1H = 0x00, + PCF50633_MBCC1_WDTIME_2H = 0x40, + PCF50633_MBCC1_WDTIME_4H = 0x80, + PCF50633_MBCC1_WDTIME_6H = 0xc0, +}; +#define PCF50633_MBCC1_WDTIME_MASK 0xc0 + +enum pcf50633_reg_mbcc2 { + PCF50633_MBCC2_VBATCOND_2V7 = 0x00, + PCF50633_MBCC2_VBATCOND_2V85 = 0x01, + PCF50633_MBCC2_VBATCOND_3V0 = 0x02, + PCF50633_MBCC2_VBATCOND_3V15 = 0x03, + PCF50633_MBCC2_VMAX_4V = 0x00, + PCF50633_MBCC2_VMAX_4V20 = 0x28, + PCF50633_MBCC2_VRESDEBTIME_64S = 0x80, /* debounce time (32/64sec) */ +}; + +enum pcf50633_reg_mbcc7 { + PCF50633_MBCC7_USB_100mA = 0x00, + PCF50633_MBCC7_USB_500mA = 0x01, + PCF50633_MBCC7_USB_1000mA = 0x02, + PCF50633_MBCC7_USB_SUSPEND = 0x03, + PCF50633_MBCC7_BATTEMP_EN = 0x04, + PCF50633_MBCC7_BATSYSIMAX_1A6 = 0x00, + PCF50633_MBCC7_BATSYSIMAX_1A8 = 0x40, + PCF50633_MBCC7_BATSYSIMAX_2A0 = 0x80, + PCF50633_MBCC7_BATSYSIMAX_2A2 = 0xc0, +}; +#define PCF50633_MBCC7_USB_MASK 0x03 + +enum pcf50633_reg_mbcc8 { + PCF50633_MBCC8_USBENASUS = 0x10, +}; + +enum pcf50633_reg_mbcs1 { + PCF50633_MBCS1_USBPRES = 0x01, + PCF50633_MBCS1_USBOK = 0x02, + PCF50633_MBCS1_ADAPTPRES = 0x04, + PCF50633_MBCS1_ADAPTOK = 0x08, + PCF50633_MBCS1_TBAT_OK = 0x00, + PCF50633_MBCS1_TBAT_ABOVE = 0x10, + PCF50633_MBCS1_TBAT_BELOW = 0x20, + PCF50633_MBCS1_TBAT_UNDEF = 0x30, + PCF50633_MBCS1_PREWDTEXP = 0x40, + PCF50633_MBCS1_WDTEXP = 0x80, +}; + +enum pcf50633_reg_mbcs2_mbcmod { + PCF50633_MBCS2_MBC_PLAY = 0x00, + PCF50633_MBCS2_MBC_USB_PRE = 0x01, + PCF50633_MBCS2_MBC_USB_PRE_WAIT = 0x02, + PCF50633_MBCS2_MBC_USB_FAST = 0x03, + PCF50633_MBCS2_MBC_USB_FAST_WAIT = 0x04, + PCF50633_MBCS2_MBC_USB_SUSPEND = 0x05, + PCF50633_MBCS2_MBC_ADP_PRE = 0x06, + PCF50633_MBCS2_MBC_ADP_PRE_WAIT = 0x07, + PCF50633_MBCS2_MBC_ADP_FAST = 0x08, + PCF50633_MBCS2_MBC_ADP_FAST_WAIT = 0x09, + PCF50633_MBCS2_MBC_BAT_FULL = 0x0a, + PCF50633_MBCS2_MBC_HALT = 0x0b, +}; +#define PCF50633_MBCS2_MBC_MASK 0x0f +enum pcf50633_reg_mbcs2_chgstat { + PCF50633_MBCS2_CHGS_NONE = 0x00, + PCF50633_MBCS2_CHGS_ADAPTER = 0x10, + PCF50633_MBCS2_CHGS_USB = 0x20, + PCF50633_MBCS2_CHGS_BOTH = 0x30, +}; +#define PCF50633_MBCS2_RESSTAT_AUTO 0x40 + +enum pcf50633_reg_mbcs3 { + PCF50633_MBCS3_USBLIM_PLAY = 0x01, + PCF50633_MBCS3_USBLIM_CGH = 0x02, + PCF50633_MBCS3_TLIM_PLAY = 0x04, + PCF50633_MBCS3_TLIM_CHG = 0x08, + PCF50633_MBCS3_ILIM = 0x10, /* 1: Ibat > Icutoff */ + PCF50633_MBCS3_VLIM = 0x20, /* 1: Vbat == Vmax */ + PCF50633_MBCS3_VBATSTAT = 0x40, /* 1: Vbat > Vbatcond */ + PCF50633_MBCS3_VRES = 0x80, /* 1: Vbat > Vth(RES) */ +}; + +#define PCF50633_MBCC2_VBATCOND_MASK 0x03 +#define PCF50633_MBCC2_VMAX_MASK 0x3c + +/* Charger status */ +#define PCF50633_MBC_USB_ONLINE 0x01 +#define PCF50633_MBC_USB_ACTIVE 0x02 +#define PCF50633_MBC_ADAPTER_ONLINE 0x04 +#define PCF50633_MBC_ADAPTER_ACTIVE 0x08 + +int pcf50633_mbc_usb_curlim_set(struct pcf50633 *pcf, int ma); + +int pcf50633_mbc_get_status(struct pcf50633 *); +void pcf50633_mbc_set_status(struct pcf50633 *, int what, int status); + +#endif + -- cgit v1.2.3 From 5ec271e745350c7df6a6ebca24b43cb7a10bfa4a Mon Sep 17 00:00:00 2001 From: Balaji Rao Date: Fri, 9 Jan 2009 01:51:01 +0100 Subject: regulator: PCF50633 pmic driver Changes from V1: - Removed support for suspend_enable & suspend_disable functions. Signed-off-by: Balaji Rao Cc: Andy Green Cc: Liam Girdwood Acked-by: Mark Brown Signed-off-by: Samuel Ortiz --- drivers/regulator/Kconfig | 7 + drivers/regulator/Makefile | 1 + drivers/regulator/pcf50633-regulator.c | 329 +++++++++++++++++++++++++++++++++ include/linux/mfd/pcf50633/pmic.h | 67 +++++++ 4 files changed, 404 insertions(+) create mode 100644 drivers/regulator/pcf50633-regulator.c create mode 100644 include/linux/mfd/pcf50633/pmic.h (limited to 'include/linux') diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index 39360e2a4540..e7e0cf102d6d 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -73,4 +73,11 @@ config REGULATOR_DA903X Say y here to support the BUCKs and LDOs regulators found on Dialog Semiconductor DA9030/DA9034 PMIC. +config REGULATOR_PCF50633 + tristate "PCF50633 regulator driver" + depends on MFD_PCF50633 + help + Say Y here to support the voltage regulators and convertors + on PCF50633 + endif diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index 254d40c02ee8..61b30c6ddecc 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -11,5 +11,6 @@ obj-$(CONFIG_REGULATOR_BQ24022) += bq24022.o obj-$(CONFIG_REGULATOR_WM8350) += wm8350-regulator.o obj-$(CONFIG_REGULATOR_WM8400) += wm8400-regulator.o obj-$(CONFIG_REGULATOR_DA903X) += da903x.o +obj-$(CONFIG_REGULATOR_PCF50633) += pcf50633-regulator.o ccflags-$(CONFIG_REGULATOR_DEBUG) += -DDEBUG diff --git a/drivers/regulator/pcf50633-regulator.c b/drivers/regulator/pcf50633-regulator.c new file mode 100644 index 000000000000..4cc85ec6e120 --- /dev/null +++ b/drivers/regulator/pcf50633-regulator.c @@ -0,0 +1,329 @@ +/* NXP PCF50633 PMIC Driver + * + * (C) 2006-2008 by Openmoko, Inc. + * Author: Balaji Rao + * All rights reserved. + * + * Broken down from monstrous PCF50633 driver mainly by + * Harald Welte and Andy Green and Werner Almesberger + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define PCF50633_REGULATOR(_name, _id) \ + { \ + .name = _name, \ + .id = _id, \ + .ops = &pcf50633_regulator_ops, \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + } + +static const u8 pcf50633_regulator_registers[PCF50633_NUM_REGULATORS] = { + [PCF50633_REGULATOR_AUTO] = PCF50633_REG_AUTOOUT, + [PCF50633_REGULATOR_DOWN1] = PCF50633_REG_DOWN1OUT, + [PCF50633_REGULATOR_DOWN2] = PCF50633_REG_DOWN2OUT, + [PCF50633_REGULATOR_MEMLDO] = PCF50633_REG_MEMLDOOUT, + [PCF50633_REGULATOR_LDO1] = PCF50633_REG_LDO1OUT, + [PCF50633_REGULATOR_LDO2] = PCF50633_REG_LDO2OUT, + [PCF50633_REGULATOR_LDO3] = PCF50633_REG_LDO3OUT, + [PCF50633_REGULATOR_LDO4] = PCF50633_REG_LDO4OUT, + [PCF50633_REGULATOR_LDO5] = PCF50633_REG_LDO5OUT, + [PCF50633_REGULATOR_LDO6] = PCF50633_REG_LDO6OUT, + [PCF50633_REGULATOR_HCLDO] = PCF50633_REG_HCLDOOUT, +}; + +/* Bits from voltage value */ +static u8 auto_voltage_bits(unsigned int millivolts) +{ + if (millivolts < 1800) + return 0; + if (millivolts > 3800) + return 0xff; + + millivolts -= 625; + + return millivolts / 25; +} + +static u8 down_voltage_bits(unsigned int millivolts) +{ + if (millivolts < 625) + return 0; + else if (millivolts > 3000) + return 0xff; + + millivolts -= 625; + + return millivolts / 25; +} + +static u8 ldo_voltage_bits(unsigned int millivolts) +{ + if (millivolts < 900) + return 0; + else if (millivolts > 3600) + return 0x1f; + + millivolts -= 900; + return millivolts / 100; +} + +/* Obtain voltage value from bits */ +static unsigned int auto_voltage_value(u8 bits) +{ + if (bits < 0x2f) + return 0; + + return 625 + (bits * 25); +} + + +static unsigned int down_voltage_value(u8 bits) +{ + return 625 + (bits * 25); +} + + +static unsigned int ldo_voltage_value(u8 bits) +{ + bits &= 0x1f; + + return 900 + (bits * 100); +} + +static int pcf50633_regulator_set_voltage(struct regulator_dev *rdev, + int min_uV, int max_uV) +{ + struct pcf50633 *pcf; + int regulator_id, millivolts; + u8 volt_bits, regnr; + + pcf = rdev_get_drvdata(rdev); + + regulator_id = rdev_get_id(rdev); + if (regulator_id >= PCF50633_NUM_REGULATORS) + return -EINVAL; + + millivolts = min_uV / 1000; + + regnr = pcf50633_regulator_registers[regulator_id]; + + switch (regulator_id) { + case PCF50633_REGULATOR_AUTO: + volt_bits = auto_voltage_bits(millivolts); + break; + case PCF50633_REGULATOR_DOWN1: + volt_bits = down_voltage_bits(millivolts); + break; + case PCF50633_REGULATOR_DOWN2: + volt_bits = down_voltage_bits(millivolts); + break; + case PCF50633_REGULATOR_LDO1: + case PCF50633_REGULATOR_LDO2: + case PCF50633_REGULATOR_LDO3: + case PCF50633_REGULATOR_LDO4: + case PCF50633_REGULATOR_LDO5: + case PCF50633_REGULATOR_LDO6: + case PCF50633_REGULATOR_HCLDO: + volt_bits = ldo_voltage_bits(millivolts); + break; + default: + return -EINVAL; + } + + return pcf50633_reg_write(pcf, regnr, volt_bits); +} + +static int pcf50633_regulator_get_voltage(struct regulator_dev *rdev) +{ + struct pcf50633 *pcf; + int regulator_id, millivolts, volt_bits; + u8 regnr; + + pcf = rdev_get_drvdata(rdev);; + + regulator_id = rdev_get_id(rdev); + if (regulator_id >= PCF50633_NUM_REGULATORS) + return -EINVAL; + + regnr = pcf50633_regulator_registers[regulator_id]; + + volt_bits = pcf50633_reg_read(pcf, regnr); + if (volt_bits < 0) + return -1; + + switch (regulator_id) { + case PCF50633_REGULATOR_AUTO: + millivolts = auto_voltage_value(volt_bits); + break; + case PCF50633_REGULATOR_DOWN1: + millivolts = down_voltage_value(volt_bits); + break; + case PCF50633_REGULATOR_DOWN2: + millivolts = down_voltage_value(volt_bits); + break; + case PCF50633_REGULATOR_LDO1: + case PCF50633_REGULATOR_LDO2: + case PCF50633_REGULATOR_LDO3: + case PCF50633_REGULATOR_LDO4: + case PCF50633_REGULATOR_LDO5: + case PCF50633_REGULATOR_LDO6: + case PCF50633_REGULATOR_HCLDO: + millivolts = ldo_voltage_value(volt_bits); + break; + default: + return -EINVAL; + } + + return millivolts * 1000; +} + +static int pcf50633_regulator_enable(struct regulator_dev *rdev) +{ + struct pcf50633 *pcf = rdev_get_drvdata(rdev); + int regulator_id; + u8 regnr; + + regulator_id = rdev_get_id(rdev); + if (regulator_id >= PCF50633_NUM_REGULATORS) + return -EINVAL; + + /* The *ENA register is always one after the *OUT register */ + regnr = pcf50633_regulator_registers[regulator_id] + 1; + + return pcf50633_reg_set_bit_mask(pcf, regnr, PCF50633_REGULATOR_ON, + PCF50633_REGULATOR_ON); +} + +static int pcf50633_regulator_disable(struct regulator_dev *rdev) +{ + struct pcf50633 *pcf = rdev_get_drvdata(rdev); + int regulator_id; + u8 regnr; + + regulator_id = rdev_get_id(rdev); + if (regulator_id >= PCF50633_NUM_REGULATORS) + return -EINVAL; + + /* the *ENA register is always one after the *OUT register */ + regnr = pcf50633_regulator_registers[regulator_id] + 1; + + return pcf50633_reg_set_bit_mask(pcf, regnr, + PCF50633_REGULATOR_ON, 0); +} + +static int pcf50633_regulator_is_enabled(struct regulator_dev *rdev) +{ + struct pcf50633 *pcf = rdev_get_drvdata(rdev); + int regulator_id = rdev_get_id(rdev); + u8 regnr; + + regulator_id = rdev_get_id(rdev); + if (regulator_id >= PCF50633_NUM_REGULATORS) + return -EINVAL; + + /* the *ENA register is always one after the *OUT register */ + regnr = pcf50633_regulator_registers[regulator_id] + 1; + + return pcf50633_reg_read(pcf, regnr) & PCF50633_REGULATOR_ON; +} + +static struct regulator_ops pcf50633_regulator_ops = { + .set_voltage = pcf50633_regulator_set_voltage, + .get_voltage = pcf50633_regulator_get_voltage, + .enable = pcf50633_regulator_enable, + .disable = pcf50633_regulator_disable, + .is_enabled = pcf50633_regulator_is_enabled, +}; + +static struct regulator_desc regulators[] = { + [PCF50633_REGULATOR_AUTO] = + PCF50633_REGULATOR("auto", PCF50633_REGULATOR_AUTO), + [PCF50633_REGULATOR_DOWN1] = + PCF50633_REGULATOR("down1", PCF50633_REGULATOR_DOWN1), + [PCF50633_REGULATOR_DOWN2] = + PCF50633_REGULATOR("down2", PCF50633_REGULATOR_DOWN2), + [PCF50633_REGULATOR_LDO1] = + PCF50633_REGULATOR("ldo1", PCF50633_REGULATOR_LDO1), + [PCF50633_REGULATOR_LDO2] = + PCF50633_REGULATOR("ldo2", PCF50633_REGULATOR_LDO2), + [PCF50633_REGULATOR_LDO3] = + PCF50633_REGULATOR("ldo3", PCF50633_REGULATOR_LDO3), + [PCF50633_REGULATOR_LDO4] = + PCF50633_REGULATOR("ldo4", PCF50633_REGULATOR_LDO4), + [PCF50633_REGULATOR_LDO5] = + PCF50633_REGULATOR("ldo5", PCF50633_REGULATOR_LDO5), + [PCF50633_REGULATOR_LDO6] = + PCF50633_REGULATOR("ldo6", PCF50633_REGULATOR_LDO6), + [PCF50633_REGULATOR_HCLDO] = + PCF50633_REGULATOR("hcldo", PCF50633_REGULATOR_HCLDO), + [PCF50633_REGULATOR_MEMLDO] = + PCF50633_REGULATOR("memldo", PCF50633_REGULATOR_MEMLDO), +}; + +static int __devinit pcf50633_regulator_probe(struct platform_device *pdev) +{ + struct regulator_dev *rdev; + struct pcf50633 *pcf; + + /* Already set by core driver */ + pcf = platform_get_drvdata(pdev); + + rdev = regulator_register(®ulators[pdev->id], &pdev->dev, pcf); + if (IS_ERR(rdev)) + return PTR_ERR(rdev); + + if (pcf->pdata->regulator_registered) + pcf->pdata->regulator_registered(pcf, pdev->id); + + return 0; +} + +static int __devexit pcf50633_regulator_remove(struct platform_device *pdev) +{ + struct regulator_dev *rdev = platform_get_drvdata(pdev); + + regulator_unregister(rdev); + + return 0; +} + +static struct platform_driver pcf50633_regulator_driver = { + .driver = { + .name = "pcf50633-regltr", + }, + .probe = pcf50633_regulator_probe, + .remove = __devexit_p(pcf50633_regulator_remove), +}; + +static int __init pcf50633_regulator_init(void) +{ + return platform_driver_register(&pcf50633_regulator_driver); +} +module_init(pcf50633_regulator_init); + +static void __exit pcf50633_regulator_exit(void) +{ + platform_driver_unregister(&pcf50633_regulator_driver); +} +module_exit(pcf50633_regulator_exit); + +MODULE_AUTHOR("Balaji Rao "); +MODULE_DESCRIPTION("PCF50633 regulator driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:pcf50633-regulator"); diff --git a/include/linux/mfd/pcf50633/pmic.h b/include/linux/mfd/pcf50633/pmic.h new file mode 100644 index 000000000000..2d3dbe53b235 --- /dev/null +++ b/include/linux/mfd/pcf50633/pmic.h @@ -0,0 +1,67 @@ +#ifndef __LINUX_MFD_PCF50633_PMIC_H +#define __LINUX_MFD_PCF50633_PMIC_H + +#include +#include + +#define PCF50633_REG_AUTOOUT 0x1a +#define PCF50633_REG_AUTOENA 0x1b +#define PCF50633_REG_AUTOCTL 0x1c +#define PCF50633_REG_AUTOMXC 0x1d +#define PCF50633_REG_DOWN1OUT 0x1e +#define PCF50633_REG_DOWN1ENA 0x1f +#define PCF50633_REG_DOWN1CTL 0x20 +#define PCF50633_REG_DOWN1MXC 0x21 +#define PCF50633_REG_DOWN2OUT 0x22 +#define PCF50633_REG_DOWN2ENA 0x23 +#define PCF50633_REG_DOWN2CTL 0x24 +#define PCF50633_REG_DOWN2MXC 0x25 +#define PCF50633_REG_MEMLDOOUT 0x26 +#define PCF50633_REG_MEMLDOENA 0x27 +#define PCF50633_REG_LDO1OUT 0x2d +#define PCF50633_REG_LDO1ENA 0x2e +#define PCF50633_REG_LDO2OUT 0x2f +#define PCF50633_REG_LDO2ENA 0x30 +#define PCF50633_REG_LDO3OUT 0x31 +#define PCF50633_REG_LDO3ENA 0x32 +#define PCF50633_REG_LDO4OUT 0x33 +#define PCF50633_REG_LDO4ENA 0x34 +#define PCF50633_REG_LDO5OUT 0x35 +#define PCF50633_REG_LDO5ENA 0x36 +#define PCF50633_REG_LDO6OUT 0x37 +#define PCF50633_REG_LDO6ENA 0x38 +#define PCF50633_REG_HCLDOOUT 0x39 +#define PCF50633_REG_HCLDOENA 0x3a +#define PCF50633_REG_HCLDOOVL 0x40 + +enum pcf50633_regulator_enable { + PCF50633_REGULATOR_ON = 0x01, + PCF50633_REGULATOR_ON_GPIO1 = 0x02, + PCF50633_REGULATOR_ON_GPIO2 = 0x04, + PCF50633_REGULATOR_ON_GPIO3 = 0x08, +}; +#define PCF50633_REGULATOR_ON_MASK 0x0f + +enum pcf50633_regulator_phase { + PCF50633_REGULATOR_ACTPH1 = 0x00, + PCF50633_REGULATOR_ACTPH2 = 0x10, + PCF50633_REGULATOR_ACTPH3 = 0x20, + PCF50633_REGULATOR_ACTPH4 = 0x30, +}; +#define PCF50633_REGULATOR_ACTPH_MASK 0x30 + +enum pcf50633_regulator_id { + PCF50633_REGULATOR_AUTO, + PCF50633_REGULATOR_DOWN1, + PCF50633_REGULATOR_DOWN2, + PCF50633_REGULATOR_LDO1, + PCF50633_REGULATOR_LDO2, + PCF50633_REGULATOR_LDO3, + PCF50633_REGULATOR_LDO4, + PCF50633_REGULATOR_LDO5, + PCF50633_REGULATOR_LDO6, + PCF50633_REGULATOR_HCLDO, + PCF50633_REGULATOR_MEMLDO, +}; +#endif + -- cgit v1.2.3 From 53ce3d9564908794ae7dd32969089b57df5fc098 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 9 Jan 2009 12:27:08 -0800 Subject: smp_call_function_single(): be slightly less stupid If you do smp_call_function_single(expression-with-side-effects, ...) then expression-with-side-effects never gets evaluated on UP builds. As always, implementing it in C is the correct thing to do. While we're there, uninline it for size and possible header dependency reasons. And create a new kernel/up.c, as a place in which to put uniprocessor-specific code and storage. It should mirror kernel/smp.c. Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/smp.h | 13 +++---------- kernel/Makefile | 6 +++++- kernel/up.c | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+), 11 deletions(-) create mode 100644 kernel/up.c (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index b82466968101..715196b09d67 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -24,6 +24,9 @@ struct call_single_data { /* total number of cpus in this system (may exceed NR_CPUS) */ extern unsigned int total_cpus; +int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, + int wait); + #ifdef CONFIG_SMP #include @@ -79,8 +82,6 @@ smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info, return 0; } -int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, - int wait); void __smp_call_function_single(int cpuid, struct call_single_data *data); /* @@ -140,14 +141,6 @@ static inline int up_smp_call_function(void (*func)(void *), void *info) static inline void smp_send_reschedule(int cpu) { } #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) -#define smp_call_function_single(cpuid, func, info, wait) \ -({ \ - WARN_ON(cpuid != 0); \ - local_irq_disable(); \ - (func)(info); \ - local_irq_enable(); \ - 0; \ -}) #define smp_call_function_mask(mask, func, info, wait) \ (up_smp_call_function(func, info)) #define smp_call_function_many(mask, func, info, wait) \ diff --git a/kernel/Makefile b/kernel/Makefile index 2921d90ce32f..2aebc4cd7878 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -40,7 +40,11 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o +ifeq ($(CONFIG_USE_GENERIC_SMP_HELPERS),y) +obj-y += smp.o +else +obj-y += up.o +endif obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o diff --git a/kernel/up.c b/kernel/up.c new file mode 100644 index 000000000000..ce62cc9e9f71 --- /dev/null +++ b/kernel/up.c @@ -0,0 +1,18 @@ +/* + * Uniprocessor-only support functions. The counterpart to kernel/smp.c + */ + +#include +#include +#include + +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int wait) +{ + WARN_ON(cpuid != 0); + local_irq_disable(); + (func)(info); + local_irq_enable(); + return 0; +} +EXPORT_SYMBOL(smp_call_function_single); -- cgit v1.2.3 From 649274d993212e7c23c0cb734572c2311c200872 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 11 Jan 2009 00:20:39 -0800 Subject: net_dma: acquire/release dma channels on ifup/ifdown The recent dmaengine rework removed the capability to remove dma device driver modules while net_dma is active. Rather than notify dmaengine-clients that channels are trying to be removed, we now rely on clients to notify dmaengine when they no longer have a need for channels. Teach net_dma to release channels by taking dmaengine references at netdevice open and dropping references at netdevice close. Acked-by: Maciej Sosnowski Signed-off-by: Dan Williams Signed-off-by: David S. Miller --- include/linux/dmaengine.h | 10 ++++++++++ net/core/dev.c | 13 ++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 64dea2ab326c..c73f1e2b59b7 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -270,8 +270,18 @@ struct dma_device { /* --- public DMA engine API --- */ +#ifdef CONFIG_DMA_ENGINE void dmaengine_get(void); void dmaengine_put(void); +#else +static inline void dmaengine_get(void) +{ +} +static inline void dmaengine_put(void) +{ +} +#endif + dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest, void *src, size_t len); dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan, diff --git a/net/core/dev.c b/net/core/dev.c index 5f736f1ceeae..b715a55cccc4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1087,6 +1087,11 @@ int dev_open(struct net_device *dev) */ dev->flags |= IFF_UP; + /* + * Enable NET_DMA + */ + dmaengine_get(); + /* * Initialize multicasting status */ @@ -1164,6 +1169,11 @@ int dev_close(struct net_device *dev) */ call_netdevice_notifiers(NETDEV_DOWN, dev); + /* + * Shutdown NET_DMA + */ + dmaengine_put(); + return 0; } @@ -5151,9 +5161,6 @@ static int __init net_dev_init(void) hotcpu_notifier(dev_cpu_callback, 0); dst_init(); dev_mcast_init(); - #ifdef CONFIG_NET_DMA - dmaengine_get(); - #endif rc = 0; out: return rc; -- cgit v1.2.3 From 57de16e612d63138bd2c618449af9d8312466e25 Mon Sep 17 00:00:00 2001 From: Martin Bachem Date: Sun, 26 Oct 2008 13:30:09 +0100 Subject: BUGFIX: used NULL pointer at ioctl(sk,IMGETDEVINFO,&devinfo) when devinfo.id not registered daxtar example # modprobe hfcsusb daxtar example # modprobe mISDN_l1loop daxtar example # ./misdnportinfo Found 3 devices id: 0 Dprotocols: 00000006 Bprotocols: 0000000e protocol: 0 nrbchan: 2 name: HFC-S_USB.1 id: 1 Dprotocols: 00000006 Bprotocols: 0000000e protocol: 0 nrbchan: 2 name: mISDN_l1loop.1 id: 2 Dprotocols: 00000006 Bprotocols: 0000000e protocol: 0 nrbchan: 2 name: mISDN_l1loop.2 daxtar example # rmmod hfcsusb daxtar example # ./misdnportinfo Found 2 devices *Segmentation* *fault* dmesg: [ 9914.939718] BUG: unable to handle kernel NULL pointer dereference at 000000d4 [ 9914.939721] IP: [] :mISDN_core:get_mdevice+0x19/0x22 [ 9914.939729] *pde = 00000000 [ 9914.939732] Oops: 0000 [#14] PREEMPT SMP [ 9914.939734] Modules linked in: mISDN_l1loop mISDN_core vmnet vmblock vmci vmmon coretemp w83627ehf hwmon_vid rfcomm l2cap blue tooth usbhid snd_usb_audio snd_usb_lib snd_rawmidi snd_hwdep fuse nvidia(P) uhci_hcd i2c_i801 ehci_hcd snd_hda_intel atl1 usbcore i2c_core parport_seria l [last unloaded: hfcsusb] [ 9914.939751] Pid: 29618, comm: misdnportinfo Tainted: P D (2.6.27.3 #5) [ 9914.939753] EIP: 0060:[] EFLAGS: 00210246 CPU: 0 [ 9914.939758] EIP is at get_mdevice+0x19/0x22 [mISDN_core] [ 9914.939760] EAX: 00000000 EBX: f8fa791c ECX: f6afaa58 EDX: f7960cf4 [ 9914.939762] ESI: 80044944 EDI: bfc2e62c EBP: bfc2e62c ESP: f5adbef4 [ 9914.939763] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 [ 9914.939765] Process misdnportinfo (pid: 29618, ti=f5ada000 task=f6bec430 task.ti=f5ada000) [ 9914.939767] Stack: f8f9f4e0 00000000 f8f9f867 bfc2e62c 0000000a c02461e8 00200246 c042dde8 [ 9914.939771] 00000003 c042dde4 00000000 00000001 00200082 c0114775 00000000 00000000 [ 9914.939775] 00000003 f7088010 00200282 f8fa791c 80044944 bfc2e62c bfc2e62c c02f6615 [ 9914.939780] Call Trace: [ 9914.939782] [] _get_mdevice+0x0/0x18 [mISDN_core] [ 9914.939789] [] base_sock_ioctl+0x7a/0x129 [mISDN_core] [ 9914.939789] [] opost+0x171/0x182 [ 9914.939789] [] __wake_up+0x29/0x39 [ 9914.939789] [] sock_ioctl+0x1b5/0x1d9 [ 9914.939789] [] sock_ioctl+0x0/0x1d9 [ 9914.939789] [] vfs_ioctl+0x1c/0x5d [ 9914.939789] [] do_vfs_ioctl+0x23e/0x24e [ 9914.939789] [] sys_ioctl+0x2c/0x45 [ 9914.939789] [] sysenter_do_call+0x12/0x21 [ 9914.939789] [] pci_fixup_i450gx+0x4e/0x56 [ 9914.939789] ======================= [ 9914.939789] Code: 00 68 02 f0 f9 f8 e8 ae b4 2c c7 8b 44 24 04 5a 59 c3 83 ec 04 31 d2 89 04 24 89 e1 b8 ac df fa f8 68 e0 f4 f9 f8 e8 4a b5 2c c7 <8b> 80 d4 00 00 00 5a 59 c3 53 89 cb 8d 90 9c 00 00 00 89 c8 e8 [ 9914.939789] EIP: [] get_mdevice+0x19/0x22 [mISDN_core] SS:ESP 0068:f5adbef4 [ 9914.939858] ---[ end trace 50e18a715b019424 ]--- Signed-off-by: Martin Bachem Signed-off-by: Karsten Keil --- include/linux/mISDNif.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h index 557477ac3d5b..5da3d95b27f1 100644 --- a/include/linux/mISDNif.h +++ b/include/linux/mISDNif.h @@ -559,7 +559,10 @@ extern void mISDN_unregister_clock(struct mISDNclock *); static inline struct mISDNdevice *dev_to_mISDN(struct device *dev) { - return dev_get_drvdata(dev); + if (dev) + return dev_get_drvdata(dev); + else + return NULL; } extern void set_channel_address(struct mISDNchannel *, u_int, u_int); -- cgit v1.2.3 From 2e4c77bea3d8b17d94f8ee382411f359b708560f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 30 Dec 2008 14:16:41 +0100 Subject: m68k: dio - Kill warn_unused_result warnings warning: ignoring return value of 'device_register', declared with attribute warn_unused_result warning: ignoring return value of 'device_create_file', declared with attribute warn_unused_result Signed-off-by: Geert Uytterhoeven --- drivers/dio/dio-sysfs.c | 16 ++++++++++------ drivers/dio/dio.c | 18 +++++++++++++++--- include/linux/dio.h | 2 +- 3 files changed, 26 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/dio/dio-sysfs.c b/drivers/dio/dio-sysfs.c index f46463038847..91d5f4da8769 100644 --- a/drivers/dio/dio-sysfs.c +++ b/drivers/dio/dio-sysfs.c @@ -63,15 +63,19 @@ static ssize_t dio_show_resource(struct device *dev, struct device_attribute *at } static DEVICE_ATTR(resource, S_IRUGO, dio_show_resource, NULL); -void dio_create_sysfs_dev_files(struct dio_dev *d) +int dio_create_sysfs_dev_files(struct dio_dev *d) { struct device *dev = &d->dev; + int error; /* current configuration's attributes */ - device_create_file(dev, &dev_attr_id); - device_create_file(dev, &dev_attr_ipl); - device_create_file(dev, &dev_attr_secid); - device_create_file(dev, &dev_attr_name); - device_create_file(dev, &dev_attr_resource); + if ((error = device_create_file(dev, &dev_attr_id)) || + (error = device_create_file(dev, &dev_attr_ipl)) || + (error = device_create_file(dev, &dev_attr_secid)) || + (error = device_create_file(dev, &dev_attr_name)) || + (error = device_create_file(dev, &dev_attr_resource))) + return error; + + return 0; } diff --git a/drivers/dio/dio.c b/drivers/dio/dio.c index 07f274f853d9..10c3c498358c 100644 --- a/drivers/dio/dio.c +++ b/drivers/dio/dio.c @@ -173,6 +173,7 @@ static int __init dio_init(void) mm_segment_t fs; int i; struct dio_dev *dev; + int error; if (!MACH_IS_HP300) return 0; @@ -182,7 +183,11 @@ static int __init dio_init(void) /* Initialize the DIO bus */ INIT_LIST_HEAD(&dio_bus.devices); strcpy(dio_bus.dev.bus_id, "dio"); - device_register(&dio_bus.dev); + error = device_register(&dio_bus.dev); + if (error) { + pr_err("DIO: Error registering dio_bus\n"); + return error; + } /* Request all resources */ dio_bus.num_resources = (hp300_model == HP_320 ? 1 : 2); @@ -252,8 +257,15 @@ static int __init dio_init(void) if (scode >= DIOII_SCBASE) iounmap(va); - device_register(&dev->dev); - dio_create_sysfs_dev_files(dev); + error = device_register(&dev->dev); + if (error) { + pr_err("DIO: Error registering device %s\n", + dev->name); + continue; + } + error = dio_create_sysfs_dev_files(dev); + if (error) + dev_err(&dev->dev, "Error creating sysfs files\n"); } return 0; } diff --git a/include/linux/dio.h b/include/linux/dio.h index 1e65ebc2a3db..b2dd31ca1710 100644 --- a/include/linux/dio.h +++ b/include/linux/dio.h @@ -241,7 +241,7 @@ struct dio_driver { extern int dio_find(int deviceid); extern unsigned long dio_scodetophysaddr(int scode); -extern void dio_create_sysfs_dev_files(struct dio_dev *); +extern int dio_create_sysfs_dev_files(struct dio_dev *); /* New-style probing */ extern int dio_register_driver(struct dio_driver *); -- cgit v1.2.3 From 985ebdb5ed54151eba734aa1b307460e8e4267ba Mon Sep 17 00:00:00 2001 From: Krzysztof HaÅ‚asa Date: Mon, 12 Jan 2009 16:32:13 -0800 Subject: net: Fix a comment in include/linux/netdevice.h. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a comment in include/linux/netdevice.h. Signed-off-by: Krzysztof HaÅ‚asa Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f24556813375..4647604c7ca9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -467,7 +467,7 @@ struct netdev_queue { * This function is called when network device transistions to the down * state. * - * int (*ndo_hard_start_xmit)(struct sk_buff *skb, struct net_device *dev); + * int (*ndo_start_xmit)(struct sk_buff *skb, struct net_device *dev); * Called when a packet needs to be transmitted. * Must return NETDEV_TX_OK , NETDEV_TX_BUSY, or NETDEV_TX_LOCKED, * Required can not be NULL. -- cgit v1.2.3 From daaf83d2b9277928739f3eb7ea64f49c1254fd62 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Mon, 12 Jan 2009 00:06:11 +0000 Subject: netfilter 09/09: remove padding from struct xt_match on 64bit builds reorder struct xt_match to remove 8 bytes of padding and make its size 128 bytes. This saves a small amount of data space in each of the xt netfilter modules and fits xt_match in one 128 byte cache line. Signed-off-by: Richard Kennedy Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter/x_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index e52ce475d19f..c7ee8744d26b 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -270,6 +270,7 @@ struct xt_match struct list_head list; const char name[XT_FUNCTION_MAXNAMELEN-1]; + u_int8_t revision; /* Return true or false: return FALSE and set *hotdrop = 1 to force immediate packet drop. */ @@ -302,7 +303,6 @@ struct xt_match unsigned short proto; unsigned short family; - u_int8_t revision; }; /* Registration hooks for targets. */ -- cgit v1.2.3 From 4c696ba7982501d43dea11dbbaabd2aa8a19cc42 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:53 +0100 Subject: [CVE-2009-0029] Move compat system call declarations to compat header file Move declarations to correct header file. Signed-off-by: Heiko Carstens --- include/linux/compat.h | 13 +++++++++++++ include/linux/syscalls.h | 12 ------------ 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index e88f3ecf38b4..3fd2194ff573 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -280,5 +280,18 @@ asmlinkage long compat_sys_timerfd_settime(int ufd, int flags, asmlinkage long compat_sys_timerfd_gettime(int ufd, struct compat_itimerspec __user *otmr); +asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page, + __u32 __user *pages, + const int __user *nodes, + int __user *status, + int flags); +asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, + struct compat_timeval __user *t); +asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename, + struct compat_stat __user *statbuf, + int flag); +asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename, + int flags, int mode); + #endif /* CONFIG_COMPAT */ #endif /* _LINUX_COMPAT_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 18d0a243a7b3..a7593f670ca6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -530,11 +530,6 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, const int __user *nodes, int __user *status, int flags); -asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page, - __u32 __user *pages, - const int __user *nodes, - int __user *status, - int flags); asmlinkage long sys_mbind(unsigned long start, unsigned long len, unsigned long mode, unsigned long __user *nmask, @@ -583,13 +578,6 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *bu int bufsiz); asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __user *utimes, int flags); -asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, - struct compat_timeval __user *t); -asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename, - struct compat_stat __user *statbuf, - int flag); -asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename, - int flags, int mode); asmlinkage long sys_unshare(unsigned long unshare_flags); asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, -- cgit v1.2.3 From 2ed7c03ec17779afb4fcfa3b8c61df61bd4879ba Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:54 +0100 Subject: [CVE-2009-0029] Convert all system calls to return a long Convert all system calls to return a long. This should be a NOP since all converted types should have the same size anyway. With the exception of sys_exit_group which returned void. But that doesn't matter since the system call doesn't return. Signed-off-by: Heiko Carstens --- fs/read_write.c | 18 +++++------ fs/xattr.c | 12 ++++---- include/linux/syscalls.h | 79 ++++++++++++++++++++++++------------------------ ipc/mqueue.c | 2 +- kernel/exit.c | 4 ++- kernel/signal.c | 2 +- kernel/timer.c | 2 +- mm/filemap.c | 2 +- mm/mmap.c | 2 +- mm/mremap.c | 2 +- mm/nommu.c | 2 +- 11 files changed, 64 insertions(+), 63 deletions(-) (limited to 'include/linux') diff --git a/fs/read_write.c b/fs/read_write.c index 5cc6924eb158..940367f51f2a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -147,7 +147,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin) } EXPORT_SYMBOL(vfs_llseek); -asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin) +asmlinkage long sys_lseek(unsigned int fd, off_t offset, unsigned int origin) { off_t retval; struct file * file; @@ -369,7 +369,7 @@ static inline void file_pos_write(struct file *file, loff_t pos) file->f_pos = pos; } -asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) +asmlinkage long sys_read(unsigned int fd, char __user * buf, size_t count) { struct file *file; ssize_t ret = -EBADF; @@ -386,7 +386,7 @@ asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) return ret; } -asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count) +asmlinkage long sys_write(unsigned int fd, const char __user * buf, size_t count) { struct file *file; ssize_t ret = -EBADF; @@ -403,7 +403,7 @@ asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t co return ret; } -asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, +asmlinkage long sys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { struct file *file; @@ -424,7 +424,7 @@ asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, return ret; } -asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf, +asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { struct file *file; @@ -672,7 +672,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, EXPORT_SYMBOL(vfs_writev); -asmlinkage ssize_t +asmlinkage long sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) { struct file *file; @@ -693,7 +693,7 @@ sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) return ret; } -asmlinkage ssize_t +asmlinkage long sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) { struct file *file; @@ -812,7 +812,7 @@ out: return retval; } -asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count) +asmlinkage long sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count) { loff_t pos; off_t off; @@ -831,7 +831,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, siz return do_sendfile(out_fd, in_fd, NULL, count, 0); } -asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count) +asmlinkage long sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count) { loff_t pos; ssize_t ret; diff --git a/fs/xattr.c b/fs/xattr.c index 237804cd6b56..d049ae27aae7 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -349,7 +349,7 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, return error; } -asmlinkage ssize_t +asmlinkage long sys_getxattr(const char __user *pathname, const char __user *name, void __user *value, size_t size) { @@ -364,7 +364,7 @@ sys_getxattr(const char __user *pathname, const char __user *name, return error; } -asmlinkage ssize_t +asmlinkage long sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value, size_t size) { @@ -379,7 +379,7 @@ sys_lgetxattr(const char __user *pathname, const char __user *name, void __user return error; } -asmlinkage ssize_t +asmlinkage long sys_fgetxattr(int fd, const char __user *name, void __user *value, size_t size) { struct file *f; @@ -424,7 +424,7 @@ listxattr(struct dentry *d, char __user *list, size_t size) return error; } -asmlinkage ssize_t +asmlinkage long sys_listxattr(const char __user *pathname, char __user *list, size_t size) { struct path path; @@ -438,7 +438,7 @@ sys_listxattr(const char __user *pathname, char __user *list, size_t size) return error; } -asmlinkage ssize_t +asmlinkage long sys_llistxattr(const char __user *pathname, char __user *list, size_t size) { struct path path; @@ -452,7 +452,7 @@ sys_llistxattr(const char __user *pathname, char __user *list, size_t size) return error; } -asmlinkage ssize_t +asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size) { struct file *f; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a7593f670ca6..22290eeaf553 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -77,7 +77,7 @@ asmlinkage long sys_times(struct tms __user *tbuf); asmlinkage long sys_gettid(void); asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp); -asmlinkage unsigned long sys_alarm(unsigned int seconds); +asmlinkage long sys_alarm(unsigned int seconds); asmlinkage long sys_getpid(void); asmlinkage long sys_getppid(void); asmlinkage long sys_getuid(void); @@ -166,7 +166,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, unsigned long flags); asmlinkage long sys_exit(int error_code); -asmlinkage void sys_exit_group(int error_code); +asmlinkage long sys_exit_group(int error_code); asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr, int options, struct rusage __user *ru); asmlinkage long sys_waitid(int which, pid_t pid, @@ -196,7 +196,7 @@ asmlinkage long sys_tkill(int pid, int sig); asmlinkage long sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo); asmlinkage long sys_sgetmask(void); asmlinkage long sys_ssetmask(int newmask); -asmlinkage unsigned long sys_signal(int sig, __sighandler_t handler); +asmlinkage long sys_signal(int sig, __sighandler_t handler); asmlinkage long sys_pause(void); asmlinkage long sys_sync(void); @@ -246,29 +246,29 @@ asmlinkage long sys_lsetxattr(const char __user *path, const char __user *name, const void __user *value, size_t size, int flags); asmlinkage long sys_fsetxattr(int fd, const char __user *name, const void __user *value, size_t size, int flags); -asmlinkage ssize_t sys_getxattr(const char __user *path, const char __user *name, - void __user *value, size_t size); -asmlinkage ssize_t sys_lgetxattr(const char __user *path, const char __user *name, - void __user *value, size_t size); -asmlinkage ssize_t sys_fgetxattr(int fd, const char __user *name, - void __user *value, size_t size); -asmlinkage ssize_t sys_listxattr(const char __user *path, char __user *list, - size_t size); -asmlinkage ssize_t sys_llistxattr(const char __user *path, char __user *list, - size_t size); -asmlinkage ssize_t sys_flistxattr(int fd, char __user *list, size_t size); +asmlinkage long sys_getxattr(const char __user *path, const char __user *name, + void __user *value, size_t size); +asmlinkage long sys_lgetxattr(const char __user *path, const char __user *name, + void __user *value, size_t size); +asmlinkage long sys_fgetxattr(int fd, const char __user *name, + void __user *value, size_t size); +asmlinkage long sys_listxattr(const char __user *path, char __user *list, + size_t size); +asmlinkage long sys_llistxattr(const char __user *path, char __user *list, + size_t size); +asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size); asmlinkage long sys_removexattr(const char __user *path, const char __user *name); asmlinkage long sys_lremovexattr(const char __user *path, const char __user *name); asmlinkage long sys_fremovexattr(int fd, const char __user *name); -asmlinkage unsigned long sys_brk(unsigned long brk); +asmlinkage long sys_brk(unsigned long brk); asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot); -asmlinkage unsigned long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr); +asmlinkage long sys_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); @@ -321,10 +321,10 @@ asmlinkage long sys_io_submit(aio_context_t, long, struct iocb __user * __user *); asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb, struct io_event __user *result); -asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, - off_t __user *offset, size_t count); -asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, - loff_t __user *offset, size_t count); +asmlinkage long sys_sendfile(int out_fd, int in_fd, + off_t __user *offset, size_t count); +asmlinkage long sys_sendfile64(int out_fd, int in_fd, + loff_t __user *offset, size_t count); asmlinkage long sys_readlink(const char __user *path, char __user *buf, int bufsiz); asmlinkage long sys_creat(const char __user *pathname, int mode); @@ -368,26 +368,25 @@ asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times); asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes); -asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, - unsigned int origin); +asmlinkage long sys_lseek(unsigned int fd, off_t offset, + unsigned int origin); asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high, unsigned long offset_low, loff_t __user *result, unsigned int origin); -asmlinkage ssize_t sys_read(unsigned int fd, char __user *buf, - size_t count); -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count); -asmlinkage ssize_t sys_readv(unsigned long fd, - const struct iovec __user *vec, - unsigned long vlen); -asmlinkage ssize_t sys_write(unsigned int fd, const char __user *buf, - size_t count); -asmlinkage ssize_t sys_writev(unsigned long fd, - const struct iovec __user *vec, - unsigned long vlen); -asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, - size_t count, loff_t pos); -asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf, - size_t count, loff_t pos); +asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count); +asmlinkage long sys_readahead(int fd, loff_t offset, size_t count); +asmlinkage long sys_readv(unsigned long fd, + const struct iovec __user *vec, + unsigned long vlen); +asmlinkage long sys_write(unsigned int fd, const char __user *buf, + size_t count); +asmlinkage long sys_writev(unsigned long fd, + const struct iovec __user *vec, + unsigned long vlen); +asmlinkage long sys_pread64(unsigned int fd, char __user *buf, + size_t count, loff_t pos); +asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf, + size_t count, loff_t pos); asmlinkage long sys_getcwd(char __user *buf, unsigned long size); asmlinkage long sys_mkdir(const char __user *pathname, int mode); asmlinkage long sys_chdir(const char __user *filename); @@ -476,7 +475,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr); asmlinkage long sys_mq_unlink(const char __user *name); asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout); -asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout); +asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout); asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification); asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 23fdb8492b8e..6df028b70543 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -907,7 +907,7 @@ out: return ret; } -asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, +asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout) { diff --git a/kernel/exit.c b/kernel/exit.c index c7740fa3252c..fac9b040af2c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1182,9 +1182,11 @@ do_group_exit(int exit_code) * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ -asmlinkage void sys_exit_group(int error_code) +asmlinkage long sys_exit_group(int error_code) { do_group_exit((error_code & 0xff) << 8); + /* NOTREACHED */ + return 0; } static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) diff --git a/kernel/signal.c b/kernel/signal.c index 3152ac3b62e2..856a5479d49d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2559,7 +2559,7 @@ sys_ssetmask(int newmask) /* * For backwards compatibility. Functionality superseded by sigaction. */ -asmlinkage unsigned long +asmlinkage long sys_signal(int sig, __sighandler_t handler) { struct k_sigaction new_sa, old_sa; diff --git a/kernel/timer.c b/kernel/timer.c index dee3f641a7a7..7b8697d7f04d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1129,7 +1129,7 @@ void do_timer(unsigned long ticks) * For backwards compatibility? This can be done in libc so Alpha * and all newer ports shouldn't need it. */ -asmlinkage unsigned long sys_alarm(unsigned int seconds) +asmlinkage long sys_alarm(unsigned int seconds) { return alarm_setitimer(seconds); } diff --git a/mm/filemap.c b/mm/filemap.c index ceba0bd03662..538b75ed6236 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1374,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp, return 0; } -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +asmlinkage long sys_readahead(int fd, loff_t offset, size_t count) { ssize_t ret; struct file *file; diff --git a/mm/mmap.c b/mm/mmap.c index 749623196cb9..a970d890cb21 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -245,7 +245,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -asmlinkage unsigned long sys_brk(unsigned long brk) +asmlinkage long sys_brk(unsigned long brk) { unsigned long rlim, retval; unsigned long newbrk, oldbrk; diff --git a/mm/mremap.c b/mm/mremap.c index 646de959aa58..5572e0825d80 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -420,7 +420,7 @@ out_nc: return ret; } -asmlinkage unsigned long sys_mremap(unsigned long addr, +asmlinkage long sys_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) { diff --git a/mm/nommu.c b/mm/nommu.c index 60ed8375c986..ee3e78927739 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -416,7 +416,7 @@ EXPORT_SYMBOL(vm_insert_page); * to a regular file. in this case, the unmapping will need * to invoke file system routines that need the global lock. */ -asmlinkage unsigned long sys_brk(unsigned long brk) +asmlinkage long sys_brk(unsigned long brk) { struct mm_struct *mm = current->mm; -- cgit v1.2.3 From e55380edf68796d75bf41391a781c68ee678587d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:55 +0100 Subject: [CVE-2009-0029] Rename old_readdir to sys_old_readdir This way it matches the generic system call name convention. Signed-off-by: Heiko Carstens --- arch/arm/kernel/calls.S | 2 +- arch/cris/arch-v10/kernel/entry.S | 2 +- arch/cris/arch-v32/kernel/entry.S | 2 +- arch/h8300/kernel/syscalls.S | 2 +- arch/m68k/kernel/entry.S | 2 +- arch/m68knommu/kernel/syscalltable.S | 2 +- arch/mips/kernel/scall32-o32.S | 2 +- arch/mn10300/kernel/entry.S | 2 +- arch/powerpc/include/asm/systbl.h | 2 +- arch/sh/kernel/syscalls_32.S | 2 +- arch/sh/kernel/syscalls_64.S | 2 +- arch/sparc/kernel/systbls_32.S | 2 +- arch/x86/kernel/syscall_table_32.S | 2 +- fs/readdir.c | 2 +- include/linux/syscalls.h | 2 ++ 15 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index 09a061cb7838..9ca8d13f05f7 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -98,7 +98,7 @@ CALL(sys_uselib) CALL(sys_swapon) CALL(sys_reboot) - CALL(OBSOLETE(old_readdir)) /* used by libc4 */ + CALL(OBSOLETE(sys_old_readdir)) /* used by libc4 */ /* 90 */ CALL(OBSOLETE(old_mmap)) /* used by libc4 */ CALL(sys_munmap) CALL(sys_truncate) diff --git a/arch/cris/arch-v10/kernel/entry.S b/arch/cris/arch-v10/kernel/entry.S index ed171d389e65..72f5cd319b97 100644 --- a/arch/cris/arch-v10/kernel/entry.S +++ b/arch/cris/arch-v10/kernel/entry.S @@ -691,7 +691,7 @@ sys_call_table: .long sys_uselib .long sys_swapon .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/arch/cris/arch-v32/kernel/entry.S b/arch/cris/arch-v32/kernel/entry.S index 7f6f93e6b70e..5e674c8f7c51 100644 --- a/arch/cris/arch-v32/kernel/entry.S +++ b/arch/cris/arch-v32/kernel/entry.S @@ -614,7 +614,7 @@ sys_call_table: .long sys_uselib .long sys_swapon .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/arch/h8300/kernel/syscalls.S b/arch/h8300/kernel/syscalls.S index 54e21c3f2057..4eb67faac633 100644 --- a/arch/h8300/kernel/syscalls.S +++ b/arch/h8300/kernel/syscalls.S @@ -103,7 +103,7 @@ SYMBOL_NAME_LABEL(sys_call_table) .long SYMBOL_NAME(sys_uselib) .long SYMBOL_NAME(sys_swapon) .long SYMBOL_NAME(sys_reboot) - .long SYMBOL_NAME(old_readdir) + .long SYMBOL_NAME(sys_old_readdir) .long SYMBOL_NAME(old_mmap) /* 90 */ .long SYMBOL_NAME(sys_munmap) .long SYMBOL_NAME(sys_truncate) diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S index 5b780826647c..5c332f2b9b83 100644 --- a/arch/m68k/kernel/entry.S +++ b/arch/m68k/kernel/entry.S @@ -513,7 +513,7 @@ sys_call_table: .long sys_uselib .long sys_swapon .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/arch/m68knommu/kernel/syscalltable.S b/arch/m68knommu/kernel/syscalltable.S index 812f8d8b7a85..5c3e3f62194a 100644 --- a/arch/m68knommu/kernel/syscalltable.S +++ b/arch/m68knommu/kernel/syscalltable.S @@ -107,7 +107,7 @@ ENTRY(sys_call_table) .long sys_uselib .long sys_ni_syscall /* sys_swapon */ .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index d0916a55cd77..51d1ba415b90 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -398,7 +398,7 @@ einval: li v0, -ENOSYS sys sys_uselib 1 sys sys_swapon 2 sys sys_reboot 3 - sys old_readdir 3 + sys sys_old_readdir 3 sys old_mmap 6 /* 4090 */ sys sys_munmap 2 sys sys_truncate 2 diff --git a/arch/mn10300/kernel/entry.S b/arch/mn10300/kernel/entry.S index 62fba8aa9b6e..ceeaaaa359e2 100644 --- a/arch/mn10300/kernel/entry.S +++ b/arch/mn10300/kernel/entry.S @@ -478,7 +478,7 @@ ENTRY(sys_call_table) .long sys_uselib .long sys_swapon .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 803def236654..72353f6070a4 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -92,7 +92,7 @@ COMPAT_SYS_SPU(readlink) SYSCALL(uselib) SYSCALL(swapon) SYSCALL(reboot) -SYSX(sys_ni_syscall,compat_sys_old_readdir,old_readdir) +SYSX(sys_ni_syscall,compat_sys_old_readdir,sys_old_readdir) SYSCALL_SPU(mmap) SYSCALL_SPU(munmap) SYSCALL_SPU(truncate) diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S index 0af693e65764..a87ce076cfa2 100644 --- a/arch/sh/kernel/syscalls_32.S +++ b/arch/sh/kernel/syscalls_32.S @@ -105,7 +105,7 @@ ENTRY(sys_call_table) .long sys_uselib .long sys_swapon .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S index 0b436aa3cad7..557cb91f5caf 100644 --- a/arch/sh/kernel/syscalls_64.S +++ b/arch/sh/kernel/syscalls_64.S @@ -109,7 +109,7 @@ sys_call_table: .long sys_uselib .long sys_swapon .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S index 7d0807586442..8a434f51ba0f 100644 --- a/arch/sparc/kernel/systbls_32.S +++ b/arch/sparc/kernel/systbls_32.S @@ -56,7 +56,7 @@ sys_call_table: /*185*/ .long sys_setpgid, sys_fremovexattr, sys_tkill, sys_exit_group, sys_newuname /*190*/ .long sys_init_module, sys_personality, sparc_remap_file_pages, sys_epoll_create, sys_epoll_ctl /*195*/ .long sys_epoll_wait, sys_ioprio_set, sys_getppid, sparc_sigaction, sys_sgetmask -/*200*/ .long sys_ssetmask, sys_sigsuspend, sys_newlstat, sys_uselib, old_readdir +/*200*/ .long sys_ssetmask, sys_sigsuspend, sys_newlstat, sys_uselib, sys_old_readdir /*205*/ .long sys_readahead, sys_socketcall, sys_syslog, sys_lookup_dcookie, sys_fadvise64 /*210*/ .long sys_fadvise64_64, sys_tgkill, sys_waitpid, sys_swapoff, sys_sysinfo /*215*/ .long sys_ipc, sys_sigreturn, sys_clone, sys_ioprio_get, sys_adjtimex diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d44395ff34c3..e2e86a08f31d 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -88,7 +88,7 @@ ENTRY(sys_call_table) .long sys_uselib .long sys_swapon .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/fs/readdir.c b/fs/readdir.c index b318d9b5af2e..8b4c2a0051a6 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -102,7 +102,7 @@ efault: return -EFAULT; } -asmlinkage long old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count) +asmlinkage long sys_old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count) { int error; struct file * file; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 22290eeaf553..ca079c3d09e3 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -54,6 +54,7 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; +struct old_linux_dirent; #include #include @@ -608,6 +609,7 @@ asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_eventfd2(unsigned int count, int flags); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); +asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -- cgit v1.2.3 From 1a94bc34768e463a93cb3751819709ab0ea80a01 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:59 +0100 Subject: [CVE-2009-0029] System call wrapper infrastructure From: Martin Schwidefsky By selecting HAVE_SYSCALL_WRAPPERS architectures can activate system call wrappers in order to sign extend system call arguments. All architectures where the ABI defines that the caller of a function has to perform sign extension probably need this. Reported-by: Christian Borntraeger Acked-by: Ralf Baechle Signed-off-by: Martin Schwidefsky Signed-off-by: Heiko Carstens --- arch/Kconfig | 3 +++ include/linux/syscalls.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) (limited to 'include/linux') diff --git a/arch/Kconfig b/arch/Kconfig index 2e13aa261929..550dab22daa1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -62,6 +62,9 @@ config HAVE_EFFICIENT_UNALIGNED_ACCESS See Documentation/unaligned-memory-access.txt for more information on the topic of unaligned memory accesses. +config HAVE_SYSCALL_WRAPPERS + bool + config KRETPROBES def_bool y depends on KPROBES && HAVE_KRETPROBES diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index ca079c3d09e3..0bb537d7ba2e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -66,6 +66,68 @@ struct old_linux_dirent; #include #include +#define __SC_DECL1(t1, a1) t1 a1 +#define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__) +#define __SC_DECL3(t3, a3, ...) t3 a3, __SC_DECL2(__VA_ARGS__) +#define __SC_DECL4(t4, a4, ...) t4 a4, __SC_DECL3(__VA_ARGS__) +#define __SC_DECL5(t5, a5, ...) t5 a5, __SC_DECL4(__VA_ARGS__) +#define __SC_DECL6(t6, a6, ...) t6 a6, __SC_DECL5(__VA_ARGS__) + +#define __SC_LONG1(t1, a1) long a1 +#define __SC_LONG2(t2, a2, ...) long a2, __SC_LONG1(__VA_ARGS__) +#define __SC_LONG3(t3, a3, ...) long a3, __SC_LONG2(__VA_ARGS__) +#define __SC_LONG4(t4, a4, ...) long a4, __SC_LONG3(__VA_ARGS__) +#define __SC_LONG5(t5, a5, ...) long a5, __SC_LONG4(__VA_ARGS__) +#define __SC_LONG6(t6, a6, ...) long a6, __SC_LONG5(__VA_ARGS__) + +#define __SC_CAST1(t1, a1) (t1) a1 +#define __SC_CAST2(t2, a2, ...) (t2) a2, __SC_CAST1(__VA_ARGS__) +#define __SC_CAST3(t3, a3, ...) (t3) a3, __SC_CAST2(__VA_ARGS__) +#define __SC_CAST4(t4, a4, ...) (t4) a4, __SC_CAST3(__VA_ARGS__) +#define __SC_CAST5(t5, a5, ...) (t5) a5, __SC_CAST4(__VA_ARGS__) +#define __SC_CAST6(t6, a6, ...) (t6) a6, __SC_CAST5(__VA_ARGS__) + +#define __SC_TEST(type) BUILD_BUG_ON(sizeof(type) > sizeof(long)) +#define __SC_TEST1(t1, a1) __SC_TEST(t1) +#define __SC_TEST2(t2, a2, ...) __SC_TEST(t2); __SC_TEST1(__VA_ARGS__) +#define __SC_TEST3(t3, a3, ...) __SC_TEST(t3); __SC_TEST2(__VA_ARGS__) +#define __SC_TEST4(t4, a4, ...) __SC_TEST(t4); __SC_TEST3(__VA_ARGS__) +#define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) +#define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) + +#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) +#define SYSCALL_DEFINE1(...) SYSCALL_DEFINEx(1, __VA_ARGS__) +#define SYSCALL_DEFINE2(...) SYSCALL_DEFINEx(2, __VA_ARGS__) +#define SYSCALL_DEFINE3(...) SYSCALL_DEFINEx(3, __VA_ARGS__) +#define SYSCALL_DEFINE4(...) SYSCALL_DEFINEx(4, __VA_ARGS__) +#define SYSCALL_DEFINE5(...) SYSCALL_DEFINEx(5, __VA_ARGS__) +#define SYSCALL_DEFINE6(...) SYSCALL_DEFINEx(6, __VA_ARGS__) + +#define SYSCALL_ALIAS(alias, name) \ + asm ("\t.globl " #alias "\n\t.set " #alias ", " #name) + +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS + +#define SYSCALL_DEFINE(name) static inline long SYSC_##name +#define SYSCALL_DEFINEx(x, name, ...) \ + asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__)); \ + static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__)); \ + asmlinkage long SyS_##name(__SC_LONG##x(__VA_ARGS__)) \ + { \ + __SC_TEST##x(__VA_ARGS__); \ + return (long) SYSC_##name(__SC_CAST##x(__VA_ARGS__)); \ + } \ + SYSCALL_ALIAS(sys_##name, SyS_##name); \ + static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__)) + +#else /* CONFIG_HAVE_SYSCALL_WRAPPERS */ + +#define SYSCALL_DEFINE(name) asmlinkage long sys_##name +#define SYSCALL_DEFINEx(x, name, ...) \ + asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__)) + +#endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */ + asmlinkage long sys_time(time_t __user *tloc); asmlinkage long sys_stime(time_t __user *tptr); asmlinkage long sys_gettimeofday(struct timeval __user *tv, -- cgit v1.2.3 From ee6a093222549ac0c72cfd296c69fa5e7d6daa34 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 14 Jan 2009 14:14:00 +0100 Subject: [CVE-2009-0029] powerpc: Enable syscall wrappers for 64-bit This enables the use of syscall wrappers to do proper sign extension for 64-bit programs. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Heiko Carstens --- arch/powerpc/Kconfig | 1 + include/linux/syscalls.h | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 84b861316ce7..e39b73bc0ff8 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -123,6 +123,7 @@ config PPC select HAVE_DMA_ATTRS if PPC64 select USE_GENERIC_SMP_HELPERS if SMP select HAVE_OPROFILE + select HAVE_SYSCALL_WRAPPERS if PPC64 config EARLY_PRINTK bool diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0bb537d7ba2e..90aa5eba87a2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -103,8 +103,14 @@ struct old_linux_dirent; #define SYSCALL_DEFINE5(...) SYSCALL_DEFINEx(5, __VA_ARGS__) #define SYSCALL_DEFINE6(...) SYSCALL_DEFINEx(6, __VA_ARGS__) +#ifdef CONFIG_PPC64 +#define SYSCALL_ALIAS(alias, name) \ + asm ("\t.globl " #alias "\n\t.set " #alias ", " #name "\n" \ + "\t.globl ." #alias "\n\t.set ." #alias ", ." #name) +#else #define SYSCALL_ALIAS(alias, name) \ asm ("\t.globl " #alias "\n\t.set " #alias ", " #name) +#endif #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -- cgit v1.2.3 From d4e82042c4cfa87a7d51710b71f568fe80132551 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:34 +0100 Subject: [CVE-2009-0029] System call wrappers part 32 Signed-off-by: Heiko Carstens --- fs/eventfd.c | 5 ++--- fs/pipe.c | 2 +- fs/readdir.c | 3 ++- fs/select.c | 11 ++++++----- fs/timerfd.c | 2 +- include/linux/syscalls.h | 7 +++++++ kernel/signal.c | 11 +++++------ 7 files changed, 24 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/fs/eventfd.c b/fs/eventfd.c index 08bf558d0408..5de2c2db3aa2 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -198,7 +198,7 @@ struct file *eventfd_fget(int fd) return file; } -asmlinkage long sys_eventfd2(unsigned int count, int flags) +SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) { int fd; struct eventfd_ctx *ctx; @@ -228,8 +228,7 @@ asmlinkage long sys_eventfd2(unsigned int count, int flags) return fd; } -asmlinkage long sys_eventfd(unsigned int count) +SYSCALL_DEFINE1(eventfd, unsigned int, count) { return sys_eventfd2(count, 0); } - diff --git a/fs/pipe.c b/fs/pipe.c index 0c64db86c919..b89c878588a9 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1043,7 +1043,7 @@ int do_pipe(int *fd) * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. */ -asmlinkage long sys_pipe2(int __user *fildes, int flags) +SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) { int fd[2]; int error; diff --git a/fs/readdir.c b/fs/readdir.c index cf6a0e39819a..7723401f8d8b 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -102,7 +102,8 @@ efault: return -EFAULT; } -asmlinkage long sys_old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count) +SYSCALL_DEFINE3(old_readdir, unsigned int, fd, + struct old_linux_dirent __user *, dirent, unsigned int, count) { int error; struct file * file; diff --git a/fs/select.c b/fs/select.c index 338f703403af..0fe0e1469df3 100644 --- a/fs/select.c +++ b/fs/select.c @@ -636,8 +636,9 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ -asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timespec __user *tsp, void __user *sig) +SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, + fd_set __user *, exp, struct timespec __user *, tsp, + void __user *, sig) { size_t sigsetsize = 0; sigset_t __user *up = NULL; @@ -889,9 +890,9 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, } #ifdef HAVE_SET_RESTORE_SIGMASK -asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, - struct timespec __user *tsp, const sigset_t __user *sigmask, - size_t sigsetsize) +SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, + struct timespec __user *, tsp, const sigset_t __user *, sigmask, + size_t, sigsetsize) { sigset_t ksigmask, sigsaved; struct timespec ts, end_time, *to = NULL; diff --git a/fs/timerfd.c b/fs/timerfd.c index c8c14f58b96f..6a123b8ff3f5 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -265,7 +265,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, return 0; } -asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr) +SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) { struct file *file; struct timerfd_ctx *ctx; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 90aa5eba87a2..56c400138b05 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -678,6 +678,13 @@ asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_eventfd2(unsigned int count, int flags); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); +asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, + fd_set __user *, struct timespec __user *, + void __user *); +asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int, + struct timespec __user *, const sigset_t __user *, + size_t); +asmlinkage long sys_pipe2(int __user *, int); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); diff --git a/kernel/signal.c b/kernel/signal.c index e2333929611a..e73759783dc8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2491,11 +2491,10 @@ out: #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ #ifdef __ARCH_WANT_SYS_RT_SIGACTION -asmlinkage long -sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize) +SYSCALL_DEFINE4(rt_sigaction, int, sig, + const struct sigaction __user *, act, + struct sigaction __user *, oact, + size_t, sigsetsize) { struct k_sigaction new_sa, old_sa; int ret = -EINVAL; @@ -2578,7 +2577,7 @@ SYSCALL_DEFINE0(pause) #endif #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND -asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) +SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) { sigset_t newset; -- cgit v1.2.3 From 2b66421995d2e93c9d1a0111acf2581f8529c6e5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:35 +0100 Subject: [CVE-2009-0029] System call wrappers part 33 Signed-off-by: Heiko Carstens --- fs/pipe.c | 2 +- include/linux/syscalls.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/pipe.c b/fs/pipe.c index b89c878588a9..3a48ba5179d5 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1059,7 +1059,7 @@ SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) return error; } -asmlinkage long sys_pipe(int __user *fildes) +SYSCALL_DEFINE1(pipe, int __user *, fildes) { return sys_pipe2(fildes, 0); } diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 56c400138b05..16875f89e6a7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -685,6 +685,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int, struct timespec __user *, const sigset_t __user *, size_t); asmlinkage long sys_pipe2(int __user *, int); +asmlinkage long sys_pipe(int __user *); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -- cgit v1.2.3 From 18e6959c385f3edf3991fa6662a53dac4eb10d5b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 14 Jan 2009 07:28:16 +0100 Subject: mm: fix assertion This assertion is incorrect for lockless pagecache. By definition if we have an unpinned page that we are trying to take a speculative reference to, it may become the tail of a compound page at any time (if it is freed, then reallocated as a compound page). It was still a valid assertion for the vmscan.c LRU isolation case, but it doesn't seem incredibly helpful... if somebody wants it, they can put it back directly where it applies in the vmscan code. Signed-off-by: Nick Piggin Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b91a73fd1bcc..e8ddc98b8405 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -260,7 +260,6 @@ static inline int put_page_testzero(struct page *page) */ static inline int get_page_unless_zero(struct page *page) { - VM_BUG_ON(PageTail(page)); return atomic_inc_not_zero(&page->_count); } -- cgit v1.2.3 From b94b898f3107046b5c97c556e23529283ea5eadd Mon Sep 17 00:00:00 2001 From: Brandon Philips Date: Wed, 14 Jan 2009 19:19:02 +0100 Subject: it821x: Add ultra_mask quirk for Vortex86SX On Vortex86SX with IDE controller revision 0x11 ultra DMA must be disabled. This patch was tested by DMP and seems to work. It is a cleaned up version of their older Kernel patch: http://www.dmp.com.tw/tech/vortex86sx/patch-2.6.24-DMP.gz Tested-by: Shawn Lin Signed-off-by: Brandon Philips Cc: Alan Cox Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/it821x.c | 12 ++++++++++++ include/linux/pci_ids.h | 1 + 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/drivers/ide/it821x.c b/drivers/ide/it821x.c index 0be27ac1f077..983440a9a5f7 100644 --- a/drivers/ide/it821x.c +++ b/drivers/ide/it821x.c @@ -68,6 +68,8 @@ #define DRV_NAME "it821x" +#define QUIRK_VORTEX86 1 + struct it821x_dev { unsigned int smart:1, /* Are we in smart raid mode */ @@ -79,6 +81,7 @@ struct it821x_dev u16 pio[2]; /* Cached PIO values */ u16 mwdma[2]; /* Cached MWDMA values */ u16 udma[2]; /* Cached UDMA values (per drive) */ + u16 quirks; }; #define ATA_66 0 @@ -577,6 +580,12 @@ static void __devinit init_hwif_it821x(ide_hwif_t *hwif) hwif->ultra_mask = ATA_UDMA6; hwif->mwdma_mask = ATA_MWDMA2; + + /* Vortex86SX quirk: prevent Ultra-DMA mode to fix BadCRC issue */ + if (idev->quirks & QUIRK_VORTEX86) { + if (dev->revision == 0x11) + hwif->ultra_mask = 0; + } } static void it8212_disable_raid(struct pci_dev *dev) @@ -649,6 +658,8 @@ static int __devinit it821x_init_one(struct pci_dev *dev, const struct pci_devic return -ENOMEM; } + itdevs->quirks = id->driver_data; + rc = ide_pci_init_one(dev, &it821x_chipset, itdevs); if (rc) kfree(itdevs); @@ -668,6 +679,7 @@ static void __devexit it821x_remove(struct pci_dev *dev) static const struct pci_device_id it821x_pci_tbl[] = { { PCI_VDEVICE(ITE, PCI_DEVICE_ID_ITE_8211), 0 }, { PCI_VDEVICE(ITE, PCI_DEVICE_ID_ITE_8212), 0 }, + { PCI_VDEVICE(RDC, PCI_DEVICE_ID_RDC_D1010), QUIRK_VORTEX86 }, { 0, }, }; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index d543365518ab..d56ad9c21c09 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2174,6 +2174,7 @@ #define PCI_DEVICE_ID_RDC_R6040 0x6040 #define PCI_DEVICE_ID_RDC_R6060 0x6060 #define PCI_DEVICE_ID_RDC_R6061 0x6061 +#define PCI_DEVICE_ID_RDC_D1010 0x1010 #define PCI_VENDOR_ID_LENOVO 0x17aa -- cgit v1.2.3 From e720b9e498b6bbb1b4f3b3d2f8e9a78578aafef7 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 14 Jan 2009 19:19:04 +0100 Subject: IDE: fix sparse signed-ness errors with host->host_busy The host_busy field in struct ide_host defaults to a signed-long, where most arch's test_and_set_bit_* macros use an unsigned long. Change to using an unsigned long, which on ARM removes the following sparse errors: drivers/ide/ide-io.c:681:8: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:681:8: expected unsigned long volatile *p drivers/ide/ide-io.c:681:8: got long volatile * drivers/ide/ide-io.c:681:8: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:681:8: expected unsigned long volatile *p drivers/ide/ide-io.c:681:8: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * Signed-off-by: Ben Dooks Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 3644f6323384..194da5a4b0d6 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -871,7 +871,7 @@ struct ide_host { ide_hwif_t *cur_port; /* for hosts requiring serialization */ /* used for hosts requiring serialization */ - volatile long host_busy; + volatile unsigned long host_busy; }; #define IDE_HOST_BUSY 0 -- cgit v1.2.3 From 74d96f018673759d04d032c137d132f6447bfb1e Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Tue, 13 Jan 2009 19:27:09 -0800 Subject: byteorder: make swab.h include asm/swab.h like a regular header Add swab.h to kbuild.asm and remove the individual entries from each arch, mark as unifdef as some arches have some kernel-only bits inside. Signed-off-by: Harvey Harrison Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/Kbuild | 1 - arch/alpha/include/asm/byteorder.h | 1 - arch/arm/include/asm/Kbuild | 1 - arch/arm/include/asm/byteorder.h | 2 -- arch/avr32/include/asm/Kbuild | 1 - arch/avr32/include/asm/byteorder.h | 1 - arch/blackfin/include/asm/Kbuild | 1 - arch/blackfin/include/asm/byteorder.h | 1 - arch/cris/include/asm/Kbuild | 1 - arch/cris/include/asm/byteorder.h | 1 - arch/h8300/include/asm/Kbuild | 1 - arch/h8300/include/asm/byteorder.h | 1 - arch/ia64/include/asm/Kbuild | 1 - arch/ia64/include/asm/byteorder.h | 1 - arch/m68knommu/include/asm/Kbuild | 2 -- arch/m68knommu/include/asm/byteorder.h | 1 - arch/mips/include/asm/Kbuild | 1 - arch/mips/include/asm/byteorder.h | 2 -- arch/parisc/include/asm/Kbuild | 1 - arch/parisc/include/asm/byteorder.h | 1 - arch/powerpc/include/asm/Kbuild | 1 - arch/powerpc/include/asm/byteorder.h | 2 -- arch/s390/include/asm/Kbuild | 1 - arch/s390/include/asm/byteorder.h | 1 - arch/sh/include/asm/Kbuild | 1 - arch/sh/include/asm/byteorder.h | 2 -- arch/sparc/include/asm/Kbuild | 1 - arch/sparc/include/asm/byteorder.h | 1 - arch/x86/include/asm/Kbuild | 1 - arch/x86/include/asm/byteorder.h | 1 - arch/xtensa/include/asm/Kbuild | 2 -- arch/xtensa/include/asm/byteorder.h | 2 -- include/asm-frv/Kbuild | 1 - include/asm-frv/byteorder.h | 1 - include/asm-generic/Kbuild.asm | 1 + include/asm-m32r/Kbuild | 1 - include/asm-m32r/byteorder.h | 2 -- include/asm-m68k/Kbuild | 1 - include/asm-m68k/byteorder.h | 1 - include/asm-mn10300/Kbuild | 1 - include/asm-mn10300/byteorder.h | 1 - include/linux/swab.h | 2 +- 42 files changed, 2 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 4dad27360576..b7c8f188b313 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -9,4 +9,3 @@ unifdef-y += console.h unifdef-y += fpu.h unifdef-y += sysinfo.h unifdef-y += compiler.h -unifdef-y += swab.h diff --git a/arch/alpha/include/asm/byteorder.h b/arch/alpha/include/asm/byteorder.h index 6772f3168701..73683093202d 100644 --- a/arch/alpha/include/asm/byteorder.h +++ b/arch/alpha/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _ALPHA_BYTEORDER_H #define _ALPHA_BYTEORDER_H -#include #include #endif /* _ALPHA_BYTEORDER_H */ diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 43b0b2ba392f..73237bd130a2 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -1,4 +1,3 @@ include include/asm-generic/Kbuild.asm unifdef-y += hwcap.h -unifdef-y += swab.h diff --git a/arch/arm/include/asm/byteorder.h b/arch/arm/include/asm/byteorder.h index c02b6fc28e1a..77379748b171 100644 --- a/arch/arm/include/asm/byteorder.h +++ b/arch/arm/include/asm/byteorder.h @@ -15,8 +15,6 @@ #ifndef __ASM_ARM_BYTEORDER_H #define __ASM_ARM_BYTEORDER_H -#include - #ifdef __ARMEB__ #include #else diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild index 219822c8ad18..3136628ba8d2 100644 --- a/arch/avr32/include/asm/Kbuild +++ b/arch/avr32/include/asm/Kbuild @@ -1,4 +1,3 @@ include include/asm-generic/Kbuild.asm -header-y += swab.h header-y += cachectl.h diff --git a/arch/avr32/include/asm/byteorder.h b/arch/avr32/include/asm/byteorder.h index 2aba64b4e122..50abc21619a8 100644 --- a/arch/avr32/include/asm/byteorder.h +++ b/arch/avr32/include/asm/byteorder.h @@ -4,7 +4,6 @@ #ifndef __ASM_AVR32_BYTEORDER_H #define __ASM_AVR32_BYTEORDER_H -#include #include #endif /* __ASM_AVR32_BYTEORDER_H */ diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild index d0d1ac435544..606ecfdcc962 100644 --- a/arch/blackfin/include/asm/Kbuild +++ b/arch/blackfin/include/asm/Kbuild @@ -1,4 +1,3 @@ include include/asm-generic/Kbuild.asm unifdef-y += fixed_code.h -unifdef-y += swab.h diff --git a/arch/blackfin/include/asm/byteorder.h b/arch/blackfin/include/asm/byteorder.h index b9e797a497b4..3e69106a4d37 100644 --- a/arch/blackfin/include/asm/byteorder.h +++ b/arch/blackfin/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _BLACKFIN_BYTEORDER_H #define _BLACKFIN_BYTEORDER_H -#include #include #endif /* _BLACKFIN_BYTEORDER_H */ diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index b79b7c6543a6..d5b631935ec8 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild @@ -9,4 +9,3 @@ header-y += sync_serial.h unifdef-y += etraxgpio.h unifdef-y += rs485.h -unifdef-y += swab.h diff --git a/arch/cris/include/asm/byteorder.h b/arch/cris/include/asm/byteorder.h index 7678d86317ae..bcd189798e26 100644 --- a/arch/cris/include/asm/byteorder.h +++ b/arch/cris/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _CRIS_BYTEORDER_H #define _CRIS_BYTEORDER_H -#include #include #endif diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index 27b108a86b39..c68e1680da01 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild @@ -1,2 +1 @@ include include/asm-generic/Kbuild.asm -unifdef-y += swab.h diff --git a/arch/h8300/include/asm/byteorder.h b/arch/h8300/include/asm/byteorder.h index c36b80a3dd84..13539da99efd 100644 --- a/arch/h8300/include/asm/byteorder.h +++ b/arch/h8300/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _H8300_BYTEORDER_H #define _H8300_BYTEORDER_H -#include #include #endif /* _H8300_BYTEORDER_H */ diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild index 3b25bd9dca91..ccbe8ae47a61 100644 --- a/arch/ia64/include/asm/Kbuild +++ b/arch/ia64/include/asm/Kbuild @@ -14,4 +14,3 @@ unifdef-y += gcc_intrin.h unifdef-y += intrinsics.h unifdef-y += perfmon.h unifdef-y += ustack.h -unifdef-y += swab.h diff --git a/arch/ia64/include/asm/byteorder.h b/arch/ia64/include/asm/byteorder.h index 0f84c5cb703d..a8dd73558150 100644 --- a/arch/ia64/include/asm/byteorder.h +++ b/arch/ia64/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _ASM_IA64_BYTEORDER_H #define _ASM_IA64_BYTEORDER_H -#include #include #endif /* _ASM_IA64_BYTEORDER_H */ diff --git a/arch/m68knommu/include/asm/Kbuild b/arch/m68knommu/include/asm/Kbuild index 58c02a454130..c68e1680da01 100644 --- a/arch/m68knommu/include/asm/Kbuild +++ b/arch/m68knommu/include/asm/Kbuild @@ -1,3 +1 @@ include include/asm-generic/Kbuild.asm - -unifdef-y += swab.h diff --git a/arch/m68knommu/include/asm/byteorder.h b/arch/m68knommu/include/asm/byteorder.h index a6f0b8f7f622..9c6c76a15041 100644 --- a/arch/m68knommu/include/asm/byteorder.h +++ b/arch/m68knommu/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _M68KNOMMU_BYTEORDER_H #define _M68KNOMMU_BYTEORDER_H -#include #include #endif /* _M68KNOMMU_BYTEORDER_H */ diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 023866c0c102..7897f05e3165 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -1,4 +1,3 @@ include include/asm-generic/Kbuild.asm header-y += cachectl.h sgidefs.h sysmips.h -header-y += swab.h diff --git a/arch/mips/include/asm/byteorder.h b/arch/mips/include/asm/byteorder.h index 607b71830707..9579051ff1c7 100644 --- a/arch/mips/include/asm/byteorder.h +++ b/arch/mips/include/asm/byteorder.h @@ -8,8 +8,6 @@ #ifndef _ASM_BYTEORDER_H #define _ASM_BYTEORDER_H -#include - #if defined(__MIPSEB__) #include #elif defined(__MIPSEL__) diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 2121d99f8364..f88b252e419c 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -1,4 +1,3 @@ include include/asm-generic/Kbuild.asm unifdef-y += pdc.h -unifdef-y += swab.h diff --git a/arch/parisc/include/asm/byteorder.h b/arch/parisc/include/asm/byteorder.h index da66029c4cb2..58af2c5f5d61 100644 --- a/arch/parisc/include/asm/byteorder.h +++ b/arch/parisc/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _PARISC_BYTEORDER_H #define _PARISC_BYTEORDER_H -#include #include #endif /* _PARISC_BYTEORDER_H */ diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 9268602de5d0..5ab7d7fe198c 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -35,4 +35,3 @@ unifdef-y += spu_info.h unifdef-y += termios.h unifdef-y += types.h unifdef-y += unistd.h -unifdef-y += swab.h diff --git a/arch/powerpc/include/asm/byteorder.h b/arch/powerpc/include/asm/byteorder.h index 5cca27a41532..aa6cc4fac965 100644 --- a/arch/powerpc/include/asm/byteorder.h +++ b/arch/powerpc/include/asm/byteorder.h @@ -7,8 +7,6 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ - -#include #include #endif /* _ASM_POWERPC_BYTEORDER_H */ diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index f2af4167bd5f..63a23415fba6 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -13,4 +13,3 @@ unifdef-y += cmb.h unifdef-y += debug.h unifdef-y += chpid.h unifdef-y += schid.h -unifdef-y += swab.h diff --git a/arch/s390/include/asm/byteorder.h b/arch/s390/include/asm/byteorder.h index b95a2b2933fb..a332e59e26fc 100644 --- a/arch/s390/include/asm/byteorder.h +++ b/arch/s390/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _S390_BYTEORDER_H #define _S390_BYTEORDER_H -#include #include #endif /* _S390_BYTEORDER_H */ diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index f1a2a0d1c79c..43910cdf78a5 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -6,4 +6,3 @@ unifdef-y += unistd_32.h unifdef-y += unistd_64.h unifdef-y += posix_types_32.h unifdef-y += posix_types_64.h -unifdef-y += swab.h diff --git a/arch/sh/include/asm/byteorder.h b/arch/sh/include/asm/byteorder.h index e95c41a5c8cc..db2f5d7cb17d 100644 --- a/arch/sh/include/asm/byteorder.h +++ b/arch/sh/include/asm/byteorder.h @@ -1,8 +1,6 @@ #ifndef __ASM_SH_BYTEORDER_H #define __ASM_SH_BYTEORDER_H -#include - #ifdef __LITTLE_ENDIAN__ #include #else diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 95e38a43dff0..deeb0fba8029 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -17,4 +17,3 @@ header-y += traps.h header-y += uctx.h header-y += utrap.h header-y += watchdog.h -header-y += swab.h diff --git a/arch/sparc/include/asm/byteorder.h b/arch/sparc/include/asm/byteorder.h index 48a047cd6fa9..ccc1b6b7de6c 100644 --- a/arch/sparc/include/asm/byteorder.h +++ b/arch/sparc/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _SPARC_BYTEORDER_H #define _SPARC_BYTEORDER_H -#include #include #endif /* _SPARC_BYTEORDER_H */ diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index a9f8a814a1f7..4a8e80cdcfa5 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -22,4 +22,3 @@ unifdef-y += unistd_32.h unifdef-y += unistd_64.h unifdef-y += vm86.h unifdef-y += vsyscall.h -unifdef-y += swab.h diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/asm/byteorder.h index 7c49917e3d9d..b13a7a88f3eb 100644 --- a/arch/x86/include/asm/byteorder.h +++ b/arch/x86/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_BYTEORDER_H #define _ASM_X86_BYTEORDER_H -#include #include #endif /* _ASM_X86_BYTEORDER_H */ diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 58c02a454130..c68e1680da01 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -1,3 +1 @@ include include/asm-generic/Kbuild.asm - -unifdef-y += swab.h diff --git a/arch/xtensa/include/asm/byteorder.h b/arch/xtensa/include/asm/byteorder.h index 329b94591ca4..54eb6315349c 100644 --- a/arch/xtensa/include/asm/byteorder.h +++ b/arch/xtensa/include/asm/byteorder.h @@ -1,8 +1,6 @@ #ifndef _XTENSA_BYTEORDER_H #define _XTENSA_BYTEORDER_H -#include - #ifdef __XTENSA_EL__ #include #elif defined(__XTENSA_EB__) diff --git a/include/asm-frv/Kbuild b/include/asm-frv/Kbuild index 1f44e7c76995..0f8956def738 100644 --- a/include/asm-frv/Kbuild +++ b/include/asm-frv/Kbuild @@ -3,4 +3,3 @@ include include/asm-generic/Kbuild.asm header-y += registers.h unifdef-y += termios.h -unifdef-y += swab.h diff --git a/include/asm-frv/byteorder.h b/include/asm-frv/byteorder.h index 1187e51ecd13..f29b7593e088 100644 --- a/include/asm-frv/byteorder.h +++ b/include/asm-frv/byteorder.h @@ -1,7 +1,6 @@ #ifndef _ASM_BYTEORDER_H #define _ASM_BYTEORDER_H -#include #include #endif /* _ASM_BYTEORDER_H */ diff --git a/include/asm-generic/Kbuild.asm b/include/asm-generic/Kbuild.asm index 1870d5e05f1c..70d185534b9d 100644 --- a/include/asm-generic/Kbuild.asm +++ b/include/asm-generic/Kbuild.asm @@ -31,6 +31,7 @@ unifdef-y += socket.h unifdef-y += sockios.h unifdef-y += stat.h unifdef-y += statfs.h +unifdef-y += swab.h unifdef-y += termbits.h unifdef-y += termios.h unifdef-y += types.h diff --git a/include/asm-m32r/Kbuild b/include/asm-m32r/Kbuild index 27b108a86b39..c68e1680da01 100644 --- a/include/asm-m32r/Kbuild +++ b/include/asm-m32r/Kbuild @@ -1,2 +1 @@ include include/asm-generic/Kbuild.asm -unifdef-y += swab.h diff --git a/include/asm-m32r/byteorder.h b/include/asm-m32r/byteorder.h index 61ff9cfd8451..21855d8b028b 100644 --- a/include/asm-m32r/byteorder.h +++ b/include/asm-m32r/byteorder.h @@ -1,8 +1,6 @@ #ifndef _ASM_M32R_BYTEORDER_H #define _ASM_M32R_BYTEORDER_H -#include - #if defined(__LITTLE_ENDIAN__) # include #else diff --git a/include/asm-m68k/Kbuild b/include/asm-m68k/Kbuild index 52fd96b4142a..1a922fad76f7 100644 --- a/include/asm-m68k/Kbuild +++ b/include/asm-m68k/Kbuild @@ -1,3 +1,2 @@ include include/asm-generic/Kbuild.asm header-y += cachectl.h -unifdef-y += swab.h diff --git a/include/asm-m68k/byteorder.h b/include/asm-m68k/byteorder.h index 300866523b86..31b260a88803 100644 --- a/include/asm-m68k/byteorder.h +++ b/include/asm-m68k/byteorder.h @@ -1,7 +1,6 @@ #ifndef _M68K_BYTEORDER_H #define _M68K_BYTEORDER_H -#include #include #endif /* _M68K_BYTEORDER_H */ diff --git a/include/asm-mn10300/Kbuild b/include/asm-mn10300/Kbuild index 27b108a86b39..c68e1680da01 100644 --- a/include/asm-mn10300/Kbuild +++ b/include/asm-mn10300/Kbuild @@ -1,2 +1 @@ include include/asm-generic/Kbuild.asm -unifdef-y += swab.h diff --git a/include/asm-mn10300/byteorder.h b/include/asm-mn10300/byteorder.h index 45b18ded19e6..5dd0bdd9feee 100644 --- a/include/asm-mn10300/byteorder.h +++ b/include/asm-mn10300/byteorder.h @@ -1,7 +1,6 @@ #ifndef _ASM_BYTEORDER_H #define _ASM_BYTEORDER_H -#include #include #endif /* _ASM_BYTEORDER_H */ diff --git a/include/linux/swab.h b/include/linux/swab.h index be5284d4a053..ea0c02fd5163 100644 --- a/include/linux/swab.h +++ b/include/linux/swab.h @@ -3,7 +3,7 @@ #include #include -#include +#include /* * casts are necessary for constants, because we never know how for sure -- cgit v1.2.3 From 937f1ba56b4be37d9e2ad77412f95048662058d2 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 14 Jan 2009 21:05:05 -0800 Subject: net: Add init_dummy_netdev() and fix EMAC driver using it This adds an init_dummy_netdev() function that gets a network device structure (allocation and lifetime entirely under caller's control) and initialize the minimum amount of fields so it can be used to schedule NAPI polls without registering a full blown interface. This is to be used by drivers that need to tie several hardware interfaces to a single NAPI poll scheduler due to HW limitations. It also updates the ibm_newemac driver to use that, this fixing the oops on 2.6.29 due to passing NULL as "dev" to netif_napi_add() Symbol is exported GPL only a I don't think we want binary drivers doing that sort of acrobatics (if we want them at all). Signed-off-by: Benjamin Herrenschmidt Tested-by: Geert Uytterhoeven Signed-off-by: David S. Miller --- drivers/net/ibm_newemac/mal.c | 4 +++- drivers/net/ibm_newemac/mal.h | 2 ++ include/linux/netdevice.h | 3 +++ net/core/dev.c | 39 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ibm_newemac/mal.c b/drivers/net/ibm_newemac/mal.c index ecf9798987fa..2a2fc17b2878 100644 --- a/drivers/net/ibm_newemac/mal.c +++ b/drivers/net/ibm_newemac/mal.c @@ -613,7 +613,9 @@ static int __devinit mal_probe(struct of_device *ofdev, INIT_LIST_HEAD(&mal->list); spin_lock_init(&mal->lock); - netif_napi_add(NULL, &mal->napi, mal_poll, + init_dummy_netdev(&mal->dummy_dev); + + netif_napi_add(&mal->dummy_dev, &mal->napi, mal_poll, CONFIG_IBM_NEW_EMAC_POLL_WEIGHT); /* Load power-on reset defaults */ diff --git a/drivers/net/ibm_newemac/mal.h b/drivers/net/ibm_newemac/mal.h index 2f0a87360844..9ededfbf0726 100644 --- a/drivers/net/ibm_newemac/mal.h +++ b/drivers/net/ibm_newemac/mal.h @@ -214,6 +214,8 @@ struct mal_instance { int index; spinlock_t lock; + struct net_device dummy_dev; + unsigned int features; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4647604c7ca9..ec54785d34f9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -795,6 +795,7 @@ struct net_device NETREG_UNREGISTERING, /* called unregister_netdevice */ NETREG_UNREGISTERED, /* completed unregister todo */ NETREG_RELEASED, /* called free_netdev */ + NETREG_DUMMY, /* dummy device for NAPI poll */ } reg_state; /* Called from unregister, can be used to call free_netdev */ @@ -1077,6 +1078,8 @@ extern void free_netdev(struct net_device *dev); extern void synchronize_net(void); extern int register_netdevice_notifier(struct notifier_block *nb); extern int unregister_netdevice_notifier(struct notifier_block *nb); +extern int init_dummy_netdev(struct net_device *dev); + extern int call_netdevice_notifiers(unsigned long val, struct net_device *dev); extern struct net_device *dev_get_by_index(struct net *net, int ifindex); extern struct net_device *__dev_get_by_index(struct net *net, int ifindex); diff --git a/net/core/dev.c b/net/core/dev.c index 60377b6c0a80..8d675975d85b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4430,6 +4430,45 @@ err_uninit: goto out; } +/** + * init_dummy_netdev - init a dummy network device for NAPI + * @dev: device to init + * + * This takes a network device structure and initialize the minimum + * amount of fields so it can be used to schedule NAPI polls without + * registering a full blown interface. This is to be used by drivers + * that need to tie several hardware interfaces to a single NAPI + * poll scheduler due to HW limitations. + */ +int init_dummy_netdev(struct net_device *dev) +{ + /* Clear everything. Note we don't initialize spinlocks + * are they aren't supposed to be taken by any of the + * NAPI code and this dummy netdev is supposed to be + * only ever used for NAPI polls + */ + memset(dev, 0, sizeof(struct net_device)); + + /* make sure we BUG if trying to hit standard + * register/unregister code path + */ + dev->reg_state = NETREG_DUMMY; + + /* initialize the ref count */ + atomic_set(&dev->refcnt, 1); + + /* NAPI wants this */ + INIT_LIST_HEAD(&dev->napi_list); + + /* a dummy interface is started by default */ + set_bit(__LINK_STATE_PRESENT, &dev->state); + set_bit(__LINK_STATE_START, &dev->state); + + return 0; +} +EXPORT_SYMBOL_GPL(init_dummy_netdev); + + /** * register_netdev - register a network device * @dev: device to register -- cgit v1.2.3 From 45ce80fb6b6f9594d1396d44dd7e7c02d596fef8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 15 Jan 2009 13:50:59 -0800 Subject: cgroups: consolidate cgroup documents Move Documentation/cpusets.txt and Documentation/controllers/* to Documentation/cgroups/ Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 5 +- Documentation/cgroups/cpuacct.txt | 32 + Documentation/cgroups/cpusets.txt | 808 +++++++++++++++++++++++++ Documentation/cgroups/devices.txt | 52 ++ Documentation/cgroups/memcg_test.txt | 342 +++++++++++ Documentation/cgroups/memory.txt | 399 ++++++++++++ Documentation/cgroups/resource_counter.txt | 181 ++++++ Documentation/controllers/cpuacct.txt | 32 - Documentation/controllers/devices.txt | 52 -- Documentation/controllers/memcg_test.txt | 342 ----------- Documentation/controllers/memory.txt | 399 ------------ Documentation/controllers/resource_counter.txt | 181 ------ Documentation/cpusets.txt | 808 ------------------------- Documentation/scheduler/sched-design-CFS.txt | 2 +- include/linux/res_counter.h | 2 +- init/Kconfig | 9 +- kernel/cpuset.c | 2 +- 17 files changed, 1824 insertions(+), 1824 deletions(-) create mode 100644 Documentation/cgroups/cpuacct.txt create mode 100644 Documentation/cgroups/cpusets.txt create mode 100644 Documentation/cgroups/devices.txt create mode 100644 Documentation/cgroups/memcg_test.txt create mode 100644 Documentation/cgroups/memory.txt create mode 100644 Documentation/cgroups/resource_counter.txt delete mode 100644 Documentation/controllers/cpuacct.txt delete mode 100644 Documentation/controllers/devices.txt delete mode 100644 Documentation/controllers/memcg_test.txt delete mode 100644 Documentation/controllers/memory.txt delete mode 100644 Documentation/controllers/resource_counter.txt delete mode 100644 Documentation/cpusets.txt (limited to 'include/linux') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index e33ee74eee77..d9e5d6f41b92 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -1,7 +1,8 @@ CGROUPS ------- -Written by Paul Menage based on Documentation/cpusets.txt +Written by Paul Menage based on +Documentation/cgroups/cpusets.txt Original copyright statements from cpusets.txt: Portions Copyright (C) 2004 BULL SA. @@ -68,7 +69,7 @@ On their own, the only use for cgroups is for simple job tracking. The intention is that other subsystems hook into the generic cgroup support to provide new attributes for cgroups, such as accounting/limiting the resources which processes in a cgroup can -access. For example, cpusets (see Documentation/cpusets.txt) allows +access. For example, cpusets (see Documentation/cgroups/cpusets.txt) allows you to associate a set of CPUs and a set of memory nodes with the tasks in each cgroup. diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt new file mode 100644 index 000000000000..bb775fbe43d7 --- /dev/null +++ b/Documentation/cgroups/cpuacct.txt @@ -0,0 +1,32 @@ +CPU Accounting Controller +------------------------- + +The CPU accounting controller is used to group tasks using cgroups and +account the CPU usage of these groups of tasks. + +The CPU accounting controller supports multi-hierarchy groups. An accounting +group accumulates the CPU usage of all of its child groups and the tasks +directly present in its group. + +Accounting groups can be created by first mounting the cgroup filesystem. + +# mkdir /cgroups +# mount -t cgroup -ocpuacct none /cgroups + +With the above step, the initial or the parent accounting group +becomes visible at /cgroups. At bootup, this group includes all the +tasks in the system. /cgroups/tasks lists the tasks in this cgroup. +/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by +this group which is essentially the CPU time obtained by all the tasks +in the system. + +New accounting groups can be created under the parent group /cgroups. + +# cd /cgroups +# mkdir g1 +# echo $$ > g1 + +The above steps create a new group g1 and move the current shell +process (bash) into it. CPU time consumed by this bash and its children +can be obtained from g1/cpuacct.usage and the same is accumulated in +/cgroups/cpuacct.usage also. diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt new file mode 100644 index 000000000000..5c86c258c791 --- /dev/null +++ b/Documentation/cgroups/cpusets.txt @@ -0,0 +1,808 @@ + CPUSETS + ------- + +Copyright (C) 2004 BULL SA. +Written by Simon.Derr@bull.net + +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. +Modified by Paul Jackson +Modified by Christoph Lameter +Modified by Paul Menage +Modified by Hidetoshi Seto + +CONTENTS: +========= + +1. Cpusets + 1.1 What are cpusets ? + 1.2 Why are cpusets needed ? + 1.3 How are cpusets implemented ? + 1.4 What are exclusive cpusets ? + 1.5 What is memory_pressure ? + 1.6 What is memory spread ? + 1.7 What is sched_load_balance ? + 1.8 What is sched_relax_domain_level ? + 1.9 How do I use cpusets ? +2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Adding/removing cpus + 2.3 Setting flags + 2.4 Attaching processes +3. Questions +4. Contact + +1. Cpusets +========== + +1.1 What are cpusets ? +---------------------- + +Cpusets provide a mechanism for assigning a set of CPUs and Memory +Nodes to a set of tasks. In this document "Memory Node" refers to +an on-line node that contains memory. + +Cpusets constrain the CPU and Memory placement of tasks to only +the resources within a tasks current cpuset. They form a nested +hierarchy visible in a virtual file system. These are the essential +hooks, beyond what is already present, required to manage dynamic +job placement on large systems. + +Cpusets use the generic cgroup subsystem described in +Documentation/cgroups/cgroups.txt. + +Requests by a task, using the sched_setaffinity(2) system call to +include CPUs in its CPU affinity mask, and using the mbind(2) and +set_mempolicy(2) system calls to include Memory Nodes in its memory +policy, are both filtered through that tasks cpuset, filtering out any +CPUs or Memory Nodes not in that cpuset. The scheduler will not +schedule a task on a CPU that is not allowed in its cpus_allowed +vector, and the kernel page allocator will not allocate a page on a +node that is not allowed in the requesting tasks mems_allowed vector. + +User level code may create and destroy cpusets by name in the cgroup +virtual file system, manage the attributes and permissions of these +cpusets and which CPUs and Memory Nodes are assigned to each cpuset, +specify and query to which cpuset a task is assigned, and list the +task pids assigned to a cpuset. + + +1.2 Why are cpusets needed ? +---------------------------- + +The management of large computer systems, with many processors (CPUs), +complex memory cache hierarchies and multiple Memory Nodes having +non-uniform access times (NUMA) presents additional challenges for +the efficient scheduling and memory placement of processes. + +Frequently more modest sized systems can be operated with adequate +efficiency just by letting the operating system automatically share +the available CPU and Memory resources amongst the requesting tasks. + +But larger systems, which benefit more from careful processor and +memory placement to reduce memory access times and contention, +and which typically represent a larger investment for the customer, +can benefit from explicitly placing jobs on properly sized subsets of +the system. + +This can be especially valuable on: + + * Web Servers running multiple instances of the same web application, + * Servers running different applications (for instance, a web server + and a database), or + * NUMA systems running large HPC applications with demanding + performance characteristics. + +These subsets, or "soft partitions" must be able to be dynamically +adjusted, as the job mix changes, without impacting other concurrently +executing jobs. The location of the running jobs pages may also be moved +when the memory locations are changed. + +The kernel cpuset patch provides the minimum essential kernel +mechanisms required to efficiently implement such subsets. It +leverages existing CPU and Memory Placement facilities in the Linux +kernel to avoid any additional impact on the critical scheduler or +memory allocator code. + + +1.3 How are cpusets implemented ? +--------------------------------- + +Cpusets provide a Linux kernel mechanism to constrain which CPUs and +Memory Nodes are used by a process or set of processes. + +The Linux kernel already has a pair of mechanisms to specify on which +CPUs a task may be scheduled (sched_setaffinity) and on which Memory +Nodes it may obtain memory (mbind, set_mempolicy). + +Cpusets extends these two mechanisms as follows: + + - Cpusets are sets of allowed CPUs and Memory Nodes, known to the + kernel. + - Each task in the system is attached to a cpuset, via a pointer + in the task structure to a reference counted cgroup structure. + - Calls to sched_setaffinity are filtered to just those CPUs + allowed in that tasks cpuset. + - Calls to mbind and set_mempolicy are filtered to just + those Memory Nodes allowed in that tasks cpuset. + - The root cpuset contains all the systems CPUs and Memory + Nodes. + - For any cpuset, one can define child cpusets containing a subset + of the parents CPU and Memory Node resources. + - The hierarchy of cpusets can be mounted at /dev/cpuset, for + browsing and manipulation from user space. + - A cpuset may be marked exclusive, which ensures that no other + cpuset (except direct ancestors and descendents) may contain + any overlapping CPUs or Memory Nodes. + - You can list all the tasks (by pid) attached to any cpuset. + +The implementation of cpusets requires a few, simple hooks +into the rest of the kernel, none in performance critical paths: + + - in init/main.c, to initialize the root cpuset at system boot. + - in fork and exit, to attach and detach a task from its cpuset. + - in sched_setaffinity, to mask the requested CPUs by what's + allowed in that tasks cpuset. + - in sched.c migrate_all_tasks(), to keep migrating tasks within + the CPUs allowed by their cpuset, if possible. + - in the mbind and set_mempolicy system calls, to mask the requested + Memory Nodes by what's allowed in that tasks cpuset. + - in page_alloc.c, to restrict memory to allowed nodes. + - in vmscan.c, to restrict page recovery to the current cpuset. + +You should mount the "cgroup" filesystem type in order to enable +browsing and modifying the cpusets presently known to the kernel. No +new system calls are added for cpusets - all support for querying and +modifying cpusets is via this cpuset file system. + +The /proc//status file for each task has four added lines, +displaying the tasks cpus_allowed (on which CPUs it may be scheduled) +and mems_allowed (on which Memory Nodes it may obtain memory), +in the two formats seen in the following example: + + Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff + Cpus_allowed_list: 0-127 + Mems_allowed: ffffffff,ffffffff + Mems_allowed_list: 0-63 + +Each cpuset is represented by a directory in the cgroup file system +containing (on top of the standard cgroup files) the following +files describing that cpuset: + + - cpus: list of CPUs in that cpuset + - mems: list of Memory Nodes in that cpuset + - memory_migrate flag: if set, move pages to cpusets nodes + - cpu_exclusive flag: is cpu placement exclusive? + - mem_exclusive flag: is memory placement exclusive? + - mem_hardwall flag: is memory allocation hardwalled + - memory_pressure: measure of how much paging pressure in cpuset + +In addition, the root cpuset only has the following file: + - memory_pressure_enabled flag: compute memory_pressure? + +New cpusets are created using the mkdir system call or shell +command. The properties of a cpuset, such as its flags, allowed +CPUs and Memory Nodes, and attached tasks, are modified by writing +to the appropriate file in that cpusets directory, as listed above. + +The named hierarchical structure of nested cpusets allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cpuset allows organizing the work load +on a system into related sets of tasks such that each set is constrained +to using the CPUs and Memory Nodes of a particular cpuset. A task +may be re-attached to any other cpuset, if allowed by the permissions +on the necessary cpuset file system directories. + +Such management of a system "in the large" integrates smoothly with +the detailed placement done on individual tasks and memory regions +using the sched_setaffinity, mbind and set_mempolicy system calls. + +The following rules apply to each cpuset: + + - Its CPUs and Memory Nodes must be a subset of its parents. + - It can't be marked exclusive unless its parent is. + - If its cpu or memory is exclusive, they may not overlap any sibling. + +These rules, and the natural hierarchy of cpusets, enable efficient +enforcement of the exclusive guarantee, without having to scan all +cpusets every time any of them change to ensure nothing overlaps a +exclusive cpuset. Also, the use of a Linux virtual file system (vfs) +to represent the cpuset hierarchy provides for a familiar permission +and name space for cpusets, with a minimum of additional kernel code. + +The cpus and mems files in the root (top_cpuset) cpuset are +read-only. The cpus file automatically tracks the value of +cpu_online_map using a CPU hotplug notifier, and the mems file +automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e., +nodes with memory--using the cpuset_track_online_nodes() hook. + + +1.4 What are exclusive cpusets ? +-------------------------------- + +If a cpuset is cpu or mem exclusive, no other cpuset, other than +a direct ancestor or descendent, may share any of the same CPUs or +Memory Nodes. + +A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", +i.e. it restricts kernel allocations for page, buffer and other data +commonly shared by the kernel across multiple users. All cpusets, +whether hardwalled or not, restrict allocations of memory for user +space. This enables configuring a system so that several independent +jobs can share common kernel data, such as file system pages, while +isolating each job's user allocation in its own cpuset. To do this, +construct a large mem_exclusive cpuset to hold all the jobs, and +construct child, non-mem_exclusive cpusets for each individual job. +Only a small amount of typical kernel memory, such as requests from +interrupt handlers, is allowed to be taken outside even a +mem_exclusive cpuset. + + +1.5 What is memory_pressure ? +----------------------------- +The memory_pressure of a cpuset provides a simple per-cpuset metric +of the rate that the tasks in a cpuset are attempting to free up in +use memory on the nodes of the cpuset to satisfy additional memory +requests. + +This enables batch managers monitoring jobs running in dedicated +cpusets to efficiently detect what level of memory pressure that job +is causing. + +This is useful both on tightly managed systems running a wide mix of +submitted jobs, which may choose to terminate or re-prioritize jobs that +are trying to use more memory than allowed on the nodes assigned them, +and with tightly coupled, long running, massively parallel scientific +computing jobs that will dramatically fail to meet required performance +goals if they start to use more memory than allowed to them. + +This mechanism provides a very economical way for the batch manager +to monitor a cpuset for signs of memory pressure. It's up to the +batch manager or other user code to decide what to do about it and +take action. + +==> Unless this feature is enabled by writing "1" to the special file + /dev/cpuset/memory_pressure_enabled, the hook in the rebalance + code of __alloc_pages() for this metric reduces to simply noticing + that the cpuset_memory_pressure_enabled flag is zero. So only + systems that enable this feature will compute the metric. + +Why a per-cpuset, running average: + + Because this meter is per-cpuset, rather than per-task or mm, + the system load imposed by a batch scheduler monitoring this + metric is sharply reduced on large systems, because a scan of + the tasklist can be avoided on each set of queries. + + Because this meter is a running average, instead of an accumulating + counter, a batch scheduler can detect memory pressure with a + single read, instead of having to read and accumulate results + for a period of time. + + Because this meter is per-cpuset rather than per-task or mm, + the batch scheduler can obtain the key information, memory + pressure in a cpuset, with a single read, rather than having to + query and accumulate results over all the (dynamically changing) + set of tasks in the cpuset. + +A per-cpuset simple digital filter (requires a spinlock and 3 words +of data per-cpuset) is kept, and updated by any task attached to that +cpuset, if it enters the synchronous (direct) page reclaim code. + +A per-cpuset file provides an integer number representing the recent +(half-life of 10 seconds) rate of direct page reclaims caused by +the tasks in the cpuset, in units of reclaims attempted per second, +times 1000. + + +1.6 What is memory spread ? +--------------------------- +There are two boolean flag files per cpuset that control where the +kernel allocates pages for the file system buffers and related in +kernel data structures. They are called 'memory_spread_page' and +'memory_spread_slab'. + +If the per-cpuset boolean flag file 'memory_spread_page' is set, then +the kernel will spread the file system buffers (page cache) evenly +over all the nodes that the faulting task is allowed to use, instead +of preferring to put those pages on the node where the task is running. + +If the per-cpuset boolean flag file 'memory_spread_slab' is set, +then the kernel will spread some file system related slab caches, +such as for inodes and dentries evenly over all the nodes that the +faulting task is allowed to use, instead of preferring to put those +pages on the node where the task is running. + +The setting of these flags does not affect anonymous data segment or +stack segment pages of a task. + +By default, both kinds of memory spreading are off, and memory +pages are allocated on the node local to where the task is running, +except perhaps as modified by the tasks NUMA mempolicy or cpuset +configuration, so long as sufficient free memory pages are available. + +When new cpusets are created, they inherit the memory spread settings +of their parent. + +Setting memory spreading causes allocations for the affected page +or slab caches to ignore the tasks NUMA mempolicy and be spread +instead. Tasks using mbind() or set_mempolicy() calls to set NUMA +mempolicies will not notice any change in these calls as a result of +their containing tasks memory spread settings. If memory spreading +is turned off, then the currently specified NUMA mempolicy once again +applies to memory page allocations. + +Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag +files. By default they contain "0", meaning that the feature is off +for that cpuset. If a "1" is written to that file, then that turns +the named feature on. + +The implementation is simple. + +Setting the flag 'memory_spread_page' turns on a per-process flag +PF_SPREAD_PAGE for each task that is in that cpuset or subsequently +joins that cpuset. The page allocation calls for the page cache +is modified to perform an inline check for this PF_SPREAD_PAGE task +flag, and if set, a call to a new routine cpuset_mem_spread_node() +returns the node to prefer for the allocation. + +Similarly, setting 'memory_spread_slab' turns on the flag +PF_SPREAD_SLAB, and appropriately marked slab caches will allocate +pages from the node returned by cpuset_mem_spread_node(). + +The cpuset_mem_spread_node() routine is also simple. It uses the +value of a per-task rotor cpuset_mem_spread_rotor to select the next +node in the current tasks mems_allowed to prefer for the allocation. + +This memory placement policy is also known (in other contexts) as +round-robin or interleave. + +This policy can provide substantial improvements for jobs that need +to place thread local data on the corresponding node, but that need +to access large file system data sets that need to be spread across +the several nodes in the jobs cpuset in order to fit. Without this +policy, especially for jobs that might have one thread reading in the +data set, the memory allocation across the nodes in the jobs cpuset +can become very uneven. + +1.7 What is sched_load_balance ? +-------------------------------- + +The kernel scheduler (kernel/sched.c) automatically load balances +tasks. If one CPU is underutilized, kernel code running on that +CPU will look for tasks on other more overloaded CPUs and move those +tasks to itself, within the constraints of such placement mechanisms +as cpusets and sched_setaffinity. + +The algorithmic cost of load balancing and its impact on key shared +kernel data structures such as the task list increases more than +linearly with the number of CPUs being balanced. So the scheduler +has support to partition the systems CPUs into a number of sched +domains such that it only load balances within each sched domain. +Each sched domain covers some subset of the CPUs in the system; +no two sched domains overlap; some CPUs might not be in any sched +domain and hence won't be load balanced. + +Put simply, it costs less to balance between two smaller sched domains +than one big one, but doing so means that overloads in one of the +two domains won't be load balanced to the other one. + +By default, there is one sched domain covering all CPUs, except those +marked isolated using the kernel boot time "isolcpus=" argument. + +This default load balancing across all CPUs is not well suited for +the following two situations: + 1) On large systems, load balancing across many CPUs is expensive. + If the system is managed using cpusets to place independent jobs + on separate sets of CPUs, full load balancing is unnecessary. + 2) Systems supporting realtime on some CPUs need to minimize + system overhead on those CPUs, including avoiding task load + balancing if that is not needed. + +When the per-cpuset flag "sched_load_balance" is enabled (the default +setting), it requests that all the CPUs in that cpusets allowed 'cpus' +be contained in a single sched domain, ensuring that load balancing +can move a task (not otherwised pinned, as by sched_setaffinity) +from any CPU in that cpuset to any other. + +When the per-cpuset flag "sched_load_balance" is disabled, then the +scheduler will avoid load balancing across the CPUs in that cpuset, +--except-- in so far as is necessary because some overlapping cpuset +has "sched_load_balance" enabled. + +So, for example, if the top cpuset has the flag "sched_load_balance" +enabled, then the scheduler will have one sched domain covering all +CPUs, and the setting of the "sched_load_balance" flag in any other +cpusets won't matter, as we're already fully load balancing. + +Therefore in the above two situations, the top cpuset flag +"sched_load_balance" should be disabled, and only some of the smaller, +child cpusets have this flag enabled. + +When doing this, you don't usually want to leave any unpinned tasks in +the top cpuset that might use non-trivial amounts of CPU, as such tasks +may be artificially constrained to some subset of CPUs, depending on +the particulars of this flag setting in descendent cpusets. Even if +such a task could use spare CPU cycles in some other CPUs, the kernel +scheduler might not consider the possibility of load balancing that +task to that underused CPU. + +Of course, tasks pinned to a particular CPU can be left in a cpuset +that disables "sched_load_balance" as those tasks aren't going anywhere +else anyway. + +There is an impedance mismatch here, between cpusets and sched domains. +Cpusets are hierarchical and nest. Sched domains are flat; they don't +overlap and each CPU is in at most one sched domain. + +It is necessary for sched domains to be flat because load balancing +across partially overlapping sets of CPUs would risk unstable dynamics +that would be beyond our understanding. So if each of two partially +overlapping cpusets enables the flag 'sched_load_balance', then we +form a single sched domain that is a superset of both. We won't move +a task to a CPU outside it cpuset, but the scheduler load balancing +code might waste some compute cycles considering that possibility. + +This mismatch is why there is not a simple one-to-one relation +between which cpusets have the flag "sched_load_balance" enabled, +and the sched domain configuration. If a cpuset enables the flag, it +will get balancing across all its CPUs, but if it disables the flag, +it will only be assured of no load balancing if no other overlapping +cpuset enables the flag. + +If two cpusets have partially overlapping 'cpus' allowed, and only +one of them has this flag enabled, then the other may find its +tasks only partially load balanced, just on the overlapping CPUs. +This is just the general case of the top_cpuset example given a few +paragraphs above. In the general case, as in the top cpuset case, +don't leave tasks that might use non-trivial amounts of CPU in +such partially load balanced cpusets, as they may be artificially +constrained to some subset of the CPUs allowed to them, for lack of +load balancing to the other CPUs. + +1.7.1 sched_load_balance implementation details. +------------------------------------------------ + +The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary +to most cpuset flags.) When enabled for a cpuset, the kernel will +ensure that it can load balance across all the CPUs in that cpuset +(makes sure that all the CPUs in the cpus_allowed of that cpuset are +in the same sched domain.) + +If two overlapping cpusets both have 'sched_load_balance' enabled, +then they will be (must be) both in the same sched domain. + +If, as is the default, the top cpuset has 'sched_load_balance' enabled, +then by the above that means there is a single sched domain covering +the whole system, regardless of any other cpuset settings. + +The kernel commits to user space that it will avoid load balancing +where it can. It will pick as fine a granularity partition of sched +domains as it can while still providing load balancing for any set +of CPUs allowed to a cpuset having 'sched_load_balance' enabled. + +The internal kernel cpuset to scheduler interface passes from the +cpuset code to the scheduler code a partition of the load balanced +CPUs in the system. This partition is a set of subsets (represented +as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all +the CPUs that must be load balanced. + +Whenever the 'sched_load_balance' flag changes, or CPUs come or go +from a cpuset with this flag enabled, or a cpuset with this flag +enabled is removed, the cpuset code builds a new such partition and +passes it to the scheduler sched domain setup code, to have the sched +domains rebuilt as necessary. + +This partition exactly defines what sched domains the scheduler should +setup - one sched domain for each element (cpumask_t) in the partition. + +The scheduler remembers the currently active sched domain partitions. +When the scheduler routine partition_sched_domains() is invoked from +the cpuset code to update these sched domains, it compares the new +partition requested with the current, and updates its sched domains, +removing the old and adding the new, for each change. + + +1.8 What is sched_relax_domain_level ? +-------------------------------------- + +In sched domain, the scheduler migrates tasks in 2 ways; periodic load +balance on tick, and at time of some schedule events. + +When a task is woken up, scheduler try to move the task on idle CPU. +For example, if a task A running on CPU X activates another task B +on the same CPU X, and if CPU Y is X's sibling and performing idle, +then scheduler migrate task B to CPU Y so that task B can start on +CPU Y without waiting task A on CPU X. + +And if a CPU run out of tasks in its runqueue, the CPU try to pull +extra tasks from other busy CPUs to help them before it is going to +be idle. + +Of course it takes some searching cost to find movable tasks and/or +idle CPUs, the scheduler might not search all CPUs in the domain +everytime. In fact, in some architectures, the searching ranges on +events are limited in the same socket or node where the CPU locates, +while the load balance on tick searchs all. + +For example, assume CPU Z is relatively far from CPU X. Even if CPU Z +is idle while CPU X and the siblings are busy, scheduler can't migrate +woken task B from X to Z since it is out of its searching range. +As the result, task B on CPU X need to wait task A or wait load balance +on the next tick. For some applications in special situation, waiting +1 tick may be too long. + +The 'sched_relax_domain_level' file allows you to request changing +this searching range as you like. This file takes int value which +indicates size of searching range in levels ideally as follows, +otherwise initial value -1 that indicates the cpuset has no request. + + -1 : no request. use system default or follow request of others. + 0 : no search. + 1 : search siblings (hyperthreads in a core). + 2 : search cores in a package. + 3 : search cpus in a node [= system wide on non-NUMA system] + ( 4 : search nodes in a chunk of node [on NUMA system] ) + ( 5 : search system wide [on NUMA system] ) + +The system default is architecture dependent. The system default +can be changed using the relax_domain_level= boot parameter. + +This file is per-cpuset and affect the sched domain where the cpuset +belongs to. Therefore if the flag 'sched_load_balance' of a cpuset +is disabled, then 'sched_relax_domain_level' have no effect since +there is no sched domain belonging the cpuset. + +If multiple cpusets are overlapping and hence they form a single sched +domain, the largest value among those is used. Be careful, if one +requests 0 and others are -1 then 0 is used. + +Note that modifying this file will have both good and bad effects, +and whether it is acceptable or not will be depend on your situation. +Don't modify this file if you are not sure. + +If your situation is: + - The migration costs between each cpu can be assumed considerably + small(for you) due to your special application's behavior or + special hardware support for CPU cache etc. + - The searching cost doesn't have impact(for you) or you can make + the searching cost enough small by managing cpuset to compact etc. + - The latency is required even it sacrifices cache hit rate etc. +then increasing 'sched_relax_domain_level' would benefit you. + + +1.9 How do I use cpusets ? +-------------------------- + +In order to minimize the impact of cpusets on critical kernel +code, such as the scheduler, and due to the fact that the kernel +does not support one task updating the memory placement of another +task directly, the impact on a task of changing its cpuset CPU +or Memory Node placement, or of changing to which cpuset a task +is attached, is subtle. + +If a cpuset has its Memory Nodes modified, then for each task attached +to that cpuset, the next time that the kernel attempts to allocate +a page of memory for that task, the kernel will notice the change +in the tasks cpuset, and update its per-task memory placement to +remain within the new cpusets memory placement. If the task was using +mempolicy MPOL_BIND, and the nodes to which it was bound overlap with +its new cpuset, then the task will continue to use whatever subset +of MPOL_BIND nodes are still allowed in the new cpuset. If the task +was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed +in the new cpuset, then the task will be essentially treated as if it +was MPOL_BIND bound to the new cpuset (even though its numa placement, +as queried by get_mempolicy(), doesn't change). If a task is moved +from one cpuset to another, then the kernel will adjust the tasks +memory placement, as above, the next time that the kernel attempts +to allocate a page of memory for that task. + +If a cpuset has its 'cpus' modified, then each task in that cpuset +will have its allowed CPU placement changed immediately. Similarly, +if a tasks pid is written to a cpusets 'tasks' file, in either its +current cpuset or another cpuset, then its allowed CPU placement is +changed immediately. If such a task had been bound to some subset +of its cpuset using the sched_setaffinity() call, the task will be +allowed to run on any CPU allowed in its new cpuset, negating the +affect of the prior sched_setaffinity() call. + +In summary, the memory placement of a task whose cpuset is changed is +updated by the kernel, on the next allocation of a page for that task, +but the processor placement is not updated, until that tasks pid is +rewritten to the 'tasks' file of its cpuset. This is done to avoid +impacting the scheduler code in the kernel with a check for changes +in a tasks processor placement. + +Normally, once a page is allocated (given a physical page +of main memory) then that page stays on whatever node it +was allocated, so long as it remains allocated, even if the +cpusets memory placement policy 'mems' subsequently changes. +If the cpuset flag file 'memory_migrate' is set true, then when +tasks are attached to that cpuset, any pages that task had +allocated to it on nodes in its previous cpuset are migrated +to the tasks new cpuset. The relative placement of the page within +the cpuset is preserved during these migration operations if possible. +For example if the page was on the second valid node of the prior cpuset +then the page will be placed on the second valid node of the new cpuset. + +Also if 'memory_migrate' is set true, then if that cpusets +'mems' file is modified, pages allocated to tasks in that +cpuset, that were on nodes in the previous setting of 'mems', +will be moved to nodes in the new setting of 'mems.' +Pages that were not in the tasks prior cpuset, or in the cpusets +prior 'mems' setting, will not be moved. + +There is an exception to the above. If hotplug functionality is used +to remove all the CPUs that are currently assigned to a cpuset, +then all the tasks in that cpuset will be moved to the nearest ancestor +with non-empty cpus. But the moving of some (or all) tasks might fail if +cpuset is bound with another cgroup subsystem which has some restrictions +on task attaching. In this failing case, those tasks will stay +in the original cpuset, and the kernel will automatically update +their cpus_allowed to allow all online CPUs. When memory hotplug +functionality for removing Memory Nodes is available, a similar exception +is expected to apply there as well. In general, the kernel prefers to +violate cpuset placement, over starving a task that has had all +its allowed CPUs or Memory Nodes taken offline. + +There is a second exception to the above. GFP_ATOMIC requests are +kernel internal allocations that must be satisfied, immediately. +The kernel may drop some request, in rare cases even panic, if a +GFP_ATOMIC alloc fails. If the request cannot be satisfied within +the current tasks cpuset, then we relax the cpuset, and look for +memory anywhere we can find it. It's better to violate the cpuset +than stress the kernel. + +To start a new job that is to be contained within a cpuset, the steps are: + + 1) mkdir /dev/cpuset + 2) mount -t cgroup -ocpuset cpuset /dev/cpuset + 3) Create the new cpuset by doing mkdir's and write's (or echo's) in + the /dev/cpuset virtual file system. + 4) Start a task that will be the "founding father" of the new job. + 5) Attach that task to the new cpuset by writing its pid to the + /dev/cpuset tasks file for that cpuset. + 6) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cpuset +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cpuset: + + mount -t cgroup -ocpuset cpuset /dev/cpuset + cd /dev/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpus + /bin/echo 1 > mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cpuset Charlie + # The next line should display '/Charlie' + cat /proc/self/cpuset + +In the future, a C library interface to cpusets will likely be +available. For now, the only way to query or modify cpusets is +via the cpuset file system, using the various cd, mkdir, echo, cat, +rmdir commands from the shell, or their equivalent from C. + +The sched_setaffinity calls can also be done at the shell prompt using +SGI's runon or Robert Love's taskset. The mbind and set_mempolicy +calls can be done at the shell prompt using the numactl command +(part of Andi Kleen's numa package). + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using the cpusets can be done through the cpuset +virtual filesystem. + +To mount it, type: +# mount -t cgroup -o cpuset cpuset /dev/cpuset + +Then under /dev/cpuset you can find a tree that corresponds to the +tree of the cpusets in the system. For instance, /dev/cpuset +is the cpuset that holds the whole system. + +If you want to create a new cpuset under /dev/cpuset: +# cd /dev/cpuset +# mkdir my_cpuset + +Now you want to do something with this cpuset. +# cd my_cpuset + +In this directory you can find several files: +# ls +cpu_exclusive memory_migrate mems tasks +cpus memory_pressure notify_on_release +mem_exclusive memory_spread_page sched_load_balance +mem_hardwall memory_spread_slab sched_relax_domain_level + +Reading them will give you information about the state of this cpuset: +the CPUs and Memory Nodes it can use, the processes that are using +it, its properties. By writing to these files you can manipulate +the cpuset. + +Set some flags: +# /bin/echo 1 > cpu_exclusive + +Add some cpus: +# /bin/echo 0-7 > cpus + +Add some mems: +# /bin/echo 0-7 > mems + +Now attach your shell to this cpuset: +# /bin/echo $$ > tasks + +You can also create cpusets inside your cpuset by using mkdir in this +directory. +# mkdir my_sub_cs + +To remove a cpuset, just use rmdir: +# rmdir my_sub_cs +This will fail if the cpuset is in use (has cpusets inside, or has +processes attached). + +Note that for legacy reasons, the "cpuset" filesystem exists as a +wrapper around the cgroup filesystem. + +The command + +mount -t cpuset X /dev/cpuset + +is equivalent to + +mount -t cgroup -ocpuset X /dev/cpuset +echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent + +2.2 Adding/removing cpus +------------------------ + +This is the syntax to use when writing in the cpus or mems files +in cpuset directories: + +# /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4 +# /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4 + +2.3 Setting flags +----------------- + +The syntax is very simple: + +# /bin/echo 1 > cpu_exclusive -> set flag 'cpu_exclusive' +# /bin/echo 0 > cpu_exclusive -> unset flag 'cpu_exclusive' + +2.4 Attaching processes +----------------------- + +# /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another: + +# /bin/echo PID1 > tasks +# /bin/echo PID2 > tasks + ... +# /bin/echo PIDn > tasks + + +3. Questions +============ + +Q: what's up with this '/bin/echo' ? +A: bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cpuset file system, you won't be + able to tell whether a command succeeded or failed. + +Q: When I attach processes, only the first of the line gets really attached ! +A: We can only return one error code per call to write(). So you should also + put only ONE pid. + +4. Contact +========== + +Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroups/devices.txt new file mode 100644 index 000000000000..7cc6e6a60672 --- /dev/null +++ b/Documentation/cgroups/devices.txt @@ -0,0 +1,52 @@ +Device Whitelist Controller + +1. Description: + +Implement a cgroup to track and enforce open and mknod restrictions +on device files. A device cgroup associates a device access +whitelist with each cgroup. A whitelist entry has 4 fields. +'type' is a (all), c (char), or b (block). 'all' means it applies +to all types and all major and minor numbers. Major and minor are +either an integer or * for all. Access is a composition of r +(read), w (write), and m (mknod). + +The root device cgroup starts with rwm to 'all'. A child device +cgroup gets a copy of the parent. Administrators can then remove +devices from the whitelist or add new entries. A child cgroup can +never receive a device access which is denied by its parent. However +when a device access is removed from a parent it will not also be +removed from the child(ren). + +2. User Interface + +An entry is added using devices.allow, and removed using +devices.deny. For instance + + echo 'c 1:3 mr' > /cgroups/1/devices.allow + +allows cgroup 1 to read and mknod the device usually known as +/dev/null. Doing + + echo a > /cgroups/1/devices.deny + +will remove the default 'a *:* rwm' entry. Doing + + echo a > /cgroups/1/devices.allow + +will add the 'a *:* rwm' entry to the whitelist. + +3. Security + +Any task can move itself between cgroups. This clearly won't +suffice, but we can decide the best way to adequately restrict +movement as people get some experience with this. We may just want +to require CAP_SYS_ADMIN, which at least is a separate bit from +CAP_MKNOD. We may want to just refuse moving to a cgroup which +isn't a descendent of the current one. Or we may want to use +CAP_MAC_ADMIN, since we really are trying to lock down root. + +CAP_SYS_ADMIN is needed to modify the whitelist or move another +task to a new cgroup. (Again we'll probably want to change that). + +A cgroup may not be granted more permissions than the cgroup's +parent has. diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt new file mode 100644 index 000000000000..19533f93b7a2 --- /dev/null +++ b/Documentation/cgroups/memcg_test.txt @@ -0,0 +1,342 @@ +Memory Resource Controller(Memcg) Implementation Memo. +Last Updated: 2008/12/15 +Base Kernel Version: based on 2.6.28-rc8-mm. + +Because VM is getting complex (one of reasons is memcg...), memcg's behavior +is complex. This is a document for memcg's internal behavior. +Please note that implementation details can be changed. + +(*) Topics on API should be in Documentation/cgroups/memory.txt) + +0. How to record usage ? + 2 objects are used. + + page_cgroup ....an object per page. + Allocated at boot or memory hotplug. Freed at memory hot removal. + + swap_cgroup ... an entry per swp_entry. + Allocated at swapon(). Freed at swapoff(). + + The page_cgroup has USED bit and double count against a page_cgroup never + occurs. swap_cgroup is used only when a charged page is swapped-out. + +1. Charge + + a page/swp_entry may be charged (usage += PAGE_SIZE) at + + mem_cgroup_newpage_charge() + Called at new page fault and Copy-On-Write. + + mem_cgroup_try_charge_swapin() + Called at do_swap_page() (page fault on swap entry) and swapoff. + Followed by charge-commit-cancel protocol. (With swap accounting) + At commit, a charge recorded in swap_cgroup is removed. + + mem_cgroup_cache_charge() + Called at add_to_page_cache() + + mem_cgroup_cache_charge_swapin() + Called at shmem's swapin. + + mem_cgroup_prepare_migration() + Called before migration. "extra" charge is done and followed by + charge-commit-cancel protocol. + At commit, charge against oldpage or newpage will be committed. + +2. Uncharge + a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by + + mem_cgroup_uncharge_page() + Called when an anonymous page is fully unmapped. I.e., mapcount goes + to 0. If the page is SwapCache, uncharge is delayed until + mem_cgroup_uncharge_swapcache(). + + mem_cgroup_uncharge_cache_page() + Called when a page-cache is deleted from radix-tree. If the page is + SwapCache, uncharge is delayed until mem_cgroup_uncharge_swapcache(). + + mem_cgroup_uncharge_swapcache() + Called when SwapCache is removed from radix-tree. The charge itself + is moved to swap_cgroup. (If mem+swap controller is disabled, no + charge to swap occurs.) + + mem_cgroup_uncharge_swap() + Called when swp_entry's refcnt goes down to 0. A charge against swap + disappears. + + mem_cgroup_end_migration(old, new) + At success of migration old is uncharged (if necessary), a charge + to new page is committed. At failure, charge to old page is committed. + +3. charge-commit-cancel + In some case, we can't know this "charge" is valid or not at charging + (because of races). + To handle such case, there are charge-commit-cancel functions. + mem_cgroup_try_charge_XXX + mem_cgroup_commit_charge_XXX + mem_cgroup_cancel_charge_XXX + these are used in swap-in and migration. + + At try_charge(), there are no flags to say "this page is charged". + at this point, usage += PAGE_SIZE. + + At commit(), the function checks the page should be charged or not + and set flags or avoid charging.(usage -= PAGE_SIZE) + + At cancel(), simply usage -= PAGE_SIZE. + +Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. + +4. Anonymous + Anonymous page is newly allocated at + - page fault into MAP_ANONYMOUS mapping. + - Copy-On-Write. + It is charged right after it's allocated before doing any page table + related operations. Of course, it's uncharged when another page is used + for the fault address. + + At freeing anonymous page (by exit() or munmap()), zap_pte() is called + and pages for ptes are freed one by one.(see mm/memory.c). Uncharges + are done at page_remove_rmap() when page_mapcount() goes down to 0. + + Another page freeing is by page-reclaim (vmscan.c) and anonymous + pages are swapped out. In this case, the page is marked as + PageSwapCache(). uncharge() routine doesn't uncharge the page marked + as SwapCache(). It's delayed until __delete_from_swap_cache(). + + 4.1 Swap-in. + At swap-in, the page is taken from swap-cache. There are 2 cases. + + (a) If the SwapCache is newly allocated and read, it has no charges. + (b) If the SwapCache has been mapped by processes, it has been + charged already. + + This swap-in is one of the most complicated work. In do_swap_page(), + following events occur when pte is unchanged. + + (1) the page (SwapCache) is looked up. + (2) lock_page() + (3) try_charge_swapin() + (4) reuse_swap_page() (may call delete_swap_cache()) + (5) commit_charge_swapin() + (6) swap_free(). + + Considering following situation for example. + + (A) The page has not been charged before (2) and reuse_swap_page() + doesn't call delete_from_swap_cache(). + (B) The page has not been charged before (2) and reuse_swap_page() + calls delete_from_swap_cache(). + (C) The page has been charged before (2) and reuse_swap_page() doesn't + call delete_from_swap_cache(). + (D) The page has been charged before (2) and reuse_swap_page() calls + delete_from_swap_cache(). + + memory.usage/memsw.usage changes to this page/swp_entry will be + Case (A) (B) (C) (D) + Event + Before (2) 0/ 1 0/ 1 1/ 1 1/ 1 + =========================================== + (3) +1/+1 +1/+1 +1/+1 +1/+1 + (4) - 0/ 0 - -1/ 0 + (5) 0/-1 0/ 0 -1/-1 0/ 0 + (6) - 0/-1 - 0/-1 + =========================================== + Result 1/ 1 1/ 1 1/ 1 1/ 1 + + In any cases, charges to this page should be 1/ 1. + + 4.2 Swap-out. + At swap-out, typical state transition is below. + + (a) add to swap cache. (marked as SwapCache) + swp_entry's refcnt += 1. + (b) fully unmapped. + swp_entry's refcnt += # of ptes. + (c) write back to swap. + (d) delete from swap cache. (remove from SwapCache) + swp_entry's refcnt -= 1. + + + At (b), the page is marked as SwapCache and not uncharged. + At (d), the page is removed from SwapCache and a charge in page_cgroup + is moved to swap_cgroup. + + Finally, at task exit, + (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. + Here, a charge in swap_cgroup disappears. + +5. Page Cache + Page Cache is charged at + - add_to_page_cache_locked(). + + uncharged at + - __remove_from_page_cache(). + + The logic is very clear. (About migration, see below) + Note: __remove_from_page_cache() is called by remove_from_page_cache() + and __remove_mapping(). + +6. Shmem(tmpfs) Page Cache + Memcg's charge/uncharge have special handlers of shmem. The best way + to understand shmem's page state transition is to read mm/shmem.c. + But brief explanation of the behavior of memcg around shmem will be + helpful to understand the logic. + + Shmem's page (just leaf page, not direct/indirect block) can be on + - radix-tree of shmem's inode. + - SwapCache. + - Both on radix-tree and SwapCache. This happens at swap-in + and swap-out, + + It's charged when... + - A new page is added to shmem's radix-tree. + - A swp page is read. (move a charge from swap_cgroup to page_cgroup) + It's uncharged when + - A page is removed from radix-tree and not SwapCache. + - When SwapCache is removed, a charge is moved to swap_cgroup. + - When swp_entry's refcnt goes down to 0, a charge in swap_cgroup + disappears. + +7. Page Migration + One of the most complicated functions is page-migration-handler. + Memcg has 2 routines. Assume that we are migrating a page's contents + from OLDPAGE to NEWPAGE. + + Usual migration logic is.. + (a) remove the page from LRU. + (b) allocate NEWPAGE (migration target) + (c) lock by lock_page(). + (d) unmap all mappings. + (e-1) If necessary, replace entry in radix-tree. + (e-2) move contents of a page. + (f) map all mappings again. + (g) pushback the page to LRU. + (-) OLDPAGE will be freed. + + Before (g), memcg should complete all necessary charge/uncharge to + NEWPAGE/OLDPAGE. + + The point is.... + - If OLDPAGE is anonymous, all charges will be dropped at (d) because + try_to_unmap() drops all mapcount and the page will not be + SwapCache. + + - If OLDPAGE is SwapCache, charges will be kept at (g) because + __delete_from_swap_cache() isn't called at (e-1) + + - If OLDPAGE is page-cache, charges will be kept at (g) because + remove_from_swap_cache() isn't called at (e-1) + + memcg provides following hooks. + + - mem_cgroup_prepare_migration(OLDPAGE) + Called after (b) to account a charge (usage += PAGE_SIZE) against + memcg which OLDPAGE belongs to. + + - mem_cgroup_end_migration(OLDPAGE, NEWPAGE) + Called after (f) before (g). + If OLDPAGE is used, commit OLDPAGE again. If OLDPAGE is already + charged, a charge by prepare_migration() is automatically canceled. + If NEWPAGE is used, commit NEWPAGE and uncharge OLDPAGE. + + But zap_pte() (by exit or munmap) can be called while migration, + we have to check if OLDPAGE/NEWPAGE is a valid page after commit(). + +8. LRU + Each memcg has its own private LRU. Now, it's handling is under global + VM's control (means that it's handled under global zone->lru_lock). + Almost all routines around memcg's LRU is called by global LRU's + list management functions under zone->lru_lock(). + + A special function is mem_cgroup_isolate_pages(). This scans + memcg's private LRU and call __isolate_lru_page() to extract a page + from LRU. + (By __isolate_lru_page(), the page is removed from both of global and + private LRU.) + + +9. Typical Tests. + + Tests for racy cases. + + 9.1 Small limit to memcg. + When you do test to do racy case, it's good test to set memcg's limit + to be very small rather than GB. Many races found in the test under + xKB or xxMB limits. + (Memory behavior under GB and Memory behavior under MB shows very + different situation.) + + 9.2 Shmem + Historically, memcg's shmem handling was poor and we saw some amount + of troubles here. This is because shmem is page-cache but can be + SwapCache. Test with shmem/tmpfs is always good test. + + 9.3 Migration + For NUMA, migration is an another special case. To do easy test, cpuset + is useful. Following is a sample script to do migration. + + mount -t cgroup -o cpuset none /opt/cpuset + + mkdir /opt/cpuset/01 + echo 1 > /opt/cpuset/01/cpuset.cpus + echo 0 > /opt/cpuset/01/cpuset.mems + echo 1 > /opt/cpuset/01/cpuset.memory_migrate + mkdir /opt/cpuset/02 + echo 1 > /opt/cpuset/02/cpuset.cpus + echo 1 > /opt/cpuset/02/cpuset.mems + echo 1 > /opt/cpuset/02/cpuset.memory_migrate + + In above set, when you moves a task from 01 to 02, page migration to + node 0 to node 1 will occur. Following is a script to migrate all + under cpuset. + -- + move_task() + { + for pid in $1 + do + /bin/echo $pid >$2/tasks 2>/dev/null + echo -n $pid + echo -n " " + done + echo END + } + + G1_TASK=`cat ${G1}/tasks` + G2_TASK=`cat ${G2}/tasks` + move_task "${G1_TASK}" ${G2} & + -- + 9.4 Memory hotplug. + memory hotplug test is one of good test. + to offline memory, do following. + # echo offline > /sys/devices/system/memory/memoryXXX/state + (XXX is the place of memory) + This is an easy way to test page migration, too. + + 9.5 mkdir/rmdir + When using hierarchy, mkdir/rmdir test should be done. + Use tests like the following. + + echo 1 >/opt/cgroup/01/memory/use_hierarchy + mkdir /opt/cgroup/01/child_a + mkdir /opt/cgroup/01/child_b + + set limit to 01. + add limit to 01/child_b + run jobs under child_a and child_b + + create/delete following groups at random while jobs are running. + /opt/cgroup/01/child_a/child_aa + /opt/cgroup/01/child_b/child_bb + /opt/cgroup/01/child_c + + running new jobs in new group is also good. + + 9.6 Mount with other subsystems. + Mounting with other subsystems is a good test because there is a + race and lock dependency with other cgroup subsystems. + + example) + # mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices + + and do task move, mkdir, rmdir etc...under this. diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt new file mode 100644 index 000000000000..e1501964df1e --- /dev/null +++ b/Documentation/cgroups/memory.txt @@ -0,0 +1,399 @@ +Memory Resource Controller + +NOTE: The Memory Resource Controller has been generically been referred +to as the memory controller in this document. Do not confuse memory controller +used here with the memory controller that is used in hardware. + +Salient features + +a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages +b. The infrastructure allows easy addition of other types of memory to control +c. Provides *zero overhead* for non memory controller users +d. Provides a double LRU: global memory pressure causes reclaim from the + global LRU; a cgroup on hitting a limit, reclaims from the per + cgroup LRU + +NOTE: Swap Cache (unmapped) is not accounted now. + +Benefits and Purpose of the memory controller + +The memory controller isolates the memory behaviour of a group of tasks +from the rest of the system. The article on LWN [12] mentions some probable +uses of the memory controller. The memory controller can be used to + +a. Isolate an application or a group of applications + Memory hungry applications can be isolated and limited to a smaller + amount of memory. +b. Create a cgroup with limited amount of memory, this can be used + as a good alternative to booting with mem=XXXX. +c. Virtualization solutions can control the amount of memory they want + to assign to a virtual machine instance. +d. A CD/DVD burner could control the amount of memory used by the + rest of the system to ensure that burning does not fail due to lack + of available memory. +e. There are several other use cases, find one or use the controller just + for fun (to learn and hack on the VM subsystem). + +1. History + +The memory controller has a long history. A request for comments for the memory +controller was posted by Balbir Singh [1]. At the time the RFC was posted +there were several implementations for memory control. The goal of the +RFC was to build consensus and agreement for the minimal features required +for memory control. The first RSS controller was posted by Balbir Singh[2] +in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the +RSS controller. At OLS, at the resource management BoF, everyone suggested +that we handle both page cache and RSS together. Another request was raised +to allow user space handling of OOM. The current memory controller is +at version 6; it combines both mapped (RSS) and unmapped Page +Cache Control [11]. + +2. Memory Control + +Memory is a unique resource in the sense that it is present in a limited +amount. If a task requires a lot of CPU processing, the task can spread +its processing over a period of hours, days, months or years, but with +memory, the same physical memory needs to be reused to accomplish the task. + +The memory controller implementation has been divided into phases. These +are: + +1. Memory controller +2. mlock(2) controller +3. Kernel user memory accounting and slab control +4. user mappings length controller + +The memory controller is the first controller developed. + +2.1. Design + +The core of the design is a counter called the res_counter. The res_counter +tracks the current memory usage and limit of the group of processes associated +with the controller. Each cgroup has a memory controller specific data +structure (mem_cgroup) associated with it. + +2.2. Accounting + + +--------------------+ + | mem_cgroup | + | (res_counter) | + +--------------------+ + / ^ \ + / | \ + +---------------+ | +---------------+ + | mm_struct | |.... | mm_struct | + | | | | | + +---------------+ | +---------------+ + | + + --------------+ + | + +---------------+ +------+--------+ + | page +----------> page_cgroup| + | | | | + +---------------+ +---------------+ + + (Figure 1: Hierarchy of Accounting) + + +Figure 1 shows the important aspects of the controller + +1. Accounting happens per cgroup +2. Each mm_struct knows about which cgroup it belongs to +3. Each page has a pointer to the page_cgroup, which in turn knows the + cgroup it belongs to + +The accounting is done as follows: mem_cgroup_charge() is invoked to setup +the necessary data structures and check if the cgroup that is being charged +is over its limit. If it is then reclaim is invoked on the cgroup. +More details can be found in the reclaim section of this document. +If everything goes well, a page meta-data-structure called page_cgroup is +allocated and associated with the page. This routine also adds the page to +the per cgroup LRU. + +2.2.1 Accounting details + +All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. +(some pages which never be reclaimable and will not be on global LRU + are not accounted. we just accounts pages under usual vm management.) + +RSS pages are accounted at page_fault unless they've already been accounted +for earlier. A file page will be accounted for as Page Cache when it's +inserted into inode (radix-tree). While it's mapped into the page tables of +processes, duplicate accounting is carefully avoided. + +A RSS page is unaccounted when it's fully unmapped. A PageCache page is +unaccounted when it's removed from radix-tree. + +At page migration, accounting information is kept. + +Note: we just account pages-on-lru because our purpose is to control amount +of used pages. not-on-lru pages are tend to be out-of-control from vm view. + +2.3 Shared Page Accounting + +Shared pages are accounted on the basis of the first touch approach. The +cgroup that first touches a page is accounted for the page. The principle +behind this approach is that a cgroup that aggressively uses a shared +page will eventually get charged for it (once it is uncharged from +the cgroup that brought it in -- this will happen on memory pressure). + +Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used.. +When you do swapoff and make swapped-out pages of shmem(tmpfs) to +be backed into memory in force, charges for pages are accounted against the +caller of swapoff rather than the users of shmem. + + +2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) +Swap Extension allows you to record charge for swap. A swapped-in page is +charged back to original page allocator if possible. + +When swap is accounted, following files are added. + - memory.memsw.usage_in_bytes. + - memory.memsw.limit_in_bytes. + +usage of mem+swap is limited by memsw.limit_in_bytes. + +Note: why 'mem+swap' rather than swap. +The global LRU(kswapd) can swap out arbitrary pages. Swap-out means +to move account from memory to swap...there is no change in usage of +mem+swap. + +In other words, when we want to limit the usage of swap without affecting +global LRU, mem+swap limit is better than just limiting swap from OS point +of view. + +2.5 Reclaim + +Each cgroup maintains a per cgroup LRU that consists of an active +and inactive list. When a cgroup goes over its limit, we first try +to reclaim memory from the cgroup so as to make space for the new +pages that the cgroup has touched. If the reclaim is unsuccessful, +an OOM routine is invoked to select and kill the bulkiest task in the +cgroup. + +The reclaim algorithm has not been modified for cgroups, except that +pages that are selected for reclaiming come from the per cgroup LRU +list. + +2. Locking + +The memory controller uses the following hierarchy + +1. zone->lru_lock is used for selecting pages to be isolated +2. mem->per_zone->lru_lock protects the per cgroup LRU (per zone) +3. lock_page_cgroup() is used to protect page->page_cgroup + +3. User Interface + +0. Configuration + +a. Enable CONFIG_CGROUPS +b. Enable CONFIG_RESOURCE_COUNTERS +c. Enable CONFIG_CGROUP_MEM_RES_CTLR + +1. Prepare the cgroups +# mkdir -p /cgroups +# mount -t cgroup none /cgroups -o memory + +2. Make the new group and move bash into it +# mkdir /cgroups/0 +# echo $$ > /cgroups/0/tasks + +Since now we're in the 0 cgroup, +We can alter the memory limit: +# echo 4M > /cgroups/0/memory.limit_in_bytes + +NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, +mega or gigabytes. + +# cat /cgroups/0/memory.limit_in_bytes +4194304 + +NOTE: The interface has now changed to display the usage in bytes +instead of pages + +We can check the usage: +# cat /cgroups/0/memory.usage_in_bytes +1216512 + +A successful write to this file does not guarantee a successful set of +this limit to the value written into the file. This can be due to a +number of factors, such as rounding up to page boundaries or the total +availability of memory on the system. The user is required to re-read +this file after a write to guarantee the value committed by the kernel. + +# echo 1 > memory.limit_in_bytes +# cat memory.limit_in_bytes +4096 + +The memory.failcnt field gives the number of times that the cgroup limit was +exceeded. + +The memory.stat file gives accounting information. Now, the number of +caches, RSS and Active pages/Inactive pages are shown. + +4. Testing + +Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. +Apart from that v6 has been tested with several applications and regular +daily use. The controller has also been tested on the PPC64, x86_64 and +UML platforms. + +4.1 Troubleshooting + +Sometimes a user might find that the application under a cgroup is +terminated. There are several causes for this: + +1. The cgroup limit is too low (just too low to do anything useful) +2. The user is using anonymous memory and swap is turned off or too low + +A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of +some of the pages cached in the cgroup (page cache pages). + +4.2 Task migration + +When a task migrates from one cgroup to another, it's charge is not +carried forward. The pages allocated from the original cgroup still +remain charged to it, the charge is dropped when the page is freed or +reclaimed. + +4.3 Removing a cgroup + +A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a +cgroup might have some charge associated with it, even though all +tasks have migrated away from it. +Such charges are freed(at default) or moved to its parent. When moved, +both of RSS and CACHES are moved to parent. +If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. + +Charges recorded in swap information is not updated at removal of cgroup. +Recorded information is discarded and a cgroup which uses swap (swapcache) +will be charged as a new owner of it. + + +5. Misc. interfaces. + +5.1 force_empty + memory.force_empty interface is provided to make cgroup's memory usage empty. + You can use this interface only when the cgroup has no tasks. + When writing anything to this + + # echo 0 > memory.force_empty + + Almost all pages tracked by this memcg will be unmapped and freed. Some of + pages cannot be freed because it's locked or in-use. Such pages are moved + to parent and this cgroup will be empty. But this may return -EBUSY in + some too busy case. + + Typical use case of this interface is that calling this before rmdir(). + Because rmdir() moves all pages to parent, some out-of-use page caches can be + moved to the parent. If you want to avoid that, force_empty will be useful. + +5.2 stat file + memory.stat file includes following statistics (now) + cache - # of pages from page-cache and shmem. + rss - # of pages from anonymous memory. + pgpgin - # of event of charging + pgpgout - # of event of uncharging + active_anon - # of pages on active lru of anon, shmem. + inactive_anon - # of pages on active lru of anon, shmem + active_file - # of pages on active lru of file-cache + inactive_file - # of pages on inactive lru of file cache + unevictable - # of pages cannot be reclaimed.(mlocked etc) + + Below is depend on CONFIG_DEBUG_VM. + inactive_ratio - VM inernal parameter. (see mm/page_alloc.c) + recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) + recent_rotated_file - VM internal parameter. (see mm/vmscan.c) + recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) + recent_scanned_file - VM internal parameter. (see mm/vmscan.c) + + Memo: + recent_rotated means recent frequency of lru rotation. + recent_scanned means recent # of scans to lru. + showing for better debug please see the code for meanings. + + +5.3 swappiness + Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. + + Following cgroup's swapiness can't be changed. + - root cgroup (uses /proc/sys/vm/swappiness). + - a cgroup which uses hierarchy and it has child cgroup. + - a cgroup which uses hierarchy and not the root of hierarchy. + + +6. Hierarchy support + +The memory controller supports a deep hierarchy and hierarchical accounting. +The hierarchy is created by creating the appropriate cgroups in the +cgroup filesystem. Consider for example, the following cgroup filesystem +hierarchy + + root + / | \ + / | \ + a b c + | \ + | \ + d e + +In the diagram above, with hierarchical accounting enabled, all memory +usage of e, is accounted to its ancestors up until the root (i.e, c and root), +that has memory.use_hierarchy enabled. If one of the ancestors goes over its +limit, the reclaim algorithm reclaims from the tasks in the ancestor and the +children of the ancestor. + +6.1 Enabling hierarchical accounting and reclaim + +The memory controller by default disables the hierarchy feature. Support +can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup + +# echo 1 > memory.use_hierarchy + +The feature can be disabled by + +# echo 0 > memory.use_hierarchy + +NOTE1: Enabling/disabling will fail if the cgroup already has other +cgroups created below it. + +NOTE2: This feature can be enabled/disabled per subtree. + +7. TODO + +1. Add support for accounting huge pages (as a separate controller) +2. Make per-cgroup scanner reclaim not-shared pages first +3. Teach controller to account for shared-pages +4. Start reclamation in the background when the limit is + not yet hit but the usage is getting closer + +Summary + +Overall, the memory controller has been a stable controller and has been +commented and discussed quite extensively in the community. + +References + +1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ +2. Singh, Balbir. Memory Controller (RSS Control), + http://lwn.net/Articles/222762/ +3. Emelianov, Pavel. Resource controllers based on process cgroups + http://lkml.org/lkml/2007/3/6/198 +4. Emelianov, Pavel. RSS controller based on process cgroups (v2) + http://lkml.org/lkml/2007/4/9/78 +5. Emelianov, Pavel. RSS controller based on process cgroups (v3) + http://lkml.org/lkml/2007/5/30/244 +6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ +7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control + subsystem (v3), http://lwn.net/Articles/235534/ +8. Singh, Balbir. RSS controller v2 test results (lmbench), + http://lkml.org/lkml/2007/5/17/232 +9. Singh, Balbir. RSS controller v2 AIM9 results + http://lkml.org/lkml/2007/5/18/1 +10. Singh, Balbir. Memory controller v6 test results, + http://lkml.org/lkml/2007/8/19/36 +11. Singh, Balbir. Memory controller introduction (v6), + http://lkml.org/lkml/2007/8/17/69 +12. Corbet, Jonathan, Controlling memory use in cgroups, + http://lwn.net/Articles/243795/ diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt new file mode 100644 index 000000000000..f196ac1d7d25 --- /dev/null +++ b/Documentation/cgroups/resource_counter.txt @@ -0,0 +1,181 @@ + + The Resource Counter + +The resource counter, declared at include/linux/res_counter.h, +is supposed to facilitate the resource management by controllers +by providing common stuff for accounting. + +This "stuff" includes the res_counter structure and routines +to work with it. + + + +1. Crucial parts of the res_counter structure + + a. unsigned long long usage + + The usage value shows the amount of a resource that is consumed + by a group at a given time. The units of measurement should be + determined by the controller that uses this counter. E.g. it can + be bytes, items or any other unit the controller operates on. + + b. unsigned long long max_usage + + The maximal value of the usage over time. + + This value is useful when gathering statistical information about + the particular group, as it shows the actual resource requirements + for a particular group, not just some usage snapshot. + + c. unsigned long long limit + + The maximal allowed amount of resource to consume by the group. In + case the group requests for more resources, so that the usage value + would exceed the limit, the resource allocation is rejected (see + the next section). + + d. unsigned long long failcnt + + The failcnt stands for "failures counter". This is the number of + resource allocation attempts that failed. + + c. spinlock_t lock + + Protects changes of the above values. + + + +2. Basic accounting routines + + a. void res_counter_init(struct res_counter *rc) + + Initializes the resource counter. As usual, should be the first + routine called for a new counter. + + b. int res_counter_charge[_locked] + (struct res_counter *rc, unsigned long val) + + When a resource is about to be allocated it has to be accounted + with the appropriate resource counter (controller should determine + which one to use on its own). This operation is called "charging". + + This is not very important which operation - resource allocation + or charging - is performed first, but + * if the allocation is performed first, this may create a + temporary resource over-usage by the time resource counter is + charged; + * if the charging is performed first, then it should be uncharged + on error path (if the one is called). + + c. void res_counter_uncharge[_locked] + (struct res_counter *rc, unsigned long val) + + When a resource is released (freed) it should be de-accounted + from the resource counter it was accounted to. This is called + "uncharging". + + The _locked routines imply that the res_counter->lock is taken. + + + 2.1 Other accounting routines + + There are more routines that may help you with common needs, like + checking whether the limit is reached or resetting the max_usage + value. They are all declared in include/linux/res_counter.h. + + + +3. Analyzing the resource counter registrations + + a. If the failcnt value constantly grows, this means that the counter's + limit is too tight. Either the group is misbehaving and consumes too + many resources, or the configuration is not suitable for the group + and the limit should be increased. + + b. The max_usage value can be used to quickly tune the group. One may + set the limits to maximal values and either load the container with + a common pattern or leave one for a while. After this the max_usage + value shows the amount of memory the container would require during + its common activity. + + Setting the limit a bit above this value gives a pretty good + configuration that works in most of the cases. + + c. If the max_usage is much less than the limit, but the failcnt value + is growing, then the group tries to allocate a big chunk of resource + at once. + + d. If the max_usage is much less than the limit, but the failcnt value + is 0, then this group is given too high limit, that it does not + require. It is better to lower the limit a bit leaving more resource + for other groups. + + + +4. Communication with the control groups subsystem (cgroups) + +All the resource controllers that are using cgroups and resource counters +should provide files (in the cgroup filesystem) to work with the resource +counter fields. They are recommended to adhere to the following rules: + + a. File names + + Field name File name + --------------------------------------------------- + usage usage_in_ + max_usage max_usage_in_ + limit limit_in_ + failcnt failcnt + lock no file :) + + b. Reading from file should show the corresponding field value in the + appropriate format. + + c. Writing to file + + Field Expected behavior + ---------------------------------- + usage prohibited + max_usage reset to usage + limit set the limit + failcnt reset to zero + + + +5. Usage example + + a. Declare a task group (take a look at cgroups subsystem for this) and + fold a res_counter into it + + struct my_group { + struct res_counter res; + + + } + + b. Put hooks in resource allocation/release paths + + int alloc_something(...) + { + if (res_counter_charge(res_counter_ptr, amount) < 0) + return -ENOMEM; + + + } + + void release_something(...) + { + res_counter_uncharge(res_counter_ptr, amount); + + + } + + In order to keep the usage value self-consistent, both the + "res_counter_ptr" and the "amount" in release_something() should be + the same as they were in the alloc_something() when the releasing + resource was allocated. + + c. Provide the way to read res_counter values and set them (the cgroups + still can help with it). + + c. Compile and run :) diff --git a/Documentation/controllers/cpuacct.txt b/Documentation/controllers/cpuacct.txt deleted file mode 100644 index bb775fbe43d7..000000000000 --- a/Documentation/controllers/cpuacct.txt +++ /dev/null @@ -1,32 +0,0 @@ -CPU Accounting Controller -------------------------- - -The CPU accounting controller is used to group tasks using cgroups and -account the CPU usage of these groups of tasks. - -The CPU accounting controller supports multi-hierarchy groups. An accounting -group accumulates the CPU usage of all of its child groups and the tasks -directly present in its group. - -Accounting groups can be created by first mounting the cgroup filesystem. - -# mkdir /cgroups -# mount -t cgroup -ocpuacct none /cgroups - -With the above step, the initial or the parent accounting group -becomes visible at /cgroups. At bootup, this group includes all the -tasks in the system. /cgroups/tasks lists the tasks in this cgroup. -/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by -this group which is essentially the CPU time obtained by all the tasks -in the system. - -New accounting groups can be created under the parent group /cgroups. - -# cd /cgroups -# mkdir g1 -# echo $$ > g1 - -The above steps create a new group g1 and move the current shell -process (bash) into it. CPU time consumed by this bash and its children -can be obtained from g1/cpuacct.usage and the same is accumulated in -/cgroups/cpuacct.usage also. diff --git a/Documentation/controllers/devices.txt b/Documentation/controllers/devices.txt deleted file mode 100644 index 7cc6e6a60672..000000000000 --- a/Documentation/controllers/devices.txt +++ /dev/null @@ -1,52 +0,0 @@ -Device Whitelist Controller - -1. Description: - -Implement a cgroup to track and enforce open and mknod restrictions -on device files. A device cgroup associates a device access -whitelist with each cgroup. A whitelist entry has 4 fields. -'type' is a (all), c (char), or b (block). 'all' means it applies -to all types and all major and minor numbers. Major and minor are -either an integer or * for all. Access is a composition of r -(read), w (write), and m (mknod). - -The root device cgroup starts with rwm to 'all'. A child device -cgroup gets a copy of the parent. Administrators can then remove -devices from the whitelist or add new entries. A child cgroup can -never receive a device access which is denied by its parent. However -when a device access is removed from a parent it will not also be -removed from the child(ren). - -2. User Interface - -An entry is added using devices.allow, and removed using -devices.deny. For instance - - echo 'c 1:3 mr' > /cgroups/1/devices.allow - -allows cgroup 1 to read and mknod the device usually known as -/dev/null. Doing - - echo a > /cgroups/1/devices.deny - -will remove the default 'a *:* rwm' entry. Doing - - echo a > /cgroups/1/devices.allow - -will add the 'a *:* rwm' entry to the whitelist. - -3. Security - -Any task can move itself between cgroups. This clearly won't -suffice, but we can decide the best way to adequately restrict -movement as people get some experience with this. We may just want -to require CAP_SYS_ADMIN, which at least is a separate bit from -CAP_MKNOD. We may want to just refuse moving to a cgroup which -isn't a descendent of the current one. Or we may want to use -CAP_MAC_ADMIN, since we really are trying to lock down root. - -CAP_SYS_ADMIN is needed to modify the whitelist or move another -task to a new cgroup. (Again we'll probably want to change that). - -A cgroup may not be granted more permissions than the cgroup's -parent has. diff --git a/Documentation/controllers/memcg_test.txt b/Documentation/controllers/memcg_test.txt deleted file mode 100644 index 08d4d3ea0d79..000000000000 --- a/Documentation/controllers/memcg_test.txt +++ /dev/null @@ -1,342 +0,0 @@ -Memory Resource Controller(Memcg) Implementation Memo. -Last Updated: 2008/12/15 -Base Kernel Version: based on 2.6.28-rc8-mm. - -Because VM is getting complex (one of reasons is memcg...), memcg's behavior -is complex. This is a document for memcg's internal behavior. -Please note that implementation details can be changed. - -(*) Topics on API should be in Documentation/controllers/memory.txt) - -0. How to record usage ? - 2 objects are used. - - page_cgroup ....an object per page. - Allocated at boot or memory hotplug. Freed at memory hot removal. - - swap_cgroup ... an entry per swp_entry. - Allocated at swapon(). Freed at swapoff(). - - The page_cgroup has USED bit and double count against a page_cgroup never - occurs. swap_cgroup is used only when a charged page is swapped-out. - -1. Charge - - a page/swp_entry may be charged (usage += PAGE_SIZE) at - - mem_cgroup_newpage_charge() - Called at new page fault and Copy-On-Write. - - mem_cgroup_try_charge_swapin() - Called at do_swap_page() (page fault on swap entry) and swapoff. - Followed by charge-commit-cancel protocol. (With swap accounting) - At commit, a charge recorded in swap_cgroup is removed. - - mem_cgroup_cache_charge() - Called at add_to_page_cache() - - mem_cgroup_cache_charge_swapin() - Called at shmem's swapin. - - mem_cgroup_prepare_migration() - Called before migration. "extra" charge is done and followed by - charge-commit-cancel protocol. - At commit, charge against oldpage or newpage will be committed. - -2. Uncharge - a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by - - mem_cgroup_uncharge_page() - Called when an anonymous page is fully unmapped. I.e., mapcount goes - to 0. If the page is SwapCache, uncharge is delayed until - mem_cgroup_uncharge_swapcache(). - - mem_cgroup_uncharge_cache_page() - Called when a page-cache is deleted from radix-tree. If the page is - SwapCache, uncharge is delayed until mem_cgroup_uncharge_swapcache(). - - mem_cgroup_uncharge_swapcache() - Called when SwapCache is removed from radix-tree. The charge itself - is moved to swap_cgroup. (If mem+swap controller is disabled, no - charge to swap occurs.) - - mem_cgroup_uncharge_swap() - Called when swp_entry's refcnt goes down to 0. A charge against swap - disappears. - - mem_cgroup_end_migration(old, new) - At success of migration old is uncharged (if necessary), a charge - to new page is committed. At failure, charge to old page is committed. - -3. charge-commit-cancel - In some case, we can't know this "charge" is valid or not at charging - (because of races). - To handle such case, there are charge-commit-cancel functions. - mem_cgroup_try_charge_XXX - mem_cgroup_commit_charge_XXX - mem_cgroup_cancel_charge_XXX - these are used in swap-in and migration. - - At try_charge(), there are no flags to say "this page is charged". - at this point, usage += PAGE_SIZE. - - At commit(), the function checks the page should be charged or not - and set flags or avoid charging.(usage -= PAGE_SIZE) - - At cancel(), simply usage -= PAGE_SIZE. - -Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. - -4. Anonymous - Anonymous page is newly allocated at - - page fault into MAP_ANONYMOUS mapping. - - Copy-On-Write. - It is charged right after it's allocated before doing any page table - related operations. Of course, it's uncharged when another page is used - for the fault address. - - At freeing anonymous page (by exit() or munmap()), zap_pte() is called - and pages for ptes are freed one by one.(see mm/memory.c). Uncharges - are done at page_remove_rmap() when page_mapcount() goes down to 0. - - Another page freeing is by page-reclaim (vmscan.c) and anonymous - pages are swapped out. In this case, the page is marked as - PageSwapCache(). uncharge() routine doesn't uncharge the page marked - as SwapCache(). It's delayed until __delete_from_swap_cache(). - - 4.1 Swap-in. - At swap-in, the page is taken from swap-cache. There are 2 cases. - - (a) If the SwapCache is newly allocated and read, it has no charges. - (b) If the SwapCache has been mapped by processes, it has been - charged already. - - This swap-in is one of the most complicated work. In do_swap_page(), - following events occur when pte is unchanged. - - (1) the page (SwapCache) is looked up. - (2) lock_page() - (3) try_charge_swapin() - (4) reuse_swap_page() (may call delete_swap_cache()) - (5) commit_charge_swapin() - (6) swap_free(). - - Considering following situation for example. - - (A) The page has not been charged before (2) and reuse_swap_page() - doesn't call delete_from_swap_cache(). - (B) The page has not been charged before (2) and reuse_swap_page() - calls delete_from_swap_cache(). - (C) The page has been charged before (2) and reuse_swap_page() doesn't - call delete_from_swap_cache(). - (D) The page has been charged before (2) and reuse_swap_page() calls - delete_from_swap_cache(). - - memory.usage/memsw.usage changes to this page/swp_entry will be - Case (A) (B) (C) (D) - Event - Before (2) 0/ 1 0/ 1 1/ 1 1/ 1 - =========================================== - (3) +1/+1 +1/+1 +1/+1 +1/+1 - (4) - 0/ 0 - -1/ 0 - (5) 0/-1 0/ 0 -1/-1 0/ 0 - (6) - 0/-1 - 0/-1 - =========================================== - Result 1/ 1 1/ 1 1/ 1 1/ 1 - - In any cases, charges to this page should be 1/ 1. - - 4.2 Swap-out. - At swap-out, typical state transition is below. - - (a) add to swap cache. (marked as SwapCache) - swp_entry's refcnt += 1. - (b) fully unmapped. - swp_entry's refcnt += # of ptes. - (c) write back to swap. - (d) delete from swap cache. (remove from SwapCache) - swp_entry's refcnt -= 1. - - - At (b), the page is marked as SwapCache and not uncharged. - At (d), the page is removed from SwapCache and a charge in page_cgroup - is moved to swap_cgroup. - - Finally, at task exit, - (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. - Here, a charge in swap_cgroup disappears. - -5. Page Cache - Page Cache is charged at - - add_to_page_cache_locked(). - - uncharged at - - __remove_from_page_cache(). - - The logic is very clear. (About migration, see below) - Note: __remove_from_page_cache() is called by remove_from_page_cache() - and __remove_mapping(). - -6. Shmem(tmpfs) Page Cache - Memcg's charge/uncharge have special handlers of shmem. The best way - to understand shmem's page state transition is to read mm/shmem.c. - But brief explanation of the behavior of memcg around shmem will be - helpful to understand the logic. - - Shmem's page (just leaf page, not direct/indirect block) can be on - - radix-tree of shmem's inode. - - SwapCache. - - Both on radix-tree and SwapCache. This happens at swap-in - and swap-out, - - It's charged when... - - A new page is added to shmem's radix-tree. - - A swp page is read. (move a charge from swap_cgroup to page_cgroup) - It's uncharged when - - A page is removed from radix-tree and not SwapCache. - - When SwapCache is removed, a charge is moved to swap_cgroup. - - When swp_entry's refcnt goes down to 0, a charge in swap_cgroup - disappears. - -7. Page Migration - One of the most complicated functions is page-migration-handler. - Memcg has 2 routines. Assume that we are migrating a page's contents - from OLDPAGE to NEWPAGE. - - Usual migration logic is.. - (a) remove the page from LRU. - (b) allocate NEWPAGE (migration target) - (c) lock by lock_page(). - (d) unmap all mappings. - (e-1) If necessary, replace entry in radix-tree. - (e-2) move contents of a page. - (f) map all mappings again. - (g) pushback the page to LRU. - (-) OLDPAGE will be freed. - - Before (g), memcg should complete all necessary charge/uncharge to - NEWPAGE/OLDPAGE. - - The point is.... - - If OLDPAGE is anonymous, all charges will be dropped at (d) because - try_to_unmap() drops all mapcount and the page will not be - SwapCache. - - - If OLDPAGE is SwapCache, charges will be kept at (g) because - __delete_from_swap_cache() isn't called at (e-1) - - - If OLDPAGE is page-cache, charges will be kept at (g) because - remove_from_swap_cache() isn't called at (e-1) - - memcg provides following hooks. - - - mem_cgroup_prepare_migration(OLDPAGE) - Called after (b) to account a charge (usage += PAGE_SIZE) against - memcg which OLDPAGE belongs to. - - - mem_cgroup_end_migration(OLDPAGE, NEWPAGE) - Called after (f) before (g). - If OLDPAGE is used, commit OLDPAGE again. If OLDPAGE is already - charged, a charge by prepare_migration() is automatically canceled. - If NEWPAGE is used, commit NEWPAGE and uncharge OLDPAGE. - - But zap_pte() (by exit or munmap) can be called while migration, - we have to check if OLDPAGE/NEWPAGE is a valid page after commit(). - -8. LRU - Each memcg has its own private LRU. Now, it's handling is under global - VM's control (means that it's handled under global zone->lru_lock). - Almost all routines around memcg's LRU is called by global LRU's - list management functions under zone->lru_lock(). - - A special function is mem_cgroup_isolate_pages(). This scans - memcg's private LRU and call __isolate_lru_page() to extract a page - from LRU. - (By __isolate_lru_page(), the page is removed from both of global and - private LRU.) - - -9. Typical Tests. - - Tests for racy cases. - - 9.1 Small limit to memcg. - When you do test to do racy case, it's good test to set memcg's limit - to be very small rather than GB. Many races found in the test under - xKB or xxMB limits. - (Memory behavior under GB and Memory behavior under MB shows very - different situation.) - - 9.2 Shmem - Historically, memcg's shmem handling was poor and we saw some amount - of troubles here. This is because shmem is page-cache but can be - SwapCache. Test with shmem/tmpfs is always good test. - - 9.3 Migration - For NUMA, migration is an another special case. To do easy test, cpuset - is useful. Following is a sample script to do migration. - - mount -t cgroup -o cpuset none /opt/cpuset - - mkdir /opt/cpuset/01 - echo 1 > /opt/cpuset/01/cpuset.cpus - echo 0 > /opt/cpuset/01/cpuset.mems - echo 1 > /opt/cpuset/01/cpuset.memory_migrate - mkdir /opt/cpuset/02 - echo 1 > /opt/cpuset/02/cpuset.cpus - echo 1 > /opt/cpuset/02/cpuset.mems - echo 1 > /opt/cpuset/02/cpuset.memory_migrate - - In above set, when you moves a task from 01 to 02, page migration to - node 0 to node 1 will occur. Following is a script to migrate all - under cpuset. - -- - move_task() - { - for pid in $1 - do - /bin/echo $pid >$2/tasks 2>/dev/null - echo -n $pid - echo -n " " - done - echo END - } - - G1_TASK=`cat ${G1}/tasks` - G2_TASK=`cat ${G2}/tasks` - move_task "${G1_TASK}" ${G2} & - -- - 9.4 Memory hotplug. - memory hotplug test is one of good test. - to offline memory, do following. - # echo offline > /sys/devices/system/memory/memoryXXX/state - (XXX is the place of memory) - This is an easy way to test page migration, too. - - 9.5 mkdir/rmdir - When using hierarchy, mkdir/rmdir test should be done. - Use tests like the following. - - echo 1 >/opt/cgroup/01/memory/use_hierarchy - mkdir /opt/cgroup/01/child_a - mkdir /opt/cgroup/01/child_b - - set limit to 01. - add limit to 01/child_b - run jobs under child_a and child_b - - create/delete following groups at random while jobs are running. - /opt/cgroup/01/child_a/child_aa - /opt/cgroup/01/child_b/child_bb - /opt/cgroup/01/child_c - - running new jobs in new group is also good. - - 9.6 Mount with other subsystems. - Mounting with other subsystems is a good test because there is a - race and lock dependency with other cgroup subsystems. - - example) - # mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices - - and do task move, mkdir, rmdir etc...under this. diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt deleted file mode 100644 index e1501964df1e..000000000000 --- a/Documentation/controllers/memory.txt +++ /dev/null @@ -1,399 +0,0 @@ -Memory Resource Controller - -NOTE: The Memory Resource Controller has been generically been referred -to as the memory controller in this document. Do not confuse memory controller -used here with the memory controller that is used in hardware. - -Salient features - -a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages -b. The infrastructure allows easy addition of other types of memory to control -c. Provides *zero overhead* for non memory controller users -d. Provides a double LRU: global memory pressure causes reclaim from the - global LRU; a cgroup on hitting a limit, reclaims from the per - cgroup LRU - -NOTE: Swap Cache (unmapped) is not accounted now. - -Benefits and Purpose of the memory controller - -The memory controller isolates the memory behaviour of a group of tasks -from the rest of the system. The article on LWN [12] mentions some probable -uses of the memory controller. The memory controller can be used to - -a. Isolate an application or a group of applications - Memory hungry applications can be isolated and limited to a smaller - amount of memory. -b. Create a cgroup with limited amount of memory, this can be used - as a good alternative to booting with mem=XXXX. -c. Virtualization solutions can control the amount of memory they want - to assign to a virtual machine instance. -d. A CD/DVD burner could control the amount of memory used by the - rest of the system to ensure that burning does not fail due to lack - of available memory. -e. There are several other use cases, find one or use the controller just - for fun (to learn and hack on the VM subsystem). - -1. History - -The memory controller has a long history. A request for comments for the memory -controller was posted by Balbir Singh [1]. At the time the RFC was posted -there were several implementations for memory control. The goal of the -RFC was to build consensus and agreement for the minimal features required -for memory control. The first RSS controller was posted by Balbir Singh[2] -in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the -RSS controller. At OLS, at the resource management BoF, everyone suggested -that we handle both page cache and RSS together. Another request was raised -to allow user space handling of OOM. The current memory controller is -at version 6; it combines both mapped (RSS) and unmapped Page -Cache Control [11]. - -2. Memory Control - -Memory is a unique resource in the sense that it is present in a limited -amount. If a task requires a lot of CPU processing, the task can spread -its processing over a period of hours, days, months or years, but with -memory, the same physical memory needs to be reused to accomplish the task. - -The memory controller implementation has been divided into phases. These -are: - -1. Memory controller -2. mlock(2) controller -3. Kernel user memory accounting and slab control -4. user mappings length controller - -The memory controller is the first controller developed. - -2.1. Design - -The core of the design is a counter called the res_counter. The res_counter -tracks the current memory usage and limit of the group of processes associated -with the controller. Each cgroup has a memory controller specific data -structure (mem_cgroup) associated with it. - -2.2. Accounting - - +--------------------+ - | mem_cgroup | - | (res_counter) | - +--------------------+ - / ^ \ - / | \ - +---------------+ | +---------------+ - | mm_struct | |.... | mm_struct | - | | | | | - +---------------+ | +---------------+ - | - + --------------+ - | - +---------------+ +------+--------+ - | page +----------> page_cgroup| - | | | | - +---------------+ +---------------+ - - (Figure 1: Hierarchy of Accounting) - - -Figure 1 shows the important aspects of the controller - -1. Accounting happens per cgroup -2. Each mm_struct knows about which cgroup it belongs to -3. Each page has a pointer to the page_cgroup, which in turn knows the - cgroup it belongs to - -The accounting is done as follows: mem_cgroup_charge() is invoked to setup -the necessary data structures and check if the cgroup that is being charged -is over its limit. If it is then reclaim is invoked on the cgroup. -More details can be found in the reclaim section of this document. -If everything goes well, a page meta-data-structure called page_cgroup is -allocated and associated with the page. This routine also adds the page to -the per cgroup LRU. - -2.2.1 Accounting details - -All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. -(some pages which never be reclaimable and will not be on global LRU - are not accounted. we just accounts pages under usual vm management.) - -RSS pages are accounted at page_fault unless they've already been accounted -for earlier. A file page will be accounted for as Page Cache when it's -inserted into inode (radix-tree). While it's mapped into the page tables of -processes, duplicate accounting is carefully avoided. - -A RSS page is unaccounted when it's fully unmapped. A PageCache page is -unaccounted when it's removed from radix-tree. - -At page migration, accounting information is kept. - -Note: we just account pages-on-lru because our purpose is to control amount -of used pages. not-on-lru pages are tend to be out-of-control from vm view. - -2.3 Shared Page Accounting - -Shared pages are accounted on the basis of the first touch approach. The -cgroup that first touches a page is accounted for the page. The principle -behind this approach is that a cgroup that aggressively uses a shared -page will eventually get charged for it (once it is uncharged from -the cgroup that brought it in -- this will happen on memory pressure). - -Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used.. -When you do swapoff and make swapped-out pages of shmem(tmpfs) to -be backed into memory in force, charges for pages are accounted against the -caller of swapoff rather than the users of shmem. - - -2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) -Swap Extension allows you to record charge for swap. A swapped-in page is -charged back to original page allocator if possible. - -When swap is accounted, following files are added. - - memory.memsw.usage_in_bytes. - - memory.memsw.limit_in_bytes. - -usage of mem+swap is limited by memsw.limit_in_bytes. - -Note: why 'mem+swap' rather than swap. -The global LRU(kswapd) can swap out arbitrary pages. Swap-out means -to move account from memory to swap...there is no change in usage of -mem+swap. - -In other words, when we want to limit the usage of swap without affecting -global LRU, mem+swap limit is better than just limiting swap from OS point -of view. - -2.5 Reclaim - -Each cgroup maintains a per cgroup LRU that consists of an active -and inactive list. When a cgroup goes over its limit, we first try -to reclaim memory from the cgroup so as to make space for the new -pages that the cgroup has touched. If the reclaim is unsuccessful, -an OOM routine is invoked to select and kill the bulkiest task in the -cgroup. - -The reclaim algorithm has not been modified for cgroups, except that -pages that are selected for reclaiming come from the per cgroup LRU -list. - -2. Locking - -The memory controller uses the following hierarchy - -1. zone->lru_lock is used for selecting pages to be isolated -2. mem->per_zone->lru_lock protects the per cgroup LRU (per zone) -3. lock_page_cgroup() is used to protect page->page_cgroup - -3. User Interface - -0. Configuration - -a. Enable CONFIG_CGROUPS -b. Enable CONFIG_RESOURCE_COUNTERS -c. Enable CONFIG_CGROUP_MEM_RES_CTLR - -1. Prepare the cgroups -# mkdir -p /cgroups -# mount -t cgroup none /cgroups -o memory - -2. Make the new group and move bash into it -# mkdir /cgroups/0 -# echo $$ > /cgroups/0/tasks - -Since now we're in the 0 cgroup, -We can alter the memory limit: -# echo 4M > /cgroups/0/memory.limit_in_bytes - -NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, -mega or gigabytes. - -# cat /cgroups/0/memory.limit_in_bytes -4194304 - -NOTE: The interface has now changed to display the usage in bytes -instead of pages - -We can check the usage: -# cat /cgroups/0/memory.usage_in_bytes -1216512 - -A successful write to this file does not guarantee a successful set of -this limit to the value written into the file. This can be due to a -number of factors, such as rounding up to page boundaries or the total -availability of memory on the system. The user is required to re-read -this file after a write to guarantee the value committed by the kernel. - -# echo 1 > memory.limit_in_bytes -# cat memory.limit_in_bytes -4096 - -The memory.failcnt field gives the number of times that the cgroup limit was -exceeded. - -The memory.stat file gives accounting information. Now, the number of -caches, RSS and Active pages/Inactive pages are shown. - -4. Testing - -Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. -Apart from that v6 has been tested with several applications and regular -daily use. The controller has also been tested on the PPC64, x86_64 and -UML platforms. - -4.1 Troubleshooting - -Sometimes a user might find that the application under a cgroup is -terminated. There are several causes for this: - -1. The cgroup limit is too low (just too low to do anything useful) -2. The user is using anonymous memory and swap is turned off or too low - -A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of -some of the pages cached in the cgroup (page cache pages). - -4.2 Task migration - -When a task migrates from one cgroup to another, it's charge is not -carried forward. The pages allocated from the original cgroup still -remain charged to it, the charge is dropped when the page is freed or -reclaimed. - -4.3 Removing a cgroup - -A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a -cgroup might have some charge associated with it, even though all -tasks have migrated away from it. -Such charges are freed(at default) or moved to its parent. When moved, -both of RSS and CACHES are moved to parent. -If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. - -Charges recorded in swap information is not updated at removal of cgroup. -Recorded information is discarded and a cgroup which uses swap (swapcache) -will be charged as a new owner of it. - - -5. Misc. interfaces. - -5.1 force_empty - memory.force_empty interface is provided to make cgroup's memory usage empty. - You can use this interface only when the cgroup has no tasks. - When writing anything to this - - # echo 0 > memory.force_empty - - Almost all pages tracked by this memcg will be unmapped and freed. Some of - pages cannot be freed because it's locked or in-use. Such pages are moved - to parent and this cgroup will be empty. But this may return -EBUSY in - some too busy case. - - Typical use case of this interface is that calling this before rmdir(). - Because rmdir() moves all pages to parent, some out-of-use page caches can be - moved to the parent. If you want to avoid that, force_empty will be useful. - -5.2 stat file - memory.stat file includes following statistics (now) - cache - # of pages from page-cache and shmem. - rss - # of pages from anonymous memory. - pgpgin - # of event of charging - pgpgout - # of event of uncharging - active_anon - # of pages on active lru of anon, shmem. - inactive_anon - # of pages on active lru of anon, shmem - active_file - # of pages on active lru of file-cache - inactive_file - # of pages on inactive lru of file cache - unevictable - # of pages cannot be reclaimed.(mlocked etc) - - Below is depend on CONFIG_DEBUG_VM. - inactive_ratio - VM inernal parameter. (see mm/page_alloc.c) - recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) - recent_rotated_file - VM internal parameter. (see mm/vmscan.c) - recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) - recent_scanned_file - VM internal parameter. (see mm/vmscan.c) - - Memo: - recent_rotated means recent frequency of lru rotation. - recent_scanned means recent # of scans to lru. - showing for better debug please see the code for meanings. - - -5.3 swappiness - Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. - - Following cgroup's swapiness can't be changed. - - root cgroup (uses /proc/sys/vm/swappiness). - - a cgroup which uses hierarchy and it has child cgroup. - - a cgroup which uses hierarchy and not the root of hierarchy. - - -6. Hierarchy support - -The memory controller supports a deep hierarchy and hierarchical accounting. -The hierarchy is created by creating the appropriate cgroups in the -cgroup filesystem. Consider for example, the following cgroup filesystem -hierarchy - - root - / | \ - / | \ - a b c - | \ - | \ - d e - -In the diagram above, with hierarchical accounting enabled, all memory -usage of e, is accounted to its ancestors up until the root (i.e, c and root), -that has memory.use_hierarchy enabled. If one of the ancestors goes over its -limit, the reclaim algorithm reclaims from the tasks in the ancestor and the -children of the ancestor. - -6.1 Enabling hierarchical accounting and reclaim - -The memory controller by default disables the hierarchy feature. Support -can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup - -# echo 1 > memory.use_hierarchy - -The feature can be disabled by - -# echo 0 > memory.use_hierarchy - -NOTE1: Enabling/disabling will fail if the cgroup already has other -cgroups created below it. - -NOTE2: This feature can be enabled/disabled per subtree. - -7. TODO - -1. Add support for accounting huge pages (as a separate controller) -2. Make per-cgroup scanner reclaim not-shared pages first -3. Teach controller to account for shared-pages -4. Start reclamation in the background when the limit is - not yet hit but the usage is getting closer - -Summary - -Overall, the memory controller has been a stable controller and has been -commented and discussed quite extensively in the community. - -References - -1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ -2. Singh, Balbir. Memory Controller (RSS Control), - http://lwn.net/Articles/222762/ -3. Emelianov, Pavel. Resource controllers based on process cgroups - http://lkml.org/lkml/2007/3/6/198 -4. Emelianov, Pavel. RSS controller based on process cgroups (v2) - http://lkml.org/lkml/2007/4/9/78 -5. Emelianov, Pavel. RSS controller based on process cgroups (v3) - http://lkml.org/lkml/2007/5/30/244 -6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ -7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control - subsystem (v3), http://lwn.net/Articles/235534/ -8. Singh, Balbir. RSS controller v2 test results (lmbench), - http://lkml.org/lkml/2007/5/17/232 -9. Singh, Balbir. RSS controller v2 AIM9 results - http://lkml.org/lkml/2007/5/18/1 -10. Singh, Balbir. Memory controller v6 test results, - http://lkml.org/lkml/2007/8/19/36 -11. Singh, Balbir. Memory controller introduction (v6), - http://lkml.org/lkml/2007/8/17/69 -12. Corbet, Jonathan, Controlling memory use in cgroups, - http://lwn.net/Articles/243795/ diff --git a/Documentation/controllers/resource_counter.txt b/Documentation/controllers/resource_counter.txt deleted file mode 100644 index f196ac1d7d25..000000000000 --- a/Documentation/controllers/resource_counter.txt +++ /dev/null @@ -1,181 +0,0 @@ - - The Resource Counter - -The resource counter, declared at include/linux/res_counter.h, -is supposed to facilitate the resource management by controllers -by providing common stuff for accounting. - -This "stuff" includes the res_counter structure and routines -to work with it. - - - -1. Crucial parts of the res_counter structure - - a. unsigned long long usage - - The usage value shows the amount of a resource that is consumed - by a group at a given time. The units of measurement should be - determined by the controller that uses this counter. E.g. it can - be bytes, items or any other unit the controller operates on. - - b. unsigned long long max_usage - - The maximal value of the usage over time. - - This value is useful when gathering statistical information about - the particular group, as it shows the actual resource requirements - for a particular group, not just some usage snapshot. - - c. unsigned long long limit - - The maximal allowed amount of resource to consume by the group. In - case the group requests for more resources, so that the usage value - would exceed the limit, the resource allocation is rejected (see - the next section). - - d. unsigned long long failcnt - - The failcnt stands for "failures counter". This is the number of - resource allocation attempts that failed. - - c. spinlock_t lock - - Protects changes of the above values. - - - -2. Basic accounting routines - - a. void res_counter_init(struct res_counter *rc) - - Initializes the resource counter. As usual, should be the first - routine called for a new counter. - - b. int res_counter_charge[_locked] - (struct res_counter *rc, unsigned long val) - - When a resource is about to be allocated it has to be accounted - with the appropriate resource counter (controller should determine - which one to use on its own). This operation is called "charging". - - This is not very important which operation - resource allocation - or charging - is performed first, but - * if the allocation is performed first, this may create a - temporary resource over-usage by the time resource counter is - charged; - * if the charging is performed first, then it should be uncharged - on error path (if the one is called). - - c. void res_counter_uncharge[_locked] - (struct res_counter *rc, unsigned long val) - - When a resource is released (freed) it should be de-accounted - from the resource counter it was accounted to. This is called - "uncharging". - - The _locked routines imply that the res_counter->lock is taken. - - - 2.1 Other accounting routines - - There are more routines that may help you with common needs, like - checking whether the limit is reached or resetting the max_usage - value. They are all declared in include/linux/res_counter.h. - - - -3. Analyzing the resource counter registrations - - a. If the failcnt value constantly grows, this means that the counter's - limit is too tight. Either the group is misbehaving and consumes too - many resources, or the configuration is not suitable for the group - and the limit should be increased. - - b. The max_usage value can be used to quickly tune the group. One may - set the limits to maximal values and either load the container with - a common pattern or leave one for a while. After this the max_usage - value shows the amount of memory the container would require during - its common activity. - - Setting the limit a bit above this value gives a pretty good - configuration that works in most of the cases. - - c. If the max_usage is much less than the limit, but the failcnt value - is growing, then the group tries to allocate a big chunk of resource - at once. - - d. If the max_usage is much less than the limit, but the failcnt value - is 0, then this group is given too high limit, that it does not - require. It is better to lower the limit a bit leaving more resource - for other groups. - - - -4. Communication with the control groups subsystem (cgroups) - -All the resource controllers that are using cgroups and resource counters -should provide files (in the cgroup filesystem) to work with the resource -counter fields. They are recommended to adhere to the following rules: - - a. File names - - Field name File name - --------------------------------------------------- - usage usage_in_ - max_usage max_usage_in_ - limit limit_in_ - failcnt failcnt - lock no file :) - - b. Reading from file should show the corresponding field value in the - appropriate format. - - c. Writing to file - - Field Expected behavior - ---------------------------------- - usage prohibited - max_usage reset to usage - limit set the limit - failcnt reset to zero - - - -5. Usage example - - a. Declare a task group (take a look at cgroups subsystem for this) and - fold a res_counter into it - - struct my_group { - struct res_counter res; - - - } - - b. Put hooks in resource allocation/release paths - - int alloc_something(...) - { - if (res_counter_charge(res_counter_ptr, amount) < 0) - return -ENOMEM; - - - } - - void release_something(...) - { - res_counter_uncharge(res_counter_ptr, amount); - - - } - - In order to keep the usage value self-consistent, both the - "res_counter_ptr" and the "amount" in release_something() should be - the same as they were in the alloc_something() when the releasing - resource was allocated. - - c. Provide the way to read res_counter values and set them (the cgroups - still can help with it). - - c. Compile and run :) diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt deleted file mode 100644 index 5c86c258c791..000000000000 --- a/Documentation/cpusets.txt +++ /dev/null @@ -1,808 +0,0 @@ - CPUSETS - ------- - -Copyright (C) 2004 BULL SA. -Written by Simon.Derr@bull.net - -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. -Modified by Paul Jackson -Modified by Christoph Lameter -Modified by Paul Menage -Modified by Hidetoshi Seto - -CONTENTS: -========= - -1. Cpusets - 1.1 What are cpusets ? - 1.2 Why are cpusets needed ? - 1.3 How are cpusets implemented ? - 1.4 What are exclusive cpusets ? - 1.5 What is memory_pressure ? - 1.6 What is memory spread ? - 1.7 What is sched_load_balance ? - 1.8 What is sched_relax_domain_level ? - 1.9 How do I use cpusets ? -2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Adding/removing cpus - 2.3 Setting flags - 2.4 Attaching processes -3. Questions -4. Contact - -1. Cpusets -========== - -1.1 What are cpusets ? ----------------------- - -Cpusets provide a mechanism for assigning a set of CPUs and Memory -Nodes to a set of tasks. In this document "Memory Node" refers to -an on-line node that contains memory. - -Cpusets constrain the CPU and Memory placement of tasks to only -the resources within a tasks current cpuset. They form a nested -hierarchy visible in a virtual file system. These are the essential -hooks, beyond what is already present, required to manage dynamic -job placement on large systems. - -Cpusets use the generic cgroup subsystem described in -Documentation/cgroups/cgroups.txt. - -Requests by a task, using the sched_setaffinity(2) system call to -include CPUs in its CPU affinity mask, and using the mbind(2) and -set_mempolicy(2) system calls to include Memory Nodes in its memory -policy, are both filtered through that tasks cpuset, filtering out any -CPUs or Memory Nodes not in that cpuset. The scheduler will not -schedule a task on a CPU that is not allowed in its cpus_allowed -vector, and the kernel page allocator will not allocate a page on a -node that is not allowed in the requesting tasks mems_allowed vector. - -User level code may create and destroy cpusets by name in the cgroup -virtual file system, manage the attributes and permissions of these -cpusets and which CPUs and Memory Nodes are assigned to each cpuset, -specify and query to which cpuset a task is assigned, and list the -task pids assigned to a cpuset. - - -1.2 Why are cpusets needed ? ----------------------------- - -The management of large computer systems, with many processors (CPUs), -complex memory cache hierarchies and multiple Memory Nodes having -non-uniform access times (NUMA) presents additional challenges for -the efficient scheduling and memory placement of processes. - -Frequently more modest sized systems can be operated with adequate -efficiency just by letting the operating system automatically share -the available CPU and Memory resources amongst the requesting tasks. - -But larger systems, which benefit more from careful processor and -memory placement to reduce memory access times and contention, -and which typically represent a larger investment for the customer, -can benefit from explicitly placing jobs on properly sized subsets of -the system. - -This can be especially valuable on: - - * Web Servers running multiple instances of the same web application, - * Servers running different applications (for instance, a web server - and a database), or - * NUMA systems running large HPC applications with demanding - performance characteristics. - -These subsets, or "soft partitions" must be able to be dynamically -adjusted, as the job mix changes, without impacting other concurrently -executing jobs. The location of the running jobs pages may also be moved -when the memory locations are changed. - -The kernel cpuset patch provides the minimum essential kernel -mechanisms required to efficiently implement such subsets. It -leverages existing CPU and Memory Placement facilities in the Linux -kernel to avoid any additional impact on the critical scheduler or -memory allocator code. - - -1.3 How are cpusets implemented ? ---------------------------------- - -Cpusets provide a Linux kernel mechanism to constrain which CPUs and -Memory Nodes are used by a process or set of processes. - -The Linux kernel already has a pair of mechanisms to specify on which -CPUs a task may be scheduled (sched_setaffinity) and on which Memory -Nodes it may obtain memory (mbind, set_mempolicy). - -Cpusets extends these two mechanisms as follows: - - - Cpusets are sets of allowed CPUs and Memory Nodes, known to the - kernel. - - Each task in the system is attached to a cpuset, via a pointer - in the task structure to a reference counted cgroup structure. - - Calls to sched_setaffinity are filtered to just those CPUs - allowed in that tasks cpuset. - - Calls to mbind and set_mempolicy are filtered to just - those Memory Nodes allowed in that tasks cpuset. - - The root cpuset contains all the systems CPUs and Memory - Nodes. - - For any cpuset, one can define child cpusets containing a subset - of the parents CPU and Memory Node resources. - - The hierarchy of cpusets can be mounted at /dev/cpuset, for - browsing and manipulation from user space. - - A cpuset may be marked exclusive, which ensures that no other - cpuset (except direct ancestors and descendents) may contain - any overlapping CPUs or Memory Nodes. - - You can list all the tasks (by pid) attached to any cpuset. - -The implementation of cpusets requires a few, simple hooks -into the rest of the kernel, none in performance critical paths: - - - in init/main.c, to initialize the root cpuset at system boot. - - in fork and exit, to attach and detach a task from its cpuset. - - in sched_setaffinity, to mask the requested CPUs by what's - allowed in that tasks cpuset. - - in sched.c migrate_all_tasks(), to keep migrating tasks within - the CPUs allowed by their cpuset, if possible. - - in the mbind and set_mempolicy system calls, to mask the requested - Memory Nodes by what's allowed in that tasks cpuset. - - in page_alloc.c, to restrict memory to allowed nodes. - - in vmscan.c, to restrict page recovery to the current cpuset. - -You should mount the "cgroup" filesystem type in order to enable -browsing and modifying the cpusets presently known to the kernel. No -new system calls are added for cpusets - all support for querying and -modifying cpusets is via this cpuset file system. - -The /proc//status file for each task has four added lines, -displaying the tasks cpus_allowed (on which CPUs it may be scheduled) -and mems_allowed (on which Memory Nodes it may obtain memory), -in the two formats seen in the following example: - - Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff - Cpus_allowed_list: 0-127 - Mems_allowed: ffffffff,ffffffff - Mems_allowed_list: 0-63 - -Each cpuset is represented by a directory in the cgroup file system -containing (on top of the standard cgroup files) the following -files describing that cpuset: - - - cpus: list of CPUs in that cpuset - - mems: list of Memory Nodes in that cpuset - - memory_migrate flag: if set, move pages to cpusets nodes - - cpu_exclusive flag: is cpu placement exclusive? - - mem_exclusive flag: is memory placement exclusive? - - mem_hardwall flag: is memory allocation hardwalled - - memory_pressure: measure of how much paging pressure in cpuset - -In addition, the root cpuset only has the following file: - - memory_pressure_enabled flag: compute memory_pressure? - -New cpusets are created using the mkdir system call or shell -command. The properties of a cpuset, such as its flags, allowed -CPUs and Memory Nodes, and attached tasks, are modified by writing -to the appropriate file in that cpusets directory, as listed above. - -The named hierarchical structure of nested cpusets allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cpuset allows organizing the work load -on a system into related sets of tasks such that each set is constrained -to using the CPUs and Memory Nodes of a particular cpuset. A task -may be re-attached to any other cpuset, if allowed by the permissions -on the necessary cpuset file system directories. - -Such management of a system "in the large" integrates smoothly with -the detailed placement done on individual tasks and memory regions -using the sched_setaffinity, mbind and set_mempolicy system calls. - -The following rules apply to each cpuset: - - - Its CPUs and Memory Nodes must be a subset of its parents. - - It can't be marked exclusive unless its parent is. - - If its cpu or memory is exclusive, they may not overlap any sibling. - -These rules, and the natural hierarchy of cpusets, enable efficient -enforcement of the exclusive guarantee, without having to scan all -cpusets every time any of them change to ensure nothing overlaps a -exclusive cpuset. Also, the use of a Linux virtual file system (vfs) -to represent the cpuset hierarchy provides for a familiar permission -and name space for cpusets, with a minimum of additional kernel code. - -The cpus and mems files in the root (top_cpuset) cpuset are -read-only. The cpus file automatically tracks the value of -cpu_online_map using a CPU hotplug notifier, and the mems file -automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e., -nodes with memory--using the cpuset_track_online_nodes() hook. - - -1.4 What are exclusive cpusets ? --------------------------------- - -If a cpuset is cpu or mem exclusive, no other cpuset, other than -a direct ancestor or descendent, may share any of the same CPUs or -Memory Nodes. - -A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", -i.e. it restricts kernel allocations for page, buffer and other data -commonly shared by the kernel across multiple users. All cpusets, -whether hardwalled or not, restrict allocations of memory for user -space. This enables configuring a system so that several independent -jobs can share common kernel data, such as file system pages, while -isolating each job's user allocation in its own cpuset. To do this, -construct a large mem_exclusive cpuset to hold all the jobs, and -construct child, non-mem_exclusive cpusets for each individual job. -Only a small amount of typical kernel memory, such as requests from -interrupt handlers, is allowed to be taken outside even a -mem_exclusive cpuset. - - -1.5 What is memory_pressure ? ------------------------------ -The memory_pressure of a cpuset provides a simple per-cpuset metric -of the rate that the tasks in a cpuset are attempting to free up in -use memory on the nodes of the cpuset to satisfy additional memory -requests. - -This enables batch managers monitoring jobs running in dedicated -cpusets to efficiently detect what level of memory pressure that job -is causing. - -This is useful both on tightly managed systems running a wide mix of -submitted jobs, which may choose to terminate or re-prioritize jobs that -are trying to use more memory than allowed on the nodes assigned them, -and with tightly coupled, long running, massively parallel scientific -computing jobs that will dramatically fail to meet required performance -goals if they start to use more memory than allowed to them. - -This mechanism provides a very economical way for the batch manager -to monitor a cpuset for signs of memory pressure. It's up to the -batch manager or other user code to decide what to do about it and -take action. - -==> Unless this feature is enabled by writing "1" to the special file - /dev/cpuset/memory_pressure_enabled, the hook in the rebalance - code of __alloc_pages() for this metric reduces to simply noticing - that the cpuset_memory_pressure_enabled flag is zero. So only - systems that enable this feature will compute the metric. - -Why a per-cpuset, running average: - - Because this meter is per-cpuset, rather than per-task or mm, - the system load imposed by a batch scheduler monitoring this - metric is sharply reduced on large systems, because a scan of - the tasklist can be avoided on each set of queries. - - Because this meter is a running average, instead of an accumulating - counter, a batch scheduler can detect memory pressure with a - single read, instead of having to read and accumulate results - for a period of time. - - Because this meter is per-cpuset rather than per-task or mm, - the batch scheduler can obtain the key information, memory - pressure in a cpuset, with a single read, rather than having to - query and accumulate results over all the (dynamically changing) - set of tasks in the cpuset. - -A per-cpuset simple digital filter (requires a spinlock and 3 words -of data per-cpuset) is kept, and updated by any task attached to that -cpuset, if it enters the synchronous (direct) page reclaim code. - -A per-cpuset file provides an integer number representing the recent -(half-life of 10 seconds) rate of direct page reclaims caused by -the tasks in the cpuset, in units of reclaims attempted per second, -times 1000. - - -1.6 What is memory spread ? ---------------------------- -There are two boolean flag files per cpuset that control where the -kernel allocates pages for the file system buffers and related in -kernel data structures. They are called 'memory_spread_page' and -'memory_spread_slab'. - -If the per-cpuset boolean flag file 'memory_spread_page' is set, then -the kernel will spread the file system buffers (page cache) evenly -over all the nodes that the faulting task is allowed to use, instead -of preferring to put those pages on the node where the task is running. - -If the per-cpuset boolean flag file 'memory_spread_slab' is set, -then the kernel will spread some file system related slab caches, -such as for inodes and dentries evenly over all the nodes that the -faulting task is allowed to use, instead of preferring to put those -pages on the node where the task is running. - -The setting of these flags does not affect anonymous data segment or -stack segment pages of a task. - -By default, both kinds of memory spreading are off, and memory -pages are allocated on the node local to where the task is running, -except perhaps as modified by the tasks NUMA mempolicy or cpuset -configuration, so long as sufficient free memory pages are available. - -When new cpusets are created, they inherit the memory spread settings -of their parent. - -Setting memory spreading causes allocations for the affected page -or slab caches to ignore the tasks NUMA mempolicy and be spread -instead. Tasks using mbind() or set_mempolicy() calls to set NUMA -mempolicies will not notice any change in these calls as a result of -their containing tasks memory spread settings. If memory spreading -is turned off, then the currently specified NUMA mempolicy once again -applies to memory page allocations. - -Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag -files. By default they contain "0", meaning that the feature is off -for that cpuset. If a "1" is written to that file, then that turns -the named feature on. - -The implementation is simple. - -Setting the flag 'memory_spread_page' turns on a per-process flag -PF_SPREAD_PAGE for each task that is in that cpuset or subsequently -joins that cpuset. The page allocation calls for the page cache -is modified to perform an inline check for this PF_SPREAD_PAGE task -flag, and if set, a call to a new routine cpuset_mem_spread_node() -returns the node to prefer for the allocation. - -Similarly, setting 'memory_spread_slab' turns on the flag -PF_SPREAD_SLAB, and appropriately marked slab caches will allocate -pages from the node returned by cpuset_mem_spread_node(). - -The cpuset_mem_spread_node() routine is also simple. It uses the -value of a per-task rotor cpuset_mem_spread_rotor to select the next -node in the current tasks mems_allowed to prefer for the allocation. - -This memory placement policy is also known (in other contexts) as -round-robin or interleave. - -This policy can provide substantial improvements for jobs that need -to place thread local data on the corresponding node, but that need -to access large file system data sets that need to be spread across -the several nodes in the jobs cpuset in order to fit. Without this -policy, especially for jobs that might have one thread reading in the -data set, the memory allocation across the nodes in the jobs cpuset -can become very uneven. - -1.7 What is sched_load_balance ? --------------------------------- - -The kernel scheduler (kernel/sched.c) automatically load balances -tasks. If one CPU is underutilized, kernel code running on that -CPU will look for tasks on other more overloaded CPUs and move those -tasks to itself, within the constraints of such placement mechanisms -as cpusets and sched_setaffinity. - -The algorithmic cost of load balancing and its impact on key shared -kernel data structures such as the task list increases more than -linearly with the number of CPUs being balanced. So the scheduler -has support to partition the systems CPUs into a number of sched -domains such that it only load balances within each sched domain. -Each sched domain covers some subset of the CPUs in the system; -no two sched domains overlap; some CPUs might not be in any sched -domain and hence won't be load balanced. - -Put simply, it costs less to balance between two smaller sched domains -than one big one, but doing so means that overloads in one of the -two domains won't be load balanced to the other one. - -By default, there is one sched domain covering all CPUs, except those -marked isolated using the kernel boot time "isolcpus=" argument. - -This default load balancing across all CPUs is not well suited for -the following two situations: - 1) On large systems, load balancing across many CPUs is expensive. - If the system is managed using cpusets to place independent jobs - on separate sets of CPUs, full load balancing is unnecessary. - 2) Systems supporting realtime on some CPUs need to minimize - system overhead on those CPUs, including avoiding task load - balancing if that is not needed. - -When the per-cpuset flag "sched_load_balance" is enabled (the default -setting), it requests that all the CPUs in that cpusets allowed 'cpus' -be contained in a single sched domain, ensuring that load balancing -can move a task (not otherwised pinned, as by sched_setaffinity) -from any CPU in that cpuset to any other. - -When the per-cpuset flag "sched_load_balance" is disabled, then the -scheduler will avoid load balancing across the CPUs in that cpuset, ---except-- in so far as is necessary because some overlapping cpuset -has "sched_load_balance" enabled. - -So, for example, if the top cpuset has the flag "sched_load_balance" -enabled, then the scheduler will have one sched domain covering all -CPUs, and the setting of the "sched_load_balance" flag in any other -cpusets won't matter, as we're already fully load balancing. - -Therefore in the above two situations, the top cpuset flag -"sched_load_balance" should be disabled, and only some of the smaller, -child cpusets have this flag enabled. - -When doing this, you don't usually want to leave any unpinned tasks in -the top cpuset that might use non-trivial amounts of CPU, as such tasks -may be artificially constrained to some subset of CPUs, depending on -the particulars of this flag setting in descendent cpusets. Even if -such a task could use spare CPU cycles in some other CPUs, the kernel -scheduler might not consider the possibility of load balancing that -task to that underused CPU. - -Of course, tasks pinned to a particular CPU can be left in a cpuset -that disables "sched_load_balance" as those tasks aren't going anywhere -else anyway. - -There is an impedance mismatch here, between cpusets and sched domains. -Cpusets are hierarchical and nest. Sched domains are flat; they don't -overlap and each CPU is in at most one sched domain. - -It is necessary for sched domains to be flat because load balancing -across partially overlapping sets of CPUs would risk unstable dynamics -that would be beyond our understanding. So if each of two partially -overlapping cpusets enables the flag 'sched_load_balance', then we -form a single sched domain that is a superset of both. We won't move -a task to a CPU outside it cpuset, but the scheduler load balancing -code might waste some compute cycles considering that possibility. - -This mismatch is why there is not a simple one-to-one relation -between which cpusets have the flag "sched_load_balance" enabled, -and the sched domain configuration. If a cpuset enables the flag, it -will get balancing across all its CPUs, but if it disables the flag, -it will only be assured of no load balancing if no other overlapping -cpuset enables the flag. - -If two cpusets have partially overlapping 'cpus' allowed, and only -one of them has this flag enabled, then the other may find its -tasks only partially load balanced, just on the overlapping CPUs. -This is just the general case of the top_cpuset example given a few -paragraphs above. In the general case, as in the top cpuset case, -don't leave tasks that might use non-trivial amounts of CPU in -such partially load balanced cpusets, as they may be artificially -constrained to some subset of the CPUs allowed to them, for lack of -load balancing to the other CPUs. - -1.7.1 sched_load_balance implementation details. ------------------------------------------------- - -The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary -to most cpuset flags.) When enabled for a cpuset, the kernel will -ensure that it can load balance across all the CPUs in that cpuset -(makes sure that all the CPUs in the cpus_allowed of that cpuset are -in the same sched domain.) - -If two overlapping cpusets both have 'sched_load_balance' enabled, -then they will be (must be) both in the same sched domain. - -If, as is the default, the top cpuset has 'sched_load_balance' enabled, -then by the above that means there is a single sched domain covering -the whole system, regardless of any other cpuset settings. - -The kernel commits to user space that it will avoid load balancing -where it can. It will pick as fine a granularity partition of sched -domains as it can while still providing load balancing for any set -of CPUs allowed to a cpuset having 'sched_load_balance' enabled. - -The internal kernel cpuset to scheduler interface passes from the -cpuset code to the scheduler code a partition of the load balanced -CPUs in the system. This partition is a set of subsets (represented -as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all -the CPUs that must be load balanced. - -Whenever the 'sched_load_balance' flag changes, or CPUs come or go -from a cpuset with this flag enabled, or a cpuset with this flag -enabled is removed, the cpuset code builds a new such partition and -passes it to the scheduler sched domain setup code, to have the sched -domains rebuilt as necessary. - -This partition exactly defines what sched domains the scheduler should -setup - one sched domain for each element (cpumask_t) in the partition. - -The scheduler remembers the currently active sched domain partitions. -When the scheduler routine partition_sched_domains() is invoked from -the cpuset code to update these sched domains, it compares the new -partition requested with the current, and updates its sched domains, -removing the old and adding the new, for each change. - - -1.8 What is sched_relax_domain_level ? --------------------------------------- - -In sched domain, the scheduler migrates tasks in 2 ways; periodic load -balance on tick, and at time of some schedule events. - -When a task is woken up, scheduler try to move the task on idle CPU. -For example, if a task A running on CPU X activates another task B -on the same CPU X, and if CPU Y is X's sibling and performing idle, -then scheduler migrate task B to CPU Y so that task B can start on -CPU Y without waiting task A on CPU X. - -And if a CPU run out of tasks in its runqueue, the CPU try to pull -extra tasks from other busy CPUs to help them before it is going to -be idle. - -Of course it takes some searching cost to find movable tasks and/or -idle CPUs, the scheduler might not search all CPUs in the domain -everytime. In fact, in some architectures, the searching ranges on -events are limited in the same socket or node where the CPU locates, -while the load balance on tick searchs all. - -For example, assume CPU Z is relatively far from CPU X. Even if CPU Z -is idle while CPU X and the siblings are busy, scheduler can't migrate -woken task B from X to Z since it is out of its searching range. -As the result, task B on CPU X need to wait task A or wait load balance -on the next tick. For some applications in special situation, waiting -1 tick may be too long. - -The 'sched_relax_domain_level' file allows you to request changing -this searching range as you like. This file takes int value which -indicates size of searching range in levels ideally as follows, -otherwise initial value -1 that indicates the cpuset has no request. - - -1 : no request. use system default or follow request of others. - 0 : no search. - 1 : search siblings (hyperthreads in a core). - 2 : search cores in a package. - 3 : search cpus in a node [= system wide on non-NUMA system] - ( 4 : search nodes in a chunk of node [on NUMA system] ) - ( 5 : search system wide [on NUMA system] ) - -The system default is architecture dependent. The system default -can be changed using the relax_domain_level= boot parameter. - -This file is per-cpuset and affect the sched domain where the cpuset -belongs to. Therefore if the flag 'sched_load_balance' of a cpuset -is disabled, then 'sched_relax_domain_level' have no effect since -there is no sched domain belonging the cpuset. - -If multiple cpusets are overlapping and hence they form a single sched -domain, the largest value among those is used. Be careful, if one -requests 0 and others are -1 then 0 is used. - -Note that modifying this file will have both good and bad effects, -and whether it is acceptable or not will be depend on your situation. -Don't modify this file if you are not sure. - -If your situation is: - - The migration costs between each cpu can be assumed considerably - small(for you) due to your special application's behavior or - special hardware support for CPU cache etc. - - The searching cost doesn't have impact(for you) or you can make - the searching cost enough small by managing cpuset to compact etc. - - The latency is required even it sacrifices cache hit rate etc. -then increasing 'sched_relax_domain_level' would benefit you. - - -1.9 How do I use cpusets ? --------------------------- - -In order to minimize the impact of cpusets on critical kernel -code, such as the scheduler, and due to the fact that the kernel -does not support one task updating the memory placement of another -task directly, the impact on a task of changing its cpuset CPU -or Memory Node placement, or of changing to which cpuset a task -is attached, is subtle. - -If a cpuset has its Memory Nodes modified, then for each task attached -to that cpuset, the next time that the kernel attempts to allocate -a page of memory for that task, the kernel will notice the change -in the tasks cpuset, and update its per-task memory placement to -remain within the new cpusets memory placement. If the task was using -mempolicy MPOL_BIND, and the nodes to which it was bound overlap with -its new cpuset, then the task will continue to use whatever subset -of MPOL_BIND nodes are still allowed in the new cpuset. If the task -was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed -in the new cpuset, then the task will be essentially treated as if it -was MPOL_BIND bound to the new cpuset (even though its numa placement, -as queried by get_mempolicy(), doesn't change). If a task is moved -from one cpuset to another, then the kernel will adjust the tasks -memory placement, as above, the next time that the kernel attempts -to allocate a page of memory for that task. - -If a cpuset has its 'cpus' modified, then each task in that cpuset -will have its allowed CPU placement changed immediately. Similarly, -if a tasks pid is written to a cpusets 'tasks' file, in either its -current cpuset or another cpuset, then its allowed CPU placement is -changed immediately. If such a task had been bound to some subset -of its cpuset using the sched_setaffinity() call, the task will be -allowed to run on any CPU allowed in its new cpuset, negating the -affect of the prior sched_setaffinity() call. - -In summary, the memory placement of a task whose cpuset is changed is -updated by the kernel, on the next allocation of a page for that task, -but the processor placement is not updated, until that tasks pid is -rewritten to the 'tasks' file of its cpuset. This is done to avoid -impacting the scheduler code in the kernel with a check for changes -in a tasks processor placement. - -Normally, once a page is allocated (given a physical page -of main memory) then that page stays on whatever node it -was allocated, so long as it remains allocated, even if the -cpusets memory placement policy 'mems' subsequently changes. -If the cpuset flag file 'memory_migrate' is set true, then when -tasks are attached to that cpuset, any pages that task had -allocated to it on nodes in its previous cpuset are migrated -to the tasks new cpuset. The relative placement of the page within -the cpuset is preserved during these migration operations if possible. -For example if the page was on the second valid node of the prior cpuset -then the page will be placed on the second valid node of the new cpuset. - -Also if 'memory_migrate' is set true, then if that cpusets -'mems' file is modified, pages allocated to tasks in that -cpuset, that were on nodes in the previous setting of 'mems', -will be moved to nodes in the new setting of 'mems.' -Pages that were not in the tasks prior cpuset, or in the cpusets -prior 'mems' setting, will not be moved. - -There is an exception to the above. If hotplug functionality is used -to remove all the CPUs that are currently assigned to a cpuset, -then all the tasks in that cpuset will be moved to the nearest ancestor -with non-empty cpus. But the moving of some (or all) tasks might fail if -cpuset is bound with another cgroup subsystem which has some restrictions -on task attaching. In this failing case, those tasks will stay -in the original cpuset, and the kernel will automatically update -their cpus_allowed to allow all online CPUs. When memory hotplug -functionality for removing Memory Nodes is available, a similar exception -is expected to apply there as well. In general, the kernel prefers to -violate cpuset placement, over starving a task that has had all -its allowed CPUs or Memory Nodes taken offline. - -There is a second exception to the above. GFP_ATOMIC requests are -kernel internal allocations that must be satisfied, immediately. -The kernel may drop some request, in rare cases even panic, if a -GFP_ATOMIC alloc fails. If the request cannot be satisfied within -the current tasks cpuset, then we relax the cpuset, and look for -memory anywhere we can find it. It's better to violate the cpuset -than stress the kernel. - -To start a new job that is to be contained within a cpuset, the steps are: - - 1) mkdir /dev/cpuset - 2) mount -t cgroup -ocpuset cpuset /dev/cpuset - 3) Create the new cpuset by doing mkdir's and write's (or echo's) in - the /dev/cpuset virtual file system. - 4) Start a task that will be the "founding father" of the new job. - 5) Attach that task to the new cpuset by writing its pid to the - /dev/cpuset tasks file for that cpuset. - 6) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cpuset -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cpuset: - - mount -t cgroup -ocpuset cpuset /dev/cpuset - cd /dev/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpus - /bin/echo 1 > mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cpuset Charlie - # The next line should display '/Charlie' - cat /proc/self/cpuset - -In the future, a C library interface to cpusets will likely be -available. For now, the only way to query or modify cpusets is -via the cpuset file system, using the various cd, mkdir, echo, cat, -rmdir commands from the shell, or their equivalent from C. - -The sched_setaffinity calls can also be done at the shell prompt using -SGI's runon or Robert Love's taskset. The mbind and set_mempolicy -calls can be done at the shell prompt using the numactl command -(part of Andi Kleen's numa package). - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using the cpusets can be done through the cpuset -virtual filesystem. - -To mount it, type: -# mount -t cgroup -o cpuset cpuset /dev/cpuset - -Then under /dev/cpuset you can find a tree that corresponds to the -tree of the cpusets in the system. For instance, /dev/cpuset -is the cpuset that holds the whole system. - -If you want to create a new cpuset under /dev/cpuset: -# cd /dev/cpuset -# mkdir my_cpuset - -Now you want to do something with this cpuset. -# cd my_cpuset - -In this directory you can find several files: -# ls -cpu_exclusive memory_migrate mems tasks -cpus memory_pressure notify_on_release -mem_exclusive memory_spread_page sched_load_balance -mem_hardwall memory_spread_slab sched_relax_domain_level - -Reading them will give you information about the state of this cpuset: -the CPUs and Memory Nodes it can use, the processes that are using -it, its properties. By writing to these files you can manipulate -the cpuset. - -Set some flags: -# /bin/echo 1 > cpu_exclusive - -Add some cpus: -# /bin/echo 0-7 > cpus - -Add some mems: -# /bin/echo 0-7 > mems - -Now attach your shell to this cpuset: -# /bin/echo $$ > tasks - -You can also create cpusets inside your cpuset by using mkdir in this -directory. -# mkdir my_sub_cs - -To remove a cpuset, just use rmdir: -# rmdir my_sub_cs -This will fail if the cpuset is in use (has cpusets inside, or has -processes attached). - -Note that for legacy reasons, the "cpuset" filesystem exists as a -wrapper around the cgroup filesystem. - -The command - -mount -t cpuset X /dev/cpuset - -is equivalent to - -mount -t cgroup -ocpuset X /dev/cpuset -echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent - -2.2 Adding/removing cpus ------------------------- - -This is the syntax to use when writing in the cpus or mems files -in cpuset directories: - -# /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4 -# /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4 - -2.3 Setting flags ------------------ - -The syntax is very simple: - -# /bin/echo 1 > cpu_exclusive -> set flag 'cpu_exclusive' -# /bin/echo 0 > cpu_exclusive -> unset flag 'cpu_exclusive' - -2.4 Attaching processes ------------------------ - -# /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another: - -# /bin/echo PID1 > tasks -# /bin/echo PID2 > tasks - ... -# /bin/echo PIDn > tasks - - -3. Questions -============ - -Q: what's up with this '/bin/echo' ? -A: bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cpuset file system, you won't be - able to tell whether a command succeeded or failed. - -Q: When I attach processes, only the first of the line gets really attached ! -A: We can only return one error code per call to write(). So you should also - put only ONE pid. - -4. Contact -========== - -Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index 8398ca4ff4ed..6f33593e59e2 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt @@ -231,7 +231,7 @@ CPU bandwidth control purposes: This options needs CONFIG_CGROUPS to be defined, and lets the administrator create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See - Documentation/cgroups.txt for more information about this filesystem. + Documentation/cgroups/cgroups.txt for more information about this filesystem. Only one of these options to group tasks can be chosen and not both. diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index dede0a2cfc45..4c5bcf6ca7e8 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -9,7 +9,7 @@ * * Author: Pavel Emelianov * - * See Documentation/controllers/resource_counter.txt for more + * See Documentation/cgroups/resource_counter.txt for more * info about what this counter is. */ diff --git a/init/Kconfig b/init/Kconfig index 56fd93c63c77..2af83825634e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -323,8 +323,8 @@ config CGROUP_SCHED This option allows you to create arbitrary task groups using the "cgroup" pseudo filesystem and control the cpu bandwidth allocated to each such task group. - Refer to Documentation/cgroups.txt for more information - on "cgroup" pseudo filesystem. + Refer to Documentation/cgroups/cgroups.txt for more + information on "cgroup" pseudo filesystem. endchoice @@ -335,10 +335,9 @@ menuconfig CGROUPS use with process control subsystems such as Cpusets, CFS, memory controls or device isolation. See - - Documentation/cpusets.txt (Cpusets) - Documentation/scheduler/sched-design-CFS.txt (CFS) - - Documentation/cgroups/ (features for grouping, isolation) - - Documentation/controllers/ (features for resource control) + - Documentation/cgroups/ (features for grouping, isolation + and resource control) Say N if unsure. diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 647c77a88fcb..a85678865c5e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -568,7 +568,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * load balancing domains (sched domains) as specified by that partial * partition. * - * See "What is sched_load_balance" in Documentation/cpusets.txt + * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt * for a background explanation of this. * * Does not return errors, on the theory that the callers of this -- cgit v1.2.3 From 3eabdb76a03bbe8f556162738c264dbfb24cff6a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Jan 2009 13:51:01 -0800 Subject: jbd: fix missing kernel-doc Fix jbd header file kernel-doc notation: Warning(linux-2.6.28-git13//include/linux/jbd.h:823): No description found for parameter 'j_average_commit_time' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/jbd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 6384b19efe64..64246dce5663 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -614,6 +614,8 @@ struct transaction_s * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the * number that will fit in j_blocksize * @j_last_sync_writer: most recent pid which did a synchronous write + * @j_average_commit_time: the average amount of time in nanoseconds it + * takes to commit a transaction to the disk. * @j_private: An opaque pointer to fs-private information. */ -- cgit v1.2.3 From 6ae301e85c9c58d2f430a8a7057ce488b7ff76df Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Jan 2009 13:51:01 -0800 Subject: resources: fix parameter name and kernel-doc Fix __request_region() parameter kernel-doc notation and parameter name: Warning(linux-2.6.28-git10//kernel/resource.c:627): No description found for parameter 'flags' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 3 ++- kernel/resource.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index f6bb2ca8e3ba..32e4b2f72294 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -143,7 +143,8 @@ static inline unsigned long resource_type(struct resource *res) extern struct resource * __request_region(struct resource *, resource_size_t start, - resource_size_t n, const char *name, int relaxed); + resource_size_t n, + const char *name, int flags); /* Compatibility cruft */ #define release_region(start,n) __release_region(&ioport_resource, (start), (n)) diff --git a/kernel/resource.c b/kernel/resource.c index ca6a1536b205..fd5d7d574bb9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -620,6 +620,7 @@ resource_size_t resource_alignment(struct resource *res) * @start: resource start address * @n: resource region size * @name: reserving caller's ID string + * @flags: IO resource flags */ struct resource * __request_region(struct resource *parent, resource_size_t start, resource_size_t n, -- cgit v1.2.3 From 1bcbf31337391a2f54ef6c1e8871c2de5944a7dc Mon Sep 17 00:00:00 2001 From: Qinghuang Feng Date: Thu, 15 Jan 2009 13:51:03 -0800 Subject: btrfs & squashfs: Move btrfs and squashfsto's magic number to Use the standard magic.h for btrfs and squashfs. Signed-off-by: Qinghuang Feng Cc: Phillip Lougher Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/super.c | 2 +- fs/squashfs/squashfs_fs.h | 1 - fs/squashfs/super.c | 1 + include/linux/magic.h | 2 ++ 4 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0a14b495532f..7256cf242eb0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -51,7 +52,6 @@ #include "export.h" #include "compression.h" -#define BTRFS_SUPER_MAGIC 0x9123683E static struct super_operations btrfs_super_ops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 6840da1bf21e..283daafc568e 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -26,7 +26,6 @@ #define SQUASHFS_CACHED_FRAGMENTS CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE #define SQUASHFS_MAJOR 4 #define SQUASHFS_MINOR 0 -#define SQUASHFS_MAGIC 0x73717368 #define SQUASHFS_START 0 /* size of metadata (inode and directory) blocks */ diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index a0466d7467b2..071df5b5b491 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/include/linux/magic.h b/include/linux/magic.h index 439f6f3cb0c4..0b4df7eba852 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h @@ -10,11 +10,13 @@ #define SYSFS_MAGIC 0x62656572 #define SECURITYFS_MAGIC 0x73636673 #define TMPFS_MAGIC 0x01021994 +#define SQUASHFS_MAGIC 0x73717368 #define EFS_SUPER_MAGIC 0x414A53 #define EXT2_SUPER_MAGIC 0xEF53 #define EXT3_SUPER_MAGIC 0xEF53 #define XENFS_SUPER_MAGIC 0xabba1974 #define EXT4_SUPER_MAGIC 0xEF53 +#define BTRFS_SUPER_MAGIC 0x9123683E #define HPFS_SUPER_MAGIC 0xf995e849 #define ISOFS_SUPER_MAGIC 0x9660 #define JFFS2_SUPER_MAGIC 0x72b6 -- cgit v1.2.3 From 00bfddaf7f68a6551319b536f052040c370756b0 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 15 Jan 2009 13:51:26 -0800 Subject: include of is preferred over Impact: fix 15 make headers_check warnings: include of is preferred over Signed-off-by: Jaswinder Singh Rajput Cc: Ingo Molnar Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/agpgart.h | 1 - include/linux/atm_idt77105.h | 2 +- include/linux/capi.h | 2 +- include/linux/connector.h | 2 +- include/linux/cyclades.h | 2 -- include/linux/fb.h | 2 +- include/linux/if_pppol2tp.h | 2 +- include/linux/if_pppox.h | 2 +- include/linux/input.h | 2 +- include/linux/joystick.h | 2 +- include/linux/kvm.h | 2 +- include/linux/loop.h | 2 +- include/linux/matroxfb.h | 2 +- include/linux/phantom.h | 2 +- include/linux/radeonfb.h | 2 +- 15 files changed, 13 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/agpgart.h b/include/linux/agpgart.h index c8fdb6e658e1..110c600c885f 100644 --- a/include/linux/agpgart.h +++ b/include/linux/agpgart.h @@ -52,7 +52,6 @@ #ifndef __KERNEL__ #include -#include struct agp_version { __u16 major; diff --git a/include/linux/atm_idt77105.h b/include/linux/atm_idt77105.h index 05621cf20709..8b724000aa50 100644 --- a/include/linux/atm_idt77105.h +++ b/include/linux/atm_idt77105.h @@ -7,7 +7,7 @@ #ifndef LINUX_ATM_IDT77105_H #define LINUX_ATM_IDT77105_H -#include +#include #include #include diff --git a/include/linux/capi.h b/include/linux/capi.h index fdebaaa9f66e..65100d6cb89b 100644 --- a/include/linux/capi.h +++ b/include/linux/capi.h @@ -12,7 +12,7 @@ #ifndef __LINUX_CAPI_H__ #define __LINUX_CAPI_H__ -#include +#include #include #ifndef __KERNEL__ #include diff --git a/include/linux/connector.h b/include/linux/connector.h index 5c7f9468f753..34f2789d9b9b 100644 --- a/include/linux/connector.h +++ b/include/linux/connector.h @@ -22,7 +22,7 @@ #ifndef __CONNECTOR_H #define __CONNECTOR_H -#include +#include #define CN_IDX_CONNECTOR 0xffffffff #define CN_VAL_CONNECTOR 0xffffffff diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h index 2d3d1e04ba92..d06fbf286346 100644 --- a/include/linux/cyclades.h +++ b/include/linux/cyclades.h @@ -150,8 +150,6 @@ struct CYZ_BOOT_CTRL { * architectures and compilers. */ -#include - typedef __u64 ucdouble; /* 64 bits, unsigned */ typedef __u32 uclong; /* 32 bits, unsigned */ typedef __u16 ucshort; /* 16 bits, unsigned */ diff --git a/include/linux/fb.h b/include/linux/fb.h index 1ee63df5be92..818fe21257e8 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -1,7 +1,7 @@ #ifndef _LINUX_FB_H #define _LINUX_FB_H -#include +#include #include struct dentry; diff --git a/include/linux/if_pppol2tp.h b/include/linux/if_pppol2tp.h index a7d6a2234b31..c7a66882b6d0 100644 --- a/include/linux/if_pppol2tp.h +++ b/include/linux/if_pppol2tp.h @@ -15,7 +15,7 @@ #ifndef __LINUX_IF_PPPOL2TP_H #define __LINUX_IF_PPPOL2TP_H -#include +#include #ifdef __KERNEL__ #include diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index 6fb7f1788570..30c88b2245ff 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -17,7 +17,7 @@ #define __LINUX_IF_PPPOX_H -#include +#include #include #ifdef __KERNEL__ diff --git a/include/linux/input.h b/include/linux/input.h index 9a6355f74db2..1249a0c20a38 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -16,7 +16,7 @@ #include #include #include -#include +#include #endif /* diff --git a/include/linux/joystick.h b/include/linux/joystick.h index b5e051295a67..9e20c29c1e14 100644 --- a/include/linux/joystick.h +++ b/include/linux/joystick.h @@ -27,7 +27,7 @@ * Vojtech Pavlik, Ucitelska 1576, Prague 8, 182 00 Czech Republic */ -#include +#include #include /* diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 35525ac63337..5715f1907601 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -7,7 +7,7 @@ * Note: you must update KVM_API_VERSION if you change this interface. */ -#include +#include #include #include #include diff --git a/include/linux/loop.h b/include/linux/loop.h index 46169a7b559b..6ffd6db5bb0d 100644 --- a/include/linux/loop.h +++ b/include/linux/loop.h @@ -80,7 +80,7 @@ enum { }; #include /* for __kernel_old_dev_t */ -#include /* for __u64 */ +#include /* for __u64 */ /* Backwards compatibility version */ struct loop_info { diff --git a/include/linux/matroxfb.h b/include/linux/matroxfb.h index ae5b09493062..404f678e734b 100644 --- a/include/linux/matroxfb.h +++ b/include/linux/matroxfb.h @@ -2,7 +2,7 @@ #define __LINUX_MATROXFB_H__ #include -#include +#include #include struct matroxioc_output_mode { diff --git a/include/linux/phantom.h b/include/linux/phantom.h index 02268c54c250..94dd6645c60a 100644 --- a/include/linux/phantom.h +++ b/include/linux/phantom.h @@ -10,7 +10,7 @@ #ifndef __PHANTOM_H #define __PHANTOM_H -#include +#include /* PHN_(G/S)ET_REG param */ struct phm_reg { diff --git a/include/linux/radeonfb.h b/include/linux/radeonfb.h index 5bd8975ed78e..8c4bbdecc44f 100644 --- a/include/linux/radeonfb.h +++ b/include/linux/radeonfb.h @@ -2,7 +2,7 @@ #define __LINUX_RADEONFB_H__ #include -#include +#include #define ATY_RADEON_LCD_ON 0x00000001 #define ATY_RADEON_CRT_ON 0x00000002 -- cgit v1.2.3 From 94be9a58d7e683ac3c1df1858a17f09ebade8da0 Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Fri, 16 Jan 2009 10:17:09 -0500 Subject: [libata] get-identity ioctl: Fix use of invalid memory pointer for SAS drivers. Caught by Ke Wei (and team?) at Marvell. Also, move the ata_scsi_ioctl export to libata-scsi.c, as that seems to be the general trend. Acked-by: James Bottomley Signed-off-by: Jeff Garzik --- drivers/ata/libata-core.c | 1 - drivers/ata/libata-scsi.c | 17 +++++++++++++---- drivers/scsi/ipr.c | 2 +- drivers/scsi/libsas/sas_scsi_host.c | 2 +- include/linux/libata.h | 2 ++ 5 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 71218d76d75e..552ecae13434 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -6638,7 +6638,6 @@ EXPORT_SYMBOL_GPL(ata_dev_pair); EXPORT_SYMBOL_GPL(ata_port_disable); EXPORT_SYMBOL_GPL(ata_ratelimit); EXPORT_SYMBOL_GPL(ata_wait_register); -EXPORT_SYMBOL_GPL(ata_scsi_ioctl); EXPORT_SYMBOL_GPL(ata_scsi_queuecmd); EXPORT_SYMBOL_GPL(ata_scsi_slave_config); EXPORT_SYMBOL_GPL(ata_scsi_slave_destroy); diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 9e92107691f2..a1a6e6298c33 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -423,9 +423,9 @@ int ata_std_bios_param(struct scsi_device *sdev, struct block_device *bdev, * RETURNS: * Zero on success, negative errno on error. */ -static int ata_get_identity(struct scsi_device *sdev, void __user *arg) +static int ata_get_identity(struct ata_port *ap, struct scsi_device *sdev, + void __user *arg) { - struct ata_port *ap = ata_shost_to_port(sdev->host); struct ata_device *dev = ata_scsi_find_dev(ap, sdev); u16 __user *dst = arg; char buf[40]; @@ -645,7 +645,8 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg) return rc; } -int ata_scsi_ioctl(struct scsi_device *scsidev, int cmd, void __user *arg) +int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *scsidev, + int cmd, void __user *arg) { int val = -EINVAL, rc = -EINVAL; @@ -663,7 +664,7 @@ int ata_scsi_ioctl(struct scsi_device *scsidev, int cmd, void __user *arg) return 0; case HDIO_GET_IDENTITY: - return ata_get_identity(scsidev, arg); + return ata_get_identity(ap, scsidev, arg); case HDIO_DRIVE_CMD: if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO)) @@ -682,6 +683,14 @@ int ata_scsi_ioctl(struct scsi_device *scsidev, int cmd, void __user *arg) return rc; } +EXPORT_SYMBOL_GPL(ata_sas_scsi_ioctl); + +int ata_scsi_ioctl(struct scsi_device *scsidev, int cmd, void __user *arg) +{ + return ata_sas_scsi_ioctl(ata_shost_to_port(scsidev->host), + scsidev, cmd, arg); +} +EXPORT_SYMBOL_GPL(ata_scsi_ioctl); /** * ata_scsi_qc_new - acquire new ata_queued_cmd reference diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index 841f460edbc4..07829009a8be 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -4912,7 +4912,7 @@ static int ipr_ioctl(struct scsi_device *sdev, int cmd, void __user *arg) if (res && ipr_is_gata(res)) { if (cmd == HDIO_GET_IDENTITY) return -ENOTTY; - return ata_scsi_ioctl(sdev, cmd, arg); + return ata_sas_scsi_ioctl(res->sata_port->ap, sdev, cmd, arg); } return -EINVAL; diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c index 744838780ada..1c558d3bce18 100644 --- a/drivers/scsi/libsas/sas_scsi_host.c +++ b/drivers/scsi/libsas/sas_scsi_host.c @@ -717,7 +717,7 @@ int sas_ioctl(struct scsi_device *sdev, int cmd, void __user *arg) struct domain_device *dev = sdev_to_domain_dev(sdev); if (dev_is_sata(dev)) - return ata_scsi_ioctl(sdev, cmd, arg); + return ata_sas_scsi_ioctl(dev->sata_dev.ap, sdev, cmd, arg); return -EINVAL; } diff --git a/include/linux/libata.h b/include/linux/libata.h index b6b8a7f3ec66..73b69c7071c5 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -927,6 +927,8 @@ extern void ata_host_init(struct ata_host *, struct device *, extern int ata_scsi_detect(struct scsi_host_template *sht); extern int ata_scsi_ioctl(struct scsi_device *dev, int cmd, void __user *arg); extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *)); +extern int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *dev, + int cmd, void __user *arg); extern void ata_sas_port_destroy(struct ata_port *); extern struct ata_port *ata_sas_port_alloc(struct ata_host *, struct ata_port_info *, struct Scsi_Host *); -- cgit v1.2.3 From 3ada9c126499dd4700dcdbd5b9fe8110ad17f578 Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 15 Jan 2009 17:45:31 -0800 Subject: libata: Add another column to the ata_timing table. The forthcoming OCTEON SOC Compact Flash driver needs an additional timing value that was not available in the ata_timing table. I add a new column for dmack_hold time. The values were obtained from the Compact Flash specification Rev 4.1. Signed-off-by: David Daney Signed-off-by: Jeff Garzik --- drivers/ata/libata-core.c | 72 ++++++++++++++++++++++++----------------------- include/linux/libata.h | 9 ++++-- 2 files changed, 43 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 552ecae13434..88c242856dae 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -3029,33 +3029,33 @@ int sata_set_spd(struct ata_link *link) */ static const struct ata_timing ata_timing[] = { -/* { XFER_PIO_SLOW, 120, 290, 240, 960, 290, 240, 960, 0 }, */ - { XFER_PIO_0, 70, 290, 240, 600, 165, 150, 600, 0 }, - { XFER_PIO_1, 50, 290, 93, 383, 125, 100, 383, 0 }, - { XFER_PIO_2, 30, 290, 40, 330, 100, 90, 240, 0 }, - { XFER_PIO_3, 30, 80, 70, 180, 80, 70, 180, 0 }, - { XFER_PIO_4, 25, 70, 25, 120, 70, 25, 120, 0 }, - { XFER_PIO_5, 15, 65, 25, 100, 65, 25, 100, 0 }, - { XFER_PIO_6, 10, 55, 20, 80, 55, 20, 80, 0 }, - - { XFER_SW_DMA_0, 120, 0, 0, 0, 480, 480, 960, 0 }, - { XFER_SW_DMA_1, 90, 0, 0, 0, 240, 240, 480, 0 }, - { XFER_SW_DMA_2, 60, 0, 0, 0, 120, 120, 240, 0 }, - - { XFER_MW_DMA_0, 60, 0, 0, 0, 215, 215, 480, 0 }, - { XFER_MW_DMA_1, 45, 0, 0, 0, 80, 50, 150, 0 }, - { XFER_MW_DMA_2, 25, 0, 0, 0, 70, 25, 120, 0 }, - { XFER_MW_DMA_3, 25, 0, 0, 0, 65, 25, 100, 0 }, - { XFER_MW_DMA_4, 25, 0, 0, 0, 55, 20, 80, 0 }, - -/* { XFER_UDMA_SLOW, 0, 0, 0, 0, 0, 0, 0, 150 }, */ - { XFER_UDMA_0, 0, 0, 0, 0, 0, 0, 0, 120 }, - { XFER_UDMA_1, 0, 0, 0, 0, 0, 0, 0, 80 }, - { XFER_UDMA_2, 0, 0, 0, 0, 0, 0, 0, 60 }, - { XFER_UDMA_3, 0, 0, 0, 0, 0, 0, 0, 45 }, - { XFER_UDMA_4, 0, 0, 0, 0, 0, 0, 0, 30 }, - { XFER_UDMA_5, 0, 0, 0, 0, 0, 0, 0, 20 }, - { XFER_UDMA_6, 0, 0, 0, 0, 0, 0, 0, 15 }, +/* { XFER_PIO_SLOW, 120, 290, 240, 960, 290, 240, 0, 960, 0 }, */ + { XFER_PIO_0, 70, 290, 240, 600, 165, 150, 0, 600, 0 }, + { XFER_PIO_1, 50, 290, 93, 383, 125, 100, 0, 383, 0 }, + { XFER_PIO_2, 30, 290, 40, 330, 100, 90, 0, 240, 0 }, + { XFER_PIO_3, 30, 80, 70, 180, 80, 70, 0, 180, 0 }, + { XFER_PIO_4, 25, 70, 25, 120, 70, 25, 0, 120, 0 }, + { XFER_PIO_5, 15, 65, 25, 100, 65, 25, 0, 100, 0 }, + { XFER_PIO_6, 10, 55, 20, 80, 55, 20, 0, 80, 0 }, + + { XFER_SW_DMA_0, 120, 0, 0, 0, 480, 480, 50, 960, 0 }, + { XFER_SW_DMA_1, 90, 0, 0, 0, 240, 240, 30, 480, 0 }, + { XFER_SW_DMA_2, 60, 0, 0, 0, 120, 120, 20, 240, 0 }, + + { XFER_MW_DMA_0, 60, 0, 0, 0, 215, 215, 20, 480, 0 }, + { XFER_MW_DMA_1, 45, 0, 0, 0, 80, 50, 5, 150, 0 }, + { XFER_MW_DMA_2, 25, 0, 0, 0, 70, 25, 5, 120, 0 }, + { XFER_MW_DMA_3, 25, 0, 0, 0, 65, 25, 5, 100, 0 }, + { XFER_MW_DMA_4, 25, 0, 0, 0, 55, 20, 5, 80, 0 }, + +/* { XFER_UDMA_SLOW, 0, 0, 0, 0, 0, 0, 0, 0, 150 }, */ + { XFER_UDMA_0, 0, 0, 0, 0, 0, 0, 0, 0, 120 }, + { XFER_UDMA_1, 0, 0, 0, 0, 0, 0, 0, 0, 80 }, + { XFER_UDMA_2, 0, 0, 0, 0, 0, 0, 0, 0, 60 }, + { XFER_UDMA_3, 0, 0, 0, 0, 0, 0, 0, 0, 45 }, + { XFER_UDMA_4, 0, 0, 0, 0, 0, 0, 0, 0, 30 }, + { XFER_UDMA_5, 0, 0, 0, 0, 0, 0, 0, 0, 20 }, + { XFER_UDMA_6, 0, 0, 0, 0, 0, 0, 0, 0, 15 }, { 0xFF } }; @@ -3065,14 +3065,15 @@ static const struct ata_timing ata_timing[] = { static void ata_timing_quantize(const struct ata_timing *t, struct ata_timing *q, int T, int UT) { - q->setup = EZ(t->setup * 1000, T); - q->act8b = EZ(t->act8b * 1000, T); - q->rec8b = EZ(t->rec8b * 1000, T); - q->cyc8b = EZ(t->cyc8b * 1000, T); - q->active = EZ(t->active * 1000, T); - q->recover = EZ(t->recover * 1000, T); - q->cycle = EZ(t->cycle * 1000, T); - q->udma = EZ(t->udma * 1000, UT); + q->setup = EZ(t->setup * 1000, T); + q->act8b = EZ(t->act8b * 1000, T); + q->rec8b = EZ(t->rec8b * 1000, T); + q->cyc8b = EZ(t->cyc8b * 1000, T); + q->active = EZ(t->active * 1000, T); + q->recover = EZ(t->recover * 1000, T); + q->dmack_hold = EZ(t->dmack_hold * 1000, T); + q->cycle = EZ(t->cycle * 1000, T); + q->udma = EZ(t->udma * 1000, UT); } void ata_timing_merge(const struct ata_timing *a, const struct ata_timing *b, @@ -3084,6 +3085,7 @@ void ata_timing_merge(const struct ata_timing *a, const struct ata_timing *b, if (what & ATA_TIMING_CYC8B ) m->cyc8b = max(a->cyc8b, b->cyc8b); if (what & ATA_TIMING_ACTIVE ) m->active = max(a->active, b->active); if (what & ATA_TIMING_RECOVER) m->recover = max(a->recover, b->recover); + if (what & ATA_TIMING_DMACK_HOLD) m->dmack_hold = max(a->dmack_hold, b->dmack_hold); if (what & ATA_TIMING_CYCLE ) m->cycle = max(a->cycle, b->cycle); if (what & ATA_TIMING_UDMA ) m->udma = max(a->udma, b->udma); } diff --git a/include/linux/libata.h b/include/linux/libata.h index 73b69c7071c5..2c6bd66209ff 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -401,12 +401,14 @@ enum { ATA_TIMING_CYC8B, ATA_TIMING_ACTIVE = (1 << 4), ATA_TIMING_RECOVER = (1 << 5), - ATA_TIMING_CYCLE = (1 << 6), - ATA_TIMING_UDMA = (1 << 7), + ATA_TIMING_DMACK_HOLD = (1 << 6), + ATA_TIMING_CYCLE = (1 << 7), + ATA_TIMING_UDMA = (1 << 8), ATA_TIMING_ALL = ATA_TIMING_SETUP | ATA_TIMING_ACT8B | ATA_TIMING_REC8B | ATA_TIMING_CYC8B | ATA_TIMING_ACTIVE | ATA_TIMING_RECOVER | - ATA_TIMING_CYCLE | ATA_TIMING_UDMA, + ATA_TIMING_DMACK_HOLD | ATA_TIMING_CYCLE | + ATA_TIMING_UDMA, }; enum ata_xfer_mask { @@ -866,6 +868,7 @@ struct ata_timing { unsigned short cyc8b; /* t0 for 8-bit I/O */ unsigned short active; /* t2 or tD */ unsigned short recover; /* t2i or tK */ + unsigned short dmack_hold; /* tj */ unsigned short cycle; /* t0 */ unsigned short udma; /* t2CYCTYP/2 */ }; -- cgit v1.2.3