From 835481d9bcd65720b473db6b38746a74a3964218 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sun, 4 Jan 2009 05:18:06 -0800 Subject: cpumask: convert struct cpufreq_policy to cpumask_var_t Impact: use new cpumask API to reduce memory usage This is part of an effort to reduce structure sizes for machines configured with large NR_CPUS. cpumask_t gets replaced by cpumask_var_t, which is either struct cpumask[1] (small NR_CPUS) or struct cpumask * (large NR_CPUS). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Dave Jones Signed-off-by: Ingo Molnar --- include/linux/cpufreq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 484b3abf61bb..384b38d3e8e2 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -80,8 +80,8 @@ struct cpufreq_real_policy { }; struct cpufreq_policy { - cpumask_t cpus; /* CPUs requiring sw coordination */ - cpumask_t related_cpus; /* CPUs with any coordination */ + cpumask_var_t cpus; /* CPUs requiring sw coordination */ + cpumask_var_t related_cpus; /* CPUs with any coordination */ unsigned int shared_type; /* ANY or ALL affected CPUs should set cpufreq */ unsigned int cpu; /* cpu nr of registered CPU */ -- cgit v1.2.3 From 490dea45d00f01847ebebd007685d564aaf2cd98 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Nov 2008 17:06:57 +0100 Subject: itimers: remove the per-cpu-ish-ness Either we bounce once cacheline per cpu per tick, yielding n^2 bounces or we just bounce a single.. Also, using per-cpu allocations for the thread-groups complicates the per-cpu allocator in that its currently aimed to be a fixed sized allocator and the only possible extention to that would be vmap based, which is seriously constrained on 32 bit archs. So making the per-cpu memory requirement depend on the number of processes is an issue. Lastly, it didn't deal with cpu-hotplug, although admittedly that might be fixable. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 6 ++++ include/linux/sched.h | 29 ++++++++++++-------- kernel/fork.c | 15 +++++----- kernel/posix-cpu-timers.c | 70 ----------------------------------------------- kernel/sched_stats.h | 33 ++++++++++------------ 5 files changed, 46 insertions(+), 107 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 2f3c2d4ef73b..ea0ea1a4c36f 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -48,6 +48,12 @@ extern struct fs_struct init_fs; .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \ .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ + .cputime = { .totals = { \ + .utime = cputime_zero, \ + .stime = cputime_zero, \ + .sum_exec_runtime = 0, \ + .lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock), \ + }, }, \ } extern struct nsproxy init_nsproxy; diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cae9b81a1f8..c20943eabb4c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -450,6 +450,7 @@ struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; + spinlock_t lock; }; /* Alternate field names when used to cache expirations. */ #define prof_exp stime @@ -465,7 +466,7 @@ struct task_cputime { * used for thread group CPU clock calculations. */ struct thread_group_cputime { - struct task_cputime *totals; + struct task_cputime totals; }; /* @@ -2180,24 +2181,30 @@ static inline int spin_needbreak(spinlock_t *lock) * Thread group CPU time accounting. */ -extern int thread_group_cputime_alloc(struct task_struct *); -extern void thread_group_cputime(struct task_struct *, struct task_cputime *); - -static inline void thread_group_cputime_init(struct signal_struct *sig) +static inline +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { - sig->cputime.totals = NULL; + struct task_cputime *totals = &tsk->signal->cputime.totals; + unsigned long flags; + + spin_lock_irqsave(&totals->lock, flags); + *times = *totals; + spin_unlock_irqrestore(&totals->lock, flags); } -static inline int thread_group_cputime_clone_thread(struct task_struct *curr) +static inline void thread_group_cputime_init(struct signal_struct *sig) { - if (curr->signal->cputime.totals) - return 0; - return thread_group_cputime_alloc(curr); + sig->cputime.totals = (struct task_cputime){ + .utime = cputime_zero, + .stime = cputime_zero, + .sum_exec_runtime = 0, + }; + + spin_lock_init(&sig->cputime.totals.lock); } static inline void thread_group_cputime_free(struct signal_struct *sig) { - free_percpu(sig->cputime.totals); } /* diff --git a/kernel/fork.c b/kernel/fork.c index 7b8f2a78be3d..7087d8c0e5e2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -820,14 +820,15 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) int ret; if (clone_flags & CLONE_THREAD) { - ret = thread_group_cputime_clone_thread(current); - if (likely(!ret)) { - atomic_inc(¤t->signal->count); - atomic_inc(¤t->signal->live); - } - return ret; + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); + return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); + + if (sig) + posix_cpu_timers_init_group(sig); + tsk->signal = sig; if (!sig) return -ENOMEM; @@ -864,8 +865,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); - posix_cpu_timers_init_group(sig); - acct_init_pacct(&sig->pacct); tty_audit_fork(sig); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 157de3a47832..fa07da94d7be 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -9,76 +9,6 @@ #include #include -/* - * Allocate the thread_group_cputime structure appropriately and fill in the - * current values of the fields. Called from copy_signal() via - * thread_group_cputime_clone_thread() when adding a second or subsequent - * thread to a thread group. Assumes interrupts are enabled when called. - */ -int thread_group_cputime_alloc(struct task_struct *tsk) -{ - struct signal_struct *sig = tsk->signal; - struct task_cputime *cputime; - - /* - * If we have multiple threads and we don't already have a - * per-CPU task_cputime struct (checked in the caller), allocate - * one and fill it in with the times accumulated so far. We may - * race with another thread so recheck after we pick up the sighand - * lock. - */ - cputime = alloc_percpu(struct task_cputime); - if (cputime == NULL) - return -ENOMEM; - spin_lock_irq(&tsk->sighand->siglock); - if (sig->cputime.totals) { - spin_unlock_irq(&tsk->sighand->siglock); - free_percpu(cputime); - return 0; - } - sig->cputime.totals = cputime; - cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id()); - cputime->utime = tsk->utime; - cputime->stime = tsk->stime; - cputime->sum_exec_runtime = tsk->se.sum_exec_runtime; - spin_unlock_irq(&tsk->sighand->siglock); - return 0; -} - -/** - * thread_group_cputime - Sum the thread group time fields across all CPUs. - * - * @tsk: The task we use to identify the thread group. - * @times: task_cputime structure in which we return the summed fields. - * - * Walk the list of CPUs to sum the per-CPU time fields in the thread group - * time structure. - */ -void thread_group_cputime( - struct task_struct *tsk, - struct task_cputime *times) -{ - struct task_cputime *totals, *tot; - int i; - - totals = tsk->signal->cputime.totals; - if (!totals) { - times->utime = tsk->utime; - times->stime = tsk->stime; - times->sum_exec_runtime = tsk->se.sum_exec_runtime; - return; - } - - times->stime = times->utime = cputime_zero; - times->sum_exec_runtime = 0; - for_each_possible_cpu(i) { - tot = per_cpu_ptr(totals, i); - times->utime = cputime_add(times->utime, tot->utime); - times->stime = cputime_add(times->stime, tot->stime); - times->sum_exec_runtime += tot->sum_exec_runtime; - } -} - /* * Called after updating RLIMIT_CPU to set timer expiration if necessary. */ diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index f2773b5d1226..8ab0cef8ecab 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -296,6 +296,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) static inline void account_group_user_time(struct task_struct *tsk, cputime_t cputime) { + struct task_cputime *times; struct signal_struct *sig; /* tsk == current, ensure it is safe to use ->signal */ @@ -303,13 +304,11 @@ static inline void account_group_user_time(struct task_struct *tsk, return; sig = tsk->signal; - if (sig->cputime.totals) { - struct task_cputime *times; + times = &sig->cputime.totals; - times = per_cpu_ptr(sig->cputime.totals, get_cpu()); - times->utime = cputime_add(times->utime, cputime); - put_cpu_no_resched(); - } + spin_lock(×->lock); + times->utime = cputime_add(times->utime, cputime); + spin_unlock(×->lock); } /** @@ -325,6 +324,7 @@ static inline void account_group_user_time(struct task_struct *tsk, static inline void account_group_system_time(struct task_struct *tsk, cputime_t cputime) { + struct task_cputime *times; struct signal_struct *sig; /* tsk == current, ensure it is safe to use ->signal */ @@ -332,13 +332,11 @@ static inline void account_group_system_time(struct task_struct *tsk, return; sig = tsk->signal; - if (sig->cputime.totals) { - struct task_cputime *times; + times = &sig->cputime.totals; - times = per_cpu_ptr(sig->cputime.totals, get_cpu()); - times->stime = cputime_add(times->stime, cputime); - put_cpu_no_resched(); - } + spin_lock(×->lock); + times->stime = cputime_add(times->stime, cputime); + spin_unlock(×->lock); } /** @@ -354,6 +352,7 @@ static inline void account_group_system_time(struct task_struct *tsk, static inline void account_group_exec_runtime(struct task_struct *tsk, unsigned long long ns) { + struct task_cputime *times; struct signal_struct *sig; sig = tsk->signal; @@ -362,11 +361,9 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (unlikely(!sig)) return; - if (sig->cputime.totals) { - struct task_cputime *times; + times = &sig->cputime.totals; - times = per_cpu_ptr(sig->cputime.totals, get_cpu()); - times->sum_exec_runtime += ns; - put_cpu_no_resched(); - } + spin_lock(×->lock); + times->sum_exec_runtime += ns; + spin_unlock(×->lock); } -- cgit v1.2.3 From 2526c151c31358aec66b63921dd712bbec5ee0cb Mon Sep 17 00:00:00 2001 From: Jon Smirl Date: Fri, 9 Jan 2009 15:49:06 -0700 Subject: drivers/of: Add the of_find_i2c_device_by_node function. The of_find_i2c_device_by_node function allows you to follow a reference in the device tree to an i2c device node and then locate the linux device instantiated by the device tree. Example use: an I2S bus driver finding the i2c_device instance for a codec described by a device tree node. This was waiting for Anton's i2c patches that were just added. Signed-off-by: Jon Smirl Signed-off-by: Grant Likely --- drivers/of/of_i2c.c | 19 +++++++++++++++++++ include/linux/of_i2c.h | 3 +++ 2 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/drivers/of/of_i2c.c b/drivers/of/of_i2c.c index e1b0ad6e918f..fa65a2b2ae2e 100644 --- a/drivers/of/of_i2c.c +++ b/drivers/of/of_i2c.c @@ -66,4 +66,23 @@ void of_register_i2c_devices(struct i2c_adapter *adap, } EXPORT_SYMBOL(of_register_i2c_devices); +static int of_dev_node_match(struct device *dev, void *data) +{ + return dev_archdata_get_node(&dev->archdata) == data; +} + +/* must call put_device() when done with returned i2c_client device */ +struct i2c_client *of_find_i2c_device_by_node(struct device_node *node) +{ + struct device *dev; + + dev = bus_find_device(&i2c_bus_type, NULL, node, + of_dev_node_match); + if (!dev) + return NULL; + + return to_i2c_client(dev); +} +EXPORT_SYMBOL(of_find_i2c_device_by_node); + MODULE_LICENSE("GPL"); diff --git a/include/linux/of_i2c.h b/include/linux/of_i2c.h index bd2a870ec296..34974b5a76f7 100644 --- a/include/linux/of_i2c.h +++ b/include/linux/of_i2c.h @@ -17,4 +17,7 @@ void of_register_i2c_devices(struct i2c_adapter *adap, struct device_node *adap_node); +/* must call put_device() when done with returned i2c_client device */ +struct i2c_client *of_find_i2c_device_by_node(struct device_node *node); + #endif /* __LINUX_OF_I2C_H */ -- cgit v1.2.3 From 85c210edc46d602a1562aeea0fc74919349c8cf0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 9 Jan 2009 16:40:53 -0800 Subject: compiler-gcc.h: add more comments to RELOC_HIDE Requested by C. Lameter Signed-off-by: Andi Kleen Cc: Christoph Lameter Cc: Andi Kleen Cc: Rusty Russell Cc: Stephen Rothwell Cc: Mike Travis Cc: Ingo Molnar Cc: Richard Henderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index af40f8eb86f0..1514d534deeb 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -11,9 +11,19 @@ /* The "volatile" is due to gcc bugs */ #define barrier() __asm__ __volatile__("": : :"memory") -/* This macro obfuscates arithmetic on a variable address so that gcc - shouldn't recognize the original var, and make assumptions about it */ /* + * This macro obfuscates arithmetic on a variable address so that gcc + * shouldn't recognize the original var, and make assumptions about it. + * + * This is needed because the C standard makes it undefined to do + * pointer arithmetic on "objects" outside their boundaries and the + * gcc optimizers assume this is the case. In particular they + * assume such arithmetic does not wrap. + * + * A miscompilation has been observed because of this on PPC. + * To work around it we hide the relationship of the pointer and the object + * using this macro. + * * Versions of the ppc64 compiler before 4.1 had a bug where use of * RELOC_HIDE could trash r30. The bug can be worked around by changing * the inline assembly constraint from =g to =r, in this particular -- cgit v1.2.3 From 69347a236b22c3962ea812511495e502dedfd50c Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Fri, 9 Jan 2009 16:40:56 -0800 Subject: memstick: annotate endianness of attribute structs The code was shifting the endianness appropriately everywhere, annotate the structs to avoid the sparse warnings when assigning the endian types to the struct members, or passing them to be[16|32]_to_cpu: drivers/memstick/core/mspro_block.c:331:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:333:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:335:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:337:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:341:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:347:4: warning: cast to restricted __be32 drivers/memstick/core/mspro_block.c:356:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:358:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:364:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:367:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:369:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:371:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:377:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:478:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:480:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:482:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:484:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:486:4: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:689:22: expected unsigned int [unsigned] [assigned] data_address drivers/memstick/core/mspro_block.c:689:22: got restricted __be32 [usertype] drivers/memstick/core/mspro_block.c:697:3: warning: cast to restricted __be32 drivers/memstick/core/mspro_block.c:960:17: warning: incorrect type in initializer (different base types) drivers/memstick/core/mspro_block.c:960:17: expected unsigned short [unsigned] data_count drivers/memstick/core/mspro_block.c:960:17: got restricted __be16 [usertype] drivers/memstick/core/mspro_block.c:993:6: warning: cast to restricted __be16 drivers/memstick/core/mspro_block.c:995:28: warning: cast to restricted __be16 Signed-off-by: Harvey Harrison Cc: Alex Dubov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/memstick/core/mspro_block.c | 43 ++++++++++++++++++------------------- include/linux/memstick.h | 4 ++-- 2 files changed, 23 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 1f1e3982b1aa..de143deb06f0 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -52,14 +52,14 @@ struct mspro_sys_attr { }; struct mspro_attr_entry { - unsigned int address; - unsigned int size; + __be32 address; + __be32 size; unsigned char id; unsigned char reserved[3]; } __attribute__((packed)); struct mspro_attribute { - unsigned short signature; + __be16 signature; unsigned short version; unsigned char count; unsigned char reserved[11]; @@ -69,28 +69,28 @@ struct mspro_attribute { struct mspro_sys_info { unsigned char class; unsigned char reserved0; - unsigned short block_size; - unsigned short block_count; - unsigned short user_block_count; - unsigned short page_size; + __be16 block_size; + __be16 block_count; + __be16 user_block_count; + __be16 page_size; unsigned char reserved1[2]; unsigned char assembly_date[8]; - unsigned int serial_number; + __be32 serial_number; unsigned char assembly_maker_code; unsigned char assembly_model_code[3]; - unsigned short memory_maker_code; - unsigned short memory_model_code; + __be16 memory_maker_code; + __be16 memory_model_code; unsigned char reserved2[4]; unsigned char vcc; unsigned char vpp; - unsigned short controller_number; - unsigned short controller_function; - unsigned short start_sector; - unsigned short unit_size; + __be16 controller_number; + __be16 controller_function; + __be16 start_sector; + __be16 unit_size; unsigned char ms_sub_class; unsigned char reserved3[4]; unsigned char interface_type; - unsigned short controller_code; + __be16 controller_code; unsigned char format_type; unsigned char reserved4; unsigned char device_type; @@ -124,11 +124,11 @@ struct mspro_specfile { } __attribute__((packed)); struct mspro_devinfo { - unsigned short cylinders; - unsigned short heads; - unsigned short bytes_per_track; - unsigned short bytes_per_sector; - unsigned short sectors_per_track; + __be16 cylinders; + __be16 heads; + __be16 bytes_per_track; + __be16 bytes_per_sector; + __be16 sectors_per_track; unsigned char reserved[6]; } __attribute__((packed)); @@ -338,8 +338,7 @@ static ssize_t mspro_block_attr_show_sysinfo(struct device *dev, rc += scnprintf(buffer + rc, PAGE_SIZE - rc, "assembly date: " "GMT%+d:%d %04u-%02u-%02u %02u:%02u:%02u\n", date_tz, date_tz_f, - be16_to_cpu(*(unsigned short *) - &x_sys->assembly_date[1]), + be16_to_cpup((__be16 *)&x_sys->assembly_date[1]), x_sys->assembly_date[3], x_sys->assembly_date[4], x_sys->assembly_date[5], x_sys->assembly_date[6], x_sys->assembly_date[7]); diff --git a/include/linux/memstick.h b/include/linux/memstick.h index d0c37e682234..690c35a9d4cc 100644 --- a/include/linux/memstick.h +++ b/include/linux/memstick.h @@ -100,8 +100,8 @@ struct mspro_param_register { #define MEMSTICK_SYS_PAR8 0x40 #define MEMSTICK_SYS_SERIAL 0x80 - unsigned short data_count; - unsigned int data_address; + __be16 data_count; + __be32 data_address; unsigned char tpc_param; } __attribute__((packed)); -- cgit v1.2.3 From c4be0c1dc4cdc37b175579be1460f15ac6495e9a Mon Sep 17 00:00:00 2001 From: Takashi Sato Date: Fri, 9 Jan 2009 16:40:58 -0800 Subject: filesystem freeze: add error handling of write_super_lockfs/unlockfs Currently, ext3 in mainline Linux doesn't have the freeze feature which suspends write requests. So, we cannot take a backup which keeps the filesystem's consistency with the storage device's features (snapshot and replication) while it is mounted. In many case, a commercial filesystem (e.g. VxFS) has the freeze feature and it would be used to get the consistent backup. If Linux's standard filesystem ext3 has the freeze feature, we can do it without a commercial filesystem. So I have implemented the ioctls of the freeze feature. I think we can take the consistent backup with the following steps. 1. Freeze the filesystem with the freeze ioctl. 2. Separate the replication volume or create the snapshot with the storage device's feature. 3. Unfreeze the filesystem with the unfreeze ioctl. 4. Take the backup from the separated replication volume or the snapshot. This patch: VFS: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they can return an error. Rename write_super_lockfs and unlockfs of the super block operation freeze_fs and unfreeze_fs to avoid a confusion. ext3, ext4, xfs, gfs2, jfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that write_super_lockfs returns an error if needed, and unlockfs always returns 0. reiserfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they always return 0 (success) to keep a current behavior. Signed-off-by: Takashi Sato Signed-off-by: Masayuki Hamaguchi Cc: Cc: Cc: Christoph Hellwig Cc: Dave Kleikamp Cc: Dave Chinner Cc: Alasdair G Kergon Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 8 +++---- Documentation/filesystems/vfs.txt | 8 +++---- fs/buffer.c | 8 +++---- fs/ext3/super.c | 45 +++++++++++++++++++++++++-------------- fs/ext4/super.c | 45 +++++++++++++++++++++++++++------------ fs/gfs2/ops_super.c | 16 ++++++++------ fs/jfs/super.c | 10 +++++---- fs/reiserfs/super.c | 10 +++++---- fs/xfs/linux-2.6/xfs_super.c | 8 +++---- fs/xfs/xfs_fsops.c | 11 ++++++---- fs/xfs/xfs_fsops.h | 2 +- include/linux/fs.h | 4 ++-- 12 files changed, 107 insertions(+), 68 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index cfbfa15a46ba..ec6a9392a173 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -97,8 +97,8 @@ prototypes: void (*put_super) (struct super_block *); void (*write_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); - void (*write_super_lockfs) (struct super_block *); - void (*unlockfs) (struct super_block *); + int (*freeze_fs) (struct super_block *); + int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); @@ -119,8 +119,8 @@ delete_inode: no put_super: yes yes no write_super: no yes read sync_fs: no no read -write_super_lockfs: ? -unlockfs: ? +freeze_fs: ? +unfreeze_fs: ? statfs: no no no remount_fs: yes yes maybe (see below) clear_inode: no diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index ef19afa186a9..deeeed0faa8f 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -210,8 +210,8 @@ struct super_operations { void (*put_super) (struct super_block *); void (*write_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); - void (*write_super_lockfs) (struct super_block *); - void (*unlockfs) (struct super_block *); + int (*freeze_fs) (struct super_block *); + int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); @@ -270,11 +270,11 @@ or bottom half). a superblock. The second parameter indicates whether the method should wait until the write out has been completed. Optional. - write_super_lockfs: called when VFS is locking a filesystem and + freeze_fs: called when VFS is locking a filesystem and forcing it into a consistent state. This method is currently used by the Logical Volume Manager (LVM). - unlockfs: called when VFS is unlocking a filesystem and making it writable + unfreeze_fs: called when VFS is unlocking a filesystem and making it writable again. statfs: called when the VFS needs to get filesystem statistics. This diff --git a/fs/buffer.c b/fs/buffer.c index c26da785938a..87f9e537b8c3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -221,8 +221,8 @@ struct super_block *freeze_bdev(struct block_device *bdev) sync_blockdev(sb->s_bdev); - if (sb->s_op->write_super_lockfs) - sb->s_op->write_super_lockfs(sb); + if (sb->s_op->freeze_fs) + sb->s_op->freeze_fs(sb); } sync_blockdev(bdev); @@ -242,8 +242,8 @@ void thaw_bdev(struct block_device *bdev, struct super_block *sb) if (sb) { BUG_ON(sb->s_bdev != bdev); - if (sb->s_op->unlockfs) - sb->s_op->unlockfs(sb); + if (sb->s_op->unfreeze_fs) + sb->s_op->unfreeze_fs(sb); sb->s_frozen = SB_UNFROZEN; smp_wmb(); wake_up(&sb->s_wait_unfrozen); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 5d047a030a73..b70d90e08a3c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -48,8 +48,8 @@ static int ext3_load_journal(struct super_block *, struct ext3_super_block *, unsigned long journal_devnum); static int ext3_create_journal(struct super_block *, struct ext3_super_block *, unsigned int); -static void ext3_commit_super (struct super_block * sb, - struct ext3_super_block * es, +static int ext3_commit_super(struct super_block *sb, + struct ext3_super_block *es, int sync); static void ext3_mark_recovery_complete(struct super_block * sb, struct ext3_super_block * es); @@ -60,9 +60,9 @@ static const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16]); static int ext3_remount (struct super_block * sb, int * flags, char * data); static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); -static void ext3_unlockfs(struct super_block *sb); +static int ext3_unfreeze(struct super_block *sb); static void ext3_write_super (struct super_block * sb); -static void ext3_write_super_lockfs(struct super_block *sb); +static int ext3_freeze(struct super_block *sb); /* * Wrappers for journal_start/end. @@ -759,8 +759,8 @@ static const struct super_operations ext3_sops = { .put_super = ext3_put_super, .write_super = ext3_write_super, .sync_fs = ext3_sync_fs, - .write_super_lockfs = ext3_write_super_lockfs, - .unlockfs = ext3_unlockfs, + .freeze_fs = ext3_freeze, + .unfreeze_fs = ext3_unfreeze, .statfs = ext3_statfs, .remount_fs = ext3_remount, .clear_inode = ext3_clear_inode, @@ -2311,21 +2311,23 @@ static int ext3_create_journal(struct super_block * sb, return 0; } -static void ext3_commit_super (struct super_block * sb, - struct ext3_super_block * es, +static int ext3_commit_super(struct super_block *sb, + struct ext3_super_block *es, int sync) { struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; + int error = 0; if (!sbh) - return; + return error; es->s_wtime = cpu_to_le32(get_seconds()); es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); if (sync) - sync_dirty_buffer(sbh); + error = sync_dirty_buffer(sbh); + return error; } @@ -2439,12 +2441,14 @@ static int ext3_sync_fs(struct super_block *sb, int wait) * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. */ -static void ext3_write_super_lockfs(struct super_block *sb) +static int ext3_freeze(struct super_block *sb) { + int error = 0; + journal_t *journal; sb->s_dirt = 0; if (!(sb->s_flags & MS_RDONLY)) { - journal_t *journal = EXT3_SB(sb)->s_journal; + journal = EXT3_SB(sb)->s_journal; /* Now we set up the journal barrier. */ journal_lock_updates(journal); @@ -2453,20 +2457,28 @@ static void ext3_write_super_lockfs(struct super_block *sb) * We don't want to clear needs_recovery flag when we failed * to flush the journal. */ - if (journal_flush(journal) < 0) - return; + error = journal_flush(journal); + if (error < 0) + goto out; /* Journal blocked and flushed, clear needs_recovery flag. */ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + if (error) + goto out; } + return 0; + +out: + journal_unlock_updates(journal); + return error; } /* * Called by LVM after the snapshot is done. We need to reset the RECOVER * flag here, even though the filesystem is not technically dirty yet. */ -static void ext3_unlockfs(struct super_block *sb) +static int ext3_unfreeze(struct super_block *sb) { if (!(sb->s_flags & MS_RDONLY)) { lock_super(sb); @@ -2476,6 +2488,7 @@ static void ext3_unlockfs(struct super_block *sb) unlock_super(sb); journal_unlock_updates(EXT3_SB(sb)->s_journal); } + return 0; } static int ext3_remount (struct super_block * sb, int * flags, char * data) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8f7e0be8ab1b..e5f06a5f045e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -51,7 +51,7 @@ struct proc_dir_entry *ext4_proc_root; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); -static void ext4_commit_super(struct super_block *sb, +static int ext4_commit_super(struct super_block *sb, struct ext4_super_block *es, int sync); static void ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); @@ -62,9 +62,9 @@ static const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); -static void ext4_unlockfs(struct super_block *sb); +static int ext4_unfreeze(struct super_block *sb); static void ext4_write_super(struct super_block *sb); -static void ext4_write_super_lockfs(struct super_block *sb); +static int ext4_freeze(struct super_block *sb); ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, @@ -978,8 +978,8 @@ static const struct super_operations ext4_sops = { .put_super = ext4_put_super, .write_super = ext4_write_super, .sync_fs = ext4_sync_fs, - .write_super_lockfs = ext4_write_super_lockfs, - .unlockfs = ext4_unlockfs, + .freeze_fs = ext4_freeze, + .unfreeze_fs = ext4_unfreeze, .statfs = ext4_statfs, .remount_fs = ext4_remount, .clear_inode = ext4_clear_inode, @@ -2888,13 +2888,14 @@ static int ext4_load_journal(struct super_block *sb, return 0; } -static void ext4_commit_super(struct super_block *sb, +static int ext4_commit_super(struct super_block *sb, struct ext4_super_block *es, int sync) { struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; + int error = 0; if (!sbh) - return; + return error; if (buffer_write_io_error(sbh)) { /* * Oh, dear. A previous attempt to write the @@ -2918,14 +2919,19 @@ static void ext4_commit_super(struct super_block *sb, BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); if (sync) { - sync_dirty_buffer(sbh); - if (buffer_write_io_error(sbh)) { + error = sync_dirty_buffer(sbh); + if (error) + return error; + + error = buffer_write_io_error(sbh); + if (error) { printk(KERN_ERR "EXT4-fs: I/O error while writing " "superblock for %s.\n", sb->s_id); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } } + return error; } @@ -3058,12 +3064,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait) * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. */ -static void ext4_write_super_lockfs(struct super_block *sb) +static int ext4_freeze(struct super_block *sb) { + int error = 0; + journal_t *journal; sb->s_dirt = 0; if (!(sb->s_flags & MS_RDONLY)) { - journal_t *journal = EXT4_SB(sb)->s_journal; + journal = EXT4_SB(sb)->s_journal; if (journal) { /* Now we set up the journal barrier. */ @@ -3073,21 +3081,29 @@ static void ext4_write_super_lockfs(struct super_block *sb) * We don't want to clear needs_recovery flag when we * failed to flush the journal. */ - if (jbd2_journal_flush(journal) < 0) - return; + error = jbd2_journal_flush(journal); + if (error < 0) + goto out; } /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + if (error) + goto out; } + return 0; +out: + jbd2_journal_unlock_updates(journal); + return error; } /* * Called by LVM after the snapshot is done. We need to reset the RECOVER * flag here, even though the filesystem is not technically dirty yet. */ -static void ext4_unlockfs(struct super_block *sb) +static int ext4_unfreeze(struct super_block *sb) { if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) { lock_super(sb); @@ -3097,6 +3113,7 @@ static void ext4_unlockfs(struct super_block *sb) unlock_super(sb); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } + return 0; } static int ext4_remount(struct super_block *sb, int *flags, char *data) diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 777783deddcb..320323d03479 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -211,18 +211,18 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) } /** - * gfs2_write_super_lockfs - prevent further writes to the filesystem + * gfs2_freeze - prevent further writes to the filesystem * @sb: the VFS structure for the filesystem * */ -static void gfs2_write_super_lockfs(struct super_block *sb) +static int gfs2_freeze(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return; + return -EINVAL; for (;;) { error = gfs2_freeze_fs(sdp); @@ -242,17 +242,19 @@ static void gfs2_write_super_lockfs(struct super_block *sb) fs_err(sdp, "retrying...\n"); msleep(1000); } + return 0; } /** - * gfs2_unlockfs - reallow writes to the filesystem + * gfs2_unfreeze - reallow writes to the filesystem * @sb: the VFS structure for the filesystem * */ -static void gfs2_unlockfs(struct super_block *sb) +static int gfs2_unfreeze(struct super_block *sb) { gfs2_unfreeze_fs(sb->s_fs_info); + return 0; } /** @@ -688,8 +690,8 @@ const struct super_operations gfs2_super_ops = { .put_super = gfs2_put_super, .write_super = gfs2_write_super, .sync_fs = gfs2_sync_fs, - .write_super_lockfs = gfs2_write_super_lockfs, - .unlockfs = gfs2_unlockfs, + .freeze_fs = gfs2_freeze, + .unfreeze_fs = gfs2_unfreeze, .statfs = gfs2_statfs, .remount_fs = gfs2_remount_fs, .clear_inode = gfs2_clear_inode, diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 0dae345e481b..b37d1f78b854 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -543,7 +543,7 @@ out_kfree: return ret; } -static void jfs_write_super_lockfs(struct super_block *sb) +static int jfs_freeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; @@ -553,9 +553,10 @@ static void jfs_write_super_lockfs(struct super_block *sb) lmLogShutdown(log); updateSuper(sb, FM_CLEAN); } + return 0; } -static void jfs_unlockfs(struct super_block *sb) +static int jfs_unfreeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; @@ -568,6 +569,7 @@ static void jfs_unlockfs(struct super_block *sb) else txResume(sb); } + return 0; } static int jfs_get_sb(struct file_system_type *fs_type, @@ -735,8 +737,8 @@ static const struct super_operations jfs_super_operations = { .delete_inode = jfs_delete_inode, .put_super = jfs_put_super, .sync_fs = jfs_sync_fs, - .write_super_lockfs = jfs_write_super_lockfs, - .unlockfs = jfs_unlockfs, + .freeze_fs = jfs_freeze, + .unfreeze_fs = jfs_unfreeze, .statfs = jfs_statfs, .remount_fs = jfs_remount, .show_options = jfs_show_options, diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index c55651f1407c..f3c820b75829 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -83,7 +83,7 @@ static void reiserfs_write_super(struct super_block *s) reiserfs_sync_fs(s, 1); } -static void reiserfs_write_super_lockfs(struct super_block *s) +static int reiserfs_freeze(struct super_block *s) { struct reiserfs_transaction_handle th; reiserfs_write_lock(s); @@ -101,11 +101,13 @@ static void reiserfs_write_super_lockfs(struct super_block *s) } s->s_dirt = 0; reiserfs_write_unlock(s); + return 0; } -static void reiserfs_unlockfs(struct super_block *s) +static int reiserfs_unfreeze(struct super_block *s) { reiserfs_allow_writes(s); + return 0; } extern const struct in_core_key MAX_IN_CORE_KEY; @@ -613,8 +615,8 @@ static const struct super_operations reiserfs_sops = { .put_super = reiserfs_put_super, .write_super = reiserfs_write_super, .sync_fs = reiserfs_sync_fs, - .write_super_lockfs = reiserfs_write_super_lockfs, - .unlockfs = reiserfs_unlockfs, + .freeze_fs = reiserfs_freeze, + .unfreeze_fs = reiserfs_unfreeze, .statfs = reiserfs_statfs, .remount_fs = reiserfs_remount, .show_options = generic_show_options, diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index be846d606ae8..95a971080368 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1269,14 +1269,14 @@ xfs_fs_remount( * need to take care of the metadata. Once that's done write a dummy * record to dirty the log in case of a crash while frozen. */ -STATIC void -xfs_fs_lockfs( +STATIC int +xfs_fs_freeze( struct super_block *sb) { struct xfs_mount *mp = XFS_M(sb); xfs_quiesce_attr(mp); - xfs_fs_log_dummy(mp); + return -xfs_fs_log_dummy(mp); } STATIC int @@ -1557,7 +1557,7 @@ static struct super_operations xfs_super_operations = { .put_super = xfs_fs_put_super, .write_super = xfs_fs_write_super, .sync_fs = xfs_fs_sync_super, - .write_super_lockfs = xfs_fs_lockfs, + .freeze_fs = xfs_fs_freeze, .statfs = xfs_fs_statfs, .remount_fs = xfs_fs_remount, .show_options = xfs_fs_show_options, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 852b6d32e8d0..680d0e0ec932 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -595,17 +595,19 @@ out: return 0; } -void +int xfs_fs_log_dummy( xfs_mount_t *mp) { xfs_trans_t *tp; xfs_inode_t *ip; + int error; tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); - if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) { + error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + if (error) { xfs_trans_cancel(tp, 0); - return; + return error; } ip = mp->m_rootip; @@ -615,9 +617,10 @@ xfs_fs_log_dummy( xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); - xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; } int diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 300d0c9d61ad..88435e0a77c9 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); -extern void xfs_fs_log_dummy(xfs_mount_t *mp); +extern int xfs_fs_log_dummy(xfs_mount_t *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 0b87b29f4797..3e59182de9df 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1377,8 +1377,8 @@ struct super_operations { void (*put_super) (struct super_block *); void (*write_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); - void (*write_super_lockfs) (struct super_block *); - void (*unlockfs) (struct super_block *); + int (*freeze_fs) (struct super_block *); + int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); -- cgit v1.2.3 From fcccf502540e3d752d33b2d8e976034dee81f9f7 Mon Sep 17 00:00:00 2001 From: Takashi Sato Date: Fri, 9 Jan 2009 16:40:59 -0800 Subject: filesystem freeze: implement generic freeze feature The ioctls for the generic freeze feature are below. o Freeze the filesystem int ioctl(int fd, int FIFREEZE, arg) fd: The file descriptor of the mountpoint FIFREEZE: request code for the freeze arg: Ignored Return value: 0 if the operation succeeds. Otherwise, -1 o Unfreeze the filesystem int ioctl(int fd, int FITHAW, arg) fd: The file descriptor of the mountpoint FITHAW: request code for unfreeze arg: Ignored Return value: 0 if the operation succeeds. Otherwise, -1 Error number: If the filesystem has already been unfrozen, errno is set to EINVAL. [akpm@linux-foundation.org: fix CONFIG_BLOCK=n] Signed-off-by: Takashi Sato Signed-off-by: Masayuki Hamaguchi Cc: Cc: Cc: Christoph Hellwig Cc: Dave Kleikamp Cc: Dave Chinner Cc: Alasdair G Kergon Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 2 ++ fs/buffer.c | 74 +++++++++++++++++++++++++++++++++++++++------ fs/ioctl.c | 46 ++++++++++++++++++++++++++++ include/linux/buffer_head.h | 11 ++++++- include/linux/fs.h | 7 +++++ 5 files changed, 130 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/block_dev.c b/fs/block_dev.c index ac7031f12ea5..b3c1efff5e1d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -285,6 +285,8 @@ static void init_once(void *foo) INIT_LIST_HEAD(&bdev->bd_holder_list); #endif inode_init_once(&ei->vfs_inode); + /* Initialize mutex for freeze. */ + mutex_init(&bdev->bd_fsfreeze_mutex); } static inline void __bd_forget(struct inode *inode) diff --git a/fs/buffer.c b/fs/buffer.c index 87f9e537b8c3..b6e8b8632e2f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -203,10 +203,25 @@ int fsync_bdev(struct block_device *bdev) * happen on bdev until thaw_bdev() is called. * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. + * The reference counter (bd_fsfreeze_count) guarantees that only the last + * unfreeze process can unfreeze the frozen filesystem actually when multiple + * freeze requests arrive simultaneously. It counts up in freeze_bdev() and + * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze + * actually. */ struct super_block *freeze_bdev(struct block_device *bdev) { struct super_block *sb; + int error = 0; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + bdev->bd_fsfreeze_count++; + sb = get_super(bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; + } + bdev->bd_fsfreeze_count++; down(&bdev->bd_mount_sem); sb = get_super(bdev); @@ -221,11 +236,24 @@ struct super_block *freeze_bdev(struct block_device *bdev) sync_blockdev(sb->s_bdev); - if (sb->s_op->freeze_fs) - sb->s_op->freeze_fs(sb); + if (sb->s_op->freeze_fs) { + error = sb->s_op->freeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem freeze failed\n"); + sb->s_frozen = SB_UNFROZEN; + drop_super(sb); + up(&bdev->bd_mount_sem); + bdev->bd_fsfreeze_count--; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return ERR_PTR(error); + } + } } sync_blockdev(bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ } EXPORT_SYMBOL(freeze_bdev); @@ -237,20 +265,48 @@ EXPORT_SYMBOL(freeze_bdev); * * Unlocks the filesystem and marks it writeable again after freeze_bdev(). */ -void thaw_bdev(struct block_device *bdev, struct super_block *sb) +int thaw_bdev(struct block_device *bdev, struct super_block *sb) { + int error = 0; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (!bdev->bd_fsfreeze_count) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return -EINVAL; + } + + bdev->bd_fsfreeze_count--; + if (bdev->bd_fsfreeze_count > 0) { + if (sb) + drop_super(sb); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; + } + if (sb) { BUG_ON(sb->s_bdev != bdev); - - if (sb->s_op->unfreeze_fs) - sb->s_op->unfreeze_fs(sb); - sb->s_frozen = SB_UNFROZEN; - smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + if (!(sb->s_flags & MS_RDONLY)) { + if (sb->s_op->unfreeze_fs) { + error = sb->s_op->unfreeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem thaw failed\n"); + sb->s_frozen = SB_FREEZE_TRANS; + bdev->bd_fsfreeze_count++; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; + } + } + sb->s_frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_wait_unfrozen); + } drop_super(sb); } up(&bdev->bd_mount_sem); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; } EXPORT_SYMBOL(thaw_bdev); diff --git a/fs/ioctl.c b/fs/ioctl.c index cc3f1aa1cf7b..20b0a8a24c6b 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -439,6 +439,43 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp, return error; } +static int ioctl_fsfreeze(struct file *filp) +{ + struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* If filesystem doesn't support freeze feature, return. */ + if (sb->s_op->freeze_fs == NULL) + return -EOPNOTSUPP; + + /* If a blockdevice-backed filesystem isn't specified, return. */ + if (sb->s_bdev == NULL) + return -EINVAL; + + /* Freeze */ + sb = freeze_bdev(sb->s_bdev); + if (IS_ERR(sb)) + return PTR_ERR(sb); + return 0; +} + +static int ioctl_fsthaw(struct file *filp) +{ + struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */ + if (sb->s_bdev == NULL) + return -EINVAL; + + /* Thaw */ + return thaw_bdev(sb->s_bdev, sb); +} + /* * When you add any new common ioctls to the switches above and below * please update compat_sys_ioctl() too. @@ -486,6 +523,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, } else error = -ENOTTY; break; + + case FIFREEZE: + error = ioctl_fsfreeze(filp); + break; + + case FITHAW: + error = ioctl_fsthaw(filp); + break; + default: if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) error = file_ioctl(filp, cmd, arg); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 8605f8a74df9..bd7ac793be19 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -171,7 +171,7 @@ void __wait_on_buffer(struct buffer_head *); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); int fsync_bdev(struct block_device *); struct super_block *freeze_bdev(struct block_device *); -void thaw_bdev(struct block_device *, struct super_block *); +int thaw_bdev(struct block_device *, struct super_block *); int fsync_super(struct super_block *); int fsync_no_super(struct block_device *); struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, @@ -346,6 +346,15 @@ static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } static inline void invalidate_bdev(struct block_device *bdev) {} +static inline struct super_block *freeze_bdev(struct block_device *sb) +{ + return NULL; +} + +static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb) +{ + return 0; +} #endif /* CONFIG_BLOCK */ #endif /* _LINUX_BUFFER_HEAD_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 3e59182de9df..6022f44043f2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -234,6 +234,8 @@ struct inodes_stat_t { #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ +#define FIFREEZE _IOWR('X', 119, int) /* Freeze */ +#define FITHAW _IOWR('X', 120, int) /* Thaw */ #define FS_IOC_GETFLAGS _IOR('f', 1, long) #define FS_IOC_SETFLAGS _IOW('f', 2, long) @@ -591,6 +593,11 @@ struct block_device { * care to not mess up bd_private for that case. */ unsigned long bd_private; + + /* The counter of freeze processes */ + int bd_fsfreeze_count; + /* Mutex for freeze */ + struct mutex bd_fsfreeze_mutex; }; /* -- cgit v1.2.3 From f4b477c47332367d35686bd2b808c2156b96d7c7 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sat, 10 Jan 2009 11:12:09 +0000 Subject: rbtree: add const qualifier to some functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'rb_first()', 'rb_last()', 'rb_next()' and 'rb_prev()' calls take a pointer to an RB node or RB root. They do not change the pointed objects, so add a 'const' qualifier in order to make life of the users of these functions easier. Indeed, if I have my own constant pointer &const struct my_type *p, and I call 'rb_next(&p->rb)', I get a GCC warning: warning: passing argument 1 of ‘rb_next’ discards qualifiers from pointer target type Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse Signed-off-by: Linus Torvalds --- include/linux/rbtree.h | 8 ++++---- lib/rbtree.c | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 344bc3495ddb..9c295411d01f 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -140,10 +140,10 @@ extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ -extern struct rb_node *rb_next(struct rb_node *); -extern struct rb_node *rb_prev(struct rb_node *); -extern struct rb_node *rb_first(struct rb_root *); -extern struct rb_node *rb_last(struct rb_root *); +extern struct rb_node *rb_next(const struct rb_node *); +extern struct rb_node *rb_prev(const struct rb_node *); +extern struct rb_node *rb_first(const struct rb_root *); +extern struct rb_node *rb_last(const struct rb_root *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, diff --git a/lib/rbtree.c b/lib/rbtree.c index 48499c2d88cc..9956b99649f0 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -292,7 +292,7 @@ EXPORT_SYMBOL(rb_erase); /* * This function returns the first node (in sort order) of the tree. */ -struct rb_node *rb_first(struct rb_root *root) +struct rb_node *rb_first(const struct rb_root *root) { struct rb_node *n; @@ -305,7 +305,7 @@ struct rb_node *rb_first(struct rb_root *root) } EXPORT_SYMBOL(rb_first); -struct rb_node *rb_last(struct rb_root *root) +struct rb_node *rb_last(const struct rb_root *root) { struct rb_node *n; @@ -318,7 +318,7 @@ struct rb_node *rb_last(struct rb_root *root) } EXPORT_SYMBOL(rb_last); -struct rb_node *rb_next(struct rb_node *node) +struct rb_node *rb_next(const struct rb_node *node) { struct rb_node *parent; @@ -331,7 +331,7 @@ struct rb_node *rb_next(struct rb_node *node) node = node->rb_right; while (node->rb_left) node=node->rb_left; - return node; + return (struct rb_node *)node; } /* No right-hand children. Everything down and left is @@ -347,7 +347,7 @@ struct rb_node *rb_next(struct rb_node *node) } EXPORT_SYMBOL(rb_next); -struct rb_node *rb_prev(struct rb_node *node) +struct rb_node *rb_prev(const struct rb_node *node) { struct rb_node *parent; @@ -360,7 +360,7 @@ struct rb_node *rb_prev(struct rb_node *node) node = node->rb_left; while (node->rb_right) node=node->rb_right; - return node; + return (struct rb_node *)node; } /* No left-hand children. Go up till we find an ancestor which -- cgit v1.2.3 From 886ad09fc83342aa1c5a02a0b6d3298b78a8067f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 9 Jan 2009 15:54:07 -0800 Subject: libata: Add a per-host flag to opt-in into parallel port probes This patch adds a per host flag that allows drivers to opt in into having its busses scanned in parallel. Drivers that do not set this flag get their ports scanned in the "original" sequence. Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- drivers/ata/ahci.c | 3 +++ drivers/ata/libata-core.c | 9 +++++++++ include/linux/libata.h | 1 + 3 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 7f701cbe14ab..96039671e3b9 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -2660,6 +2660,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) host->iomap = pcim_iomap_table(pdev); host->private_data = hpriv; + if (!(hpriv->cap & HOST_CAP_SSS)) + host->flags |= ATA_HOST_PARALLEL_SCAN; + if (pi.flags & ATA_FLAG_EM) ahci_reset_em(host); diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index c507a9ac78f4..f810078fafcc 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5920,6 +5920,15 @@ static void async_port_probe(void *data, async_cookie_t cookie) { int rc; struct ata_port *ap = data; + + /* + * If we're not allowed to scan this host in parallel, + * we need to wait until all previous scans have completed + * before going further. + */ + if (!(ap->host->flags & ATA_HOST_PARALLEL_SCAN)) + async_synchronize_cookie(cookie); + /* probe */ if (ap->ops->error_handler) { struct ata_eh_info *ehi = &ap->link.eh_info; diff --git a/include/linux/libata.h b/include/linux/libata.h index 4f7c8fb4d3fe..b6b8a7f3ec66 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -239,6 +239,7 @@ enum { /* host set flags */ ATA_HOST_SIMPLEX = (1 << 0), /* Host is simplex, one DMA channel per host only */ ATA_HOST_STARTED = (1 << 1), /* Host started */ + ATA_HOST_PARALLEL_SCAN = (1 << 2), /* Ports on this host can be scanned in parallel */ /* bits 24:31 of host->flags are reserved for LLD specific flags */ -- cgit v1.2.3 From f52046b14b1e1a8a02ae48d0c69d39c5e204644f Mon Sep 17 00:00:00 2001 From: Balaji Rao Date: Fri, 9 Jan 2009 01:49:01 +0100 Subject: mfd: PCF50633 core driver This patch implements the core of the PCF50633 driver. This core driver has generic register read/write functions and does interrupt management for its sub devices. Signed-off-by: Balaji Rao Cc: Andy Green Signed-off-by: Samuel Ortiz --- drivers/mfd/Kconfig | 9 + drivers/mfd/Makefile | 2 + drivers/mfd/pcf50633-core.c | 710 ++++++++++++++++++++++++++++++++++++++ include/linux/mfd/pcf50633/core.h | 218 ++++++++++++ 4 files changed, 939 insertions(+) create mode 100644 drivers/mfd/pcf50633-core.c create mode 100644 include/linux/mfd/pcf50633/core.h (limited to 'include/linux') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 416f9e7286ba..3ac32e45b03b 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -217,6 +217,15 @@ config MFD_WM8350_I2C I2C as the control interface. Additional options must be selected to enable support for the functionality of the chip. +config MFD_PCF50633 + tristate "Support for NXP PCF50633" + depends on I2C + help + Say yes here if you have NXP PCF50633 chip on your board. + This core driver provides register access and IRQ handling + facilities, and registers devices for the various functions + so that function-specific drivers can bind to them. + endmenu menu "Multimedia Capabilities Port drivers" diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 0c9418b36c26..23d4d10981b3 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -37,3 +37,5 @@ endif obj-$(CONFIG_UCB1400_CORE) += ucb1400_core.o obj-$(CONFIG_PMIC_DA903X) += da903x.o + +obj-$(CONFIG_MFD_PCF50633) += pcf50633-core.o \ No newline at end of file diff --git a/drivers/mfd/pcf50633-core.c b/drivers/mfd/pcf50633-core.c new file mode 100644 index 000000000000..24508e28e3fb --- /dev/null +++ b/drivers/mfd/pcf50633-core.c @@ -0,0 +1,710 @@ +/* NXP PCF50633 Power Management Unit (PMU) driver + * + * (C) 2006-2008 by Openmoko, Inc. + * Author: Harald Welte + * Balaji Rao + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* Two MBCS registers used during cold start */ +#define PCF50633_REG_MBCS1 0x4b +#define PCF50633_REG_MBCS2 0x4c +#define PCF50633_MBCS1_USBPRES 0x01 +#define PCF50633_MBCS1_ADAPTPRES 0x01 + +static int __pcf50633_read(struct pcf50633 *pcf, u8 reg, int num, u8 *data) +{ + int ret; + + ret = i2c_smbus_read_i2c_block_data(pcf->i2c_client, reg, + num, data); + if (ret < 0) + dev_err(pcf->dev, "Error reading %d regs at %d\n", num, reg); + + return ret; +} + +static int __pcf50633_write(struct pcf50633 *pcf, u8 reg, int num, u8 *data) +{ + int ret; + + ret = i2c_smbus_write_i2c_block_data(pcf->i2c_client, reg, + num, data); + if (ret < 0) + dev_err(pcf->dev, "Error writing %d regs at %d\n", num, reg); + + return ret; + +} + +/* Read a block of upto 32 regs */ +int pcf50633_read_block(struct pcf50633 *pcf, u8 reg, + int nr_regs, u8 *data) +{ + int ret; + + mutex_lock(&pcf->lock); + ret = __pcf50633_read(pcf, reg, nr_regs, data); + mutex_unlock(&pcf->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(pcf50633_read_block); + +/* Write a block of upto 32 regs */ +int pcf50633_write_block(struct pcf50633 *pcf , u8 reg, + int nr_regs, u8 *data) +{ + int ret; + + mutex_lock(&pcf->lock); + ret = __pcf50633_write(pcf, reg, nr_regs, data); + mutex_unlock(&pcf->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(pcf50633_write_block); + +u8 pcf50633_reg_read(struct pcf50633 *pcf, u8 reg) +{ + u8 val; + + mutex_lock(&pcf->lock); + __pcf50633_read(pcf, reg, 1, &val); + mutex_unlock(&pcf->lock); + + return val; +} +EXPORT_SYMBOL_GPL(pcf50633_reg_read); + +int pcf50633_reg_write(struct pcf50633 *pcf, u8 reg, u8 val) +{ + int ret; + + mutex_lock(&pcf->lock); + ret = __pcf50633_write(pcf, reg, 1, &val); + mutex_unlock(&pcf->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(pcf50633_reg_write); + +int pcf50633_reg_set_bit_mask(struct pcf50633 *pcf, u8 reg, u8 mask, u8 val) +{ + int ret; + u8 tmp; + + val &= mask; + + mutex_lock(&pcf->lock); + ret = __pcf50633_read(pcf, reg, 1, &tmp); + if (ret < 0) + goto out; + + tmp &= ~mask; + tmp |= val; + ret = __pcf50633_write(pcf, reg, 1, &tmp); + +out: + mutex_unlock(&pcf->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(pcf50633_reg_set_bit_mask); + +int pcf50633_reg_clear_bits(struct pcf50633 *pcf, u8 reg, u8 val) +{ + int ret; + u8 tmp; + + mutex_lock(&pcf->lock); + ret = __pcf50633_read(pcf, reg, 1, &tmp); + if (ret < 0) + goto out; + + tmp &= ~val; + ret = __pcf50633_write(pcf, reg, 1, &tmp); + +out: + mutex_unlock(&pcf->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(pcf50633_reg_clear_bits); + +/* sysfs attributes */ +static ssize_t show_dump_regs(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct pcf50633 *pcf = dev_get_drvdata(dev); + u8 dump[16]; + int n, n1, idx = 0; + char *buf1 = buf; + static u8 address_no_read[] = { /* must be ascending */ + PCF50633_REG_INT1, + PCF50633_REG_INT2, + PCF50633_REG_INT3, + PCF50633_REG_INT4, + PCF50633_REG_INT5, + 0 /* terminator */ + }; + + for (n = 0; n < 256; n += sizeof(dump)) { + for (n1 = 0; n1 < sizeof(dump); n1++) + if (n == address_no_read[idx]) { + idx++; + dump[n1] = 0x00; + } else + dump[n1] = pcf50633_reg_read(pcf, n + n1); + + hex_dump_to_buffer(dump, sizeof(dump), 16, 1, buf1, 128, 0); + buf1 += strlen(buf1); + *buf1++ = '\n'; + *buf1 = '\0'; + } + + return buf1 - buf; +} +static DEVICE_ATTR(dump_regs, 0400, show_dump_regs, NULL); + +static ssize_t show_resume_reason(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pcf50633 *pcf = dev_get_drvdata(dev); + int n; + + n = sprintf(buf, "%02x%02x%02x%02x%02x\n", + pcf->resume_reason[0], + pcf->resume_reason[1], + pcf->resume_reason[2], + pcf->resume_reason[3], + pcf->resume_reason[4]); + + return n; +} +static DEVICE_ATTR(resume_reason, 0400, show_resume_reason, NULL); + +static struct attribute *pcf_sysfs_entries[] = { + &dev_attr_dump_regs.attr, + &dev_attr_resume_reason.attr, + NULL, +}; + +static struct attribute_group pcf_attr_group = { + .name = NULL, /* put in device directory */ + .attrs = pcf_sysfs_entries, +}; + +int pcf50633_register_irq(struct pcf50633 *pcf, int irq, + void (*handler) (int, void *), void *data) +{ + if (irq < 0 || irq > PCF50633_NUM_IRQ || !handler) + return -EINVAL; + + if (WARN_ON(pcf->irq_handler[irq].handler)) + return -EBUSY; + + mutex_lock(&pcf->lock); + pcf->irq_handler[irq].handler = handler; + pcf->irq_handler[irq].data = data; + mutex_unlock(&pcf->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(pcf50633_register_irq); + +int pcf50633_free_irq(struct pcf50633 *pcf, int irq) +{ + if (irq < 0 || irq > PCF50633_NUM_IRQ) + return -EINVAL; + + mutex_lock(&pcf->lock); + pcf->irq_handler[irq].handler = NULL; + mutex_unlock(&pcf->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(pcf50633_free_irq); + +static int __pcf50633_irq_mask_set(struct pcf50633 *pcf, int irq, u8 mask) +{ + u8 reg, bits, tmp; + int ret = 0, idx; + + idx = irq >> 3; + reg = PCF50633_REG_INT1M + idx; + bits = 1 << (irq & 0x07); + + mutex_lock(&pcf->lock); + + if (mask) { + ret = __pcf50633_read(pcf, reg, 1, &tmp); + if (ret < 0) + goto out; + + tmp |= bits; + + ret = __pcf50633_write(pcf, reg, 1, &tmp); + if (ret < 0) + goto out; + + pcf->mask_regs[idx] &= ~bits; + pcf->mask_regs[idx] |= bits; + } else { + ret = __pcf50633_read(pcf, reg, 1, &tmp); + if (ret < 0) + goto out; + + tmp &= ~bits; + + ret = __pcf50633_write(pcf, reg, 1, &tmp); + if (ret < 0) + goto out; + + pcf->mask_regs[idx] &= ~bits; + } +out: + mutex_unlock(&pcf->lock); + + return ret; +} + +int pcf50633_irq_mask(struct pcf50633 *pcf, int irq) +{ + dev_info(pcf->dev, "Masking IRQ %d\n", irq); + + return __pcf50633_irq_mask_set(pcf, irq, 1); +} +EXPORT_SYMBOL_GPL(pcf50633_irq_mask); + +int pcf50633_irq_unmask(struct pcf50633 *pcf, int irq) +{ + dev_info(pcf->dev, "Unmasking IRQ %d\n", irq); + + return __pcf50633_irq_mask_set(pcf, irq, 0); +} +EXPORT_SYMBOL_GPL(pcf50633_irq_unmask); + +int pcf50633_irq_mask_get(struct pcf50633 *pcf, int irq) +{ + u8 reg, bits; + + reg = irq >> 3; + bits = 1 << (irq & 0x07); + + return pcf->mask_regs[reg] & bits; +} +EXPORT_SYMBOL_GPL(pcf50633_irq_mask_get); + +static void pcf50633_irq_call_handler(struct pcf50633 *pcf, int irq) +{ + if (pcf->irq_handler[irq].handler) + pcf->irq_handler[irq].handler(irq, pcf->irq_handler[irq].data); +} + +/* Maximum amount of time ONKEY is held before emergency action is taken */ +#define PCF50633_ONKEY1S_TIMEOUT 8 + +static void pcf50633_irq_worker(struct work_struct *work) +{ + struct pcf50633 *pcf; + int ret, i, j; + u8 pcf_int[5], chgstat; + + pcf = container_of(work, struct pcf50633, irq_work); + + /* Read the 5 INT regs in one transaction */ + ret = pcf50633_read_block(pcf, PCF50633_REG_INT1, + ARRAY_SIZE(pcf_int), pcf_int); + if (ret != ARRAY_SIZE(pcf_int)) { + dev_err(pcf->dev, "Error reading INT registers\n"); + + /* + * If this doesn't ACK the interrupt to the chip, we'll be + * called once again as we're level triggered. + */ + goto out; + } + + /* We immediately read the usb and adapter status. We thus make sure + * only of USBINS/USBREM IRQ handlers are called */ + if (pcf_int[0] & (PCF50633_INT1_USBINS | PCF50633_INT1_USBREM)) { + chgstat = pcf50633_reg_read(pcf, PCF50633_REG_MBCS2); + if (chgstat & (0x3 << 4)) + pcf_int[0] &= ~(1 << PCF50633_INT1_USBREM); + else + pcf_int[0] &= ~(1 << PCF50633_INT1_USBINS); + } + + /* Make sure only one of ADPINS or ADPREM is set */ + if (pcf_int[0] & (PCF50633_INT1_ADPINS | PCF50633_INT1_ADPREM)) { + chgstat = pcf50633_reg_read(pcf, PCF50633_REG_MBCS2); + if (chgstat & (0x3 << 4)) + pcf_int[0] &= ~(1 << PCF50633_INT1_ADPREM); + else + pcf_int[0] &= ~(1 << PCF50633_INT1_ADPINS); + } + + dev_dbg(pcf->dev, "INT1=0x%02x INT2=0x%02x INT3=0x%02x " + "INT4=0x%02x INT5=0x%02x\n", pcf_int[0], + pcf_int[1], pcf_int[2], pcf_int[3], pcf_int[4]); + + /* Some revisions of the chip don't have a 8s standby mode on + * ONKEY1S press. We try to manually do it in such cases. */ + if ((pcf_int[0] & PCF50633_INT1_SECOND) && pcf->onkey1s_held) { + dev_info(pcf->dev, "ONKEY1S held for %d secs\n", + pcf->onkey1s_held); + if (pcf->onkey1s_held++ == PCF50633_ONKEY1S_TIMEOUT) + if (pcf->pdata->force_shutdown) + pcf->pdata->force_shutdown(pcf); + } + + if (pcf_int[2] & PCF50633_INT3_ONKEY1S) { + dev_info(pcf->dev, "ONKEY1S held\n"); + pcf->onkey1s_held = 1 ; + + /* Unmask IRQ_SECOND */ + pcf50633_reg_clear_bits(pcf, PCF50633_REG_INT1M, + PCF50633_INT1_SECOND); + + /* Unmask IRQ_ONKEYR */ + pcf50633_reg_clear_bits(pcf, PCF50633_REG_INT2M, + PCF50633_INT2_ONKEYR); + } + + if ((pcf_int[1] & PCF50633_INT2_ONKEYR) && pcf->onkey1s_held) { + pcf->onkey1s_held = 0; + + /* Mask SECOND and ONKEYR interrupts */ + if (pcf->mask_regs[0] & PCF50633_INT1_SECOND) + pcf50633_reg_set_bit_mask(pcf, + PCF50633_REG_INT1M, + PCF50633_INT1_SECOND, + PCF50633_INT1_SECOND); + + if (pcf->mask_regs[1] & PCF50633_INT2_ONKEYR) + pcf50633_reg_set_bit_mask(pcf, + PCF50633_REG_INT2M, + PCF50633_INT2_ONKEYR, + PCF50633_INT2_ONKEYR); + } + + /* Have we just resumed ? */ + if (pcf->is_suspended) { + pcf->is_suspended = 0; + + /* Set the resume reason filtering out non resumers */ + for (i = 0; i < ARRAY_SIZE(pcf_int); i++) + pcf->resume_reason[i] = pcf_int[i] & + pcf->pdata->resumers[i]; + + /* Make sure we don't pass on any ONKEY events to + * userspace now */ + pcf_int[1] &= ~(PCF50633_INT2_ONKEYR | PCF50633_INT2_ONKEYF); + } + + for (i = 0; i < ARRAY_SIZE(pcf_int); i++) { + /* Unset masked interrupts */ + pcf_int[i] &= ~pcf->mask_regs[i]; + + for (j = 0; j < 8 ; j++) + if (pcf_int[i] & (1 << j)) + pcf50633_irq_call_handler(pcf, (i * 8) + j); + } + +out: + put_device(pcf->dev); + enable_irq(pcf->irq); +} + +static irqreturn_t pcf50633_irq(int irq, void *data) +{ + struct pcf50633 *pcf = data; + + dev_dbg(pcf->dev, "pcf50633_irq\n"); + + get_device(pcf->dev); + disable_irq(pcf->irq); + schedule_work(&pcf->irq_work); + + return IRQ_HANDLED; +} + +static void +pcf50633_client_dev_register(struct pcf50633 *pcf, const char *name, + struct platform_device **pdev) +{ + struct pcf50633_subdev_pdata *subdev_pdata; + int ret; + + *pdev = platform_device_alloc(name, -1); + if (!*pdev) { + dev_err(pcf->dev, "Falied to allocate %s\n", name); + return; + } + + subdev_pdata = kmalloc(sizeof(*subdev_pdata), GFP_KERNEL); + if (!subdev_pdata) { + dev_err(pcf->dev, "Error allocating subdev pdata\n"); + platform_device_put(*pdev); + } + + subdev_pdata->pcf = pcf; + platform_device_add_data(*pdev, subdev_pdata, sizeof(*subdev_pdata)); + + (*pdev)->dev.parent = pcf->dev; + + ret = platform_device_add(*pdev); + if (ret) { + dev_err(pcf->dev, "Failed to register %s: %d\n", name, ret); + platform_device_put(*pdev); + *pdev = NULL; + } +} + +#ifdef CONFIG_PM +static int pcf50633_suspend(struct device *dev, pm_message_t state) +{ + struct pcf50633 *pcf; + int ret = 0, i; + u8 res[5]; + + pcf = dev_get_drvdata(dev); + + /* Make sure our interrupt handlers are not called + * henceforth */ + disable_irq(pcf->irq); + + /* Make sure that any running IRQ worker has quit */ + cancel_work_sync(&pcf->irq_work); + + /* Save the masks */ + ret = pcf50633_read_block(pcf, PCF50633_REG_INT1M, + ARRAY_SIZE(pcf->suspend_irq_masks), + pcf->suspend_irq_masks); + if (ret < 0) { + dev_err(pcf->dev, "error saving irq masks\n"); + goto out; + } + + /* Write wakeup irq masks */ + for (i = 0; i < ARRAY_SIZE(res); i++) + res[i] = ~pcf->pdata->resumers[i]; + + ret = pcf50633_write_block(pcf, PCF50633_REG_INT1M, + ARRAY_SIZE(res), &res[0]); + if (ret < 0) { + dev_err(pcf->dev, "error writing wakeup irq masks\n"); + goto out; + } + + pcf->is_suspended = 1; + +out: + return ret; +} + +static int pcf50633_resume(struct device *dev) +{ + struct pcf50633 *pcf; + int ret; + + pcf = dev_get_drvdata(dev); + + /* Write the saved mask registers */ + ret = pcf50633_write_block(pcf, PCF50633_REG_INT1M, + ARRAY_SIZE(pcf->suspend_irq_masks), + pcf->suspend_irq_masks); + if (ret < 0) + dev_err(pcf->dev, "Error restoring saved suspend masks\n"); + + /* Restore regulators' state */ + + + get_device(pcf->dev); + + /* + * Clear any pending interrupts and set resume reason if any. + * This will leave with enable_irq() + */ + pcf50633_irq_worker(&pcf->irq_work); + + return 0; +} +#else +#define pcf50633_suspend NULL +#define pcf50633_resume NULL +#endif + +static int __devinit pcf50633_probe(struct i2c_client *client, + const struct i2c_device_id *ids) +{ + struct pcf50633 *pcf; + struct pcf50633_platform_data *pdata = client->dev.platform_data; + int i, ret = 0; + int version, variant; + + pcf = kzalloc(sizeof(*pcf), GFP_KERNEL); + if (!pcf) + return -ENOMEM; + + pcf->pdata = pdata; + + mutex_init(&pcf->lock); + + i2c_set_clientdata(client, pcf); + pcf->dev = &client->dev; + pcf->i2c_client = client; + pcf->irq = client->irq; + + INIT_WORK(&pcf->irq_work, pcf50633_irq_worker); + + version = pcf50633_reg_read(pcf, 0); + variant = pcf50633_reg_read(pcf, 1); + if (version < 0 || variant < 0) { + dev_err(pcf->dev, "Unable to probe pcf50633\n"); + ret = -ENODEV; + goto err; + } + + dev_info(pcf->dev, "Probed device version %d variant %d\n", + version, variant); + + /* Enable all interrupts except RTC SECOND */ + pcf->mask_regs[0] = 0x80; + pcf50633_reg_write(pcf, PCF50633_REG_INT1M, pcf->mask_regs[0]); + pcf50633_reg_write(pcf, PCF50633_REG_INT2M, 0x00); + pcf50633_reg_write(pcf, PCF50633_REG_INT3M, 0x00); + pcf50633_reg_write(pcf, PCF50633_REG_INT4M, 0x00); + pcf50633_reg_write(pcf, PCF50633_REG_INT5M, 0x00); + + /* Create sub devices */ + pcf50633_client_dev_register(pcf, "pcf50633-input", + &pcf->input_pdev); + pcf50633_client_dev_register(pcf, "pcf50633-rtc", + &pcf->rtc_pdev); + pcf50633_client_dev_register(pcf, "pcf50633-mbc", + &pcf->mbc_pdev); + pcf50633_client_dev_register(pcf, "pcf50633-adc", + &pcf->adc_pdev); + + for (i = 0; i < PCF50633_NUM_REGULATORS; i++) { + struct platform_device *pdev; + + pdev = platform_device_alloc("pcf50633-regltr", i); + if (!pdev) { + dev_err(pcf->dev, "Cannot create regulator\n"); + continue; + } + + pdev->dev.parent = pcf->dev; + pdev->dev.platform_data = &pdata->reg_init_data[i]; + pdev->dev.driver_data = pcf; + pcf->regulator_pdev[i] = pdev; + + platform_device_add(pdev); + } + + if (client->irq) { + set_irq_handler(client->irq, handle_level_irq); + ret = request_irq(client->irq, pcf50633_irq, + IRQF_TRIGGER_LOW, "pcf50633", pcf); + + if (ret) { + dev_err(pcf->dev, "Failed to request IRQ %d\n", ret); + goto err; + } + } else { + dev_err(pcf->dev, "No IRQ configured\n"); + goto err; + } + + if (enable_irq_wake(client->irq) < 0) + dev_err(pcf->dev, "IRQ %u cannot be enabled as wake-up source" + "in this hardware revision", client->irq); + + ret = sysfs_create_group(&client->dev.kobj, &pcf_attr_group); + if (ret) + dev_err(pcf->dev, "error creating sysfs entries\n"); + + if (pdata->probe_done) + pdata->probe_done(pcf); + + return 0; + +err: + kfree(pcf); + return ret; +} + +static int __devexit pcf50633_remove(struct i2c_client *client) +{ + struct pcf50633 *pcf = i2c_get_clientdata(client); + int i; + + free_irq(pcf->irq, pcf); + + platform_device_unregister(pcf->input_pdev); + platform_device_unregister(pcf->rtc_pdev); + platform_device_unregister(pcf->mbc_pdev); + platform_device_unregister(pcf->adc_pdev); + + for (i = 0; i < PCF50633_NUM_REGULATORS; i++) + platform_device_unregister(pcf->regulator_pdev[i]); + + kfree(pcf); + + return 0; +} + +static struct i2c_device_id pcf50633_id_table[] = { + {"pcf50633", 0x73}, +}; + +static struct i2c_driver pcf50633_driver = { + .driver = { + .name = "pcf50633", + .suspend = pcf50633_suspend, + .resume = pcf50633_resume, + }, + .id_table = pcf50633_id_table, + .probe = pcf50633_probe, + .remove = __devexit_p(pcf50633_remove), +}; + +static int __init pcf50633_init(void) +{ + return i2c_add_driver(&pcf50633_driver); +} + +static void __exit pcf50633_exit(void) +{ + i2c_del_driver(&pcf50633_driver); +} + +MODULE_DESCRIPTION("I2C chip driver for NXP PCF50633 PMU"); +MODULE_AUTHOR("Harald Welte "); +MODULE_LICENSE("GPL"); + +module_init(pcf50633_init); +module_exit(pcf50633_exit); diff --git a/include/linux/mfd/pcf50633/core.h b/include/linux/mfd/pcf50633/core.h new file mode 100644 index 000000000000..4455b212d75a --- /dev/null +++ b/include/linux/mfd/pcf50633/core.h @@ -0,0 +1,218 @@ +/* + * core.h -- Core driver for NXP PCF50633 + * + * (C) 2006-2008 by Openmoko, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#ifndef __LINUX_MFD_PCF50633_CORE_H +#define __LINUX_MFD_PCF50633_CORE_H + +#include +#include +#include +#include +#include + +struct pcf50633; + +#define PCF50633_NUM_REGULATORS 11 + +struct pcf50633_platform_data { + struct regulator_init_data reg_init_data[PCF50633_NUM_REGULATORS]; + + char **batteries; + int num_batteries; + + /* Callbacks */ + void (*probe_done)(struct pcf50633 *); + void (*mbc_event_callback)(struct pcf50633 *, int); + void (*regulator_registered)(struct pcf50633 *, int); + void (*force_shutdown)(struct pcf50633 *); + + u8 resumers[5]; +}; + +struct pcf50633_subdev_pdata { + struct pcf50633 *pcf; +}; + +struct pcf50633_irq { + void (*handler) (int, void *); + void *data; +}; + +int pcf50633_register_irq(struct pcf50633 *pcf, int irq, + void (*handler) (int, void *), void *data); +int pcf50633_free_irq(struct pcf50633 *pcf, int irq); + +int pcf50633_irq_mask(struct pcf50633 *pcf, int irq); +int pcf50633_irq_unmask(struct pcf50633 *pcf, int irq); +int pcf50633_irq_mask_get(struct pcf50633 *pcf, int irq); + +int pcf50633_read_block(struct pcf50633 *, u8 reg, + int nr_regs, u8 *data); +int pcf50633_write_block(struct pcf50633 *pcf, u8 reg, + int nr_regs, u8 *data); +u8 pcf50633_reg_read(struct pcf50633 *, u8 reg); +int pcf50633_reg_write(struct pcf50633 *pcf, u8 reg, u8 val); + +int pcf50633_reg_set_bit_mask(struct pcf50633 *pcf, u8 reg, u8 mask, u8 val); +int pcf50633_reg_clear_bits(struct pcf50633 *pcf, u8 reg, u8 bits); + +/* Interrupt registers */ + +#define PCF50633_REG_INT1 0x02 +#define PCF50633_REG_INT2 0x03 +#define PCF50633_REG_INT3 0x04 +#define PCF50633_REG_INT4 0x05 +#define PCF50633_REG_INT5 0x06 + +#define PCF50633_REG_INT1M 0x07 +#define PCF50633_REG_INT2M 0x08 +#define PCF50633_REG_INT3M 0x09 +#define PCF50633_REG_INT4M 0x0a +#define PCF50633_REG_INT5M 0x0b + +enum { + /* Chip IRQs */ + PCF50633_IRQ_ADPINS, + PCF50633_IRQ_ADPREM, + PCF50633_IRQ_USBINS, + PCF50633_IRQ_USBREM, + PCF50633_IRQ_RESERVED1, + PCF50633_IRQ_RESERVED2, + PCF50633_IRQ_ALARM, + PCF50633_IRQ_SECOND, + PCF50633_IRQ_ONKEYR, + PCF50633_IRQ_ONKEYF, + PCF50633_IRQ_EXTON1R, + PCF50633_IRQ_EXTON1F, + PCF50633_IRQ_EXTON2R, + PCF50633_IRQ_EXTON2F, + PCF50633_IRQ_EXTON3R, + PCF50633_IRQ_EXTON3F, + PCF50633_IRQ_BATFULL, + PCF50633_IRQ_CHGHALT, + PCF50633_IRQ_THLIMON, + PCF50633_IRQ_THLIMOFF, + PCF50633_IRQ_USBLIMON, + PCF50633_IRQ_USBLIMOFF, + PCF50633_IRQ_ADCRDY, + PCF50633_IRQ_ONKEY1S, + PCF50633_IRQ_LOWSYS, + PCF50633_IRQ_LOWBAT, + PCF50633_IRQ_HIGHTMP, + PCF50633_IRQ_AUTOPWRFAIL, + PCF50633_IRQ_DWN1PWRFAIL, + PCF50633_IRQ_DWN2PWRFAIL, + PCF50633_IRQ_LEDPWRFAIL, + PCF50633_IRQ_LEDOVP, + PCF50633_IRQ_LDO1PWRFAIL, + PCF50633_IRQ_LDO2PWRFAIL, + PCF50633_IRQ_LDO3PWRFAIL, + PCF50633_IRQ_LDO4PWRFAIL, + PCF50633_IRQ_LDO5PWRFAIL, + PCF50633_IRQ_LDO6PWRFAIL, + PCF50633_IRQ_HCLDOPWRFAIL, + PCF50633_IRQ_HCLDOOVL, + + /* Always last */ + PCF50633_NUM_IRQ, +}; + +struct pcf50633 { + struct device *dev; + struct i2c_client *i2c_client; + + struct pcf50633_platform_data *pdata; + int irq; + struct pcf50633_irq irq_handler[PCF50633_NUM_IRQ]; + struct work_struct irq_work; + struct mutex lock; + + u8 mask_regs[5]; + + u8 suspend_irq_masks[5]; + u8 resume_reason[5]; + int is_suspended; + + int onkey1s_held; + + struct platform_device *rtc_pdev; + struct platform_device *mbc_pdev; + struct platform_device *adc_pdev; + struct platform_device *input_pdev; + struct platform_device *regulator_pdev[PCF50633_NUM_REGULATORS]; +}; + +enum pcf50633_reg_int1 { + PCF50633_INT1_ADPINS = 0x01, /* Adapter inserted */ + PCF50633_INT1_ADPREM = 0x02, /* Adapter removed */ + PCF50633_INT1_USBINS = 0x04, /* USB inserted */ + PCF50633_INT1_USBREM = 0x08, /* USB removed */ + /* reserved */ + PCF50633_INT1_ALARM = 0x40, /* RTC alarm time is reached */ + PCF50633_INT1_SECOND = 0x80, /* RTC periodic second interrupt */ +}; + +enum pcf50633_reg_int2 { + PCF50633_INT2_ONKEYR = 0x01, /* ONKEY rising edge */ + PCF50633_INT2_ONKEYF = 0x02, /* ONKEY falling edge */ + PCF50633_INT2_EXTON1R = 0x04, /* EXTON1 rising edge */ + PCF50633_INT2_EXTON1F = 0x08, /* EXTON1 falling edge */ + PCF50633_INT2_EXTON2R = 0x10, /* EXTON2 rising edge */ + PCF50633_INT2_EXTON2F = 0x20, /* EXTON2 falling edge */ + PCF50633_INT2_EXTON3R = 0x40, /* EXTON3 rising edge */ + PCF50633_INT2_EXTON3F = 0x80, /* EXTON3 falling edge */ +}; + +enum pcf50633_reg_int3 { + PCF50633_INT3_BATFULL = 0x01, /* Battery full */ + PCF50633_INT3_CHGHALT = 0x02, /* Charger halt */ + PCF50633_INT3_THLIMON = 0x04, + PCF50633_INT3_THLIMOFF = 0x08, + PCF50633_INT3_USBLIMON = 0x10, + PCF50633_INT3_USBLIMOFF = 0x20, + PCF50633_INT3_ADCRDY = 0x40, /* ADC result ready */ + PCF50633_INT3_ONKEY1S = 0x80, /* ONKEY pressed 1 second */ +}; + +enum pcf50633_reg_int4 { + PCF50633_INT4_LOWSYS = 0x01, + PCF50633_INT4_LOWBAT = 0x02, + PCF50633_INT4_HIGHTMP = 0x04, + PCF50633_INT4_AUTOPWRFAIL = 0x08, + PCF50633_INT4_DWN1PWRFAIL = 0x10, + PCF50633_INT4_DWN2PWRFAIL = 0x20, + PCF50633_INT4_LEDPWRFAIL = 0x40, + PCF50633_INT4_LEDOVP = 0x80, +}; + +enum pcf50633_reg_int5 { + PCF50633_INT5_LDO1PWRFAIL = 0x01, + PCF50633_INT5_LDO2PWRFAIL = 0x02, + PCF50633_INT5_LDO3PWRFAIL = 0x04, + PCF50633_INT5_LDO4PWRFAIL = 0x08, + PCF50633_INT5_LDO5PWRFAIL = 0x10, + PCF50633_INT5_LDO6PWRFAIL = 0x20, + PCF50633_INT5_HCLDOPWRFAIL = 0x40, + PCF50633_INT5_HCLDOOVL = 0x80, +}; + +/* misc. registers */ +#define PCF50633_REG_OOCSHDWN 0x0c + +/* LED registers */ +#define PCF50633_REG_LEDOUT 0x28 +#define PCF50633_REG_LEDENA 0x29 +#define PCF50633_REG_LEDCTL 0x2a +#define PCF50633_REG_LEDDIM 0x2b + +#endif + -- cgit v1.2.3 From 08c3e06a5eb27d43b712adef18379f8464425e71 Mon Sep 17 00:00:00 2001 From: Balaji Rao Date: Fri, 9 Jan 2009 01:49:26 +0100 Subject: mfd: PCF50633 adc driver This patch adds basic support for the PCF50633 ADC. The subtractive mode is not supported yet. Since we don't have adc subsystem, it currently lives in drivers/mfd. Signed-off-by: Balaji Rao Cc: Andy Green Acked-by: Jonathan Cameron Signed-off-by: Samuel Ortiz --- drivers/mfd/Kconfig | 7 + drivers/mfd/Makefile | 3 +- drivers/mfd/pcf50633-adc.c | 277 +++++++++++++++++++++++++++++++++++++++ include/linux/mfd/pcf50633/adc.h | 72 ++++++++++ 4 files changed, 358 insertions(+), 1 deletion(-) create mode 100644 drivers/mfd/pcf50633-adc.c create mode 100644 include/linux/mfd/pcf50633/adc.h (limited to 'include/linux') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 3ac32e45b03b..2fa3504e165a 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -226,6 +226,13 @@ config MFD_PCF50633 facilities, and registers devices for the various functions so that function-specific drivers can bind to them. +config PCF50633_ADC + tristate "Support for NXP PCF50633 ADC" + depends on MFD_PCF50633 + help + Say yes here if you want to include support for ADC in the + NXP PCF50633 chip. + endmenu menu "Multimedia Capabilities Port drivers" diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 23d4d10981b3..138f9c4d3d7b 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -38,4 +38,5 @@ obj-$(CONFIG_UCB1400_CORE) += ucb1400_core.o obj-$(CONFIG_PMIC_DA903X) += da903x.o -obj-$(CONFIG_MFD_PCF50633) += pcf50633-core.o \ No newline at end of file +obj-$(CONFIG_MFD_PCF50633) += pcf50633-core.o +obj-$(CONFIG_PCF50633_ADC) += pcf50633-adc.o \ No newline at end of file diff --git a/drivers/mfd/pcf50633-adc.c b/drivers/mfd/pcf50633-adc.c new file mode 100644 index 000000000000..c2d05becfa97 --- /dev/null +++ b/drivers/mfd/pcf50633-adc.c @@ -0,0 +1,277 @@ +/* NXP PCF50633 ADC Driver + * + * (C) 2006-2008 by Openmoko, Inc. + * Author: Balaji Rao + * All rights reserved. + * + * Broken down from monstrous PCF50633 driver mainly by + * Harald Welte, Andy Green and Werner Almesberger + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * NOTE: This driver does not yet support subtractive ADC mode, which means + * you can do only one measurement per read request. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +struct pcf50633_adc_request { + int mux; + int avg; + int result; + void (*callback)(struct pcf50633 *, void *, int); + void *callback_param; + + /* Used in case of sync requests */ + struct completion completion; + +}; + +#define PCF50633_MAX_ADC_FIFO_DEPTH 8 + +struct pcf50633_adc { + struct pcf50633 *pcf; + + /* Private stuff */ + struct pcf50633_adc_request *queue[PCF50633_MAX_ADC_FIFO_DEPTH]; + int queue_head; + int queue_tail; + struct mutex queue_mutex; +}; + +static inline struct pcf50633_adc *__to_adc(struct pcf50633 *pcf) +{ + return platform_get_drvdata(pcf->adc_pdev); +} + +static void adc_setup(struct pcf50633 *pcf, int channel, int avg) +{ + channel &= PCF50633_ADCC1_ADCMUX_MASK; + + /* kill ratiometric, but enable ACCSW biasing */ + pcf50633_reg_write(pcf, PCF50633_REG_ADCC2, 0x00); + pcf50633_reg_write(pcf, PCF50633_REG_ADCC3, 0x01); + + /* start ADC conversion on selected channel */ + pcf50633_reg_write(pcf, PCF50633_REG_ADCC1, channel | avg | + PCF50633_ADCC1_ADCSTART | PCF50633_ADCC1_RES_10BIT); +} + +static void trigger_next_adc_job_if_any(struct pcf50633 *pcf) +{ + struct pcf50633_adc *adc = __to_adc(pcf); + int head; + + mutex_lock(&adc->queue_mutex); + + head = adc->queue_head; + + if (!adc->queue[head]) { + mutex_unlock(&adc->queue_mutex); + return; + } + mutex_unlock(&adc->queue_mutex); + + adc_setup(pcf, adc->queue[head]->mux, adc->queue[head]->avg); +} + +static int +adc_enqueue_request(struct pcf50633 *pcf, struct pcf50633_adc_request *req) +{ + struct pcf50633_adc *adc = __to_adc(pcf); + int head, tail; + + mutex_lock(&adc->queue_mutex); + + head = adc->queue_head; + tail = adc->queue_tail; + + if (adc->queue[tail]) { + mutex_unlock(&adc->queue_mutex); + return -EBUSY; + } + + adc->queue[tail] = req; + adc->queue_tail = (tail + 1) & (PCF50633_MAX_ADC_FIFO_DEPTH - 1); + + mutex_unlock(&adc->queue_mutex); + + trigger_next_adc_job_if_any(pcf); + + return 0; +} + +static void +pcf50633_adc_sync_read_callback(struct pcf50633 *pcf, void *param, int result) +{ + struct pcf50633_adc_request *req = param; + + req->result = result; + complete(&req->completion); +} + +int pcf50633_adc_sync_read(struct pcf50633 *pcf, int mux, int avg) +{ + struct pcf50633_adc_request *req; + + /* req is freed when the result is ready, in interrupt handler */ + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->mux = mux; + req->avg = avg; + req->callback = pcf50633_adc_sync_read_callback; + req->callback_param = req; + + init_completion(&req->completion); + adc_enqueue_request(pcf, req); + wait_for_completion(&req->completion); + + return req->result; +} +EXPORT_SYMBOL_GPL(pcf50633_adc_sync_read); + +int pcf50633_adc_async_read(struct pcf50633 *pcf, int mux, int avg, + void (*callback)(struct pcf50633 *, void *, int), + void *callback_param) +{ + struct pcf50633_adc_request *req; + + /* req is freed when the result is ready, in interrupt handler */ + req = kmalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->mux = mux; + req->avg = avg; + req->callback = callback; + req->callback_param = callback_param; + + adc_enqueue_request(pcf, req); + + return 0; +} +EXPORT_SYMBOL_GPL(pcf50633_adc_async_read); + +static int adc_result(struct pcf50633 *pcf) +{ + u8 adcs1, adcs3; + u16 result; + + adcs1 = pcf50633_reg_read(pcf, PCF50633_REG_ADCS1); + adcs3 = pcf50633_reg_read(pcf, PCF50633_REG_ADCS3); + result = (adcs1 << 2) | (adcs3 & PCF50633_ADCS3_ADCDAT1L_MASK); + + dev_dbg(pcf->dev, "adc result = %d\n", result); + + return result; +} + +static void pcf50633_adc_irq(int irq, void *data) +{ + struct pcf50633_adc *adc = data; + struct pcf50633 *pcf = adc->pcf; + struct pcf50633_adc_request *req; + int head; + + mutex_lock(&adc->queue_mutex); + head = adc->queue_head; + + req = adc->queue[head]; + if (WARN_ON(!req)) { + dev_err(pcf->dev, "pcf50633-adc irq: ADC queue empty!\n"); + mutex_unlock(&adc->queue_mutex); + return; + } + adc->queue[head] = NULL; + adc->queue_head = (head + 1) & + (PCF50633_MAX_ADC_FIFO_DEPTH - 1); + + mutex_unlock(&adc->queue_mutex); + + req->callback(pcf, req->callback_param, adc_result(pcf)); + kfree(req); + + trigger_next_adc_job_if_any(pcf); +} + +static int __devinit pcf50633_adc_probe(struct platform_device *pdev) +{ + struct pcf50633_subdev_pdata *pdata = pdev->dev.platform_data; + struct pcf50633_adc *adc; + + adc = kzalloc(sizeof(*adc), GFP_KERNEL); + if (!adc) + return -ENOMEM; + + adc->pcf = pdata->pcf; + platform_set_drvdata(pdev, adc); + + pcf50633_register_irq(pdata->pcf, PCF50633_IRQ_ADCRDY, + pcf50633_adc_irq, adc); + + mutex_init(&adc->queue_mutex); + + return 0; +} + +static int __devexit pcf50633_adc_remove(struct platform_device *pdev) +{ + struct pcf50633_adc *adc = platform_get_drvdata(pdev); + int i, head; + + pcf50633_free_irq(adc->pcf, PCF50633_IRQ_ADCRDY); + + mutex_lock(&adc->queue_mutex); + head = adc->queue_head; + + if (WARN_ON(adc->queue[head])) + dev_err(adc->pcf->dev, + "adc driver removed with request pending\n"); + + for (i = 0; i < PCF50633_MAX_ADC_FIFO_DEPTH; i++) + kfree(adc->queue[i]); + + mutex_unlock(&adc->queue_mutex); + kfree(adc); + + return 0; +} + +static struct platform_driver pcf50633_adc_driver = { + .driver = { + .name = "pcf50633-adc", + }, + .probe = pcf50633_adc_probe, + .remove = __devexit_p(pcf50633_adc_remove), +}; + +static int __init pcf50633_adc_init(void) +{ + return platform_driver_register(&pcf50633_adc_driver); +} +module_init(pcf50633_adc_init); + +static void __exit pcf50633_adc_exit(void) +{ + platform_driver_unregister(&pcf50633_adc_driver); +} +module_exit(pcf50633_adc_exit); + +MODULE_AUTHOR("Balaji Rao "); +MODULE_DESCRIPTION("PCF50633 adc driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:pcf50633-adc"); + diff --git a/include/linux/mfd/pcf50633/adc.h b/include/linux/mfd/pcf50633/adc.h new file mode 100644 index 000000000000..56669b4183ad --- /dev/null +++ b/include/linux/mfd/pcf50633/adc.h @@ -0,0 +1,72 @@ +/* + * adc.h -- Driver for NXP PCF50633 ADC + * + * (C) 2006-2008 by Openmoko, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#ifndef __LINUX_MFD_PCF50633_ADC_H +#define __LINUX_MFD_PCF50633_ADC_H + +#include +#include + +/* ADC Registers */ +#define PCF50633_REG_ADCC3 0x52 +#define PCF50633_REG_ADCC2 0x53 +#define PCF50633_REG_ADCC1 0x54 +#define PCF50633_REG_ADCS1 0x55 +#define PCF50633_REG_ADCS2 0x56 +#define PCF50633_REG_ADCS3 0x57 + +#define PCF50633_ADCC1_ADCSTART 0x01 +#define PCF50633_ADCC1_RES_10BIT 0x02 +#define PCF50633_ADCC1_AVERAGE_NO 0x00 +#define PCF50633_ADCC1_AVERAGE_4 0x04 +#define PCF50633_ADCC1_AVERAGE_8 0x08 +#define PCF50633_ADCC1_AVERAGE_16 0x0c +#define PCF50633_ADCC1_MUX_BATSNS_RES 0x00 +#define PCF50633_ADCC1_MUX_BATSNS_SUBTR 0x10 +#define PCF50633_ADCC1_MUX_ADCIN2_RES 0x20 +#define PCF50633_ADCC1_MUX_ADCIN2_SUBTR 0x30 +#define PCF50633_ADCC1_MUX_BATTEMP 0x60 +#define PCF50633_ADCC1_MUX_ADCIN1 0x70 +#define PCF50633_ADCC1_AVERAGE_MASK 0x0c +#define PCF50633_ADCC1_ADCMUX_MASK 0xf0 + +#define PCF50633_ADCC2_RATIO_NONE 0x00 +#define PCF50633_ADCC2_RATIO_BATTEMP 0x01 +#define PCF50633_ADCC2_RATIO_ADCIN1 0x02 +#define PCF50633_ADCC2_RATIO_BOTH 0x03 +#define PCF50633_ADCC2_RATIOSETTL_100US 0x04 + +#define PCF50633_ADCC3_ACCSW_EN 0x01 +#define PCF50633_ADCC3_NTCSW_EN 0x04 +#define PCF50633_ADCC3_RES_DIV_TWO 0x10 +#define PCF50633_ADCC3_RES_DIV_THREE 0x00 + +#define PCF50633_ADCS3_REF_NTCSW 0x00 +#define PCF50633_ADCS3_REF_ACCSW 0x10 +#define PCF50633_ADCS3_REF_2V0 0x20 +#define PCF50633_ADCS3_REF_VISA 0x30 +#define PCF50633_ADCS3_REF_2V0_2 0x70 +#define PCF50633_ADCS3_ADCRDY 0x80 + +#define PCF50633_ADCS3_ADCDAT1L_MASK 0x03 +#define PCF50633_ADCS3_ADCDAT2L_MASK 0x0c +#define PCF50633_ADCS3_ADCDAT2L_SHIFT 2 +#define PCF50633_ASCS3_REF_MASK 0x70 + +extern int +pcf50633_adc_async_read(struct pcf50633 *pcf, int mux, int avg, + void (*callback)(struct pcf50633 *, void *, int), + void *callback_param); +extern int +pcf50633_adc_sync_read(struct pcf50633 *pcf, int mux, int avg); + +#endif /* __LINUX_PCF50633_ADC_H */ -- cgit v1.2.3 From 6a3d119b4ce29cf32bfe91eb61d46e9dbd8ce38a Mon Sep 17 00:00:00 2001 From: Balaji Rao Date: Fri, 9 Jan 2009 01:49:37 +0100 Subject: mfd: PCF50633 gpio support What the PCF05633 calls as a 'GPIO' is much more than the GPIO in the linux sense and there are only 4 of them - which means, the gpiolib is not used here. Signed-off-by: Balaji Rao Cc: Andy Green Signed-off-by: Samuel Ortiz --- drivers/mfd/Kconfig | 7 +++ drivers/mfd/Makefile | 3 +- drivers/mfd/pcf50633-gpio.c | 118 ++++++++++++++++++++++++++++++++++++++ include/linux/mfd/pcf50633/gpio.h | 52 +++++++++++++++++ 4 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 drivers/mfd/pcf50633-gpio.c create mode 100644 include/linux/mfd/pcf50633/gpio.h (limited to 'include/linux') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 2fa3504e165a..06a2b0f7737c 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -233,6 +233,13 @@ config PCF50633_ADC Say yes here if you want to include support for ADC in the NXP PCF50633 chip. +config PCF50633_GPIO + tristate "Support for NXP PCF50633 GPIO" + depends on MFD_PCF50633 + help + Say yes here if you want to include support GPIO for pins on + the PCF50633 chip. + endmenu menu "Multimedia Capabilities Port drivers" diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 138f9c4d3d7b..3afb5192e4da 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -39,4 +39,5 @@ obj-$(CONFIG_UCB1400_CORE) += ucb1400_core.o obj-$(CONFIG_PMIC_DA903X) += da903x.o obj-$(CONFIG_MFD_PCF50633) += pcf50633-core.o -obj-$(CONFIG_PCF50633_ADC) += pcf50633-adc.o \ No newline at end of file +obj-$(CONFIG_PCF50633_ADC) += pcf50633-adc.o +obj-$(CONFIG_PCF50633_GPIO) += pcf50633-gpio.o \ No newline at end of file diff --git a/drivers/mfd/pcf50633-gpio.c b/drivers/mfd/pcf50633-gpio.c new file mode 100644 index 000000000000..2fa2eca5c9cc --- /dev/null +++ b/drivers/mfd/pcf50633-gpio.c @@ -0,0 +1,118 @@ +/* NXP PCF50633 GPIO Driver + * + * (C) 2006-2008 by Openmoko, Inc. + * Author: Balaji Rao + * All rights reserved. + * + * Broken down from monstrous PCF50633 driver mainly by + * Harald Welte, Andy Green and Werner Almesberger + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + */ + +#include + +#include +#include + +enum pcf50633_regulator_id { + PCF50633_REGULATOR_AUTO, + PCF50633_REGULATOR_DOWN1, + PCF50633_REGULATOR_DOWN2, + PCF50633_REGULATOR_LDO1, + PCF50633_REGULATOR_LDO2, + PCF50633_REGULATOR_LDO3, + PCF50633_REGULATOR_LDO4, + PCF50633_REGULATOR_LDO5, + PCF50633_REGULATOR_LDO6, + PCF50633_REGULATOR_HCLDO, + PCF50633_REGULATOR_MEMLDO, +}; + +#define PCF50633_REG_AUTOOUT 0x1a +#define PCF50633_REG_DOWN1OUT 0x1e +#define PCF50633_REG_DOWN2OUT 0x22 +#define PCF50633_REG_MEMLDOOUT 0x26 +#define PCF50633_REG_LDO1OUT 0x2d +#define PCF50633_REG_LDO2OUT 0x2f +#define PCF50633_REG_LDO3OUT 0x31 +#define PCF50633_REG_LDO4OUT 0x33 +#define PCF50633_REG_LDO5OUT 0x35 +#define PCF50633_REG_LDO6OUT 0x37 +#define PCF50633_REG_HCLDOOUT 0x39 + +static const u8 pcf50633_regulator_registers[PCF50633_NUM_REGULATORS] = { + [PCF50633_REGULATOR_AUTO] = PCF50633_REG_AUTOOUT, + [PCF50633_REGULATOR_DOWN1] = PCF50633_REG_DOWN1OUT, + [PCF50633_REGULATOR_DOWN2] = PCF50633_REG_DOWN2OUT, + [PCF50633_REGULATOR_MEMLDO] = PCF50633_REG_MEMLDOOUT, + [PCF50633_REGULATOR_LDO1] = PCF50633_REG_LDO1OUT, + [PCF50633_REGULATOR_LDO2] = PCF50633_REG_LDO2OUT, + [PCF50633_REGULATOR_LDO3] = PCF50633_REG_LDO3OUT, + [PCF50633_REGULATOR_LDO4] = PCF50633_REG_LDO4OUT, + [PCF50633_REGULATOR_LDO5] = PCF50633_REG_LDO5OUT, + [PCF50633_REGULATOR_LDO6] = PCF50633_REG_LDO6OUT, + [PCF50633_REGULATOR_HCLDO] = PCF50633_REG_HCLDOOUT, +}; + +int pcf50633_gpio_set(struct pcf50633 *pcf, int gpio, u8 val) +{ + u8 reg; + + reg = gpio - PCF50633_GPIO1 + PCF50633_REG_GPIO1CFG; + + return pcf50633_reg_set_bit_mask(pcf, reg, 0x07, val); +} +EXPORT_SYMBOL_GPL(pcf50633_gpio_set); + +u8 pcf50633_gpio_get(struct pcf50633 *pcf, int gpio) +{ + u8 reg, val; + + reg = gpio - PCF50633_GPIO1 + PCF50633_REG_GPIO1CFG; + val = pcf50633_reg_read(pcf, reg) & 0x07; + + return val; +} +EXPORT_SYMBOL_GPL(pcf50633_gpio_get); + +int pcf50633_gpio_invert_set(struct pcf50633 *pcf, int gpio, int invert) +{ + u8 val, reg; + + reg = gpio - PCF50633_GPIO1 + PCF50633_REG_GPIO1CFG; + val = !!invert << 3; + + return pcf50633_reg_set_bit_mask(pcf, reg, 1 << 3, val); +} +EXPORT_SYMBOL_GPL(pcf50633_gpio_invert_set); + +int pcf50633_gpio_invert_get(struct pcf50633 *pcf, int gpio) +{ + u8 reg, val; + + reg = gpio - PCF50633_GPIO1 + PCF50633_REG_GPIO1CFG; + val = pcf50633_reg_read(pcf, reg); + + return val & (1 << 3); +} +EXPORT_SYMBOL_GPL(pcf50633_gpio_invert_get); + +int pcf50633_gpio_power_supply_set(struct pcf50633 *pcf, + int gpio, int regulator, int on) +{ + u8 reg, val, mask; + + /* the *ENA register is always one after the *OUT register */ + reg = pcf50633_regulator_registers[regulator] + 1; + + val = !!on << (gpio - PCF50633_GPIO1); + mask = 1 << (gpio - PCF50633_GPIO1); + + return pcf50633_reg_set_bit_mask(pcf, reg, mask, val); +} +EXPORT_SYMBOL_GPL(pcf50633_gpio_power_supply_set); diff --git a/include/linux/mfd/pcf50633/gpio.h b/include/linux/mfd/pcf50633/gpio.h new file mode 100644 index 000000000000..a42b845efc54 --- /dev/null +++ b/include/linux/mfd/pcf50633/gpio.h @@ -0,0 +1,52 @@ +/* + * gpio.h -- GPIO driver for NXP PCF50633 + * + * (C) 2006-2008 by Openmoko, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#ifndef __LINUX_MFD_PCF50633_GPIO_H +#define __LINUX_MFD_PCF50633_GPIO_H + +#include + +#define PCF50633_GPIO1 1 +#define PCF50633_GPIO2 2 +#define PCF50633_GPIO3 3 +#define PCF50633_GPO 4 + +#define PCF50633_REG_GPIO1CFG 0x14 +#define PCF50633_REG_GPIO2CFG 0x15 +#define PCF50633_REG_GPIO3CFG 0x16 +#define PCF50633_REG_GPOCFG 0x17 + +#define PCF50633_GPOCFG_GPOSEL_MASK 0x07 + +enum pcf50633_reg_gpocfg { + PCF50633_GPOCFG_GPOSEL_0 = 0x00, + PCF50633_GPOCFG_GPOSEL_LED_NFET = 0x01, + PCF50633_GPOCFG_GPOSEL_SYSxOK = 0x02, + PCF50633_GPOCFG_GPOSEL_CLK32K = 0x03, + PCF50633_GPOCFG_GPOSEL_ADAPUSB = 0x04, + PCF50633_GPOCFG_GPOSEL_USBxOK = 0x05, + PCF50633_GPOCFG_GPOSEL_ACTPH4 = 0x06, + PCF50633_GPOCFG_GPOSEL_1 = 0x07, + PCF50633_GPOCFG_GPOSEL_INVERSE = 0x08, +}; + +int pcf50633_gpio_set(struct pcf50633 *pcf, int gpio, u8 val); +u8 pcf50633_gpio_get(struct pcf50633 *pcf, int gpio); + +int pcf50633_gpio_invert_set(struct pcf50633 *, int gpio, int invert); +int pcf50633_gpio_invert_get(struct pcf50633 *pcf, int gpio); + +int pcf50633_gpio_power_supply_set(struct pcf50633 *, + int gpio, int regulator, int on); +#endif /* __LINUX_MFD_PCF50633_GPIO_H */ + + -- cgit v1.2.3 From f5714dc97d63cc0dd1219bd0eb2e1f8df1e4347a Mon Sep 17 00:00:00 2001 From: Balaji Rao Date: Fri, 9 Jan 2009 01:50:55 +0100 Subject: power_supply: PCF50633 battery charger driver Signed-off-by: Balaji Rao Cc: Andy Green Cc: David Woodhouse Acked-by: Anton Vorontsov Signed-off-by: Samuel Ortiz --- drivers/power/Kconfig | 6 + drivers/power/Makefile | 1 + drivers/power/pcf50633-charger.c | 358 +++++++++++++++++++++++++++++++++++++++ include/linux/mfd/pcf50633/mbc.h | 134 +++++++++++++++ 4 files changed, 499 insertions(+) create mode 100644 drivers/power/pcf50633-charger.c create mode 100644 include/linux/mfd/pcf50633/mbc.h (limited to 'include/linux') diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig index 668472405a57..33da1127992a 100644 --- a/drivers/power/Kconfig +++ b/drivers/power/Kconfig @@ -82,4 +82,10 @@ config BATTERY_DA9030 Say Y here to enable support for batteries charger integrated into DA9030 PMIC. +config CHARGER_PCF50633 + tristate "NXP PCF50633 MBC" + depends on MFD_PCF50633 + help + Say Y to include support for NXP PCF50633 Main Battery Charger. + endif # POWER_SUPPLY diff --git a/drivers/power/Makefile b/drivers/power/Makefile index eebb15505a40..2fcf41d13e5c 100644 --- a/drivers/power/Makefile +++ b/drivers/power/Makefile @@ -25,3 +25,4 @@ obj-$(CONFIG_BATTERY_TOSA) += tosa_battery.o obj-$(CONFIG_BATTERY_WM97XX) += wm97xx_battery.o obj-$(CONFIG_BATTERY_BQ27x00) += bq27x00_battery.o obj-$(CONFIG_BATTERY_DA9030) += da9030_battery.o +obj-$(CONFIG_CHARGER_PCF50633) += pcf50633-charger.o \ No newline at end of file diff --git a/drivers/power/pcf50633-charger.c b/drivers/power/pcf50633-charger.c new file mode 100644 index 000000000000..e988ec130fcd --- /dev/null +++ b/drivers/power/pcf50633-charger.c @@ -0,0 +1,358 @@ +/* NXP PCF50633 Main Battery Charger Driver + * + * (C) 2006-2008 by Openmoko, Inc. + * Author: Balaji Rao + * All rights reserved. + * + * Broken down from monstrous PCF50633 driver mainly by + * Harald Welte, Andy Green and Werner Almesberger + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct pcf50633_mbc { + struct pcf50633 *pcf; + + int adapter_active; + int adapter_online; + int usb_active; + int usb_online; + + struct power_supply usb; + struct power_supply adapter; +}; + +int pcf50633_mbc_usb_curlim_set(struct pcf50633 *pcf, int ma) +{ + struct pcf50633_mbc *mbc = platform_get_drvdata(pcf->mbc_pdev); + int ret = 0; + u8 bits; + + if (ma >= 1000) + bits = PCF50633_MBCC7_USB_1000mA; + else if (ma >= 500) + bits = PCF50633_MBCC7_USB_500mA; + else if (ma >= 100) + bits = PCF50633_MBCC7_USB_100mA; + else + bits = PCF50633_MBCC7_USB_SUSPEND; + + ret = pcf50633_reg_set_bit_mask(pcf, PCF50633_REG_MBCC7, + PCF50633_MBCC7_USB_MASK, bits); + if (ret) + dev_err(pcf->dev, "error setting usb curlim to %d mA\n", ma); + else + dev_info(pcf->dev, "usb curlim to %d mA\n", ma); + + power_supply_changed(&mbc->usb); + + return ret; +} +EXPORT_SYMBOL_GPL(pcf50633_mbc_usb_curlim_set); + +int pcf50633_mbc_get_status(struct pcf50633 *pcf) +{ + struct pcf50633_mbc *mbc = platform_get_drvdata(pcf->mbc_pdev); + int status = 0; + + if (mbc->usb_online) + status |= PCF50633_MBC_USB_ONLINE; + if (mbc->usb_active) + status |= PCF50633_MBC_USB_ACTIVE; + if (mbc->adapter_online) + status |= PCF50633_MBC_ADAPTER_ONLINE; + if (mbc->adapter_active) + status |= PCF50633_MBC_ADAPTER_ACTIVE; + + return status; +} +EXPORT_SYMBOL_GPL(pcf50633_mbc_get_status); + +void pcf50633_mbc_set_status(struct pcf50633 *pcf, int what, int status) +{ + struct pcf50633_mbc *mbc = platform_get_drvdata(pcf->mbc_pdev); + + if (what & PCF50633_MBC_USB_ONLINE) + mbc->usb_online = !!status; + if (what & PCF50633_MBC_USB_ACTIVE) + mbc->usb_active = !!status; + if (what & PCF50633_MBC_ADAPTER_ONLINE) + mbc->adapter_online = !!status; + if (what & PCF50633_MBC_ADAPTER_ACTIVE) + mbc->adapter_active = !!status; +} +EXPORT_SYMBOL_GPL(pcf50633_mbc_set_status); + +static ssize_t +show_chgmode(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct pcf50633_mbc *mbc = dev_get_drvdata(dev); + + u8 mbcs2 = pcf50633_reg_read(mbc->pcf, PCF50633_REG_MBCS2); + u8 chgmod = (mbcs2 & PCF50633_MBCS2_MBC_MASK); + + return sprintf(buf, "%d\n", chgmod); +} +static DEVICE_ATTR(chgmode, S_IRUGO, show_chgmode, NULL); + +static ssize_t +show_usblim(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct pcf50633_mbc *mbc = dev_get_drvdata(dev); + u8 usblim = pcf50633_reg_read(mbc->pcf, PCF50633_REG_MBCC7) & + PCF50633_MBCC7_USB_MASK; + unsigned int ma; + + if (usblim == PCF50633_MBCC7_USB_1000mA) + ma = 1000; + else if (usblim == PCF50633_MBCC7_USB_500mA) + ma = 500; + else if (usblim == PCF50633_MBCC7_USB_100mA) + ma = 100; + else + ma = 0; + + return sprintf(buf, "%u\n", ma); +} + +static ssize_t set_usblim(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct pcf50633_mbc *mbc = dev_get_drvdata(dev); + unsigned long ma; + int ret; + + ret = strict_strtoul(buf, 10, &ma); + if (ret) + return -EINVAL; + + pcf50633_mbc_usb_curlim_set(mbc->pcf, ma); + + return count; +} + +static DEVICE_ATTR(usb_curlim, S_IRUGO | S_IWUSR, show_usblim, set_usblim); + +static struct attribute *pcf50633_mbc_sysfs_entries[] = { + &dev_attr_chgmode.attr, + &dev_attr_usb_curlim.attr, + NULL, +}; + +static struct attribute_group mbc_attr_group = { + .name = NULL, /* put in device directory */ + .attrs = pcf50633_mbc_sysfs_entries, +}; + +static void +pcf50633_mbc_irq_handler(int irq, void *data) +{ + struct pcf50633_mbc *mbc = data; + + /* USB */ + if (irq == PCF50633_IRQ_USBINS) { + mbc->usb_online = 1; + } else if (irq == PCF50633_IRQ_USBREM) { + mbc->usb_online = 0; + mbc->usb_active = 0; + pcf50633_mbc_usb_curlim_set(mbc->pcf, 0); + } + + /* Adapter */ + if (irq == PCF50633_IRQ_ADPINS) { + mbc->adapter_online = 1; + mbc->adapter_active = 1; + } else if (irq == PCF50633_IRQ_ADPREM) { + mbc->adapter_online = 0; + mbc->adapter_active = 0; + } + + if (irq == PCF50633_IRQ_BATFULL) { + mbc->usb_active = 0; + mbc->adapter_active = 0; + } + + power_supply_changed(&mbc->usb); + power_supply_changed(&mbc->adapter); + + if (mbc->pcf->pdata->mbc_event_callback) + mbc->pcf->pdata->mbc_event_callback(mbc->pcf, irq); +} + +static int adapter_get_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val) +{ + struct pcf50633_mbc *mbc = container_of(psy, struct pcf50633_mbc, usb); + int ret = 0; + + switch (psp) { + case POWER_SUPPLY_PROP_ONLINE: + val->intval = mbc->adapter_online; + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static int usb_get_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val) +{ + struct pcf50633_mbc *mbc = container_of(psy, struct pcf50633_mbc, usb); + int ret = 0; + + switch (psp) { + case POWER_SUPPLY_PROP_ONLINE: + val->intval = mbc->usb_online; + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static enum power_supply_property power_props[] = { + POWER_SUPPLY_PROP_ONLINE, +}; + +static const u8 mbc_irq_handlers[] = { + PCF50633_IRQ_ADPINS, + PCF50633_IRQ_ADPREM, + PCF50633_IRQ_USBINS, + PCF50633_IRQ_USBREM, + PCF50633_IRQ_BATFULL, + PCF50633_IRQ_CHGHALT, + PCF50633_IRQ_THLIMON, + PCF50633_IRQ_THLIMOFF, + PCF50633_IRQ_USBLIMON, + PCF50633_IRQ_USBLIMOFF, + PCF50633_IRQ_LOWSYS, + PCF50633_IRQ_LOWBAT, +}; + +static int __devinit pcf50633_mbc_probe(struct platform_device *pdev) +{ + struct pcf50633_mbc *mbc; + struct pcf50633_subdev_pdata *pdata = pdev->dev.platform_data; + int ret; + int i; + u8 mbcs1; + + mbc = kzalloc(sizeof(*mbc), GFP_KERNEL); + if (!mbc) + return -ENOMEM; + + platform_set_drvdata(pdev, mbc); + mbc->pcf = pdata->pcf; + + /* Set up IRQ handlers */ + for (i = 0; i < ARRAY_SIZE(mbc_irq_handlers); i++) + pcf50633_register_irq(mbc->pcf, mbc_irq_handlers[i], + pcf50633_mbc_irq_handler, mbc); + + /* Create power supplies */ + mbc->adapter.name = "adapter"; + mbc->adapter.type = POWER_SUPPLY_TYPE_MAINS; + mbc->adapter.properties = power_props; + mbc->adapter.num_properties = ARRAY_SIZE(power_props); + mbc->adapter.get_property = &adapter_get_property; + mbc->adapter.supplied_to = mbc->pcf->pdata->batteries; + mbc->adapter.num_supplicants = mbc->pcf->pdata->num_batteries; + + mbc->usb.name = "usb"; + mbc->usb.type = POWER_SUPPLY_TYPE_USB; + mbc->usb.properties = power_props; + mbc->usb.num_properties = ARRAY_SIZE(power_props); + mbc->usb.get_property = usb_get_property; + mbc->usb.supplied_to = mbc->pcf->pdata->batteries; + mbc->usb.num_supplicants = mbc->pcf->pdata->num_batteries; + + ret = power_supply_register(&pdev->dev, &mbc->adapter); + if (ret) { + dev_err(mbc->pcf->dev, "failed to register adapter\n"); + kfree(mbc); + return ret; + } + + ret = power_supply_register(&pdev->dev, &mbc->usb); + if (ret) { + dev_err(mbc->pcf->dev, "failed to register usb\n"); + power_supply_unregister(&mbc->adapter); + kfree(mbc); + return ret; + } + + ret = sysfs_create_group(&pdev->dev.kobj, &mbc_attr_group); + if (ret) + dev_err(mbc->pcf->dev, "failed to create sysfs entries\n"); + + mbcs1 = pcf50633_reg_read(mbc->pcf, PCF50633_REG_MBCS1); + if (mbcs1 & PCF50633_MBCS1_USBPRES) + pcf50633_mbc_irq_handler(PCF50633_IRQ_USBINS, mbc); + if (mbcs1 & PCF50633_MBCS1_ADAPTPRES) + pcf50633_mbc_irq_handler(PCF50633_IRQ_ADPINS, mbc); + + return 0; +} + +static int __devexit pcf50633_mbc_remove(struct platform_device *pdev) +{ + struct pcf50633_mbc *mbc = platform_get_drvdata(pdev); + int i; + + /* Remove IRQ handlers */ + for (i = 0; i < ARRAY_SIZE(mbc_irq_handlers); i++) + pcf50633_free_irq(mbc->pcf, mbc_irq_handlers[i]); + + power_supply_unregister(&mbc->usb); + power_supply_unregister(&mbc->adapter); + + kfree(mbc); + + return 0; +} + +static struct platform_driver pcf50633_mbc_driver = { + .driver = { + .name = "pcf50633-mbc", + }, + .probe = pcf50633_mbc_probe, + .remove = __devexit_p(pcf50633_mbc_remove), +}; + +static int __init pcf50633_mbc_init(void) +{ + return platform_driver_register(&pcf50633_mbc_driver); +} +module_init(pcf50633_mbc_init); + +static void __exit pcf50633_mbc_exit(void) +{ + platform_driver_unregister(&pcf50633_mbc_driver); +} +module_exit(pcf50633_mbc_exit); + +MODULE_AUTHOR("Balaji Rao "); +MODULE_DESCRIPTION("PCF50633 mbc driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:pcf50633-mbc"); diff --git a/include/linux/mfd/pcf50633/mbc.h b/include/linux/mfd/pcf50633/mbc.h new file mode 100644 index 000000000000..6e17619b773a --- /dev/null +++ b/include/linux/mfd/pcf50633/mbc.h @@ -0,0 +1,134 @@ +/* + * mbc.h -- Driver for NXP PCF50633 Main Battery Charger + * + * (C) 2006-2008 by Openmoko, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#ifndef __LINUX_MFD_PCF50633_MBC_H +#define __LINUX_MFD_PCF50633_MBC_H + +#include +#include + +#define PCF50633_REG_MBCC1 0x43 +#define PCF50633_REG_MBCC2 0x44 +#define PCF50633_REG_MBCC3 0x45 +#define PCF50633_REG_MBCC4 0x46 +#define PCF50633_REG_MBCC5 0x47 +#define PCF50633_REG_MBCC6 0x48 +#define PCF50633_REG_MBCC7 0x49 +#define PCF50633_REG_MBCC8 0x4a +#define PCF50633_REG_MBCS1 0x4b +#define PCF50633_REG_MBCS2 0x4c +#define PCF50633_REG_MBCS3 0x4d + +enum pcf50633_reg_mbcc1 { + PCF50633_MBCC1_CHGENA = 0x01, /* Charger enable */ + PCF50633_MBCC1_AUTOSTOP = 0x02, + PCF50633_MBCC1_AUTORES = 0x04, /* automatic resume */ + PCF50633_MBCC1_RESUME = 0x08, /* explicit resume cmd */ + PCF50633_MBCC1_RESTART = 0x10, /* restart charging */ + PCF50633_MBCC1_PREWDTIME_60M = 0x20, /* max. precharging time */ + PCF50633_MBCC1_WDTIME_1H = 0x00, + PCF50633_MBCC1_WDTIME_2H = 0x40, + PCF50633_MBCC1_WDTIME_4H = 0x80, + PCF50633_MBCC1_WDTIME_6H = 0xc0, +}; +#define PCF50633_MBCC1_WDTIME_MASK 0xc0 + +enum pcf50633_reg_mbcc2 { + PCF50633_MBCC2_VBATCOND_2V7 = 0x00, + PCF50633_MBCC2_VBATCOND_2V85 = 0x01, + PCF50633_MBCC2_VBATCOND_3V0 = 0x02, + PCF50633_MBCC2_VBATCOND_3V15 = 0x03, + PCF50633_MBCC2_VMAX_4V = 0x00, + PCF50633_MBCC2_VMAX_4V20 = 0x28, + PCF50633_MBCC2_VRESDEBTIME_64S = 0x80, /* debounce time (32/64sec) */ +}; + +enum pcf50633_reg_mbcc7 { + PCF50633_MBCC7_USB_100mA = 0x00, + PCF50633_MBCC7_USB_500mA = 0x01, + PCF50633_MBCC7_USB_1000mA = 0x02, + PCF50633_MBCC7_USB_SUSPEND = 0x03, + PCF50633_MBCC7_BATTEMP_EN = 0x04, + PCF50633_MBCC7_BATSYSIMAX_1A6 = 0x00, + PCF50633_MBCC7_BATSYSIMAX_1A8 = 0x40, + PCF50633_MBCC7_BATSYSIMAX_2A0 = 0x80, + PCF50633_MBCC7_BATSYSIMAX_2A2 = 0xc0, +}; +#define PCF50633_MBCC7_USB_MASK 0x03 + +enum pcf50633_reg_mbcc8 { + PCF50633_MBCC8_USBENASUS = 0x10, +}; + +enum pcf50633_reg_mbcs1 { + PCF50633_MBCS1_USBPRES = 0x01, + PCF50633_MBCS1_USBOK = 0x02, + PCF50633_MBCS1_ADAPTPRES = 0x04, + PCF50633_MBCS1_ADAPTOK = 0x08, + PCF50633_MBCS1_TBAT_OK = 0x00, + PCF50633_MBCS1_TBAT_ABOVE = 0x10, + PCF50633_MBCS1_TBAT_BELOW = 0x20, + PCF50633_MBCS1_TBAT_UNDEF = 0x30, + PCF50633_MBCS1_PREWDTEXP = 0x40, + PCF50633_MBCS1_WDTEXP = 0x80, +}; + +enum pcf50633_reg_mbcs2_mbcmod { + PCF50633_MBCS2_MBC_PLAY = 0x00, + PCF50633_MBCS2_MBC_USB_PRE = 0x01, + PCF50633_MBCS2_MBC_USB_PRE_WAIT = 0x02, + PCF50633_MBCS2_MBC_USB_FAST = 0x03, + PCF50633_MBCS2_MBC_USB_FAST_WAIT = 0x04, + PCF50633_MBCS2_MBC_USB_SUSPEND = 0x05, + PCF50633_MBCS2_MBC_ADP_PRE = 0x06, + PCF50633_MBCS2_MBC_ADP_PRE_WAIT = 0x07, + PCF50633_MBCS2_MBC_ADP_FAST = 0x08, + PCF50633_MBCS2_MBC_ADP_FAST_WAIT = 0x09, + PCF50633_MBCS2_MBC_BAT_FULL = 0x0a, + PCF50633_MBCS2_MBC_HALT = 0x0b, +}; +#define PCF50633_MBCS2_MBC_MASK 0x0f +enum pcf50633_reg_mbcs2_chgstat { + PCF50633_MBCS2_CHGS_NONE = 0x00, + PCF50633_MBCS2_CHGS_ADAPTER = 0x10, + PCF50633_MBCS2_CHGS_USB = 0x20, + PCF50633_MBCS2_CHGS_BOTH = 0x30, +}; +#define PCF50633_MBCS2_RESSTAT_AUTO 0x40 + +enum pcf50633_reg_mbcs3 { + PCF50633_MBCS3_USBLIM_PLAY = 0x01, + PCF50633_MBCS3_USBLIM_CGH = 0x02, + PCF50633_MBCS3_TLIM_PLAY = 0x04, + PCF50633_MBCS3_TLIM_CHG = 0x08, + PCF50633_MBCS3_ILIM = 0x10, /* 1: Ibat > Icutoff */ + PCF50633_MBCS3_VLIM = 0x20, /* 1: Vbat == Vmax */ + PCF50633_MBCS3_VBATSTAT = 0x40, /* 1: Vbat > Vbatcond */ + PCF50633_MBCS3_VRES = 0x80, /* 1: Vbat > Vth(RES) */ +}; + +#define PCF50633_MBCC2_VBATCOND_MASK 0x03 +#define PCF50633_MBCC2_VMAX_MASK 0x3c + +/* Charger status */ +#define PCF50633_MBC_USB_ONLINE 0x01 +#define PCF50633_MBC_USB_ACTIVE 0x02 +#define PCF50633_MBC_ADAPTER_ONLINE 0x04 +#define PCF50633_MBC_ADAPTER_ACTIVE 0x08 + +int pcf50633_mbc_usb_curlim_set(struct pcf50633 *pcf, int ma); + +int pcf50633_mbc_get_status(struct pcf50633 *); +void pcf50633_mbc_set_status(struct pcf50633 *, int what, int status); + +#endif + -- cgit v1.2.3 From 5ec271e745350c7df6a6ebca24b43cb7a10bfa4a Mon Sep 17 00:00:00 2001 From: Balaji Rao Date: Fri, 9 Jan 2009 01:51:01 +0100 Subject: regulator: PCF50633 pmic driver Changes from V1: - Removed support for suspend_enable & suspend_disable functions. Signed-off-by: Balaji Rao Cc: Andy Green Cc: Liam Girdwood Acked-by: Mark Brown Signed-off-by: Samuel Ortiz --- drivers/regulator/Kconfig | 7 + drivers/regulator/Makefile | 1 + drivers/regulator/pcf50633-regulator.c | 329 +++++++++++++++++++++++++++++++++ include/linux/mfd/pcf50633/pmic.h | 67 +++++++ 4 files changed, 404 insertions(+) create mode 100644 drivers/regulator/pcf50633-regulator.c create mode 100644 include/linux/mfd/pcf50633/pmic.h (limited to 'include/linux') diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index 39360e2a4540..e7e0cf102d6d 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -73,4 +73,11 @@ config REGULATOR_DA903X Say y here to support the BUCKs and LDOs regulators found on Dialog Semiconductor DA9030/DA9034 PMIC. +config REGULATOR_PCF50633 + tristate "PCF50633 regulator driver" + depends on MFD_PCF50633 + help + Say Y here to support the voltage regulators and convertors + on PCF50633 + endif diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index 254d40c02ee8..61b30c6ddecc 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -11,5 +11,6 @@ obj-$(CONFIG_REGULATOR_BQ24022) += bq24022.o obj-$(CONFIG_REGULATOR_WM8350) += wm8350-regulator.o obj-$(CONFIG_REGULATOR_WM8400) += wm8400-regulator.o obj-$(CONFIG_REGULATOR_DA903X) += da903x.o +obj-$(CONFIG_REGULATOR_PCF50633) += pcf50633-regulator.o ccflags-$(CONFIG_REGULATOR_DEBUG) += -DDEBUG diff --git a/drivers/regulator/pcf50633-regulator.c b/drivers/regulator/pcf50633-regulator.c new file mode 100644 index 000000000000..4cc85ec6e120 --- /dev/null +++ b/drivers/regulator/pcf50633-regulator.c @@ -0,0 +1,329 @@ +/* NXP PCF50633 PMIC Driver + * + * (C) 2006-2008 by Openmoko, Inc. + * Author: Balaji Rao + * All rights reserved. + * + * Broken down from monstrous PCF50633 driver mainly by + * Harald Welte and Andy Green and Werner Almesberger + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define PCF50633_REGULATOR(_name, _id) \ + { \ + .name = _name, \ + .id = _id, \ + .ops = &pcf50633_regulator_ops, \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + } + +static const u8 pcf50633_regulator_registers[PCF50633_NUM_REGULATORS] = { + [PCF50633_REGULATOR_AUTO] = PCF50633_REG_AUTOOUT, + [PCF50633_REGULATOR_DOWN1] = PCF50633_REG_DOWN1OUT, + [PCF50633_REGULATOR_DOWN2] = PCF50633_REG_DOWN2OUT, + [PCF50633_REGULATOR_MEMLDO] = PCF50633_REG_MEMLDOOUT, + [PCF50633_REGULATOR_LDO1] = PCF50633_REG_LDO1OUT, + [PCF50633_REGULATOR_LDO2] = PCF50633_REG_LDO2OUT, + [PCF50633_REGULATOR_LDO3] = PCF50633_REG_LDO3OUT, + [PCF50633_REGULATOR_LDO4] = PCF50633_REG_LDO4OUT, + [PCF50633_REGULATOR_LDO5] = PCF50633_REG_LDO5OUT, + [PCF50633_REGULATOR_LDO6] = PCF50633_REG_LDO6OUT, + [PCF50633_REGULATOR_HCLDO] = PCF50633_REG_HCLDOOUT, +}; + +/* Bits from voltage value */ +static u8 auto_voltage_bits(unsigned int millivolts) +{ + if (millivolts < 1800) + return 0; + if (millivolts > 3800) + return 0xff; + + millivolts -= 625; + + return millivolts / 25; +} + +static u8 down_voltage_bits(unsigned int millivolts) +{ + if (millivolts < 625) + return 0; + else if (millivolts > 3000) + return 0xff; + + millivolts -= 625; + + return millivolts / 25; +} + +static u8 ldo_voltage_bits(unsigned int millivolts) +{ + if (millivolts < 900) + return 0; + else if (millivolts > 3600) + return 0x1f; + + millivolts -= 900; + return millivolts / 100; +} + +/* Obtain voltage value from bits */ +static unsigned int auto_voltage_value(u8 bits) +{ + if (bits < 0x2f) + return 0; + + return 625 + (bits * 25); +} + + +static unsigned int down_voltage_value(u8 bits) +{ + return 625 + (bits * 25); +} + + +static unsigned int ldo_voltage_value(u8 bits) +{ + bits &= 0x1f; + + return 900 + (bits * 100); +} + +static int pcf50633_regulator_set_voltage(struct regulator_dev *rdev, + int min_uV, int max_uV) +{ + struct pcf50633 *pcf; + int regulator_id, millivolts; + u8 volt_bits, regnr; + + pcf = rdev_get_drvdata(rdev); + + regulator_id = rdev_get_id(rdev); + if (regulator_id >= PCF50633_NUM_REGULATORS) + return -EINVAL; + + millivolts = min_uV / 1000; + + regnr = pcf50633_regulator_registers[regulator_id]; + + switch (regulator_id) { + case PCF50633_REGULATOR_AUTO: + volt_bits = auto_voltage_bits(millivolts); + break; + case PCF50633_REGULATOR_DOWN1: + volt_bits = down_voltage_bits(millivolts); + break; + case PCF50633_REGULATOR_DOWN2: + volt_bits = down_voltage_bits(millivolts); + break; + case PCF50633_REGULATOR_LDO1: + case PCF50633_REGULATOR_LDO2: + case PCF50633_REGULATOR_LDO3: + case PCF50633_REGULATOR_LDO4: + case PCF50633_REGULATOR_LDO5: + case PCF50633_REGULATOR_LDO6: + case PCF50633_REGULATOR_HCLDO: + volt_bits = ldo_voltage_bits(millivolts); + break; + default: + return -EINVAL; + } + + return pcf50633_reg_write(pcf, regnr, volt_bits); +} + +static int pcf50633_regulator_get_voltage(struct regulator_dev *rdev) +{ + struct pcf50633 *pcf; + int regulator_id, millivolts, volt_bits; + u8 regnr; + + pcf = rdev_get_drvdata(rdev);; + + regulator_id = rdev_get_id(rdev); + if (regulator_id >= PCF50633_NUM_REGULATORS) + return -EINVAL; + + regnr = pcf50633_regulator_registers[regulator_id]; + + volt_bits = pcf50633_reg_read(pcf, regnr); + if (volt_bits < 0) + return -1; + + switch (regulator_id) { + case PCF50633_REGULATOR_AUTO: + millivolts = auto_voltage_value(volt_bits); + break; + case PCF50633_REGULATOR_DOWN1: + millivolts = down_voltage_value(volt_bits); + break; + case PCF50633_REGULATOR_DOWN2: + millivolts = down_voltage_value(volt_bits); + break; + case PCF50633_REGULATOR_LDO1: + case PCF50633_REGULATOR_LDO2: + case PCF50633_REGULATOR_LDO3: + case PCF50633_REGULATOR_LDO4: + case PCF50633_REGULATOR_LDO5: + case PCF50633_REGULATOR_LDO6: + case PCF50633_REGULATOR_HCLDO: + millivolts = ldo_voltage_value(volt_bits); + break; + default: + return -EINVAL; + } + + return millivolts * 1000; +} + +static int pcf50633_regulator_enable(struct regulator_dev *rdev) +{ + struct pcf50633 *pcf = rdev_get_drvdata(rdev); + int regulator_id; + u8 regnr; + + regulator_id = rdev_get_id(rdev); + if (regulator_id >= PCF50633_NUM_REGULATORS) + return -EINVAL; + + /* The *ENA register is always one after the *OUT register */ + regnr = pcf50633_regulator_registers[regulator_id] + 1; + + return pcf50633_reg_set_bit_mask(pcf, regnr, PCF50633_REGULATOR_ON, + PCF50633_REGULATOR_ON); +} + +static int pcf50633_regulator_disable(struct regulator_dev *rdev) +{ + struct pcf50633 *pcf = rdev_get_drvdata(rdev); + int regulator_id; + u8 regnr; + + regulator_id = rdev_get_id(rdev); + if (regulator_id >= PCF50633_NUM_REGULATORS) + return -EINVAL; + + /* the *ENA register is always one after the *OUT register */ + regnr = pcf50633_regulator_registers[regulator_id] + 1; + + return pcf50633_reg_set_bit_mask(pcf, regnr, + PCF50633_REGULATOR_ON, 0); +} + +static int pcf50633_regulator_is_enabled(struct regulator_dev *rdev) +{ + struct pcf50633 *pcf = rdev_get_drvdata(rdev); + int regulator_id = rdev_get_id(rdev); + u8 regnr; + + regulator_id = rdev_get_id(rdev); + if (regulator_id >= PCF50633_NUM_REGULATORS) + return -EINVAL; + + /* the *ENA register is always one after the *OUT register */ + regnr = pcf50633_regulator_registers[regulator_id] + 1; + + return pcf50633_reg_read(pcf, regnr) & PCF50633_REGULATOR_ON; +} + +static struct regulator_ops pcf50633_regulator_ops = { + .set_voltage = pcf50633_regulator_set_voltage, + .get_voltage = pcf50633_regulator_get_voltage, + .enable = pcf50633_regulator_enable, + .disable = pcf50633_regulator_disable, + .is_enabled = pcf50633_regulator_is_enabled, +}; + +static struct regulator_desc regulators[] = { + [PCF50633_REGULATOR_AUTO] = + PCF50633_REGULATOR("auto", PCF50633_REGULATOR_AUTO), + [PCF50633_REGULATOR_DOWN1] = + PCF50633_REGULATOR("down1", PCF50633_REGULATOR_DOWN1), + [PCF50633_REGULATOR_DOWN2] = + PCF50633_REGULATOR("down2", PCF50633_REGULATOR_DOWN2), + [PCF50633_REGULATOR_LDO1] = + PCF50633_REGULATOR("ldo1", PCF50633_REGULATOR_LDO1), + [PCF50633_REGULATOR_LDO2] = + PCF50633_REGULATOR("ldo2", PCF50633_REGULATOR_LDO2), + [PCF50633_REGULATOR_LDO3] = + PCF50633_REGULATOR("ldo3", PCF50633_REGULATOR_LDO3), + [PCF50633_REGULATOR_LDO4] = + PCF50633_REGULATOR("ldo4", PCF50633_REGULATOR_LDO4), + [PCF50633_REGULATOR_LDO5] = + PCF50633_REGULATOR("ldo5", PCF50633_REGULATOR_LDO5), + [PCF50633_REGULATOR_LDO6] = + PCF50633_REGULATOR("ldo6", PCF50633_REGULATOR_LDO6), + [PCF50633_REGULATOR_HCLDO] = + PCF50633_REGULATOR("hcldo", PCF50633_REGULATOR_HCLDO), + [PCF50633_REGULATOR_MEMLDO] = + PCF50633_REGULATOR("memldo", PCF50633_REGULATOR_MEMLDO), +}; + +static int __devinit pcf50633_regulator_probe(struct platform_device *pdev) +{ + struct regulator_dev *rdev; + struct pcf50633 *pcf; + + /* Already set by core driver */ + pcf = platform_get_drvdata(pdev); + + rdev = regulator_register(®ulators[pdev->id], &pdev->dev, pcf); + if (IS_ERR(rdev)) + return PTR_ERR(rdev); + + if (pcf->pdata->regulator_registered) + pcf->pdata->regulator_registered(pcf, pdev->id); + + return 0; +} + +static int __devexit pcf50633_regulator_remove(struct platform_device *pdev) +{ + struct regulator_dev *rdev = platform_get_drvdata(pdev); + + regulator_unregister(rdev); + + return 0; +} + +static struct platform_driver pcf50633_regulator_driver = { + .driver = { + .name = "pcf50633-regltr", + }, + .probe = pcf50633_regulator_probe, + .remove = __devexit_p(pcf50633_regulator_remove), +}; + +static int __init pcf50633_regulator_init(void) +{ + return platform_driver_register(&pcf50633_regulator_driver); +} +module_init(pcf50633_regulator_init); + +static void __exit pcf50633_regulator_exit(void) +{ + platform_driver_unregister(&pcf50633_regulator_driver); +} +module_exit(pcf50633_regulator_exit); + +MODULE_AUTHOR("Balaji Rao "); +MODULE_DESCRIPTION("PCF50633 regulator driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:pcf50633-regulator"); diff --git a/include/linux/mfd/pcf50633/pmic.h b/include/linux/mfd/pcf50633/pmic.h new file mode 100644 index 000000000000..2d3dbe53b235 --- /dev/null +++ b/include/linux/mfd/pcf50633/pmic.h @@ -0,0 +1,67 @@ +#ifndef __LINUX_MFD_PCF50633_PMIC_H +#define __LINUX_MFD_PCF50633_PMIC_H + +#include +#include + +#define PCF50633_REG_AUTOOUT 0x1a +#define PCF50633_REG_AUTOENA 0x1b +#define PCF50633_REG_AUTOCTL 0x1c +#define PCF50633_REG_AUTOMXC 0x1d +#define PCF50633_REG_DOWN1OUT 0x1e +#define PCF50633_REG_DOWN1ENA 0x1f +#define PCF50633_REG_DOWN1CTL 0x20 +#define PCF50633_REG_DOWN1MXC 0x21 +#define PCF50633_REG_DOWN2OUT 0x22 +#define PCF50633_REG_DOWN2ENA 0x23 +#define PCF50633_REG_DOWN2CTL 0x24 +#define PCF50633_REG_DOWN2MXC 0x25 +#define PCF50633_REG_MEMLDOOUT 0x26 +#define PCF50633_REG_MEMLDOENA 0x27 +#define PCF50633_REG_LDO1OUT 0x2d +#define PCF50633_REG_LDO1ENA 0x2e +#define PCF50633_REG_LDO2OUT 0x2f +#define PCF50633_REG_LDO2ENA 0x30 +#define PCF50633_REG_LDO3OUT 0x31 +#define PCF50633_REG_LDO3ENA 0x32 +#define PCF50633_REG_LDO4OUT 0x33 +#define PCF50633_REG_LDO4ENA 0x34 +#define PCF50633_REG_LDO5OUT 0x35 +#define PCF50633_REG_LDO5ENA 0x36 +#define PCF50633_REG_LDO6OUT 0x37 +#define PCF50633_REG_LDO6ENA 0x38 +#define PCF50633_REG_HCLDOOUT 0x39 +#define PCF50633_REG_HCLDOENA 0x3a +#define PCF50633_REG_HCLDOOVL 0x40 + +enum pcf50633_regulator_enable { + PCF50633_REGULATOR_ON = 0x01, + PCF50633_REGULATOR_ON_GPIO1 = 0x02, + PCF50633_REGULATOR_ON_GPIO2 = 0x04, + PCF50633_REGULATOR_ON_GPIO3 = 0x08, +}; +#define PCF50633_REGULATOR_ON_MASK 0x0f + +enum pcf50633_regulator_phase { + PCF50633_REGULATOR_ACTPH1 = 0x00, + PCF50633_REGULATOR_ACTPH2 = 0x10, + PCF50633_REGULATOR_ACTPH3 = 0x20, + PCF50633_REGULATOR_ACTPH4 = 0x30, +}; +#define PCF50633_REGULATOR_ACTPH_MASK 0x30 + +enum pcf50633_regulator_id { + PCF50633_REGULATOR_AUTO, + PCF50633_REGULATOR_DOWN1, + PCF50633_REGULATOR_DOWN2, + PCF50633_REGULATOR_LDO1, + PCF50633_REGULATOR_LDO2, + PCF50633_REGULATOR_LDO3, + PCF50633_REGULATOR_LDO4, + PCF50633_REGULATOR_LDO5, + PCF50633_REGULATOR_LDO6, + PCF50633_REGULATOR_HCLDO, + PCF50633_REGULATOR_MEMLDO, +}; +#endif + -- cgit v1.2.3 From 53ce3d9564908794ae7dd32969089b57df5fc098 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 9 Jan 2009 12:27:08 -0800 Subject: smp_call_function_single(): be slightly less stupid If you do smp_call_function_single(expression-with-side-effects, ...) then expression-with-side-effects never gets evaluated on UP builds. As always, implementing it in C is the correct thing to do. While we're there, uninline it for size and possible header dependency reasons. And create a new kernel/up.c, as a place in which to put uniprocessor-specific code and storage. It should mirror kernel/smp.c. Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/smp.h | 13 +++---------- kernel/Makefile | 6 +++++- kernel/up.c | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+), 11 deletions(-) create mode 100644 kernel/up.c (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index b82466968101..715196b09d67 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -24,6 +24,9 @@ struct call_single_data { /* total number of cpus in this system (may exceed NR_CPUS) */ extern unsigned int total_cpus; +int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, + int wait); + #ifdef CONFIG_SMP #include @@ -79,8 +82,6 @@ smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info, return 0; } -int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, - int wait); void __smp_call_function_single(int cpuid, struct call_single_data *data); /* @@ -140,14 +141,6 @@ static inline int up_smp_call_function(void (*func)(void *), void *info) static inline void smp_send_reschedule(int cpu) { } #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) -#define smp_call_function_single(cpuid, func, info, wait) \ -({ \ - WARN_ON(cpuid != 0); \ - local_irq_disable(); \ - (func)(info); \ - local_irq_enable(); \ - 0; \ -}) #define smp_call_function_mask(mask, func, info, wait) \ (up_smp_call_function(func, info)) #define smp_call_function_many(mask, func, info, wait) \ diff --git a/kernel/Makefile b/kernel/Makefile index 2921d90ce32f..2aebc4cd7878 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -40,7 +40,11 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o +ifeq ($(CONFIG_USE_GENERIC_SMP_HELPERS),y) +obj-y += smp.o +else +obj-y += up.o +endif obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o diff --git a/kernel/up.c b/kernel/up.c new file mode 100644 index 000000000000..ce62cc9e9f71 --- /dev/null +++ b/kernel/up.c @@ -0,0 +1,18 @@ +/* + * Uniprocessor-only support functions. The counterpart to kernel/smp.c + */ + +#include +#include +#include + +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int wait) +{ + WARN_ON(cpuid != 0); + local_irq_disable(); + (func)(info); + local_irq_enable(); + return 0; +} +EXPORT_SYMBOL(smp_call_function_single); -- cgit v1.2.3 From 649274d993212e7c23c0cb734572c2311c200872 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 11 Jan 2009 00:20:39 -0800 Subject: net_dma: acquire/release dma channels on ifup/ifdown The recent dmaengine rework removed the capability to remove dma device driver modules while net_dma is active. Rather than notify dmaengine-clients that channels are trying to be removed, we now rely on clients to notify dmaengine when they no longer have a need for channels. Teach net_dma to release channels by taking dmaengine references at netdevice open and dropping references at netdevice close. Acked-by: Maciej Sosnowski Signed-off-by: Dan Williams Signed-off-by: David S. Miller --- include/linux/dmaengine.h | 10 ++++++++++ net/core/dev.c | 13 ++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 64dea2ab326c..c73f1e2b59b7 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -270,8 +270,18 @@ struct dma_device { /* --- public DMA engine API --- */ +#ifdef CONFIG_DMA_ENGINE void dmaengine_get(void); void dmaengine_put(void); +#else +static inline void dmaengine_get(void) +{ +} +static inline void dmaengine_put(void) +{ +} +#endif + dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest, void *src, size_t len); dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan, diff --git a/net/core/dev.c b/net/core/dev.c index 5f736f1ceeae..b715a55cccc4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1087,6 +1087,11 @@ int dev_open(struct net_device *dev) */ dev->flags |= IFF_UP; + /* + * Enable NET_DMA + */ + dmaengine_get(); + /* * Initialize multicasting status */ @@ -1164,6 +1169,11 @@ int dev_close(struct net_device *dev) */ call_netdevice_notifiers(NETDEV_DOWN, dev); + /* + * Shutdown NET_DMA + */ + dmaengine_put(); + return 0; } @@ -5151,9 +5161,6 @@ static int __init net_dev_init(void) hotcpu_notifier(dev_cpu_callback, 0); dst_init(); dev_mcast_init(); - #ifdef CONFIG_NET_DMA - dmaengine_get(); - #endif rc = 0; out: return rc; -- cgit v1.2.3 From 57de16e612d63138bd2c618449af9d8312466e25 Mon Sep 17 00:00:00 2001 From: Martin Bachem Date: Sun, 26 Oct 2008 13:30:09 +0100 Subject: BUGFIX: used NULL pointer at ioctl(sk,IMGETDEVINFO,&devinfo) when devinfo.id not registered daxtar example # modprobe hfcsusb daxtar example # modprobe mISDN_l1loop daxtar example # ./misdnportinfo Found 3 devices id: 0 Dprotocols: 00000006 Bprotocols: 0000000e protocol: 0 nrbchan: 2 name: HFC-S_USB.1 id: 1 Dprotocols: 00000006 Bprotocols: 0000000e protocol: 0 nrbchan: 2 name: mISDN_l1loop.1 id: 2 Dprotocols: 00000006 Bprotocols: 0000000e protocol: 0 nrbchan: 2 name: mISDN_l1loop.2 daxtar example # rmmod hfcsusb daxtar example # ./misdnportinfo Found 2 devices *Segmentation* *fault* dmesg: [ 9914.939718] BUG: unable to handle kernel NULL pointer dereference at 000000d4 [ 9914.939721] IP: [] :mISDN_core:get_mdevice+0x19/0x22 [ 9914.939729] *pde = 00000000 [ 9914.939732] Oops: 0000 [#14] PREEMPT SMP [ 9914.939734] Modules linked in: mISDN_l1loop mISDN_core vmnet vmblock vmci vmmon coretemp w83627ehf hwmon_vid rfcomm l2cap blue tooth usbhid snd_usb_audio snd_usb_lib snd_rawmidi snd_hwdep fuse nvidia(P) uhci_hcd i2c_i801 ehci_hcd snd_hda_intel atl1 usbcore i2c_core parport_seria l [last unloaded: hfcsusb] [ 9914.939751] Pid: 29618, comm: misdnportinfo Tainted: P D (2.6.27.3 #5) [ 9914.939753] EIP: 0060:[] EFLAGS: 00210246 CPU: 0 [ 9914.939758] EIP is at get_mdevice+0x19/0x22 [mISDN_core] [ 9914.939760] EAX: 00000000 EBX: f8fa791c ECX: f6afaa58 EDX: f7960cf4 [ 9914.939762] ESI: 80044944 EDI: bfc2e62c EBP: bfc2e62c ESP: f5adbef4 [ 9914.939763] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 [ 9914.939765] Process misdnportinfo (pid: 29618, ti=f5ada000 task=f6bec430 task.ti=f5ada000) [ 9914.939767] Stack: f8f9f4e0 00000000 f8f9f867 bfc2e62c 0000000a c02461e8 00200246 c042dde8 [ 9914.939771] 00000003 c042dde4 00000000 00000001 00200082 c0114775 00000000 00000000 [ 9914.939775] 00000003 f7088010 00200282 f8fa791c 80044944 bfc2e62c bfc2e62c c02f6615 [ 9914.939780] Call Trace: [ 9914.939782] [] _get_mdevice+0x0/0x18 [mISDN_core] [ 9914.939789] [] base_sock_ioctl+0x7a/0x129 [mISDN_core] [ 9914.939789] [] opost+0x171/0x182 [ 9914.939789] [] __wake_up+0x29/0x39 [ 9914.939789] [] sock_ioctl+0x1b5/0x1d9 [ 9914.939789] [] sock_ioctl+0x0/0x1d9 [ 9914.939789] [] vfs_ioctl+0x1c/0x5d [ 9914.939789] [] do_vfs_ioctl+0x23e/0x24e [ 9914.939789] [] sys_ioctl+0x2c/0x45 [ 9914.939789] [] sysenter_do_call+0x12/0x21 [ 9914.939789] [] pci_fixup_i450gx+0x4e/0x56 [ 9914.939789] ======================= [ 9914.939789] Code: 00 68 02 f0 f9 f8 e8 ae b4 2c c7 8b 44 24 04 5a 59 c3 83 ec 04 31 d2 89 04 24 89 e1 b8 ac df fa f8 68 e0 f4 f9 f8 e8 4a b5 2c c7 <8b> 80 d4 00 00 00 5a 59 c3 53 89 cb 8d 90 9c 00 00 00 89 c8 e8 [ 9914.939789] EIP: [] get_mdevice+0x19/0x22 [mISDN_core] SS:ESP 0068:f5adbef4 [ 9914.939858] ---[ end trace 50e18a715b019424 ]--- Signed-off-by: Martin Bachem Signed-off-by: Karsten Keil --- include/linux/mISDNif.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h index 557477ac3d5b..5da3d95b27f1 100644 --- a/include/linux/mISDNif.h +++ b/include/linux/mISDNif.h @@ -559,7 +559,10 @@ extern void mISDN_unregister_clock(struct mISDNclock *); static inline struct mISDNdevice *dev_to_mISDN(struct device *dev) { - return dev_get_drvdata(dev); + if (dev) + return dev_get_drvdata(dev); + else + return NULL; } extern void set_channel_address(struct mISDNchannel *, u_int, u_int); -- cgit v1.2.3 From 6d612b0f943289856c6e8186c564cda922cd040e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 Jan 2009 12:52:23 +0100 Subject: locking, hpet: annotate false positive warning Alexander Beregalov reported that this warning is caused by the HPET code: > hpet0: at MMIO 0xfed00000, IRQs 2, 8, 0 > hpet0: 3 comparators, 64-bit 14.318180 MHz counter > ODEBUG: object is on stack, but not annotated > ------------[ cut here ]------------ > WARNING: at lib/debugobjects.c:251 __debug_object_init+0x2a4/0x352() > Bisected down to 26afe5f2fbf06ea0765aaa316640c4dd472310c0 > (x86: HPET_MSI Initialise per-cpu HPET timers) The commit is fine - but the on-stack workqueue entry needs annotation. Reported-and-bisected-by: Alexander Beregalov Signed-off-by: Peter Zijlstra Tested-by: Alexander Beregalov Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 2 +- include/linux/workqueue.h | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index cd759ad90690..bb2e0f0975ae 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -628,7 +628,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n, switch (action & 0xf) { case CPU_ONLINE: - INIT_DELAYED_WORK(&work.work, hpet_work); + INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work); init_completion(&work.complete); /* FIXME: add schedule_work_on() */ schedule_delayed_work_on(cpu, &work.work, 0); diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index b36291130f22..47151c8495aa 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -118,6 +118,12 @@ struct execute_work { init_timer(&(_work)->timer); \ } while (0) +#define INIT_DELAYED_WORK_ON_STACK(_work, _func) \ + do { \ + INIT_WORK(&(_work)->work, (_func)); \ + init_timer_on_stack(&(_work)->timer); \ + } while (0) + #define INIT_DELAYED_WORK_DEFERRABLE(_work, _func) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ -- cgit v1.2.3 From 2e4c77bea3d8b17d94f8ee382411f359b708560f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 30 Dec 2008 14:16:41 +0100 Subject: m68k: dio - Kill warn_unused_result warnings warning: ignoring return value of 'device_register', declared with attribute warn_unused_result warning: ignoring return value of 'device_create_file', declared with attribute warn_unused_result Signed-off-by: Geert Uytterhoeven --- drivers/dio/dio-sysfs.c | 16 ++++++++++------ drivers/dio/dio.c | 18 +++++++++++++++--- include/linux/dio.h | 2 +- 3 files changed, 26 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/dio/dio-sysfs.c b/drivers/dio/dio-sysfs.c index f46463038847..91d5f4da8769 100644 --- a/drivers/dio/dio-sysfs.c +++ b/drivers/dio/dio-sysfs.c @@ -63,15 +63,19 @@ static ssize_t dio_show_resource(struct device *dev, struct device_attribute *at } static DEVICE_ATTR(resource, S_IRUGO, dio_show_resource, NULL); -void dio_create_sysfs_dev_files(struct dio_dev *d) +int dio_create_sysfs_dev_files(struct dio_dev *d) { struct device *dev = &d->dev; + int error; /* current configuration's attributes */ - device_create_file(dev, &dev_attr_id); - device_create_file(dev, &dev_attr_ipl); - device_create_file(dev, &dev_attr_secid); - device_create_file(dev, &dev_attr_name); - device_create_file(dev, &dev_attr_resource); + if ((error = device_create_file(dev, &dev_attr_id)) || + (error = device_create_file(dev, &dev_attr_ipl)) || + (error = device_create_file(dev, &dev_attr_secid)) || + (error = device_create_file(dev, &dev_attr_name)) || + (error = device_create_file(dev, &dev_attr_resource))) + return error; + + return 0; } diff --git a/drivers/dio/dio.c b/drivers/dio/dio.c index 07f274f853d9..10c3c498358c 100644 --- a/drivers/dio/dio.c +++ b/drivers/dio/dio.c @@ -173,6 +173,7 @@ static int __init dio_init(void) mm_segment_t fs; int i; struct dio_dev *dev; + int error; if (!MACH_IS_HP300) return 0; @@ -182,7 +183,11 @@ static int __init dio_init(void) /* Initialize the DIO bus */ INIT_LIST_HEAD(&dio_bus.devices); strcpy(dio_bus.dev.bus_id, "dio"); - device_register(&dio_bus.dev); + error = device_register(&dio_bus.dev); + if (error) { + pr_err("DIO: Error registering dio_bus\n"); + return error; + } /* Request all resources */ dio_bus.num_resources = (hp300_model == HP_320 ? 1 : 2); @@ -252,8 +257,15 @@ static int __init dio_init(void) if (scode >= DIOII_SCBASE) iounmap(va); - device_register(&dev->dev); - dio_create_sysfs_dev_files(dev); + error = device_register(&dev->dev); + if (error) { + pr_err("DIO: Error registering device %s\n", + dev->name); + continue; + } + error = dio_create_sysfs_dev_files(dev); + if (error) + dev_err(&dev->dev, "Error creating sysfs files\n"); } return 0; } diff --git a/include/linux/dio.h b/include/linux/dio.h index 1e65ebc2a3db..b2dd31ca1710 100644 --- a/include/linux/dio.h +++ b/include/linux/dio.h @@ -241,7 +241,7 @@ struct dio_driver { extern int dio_find(int deviceid); extern unsigned long dio_scodetophysaddr(int scode); -extern void dio_create_sysfs_dev_files(struct dio_dev *); +extern int dio_create_sysfs_dev_files(struct dio_dev *); /* New-style probing */ extern int dio_register_driver(struct dio_driver *); -- cgit v1.2.3 From 985ebdb5ed54151eba734aa1b307460e8e4267ba Mon Sep 17 00:00:00 2001 From: Krzysztof Hałasa Date: Mon, 12 Jan 2009 16:32:13 -0800 Subject: net: Fix a comment in include/linux/netdevice.h. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a comment in include/linux/netdevice.h. Signed-off-by: Krzysztof Hałasa Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f24556813375..4647604c7ca9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -467,7 +467,7 @@ struct netdev_queue { * This function is called when network device transistions to the down * state. * - * int (*ndo_hard_start_xmit)(struct sk_buff *skb, struct net_device *dev); + * int (*ndo_start_xmit)(struct sk_buff *skb, struct net_device *dev); * Called when a packet needs to be transmitted. * Must return NETDEV_TX_OK , NETDEV_TX_BUSY, or NETDEV_TX_LOCKED, * Required can not be NULL. -- cgit v1.2.3 From daaf83d2b9277928739f3eb7ea64f49c1254fd62 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Mon, 12 Jan 2009 00:06:11 +0000 Subject: netfilter 09/09: remove padding from struct xt_match on 64bit builds reorder struct xt_match to remove 8 bytes of padding and make its size 128 bytes. This saves a small amount of data space in each of the xt netfilter modules and fits xt_match in one 128 byte cache line. Signed-off-by: Richard Kennedy Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter/x_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index e52ce475d19f..c7ee8744d26b 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -270,6 +270,7 @@ struct xt_match struct list_head list; const char name[XT_FUNCTION_MAXNAMELEN-1]; + u_int8_t revision; /* Return true or false: return FALSE and set *hotdrop = 1 to force immediate packet drop. */ @@ -302,7 +303,6 @@ struct xt_match unsigned short proto; unsigned short family; - u_int8_t revision; }; /* Registration hooks for targets. */ -- cgit v1.2.3 From baf48f6577e581a9adb8fe849dc80e24b21d171d Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Mon, 12 Jan 2009 21:15:17 -0800 Subject: softlock: fix false panic which can occur if softlockup_thresh is reduced At run-time, if softlockup_thresh is changed to a much lower value, touch_timestamp is likely to be much older than the new softlock_thresh. This will cause a false softlockup to be detected. If softlockup_panic is enabled, the system will panic. The fix is to touch all watchdogs before changing softlockup_thresh. Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +++ kernel/softlockup.c | 9 +++++++++ kernel/sysctl.c | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cae9b81a1f8..54cbabf3b871 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -293,6 +293,9 @@ extern void sched_show_task(struct task_struct *p); extern void softlockup_tick(void); extern void touch_softlockup_watchdog(void); extern void touch_all_softlockup_watchdogs(void); +extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; extern unsigned long sysctl_hung_task_check_count; extern unsigned long sysctl_hung_task_timeout_secs; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index d9188c66278a..85d5a2455103 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -88,6 +89,14 @@ void touch_all_softlockup_watchdogs(void) } EXPORT_SYMBOL(touch_all_softlockup_watchdogs); +int proc_dosoftlockup_thresh(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + touch_all_softlockup_watchdogs(); + return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); +} + /* * This callback runs from the timer interrupt, and checks * whether the watchdog thread has hung or not: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 89d74436318c..596dc31a7116 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -800,7 +800,7 @@ static struct ctl_table kern_table[] = { .data = &softlockup_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &proc_dosoftlockup_thresh, .strategy = &sysctl_intvec, .extra1 = &neg_one, .extra2 = &sixty, -- cgit v1.2.3 From 4c696ba7982501d43dea11dbbaabd2aa8a19cc42 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:53 +0100 Subject: [CVE-2009-0029] Move compat system call declarations to compat header file Move declarations to correct header file. Signed-off-by: Heiko Carstens --- include/linux/compat.h | 13 +++++++++++++ include/linux/syscalls.h | 12 ------------ 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index e88f3ecf38b4..3fd2194ff573 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -280,5 +280,18 @@ asmlinkage long compat_sys_timerfd_settime(int ufd, int flags, asmlinkage long compat_sys_timerfd_gettime(int ufd, struct compat_itimerspec __user *otmr); +asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page, + __u32 __user *pages, + const int __user *nodes, + int __user *status, + int flags); +asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, + struct compat_timeval __user *t); +asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename, + struct compat_stat __user *statbuf, + int flag); +asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename, + int flags, int mode); + #endif /* CONFIG_COMPAT */ #endif /* _LINUX_COMPAT_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 18d0a243a7b3..a7593f670ca6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -530,11 +530,6 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, const int __user *nodes, int __user *status, int flags); -asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page, - __u32 __user *pages, - const int __user *nodes, - int __user *status, - int flags); asmlinkage long sys_mbind(unsigned long start, unsigned long len, unsigned long mode, unsigned long __user *nmask, @@ -583,13 +578,6 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *bu int bufsiz); asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __user *utimes, int flags); -asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, - struct compat_timeval __user *t); -asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename, - struct compat_stat __user *statbuf, - int flag); -asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename, - int flags, int mode); asmlinkage long sys_unshare(unsigned long unshare_flags); asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, -- cgit v1.2.3 From 2ed7c03ec17779afb4fcfa3b8c61df61bd4879ba Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:54 +0100 Subject: [CVE-2009-0029] Convert all system calls to return a long Convert all system calls to return a long. This should be a NOP since all converted types should have the same size anyway. With the exception of sys_exit_group which returned void. But that doesn't matter since the system call doesn't return. Signed-off-by: Heiko Carstens --- fs/read_write.c | 18 +++++------ fs/xattr.c | 12 ++++---- include/linux/syscalls.h | 79 ++++++++++++++++++++++++------------------------ ipc/mqueue.c | 2 +- kernel/exit.c | 4 ++- kernel/signal.c | 2 +- kernel/timer.c | 2 +- mm/filemap.c | 2 +- mm/mmap.c | 2 +- mm/mremap.c | 2 +- mm/nommu.c | 2 +- 11 files changed, 64 insertions(+), 63 deletions(-) (limited to 'include/linux') diff --git a/fs/read_write.c b/fs/read_write.c index 5cc6924eb158..940367f51f2a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -147,7 +147,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin) } EXPORT_SYMBOL(vfs_llseek); -asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin) +asmlinkage long sys_lseek(unsigned int fd, off_t offset, unsigned int origin) { off_t retval; struct file * file; @@ -369,7 +369,7 @@ static inline void file_pos_write(struct file *file, loff_t pos) file->f_pos = pos; } -asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) +asmlinkage long sys_read(unsigned int fd, char __user * buf, size_t count) { struct file *file; ssize_t ret = -EBADF; @@ -386,7 +386,7 @@ asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) return ret; } -asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count) +asmlinkage long sys_write(unsigned int fd, const char __user * buf, size_t count) { struct file *file; ssize_t ret = -EBADF; @@ -403,7 +403,7 @@ asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t co return ret; } -asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, +asmlinkage long sys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { struct file *file; @@ -424,7 +424,7 @@ asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, return ret; } -asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf, +asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { struct file *file; @@ -672,7 +672,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, EXPORT_SYMBOL(vfs_writev); -asmlinkage ssize_t +asmlinkage long sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) { struct file *file; @@ -693,7 +693,7 @@ sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) return ret; } -asmlinkage ssize_t +asmlinkage long sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) { struct file *file; @@ -812,7 +812,7 @@ out: return retval; } -asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count) +asmlinkage long sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count) { loff_t pos; off_t off; @@ -831,7 +831,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, siz return do_sendfile(out_fd, in_fd, NULL, count, 0); } -asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count) +asmlinkage long sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count) { loff_t pos; ssize_t ret; diff --git a/fs/xattr.c b/fs/xattr.c index 237804cd6b56..d049ae27aae7 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -349,7 +349,7 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, return error; } -asmlinkage ssize_t +asmlinkage long sys_getxattr(const char __user *pathname, const char __user *name, void __user *value, size_t size) { @@ -364,7 +364,7 @@ sys_getxattr(const char __user *pathname, const char __user *name, return error; } -asmlinkage ssize_t +asmlinkage long sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value, size_t size) { @@ -379,7 +379,7 @@ sys_lgetxattr(const char __user *pathname, const char __user *name, void __user return error; } -asmlinkage ssize_t +asmlinkage long sys_fgetxattr(int fd, const char __user *name, void __user *value, size_t size) { struct file *f; @@ -424,7 +424,7 @@ listxattr(struct dentry *d, char __user *list, size_t size) return error; } -asmlinkage ssize_t +asmlinkage long sys_listxattr(const char __user *pathname, char __user *list, size_t size) { struct path path; @@ -438,7 +438,7 @@ sys_listxattr(const char __user *pathname, char __user *list, size_t size) return error; } -asmlinkage ssize_t +asmlinkage long sys_llistxattr(const char __user *pathname, char __user *list, size_t size) { struct path path; @@ -452,7 +452,7 @@ sys_llistxattr(const char __user *pathname, char __user *list, size_t size) return error; } -asmlinkage ssize_t +asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size) { struct file *f; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a7593f670ca6..22290eeaf553 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -77,7 +77,7 @@ asmlinkage long sys_times(struct tms __user *tbuf); asmlinkage long sys_gettid(void); asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp); -asmlinkage unsigned long sys_alarm(unsigned int seconds); +asmlinkage long sys_alarm(unsigned int seconds); asmlinkage long sys_getpid(void); asmlinkage long sys_getppid(void); asmlinkage long sys_getuid(void); @@ -166,7 +166,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, unsigned long flags); asmlinkage long sys_exit(int error_code); -asmlinkage void sys_exit_group(int error_code); +asmlinkage long sys_exit_group(int error_code); asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr, int options, struct rusage __user *ru); asmlinkage long sys_waitid(int which, pid_t pid, @@ -196,7 +196,7 @@ asmlinkage long sys_tkill(int pid, int sig); asmlinkage long sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo); asmlinkage long sys_sgetmask(void); asmlinkage long sys_ssetmask(int newmask); -asmlinkage unsigned long sys_signal(int sig, __sighandler_t handler); +asmlinkage long sys_signal(int sig, __sighandler_t handler); asmlinkage long sys_pause(void); asmlinkage long sys_sync(void); @@ -246,29 +246,29 @@ asmlinkage long sys_lsetxattr(const char __user *path, const char __user *name, const void __user *value, size_t size, int flags); asmlinkage long sys_fsetxattr(int fd, const char __user *name, const void __user *value, size_t size, int flags); -asmlinkage ssize_t sys_getxattr(const char __user *path, const char __user *name, - void __user *value, size_t size); -asmlinkage ssize_t sys_lgetxattr(const char __user *path, const char __user *name, - void __user *value, size_t size); -asmlinkage ssize_t sys_fgetxattr(int fd, const char __user *name, - void __user *value, size_t size); -asmlinkage ssize_t sys_listxattr(const char __user *path, char __user *list, - size_t size); -asmlinkage ssize_t sys_llistxattr(const char __user *path, char __user *list, - size_t size); -asmlinkage ssize_t sys_flistxattr(int fd, char __user *list, size_t size); +asmlinkage long sys_getxattr(const char __user *path, const char __user *name, + void __user *value, size_t size); +asmlinkage long sys_lgetxattr(const char __user *path, const char __user *name, + void __user *value, size_t size); +asmlinkage long sys_fgetxattr(int fd, const char __user *name, + void __user *value, size_t size); +asmlinkage long sys_listxattr(const char __user *path, char __user *list, + size_t size); +asmlinkage long sys_llistxattr(const char __user *path, char __user *list, + size_t size); +asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size); asmlinkage long sys_removexattr(const char __user *path, const char __user *name); asmlinkage long sys_lremovexattr(const char __user *path, const char __user *name); asmlinkage long sys_fremovexattr(int fd, const char __user *name); -asmlinkage unsigned long sys_brk(unsigned long brk); +asmlinkage long sys_brk(unsigned long brk); asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot); -asmlinkage unsigned long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr); +asmlinkage long sys_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); @@ -321,10 +321,10 @@ asmlinkage long sys_io_submit(aio_context_t, long, struct iocb __user * __user *); asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb, struct io_event __user *result); -asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, - off_t __user *offset, size_t count); -asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, - loff_t __user *offset, size_t count); +asmlinkage long sys_sendfile(int out_fd, int in_fd, + off_t __user *offset, size_t count); +asmlinkage long sys_sendfile64(int out_fd, int in_fd, + loff_t __user *offset, size_t count); asmlinkage long sys_readlink(const char __user *path, char __user *buf, int bufsiz); asmlinkage long sys_creat(const char __user *pathname, int mode); @@ -368,26 +368,25 @@ asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times); asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes); -asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, - unsigned int origin); +asmlinkage long sys_lseek(unsigned int fd, off_t offset, + unsigned int origin); asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high, unsigned long offset_low, loff_t __user *result, unsigned int origin); -asmlinkage ssize_t sys_read(unsigned int fd, char __user *buf, - size_t count); -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count); -asmlinkage ssize_t sys_readv(unsigned long fd, - const struct iovec __user *vec, - unsigned long vlen); -asmlinkage ssize_t sys_write(unsigned int fd, const char __user *buf, - size_t count); -asmlinkage ssize_t sys_writev(unsigned long fd, - const struct iovec __user *vec, - unsigned long vlen); -asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, - size_t count, loff_t pos); -asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf, - size_t count, loff_t pos); +asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count); +asmlinkage long sys_readahead(int fd, loff_t offset, size_t count); +asmlinkage long sys_readv(unsigned long fd, + const struct iovec __user *vec, + unsigned long vlen); +asmlinkage long sys_write(unsigned int fd, const char __user *buf, + size_t count); +asmlinkage long sys_writev(unsigned long fd, + const struct iovec __user *vec, + unsigned long vlen); +asmlinkage long sys_pread64(unsigned int fd, char __user *buf, + size_t count, loff_t pos); +asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf, + size_t count, loff_t pos); asmlinkage long sys_getcwd(char __user *buf, unsigned long size); asmlinkage long sys_mkdir(const char __user *pathname, int mode); asmlinkage long sys_chdir(const char __user *filename); @@ -476,7 +475,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr); asmlinkage long sys_mq_unlink(const char __user *name); asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout); -asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout); +asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout); asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification); asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 23fdb8492b8e..6df028b70543 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -907,7 +907,7 @@ out: return ret; } -asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, +asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout) { diff --git a/kernel/exit.c b/kernel/exit.c index c7740fa3252c..fac9b040af2c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1182,9 +1182,11 @@ do_group_exit(int exit_code) * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ -asmlinkage void sys_exit_group(int error_code) +asmlinkage long sys_exit_group(int error_code) { do_group_exit((error_code & 0xff) << 8); + /* NOTREACHED */ + return 0; } static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) diff --git a/kernel/signal.c b/kernel/signal.c index 3152ac3b62e2..856a5479d49d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2559,7 +2559,7 @@ sys_ssetmask(int newmask) /* * For backwards compatibility. Functionality superseded by sigaction. */ -asmlinkage unsigned long +asmlinkage long sys_signal(int sig, __sighandler_t handler) { struct k_sigaction new_sa, old_sa; diff --git a/kernel/timer.c b/kernel/timer.c index dee3f641a7a7..7b8697d7f04d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1129,7 +1129,7 @@ void do_timer(unsigned long ticks) * For backwards compatibility? This can be done in libc so Alpha * and all newer ports shouldn't need it. */ -asmlinkage unsigned long sys_alarm(unsigned int seconds) +asmlinkage long sys_alarm(unsigned int seconds) { return alarm_setitimer(seconds); } diff --git a/mm/filemap.c b/mm/filemap.c index ceba0bd03662..538b75ed6236 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1374,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp, return 0; } -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +asmlinkage long sys_readahead(int fd, loff_t offset, size_t count) { ssize_t ret; struct file *file; diff --git a/mm/mmap.c b/mm/mmap.c index 749623196cb9..a970d890cb21 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -245,7 +245,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -asmlinkage unsigned long sys_brk(unsigned long brk) +asmlinkage long sys_brk(unsigned long brk) { unsigned long rlim, retval; unsigned long newbrk, oldbrk; diff --git a/mm/mremap.c b/mm/mremap.c index 646de959aa58..5572e0825d80 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -420,7 +420,7 @@ out_nc: return ret; } -asmlinkage unsigned long sys_mremap(unsigned long addr, +asmlinkage long sys_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) { diff --git a/mm/nommu.c b/mm/nommu.c index 60ed8375c986..ee3e78927739 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -416,7 +416,7 @@ EXPORT_SYMBOL(vm_insert_page); * to a regular file. in this case, the unmapping will need * to invoke file system routines that need the global lock. */ -asmlinkage unsigned long sys_brk(unsigned long brk) +asmlinkage long sys_brk(unsigned long brk) { struct mm_struct *mm = current->mm; -- cgit v1.2.3 From e55380edf68796d75bf41391a781c68ee678587d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:55 +0100 Subject: [CVE-2009-0029] Rename old_readdir to sys_old_readdir This way it matches the generic system call name convention. Signed-off-by: Heiko Carstens --- arch/arm/kernel/calls.S | 2 +- arch/cris/arch-v10/kernel/entry.S | 2 +- arch/cris/arch-v32/kernel/entry.S | 2 +- arch/h8300/kernel/syscalls.S | 2 +- arch/m68k/kernel/entry.S | 2 +- arch/m68knommu/kernel/syscalltable.S | 2 +- arch/mips/kernel/scall32-o32.S | 2 +- arch/mn10300/kernel/entry.S | 2 +- arch/powerpc/include/asm/systbl.h | 2 +- arch/sh/kernel/syscalls_32.S | 2 +- arch/sh/kernel/syscalls_64.S | 2 +- arch/sparc/kernel/systbls_32.S | 2 +- arch/x86/kernel/syscall_table_32.S | 2 +- fs/readdir.c | 2 +- include/linux/syscalls.h | 2 ++ 15 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index 09a061cb7838..9ca8d13f05f7 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -98,7 +98,7 @@ CALL(sys_uselib) CALL(sys_swapon) CALL(sys_reboot) - CALL(OBSOLETE(old_readdir)) /* used by libc4 */ + CALL(OBSOLETE(sys_old_readdir)) /* used by libc4 */ /* 90 */ CALL(OBSOLETE(old_mmap)) /* used by libc4 */ CALL(sys_munmap) CALL(sys_truncate) diff --git a/arch/cris/arch-v10/kernel/entry.S b/arch/cris/arch-v10/kernel/entry.S index ed171d389e65..72f5cd319b97 100644 --- a/arch/cris/arch-v10/kernel/entry.S +++ b/arch/cris/arch-v10/kernel/entry.S @@ -691,7 +691,7 @@ sys_call_table: .long sys_uselib .long sys_swapon .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/arch/cris/arch-v32/kernel/entry.S b/arch/cris/arch-v32/kernel/entry.S index 7f6f93e6b70e..5e674c8f7c51 100644 --- a/arch/cris/arch-v32/kernel/entry.S +++ b/arch/cris/arch-v32/kernel/entry.S @@ -614,7 +614,7 @@ sys_call_table: .long sys_uselib .long sys_swapon .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/arch/h8300/kernel/syscalls.S b/arch/h8300/kernel/syscalls.S index 54e21c3f2057..4eb67faac633 100644 --- a/arch/h8300/kernel/syscalls.S +++ b/arch/h8300/kernel/syscalls.S @@ -103,7 +103,7 @@ SYMBOL_NAME_LABEL(sys_call_table) .long SYMBOL_NAME(sys_uselib) .long SYMBOL_NAME(sys_swapon) .long SYMBOL_NAME(sys_reboot) - .long SYMBOL_NAME(old_readdir) + .long SYMBOL_NAME(sys_old_readdir) .long SYMBOL_NAME(old_mmap) /* 90 */ .long SYMBOL_NAME(sys_munmap) .long SYMBOL_NAME(sys_truncate) diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S index 5b780826647c..5c332f2b9b83 100644 --- a/arch/m68k/kernel/entry.S +++ b/arch/m68k/kernel/entry.S @@ -513,7 +513,7 @@ sys_call_table: .long sys_uselib .long sys_swapon .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/arch/m68knommu/kernel/syscalltable.S b/arch/m68knommu/kernel/syscalltable.S index 812f8d8b7a85..5c3e3f62194a 100644 --- a/arch/m68knommu/kernel/syscalltable.S +++ b/arch/m68knommu/kernel/syscalltable.S @@ -107,7 +107,7 @@ ENTRY(sys_call_table) .long sys_uselib .long sys_ni_syscall /* sys_swapon */ .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index d0916a55cd77..51d1ba415b90 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -398,7 +398,7 @@ einval: li v0, -ENOSYS sys sys_uselib 1 sys sys_swapon 2 sys sys_reboot 3 - sys old_readdir 3 + sys sys_old_readdir 3 sys old_mmap 6 /* 4090 */ sys sys_munmap 2 sys sys_truncate 2 diff --git a/arch/mn10300/kernel/entry.S b/arch/mn10300/kernel/entry.S index 62fba8aa9b6e..ceeaaaa359e2 100644 --- a/arch/mn10300/kernel/entry.S +++ b/arch/mn10300/kernel/entry.S @@ -478,7 +478,7 @@ ENTRY(sys_call_table) .long sys_uselib .long sys_swapon .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 803def236654..72353f6070a4 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -92,7 +92,7 @@ COMPAT_SYS_SPU(readlink) SYSCALL(uselib) SYSCALL(swapon) SYSCALL(reboot) -SYSX(sys_ni_syscall,compat_sys_old_readdir,old_readdir) +SYSX(sys_ni_syscall,compat_sys_old_readdir,sys_old_readdir) SYSCALL_SPU(mmap) SYSCALL_SPU(munmap) SYSCALL_SPU(truncate) diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S index 0af693e65764..a87ce076cfa2 100644 --- a/arch/sh/kernel/syscalls_32.S +++ b/arch/sh/kernel/syscalls_32.S @@ -105,7 +105,7 @@ ENTRY(sys_call_table) .long sys_uselib .long sys_swapon .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S index 0b436aa3cad7..557cb91f5caf 100644 --- a/arch/sh/kernel/syscalls_64.S +++ b/arch/sh/kernel/syscalls_64.S @@ -109,7 +109,7 @@ sys_call_table: .long sys_uselib .long sys_swapon .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S index 7d0807586442..8a434f51ba0f 100644 --- a/arch/sparc/kernel/systbls_32.S +++ b/arch/sparc/kernel/systbls_32.S @@ -56,7 +56,7 @@ sys_call_table: /*185*/ .long sys_setpgid, sys_fremovexattr, sys_tkill, sys_exit_group, sys_newuname /*190*/ .long sys_init_module, sys_personality, sparc_remap_file_pages, sys_epoll_create, sys_epoll_ctl /*195*/ .long sys_epoll_wait, sys_ioprio_set, sys_getppid, sparc_sigaction, sys_sgetmask -/*200*/ .long sys_ssetmask, sys_sigsuspend, sys_newlstat, sys_uselib, old_readdir +/*200*/ .long sys_ssetmask, sys_sigsuspend, sys_newlstat, sys_uselib, sys_old_readdir /*205*/ .long sys_readahead, sys_socketcall, sys_syslog, sys_lookup_dcookie, sys_fadvise64 /*210*/ .long sys_fadvise64_64, sys_tgkill, sys_waitpid, sys_swapoff, sys_sysinfo /*215*/ .long sys_ipc, sys_sigreturn, sys_clone, sys_ioprio_get, sys_adjtimex diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d44395ff34c3..e2e86a08f31d 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -88,7 +88,7 @@ ENTRY(sys_call_table) .long sys_uselib .long sys_swapon .long sys_reboot - .long old_readdir + .long sys_old_readdir .long old_mmap /* 90 */ .long sys_munmap .long sys_truncate diff --git a/fs/readdir.c b/fs/readdir.c index b318d9b5af2e..8b4c2a0051a6 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -102,7 +102,7 @@ efault: return -EFAULT; } -asmlinkage long old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count) +asmlinkage long sys_old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count) { int error; struct file * file; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 22290eeaf553..ca079c3d09e3 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -54,6 +54,7 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; +struct old_linux_dirent; #include #include @@ -608,6 +609,7 @@ asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_eventfd2(unsigned int count, int flags); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); +asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -- cgit v1.2.3 From 1a94bc34768e463a93cb3751819709ab0ea80a01 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:59 +0100 Subject: [CVE-2009-0029] System call wrapper infrastructure From: Martin Schwidefsky By selecting HAVE_SYSCALL_WRAPPERS architectures can activate system call wrappers in order to sign extend system call arguments. All architectures where the ABI defines that the caller of a function has to perform sign extension probably need this. Reported-by: Christian Borntraeger Acked-by: Ralf Baechle Signed-off-by: Martin Schwidefsky Signed-off-by: Heiko Carstens --- arch/Kconfig | 3 +++ include/linux/syscalls.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) (limited to 'include/linux') diff --git a/arch/Kconfig b/arch/Kconfig index 2e13aa261929..550dab22daa1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -62,6 +62,9 @@ config HAVE_EFFICIENT_UNALIGNED_ACCESS See Documentation/unaligned-memory-access.txt for more information on the topic of unaligned memory accesses. +config HAVE_SYSCALL_WRAPPERS + bool + config KRETPROBES def_bool y depends on KPROBES && HAVE_KRETPROBES diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index ca079c3d09e3..0bb537d7ba2e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -66,6 +66,68 @@ struct old_linux_dirent; #include #include +#define __SC_DECL1(t1, a1) t1 a1 +#define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__) +#define __SC_DECL3(t3, a3, ...) t3 a3, __SC_DECL2(__VA_ARGS__) +#define __SC_DECL4(t4, a4, ...) t4 a4, __SC_DECL3(__VA_ARGS__) +#define __SC_DECL5(t5, a5, ...) t5 a5, __SC_DECL4(__VA_ARGS__) +#define __SC_DECL6(t6, a6, ...) t6 a6, __SC_DECL5(__VA_ARGS__) + +#define __SC_LONG1(t1, a1) long a1 +#define __SC_LONG2(t2, a2, ...) long a2, __SC_LONG1(__VA_ARGS__) +#define __SC_LONG3(t3, a3, ...) long a3, __SC_LONG2(__VA_ARGS__) +#define __SC_LONG4(t4, a4, ...) long a4, __SC_LONG3(__VA_ARGS__) +#define __SC_LONG5(t5, a5, ...) long a5, __SC_LONG4(__VA_ARGS__) +#define __SC_LONG6(t6, a6, ...) long a6, __SC_LONG5(__VA_ARGS__) + +#define __SC_CAST1(t1, a1) (t1) a1 +#define __SC_CAST2(t2, a2, ...) (t2) a2, __SC_CAST1(__VA_ARGS__) +#define __SC_CAST3(t3, a3, ...) (t3) a3, __SC_CAST2(__VA_ARGS__) +#define __SC_CAST4(t4, a4, ...) (t4) a4, __SC_CAST3(__VA_ARGS__) +#define __SC_CAST5(t5, a5, ...) (t5) a5, __SC_CAST4(__VA_ARGS__) +#define __SC_CAST6(t6, a6, ...) (t6) a6, __SC_CAST5(__VA_ARGS__) + +#define __SC_TEST(type) BUILD_BUG_ON(sizeof(type) > sizeof(long)) +#define __SC_TEST1(t1, a1) __SC_TEST(t1) +#define __SC_TEST2(t2, a2, ...) __SC_TEST(t2); __SC_TEST1(__VA_ARGS__) +#define __SC_TEST3(t3, a3, ...) __SC_TEST(t3); __SC_TEST2(__VA_ARGS__) +#define __SC_TEST4(t4, a4, ...) __SC_TEST(t4); __SC_TEST3(__VA_ARGS__) +#define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) +#define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) + +#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) +#define SYSCALL_DEFINE1(...) SYSCALL_DEFINEx(1, __VA_ARGS__) +#define SYSCALL_DEFINE2(...) SYSCALL_DEFINEx(2, __VA_ARGS__) +#define SYSCALL_DEFINE3(...) SYSCALL_DEFINEx(3, __VA_ARGS__) +#define SYSCALL_DEFINE4(...) SYSCALL_DEFINEx(4, __VA_ARGS__) +#define SYSCALL_DEFINE5(...) SYSCALL_DEFINEx(5, __VA_ARGS__) +#define SYSCALL_DEFINE6(...) SYSCALL_DEFINEx(6, __VA_ARGS__) + +#define SYSCALL_ALIAS(alias, name) \ + asm ("\t.globl " #alias "\n\t.set " #alias ", " #name) + +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS + +#define SYSCALL_DEFINE(name) static inline long SYSC_##name +#define SYSCALL_DEFINEx(x, name, ...) \ + asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__)); \ + static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__)); \ + asmlinkage long SyS_##name(__SC_LONG##x(__VA_ARGS__)) \ + { \ + __SC_TEST##x(__VA_ARGS__); \ + return (long) SYSC_##name(__SC_CAST##x(__VA_ARGS__)); \ + } \ + SYSCALL_ALIAS(sys_##name, SyS_##name); \ + static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__)) + +#else /* CONFIG_HAVE_SYSCALL_WRAPPERS */ + +#define SYSCALL_DEFINE(name) asmlinkage long sys_##name +#define SYSCALL_DEFINEx(x, name, ...) \ + asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__)) + +#endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */ + asmlinkage long sys_time(time_t __user *tloc); asmlinkage long sys_stime(time_t __user *tptr); asmlinkage long sys_gettimeofday(struct timeval __user *tv, -- cgit v1.2.3 From ee6a093222549ac0c72cfd296c69fa5e7d6daa34 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 14 Jan 2009 14:14:00 +0100 Subject: [CVE-2009-0029] powerpc: Enable syscall wrappers for 64-bit This enables the use of syscall wrappers to do proper sign extension for 64-bit programs. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Heiko Carstens --- arch/powerpc/Kconfig | 1 + include/linux/syscalls.h | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 84b861316ce7..e39b73bc0ff8 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -123,6 +123,7 @@ config PPC select HAVE_DMA_ATTRS if PPC64 select USE_GENERIC_SMP_HELPERS if SMP select HAVE_OPROFILE + select HAVE_SYSCALL_WRAPPERS if PPC64 config EARLY_PRINTK bool diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0bb537d7ba2e..90aa5eba87a2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -103,8 +103,14 @@ struct old_linux_dirent; #define SYSCALL_DEFINE5(...) SYSCALL_DEFINEx(5, __VA_ARGS__) #define SYSCALL_DEFINE6(...) SYSCALL_DEFINEx(6, __VA_ARGS__) +#ifdef CONFIG_PPC64 +#define SYSCALL_ALIAS(alias, name) \ + asm ("\t.globl " #alias "\n\t.set " #alias ", " #name "\n" \ + "\t.globl ." #alias "\n\t.set ." #alias ", ." #name) +#else #define SYSCALL_ALIAS(alias, name) \ asm ("\t.globl " #alias "\n\t.set " #alias ", " #name) +#endif #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -- cgit v1.2.3 From d4e82042c4cfa87a7d51710b71f568fe80132551 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:34 +0100 Subject: [CVE-2009-0029] System call wrappers part 32 Signed-off-by: Heiko Carstens --- fs/eventfd.c | 5 ++--- fs/pipe.c | 2 +- fs/readdir.c | 3 ++- fs/select.c | 11 ++++++----- fs/timerfd.c | 2 +- include/linux/syscalls.h | 7 +++++++ kernel/signal.c | 11 +++++------ 7 files changed, 24 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/fs/eventfd.c b/fs/eventfd.c index 08bf558d0408..5de2c2db3aa2 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -198,7 +198,7 @@ struct file *eventfd_fget(int fd) return file; } -asmlinkage long sys_eventfd2(unsigned int count, int flags) +SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) { int fd; struct eventfd_ctx *ctx; @@ -228,8 +228,7 @@ asmlinkage long sys_eventfd2(unsigned int count, int flags) return fd; } -asmlinkage long sys_eventfd(unsigned int count) +SYSCALL_DEFINE1(eventfd, unsigned int, count) { return sys_eventfd2(count, 0); } - diff --git a/fs/pipe.c b/fs/pipe.c index 0c64db86c919..b89c878588a9 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1043,7 +1043,7 @@ int do_pipe(int *fd) * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. */ -asmlinkage long sys_pipe2(int __user *fildes, int flags) +SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) { int fd[2]; int error; diff --git a/fs/readdir.c b/fs/readdir.c index cf6a0e39819a..7723401f8d8b 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -102,7 +102,8 @@ efault: return -EFAULT; } -asmlinkage long sys_old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count) +SYSCALL_DEFINE3(old_readdir, unsigned int, fd, + struct old_linux_dirent __user *, dirent, unsigned int, count) { int error; struct file * file; diff --git a/fs/select.c b/fs/select.c index 338f703403af..0fe0e1469df3 100644 --- a/fs/select.c +++ b/fs/select.c @@ -636,8 +636,9 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ -asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timespec __user *tsp, void __user *sig) +SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, + fd_set __user *, exp, struct timespec __user *, tsp, + void __user *, sig) { size_t sigsetsize = 0; sigset_t __user *up = NULL; @@ -889,9 +890,9 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, } #ifdef HAVE_SET_RESTORE_SIGMASK -asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, - struct timespec __user *tsp, const sigset_t __user *sigmask, - size_t sigsetsize) +SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, + struct timespec __user *, tsp, const sigset_t __user *, sigmask, + size_t, sigsetsize) { sigset_t ksigmask, sigsaved; struct timespec ts, end_time, *to = NULL; diff --git a/fs/timerfd.c b/fs/timerfd.c index c8c14f58b96f..6a123b8ff3f5 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -265,7 +265,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, return 0; } -asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr) +SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) { struct file *file; struct timerfd_ctx *ctx; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 90aa5eba87a2..56c400138b05 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -678,6 +678,13 @@ asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_eventfd2(unsigned int count, int flags); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); +asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, + fd_set __user *, struct timespec __user *, + void __user *); +asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int, + struct timespec __user *, const sigset_t __user *, + size_t); +asmlinkage long sys_pipe2(int __user *, int); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); diff --git a/kernel/signal.c b/kernel/signal.c index e2333929611a..e73759783dc8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2491,11 +2491,10 @@ out: #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ #ifdef __ARCH_WANT_SYS_RT_SIGACTION -asmlinkage long -sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize) +SYSCALL_DEFINE4(rt_sigaction, int, sig, + const struct sigaction __user *, act, + struct sigaction __user *, oact, + size_t, sigsetsize) { struct k_sigaction new_sa, old_sa; int ret = -EINVAL; @@ -2578,7 +2577,7 @@ SYSCALL_DEFINE0(pause) #endif #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND -asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) +SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) { sigset_t newset; -- cgit v1.2.3 From 2b66421995d2e93c9d1a0111acf2581f8529c6e5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:35 +0100 Subject: [CVE-2009-0029] System call wrappers part 33 Signed-off-by: Heiko Carstens --- fs/pipe.c | 2 +- include/linux/syscalls.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/pipe.c b/fs/pipe.c index b89c878588a9..3a48ba5179d5 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1059,7 +1059,7 @@ SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) return error; } -asmlinkage long sys_pipe(int __user *fildes) +SYSCALL_DEFINE1(pipe, int __user *, fildes) { return sys_pipe2(fildes, 0); } diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 56c400138b05..16875f89e6a7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -685,6 +685,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int, struct timespec __user *, const sigset_t __user *, size_t); asmlinkage long sys_pipe2(int __user *, int); +asmlinkage long sys_pipe(int __user *); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -- cgit v1.2.3 From 18e6959c385f3edf3991fa6662a53dac4eb10d5b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 14 Jan 2009 07:28:16 +0100 Subject: mm: fix assertion This assertion is incorrect for lockless pagecache. By definition if we have an unpinned page that we are trying to take a speculative reference to, it may become the tail of a compound page at any time (if it is freed, then reallocated as a compound page). It was still a valid assertion for the vmscan.c LRU isolation case, but it doesn't seem incredibly helpful... if somebody wants it, they can put it back directly where it applies in the vmscan code. Signed-off-by: Nick Piggin Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b91a73fd1bcc..e8ddc98b8405 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -260,7 +260,6 @@ static inline int put_page_testzero(struct page *page) */ static inline int get_page_unless_zero(struct page *page) { - VM_BUG_ON(PageTail(page)); return atomic_inc_not_zero(&page->_count); } -- cgit v1.2.3 From b94b898f3107046b5c97c556e23529283ea5eadd Mon Sep 17 00:00:00 2001 From: Brandon Philips Date: Wed, 14 Jan 2009 19:19:02 +0100 Subject: it821x: Add ultra_mask quirk for Vortex86SX On Vortex86SX with IDE controller revision 0x11 ultra DMA must be disabled. This patch was tested by DMP and seems to work. It is a cleaned up version of their older Kernel patch: http://www.dmp.com.tw/tech/vortex86sx/patch-2.6.24-DMP.gz Tested-by: Shawn Lin Signed-off-by: Brandon Philips Cc: Alan Cox Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/it821x.c | 12 ++++++++++++ include/linux/pci_ids.h | 1 + 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/drivers/ide/it821x.c b/drivers/ide/it821x.c index 0be27ac1f077..983440a9a5f7 100644 --- a/drivers/ide/it821x.c +++ b/drivers/ide/it821x.c @@ -68,6 +68,8 @@ #define DRV_NAME "it821x" +#define QUIRK_VORTEX86 1 + struct it821x_dev { unsigned int smart:1, /* Are we in smart raid mode */ @@ -79,6 +81,7 @@ struct it821x_dev u16 pio[2]; /* Cached PIO values */ u16 mwdma[2]; /* Cached MWDMA values */ u16 udma[2]; /* Cached UDMA values (per drive) */ + u16 quirks; }; #define ATA_66 0 @@ -577,6 +580,12 @@ static void __devinit init_hwif_it821x(ide_hwif_t *hwif) hwif->ultra_mask = ATA_UDMA6; hwif->mwdma_mask = ATA_MWDMA2; + + /* Vortex86SX quirk: prevent Ultra-DMA mode to fix BadCRC issue */ + if (idev->quirks & QUIRK_VORTEX86) { + if (dev->revision == 0x11) + hwif->ultra_mask = 0; + } } static void it8212_disable_raid(struct pci_dev *dev) @@ -649,6 +658,8 @@ static int __devinit it821x_init_one(struct pci_dev *dev, const struct pci_devic return -ENOMEM; } + itdevs->quirks = id->driver_data; + rc = ide_pci_init_one(dev, &it821x_chipset, itdevs); if (rc) kfree(itdevs); @@ -668,6 +679,7 @@ static void __devexit it821x_remove(struct pci_dev *dev) static const struct pci_device_id it821x_pci_tbl[] = { { PCI_VDEVICE(ITE, PCI_DEVICE_ID_ITE_8211), 0 }, { PCI_VDEVICE(ITE, PCI_DEVICE_ID_ITE_8212), 0 }, + { PCI_VDEVICE(RDC, PCI_DEVICE_ID_RDC_D1010), QUIRK_VORTEX86 }, { 0, }, }; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index d543365518ab..d56ad9c21c09 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2174,6 +2174,7 @@ #define PCI_DEVICE_ID_RDC_R6040 0x6040 #define PCI_DEVICE_ID_RDC_R6060 0x6060 #define PCI_DEVICE_ID_RDC_R6061 0x6061 +#define PCI_DEVICE_ID_RDC_D1010 0x1010 #define PCI_VENDOR_ID_LENOVO 0x17aa -- cgit v1.2.3 From e720b9e498b6bbb1b4f3b3d2f8e9a78578aafef7 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 14 Jan 2009 19:19:04 +0100 Subject: IDE: fix sparse signed-ness errors with host->host_busy The host_busy field in struct ide_host defaults to a signed-long, where most arch's test_and_set_bit_* macros use an unsigned long. Change to using an unsigned long, which on ARM removes the following sparse errors: drivers/ide/ide-io.c:681:8: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:681:8: expected unsigned long volatile *p drivers/ide/ide-io.c:681:8: got long volatile * drivers/ide/ide-io.c:681:8: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:681:8: expected unsigned long volatile *p drivers/ide/ide-io.c:681:8: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * drivers/ide/ide-io.c:695:3: warning: incorrect type in argument 2 (different signedness) drivers/ide/ide-io.c:695:3: expected unsigned long volatile *p drivers/ide/ide-io.c:695:3: got long volatile * Signed-off-by: Ben Dooks Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 3644f6323384..194da5a4b0d6 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -871,7 +871,7 @@ struct ide_host { ide_hwif_t *cur_port; /* for hosts requiring serialization */ /* used for hosts requiring serialization */ - volatile long host_busy; + volatile unsigned long host_busy; }; #define IDE_HOST_BUSY 0 -- cgit v1.2.3 From 74d96f018673759d04d032c137d132f6447bfb1e Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Tue, 13 Jan 2009 19:27:09 -0800 Subject: byteorder: make swab.h include asm/swab.h like a regular header Add swab.h to kbuild.asm and remove the individual entries from each arch, mark as unifdef as some arches have some kernel-only bits inside. Signed-off-by: Harvey Harrison Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/Kbuild | 1 - arch/alpha/include/asm/byteorder.h | 1 - arch/arm/include/asm/Kbuild | 1 - arch/arm/include/asm/byteorder.h | 2 -- arch/avr32/include/asm/Kbuild | 1 - arch/avr32/include/asm/byteorder.h | 1 - arch/blackfin/include/asm/Kbuild | 1 - arch/blackfin/include/asm/byteorder.h | 1 - arch/cris/include/asm/Kbuild | 1 - arch/cris/include/asm/byteorder.h | 1 - arch/h8300/include/asm/Kbuild | 1 - arch/h8300/include/asm/byteorder.h | 1 - arch/ia64/include/asm/Kbuild | 1 - arch/ia64/include/asm/byteorder.h | 1 - arch/m68knommu/include/asm/Kbuild | 2 -- arch/m68knommu/include/asm/byteorder.h | 1 - arch/mips/include/asm/Kbuild | 1 - arch/mips/include/asm/byteorder.h | 2 -- arch/parisc/include/asm/Kbuild | 1 - arch/parisc/include/asm/byteorder.h | 1 - arch/powerpc/include/asm/Kbuild | 1 - arch/powerpc/include/asm/byteorder.h | 2 -- arch/s390/include/asm/Kbuild | 1 - arch/s390/include/asm/byteorder.h | 1 - arch/sh/include/asm/Kbuild | 1 - arch/sh/include/asm/byteorder.h | 2 -- arch/sparc/include/asm/Kbuild | 1 - arch/sparc/include/asm/byteorder.h | 1 - arch/x86/include/asm/Kbuild | 1 - arch/x86/include/asm/byteorder.h | 1 - arch/xtensa/include/asm/Kbuild | 2 -- arch/xtensa/include/asm/byteorder.h | 2 -- include/asm-frv/Kbuild | 1 - include/asm-frv/byteorder.h | 1 - include/asm-generic/Kbuild.asm | 1 + include/asm-m32r/Kbuild | 1 - include/asm-m32r/byteorder.h | 2 -- include/asm-m68k/Kbuild | 1 - include/asm-m68k/byteorder.h | 1 - include/asm-mn10300/Kbuild | 1 - include/asm-mn10300/byteorder.h | 1 - include/linux/swab.h | 2 +- 42 files changed, 2 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 4dad27360576..b7c8f188b313 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -9,4 +9,3 @@ unifdef-y += console.h unifdef-y += fpu.h unifdef-y += sysinfo.h unifdef-y += compiler.h -unifdef-y += swab.h diff --git a/arch/alpha/include/asm/byteorder.h b/arch/alpha/include/asm/byteorder.h index 6772f3168701..73683093202d 100644 --- a/arch/alpha/include/asm/byteorder.h +++ b/arch/alpha/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _ALPHA_BYTEORDER_H #define _ALPHA_BYTEORDER_H -#include #include #endif /* _ALPHA_BYTEORDER_H */ diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 43b0b2ba392f..73237bd130a2 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -1,4 +1,3 @@ include include/asm-generic/Kbuild.asm unifdef-y += hwcap.h -unifdef-y += swab.h diff --git a/arch/arm/include/asm/byteorder.h b/arch/arm/include/asm/byteorder.h index c02b6fc28e1a..77379748b171 100644 --- a/arch/arm/include/asm/byteorder.h +++ b/arch/arm/include/asm/byteorder.h @@ -15,8 +15,6 @@ #ifndef __ASM_ARM_BYTEORDER_H #define __ASM_ARM_BYTEORDER_H -#include - #ifdef __ARMEB__ #include #else diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild index 219822c8ad18..3136628ba8d2 100644 --- a/arch/avr32/include/asm/Kbuild +++ b/arch/avr32/include/asm/Kbuild @@ -1,4 +1,3 @@ include include/asm-generic/Kbuild.asm -header-y += swab.h header-y += cachectl.h diff --git a/arch/avr32/include/asm/byteorder.h b/arch/avr32/include/asm/byteorder.h index 2aba64b4e122..50abc21619a8 100644 --- a/arch/avr32/include/asm/byteorder.h +++ b/arch/avr32/include/asm/byteorder.h @@ -4,7 +4,6 @@ #ifndef __ASM_AVR32_BYTEORDER_H #define __ASM_AVR32_BYTEORDER_H -#include #include #endif /* __ASM_AVR32_BYTEORDER_H */ diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild index d0d1ac435544..606ecfdcc962 100644 --- a/arch/blackfin/include/asm/Kbuild +++ b/arch/blackfin/include/asm/Kbuild @@ -1,4 +1,3 @@ include include/asm-generic/Kbuild.asm unifdef-y += fixed_code.h -unifdef-y += swab.h diff --git a/arch/blackfin/include/asm/byteorder.h b/arch/blackfin/include/asm/byteorder.h index b9e797a497b4..3e69106a4d37 100644 --- a/arch/blackfin/include/asm/byteorder.h +++ b/arch/blackfin/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _BLACKFIN_BYTEORDER_H #define _BLACKFIN_BYTEORDER_H -#include #include #endif /* _BLACKFIN_BYTEORDER_H */ diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index b79b7c6543a6..d5b631935ec8 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild @@ -9,4 +9,3 @@ header-y += sync_serial.h unifdef-y += etraxgpio.h unifdef-y += rs485.h -unifdef-y += swab.h diff --git a/arch/cris/include/asm/byteorder.h b/arch/cris/include/asm/byteorder.h index 7678d86317ae..bcd189798e26 100644 --- a/arch/cris/include/asm/byteorder.h +++ b/arch/cris/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _CRIS_BYTEORDER_H #define _CRIS_BYTEORDER_H -#include #include #endif diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index 27b108a86b39..c68e1680da01 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild @@ -1,2 +1 @@ include include/asm-generic/Kbuild.asm -unifdef-y += swab.h diff --git a/arch/h8300/include/asm/byteorder.h b/arch/h8300/include/asm/byteorder.h index c36b80a3dd84..13539da99efd 100644 --- a/arch/h8300/include/asm/byteorder.h +++ b/arch/h8300/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _H8300_BYTEORDER_H #define _H8300_BYTEORDER_H -#include #include #endif /* _H8300_BYTEORDER_H */ diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild index 3b25bd9dca91..ccbe8ae47a61 100644 --- a/arch/ia64/include/asm/Kbuild +++ b/arch/ia64/include/asm/Kbuild @@ -14,4 +14,3 @@ unifdef-y += gcc_intrin.h unifdef-y += intrinsics.h unifdef-y += perfmon.h unifdef-y += ustack.h -unifdef-y += swab.h diff --git a/arch/ia64/include/asm/byteorder.h b/arch/ia64/include/asm/byteorder.h index 0f84c5cb703d..a8dd73558150 100644 --- a/arch/ia64/include/asm/byteorder.h +++ b/arch/ia64/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _ASM_IA64_BYTEORDER_H #define _ASM_IA64_BYTEORDER_H -#include #include #endif /* _ASM_IA64_BYTEORDER_H */ diff --git a/arch/m68knommu/include/asm/Kbuild b/arch/m68knommu/include/asm/Kbuild index 58c02a454130..c68e1680da01 100644 --- a/arch/m68knommu/include/asm/Kbuild +++ b/arch/m68knommu/include/asm/Kbuild @@ -1,3 +1 @@ include include/asm-generic/Kbuild.asm - -unifdef-y += swab.h diff --git a/arch/m68knommu/include/asm/byteorder.h b/arch/m68knommu/include/asm/byteorder.h index a6f0b8f7f622..9c6c76a15041 100644 --- a/arch/m68knommu/include/asm/byteorder.h +++ b/arch/m68knommu/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _M68KNOMMU_BYTEORDER_H #define _M68KNOMMU_BYTEORDER_H -#include #include #endif /* _M68KNOMMU_BYTEORDER_H */ diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 023866c0c102..7897f05e3165 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -1,4 +1,3 @@ include include/asm-generic/Kbuild.asm header-y += cachectl.h sgidefs.h sysmips.h -header-y += swab.h diff --git a/arch/mips/include/asm/byteorder.h b/arch/mips/include/asm/byteorder.h index 607b71830707..9579051ff1c7 100644 --- a/arch/mips/include/asm/byteorder.h +++ b/arch/mips/include/asm/byteorder.h @@ -8,8 +8,6 @@ #ifndef _ASM_BYTEORDER_H #define _ASM_BYTEORDER_H -#include - #if defined(__MIPSEB__) #include #elif defined(__MIPSEL__) diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 2121d99f8364..f88b252e419c 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -1,4 +1,3 @@ include include/asm-generic/Kbuild.asm unifdef-y += pdc.h -unifdef-y += swab.h diff --git a/arch/parisc/include/asm/byteorder.h b/arch/parisc/include/asm/byteorder.h index da66029c4cb2..58af2c5f5d61 100644 --- a/arch/parisc/include/asm/byteorder.h +++ b/arch/parisc/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _PARISC_BYTEORDER_H #define _PARISC_BYTEORDER_H -#include #include #endif /* _PARISC_BYTEORDER_H */ diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 9268602de5d0..5ab7d7fe198c 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -35,4 +35,3 @@ unifdef-y += spu_info.h unifdef-y += termios.h unifdef-y += types.h unifdef-y += unistd.h -unifdef-y += swab.h diff --git a/arch/powerpc/include/asm/byteorder.h b/arch/powerpc/include/asm/byteorder.h index 5cca27a41532..aa6cc4fac965 100644 --- a/arch/powerpc/include/asm/byteorder.h +++ b/arch/powerpc/include/asm/byteorder.h @@ -7,8 +7,6 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ - -#include #include #endif /* _ASM_POWERPC_BYTEORDER_H */ diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index f2af4167bd5f..63a23415fba6 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -13,4 +13,3 @@ unifdef-y += cmb.h unifdef-y += debug.h unifdef-y += chpid.h unifdef-y += schid.h -unifdef-y += swab.h diff --git a/arch/s390/include/asm/byteorder.h b/arch/s390/include/asm/byteorder.h index b95a2b2933fb..a332e59e26fc 100644 --- a/arch/s390/include/asm/byteorder.h +++ b/arch/s390/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _S390_BYTEORDER_H #define _S390_BYTEORDER_H -#include #include #endif /* _S390_BYTEORDER_H */ diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index f1a2a0d1c79c..43910cdf78a5 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -6,4 +6,3 @@ unifdef-y += unistd_32.h unifdef-y += unistd_64.h unifdef-y += posix_types_32.h unifdef-y += posix_types_64.h -unifdef-y += swab.h diff --git a/arch/sh/include/asm/byteorder.h b/arch/sh/include/asm/byteorder.h index e95c41a5c8cc..db2f5d7cb17d 100644 --- a/arch/sh/include/asm/byteorder.h +++ b/arch/sh/include/asm/byteorder.h @@ -1,8 +1,6 @@ #ifndef __ASM_SH_BYTEORDER_H #define __ASM_SH_BYTEORDER_H -#include - #ifdef __LITTLE_ENDIAN__ #include #else diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 95e38a43dff0..deeb0fba8029 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -17,4 +17,3 @@ header-y += traps.h header-y += uctx.h header-y += utrap.h header-y += watchdog.h -header-y += swab.h diff --git a/arch/sparc/include/asm/byteorder.h b/arch/sparc/include/asm/byteorder.h index 48a047cd6fa9..ccc1b6b7de6c 100644 --- a/arch/sparc/include/asm/byteorder.h +++ b/arch/sparc/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _SPARC_BYTEORDER_H #define _SPARC_BYTEORDER_H -#include #include #endif /* _SPARC_BYTEORDER_H */ diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index a9f8a814a1f7..4a8e80cdcfa5 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -22,4 +22,3 @@ unifdef-y += unistd_32.h unifdef-y += unistd_64.h unifdef-y += vm86.h unifdef-y += vsyscall.h -unifdef-y += swab.h diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/asm/byteorder.h index 7c49917e3d9d..b13a7a88f3eb 100644 --- a/arch/x86/include/asm/byteorder.h +++ b/arch/x86/include/asm/byteorder.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_BYTEORDER_H #define _ASM_X86_BYTEORDER_H -#include #include #endif /* _ASM_X86_BYTEORDER_H */ diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 58c02a454130..c68e1680da01 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -1,3 +1 @@ include include/asm-generic/Kbuild.asm - -unifdef-y += swab.h diff --git a/arch/xtensa/include/asm/byteorder.h b/arch/xtensa/include/asm/byteorder.h index 329b94591ca4..54eb6315349c 100644 --- a/arch/xtensa/include/asm/byteorder.h +++ b/arch/xtensa/include/asm/byteorder.h @@ -1,8 +1,6 @@ #ifndef _XTENSA_BYTEORDER_H #define _XTENSA_BYTEORDER_H -#include - #ifdef __XTENSA_EL__ #include #elif defined(__XTENSA_EB__) diff --git a/include/asm-frv/Kbuild b/include/asm-frv/Kbuild index 1f44e7c76995..0f8956def738 100644 --- a/include/asm-frv/Kbuild +++ b/include/asm-frv/Kbuild @@ -3,4 +3,3 @@ include include/asm-generic/Kbuild.asm header-y += registers.h unifdef-y += termios.h -unifdef-y += swab.h diff --git a/include/asm-frv/byteorder.h b/include/asm-frv/byteorder.h index 1187e51ecd13..f29b7593e088 100644 --- a/include/asm-frv/byteorder.h +++ b/include/asm-frv/byteorder.h @@ -1,7 +1,6 @@ #ifndef _ASM_BYTEORDER_H #define _ASM_BYTEORDER_H -#include #include #endif /* _ASM_BYTEORDER_H */ diff --git a/include/asm-generic/Kbuild.asm b/include/asm-generic/Kbuild.asm index 1870d5e05f1c..70d185534b9d 100644 --- a/include/asm-generic/Kbuild.asm +++ b/include/asm-generic/Kbuild.asm @@ -31,6 +31,7 @@ unifdef-y += socket.h unifdef-y += sockios.h unifdef-y += stat.h unifdef-y += statfs.h +unifdef-y += swab.h unifdef-y += termbits.h unifdef-y += termios.h unifdef-y += types.h diff --git a/include/asm-m32r/Kbuild b/include/asm-m32r/Kbuild index 27b108a86b39..c68e1680da01 100644 --- a/include/asm-m32r/Kbuild +++ b/include/asm-m32r/Kbuild @@ -1,2 +1 @@ include include/asm-generic/Kbuild.asm -unifdef-y += swab.h diff --git a/include/asm-m32r/byteorder.h b/include/asm-m32r/byteorder.h index 61ff9cfd8451..21855d8b028b 100644 --- a/include/asm-m32r/byteorder.h +++ b/include/asm-m32r/byteorder.h @@ -1,8 +1,6 @@ #ifndef _ASM_M32R_BYTEORDER_H #define _ASM_M32R_BYTEORDER_H -#include - #if defined(__LITTLE_ENDIAN__) # include #else diff --git a/include/asm-m68k/Kbuild b/include/asm-m68k/Kbuild index 52fd96b4142a..1a922fad76f7 100644 --- a/include/asm-m68k/Kbuild +++ b/include/asm-m68k/Kbuild @@ -1,3 +1,2 @@ include include/asm-generic/Kbuild.asm header-y += cachectl.h -unifdef-y += swab.h diff --git a/include/asm-m68k/byteorder.h b/include/asm-m68k/byteorder.h index 300866523b86..31b260a88803 100644 --- a/include/asm-m68k/byteorder.h +++ b/include/asm-m68k/byteorder.h @@ -1,7 +1,6 @@ #ifndef _M68K_BYTEORDER_H #define _M68K_BYTEORDER_H -#include #include #endif /* _M68K_BYTEORDER_H */ diff --git a/include/asm-mn10300/Kbuild b/include/asm-mn10300/Kbuild index 27b108a86b39..c68e1680da01 100644 --- a/include/asm-mn10300/Kbuild +++ b/include/asm-mn10300/Kbuild @@ -1,2 +1 @@ include include/asm-generic/Kbuild.asm -unifdef-y += swab.h diff --git a/include/asm-mn10300/byteorder.h b/include/asm-mn10300/byteorder.h index 45b18ded19e6..5dd0bdd9feee 100644 --- a/include/asm-mn10300/byteorder.h +++ b/include/asm-mn10300/byteorder.h @@ -1,7 +1,6 @@ #ifndef _ASM_BYTEORDER_H #define _ASM_BYTEORDER_H -#include #include #endif /* _ASM_BYTEORDER_H */ diff --git a/include/linux/swab.h b/include/linux/swab.h index be5284d4a053..ea0c02fd5163 100644 --- a/include/linux/swab.h +++ b/include/linux/swab.h @@ -3,7 +3,7 @@ #include #include -#include +#include /* * casts are necessary for constants, because we never know how for sure -- cgit v1.2.3 From 937f1ba56b4be37d9e2ad77412f95048662058d2 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 14 Jan 2009 21:05:05 -0800 Subject: net: Add init_dummy_netdev() and fix EMAC driver using it This adds an init_dummy_netdev() function that gets a network device structure (allocation and lifetime entirely under caller's control) and initialize the minimum amount of fields so it can be used to schedule NAPI polls without registering a full blown interface. This is to be used by drivers that need to tie several hardware interfaces to a single NAPI poll scheduler due to HW limitations. It also updates the ibm_newemac driver to use that, this fixing the oops on 2.6.29 due to passing NULL as "dev" to netif_napi_add() Symbol is exported GPL only a I don't think we want binary drivers doing that sort of acrobatics (if we want them at all). Signed-off-by: Benjamin Herrenschmidt Tested-by: Geert Uytterhoeven Signed-off-by: David S. Miller --- drivers/net/ibm_newemac/mal.c | 4 +++- drivers/net/ibm_newemac/mal.h | 2 ++ include/linux/netdevice.h | 3 +++ net/core/dev.c | 39 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ibm_newemac/mal.c b/drivers/net/ibm_newemac/mal.c index ecf9798987fa..2a2fc17b2878 100644 --- a/drivers/net/ibm_newemac/mal.c +++ b/drivers/net/ibm_newemac/mal.c @@ -613,7 +613,9 @@ static int __devinit mal_probe(struct of_device *ofdev, INIT_LIST_HEAD(&mal->list); spin_lock_init(&mal->lock); - netif_napi_add(NULL, &mal->napi, mal_poll, + init_dummy_netdev(&mal->dummy_dev); + + netif_napi_add(&mal->dummy_dev, &mal->napi, mal_poll, CONFIG_IBM_NEW_EMAC_POLL_WEIGHT); /* Load power-on reset defaults */ diff --git a/drivers/net/ibm_newemac/mal.h b/drivers/net/ibm_newemac/mal.h index 2f0a87360844..9ededfbf0726 100644 --- a/drivers/net/ibm_newemac/mal.h +++ b/drivers/net/ibm_newemac/mal.h @@ -214,6 +214,8 @@ struct mal_instance { int index; spinlock_t lock; + struct net_device dummy_dev; + unsigned int features; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4647604c7ca9..ec54785d34f9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -795,6 +795,7 @@ struct net_device NETREG_UNREGISTERING, /* called unregister_netdevice */ NETREG_UNREGISTERED, /* completed unregister todo */ NETREG_RELEASED, /* called free_netdev */ + NETREG_DUMMY, /* dummy device for NAPI poll */ } reg_state; /* Called from unregister, can be used to call free_netdev */ @@ -1077,6 +1078,8 @@ extern void free_netdev(struct net_device *dev); extern void synchronize_net(void); extern int register_netdevice_notifier(struct notifier_block *nb); extern int unregister_netdevice_notifier(struct notifier_block *nb); +extern int init_dummy_netdev(struct net_device *dev); + extern int call_netdevice_notifiers(unsigned long val, struct net_device *dev); extern struct net_device *dev_get_by_index(struct net *net, int ifindex); extern struct net_device *__dev_get_by_index(struct net *net, int ifindex); diff --git a/net/core/dev.c b/net/core/dev.c index 60377b6c0a80..8d675975d85b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4430,6 +4430,45 @@ err_uninit: goto out; } +/** + * init_dummy_netdev - init a dummy network device for NAPI + * @dev: device to init + * + * This takes a network device structure and initialize the minimum + * amount of fields so it can be used to schedule NAPI polls without + * registering a full blown interface. This is to be used by drivers + * that need to tie several hardware interfaces to a single NAPI + * poll scheduler due to HW limitations. + */ +int init_dummy_netdev(struct net_device *dev) +{ + /* Clear everything. Note we don't initialize spinlocks + * are they aren't supposed to be taken by any of the + * NAPI code and this dummy netdev is supposed to be + * only ever used for NAPI polls + */ + memset(dev, 0, sizeof(struct net_device)); + + /* make sure we BUG if trying to hit standard + * register/unregister code path + */ + dev->reg_state = NETREG_DUMMY; + + /* initialize the ref count */ + atomic_set(&dev->refcnt, 1); + + /* NAPI wants this */ + INIT_LIST_HEAD(&dev->napi_list); + + /* a dummy interface is started by default */ + set_bit(__LINK_STATE_PRESENT, &dev->state); + set_bit(__LINK_STATE_START, &dev->state); + + return 0; +} +EXPORT_SYMBOL_GPL(init_dummy_netdev); + + /** * register_netdev - register a network device * @dev: device to register -- cgit v1.2.3 From 45ce80fb6b6f9594d1396d44dd7e7c02d596fef8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 15 Jan 2009 13:50:59 -0800 Subject: cgroups: consolidate cgroup documents Move Documentation/cpusets.txt and Documentation/controllers/* to Documentation/cgroups/ Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 5 +- Documentation/cgroups/cpuacct.txt | 32 + Documentation/cgroups/cpusets.txt | 808 +++++++++++++++++++++++++ Documentation/cgroups/devices.txt | 52 ++ Documentation/cgroups/memcg_test.txt | 342 +++++++++++ Documentation/cgroups/memory.txt | 399 ++++++++++++ Documentation/cgroups/resource_counter.txt | 181 ++++++ Documentation/controllers/cpuacct.txt | 32 - Documentation/controllers/devices.txt | 52 -- Documentation/controllers/memcg_test.txt | 342 ----------- Documentation/controllers/memory.txt | 399 ------------ Documentation/controllers/resource_counter.txt | 181 ------ Documentation/cpusets.txt | 808 ------------------------- Documentation/scheduler/sched-design-CFS.txt | 2 +- include/linux/res_counter.h | 2 +- init/Kconfig | 9 +- kernel/cpuset.c | 2 +- 17 files changed, 1824 insertions(+), 1824 deletions(-) create mode 100644 Documentation/cgroups/cpuacct.txt create mode 100644 Documentation/cgroups/cpusets.txt create mode 100644 Documentation/cgroups/devices.txt create mode 100644 Documentation/cgroups/memcg_test.txt create mode 100644 Documentation/cgroups/memory.txt create mode 100644 Documentation/cgroups/resource_counter.txt delete mode 100644 Documentation/controllers/cpuacct.txt delete mode 100644 Documentation/controllers/devices.txt delete mode 100644 Documentation/controllers/memcg_test.txt delete mode 100644 Documentation/controllers/memory.txt delete mode 100644 Documentation/controllers/resource_counter.txt delete mode 100644 Documentation/cpusets.txt (limited to 'include/linux') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index e33ee74eee77..d9e5d6f41b92 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -1,7 +1,8 @@ CGROUPS ------- -Written by Paul Menage based on Documentation/cpusets.txt +Written by Paul Menage based on +Documentation/cgroups/cpusets.txt Original copyright statements from cpusets.txt: Portions Copyright (C) 2004 BULL SA. @@ -68,7 +69,7 @@ On their own, the only use for cgroups is for simple job tracking. The intention is that other subsystems hook into the generic cgroup support to provide new attributes for cgroups, such as accounting/limiting the resources which processes in a cgroup can -access. For example, cpusets (see Documentation/cpusets.txt) allows +access. For example, cpusets (see Documentation/cgroups/cpusets.txt) allows you to associate a set of CPUs and a set of memory nodes with the tasks in each cgroup. diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt new file mode 100644 index 000000000000..bb775fbe43d7 --- /dev/null +++ b/Documentation/cgroups/cpuacct.txt @@ -0,0 +1,32 @@ +CPU Accounting Controller +------------------------- + +The CPU accounting controller is used to group tasks using cgroups and +account the CPU usage of these groups of tasks. + +The CPU accounting controller supports multi-hierarchy groups. An accounting +group accumulates the CPU usage of all of its child groups and the tasks +directly present in its group. + +Accounting groups can be created by first mounting the cgroup filesystem. + +# mkdir /cgroups +# mount -t cgroup -ocpuacct none /cgroups + +With the above step, the initial or the parent accounting group +becomes visible at /cgroups. At bootup, this group includes all the +tasks in the system. /cgroups/tasks lists the tasks in this cgroup. +/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by +this group which is essentially the CPU time obtained by all the tasks +in the system. + +New accounting groups can be created under the parent group /cgroups. + +# cd /cgroups +# mkdir g1 +# echo $$ > g1 + +The above steps create a new group g1 and move the current shell +process (bash) into it. CPU time consumed by this bash and its children +can be obtained from g1/cpuacct.usage and the same is accumulated in +/cgroups/cpuacct.usage also. diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt new file mode 100644 index 000000000000..5c86c258c791 --- /dev/null +++ b/Documentation/cgroups/cpusets.txt @@ -0,0 +1,808 @@ + CPUSETS + ------- + +Copyright (C) 2004 BULL SA. +Written by Simon.Derr@bull.net + +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. +Modified by Paul Jackson +Modified by Christoph Lameter +Modified by Paul Menage +Modified by Hidetoshi Seto + +CONTENTS: +========= + +1. Cpusets + 1.1 What are cpusets ? + 1.2 Why are cpusets needed ? + 1.3 How are cpusets implemented ? + 1.4 What are exclusive cpusets ? + 1.5 What is memory_pressure ? + 1.6 What is memory spread ? + 1.7 What is sched_load_balance ? + 1.8 What is sched_relax_domain_level ? + 1.9 How do I use cpusets ? +2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Adding/removing cpus + 2.3 Setting flags + 2.4 Attaching processes +3. Questions +4. Contact + +1. Cpusets +========== + +1.1 What are cpusets ? +---------------------- + +Cpusets provide a mechanism for assigning a set of CPUs and Memory +Nodes to a set of tasks. In this document "Memory Node" refers to +an on-line node that contains memory. + +Cpusets constrain the CPU and Memory placement of tasks to only +the resources within a tasks current cpuset. They form a nested +hierarchy visible in a virtual file system. These are the essential +hooks, beyond what is already present, required to manage dynamic +job placement on large systems. + +Cpusets use the generic cgroup subsystem described in +Documentation/cgroups/cgroups.txt. + +Requests by a task, using the sched_setaffinity(2) system call to +include CPUs in its CPU affinity mask, and using the mbind(2) and +set_mempolicy(2) system calls to include Memory Nodes in its memory +policy, are both filtered through that tasks cpuset, filtering out any +CPUs or Memory Nodes not in that cpuset. The scheduler will not +schedule a task on a CPU that is not allowed in its cpus_allowed +vector, and the kernel page allocator will not allocate a page on a +node that is not allowed in the requesting tasks mems_allowed vector. + +User level code may create and destroy cpusets by name in the cgroup +virtual file system, manage the attributes and permissions of these +cpusets and which CPUs and Memory Nodes are assigned to each cpuset, +specify and query to which cpuset a task is assigned, and list the +task pids assigned to a cpuset. + + +1.2 Why are cpusets needed ? +---------------------------- + +The management of large computer systems, with many processors (CPUs), +complex memory cache hierarchies and multiple Memory Nodes having +non-uniform access times (NUMA) presents additional challenges for +the efficient scheduling and memory placement of processes. + +Frequently more modest sized systems can be operated with adequate +efficiency just by letting the operating system automatically share +the available CPU and Memory resources amongst the requesting tasks. + +But larger systems, which benefit more from careful processor and +memory placement to reduce memory access times and contention, +and which typically represent a larger investment for the customer, +can benefit from explicitly placing jobs on properly sized subsets of +the system. + +This can be especially valuable on: + + * Web Servers running multiple instances of the same web application, + * Servers running different applications (for instance, a web server + and a database), or + * NUMA systems running large HPC applications with demanding + performance characteristics. + +These subsets, or "soft partitions" must be able to be dynamically +adjusted, as the job mix changes, without impacting other concurrently +executing jobs. The location of the running jobs pages may also be moved +when the memory locations are changed. + +The kernel cpuset patch provides the minimum essential kernel +mechanisms required to efficiently implement such subsets. It +leverages existing CPU and Memory Placement facilities in the Linux +kernel to avoid any additional impact on the critical scheduler or +memory allocator code. + + +1.3 How are cpusets implemented ? +--------------------------------- + +Cpusets provide a Linux kernel mechanism to constrain which CPUs and +Memory Nodes are used by a process or set of processes. + +The Linux kernel already has a pair of mechanisms to specify on which +CPUs a task may be scheduled (sched_setaffinity) and on which Memory +Nodes it may obtain memory (mbind, set_mempolicy). + +Cpusets extends these two mechanisms as follows: + + - Cpusets are sets of allowed CPUs and Memory Nodes, known to the + kernel. + - Each task in the system is attached to a cpuset, via a pointer + in the task structure to a reference counted cgroup structure. + - Calls to sched_setaffinity are filtered to just those CPUs + allowed in that tasks cpuset. + - Calls to mbind and set_mempolicy are filtered to just + those Memory Nodes allowed in that tasks cpuset. + - The root cpuset contains all the systems CPUs and Memory + Nodes. + - For any cpuset, one can define child cpusets containing a subset + of the parents CPU and Memory Node resources. + - The hierarchy of cpusets can be mounted at /dev/cpuset, for + browsing and manipulation from user space. + - A cpuset may be marked exclusive, which ensures that no other + cpuset (except direct ancestors and descendents) may contain + any overlapping CPUs or Memory Nodes. + - You can list all the tasks (by pid) attached to any cpuset. + +The implementation of cpusets requires a few, simple hooks +into the rest of the kernel, none in performance critical paths: + + - in init/main.c, to initialize the root cpuset at system boot. + - in fork and exit, to attach and detach a task from its cpuset. + - in sched_setaffinity, to mask the requested CPUs by what's + allowed in that tasks cpuset. + - in sched.c migrate_all_tasks(), to keep migrating tasks within + the CPUs allowed by their cpuset, if possible. + - in the mbind and set_mempolicy system calls, to mask the requested + Memory Nodes by what's allowed in that tasks cpuset. + - in page_alloc.c, to restrict memory to allowed nodes. + - in vmscan.c, to restrict page recovery to the current cpuset. + +You should mount the "cgroup" filesystem type in order to enable +browsing and modifying the cpusets presently known to the kernel. No +new system calls are added for cpusets - all support for querying and +modifying cpusets is via this cpuset file system. + +The /proc//status file for each task has four added lines, +displaying the tasks cpus_allowed (on which CPUs it may be scheduled) +and mems_allowed (on which Memory Nodes it may obtain memory), +in the two formats seen in the following example: + + Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff + Cpus_allowed_list: 0-127 + Mems_allowed: ffffffff,ffffffff + Mems_allowed_list: 0-63 + +Each cpuset is represented by a directory in the cgroup file system +containing (on top of the standard cgroup files) the following +files describing that cpuset: + + - cpus: list of CPUs in that cpuset + - mems: list of Memory Nodes in that cpuset + - memory_migrate flag: if set, move pages to cpusets nodes + - cpu_exclusive flag: is cpu placement exclusive? + - mem_exclusive flag: is memory placement exclusive? + - mem_hardwall flag: is memory allocation hardwalled + - memory_pressure: measure of how much paging pressure in cpuset + +In addition, the root cpuset only has the following file: + - memory_pressure_enabled flag: compute memory_pressure? + +New cpusets are created using the mkdir system call or shell +command. The properties of a cpuset, such as its flags, allowed +CPUs and Memory Nodes, and attached tasks, are modified by writing +to the appropriate file in that cpusets directory, as listed above. + +The named hierarchical structure of nested cpusets allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cpuset allows organizing the work load +on a system into related sets of tasks such that each set is constrained +to using the CPUs and Memory Nodes of a particular cpuset. A task +may be re-attached to any other cpuset, if allowed by the permissions +on the necessary cpuset file system directories. + +Such management of a system "in the large" integrates smoothly with +the detailed placement done on individual tasks and memory regions +using the sched_setaffinity, mbind and set_mempolicy system calls. + +The following rules apply to each cpuset: + + - Its CPUs and Memory Nodes must be a subset of its parents. + - It can't be marked exclusive unless its parent is. + - If its cpu or memory is exclusive, they may not overlap any sibling. + +These rules, and the natural hierarchy of cpusets, enable efficient +enforcement of the exclusive guarantee, without having to scan all +cpusets every time any of them change to ensure nothing overlaps a +exclusive cpuset. Also, the use of a Linux virtual file system (vfs) +to represent the cpuset hierarchy provides for a familiar permission +and name space for cpusets, with a minimum of additional kernel code. + +The cpus and mems files in the root (top_cpuset) cpuset are +read-only. The cpus file automatically tracks the value of +cpu_online_map using a CPU hotplug notifier, and the mems file +automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e., +nodes with memory--using the cpuset_track_online_nodes() hook. + + +1.4 What are exclusive cpusets ? +-------------------------------- + +If a cpuset is cpu or mem exclusive, no other cpuset, other than +a direct ancestor or descendent, may share any of the same CPUs or +Memory Nodes. + +A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", +i.e. it restricts kernel allocations for page, buffer and other data +commonly shared by the kernel across multiple users. All cpusets, +whether hardwalled or not, restrict allocations of memory for user +space. This enables configuring a system so that several independent +jobs can share common kernel data, such as file system pages, while +isolating each job's user allocation in its own cpuset. To do this, +construct a large mem_exclusive cpuset to hold all the jobs, and +construct child, non-mem_exclusive cpusets for each individual job. +Only a small amount of typical kernel memory, such as requests from +interrupt handlers, is allowed to be taken outside even a +mem_exclusive cpuset. + + +1.5 What is memory_pressure ? +----------------------------- +The memory_pressure of a cpuset provides a simple per-cpuset metric +of the rate that the tasks in a cpuset are attempting to free up in +use memory on the nodes of the cpuset to satisfy additional memory +requests. + +This enables batch managers monitoring jobs running in dedicated +cpusets to efficiently detect what level of memory pressure that job +is causing. + +This is useful both on tightly managed systems running a wide mix of +submitted jobs, which may choose to terminate or re-prioritize jobs that +are trying to use more memory than allowed on the nodes assigned them, +and with tightly coupled, long running, massively parallel scientific +computing jobs that will dramatically fail to meet required performance +goals if they start to use more memory than allowed to them. + +This mechanism provides a very economical way for the batch manager +to monitor a cpuset for signs of memory pressure. It's up to the +batch manager or other user code to decide what to do about it and +take action. + +==> Unless this feature is enabled by writing "1" to the special file + /dev/cpuset/memory_pressure_enabled, the hook in the rebalance + code of __alloc_pages() for this metric reduces to simply noticing + that the cpuset_memory_pressure_enabled flag is zero. So only + systems that enable this feature will compute the metric. + +Why a per-cpuset, running average: + + Because this meter is per-cpuset, rather than per-task or mm, + the system load imposed by a batch scheduler monitoring this + metric is sharply reduced on large systems, because a scan of + the tasklist can be avoided on each set of queries. + + Because this meter is a running average, instead of an accumulating + counter, a batch scheduler can detect memory pressure with a + single read, instead of having to read and accumulate results + for a period of time. + + Because this meter is per-cpuset rather than per-task or mm, + the batch scheduler can obtain the key information, memory + pressure in a cpuset, with a single read, rather than having to + query and accumulate results over all the (dynamically changing) + set of tasks in the cpuset. + +A per-cpuset simple digital filter (requires a spinlock and 3 words +of data per-cpuset) is kept, and updated by any task attached to that +cpuset, if it enters the synchronous (direct) page reclaim code. + +A per-cpuset file provides an integer number representing the recent +(half-life of 10 seconds) rate of direct page reclaims caused by +the tasks in the cpuset, in units of reclaims attempted per second, +times 1000. + + +1.6 What is memory spread ? +--------------------------- +There are two boolean flag files per cpuset that control where the +kernel allocates pages for the file system buffers and related in +kernel data structures. They are called 'memory_spread_page' and +'memory_spread_slab'. + +If the per-cpuset boolean flag file 'memory_spread_page' is set, then +the kernel will spread the file system buffers (page cache) evenly +over all the nodes that the faulting task is allowed to use, instead +of preferring to put those pages on the node where the task is running. + +If the per-cpuset boolean flag file 'memory_spread_slab' is set, +then the kernel will spread some file system related slab caches, +such as for inodes and dentries evenly over all the nodes that the +faulting task is allowed to use, instead of preferring to put those +pages on the node where the task is running. + +The setting of these flags does not affect anonymous data segment or +stack segment pages of a task. + +By default, both kinds of memory spreading are off, and memory +pages are allocated on the node local to where the task is running, +except perhaps as modified by the tasks NUMA mempolicy or cpuset +configuration, so long as sufficient free memory pages are available. + +When new cpusets are created, they inherit the memory spread settings +of their parent. + +Setting memory spreading causes allocations for the affected page +or slab caches to ignore the tasks NUMA mempolicy and be spread +instead. Tasks using mbind() or set_mempolicy() calls to set NUMA +mempolicies will not notice any change in these calls as a result of +their containing tasks memory spread settings. If memory spreading +is turned off, then the currently specified NUMA mempolicy once again +applies to memory page allocations. + +Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag +files. By default they contain "0", meaning that the feature is off +for that cpuset. If a "1" is written to that file, then that turns +the named feature on. + +The implementation is simple. + +Setting the flag 'memory_spread_page' turns on a per-process flag +PF_SPREAD_PAGE for each task that is in that cpuset or subsequently +joins that cpuset. The page allocation calls for the page cache +is modified to perform an inline check for this PF_SPREAD_PAGE task +flag, and if set, a call to a new routine cpuset_mem_spread_node() +returns the node to prefer for the allocation. + +Similarly, setting 'memory_spread_slab' turns on the flag +PF_SPREAD_SLAB, and appropriately marked slab caches will allocate +pages from the node returned by cpuset_mem_spread_node(). + +The cpuset_mem_spread_node() routine is also simple. It uses the +value of a per-task rotor cpuset_mem_spread_rotor to select the next +node in the current tasks mems_allowed to prefer for the allocation. + +This memory placement policy is also known (in other contexts) as +round-robin or interleave. + +This policy can provide substantial improvements for jobs that need +to place thread local data on the corresponding node, but that need +to access large file system data sets that need to be spread across +the several nodes in the jobs cpuset in order to fit. Without this +policy, especially for jobs that might have one thread reading in the +data set, the memory allocation across the nodes in the jobs cpuset +can become very uneven. + +1.7 What is sched_load_balance ? +-------------------------------- + +The kernel scheduler (kernel/sched.c) automatically load balances +tasks. If one CPU is underutilized, kernel code running on that +CPU will look for tasks on other more overloaded CPUs and move those +tasks to itself, within the constraints of such placement mechanisms +as cpusets and sched_setaffinity. + +The algorithmic cost of load balancing and its impact on key shared +kernel data structures such as the task list increases more than +linearly with the number of CPUs being balanced. So the scheduler +has support to partition the systems CPUs into a number of sched +domains such that it only load balances within each sched domain. +Each sched domain covers some subset of the CPUs in the system; +no two sched domains overlap; some CPUs might not be in any sched +domain and hence won't be load balanced. + +Put simply, it costs less to balance between two smaller sched domains +than one big one, but doing so means that overloads in one of the +two domains won't be load balanced to the other one. + +By default, there is one sched domain covering all CPUs, except those +marked isolated using the kernel boot time "isolcpus=" argument. + +This default load balancing across all CPUs is not well suited for +the following two situations: + 1) On large systems, load balancing across many CPUs is expensive. + If the system is managed using cpusets to place independent jobs + on separate sets of CPUs, full load balancing is unnecessary. + 2) Systems supporting realtime on some CPUs need to minimize + system overhead on those CPUs, including avoiding task load + balancing if that is not needed. + +When the per-cpuset flag "sched_load_balance" is enabled (the default +setting), it requests that all the CPUs in that cpusets allowed 'cpus' +be contained in a single sched domain, ensuring that load balancing +can move a task (not otherwised pinned, as by sched_setaffinity) +from any CPU in that cpuset to any other. + +When the per-cpuset flag "sched_load_balance" is disabled, then the +scheduler will avoid load balancing across the CPUs in that cpuset, +--except-- in so far as is necessary because some overlapping cpuset +has "sched_load_balance" enabled. + +So, for example, if the top cpuset has the flag "sched_load_balance" +enabled, then the scheduler will have one sched domain covering all +CPUs, and the setting of the "sched_load_balance" flag in any other +cpusets won't matter, as we're already fully load balancing. + +Therefore in the above two situations, the top cpuset flag +"sched_load_balance" should be disabled, and only some of the smaller, +child cpusets have this flag enabled. + +When doing this, you don't usually want to leave any unpinned tasks in +the top cpuset that might use non-trivial amounts of CPU, as such tasks +may be artificially constrained to some subset of CPUs, depending on +the particulars of this flag setting in descendent cpusets. Even if +such a task could use spare CPU cycles in some other CPUs, the kernel +scheduler might not consider the possibility of load balancing that +task to that underused CPU. + +Of course, tasks pinned to a particular CPU can be left in a cpuset +that disables "sched_load_balance" as those tasks aren't going anywhere +else anyway. + +There is an impedance mismatch here, between cpusets and sched domains. +Cpusets are hierarchical and nest. Sched domains are flat; they don't +overlap and each CPU is in at most one sched domain. + +It is necessary for sched domains to be flat because load balancing +across partially overlapping sets of CPUs would risk unstable dynamics +that would be beyond our understanding. So if each of two partially +overlapping cpusets enables the flag 'sched_load_balance', then we +form a single sched domain that is a superset of both. We won't move +a task to a CPU outside it cpuset, but the scheduler load balancing +code might waste some compute cycles considering that possibility. + +This mismatch is why there is not a simple one-to-one relation +between which cpusets have the flag "sched_load_balance" enabled, +and the sched domain configuration. If a cpuset enables the flag, it +will get balancing across all its CPUs, but if it disables the flag, +it will only be assured of no load balancing if no other overlapping +cpuset enables the flag. + +If two cpusets have partially overlapping 'cpus' allowed, and only +one of them has this flag enabled, then the other may find its +tasks only partially load balanced, just on the overlapping CPUs. +This is just the general case of the top_cpuset example given a few +paragraphs above. In the general case, as in the top cpuset case, +don't leave tasks that might use non-trivial amounts of CPU in +such partially load balanced cpusets, as they may be artificially +constrained to some subset of the CPUs allowed to them, for lack of +load balancing to the other CPUs. + +1.7.1 sched_load_balance implementation details. +------------------------------------------------ + +The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary +to most cpuset flags.) When enabled for a cpuset, the kernel will +ensure that it can load balance across all the CPUs in that cpuset +(makes sure that all the CPUs in the cpus_allowed of that cpuset are +in the same sched domain.) + +If two overlapping cpusets both have 'sched_load_balance' enabled, +then they will be (must be) both in the same sched domain. + +If, as is the default, the top cpuset has 'sched_load_balance' enabled, +then by the above that means there is a single sched domain covering +the whole system, regardless of any other cpuset settings. + +The kernel commits to user space that it will avoid load balancing +where it can. It will pick as fine a granularity partition of sched +domains as it can while still providing load balancing for any set +of CPUs allowed to a cpuset having 'sched_load_balance' enabled. + +The internal kernel cpuset to scheduler interface passes from the +cpuset code to the scheduler code a partition of the load balanced +CPUs in the system. This partition is a set of subsets (represented +as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all +the CPUs that must be load balanced. + +Whenever the 'sched_load_balance' flag changes, or CPUs come or go +from a cpuset with this flag enabled, or a cpuset with this flag +enabled is removed, the cpuset code builds a new such partition and +passes it to the scheduler sched domain setup code, to have the sched +domains rebuilt as necessary. + +This partition exactly defines what sched domains the scheduler should +setup - one sched domain for each element (cpumask_t) in the partition. + +The scheduler remembers the currently active sched domain partitions. +When the scheduler routine partition_sched_domains() is invoked from +the cpuset code to update these sched domains, it compares the new +partition requested with the current, and updates its sched domains, +removing the old and adding the new, for each change. + + +1.8 What is sched_relax_domain_level ? +-------------------------------------- + +In sched domain, the scheduler migrates tasks in 2 ways; periodic load +balance on tick, and at time of some schedule events. + +When a task is woken up, scheduler try to move the task on idle CPU. +For example, if a task A running on CPU X activates another task B +on the same CPU X, and if CPU Y is X's sibling and performing idle, +then scheduler migrate task B to CPU Y so that task B can start on +CPU Y without waiting task A on CPU X. + +And if a CPU run out of tasks in its runqueue, the CPU try to pull +extra tasks from other busy CPUs to help them before it is going to +be idle. + +Of course it takes some searching cost to find movable tasks and/or +idle CPUs, the scheduler might not search all CPUs in the domain +everytime. In fact, in some architectures, the searching ranges on +events are limited in the same socket or node where the CPU locates, +while the load balance on tick searchs all. + +For example, assume CPU Z is relatively far from CPU X. Even if CPU Z +is idle while CPU X and the siblings are busy, scheduler can't migrate +woken task B from X to Z since it is out of its searching range. +As the result, task B on CPU X need to wait task A or wait load balance +on the next tick. For some applications in special situation, waiting +1 tick may be too long. + +The 'sched_relax_domain_level' file allows you to request changing +this searching range as you like. This file takes int value which +indicates size of searching range in levels ideally as follows, +otherwise initial value -1 that indicates the cpuset has no request. + + -1 : no request. use system default or follow request of others. + 0 : no search. + 1 : search siblings (hyperthreads in a core). + 2 : search cores in a package. + 3 : search cpus in a node [= system wide on non-NUMA system] + ( 4 : search nodes in a chunk of node [on NUMA system] ) + ( 5 : search system wide [on NUMA system] ) + +The system default is architecture dependent. The system default +can be changed using the relax_domain_level= boot parameter. + +This file is per-cpuset and affect the sched domain where the cpuset +belongs to. Therefore if the flag 'sched_load_balance' of a cpuset +is disabled, then 'sched_relax_domain_level' have no effect since +there is no sched domain belonging the cpuset. + +If multiple cpusets are overlapping and hence they form a single sched +domain, the largest value among those is used. Be careful, if one +requests 0 and others are -1 then 0 is used. + +Note that modifying this file will have both good and bad effects, +and whether it is acceptable or not will be depend on your situation. +Don't modify this file if you are not sure. + +If your situation is: + - The migration costs between each cpu can be assumed considerably + small(for you) due to your special application's behavior or + special hardware support for CPU cache etc. + - The searching cost doesn't have impact(for you) or you can make + the searching cost enough small by managing cpuset to compact etc. + - The latency is required even it sacrifices cache hit rate etc. +then increasing 'sched_relax_domain_level' would benefit you. + + +1.9 How do I use cpusets ? +-------------------------- + +In order to minimize the impact of cpusets on critical kernel +code, such as the scheduler, and due to the fact that the kernel +does not support one task updating the memory placement of another +task directly, the impact on a task of changing its cpuset CPU +or Memory Node placement, or of changing to which cpuset a task +is attached, is subtle. + +If a cpuset has its Memory Nodes modified, then for each task attached +to that cpuset, the next time that the kernel attempts to allocate +a page of memory for that task, the kernel will notice the change +in the tasks cpuset, and update its per-task memory placement to +remain within the new cpusets memory placement. If the task was using +mempolicy MPOL_BIND, and the nodes to which it was bound overlap with +its new cpuset, then the task will continue to use whatever subset +of MPOL_BIND nodes are still allowed in the new cpuset. If the task +was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed +in the new cpuset, then the task will be essentially treated as if it +was MPOL_BIND bound to the new cpuset (even though its numa placement, +as queried by get_mempolicy(), doesn't change). If a task is moved +from one cpuset to another, then the kernel will adjust the tasks +memory placement, as above, the next time that the kernel attempts +to allocate a page of memory for that task. + +If a cpuset has its 'cpus' modified, then each task in that cpuset +will have its allowed CPU placement changed immediately. Similarly, +if a tasks pid is written to a cpusets 'tasks' file, in either its +current cpuset or another cpuset, then its allowed CPU placement is +changed immediately. If such a task had been bound to some subset +of its cpuset using the sched_setaffinity() call, the task will be +allowed to run on any CPU allowed in its new cpuset, negating the +affect of the prior sched_setaffinity() call. + +In summary, the memory placement of a task whose cpuset is changed is +updated by the kernel, on the next allocation of a page for that task, +but the processor placement is not updated, until that tasks pid is +rewritten to the 'tasks' file of its cpuset. This is done to avoid +impacting the scheduler code in the kernel with a check for changes +in a tasks processor placement. + +Normally, once a page is allocated (given a physical page +of main memory) then that page stays on whatever node it +was allocated, so long as it remains allocated, even if the +cpusets memory placement policy 'mems' subsequently changes. +If the cpuset flag file 'memory_migrate' is set true, then when +tasks are attached to that cpuset, any pages that task had +allocated to it on nodes in its previous cpuset are migrated +to the tasks new cpuset. The relative placement of the page within +the cpuset is preserved during these migration operations if possible. +For example if the page was on the second valid node of the prior cpuset +then the page will be placed on the second valid node of the new cpuset. + +Also if 'memory_migrate' is set true, then if that cpusets +'mems' file is modified, pages allocated to tasks in that +cpuset, that were on nodes in the previous setting of 'mems', +will be moved to nodes in the new setting of 'mems.' +Pages that were not in the tasks prior cpuset, or in the cpusets +prior 'mems' setting, will not be moved. + +There is an exception to the above. If hotplug functionality is used +to remove all the CPUs that are currently assigned to a cpuset, +then all the tasks in that cpuset will be moved to the nearest ancestor +with non-empty cpus. But the moving of some (or all) tasks might fail if +cpuset is bound with another cgroup subsystem which has some restrictions +on task attaching. In this failing case, those tasks will stay +in the original cpuset, and the kernel will automatically update +their cpus_allowed to allow all online CPUs. When memory hotplug +functionality for removing Memory Nodes is available, a similar exception +is expected to apply there as well. In general, the kernel prefers to +violate cpuset placement, over starving a task that has had all +its allowed CPUs or Memory Nodes taken offline. + +There is a second exception to the above. GFP_ATOMIC requests are +kernel internal allocations that must be satisfied, immediately. +The kernel may drop some request, in rare cases even panic, if a +GFP_ATOMIC alloc fails. If the request cannot be satisfied within +the current tasks cpuset, then we relax the cpuset, and look for +memory anywhere we can find it. It's better to violate the cpuset +than stress the kernel. + +To start a new job that is to be contained within a cpuset, the steps are: + + 1) mkdir /dev/cpuset + 2) mount -t cgroup -ocpuset cpuset /dev/cpuset + 3) Create the new cpuset by doing mkdir's and write's (or echo's) in + the /dev/cpuset virtual file system. + 4) Start a task that will be the "founding father" of the new job. + 5) Attach that task to the new cpuset by writing its pid to the + /dev/cpuset tasks file for that cpuset. + 6) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cpuset +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cpuset: + + mount -t cgroup -ocpuset cpuset /dev/cpuset + cd /dev/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpus + /bin/echo 1 > mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cpuset Charlie + # The next line should display '/Charlie' + cat /proc/self/cpuset + +In the future, a C library interface to cpusets will likely be +available. For now, the only way to query or modify cpusets is +via the cpuset file system, using the various cd, mkdir, echo, cat, +rmdir commands from the shell, or their equivalent from C. + +The sched_setaffinity calls can also be done at the shell prompt using +SGI's runon or Robert Love's taskset. The mbind and set_mempolicy +calls can be done at the shell prompt using the numactl command +(part of Andi Kleen's numa package). + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using the cpusets can be done through the cpuset +virtual filesystem. + +To mount it, type: +# mount -t cgroup -o cpuset cpuset /dev/cpuset + +Then under /dev/cpuset you can find a tree that corresponds to the +tree of the cpusets in the system. For instance, /dev/cpuset +is the cpuset that holds the whole system. + +If you want to create a new cpuset under /dev/cpuset: +# cd /dev/cpuset +# mkdir my_cpuset + +Now you want to do something with this cpuset. +# cd my_cpuset + +In this directory you can find several files: +# ls +cpu_exclusive memory_migrate mems tasks +cpus memory_pressure notify_on_release +mem_exclusive memory_spread_page sched_load_balance +mem_hardwall memory_spread_slab sched_relax_domain_level + +Reading them will give you information about the state of this cpuset: +the CPUs and Memory Nodes it can use, the processes that are using +it, its properties. By writing to these files you can manipulate +the cpuset. + +Set some flags: +# /bin/echo 1 > cpu_exclusive + +Add some cpus: +# /bin/echo 0-7 > cpus + +Add some mems: +# /bin/echo 0-7 > mems + +Now attach your shell to this cpuset: +# /bin/echo $$ > tasks + +You can also create cpusets inside your cpuset by using mkdir in this +directory. +# mkdir my_sub_cs + +To remove a cpuset, just use rmdir: +# rmdir my_sub_cs +This will fail if the cpuset is in use (has cpusets inside, or has +processes attached). + +Note that for legacy reasons, the "cpuset" filesystem exists as a +wrapper around the cgroup filesystem. + +The command + +mount -t cpuset X /dev/cpuset + +is equivalent to + +mount -t cgroup -ocpuset X /dev/cpuset +echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent + +2.2 Adding/removing cpus +------------------------ + +This is the syntax to use when writing in the cpus or mems files +in cpuset directories: + +# /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4 +# /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4 + +2.3 Setting flags +----------------- + +The syntax is very simple: + +# /bin/echo 1 > cpu_exclusive -> set flag 'cpu_exclusive' +# /bin/echo 0 > cpu_exclusive -> unset flag 'cpu_exclusive' + +2.4 Attaching processes +----------------------- + +# /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another: + +# /bin/echo PID1 > tasks +# /bin/echo PID2 > tasks + ... +# /bin/echo PIDn > tasks + + +3. Questions +============ + +Q: what's up with this '/bin/echo' ? +A: bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cpuset file system, you won't be + able to tell whether a command succeeded or failed. + +Q: When I attach processes, only the first of the line gets really attached ! +A: We can only return one error code per call to write(). So you should also + put only ONE pid. + +4. Contact +========== + +Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroups/devices.txt new file mode 100644 index 000000000000..7cc6e6a60672 --- /dev/null +++ b/Documentation/cgroups/devices.txt @@ -0,0 +1,52 @@ +Device Whitelist Controller + +1. Description: + +Implement a cgroup to track and enforce open and mknod restrictions +on device files. A device cgroup associates a device access +whitelist with each cgroup. A whitelist entry has 4 fields. +'type' is a (all), c (char), or b (block). 'all' means it applies +to all types and all major and minor numbers. Major and minor are +either an integer or * for all. Access is a composition of r +(read), w (write), and m (mknod). + +The root device cgroup starts with rwm to 'all'. A child device +cgroup gets a copy of the parent. Administrators can then remove +devices from the whitelist or add new entries. A child cgroup can +never receive a device access which is denied by its parent. However +when a device access is removed from a parent it will not also be +removed from the child(ren). + +2. User Interface + +An entry is added using devices.allow, and removed using +devices.deny. For instance + + echo 'c 1:3 mr' > /cgroups/1/devices.allow + +allows cgroup 1 to read and mknod the device usually known as +/dev/null. Doing + + echo a > /cgroups/1/devices.deny + +will remove the default 'a *:* rwm' entry. Doing + + echo a > /cgroups/1/devices.allow + +will add the 'a *:* rwm' entry to the whitelist. + +3. Security + +Any task can move itself between cgroups. This clearly won't +suffice, but we can decide the best way to adequately restrict +movement as people get some experience with this. We may just want +to require CAP_SYS_ADMIN, which at least is a separate bit from +CAP_MKNOD. We may want to just refuse moving to a cgroup which +isn't a descendent of the current one. Or we may want to use +CAP_MAC_ADMIN, since we really are trying to lock down root. + +CAP_SYS_ADMIN is needed to modify the whitelist or move another +task to a new cgroup. (Again we'll probably want to change that). + +A cgroup may not be granted more permissions than the cgroup's +parent has. diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt new file mode 100644 index 000000000000..19533f93b7a2 --- /dev/null +++ b/Documentation/cgroups/memcg_test.txt @@ -0,0 +1,342 @@ +Memory Resource Controller(Memcg) Implementation Memo. +Last Updated: 2008/12/15 +Base Kernel Version: based on 2.6.28-rc8-mm. + +Because VM is getting complex (one of reasons is memcg...), memcg's behavior +is complex. This is a document for memcg's internal behavior. +Please note that implementation details can be changed. + +(*) Topics on API should be in Documentation/cgroups/memory.txt) + +0. How to record usage ? + 2 objects are used. + + page_cgroup ....an object per page. + Allocated at boot or memory hotplug. Freed at memory hot removal. + + swap_cgroup ... an entry per swp_entry. + Allocated at swapon(). Freed at swapoff(). + + The page_cgroup has USED bit and double count against a page_cgroup never + occurs. swap_cgroup is used only when a charged page is swapped-out. + +1. Charge + + a page/swp_entry may be charged (usage += PAGE_SIZE) at + + mem_cgroup_newpage_charge() + Called at new page fault and Copy-On-Write. + + mem_cgroup_try_charge_swapin() + Called at do_swap_page() (page fault on swap entry) and swapoff. + Followed by charge-commit-cancel protocol. (With swap accounting) + At commit, a charge recorded in swap_cgroup is removed. + + mem_cgroup_cache_charge() + Called at add_to_page_cache() + + mem_cgroup_cache_charge_swapin() + Called at shmem's swapin. + + mem_cgroup_prepare_migration() + Called before migration. "extra" charge is done and followed by + charge-commit-cancel protocol. + At commit, charge against oldpage or newpage will be committed. + +2. Uncharge + a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by + + mem_cgroup_uncharge_page() + Called when an anonymous page is fully unmapped. I.e., mapcount goes + to 0. If the page is SwapCache, uncharge is delayed until + mem_cgroup_uncharge_swapcache(). + + mem_cgroup_uncharge_cache_page() + Called when a page-cache is deleted from radix-tree. If the page is + SwapCache, uncharge is delayed until mem_cgroup_uncharge_swapcache(). + + mem_cgroup_uncharge_swapcache() + Called when SwapCache is removed from radix-tree. The charge itself + is moved to swap_cgroup. (If mem+swap controller is disabled, no + charge to swap occurs.) + + mem_cgroup_uncharge_swap() + Called when swp_entry's refcnt goes down to 0. A charge against swap + disappears. + + mem_cgroup_end_migration(old, new) + At success of migration old is uncharged (if necessary), a charge + to new page is committed. At failure, charge to old page is committed. + +3. charge-commit-cancel + In some case, we can't know this "charge" is valid or not at charging + (because of races). + To handle such case, there are charge-commit-cancel functions. + mem_cgroup_try_charge_XXX + mem_cgroup_commit_charge_XXX + mem_cgroup_cancel_charge_XXX + these are used in swap-in and migration. + + At try_charge(), there are no flags to say "this page is charged". + at this point, usage += PAGE_SIZE. + + At commit(), the function checks the page should be charged or not + and set flags or avoid charging.(usage -= PAGE_SIZE) + + At cancel(), simply usage -= PAGE_SIZE. + +Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. + +4. Anonymous + Anonymous page is newly allocated at + - page fault into MAP_ANONYMOUS mapping. + - Copy-On-Write. + It is charged right after it's allocated before doing any page table + related operations. Of course, it's uncharged when another page is used + for the fault address. + + At freeing anonymous page (by exit() or munmap()), zap_pte() is called + and pages for ptes are freed one by one.(see mm/memory.c). Uncharges + are done at page_remove_rmap() when page_mapcount() goes down to 0. + + Another page freeing is by page-reclaim (vmscan.c) and anonymous + pages are swapped out. In this case, the page is marked as + PageSwapCache(). uncharge() routine doesn't uncharge the page marked + as SwapCache(). It's delayed until __delete_from_swap_cache(). + + 4.1 Swap-in. + At swap-in, the page is taken from swap-cache. There are 2 cases. + + (a) If the SwapCache is newly allocated and read, it has no charges. + (b) If the SwapCache has been mapped by processes, it has been + charged already. + + This swap-in is one of the most complicated work. In do_swap_page(), + following events occur when pte is unchanged. + + (1) the page (SwapCache) is looked up. + (2) lock_page() + (3) try_charge_swapin() + (4) reuse_swap_page() (may call delete_swap_cache()) + (5) commit_charge_swapin() + (6) swap_free(). + + Considering following situation for example. + + (A) The page has not been charged before (2) and reuse_swap_page() + doesn't call delete_from_swap_cache(). + (B) The page has not been charged before (2) and reuse_swap_page() + calls delete_from_swap_cache(). + (C) The page has been charged before (2) and reuse_swap_page() doesn't + call delete_from_swap_cache(). + (D) The page has been charged before (2) and reuse_swap_page() calls + delete_from_swap_cache(). + + memory.usage/memsw.usage changes to this page/swp_entry will be + Case (A) (B) (C) (D) + Event + Before (2) 0/ 1 0/ 1 1/ 1 1/ 1 + =========================================== + (3) +1/+1 +1/+1 +1/+1 +1/+1 + (4) - 0/ 0 - -1/ 0 + (5) 0/-1 0/ 0 -1/-1 0/ 0 + (6) - 0/-1 - 0/-1 + =========================================== + Result 1/ 1 1/ 1 1/ 1 1/ 1 + + In any cases, charges to this page should be 1/ 1. + + 4.2 Swap-out. + At swap-out, typical state transition is below. + + (a) add to swap cache. (marked as SwapCache) + swp_entry's refcnt += 1. + (b) fully unmapped. + swp_entry's refcnt += # of ptes. + (c) write back to swap. + (d) delete from swap cache. (remove from SwapCache) + swp_entry's refcnt -= 1. + + + At (b), the page is marked as SwapCache and not uncharged. + At (d), the page is removed from SwapCache and a charge in page_cgroup + is moved to swap_cgroup. + + Finally, at task exit, + (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. + Here, a charge in swap_cgroup disappears. + +5. Page Cache + Page Cache is charged at + - add_to_page_cache_locked(). + + uncharged at + - __remove_from_page_cache(). + + The logic is very clear. (About migration, see below) + Note: __remove_from_page_cache() is called by remove_from_page_cache() + and __remove_mapping(). + +6. Shmem(tmpfs) Page Cache + Memcg's charge/uncharge have special handlers of shmem. The best way + to understand shmem's page state transition is to read mm/shmem.c. + But brief explanation of the behavior of memcg around shmem will be + helpful to understand the logic. + + Shmem's page (just leaf page, not direct/indirect block) can be on + - radix-tree of shmem's inode. + - SwapCache. + - Both on radix-tree and SwapCache. This happens at swap-in + and swap-out, + + It's charged when... + - A new page is added to shmem's radix-tree. + - A swp page is read. (move a charge from swap_cgroup to page_cgroup) + It's uncharged when + - A page is removed from radix-tree and not SwapCache. + - When SwapCache is removed, a charge is moved to swap_cgroup. + - When swp_entry's refcnt goes down to 0, a charge in swap_cgroup + disappears. + +7. Page Migration + One of the most complicated functions is page-migration-handler. + Memcg has 2 routines. Assume that we are migrating a page's contents + from OLDPAGE to NEWPAGE. + + Usual migration logic is.. + (a) remove the page from LRU. + (b) allocate NEWPAGE (migration target) + (c) lock by lock_page(). + (d) unmap all mappings. + (e-1) If necessary, replace entry in radix-tree. + (e-2) move contents of a page. + (f) map all mappings again. + (g) pushback the page to LRU. + (-) OLDPAGE will be freed. + + Before (g), memcg should complete all necessary charge/uncharge to + NEWPAGE/OLDPAGE. + + The point is.... + - If OLDPAGE is anonymous, all charges will be dropped at (d) because + try_to_unmap() drops all mapcount and the page will not be + SwapCache. + + - If OLDPAGE is SwapCache, charges will be kept at (g) because + __delete_from_swap_cache() isn't called at (e-1) + + - If OLDPAGE is page-cache, charges will be kept at (g) because + remove_from_swap_cache() isn't called at (e-1) + + memcg provides following hooks. + + - mem_cgroup_prepare_migration(OLDPAGE) + Called after (b) to account a charge (usage += PAGE_SIZE) against + memcg which OLDPAGE belongs to. + + - mem_cgroup_end_migration(OLDPAGE, NEWPAGE) + Called after (f) before (g). + If OLDPAGE is used, commit OLDPAGE again. If OLDPAGE is already + charged, a charge by prepare_migration() is automatically canceled. + If NEWPAGE is used, commit NEWPAGE and uncharge OLDPAGE. + + But zap_pte() (by exit or munmap) can be called while migration, + we have to check if OLDPAGE/NEWPAGE is a valid page after commit(). + +8. LRU + Each memcg has its own private LRU. Now, it's handling is under global + VM's control (means that it's handled under global zone->lru_lock). + Almost all routines around memcg's LRU is called by global LRU's + list management functions under zone->lru_lock(). + + A special function is mem_cgroup_isolate_pages(). This scans + memcg's private LRU and call __isolate_lru_page() to extract a page + from LRU. + (By __isolate_lru_page(), the page is removed from both of global and + private LRU.) + + +9. Typical Tests. + + Tests for racy cases. + + 9.1 Small limit to memcg. + When you do test to do racy case, it's good test to set memcg's limit + to be very small rather than GB. Many races found in the test under + xKB or xxMB limits. + (Memory behavior under GB and Memory behavior under MB shows very + different situation.) + + 9.2 Shmem + Historically, memcg's shmem handling was poor and we saw some amount + of troubles here. This is because shmem is page-cache but can be + SwapCache. Test with shmem/tmpfs is always good test. + + 9.3 Migration + For NUMA, migration is an another special case. To do easy test, cpuset + is useful. Following is a sample script to do migration. + + mount -t cgroup -o cpuset none /opt/cpuset + + mkdir /opt/cpuset/01 + echo 1 > /opt/cpuset/01/cpuset.cpus + echo 0 > /opt/cpuset/01/cpuset.mems + echo 1 > /opt/cpuset/01/cpuset.memory_migrate + mkdir /opt/cpuset/02 + echo 1 > /opt/cpuset/02/cpuset.cpus + echo 1 > /opt/cpuset/02/cpuset.mems + echo 1 > /opt/cpuset/02/cpuset.memory_migrate + + In above set, when you moves a task from 01 to 02, page migration to + node 0 to node 1 will occur. Following is a script to migrate all + under cpuset. + -- + move_task() + { + for pid in $1 + do + /bin/echo $pid >$2/tasks 2>/dev/null + echo -n $pid + echo -n " " + done + echo END + } + + G1_TASK=`cat ${G1}/tasks` + G2_TASK=`cat ${G2}/tasks` + move_task "${G1_TASK}" ${G2} & + -- + 9.4 Memory hotplug. + memory hotplug test is one of good test. + to offline memory, do following. + # echo offline > /sys/devices/system/memory/memoryXXX/state + (XXX is the place of memory) + This is an easy way to test page migration, too. + + 9.5 mkdir/rmdir + When using hierarchy, mkdir/rmdir test should be done. + Use tests like the following. + + echo 1 >/opt/cgroup/01/memory/use_hierarchy + mkdir /opt/cgroup/01/child_a + mkdir /opt/cgroup/01/child_b + + set limit to 01. + add limit to 01/child_b + run jobs under child_a and child_b + + create/delete following groups at random while jobs are running. + /opt/cgroup/01/child_a/child_aa + /opt/cgroup/01/child_b/child_bb + /opt/cgroup/01/child_c + + running new jobs in new group is also good. + + 9.6 Mount with other subsystems. + Mounting with other subsystems is a good test because there is a + race and lock dependency with other cgroup subsystems. + + example) + # mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices + + and do task move, mkdir, rmdir etc...under this. diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt new file mode 100644 index 000000000000..e1501964df1e --- /dev/null +++ b/Documentation/cgroups/memory.txt @@ -0,0 +1,399 @@ +Memory Resource Controller + +NOTE: The Memory Resource Controller has been generically been referred +to as the memory controller in this document. Do not confuse memory controller +used here with the memory controller that is used in hardware. + +Salient features + +a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages +b. The infrastructure allows easy addition of other types of memory to control +c. Provides *zero overhead* for non memory controller users +d. Provides a double LRU: global memory pressure causes reclaim from the + global LRU; a cgroup on hitting a limit, reclaims from the per + cgroup LRU + +NOTE: Swap Cache (unmapped) is not accounted now. + +Benefits and Purpose of the memory controller + +The memory controller isolates the memory behaviour of a group of tasks +from the rest of the system. The article on LWN [12] mentions some probable +uses of the memory controller. The memory controller can be used to + +a. Isolate an application or a group of applications + Memory hungry applications can be isolated and limited to a smaller + amount of memory. +b. Create a cgroup with limited amount of memory, this can be used + as a good alternative to booting with mem=XXXX. +c. Virtualization solutions can control the amount of memory they want + to assign to a virtual machine instance. +d. A CD/DVD burner could control the amount of memory used by the + rest of the system to ensure that burning does not fail due to lack + of available memory. +e. There are several other use cases, find one or use the controller just + for fun (to learn and hack on the VM subsystem). + +1. History + +The memory controller has a long history. A request for comments for the memory +controller was posted by Balbir Singh [1]. At the time the RFC was posted +there were several implementations for memory control. The goal of the +RFC was to build consensus and agreement for the minimal features required +for memory control. The first RSS controller was posted by Balbir Singh[2] +in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the +RSS controller. At OLS, at the resource management BoF, everyone suggested +that we handle both page cache and RSS together. Another request was raised +to allow user space handling of OOM. The current memory controller is +at version 6; it combines both mapped (RSS) and unmapped Page +Cache Control [11]. + +2. Memory Control + +Memory is a unique resource in the sense that it is present in a limited +amount. If a task requires a lot of CPU processing, the task can spread +its processing over a period of hours, days, months or years, but with +memory, the same physical memory needs to be reused to accomplish the task. + +The memory controller implementation has been divided into phases. These +are: + +1. Memory controller +2. mlock(2) controller +3. Kernel user memory accounting and slab control +4. user mappings length controller + +The memory controller is the first controller developed. + +2.1. Design + +The core of the design is a counter called the res_counter. The res_counter +tracks the current memory usage and limit of the group of processes associated +with the controller. Each cgroup has a memory controller specific data +structure (mem_cgroup) associated with it. + +2.2. Accounting + + +--------------------+ + | mem_cgroup | + | (res_counter) | + +--------------------+ + / ^ \ + / | \ + +---------------+ | +---------------+ + | mm_struct | |.... | mm_struct | + | | | | | + +---------------+ | +---------------+ + | + + --------------+ + | + +---------------+ +------+--------+ + | page +----------> page_cgroup| + | | | | + +---------------+ +---------------+ + + (Figure 1: Hierarchy of Accounting) + + +Figure 1 shows the important aspects of the controller + +1. Accounting happens per cgroup +2. Each mm_struct knows about which cgroup it belongs to +3. Each page has a pointer to the page_cgroup, which in turn knows the + cgroup it belongs to + +The accounting is done as follows: mem_cgroup_charge() is invoked to setup +the necessary data structures and check if the cgroup that is being charged +is over its limit. If it is then reclaim is invoked on the cgroup. +More details can be found in the reclaim section of this document. +If everything goes well, a page meta-data-structure called page_cgroup is +allocated and associated with the page. This routine also adds the page to +the per cgroup LRU. + +2.2.1 Accounting details + +All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. +(some pages which never be reclaimable and will not be on global LRU + are not accounted. we just accounts pages under usual vm management.) + +RSS pages are accounted at page_fault unless they've already been accounted +for earlier. A file page will be accounted for as Page Cache when it's +inserted into inode (radix-tree). While it's mapped into the page tables of +processes, duplicate accounting is carefully avoided. + +A RSS page is unaccounted when it's fully unmapped. A PageCache page is +unaccounted when it's removed from radix-tree. + +At page migration, accounting information is kept. + +Note: we just account pages-on-lru because our purpose is to control amount +of used pages. not-on-lru pages are tend to be out-of-control from vm view. + +2.3 Shared Page Accounting + +Shared pages are accounted on the basis of the first touch approach. The +cgroup that first touches a page is accounted for the page. The principle +behind this approach is that a cgroup that aggressively uses a shared +page will eventually get charged for it (once it is uncharged from +the cgroup that brought it in -- this will happen on memory pressure). + +Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used.. +When you do swapoff and make swapped-out pages of shmem(tmpfs) to +be backed into memory in force, charges for pages are accounted against the +caller of swapoff rather than the users of shmem. + + +2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) +Swap Extension allows you to record charge for swap. A swapped-in page is +charged back to original page allocator if possible. + +When swap is accounted, following files are added. + - memory.memsw.usage_in_bytes. + - memory.memsw.limit_in_bytes. + +usage of mem+swap is limited by memsw.limit_in_bytes. + +Note: why 'mem+swap' rather than swap. +The global LRU(kswapd) can swap out arbitrary pages. Swap-out means +to move account from memory to swap...there is no change in usage of +mem+swap. + +In other words, when we want to limit the usage of swap without affecting +global LRU, mem+swap limit is better than just limiting swap from OS point +of view. + +2.5 Reclaim + +Each cgroup maintains a per cgroup LRU that consists of an active +and inactive list. When a cgroup goes over its limit, we first try +to reclaim memory from the cgroup so as to make space for the new +pages that the cgroup has touched. If the reclaim is unsuccessful, +an OOM routine is invoked to select and kill the bulkiest task in the +cgroup. + +The reclaim algorithm has not been modified for cgroups, except that +pages that are selected for reclaiming come from the per cgroup LRU +list. + +2. Locking + +The memory controller uses the following hierarchy + +1. zone->lru_lock is used for selecting pages to be isolated +2. mem->per_zone->lru_lock protects the per cgroup LRU (per zone) +3. lock_page_cgroup() is used to protect page->page_cgroup + +3. User Interface + +0. Configuration + +a. Enable CONFIG_CGROUPS +b. Enable CONFIG_RESOURCE_COUNTERS +c. Enable CONFIG_CGROUP_MEM_RES_CTLR + +1. Prepare the cgroups +# mkdir -p /cgroups +# mount -t cgroup none /cgroups -o memory + +2. Make the new group and move bash into it +# mkdir /cgroups/0 +# echo $$ > /cgroups/0/tasks + +Since now we're in the 0 cgroup, +We can alter the memory limit: +# echo 4M > /cgroups/0/memory.limit_in_bytes + +NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, +mega or gigabytes. + +# cat /cgroups/0/memory.limit_in_bytes +4194304 + +NOTE: The interface has now changed to display the usage in bytes +instead of pages + +We can check the usage: +# cat /cgroups/0/memory.usage_in_bytes +1216512 + +A successful write to this file does not guarantee a successful set of +this limit to the value written into the file. This can be due to a +number of factors, such as rounding up to page boundaries or the total +availability of memory on the system. The user is required to re-read +this file after a write to guarantee the value committed by the kernel. + +# echo 1 > memory.limit_in_bytes +# cat memory.limit_in_bytes +4096 + +The memory.failcnt field gives the number of times that the cgroup limit was +exceeded. + +The memory.stat file gives accounting information. Now, the number of +caches, RSS and Active pages/Inactive pages are shown. + +4. Testing + +Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. +Apart from that v6 has been tested with several applications and regular +daily use. The controller has also been tested on the PPC64, x86_64 and +UML platforms. + +4.1 Troubleshooting + +Sometimes a user might find that the application under a cgroup is +terminated. There are several causes for this: + +1. The cgroup limit is too low (just too low to do anything useful) +2. The user is using anonymous memory and swap is turned off or too low + +A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of +some of the pages cached in the cgroup (page cache pages). + +4.2 Task migration + +When a task migrates from one cgroup to another, it's charge is not +carried forward. The pages allocated from the original cgroup still +remain charged to it, the charge is dropped when the page is freed or +reclaimed. + +4.3 Removing a cgroup + +A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a +cgroup might have some charge associated with it, even though all +tasks have migrated away from it. +Such charges are freed(at default) or moved to its parent. When moved, +both of RSS and CACHES are moved to parent. +If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. + +Charges recorded in swap information is not updated at removal of cgroup. +Recorded information is discarded and a cgroup which uses swap (swapcache) +will be charged as a new owner of it. + + +5. Misc. interfaces. + +5.1 force_empty + memory.force_empty interface is provided to make cgroup's memory usage empty. + You can use this interface only when the cgroup has no tasks. + When writing anything to this + + # echo 0 > memory.force_empty + + Almost all pages tracked by this memcg will be unmapped and freed. Some of + pages cannot be freed because it's locked or in-use. Such pages are moved + to parent and this cgroup will be empty. But this may return -EBUSY in + some too busy case. + + Typical use case of this interface is that calling this before rmdir(). + Because rmdir() moves all pages to parent, some out-of-use page caches can be + moved to the parent. If you want to avoid that, force_empty will be useful. + +5.2 stat file + memory.stat file includes following statistics (now) + cache - # of pages from page-cache and shmem. + rss - # of pages from anonymous memory. + pgpgin - # of event of charging + pgpgout - # of event of uncharging + active_anon - # of pages on active lru of anon, shmem. + inactive_anon - # of pages on active lru of anon, shmem + active_file - # of pages on active lru of file-cache + inactive_file - # of pages on inactive lru of file cache + unevictable - # of pages cannot be reclaimed.(mlocked etc) + + Below is depend on CONFIG_DEBUG_VM. + inactive_ratio - VM inernal parameter. (see mm/page_alloc.c) + recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) + recent_rotated_file - VM internal parameter. (see mm/vmscan.c) + recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) + recent_scanned_file - VM internal parameter. (see mm/vmscan.c) + + Memo: + recent_rotated means recent frequency of lru rotation. + recent_scanned means recent # of scans to lru. + showing for better debug please see the code for meanings. + + +5.3 swappiness + Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. + + Following cgroup's swapiness can't be changed. + - root cgroup (uses /proc/sys/vm/swappiness). + - a cgroup which uses hierarchy and it has child cgroup. + - a cgroup which uses hierarchy and not the root of hierarchy. + + +6. Hierarchy support + +The memory controller supports a deep hierarchy and hierarchical accounting. +The hierarchy is created by creating the appropriate cgroups in the +cgroup filesystem. Consider for example, the following cgroup filesystem +hierarchy + + root + / | \ + / | \ + a b c + | \ + | \ + d e + +In the diagram above, with hierarchical accounting enabled, all memory +usage of e, is accounted to its ancestors up until the root (i.e, c and root), +that has memory.use_hierarchy enabled. If one of the ancestors goes over its +limit, the reclaim algorithm reclaims from the tasks in the ancestor and the +children of the ancestor. + +6.1 Enabling hierarchical accounting and reclaim + +The memory controller by default disables the hierarchy feature. Support +can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup + +# echo 1 > memory.use_hierarchy + +The feature can be disabled by + +# echo 0 > memory.use_hierarchy + +NOTE1: Enabling/disabling will fail if the cgroup already has other +cgroups created below it. + +NOTE2: This feature can be enabled/disabled per subtree. + +7. TODO + +1. Add support for accounting huge pages (as a separate controller) +2. Make per-cgroup scanner reclaim not-shared pages first +3. Teach controller to account for shared-pages +4. Start reclamation in the background when the limit is + not yet hit but the usage is getting closer + +Summary + +Overall, the memory controller has been a stable controller and has been +commented and discussed quite extensively in the community. + +References + +1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ +2. Singh, Balbir. Memory Controller (RSS Control), + http://lwn.net/Articles/222762/ +3. Emelianov, Pavel. Resource controllers based on process cgroups + http://lkml.org/lkml/2007/3/6/198 +4. Emelianov, Pavel. RSS controller based on process cgroups (v2) + http://lkml.org/lkml/2007/4/9/78 +5. Emelianov, Pavel. RSS controller based on process cgroups (v3) + http://lkml.org/lkml/2007/5/30/244 +6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ +7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control + subsystem (v3), http://lwn.net/Articles/235534/ +8. Singh, Balbir. RSS controller v2 test results (lmbench), + http://lkml.org/lkml/2007/5/17/232 +9. Singh, Balbir. RSS controller v2 AIM9 results + http://lkml.org/lkml/2007/5/18/1 +10. Singh, Balbir. Memory controller v6 test results, + http://lkml.org/lkml/2007/8/19/36 +11. Singh, Balbir. Memory controller introduction (v6), + http://lkml.org/lkml/2007/8/17/69 +12. Corbet, Jonathan, Controlling memory use in cgroups, + http://lwn.net/Articles/243795/ diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt new file mode 100644 index 000000000000..f196ac1d7d25 --- /dev/null +++ b/Documentation/cgroups/resource_counter.txt @@ -0,0 +1,181 @@ + + The Resource Counter + +The resource counter, declared at include/linux/res_counter.h, +is supposed to facilitate the resource management by controllers +by providing common stuff for accounting. + +This "stuff" includes the res_counter structure and routines +to work with it. + + + +1. Crucial parts of the res_counter structure + + a. unsigned long long usage + + The usage value shows the amount of a resource that is consumed + by a group at a given time. The units of measurement should be + determined by the controller that uses this counter. E.g. it can + be bytes, items or any other unit the controller operates on. + + b. unsigned long long max_usage + + The maximal value of the usage over time. + + This value is useful when gathering statistical information about + the particular group, as it shows the actual resource requirements + for a particular group, not just some usage snapshot. + + c. unsigned long long limit + + The maximal allowed amount of resource to consume by the group. In + case the group requests for more resources, so that the usage value + would exceed the limit, the resource allocation is rejected (see + the next section). + + d. unsigned long long failcnt + + The failcnt stands for "failures counter". This is the number of + resource allocation attempts that failed. + + c. spinlock_t lock + + Protects changes of the above values. + + + +2. Basic accounting routines + + a. void res_counter_init(struct res_counter *rc) + + Initializes the resource counter. As usual, should be the first + routine called for a new counter. + + b. int res_counter_charge[_locked] + (struct res_counter *rc, unsigned long val) + + When a resource is about to be allocated it has to be accounted + with the appropriate resource counter (controller should determine + which one to use on its own). This operation is called "charging". + + This is not very important which operation - resource allocation + or charging - is performed first, but + * if the allocation is performed first, this may create a + temporary resource over-usage by the time resource counter is + charged; + * if the charging is performed first, then it should be uncharged + on error path (if the one is called). + + c. void res_counter_uncharge[_locked] + (struct res_counter *rc, unsigned long val) + + When a resource is released (freed) it should be de-accounted + from the resource counter it was accounted to. This is called + "uncharging". + + The _locked routines imply that the res_counter->lock is taken. + + + 2.1 Other accounting routines + + There are more routines that may help you with common needs, like + checking whether the limit is reached or resetting the max_usage + value. They are all declared in include/linux/res_counter.h. + + + +3. Analyzing the resource counter registrations + + a. If the failcnt value constantly grows, this means that the counter's + limit is too tight. Either the group is misbehaving and consumes too + many resources, or the configuration is not suitable for the group + and the limit should be increased. + + b. The max_usage value can be used to quickly tune the group. One may + set the limits to maximal values and either load the container with + a common pattern or leave one for a while. After this the max_usage + value shows the amount of memory the container would require during + its common activity. + + Setting the limit a bit above this value gives a pretty good + configuration that works in most of the cases. + + c. If the max_usage is much less than the limit, but the failcnt value + is growing, then the group tries to allocate a big chunk of resource + at once. + + d. If the max_usage is much less than the limit, but the failcnt value + is 0, then this group is given too high limit, that it does not + require. It is better to lower the limit a bit leaving more resource + for other groups. + + + +4. Communication with the control groups subsystem (cgroups) + +All the resource controllers that are using cgroups and resource counters +should provide files (in the cgroup filesystem) to work with the resource +counter fields. They are recommended to adhere to the following rules: + + a. File names + + Field name File name + --------------------------------------------------- + usage usage_in_ + max_usage max_usage_in_ + limit limit_in_ + failcnt failcnt + lock no file :) + + b. Reading from file should show the corresponding field value in the + appropriate format. + + c. Writing to file + + Field Expected behavior + ---------------------------------- + usage prohibited + max_usage reset to usage + limit set the limit + failcnt reset to zero + + + +5. Usage example + + a. Declare a task group (take a look at cgroups subsystem for this) and + fold a res_counter into it + + struct my_group { + struct res_counter res; + + + } + + b. Put hooks in resource allocation/release paths + + int alloc_something(...) + { + if (res_counter_charge(res_counter_ptr, amount) < 0) + return -ENOMEM; + + + } + + void release_something(...) + { + res_counter_uncharge(res_counter_ptr, amount); + + + } + + In order to keep the usage value self-consistent, both the + "res_counter_ptr" and the "amount" in release_something() should be + the same as they were in the alloc_something() when the releasing + resource was allocated. + + c. Provide the way to read res_counter values and set them (the cgroups + still can help with it). + + c. Compile and run :) diff --git a/Documentation/controllers/cpuacct.txt b/Documentation/controllers/cpuacct.txt deleted file mode 100644 index bb775fbe43d7..000000000000 --- a/Documentation/controllers/cpuacct.txt +++ /dev/null @@ -1,32 +0,0 @@ -CPU Accounting Controller -------------------------- - -The CPU accounting controller is used to group tasks using cgroups and -account the CPU usage of these groups of tasks. - -The CPU accounting controller supports multi-hierarchy groups. An accounting -group accumulates the CPU usage of all of its child groups and the tasks -directly present in its group. - -Accounting groups can be created by first mounting the cgroup filesystem. - -# mkdir /cgroups -# mount -t cgroup -ocpuacct none /cgroups - -With the above step, the initial or the parent accounting group -becomes visible at /cgroups. At bootup, this group includes all the -tasks in the system. /cgroups/tasks lists the tasks in this cgroup. -/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by -this group which is essentially the CPU time obtained by all the tasks -in the system. - -New accounting groups can be created under the parent group /cgroups. - -# cd /cgroups -# mkdir g1 -# echo $$ > g1 - -The above steps create a new group g1 and move the current shell -process (bash) into it. CPU time consumed by this bash and its children -can be obtained from g1/cpuacct.usage and the same is accumulated in -/cgroups/cpuacct.usage also. diff --git a/Documentation/controllers/devices.txt b/Documentation/controllers/devices.txt deleted file mode 100644 index 7cc6e6a60672..000000000000 --- a/Documentation/controllers/devices.txt +++ /dev/null @@ -1,52 +0,0 @@ -Device Whitelist Controller - -1. Description: - -Implement a cgroup to track and enforce open and mknod restrictions -on device files. A device cgroup associates a device access -whitelist with each cgroup. A whitelist entry has 4 fields. -'type' is a (all), c (char), or b (block). 'all' means it applies -to all types and all major and minor numbers. Major and minor are -either an integer or * for all. Access is a composition of r -(read), w (write), and m (mknod). - -The root device cgroup starts with rwm to 'all'. A child device -cgroup gets a copy of the parent. Administrators can then remove -devices from the whitelist or add new entries. A child cgroup can -never receive a device access which is denied by its parent. However -when a device access is removed from a parent it will not also be -removed from the child(ren). - -2. User Interface - -An entry is added using devices.allow, and removed using -devices.deny. For instance - - echo 'c 1:3 mr' > /cgroups/1/devices.allow - -allows cgroup 1 to read and mknod the device usually known as -/dev/null. Doing - - echo a > /cgroups/1/devices.deny - -will remove the default 'a *:* rwm' entry. Doing - - echo a > /cgroups/1/devices.allow - -will add the 'a *:* rwm' entry to the whitelist. - -3. Security - -Any task can move itself between cgroups. This clearly won't -suffice, but we can decide the best way to adequately restrict -movement as people get some experience with this. We may just want -to require CAP_SYS_ADMIN, which at least is a separate bit from -CAP_MKNOD. We may want to just refuse moving to a cgroup which -isn't a descendent of the current one. Or we may want to use -CAP_MAC_ADMIN, since we really are trying to lock down root. - -CAP_SYS_ADMIN is needed to modify the whitelist or move another -task to a new cgroup. (Again we'll probably want to change that). - -A cgroup may not be granted more permissions than the cgroup's -parent has. diff --git a/Documentation/controllers/memcg_test.txt b/Documentation/controllers/memcg_test.txt deleted file mode 100644 index 08d4d3ea0d79..000000000000 --- a/Documentation/controllers/memcg_test.txt +++ /dev/null @@ -1,342 +0,0 @@ -Memory Resource Controller(Memcg) Implementation Memo. -Last Updated: 2008/12/15 -Base Kernel Version: based on 2.6.28-rc8-mm. - -Because VM is getting complex (one of reasons is memcg...), memcg's behavior -is complex. This is a document for memcg's internal behavior. -Please note that implementation details can be changed. - -(*) Topics on API should be in Documentation/controllers/memory.txt) - -0. How to record usage ? - 2 objects are used. - - page_cgroup ....an object per page. - Allocated at boot or memory hotplug. Freed at memory hot removal. - - swap_cgroup ... an entry per swp_entry. - Allocated at swapon(). Freed at swapoff(). - - The page_cgroup has USED bit and double count against a page_cgroup never - occurs. swap_cgroup is used only when a charged page is swapped-out. - -1. Charge - - a page/swp_entry may be charged (usage += PAGE_SIZE) at - - mem_cgroup_newpage_charge() - Called at new page fault and Copy-On-Write. - - mem_cgroup_try_charge_swapin() - Called at do_swap_page() (page fault on swap entry) and swapoff. - Followed by charge-commit-cancel protocol. (With swap accounting) - At commit, a charge recorded in swap_cgroup is removed. - - mem_cgroup_cache_charge() - Called at add_to_page_cache() - - mem_cgroup_cache_charge_swapin() - Called at shmem's swapin. - - mem_cgroup_prepare_migration() - Called before migration. "extra" charge is done and followed by - charge-commit-cancel protocol. - At commit, charge against oldpage or newpage will be committed. - -2. Uncharge - a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by - - mem_cgroup_uncharge_page() - Called when an anonymous page is fully unmapped. I.e., mapcount goes - to 0. If the page is SwapCache, uncharge is delayed until - mem_cgroup_uncharge_swapcache(). - - mem_cgroup_uncharge_cache_page() - Called when a page-cache is deleted from radix-tree. If the page is - SwapCache, uncharge is delayed until mem_cgroup_uncharge_swapcache(). - - mem_cgroup_uncharge_swapcache() - Called when SwapCache is removed from radix-tree. The charge itself - is moved to swap_cgroup. (If mem+swap controller is disabled, no - charge to swap occurs.) - - mem_cgroup_uncharge_swap() - Called when swp_entry's refcnt goes down to 0. A charge against swap - disappears. - - mem_cgroup_end_migration(old, new) - At success of migration old is uncharged (if necessary), a charge - to new page is committed. At failure, charge to old page is committed. - -3. charge-commit-cancel - In some case, we can't know this "charge" is valid or not at charging - (because of races). - To handle such case, there are charge-commit-cancel functions. - mem_cgroup_try_charge_XXX - mem_cgroup_commit_charge_XXX - mem_cgroup_cancel_charge_XXX - these are used in swap-in and migration. - - At try_charge(), there are no flags to say "this page is charged". - at this point, usage += PAGE_SIZE. - - At commit(), the function checks the page should be charged or not - and set flags or avoid charging.(usage -= PAGE_SIZE) - - At cancel(), simply usage -= PAGE_SIZE. - -Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. - -4. Anonymous - Anonymous page is newly allocated at - - page fault into MAP_ANONYMOUS mapping. - - Copy-On-Write. - It is charged right after it's allocated before doing any page table - related operations. Of course, it's uncharged when another page is used - for the fault address. - - At freeing anonymous page (by exit() or munmap()), zap_pte() is called - and pages for ptes are freed one by one.(see mm/memory.c). Uncharges - are done at page_remove_rmap() when page_mapcount() goes down to 0. - - Another page freeing is by page-reclaim (vmscan.c) and anonymous - pages are swapped out. In this case, the page is marked as - PageSwapCache(). uncharge() routine doesn't uncharge the page marked - as SwapCache(). It's delayed until __delete_from_swap_cache(). - - 4.1 Swap-in. - At swap-in, the page is taken from swap-cache. There are 2 cases. - - (a) If the SwapCache is newly allocated and read, it has no charges. - (b) If the SwapCache has been mapped by processes, it has been - charged already. - - This swap-in is one of the most complicated work. In do_swap_page(), - following events occur when pte is unchanged. - - (1) the page (SwapCache) is looked up. - (2) lock_page() - (3) try_charge_swapin() - (4) reuse_swap_page() (may call delete_swap_cache()) - (5) commit_charge_swapin() - (6) swap_free(). - - Considering following situation for example. - - (A) The page has not been charged before (2) and reuse_swap_page() - doesn't call delete_from_swap_cache(). - (B) The page has not been charged before (2) and reuse_swap_page() - calls delete_from_swap_cache(). - (C) The page has been charged before (2) and reuse_swap_page() doesn't - call delete_from_swap_cache(). - (D) The page has been charged before (2) and reuse_swap_page() calls - delete_from_swap_cache(). - - memory.usage/memsw.usage changes to this page/swp_entry will be - Case (A) (B) (C) (D) - Event - Before (2) 0/ 1 0/ 1 1/ 1 1/ 1 - =========================================== - (3) +1/+1 +1/+1 +1/+1 +1/+1 - (4) - 0/ 0 - -1/ 0 - (5) 0/-1 0/ 0 -1/-1 0/ 0 - (6) - 0/-1 - 0/-1 - =========================================== - Result 1/ 1 1/ 1 1/ 1 1/ 1 - - In any cases, charges to this page should be 1/ 1. - - 4.2 Swap-out. - At swap-out, typical state transition is below. - - (a) add to swap cache. (marked as SwapCache) - swp_entry's refcnt += 1. - (b) fully unmapped. - swp_entry's refcnt += # of ptes. - (c) write back to swap. - (d) delete from swap cache. (remove from SwapCache) - swp_entry's refcnt -= 1. - - - At (b), the page is marked as SwapCache and not uncharged. - At (d), the page is removed from SwapCache and a charge in page_cgroup - is moved to swap_cgroup. - - Finally, at task exit, - (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. - Here, a charge in swap_cgroup disappears. - -5. Page Cache - Page Cache is charged at - - add_to_page_cache_locked(). - - uncharged at - - __remove_from_page_cache(). - - The logic is very clear. (About migration, see below) - Note: __remove_from_page_cache() is called by remove_from_page_cache() - and __remove_mapping(). - -6. Shmem(tmpfs) Page Cache - Memcg's charge/uncharge have special handlers of shmem. The best way - to understand shmem's page state transition is to read mm/shmem.c. - But brief explanation of the behavior of memcg around shmem will be - helpful to understand the logic. - - Shmem's page (just leaf page, not direct/indirect block) can be on - - radix-tree of shmem's inode. - - SwapCache. - - Both on radix-tree and SwapCache. This happens at swap-in - and swap-out, - - It's charged when... - - A new page is added to shmem's radix-tree. - - A swp page is read. (move a charge from swap_cgroup to page_cgroup) - It's uncharged when - - A page is removed from radix-tree and not SwapCache. - - When SwapCache is removed, a charge is moved to swap_cgroup. - - When swp_entry's refcnt goes down to 0, a charge in swap_cgroup - disappears. - -7. Page Migration - One of the most complicated functions is page-migration-handler. - Memcg has 2 routines. Assume that we are migrating a page's contents - from OLDPAGE to NEWPAGE. - - Usual migration logic is.. - (a) remove the page from LRU. - (b) allocate NEWPAGE (migration target) - (c) lock by lock_page(). - (d) unmap all mappings. - (e-1) If necessary, replace entry in radix-tree. - (e-2) move contents of a page. - (f) map all mappings again. - (g) pushback the page to LRU. - (-) OLDPAGE will be freed. - - Before (g), memcg should complete all necessary charge/uncharge to - NEWPAGE/OLDPAGE. - - The point is.... - - If OLDPAGE is anonymous, all charges will be dropped at (d) because - try_to_unmap() drops all mapcount and the page will not be - SwapCache. - - - If OLDPAGE is SwapCache, charges will be kept at (g) because - __delete_from_swap_cache() isn't called at (e-1) - - - If OLDPAGE is page-cache, charges will be kept at (g) because - remove_from_swap_cache() isn't called at (e-1) - - memcg provides following hooks. - - - mem_cgroup_prepare_migration(OLDPAGE) - Called after (b) to account a charge (usage += PAGE_SIZE) against - memcg which OLDPAGE belongs to. - - - mem_cgroup_end_migration(OLDPAGE, NEWPAGE) - Called after (f) before (g). - If OLDPAGE is used, commit OLDPAGE again. If OLDPAGE is already - charged, a charge by prepare_migration() is automatically canceled. - If NEWPAGE is used, commit NEWPAGE and uncharge OLDPAGE. - - But zap_pte() (by exit or munmap) can be called while migration, - we have to check if OLDPAGE/NEWPAGE is a valid page after commit(). - -8. LRU - Each memcg has its own private LRU. Now, it's handling is under global - VM's control (means that it's handled under global zone->lru_lock). - Almost all routines around memcg's LRU is called by global LRU's - list management functions under zone->lru_lock(). - - A special function is mem_cgroup_isolate_pages(). This scans - memcg's private LRU and call __isolate_lru_page() to extract a page - from LRU. - (By __isolate_lru_page(), the page is removed from both of global and - private LRU.) - - -9. Typical Tests. - - Tests for racy cases. - - 9.1 Small limit to memcg. - When you do test to do racy case, it's good test to set memcg's limit - to be very small rather than GB. Many races found in the test under - xKB or xxMB limits. - (Memory behavior under GB and Memory behavior under MB shows very - different situation.) - - 9.2 Shmem - Historically, memcg's shmem handling was poor and we saw some amount - of troubles here. This is because shmem is page-cache but can be - SwapCache. Test with shmem/tmpfs is always good test. - - 9.3 Migration - For NUMA, migration is an another special case. To do easy test, cpuset - is useful. Following is a sample script to do migration. - - mount -t cgroup -o cpuset none /opt/cpuset - - mkdir /opt/cpuset/01 - echo 1 > /opt/cpuset/01/cpuset.cpus - echo 0 > /opt/cpuset/01/cpuset.mems - echo 1 > /opt/cpuset/01/cpuset.memory_migrate - mkdir /opt/cpuset/02 - echo 1 > /opt/cpuset/02/cpuset.cpus - echo 1 > /opt/cpuset/02/cpuset.mems - echo 1 > /opt/cpuset/02/cpuset.memory_migrate - - In above set, when you moves a task from 01 to 02, page migration to - node 0 to node 1 will occur. Following is a script to migrate all - under cpuset. - -- - move_task() - { - for pid in $1 - do - /bin/echo $pid >$2/tasks 2>/dev/null - echo -n $pid - echo -n " " - done - echo END - } - - G1_TASK=`cat ${G1}/tasks` - G2_TASK=`cat ${G2}/tasks` - move_task "${G1_TASK}" ${G2} & - -- - 9.4 Memory hotplug. - memory hotplug test is one of good test. - to offline memory, do following. - # echo offline > /sys/devices/system/memory/memoryXXX/state - (XXX is the place of memory) - This is an easy way to test page migration, too. - - 9.5 mkdir/rmdir - When using hierarchy, mkdir/rmdir test should be done. - Use tests like the following. - - echo 1 >/opt/cgroup/01/memory/use_hierarchy - mkdir /opt/cgroup/01/child_a - mkdir /opt/cgroup/01/child_b - - set limit to 01. - add limit to 01/child_b - run jobs under child_a and child_b - - create/delete following groups at random while jobs are running. - /opt/cgroup/01/child_a/child_aa - /opt/cgroup/01/child_b/child_bb - /opt/cgroup/01/child_c - - running new jobs in new group is also good. - - 9.6 Mount with other subsystems. - Mounting with other subsystems is a good test because there is a - race and lock dependency with other cgroup subsystems. - - example) - # mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices - - and do task move, mkdir, rmdir etc...under this. diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt deleted file mode 100644 index e1501964df1e..000000000000 --- a/Documentation/controllers/memory.txt +++ /dev/null @@ -1,399 +0,0 @@ -Memory Resource Controller - -NOTE: The Memory Resource Controller has been generically been referred -to as the memory controller in this document. Do not confuse memory controller -used here with the memory controller that is used in hardware. - -Salient features - -a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages -b. The infrastructure allows easy addition of other types of memory to control -c. Provides *zero overhead* for non memory controller users -d. Provides a double LRU: global memory pressure causes reclaim from the - global LRU; a cgroup on hitting a limit, reclaims from the per - cgroup LRU - -NOTE: Swap Cache (unmapped) is not accounted now. - -Benefits and Purpose of the memory controller - -The memory controller isolates the memory behaviour of a group of tasks -from the rest of the system. The article on LWN [12] mentions some probable -uses of the memory controller. The memory controller can be used to - -a. Isolate an application or a group of applications - Memory hungry applications can be isolated and limited to a smaller - amount of memory. -b. Create a cgroup with limited amount of memory, this can be used - as a good alternative to booting with mem=XXXX. -c. Virtualization solutions can control the amount of memory they want - to assign to a virtual machine instance. -d. A CD/DVD burner could control the amount of memory used by the - rest of the system to ensure that burning does not fail due to lack - of available memory. -e. There are several other use cases, find one or use the controller just - for fun (to learn and hack on the VM subsystem). - -1. History - -The memory controller has a long history. A request for comments for the memory -controller was posted by Balbir Singh [1]. At the time the RFC was posted -there were several implementations for memory control. The goal of the -RFC was to build consensus and agreement for the minimal features required -for memory control. The first RSS controller was posted by Balbir Singh[2] -in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the -RSS controller. At OLS, at the resource management BoF, everyone suggested -that we handle both page cache and RSS together. Another request was raised -to allow user space handling of OOM. The current memory controller is -at version 6; it combines both mapped (RSS) and unmapped Page -Cache Control [11]. - -2. Memory Control - -Memory is a unique resource in the sense that it is present in a limited -amount. If a task requires a lot of CPU processing, the task can spread -its processing over a period of hours, days, months or years, but with -memory, the same physical memory needs to be reused to accomplish the task. - -The memory controller implementation has been divided into phases. These -are: - -1. Memory controller -2. mlock(2) controller -3. Kernel user memory accounting and slab control -4. user mappings length controller - -The memory controller is the first controller developed. - -2.1. Design - -The core of the design is a counter called the res_counter. The res_counter -tracks the current memory usage and limit of the group of processes associated -with the controller. Each cgroup has a memory controller specific data -structure (mem_cgroup) associated with it. - -2.2. Accounting - - +--------------------+ - | mem_cgroup | - | (res_counter) | - +--------------------+ - / ^ \ - / | \ - +---------------+ | +---------------+ - | mm_struct | |.... | mm_struct | - | | | | | - +---------------+ | +---------------+ - | - + --------------+ - | - +---------------+ +------+--------+ - | page +----------> page_cgroup| - | | | | - +---------------+ +---------------+ - - (Figure 1: Hierarchy of Accounting) - - -Figure 1 shows the important aspects of the controller - -1. Accounting happens per cgroup -2. Each mm_struct knows about which cgroup it belongs to -3. Each page has a pointer to the page_cgroup, which in turn knows the - cgroup it belongs to - -The accounting is done as follows: mem_cgroup_charge() is invoked to setup -the necessary data structures and check if the cgroup that is being charged -is over its limit. If it is then reclaim is invoked on the cgroup. -More details can be found in the reclaim section of this document. -If everything goes well, a page meta-data-structure called page_cgroup is -allocated and associated with the page. This routine also adds the page to -the per cgroup LRU. - -2.2.1 Accounting details - -All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. -(some pages which never be reclaimable and will not be on global LRU - are not accounted. we just accounts pages under usual vm management.) - -RSS pages are accounted at page_fault unless they've already been accounted -for earlier. A file page will be accounted for as Page Cache when it's -inserted into inode (radix-tree). While it's mapped into the page tables of -processes, duplicate accounting is carefully avoided. - -A RSS page is unaccounted when it's fully unmapped. A PageCache page is -unaccounted when it's removed from radix-tree. - -At page migration, accounting information is kept. - -Note: we just account pages-on-lru because our purpose is to control amount -of used pages. not-on-lru pages are tend to be out-of-control from vm view. - -2.3 Shared Page Accounting - -Shared pages are accounted on the basis of the first touch approach. The -cgroup that first touches a page is accounted for the page. The principle -behind this approach is that a cgroup that aggressively uses a shared -page will eventually get charged for it (once it is uncharged from -the cgroup that brought it in -- this will happen on memory pressure). - -Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used.. -When you do swapoff and make swapped-out pages of shmem(tmpfs) to -be backed into memory in force, charges for pages are accounted against the -caller of swapoff rather than the users of shmem. - - -2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) -Swap Extension allows you to record charge for swap. A swapped-in page is -charged back to original page allocator if possible. - -When swap is accounted, following files are added. - - memory.memsw.usage_in_bytes. - - memory.memsw.limit_in_bytes. - -usage of mem+swap is limited by memsw.limit_in_bytes. - -Note: why 'mem+swap' rather than swap. -The global LRU(kswapd) can swap out arbitrary pages. Swap-out means -to move account from memory to swap...there is no change in usage of -mem+swap. - -In other words, when we want to limit the usage of swap without affecting -global LRU, mem+swap limit is better than just limiting swap from OS point -of view. - -2.5 Reclaim - -Each cgroup maintains a per cgroup LRU that consists of an active -and inactive list. When a cgroup goes over its limit, we first try -to reclaim memory from the cgroup so as to make space for the new -pages that the cgroup has touched. If the reclaim is unsuccessful, -an OOM routine is invoked to select and kill the bulkiest task in the -cgroup. - -The reclaim algorithm has not been modified for cgroups, except that -pages that are selected for reclaiming come from the per cgroup LRU -list. - -2. Locking - -The memory controller uses the following hierarchy - -1. zone->lru_lock is used for selecting pages to be isolated -2. mem->per_zone->lru_lock protects the per cgroup LRU (per zone) -3. lock_page_cgroup() is used to protect page->page_cgroup - -3. User Interface - -0. Configuration - -a. Enable CONFIG_CGROUPS -b. Enable CONFIG_RESOURCE_COUNTERS -c. Enable CONFIG_CGROUP_MEM_RES_CTLR - -1. Prepare the cgroups -# mkdir -p /cgroups -# mount -t cgroup none /cgroups -o memory - -2. Make the new group and move bash into it -# mkdir /cgroups/0 -# echo $$ > /cgroups/0/tasks - -Since now we're in the 0 cgroup, -We can alter the memory limit: -# echo 4M > /cgroups/0/memory.limit_in_bytes - -NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, -mega or gigabytes. - -# cat /cgroups/0/memory.limit_in_bytes -4194304 - -NOTE: The interface has now changed to display the usage in bytes -instead of pages - -We can check the usage: -# cat /cgroups/0/memory.usage_in_bytes -1216512 - -A successful write to this file does not guarantee a successful set of -this limit to the value written into the file. This can be due to a -number of factors, such as rounding up to page boundaries or the total -availability of memory on the system. The user is required to re-read -this file after a write to guarantee the value committed by the kernel. - -# echo 1 > memory.limit_in_bytes -# cat memory.limit_in_bytes -4096 - -The memory.failcnt field gives the number of times that the cgroup limit was -exceeded. - -The memory.stat file gives accounting information. Now, the number of -caches, RSS and Active pages/Inactive pages are shown. - -4. Testing - -Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. -Apart from that v6 has been tested with several applications and regular -daily use. The controller has also been tested on the PPC64, x86_64 and -UML platforms. - -4.1 Troubleshooting - -Sometimes a user might find that the application under a cgroup is -terminated. There are several causes for this: - -1. The cgroup limit is too low (just too low to do anything useful) -2. The user is using anonymous memory and swap is turned off or too low - -A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of -some of the pages cached in the cgroup (page cache pages). - -4.2 Task migration - -When a task migrates from one cgroup to another, it's charge is not -carried forward. The pages allocated from the original cgroup still -remain charged to it, the charge is dropped when the page is freed or -reclaimed. - -4.3 Removing a cgroup - -A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a -cgroup might have some charge associated with it, even though all -tasks have migrated away from it. -Such charges are freed(at default) or moved to its parent. When moved, -both of RSS and CACHES are moved to parent. -If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. - -Charges recorded in swap information is not updated at removal of cgroup. -Recorded information is discarded and a cgroup which uses swap (swapcache) -will be charged as a new owner of it. - - -5. Misc. interfaces. - -5.1 force_empty - memory.force_empty interface is provided to make cgroup's memory usage empty. - You can use this interface only when the cgroup has no tasks. - When writing anything to this - - # echo 0 > memory.force_empty - - Almost all pages tracked by this memcg will be unmapped and freed. Some of - pages cannot be freed because it's locked or in-use. Such pages are moved - to parent and this cgroup will be empty. But this may return -EBUSY in - some too busy case. - - Typical use case of this interface is that calling this before rmdir(). - Because rmdir() moves all pages to parent, some out-of-use page caches can be - moved to the parent. If you want to avoid that, force_empty will be useful. - -5.2 stat file - memory.stat file includes following statistics (now) - cache - # of pages from page-cache and shmem. - rss - # of pages from anonymous memory. - pgpgin - # of event of charging - pgpgout - # of event of uncharging - active_anon - # of pages on active lru of anon, shmem. - inactive_anon - # of pages on active lru of anon, shmem - active_file - # of pages on active lru of file-cache - inactive_file - # of pages on inactive lru of file cache - unevictable - # of pages cannot be reclaimed.(mlocked etc) - - Below is depend on CONFIG_DEBUG_VM. - inactive_ratio - VM inernal parameter. (see mm/page_alloc.c) - recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) - recent_rotated_file - VM internal parameter. (see mm/vmscan.c) - recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) - recent_scanned_file - VM internal parameter. (see mm/vmscan.c) - - Memo: - recent_rotated means recent frequency of lru rotation. - recent_scanned means recent # of scans to lru. - showing for better debug please see the code for meanings. - - -5.3 swappiness - Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. - - Following cgroup's swapiness can't be changed. - - root cgroup (uses /proc/sys/vm/swappiness). - - a cgroup which uses hierarchy and it has child cgroup. - - a cgroup which uses hierarchy and not the root of hierarchy. - - -6. Hierarchy support - -The memory controller supports a deep hierarchy and hierarchical accounting. -The hierarchy is created by creating the appropriate cgroups in the -cgroup filesystem. Consider for example, the following cgroup filesystem -hierarchy - - root - / | \ - / | \ - a b c - | \ - | \ - d e - -In the diagram above, with hierarchical accounting enabled, all memory -usage of e, is accounted to its ancestors up until the root (i.e, c and root), -that has memory.use_hierarchy enabled. If one of the ancestors goes over its -limit, the reclaim algorithm reclaims from the tasks in the ancestor and the -children of the ancestor. - -6.1 Enabling hierarchical accounting and reclaim - -The memory controller by default disables the hierarchy feature. Support -can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup - -# echo 1 > memory.use_hierarchy - -The feature can be disabled by - -# echo 0 > memory.use_hierarchy - -NOTE1: Enabling/disabling will fail if the cgroup already has other -cgroups created below it. - -NOTE2: This feature can be enabled/disabled per subtree. - -7. TODO - -1. Add support for accounting huge pages (as a separate controller) -2. Make per-cgroup scanner reclaim not-shared pages first -3. Teach controller to account for shared-pages -4. Start reclamation in the background when the limit is - not yet hit but the usage is getting closer - -Summary - -Overall, the memory controller has been a stable controller and has been -commented and discussed quite extensively in the community. - -References - -1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ -2. Singh, Balbir. Memory Controller (RSS Control), - http://lwn.net/Articles/222762/ -3. Emelianov, Pavel. Resource controllers based on process cgroups - http://lkml.org/lkml/2007/3/6/198 -4. Emelianov, Pavel. RSS controller based on process cgroups (v2) - http://lkml.org/lkml/2007/4/9/78 -5. Emelianov, Pavel. RSS controller based on process cgroups (v3) - http://lkml.org/lkml/2007/5/30/244 -6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ -7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control - subsystem (v3), http://lwn.net/Articles/235534/ -8. Singh, Balbir. RSS controller v2 test results (lmbench), - http://lkml.org/lkml/2007/5/17/232 -9. Singh, Balbir. RSS controller v2 AIM9 results - http://lkml.org/lkml/2007/5/18/1 -10. Singh, Balbir. Memory controller v6 test results, - http://lkml.org/lkml/2007/8/19/36 -11. Singh, Balbir. Memory controller introduction (v6), - http://lkml.org/lkml/2007/8/17/69 -12. Corbet, Jonathan, Controlling memory use in cgroups, - http://lwn.net/Articles/243795/ diff --git a/Documentation/controllers/resource_counter.txt b/Documentation/controllers/resource_counter.txt deleted file mode 100644 index f196ac1d7d25..000000000000 --- a/Documentation/controllers/resource_counter.txt +++ /dev/null @@ -1,181 +0,0 @@ - - The Resource Counter - -The resource counter, declared at include/linux/res_counter.h, -is supposed to facilitate the resource management by controllers -by providing common stuff for accounting. - -This "stuff" includes the res_counter structure and routines -to work with it. - - - -1. Crucial parts of the res_counter structure - - a. unsigned long long usage - - The usage value shows the amount of a resource that is consumed - by a group at a given time. The units of measurement should be - determined by the controller that uses this counter. E.g. it can - be bytes, items or any other unit the controller operates on. - - b. unsigned long long max_usage - - The maximal value of the usage over time. - - This value is useful when gathering statistical information about - the particular group, as it shows the actual resource requirements - for a particular group, not just some usage snapshot. - - c. unsigned long long limit - - The maximal allowed amount of resource to consume by the group. In - case the group requests for more resources, so that the usage value - would exceed the limit, the resource allocation is rejected (see - the next section). - - d. unsigned long long failcnt - - The failcnt stands for "failures counter". This is the number of - resource allocation attempts that failed. - - c. spinlock_t lock - - Protects changes of the above values. - - - -2. Basic accounting routines - - a. void res_counter_init(struct res_counter *rc) - - Initializes the resource counter. As usual, should be the first - routine called for a new counter. - - b. int res_counter_charge[_locked] - (struct res_counter *rc, unsigned long val) - - When a resource is about to be allocated it has to be accounted - with the appropriate resource counter (controller should determine - which one to use on its own). This operation is called "charging". - - This is not very important which operation - resource allocation - or charging - is performed first, but - * if the allocation is performed first, this may create a - temporary resource over-usage by the time resource counter is - charged; - * if the charging is performed first, then it should be uncharged - on error path (if the one is called). - - c. void res_counter_uncharge[_locked] - (struct res_counter *rc, unsigned long val) - - When a resource is released (freed) it should be de-accounted - from the resource counter it was accounted to. This is called - "uncharging". - - The _locked routines imply that the res_counter->lock is taken. - - - 2.1 Other accounting routines - - There are more routines that may help you with common needs, like - checking whether the limit is reached or resetting the max_usage - value. They are all declared in include/linux/res_counter.h. - - - -3. Analyzing the resource counter registrations - - a. If the failcnt value constantly grows, this means that the counter's - limit is too tight. Either the group is misbehaving and consumes too - many resources, or the configuration is not suitable for the group - and the limit should be increased. - - b. The max_usage value can be used to quickly tune the group. One may - set the limits to maximal values and either load the container with - a common pattern or leave one for a while. After this the max_usage - value shows the amount of memory the container would require during - its common activity. - - Setting the limit a bit above this value gives a pretty good - configuration that works in most of the cases. - - c. If the max_usage is much less than the limit, but the failcnt value - is growing, then the group tries to allocate a big chunk of resource - at once. - - d. If the max_usage is much less than the limit, but the failcnt value - is 0, then this group is given too high limit, that it does not - require. It is better to lower the limit a bit leaving more resource - for other groups. - - - -4. Communication with the control groups subsystem (cgroups) - -All the resource controllers that are using cgroups and resource counters -should provide files (in the cgroup filesystem) to work with the resource -counter fields. They are recommended to adhere to the following rules: - - a. File names - - Field name File name - --------------------------------------------------- - usage usage_in_ - max_usage max_usage_in_ - limit limit_in_ - failcnt failcnt - lock no file :) - - b. Reading from file should show the corresponding field value in the - appropriate format. - - c. Writing to file - - Field Expected behavior - ---------------------------------- - usage prohibited - max_usage reset to usage - limit set the limit - failcnt reset to zero - - - -5. Usage example - - a. Declare a task group (take a look at cgroups subsystem for this) and - fold a res_counter into it - - struct my_group { - struct res_counter res; - - - } - - b. Put hooks in resource allocation/release paths - - int alloc_something(...) - { - if (res_counter_charge(res_counter_ptr, amount) < 0) - return -ENOMEM; - - - } - - void release_something(...) - { - res_counter_uncharge(res_counter_ptr, amount); - - - } - - In order to keep the usage value self-consistent, both the - "res_counter_ptr" and the "amount" in release_something() should be - the same as they were in the alloc_something() when the releasing - resource was allocated. - - c. Provide the way to read res_counter values and set them (the cgroups - still can help with it). - - c. Compile and run :) diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt deleted file mode 100644 index 5c86c258c791..000000000000 --- a/Documentation/cpusets.txt +++ /dev/null @@ -1,808 +0,0 @@ - CPUSETS - ------- - -Copyright (C) 2004 BULL SA. -Written by Simon.Derr@bull.net - -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. -Modified by Paul Jackson -Modified by Christoph Lameter -Modified by Paul Menage -Modified by Hidetoshi Seto - -CONTENTS: -========= - -1. Cpusets - 1.1 What are cpusets ? - 1.2 Why are cpusets needed ? - 1.3 How are cpusets implemented ? - 1.4 What are exclusive cpusets ? - 1.5 What is memory_pressure ? - 1.6 What is memory spread ? - 1.7 What is sched_load_balance ? - 1.8 What is sched_relax_domain_level ? - 1.9 How do I use cpusets ? -2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Adding/removing cpus - 2.3 Setting flags - 2.4 Attaching processes -3. Questions -4. Contact - -1. Cpusets -========== - -1.1 What are cpusets ? ----------------------- - -Cpusets provide a mechanism for assigning a set of CPUs and Memory -Nodes to a set of tasks. In this document "Memory Node" refers to -an on-line node that contains memory. - -Cpusets constrain the CPU and Memory placement of tasks to only -the resources within a tasks current cpuset. They form a nested -hierarchy visible in a virtual file system. These are the essential -hooks, beyond what is already present, required to manage dynamic -job placement on large systems. - -Cpusets use the generic cgroup subsystem described in -Documentation/cgroups/cgroups.txt. - -Requests by a task, using the sched_setaffinity(2) system call to -include CPUs in its CPU affinity mask, and using the mbind(2) and -set_mempolicy(2) system calls to include Memory Nodes in its memory -policy, are both filtered through that tasks cpuset, filtering out any -CPUs or Memory Nodes not in that cpuset. The scheduler will not -schedule a task on a CPU that is not allowed in its cpus_allowed -vector, and the kernel page allocator will not allocate a page on a -node that is not allowed in the requesting tasks mems_allowed vector. - -User level code may create and destroy cpusets by name in the cgroup -virtual file system, manage the attributes and permissions of these -cpusets and which CPUs and Memory Nodes are assigned to each cpuset, -specify and query to which cpuset a task is assigned, and list the -task pids assigned to a cpuset. - - -1.2 Why are cpusets needed ? ----------------------------- - -The management of large computer systems, with many processors (CPUs), -complex memory cache hierarchies and multiple Memory Nodes having -non-uniform access times (NUMA) presents additional challenges for -the efficient scheduling and memory placement of processes. - -Frequently more modest sized systems can be operated with adequate -efficiency just by letting the operating system automatically share -the available CPU and Memory resources amongst the requesting tasks. - -But larger systems, which benefit more from careful processor and -memory placement to reduce memory access times and contention, -and which typically represent a larger investment for the customer, -can benefit from explicitly placing jobs on properly sized subsets of -the system. - -This can be especially valuable on: - - * Web Servers running multiple instances of the same web application, - * Servers running different applications (for instance, a web server - and a database), or - * NUMA systems running large HPC applications with demanding - performance characteristics. - -These subsets, or "soft partitions" must be able to be dynamically -adjusted, as the job mix changes, without impacting other concurrently -executing jobs. The location of the running jobs pages may also be moved -when the memory locations are changed. - -The kernel cpuset patch provides the minimum essential kernel -mechanisms required to efficiently implement such subsets. It -leverages existing CPU and Memory Placement facilities in the Linux -kernel to avoid any additional impact on the critical scheduler or -memory allocator code. - - -1.3 How are cpusets implemented ? ---------------------------------- - -Cpusets provide a Linux kernel mechanism to constrain which CPUs and -Memory Nodes are used by a process or set of processes. - -The Linux kernel already has a pair of mechanisms to specify on which -CPUs a task may be scheduled (sched_setaffinity) and on which Memory -Nodes it may obtain memory (mbind, set_mempolicy). - -Cpusets extends these two mechanisms as follows: - - - Cpusets are sets of allowed CPUs and Memory Nodes, known to the - kernel. - - Each task in the system is attached to a cpuset, via a pointer - in the task structure to a reference counted cgroup structure. - - Calls to sched_setaffinity are filtered to just those CPUs - allowed in that tasks cpuset. - - Calls to mbind and set_mempolicy are filtered to just - those Memory Nodes allowed in that tasks cpuset. - - The root cpuset contains all the systems CPUs and Memory - Nodes. - - For any cpuset, one can define child cpusets containing a subset - of the parents CPU and Memory Node resources. - - The hierarchy of cpusets can be mounted at /dev/cpuset, for - browsing and manipulation from user space. - - A cpuset may be marked exclusive, which ensures that no other - cpuset (except direct ancestors and descendents) may contain - any overlapping CPUs or Memory Nodes. - - You can list all the tasks (by pid) attached to any cpuset. - -The implementation of cpusets requires a few, simple hooks -into the rest of the kernel, none in performance critical paths: - - - in init/main.c, to initialize the root cpuset at system boot. - - in fork and exit, to attach and detach a task from its cpuset. - - in sched_setaffinity, to mask the requested CPUs by what's - allowed in that tasks cpuset. - - in sched.c migrate_all_tasks(), to keep migrating tasks within - the CPUs allowed by their cpuset, if possible. - - in the mbind and set_mempolicy system calls, to mask the requested - Memory Nodes by what's allowed in that tasks cpuset. - - in page_alloc.c, to restrict memory to allowed nodes. - - in vmscan.c, to restrict page recovery to the current cpuset. - -You should mount the "cgroup" filesystem type in order to enable -browsing and modifying the cpusets presently known to the kernel. No -new system calls are added for cpusets - all support for querying and -modifying cpusets is via this cpuset file system. - -The /proc//status file for each task has four added lines, -displaying the tasks cpus_allowed (on which CPUs it may be scheduled) -and mems_allowed (on which Memory Nodes it may obtain memory), -in the two formats seen in the following example: - - Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff - Cpus_allowed_list: 0-127 - Mems_allowed: ffffffff,ffffffff - Mems_allowed_list: 0-63 - -Each cpuset is represented by a directory in the cgroup file system -containing (on top of the standard cgroup files) the following -files describing that cpuset: - - - cpus: list of CPUs in that cpuset - - mems: list of Memory Nodes in that cpuset - - memory_migrate flag: if set, move pages to cpusets nodes - - cpu_exclusive flag: is cpu placement exclusive? - - mem_exclusive flag: is memory placement exclusive? - - mem_hardwall flag: is memory allocation hardwalled - - memory_pressure: measure of how much paging pressure in cpuset - -In addition, the root cpuset only has the following file: - - memory_pressure_enabled flag: compute memory_pressure? - -New cpusets are created using the mkdir system call or shell -command. The properties of a cpuset, such as its flags, allowed -CPUs and Memory Nodes, and attached tasks, are modified by writing -to the appropriate file in that cpusets directory, as listed above. - -The named hierarchical structure of nested cpusets allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cpuset allows organizing the work load -on a system into related sets of tasks such that each set is constrained -to using the CPUs and Memory Nodes of a particular cpuset. A task -may be re-attached to any other cpuset, if allowed by the permissions -on the necessary cpuset file system directories. - -Such management of a system "in the large" integrates smoothly with -the detailed placement done on individual tasks and memory regions -using the sched_setaffinity, mbind and set_mempolicy system calls. - -The following rules apply to each cpuset: - - - Its CPUs and Memory Nodes must be a subset of its parents. - - It can't be marked exclusive unless its parent is. - - If its cpu or memory is exclusive, they may not overlap any sibling. - -These rules, and the natural hierarchy of cpusets, enable efficient -enforcement of the exclusive guarantee, without having to scan all -cpusets every time any of them change to ensure nothing overlaps a -exclusive cpuset. Also, the use of a Linux virtual file system (vfs) -to represent the cpuset hierarchy provides for a familiar permission -and name space for cpusets, with a minimum of additional kernel code. - -The cpus and mems files in the root (top_cpuset) cpuset are -read-only. The cpus file automatically tracks the value of -cpu_online_map using a CPU hotplug notifier, and the mems file -automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e., -nodes with memory--using the cpuset_track_online_nodes() hook. - - -1.4 What are exclusive cpusets ? --------------------------------- - -If a cpuset is cpu or mem exclusive, no other cpuset, other than -a direct ancestor or descendent, may share any of the same CPUs or -Memory Nodes. - -A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", -i.e. it restricts kernel allocations for page, buffer and other data -commonly shared by the kernel across multiple users. All cpusets, -whether hardwalled or not, restrict allocations of memory for user -space. This enables configuring a system so that several independent -jobs can share common kernel data, such as file system pages, while -isolating each job's user allocation in its own cpuset. To do this, -construct a large mem_exclusive cpuset to hold all the jobs, and -construct child, non-mem_exclusive cpusets for each individual job. -Only a small amount of typical kernel memory, such as requests from -interrupt handlers, is allowed to be taken outside even a -mem_exclusive cpuset. - - -1.5 What is memory_pressure ? ------------------------------ -The memory_pressure of a cpuset provides a simple per-cpuset metric -of the rate that the tasks in a cpuset are attempting to free up in -use memory on the nodes of the cpuset to satisfy additional memory -requests. - -This enables batch managers monitoring jobs running in dedicated -cpusets to efficiently detect what level of memory pressure that job -is causing. - -This is useful both on tightly managed systems running a wide mix of -submitted jobs, which may choose to terminate or re-prioritize jobs that -are trying to use more memory than allowed on the nodes assigned them, -and with tightly coupled, long running, massively parallel scientific -computing jobs that will dramatically fail to meet required performance -goals if they start to use more memory than allowed to them. - -This mechanism provides a very economical way for the batch manager -to monitor a cpuset for signs of memory pressure. It's up to the -batch manager or other user code to decide what to do about it and -take action. - -==> Unless this feature is enabled by writing "1" to the special file - /dev/cpuset/memory_pressure_enabled, the hook in the rebalance - code of __alloc_pages() for this metric reduces to simply noticing - that the cpuset_memory_pressure_enabled flag is zero. So only - systems that enable this feature will compute the metric. - -Why a per-cpuset, running average: - - Because this meter is per-cpuset, rather than per-task or mm, - the system load imposed by a batch scheduler monitoring this - metric is sharply reduced on large systems, because a scan of - the tasklist can be avoided on each set of queries. - - Because this meter is a running average, instead of an accumulating - counter, a batch scheduler can detect memory pressure with a - single read, instead of having to read and accumulate results - for a period of time. - - Because this meter is per-cpuset rather than per-task or mm, - the batch scheduler can obtain the key information, memory - pressure in a cpuset, with a single read, rather than having to - query and accumulate results over all the (dynamically changing) - set of tasks in the cpuset. - -A per-cpuset simple digital filter (requires a spinlock and 3 words -of data per-cpuset) is kept, and updated by any task attached to that -cpuset, if it enters the synchronous (direct) page reclaim code. - -A per-cpuset file provides an integer number representing the recent -(half-life of 10 seconds) rate of direct page reclaims caused by -the tasks in the cpuset, in units of reclaims attempted per second, -times 1000. - - -1.6 What is memory spread ? ---------------------------- -There are two boolean flag files per cpuset that control where the -kernel allocates pages for the file system buffers and related in -kernel data structures. They are called 'memory_spread_page' and -'memory_spread_slab'. - -If the per-cpuset boolean flag file 'memory_spread_page' is set, then -the kernel will spread the file system buffers (page cache) evenly -over all the nodes that the faulting task is allowed to use, instead -of preferring to put those pages on the node where the task is running. - -If the per-cpuset boolean flag file 'memory_spread_slab' is set, -then the kernel will spread some file system related slab caches, -such as for inodes and dentries evenly over all the nodes that the -faulting task is allowed to use, instead of preferring to put those -pages on the node where the task is running. - -The setting of these flags does not affect anonymous data segment or -stack segment pages of a task. - -By default, both kinds of memory spreading are off, and memory -pages are allocated on the node local to where the task is running, -except perhaps as modified by the tasks NUMA mempolicy or cpuset -configuration, so long as sufficient free memory pages are available. - -When new cpusets are created, they inherit the memory spread settings -of their parent. - -Setting memory spreading causes allocations for the affected page -or slab caches to ignore the tasks NUMA mempolicy and be spread -instead. Tasks using mbind() or set_mempolicy() calls to set NUMA -mempolicies will not notice any change in these calls as a result of -their containing tasks memory spread settings. If memory spreading -is turned off, then the currently specified NUMA mempolicy once again -applies to memory page allocations. - -Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag -files. By default they contain "0", meaning that the feature is off -for that cpuset. If a "1" is written to that file, then that turns -the named feature on. - -The implementation is simple. - -Setting the flag 'memory_spread_page' turns on a per-process flag -PF_SPREAD_PAGE for each task that is in that cpuset or subsequently -joins that cpuset. The page allocation calls for the page cache -is modified to perform an inline check for this PF_SPREAD_PAGE task -flag, and if set, a call to a new routine cpuset_mem_spread_node() -returns the node to prefer for the allocation. - -Similarly, setting 'memory_spread_slab' turns on the flag -PF_SPREAD_SLAB, and appropriately marked slab caches will allocate -pages from the node returned by cpuset_mem_spread_node(). - -The cpuset_mem_spread_node() routine is also simple. It uses the -value of a per-task rotor cpuset_mem_spread_rotor to select the next -node in the current tasks mems_allowed to prefer for the allocation. - -This memory placement policy is also known (in other contexts) as -round-robin or interleave. - -This policy can provide substantial improvements for jobs that need -to place thread local data on the corresponding node, but that need -to access large file system data sets that need to be spread across -the several nodes in the jobs cpuset in order to fit. Without this -policy, especially for jobs that might have one thread reading in the -data set, the memory allocation across the nodes in the jobs cpuset -can become very uneven. - -1.7 What is sched_load_balance ? --------------------------------- - -The kernel scheduler (kernel/sched.c) automatically load balances -tasks. If one CPU is underutilized, kernel code running on that -CPU will look for tasks on other more overloaded CPUs and move those -tasks to itself, within the constraints of such placement mechanisms -as cpusets and sched_setaffinity. - -The algorithmic cost of load balancing and its impact on key shared -kernel data structures such as the task list increases more than -linearly with the number of CPUs being balanced. So the scheduler -has support to partition the systems CPUs into a number of sched -domains such that it only load balances within each sched domain. -Each sched domain covers some subset of the CPUs in the system; -no two sched domains overlap; some CPUs might not be in any sched -domain and hence won't be load balanced. - -Put simply, it costs less to balance between two smaller sched domains -than one big one, but doing so means that overloads in one of the -two domains won't be load balanced to the other one. - -By default, there is one sched domain covering all CPUs, except those -marked isolated using the kernel boot time "isolcpus=" argument. - -This default load balancing across all CPUs is not well suited for -the following two situations: - 1) On large systems, load balancing across many CPUs is expensive. - If the system is managed using cpusets to place independent jobs - on separate sets of CPUs, full load balancing is unnecessary. - 2) Systems supporting realtime on some CPUs need to minimize - system overhead on those CPUs, including avoiding task load - balancing if that is not needed. - -When the per-cpuset flag "sched_load_balance" is enabled (the default -setting), it requests that all the CPUs in that cpusets allowed 'cpus' -be contained in a single sched domain, ensuring that load balancing -can move a task (not otherwised pinned, as by sched_setaffinity) -from any CPU in that cpuset to any other. - -When the per-cpuset flag "sched_load_balance" is disabled, then the -scheduler will avoid load balancing across the CPUs in that cpuset, ---except-- in so far as is necessary because some overlapping cpuset -has "sched_load_balance" enabled. - -So, for example, if the top cpuset has the flag "sched_load_balance" -enabled, then the scheduler will have one sched domain covering all -CPUs, and the setting of the "sched_load_balance" flag in any other -cpusets won't matter, as we're already fully load balancing. - -Therefore in the above two situations, the top cpuset flag -"sched_load_balance" should be disabled, and only some of the smaller, -child cpusets have this flag enabled. - -When doing this, you don't usually want to leave any unpinned tasks in -the top cpuset that might use non-trivial amounts of CPU, as such tasks -may be artificially constrained to some subset of CPUs, depending on -the particulars of this flag setting in descendent cpusets. Even if -such a task could use spare CPU cycles in some other CPUs, the kernel -scheduler might not consider the possibility of load balancing that -task to that underused CPU. - -Of course, tasks pinned to a particular CPU can be left in a cpuset -that disables "sched_load_balance" as those tasks aren't going anywhere -else anyway. - -There is an impedance mismatch here, between cpusets and sched domains. -Cpusets are hierarchical and nest. Sched domains are flat; they don't -overlap and each CPU is in at most one sched domain. - -It is necessary for sched domains to be flat because load balancing -across partially overlapping sets of CPUs would risk unstable dynamics -that would be beyond our understanding. So if each of two partially -overlapping cpusets enables the flag 'sched_load_balance', then we -form a single sched domain that is a superset of both. We won't move -a task to a CPU outside it cpuset, but the scheduler load balancing -code might waste some compute cycles considering that possibility. - -This mismatch is why there is not a simple one-to-one relation -between which cpusets have the flag "sched_load_balance" enabled, -and the sched domain configuration. If a cpuset enables the flag, it -will get balancing across all its CPUs, but if it disables the flag, -it will only be assured of no load balancing if no other overlapping -cpuset enables the flag. - -If two cpusets have partially overlapping 'cpus' allowed, and only -one of them has this flag enabled, then the other may find its -tasks only partially load balanced, just on the overlapping CPUs. -This is just the general case of the top_cpuset example given a few -paragraphs above. In the general case, as in the top cpuset case, -don't leave tasks that might use non-trivial amounts of CPU in -such partially load balanced cpusets, as they may be artificially -constrained to some subset of the CPUs allowed to them, for lack of -load balancing to the other CPUs. - -1.7.1 sched_load_balance implementation details. ------------------------------------------------- - -The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary -to most cpuset flags.) When enabled for a cpuset, the kernel will -ensure that it can load balance across all the CPUs in that cpuset -(makes sure that all the CPUs in the cpus_allowed of that cpuset are -in the same sched domain.) - -If two overlapping cpusets both have 'sched_load_balance' enabled, -then they will be (must be) both in the same sched domain. - -If, as is the default, the top cpuset has 'sched_load_balance' enabled, -then by the above that means there is a single sched domain covering -the whole system, regardless of any other cpuset settings. - -The kernel commits to user space that it will avoid load balancing -where it can. It will pick as fine a granularity partition of sched -domains as it can while still providing load balancing for any set -of CPUs allowed to a cpuset having 'sched_load_balance' enabled. - -The internal kernel cpuset to scheduler interface passes from the -cpuset code to the scheduler code a partition of the load balanced -CPUs in the system. This partition is a set of subsets (represented -as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all -the CPUs that must be load balanced. - -Whenever the 'sched_load_balance' flag changes, or CPUs come or go -from a cpuset with this flag enabled, or a cpuset with this flag -enabled is removed, the cpuset code builds a new such partition and -passes it to the scheduler sched domain setup code, to have the sched -domains rebuilt as necessary. - -This partition exactly defines what sched domains the scheduler should -setup - one sched domain for each element (cpumask_t) in the partition. - -The scheduler remembers the currently active sched domain partitions. -When the scheduler routine partition_sched_domains() is invoked from -the cpuset code to update these sched domains, it compares the new -partition requested with the current, and updates its sched domains, -removing the old and adding the new, for each change. - - -1.8 What is sched_relax_domain_level ? --------------------------------------- - -In sched domain, the scheduler migrates tasks in 2 ways; periodic load -balance on tick, and at time of some schedule events. - -When a task is woken up, scheduler try to move the task on idle CPU. -For example, if a task A running on CPU X activates another task B -on the same CPU X, and if CPU Y is X's sibling and performing idle, -then scheduler migrate task B to CPU Y so that task B can start on -CPU Y without waiting task A on CPU X. - -And if a CPU run out of tasks in its runqueue, the CPU try to pull -extra tasks from other busy CPUs to help them before it is going to -be idle. - -Of course it takes some searching cost to find movable tasks and/or -idle CPUs, the scheduler might not search all CPUs in the domain -everytime. In fact, in some architectures, the searching ranges on -events are limited in the same socket or node where the CPU locates, -while the load balance on tick searchs all. - -For example, assume CPU Z is relatively far from CPU X. Even if CPU Z -is idle while CPU X and the siblings are busy, scheduler can't migrate -woken task B from X to Z since it is out of its searching range. -As the result, task B on CPU X need to wait task A or wait load balance -on the next tick. For some applications in special situation, waiting -1 tick may be too long. - -The 'sched_relax_domain_level' file allows you to request changing -this searching range as you like. This file takes int value which -indicates size of searching range in levels ideally as follows, -otherwise initial value -1 that indicates the cpuset has no request. - - -1 : no request. use system default or follow request of others. - 0 : no search. - 1 : search siblings (hyperthreads in a core). - 2 : search cores in a package. - 3 : search cpus in a node [= system wide on non-NUMA system] - ( 4 : search nodes in a chunk of node [on NUMA system] ) - ( 5 : search system wide [on NUMA system] ) - -The system default is architecture dependent. The system default -can be changed using the relax_domain_level= boot parameter. - -This file is per-cpuset and affect the sched domain where the cpuset -belongs to. Therefore if the flag 'sched_load_balance' of a cpuset -is disabled, then 'sched_relax_domain_level' have no effect since -there is no sched domain belonging the cpuset. - -If multiple cpusets are overlapping and hence they form a single sched -domain, the largest value among those is used. Be careful, if one -requests 0 and others are -1 then 0 is used. - -Note that modifying this file will have both good and bad effects, -and whether it is acceptable or not will be depend on your situation. -Don't modify this file if you are not sure. - -If your situation is: - - The migration costs between each cpu can be assumed considerably - small(for you) due to your special application's behavior or - special hardware support for CPU cache etc. - - The searching cost doesn't have impact(for you) or you can make - the searching cost enough small by managing cpuset to compact etc. - - The latency is required even it sacrifices cache hit rate etc. -then increasing 'sched_relax_domain_level' would benefit you. - - -1.9 How do I use cpusets ? --------------------------- - -In order to minimize the impact of cpusets on critical kernel -code, such as the scheduler, and due to the fact that the kernel -does not support one task updating the memory placement of another -task directly, the impact on a task of changing its cpuset CPU -or Memory Node placement, or of changing to which cpuset a task -is attached, is subtle. - -If a cpuset has its Memory Nodes modified, then for each task attached -to that cpuset, the next time that the kernel attempts to allocate -a page of memory for that task, the kernel will notice the change -in the tasks cpuset, and update its per-task memory placement to -remain within the new cpusets memory placement. If the task was using -mempolicy MPOL_BIND, and the nodes to which it was bound overlap with -its new cpuset, then the task will continue to use whatever subset -of MPOL_BIND nodes are still allowed in the new cpuset. If the task -was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed -in the new cpuset, then the task will be essentially treated as if it -was MPOL_BIND bound to the new cpuset (even though its numa placement, -as queried by get_mempolicy(), doesn't change). If a task is moved -from one cpuset to another, then the kernel will adjust the tasks -memory placement, as above, the next time that the kernel attempts -to allocate a page of memory for that task. - -If a cpuset has its 'cpus' modified, then each task in that cpuset -will have its allowed CPU placement changed immediately. Similarly, -if a tasks pid is written to a cpusets 'tasks' file, in either its -current cpuset or another cpuset, then its allowed CPU placement is -changed immediately. If such a task had been bound to some subset -of its cpuset using the sched_setaffinity() call, the task will be -allowed to run on any CPU allowed in its new cpuset, negating the -affect of the prior sched_setaffinity() call. - -In summary, the memory placement of a task whose cpuset is changed is -updated by the kernel, on the next allocation of a page for that task, -but the processor placement is not updated, until that tasks pid is -rewritten to the 'tasks' file of its cpuset. This is done to avoid -impacting the scheduler code in the kernel with a check for changes -in a tasks processor placement. - -Normally, once a page is allocated (given a physical page -of main memory) then that page stays on whatever node it -was allocated, so long as it remains allocated, even if the -cpusets memory placement policy 'mems' subsequently changes. -If the cpuset flag file 'memory_migrate' is set true, then when -tasks are attached to that cpuset, any pages that task had -allocated to it on nodes in its previous cpuset are migrated -to the tasks new cpuset. The relative placement of the page within -the cpuset is preserved during these migration operations if possible. -For example if the page was on the second valid node of the prior cpuset -then the page will be placed on the second valid node of the new cpuset. - -Also if 'memory_migrate' is set true, then if that cpusets -'mems' file is modified, pages allocated to tasks in that -cpuset, that were on nodes in the previous setting of 'mems', -will be moved to nodes in the new setting of 'mems.' -Pages that were not in the tasks prior cpuset, or in the cpusets -prior 'mems' setting, will not be moved. - -There is an exception to the above. If hotplug functionality is used -to remove all the CPUs that are currently assigned to a cpuset, -then all the tasks in that cpuset will be moved to the nearest ancestor -with non-empty cpus. But the moving of some (or all) tasks might fail if -cpuset is bound with another cgroup subsystem which has some restrictions -on task attaching. In this failing case, those tasks will stay -in the original cpuset, and the kernel will automatically update -their cpus_allowed to allow all online CPUs. When memory hotplug -functionality for removing Memory Nodes is available, a similar exception -is expected to apply there as well. In general, the kernel prefers to -violate cpuset placement, over starving a task that has had all -its allowed CPUs or Memory Nodes taken offline. - -There is a second exception to the above. GFP_ATOMIC requests are -kernel internal allocations that must be satisfied, immediately. -The kernel may drop some request, in rare cases even panic, if a -GFP_ATOMIC alloc fails. If the request cannot be satisfied within -the current tasks cpuset, then we relax the cpuset, and look for -memory anywhere we can find it. It's better to violate the cpuset -than stress the kernel. - -To start a new job that is to be contained within a cpuset, the steps are: - - 1) mkdir /dev/cpuset - 2) mount -t cgroup -ocpuset cpuset /dev/cpuset - 3) Create the new cpuset by doing mkdir's and write's (or echo's) in - the /dev/cpuset virtual file system. - 4) Start a task that will be the "founding father" of the new job. - 5) Attach that task to the new cpuset by writing its pid to the - /dev/cpuset tasks file for that cpuset. - 6) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cpuset -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cpuset: - - mount -t cgroup -ocpuset cpuset /dev/cpuset - cd /dev/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpus - /bin/echo 1 > mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cpuset Charlie - # The next line should display '/Charlie' - cat /proc/self/cpuset - -In the future, a C library interface to cpusets will likely be -available. For now, the only way to query or modify cpusets is -via the cpuset file system, using the various cd, mkdir, echo, cat, -rmdir commands from the shell, or their equivalent from C. - -The sched_setaffinity calls can also be done at the shell prompt using -SGI's runon or Robert Love's taskset. The mbind and set_mempolicy -calls can be done at the shell prompt using the numactl command -(part of Andi Kleen's numa package). - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using the cpusets can be done through the cpuset -virtual filesystem. - -To mount it, type: -# mount -t cgroup -o cpuset cpuset /dev/cpuset - -Then under /dev/cpuset you can find a tree that corresponds to the -tree of the cpusets in the system. For instance, /dev/cpuset -is the cpuset that holds the whole system. - -If you want to create a new cpuset under /dev/cpuset: -# cd /dev/cpuset -# mkdir my_cpuset - -Now you want to do something with this cpuset. -# cd my_cpuset - -In this directory you can find several files: -# ls -cpu_exclusive memory_migrate mems tasks -cpus memory_pressure notify_on_release -mem_exclusive memory_spread_page sched_load_balance -mem_hardwall memory_spread_slab sched_relax_domain_level - -Reading them will give you information about the state of this cpuset: -the CPUs and Memory Nodes it can use, the processes that are using -it, its properties. By writing to these files you can manipulate -the cpuset. - -Set some flags: -# /bin/echo 1 > cpu_exclusive - -Add some cpus: -# /bin/echo 0-7 > cpus - -Add some mems: -# /bin/echo 0-7 > mems - -Now attach your shell to this cpuset: -# /bin/echo $$ > tasks - -You can also create cpusets inside your cpuset by using mkdir in this -directory. -# mkdir my_sub_cs - -To remove a cpuset, just use rmdir: -# rmdir my_sub_cs -This will fail if the cpuset is in use (has cpusets inside, or has -processes attached). - -Note that for legacy reasons, the "cpuset" filesystem exists as a -wrapper around the cgroup filesystem. - -The command - -mount -t cpuset X /dev/cpuset - -is equivalent to - -mount -t cgroup -ocpuset X /dev/cpuset -echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent - -2.2 Adding/removing cpus ------------------------- - -This is the syntax to use when writing in the cpus or mems files -in cpuset directories: - -# /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4 -# /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4 - -2.3 Setting flags ------------------ - -The syntax is very simple: - -# /bin/echo 1 > cpu_exclusive -> set flag 'cpu_exclusive' -# /bin/echo 0 > cpu_exclusive -> unset flag 'cpu_exclusive' - -2.4 Attaching processes ------------------------ - -# /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another: - -# /bin/echo PID1 > tasks -# /bin/echo PID2 > tasks - ... -# /bin/echo PIDn > tasks - - -3. Questions -============ - -Q: what's up with this '/bin/echo' ? -A: bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cpuset file system, you won't be - able to tell whether a command succeeded or failed. - -Q: When I attach processes, only the first of the line gets really attached ! -A: We can only return one error code per call to write(). So you should also - put only ONE pid. - -4. Contact -========== - -Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index 8398ca4ff4ed..6f33593e59e2 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt @@ -231,7 +231,7 @@ CPU bandwidth control purposes: This options needs CONFIG_CGROUPS to be defined, and lets the administrator create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See - Documentation/cgroups.txt for more information about this filesystem. + Documentation/cgroups/cgroups.txt for more information about this filesystem. Only one of these options to group tasks can be chosen and not both. diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index dede0a2cfc45..4c5bcf6ca7e8 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -9,7 +9,7 @@ * * Author: Pavel Emelianov * - * See Documentation/controllers/resource_counter.txt for more + * See Documentation/cgroups/resource_counter.txt for more * info about what this counter is. */ diff --git a/init/Kconfig b/init/Kconfig index 56fd93c63c77..2af83825634e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -323,8 +323,8 @@ config CGROUP_SCHED This option allows you to create arbitrary task groups using the "cgroup" pseudo filesystem and control the cpu bandwidth allocated to each such task group. - Refer to Documentation/cgroups.txt for more information - on "cgroup" pseudo filesystem. + Refer to Documentation/cgroups/cgroups.txt for more + information on "cgroup" pseudo filesystem. endchoice @@ -335,10 +335,9 @@ menuconfig CGROUPS use with process control subsystems such as Cpusets, CFS, memory controls or device isolation. See - - Documentation/cpusets.txt (Cpusets) - Documentation/scheduler/sched-design-CFS.txt (CFS) - - Documentation/cgroups/ (features for grouping, isolation) - - Documentation/controllers/ (features for resource control) + - Documentation/cgroups/ (features for grouping, isolation + and resource control) Say N if unsure. diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 647c77a88fcb..a85678865c5e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -568,7 +568,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * load balancing domains (sched domains) as specified by that partial * partition. * - * See "What is sched_load_balance" in Documentation/cpusets.txt + * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt * for a background explanation of this. * * Does not return errors, on the theory that the callers of this -- cgit v1.2.3 From 3eabdb76a03bbe8f556162738c264dbfb24cff6a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Jan 2009 13:51:01 -0800 Subject: jbd: fix missing kernel-doc Fix jbd header file kernel-doc notation: Warning(linux-2.6.28-git13//include/linux/jbd.h:823): No description found for parameter 'j_average_commit_time' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/jbd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 6384b19efe64..64246dce5663 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -614,6 +614,8 @@ struct transaction_s * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the * number that will fit in j_blocksize * @j_last_sync_writer: most recent pid which did a synchronous write + * @j_average_commit_time: the average amount of time in nanoseconds it + * takes to commit a transaction to the disk. * @j_private: An opaque pointer to fs-private information. */ -- cgit v1.2.3 From 6ae301e85c9c58d2f430a8a7057ce488b7ff76df Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Jan 2009 13:51:01 -0800 Subject: resources: fix parameter name and kernel-doc Fix __request_region() parameter kernel-doc notation and parameter name: Warning(linux-2.6.28-git10//kernel/resource.c:627): No description found for parameter 'flags' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 3 ++- kernel/resource.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index f6bb2ca8e3ba..32e4b2f72294 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -143,7 +143,8 @@ static inline unsigned long resource_type(struct resource *res) extern struct resource * __request_region(struct resource *, resource_size_t start, - resource_size_t n, const char *name, int relaxed); + resource_size_t n, + const char *name, int flags); /* Compatibility cruft */ #define release_region(start,n) __release_region(&ioport_resource, (start), (n)) diff --git a/kernel/resource.c b/kernel/resource.c index ca6a1536b205..fd5d7d574bb9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -620,6 +620,7 @@ resource_size_t resource_alignment(struct resource *res) * @start: resource start address * @n: resource region size * @name: reserving caller's ID string + * @flags: IO resource flags */ struct resource * __request_region(struct resource *parent, resource_size_t start, resource_size_t n, -- cgit v1.2.3 From 1bcbf31337391a2f54ef6c1e8871c2de5944a7dc Mon Sep 17 00:00:00 2001 From: Qinghuang Feng Date: Thu, 15 Jan 2009 13:51:03 -0800 Subject: btrfs & squashfs: Move btrfs and squashfsto's magic number to Use the standard magic.h for btrfs and squashfs. Signed-off-by: Qinghuang Feng Cc: Phillip Lougher Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/super.c | 2 +- fs/squashfs/squashfs_fs.h | 1 - fs/squashfs/super.c | 1 + include/linux/magic.h | 2 ++ 4 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0a14b495532f..7256cf242eb0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -51,7 +52,6 @@ #include "export.h" #include "compression.h" -#define BTRFS_SUPER_MAGIC 0x9123683E static struct super_operations btrfs_super_ops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 6840da1bf21e..283daafc568e 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -26,7 +26,6 @@ #define SQUASHFS_CACHED_FRAGMENTS CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE #define SQUASHFS_MAJOR 4 #define SQUASHFS_MINOR 0 -#define SQUASHFS_MAGIC 0x73717368 #define SQUASHFS_START 0 /* size of metadata (inode and directory) blocks */ diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index a0466d7467b2..071df5b5b491 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/include/linux/magic.h b/include/linux/magic.h index 439f6f3cb0c4..0b4df7eba852 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h @@ -10,11 +10,13 @@ #define SYSFS_MAGIC 0x62656572 #define SECURITYFS_MAGIC 0x73636673 #define TMPFS_MAGIC 0x01021994 +#define SQUASHFS_MAGIC 0x73717368 #define EFS_SUPER_MAGIC 0x414A53 #define EXT2_SUPER_MAGIC 0xEF53 #define EXT3_SUPER_MAGIC 0xEF53 #define XENFS_SUPER_MAGIC 0xabba1974 #define EXT4_SUPER_MAGIC 0xEF53 +#define BTRFS_SUPER_MAGIC 0x9123683E #define HPFS_SUPER_MAGIC 0xf995e849 #define ISOFS_SUPER_MAGIC 0x9660 #define JFFS2_SUPER_MAGIC 0x72b6 -- cgit v1.2.3 From 00bfddaf7f68a6551319b536f052040c370756b0 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 15 Jan 2009 13:51:26 -0800 Subject: include of is preferred over Impact: fix 15 make headers_check warnings: include of is preferred over Signed-off-by: Jaswinder Singh Rajput Cc: Ingo Molnar Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/agpgart.h | 1 - include/linux/atm_idt77105.h | 2 +- include/linux/capi.h | 2 +- include/linux/connector.h | 2 +- include/linux/cyclades.h | 2 -- include/linux/fb.h | 2 +- include/linux/if_pppol2tp.h | 2 +- include/linux/if_pppox.h | 2 +- include/linux/input.h | 2 +- include/linux/joystick.h | 2 +- include/linux/kvm.h | 2 +- include/linux/loop.h | 2 +- include/linux/matroxfb.h | 2 +- include/linux/phantom.h | 2 +- include/linux/radeonfb.h | 2 +- 15 files changed, 13 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/agpgart.h b/include/linux/agpgart.h index c8fdb6e658e1..110c600c885f 100644 --- a/include/linux/agpgart.h +++ b/include/linux/agpgart.h @@ -52,7 +52,6 @@ #ifndef __KERNEL__ #include -#include struct agp_version { __u16 major; diff --git a/include/linux/atm_idt77105.h b/include/linux/atm_idt77105.h index 05621cf20709..8b724000aa50 100644 --- a/include/linux/atm_idt77105.h +++ b/include/linux/atm_idt77105.h @@ -7,7 +7,7 @@ #ifndef LINUX_ATM_IDT77105_H #define LINUX_ATM_IDT77105_H -#include +#include #include #include diff --git a/include/linux/capi.h b/include/linux/capi.h index fdebaaa9f66e..65100d6cb89b 100644 --- a/include/linux/capi.h +++ b/include/linux/capi.h @@ -12,7 +12,7 @@ #ifndef __LINUX_CAPI_H__ #define __LINUX_CAPI_H__ -#include +#include #include #ifndef __KERNEL__ #include diff --git a/include/linux/connector.h b/include/linux/connector.h index 5c7f9468f753..34f2789d9b9b 100644 --- a/include/linux/connector.h +++ b/include/linux/connector.h @@ -22,7 +22,7 @@ #ifndef __CONNECTOR_H #define __CONNECTOR_H -#include +#include #define CN_IDX_CONNECTOR 0xffffffff #define CN_VAL_CONNECTOR 0xffffffff diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h index 2d3d1e04ba92..d06fbf286346 100644 --- a/include/linux/cyclades.h +++ b/include/linux/cyclades.h @@ -150,8 +150,6 @@ struct CYZ_BOOT_CTRL { * architectures and compilers. */ -#include - typedef __u64 ucdouble; /* 64 bits, unsigned */ typedef __u32 uclong; /* 32 bits, unsigned */ typedef __u16 ucshort; /* 16 bits, unsigned */ diff --git a/include/linux/fb.h b/include/linux/fb.h index 1ee63df5be92..818fe21257e8 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -1,7 +1,7 @@ #ifndef _LINUX_FB_H #define _LINUX_FB_H -#include +#include #include struct dentry; diff --git a/include/linux/if_pppol2tp.h b/include/linux/if_pppol2tp.h index a7d6a2234b31..c7a66882b6d0 100644 --- a/include/linux/if_pppol2tp.h +++ b/include/linux/if_pppol2tp.h @@ -15,7 +15,7 @@ #ifndef __LINUX_IF_PPPOL2TP_H #define __LINUX_IF_PPPOL2TP_H -#include +#include #ifdef __KERNEL__ #include diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index 6fb7f1788570..30c88b2245ff 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -17,7 +17,7 @@ #define __LINUX_IF_PPPOX_H -#include +#include #include #ifdef __KERNEL__ diff --git a/include/linux/input.h b/include/linux/input.h index 9a6355f74db2..1249a0c20a38 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -16,7 +16,7 @@ #include #include #include -#include +#include #endif /* diff --git a/include/linux/joystick.h b/include/linux/joystick.h index b5e051295a67..9e20c29c1e14 100644 --- a/include/linux/joystick.h +++ b/include/linux/joystick.h @@ -27,7 +27,7 @@ * Vojtech Pavlik, Ucitelska 1576, Prague 8, 182 00 Czech Republic */ -#include +#include #include /* diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 35525ac63337..5715f1907601 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -7,7 +7,7 @@ * Note: you must update KVM_API_VERSION if you change this interface. */ -#include +#include #include #include #include diff --git a/include/linux/loop.h b/include/linux/loop.h index 46169a7b559b..6ffd6db5bb0d 100644 --- a/include/linux/loop.h +++ b/include/linux/loop.h @@ -80,7 +80,7 @@ enum { }; #include /* for __kernel_old_dev_t */ -#include /* for __u64 */ +#include /* for __u64 */ /* Backwards compatibility version */ struct loop_info { diff --git a/include/linux/matroxfb.h b/include/linux/matroxfb.h index ae5b09493062..404f678e734b 100644 --- a/include/linux/matroxfb.h +++ b/include/linux/matroxfb.h @@ -2,7 +2,7 @@ #define __LINUX_MATROXFB_H__ #include -#include +#include #include struct matroxioc_output_mode { diff --git a/include/linux/phantom.h b/include/linux/phantom.h index 02268c54c250..94dd6645c60a 100644 --- a/include/linux/phantom.h +++ b/include/linux/phantom.h @@ -10,7 +10,7 @@ #ifndef __PHANTOM_H #define __PHANTOM_H -#include +#include /* PHN_(G/S)ET_REG param */ struct phm_reg { diff --git a/include/linux/radeonfb.h b/include/linux/radeonfb.h index 5bd8975ed78e..8c4bbdecc44f 100644 --- a/include/linux/radeonfb.h +++ b/include/linux/radeonfb.h @@ -2,7 +2,7 @@ #define __LINUX_RADEONFB_H__ #include -#include +#include #define ATY_RADEON_LCD_ON 0x00000001 #define ATY_RADEON_CRT_ON 0x00000002 -- cgit v1.2.3 From 94be9a58d7e683ac3c1df1858a17f09ebade8da0 Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Fri, 16 Jan 2009 10:17:09 -0500 Subject: [libata] get-identity ioctl: Fix use of invalid memory pointer for SAS drivers. Caught by Ke Wei (and team?) at Marvell. Also, move the ata_scsi_ioctl export to libata-scsi.c, as that seems to be the general trend. Acked-by: James Bottomley Signed-off-by: Jeff Garzik --- drivers/ata/libata-core.c | 1 - drivers/ata/libata-scsi.c | 17 +++++++++++++---- drivers/scsi/ipr.c | 2 +- drivers/scsi/libsas/sas_scsi_host.c | 2 +- include/linux/libata.h | 2 ++ 5 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 71218d76d75e..552ecae13434 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -6638,7 +6638,6 @@ EXPORT_SYMBOL_GPL(ata_dev_pair); EXPORT_SYMBOL_GPL(ata_port_disable); EXPORT_SYMBOL_GPL(ata_ratelimit); EXPORT_SYMBOL_GPL(ata_wait_register); -EXPORT_SYMBOL_GPL(ata_scsi_ioctl); EXPORT_SYMBOL_GPL(ata_scsi_queuecmd); EXPORT_SYMBOL_GPL(ata_scsi_slave_config); EXPORT_SYMBOL_GPL(ata_scsi_slave_destroy); diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 9e92107691f2..a1a6e6298c33 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -423,9 +423,9 @@ int ata_std_bios_param(struct scsi_device *sdev, struct block_device *bdev, * RETURNS: * Zero on success, negative errno on error. */ -static int ata_get_identity(struct scsi_device *sdev, void __user *arg) +static int ata_get_identity(struct ata_port *ap, struct scsi_device *sdev, + void __user *arg) { - struct ata_port *ap = ata_shost_to_port(sdev->host); struct ata_device *dev = ata_scsi_find_dev(ap, sdev); u16 __user *dst = arg; char buf[40]; @@ -645,7 +645,8 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg) return rc; } -int ata_scsi_ioctl(struct scsi_device *scsidev, int cmd, void __user *arg) +int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *scsidev, + int cmd, void __user *arg) { int val = -EINVAL, rc = -EINVAL; @@ -663,7 +664,7 @@ int ata_scsi_ioctl(struct scsi_device *scsidev, int cmd, void __user *arg) return 0; case HDIO_GET_IDENTITY: - return ata_get_identity(scsidev, arg); + return ata_get_identity(ap, scsidev, arg); case HDIO_DRIVE_CMD: if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO)) @@ -682,6 +683,14 @@ int ata_scsi_ioctl(struct scsi_device *scsidev, int cmd, void __user *arg) return rc; } +EXPORT_SYMBOL_GPL(ata_sas_scsi_ioctl); + +int ata_scsi_ioctl(struct scsi_device *scsidev, int cmd, void __user *arg) +{ + return ata_sas_scsi_ioctl(ata_shost_to_port(scsidev->host), + scsidev, cmd, arg); +} +EXPORT_SYMBOL_GPL(ata_scsi_ioctl); /** * ata_scsi_qc_new - acquire new ata_queued_cmd reference diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index 841f460edbc4..07829009a8be 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -4912,7 +4912,7 @@ static int ipr_ioctl(struct scsi_device *sdev, int cmd, void __user *arg) if (res && ipr_is_gata(res)) { if (cmd == HDIO_GET_IDENTITY) return -ENOTTY; - return ata_scsi_ioctl(sdev, cmd, arg); + return ata_sas_scsi_ioctl(res->sata_port->ap, sdev, cmd, arg); } return -EINVAL; diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c index 744838780ada..1c558d3bce18 100644 --- a/drivers/scsi/libsas/sas_scsi_host.c +++ b/drivers/scsi/libsas/sas_scsi_host.c @@ -717,7 +717,7 @@ int sas_ioctl(struct scsi_device *sdev, int cmd, void __user *arg) struct domain_device *dev = sdev_to_domain_dev(sdev); if (dev_is_sata(dev)) - return ata_scsi_ioctl(sdev, cmd, arg); + return ata_sas_scsi_ioctl(dev->sata_dev.ap, sdev, cmd, arg); return -EINVAL; } diff --git a/include/linux/libata.h b/include/linux/libata.h index b6b8a7f3ec66..73b69c7071c5 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -927,6 +927,8 @@ extern void ata_host_init(struct ata_host *, struct device *, extern int ata_scsi_detect(struct scsi_host_template *sht); extern int ata_scsi_ioctl(struct scsi_device *dev, int cmd, void __user *arg); extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *)); +extern int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *dev, + int cmd, void __user *arg); extern void ata_sas_port_destroy(struct ata_port *); extern struct ata_port *ata_sas_port_alloc(struct ata_host *, struct ata_port_info *, struct Scsi_Host *); -- cgit v1.2.3 From 3ada9c126499dd4700dcdbd5b9fe8110ad17f578 Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 15 Jan 2009 17:45:31 -0800 Subject: libata: Add another column to the ata_timing table. The forthcoming OCTEON SOC Compact Flash driver needs an additional timing value that was not available in the ata_timing table. I add a new column for dmack_hold time. The values were obtained from the Compact Flash specification Rev 4.1. Signed-off-by: David Daney Signed-off-by: Jeff Garzik --- drivers/ata/libata-core.c | 72 ++++++++++++++++++++++++----------------------- include/linux/libata.h | 9 ++++-- 2 files changed, 43 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 552ecae13434..88c242856dae 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -3029,33 +3029,33 @@ int sata_set_spd(struct ata_link *link) */ static const struct ata_timing ata_timing[] = { -/* { XFER_PIO_SLOW, 120, 290, 240, 960, 290, 240, 960, 0 }, */ - { XFER_PIO_0, 70, 290, 240, 600, 165, 150, 600, 0 }, - { XFER_PIO_1, 50, 290, 93, 383, 125, 100, 383, 0 }, - { XFER_PIO_2, 30, 290, 40, 330, 100, 90, 240, 0 }, - { XFER_PIO_3, 30, 80, 70, 180, 80, 70, 180, 0 }, - { XFER_PIO_4, 25, 70, 25, 120, 70, 25, 120, 0 }, - { XFER_PIO_5, 15, 65, 25, 100, 65, 25, 100, 0 }, - { XFER_PIO_6, 10, 55, 20, 80, 55, 20, 80, 0 }, - - { XFER_SW_DMA_0, 120, 0, 0, 0, 480, 480, 960, 0 }, - { XFER_SW_DMA_1, 90, 0, 0, 0, 240, 240, 480, 0 }, - { XFER_SW_DMA_2, 60, 0, 0, 0, 120, 120, 240, 0 }, - - { XFER_MW_DMA_0, 60, 0, 0, 0, 215, 215, 480, 0 }, - { XFER_MW_DMA_1, 45, 0, 0, 0, 80, 50, 150, 0 }, - { XFER_MW_DMA_2, 25, 0, 0, 0, 70, 25, 120, 0 }, - { XFER_MW_DMA_3, 25, 0, 0, 0, 65, 25, 100, 0 }, - { XFER_MW_DMA_4, 25, 0, 0, 0, 55, 20, 80, 0 }, - -/* { XFER_UDMA_SLOW, 0, 0, 0, 0, 0, 0, 0, 150 }, */ - { XFER_UDMA_0, 0, 0, 0, 0, 0, 0, 0, 120 }, - { XFER_UDMA_1, 0, 0, 0, 0, 0, 0, 0, 80 }, - { XFER_UDMA_2, 0, 0, 0, 0, 0, 0, 0, 60 }, - { XFER_UDMA_3, 0, 0, 0, 0, 0, 0, 0, 45 }, - { XFER_UDMA_4, 0, 0, 0, 0, 0, 0, 0, 30 }, - { XFER_UDMA_5, 0, 0, 0, 0, 0, 0, 0, 20 }, - { XFER_UDMA_6, 0, 0, 0, 0, 0, 0, 0, 15 }, +/* { XFER_PIO_SLOW, 120, 290, 240, 960, 290, 240, 0, 960, 0 }, */ + { XFER_PIO_0, 70, 290, 240, 600, 165, 150, 0, 600, 0 }, + { XFER_PIO_1, 50, 290, 93, 383, 125, 100, 0, 383, 0 }, + { XFER_PIO_2, 30, 290, 40, 330, 100, 90, 0, 240, 0 }, + { XFER_PIO_3, 30, 80, 70, 180, 80, 70, 0, 180, 0 }, + { XFER_PIO_4, 25, 70, 25, 120, 70, 25, 0, 120, 0 }, + { XFER_PIO_5, 15, 65, 25, 100, 65, 25, 0, 100, 0 }, + { XFER_PIO_6, 10, 55, 20, 80, 55, 20, 0, 80, 0 }, + + { XFER_SW_DMA_0, 120, 0, 0, 0, 480, 480, 50, 960, 0 }, + { XFER_SW_DMA_1, 90, 0, 0, 0, 240, 240, 30, 480, 0 }, + { XFER_SW_DMA_2, 60, 0, 0, 0, 120, 120, 20, 240, 0 }, + + { XFER_MW_DMA_0, 60, 0, 0, 0, 215, 215, 20, 480, 0 }, + { XFER_MW_DMA_1, 45, 0, 0, 0, 80, 50, 5, 150, 0 }, + { XFER_MW_DMA_2, 25, 0, 0, 0, 70, 25, 5, 120, 0 }, + { XFER_MW_DMA_3, 25, 0, 0, 0, 65, 25, 5, 100, 0 }, + { XFER_MW_DMA_4, 25, 0, 0, 0, 55, 20, 5, 80, 0 }, + +/* { XFER_UDMA_SLOW, 0, 0, 0, 0, 0, 0, 0, 0, 150 }, */ + { XFER_UDMA_0, 0, 0, 0, 0, 0, 0, 0, 0, 120 }, + { XFER_UDMA_1, 0, 0, 0, 0, 0, 0, 0, 0, 80 }, + { XFER_UDMA_2, 0, 0, 0, 0, 0, 0, 0, 0, 60 }, + { XFER_UDMA_3, 0, 0, 0, 0, 0, 0, 0, 0, 45 }, + { XFER_UDMA_4, 0, 0, 0, 0, 0, 0, 0, 0, 30 }, + { XFER_UDMA_5, 0, 0, 0, 0, 0, 0, 0, 0, 20 }, + { XFER_UDMA_6, 0, 0, 0, 0, 0, 0, 0, 0, 15 }, { 0xFF } }; @@ -3065,14 +3065,15 @@ static const struct ata_timing ata_timing[] = { static void ata_timing_quantize(const struct ata_timing *t, struct ata_timing *q, int T, int UT) { - q->setup = EZ(t->setup * 1000, T); - q->act8b = EZ(t->act8b * 1000, T); - q->rec8b = EZ(t->rec8b * 1000, T); - q->cyc8b = EZ(t->cyc8b * 1000, T); - q->active = EZ(t->active * 1000, T); - q->recover = EZ(t->recover * 1000, T); - q->cycle = EZ(t->cycle * 1000, T); - q->udma = EZ(t->udma * 1000, UT); + q->setup = EZ(t->setup * 1000, T); + q->act8b = EZ(t->act8b * 1000, T); + q->rec8b = EZ(t->rec8b * 1000, T); + q->cyc8b = EZ(t->cyc8b * 1000, T); + q->active = EZ(t->active * 1000, T); + q->recover = EZ(t->recover * 1000, T); + q->dmack_hold = EZ(t->dmack_hold * 1000, T); + q->cycle = EZ(t->cycle * 1000, T); + q->udma = EZ(t->udma * 1000, UT); } void ata_timing_merge(const struct ata_timing *a, const struct ata_timing *b, @@ -3084,6 +3085,7 @@ void ata_timing_merge(const struct ata_timing *a, const struct ata_timing *b, if (what & ATA_TIMING_CYC8B ) m->cyc8b = max(a->cyc8b, b->cyc8b); if (what & ATA_TIMING_ACTIVE ) m->active = max(a->active, b->active); if (what & ATA_TIMING_RECOVER) m->recover = max(a->recover, b->recover); + if (what & ATA_TIMING_DMACK_HOLD) m->dmack_hold = max(a->dmack_hold, b->dmack_hold); if (what & ATA_TIMING_CYCLE ) m->cycle = max(a->cycle, b->cycle); if (what & ATA_TIMING_UDMA ) m->udma = max(a->udma, b->udma); } diff --git a/include/linux/libata.h b/include/linux/libata.h index 73b69c7071c5..2c6bd66209ff 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -401,12 +401,14 @@ enum { ATA_TIMING_CYC8B, ATA_TIMING_ACTIVE = (1 << 4), ATA_TIMING_RECOVER = (1 << 5), - ATA_TIMING_CYCLE = (1 << 6), - ATA_TIMING_UDMA = (1 << 7), + ATA_TIMING_DMACK_HOLD = (1 << 6), + ATA_TIMING_CYCLE = (1 << 7), + ATA_TIMING_UDMA = (1 << 8), ATA_TIMING_ALL = ATA_TIMING_SETUP | ATA_TIMING_ACT8B | ATA_TIMING_REC8B | ATA_TIMING_CYC8B | ATA_TIMING_ACTIVE | ATA_TIMING_RECOVER | - ATA_TIMING_CYCLE | ATA_TIMING_UDMA, + ATA_TIMING_DMACK_HOLD | ATA_TIMING_CYCLE | + ATA_TIMING_UDMA, }; enum ata_xfer_mask { @@ -866,6 +868,7 @@ struct ata_timing { unsigned short cyc8b; /* t0 for 8-bit I/O */ unsigned short active; /* t2 or tD */ unsigned short recover; /* t2i or tK */ + unsigned short dmack_hold; /* tj */ unsigned short cycle; /* t0 */ unsigned short udma; /* t2CYCTYP/2 */ }; -- cgit v1.2.3 From 08ec8c3878cea0bf91f2ba3c0badf44b383752d0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 16 Jan 2009 11:57:00 -0500 Subject: jbd2: On a __journal_expect() assertion failure printk "JBD2", not "EXT3-fs" Otherwise it can be very confusing to find a "EXT3-fs: " failure in the middle of EXT4-fs failures, and it makes it harder to track the source of the failure. Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index b45109c61fba..b28b37eb11c6 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -308,7 +308,8 @@ void buffer_assertion_failure(struct buffer_head *bh); int val = (expr); \ if (!val) { \ printk(KERN_ERR \ - "EXT3-fs unexpected failure: %s;\n",# expr); \ + "JBD2 unexpected failure: %s: %s;\n", \ + __func__, #expr); \ printk(KERN_ERR why "\n"); \ } \ val; \ -- cgit v1.2.3 From cc33412fb1f11613e20f9dfc2919a77ecd63fbc4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Jan 2009 17:23:05 +0100 Subject: quota: Improve locking We implement dqget() and dqput() that need neither dqonoff_mutex nor dqptr_sem. Then move dqget() and dqput() calls so that they are not called from under dqptr_sem. This is important because filesystem callbacks aren't called from under dqptr_sem which used to cause *lots* of problems with lock ranking (and with OCFS2 they became close to unsolvable). The patch also removes two functions which were introduced solely because OCFS2 needed them to cope with the old locking scheme. As time showed, they were not enough for OCFS2 anyway and it would be unnecessary work to adapt them to the new locking scheme in which they aren't needed. As a result OCFS2 needs the following patch to compile properly with quotas. Sorry to any bisecters which hit this in advance. Signed-off-by: Jan Kara --- fs/dquot.c | 218 ++++++++++++++++++++++++++--------------------- include/linux/quotaops.h | 2 - 2 files changed, 122 insertions(+), 98 deletions(-) (limited to 'include/linux') diff --git a/fs/dquot.c b/fs/dquot.c index 48c0571f831d..bca3cac4bee7 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -87,14 +87,17 @@ #define __DQUOT_PARANOIA /* - * There are two quota SMP locks. dq_list_lock protects all lists with quotas - * and quota formats and also dqstats structure containing statistics about the - * lists. dq_data_lock protects data from dq_dqb and also mem_dqinfo structures - * and also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. + * There are three quota SMP locks. dq_list_lock protects all lists with quotas + * and quota formats, dqstats structure containing statistics about the lists + * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and + * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly - * in inode_add_bytes() and inode_sub_bytes(). + * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects + * modifications of quota state (on quotaon and quotaoff) and readers who care + * about latest values take it as well. * - * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock + * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock, + * dq_list_lock > dq_state_lock * * Note that some things (eg. sb pointer, type, id) doesn't change during * the life of the dquot structure and so needn't to be protected by a lock @@ -103,12 +106,7 @@ * operation is just reading pointers from inode (or not using them at all) the * read lock is enough. If pointers are altered function must hold write lock * (these locking rules also apply for S_NOQUOTA flag in the inode - note that - * for altering the flag i_mutex is also needed). If operation is holding - * reference to dquot in other way (e.g. quotactl ops) it must be guarded by - * dqonoff_mutex. - * This locking assures that: - * a) update/access to dquot pointers in inode is serialized - * b) everyone is guarded against invalidate_dquots() + * for altering the flag i_mutex is also needed). * * Each dquot has its dq_lock mutex. Locked dquots might not be referenced * from inodes (dquot_alloc_space() and such don't check the dq_lock). @@ -122,10 +120,17 @@ * Lock ordering (including related VFS locks) is the following: * i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock > * dqio_mutex + * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem > + * dqptr_sem. But filesystem has to count with the fact that functions such as + * dquot_alloc_space() acquire dqptr_sem and they usually have to be called + * from inside a transaction to keep filesystem consistency after a crash. Also + * filesystems usually want to do some IO on dquot from ->mark_dirty which is + * called with dqptr_sem held. * i_mutex on quota files is special (it's below dqio_mutex) */ static DEFINE_SPINLOCK(dq_list_lock); +static DEFINE_SPINLOCK(dq_state_lock); DEFINE_SPINLOCK(dq_data_lock); static char *quotatypes[] = INITQFNAMES; @@ -428,7 +433,7 @@ static inline void do_destroy_dquot(struct dquot *dquot) * quota is disabled and pointers from inodes removed so there cannot be new * quota users. There can still be some users of quotas due to inodes being * just deleted or pruned by prune_icache() (those are not attached to any - * list). We have to wait for such users. + * list) or parallel quotactl call. We have to wait for such users. */ static void invalidate_dquots(struct super_block *sb, int type) { @@ -600,7 +605,6 @@ static struct shrinker dqcache_shrinker = { /* * Put reference to dquot * NOTE: If you change this function please check whether dqput_blocks() works right... - * MUST be called with either dqptr_sem or dqonoff_mutex held */ void dqput(struct dquot *dquot) { @@ -696,37 +700,31 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) return dquot; } -/* - * Check whether dquot is in memory. - * MUST be called with either dqptr_sem or dqonoff_mutex held - */ -int dquot_is_cached(struct super_block *sb, unsigned int id, int type) -{ - unsigned int hashent = hashfn(sb, id, type); - int ret = 0; - - if (!sb_has_quota_active(sb, type)) - return 0; - spin_lock(&dq_list_lock); - if (find_dquot(hashent, sb, id, type) != NODQUOT) - ret = 1; - spin_unlock(&dq_list_lock); - return ret; -} - /* * Get reference to dquot - * MUST be called with either dqptr_sem or dqonoff_mutex held + * + * Locking is slightly tricky here. We are guarded from parallel quotaoff() + * destroying our dquot by: + * a) checking for quota flags under dq_list_lock and + * b) getting a reference to dquot before we release dq_list_lock */ struct dquot *dqget(struct super_block *sb, unsigned int id, int type) { unsigned int hashent = hashfn(sb, id, type); - struct dquot *dquot, *empty = NODQUOT; + struct dquot *dquot = NODQUOT, *empty = NODQUOT; if (!sb_has_quota_active(sb, type)) return NODQUOT; we_slept: spin_lock(&dq_list_lock); + spin_lock(&dq_state_lock); + if (!sb_has_quota_active(sb, type)) { + spin_unlock(&dq_state_lock); + spin_unlock(&dq_list_lock); + goto out; + } + spin_unlock(&dq_state_lock); + if ((dquot = find_dquot(hashent, sb, id, type)) == NODQUOT) { if (empty == NODQUOT) { spin_unlock(&dq_list_lock); @@ -735,6 +733,7 @@ we_slept: goto we_slept; } dquot = empty; + empty = NODQUOT; dquot->dq_id = id; /* all dquots go on the inuse_list */ put_inuse(dquot); @@ -749,8 +748,6 @@ we_slept: dqstats.cache_hits++; dqstats.lookups++; spin_unlock(&dq_list_lock); - if (empty) - do_destroy_dquot(empty); } /* Wait for dq_lock - after this we know that either dquot_release() is already * finished or it will be canceled due to dq_count > 1 test */ @@ -758,11 +755,15 @@ we_slept: /* Read the dquot and instantiate it (everything done only if needed) */ if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && sb->dq_op->acquire_dquot(dquot) < 0) { dqput(dquot); - return NODQUOT; + dquot = NODQUOT; + goto out; } #ifdef __DQUOT_PARANOIA BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ #endif +out: + if (empty) + do_destroy_dquot(empty); return dquot; } @@ -1198,63 +1199,76 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space) } /* * Initialize quota pointers in inode - * Transaction must be started at entry + * We do things in a bit complicated way but by that we avoid calling + * dqget() and thus filesystem callbacks under dqptr_sem. */ int dquot_initialize(struct inode *inode, int type) { unsigned int id = 0; int cnt, ret = 0; + struct dquot *got[MAXQUOTAS] = { NODQUOT, NODQUOT }; + struct super_block *sb = inode->i_sb; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) return 0; - down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + + /* First get references to structures we might need. */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + switch (cnt) { + case USRQUOTA: + id = inode->i_uid; + break; + case GRPQUOTA: + id = inode->i_gid; + break; + } + got[cnt] = dqget(sb, id, cnt); + } + + down_write(&sb_dqopt(sb)->dqptr_sem); /* Having dqptr_sem we know NOQUOTA flags can't be altered... */ if (IS_NOQUOTA(inode)) goto out_err; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; + /* Avoid races with quotaoff() */ + if (!sb_has_quota_active(sb, cnt)) + continue; if (inode->i_dquot[cnt] == NODQUOT) { - switch (cnt) { - case USRQUOTA: - id = inode->i_uid; - break; - case GRPQUOTA: - id = inode->i_gid; - break; - } - inode->i_dquot[cnt] = dqget(inode->i_sb, id, cnt); + inode->i_dquot[cnt] = got[cnt]; + got[cnt] = NODQUOT; } } out_err: - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + up_write(&sb_dqopt(sb)->dqptr_sem); + /* Drop unused references */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + dqput(got[cnt]); return ret; } /* * Release all quotas referenced by inode - * Transaction must be started at an entry */ -int dquot_drop_locked(struct inode *inode) +int dquot_drop(struct inode *inode) { int cnt; + struct dquot *put[MAXQUOTAS]; + down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (inode->i_dquot[cnt] != NODQUOT) { - dqput(inode->i_dquot[cnt]); - inode->i_dquot[cnt] = NODQUOT; - } + put[cnt] = inode->i_dquot[cnt]; + inode->i_dquot[cnt] = NODQUOT; } - return 0; -} - -int dquot_drop(struct inode *inode) -{ - down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - dquot_drop_locked(inode); up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + dqput(put[cnt]); return 0; } @@ -1470,8 +1484,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) qsize_t space; struct dquot *transfer_from[MAXQUOTAS]; struct dquot *transfer_to[MAXQUOTAS]; - int cnt, ret = NO_QUOTA, chuid = (iattr->ia_valid & ATTR_UID) && inode->i_uid != iattr->ia_uid, - chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid; + int cnt, ret = QUOTA_OK; + int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid, + chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid; char warntype_to[MAXQUOTAS]; char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; @@ -1479,21 +1494,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) return QUOTA_OK; - /* Clear the arrays */ + /* Initialize the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - transfer_to[cnt] = transfer_from[cnt] = NODQUOT; + transfer_from[cnt] = NODQUOT; + transfer_to[cnt] = NODQUOT; warntype_to[cnt] = QUOTA_NL_NOWARN; - } - down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Now recheck reliably when holding dqptr_sem */ - if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; - } - /* First build the transfer_to list - here we can block on - * reading/instantiating of dquots. We know that the transaction for - * us was already started so we don't violate lock ranking here */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { switch (cnt) { case USRQUOTA: if (!chuid) @@ -1507,6 +1512,13 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) break; } } + + down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + /* Now recheck reliably when holding dqptr_sem */ + if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ + up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + goto put_all; + } spin_lock(&dq_data_lock); space = inode_get_bytes(inode); /* Build the transfer_from list and check the limits */ @@ -1517,7 +1529,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) == NO_QUOTA || check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt) == NO_QUOTA) - goto warn_put_all; + goto over_quota; } /* @@ -1545,28 +1557,37 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) inode->i_dquot[cnt] = transfer_to[cnt]; } - ret = QUOTA_OK; -warn_put_all: spin_unlock(&dq_data_lock); + up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + /* Dirtify all the dquots - this can block when journalling */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (transfer_from[cnt]) mark_dquot_dirty(transfer_from[cnt]); - if (transfer_to[cnt]) + if (transfer_to[cnt]) { mark_dquot_dirty(transfer_to[cnt]); + /* The reference we got is transferred to the inode */ + transfer_to[cnt] = NODQUOT; + } } +warn_put_all: flush_warnings(transfer_to, warntype_to); flush_warnings(transfer_from, warntype_from_inodes); flush_warnings(transfer_from, warntype_from_space); - +put_all: for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (ret == QUOTA_OK && transfer_from[cnt] != NODQUOT) - dqput(transfer_from[cnt]); - if (ret == NO_QUOTA && transfer_to[cnt] != NODQUOT) - dqput(transfer_to[cnt]); + dqput(transfer_from[cnt]); + dqput(transfer_to[cnt]); } - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); return ret; +over_quota: + spin_unlock(&dq_data_lock); + up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + /* Clear dquot pointers we don't want to dqput() */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + transfer_from[cnt] = NODQUOT; + ret = NO_QUOTA; + goto warn_put_all; } /* Wrapper for transferring ownership of an inode */ @@ -1651,19 +1672,24 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) continue; if (flags & DQUOT_SUSPENDED) { + spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(DQUOT_SUSPENDED, cnt); + spin_unlock(&dq_state_lock); } else { + spin_lock(&dq_state_lock); dqopt->flags &= ~dquot_state_flag(flags, cnt); /* Turning off suspended quotas? */ if (!sb_has_quota_loaded(sb, cnt) && sb_has_quota_suspended(sb, cnt)) { dqopt->flags &= ~dquot_state_flag( DQUOT_SUSPENDED, cnt); + spin_unlock(&dq_state_lock); iput(dqopt->files[cnt]); dqopt->files[cnt] = NULL; continue; } + spin_unlock(&dq_state_lock); } /* We still have to keep quota loaded? */ @@ -1830,7 +1856,9 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, } mutex_unlock(&dqopt->dqio_mutex); mutex_unlock(&inode->i_mutex); + spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(flags, type); + spin_unlock(&dq_state_lock); add_dquot_ref(sb, type); mutex_unlock(&dqopt->dqonoff_mutex); @@ -1872,9 +1900,11 @@ static int vfs_quota_on_remount(struct super_block *sb, int type) } inode = dqopt->files[type]; dqopt->files[type] = NULL; + spin_lock(&dq_state_lock); flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED, type); dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type); + spin_unlock(&dq_state_lock); mutex_unlock(&dqopt->dqonoff_mutex); flags = dquot_generic_flag(flags, type); @@ -1952,7 +1982,9 @@ int vfs_quota_enable(struct inode *inode, int type, int format_id, ret = -EBUSY; goto out_lock; } + spin_lock(&dq_state_lock); sb_dqopt(sb)->flags |= dquot_state_flag(flags, type); + spin_unlock(&dq_state_lock); out_lock: mutex_unlock(&dqopt->dqonoff_mutex); return ret; @@ -2039,14 +2071,12 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d { struct dquot *dquot; - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - if (!(dquot = dqget(sb, id, type))) { - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + dquot = dqget(sb, id, type); + if (dquot == NODQUOT) return -ESRCH; - } do_get_dqblk(dquot, di); dqput(dquot); - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return 0; } @@ -2130,7 +2160,6 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d struct dquot *dquot; int rc; - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); dquot = dqget(sb, id, type); if (!dquot) { rc = -ESRCH; @@ -2139,7 +2168,6 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d rc = do_set_dqblk(dquot, di); dqput(dquot); out: - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return rc; } @@ -2370,11 +2398,9 @@ EXPORT_SYMBOL(dquot_release); EXPORT_SYMBOL(dquot_mark_dquot_dirty); EXPORT_SYMBOL(dquot_initialize); EXPORT_SYMBOL(dquot_drop); -EXPORT_SYMBOL(dquot_drop_locked); EXPORT_SYMBOL(vfs_dq_drop); EXPORT_SYMBOL(dqget); EXPORT_SYMBOL(dqput); -EXPORT_SYMBOL(dquot_is_cached); EXPORT_SYMBOL(dquot_alloc_space); EXPORT_SYMBOL(dquot_alloc_inode); EXPORT_SYMBOL(dquot_free_space); diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 21b781a3350f..0b35b3a1be05 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -24,10 +24,8 @@ void sync_dquots(struct super_block *sb, int type); int dquot_initialize(struct inode *inode, int type); int dquot_drop(struct inode *inode); -int dquot_drop_locked(struct inode *inode); struct dquot *dqget(struct super_block *sb, unsigned int id, int type); void dqput(struct dquot *dquot); -int dquot_is_cached(struct super_block *sb, unsigned int id, int type); int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), unsigned long priv); -- cgit v1.2.3 From aa8c6c93747f7b55fa11e1624fec8ca33763a805 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 16 Jan 2009 21:54:43 +0100 Subject: PCI PM: Restore standard config registers of all devices early There is a problem in our handling of suspend-resume of PCI devices that many of them have their standard config registers restored with interrupts enabled and they are put into the full power state with interrupts enabled as well. This may lead to the following scenario: * an interrupt vector is shared between two or more devices * one device is resumed earlier and generates an interrupt * the interrupt handler of another device tries to handle it and attempts to access the device the config space of which hasn't been restored yet and/or which still is in a low power state * the system crashes as a result To prevent this from happening we should restore the standard configuration registers of all devices with interrupts disabled and we should put them into the D0 power state right after that. Unfortunately, this cannot be done using the existing pci_set_power_state(), because it can sleep. Also, to do it we have to make sure that the config spaces of all devices were actually saved during suspend. Signed-off-by: Rafael J. Wysocki Acked-by: Linus Torvalds Signed-off-by: Jesse Barnes --- drivers/pci/pci-driver.c | 91 ++++++++++++++---------------------------------- drivers/pci/pci.c | 63 +++++++++++++++++++++++++++++---- drivers/pci/pci.h | 6 ++++ include/linux/pci.h | 5 +++ 4 files changed, 95 insertions(+), 70 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index c697f2680856..9de07b75b993 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -355,17 +355,27 @@ static int pci_legacy_suspend(struct device *dev, pm_message_t state) int i = 0; if (drv && drv->suspend) { + pci_dev->state_saved = false; + i = drv->suspend(pci_dev, state); suspend_report_result(drv->suspend, i); - } else { - pci_save_state(pci_dev); - /* - * This is for compatibility with existing code with legacy PM - * support. - */ - pci_pm_set_unknown_state(pci_dev); + if (i) + return i; + + if (pci_dev->state_saved) + goto Fixup; + + if (WARN_ON_ONCE(pci_dev->current_state != PCI_D0)) + goto Fixup; } + pci_save_state(pci_dev); + /* + * This is for compatibility with existing code with legacy PM support. + */ + pci_pm_set_unknown_state(pci_dev); + + Fixup: pci_fixup_device(pci_fixup_suspend, pci_dev); return i; @@ -386,81 +396,34 @@ static int pci_legacy_suspend_late(struct device *dev, pm_message_t state) static int pci_legacy_resume_early(struct device *dev) { - int error = 0; struct pci_dev * pci_dev = to_pci_dev(dev); struct pci_driver * drv = pci_dev->driver; - pci_fixup_device(pci_fixup_resume_early, pci_dev); - - if (drv && drv->resume_early) - error = drv->resume_early(pci_dev); - return error; + return drv && drv->resume_early ? + drv->resume_early(pci_dev) : 0; } static int pci_legacy_resume(struct device *dev) { - int error; struct pci_dev * pci_dev = to_pci_dev(dev); struct pci_driver * drv = pci_dev->driver; pci_fixup_device(pci_fixup_resume, pci_dev); - if (drv && drv->resume) { - error = drv->resume(pci_dev); - } else { - /* restore the PCI config space */ - pci_restore_state(pci_dev); - error = pci_pm_reenable_device(pci_dev); - } - return error; + return drv && drv->resume ? + drv->resume(pci_dev) : pci_pm_reenable_device(pci_dev); } /* Auxiliary functions used by the new power management framework */ -static int pci_restore_standard_config(struct pci_dev *pci_dev) -{ - struct pci_dev *parent = pci_dev->bus->self; - int error = 0; - - /* Check if the device's bus is operational */ - if (!parent || parent->current_state == PCI_D0) { - pci_restore_state(pci_dev); - pci_update_current_state(pci_dev, PCI_D0); - } else { - dev_warn(&pci_dev->dev, "unable to restore config, " - "bridge %s in low power state D%d\n", pci_name(parent), - parent->current_state); - pci_dev->current_state = PCI_UNKNOWN; - error = -EAGAIN; - } - - return error; -} - -static bool pci_is_bridge(struct pci_dev *pci_dev) -{ - return !!(pci_dev->subordinate); -} - static void pci_pm_default_resume_noirq(struct pci_dev *pci_dev) { - if (pci_restore_standard_config(pci_dev)) - pci_fixup_device(pci_fixup_resume_early, pci_dev); + pci_restore_standard_config(pci_dev); + pci_fixup_device(pci_fixup_resume_early, pci_dev); } static int pci_pm_default_resume(struct pci_dev *pci_dev) { - /* - * pci_restore_standard_config() should have been called once already, - * but it would have failed if the device's parent bridge had not been - * in power state D0 at that time. Check it and try again if necessary. - */ - if (pci_dev->current_state == PCI_UNKNOWN) { - int error = pci_restore_standard_config(pci_dev); - if (error) - return error; - } - pci_fixup_device(pci_fixup_resume, pci_dev); if (!pci_is_bridge(pci_dev)) @@ -575,11 +538,11 @@ static int pci_pm_resume_noirq(struct device *dev) struct device_driver *drv = dev->driver; int error = 0; + pci_pm_default_resume_noirq(pci_dev); + if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_resume_early(dev); - pci_pm_default_resume_noirq(pci_dev); - if (drv && drv->pm && drv->pm->resume_noirq) error = drv->pm->resume_noirq(dev); @@ -730,11 +693,11 @@ static int pci_pm_restore_noirq(struct device *dev) struct device_driver *drv = dev->driver; int error = 0; + pci_pm_default_resume_noirq(pci_dev); + if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_resume_early(dev); - pci_pm_default_resume_noirq(pci_dev); - if (drv && drv->pm && drv->pm->restore_noirq) error = drv->pm->restore_noirq(dev); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index e491fdedf705..17bd9325a245 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -22,7 +22,7 @@ #include /* isa_dma_bridge_buggy */ #include "pci.h" -unsigned int pci_pm_d3_delay = 10; +unsigned int pci_pm_d3_delay = PCI_PM_D3_WAIT; #ifdef CONFIG_PCI_DOMAINS int pci_domains_supported = 1; @@ -426,6 +426,7 @@ static inline int platform_pci_sleep_wake(struct pci_dev *dev, bool enable) * given PCI device * @dev: PCI device to handle. * @state: PCI power state (D0, D1, D2, D3hot) to put the device into. + * @wait: If 'true', wait for the device to change its power state * * RETURN VALUE: * -EINVAL if the requested state is invalid. @@ -435,7 +436,7 @@ static inline int platform_pci_sleep_wake(struct pci_dev *dev, bool enable) * 0 if device's power state has been successfully changed. */ static int -pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state) +pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state, bool wait) { u16 pmcsr; bool need_restore = false; @@ -480,8 +481,10 @@ pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state) break; case PCI_UNKNOWN: /* Boot-up */ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) == PCI_D3hot - && !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET)) + && !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET)) { need_restore = true; + wait = true; + } /* Fall-through: force to D0 */ default: pmcsr = 0; @@ -491,12 +494,15 @@ pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state) /* enter specified state */ pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr); + if (!wait) + return 0; + /* Mandatory power management transition delays */ /* see PCI PM 1.1 5.6.1 table 18 */ if (state == PCI_D3hot || dev->current_state == PCI_D3hot) msleep(pci_pm_d3_delay); else if (state == PCI_D2 || dev->current_state == PCI_D2) - udelay(200); + udelay(PCI_PM_D2_DELAY); dev->current_state = state; @@ -515,7 +521,7 @@ pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state) if (need_restore) pci_restore_bars(dev); - if (dev->bus->self) + if (wait && dev->bus->self) pcie_aspm_pm_state_change(dev->bus->self); return 0; @@ -585,7 +591,7 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state) if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3)) return 0; - error = pci_raw_set_power_state(dev, state); + error = pci_raw_set_power_state(dev, state, true); if (state > PCI_D0 && platform_pci_power_manageable(dev)) { /* Allow the platform to finalize the transition */ @@ -730,6 +736,7 @@ pci_save_state(struct pci_dev *dev) /* XXX: 100% dword access ok here? */ for (i = 0; i < 16; i++) pci_read_config_dword(dev, i * 4,&dev->saved_config_space[i]); + dev->state_saved = true; if ((i = pci_save_pcie_state(dev)) != 0) return i; if ((i = pci_save_pcix_state(dev)) != 0) @@ -1373,6 +1380,50 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev) "unable to preallocate PCI-X save buffer\n"); } +/** + * pci_restore_standard_config - restore standard config registers of PCI device + * @dev: PCI device to handle + * + * This function assumes that the device's configuration space is accessible. + * If the device needs to be powered up, the function will wait for it to + * change the state. + */ +int pci_restore_standard_config(struct pci_dev *dev) +{ + pci_power_t prev_state; + int error; + + pci_restore_state(dev); + pci_update_current_state(dev, PCI_D0); + + prev_state = dev->current_state; + if (prev_state == PCI_D0) + return 0; + + error = pci_raw_set_power_state(dev, PCI_D0, false); + if (error) + return error; + + if (pci_is_bridge(dev)) { + if (prev_state > PCI_D1) + mdelay(PCI_PM_BUS_WAIT); + } else { + switch(prev_state) { + case PCI_D3cold: + case PCI_D3hot: + mdelay(pci_pm_d3_delay); + break; + case PCI_D2: + udelay(PCI_PM_D2_DELAY); + break; + } + } + + dev->current_state = PCI_D0; + + return 0; +} + /** * pci_enable_ari - enable ARI forwarding if hardware support it * @dev: the PCI device diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1351bb4addde..26ddf78ac300 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -49,6 +49,12 @@ extern void pci_disable_enabled_device(struct pci_dev *dev); extern void pci_pm_init(struct pci_dev *dev); extern void platform_pci_wakeup_init(struct pci_dev *dev); extern void pci_allocate_cap_save_buffers(struct pci_dev *dev); +extern int pci_restore_standard_config(struct pci_dev *dev); + +static inline bool pci_is_bridge(struct pci_dev *pci_dev) +{ + return !!(pci_dev->subordinate); +} extern int pci_user_read_config_byte(struct pci_dev *dev, int where, u8 *val); extern int pci_user_read_config_word(struct pci_dev *dev, int where, u16 *val); diff --git a/include/linux/pci.h b/include/linux/pci.h index 80f8b8b65fde..48890cf3f96e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -117,6 +117,10 @@ typedef int __bitwise pci_power_t; #define PCI_UNKNOWN ((pci_power_t __force) 5) #define PCI_POWER_ERROR ((pci_power_t __force) -1) +#define PCI_PM_D2_DELAY 200 +#define PCI_PM_D3_WAIT 10 +#define PCI_PM_BUS_WAIT 50 + /** The pci_channel state describes connectivity between the CPU and * the pci device. If some PCI bus between here and the pci device * has crashed or locked up, this info is reflected here. @@ -252,6 +256,7 @@ struct pci_dev { unsigned int ari_enabled:1; /* ARI forwarding */ unsigned int is_managed:1; unsigned int is_pcie:1; + unsigned int state_saved:1; pci_dev_flags_t dev_flags; atomic_t enable_cnt; /* pci_enable_device has been called */ -- cgit v1.2.3 From c50331e8be32eaba5e1949f98c70d50b891262db Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 19 Jan 2009 15:33:14 -0700 Subject: dmaengine: dma_issue_pending_all == nop when CONFIG_DMA_ENGINE=n The device list will always be empty in this configuration, so no need to walk the list. Reported-by: Ingo Molnar Signed-off-by: Dan Williams --- include/linux/dmaengine.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 64dea2ab326c..c4a560e72ab7 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -390,11 +390,16 @@ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie, enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie); #ifdef CONFIG_DMA_ENGINE enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx); +void dma_issue_pending_all(void); #else static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) { return DMA_SUCCESS; } +static inline void dma_issue_pending_all(void) +{ + do { } while (0); +} #endif /* --- DMA device --- */ @@ -403,7 +408,6 @@ int dma_async_device_register(struct dma_device *device); void dma_async_device_unregister(struct dma_device *device); void dma_run_dependencies(struct dma_async_tx_descriptor *tx); struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type); -void dma_issue_pending_all(void); #define dma_request_channel(mask, x, y) __dma_request_channel(&(mask), x, y) struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param); void dma_release_channel(struct dma_chan *chan); -- cgit v1.2.3 From ef560682a97491f62ef538931a4861b57d66c52c Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Mon, 19 Jan 2009 15:36:21 -0700 Subject: dmaengine: add async_tx_clear_ack() macro To complete the DMA_CTRL_ACK handling API add a async_tx_clear_ack() macro. Signed-off-by: Guennadi Liakhovetski Signed-off-by: Dan Williams --- include/linux/dmaengine.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index c4a560e72ab7..34f124d7fb94 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -287,6 +287,11 @@ static inline void async_tx_ack(struct dma_async_tx_descriptor *tx) tx->flags |= DMA_CTRL_ACK; } +static inline void async_tx_clear_ack(struct dma_async_tx_descriptor *tx) +{ + tx->flags &= ~DMA_CTRL_ACK; +} + static inline bool async_tx_test_ack(struct dma_async_tx_descriptor *tx) { return (tx->flags & DMA_CTRL_ACK) == DMA_CTRL_ACK; -- cgit v1.2.3 From 8adb711f3668b034e7b956fac951ed08b53e0d55 Mon Sep 17 00:00:00 2001 From: Inaky Perez-Gonzalez Date: Tue, 20 Jan 2009 12:17:28 -0800 Subject: debugfs: introduce stub for debugfs_create_size_t() when DEBUG_FS=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Toralf Förster reported a build failure in the WiMAX stack when CONFIG_DEBUG_FS=n http://linuxwimax.org/pipermail/wimax/2009-January/000449.html This is due to debugfs_create_size_t() missing an stub that returns -ENODEV when the DEBUGFS subsystem is not configured in (like the rest of the debugfs API). This patch adds said stub. Reported-by: Toralf Förster Signed-off-by: Inaky Perez-Gonzalez Signed-off-by: Greg Kroah-Hartman --- include/linux/debugfs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 23936b16426b..0f5c33b0bd3e 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -162,6 +162,13 @@ static inline struct dentry *debugfs_create_x32(const char *name, mode_t mode, return ERR_PTR(-ENODEV); } +struct dentry *debugfs_create_size_t(const char *name, mode_t mode, + struct dentry *parent, + size_t *value) +{ + return ERR_PTR(-ENODEV); +} + static inline struct dentry *debugfs_create_bool(const char *name, mode_t mode, struct dentry *parent, u32 *value) -- cgit v1.2.3 From c0e69a5bbc6fc74184aa043aadb9a53bc58f953b Mon Sep 17 00:00:00 2001 From: Jesper Nilsson Date: Wed, 14 Jan 2009 11:19:08 +0100 Subject: klist.c: bit 0 in pointer can't be used as flag The commit a1ed5b0cffe4b16a93a6a3390e8cee0fbef94f86 (klist: don't iterate over deleted entries) introduces use of the low bit in a pointer to indicate if the knode is dead or not, assuming that this bit is always free. This is not true for all architectures, CRIS for example may align data on byte borders. The result is a bunch of warnings on bootup, devices not being added correctly etc, reported by Hinko Kocevar : ------------[ cut here ]------------ WARNING: at lib/klist.c:62 () Modules linked in: Stack from c1fe1cf0: c01cc7f4 c1fe1d11 c000eb4e c000e4de 00000000 00000000 c1f4f78f c1f50c2d c01d008c c1fdd1a0 c1fdd1a0 c1fe1d38 c0192954 c1fe0000 00000000 c1fe1dc0 00000002 7fffffff c1fe1da8 c0192d50 c1fe1dc0 00000002 7fffffff c1ff9fcc Call Trace: [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] <4>---[ end trace 4eaa2a86a8e2da22 ]--- ------------[ cut here ]------------ Repeat ad nauseam. Wed, Jan 14, 2009 at 12:11:32AM +0100, Bastien ROUCARIES wrote: > Perhaps using a pointerhackalign trick on this structure where > #define pointerhackalign(x) __attribute__ ((aligned (x))) > and declare > struct klist_node { > ... > } pointerhackalign(2); > > Because __attribute__ ((aligned (x))) could only increase alignment > it will safe to do that and serve as documentation purpose :) That works, but we need to do it not for the struct klist_node, but for the struct we insert into the void * in klist_node, which is struct klist. Reported-by: Hinko Kocevar Signed-off-by: Jesper Nilsson Cc: stable Signed-off-by: Greg Kroah-Hartman --- include/linux/klist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/klist.h b/include/linux/klist.h index d5a27af9dba5..e91a4e59b771 100644 --- a/include/linux/klist.h +++ b/include/linux/klist.h @@ -22,7 +22,7 @@ struct klist { struct list_head k_list; void (*get)(struct klist_node *); void (*put)(struct klist_node *); -}; +} __attribute__ ((aligned (4))); #define KLIST_INIT(_name, _get, _put) \ { .k_lock = __SPIN_LOCK_UNLOCKED(_name.k_lock), \ -- cgit v1.2.3 From 0b491eee46012772cbf029450d123e933c2e7940 Mon Sep 17 00:00:00 2001 From: Steve Glendinning Date: Wed, 21 Jan 2009 12:35:43 -0800 Subject: usbnet: allow type check of devdbg arguments in non-debug build Improve usbnet's devdbg to always type-check diagnostic arguments, like dev_dbg (device.h). This makes no change to the resulting size of usbnet modules. This patch also removes an #ifdef DEBUG directive from rndis_wlan so it's devdbg statements are always type-checked at compile time. Signed-off-by: Steve Glendinning Signed-off-by: David Brownell Signed-off-by: David S. Miller --- drivers/net/wireless/rndis_wlan.c | 2 -- include/linux/usb/usbnet.h | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c index 607ce9f61b54..ed93ac41297f 100644 --- a/drivers/net/wireless/rndis_wlan.c +++ b/drivers/net/wireless/rndis_wlan.c @@ -1649,9 +1649,7 @@ static char *rndis_translate_scan(struct net_device *dev, char *end_buf, struct ndis_80211_bssid_ex *bssid) { -#ifdef DEBUG struct usbnet *usbdev = netdev_priv(dev); -#endif u8 *ie; char *current_val; int bssid_len, ie_len, i; diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index ba09fe88adda..7d3822243074 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -197,7 +197,9 @@ extern int usbnet_nway_reset(struct net_device *net); #define devdbg(usbnet, fmt, arg...) \ printk(KERN_DEBUG "%s: " fmt "\n" , (usbnet)->net->name , ## arg) #else -#define devdbg(usbnet, fmt, arg...) do {} while(0) +#define devdbg(usbnet, fmt, arg...) \ + ({ if (0) printk(KERN_DEBUG "%s: " fmt "\n" , (usbnet)->net->name , \ + ## arg); 0; }) #endif #define deverr(usbnet, fmt, arg...) \ -- cgit v1.2.3 From 336f6c322d87806ef93afad6308ac65083a865e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 22 Jan 2009 09:50:44 +0100 Subject: debugobjects: add and use INIT_WORK_ON_STACK Impact: Fix debugobjects warning debugobject enabled kernels spit out a warning in hpet code due to a workqueue which is initialized on stack. Add INIT_WORK_ON_STACK() which calls init_timer_on_stack() and use it in hpet. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 3 ++- include/linux/workqueue.h | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index cd759ad90690..64d5ad0b8add 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -628,11 +628,12 @@ static int hpet_cpuhp_notify(struct notifier_block *n, switch (action & 0xf) { case CPU_ONLINE: - INIT_DELAYED_WORK(&work.work, hpet_work); + INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work); init_completion(&work.complete); /* FIXME: add schedule_work_on() */ schedule_delayed_work_on(cpu, &work.work, 0); wait_for_completion(&work.complete); + destroy_timer_on_stack(&work.work.timer); break; case CPU_DEAD: if (hdev) { diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index b36291130f22..20b59eb1facd 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -124,6 +124,12 @@ struct execute_work { init_timer_deferrable(&(_work)->timer); \ } while (0) +#define INIT_DELAYED_WORK_ON_STACK(_work, _func) \ + do { \ + INIT_WORK(&(_work)->work, (_func)); \ + init_timer_on_stack(&(_work)->timer); \ + } while (0) + /** * work_pending - Find out whether a work item is currently pending * @work: The work item in question -- cgit v1.2.3 From e4d866cdea24543ee16ce6d07d80c513e86ba983 Mon Sep 17 00:00:00 2001 From: "JosephChan@via.com.tw" Date: Fri, 23 Jan 2009 15:37:39 +0800 Subject: [libata] pata_via: support VX855, future chips whose IDE controller use 0x0571 It supports VX855 and future chips whose IDE controller uses PCI ID 0x0571. Signed-off-by: Joseph Chan Acked-by: Tejun Heo Signed-off-by: Jeff Garzik --- drivers/ata/pata_via.c | 22 +++++++++++++++++----- include/linux/pci_ids.h | 4 ++++ 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c index 681169c9c640..79a6c9a0b721 100644 --- a/drivers/ata/pata_via.c +++ b/drivers/ata/pata_via.c @@ -86,6 +86,10 @@ enum { VIA_SATA_PATA = 0x800, /* SATA/PATA combined configuration */ }; +enum { + VIA_IDFLAG_SINGLE = (1 << 0), /* single channel controller) */ +}; + /* * VIA SouthBridge chips. */ @@ -97,8 +101,12 @@ static const struct via_isa_bridge { u8 rev_max; u16 flags; } via_isa_bridges[] = { + { "vx855", PCI_DEVICE_ID_VIA_VX855, 0x00, 0x2f, + VIA_UDMA_133 | VIA_BAD_AST | VIA_SATA_PATA }, { "vx800", PCI_DEVICE_ID_VIA_VX800, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_SATA_PATA }, + { "vt8261", PCI_DEVICE_ID_VIA_8261, 0x00, 0x2f, + VIA_UDMA_133 | VIA_BAD_AST }, { "vt8237s", PCI_DEVICE_ID_VIA_8237S, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, { "vt8251", PCI_DEVICE_ID_VIA_8251, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, { "cx700", PCI_DEVICE_ID_VIA_CX700, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_SATA_PATA }, @@ -122,6 +130,8 @@ static const struct via_isa_bridge { { "vt82c586", PCI_DEVICE_ID_VIA_82C586_0, 0x00, 0x0f, VIA_UDMA_NONE | VIA_SET_FIFO }, { "vt82c576", PCI_DEVICE_ID_VIA_82C576, 0x00, 0x2f, VIA_UDMA_NONE | VIA_SET_FIFO | VIA_NO_UNMASK }, { "vt82c576", PCI_DEVICE_ID_VIA_82C576, 0x00, 0x2f, VIA_UDMA_NONE | VIA_SET_FIFO | VIA_NO_UNMASK | VIA_BAD_ID }, + { "vtxxxx", PCI_DEVICE_ID_VIA_ANON, 0x00, 0x2f, + VIA_UDMA_133 | VIA_BAD_AST }, { NULL } }; @@ -460,6 +470,7 @@ static int via_init_one(struct pci_dev *pdev, const struct pci_device_id *id) static int printed_version; u8 enable; u32 timing; + unsigned long flags = id->driver_data; int rc; if (!printed_version++) @@ -469,9 +480,13 @@ static int via_init_one(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) return rc; + if (flags & VIA_IDFLAG_SINGLE) + ppi[1] = &ata_dummy_port_info; + /* To find out how the IDE will behave and what features we actually have to look at the bridge not the IDE controller */ - for (config = via_isa_bridges; config->id; config++) + for (config = via_isa_bridges; config->id != PCI_DEVICE_ID_VIA_ANON; + config++) if ((isa = pci_get_device(PCI_VENDOR_ID_VIA + !!(config->flags & VIA_BAD_ID), config->id, NULL))) { @@ -482,10 +497,6 @@ static int via_init_one(struct pci_dev *pdev, const struct pci_device_id *id) pci_dev_put(isa); } - if (!config->id) { - printk(KERN_WARNING "via: Unknown VIA SouthBridge, disabling.\n"); - return -ENODEV; - } pci_dev_put(isa); if (!(config->flags & VIA_NO_ENABLES)) { @@ -587,6 +598,7 @@ static const struct pci_device_id via[] = { { PCI_VDEVICE(VIA, 0x1571), }, { PCI_VDEVICE(VIA, 0x3164), }, { PCI_VDEVICE(VIA, 0x5324), }, + { PCI_VDEVICE(VIA, 0xC409), VIA_IDFLAG_SINGLE }, { }, }; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index d56ad9c21c09..febc10ed3858 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1357,6 +1357,7 @@ #define PCI_DEVICE_ID_VIA_8783_0 0x3208 #define PCI_DEVICE_ID_VIA_8237 0x3227 #define PCI_DEVICE_ID_VIA_8251 0x3287 +#define PCI_DEVICE_ID_VIA_8261 0x3402 #define PCI_DEVICE_ID_VIA_8237A 0x3337 #define PCI_DEVICE_ID_VIA_8237S 0x3372 #define PCI_DEVICE_ID_VIA_SATA_EIDE 0x5324 @@ -1366,10 +1367,13 @@ #define PCI_DEVICE_ID_VIA_CX700 0x8324 #define PCI_DEVICE_ID_VIA_CX700_IDE 0x0581 #define PCI_DEVICE_ID_VIA_VX800 0x8353 +#define PCI_DEVICE_ID_VIA_VX855 0x8409 #define PCI_DEVICE_ID_VIA_8371_1 0x8391 #define PCI_DEVICE_ID_VIA_82C598_1 0x8598 #define PCI_DEVICE_ID_VIA_838X_1 0xB188 #define PCI_DEVICE_ID_VIA_83_87XX_1 0xB198 +#define PCI_DEVICE_ID_VIA_C409_IDE 0XC409 +#define PCI_DEVICE_ID_VIA_ANON 0xFFFF #define PCI_VENDOR_ID_SIEMENS 0x110A #define PCI_DEVICE_ID_SIEMENS_DSCC4 0x2102 -- cgit v1.2.3 From e2ba42ed0f8ba174302ebfabfa063fb456d5d6f5 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 26 Jan 2009 21:19:52 +0100 Subject: i2c: Delete 10 unused driver IDs Signed-off-by: Jean Delvare --- include/linux/i2c-id.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h index 01d67ba9e985..82b7eee22636 100644 --- a/include/linux/i2c-id.h +++ b/include/linux/i2c-id.h @@ -40,9 +40,7 @@ #define I2C_DRIVERID_SAA7185B 13 /* video encoder */ #define I2C_DRIVERID_SAA7110 22 /* video decoder */ #define I2C_DRIVERID_SAA5249 24 /* SAA5249 and compatibles */ -#define I2C_DRIVERID_PCF8583 25 /* real time clock */ #define I2C_DRIVERID_TDA7432 27 /* Stereo sound processor */ -#define I2C_DRIVERID_TVMIXER 28 /* Mixer driver for tv cards */ #define I2C_DRIVERID_TVAUDIO 29 /* Generic TV sound driver */ #define I2C_DRIVERID_TDA9875 32 /* TV sound decoder chip */ #define I2C_DRIVERID_BT819 40 /* video decoder */ @@ -54,7 +52,6 @@ #define I2C_DRIVERID_SAA7191 57 /* video decoder */ #define I2C_DRIVERID_INDYCAM 58 /* SGI IndyCam */ #define I2C_DRIVERID_OVCAMCHIP 61 /* OmniVision CMOS image sens. */ -#define I2C_DRIVERID_MAX6900 63 /* MAX6900 real-time clock */ #define I2C_DRIVERID_SAA6752HS 67 /* MPEG2 encoder */ #define I2C_DRIVERID_TVEEPROM 68 /* TV EEPROM */ #define I2C_DRIVERID_WM8775 69 /* wm8775 audio processor */ @@ -62,23 +59,16 @@ #define I2C_DRIVERID_CX25840 71 /* cx2584x video encoder */ #define I2C_DRIVERID_SAA7127 72 /* saa7127 video encoder */ #define I2C_DRIVERID_SAA711X 73 /* saa711x video encoders */ -#define I2C_DRIVERID_AKITAIOEXP 74 /* IO Expander on Sharp SL-C1000 */ #define I2C_DRIVERID_INFRARED 75 /* I2C InfraRed on Video boards */ #define I2C_DRIVERID_TVP5150 76 /* TVP5150 video decoder */ #define I2C_DRIVERID_WM8739 77 /* wm8739 audio processor */ #define I2C_DRIVERID_UPD64083 78 /* upd64083 video processor */ #define I2C_DRIVERID_UPD64031A 79 /* upd64031a video processor */ #define I2C_DRIVERID_SAA717X 80 /* saa717x video encoder */ -#define I2C_DRIVERID_DS1672 81 /* Dallas/Maxim DS1672 RTC */ #define I2C_DRIVERID_BT866 85 /* Conexant bt866 video encoder */ #define I2C_DRIVERID_KS0127 86 /* Samsung ks0127 video decoder */ #define I2C_DRIVERID_TLV320AIC23B 87 /* TI TLV320AIC23B audio codec */ -#define I2C_DRIVERID_WM8731 89 /* Wolfson WM8731 audio codec */ -#define I2C_DRIVERID_WM8750 90 /* Wolfson WM8750 audio codec */ -#define I2C_DRIVERID_WM8753 91 /* Wolfson WM8753 audio codec */ -#define I2C_DRIVERID_LM4857 92 /* LM4857 Audio Amplifier */ #define I2C_DRIVERID_VP27SMPX 93 /* Panasonic VP27s tuner internal MPX */ -#define I2C_DRIVERID_CS4270 94 /* Cirrus Logic 4270 audio codec */ #define I2C_DRIVERID_M52790 95 /* Mitsubishi M52790SP/FP AV switch */ #define I2C_DRIVERID_CS5345 96 /* cs5345 audio processor */ -- cgit v1.2.3 From 1745522ccbabd990bfc7511861aa9fa98287cba0 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 26 Jan 2009 21:19:52 +0100 Subject: i2c: Delete many unused adapter IDs Signed-off-by: Jean Delvare --- drivers/gpu/drm/i915/intel_i2c.c | 4 --- drivers/i2c/busses/i2c-acorn.c | 1 - drivers/i2c/busses/i2c-ali1535.c | 1 - drivers/i2c/busses/i2c-ali1563.c | 1 - drivers/i2c/busses/i2c-ali15x3.c | 1 - drivers/i2c/busses/i2c-amd756.c | 1 - drivers/i2c/busses/i2c-amd8111.c | 1 - drivers/i2c/busses/i2c-au1550.c | 1 - drivers/i2c/busses/i2c-bfin-twi.c | 1 - drivers/i2c/busses/i2c-elektor.c | 1 - drivers/i2c/busses/i2c-hydra.c | 1 - drivers/i2c/busses/i2c-i801.c | 1 - drivers/i2c/busses/i2c-ibm_iic.c | 1 - drivers/i2c/busses/i2c-iop3xx.c | 1 - drivers/i2c/busses/i2c-ixp2000.c | 1 - drivers/i2c/busses/i2c-mpc.c | 1 - drivers/i2c/busses/i2c-mv64xxx.c | 1 - drivers/i2c/busses/i2c-nforce2.c | 1 - drivers/i2c/busses/i2c-parport-light.c | 1 - drivers/i2c/busses/i2c-parport.c | 1 - drivers/i2c/busses/i2c-pca-isa.c | 1 - drivers/i2c/busses/i2c-piix4.c | 1 - drivers/i2c/busses/i2c-sibyte.c | 2 -- drivers/i2c/busses/i2c-sis5595.c | 1 - drivers/i2c/busses/i2c-sis630.c | 1 - drivers/i2c/busses/i2c-sis96x.c | 1 - drivers/i2c/busses/i2c-via.c | 1 - drivers/i2c/busses/i2c-viapro.c | 1 - drivers/i2c/busses/i2c-voodoo3.c | 2 -- drivers/i2c/busses/scx200_acb.c | 1 - drivers/i2c/busses/scx200_i2c.c | 1 - drivers/ieee1394/pcilynx.c | 1 - drivers/video/aty/radeon_i2c.c | 1 - drivers/video/i810/i810-i2c.c | 1 - drivers/video/intelfb/intelfb_i2c.c | 1 - drivers/video/nvidia/nv_i2c.c | 1 - drivers/video/savage/savagefb-i2c.c | 1 - include/linux/i2c-id.h | 51 ---------------------------------- 38 files changed, 93 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/i915/intel_i2c.c b/drivers/gpu/drm/i915/intel_i2c.c index a5a2f5339e9e..5ee9d4c25753 100644 --- a/drivers/gpu/drm/i915/intel_i2c.c +++ b/drivers/gpu/drm/i915/intel_i2c.c @@ -137,10 +137,6 @@ struct intel_i2c_chan *intel_i2c_create(struct drm_device *dev, const u32 reg, chan->reg = reg; snprintf(chan->adapter.name, I2C_NAME_SIZE, "intel drm %s", name); chan->adapter.owner = THIS_MODULE; -#ifndef I2C_HW_B_INTELFB -#define I2C_HW_B_INTELFB I2C_HW_B_I810 -#endif - chan->adapter.id = I2C_HW_B_INTELFB; chan->adapter.algo_data = &chan->algo; chan->adapter.dev.parent = &dev->pdev->dev; chan->algo.setsda = set_data; diff --git a/drivers/i2c/busses/i2c-acorn.c b/drivers/i2c/busses/i2c-acorn.c index 75089febbc13..9fee3ca17344 100644 --- a/drivers/i2c/busses/i2c-acorn.c +++ b/drivers/i2c/busses/i2c-acorn.c @@ -83,7 +83,6 @@ static struct i2c_algo_bit_data ioc_data = { }; static struct i2c_adapter ioc_ops = { - .id = I2C_HW_B_IOC, .algo_data = &ioc_data, }; diff --git a/drivers/i2c/busses/i2c-ali1535.c b/drivers/i2c/busses/i2c-ali1535.c index 9cead9b9458e..981e080b32ae 100644 --- a/drivers/i2c/busses/i2c-ali1535.c +++ b/drivers/i2c/busses/i2c-ali1535.c @@ -476,7 +476,6 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter ali1535_adapter = { .owner = THIS_MODULE, - .id = I2C_HW_SMBUS_ALI1535, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-ali1563.c b/drivers/i2c/busses/i2c-ali1563.c index dd9e796fad69..f70f46582c6c 100644 --- a/drivers/i2c/busses/i2c-ali1563.c +++ b/drivers/i2c/busses/i2c-ali1563.c @@ -386,7 +386,6 @@ static const struct i2c_algorithm ali1563_algorithm = { static struct i2c_adapter ali1563_adapter = { .owner = THIS_MODULE, - .id = I2C_HW_SMBUS_ALI1563, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &ali1563_algorithm, }; diff --git a/drivers/i2c/busses/i2c-ali15x3.c b/drivers/i2c/busses/i2c-ali15x3.c index 234fdde7d40e..39066dee46e3 100644 --- a/drivers/i2c/busses/i2c-ali15x3.c +++ b/drivers/i2c/busses/i2c-ali15x3.c @@ -473,7 +473,6 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter ali15x3_adapter = { .owner = THIS_MODULE, - .id = I2C_HW_SMBUS_ALI15X3, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-amd756.c b/drivers/i2c/busses/i2c-amd756.c index 36bee5b9c952..220f4a1eee1d 100644 --- a/drivers/i2c/busses/i2c-amd756.c +++ b/drivers/i2c/busses/i2c-amd756.c @@ -298,7 +298,6 @@ static const struct i2c_algorithm smbus_algorithm = { struct i2c_adapter amd756_smbus = { .owner = THIS_MODULE, - .id = I2C_HW_SMBUS_AMD756, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-amd8111.c b/drivers/i2c/busses/i2c-amd8111.c index 3972208876b3..edab51973bf5 100644 --- a/drivers/i2c/busses/i2c-amd8111.c +++ b/drivers/i2c/busses/i2c-amd8111.c @@ -387,7 +387,6 @@ static int __devinit amd8111_probe(struct pci_dev *dev, smbus->adapter.owner = THIS_MODULE; snprintf(smbus->adapter.name, sizeof(smbus->adapter.name), "SMBus2 AMD8111 adapter at %04x", smbus->base); - smbus->adapter.id = I2C_HW_SMBUS_AMD8111; smbus->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD; smbus->adapter.algo = &smbus_algorithm; smbus->adapter.algo_data = smbus; diff --git a/drivers/i2c/busses/i2c-au1550.c b/drivers/i2c/busses/i2c-au1550.c index 66a04c2c660f..f78ce523e3db 100644 --- a/drivers/i2c/busses/i2c-au1550.c +++ b/drivers/i2c/busses/i2c-au1550.c @@ -400,7 +400,6 @@ i2c_au1550_probe(struct platform_device *pdev) priv->xfer_timeout = 200; priv->ack_timeout = 200; - priv->adap.id = I2C_HW_AU1550_PSC; priv->adap.nr = pdev->id; priv->adap.algo = &au1550_algo; priv->adap.algo_data = priv; diff --git a/drivers/i2c/busses/i2c-bfin-twi.c b/drivers/i2c/busses/i2c-bfin-twi.c index 3fd2c417c1e0..fc548b3d002e 100644 --- a/drivers/i2c/busses/i2c-bfin-twi.c +++ b/drivers/i2c/busses/i2c-bfin-twi.c @@ -651,7 +651,6 @@ static int i2c_bfin_twi_probe(struct platform_device *pdev) iface->timeout_timer.data = (unsigned long)iface; p_adap = &iface->adap; - p_adap->id = I2C_HW_BLACKFIN; p_adap->nr = pdev->id; strlcpy(p_adap->name, pdev->name, sizeof(p_adap->name)); p_adap->algo = &bfin_twi_algorithm; diff --git a/drivers/i2c/busses/i2c-elektor.c b/drivers/i2c/busses/i2c-elektor.c index 0ed3ccb81b63..448b4bf35eb7 100644 --- a/drivers/i2c/busses/i2c-elektor.c +++ b/drivers/i2c/busses/i2c-elektor.c @@ -202,7 +202,6 @@ static struct i2c_algo_pcf_data pcf_isa_data = { static struct i2c_adapter pcf_isa_ops = { .owner = THIS_MODULE, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, - .id = I2C_HW_P_ELEK, .algo_data = &pcf_isa_data, .name = "i2c-elektor", }; diff --git a/drivers/i2c/busses/i2c-hydra.c b/drivers/i2c/busses/i2c-hydra.c index 648aa7baff83..bec9b845dd16 100644 --- a/drivers/i2c/busses/i2c-hydra.c +++ b/drivers/i2c/busses/i2c-hydra.c @@ -102,7 +102,6 @@ static struct i2c_algo_bit_data hydra_bit_data = { static struct i2c_adapter hydra_adap = { .owner = THIS_MODULE, .name = "Hydra i2c", - .id = I2C_HW_B_HYDRA, .algo_data = &hydra_bit_data, }; diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index 526625eaa84b..230238df56c4 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -556,7 +556,6 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter i801_adapter = { .owner = THIS_MODULE, - .id = I2C_HW_SMBUS_I801, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-ibm_iic.c b/drivers/i2c/busses/i2c-ibm_iic.c index 651f2f1ae5b7..88f0db73b364 100644 --- a/drivers/i2c/busses/i2c-ibm_iic.c +++ b/drivers/i2c/busses/i2c-ibm_iic.c @@ -746,7 +746,6 @@ static int __devinit iic_probe(struct of_device *ofdev, adap->dev.parent = &ofdev->dev; strlcpy(adap->name, "IBM IIC", sizeof(adap->name)); i2c_set_adapdata(adap, dev); - adap->id = I2C_HW_OCP; adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; adap->algo = &iic_algo; adap->timeout = 1; diff --git a/drivers/i2c/busses/i2c-iop3xx.c b/drivers/i2c/busses/i2c-iop3xx.c index fc2714ac0c0f..3190690c26ce 100644 --- a/drivers/i2c/busses/i2c-iop3xx.c +++ b/drivers/i2c/busses/i2c-iop3xx.c @@ -480,7 +480,6 @@ iop3xx_i2c_probe(struct platform_device *pdev) } memcpy(new_adapter->name, pdev->name, strlen(pdev->name)); - new_adapter->id = I2C_HW_IOP3XX; new_adapter->owner = THIS_MODULE; new_adapter->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; new_adapter->dev.parent = &pdev->dev; diff --git a/drivers/i2c/busses/i2c-ixp2000.c b/drivers/i2c/busses/i2c-ixp2000.c index 05d72e981353..8e8467970481 100644 --- a/drivers/i2c/busses/i2c-ixp2000.c +++ b/drivers/i2c/busses/i2c-ixp2000.c @@ -116,7 +116,6 @@ static int ixp2000_i2c_probe(struct platform_device *plat_dev) drv_data->algo_data.udelay = 6; drv_data->algo_data.timeout = 100; - drv_data->adapter.id = I2C_HW_B_IXP2000, strlcpy(drv_data->adapter.name, plat_dev->dev.driver->name, sizeof(drv_data->adapter.name)); drv_data->adapter.algo_data = &drv_data->algo_data, diff --git a/drivers/i2c/busses/i2c-mpc.c b/drivers/i2c/busses/i2c-mpc.c index a9a45fcc8544..aedbbe6618db 100644 --- a/drivers/i2c/busses/i2c-mpc.c +++ b/drivers/i2c/busses/i2c-mpc.c @@ -310,7 +310,6 @@ static const struct i2c_algorithm mpc_algo = { static struct i2c_adapter mpc_ops = { .owner = THIS_MODULE, .name = "MPC adapter", - .id = I2C_HW_MPC107, .algo = &mpc_algo, .timeout = 1, }; diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c index 9e8118d2fe64..eeda276f8f16 100644 --- a/drivers/i2c/busses/i2c-mv64xxx.c +++ b/drivers/i2c/busses/i2c-mv64xxx.c @@ -527,7 +527,6 @@ mv64xxx_i2c_probe(struct platform_device *pd) goto exit_unmap_regs; } drv_data->adapter.dev.parent = &pd->dev; - drv_data->adapter.id = I2C_HW_MV64XXX; drv_data->adapter.algo = &mv64xxx_i2c_algo; drv_data->adapter.owner = THIS_MODULE; drv_data->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD; diff --git a/drivers/i2c/busses/i2c-nforce2.c b/drivers/i2c/busses/i2c-nforce2.c index 3b19bc41a60b..05af6cd7f270 100644 --- a/drivers/i2c/busses/i2c-nforce2.c +++ b/drivers/i2c/busses/i2c-nforce2.c @@ -355,7 +355,6 @@ static int __devinit nforce2_probe_smb (struct pci_dev *dev, int bar, return -EBUSY; } smbus->adapter.owner = THIS_MODULE; - smbus->adapter.id = I2C_HW_SMBUS_NFORCE2; smbus->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD; smbus->adapter.algo = &smbus_algorithm; smbus->adapter.algo_data = smbus; diff --git a/drivers/i2c/busses/i2c-parport-light.c b/drivers/i2c/busses/i2c-parport-light.c index b2b8380f6602..322c5691e38e 100644 --- a/drivers/i2c/busses/i2c-parport-light.c +++ b/drivers/i2c/busses/i2c-parport-light.c @@ -115,7 +115,6 @@ static struct i2c_algo_bit_data parport_algo_data = { static struct i2c_adapter parport_adapter = { .owner = THIS_MODULE, .class = I2C_CLASS_HWMON, - .id = I2C_HW_B_LP, .algo_data = &parport_algo_data, .name = "Parallel port adapter (light)", }; diff --git a/drivers/i2c/busses/i2c-parport.c b/drivers/i2c/busses/i2c-parport.c index a257cd5cd134..0d8998610c74 100644 --- a/drivers/i2c/busses/i2c-parport.c +++ b/drivers/i2c/busses/i2c-parport.c @@ -164,7 +164,6 @@ static void i2c_parport_attach (struct parport *port) /* Fill the rest of the structure */ adapter->adapter.owner = THIS_MODULE; adapter->adapter.class = I2C_CLASS_HWMON; - adapter->adapter.id = I2C_HW_B_LP; strlcpy(adapter->adapter.name, "Parallel port adapter", sizeof(adapter->adapter.name)); adapter->algo_data = parport_algo_data; diff --git a/drivers/i2c/busses/i2c-pca-isa.c b/drivers/i2c/busses/i2c-pca-isa.c index 9eb76268ec78..4aa8138cb0a9 100644 --- a/drivers/i2c/busses/i2c-pca-isa.c +++ b/drivers/i2c/busses/i2c-pca-isa.c @@ -101,7 +101,6 @@ static struct i2c_algo_pca_data pca_isa_data = { static struct i2c_adapter pca_isa_ops = { .owner = THIS_MODULE, - .id = I2C_HW_A_ISA, .algo_data = &pca_isa_data, .name = "PCA9564 ISA Adapter", .timeout = 100, diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c index eaa9b387543e..761f9dd53620 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -403,7 +403,6 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter piix4_adapter = { .owner = THIS_MODULE, - .id = I2C_HW_SMBUS_PIIX4, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-sibyte.c b/drivers/i2c/busses/i2c-sibyte.c index 4ddefbf238e9..98b1ec489159 100644 --- a/drivers/i2c/busses/i2c-sibyte.c +++ b/drivers/i2c/busses/i2c-sibyte.c @@ -155,7 +155,6 @@ static struct i2c_algo_sibyte_data sibyte_board_data[2] = { static struct i2c_adapter sibyte_board_adapter[2] = { { .owner = THIS_MODULE, - .id = I2C_HW_SIBYTE, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = NULL, .algo_data = &sibyte_board_data[0], @@ -164,7 +163,6 @@ static struct i2c_adapter sibyte_board_adapter[2] = { }, { .owner = THIS_MODULE, - .id = I2C_HW_SIBYTE, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = NULL, .algo_data = &sibyte_board_data[1], diff --git a/drivers/i2c/busses/i2c-sis5595.c b/drivers/i2c/busses/i2c-sis5595.c index 8ce2daff985c..f320ab27da46 100644 --- a/drivers/i2c/busses/i2c-sis5595.c +++ b/drivers/i2c/busses/i2c-sis5595.c @@ -365,7 +365,6 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter sis5595_adapter = { .owner = THIS_MODULE, - .id = I2C_HW_SMBUS_SIS5595, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-sis630.c b/drivers/i2c/busses/i2c-sis630.c index 9c9c016ff2b5..50c3610e6028 100644 --- a/drivers/i2c/busses/i2c-sis630.c +++ b/drivers/i2c/busses/i2c-sis630.c @@ -464,7 +464,6 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter sis630_adapter = { .owner = THIS_MODULE, - .id = I2C_HW_SMBUS_SIS630, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-sis96x.c b/drivers/i2c/busses/i2c-sis96x.c index f1bba6396641..7e1594b40579 100644 --- a/drivers/i2c/busses/i2c-sis96x.c +++ b/drivers/i2c/busses/i2c-sis96x.c @@ -241,7 +241,6 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter sis96x_adapter = { .owner = THIS_MODULE, - .id = I2C_HW_SMBUS_SIS96X, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-via.c b/drivers/i2c/busses/i2c-via.c index 29cef0433f34..8b24f192103a 100644 --- a/drivers/i2c/busses/i2c-via.c +++ b/drivers/i2c/busses/i2c-via.c @@ -83,7 +83,6 @@ static struct i2c_algo_bit_data bit_data = { static struct i2c_adapter vt586b_adapter = { .owner = THIS_MODULE, - .id = I2C_HW_B_VIA, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .name = "VIA i2c", .algo_data = &bit_data, diff --git a/drivers/i2c/busses/i2c-viapro.c b/drivers/i2c/busses/i2c-viapro.c index 9f194d9efd91..02e6f724b05f 100644 --- a/drivers/i2c/busses/i2c-viapro.c +++ b/drivers/i2c/busses/i2c-viapro.c @@ -321,7 +321,6 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter vt596_adapter = { .owner = THIS_MODULE, - .id = I2C_HW_SMBUS_VIA2, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-voodoo3.c b/drivers/i2c/busses/i2c-voodoo3.c index 1d4ae26ba73d..1a474acc0ddd 100644 --- a/drivers/i2c/busses/i2c-voodoo3.c +++ b/drivers/i2c/busses/i2c-voodoo3.c @@ -163,7 +163,6 @@ static struct i2c_algo_bit_data voo_i2c_bit_data = { static struct i2c_adapter voodoo3_i2c_adapter = { .owner = THIS_MODULE, - .id = I2C_HW_B_VOO, .class = I2C_CLASS_TV_ANALOG, .name = "I2C Voodoo3/Banshee adapter", .algo_data = &voo_i2c_bit_data, @@ -180,7 +179,6 @@ static struct i2c_algo_bit_data voo_ddc_bit_data = { static struct i2c_adapter voodoo3_ddc_adapter = { .owner = THIS_MODULE, - .id = I2C_HW_B_VOO, .class = I2C_CLASS_DDC, .name = "DDC Voodoo3/Banshee adapter", .algo_data = &voo_ddc_bit_data, diff --git a/drivers/i2c/busses/scx200_acb.c b/drivers/i2c/busses/scx200_acb.c index ed794b145a11..648ecc6f60e6 100644 --- a/drivers/i2c/busses/scx200_acb.c +++ b/drivers/i2c/busses/scx200_acb.c @@ -440,7 +440,6 @@ static __init struct scx200_acb_iface *scx200_create_iface(const char *text, i2c_set_adapdata(adapter, iface); snprintf(adapter->name, sizeof(adapter->name), "%s ACB%d", text, index); adapter->owner = THIS_MODULE; - adapter->id = I2C_HW_SMBUS_SCX200; adapter->algo = &scx200_acb_algorithm; adapter->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; adapter->dev.parent = dev; diff --git a/drivers/i2c/busses/scx200_i2c.c b/drivers/i2c/busses/scx200_i2c.c index e4c98539c517..162b74a04886 100644 --- a/drivers/i2c/busses/scx200_i2c.c +++ b/drivers/i2c/busses/scx200_i2c.c @@ -82,7 +82,6 @@ static struct i2c_algo_bit_data scx200_i2c_data = { static struct i2c_adapter scx200_i2c_ops = { .owner = THIS_MODULE, .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, - .id = I2C_HW_B_SCX200, .algo_data = &scx200_i2c_data, .name = "NatSemi SCx200 I2C", }; diff --git a/drivers/ieee1394/pcilynx.c b/drivers/ieee1394/pcilynx.c index dc15cadb06ef..38f712036201 100644 --- a/drivers/ieee1394/pcilynx.c +++ b/drivers/ieee1394/pcilynx.c @@ -1419,7 +1419,6 @@ static int __devinit add_card(struct pci_dev *dev, i2c_ad = kzalloc(sizeof(*i2c_ad), GFP_KERNEL); if (!i2c_ad) FAIL("failed to allocate I2C adapter memory"); - i2c_ad->id = I2C_HW_B_PCILYNX; strlcpy(i2c_ad->name, "PCILynx I2C", sizeof(i2c_ad->name)); i2c_adapter_data = bit_data; i2c_ad->algo_data = &i2c_adapter_data; diff --git a/drivers/video/aty/radeon_i2c.c b/drivers/video/aty/radeon_i2c.c index 2c5567175dca..359fc64e761a 100644 --- a/drivers/video/aty/radeon_i2c.c +++ b/drivers/video/aty/radeon_i2c.c @@ -72,7 +72,6 @@ static int radeon_setup_i2c_bus(struct radeon_i2c_chan *chan, const char *name) snprintf(chan->adapter.name, sizeof(chan->adapter.name), "radeonfb %s", name); chan->adapter.owner = THIS_MODULE; - chan->adapter.id = I2C_HW_B_RADEON; chan->adapter.algo_data = &chan->algo; chan->adapter.dev.parent = &chan->rinfo->pdev->dev; chan->algo.setsda = radeon_gpio_setsda; diff --git a/drivers/video/i810/i810-i2c.c b/drivers/video/i810/i810-i2c.c index 7787c3322ffb..9dd55e5324a1 100644 --- a/drivers/video/i810/i810-i2c.c +++ b/drivers/video/i810/i810-i2c.c @@ -90,7 +90,6 @@ static int i810_setup_i2c_bus(struct i810fb_i2c_chan *chan, const char *name) chan->adapter.owner = THIS_MODULE; chan->adapter.algo_data = &chan->algo; chan->adapter.dev.parent = &chan->par->dev->dev; - chan->adapter.id = I2C_HW_B_I810; chan->algo.setsda = i810i2c_setsda; chan->algo.setscl = i810i2c_setscl; chan->algo.getsda = i810i2c_getsda; diff --git a/drivers/video/intelfb/intelfb_i2c.c b/drivers/video/intelfb/intelfb_i2c.c index 5d896b81f4e0..b3065492bb20 100644 --- a/drivers/video/intelfb/intelfb_i2c.c +++ b/drivers/video/intelfb/intelfb_i2c.c @@ -111,7 +111,6 @@ static int intelfb_setup_i2c_bus(struct intelfb_info *dinfo, "intelfb %s", name); chan->adapter.class = class; chan->adapter.owner = THIS_MODULE; - chan->adapter.id = I2C_HW_B_INTELFB; chan->adapter.algo_data = &chan->algo; chan->adapter.dev.parent = &chan->dinfo->pdev->dev; chan->algo.setsda = intelfb_gpio_setsda; diff --git a/drivers/video/nvidia/nv_i2c.c b/drivers/video/nvidia/nv_i2c.c index 6fd7cb8f9b8e..6aaddb4f6788 100644 --- a/drivers/video/nvidia/nv_i2c.c +++ b/drivers/video/nvidia/nv_i2c.c @@ -87,7 +87,6 @@ static int nvidia_setup_i2c_bus(struct nvidia_i2c_chan *chan, const char *name, strcpy(chan->adapter.name, name); chan->adapter.owner = THIS_MODULE; - chan->adapter.id = I2C_HW_B_NVIDIA; chan->adapter.class = i2c_class; chan->adapter.algo_data = &chan->algo; chan->adapter.dev.parent = &chan->par->pci_dev->dev; diff --git a/drivers/video/savage/savagefb-i2c.c b/drivers/video/savage/savagefb-i2c.c index 783d4adffb93..574b29e9f8f2 100644 --- a/drivers/video/savage/savagefb-i2c.c +++ b/drivers/video/savage/savagefb-i2c.c @@ -137,7 +137,6 @@ static int savage_setup_i2c_bus(struct savagefb_i2c_chan *chan, if (chan->par) { strcpy(chan->adapter.name, name); chan->adapter.owner = THIS_MODULE; - chan->adapter.id = I2C_HW_B_SAVAGE; chan->adapter.algo_data = &chan->algo; chan->adapter.dev.parent = &chan->par->pcidev->dev; chan->algo.udelay = 10; diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h index 82b7eee22636..1ffc23bc5d1e 100644 --- a/include/linux/i2c-id.h +++ b/include/linux/i2c-id.h @@ -79,74 +79,23 @@ */ /* --- Bit algorithm adapters */ -#define I2C_HW_B_LP 0x010000 /* Parallel port Philips style */ #define I2C_HW_B_BT848 0x010005 /* BT848 video boards */ -#define I2C_HW_B_VIA 0x010007 /* Via vt82c586b */ -#define I2C_HW_B_HYDRA 0x010008 /* Apple Hydra Mac I/O */ -#define I2C_HW_B_I810 0x01000a /* Intel I810 */ -#define I2C_HW_B_VOO 0x01000b /* 3dfx Voodoo 3 / Banshee */ -#define I2C_HW_B_SCX200 0x01000e /* Nat'l Semi SCx200 I2C */ #define I2C_HW_B_RIVA 0x010010 /* Riva based graphics cards */ -#define I2C_HW_B_IOC 0x010011 /* IOC bit-wiggling */ -#define I2C_HW_B_IXP2000 0x010016 /* GPIO on IXP2000 systems */ #define I2C_HW_B_ZR36067 0x010019 /* Zoran-36057/36067 based boards */ -#define I2C_HW_B_PCILYNX 0x01001a /* TI PCILynx I2C adapter */ #define I2C_HW_B_CX2388x 0x01001b /* connexant 2388x based tv cards */ -#define I2C_HW_B_NVIDIA 0x01001c /* nvidia framebuffer driver */ -#define I2C_HW_B_SAVAGE 0x01001d /* savage framebuffer driver */ -#define I2C_HW_B_RADEON 0x01001e /* radeon framebuffer driver */ #define I2C_HW_B_EM28XX 0x01001f /* em28xx video capture cards */ #define I2C_HW_B_CX2341X 0x010020 /* Conexant CX2341X MPEG encoder cards */ -#define I2C_HW_B_INTELFB 0x010021 /* intel framebuffer driver */ #define I2C_HW_B_CX23885 0x010022 /* conexant 23885 based tv cards (bus1) */ #define I2C_HW_B_AU0828 0x010023 /* auvitek au0828 usb bridge */ -/* --- PCF 8584 based algorithms */ -#define I2C_HW_P_ELEK 0x020002 /* Elektor ISA Bus inteface card */ - -/* --- PCA 9564 based algorithms */ -#define I2C_HW_A_ISA 0x1a0000 /* generic ISA Bus interface card */ - -/* --- PowerPC on-chip adapters */ -#define I2C_HW_OCP 0x120000 /* IBM on-chip I2C adapter */ - -/* --- Broadcom SiByte adapters */ -#define I2C_HW_SIBYTE 0x150000 - /* --- SGI adapters */ #define I2C_HW_SGI_VINO 0x160000 -/* --- XSCALE on-chip adapters */ -#define I2C_HW_IOP3XX 0x140000 - -/* --- Au1550 PSC adapters adapters */ -#define I2C_HW_AU1550_PSC 0x1b0000 - /* --- SMBus only adapters */ -#define I2C_HW_SMBUS_PIIX4 0x040000 -#define I2C_HW_SMBUS_ALI15X3 0x040001 -#define I2C_HW_SMBUS_VIA2 0x040002 -#define I2C_HW_SMBUS_I801 0x040004 -#define I2C_HW_SMBUS_AMD756 0x040005 -#define I2C_HW_SMBUS_SIS5595 0x040006 -#define I2C_HW_SMBUS_ALI1535 0x040007 -#define I2C_HW_SMBUS_SIS630 0x040008 -#define I2C_HW_SMBUS_SIS96X 0x040009 -#define I2C_HW_SMBUS_AMD8111 0x04000a -#define I2C_HW_SMBUS_SCX200 0x04000b -#define I2C_HW_SMBUS_NFORCE2 0x04000c #define I2C_HW_SMBUS_W9968CF 0x04000d #define I2C_HW_SMBUS_OV511 0x04000e /* OV511(+) USB 1.1 webcam ICs */ #define I2C_HW_SMBUS_OV518 0x04000f /* OV518(+) USB 1.1 webcam ICs */ #define I2C_HW_SMBUS_CAFE 0x040012 /* Marvell 88ALP01 "CAFE" cam */ -#define I2C_HW_SMBUS_ALI1563 0x040013 - -/* --- MCP107 adapter */ -#define I2C_HW_MPC107 0x0d0000 - -/* --- Embedded adapters */ -#define I2C_HW_MV64XXX 0x190000 -#define I2C_HW_BLACKFIN 0x190001 /* ADI Blackfin I2C TWI driver */ /* --- Miscellaneous adapters */ #define I2C_HW_SAA7146 0x060000 /* SAA7146 video decoder bus */ -- cgit v1.2.3 From 5fb4523afbffae5a5cec4989ee4c9fbc3dbdef33 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 26 Jan 2009 21:19:57 +0100 Subject: i2c: Warn on deprecated binding model use Let the kernel developers know that i2c_attach_client() and i2c_detach_client() are deprecated and should no longer be used. Drivers using these should be converted to the standard device driver binding model (probe and remove methods.) Signed-off-by: Jean Delvare Acked-by: Ben Dooks --- include/linux/i2c.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 20873d402467..fcfbfea3af72 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -151,7 +151,7 @@ struct i2c_driver { * has been dynamically allocated by the driver in the function above, * it must be freed here. (LEGACY I2C DRIVERS ONLY) */ - int (*detach_client)(struct i2c_client *); + int (*detach_client)(struct i2c_client *) __deprecated; /* Standard driver model interfaces, for "new style" i2c drivers. * With the driver model, device enumeration is NEVER done by drivers; @@ -429,8 +429,10 @@ static inline int i2c_add_driver(struct i2c_driver *driver) return i2c_register_driver(THIS_MODULE, driver); } -extern int i2c_attach_client(struct i2c_client *); -extern int i2c_detach_client(struct i2c_client *); +/* These are deprecated, your driver should use the standard .probe() + * and .remove() methods instead. */ +extern int __deprecated i2c_attach_client(struct i2c_client *); +extern int __deprecated i2c_detach_client(struct i2c_client *); extern struct i2c_client *i2c_use_client(struct i2c_client *client); extern void i2c_release_client(struct i2c_client *client); -- cgit v1.2.3 From 3121a48d87a580f369eeb26aa0a075142274a353 Mon Sep 17 00:00:00 2001 From: Krzysztof Hałasa Date: Mon, 26 Jan 2009 12:30:12 -0800 Subject: net: Fix linux/if_frad.h's suitability for userspace. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The userspace interfaces are protected by CONFIG_* ifdefs and that of course can't work. Reported by Jaswinder Singh Rajput. Signed-off-by: Krzysztof Hałasa Signed-off-by: David S. Miller --- include/linux/if_frad.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h index 5c34240de746..60e16a551dd6 100644 --- a/include/linux/if_frad.h +++ b/include/linux/if_frad.h @@ -26,8 +26,6 @@ #include -#if defined(CONFIG_DLCI) || defined(CONFIG_DLCI_MODULE) - /* Structures and constants associated with the DLCI device driver */ struct dlci_add @@ -127,6 +125,8 @@ struct frad_conf #ifdef __KERNEL__ +#if defined(CONFIG_DLCI) || defined(CONFIG_DLCI_MODULE) + /* these are the fields of an RFC 1490 header */ struct frhdr { @@ -190,12 +190,10 @@ struct frad_local int buffer; /* current buffer for S508 firmware */ }; -#endif /* __KERNEL__ */ - #endif /* CONFIG_DLCI || CONFIG_DLCI_MODULE */ -#ifdef __KERNEL__ extern void dlci_ioctl_set(int (*hook)(unsigned int, void __user *)); -#endif + +#endif /* __KERNEL__ */ #endif -- cgit v1.2.3 From 5ee810072175042775e39bdd3eaaa68884c27805 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 26 Jan 2009 19:21:26 -0800 Subject: Fix "multiple definition of `debugfs_create_size_t'" Introduced by 8adb711f3668b034e7b956fac951ed08b53e0d55 ("debugfs: introduce stub for debugfs_create_size_t() when DEBUG_FS=n") and due to a simple missing "static inline". Reported-and-tested-by: Jeff Chua Acked-by: Greg KH Signed-off-by: Linus Torvalds --- include/linux/debugfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 0f5c33b0bd3e..af0e01d4c663 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -162,7 +162,7 @@ static inline struct dentry *debugfs_create_x32(const char *name, mode_t mode, return ERR_PTR(-ENODEV); } -struct dentry *debugfs_create_size_t(const char *name, mode_t mode, +static inline struct dentry *debugfs_create_size_t(const char *name, mode_t mode, struct dentry *parent, size_t *value) { -- cgit v1.2.3 From abfe2d7b915c872f3a1fd203267cedebf90daa45 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 19 Jan 2009 20:54:54 +0100 Subject: Hibernation: Introduce system_entering_hibernation Introduce boolean function system_entering_hibernation() returning 'true' during the last phase of hibernation, in which devices are being put into low power states and the sleep state (for example, ACPI S4) is finally entered. Some device drivers need such a function to check if the system is in the final phase of hibernation. In particular, some SATA drivers are going to use it for blacklisting systems in which the disks should not be spun down during the last phase of hibernation (the BIOS will do that anyway). Signed-off-by: Rafael J. Wysocki Signed-off-by: Jeff Garzik --- include/linux/suspend.h | 2 ++ kernel/power/disk.c | 10 ++++++++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 2b409c44db83..c7d9bb1832ba 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -237,6 +237,7 @@ extern int hibernate_nvs_alloc(void); extern void hibernate_nvs_free(void); extern void hibernate_nvs_save(void); extern void hibernate_nvs_restore(void); +extern bool system_entering_hibernation(void); #else /* CONFIG_HIBERNATION */ static inline int swsusp_page_is_forbidden(struct page *p) { return 0; } static inline void swsusp_set_page_free(struct page *p) {} @@ -252,6 +253,7 @@ static inline int hibernate_nvs_alloc(void) { return 0; } static inline void hibernate_nvs_free(void) {} static inline void hibernate_nvs_save(void) {} static inline void hibernate_nvs_restore(void) {} +static inline bool system_entering_hibernation(void) { return false; } #endif /* CONFIG_HIBERNATION */ #ifdef CONFIG_PM_SLEEP diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 45e8541ab7e3..432ee575c9ee 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -71,6 +71,14 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops) mutex_unlock(&pm_mutex); } +static bool entering_platform_hibernation; + +bool system_entering_hibernation(void) +{ + return entering_platform_hibernation; +} +EXPORT_SYMBOL(system_entering_hibernation); + #ifdef CONFIG_PM_DEBUG static void hibernation_debug_sleep(void) { @@ -411,6 +419,7 @@ int hibernation_platform_enter(void) if (error) goto Close; + entering_platform_hibernation = true; suspend_console(); error = device_suspend(PMSG_HIBERNATE); if (error) { @@ -445,6 +454,7 @@ int hibernation_platform_enter(void) Finish: hibernation_ops->finish(); Resume_devices: + entering_platform_hibernation = false; device_resume(PMSG_RESTORE); resume_console(); Close: -- cgit v1.2.3 From d7b1956fed33d30c4815e848fd7a143722916868 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 19 Jan 2009 20:55:50 +0100 Subject: DMI: Introduce dmi_first_match to make the interface more flexible Some notebooks from HP have the problem that their BIOSes attempt to spin down hard drives before entering ACPI system states S4 and S5. This leads to a yo-yo effect during system power-off shutdown and the last phase of hibernation when the disk is first spun down by the kernel and then almost immediately turned on and off by the BIOS. This, in turn, may result in shortening the disk's life times. To prevent this from happening we can blacklist the affected systems using DMI information. However, only the on-board controlles should be blacklisted and their PCI slot numbers can be used for this purpose. Unfortunately the existing interface for checking DMI information of the system is not very convenient for this purpose, because to use it, we would have to define special callback functions or create a separate struct dmi_system_id table for each blacklisted system. To overcome this difficulty introduce a new function dmi_first_match() returning a pointer to the first entry in an array of struct dmi_system_id elements that matches the system DMI information. Then, we can use this pointer to access the entry's .driver_data field containing the additional information, such as the PCI slot number, allowing us to do the desired blacklisting. Signed-off-by: Rafael J. Wysocki Signed-off-by: Jeff Garzik --- drivers/firmware/dmi_scan.c | 74 +++++++++++++++++++++++++++++++++------------ include/linux/dmi.h | 1 + 2 files changed, 56 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index d76adfea5df7..8f0f7c449305 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -414,6 +414,29 @@ void __init dmi_scan_machine(void) dmi_initialized = 1; } +/** + * dmi_matches - check if dmi_system_id structure matches system DMI data + * @dmi: pointer to the dmi_system_id structure to check + */ +static bool dmi_matches(const struct dmi_system_id *dmi) +{ + int i; + + WARN(!dmi_initialized, KERN_ERR "dmi check: not initialized yet.\n"); + + for (i = 0; i < ARRAY_SIZE(dmi->matches); i++) { + int s = dmi->matches[i].slot; + if (s == DMI_NONE) + continue; + if (dmi_ident[s] + && strstr(dmi_ident[s], dmi->matches[i].substr)) + continue; + /* No match */ + return false; + } + return true; +} + /** * dmi_check_system - check system DMI data * @list: array of dmi_system_id structures to match against @@ -429,31 +452,44 @@ void __init dmi_scan_machine(void) */ int dmi_check_system(const struct dmi_system_id *list) { - int i, count = 0; - const struct dmi_system_id *d = list; - - WARN(!dmi_initialized, KERN_ERR "dmi check: not initialized yet.\n"); - - while (d->ident) { - for (i = 0; i < ARRAY_SIZE(d->matches); i++) { - int s = d->matches[i].slot; - if (s == DMI_NONE) - continue; - if (dmi_ident[s] && strstr(dmi_ident[s], d->matches[i].substr)) - continue; - /* No match */ - goto fail; + int count = 0; + const struct dmi_system_id *d; + + for (d = list; d->ident; d++) + if (dmi_matches(d)) { + count++; + if (d->callback && d->callback(d)) + break; } - count++; - if (d->callback && d->callback(d)) - break; -fail: d++; - } return count; } EXPORT_SYMBOL(dmi_check_system); +/** + * dmi_first_match - find dmi_system_id structure matching system DMI data + * @list: array of dmi_system_id structures to match against + * All non-null elements of the list must match + * their slot's (field index's) data (i.e., each + * list string must be a substring of the specified + * DMI slot's string data) to be considered a + * successful match. + * + * Walk the blacklist table until the first match is found. Return the + * pointer to the matching entry or NULL if there's no match. + */ +const struct dmi_system_id *dmi_first_match(const struct dmi_system_id *list) +{ + const struct dmi_system_id *d; + + for (d = list; d->ident; d++) + if (dmi_matches(d)) + return d; + + return NULL; +} +EXPORT_SYMBOL(dmi_first_match); + /** * dmi_get_system_info - return DMI data value * @field: data index (see enum dmi_field) diff --git a/include/linux/dmi.h b/include/linux/dmi.h index 34161907b2f8..aea23105d3ed 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -38,6 +38,7 @@ struct dmi_device { #ifdef CONFIG_DMI extern int dmi_check_system(const struct dmi_system_id *list); +const struct dmi_system_id *dmi_first_match(const struct dmi_system_id *list); extern const char * dmi_get_system_info(int field); extern const struct dmi_device * dmi_find_device(int type, const char *name, const struct dmi_device *from); -- cgit v1.2.3 From 2a6e58d2731dcc05dafa7f976d935e0f0627fcd7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 19 Jan 2009 20:56:43 +0100 Subject: SATA: Blacklisting of systems that spin off disks during ACPI power off Introduce new libata flags ATA_FLAG_NO_POWEROFF_SPINDOWN and ATA_FLAG_NO_HIBERNATE_SPINDOWN that, if set, will prevent disks from being spun off during system power off and hibernation, respectively (to handle the hibernation case we need the new system state SYSTEM_HIBERNATE_ENTER that can be checked against by libata, in analogy with SYSTEM_POWER_OFF). Signed-off-by: Rafael J. Wysocki Signed-off-by: Jeff Garzik --- drivers/ata/libata-scsi.c | 20 +++++++++++++++++--- include/linux/libata.h | 2 ++ 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index a1a6e6298c33..3c4c5ae277ba 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "libata.h" @@ -1303,6 +1304,17 @@ static unsigned int ata_scsi_start_stop_xlat(struct ata_queued_cmd *qc) tf->command = ATA_CMD_VERIFY; /* READ VERIFY */ } else { + /* Some odd clown BIOSen issue spindown on power off (ACPI S4 + * or S5) causing some drives to spin up and down again. + */ + if ((qc->ap->flags & ATA_FLAG_NO_POWEROFF_SPINDOWN) && + system_state == SYSTEM_POWER_OFF) + goto skip; + + if ((qc->ap->flags & ATA_FLAG_NO_HIBERNATE_SPINDOWN) && + system_entering_hibernation()) + goto skip; + /* XXX: This is for backward compatibility, will be * removed. Read Documentation/feature-removal-schedule.txt * for more info. @@ -1326,8 +1338,7 @@ static unsigned int ata_scsi_start_stop_xlat(struct ata_queued_cmd *qc) scmd->scsi_done = qc->scsidone; qc->scsidone = ata_delayed_done; } - scmd->result = SAM_STAT_GOOD; - return 1; + goto skip; } /* Issue ATA STANDBY IMMEDIATE command */ @@ -1343,10 +1354,13 @@ static unsigned int ata_scsi_start_stop_xlat(struct ata_queued_cmd *qc) return 0; -invalid_fld: + invalid_fld: ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x0); /* "Invalid field in cbd" */ return 1; + skip: + scmd->result = SAM_STAT_GOOD; + return 1; } diff --git a/include/linux/libata.h b/include/linux/libata.h index 2c6bd66209ff..bca3ba25f52a 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -187,6 +187,8 @@ enum { ATA_FLAG_PIO_POLLING = (1 << 9), /* use polling PIO if LLD * doesn't handle PIO interrupts */ ATA_FLAG_NCQ = (1 << 10), /* host supports NCQ */ + ATA_FLAG_NO_POWEROFF_SPINDOWN = (1 << 11), /* don't spindown before poweroff */ + ATA_FLAG_NO_HIBERNATE_SPINDOWN = (1 << 12), /* don't spindown before hibernation */ ATA_FLAG_DEBUGMSG = (1 << 13), ATA_FLAG_IGN_SIMPLEX = (1 << 15), /* ignore SIMPLEX */ ATA_FLAG_NO_IORDY = (1 << 16), /* controller lacks iordy */ -- cgit v1.2.3 From 57064d213d2e44654d4f13c66df135b5e7389a26 Mon Sep 17 00:00:00 2001 From: Seth Heasley Date: Fri, 23 Jan 2009 12:43:38 -0800 Subject: PCI: irq and pci_ids patch for Intel Tigerpoint DeviceIDs This patch adds the Intel Tigerpoint LPC Controller DeviceIDs. Signed-off-by: Seth Heasley Signed-off-by: Jesse Barnes --- arch/x86/pci/irq.c | 1 + include/linux/pci_ids.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 4064345cf144..fecbce6e7d7c 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -572,6 +572,7 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH7_1: case PCI_DEVICE_ID_INTEL_ICH7_30: case PCI_DEVICE_ID_INTEL_ICH7_31: + case PCI_DEVICE_ID_INTEL_TGP_LPC: case PCI_DEVICE_ID_INTEL_ESB2_0: case PCI_DEVICE_ID_INTEL_ICH8_0: case PCI_DEVICE_ID_INTEL_ICH8_1: diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index d56ad9c21c09..6f260b50253b 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2421,6 +2421,7 @@ #define PCI_DEVICE_ID_INTEL_ICH7_0 0x27b8 #define PCI_DEVICE_ID_INTEL_ICH7_1 0x27b9 #define PCI_DEVICE_ID_INTEL_ICH7_30 0x27b0 +#define PCI_DEVICE_ID_INTEL_TGP_LPC 0x27bc #define PCI_DEVICE_ID_INTEL_ICH7_31 0x27bd #define PCI_DEVICE_ID_INTEL_ICH7_17 0x27da #define PCI_DEVICE_ID_INTEL_ICH7_19 0x27dd -- cgit v1.2.3 From 1cf3eb2ff6b0844c678f2f48d0053b9d12b7da67 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 27 Jan 2009 23:48:59 +0200 Subject: kmalloc: return NULL instead of link failure The SLAB kmalloc with a constant value isn't consistent with the other implementations because it bails out with __you_cannot_kmalloc_that_much rather than returning NULL and properly allowing the caller to fall back to vmalloc or take other action. This doesn't happen with a non-constant value or with SLOB or SLUB. Starting with 2.6.28, I've been seeing build failures on s390x. This is due to init_section_page_cgroup trying to allocate 2.5MB when the max size for a kmalloc on s390x is 2MB. It's failing because the value is constant. The workarounds at the call size are ugly and the caller shouldn't have to change behavior depending on what the backend of the API is. So, this patch eliminates the link failure and returns NULL like the other implementations. Signed-off-by: Jeff Mahoney Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Christoph Lameter Cc: Pekka Enberg Cc: Matt Mackall Cc: Nick Piggin Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Pekka Enberg --- include/linux/slab_def.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 39c3a5eb8ebe..6ca6a7b66d75 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -43,10 +43,7 @@ static inline void *kmalloc(size_t size, gfp_t flags) i++; #include #undef CACHE - { - extern void __you_cannot_kmalloc_that_much(void); - __you_cannot_kmalloc_that_much(); - } + return NULL; found: #ifdef CONFIG_ZONE_DMA if (flags & GFP_DMA) @@ -77,10 +74,7 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node) i++; #include #undef CACHE - { - extern void __you_cannot_kmalloc_that_much(void); - __you_cannot_kmalloc_that_much(); - } + return NULL; found: #ifdef CONFIG_ZONE_DMA if (flags & GFP_DMA) -- cgit v1.2.3 From 40413dcb7b273bda681dca38e6ff0bbb3728ef11 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 22 Jan 2009 01:58:36 +0300 Subject: Fix longstanding "error: storage size of '__mod_dmi_device_table' isn't known" gcc 3.4.6 doesn't like MODULE_DEVICE_TABLE(dmi, x) expansion enough to error out. Shut it up in a most simple way. Signed-off-by: Alexey Dobriyan Signed-off-by: Linus Torvalds --- include/linux/mod_devicetable.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 97b91d1abb43..fde86671f48f 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -443,6 +443,13 @@ struct dmi_system_id { struct dmi_strmatch matches[4]; void *driver_data; }; +/* + * struct dmi_device_id appears during expansion of + * "MODULE_DEVICE_TABLE(dmi, x)". Compiler doesn't look inside it + * but this is enough for gcc 3.4.6 to error out: + * error: storage size of '__mod_dmi_device_table' isn't known + */ +#define dmi_device_id dmi_system_id #endif #define DMI_MATCH(a, b) { a, b } -- cgit v1.2.3 From 11e76ae0f3a82bbb6c06df8af2167af8b96a0584 Mon Sep 17 00:00:00 2001 From: Inaky Perez-Gonzalez Date: Thu, 8 Jan 2009 12:52:19 -0800 Subject: USB: add kernel-doc for wusb_dev in struct usb_device Reported by Randy Dunlap from a warning on the v2.6.29 merge window. Signed-off-by: Inaky Perez-Gonzalez Cc: David Vrabel Cc: Randy Dunlap Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 85ee9be9361e..88079fd60235 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -418,6 +418,8 @@ struct usb_tt; * @autosuspend_disabled: autosuspend disabled by the user * @autoresume_disabled: autoresume disabled by the user * @skip_sys_resume: skip the next system resume + * @wusb_dev: if this is a Wireless USB device, link to the WUSB + * specific data for the device. * * Notes: * Usbcore drivers should not set usbdev->state directly. Instead use -- cgit v1.2.3 From d8204ee2ad1c9babd7e33d4c118ec99a78a8442e Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Wed, 28 Jan 2009 00:07:20 -0600 Subject: dmi: Fix build breakage Commit d7b1956fed33d30c4815e848fd7a143722916868 ("DMI: Introduce dmi_first_match to make the interface more flexible") introduced compile errors like the following when !CONFIG_DMI drivers/ata/sata_sil.c: In function 'sil_broken_system_poweroff': drivers/ata/sata_sil.c:713: error: implicit declaration of function 'dmi_first_match' drivers/ata/sata_sil.c:713: warning: initialization makes pointer from integer without a cast We just need a dummy version of dmi_first_match() to fix this all up. Signed-off-by: Kumar Gala Signed-off-by: Linus Torvalds --- include/linux/dmi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmi.h b/include/linux/dmi.h index aea23105d3ed..d741b9ceb0e0 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -65,6 +65,8 @@ static inline int dmi_walk(void (*decode)(const struct dmi_header *)) { return -1; } static inline bool dmi_match(enum dmi_field f, const char *str) { return false; } +static inline const struct dmi_system_id * + dmi_first_match(const struct dmi_system_id *list) { return NULL; } #endif -- cgit v1.2.3 From dc19835df6c47ff676ad6c98722d5e529db5d74c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 29 Jan 2009 14:25:08 -0800 Subject: kprobes: fix module compilation error with CONFIG_KPROBES=n Define kprobes related data structures even if CONFIG_KPROBES is not set. This fixes compilation errors which occur if CONFIG_KPROBES is not set, in kprobe using modules. [akpm@linux-foundation.org: fix build for non-kprobes-supporting architectures] Reviewed-by: Ananth N Mavinakayanahalli Signed-off-by: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index d6ea19e314bb..32851eef48f0 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -49,6 +49,13 @@ /* Attach to insert probes on any functions which should be ignored*/ #define __kprobes __attribute__((__section__(".kprobes.text"))) notrace +#else /* CONFIG_KPROBES */ +typedef int kprobe_opcode_t; +struct arch_specific_insn { + int dummy; +}; +#define __kprobes notrace +#endif /* CONFIG_KPROBES */ struct kprobe; struct pt_regs; @@ -131,23 +138,6 @@ struct jprobe { /* For backward compatibility with old code using JPROBE_ENTRY() */ #define JPROBE_ENTRY(handler) (handler) -DECLARE_PER_CPU(struct kprobe *, current_kprobe); -DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); - -#ifdef CONFIG_KRETPROBES -extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs); -extern int arch_trampoline_kprobe(struct kprobe *p); -#else /* CONFIG_KRETPROBES */ -static inline void arch_prepare_kretprobe(struct kretprobe *rp, - struct pt_regs *regs) -{ -} -static inline int arch_trampoline_kprobe(struct kprobe *p) -{ - return 0; -} -#endif /* CONFIG_KRETPROBES */ /* * Function-return probe - * Note: @@ -188,6 +178,25 @@ struct kprobe_blackpoint { unsigned long range; }; +#ifdef CONFIG_KPROBES +DECLARE_PER_CPU(struct kprobe *, current_kprobe); +DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +#ifdef CONFIG_KRETPROBES +extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, + struct pt_regs *regs); +extern int arch_trampoline_kprobe(struct kprobe *p); +#else /* CONFIG_KRETPROBES */ +static inline void arch_prepare_kretprobe(struct kretprobe *rp, + struct pt_regs *regs) +{ +} +static inline int arch_trampoline_kprobe(struct kprobe *p) +{ + return 0; +} +#endif /* CONFIG_KRETPROBES */ + extern struct kretprobe_blackpoint kretprobe_blacklist[]; static inline void kretprobe_assert(struct kretprobe_instance *ri, @@ -264,10 +273,6 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); #else /* CONFIG_KPROBES */ -#define __kprobes notrace -struct jprobe; -struct kretprobe; - static inline struct kprobe *get_kprobe(void *addr) { return NULL; -- cgit v1.2.3 From e5d9a90c36e05dd080704ea58328c00f64facdc1 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Thu, 29 Jan 2009 14:25:18 -0800 Subject: alpha: use syscall wrappers Convert OSF syscalls and add alpha specific SYSCALL_ALIAS() macro. Signed-off-by: Ivan Kokshaysky Cc: Richard Henderson Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/Kconfig | 1 + arch/alpha/kernel/entry.S | 2 +- arch/alpha/kernel/osf_sys.c | 113 ++++++++++++++++++++------------------------ arch/alpha/kernel/signal.c | 18 +++---- arch/alpha/kernel/systbls.S | 52 ++++++++++---------- include/linux/syscalls.h | 5 ++ 6 files changed, 92 insertions(+), 99 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 6110197757a3..9fb8aae5c391 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -8,6 +8,7 @@ config ALPHA select HAVE_AOUT select HAVE_IDE select HAVE_OPROFILE + select HAVE_SYSCALL_WRAPPERS help The Alpha is a 64-bit general-purpose processor designed and marketed by the Digital Equipment Corporation of blessed memory, diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S index aa2e50cf9857..e4a54b615894 100644 --- a/arch/alpha/kernel/entry.S +++ b/arch/alpha/kernel/entry.S @@ -933,7 +933,7 @@ sys_execve: osf_sigprocmask: .prologue 0 mov $sp, $18 - jmp $31, do_osf_sigprocmask + jmp $31, sys_osf_sigprocmask .end osf_sigprocmask .align 4 diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 18a3ea1aac51..ae41f097864b 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -54,8 +54,7 @@ extern int do_pipe(int *); * identical to OSF as we don't return 0 on success, but doing otherwise * would require changes to libc. Hopefully this is good enough. */ -asmlinkage unsigned long -osf_brk(unsigned long brk) +SYSCALL_DEFINE1(osf_brk, unsigned long, brk) { unsigned long retval = sys_brk(brk); if (brk && brk != retval) @@ -66,9 +65,9 @@ osf_brk(unsigned long brk) /* * This is pure guess-work.. */ -asmlinkage int -osf_set_program_attributes(unsigned long text_start, unsigned long text_len, - unsigned long bss_start, unsigned long bss_len) +SYSCALL_DEFINE4(osf_set_program_attributes, unsigned long, text_start, + unsigned long, text_len, unsigned long, bss_start, + unsigned long, bss_len) { struct mm_struct *mm; @@ -146,9 +145,9 @@ Efault: return -EFAULT; } -asmlinkage int -osf_getdirentries(unsigned int fd, struct osf_dirent __user *dirent, - unsigned int count, long __user *basep) +SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd, + struct osf_dirent __user *, dirent, unsigned int, count, + long __user *, basep) { int error; struct file *file; @@ -177,9 +176,9 @@ osf_getdirentries(unsigned int fd, struct osf_dirent __user *dirent, #undef NAME_OFFSET -asmlinkage unsigned long -osf_mmap(unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long off) +SYSCALL_DEFINE6(osf_mmap, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, unsigned long, fd, + unsigned long, off) { struct file *file = NULL; unsigned long ret = -EBADF; @@ -254,8 +253,8 @@ do_osf_statfs(struct dentry * dentry, struct osf_statfs __user *buffer, return error; } -asmlinkage int -osf_statfs(char __user *pathname, struct osf_statfs __user *buffer, unsigned long bufsiz) +SYSCALL_DEFINE3(osf_statfs, char __user *, pathname, + struct osf_statfs __user *, buffer, unsigned long, bufsiz) { struct path path; int retval; @@ -268,8 +267,8 @@ osf_statfs(char __user *pathname, struct osf_statfs __user *buffer, unsigned lon return retval; } -asmlinkage int -osf_fstatfs(unsigned long fd, struct osf_statfs __user *buffer, unsigned long bufsiz) +SYSCALL_DEFINE3(osf_fstatfs, unsigned long, fd, + struct osf_statfs __user *, buffer, unsigned long, bufsiz) { struct file *file; int retval; @@ -368,8 +367,8 @@ osf_procfs_mount(char *dirname, struct procfs_args __user *args, int flags) return do_mount("", dirname, "proc", flags, NULL); } -asmlinkage int -osf_mount(unsigned long typenr, char __user *path, int flag, void __user *data) +SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, char __user *, path, + int, flag, void __user *, data) { int retval = -EINVAL; char *name; @@ -399,8 +398,7 @@ osf_mount(unsigned long typenr, char __user *path, int flag, void __user *data) return retval; } -asmlinkage int -osf_utsname(char __user *name) +SYSCALL_DEFINE1(osf_utsname, char __user *, name) { int error; @@ -423,14 +421,12 @@ osf_utsname(char __user *name) return error; } -asmlinkage unsigned long -sys_getpagesize(void) +SYSCALL_DEFINE0(getpagesize) { return PAGE_SIZE; } -asmlinkage unsigned long -sys_getdtablesize(void) +SYSCALL_DEFINE0(getdtablesize) { return sysctl_nr_open; } @@ -438,8 +434,7 @@ sys_getdtablesize(void) /* * For compatibility with OSF/1 only. Use utsname(2) instead. */ -asmlinkage int -osf_getdomainname(char __user *name, int namelen) +SYSCALL_DEFINE2(osf_getdomainname, char __user *, name, int, namelen) { unsigned len; int i; @@ -527,8 +522,8 @@ enum pl_code { PL_DEL = 5, PL_FDEL = 6 }; -asmlinkage long -osf_proplist_syscall(enum pl_code code, union pl_args __user *args) +SYSCALL_DEFINE2(osf_proplist_syscall, enum pl_code, code, + union pl_args __user *, args) { long error; int __user *min_buf_size_ptr; @@ -567,8 +562,8 @@ osf_proplist_syscall(enum pl_code code, union pl_args __user *args) return error; } -asmlinkage int -osf_sigstack(struct sigstack __user *uss, struct sigstack __user *uoss) +SYSCALL_DEFINE2(osf_sigstack, struct sigstack __user *, uss, + struct sigstack __user *, uoss) { unsigned long usp = rdusp(); unsigned long oss_sp = current->sas_ss_sp + current->sas_ss_size; @@ -608,8 +603,7 @@ osf_sigstack(struct sigstack __user *uss, struct sigstack __user *uoss) return error; } -asmlinkage long -osf_sysinfo(int command, char __user *buf, long count) +SYSCALL_DEFINE3(osf_sysinfo, int, command, char __user *, buf, long, count) { char *sysinfo_table[] = { utsname()->sysname, @@ -647,9 +641,8 @@ osf_sysinfo(int command, char __user *buf, long count) return err; } -asmlinkage unsigned long -osf_getsysinfo(unsigned long op, void __user *buffer, unsigned long nbytes, - int __user *start, void __user *arg) +SYSCALL_DEFINE5(osf_getsysinfo, unsigned long, op, void __user *, buffer, + unsigned long, nbytes, int __user *, start, void __user *, arg) { unsigned long w; struct percpu_struct *cpu; @@ -705,9 +698,8 @@ osf_getsysinfo(unsigned long op, void __user *buffer, unsigned long nbytes, return -EOPNOTSUPP; } -asmlinkage unsigned long -osf_setsysinfo(unsigned long op, void __user *buffer, unsigned long nbytes, - int __user *start, void __user *arg) +SYSCALL_DEFINE5(osf_setsysinfo, unsigned long, op, void __user *, buffer, + unsigned long, nbytes, int __user *, start, void __user *, arg) { switch (op) { case SSI_IEEE_FP_CONTROL: { @@ -880,8 +872,8 @@ jiffies_to_timeval32(unsigned long jiffies, struct timeval32 *value) value->tv_sec = jiffies / HZ; } -asmlinkage int -osf_gettimeofday(struct timeval32 __user *tv, struct timezone __user *tz) +SYSCALL_DEFINE2(osf_gettimeofday, struct timeval32 __user *, tv, + struct timezone __user *, tz) { if (tv) { struct timeval ktv; @@ -896,8 +888,8 @@ osf_gettimeofday(struct timeval32 __user *tv, struct timezone __user *tz) return 0; } -asmlinkage int -osf_settimeofday(struct timeval32 __user *tv, struct timezone __user *tz) +SYSCALL_DEFINE2(osf_settimeofday, struct timeval32 __user *, tv, + struct timezone __user *, tz) { struct timespec kts; struct timezone ktz; @@ -916,8 +908,7 @@ osf_settimeofday(struct timeval32 __user *tv, struct timezone __user *tz) return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); } -asmlinkage int -osf_getitimer(int which, struct itimerval32 __user *it) +SYSCALL_DEFINE2(osf_getitimer, int, which, struct itimerval32 __user *, it) { struct itimerval kit; int error; @@ -929,8 +920,8 @@ osf_getitimer(int which, struct itimerval32 __user *it) return error; } -asmlinkage int -osf_setitimer(int which, struct itimerval32 __user *in, struct itimerval32 __user *out) +SYSCALL_DEFINE3(osf_setitimer, int, which, struct itimerval32 __user *, in, + struct itimerval32 __user *, out) { struct itimerval kin, kout; int error; @@ -952,8 +943,8 @@ osf_setitimer(int which, struct itimerval32 __user *in, struct itimerval32 __use } -asmlinkage int -osf_utimes(char __user *filename, struct timeval32 __user *tvs) +SYSCALL_DEFINE2(osf_utimes, char __user *, filename, + struct timeval32 __user *, tvs) { struct timespec tv[2]; @@ -979,9 +970,8 @@ osf_utimes(char __user *filename, struct timeval32 __user *tvs) #define MAX_SELECT_SECONDS \ ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) -asmlinkage int -osf_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, - struct timeval32 __user *tvp) +SYSCALL_DEFINE5(osf_select, int, n, fd_set __user *, inp, fd_set __user *, outp, + fd_set __user *, exp, struct timeval32 __user *, tvp) { struct timespec end_time, *to = NULL; if (tvp) { @@ -1026,8 +1016,7 @@ struct rusage32 { long ru_nivcsw; /* involuntary " */ }; -asmlinkage int -osf_getrusage(int who, struct rusage32 __user *ru) +SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru) { struct rusage32 r; @@ -1053,9 +1042,8 @@ osf_getrusage(int who, struct rusage32 __user *ru) return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } -asmlinkage long -osf_wait4(pid_t pid, int __user *ustatus, int options, - struct rusage32 __user *ur) +SYSCALL_DEFINE4(osf_wait4, pid_t, pid, int __user *, ustatus, int, options, + struct rusage32 __user *, ur) { struct rusage r; long ret, err; @@ -1101,8 +1089,8 @@ osf_wait4(pid_t pid, int __user *ustatus, int options, * seems to be a timeval pointer, and I suspect the second * one is the time remaining.. Ho humm.. No documentation. */ -asmlinkage int -osf_usleep_thread(struct timeval32 __user *sleep, struct timeval32 __user *remain) +SYSCALL_DEFINE2(osf_usleep_thread, struct timeval32 __user *, sleep, + struct timeval32 __user *, remain) { struct timeval tmp; unsigned long ticks; @@ -1155,8 +1143,7 @@ struct timex32 { int :32; int :32; int :32; int :32; }; -asmlinkage int -sys_old_adjtimex(struct timex32 __user *txc_p) +SYSCALL_DEFINE1(old_adjtimex, struct timex32 __user *, txc_p) { struct timex txc; int ret; @@ -1267,8 +1254,8 @@ osf_fix_iov_len(const struct iovec __user *iov, unsigned long count) return 0; } -asmlinkage ssize_t -osf_readv(unsigned long fd, const struct iovec __user * vector, unsigned long count) +SYSCALL_DEFINE3(osf_readv, unsigned long, fd, + const struct iovec __user *, vector, unsigned long, count) { if (unlikely(personality(current->personality) == PER_OSF4)) if (osf_fix_iov_len(vector, count)) @@ -1276,8 +1263,8 @@ osf_readv(unsigned long fd, const struct iovec __user * vector, unsigned long co return sys_readv(fd, vector, count); } -asmlinkage ssize_t -osf_writev(unsigned long fd, const struct iovec __user * vector, unsigned long count) +SYSCALL_DEFINE3(osf_writev, unsigned long, fd, + const struct iovec __user *, vector, unsigned long, count) { if (unlikely(personality(current->personality) == PER_OSF4)) if (osf_fix_iov_len(vector, count)) diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index 410af4f3140e..df65eaa84c4c 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -51,8 +52,8 @@ static void do_signal(struct pt_regs *, struct switch_stack *, * Note that we don't need to acquire the kernel lock for SMP * operation, as all of this is local to this thread. */ -asmlinkage unsigned long -do_osf_sigprocmask(int how, unsigned long newmask, struct pt_regs *regs) +SYSCALL_DEFINE3(osf_sigprocmask, int, how, unsigned long, newmask, + struct pt_regs *, regs) { unsigned long oldmask = -EINVAL; @@ -81,9 +82,9 @@ do_osf_sigprocmask(int how, unsigned long newmask, struct pt_regs *regs) return oldmask; } -asmlinkage int -osf_sigaction(int sig, const struct osf_sigaction __user *act, - struct osf_sigaction __user *oact) +SYSCALL_DEFINE3(osf_sigaction, int, sig, + const struct osf_sigaction __user *, act, + struct osf_sigaction __user *, oact) { struct k_sigaction new_ka, old_ka; int ret; @@ -112,10 +113,9 @@ osf_sigaction(int sig, const struct osf_sigaction __user *act, return ret; } -asmlinkage long -sys_rt_sigaction(int sig, const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize, void __user *restorer) +SYSCALL_DEFINE5(rt_sigaction, int, sig, const struct sigaction __user *, act, + struct sigaction __user *, oact, + size_t, sigsetsize, void __user *, restorer) { struct k_sigaction new_ka, old_ka; int ret; diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S index 9d9e3a98bb95..95c9aef1c106 100644 --- a/arch/alpha/kernel/systbls.S +++ b/arch/alpha/kernel/systbls.S @@ -17,7 +17,7 @@ sys_call_table: .quad sys_write .quad alpha_ni_syscall /* 5 */ .quad sys_close - .quad osf_wait4 + .quad sys_osf_wait4 .quad alpha_ni_syscall .quad sys_link .quad sys_unlink /* 10 */ @@ -27,11 +27,11 @@ sys_call_table: .quad sys_mknod .quad sys_chmod /* 15 */ .quad sys_chown - .quad osf_brk + .quad sys_osf_brk .quad alpha_ni_syscall .quad sys_lseek .quad sys_getxpid /* 20 */ - .quad osf_mount + .quad sys_osf_mount .quad sys_umount .quad sys_setuid .quad sys_getxuid @@ -53,7 +53,7 @@ sys_call_table: .quad alpha_ni_syscall /* 40 */ .quad sys_dup .quad sys_alpha_pipe - .quad osf_set_program_attributes + .quad sys_osf_set_program_attributes .quad alpha_ni_syscall .quad sys_open /* 45 */ .quad alpha_ni_syscall @@ -81,7 +81,7 @@ sys_call_table: .quad sys_newlstat .quad alpha_ni_syscall .quad alpha_ni_syscall /* 70 */ - .quad osf_mmap + .quad sys_osf_mmap .quad alpha_ni_syscall .quad sys_munmap .quad sys_mprotect @@ -94,17 +94,17 @@ sys_call_table: .quad sys_setgroups /* 80 */ .quad alpha_ni_syscall .quad sys_setpgid - .quad osf_setitimer + .quad sys_osf_setitimer .quad alpha_ni_syscall .quad alpha_ni_syscall /* 85 */ - .quad osf_getitimer + .quad sys_osf_getitimer .quad sys_gethostname .quad sys_sethostname .quad sys_getdtablesize .quad sys_dup2 /* 90 */ .quad sys_newfstat .quad sys_fcntl - .quad osf_select + .quad sys_osf_select .quad sys_poll .quad sys_fsync /* 95 */ .quad sys_setpriority @@ -123,22 +123,22 @@ sys_call_table: .quad alpha_ni_syscall .quad alpha_ni_syscall /* 110 */ .quad sys_sigsuspend - .quad osf_sigstack + .quad sys_osf_sigstack .quad sys_recvmsg .quad sys_sendmsg .quad alpha_ni_syscall /* 115 */ - .quad osf_gettimeofday - .quad osf_getrusage + .quad sys_osf_gettimeofday + .quad sys_osf_getrusage .quad sys_getsockopt .quad alpha_ni_syscall #ifdef CONFIG_OSF4_COMPAT - .quad osf_readv /* 120 */ - .quad osf_writev + .quad sys_osf_readv /* 120 */ + .quad sys_osf_writev #else .quad sys_readv /* 120 */ .quad sys_writev #endif - .quad osf_settimeofday + .quad sys_osf_settimeofday .quad sys_fchown .quad sys_fchmod .quad sys_recvfrom /* 125 */ @@ -154,7 +154,7 @@ sys_call_table: .quad sys_socketpair /* 135 */ .quad sys_mkdir .quad sys_rmdir - .quad osf_utimes + .quad sys_osf_utimes .quad alpha_ni_syscall .quad alpha_ni_syscall /* 140 */ .quad sys_getpeername @@ -172,16 +172,16 @@ sys_call_table: .quad alpha_ni_syscall .quad alpha_ni_syscall .quad alpha_ni_syscall /* 155 */ - .quad osf_sigaction + .quad sys_osf_sigaction .quad alpha_ni_syscall .quad alpha_ni_syscall - .quad osf_getdirentries - .quad osf_statfs /* 160 */ - .quad osf_fstatfs + .quad sys_osf_getdirentries + .quad sys_osf_statfs /* 160 */ + .quad sys_osf_fstatfs .quad alpha_ni_syscall .quad alpha_ni_syscall .quad alpha_ni_syscall - .quad osf_getdomainname /* 165 */ + .quad sys_osf_getdomainname /* 165 */ .quad sys_setdomainname .quad alpha_ni_syscall .quad alpha_ni_syscall @@ -224,7 +224,7 @@ sys_call_table: .quad sys_semctl .quad sys_semget /* 205 */ .quad sys_semop - .quad osf_utsname + .quad sys_osf_utsname .quad sys_lchown .quad sys_shmat .quad sys_shmctl /* 210 */ @@ -258,23 +258,23 @@ sys_call_table: .quad alpha_ni_syscall .quad alpha_ni_syscall .quad alpha_ni_syscall /* 240 */ - .quad osf_sysinfo + .quad sys_osf_sysinfo .quad alpha_ni_syscall .quad alpha_ni_syscall - .quad osf_proplist_syscall + .quad sys_osf_proplist_syscall .quad alpha_ni_syscall /* 245 */ .quad alpha_ni_syscall .quad alpha_ni_syscall .quad alpha_ni_syscall .quad alpha_ni_syscall .quad alpha_ni_syscall /* 250 */ - .quad osf_usleep_thread + .quad sys_osf_usleep_thread .quad alpha_ni_syscall .quad alpha_ni_syscall .quad sys_sysfs .quad alpha_ni_syscall /* 255 */ - .quad osf_getsysinfo - .quad osf_setsysinfo + .quad sys_osf_getsysinfo + .quad sys_osf_setsysinfo .quad alpha_ni_syscall .quad alpha_ni_syscall .quad alpha_ni_syscall /* 260 */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 16875f89e6a7..0eda02ff2414 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -108,9 +108,14 @@ struct old_linux_dirent; asm ("\t.globl " #alias "\n\t.set " #alias ", " #name "\n" \ "\t.globl ." #alias "\n\t.set ." #alias ", ." #name) #else +#ifdef CONFIG_ALPHA +#define SYSCALL_ALIAS(alias, name) \ + asm ( #alias " = " #name "\n\t.globl " #alias) +#else #define SYSCALL_ALIAS(alias, name) \ asm ("\t.globl " #alias "\n\t.set " #alias ", " #name) #endif +#endif #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -- cgit v1.2.3 From 804b3c28a4e4fa1c224571bf76edb534b9c4b1ed Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Thu, 29 Jan 2009 14:25:21 -0800 Subject: cgroups: add cpu_relax() calls in css_tryget() and cgroup_clear_css_refs() css_tryget() and cgroup_clear_css_refs() contain polling loops; these loops should have cpu_relax calls in them to reduce cross-cache traffic. Signed-off-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 1 + kernel/cgroup.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e267e62827bb..e4e8e117d27d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -99,6 +99,7 @@ static inline bool css_tryget(struct cgroup_subsys_state *css) while (!atomic_inc_not_zero(&css->refcnt)) { if (test_bit(CSS_REMOVED, &css->flags)) return false; + cpu_relax(); } return true; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0066092de19a..492215d67fa5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2509,7 +2509,7 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp) for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; int refcnt; - do { + while (1) { /* We can only remove a CSS with a refcnt==1 */ refcnt = atomic_read(&css->refcnt); if (refcnt > 1) { @@ -2523,7 +2523,10 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp) * css_tryget() to spin until we set the * CSS_REMOVED bits or abort */ - } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt); + if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) + break; + cpu_relax(); + } } done: for_each_subsys(cgrp->root, ss) { -- cgit v1.2.3 From 9df04e1f25effde823a600e755b51475d438f56b Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Thu, 29 Jan 2009 14:25:26 -0800 Subject: epoll: drop max_user_instances and rely only on max_user_watches Linus suggested to put limits where the money is, and max_user_watches already does that w/out the need of max_user_instances. That has the advantage to mitigate the potential DoS while allowing pretty generous default behavior. Allowing top 4% of low memory (per user) to be allocated in epoll watches, we have: LOMEM MAX_WATCHES (per user) 512MB ~178000 1GB ~356000 2GB ~712000 A box with 512MB of lomem, will meet some challenge in hitting 180K watches, socket buffers math teaches us. No more max_user_instances limits then. Signed-off-by: Davide Libenzi Cc: Willy Tarreau Cc: Michael Kerrisk Cc: Bron Gondwana Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 22 ++++------------------ include/linux/sched.h | 1 - 2 files changed, 4 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ba2f9ec71192..011b9b8c90c6 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -234,8 +234,6 @@ struct ep_pqueue { /* * Configuration options available inside /proc/sys/fs/epoll/ */ -/* Maximum number of epoll devices, per user */ -static int max_user_instances __read_mostly; /* Maximum number of epoll watched descriptors, per user */ static int max_user_watches __read_mostly; @@ -260,14 +258,6 @@ static struct kmem_cache *pwq_cache __read_mostly; static int zero; ctl_table epoll_table[] = { - { - .procname = "max_user_instances", - .data = &max_user_instances, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = &zero, - }, { .procname = "max_user_watches", .data = &max_user_watches, @@ -491,7 +481,6 @@ static void ep_free(struct eventpoll *ep) mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); - atomic_dec(&ep->user->epoll_devs); free_uid(ep->user); kfree(ep); } @@ -581,10 +570,6 @@ static int ep_alloc(struct eventpoll **pep) struct eventpoll *ep; user = get_current_user(); - error = -EMFILE; - if (unlikely(atomic_read(&user->epoll_devs) >= - max_user_instances)) - goto free_uid; error = -ENOMEM; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) @@ -1141,7 +1126,6 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) flags & O_CLOEXEC); if (fd < 0) ep_free(ep); - atomic_inc(&ep->user->epoll_devs); error_return: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", @@ -1366,8 +1350,10 @@ static int __init eventpoll_init(void) struct sysinfo si; si_meminfo(&si); - max_user_instances = 128; - max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) / + /* + * Allows top 4% of lomem to be allocated for epoll watches (per user). + */ + max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / EP_ITEM_COST; /* Initialize the structure used to perform safe poll wait head wake ups */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 02e16d207304..5a7c76388731 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -630,7 +630,6 @@ struct user_struct { atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ #endif #ifdef CONFIG_EPOLL - atomic_t epoll_devs; /* The number of epoll descriptors currently open */ atomic_t epoll_watches; /* The number of file descriptors currently watched */ #endif #ifdef CONFIG_POSIX_MQUEUE -- cgit v1.2.3 From 7b24fc4d7eb611da367dea3aad45473050aacd6c Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Sun, 4 Jan 2009 02:43:38 -0500 Subject: block: Don't verify integrity metadata on read error If we get an I/O error on a read request there is no point in doing a verify pass on the integrity buffer. Adjust the completion path accordingly. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 25 +++++++++++++++---------- include/linux/bio.h | 1 - 2 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 77ebc3c263d6..8396d741f804 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -465,7 +465,7 @@ static int bio_integrity_verify(struct bio *bio) if (ret) { kunmap_atomic(kaddr, KM_USER0); - break; + return ret; } sectors = bv->bv_len / bi->sector_size; @@ -493,18 +493,13 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; - int error = bip->bip_error; + int error; - if (bio_integrity_verify(bio)) { - clear_bit(BIO_UPTODATE, &bio->bi_flags); - error = -EIO; - } + error = bio_integrity_verify(bio); /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; - - if (bio->bi_end_io) - bio->bi_end_io(bio, error); + bio_endio(bio, error); } /** @@ -525,7 +520,17 @@ void bio_integrity_endio(struct bio *bio, int error) BUG_ON(bip->bip_bio != bio); - bip->bip_error = error; + /* In case of an I/O error there is no point in verifying the + * integrity metadata. Restore original bio end_io handler + * and run it. + */ + if (error) { + bio->bi_end_io = bip->bip_end_io; + bio_endio(bio, error); + + return; + } + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); queue_work(kintegrityd_wq, &bip->bip_work); } diff --git a/include/linux/bio.h b/include/linux/bio.h index 18462c5b8fff..18fc4a281a7b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -312,7 +312,6 @@ struct bio_integrity_payload { void *bip_buf; /* generated integrity data */ bio_end_io_t *bip_end_io; /* saved I/O completion fn */ - int bip_error; /* saved I/O error */ unsigned int bip_size; unsigned short bip_pool; /* pool the ivec came from */ -- cgit v1.2.3 From 16642eb68216d8e0e136a99e514e9166e7125838 Mon Sep 17 00:00:00 2001 From: Alberto Bertogli Date: Mon, 5 Jan 2009 10:18:53 +0100 Subject: Fix small typo in bio.h's documentation Signed-off-by: Alberto Bertogli Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 18fc4a281a7b..5175aa3103c6 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -144,7 +144,7 @@ struct bio { * bit 1 -- rw-ahead when set * bit 2 -- barrier * Insert a serialization point in the IO queue, forcing previously - * submitted IO to be completed before this oen is issued. + * submitted IO to be completed before this one is issued. * bit 3 -- synchronous I/O hint: the block layer will unplug immediately * Note that this does NOT indicate that the IO itself is sync, just * that the block layer will not postpone issue of this IO by plugging. -- cgit v1.2.3 From 213d9417fec62ef4c3675621b9364a667954d4dd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 6 Jan 2009 09:16:05 +0100 Subject: block: seperate bio/request unplug and sync bits Signed-off-by: Jens Axboe --- block/blk-core.c | 5 ++++- include/linux/bio.h | 18 +++++++++++------- include/linux/blkdev.h | 2 ++ 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index a824e49c0d0a..9e2e86fb78b8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1125,6 +1125,8 @@ void init_request_from_bio(struct request *req, struct bio *bio) if (bio_sync(bio)) req->cmd_flags |= REQ_RW_SYNC; + if (bio_unplug(bio)) + req->cmd_flags |= REQ_UNPLUG; if (bio_rw_meta(bio)) req->cmd_flags |= REQ_RW_META; @@ -1141,6 +1143,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) int el_ret, nr_sectors; const unsigned short prio = bio_prio(bio); const int sync = bio_sync(bio); + const int unplug = bio_unplug(bio); int rw_flags; nr_sectors = bio_sectors(bio); @@ -1244,7 +1247,7 @@ get_rq: blk_plug_device(q); add_request(q, req); out: - if (sync || blk_queue_nonrot(q)) + if (unplug || blk_queue_nonrot(q)) __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); return 0; diff --git a/include/linux/bio.h b/include/linux/bio.h index 5175aa3103c6..f53568c5852e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -163,12 +163,15 @@ struct bio { #define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */ #define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */ #define BIO_RW_BARRIER 2 -#define BIO_RW_SYNC 3 -#define BIO_RW_META 4 -#define BIO_RW_DISCARD 5 -#define BIO_RW_FAILFAST_DEV 6 -#define BIO_RW_FAILFAST_TRANSPORT 7 -#define BIO_RW_FAILFAST_DRIVER 8 +#define BIO_RW_SYNCIO 3 +#define BIO_RW_UNPLUG 4 +#define BIO_RW_META 5 +#define BIO_RW_DISCARD 6 +#define BIO_RW_FAILFAST_DEV 7 +#define BIO_RW_FAILFAST_TRANSPORT 8 +#define BIO_RW_FAILFAST_DRIVER 9 + +#define BIO_RW_SYNC (BIO_RW_SYNCIO | BIO_RW_UNPLUG) /* * upper 16 bits of bi_rw define the io priority of this bio @@ -194,7 +197,8 @@ struct bio { #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) #define bio_sectors(bio) ((bio)->bi_size >> 9) #define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER)) -#define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC)) +#define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNCIO)) +#define bio_unplug(bio) ((bio)->bi_rw & (1 << BIO_RW_UNPLUG)) #define bio_failfast_dev(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DEV)) #define bio_failfast_transport(bio) \ ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_TRANSPORT)) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 044467ef7b11..75426e4b8cdf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -108,6 +108,7 @@ enum rq_flag_bits { __REQ_RW_META, /* metadata io request */ __REQ_COPY_USER, /* contains copies of user pages */ __REQ_INTEGRITY, /* integrity metadata has been remapped */ + __REQ_UNPLUG, /* unplug queue on submission */ __REQ_NR_BITS, /* stops here */ }; @@ -134,6 +135,7 @@ enum rq_flag_bits { #define REQ_RW_META (1 << __REQ_RW_META) #define REQ_COPY_USER (1 << __REQ_COPY_USER) #define REQ_INTEGRITY (1 << __REQ_INTEGRITY) +#define REQ_UNPLUG (1 << __REQ_UNPLUG) #define BLK_MAX_CDB 16 -- cgit v1.2.3 From 1dfa17f4ab8543d82caf4d36636b93916a18f456 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 6 Jan 2009 09:21:49 +0100 Subject: block: add bio_rw_flagged() for testing bio->bi_rw The existing functions for checking bio->bi_rw are badly named. So lets mirror what we do for bio->bi_flags testing, use a properly named function so that it's immediately obvious what is being tested. Maintain compatability names for the old macros, eventually we'll get rid of these. Signed-off-by: Jens Axboe --- include/linux/bio.h | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index f53568c5852e..0942765cf8c0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -173,6 +173,24 @@ struct bio { #define BIO_RW_SYNC (BIO_RW_SYNCIO | BIO_RW_UNPLUG) +#define bio_rw_flagged(bio, flag) ((bio)->bi_rw & (1 << (flag))) + +/* + * Old defines, these should eventually be replaced by direct usage of + * bio_rw_flagged() + */ +#define bio_barrier(bio) bio_rw_flagged(bio, BIO_RW_BARRIER) +#define bio_sync(bio) bio_rw_flagged(bio, BIO_RW_SYNCIO) +#define bio_unplug(bio) bio_rw_flagged(bio, BIO_RW_UNPLUG) +#define bio_failfast_dev(bio) bio_rw_flagged(bio, BIO_RW_FAILFAST_DEV) +#define bio_failfast_transport(bio) \ + bio_rw_flagged(bio, BIO_RW_FAILFAST_TRANSPORT) +#define bio_failfast_driver(bio) \ + bio_rw_flagged(bio, BIO_RW_FAILFAST_DRIVER) +#define bio_rw_ahead(bio) bio_rw_flagged(bio, BIO_RW_AHEAD) +#define bio_rw_meta(bio) bio_rw_flagged(bio, BIO_RW_META) +#define bio_discard(bio) bio_rw_flagged(bio, BIO_RW_DISCARD) + /* * upper 16 bits of bi_rw define the io priority of this bio */ @@ -196,16 +214,6 @@ struct bio { #define bio_offset(bio) bio_iovec((bio))->bv_offset #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) #define bio_sectors(bio) ((bio)->bi_size >> 9) -#define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER)) -#define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNCIO)) -#define bio_unplug(bio) ((bio)->bi_rw & (1 << BIO_RW_UNPLUG)) -#define bio_failfast_dev(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DEV)) -#define bio_failfast_transport(bio) \ - ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_TRANSPORT)) -#define bio_failfast_driver(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DRIVER)) -#define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) -#define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META)) -#define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD)) #define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio)) static inline unsigned int bio_cur_sectors(struct bio *bio) -- cgit v1.2.3 From a229fc61ef0ee3c30fd193beee0eeb87410227f1 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 19 Jan 2009 10:37:38 +0100 Subject: include/linux: Add bsg.h to the Kernel exported headers bsg.h in current form is perfectly suitable for user-mode consumption. It is needed together with scsi/sg.h for applications that want to interface with the bsg driver. Currently the few projects that use it would copy it over into the projects. But that is not acceptable for projects that need to provide source and devel packages for distros. This should also be submitted to stable 2.6.28 and 2.6.27 since bsg had a stable API since these Kernels and distro users will need the header for these kernels a swell Signed-off-by: Boaz Harrosh Acked-by: FUJITA Tomonori CC: stable@kernel.org Signed-off-by: Jens Axboe --- include/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 12e9a2957caf..2124c063a7ef 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -41,6 +41,7 @@ header-y += baycom.h header-y += bfs_fs.h header-y += blkpg.h header-y += bpqether.h +header-y += bsg.h header-y += can.h header-y += cdk.h header-y += chio.h -- cgit v1.2.3 From bc58ba9468d94d62c56ab9b47173583ec140b165 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 23 Jan 2009 10:54:44 +0100 Subject: block: add sysfs file for controlling io stats accounting This allows us to turn off disk stat accounting completely, for the cases where the 0.5-1% reduction in system time is important. Signed-off-by: Jens Axboe --- block/blk-core.c | 90 ++++++++++++++++++++++++++++++-------------------- block/blk-sysfs.c | 28 ++++++++++++++++ include/linux/blkdev.h | 6 ++++ 3 files changed, 88 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index ae75c047f45d..ca69f3d94100 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -64,11 +64,12 @@ static struct workqueue_struct *kblockd_workqueue; static void drive_stat_acct(struct request *rq, int new_io) { + struct gendisk *disk = rq->rq_disk; struct hd_struct *part; int rw = rq_data_dir(rq); int cpu; - if (!blk_fs_request(rq) || !rq->rq_disk) + if (!blk_fs_request(rq) || !disk || !blk_queue_io_stat(disk->queue)) return; cpu = part_stat_lock(); @@ -599,8 +600,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) q->request_fn = rfn; q->prep_rq_fn = NULL; q->unplug_fn = generic_unplug_device; - q->queue_flags = (1 << QUEUE_FLAG_CLUSTER | - 1 << QUEUE_FLAG_STACKABLE); + q->queue_flags = QUEUE_FLAG_DEFAULT; q->queue_lock = lock; blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK); @@ -1663,6 +1663,55 @@ void blkdev_dequeue_request(struct request *req) } EXPORT_SYMBOL(blkdev_dequeue_request); +static void blk_account_io_completion(struct request *req, unsigned int bytes) +{ + struct gendisk *disk = req->rq_disk; + + if (!disk || !blk_queue_io_stat(disk->queue)) + return; + + if (blk_fs_request(req)) { + const int rw = rq_data_dir(req); + struct hd_struct *part; + int cpu; + + cpu = part_stat_lock(); + part = disk_map_sector_rcu(req->rq_disk, req->sector); + part_stat_add(cpu, part, sectors[rw], bytes >> 9); + part_stat_unlock(); + } +} + +static void blk_account_io_done(struct request *req) +{ + struct gendisk *disk = req->rq_disk; + + if (!disk || !blk_queue_io_stat(disk->queue)) + return; + + /* + * Account IO completion. bar_rq isn't accounted as a normal + * IO on queueing nor completion. Accounting the containing + * request is enough. + */ + if (blk_fs_request(req) && req != &req->q->bar_rq) { + unsigned long duration = jiffies - req->start_time; + const int rw = rq_data_dir(req); + struct hd_struct *part; + int cpu; + + cpu = part_stat_lock(); + part = disk_map_sector_rcu(disk, req->sector); + + part_stat_inc(cpu, part, ios[rw]); + part_stat_add(cpu, part, ticks[rw], duration); + part_round_stats(cpu, part); + part_dec_in_flight(part); + + part_stat_unlock(); + } +} + /** * __end_that_request_first - end I/O on a request * @req: the request being processed @@ -1698,16 +1747,7 @@ static int __end_that_request_first(struct request *req, int error, (unsigned long long)req->sector); } - if (blk_fs_request(req) && req->rq_disk) { - const int rw = rq_data_dir(req); - struct hd_struct *part; - int cpu; - - cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, req->sector); - part_stat_add(cpu, part, sectors[rw], nr_bytes >> 9); - part_stat_unlock(); - } + blk_account_io_completion(req, nr_bytes); total_bytes = bio_nbytes = 0; while ((bio = req->bio) != NULL) { @@ -1787,8 +1827,6 @@ static int __end_that_request_first(struct request *req, int error, */ static void end_that_request_last(struct request *req, int error) { - struct gendisk *disk = req->rq_disk; - if (blk_rq_tagged(req)) blk_queue_end_tag(req->q, req); @@ -1800,27 +1838,7 @@ static void end_that_request_last(struct request *req, int error) blk_delete_timer(req); - /* - * Account IO completion. bar_rq isn't accounted as a normal - * IO on queueing nor completion. Accounting the containing - * request is enough. - */ - if (disk && blk_fs_request(req) && req != &req->q->bar_rq) { - unsigned long duration = jiffies - req->start_time; - const int rw = rq_data_dir(req); - struct hd_struct *part; - int cpu; - - cpu = part_stat_lock(); - part = disk_map_sector_rcu(disk, req->sector); - - part_stat_inc(cpu, part, ios[rw]); - part_stat_add(cpu, part, ticks[rw], duration); - part_round_stats(cpu, part); - part_dec_in_flight(part); - - part_stat_unlock(); - } + blk_account_io_done(req); if (req->end_io) req->end_io(req, error); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b538ab8dbbd1..e29ddfc73cf4 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -197,6 +197,27 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } +static ssize_t queue_iostats_show(struct request_queue *q, char *page) +{ + return queue_var_show(blk_queue_io_stat(q), page); +} + +static ssize_t queue_iostats_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long stats; + ssize_t ret = queue_var_store(&stats, page, count); + + spin_lock_irq(q->queue_lock); + if (stats) + queue_flag_set(QUEUE_FLAG_IO_STAT, q); + else + queue_flag_clear(QUEUE_FLAG_IO_STAT, q); + spin_unlock_irq(q->queue_lock); + + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -249,6 +270,12 @@ static struct queue_sysfs_entry queue_rq_affinity_entry = { .store = queue_rq_affinity_store, }; +static struct queue_sysfs_entry queue_iostats_entry = { + .attr = {.name = "iostats", .mode = S_IRUGO | S_IWUSR }, + .show = queue_iostats_show, + .store = queue_iostats_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -259,6 +286,7 @@ static struct attribute *default_attrs[] = { &queue_nonrot_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, + &queue_iostats_entry.attr, NULL, }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 75426e4b8cdf..d08c4b8219a6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -451,6 +451,11 @@ struct request_queue #define QUEUE_FLAG_STACKABLE 13 /* supports request stacking */ #define QUEUE_FLAG_NONROT 14 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ +#define QUEUE_FLAG_IO_STAT 15 /* do IO stats */ + +#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ + (1 << QUEUE_FLAG_CLUSTER) | \ + 1 << QUEUE_FLAG_STACKABLE) static inline int queue_is_locked(struct request_queue *q) { @@ -567,6 +572,7 @@ enum { #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) +#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_flushing(q) ((q)->ordseq) #define blk_queue_stackable(q) \ test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) -- cgit v1.2.3 From 9d6aa4c7ece26652fcbfe37bd45679eac5f69347 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 19:50:25 +0530 Subject: headers_check fix: can/bcm.h fix the following 'make headers_check' warning: usr/include/linux/can/bcm.h:29: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/can/bcm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/can/bcm.h b/include/linux/can/bcm.h index 7f293273c444..1432b278c52d 100644 --- a/include/linux/can/bcm.h +++ b/include/linux/can/bcm.h @@ -14,6 +14,8 @@ #ifndef CAN_BCM_H #define CAN_BCM_H +#include + /** * struct bcm_msg_head - head of messages to/from the broadcast manager * @opcode: opcode, see enum below. -- cgit v1.2.3 From 15cf98ad2965aaefaa2f85332535ff39e48f9f4e Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 19:53:38 +0530 Subject: headers_check fix: dvb/audio.h fix the following 'make headers_check' warning: usr/include/linux/dvb/audio.h:133: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/dvb/audio.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dvb/audio.h b/include/linux/dvb/audio.h index 89412e18f571..bb0df2aaebfa 100644 --- a/include/linux/dvb/audio.h +++ b/include/linux/dvb/audio.h @@ -24,12 +24,7 @@ #ifndef _DVBAUDIO_H_ #define _DVBAUDIO_H_ -#ifdef __KERNEL__ #include -#else -#include -#endif - typedef enum { AUDIO_SOURCE_DEMUX, /* Select the demux as the main source */ -- cgit v1.2.3 From c86629c855800071314810881d1d2fb226ca9ec9 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 19:55:32 +0530 Subject: headers_check fix: dvb/dmx.h fix the following 'make headers_check' warnings: usr/include/linux/dvb/dmx.h:27: include of is preferred over usr/include/linux/dvb/dmx.h:90: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/dvb/dmx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dvb/dmx.h b/include/linux/dvb/dmx.h index 402fb7a8d922..fef943738a24 100644 --- a/include/linux/dvb/dmx.h +++ b/include/linux/dvb/dmx.h @@ -24,7 +24,7 @@ #ifndef _DVBDMX_H_ #define _DVBDMX_H_ -#include +#include #ifdef __KERNEL__ #include #else -- cgit v1.2.3 From de189f078ee4ae74944e6827dff184a3ef1fc89b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 19:56:41 +0530 Subject: headers_check fix: dvb/frontend.h fix the following 'make headers_check' warnings: usr/include/linux/dvb/frontend.h:29: include of is preferred over usr/include/linux/dvb/frontend.h:76: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/dvb/frontend.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dvb/frontend.h b/include/linux/dvb/frontend.h index 55026b1a40bd..51c8d2d49e42 100644 --- a/include/linux/dvb/frontend.h +++ b/include/linux/dvb/frontend.h @@ -26,8 +26,7 @@ #ifndef _DVBFRONTEND_H_ #define _DVBFRONTEND_H_ -#include - +#include typedef enum fe_type { FE_QPSK, -- cgit v1.2.3 From 8996be9de98a9362a3192b866dd8ab9930e28ad9 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 19:58:00 +0530 Subject: headers_check fix: dvb/net.h fix the following 'make headers_check' warnings: usr/include/linux/dvb/net.h:27: include of is preferred over usr/include/linux/dvb/net.h:31: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/dvb/net.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dvb/net.h b/include/linux/dvb/net.h index 5be474bf0d2b..f451e7eb0b0b 100644 --- a/include/linux/dvb/net.h +++ b/include/linux/dvb/net.h @@ -24,8 +24,7 @@ #ifndef _DVBNET_H_ #define _DVBNET_H_ -#include - +#include struct dvb_net_if { __u16 pid; -- cgit v1.2.3 From b852d36b86902abb272b0f2dd7a56dd2d17ea88c Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 19:59:53 +0530 Subject: headers_check fix: dvb/video.h fix the following 'make headers_check' warnings: usr/include/linux/dvb/video.h:29: include of is preferred over usr/include/linux/dvb/video.h:102: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/dvb/video.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dvb/video.h b/include/linux/dvb/video.h index 50839fe9e39e..bd49c3ebf916 100644 --- a/include/linux/dvb/video.h +++ b/include/linux/dvb/video.h @@ -24,17 +24,14 @@ #ifndef _DVBVIDEO_H_ #define _DVBVIDEO_H_ -#include - -#ifdef __KERNEL__ #include +#ifdef __KERNEL__ +#include #else -#include #include #include #endif - typedef enum { VIDEO_FORMAT_4_3, /* Select 4:3 format */ VIDEO_FORMAT_16_9, /* Select 16:9 format. */ -- cgit v1.2.3 From 9df27bab62e60d1f786abd0599af4a5e3192a784 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:00:47 +0530 Subject: headers_check fix: netfilter/xt_conntrack.h fix the following 'make headers_check' warning: usr/include/linux/netfilter/xt_conntrack.h:40: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/netfilter/xt_conntrack.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/xt_conntrack.h b/include/linux/netfilter/xt_conntrack.h index f3fd83e46bab..8f5345275393 100644 --- a/include/linux/netfilter/xt_conntrack.h +++ b/include/linux/netfilter/xt_conntrack.h @@ -5,6 +5,7 @@ #ifndef _XT_CONNTRACK_H #define _XT_CONNTRACK_H +#include #include #define XT_CONNTRACK_STATE_BIT(ctinfo) (1 << ((ctinfo)%IP_CT_IS_REPLY+1)) -- cgit v1.2.3 From 3187cedf158687432cdf152eeee205f7b149f4ef Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:03:06 +0530 Subject: headers_check fix: nfsd/export.h fix the following 'make headers_check' warning: usr/include/linux/nfsd/export.h:13: include of is preferred over Signed-off-by: Jaswinder Singh Rajput --- include/linux/nfsd/export.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h index 5431512b2757..bcd0201589f8 100644 --- a/include/linux/nfsd/export.h +++ b/include/linux/nfsd/export.h @@ -10,9 +10,8 @@ #ifndef NFSD_EXPORT_H #define NFSD_EXPORT_H -#include -#ifdef __KERNEL__ # include +#ifdef __KERNEL__ # include #endif -- cgit v1.2.3 From 9e87b1e53f3c72c1196dc22cb359b5d6188a3729 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:04:03 +0530 Subject: headers_check fix: nfsd/nfsfh.h fix the following 'make headers_check' warnings: usr/include/linux/nfsd/nfsfh.h:17: include of is preferred over usr/include/linux/nfsd/nfsfh.h:28: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/nfsd/nfsfh.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h index b2e093870bc6..fa317f6c154b 100644 --- a/include/linux/nfsd/nfsfh.h +++ b/include/linux/nfsd/nfsfh.h @@ -14,9 +14,8 @@ #ifndef _LINUX_NFSD_FH_H #define _LINUX_NFSD_FH_H -#include -#ifdef __KERNEL__ # include +#ifdef __KERNEL__ # include # include #endif -- cgit v1.2.3 From 03cf1e0c3b4ee4ef51dc7eb197a4d098ad4873af Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:05:27 +0530 Subject: headers_check fix: nfsd/syscall.h fix the following 'make headers_check' warnings: usr/include/linux/nfsd/syscall.h:12: include of is preferred over usr/include/linux/nfsd/syscall.h:104: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/nfsd/syscall.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfsd/syscall.h b/include/linux/nfsd/syscall.h index 4e439765b705..7a3b565b898f 100644 --- a/include/linux/nfsd/syscall.h +++ b/include/linux/nfsd/syscall.h @@ -9,9 +9,8 @@ #ifndef NFSD_SYSCALL_H #define NFSD_SYSCALL_H -#include -#ifdef __KERNEL__ # include +#ifdef __KERNEL__ # include #endif #include -- cgit v1.2.3 From bcf74582af3feca80ec96cc21d0a26c938d1863e Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:06:44 +0530 Subject: headers_check fix: raid/md_p.h fix the following 'make headers_check' warning: usr/include/linux/raid/md_p.h:85: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/raid/md_p.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 9491026afe66..6ba830fa8538 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -15,6 +15,8 @@ #ifndef _MD_P_H #define _MD_P_H +#include + /* * RAID superblock. * -- cgit v1.2.3 From 550e978aa52e2ac3c493e8a0b36b368ade6dd2b4 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:07:35 +0530 Subject: headers_check fix: spi/spidev.h fix the following 'make headers_check' warning: usr/include/linux/spi/spidev.h:83: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/spi/spidev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/spi/spidev.h b/include/linux/spi/spidev.h index c93ef9d42a01..95251ccd5a07 100644 --- a/include/linux/spi/spidev.h +++ b/include/linux/spi/spidev.h @@ -22,6 +22,7 @@ #ifndef SPIDEV_H #define SPIDEV_H +#include /* User space versions of kernel symbols for SPI clocking modes, * matching -- cgit v1.2.3 From 2d594c0c8aa46beb21be1c5c2b7141f89d206313 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:10:05 +0530 Subject: headers_check fix: tc_act/tc_gact.h fix the following 'make headers_check' warning: usr/include/linux/tc_act/tc_gact.h:19: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/tc_act/tc_gact.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tc_act/tc_gact.h b/include/linux/tc_act/tc_gact.h index 23a03eb630db..e895c0a39629 100644 --- a/include/linux/tc_act/tc_gact.h +++ b/include/linux/tc_act/tc_gact.h @@ -1,6 +1,7 @@ #ifndef __LINUX_TC_GACT_H #define __LINUX_TC_GACT_H +#include #include #define TCA_ACT_GACT 5 -- cgit v1.2.3 From 9c536d275823b8a6281894f4f8c2687f60578253 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:10:44 +0530 Subject: headers_check fix: tc_act/tc_mirred.h fix the following 'make headers_check' warning: usr/include/linux/tc_act/tc_mirred.h:16: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/tc_act/tc_mirred.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tc_act/tc_mirred.h b/include/linux/tc_act/tc_mirred.h index 71d63409d568..0a99ab60d610 100644 --- a/include/linux/tc_act/tc_mirred.h +++ b/include/linux/tc_act/tc_mirred.h @@ -1,6 +1,7 @@ #ifndef __LINUX_TC_MIR_H #define __LINUX_TC_MIR_H +#include #include #define TCA_ACT_MIRRED 8 -- cgit v1.2.3 From 5dbbf3bcae2f6b5dee1c33b3eeced00bcb6c4f71 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:11:22 +0530 Subject: headers_check fix: tc_act/tc_pedit.h fix the following 'make headers_check' warning: usr/include/linux/tc_act/tc_pedit.h:19: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/tc_act/tc_pedit.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tc_act/tc_pedit.h b/include/linux/tc_act/tc_pedit.h index 83e56e32e8e0..54ce9064115a 100644 --- a/include/linux/tc_act/tc_pedit.h +++ b/include/linux/tc_act/tc_pedit.h @@ -1,6 +1,7 @@ #ifndef __LINUX_TC_PED_H #define __LINUX_TC_PED_H +#include #include #define TCA_ACT_PEDIT 7 -- cgit v1.2.3 From ba3a51e3b899c1bd34c18f84a1c6f7e5f99be850 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:12:01 +0530 Subject: headers_check fix: tc_ematch/tc_em_cmp.h fix the following 'make headers_check' warning: usr/include/linux/tc_ematch/tc_em_cmp.h:8: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/tc_ematch/tc_em_cmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tc_ematch/tc_em_cmp.h b/include/linux/tc_ematch/tc_em_cmp.h index c7f4d43618fd..38e7f7b25ec2 100644 --- a/include/linux/tc_ematch/tc_em_cmp.h +++ b/include/linux/tc_ematch/tc_em_cmp.h @@ -1,6 +1,7 @@ #ifndef __LINUX_TC_EM_CMP_H #define __LINUX_TC_EM_CMP_H +#include #include struct tcf_em_cmp -- cgit v1.2.3 From 9976007a13e67b973f94c8d472ed615ac4cf8078 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:12:59 +0530 Subject: headers_check fix: tc_ematch/tc_em_meta.h fix the following 'make headers_check' warning: usr/include/linux/tc_ematch/tc_em_meta.h:18: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/tc_ematch/tc_em_meta.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tc_ematch/tc_em_meta.h b/include/linux/tc_ematch/tc_em_meta.h index c50d2ba5caf0..dcfb733fa1f6 100644 --- a/include/linux/tc_ematch/tc_em_meta.h +++ b/include/linux/tc_ematch/tc_em_meta.h @@ -1,6 +1,7 @@ #ifndef __LINUX_TC_EM_META_H #define __LINUX_TC_EM_META_H +#include #include enum -- cgit v1.2.3 From ac836c6f1b17f674e35a7e2784541bb8ab0bce38 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:13:41 +0530 Subject: headers_check fix: tc_ematch/tc_em_nbyte.h fix the following 'make headers_check' warning: usr/include/linux/tc_ematch/tc_em_nbyte.h:8: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/tc_ematch/tc_em_nbyte.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tc_ematch/tc_em_nbyte.h b/include/linux/tc_ematch/tc_em_nbyte.h index f19d1f58ec9d..9ed8c2e58488 100644 --- a/include/linux/tc_ematch/tc_em_nbyte.h +++ b/include/linux/tc_ematch/tc_em_nbyte.h @@ -1,6 +1,7 @@ #ifndef __LINUX_TC_EM_NBYTE_H #define __LINUX_TC_EM_NBYTE_H +#include #include struct tcf_em_nbyte -- cgit v1.2.3 From 30f410a6d372f067df3d02e3db328720bf421c81 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:14:36 +0530 Subject: headers_check fix: tc_ematch/tc_em_text.h fix the following 'make headers_check' warning: usr/include/linux/tc_ematch/tc_em_text.h:11: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/tc_ematch/tc_em_text.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tc_ematch/tc_em_text.h b/include/linux/tc_ematch/tc_em_text.h index 7cd43e99c7f5..d12a73a225fc 100644 --- a/include/linux/tc_ematch/tc_em_text.h +++ b/include/linux/tc_ematch/tc_em_text.h @@ -1,6 +1,7 @@ #ifndef __LINUX_TC_EM_TEXT_H #define __LINUX_TC_EM_TEXT_H +#include #include #define TC_EM_TEXT_ALGOSIZ 16 -- cgit v1.2.3 From d8151585690d824ac5b60a94ef86f8bfd61478fa Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:15:47 +0530 Subject: headers_check fix: usb/cdc.h fix the following 'make headers_check' warning: usr/include/linux/usb/cdc.h:50: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/usb/cdc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/cdc.h b/include/linux/usb/cdc.h index 18a729343ffa..3c86ed25a04c 100644 --- a/include/linux/usb/cdc.h +++ b/include/linux/usb/cdc.h @@ -9,6 +9,8 @@ #ifndef __LINUX_USB_CDC_H #define __LINUX_USB_CDC_H +#include + #define USB_CDC_SUBCLASS_ACM 0x02 #define USB_CDC_SUBCLASS_ETHERNET 0x06 #define USB_CDC_SUBCLASS_WHCM 0x08 -- cgit v1.2.3 From 4c866d444078d931579c50c9ce3133709390287b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:16:33 +0530 Subject: headers_check fix: usb/gadgetfs.h fix the following 'make headers_check' warning: usr/include/linux/usb/gadgetfs.h:21: include of is preferred over Signed-off-by: Jaswinder Singh Rajput --- include/linux/usb/gadgetfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/gadgetfs.h b/include/linux/usb/gadgetfs.h index ea45f265ec05..612102e4d75e 100644 --- a/include/linux/usb/gadgetfs.h +++ b/include/linux/usb/gadgetfs.h @@ -18,7 +18,7 @@ #ifndef __LINUX_USB_GADGETFS_H #define __LINUX_USB_GADGETFS_H -#include +#include #include #include -- cgit v1.2.3 From bd247b348aaa9f28a53a64df06c69d6f40ff2280 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:20:10 +0530 Subject: headers_check fix: linux/aio_abi.h fix the following 'make headers_check' warning: usr/include/linux/aio_abi.h:58: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/aio_abi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/aio_abi.h b/include/linux/aio_abi.h index 9e0172931315..2c8731664180 100644 --- a/include/linux/aio_abi.h +++ b/include/linux/aio_abi.h @@ -27,6 +27,7 @@ #ifndef __LINUX__AIO_ABI_H #define __LINUX__AIO_ABI_H +#include #include typedef unsigned long aio_context_t; -- cgit v1.2.3 From 85c09569e563cbb9376f10da20ada42107dfef98 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:26:25 +0530 Subject: headers_check fix: linux/atalk.h fix the following 'make headers_check' warning: usr/include/linux/atalk.h:15: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/atalk.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/atalk.h b/include/linux/atalk.h index e9ebac2e2ecc..d34c187432ed 100644 --- a/include/linux/atalk.h +++ b/include/linux/atalk.h @@ -1,6 +1,7 @@ #ifndef __LINUX_ATALK_H__ #define __LINUX_ATALK_H__ +#include #include /* -- cgit v1.2.3 From f757f603f7d52254120cbfcd967f67f663264c64 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:29:11 +0530 Subject: headers_check fix: linux/atmbr2684.h fix the following 'make headers_check' warning: usr/include/linux/atmbr2684.h:88: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/atmbr2684.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/atmbr2684.h b/include/linux/atmbr2684.h index 52bf72affbba..fdb2629b6189 100644 --- a/include/linux/atmbr2684.h +++ b/include/linux/atmbr2684.h @@ -1,6 +1,7 @@ #ifndef _LINUX_ATMBR2684_H #define _LINUX_ATMBR2684_H +#include #include #include /* For IFNAMSIZ */ -- cgit v1.2.3 From 5d461bfebe4be9ae8d25d4570d4eaa415ca76f0f Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:31:19 +0530 Subject: headers_check fix: linux/auto_fs4.h fix the following 'make headers_check' warning: usr/include/linux/auto_fs4.h:132: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/auto_fs4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/auto_fs4.h b/include/linux/auto_fs4.h index 55fa478bd639..8b49ac48a5b7 100644 --- a/include/linux/auto_fs4.h +++ b/include/linux/auto_fs4.h @@ -12,6 +12,7 @@ #define _LINUX_AUTO_FS4_H /* Include common v3 definitions */ +#include #include /* autofs v4 definitions */ -- cgit v1.2.3 From 1da9ebd5abb2e960c4ca4d49f7587e6c76b16ac0 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:34:15 +0530 Subject: headers_check fix: linux/bfs_fs.h fix the following 'make headers_check' warning: usr/include/linux/bfs_fs.h:24: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/bfs_fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bfs_fs.h b/include/linux/bfs_fs.h index 8ed6dfdcd783..1c0b355aa515 100644 --- a/include/linux/bfs_fs.h +++ b/include/linux/bfs_fs.h @@ -6,6 +6,8 @@ #ifndef _LINUX_BFS_FS_H #define _LINUX_BFS_FS_H +#include + #define BFS_BSIZE_BITS 9 #define BFS_BSIZE (1< Date: Fri, 30 Jan 2009 20:36:52 +0530 Subject: headers_check fix: linux/blktrace_api.h fix the following 'make headers_check' warning: usr/include/linux/blktrace_api.h:96: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/blktrace_api.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 1dba3493d520..25379cba2370 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -1,6 +1,7 @@ #ifndef BLKTRACE_H #define BLKTRACE_H +#include #ifdef __KERNEL__ #include #include -- cgit v1.2.3 From 9fa91d99bfdd9582e43b6b9ab97678c51373c4ae Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:39:30 +0530 Subject: headers_check fix: linux/capability.h fix the following 'make headers_check' warning: usr/include/linux/capability.h:73: extern's make no sense in userspace Signed-off-by: Jaswinder Singh Rajput --- include/linux/capability.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index 02bdb768d43b..1b9872556131 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -69,10 +69,6 @@ typedef struct __user_cap_data_struct { #define VFS_CAP_U32 VFS_CAP_U32_2 #define VFS_CAP_REVISION VFS_CAP_REVISION_2 -#ifdef CONFIG_SECURITY_FILE_CAPABILITIES -extern int file_caps_enabled; -#endif - struct vfs_cap_data { __le32 magic_etc; /* Little endian */ struct { @@ -96,6 +92,10 @@ struct vfs_cap_data { #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 #define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3 +#ifdef CONFIG_SECURITY_FILE_CAPABILITIES +extern int file_caps_enabled; +#endif + typedef struct kernel_cap_struct { __u32 cap[_KERNEL_CAPABILITY_U32S]; } kernel_cap_t; -- cgit v1.2.3 From 960066a919f1db57817df6d02e72b01542f1deed Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:41:51 +0530 Subject: headers_check fix: linux/cdrom.h fix the following 'make headers_check' warning: usr/include/linux/cdrom.h:155: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/cdrom.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 0b49e08d3cb0..78e904796622 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -11,6 +11,7 @@ #ifndef _LINUX_CDROM_H #define _LINUX_CDROM_H +#include #include /******************************************************* -- cgit v1.2.3 From 59e4cf19ede2d2725c1b336707c1077afdd3cf85 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:43:31 +0530 Subject: headers_check fix: linux/cgroupstats.h fix the following 'make headers_check' warning: usr/include/linux/cgroupstats.h:31: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/cgroupstats.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroupstats.h b/include/linux/cgroupstats.h index 4f53abf6855d..3753c33160d1 100644 --- a/include/linux/cgroupstats.h +++ b/include/linux/cgroupstats.h @@ -15,6 +15,7 @@ #ifndef _LINUX_CGROUPSTATS_H #define _LINUX_CGROUPSTATS_H +#include #include /* -- cgit v1.2.3 From 37eb1f4c3320ed505fbe59a916635b2342c740e4 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:51:30 +0530 Subject: headers_check fix: linux/dlm_plock.h fix the following 'make headers_check' warning: usr/include/linux/dlm_plock.h:25: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/dlm_plock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dlm_plock.h b/include/linux/dlm_plock.h index 18d5fdbceb74..2dd21243104f 100644 --- a/include/linux/dlm_plock.h +++ b/include/linux/dlm_plock.h @@ -9,6 +9,8 @@ #ifndef __DLM_PLOCK_DOT_H__ #define __DLM_PLOCK_DOT_H__ +#include + #define DLM_PLOCK_MISC_NAME "dlm_plock" #define DLM_PLOCK_VERSION_MAJOR 1 -- cgit v1.2.3 From 57d1780fab89d3736de0d24189129c17178448f0 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:52:54 +0530 Subject: headers_check fix: linux/dn.h fix the following 'make headers_check' warning: usr/include/linux/dn.h:75: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/dn.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dn.h b/include/linux/dn.h index 02bba040fcfb..fe9990823193 100644 --- a/include/linux/dn.h +++ b/include/linux/dn.h @@ -1,6 +1,8 @@ #ifndef _LINUX_DN_H #define _LINUX_DN_H +#include + /* DECnet Data Structures and Constants -- cgit v1.2.3 From 4144147081b9d08e69055a780888fcbb7cfcbb8e Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 20:54:27 +0530 Subject: headers_check fix: linux/edd.h fix the following 'make headers_check' warning: usr/include/linux/edd.h:70: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/edd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/edd.h b/include/linux/edd.h index 5d747c5cd0fe..4cbd0fe9df08 100644 --- a/include/linux/edd.h +++ b/include/linux/edd.h @@ -30,6 +30,8 @@ #ifndef _LINUX_EDD_H #define _LINUX_EDD_H +#include + #define EDDNR 0x1e9 /* addr of number of edd_info structs at EDDBUF in boot_params - treat this as 1 byte */ #define EDDBUF 0xd00 /* addr of edd_info structs in boot_params */ -- cgit v1.2.3 From bd71b5f734c66ad0134e308036b13d122907b8c6 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:01:11 +0530 Subject: headers_check fix: linux/efs_fs_sb.h fix the following 'make headers_check' warning: usr/include/linux/efs_fs_sb.h:49: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/efs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/efs_fs_sb.h b/include/linux/efs_fs_sb.h index ff1945e37790..a01be90c58cc 100644 --- a/include/linux/efs_fs_sb.h +++ b/include/linux/efs_fs_sb.h @@ -9,6 +9,7 @@ #ifndef __EFS_FS_SB_H__ #define __EFS_FS_SB_H__ +#include #include /* EFS superblock magic numbers */ -- cgit v1.2.3 From 177a858ff8d71a8e7f8b0ef53ff49441e29c8fb1 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:04:44 +0530 Subject: headers_check fix: linux/elf-fdpic.h fix the following 'make headers_check' warning: usr/include/linux/elf-fdpic.h:62: extern's make no sense in userspace Signed-off-by: Jaswinder Singh Rajput --- include/linux/elf-fdpic.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/elf-fdpic.h b/include/linux/elf-fdpic.h index 9f5b7456bff3..7cd2e80cebc8 100644 --- a/include/linux/elf-fdpic.h +++ b/include/linux/elf-fdpic.h @@ -58,11 +58,13 @@ struct elf_fdpic_params { #define ELF_FDPIC_FLAG_PRESENT 0x80000000 /* T if this object is present */ }; +#ifdef __KERNEL__ #ifdef CONFIG_MMU extern void elf_fdpic_arch_lay_out_mm(struct elf_fdpic_params *exec_params, struct elf_fdpic_params *interp_params, unsigned long *start_stack, unsigned long *start_brk); #endif +#endif /* __KERNEL__ */ #endif /* _LINUX_ELF_FDPIC_H */ -- cgit v1.2.3 From f4aa1c30255278b7b50a1cd273c7b4a46f099a90 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:05:50 +0530 Subject: headers_check fix: linux/elf.h fix the following 'make headers_check' warnings: usr/include/linux/elf.h:379: extern's make no sense in userspace usr/include/linux/elf.h:387: extern's make no sense in userspace usr/include/linux/elf.h:401: extern's make no sense in userspace usr/include/linux/elf.h:402: extern's make no sense in userspace Signed-off-by: Jaswinder Singh Rajput --- include/linux/elf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/elf.h b/include/linux/elf.h index 0b61ca41a044..45a937be6d38 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -377,6 +377,7 @@ typedef struct elf64_note { Elf64_Word n_type; /* Content type */ } Elf64_Nhdr; +#ifdef __KERNEL__ #if ELF_CLASS == ELFCLASS32 extern Elf32_Dyn _DYNAMIC []; @@ -404,5 +405,5 @@ static inline int elf_coredump_extra_notes_write(struct file *file, extern int elf_coredump_extra_notes_size(void); extern int elf_coredump_extra_notes_write(struct file *file, loff_t *foffset); #endif - +#endif /* __KERNEL__ */ #endif /* _LINUX_ELF_H */ -- cgit v1.2.3 From 93c1c0e310b56acbd366a43b15260a1775481f24 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:09:18 +0530 Subject: headers_check fix: linux/errqueue.h fix the following 'make headers_check' warning: usr/include/linux/errqueue.h:6: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/errqueue.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/errqueue.h b/include/linux/errqueue.h index 92f8d4fab32b..ceb1454b6977 100644 --- a/include/linux/errqueue.h +++ b/include/linux/errqueue.h @@ -1,6 +1,8 @@ #ifndef _LINUX_ERRQUEUE_H #define _LINUX_ERRQUEUE_H 1 +#include + struct sock_extended_err { __u32 ee_errno; -- cgit v1.2.3 From 985f302cb42e18912c88a3d2f9d9008844045ee3 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:10:52 +0530 Subject: headers_check fix: linux/genetlink.h fix the following 'make headers_check' warning: usr/include/linux/genetlink.h:12: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/genetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h index 7da02c93002b..b834ef6d59fa 100644 --- a/include/linux/genetlink.h +++ b/include/linux/genetlink.h @@ -1,6 +1,7 @@ #ifndef __LINUX_GENERIC_NETLINK_H #define __LINUX_GENERIC_NETLINK_H +#include #include #define GENL_NAMSIZ 16 /* length of family name */ -- cgit v1.2.3 From 237416fe05067237f0bcc6370d84c09b52fb776a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:12:38 +0530 Subject: headers_check fix: linux/gfs2_ondisk.h fix the following 'make headers_check' warning: usr/include/linux/gfs2_ondisk.h:109: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/gfs2_ondisk.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h index 14d0df0b5749..c56b4bce56d0 100644 --- a/include/linux/gfs2_ondisk.h +++ b/include/linux/gfs2_ondisk.h @@ -10,6 +10,8 @@ #ifndef __GFS2_ONDISK_DOT_H__ #define __GFS2_ONDISK_DOT_H__ +#include + #define GFS2_MAGIC 0x01161970 #define GFS2_BASIC_BLOCK 512 #define GFS2_BASIC_BLOCK_SHIFT 9 -- cgit v1.2.3 From b08ead0527bcfdcab39a347b531701289485b484 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:15:34 +0530 Subject: headers_check fix: linux/hid.h fix the following 'make headers_check' warnings: usr/include/linux/hid.h:69: extern's make no sense in userspace usr/include/linux/hid.h:76: extern's make no sense in userspace Signed-off-by: Jaswinder Singh Rajput --- include/linux/hid.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 81aa84d60c6b..fa8ee9cef7be 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -791,6 +791,7 @@ dbg_hid(const char *fmt, ...) __FILE__ , ## arg) #endif /* HID_FF */ +#ifdef __KERNEL__ #ifdef CONFIG_HID_COMPAT #define HID_COMPAT_LOAD_DRIVER(name) \ /* prototype to avoid sparse warning */ \ @@ -804,6 +805,7 @@ EXPORT_SYMBOL(hid_compat_##name) extern void hid_compat_##name(void); \ hid_compat_##name(); \ } while (0) +#endif /* __KERNEL__ */ #endif -- cgit v1.2.3 From c244ae5b16dc31b5bea67e6d6e9d6ff654aee781 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:17:06 +0530 Subject: headers_check fix: linux/hiddev.h fix the following 'make headers_check' warning: usr/include/linux/hiddev.h:40: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/hiddev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hiddev.h b/include/linux/hiddev.h index c760ae0eb6a1..bb6f58baf319 100644 --- a/include/linux/hiddev.h +++ b/include/linux/hiddev.h @@ -27,6 +27,8 @@ * Vojtech Pavlik, Ucitelska 1576, Prague 8, 182 00 Czech Republic */ +#include + /* * The event structure itself */ -- cgit v1.2.3 From 1cc49ae2e6d241e5cfc2c52e3329f5ef8dd42f18 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:18:37 +0530 Subject: headers_check fix: linux/icmpv6.h fix the following 'make headers_check' warning: usr/include/linux/icmpv6.h:8: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/icmpv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index a93a8dd33118..10d701eec484 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -1,6 +1,7 @@ #ifndef _LINUX_ICMPV6_H #define _LINUX_ICMPV6_H +#include #include struct icmp6hdr { -- cgit v1.2.3 From 680ee0bd2a9625965812c1209476168fd0704a00 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:21:01 +0530 Subject: headers_check fix: linux/if_addr.h fix the following 'make headers_check' warning: usr/include/linux/if_addr.h:8: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/if_addr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_addr.h b/include/linux/if_addr.h index 43f3bedaafd3..a60c821be44c 100644 --- a/include/linux/if_addr.h +++ b/include/linux/if_addr.h @@ -1,6 +1,7 @@ #ifndef __LINUX_IF_ADDR_H #define __LINUX_IF_ADDR_H +#include #include struct ifaddrmsg -- cgit v1.2.3 From 1759cb994c3ff51e69268379da1cdd96048a8268 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:25:00 +0530 Subject: headers_check fix: linux/if_addrlabel.h fix the following 'make headers_check' warning: usr/include/linux/if_addrlabel.h:15: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/if_addrlabel.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_addrlabel.h b/include/linux/if_addrlabel.h index 9fe79c95dd28..89571f65d6de 100644 --- a/include/linux/if_addrlabel.h +++ b/include/linux/if_addrlabel.h @@ -10,6 +10,8 @@ #ifndef __LINUX_IF_ADDRLABEL_H #define __LINUX_IF_ADDRLABEL_H +#include + struct ifaddrlblmsg { __u8 ifal_family; /* Address family */ -- cgit v1.2.3 From ba7161387e82fbbdc4b49533aa1345bb7befda13 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:26:35 +0530 Subject: headers_check fix: linux/if_fc.h fix the following 'make headers_check' warning: usr/include/linux/if_fc.h:37: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/if_fc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_fc.h b/include/linux/if_fc.h index 376a34ea4723..6ed7f1bf35c8 100644 --- a/include/linux/if_fc.h +++ b/include/linux/if_fc.h @@ -20,6 +20,7 @@ #ifndef _LINUX_IF_FC_H #define _LINUX_IF_FC_H +#include #define FC_ALEN 6 /* Octets in one ethernet addr */ #define FC_HLEN (sizeof(struct fch_hdr)+sizeof(struct fcllc)) -- cgit v1.2.3 From b06e936939931c5acb1ca5dfe1d02b4d2f7cb11f Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:27:59 +0530 Subject: headers_check fix: linux/if_hippi.h fix the following 'make headers_check' warning: usr/include/linux/if_hippi.h:82: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/if_hippi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_hippi.h b/include/linux/if_hippi.h index f0f23516bb59..4a7c9940b080 100644 --- a/include/linux/if_hippi.h +++ b/include/linux/if_hippi.h @@ -22,6 +22,7 @@ #ifndef _LINUX_IF_HIPPI_H #define _LINUX_IF_HIPPI_H +#include #include /* -- cgit v1.2.3 From 85db53102dbf0816e9c5426c9322a64759e7166b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:29:20 +0530 Subject: headers_check fix: linux/if_link.h fix the following 'make headers_check' warning: usr/include/linux/if_link.h:9: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_link.h b/include/linux/if_link.h index f9032c88716a..176c5182c515 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -1,6 +1,7 @@ #ifndef _LINUX_IF_LINK_H #define _LINUX_IF_LINK_H +#include #include /* The struct should be in sync with struct net_device_stats */ -- cgit v1.2.3 From 0fe5a8fe0c145a6ff8f3daacd32f1824d0c021a9 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:31:29 +0530 Subject: headers_check fix: linux/if_ppp.h fix the following 'make headers_check' warning: usr/include/linux/if_ppp.h:96: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/if_ppp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_ppp.h b/include/linux/if_ppp.h index c3b1f8562709..fcef103aa3f6 100644 --- a/include/linux/if_ppp.h +++ b/include/linux/if_ppp.h @@ -33,6 +33,7 @@ #ifndef _IF_PPP_H_ #define _IF_PPP_H_ +#include #include /* -- cgit v1.2.3 From 84ad40ebbaeb22fc665b1f307d32128c46e8d42d Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:32:52 +0530 Subject: headers_check fix: linux/if_strip.h fix the following 'make headers_check' warning: usr/include/linux/if_strip.h:22: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/if_strip.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_strip.h b/include/linux/if_strip.h index fb5c5c98442f..6526a6235832 100644 --- a/include/linux/if_strip.h +++ b/include/linux/if_strip.h @@ -18,6 +18,8 @@ #ifndef __LINUX_STRIP_H #define __LINUX_STRIP_H +#include + typedef struct { __u8 c[6]; } MetricomAddress; -- cgit v1.2.3 From 65863dbc0833e06b905679f61450f05a68bae4c2 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:34:36 +0530 Subject: headers_check fix: linux/if_tr.h fix the following 'make headers_check' warning: usr/include/linux/if_tr.h:37: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/if_tr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_tr.h b/include/linux/if_tr.h index 5bcec8b2c5e2..fc23aeb0f201 100644 --- a/include/linux/if_tr.h +++ b/include/linux/if_tr.h @@ -19,6 +19,7 @@ #ifndef _LINUX_IF_TR_H #define _LINUX_IF_TR_H +#include #include /* For __be16 */ /* IEEE 802.5 Token-Ring magic constants. The frame sizes omit the preamble -- cgit v1.2.3 From de8b0bcafabfb4400aa028282293ce7d52307433 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:36:04 +0530 Subject: headers_check fix: linux/igmp.h fix the following 'make headers_check' warning: usr/include/linux/igmp.h:31: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/igmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index f734a0ba0698..92fbd8cbd68f 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -16,6 +16,7 @@ #ifndef _LINUX_IGMP_H #define _LINUX_IGMP_H +#include #include /* -- cgit v1.2.3 From 2df005b75ab910f789f099f81bb70b3aa37203a7 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:42:47 +0530 Subject: headers_check fix: linux/inet_diag.h fix the following 'make headers_check' warning: usr/include/linux/inet_diag.h:16: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/inet_diag.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 6e8bc548635a..bc8c49022084 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -1,6 +1,8 @@ #ifndef _INET_DIAG_H_ #define _INET_DIAG_H_ 1 +#include + /* Just some random number */ #define TCPDIAG_GETSOCK 18 #define DCCPDIAG_GETSOCK 19 -- cgit v1.2.3 From 217a2291570b1e4c28cb6e4cd099707e456a09b8 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:44:19 +0530 Subject: headers_check fix: linux/ip6_tunnel.h fix the following 'make headers_check' warning: include/linux/ip6_tunnel.h:21: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/ip6_tunnel.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ip6_tunnel.h b/include/linux/ip6_tunnel.h index 1e7cc4af40de..acb9ad684d63 100644 --- a/include/linux/ip6_tunnel.h +++ b/include/linux/ip6_tunnel.h @@ -1,6 +1,8 @@ #ifndef _IP6_TUNNEL_H #define _IP6_TUNNEL_H +#include + #define IPV6_TLV_TNL_ENCAP_LIMIT 4 #define IPV6_DEFAULT_TNL_ENCAP_LIMIT 4 -- cgit v1.2.3 From 5c6aa2badf1b97ead5ffec8094f0c6236e0c07c5 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:46:43 +0530 Subject: headers_check fix: linux/ipv6.h fix the following 'make headers_check' warning: usr/include/linux/ipv6.h:26: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 0b816cae533e..476d9464ac82 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -1,6 +1,7 @@ #ifndef _IPV6_H #define _IPV6_H +#include #include #include -- cgit v1.2.3 From e5144de521417b0f0eea74ece89acd437ecd32c9 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:47:52 +0530 Subject: headers_check fix: linux/ipv6_route.h fix the following 'make headers_check' warning: usr/include/linux/ipv6_route.h:42: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/ipv6_route.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ipv6_route.h b/include/linux/ipv6_route.h index b323ff577967..1e7d8af2defe 100644 --- a/include/linux/ipv6_route.h +++ b/include/linux/ipv6_route.h @@ -13,6 +13,8 @@ #ifndef _LINUX_IPV6_ROUTE_H #define _LINUX_IPV6_ROUTE_H +#include + #define RTF_DEFAULT 0x00010000 /* default - learned via ND */ #define RTF_ALLONLINK 0x00020000 /* (deprecated and will be removed) fallback, no routers on link */ -- cgit v1.2.3 From d6d20f54847e27ed886e8285c208368ef3d42abb Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:49:07 +0530 Subject: headers_check fix: linux/ipx.h fix the following 'make headers_check' warning: usr/include/linux/ipx.h:13: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/ipx.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ipx.h b/include/linux/ipx.h index eb19b4ea84f4..aabb1d294025 100644 --- a/include/linux/ipx.h +++ b/include/linux/ipx.h @@ -1,5 +1,6 @@ #ifndef _IPX_H_ #define _IPX_H_ +#include #include #include #define IPX_NODE_LEN 6 -- cgit v1.2.3 From df9c04ed3ff455aa5cb7c4bcddf4544fe54cfa33 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:50:39 +0530 Subject: headers_check fix: linux/irda.h fix the following 'make headers_check' warning: usr/include/linux/irda.h:127: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/irda.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irda.h b/include/linux/irda.h index 28f88ecba344..00bdad0e8515 100644 --- a/include/linux/irda.h +++ b/include/linux/irda.h @@ -25,6 +25,8 @@ #ifndef KERNEL_IRDA_H #define KERNEL_IRDA_H +#include + /* Please do *not* add any #include in this file, this file is * included as-is in user space. * Please fix the calling file to properly included needed files before -- cgit v1.2.3 From 4b7ae34277608a30346d076beb44cbc466aa73e5 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:54:23 +0530 Subject: headers_check fix: linux/minix_fs.h fix the following 'make headers_check' warning: usr/include/linux/minix_fs.h:34: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/minix_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/minix_fs.h b/include/linux/minix_fs.h index 0e39745f5111..13fe09e0576a 100644 --- a/include/linux/minix_fs.h +++ b/include/linux/minix_fs.h @@ -1,6 +1,7 @@ #ifndef _LINUX_MINIX_FS_H #define _LINUX_MINIX_FS_H +#include #include /* -- cgit v1.2.3 From 8ef342021a55e4237e593c7f6304d0caa7bf1232 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:56:32 +0530 Subject: headers_check fix: linux/msdos_fs.h fix the following 'make headers_check' warning: usr/include/linux/msdos_fs.h:100: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/msdos_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h index e0a9b207920d..ce38f1caa5e1 100644 --- a/include/linux/msdos_fs.h +++ b/include/linux/msdos_fs.h @@ -1,6 +1,7 @@ #ifndef _LINUX_MSDOS_FS_H #define _LINUX_MSDOS_FS_H +#include #include #include -- cgit v1.2.3 From ee79a6415f911801eb7804704ac130088281b2d8 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:58:19 +0530 Subject: headers_check fix: linux/neighbour.h fix the following 'make headers_check' warning: usr/include/linux/neighbour.h:8: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/neighbour.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/neighbour.h b/include/linux/neighbour.h index bd3bbf668cdb..8730d5dae1bc 100644 --- a/include/linux/neighbour.h +++ b/include/linux/neighbour.h @@ -1,6 +1,7 @@ #ifndef __LINUX_NEIGHBOUR_H #define __LINUX_NEIGHBOUR_H +#include #include struct ndmsg -- cgit v1.2.3 From a81184c1f8cf8589a00894c20422982defc3f056 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 21:59:48 +0530 Subject: headers_check fix: linux/nfs_idmap.h fix the following 'make headers_check' warning: usr/include/linux/nfs_idmap.h:55: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/nfs_idmap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h index 15a9f3b7289a..91a1c24e0cbf 100644 --- a/include/linux/nfs_idmap.h +++ b/include/linux/nfs_idmap.h @@ -37,6 +37,8 @@ #ifndef NFS_IDMAP_H #define NFS_IDMAP_H +#include + /* XXX from bits/utmp.h */ #define IDMAP_NAMESZ 128 -- cgit v1.2.3 From 06f43adba62f99de101616ffc5d0564aab237111 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:03:25 +0530 Subject: headers_check fix: linux/phonet.h fix the following 'make headers_check' warning: usr/include/linux/phonet.h:50: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/phonet.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phonet.h b/include/linux/phonet.h index 4157faa857b6..ee5e3c9e2bca 100644 --- a/include/linux/phonet.h +++ b/include/linux/phonet.h @@ -23,6 +23,8 @@ #ifndef LINUX_PHONET_H #define LINUX_PHONET_H +#include + /* Automatic protocol selection */ #define PN_PROTO_TRANSPORT 0 /* Phonet datagram socket */ -- cgit v1.2.3 From ed307444d8f276d7052400c47d9f4c5393997c42 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:05:32 +0530 Subject: headers_check fix: linux/pkt_cls.h fix the following 'make headers_check' warning: linux/pkt_cls.h:122: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/pkt_cls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h index e6aa8482ad7a..3c842edff388 100644 --- a/include/linux/pkt_cls.h +++ b/include/linux/pkt_cls.h @@ -1,6 +1,7 @@ #ifndef __LINUX_PKT_CLS_H #define __LINUX_PKT_CLS_H +#include #include /* I think i could have done better macros ; for now this is stolen from -- cgit v1.2.3 From b8adfd3c753b47c47f626e032da7999530c316f0 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:07:05 +0530 Subject: headers_check fix: linux/pkt_sched.h fix the following 'make headers_check' warning: usr/include/linux/pkt_sched.h:32: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/pkt_sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index e3f133adba78..b2648e8e4987 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -1,6 +1,8 @@ #ifndef __LINUX_PKT_SCHED_H #define __LINUX_PKT_SCHED_H +#include + /* Logical priority bands not depending on specific packet scheduler. Every scheduler will map them to real traffic classes, if it has no more precise mechanism to classify packets. -- cgit v1.2.3 From 7260a91666a3149181e7b78bbf73beebbb04f8fa Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:09:17 +0530 Subject: headers_check fix: linux/ppp_defs.h fix the following 'make headers_check' warning: usr/include/linux/ppp_defs.h:50: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/ppp_defs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ppp_defs.h b/include/linux/ppp_defs.h index 6e8adc77522c..1c866bda2018 100644 --- a/include/linux/ppp_defs.h +++ b/include/linux/ppp_defs.h @@ -25,6 +25,8 @@ * OR MODIFICATIONS. */ +#include + /* * ==FILEVERSION 20000114== * -- cgit v1.2.3 From 68622c61dc7971382f5d69cd5d881e618ea30414 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:11:32 +0530 Subject: headers_check fix: linux/random.h fix the following 'make headers_check' warning: usr/include/linux/random.h:39: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/random.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 407ea3646f8f..25d02fe5c9b5 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -7,6 +7,7 @@ #ifndef _LINUX_RANDOM_H #define _LINUX_RANDOM_H +#include #include #include -- cgit v1.2.3 From a788fd53aec9a439f6b8bf57888c30aea1176e1b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:14:02 +0530 Subject: headers_check fix: linux/signalfd.h fix the following 'make headers_check' warning: usr/include/linux/signalfd.h:19: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/signalfd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h index bef0c46d4713..b363b916c909 100644 --- a/include/linux/signalfd.h +++ b/include/linux/signalfd.h @@ -8,6 +8,7 @@ #ifndef _LINUX_SIGNALFD_H #define _LINUX_SIGNALFD_H +#include /* For O_CLOEXEC and O_NONBLOCK */ #include -- cgit v1.2.3 From e6faa002be269233bf1e8961e7e0a79ca3f2db8b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:17:53 +0530 Subject: headers_check fix: linux/sound.h fix the following 'make headers_check' warnings: usr/include/linux/sound.h:33: extern's make no sense in userspace usr/include/linux/sound.h:34: extern's make no sense in userspace usr/include/linux/sound.h:35: extern's make no sense in userspace usr/include/linux/sound.h:36: extern's make no sense in userspace usr/include/linux/sound.h:37: extern's make no sense in userspace usr/include/linux/sound.h:39: extern's make no sense in userspace usr/include/linux/sound.h:40: extern's make no sense in userspace usr/include/linux/sound.h:41: extern's make no sense in userspace usr/include/linux/sound.h:42: extern's make no sense in userspace Signed-off-by: Jaswinder Singh Rajput --- include/linux/sound.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sound.h b/include/linux/sound.h index 9e2a94feed6b..44dcf0570432 100644 --- a/include/linux/sound.h +++ b/include/linux/sound.h @@ -25,6 +25,7 @@ #define SND_DEV_AMIDI 13 /* Like /dev/midi (obsolete) */ #define SND_DEV_ADMMIDI 14 /* Like /dev/dmmidi (onsolete) */ +#ifdef __KERNEL__ /* * Sound core interface functions */ @@ -40,3 +41,4 @@ extern void unregister_sound_special(int unit); extern void unregister_sound_mixer(int unit); extern void unregister_sound_midi(int unit); extern void unregister_sound_dsp(int unit); +#endif /* __KERNEL__ */ -- cgit v1.2.3 From 6b6bcd0ed953ae0ed73af4759788fb8384bbaeed Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:20:04 +0530 Subject: headers_check fix: linux/synclink.h fix the following 'make headers_check' warning: usr/include/linux/synclink.h:209: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/synclink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/synclink.h b/include/linux/synclink.h index c844a229acc9..99b8bdb17b2b 100644 --- a/include/linux/synclink.h +++ b/include/linux/synclink.h @@ -13,6 +13,8 @@ #define _SYNCLINK_H_ #define SYNCLINK_H_VERSION 3.6 +#include + #define BIT0 0x0001 #define BIT1 0x0002 #define BIT2 0x0004 -- cgit v1.2.3 From 448314fc968252b0b95f74bbe63fdcaf41e6413d Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:21:19 +0530 Subject: headers_check fix: linux/taskstats.h fix the following 'make headers_check' warning: usr/include/linux/taskstats.h:44: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/taskstats.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h index 18269e956a71..341dddb55090 100644 --- a/include/linux/taskstats.h +++ b/include/linux/taskstats.h @@ -16,6 +16,8 @@ #ifndef _LINUX_TASKSTATS_H #define _LINUX_TASKSTATS_H +#include + /* Format for per-task data returned to userland when * - a task exits * - listener requests stats for a task -- cgit v1.2.3 From 8b1e3a2f7f84484a8c208671adac39eb148c7d61 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:22:51 +0530 Subject: headers_check fix: linux/video_decoder.h fix the following 'make headers_check' warning: usr/include/linux/video_decoder.h:7: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/video_decoder.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/video_decoder.h b/include/linux/video_decoder.h index 121e26da2c18..e26c0c86a6ea 100644 --- a/include/linux/video_decoder.h +++ b/include/linux/video_decoder.h @@ -1,6 +1,8 @@ #ifndef _LINUX_VIDEO_DECODER_H #define _LINUX_VIDEO_DECODER_H +#include + #define HAVE_VIDEO_DECODER 1 struct video_decoder_capability { /* this name is too long */ -- cgit v1.2.3 From a4c1d7c8c61969667a853d08b039507669463807 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:24:09 +0530 Subject: headers_check fix: linux/video_encoder.h fix the following 'make headers_check' warning: usr/include/linux/video_encoder.h:5: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/video_encoder.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/video_encoder.h b/include/linux/video_encoder.h index 4b0e6907a7b4..b7b6423bbb8a 100644 --- a/include/linux/video_encoder.h +++ b/include/linux/video_encoder.h @@ -1,6 +1,8 @@ #ifndef _LINUX_VIDEO_ENCODER_H #define _LINUX_VIDEO_ENCODER_H +#include + struct video_encoder_capability { /* this name is too long */ __u32 flags; #define VIDEO_ENCODER_PAL 1 /* can encode PAL signal */ -- cgit v1.2.3 From 98be96b85398499212bc77ae3076a69e20368428 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:26:01 +0530 Subject: headers_check fix: linux/videodev.h fix the following 'make headers_check' warning: usr/include/linux/videodev.h:53: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/videodev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/videodev.h b/include/linux/videodev.h index 15a653d41132..837f392fbe97 100644 --- a/include/linux/videodev.h +++ b/include/linux/videodev.h @@ -12,6 +12,7 @@ #ifndef __LINUX_VIDEODEV_H #define __LINUX_VIDEODEV_H +#include #include #include -- cgit v1.2.3 From 982f8184f9a9251ba4e5c6d79ec32d25c0ad3cc8 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:27:58 +0530 Subject: headers_check fix: linux/virtio_blk.h fix the following 'make headers_check' warning: usr/include/linux/virtio_blk.h:21: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/virtio_blk.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h index c1aef85243bf..94c56d29869d 100644 --- a/include/linux/virtio_blk.h +++ b/include/linux/virtio_blk.h @@ -2,6 +2,7 @@ #define _LINUX_VIRTIO_BLK_H /* This header is BSD licensed so anyone can use the definitions to implement * compatible drivers/servers. */ +#include #include /* The ID for virtio_block */ -- cgit v1.2.3 From 8697325408d9be18fa24346c346b23fa56c3b190 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:29:33 +0530 Subject: headers_check fix: linux/virtio_console.h fix the following 'make headers_check' warning: usr/include/linux/virtio_console.h:15: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/virtio_console.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/virtio_console.h b/include/linux/virtio_console.h index 7615ffcdd555..dc161115ae35 100644 --- a/include/linux/virtio_console.h +++ b/include/linux/virtio_console.h @@ -1,5 +1,6 @@ #ifndef _LINUX_VIRTIO_CONSOLE_H #define _LINUX_VIRTIO_CONSOLE_H +#include #include /* This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so * anyone can use the definitions to implement compatible drivers/servers. */ -- cgit v1.2.3 From 9a0e0ac21ca2af4715808b97bd600f0aecd87240 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:31:05 +0530 Subject: headers_check fix: linux/virtio_net.h fix the following 'make headers_check' warning: usr/include/linux/virtio_net.h:28: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/virtio_net.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 5cdd0aa8bde9..3efa86c3ecb3 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -2,6 +2,7 @@ #define _LINUX_VIRTIO_NET_H /* This header is BSD licensed so anyone can use the definitions to implement * compatible drivers/servers. */ +#include #include /* The ID for virtio_net */ -- cgit v1.2.3 From d5c72d7842c71403bc3d57ca05a8a1f96d81e262 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:39:17 +0530 Subject: headers_check fix: linux/nubus.h fix the following 'make headers_check' warning: usr/include/linux/nubus.h:232: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/nubus.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nubus.h b/include/linux/nubus.h index c4355076d1a5..7382af374731 100644 --- a/include/linux/nubus.h +++ b/include/linux/nubus.h @@ -12,6 +12,7 @@ #ifndef LINUX_NUBUS_H #define LINUX_NUBUS_H +#include #ifdef __KERNEL__ #include #endif -- cgit v1.2.3 From 541c94f1d5ac2665fd15f1b827416f8c0b2f55cb Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 30 Jan 2009 22:40:32 +0530 Subject: headers_check fix: linux/rtnetlink.h fix the following 'make headers_check' warning: usr/include/linux/rtnetlink.h:328: found __[us]{8,16,32,64} type without #include Signed-off-by: Jaswinder Singh Rajput --- include/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index e88f7058b3a1..1e5f6730ff31 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -1,6 +1,7 @@ #ifndef __LINUX_RTNETLINK_H #define __LINUX_RTNETLINK_H +#include #include #include #include -- cgit v1.2.3 From 94df7de0289bc2df3d6e85cd2ece52bf42682f45 Mon Sep 17 00:00:00 2001 From: Sebastien Dugue Date: Mon, 1 Dec 2008 14:09:07 +0100 Subject: hrtimers: allow the hot-unplugging of all cpus Impact: fix CPU hotplug hang on Power6 testbox On architectures that support offlining all cpus (at least powerpc/pseries), hot-unpluging the tick_do_timer_cpu can result in a system hang. This comes from the fact that if the cpu going down happens to be the cpu doing the tick, then as the tick_do_timer_cpu handover happens after the cpu is dead (via the CPU_DEAD notification), we're left without ticks, jiffies are frozen and any task relying on timers (msleep, ...) is stuck. That's particularly the case for the cpu looping in __cpu_die() waiting for the dying cpu to be dead. This patch addresses this by having the tick_do_timer_cpu handover happen earlier during the CPU_DYING notification. For this, a new clockevent notification type is introduced (CLOCK_EVT_NOTIFY_CPU_DYING) which is triggered in hrtimer_cpu_notify(). Signed-off-by: Sebastien Dugue Cc: Signed-off-by: Ingo Molnar --- include/linux/clockchips.h | 1 + kernel/hrtimer.c | 4 ++++ kernel/time/tick-common.c | 26 +++++++++++++++++++------- 3 files changed, 24 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index cea153697ec7..3a1dbba4d3ae 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -36,6 +36,7 @@ enum clock_event_nofitiers { CLOCK_EVT_NOTIFY_BROADCAST_EXIT, CLOCK_EVT_NOTIFY_SUSPEND, CLOCK_EVT_NOTIFY_RESUME, + CLOCK_EVT_NOTIFY_CPU_DYING, CLOCK_EVT_NOTIFY_CPU_DEAD, }; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 8fea312ca36c..647a40e2fea1 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1608,6 +1608,10 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, break; #ifdef CONFIG_HOTPLUG_CPU + case CPU_DYING: + case CPU_DYING_FROZEN: + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); + break; case CPU_DEAD: case CPU_DEAD_FROZEN: { diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 63e05d423a09..21a5ca849514 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -273,6 +273,21 @@ out_bc: return ret; } +/* + * Transfer the do_timer job away from a dying cpu. + * + * Called with interrupts disabled. + */ +static void tick_handover_do_timer(int *cpup) +{ + if (*cpup == tick_do_timer_cpu) { + int cpu = cpumask_first(cpu_online_mask); + + tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : + TICK_DO_TIMER_NONE; + } +} + /* * Shutdown an event device on a given cpu: * @@ -297,13 +312,6 @@ static void tick_shutdown(unsigned int *cpup) clockevents_exchange_device(dev, NULL); td->evtdev = NULL; } - /* Transfer the do_timer job away from this cpu */ - if (*cpup == tick_do_timer_cpu) { - int cpu = cpumask_first(cpu_online_mask); - - tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : - TICK_DO_TIMER_NONE; - } spin_unlock_irqrestore(&tick_device_lock, flags); } @@ -357,6 +365,10 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, tick_broadcast_oneshot_control(reason); break; + case CLOCK_EVT_NOTIFY_CPU_DYING: + tick_handover_do_timer(dev); + break; + case CLOCK_EVT_NOTIFY_CPU_DEAD: tick_shutdown_broadcast_oneshot(dev); tick_shutdown_broadcast(dev); -- cgit v1.2.3 From 0648e10d71c8e510d80772c4cb4220c97e9c34c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 2 Feb 2009 08:43:48 +0100 Subject: block: fix inconsistent parenthesisation of QUEUE_FLAG_DEFAULT Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d08c4b8219a6..dcaa0fd84b02 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -455,7 +455,7 @@ struct request_queue #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_CLUSTER) | \ - 1 << QUEUE_FLAG_STACKABLE) + (1 << QUEUE_FLAG_STACKABLE)) static inline int queue_is_locked(struct request_queue *q) { -- cgit v1.2.3 From 20b636bf7c946da260391cd4570b16506f140a2c Mon Sep 17 00:00:00 2001 From: Alberto Bertogli Date: Mon, 2 Feb 2009 12:41:07 +0100 Subject: Fix misleading comment in bio.h The comment says "remember to add offset!", but the function already adds it. Signed-off-by: Alberto Bertogli Signed-off-by: Jens Axboe --- include/linux/bio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 0942765cf8c0..99728320cc05 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -451,8 +451,8 @@ extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly; #ifdef CONFIG_HIGHMEM /* - * remember to add offset! and never ever reenable interrupts between a - * bvec_kmap_irq and bvec_kunmap_irq!! + * remember never ever reenable interrupts between a bvec_kmap_irq and + * bvec_kunmap_irq! * * This function MUST be inlined - it plays with the CPU interrupt flags. */ -- cgit v1.2.3 From c52440a69df22dca69794936a91e2fb529a707fb Mon Sep 17 00:00:00 2001 From: Alberto Bertogli Date: Mon, 2 Feb 2009 12:41:07 +0100 Subject: bio.h: If they MUST be inlined, then use __always_inline bvec_kmap_irq() and bvec_kunmap_irq() comments say they MUST be inlined, so mark them as __always_inline. Signed-off-by: Alberto Bertogli Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 99728320cc05..2aa283ab062b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -456,7 +456,8 @@ extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly; * * This function MUST be inlined - it plays with the CPU interrupt flags. */ -static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) +static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec, + unsigned long *flags) { unsigned long addr; @@ -472,7 +473,8 @@ static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) return (char *) addr + bvec->bv_offset; } -static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) +static __always_inline void bvec_kunmap_irq(char *buffer, + unsigned long *flags) { unsigned long ptr = (unsigned long) buffer & PAGE_MASK; -- cgit v1.2.3 From 0afd4a21ba7d75e93fa79cf05d7a21774e149c0f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Feb 2009 13:27:44 -0800 Subject: net: Fix userland breakage wrt. linux/if_tunnel.h Reported by Andrew Walrond Changeset c19e654ddbe3831252f61e76a74d661e1a755530 ("gre: Add netlink interface") added an include of linux/ip.h to linux/if_tunnel.h We can't really let that get exposed to userspace because this conflicts with types defined in netinet/ip.h which userland is almost certainly going to have included either explicitly or implicitly. So guard this include with a __KERNEL__ ifdef. Signed-off-by: David S. Miller --- include/linux/Kbuild | 2 +- include/linux/if_tunnel.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 12e9a2957caf..6a9bb9753235 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -89,7 +89,6 @@ header-y += if_ppp.h header-y += if_slip.h header-y += if_strip.h header-y += if_tun.h -header-y += if_tunnel.h header-y += in_route.h header-y += ioctl.h header-y += ip6_tunnel.h @@ -235,6 +234,7 @@ unifdef-y += if_phonet.h unifdef-y += if_pppol2tp.h unifdef-y += if_pppox.h unifdef-y += if_tr.h +unifdef-y += if_tunnel.h unifdef-y += if_vlan.h unifdef-y += igmp.h unifdef-y += inet_diag.h diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h index aeab2cb32a9c..82c43624c067 100644 --- a/include/linux/if_tunnel.h +++ b/include/linux/if_tunnel.h @@ -2,7 +2,10 @@ #define _IF_TUNNEL_H_ #include + +#ifdef __KERNEL__ #include +#endif #define SIOCGETTUNNEL (SIOCDEVPRIVATE + 0) #define SIOCADDTUNNEL (SIOCDEVPRIVATE + 1) -- cgit v1.2.3 From 720eba31f47aeade8ec130ca7f4353223c49170f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 3 Feb 2009 13:31:36 +1030 Subject: modules: Use a better scheme for refcounting Current refcounting for modules (done if CONFIG_MODULE_UNLOAD=y) is using a lot of memory. Each 'struct module' contains an [NR_CPUS] array of full cache lines. This patch uses existing infrastructure (percpu_modalloc() & percpu_modfree()) to allocate percpu space for the refcount storage. Instead of wasting NR_CPUS*128 bytes (on i386), we now use nr_cpu_ids*sizeof(local_t) bytes. On a typical distro, where NR_CPUS=8, shiping 2000 modules, we reduce size of module files by about 2 Mbytes. (1Kb per module) Instead of having all refcounters in the same memory node - with TLB misses because of vmalloc() - this new implementation permits to have better NUMA properties, since each CPU will use storage on its preferred node, thanks to percpu storage. Signed-off-by: Eric Dumazet Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- include/linux/module.h | 25 ++++++++++++++++--------- kernel/module.c | 35 +++++++++++++++++++++++++---------- 2 files changed, 41 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 4f7ea12463d3..f3b8329eb5b8 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -219,11 +219,6 @@ void *__symbol_get_gpl(const char *symbol); #endif -struct module_ref -{ - local_t count; -} ____cacheline_aligned; - enum module_state { MODULE_STATE_LIVE, @@ -344,8 +339,11 @@ struct module /* Destruction function. */ void (*exit)(void); - /* Reference counts */ - struct module_ref ref[NR_CPUS]; +#ifdef CONFIG_SMP + char *refptr; +#else + local_t ref; +#endif #endif }; #ifndef MODULE_ARCH_INIT @@ -395,13 +393,22 @@ void __symbol_put(const char *symbol); #define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x) void symbol_put_addr(void *addr); +static inline local_t *__module_ref_addr(struct module *mod, int cpu) +{ +#ifdef CONFIG_SMP + return (local_t *) (mod->refptr + per_cpu_offset(cpu)); +#else + return &mod->ref; +#endif +} + /* Sometimes we know we already have a refcount, and it's easier not to handle the error case (which only happens with rmmod --wait). */ static inline void __module_get(struct module *module) { if (module) { BUG_ON(module_refcount(module) == 0); - local_inc(&module->ref[get_cpu()].count); + local_inc(__module_ref_addr(module, get_cpu())); put_cpu(); } } @@ -413,7 +420,7 @@ static inline int try_module_get(struct module *module) if (module) { unsigned int cpu = get_cpu(); if (likely(module_is_live(module))) - local_inc(&module->ref[cpu].count); + local_inc(__module_ref_addr(module, cpu)); else ret = 0; put_cpu(); diff --git a/kernel/module.c b/kernel/module.c index e8b51d41dd72..ba22484a987e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -573,13 +573,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; /* Init the unload section of the module. */ static void module_unload_init(struct module *mod) { - unsigned int i; + int cpu; INIT_LIST_HEAD(&mod->modules_which_use_me); - for (i = 0; i < NR_CPUS; i++) - local_set(&mod->ref[i].count, 0); + for_each_possible_cpu(cpu) + local_set(__module_ref_addr(mod, cpu), 0); /* Hold reference count during initialization. */ - local_set(&mod->ref[raw_smp_processor_id()].count, 1); + local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } @@ -717,10 +717,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced) unsigned int module_refcount(struct module *mod) { - unsigned int i, total = 0; + unsigned int total = 0; + int cpu; - for (i = 0; i < NR_CPUS; i++) - total += local_read(&mod->ref[i].count); + for_each_possible_cpu(cpu) + total += local_read(__module_ref_addr(mod, cpu)); return total; } EXPORT_SYMBOL(module_refcount); @@ -894,7 +895,7 @@ void module_put(struct module *module) { if (module) { unsigned int cpu = get_cpu(); - local_dec(&module->ref[cpu].count); + local_dec(__module_ref_addr(module, cpu)); /* Maybe they're waiting for us to drop reference? */ if (unlikely(!module_is_live(module))) wake_up_process(module->waiter); @@ -1464,7 +1465,10 @@ static void free_module(struct module *mod) kfree(mod->args); if (mod->percpu) percpu_modfree(mod->percpu); - +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + if (mod->refptr) + percpu_modfree(mod->refptr); +#endif /* Free lock-classes: */ lockdep_free_key_range(mod->module_core, mod->core_size); @@ -2011,6 +2015,14 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto free_mod; +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), + mod->name); + if (!mod->refptr) { + err = -ENOMEM; + goto free_mod; + } +#endif if (pcpuindex) { /* We have a special allocation for this section. */ percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, @@ -2018,7 +2030,7 @@ static noinline struct module *load_module(void __user *umod, mod->name); if (!percpu) { err = -ENOMEM; - goto free_mod; + goto free_percpu; } sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; mod->percpu = percpu; @@ -2282,6 +2294,9 @@ static noinline struct module *load_module(void __user *umod, free_percpu: if (percpu) percpu_modfree(percpu); +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + percpu_modfree(mod->refptr); +#endif free_mod: kfree(args); free_hdr: -- cgit v1.2.3 From 2999b58b795ad81f10e34bdbbfd2742172f247e4 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sun, 1 Feb 2009 20:46:39 +0400 Subject: ide/libata: fix ata_id_is_cfa() (take 4) When checking for the CFA feature set support, ata_id_is_cfa() tests bit 2 in word 82 of the identify data instead the word 83; it also checks the ATA/PI version support in the word 80 (which the CompactFlash specifications have as reserved), this having no slightest chance to work on the modern CF cards that don't have 0x848A in the word 0... Signed-off-by: Sergei Shtylyov Signed-off-by: Jeff Garzik --- include/linux/ata.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index a53318b8cbd0..08a86d5cdf1b 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -731,12 +731,17 @@ static inline int ata_id_current_chs_valid(const u16 *id) static inline int ata_id_is_cfa(const u16 *id) { - if (id[ATA_ID_CONFIG] == 0x848A) /* Standard CF */ + if (id[ATA_ID_CONFIG] == 0x848A) /* Traditional CF */ return 1; - /* Could be CF hiding as standard ATA */ - if (ata_id_major_version(id) >= 3 && - id[ATA_ID_COMMAND_SET_1] != 0xFFFF && - (id[ATA_ID_COMMAND_SET_1] & (1 << 2))) + /* + * CF specs don't require specific value in the word 0 anymore and yet + * they forbid to report the ATA version in the word 80 and require the + * CFA feature set support to be indicated in the word 83 in this case. + * Unfortunately, some cards only follow either of this requirements, + * and while those that don't indicate CFA feature support need some + * sort of quirk list, it seems impractical for the ones that do... + */ + if ((id[ATA_ID_COMMAND_SET_2] & 0xC004) == 0x4004) return 1; return 0; } -- cgit v1.2.3 From 99cf610aa4840d822cdc67d194b23b55010ca9bd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Jan 2009 20:31:32 +0900 Subject: libata: clear dev->ering in smarter way dev->ering used to be cleared together with the rest of ata_device in ata_dev_init() which is called whenever a probing event occurs. dev->ering is about to be used to track probing failures so it needs to remain persistent over multiple porbing events. This patch achieves this by doing the following. * Instead of CLEAR_OFFSET, define CLEAR_BEGIN and CLEAR_END and only clear between BEGIN and END. ering is moved after END. The split of persistent area is to allow hotter items remain at the head. * ering is explicitly cleared on ata_dev_disable() and when device attach succeeds. So, ering is persistent throug a device's life time (unless explicitly cleared of course) and also through periods inbetween disablement of an attached device and successful detection of the next one. Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- drivers/ata/libata-core.c | 4 ++-- drivers/ata/libata-eh.c | 7 +++++++ include/linux/libata.h | 18 ++++++++++-------- 3 files changed, 19 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index d006e5c4768c..564c03c4ebb3 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5404,8 +5404,8 @@ void ata_dev_init(struct ata_device *dev) dev->horkage = 0; spin_unlock_irqrestore(ap->lock, flags); - memset((void *)dev + ATA_DEVICE_CLEAR_OFFSET, 0, - sizeof(*dev) - ATA_DEVICE_CLEAR_OFFSET); + memset((void *)dev + ATA_DEVICE_CLEAR_BEGIN, 0, + ATA_DEVICE_CLEAR_END - ATA_DEVICE_CLEAR_BEGIN); dev->pio_mask = UINT_MAX; dev->mwdma_mask = UINT_MAX; dev->udma_mask = UINT_MAX; diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index aafe82bf5e2e..90092c1aae53 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -1194,6 +1194,11 @@ void ata_dev_disable(struct ata_device *dev) ata_acpi_on_disable(dev); ata_down_xfermask_limit(dev, ATA_DNXFER_FORCE_PIO0 | ATA_DNXFER_QUIET); dev->class++; + + /* From now till the next successful probe, ering is used to + * track probe failures. Clear accumulated device error info. + */ + ata_ering_clear(&dev->ering); } /** @@ -2765,6 +2770,8 @@ static int ata_eh_revalidate_and_attach(struct ata_link *link, readid_flags, dev->id); switch (rc) { case 0: + /* clear error info accumulated during probe */ + ata_ering_clear(&dev->ering); new_mask |= 1 << dev->devno; break; case -ENOENT: diff --git a/include/linux/libata.h b/include/linux/libata.h index bca3ba25f52a..9dcefee5843c 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -580,7 +580,7 @@ struct ata_device { acpi_handle acpi_handle; union acpi_object *gtf_cache; #endif - /* n_sector is used as CLEAR_OFFSET, read comment above CLEAR_OFFSET */ + /* n_sector is CLEAR_BEGIN, read comment above CLEAR_BEGIN */ u64 n_sectors; /* size of device, if ATA */ unsigned int class; /* ATA_DEV_xxx */ unsigned long unpark_deadline; @@ -605,20 +605,22 @@ struct ata_device { u16 heads; /* Number of heads */ u16 sectors; /* Number of sectors per track */ - /* error history */ - int spdn_cnt; - struct ata_ering ering; - union { u16 id[ATA_ID_WORDS]; /* IDENTIFY xxx DEVICE data */ u32 gscr[SATA_PMP_GSCR_DWORDS]; /* PMP GSCR block */ }; + + /* error history */ + int spdn_cnt; + /* ering is CLEAR_END, read comment above CLEAR_END */ + struct ata_ering ering; }; -/* Offset into struct ata_device. Fields above it are maintained - * acress device init. Fields below are zeroed. +/* Fields between ATA_DEVICE_CLEAR_BEGIN and ATA_DEVICE_CLEAR_END are + * cleared to zero on ata_dev_init(). */ -#define ATA_DEVICE_CLEAR_OFFSET offsetof(struct ata_device, n_sectors) +#define ATA_DEVICE_CLEAR_BEGIN offsetof(struct ata_device, n_sectors) +#define ATA_DEVICE_CLEAR_END offsetof(struct ata_device, ering) struct ata_eh_info { struct ata_device *dev; /* offending device */ -- cgit v1.2.3 From 9062712fa9ed13b531dfc2228086650b8bd6a255 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Jan 2009 20:31:36 +0900 Subject: libata: implement HORKAGE_1_5_GBPS and apply it to WD My Book 3Gbps is often much more prone to transmission failures. It's usually okay to let EH handle speed down after transmission failures but some WD My Book drives completely shutdown after certain transmission failures and after it only power cycling can revive them. Combined with the fact that external drives often end up with cable assembly which is longer than usual and more likely to have intervening gender, this makes these drives very likely to shutdown under certain configurations virtually rendering them unusable. This patch implements HOARKGE_1_5_GBPS and applies it to WD My Book such that 1.5Gbps is forced once the device is identified. Please take a look at the following bz for related reports. http://bugzilla.kernel.org/show_bug.cgi?id=9913 Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- drivers/ata/libata-core.c | 41 +++++++++++++++++++++++++++++++++++++++++ include/linux/libata.h | 1 + 2 files changed, 42 insertions(+) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 3c5965d56c47..9fbf0595f3d4 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2232,6 +2232,40 @@ retry: return rc; } +static int ata_do_link_spd_horkage(struct ata_device *dev) +{ + struct ata_link *plink = ata_dev_phys_link(dev); + u32 target, target_limit; + + if (!sata_scr_valid(plink)) + return 0; + + if (dev->horkage & ATA_HORKAGE_1_5_GBPS) + target = 1; + else + return 0; + + target_limit = (1 << target) - 1; + + /* if already on stricter limit, no need to push further */ + if (plink->sata_spd_limit <= target_limit) + return 0; + + plink->sata_spd_limit = target_limit; + + /* Request another EH round by returning -EAGAIN if link is + * going faster than the target speed. Forward progress is + * guaranteed by setting sata_spd_limit to target_limit above. + */ + if (plink->sata_spd > target) { + ata_dev_printk(dev, KERN_INFO, + "applying link speed limit horkage to %s\n", + sata_spd_string(target)); + return -EAGAIN; + } + return 0; +} + static inline u8 ata_dev_knobble(struct ata_device *dev) { struct ata_port *ap = dev->link->ap; @@ -2322,6 +2356,10 @@ int ata_dev_configure(struct ata_device *dev) return 0; } + rc = ata_do_link_spd_horkage(dev); + if (rc) + return rc; + /* let ACPI work its magic */ rc = ata_acpi_on_devcfg(dev); if (rc) @@ -4223,6 +4261,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { /* Devices that do not need bridging limits applied */ { "MTRON MSP-SATA*", NULL, ATA_HORKAGE_BRIDGE_OK, }, + /* Devices which aren't very happy with higher link speeds */ + { "WD My Book", NULL, ATA_HORKAGE_1_5_GBPS, }, + /* End Marker */ { } }; diff --git a/include/linux/libata.h b/include/linux/libata.h index 9dcefee5843c..5d87bc09a1f5 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -380,6 +380,7 @@ enum { ATA_HORKAGE_ATAPI_MOD16_DMA = (1 << 11), /* use ATAPI DMA for commands not multiple of 16 bytes */ ATA_HORKAGE_FIRMWARE_WARN = (1 << 12), /* firwmare update warning */ + ATA_HORKAGE_1_5_GBPS = (1 << 13), /* force 1.5 Gbps */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3 From 35626129abcd6a7547e84c817ef5b6eff7a8758b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 2 Feb 2009 16:00:29 -0800 Subject: sched: add missing kernel-doc in sched.h Add kernel-doc notation for @lock: include/linux/sched.h:457: No description found for parameter 'lock' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a7c76388731..2127e959e0f4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -443,6 +443,7 @@ struct pacct_struct { * @utime: time spent in user mode, in &cputime_t units * @stime: time spent in kernel mode, in &cputime_t units * @sum_exec_runtime: total time spent on the CPU, in nanoseconds + * @lock: lock for fields in this struct * * This structure groups together three kinds of CPU time that are * tracked for threads and thread groups. Most things considering -- cgit v1.2.3 From 97c44836cdec1ea713a15d84098a1a908157e68f Mon Sep 17 00:00:00 2001 From: "Timothy S. Nelson" Date: Fri, 30 Jan 2009 06:12:47 +1100 Subject: PCI: return error on failure to read PCI ROMs This patch makes the ROM reading code return an error to user space if the size of the ROM read is equal to 0. The patch also emits a warnings if the contents of the ROM are invalid, and documents the effects of the "enable" file on ROM reading. Signed-off-by: Timothy S. Nelson Acked-by: Alex Villacis-Lasso Signed-off-by: Jesse Barnes --- Documentation/filesystems/sysfs-pci.txt | 13 ++++++++++++- arch/ia64/sn/kernel/io_acpi_init.c | 2 +- arch/ia64/sn/kernel/io_init.c | 2 +- drivers/pci/pci-sysfs.c | 4 ++-- drivers/pci/rom.c | 8 +++++--- include/linux/pci.h | 2 +- 6 files changed, 22 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt index 68ef48839c04..9f8740ca3f3b 100644 --- a/Documentation/filesystems/sysfs-pci.txt +++ b/Documentation/filesystems/sysfs-pci.txt @@ -9,6 +9,7 @@ that support it. For example, a given bus might look like this: | |-- class | |-- config | |-- device + | |-- enable | |-- irq | |-- local_cpus | |-- resource @@ -32,6 +33,7 @@ files, each with their own function. class PCI class (ascii, ro) config PCI config space (binary, rw) device PCI device (ascii, ro) + enable Whether the device is enabled (ascii, rw) irq IRQ number (ascii, ro) local_cpus nearby CPU mask (cpumask, ro) resource PCI resource host addresses (ascii, ro) @@ -57,10 +59,19 @@ used to do actual device programming from userspace. Note that some platforms don't support mmapping of certain resources, so be sure to check the return value from any attempted mmap. +The 'enable' file provides a counter that indicates how many times the device +has been enabled. If the 'enable' file currently returns '4', and a '1' is +echoed into it, it will then return '5'. Echoing a '0' into it will decrease +the count. Even when it returns to 0, though, some of the initialisation +may not be reversed. + The 'rom' file is special in that it provides read-only access to the device's ROM file, if available. It's disabled by default, however, so applications should write the string "1" to the file to enable it before attempting a read -call, and disable it following the access by writing "0" to the file. +call, and disable it following the access by writing "0" to the file. Note +that the device must be enabled for a rom read to return data succesfully. +In the event a driver is not bound to the device, it can be enabled using the +'enable' file, documented above. Accessing legacy resources through sysfs ---------------------------------------- diff --git a/arch/ia64/sn/kernel/io_acpi_init.c b/arch/ia64/sn/kernel/io_acpi_init.c index c5a214026a77..d0223abbbbd4 100644 --- a/arch/ia64/sn/kernel/io_acpi_init.c +++ b/arch/ia64/sn/kernel/io_acpi_init.c @@ -443,7 +443,7 @@ sn_acpi_slot_fixup(struct pci_dev *dev) size = pci_resource_len(dev, PCI_ROM_RESOURCE); addr = ioremap(pcidev_info->pdi_pio_mapped_addr[PCI_ROM_RESOURCE], size); - image_size = pci_get_rom_size(addr, size); + image_size = pci_get_rom_size(dev, addr, size); dev->resource[PCI_ROM_RESOURCE].start = (unsigned long) addr; dev->resource[PCI_ROM_RESOURCE].end = (unsigned long) addr + image_size - 1; diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c index 4e1801bad83a..e2eb2da60f96 100644 --- a/arch/ia64/sn/kernel/io_init.c +++ b/arch/ia64/sn/kernel/io_init.c @@ -269,7 +269,7 @@ sn_io_slot_fixup(struct pci_dev *dev) rom = ioremap(pci_resource_start(dev, PCI_ROM_RESOURCE), size + 1); - image_size = pci_get_rom_size(rom, size + 1); + image_size = pci_get_rom_size(dev, rom, size + 1); dev->resource[PCI_ROM_RESOURCE].end = dev->resource[PCI_ROM_RESOURCE].start + image_size - 1; diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index db7ec14fa719..dfc4e0ddf241 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -768,8 +768,8 @@ pci_read_rom(struct kobject *kobj, struct bin_attribute *bin_attr, return -EINVAL; rom = pci_map_rom(pdev, &size); /* size starts out as PCI window size */ - if (!rom) - return 0; + if (!rom || !size) + return -EIO; if (off >= size) count = 0; diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c index 132a78159b60..29cbe47f219f 100644 --- a/drivers/pci/rom.c +++ b/drivers/pci/rom.c @@ -63,7 +63,7 @@ void pci_disable_rom(struct pci_dev *pdev) * The PCI window size could be much larger than the * actual image size. */ -size_t pci_get_rom_size(void __iomem *rom, size_t size) +size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom, size_t size) { void __iomem *image; int last_image; @@ -72,8 +72,10 @@ size_t pci_get_rom_size(void __iomem *rom, size_t size) do { void __iomem *pds; /* Standard PCI ROMs start out with these bytes 55 AA */ - if (readb(image) != 0x55) + if (readb(image) != 0x55) { + dev_err(&pdev->dev, "Invalid ROM contents\n"); break; + } if (readb(image + 1) != 0xAA) break; /* get the PCI data structure and check its signature */ @@ -159,7 +161,7 @@ void __iomem *pci_map_rom(struct pci_dev *pdev, size_t *size) * size is much larger than the actual size of the ROM. * True size is important if the ROM is going to be copied. */ - *size = pci_get_rom_size(rom, *size); + *size = pci_get_rom_size(pdev, rom, *size); return rom; } diff --git a/include/linux/pci.h b/include/linux/pci.h index 48890cf3f96e..7bd624bfdcfd 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -684,7 +684,7 @@ int pci_enable_rom(struct pci_dev *pdev); void pci_disable_rom(struct pci_dev *pdev); void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size); void pci_unmap_rom(struct pci_dev *pdev, void __iomem *rom); -size_t pci_get_rom_size(void __iomem *rom, size_t size); +size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom, size_t size); /* Power management related routines */ int pci_save_state(struct pci_dev *dev); -- cgit v1.2.3 From 7b2cd92adc5430b0c1adeb120971852b4ea1ab08 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 5 Feb 2009 16:48:24 +1100 Subject: crypto: api - Fix zeroing on free Geert Uytterhoeven pointed out that we're not zeroing all the memory when freeing a transform. This patch fixes it by calling ksize to ensure that we zero everything in sight. Reported-by: Geert Uytterhoeven Signed-off-by: Herbert Xu --- crypto/api.c | 20 ++++++++++---------- include/linux/crypto.h | 7 ++++++- 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/crypto/api.c b/crypto/api.c index 9975a7bd246c..efe77df6863f 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -557,34 +557,34 @@ err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(crypto_alloc_tfm); - + /* - * crypto_free_tfm - Free crypto transform + * crypto_destroy_tfm - Free crypto transform + * @mem: Start of tfm slab * @tfm: Transform to free * - * crypto_free_tfm() frees up the transform and any associated resources, + * This function frees up the transform and any associated resources, * then drops the refcount on the associated algorithm. */ -void crypto_free_tfm(struct crypto_tfm *tfm) +void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm) { struct crypto_alg *alg; int size; - if (unlikely(!tfm)) + if (unlikely(!mem)) return; alg = tfm->__crt_alg; - size = sizeof(*tfm) + alg->cra_ctxsize; + size = ksize(mem); if (!tfm->exit && alg->cra_exit) alg->cra_exit(tfm); crypto_exit_ops(tfm); crypto_mod_put(alg); - memset(tfm, 0, size); - kfree(tfm); + memset(mem, 0, size); + kfree(mem); } - -EXPORT_SYMBOL_GPL(crypto_free_tfm); +EXPORT_SYMBOL_GPL(crypto_destroy_tfm); int crypto_has_alg(const char *name, u32 type, u32 mask) { diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 3bacd71509fb..1f2e9020acc6 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -552,7 +552,12 @@ struct crypto_tfm *crypto_alloc_tfm(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask); struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask); -void crypto_free_tfm(struct crypto_tfm *tfm); +void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm); + +static inline void crypto_free_tfm(struct crypto_tfm *tfm) +{ + return crypto_destroy_tfm(tfm, tfm); +} int alg_test(const char *driver, const char *alg, u32 type, u32 mask); -- cgit v1.2.3 From 32bd671d6cbeda60dc73be77fa2b9037d9a9bfa0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Feb 2009 12:24:15 +0100 Subject: signal: re-add dead task accumulation stats. We're going to split the process wide cpu accounting into two parts: - clocks; which can take all the time they want since they run from user context. - timers; which need constant time tracing but can affort the overhead because they're default off -- and rare. The clock readout will go back to a full sum of the thread group, for this we need to re-add the exit stats that were removed in the initial itimer rework (f06febc9: timers: fix itimer/many thread hang). Furthermore, since that full sum can be rather slow for large thread groups and we have the complete dead task stats, revert the do_notify_parent time computation. Signed-off-by: Peter Zijlstra Reviewed-by: Ingo Molnar Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 +++++++++- kernel/exit.c | 3 +++ kernel/fork.c | 3 ++- kernel/signal.c | 8 ++++---- 4 files changed, 18 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2127e959e0f4..2e0646a30314 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -559,7 +559,7 @@ struct signal_struct { * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ - cputime_t cutime, cstime; + cputime_t utime, stime, cutime, cstime; cputime_t gtime; cputime_t cgtime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; @@ -567,6 +567,14 @@ struct signal_struct { unsigned long inblock, oublock, cinblock, coublock; struct task_io_accounting ioac; + /* + * Cumulative ns of schedule CPU time fo dead threads in the + * group, not including a zombie group leader, (This only differs + * from jiffies_to_ns(utime + stime) if sched_clock uses something + * other than jiffies.) + */ + unsigned long long sum_sched_runtime; + /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs diff --git a/kernel/exit.c b/kernel/exit.c index f80dec3f1875..efd30ccf3858 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -118,6 +118,8 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ + sig->utime = cputime_add(sig->utime, task_utime(tsk)); + sig->stime = cputime_add(sig->stime, task_stime(tsk)); sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; @@ -126,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk) sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } diff --git a/kernel/fork.c b/kernel/fork.c index 242a706e7721..e8e854a04ad2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -851,13 +851,14 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->tty_old_pgrp = NULL; sig->tty = NULL; - sig->cutime = sig->cstime = cputime_zero; + sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; task_io_accounting_init(&sig->ioac); + sig->sum_sched_runtime = 0; taskstats_tgid_init(sig); task_lock(current->group_leader); diff --git a/kernel/signal.c b/kernel/signal.c index b6b36768b758..2a74fe87c0dd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1367,7 +1367,6 @@ int do_notify_parent(struct task_struct *tsk, int sig) struct siginfo info; unsigned long flags; struct sighand_struct *psig; - struct task_cputime cputime; int ret = sig; BUG_ON(sig == -1); @@ -1397,9 +1396,10 @@ int do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = __task_cred(tsk)->uid; rcu_read_unlock(); - thread_group_cputime(tsk, &cputime); - info.si_utime = cputime_to_jiffies(cputime.utime); - info.si_stime = cputime_to_jiffies(cputime.stime); + info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, + tsk->signal->utime)); + info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, + tsk->signal->stime)); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) -- cgit v1.2.3 From 4cd4c1b40d40447fb5e7ba80746c6d7ba91d7a53 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Feb 2009 12:24:16 +0100 Subject: timers: split process wide cpu clocks/timers Change the process wide cpu timers/clocks so that we: 1) don't mess up the kernel with too many threads, 2) don't have a per-cpu allocation for each process, 3) have no impact when not used. In order to accomplish this we're going to split it into two parts: - clocks; which can take all the time they want since they run from user context -- ie. sys_clock_gettime(CLOCK_PROCESS_CPUTIME_ID) - timers; which need constant time sampling but since they're explicity used, the user can pay the overhead. The clock readout will go back to a full sum of the thread group, while the timers will run of a global 'clock' that only runs when needed, so only programs that make use of the facility pay the price. Signed-off-by: Peter Zijlstra Reviewed-by: Ingo Molnar Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 11 +++--- include/linux/sched.h | 54 +++++++++++++++------------ kernel/itimer.c | 4 +- kernel/posix-cpu-timers.c | 95 +++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched_stats.h | 45 ++++++++++++---------- 5 files changed, 155 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index ea0ea1a4c36f..e752d973fa21 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -48,12 +48,11 @@ extern struct fs_struct init_fs; .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \ .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ - .cputime = { .totals = { \ - .utime = cputime_zero, \ - .stime = cputime_zero, \ - .sum_exec_runtime = 0, \ - .lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock), \ - }, }, \ + .cputimer = { \ + .cputime = INIT_CPUTIME, \ + .running = 0, \ + .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ + }, \ } extern struct nsproxy init_nsproxy; diff --git a/include/linux/sched.h b/include/linux/sched.h index 2e0646a30314..082d7619b3a1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -443,7 +443,6 @@ struct pacct_struct { * @utime: time spent in user mode, in &cputime_t units * @stime: time spent in kernel mode, in &cputime_t units * @sum_exec_runtime: total time spent on the CPU, in nanoseconds - * @lock: lock for fields in this struct * * This structure groups together three kinds of CPU time that are * tracked for threads and thread groups. Most things considering @@ -454,23 +453,33 @@ struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; - spinlock_t lock; }; /* Alternate field names when used to cache expirations. */ #define prof_exp stime #define virt_exp utime #define sched_exp sum_exec_runtime +#define INIT_CPUTIME \ + (struct task_cputime) { \ + .utime = cputime_zero, \ + .stime = cputime_zero, \ + .sum_exec_runtime = 0, \ + } + /** - * struct thread_group_cputime - thread group interval timer counts - * @totals: thread group interval timers; substructure for - * uniprocessor kernel, per-cpu for SMP kernel. + * struct thread_group_cputimer - thread group interval timer counts + * @cputime: thread group interval timers. + * @running: non-zero when there are timers running and + * @cputime receives updates. + * @lock: lock for fields in this struct. * * This structure contains the version of task_cputime, above, that is - * used for thread group CPU clock calculations. + * used for thread group CPU timer calculations. */ -struct thread_group_cputime { - struct task_cputime totals; +struct thread_group_cputimer { + struct task_cputime cputime; + int running; + spinlock_t lock; }; /* @@ -519,10 +528,10 @@ struct signal_struct { cputime_t it_prof_incr, it_virt_incr; /* - * Thread group totals for process CPU clocks. - * See thread_group_cputime(), et al, for details. + * Thread group totals for process CPU timers. + * See thread_group_cputimer(), et al, for details. */ - struct thread_group_cputime cputime; + struct thread_group_cputimer cputimer; /* Earliest-expiration cache. */ struct task_cputime cputime_expires; @@ -2191,27 +2200,26 @@ static inline int spin_needbreak(spinlock_t *lock) /* * Thread group CPU time accounting. */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); static inline -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) { - struct task_cputime *totals = &tsk->signal->cputime.totals; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; unsigned long flags; - spin_lock_irqsave(&totals->lock, flags); - *times = *totals; - spin_unlock_irqrestore(&totals->lock, flags); + WARN_ON(!cputimer->running); + + spin_lock_irqsave(&cputimer->lock, flags); + *times = cputimer->cputime; + spin_unlock_irqrestore(&cputimer->lock, flags); } static inline void thread_group_cputime_init(struct signal_struct *sig) { - sig->cputime.totals = (struct task_cputime){ - .utime = cputime_zero, - .stime = cputime_zero, - .sum_exec_runtime = 0, - }; - - spin_lock_init(&sig->cputime.totals.lock); + sig->cputimer.cputime = INIT_CPUTIME; + spin_lock_init(&sig->cputimer.lock); + sig->cputimer.running = 0; } static inline void thread_group_cputime_free(struct signal_struct *sig) diff --git a/kernel/itimer.c b/kernel/itimer.c index 6a5fe93dd8bd..58762f7077ec 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -62,7 +62,7 @@ int do_getitimer(int which, struct itimerval *value) struct task_cputime cputime; cputime_t utime; - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); utime = cputime.utime; if (cputime_le(cval, utime)) { /* about to fire */ cval = jiffies_to_cputime(1); @@ -82,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value) struct task_cputime times; cputime_t ptime; - thread_group_cputime(tsk, ×); + thread_group_cputimer(tsk, ×); ptime = cputime_add(times.utime, times.stime); if (cputime_le(cval, ptime)) { /* about to fire */ cval = jiffies_to_cputime(1); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index fa07da94d7be..db107c9bbc05 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -230,6 +230,37 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, return 0; } +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct sighand_struct *sighand; + struct signal_struct *sig; + struct task_struct *t; + + *times = INIT_CPUTIME; + + rcu_read_lock(); + sighand = rcu_dereference(tsk->sighand); + if (!sighand) + goto out; + + sig = tsk->signal; + + t = tsk; + do { + times->utime = cputime_add(times->utime, t->utime); + times->stime = cputime_add(times->stime, t->stime); + times->sum_exec_runtime += t->se.sum_exec_runtime; + + t = next_thread(t); + } while (t != tsk); + + times->utime = cputime_add(times->utime, sig->utime); + times->stime = cputime_add(times->stime, sig->stime); + times->sum_exec_runtime += sig->sum_sched_runtime; +out: + rcu_read_unlock(); +} + /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. @@ -475,6 +506,29 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) now); } +/* + * Enable the process wide cpu timer accounting. + * + * serialized using ->sighand->siglock + */ +static void start_process_timers(struct task_struct *tsk) +{ + tsk->signal->cputimer.running = 1; + barrier(); +} + +/* + * Release the process wide timer accounting -- timer stops ticking when + * nobody cares about it. + * + * serialized using ->sighand->siglock + */ +static void stop_process_timers(struct task_struct *tsk) +{ + tsk->signal->cputimer.running = 0; + barrier(); +} + /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held @@ -495,6 +549,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) BUG_ON(!irqs_disabled()); spin_lock(&p->sighand->siglock); + if (!CPUCLOCK_PERTHREAD(timer->it_clock)) + start_process_timers(p); + listpos = head; if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { list_for_each_entry(next, head, entry) { @@ -987,13 +1044,15 @@ static void check_process_timers(struct task_struct *tsk, sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && list_empty(&timers[CPUCLOCK_VIRT]) && cputime_eq(sig->it_virt_expires, cputime_zero) && - list_empty(&timers[CPUCLOCK_SCHED])) + list_empty(&timers[CPUCLOCK_SCHED])) { + stop_process_timers(tsk); return; + } /* * Collect the current process totals. */ - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); utime = cputime.utime; ptime = cputime_add(utime, cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; @@ -1259,7 +1318,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (!task_cputime_zero(&sig->cputime_expires)) { struct task_cputime group_sample; - thread_group_cputime(tsk, &group_sample); + thread_group_cputimer(tsk, &group_sample); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; } @@ -1328,6 +1387,33 @@ void run_posix_cpu_timers(struct task_struct *tsk) } } +/* + * Sample a process (thread group) timer for the given group_leader task. + * Must be called with tasklist_lock held for reading. + */ +static int cpu_timer_sample_group(const clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) +{ + struct task_cputime cputime; + + thread_group_cputimer(p, &cputime); + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + cpu->cpu = cputime_add(cputime.utime, cputime.stime); + break; + case CPUCLOCK_VIRT: + cpu->cpu = cputime.utime; + break; + case CPUCLOCK_SCHED: + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + break; + } + return 0; +} + /* * Set one of the process-wide special case CPU timers. * The tsk->sighand->siglock must be held by the caller. @@ -1341,7 +1427,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - cpu_clock_sample_group(clock_idx, tsk, &now); + start_process_timers(tsk); + cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval) { if (!cputime_eq(*oldval, cputime_zero)) { diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 8ab0cef8ecab..a8f93dd374e1 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -296,19 +296,21 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) static inline void account_group_user_time(struct task_struct *tsk, cputime_t cputime) { - struct task_cputime *times; - struct signal_struct *sig; + struct thread_group_cputimer *cputimer; /* tsk == current, ensure it is safe to use ->signal */ if (unlikely(tsk->exit_state)) return; - sig = tsk->signal; - times = &sig->cputime.totals; + cputimer = &tsk->signal->cputimer; - spin_lock(×->lock); - times->utime = cputime_add(times->utime, cputime); - spin_unlock(×->lock); + if (!cputimer->running) + return; + + spin_lock(&cputimer->lock); + cputimer->cputime.utime = + cputime_add(cputimer->cputime.utime, cputime); + spin_unlock(&cputimer->lock); } /** @@ -324,19 +326,21 @@ static inline void account_group_user_time(struct task_struct *tsk, static inline void account_group_system_time(struct task_struct *tsk, cputime_t cputime) { - struct task_cputime *times; - struct signal_struct *sig; + struct thread_group_cputimer *cputimer; /* tsk == current, ensure it is safe to use ->signal */ if (unlikely(tsk->exit_state)) return; - sig = tsk->signal; - times = &sig->cputime.totals; + cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return; - spin_lock(×->lock); - times->stime = cputime_add(times->stime, cputime); - spin_unlock(×->lock); + spin_lock(&cputimer->lock); + cputimer->cputime.stime = + cputime_add(cputimer->cputime.stime, cputime); + spin_unlock(&cputimer->lock); } /** @@ -352,7 +356,7 @@ static inline void account_group_system_time(struct task_struct *tsk, static inline void account_group_exec_runtime(struct task_struct *tsk, unsigned long long ns) { - struct task_cputime *times; + struct thread_group_cputimer *cputimer; struct signal_struct *sig; sig = tsk->signal; @@ -361,9 +365,12 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (unlikely(!sig)) return; - times = &sig->cputime.totals; + cputimer = &sig->cputimer; + + if (!cputimer->running) + return; - spin_lock(×->lock); - times->sum_exec_runtime += ns; - spin_unlock(×->lock); + spin_lock(&cputimer->lock); + cputimer->cputime.sum_exec_runtime += ns; + spin_unlock(&cputimer->lock); } -- cgit v1.2.3 From ac7b9004909d03d67016368093e81d37cae72895 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Feb 2009 15:11:59 -0800 Subject: generic swap(): don't return a value from swap() The swap() macro is accidentally retuning the value of its first argument. Change it into a doesn't-return-anything macro before someone goes and relies upon this behaviour. Signed-off-by: Peter Zijlstra Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 343df9ef2412..7fa371898e3e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -480,7 +480,8 @@ static inline char *pack_hex_byte(char *buf, u8 byte) /* * swap - swap value of @a and @b */ -#define swap(a, b) ({ typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; }) +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) /** * container_of - cast a member of a structure out to the containing structure -- cgit v1.2.3 From 1f5e31d7e55ac7fbd4ec5e5b20c8868b0e4564c9 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 4 Feb 2009 15:12:03 -0800 Subject: fbmem: don't call copy_from/to_user() with mutex held Avoid calling copy_from/to_user() with fb_info->lock mutex held in fbmem ioctl(). fb_mmap() is called under mm->mmap_sem (A) held, that also acquires fb_info->lock (B); fb_ioctl() takes fb_info->lock (B) and does copy_from/to_user() that might acquire mm->mmap_sem (A), causing a deadlock. NOTE: it doesn't push down the fb_info->lock in each own driver's fb_ioctl(), so there are still potential deadlocks elsewhere. Signed-off-by: Andrea Righi Cc: Dave Jones Cc: "Rafael J. Wysocki" Cc: Johannes Weiner Cc: Krzysztof Helt Cc: Harvey Harrison Cc: Stefan Richter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/fbcmap.c | 20 +++++--- drivers/video/fbmem.c | 135 +++++++++++++++++++++++++------------------------ include/linux/fb.h | 15 ++++++ 3 files changed, 98 insertions(+), 72 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/fbcmap.c b/drivers/video/fbcmap.c index 91b78e691505..f53b9f1d6aba 100644 --- a/drivers/video/fbcmap.c +++ b/drivers/video/fbcmap.c @@ -250,10 +250,6 @@ int fb_set_user_cmap(struct fb_cmap_user *cmap, struct fb_info *info) int rc, size = cmap->len * sizeof(u16); struct fb_cmap umap; - if (cmap->start < 0 || (!info->fbops->fb_setcolreg && - !info->fbops->fb_setcmap)) - return -EINVAL; - memset(&umap, 0, sizeof(struct fb_cmap)); rc = fb_alloc_cmap(&umap, cmap->len, cmap->transp != NULL); if (rc) @@ -262,11 +258,23 @@ int fb_set_user_cmap(struct fb_cmap_user *cmap, struct fb_info *info) copy_from_user(umap.green, cmap->green, size) || copy_from_user(umap.blue, cmap->blue, size) || (cmap->transp && copy_from_user(umap.transp, cmap->transp, size))) { - fb_dealloc_cmap(&umap); - return -EFAULT; + rc = -EFAULT; + goto out; } umap.start = cmap->start; + if (!lock_fb_info(info)) { + rc = -ENODEV; + goto out; + } + if (cmap->start < 0 || (!info->fbops->fb_setcolreg && + !info->fbops->fb_setcmap)) { + rc = -EINVAL; + goto out1; + } rc = fb_set_cmap(&umap, info); +out1: + unlock_fb_info(info); +out: fb_dealloc_cmap(&umap); return rc; } diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index 756efeb91abc..cfd9dce1ce0b 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -1013,132 +1013,139 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, struct fb_var_screeninfo var; struct fb_fix_screeninfo fix; struct fb_con2fbmap con2fb; + struct fb_cmap cmap_from; struct fb_cmap_user cmap; struct fb_event event; void __user *argp = (void __user *)arg; long ret = 0; - fb = info->fbops; - if (!fb) - return -ENODEV; - switch (cmd) { case FBIOGET_VSCREENINFO: - ret = copy_to_user(argp, &info->var, - sizeof(var)) ? -EFAULT : 0; + if (!lock_fb_info(info)) + return -ENODEV; + var = info->var; + unlock_fb_info(info); + + ret = copy_to_user(argp, &var, sizeof(var)) ? -EFAULT : 0; break; case FBIOPUT_VSCREENINFO: - if (copy_from_user(&var, argp, sizeof(var))) { - ret = -EFAULT; - break; - } + if (copy_from_user(&var, argp, sizeof(var))) + return -EFAULT; + if (!lock_fb_info(info)) + return -ENODEV; acquire_console_sem(); info->flags |= FBINFO_MISC_USEREVENT; ret = fb_set_var(info, &var); info->flags &= ~FBINFO_MISC_USEREVENT; release_console_sem(); - if (ret == 0 && copy_to_user(argp, &var, sizeof(var))) + unlock_fb_info(info); + if (!ret && copy_to_user(argp, &var, sizeof(var))) ret = -EFAULT; break; case FBIOGET_FSCREENINFO: - ret = copy_to_user(argp, &info->fix, - sizeof(fix)) ? -EFAULT : 0; + if (!lock_fb_info(info)) + return -ENODEV; + fix = info->fix; + unlock_fb_info(info); + + ret = copy_to_user(argp, &fix, sizeof(fix)) ? -EFAULT : 0; break; case FBIOPUTCMAP: if (copy_from_user(&cmap, argp, sizeof(cmap))) - ret = -EFAULT; - else - ret = fb_set_user_cmap(&cmap, info); + return -EFAULT; + ret = fb_set_user_cmap(&cmap, info); break; case FBIOGETCMAP: if (copy_from_user(&cmap, argp, sizeof(cmap))) - ret = -EFAULT; - else - ret = fb_cmap_to_user(&info->cmap, &cmap); + return -EFAULT; + if (!lock_fb_info(info)) + return -ENODEV; + cmap_from = info->cmap; + unlock_fb_info(info); + ret = fb_cmap_to_user(&cmap_from, &cmap); break; case FBIOPAN_DISPLAY: - if (copy_from_user(&var, argp, sizeof(var))) { - ret = -EFAULT; - break; - } + if (copy_from_user(&var, argp, sizeof(var))) + return -EFAULT; + if (!lock_fb_info(info)) + return -ENODEV; acquire_console_sem(); ret = fb_pan_display(info, &var); release_console_sem(); + unlock_fb_info(info); if (ret == 0 && copy_to_user(argp, &var, sizeof(var))) - ret = -EFAULT; + return -EFAULT; break; case FBIO_CURSOR: ret = -EINVAL; break; case FBIOGET_CON2FBMAP: if (copy_from_user(&con2fb, argp, sizeof(con2fb))) - ret = -EFAULT; - else if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES) - ret = -EINVAL; - else { - con2fb.framebuffer = -1; - event.info = info; - event.data = &con2fb; - fb_notifier_call_chain(FB_EVENT_GET_CONSOLE_MAP, - &event); - ret = copy_to_user(argp, &con2fb, - sizeof(con2fb)) ? -EFAULT : 0; - } + return -EFAULT; + if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES) + return -EINVAL; + con2fb.framebuffer = -1; + event.data = &con2fb; + + if (!lock_fb_info(info)) + return -ENODEV; + event.info = info; + fb_notifier_call_chain(FB_EVENT_GET_CONSOLE_MAP, &event); + unlock_fb_info(info); + + ret = copy_to_user(argp, &con2fb, sizeof(con2fb)) ? -EFAULT : 0; break; case FBIOPUT_CON2FBMAP: - if (copy_from_user(&con2fb, argp, sizeof(con2fb))) { - ret = -EFAULT; - break; - } - if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES) { - ret = -EINVAL; - break; - } - if (con2fb.framebuffer < 0 || con2fb.framebuffer >= FB_MAX) { - ret = -EINVAL; - break; - } + if (copy_from_user(&con2fb, argp, sizeof(con2fb))) + return -EFAULT; + if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES) + return -EINVAL; + if (con2fb.framebuffer < 0 || con2fb.framebuffer >= FB_MAX) + return -EINVAL; if (!registered_fb[con2fb.framebuffer]) request_module("fb%d", con2fb.framebuffer); if (!registered_fb[con2fb.framebuffer]) { ret = -EINVAL; break; } - event.info = info; event.data = &con2fb; + if (!lock_fb_info(info)) + return -ENODEV; + event.info = info; ret = fb_notifier_call_chain(FB_EVENT_SET_CONSOLE_MAP, &event); + unlock_fb_info(info); break; case FBIOBLANK: + if (!lock_fb_info(info)) + return -ENODEV; acquire_console_sem(); info->flags |= FBINFO_MISC_USEREVENT; ret = fb_blank(info, arg); info->flags &= ~FBINFO_MISC_USEREVENT; release_console_sem(); - break;; + unlock_fb_info(info); + break; default: - if (fb->fb_ioctl == NULL) - ret = -ENOTTY; - else + if (!lock_fb_info(info)) + return -ENODEV; + fb = info->fbops; + if (fb->fb_ioctl) ret = fb->fb_ioctl(info, cmd, arg); + else + ret = -ENOTTY; + unlock_fb_info(info); } return ret; } static long fb_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -__acquires(&info->lock) -__releases(&info->lock) { struct inode *inode = file->f_path.dentry->d_inode; int fbidx = iminor(inode); - struct fb_info *info; - long ret; + struct fb_info *info = registered_fb[fbidx]; - info = registered_fb[fbidx]; - mutex_lock(&info->lock); - ret = do_fb_ioctl(info, cmd, arg); - mutex_unlock(&info->lock); - return ret; + return do_fb_ioctl(info, cmd, arg); } #ifdef CONFIG_COMPAT @@ -1257,8 +1264,6 @@ static int fb_get_fscreeninfo(struct fb_info *info, unsigned int cmd, static long fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -__acquires(&info->lock) -__releases(&info->lock) { struct inode *inode = file->f_path.dentry->d_inode; int fbidx = iminor(inode); @@ -1266,7 +1271,6 @@ __releases(&info->lock) struct fb_ops *fb = info->fbops; long ret = -ENOIOCTLCMD; - mutex_lock(&info->lock); switch(cmd) { case FBIOGET_VSCREENINFO: case FBIOPUT_VSCREENINFO: @@ -1292,7 +1296,6 @@ __releases(&info->lock) ret = fb->fb_compat_ioctl(info, cmd, arg); break; } - mutex_unlock(&info->lock); return ret; } #endif diff --git a/include/linux/fb.h b/include/linux/fb.h index 818fe21257e8..31527e17076b 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -960,6 +960,21 @@ extern struct fb_info *registered_fb[FB_MAX]; extern int num_registered_fb; extern struct class *fb_class; +static inline int lock_fb_info(struct fb_info *info) +{ + mutex_lock(&info->lock); + if (!info->fbops) { + mutex_unlock(&info->lock); + return 0; + } + return 1; +} + +static inline void unlock_fb_info(struct fb_info *info) +{ + mutex_unlock(&info->lock); +} + static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, u32 height) { -- cgit v1.2.3 From 777c6c5f1f6e757ae49ecca2ed72d6b1f523c007 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 4 Feb 2009 15:12:14 -0800 Subject: wait: prevent exclusive waiter starvation With exclusive waiters, every process woken up through the wait queue must ensure that the next waiter down the line is woken when it has finished. Interruptible waiters don't do that when aborting due to a signal. And if an aborting waiter is concurrently woken up through the waitqueue, noone will ever wake up the next waiter. This has been observed with __wait_on_bit_lock() used by lock_page_killable(): the first contender on the queue was aborting when the actual lock holder woke it up concurrently. The aborted contender didn't acquire the lock and therefor never did an unlock followed by waking up the next waiter. Add abort_exclusive_wait() which removes the process' wait descriptor from the waitqueue, iff still queued, or wakes up the next waiter otherwise. It does so under the waitqueue lock. Racing with a wake up means the aborting process is either already woken (removed from the queue) and will wake up the next waiter, or it will remove itself from the queue and the concurrent wake up will apply to the next waiter after it. Use abort_exclusive_wait() in __wait_event_interruptible_exclusive() and __wait_on_bit_lock() when they were interrupted by other means than a wake up through the queue. [akpm@linux-foundation.org: coding-style fixes] Reported-by: Chris Mason Signed-off-by: Johannes Weiner Mentored-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Matthew Wilcox Cc: Chuck Lever Cc: Nick Piggin Cc: Ingo Molnar Cc: ["after some testing"] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/wait.h | 11 ++++++++-- kernel/sched.c | 4 ++-- kernel/wait.c | 59 +++++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 63 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index ef609f842fac..a210ede73b56 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -132,6 +132,8 @@ static inline void __remove_wait_queue(wait_queue_head_t *head, list_del(&old->task_list); } +void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int sync, void *key); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); extern void __wake_up_locked(wait_queue_head_t *q, unsigned int mode); extern void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); @@ -333,16 +335,19 @@ do { \ for (;;) { \ prepare_to_wait_exclusive(&wq, &__wait, \ TASK_INTERRUPTIBLE); \ - if (condition) \ + if (condition) { \ + finish_wait(&wq, &__wait); \ break; \ + } \ if (!signal_pending(current)) { \ schedule(); \ continue; \ } \ ret = -ERESTARTSYS; \ + abort_exclusive_wait(&wq, &__wait, \ + TASK_INTERRUPTIBLE, NULL); \ break; \ } \ - finish_wait(&wq, &__wait); \ } while (0) #define wait_event_interruptible_exclusive(wq, condition) \ @@ -431,6 +436,8 @@ extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, + unsigned int mode, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); diff --git a/kernel/sched.c b/kernel/sched.c index 242d0d47a70d..8ee437a5ec1d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4697,8 +4697,8 @@ EXPORT_SYMBOL(default_wake_function); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) +void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int sync, void *key) { wait_queue_t *curr, *next; diff --git a/kernel/wait.c b/kernel/wait.c index cd87131f2fc2..42a2dbc181c8 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -91,6 +91,15 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_exclusive); +/* + * finish_wait - clean up after waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + */ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; @@ -117,6 +126,39 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(finish_wait); +/* + * abort_exclusive_wait - abort exclusive waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * @state: runstate of the waiter to be woken + * @key: key to identify a wait bit queue or %NULL + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + * + * Wakes up the next waiter if the caller is concurrently + * woken up through the queue. + * + * This prevents waiter starvation where an exclusive waiter + * aborts and is woken up concurrently and noone wakes up + * the next waiter. + */ +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, + unsigned int mode, void *key) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + spin_lock_irqsave(&q->lock, flags); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); + else if (waitqueue_active(q)) + __wake_up_common(q, mode, 1, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(abort_exclusive_wait); + int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); @@ -177,17 +219,20 @@ int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, int (*action)(void *), unsigned mode) { - int ret = 0; - do { + int ret; + prepare_to_wait_exclusive(wq, &q->wait, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) { - if ((ret = (*action)(q->key.flags))) - break; - } + if (!test_bit(q->key.bit_nr, q->key.flags)) + continue; + ret = action(q->key.flags); + if (!ret) + continue; + abort_exclusive_wait(wq, &q->wait, mode, &q->key); + return ret; } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); finish_wait(wq, &q->wait); - return ret; + return 0; } EXPORT_SYMBOL(__wait_on_bit_lock); -- cgit v1.2.3 From 7d8e23df69820e6be42bcc41d441f4860e8c76f7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 6 Feb 2009 14:57:51 +0100 Subject: timers: split process wide cpu clocks/timers, remove spurious warning Mike Galbraith reported that the new warning in thread_group_cputimer() triggers en masse with Amarok running. Oleg Nesterov observed: Can't fastpath_timer_check()->thread_group_cputimer() have the false warning too? Suppose we had the timer, then posix_cpu_timer_del() removes this timer, but task_cputime_zero(&sig->cputime_expires) still not true. Remove the spurious debug warning. Reported-by: Mike Galbraith Explained-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 082d7619b3a1..79392916d6c9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2208,8 +2208,6 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; unsigned long flags; - WARN_ON(!cputimer->running); - spin_lock_irqsave(&cputimer->lock, flags); *times = cputimer->cputime; spin_unlock_irqrestore(&cputimer->lock, flags); -- cgit v1.2.3 From b4bd07c20ba0c1fa7ad09ba257e0a5cfc2bf6bb3 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 6 Feb 2009 22:06:43 -0800 Subject: net_dma: call dmaengine_get only if NET_DMA enabled Based upon a patch from Atsushi Nemoto -------------------- The commit 649274d993212e7c23c0cb734572c2311c200872 ("net_dma: acquire/release dma channels on ifup/ifdown") added unconditional call of dmaengine_get() to net_dma. The API should be called only if NET_DMA was enabled. -------------------- Signed-off-by: David S. Miller Acked-by: Dan Williams --- include/linux/dmaengine.h | 12 ++++++++++++ net/core/dev.c | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 3e0f64c335c8..3e68469c1885 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -282,6 +282,18 @@ static inline void dmaengine_put(void) } #endif +#ifdef CONFIG_NET_DMA +#define net_dmaengine_get() dmaengine_get() +#define net_dmaengine_put() dmaengine_put() +#else +static inline void net_dmaengine_get(void) +{ +} +static inline void net_dmaengine_put(void) +{ +} +#endif + dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest, void *src, size_t len); dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan, diff --git a/net/core/dev.c b/net/core/dev.c index 5379b0c1190a..a17e00662363 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1090,7 +1090,7 @@ int dev_open(struct net_device *dev) /* * Enable NET_DMA */ - dmaengine_get(); + net_dmaengine_get(); /* * Initialize multicasting status @@ -1172,7 +1172,7 @@ int dev_close(struct net_device *dev) /* * Shutdown NET_DMA */ - dmaengine_put(); + net_dmaengine_put(); return 0; } -- cgit v1.2.3 From 7f9a50a5b89b87f8e754f59ae9968da28be618a5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 7 Feb 2009 18:15:56 +1030 Subject: module: remove over-zealous check in __module_get() Impact: fix spurious BUG_ON() triggered under load module_refcount() isn't reliable outside stop_machine(), as demonstrated by Karsten Keil , networking can trigger it under load (an inc on one cpu and dec on another while module_refcount() is tallying can give false results, for example). Almost noone should be using __module_get, but that's another issue. Cc: Karsten Keil Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- include/linux/module.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index f3b8329eb5b8..145a75528cc1 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -407,7 +407,6 @@ static inline local_t *__module_ref_addr(struct module *mod, int cpu) static inline void __module_get(struct module *module) { if (module) { - BUG_ON(module_refcount(module) == 0); local_inc(__module_ref_addr(module, get_cpu())); put_cpu(); } -- cgit v1.2.3 From 766ccb9ed406c230d13c145def08ebea1b932982 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Tue, 20 Jan 2009 15:31:31 +0100 Subject: async: Rename _special -> _domain for clarity. Rename the async_*_special() functions to async_*_domain(), which describes the purpose of these functions much better. [Broke up long lines to silence checkpatch] Signed-off-by: Cornelia Huck Signed-off-by: Arjan van de Ven --- fs/super.c | 4 ++-- include/linux/async.h | 8 +++++--- kernel/async.c | 41 ++++++++++++++++++++++------------------- 3 files changed, 29 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index 645e5403f2a0..61dce001dd57 100644 --- a/fs/super.c +++ b/fs/super.c @@ -301,7 +301,7 @@ void generic_shutdown_super(struct super_block *sb) /* * wait for asynchronous fs operations to finish before going further */ - async_synchronize_full_special(&sb->s_async_list); + async_synchronize_full_domain(&sb->s_async_list); /* bad name - it should be evict_inodes() */ invalidate_inodes(sb); @@ -470,7 +470,7 @@ restart: sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - async_synchronize_full_special(&sb->s_async_list); + async_synchronize_full_domain(&sb->s_async_list); if (sb->s_root && (wait || sb->s_dirt)) sb->s_op->sync_fs(sb, wait); up_read(&sb->s_umount); diff --git a/include/linux/async.h b/include/linux/async.h index c4ecacd0b327..68a9530196f2 100644 --- a/include/linux/async.h +++ b/include/linux/async.h @@ -17,9 +17,11 @@ typedef u64 async_cookie_t; typedef void (async_func_ptr) (void *data, async_cookie_t cookie); extern async_cookie_t async_schedule(async_func_ptr *ptr, void *data); -extern async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *list); +extern async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, + struct list_head *list); extern void async_synchronize_full(void); -extern void async_synchronize_full_special(struct list_head *list); +extern void async_synchronize_full_domain(struct list_head *list); extern void async_synchronize_cookie(async_cookie_t cookie); -extern void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *list); +extern void async_synchronize_cookie_domain(async_cookie_t cookie, + struct list_head *list); diff --git a/kernel/async.c b/kernel/async.c index b5f0d4b94937..e23399d88bac 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -224,22 +224,23 @@ async_cookie_t async_schedule(async_func_ptr *ptr, void *data) EXPORT_SYMBOL_GPL(async_schedule); /** - * async_schedule_special - schedule a function for asynchronous execution with a special running queue + * async_schedule_domain - schedule a function for asynchronous execution within a certain domain * @ptr: function to execute asynchronously * @data: data pointer to pass to the function - * @running: list head to add to while running + * @running: running list for the domain * * Returns an async_cookie_t that may be used for checkpointing later. - * @running may be used in the async_synchronize_*_special() functions - * to wait on a special running queue rather than on the global running - * queue. + * @running may be used in the async_synchronize_*_domain() functions + * to wait within a certain synchronization domain rather than globally. + * A synchronization domain is specified via the running queue @running to use. * Note: This function may be called from atomic or non-atomic contexts. */ -async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running) +async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, + struct list_head *running) { return __async_schedule(ptr, data, running); } -EXPORT_SYMBOL_GPL(async_schedule_special); +EXPORT_SYMBOL_GPL(async_schedule_domain); /** * async_synchronize_full - synchronize all asynchronous function calls @@ -255,27 +256,29 @@ void async_synchronize_full(void) EXPORT_SYMBOL_GPL(async_synchronize_full); /** - * async_synchronize_full_special - synchronize all asynchronous function calls for a running list + * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain * @list: running list to synchronize on * - * This function waits until all asynchronous function calls for the running - * list @list have been done. + * This function waits until all asynchronous function calls for the + * synchronization domain specified by the running list @list have been done. */ -void async_synchronize_full_special(struct list_head *list) +void async_synchronize_full_domain(struct list_head *list) { - async_synchronize_cookie_special(next_cookie, list); + async_synchronize_cookie_domain(next_cookie, list); } -EXPORT_SYMBOL_GPL(async_synchronize_full_special); +EXPORT_SYMBOL_GPL(async_synchronize_full_domain); /** - * async_synchronize_cookie_special - synchronize asynchronous function calls on a running list with cookie checkpointing + * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing * @cookie: async_cookie_t to use as checkpoint * @running: running list to synchronize on * - * This function waits until all asynchronous function calls for the running - * list @list submitted prior to @cookie have been done. + * This function waits until all asynchronous function calls for the + * synchronization domain specified by the running list @list submitted + * prior to @cookie have been done. */ -void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running) +void async_synchronize_cookie_domain(async_cookie_t cookie, + struct list_head *running) { ktime_t starttime, delta, endtime; @@ -295,7 +298,7 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r (long long)ktime_to_ns(delta) >> 10); } } -EXPORT_SYMBOL_GPL(async_synchronize_cookie_special); +EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain); /** * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing @@ -306,7 +309,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_special); */ void async_synchronize_cookie(async_cookie_t cookie) { - async_synchronize_cookie_special(cookie, &async_running); + async_synchronize_cookie_domain(cookie, &async_running); } EXPORT_SYMBOL_GPL(async_synchronize_cookie); -- cgit v1.2.3 From 704126ad81b8cb7d3d70adb9ecb143f4d3fb38af Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 4 Jan 2009 16:28:52 +0800 Subject: VT-d: handle Invalidation Queue Error to avoid system hang When hardware detects any error with a descriptor from the invalidation queue, it stops fetching new descriptors from the queue until software clears the Invalidation Queue Error bit in the Fault Status register. Following fix handles the IQE so the kernel won't be trapped in an infinite loop. Signed-off-by: Yu Zhao Signed-off-by: David Woodhouse --- drivers/pci/dmar.c | 61 ++++++++++++++++++++++++++++++++------------ drivers/pci/intr_remapping.c | 21 ++++++++------- include/linux/intel-iommu.h | 3 ++- 3 files changed, 59 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index 2b4162d9ca30..8d3e9c261061 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -573,19 +573,49 @@ static inline void reclaim_free_desc(struct q_inval *qi) } } +static int qi_check_fault(struct intel_iommu *iommu, int index) +{ + u32 fault; + int head; + struct q_inval *qi = iommu->qi; + int wait_index = (index + 1) % QI_LENGTH; + + fault = readl(iommu->reg + DMAR_FSTS_REG); + + /* + * If IQE happens, the head points to the descriptor associated + * with the error. No new descriptors are fetched until the IQE + * is cleared. + */ + if (fault & DMA_FSTS_IQE) { + head = readl(iommu->reg + DMAR_IQH_REG); + if ((head >> 4) == index) { + memcpy(&qi->desc[index], &qi->desc[wait_index], + sizeof(struct qi_desc)); + __iommu_flush_cache(iommu, &qi->desc[index], + sizeof(struct qi_desc)); + writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG); + return -EINVAL; + } + } + + return 0; +} + /* * Submit the queued invalidation descriptor to the remapping * hardware unit and wait for its completion. */ -void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) +int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) { + int rc = 0; struct q_inval *qi = iommu->qi; struct qi_desc *hw, wait_desc; int wait_index, index; unsigned long flags; if (!qi) - return; + return 0; hw = qi->desc; @@ -603,7 +633,8 @@ void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) hw[index] = *desc; - wait_desc.low = QI_IWD_STATUS_DATA(2) | QI_IWD_STATUS_WRITE | QI_IWD_TYPE; + wait_desc.low = QI_IWD_STATUS_DATA(QI_DONE) | + QI_IWD_STATUS_WRITE | QI_IWD_TYPE; wait_desc.high = virt_to_phys(&qi->desc_status[wait_index]); hw[wait_index] = wait_desc; @@ -614,13 +645,11 @@ void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) qi->free_head = (qi->free_head + 2) % QI_LENGTH; qi->free_cnt -= 2; - spin_lock(&iommu->register_lock); /* * update the HW tail register indicating the presence of * new descriptors. */ writel(qi->free_head << 4, iommu->reg + DMAR_IQT_REG); - spin_unlock(&iommu->register_lock); while (qi->desc_status[wait_index] != QI_DONE) { /* @@ -630,15 +659,21 @@ void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) * a deadlock where the interrupt context can wait indefinitely * for free slots in the queue. */ + rc = qi_check_fault(iommu, index); + if (rc) + goto out; + spin_unlock(&qi->q_lock); cpu_relax(); spin_lock(&qi->q_lock); } - - qi->desc_status[index] = QI_DONE; +out: + qi->desc_status[index] = qi->desc_status[wait_index] = QI_DONE; reclaim_free_desc(qi); spin_unlock_irqrestore(&qi->q_lock, flags); + + return rc; } /* @@ -651,13 +686,13 @@ void qi_global_iec(struct intel_iommu *iommu) desc.low = QI_IEC_TYPE; desc.high = 0; + /* should never fail */ qi_submit_sync(&desc, iommu); } int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, u64 type, int non_present_entry_flush) { - struct qi_desc desc; if (non_present_entry_flush) { @@ -671,10 +706,7 @@ int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, | QI_CC_GRAN(type) | QI_CC_TYPE; desc.high = 0; - qi_submit_sync(&desc, iommu); - - return 0; - + return qi_submit_sync(&desc, iommu); } int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, @@ -704,10 +736,7 @@ int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, desc.high = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) | QI_IOTLB_AM(size_order); - qi_submit_sync(&desc, iommu); - - return 0; - + return qi_submit_sync(&desc, iommu); } /* diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index f78371b22529..45effc5726c0 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -207,7 +207,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) return index; } -static void qi_flush_iec(struct intel_iommu *iommu, int index, int mask) +static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask) { struct qi_desc desc; @@ -215,7 +215,7 @@ static void qi_flush_iec(struct intel_iommu *iommu, int index, int mask) | QI_IEC_SELECTIVE; desc.high = 0; - qi_submit_sync(&desc, iommu); + return qi_submit_sync(&desc, iommu); } int map_irq_to_irte_handle(int irq, u16 *sub_handle) @@ -283,6 +283,7 @@ int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index) int modify_irte(int irq, struct irte *irte_modified) { + int rc; int index; struct irte *irte; struct intel_iommu *iommu; @@ -303,14 +304,15 @@ int modify_irte(int irq, struct irte *irte_modified) set_64bit((unsigned long *)irte, irte_modified->low | (1 << 1)); __iommu_flush_cache(iommu, irte, sizeof(*irte)); - qi_flush_iec(iommu, index, 0); - + rc = qi_flush_iec(iommu, index, 0); spin_unlock(&irq_2_ir_lock); - return 0; + + return rc; } int flush_irte(int irq) { + int rc; int index; struct intel_iommu *iommu; struct irq_2_iommu *irq_iommu; @@ -326,10 +328,10 @@ int flush_irte(int irq) index = irq_iommu->irte_index + irq_iommu->sub_handle; - qi_flush_iec(iommu, index, irq_iommu->irte_mask); + rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask); spin_unlock(&irq_2_ir_lock); - return 0; + return rc; } struct intel_iommu *map_ioapic_to_ir(int apic) @@ -355,6 +357,7 @@ struct intel_iommu *map_dev_to_ir(struct pci_dev *dev) int free_irte(int irq) { + int rc = 0; int index, i; struct irte *irte; struct intel_iommu *iommu; @@ -375,7 +378,7 @@ int free_irte(int irq) if (!irq_iommu->sub_handle) { for (i = 0; i < (1 << irq_iommu->irte_mask); i++) set_64bit((unsigned long *)irte, 0); - qi_flush_iec(iommu, index, irq_iommu->irte_mask); + rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask); } irq_iommu->iommu = NULL; @@ -385,7 +388,7 @@ int free_irte(int irq) spin_unlock(&irq_2_ir_lock); - return 0; + return rc; } static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode) diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index c4f6c101dbcd..d2e3cbfba14f 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -194,6 +194,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) /* FSTS_REG */ #define DMA_FSTS_PPF ((u32)2) #define DMA_FSTS_PFO ((u32)1) +#define DMA_FSTS_IQE (1 << 4) #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff) /* FRCD_REG, 32 bits access */ @@ -328,7 +329,7 @@ extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type, int non_present_entry_flush); -extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); +extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); extern void *intel_alloc_coherent(struct device *, size_t, dma_addr_t *, gfp_t); extern void intel_free_coherent(struct device *, size_t, void *, dma_addr_t); -- cgit v1.2.3 From a5ef7ca0e2636bad0ccd07b996d775348ae2b65e Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Sun, 8 Feb 2009 17:39:58 -0500 Subject: x86: spinlocks: define dummy __raw_spin_is_contended Architectures other than mips and x86 are not using ticket spinlocks. Therefore, the contention on the lock is meaningless, since there is nobody known to be waiting on it (arguably /fairly/ unfair locks). Dummy it out to return 0 on other architectures. Signed-off-by: Kyle McMartin Acked-by: Ralf Baechle Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/mips/include/asm/spinlock.h | 1 + arch/x86/include/asm/paravirt.h | 1 + arch/x86/include/asm/spinlock.h | 1 + include/linux/spinlock.h | 5 +++++ 4 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h index 1a1f320c30d8..0884947ebe27 100644 --- a/arch/mips/include/asm/spinlock.h +++ b/arch/mips/include/asm/spinlock.h @@ -51,6 +51,7 @@ static inline int __raw_spin_is_contended(raw_spinlock_t *lock) return (((counters >> 14) - counters) & 0x1fff) > 1; } +#define __raw_spin_is_contended __raw_spin_is_contended static inline void __raw_spin_lock(raw_spinlock_t *lock) { diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ba3e2ff6aedc..c09a14127584 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -1402,6 +1402,7 @@ static inline int __raw_spin_is_contended(struct raw_spinlock *lock) { return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); } +#define __raw_spin_is_contended __raw_spin_is_contended static __always_inline void __raw_spin_lock(struct raw_spinlock *lock) { diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index d17c91981da2..8247e94ac6b1 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -245,6 +245,7 @@ static inline int __raw_spin_is_contended(raw_spinlock_t *lock) { return __ticket_spin_is_contended(lock); } +#define __raw_spin_is_contended __raw_spin_is_contended static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) { diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index e0c0fccced46..a0c66a2e00ad 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -124,7 +124,12 @@ do { \ #ifdef CONFIG_GENERIC_LOCKBREAK #define spin_is_contended(lock) ((lock)->break_lock) #else + +#ifdef __raw_spin_is_contended #define spin_is_contended(lock) __raw_spin_is_contended(&(lock)->raw_lock) +#else +#define spin_is_contended(lock) (((void)(lock), 0)) +#endif /*__raw_spin_is_contended*/ #endif /** -- cgit v1.2.3 From 43a990765a9e874350bae1009366d00809dbc9d8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Feb 2009 00:00:22 +0100 Subject: sound: Remove OSSlib stuff from linux/soundcard.h Removed OSSlib stuff from linux/soundcard.h to fix the warnings for 'make headers_check'. This patch breaks building against OSSlib with the kernel headers instead of its own headers. It should still work with any version of the library from the 2003 onwards which provide their own headers for the latest interface. Signed-off-by: Arnd Bergmann Cc: Jaswinder Singh Rajput Signed-off-by: Takashi Iwai --- include/linux/soundcard.h | 74 +++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundcard.h b/include/linux/soundcard.h index 523d069c862c..1904afedb82f 100644 --- a/include/linux/soundcard.h +++ b/include/linux/soundcard.h @@ -1045,50 +1045,36 @@ typedef struct mixer_vol_table { */ #define LOCL_STARTAUDIO 1 -#if (!defined(__KERNEL__) && !defined(KERNEL) && !defined(INKERNEL) && !defined(_KERNEL)) || defined(USE_SEQ_MACROS) +#if !defined(__KERNEL__) || defined(USE_SEQ_MACROS) /* * Some convenience macros to simplify programming of the * /dev/sequencer interface * - * These macros define the API which should be used when possible. + * This is a legacy interface for applications written against + * the OSSlib-3.8 style interface. It is no longer possible + * to actually link against OSSlib with this header, but we + * still provide these macros for programs using them. + * + * If you want to use OSSlib, it is recommended that you get + * the GPL version of OSS-4.x and build against that version + * of the header. + * + * We redefine the extern keyword so that make headers_check + * does not complain about SEQ_USE_EXTBUF. */ #define SEQ_DECLAREBUF() SEQ_USE_EXTBUF() void seqbuf_dump(void); /* This function must be provided by programs */ -extern int OSS_init(int seqfd, int buflen); -extern void OSS_seqbuf_dump(int fd, unsigned char *buf, int buflen); -extern void OSS_seq_advbuf(int len, int fd, unsigned char *buf, int buflen); -extern void OSS_seq_needbuf(int len, int fd, unsigned char *buf, int buflen); -extern void OSS_patch_caching(int dev, int chn, int patch, - int fd, unsigned char *buf, int buflen); -extern void OSS_drum_caching(int dev, int chn, int patch, - int fd, unsigned char *buf, int buflen); -extern void OSS_write_patch(int fd, unsigned char *buf, int len); -extern int OSS_write_patch2(int fd, unsigned char *buf, int len); - #define SEQ_PM_DEFINES int __foo_bar___ -#ifdef OSSLIB -# define SEQ_USE_EXTBUF() \ - extern unsigned char *_seqbuf; \ - extern int _seqbuflen;extern int _seqbufptr -# define SEQ_DEFINEBUF(len) SEQ_USE_EXTBUF();static int _requested_seqbuflen=len -# define _SEQ_ADVBUF(len) OSS_seq_advbuf(len, seqfd, _seqbuf, _seqbuflen) -# define _SEQ_NEEDBUF(len) OSS_seq_needbuf(len, seqfd, _seqbuf, _seqbuflen) -# define SEQ_DUMPBUF() OSS_seqbuf_dump(seqfd, _seqbuf, _seqbuflen) - -# define SEQ_LOAD_GMINSTR(dev, instr) \ - OSS_patch_caching(dev, -1, instr, seqfd, _seqbuf, _seqbuflen) -# define SEQ_LOAD_GMDRUM(dev, drum) \ - OSS_drum_caching(dev, -1, drum, seqfd, _seqbuf, _seqbuflen) -#else /* !OSSLIB */ - -# define SEQ_LOAD_GMINSTR(dev, instr) -# define SEQ_LOAD_GMDRUM(dev, drum) - -# define SEQ_USE_EXTBUF() \ - extern unsigned char _seqbuf[]; \ - extern int _seqbuflen;extern int _seqbufptr + +#define SEQ_LOAD_GMINSTR(dev, instr) +#define SEQ_LOAD_GMDRUM(dev, drum) + +#define _SEQ_EXTERN extern +#define SEQ_USE_EXTBUF() \ + _SEQ_EXTERN unsigned char _seqbuf[]; \ + _SEQ_EXTERN int _seqbuflen; _SEQ_EXTERN int _seqbufptr #ifndef USE_SIMPLE_MACROS /* Sample seqbuf_dump() implementation: @@ -1131,7 +1117,6 @@ extern int OSS_write_patch2(int fd, unsigned char *buf, int len); */ #define _SEQ_NEEDBUF(len) /* empty */ #endif -#endif /* !OSSLIB */ #define SEQ_VOLUME_MODE(dev, mode) {_SEQ_NEEDBUF(8);\ _seqbuf[_seqbufptr] = SEQ_EXTENDED;\ @@ -1215,14 +1200,8 @@ extern int OSS_write_patch2(int fd, unsigned char *buf, int len); _CHN_COMMON(dev, MIDI_CHN_PRESSURE, chn, pressure, 0, 0) #define SEQ_SET_PATCH SEQ_PGM_CHANGE -#ifdef OSSLIB -# define SEQ_PGM_CHANGE(dev, chn, patch) \ - {OSS_patch_caching(dev, chn, patch, seqfd, _seqbuf, _seqbuflen); \ - _CHN_COMMON(dev, MIDI_PGM_CHANGE, chn, patch, 0, 0);} -#else -# define SEQ_PGM_CHANGE(dev, chn, patch) \ +#define SEQ_PGM_CHANGE(dev, chn, patch) \ _CHN_COMMON(dev, MIDI_PGM_CHANGE, chn, patch, 0, 0) -#endif #define SEQ_CONTROL(dev, chn, controller, value) \ _CHN_COMMON(dev, MIDI_CTL_CHANGE, chn, controller, 0, value) @@ -1300,19 +1279,12 @@ extern int OSS_write_patch2(int fd, unsigned char *buf, int len); /* * Patch loading. */ -#ifdef OSSLIB -# define SEQ_WRPATCH(patchx, len) \ - OSS_write_patch(seqfd, (char*)(patchx), len) -# define SEQ_WRPATCH2(patchx, len) \ - OSS_write_patch2(seqfd, (char*)(patchx), len) -#else -# define SEQ_WRPATCH(patchx, len) \ +#define SEQ_WRPATCH(patchx, len) \ {if (_seqbufptr) SEQ_DUMPBUF();\ if (write(seqfd, (char*)(patchx), len)==-1) \ perror("Write patch: /dev/sequencer");} -# define SEQ_WRPATCH2(patchx, len) \ +#define SEQ_WRPATCH2(patchx, len) \ (SEQ_DUMPBUF(), write(seqfd, (char*)(patchx), len)) -#endif #endif #endif -- cgit v1.2.3 From 7f5aa215088b817add9c71914b83650bdd49f8a9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 10 Feb 2009 11:15:34 -0500 Subject: jbd2: Avoid possible NULL dereference in jbd2_journal_begin_ordered_truncate() If we race with commit code setting i_transaction to NULL, we could possibly dereference it. Proper locking requires the journal pointer (to access journal->j_list_lock), which we don't have. So we have to change the prototype of the function so that filesystem passes us the journal pointer. Also add a more detailed comment about why the function jbd2_journal_begin_ordered_truncate() does what it does and how it should be used. Thanks to Dan Carpenter for pointing to the suspitious code. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Acked-by: Joel Becker CC: linux-ext4@vger.kernel.org CC: ocfs2-devel@oss.oracle.com CC: mfasheh@suse.de CC: Dan Carpenter --- fs/ext4/inode.c | 6 ++++-- fs/jbd2/transaction.c | 42 +++++++++++++++++++++++++++++++----------- fs/ocfs2/journal.h | 6 ++++-- include/linux/jbd2.h | 3 ++- 4 files changed, 41 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 03ba20be1329..658c4a7f2578 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,8 +47,10 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { - return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, - new_size); + return jbd2_journal_begin_ordered_truncate( + EXT4_SB(inode->i_sb)->s_journal, + &EXT4_I(inode)->jinode, + new_size); } static void ext4_invalidatepage(struct page *page, unsigned long offset); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 46b4e347ed7d..28ce21d8598e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2129,26 +2129,46 @@ done: } /* - * This function must be called when inode is journaled in ordered mode - * before truncation happens. It starts writeout of truncated part in - * case it is in the committing transaction so that we stand to ordered - * mode consistency guarantees. + * File truncate and transaction commit interact with each other in a + * non-trivial way. If a transaction writing data block A is + * committing, we cannot discard the data by truncate until we have + * written them. Otherwise if we crashed after the transaction with + * write has committed but before the transaction with truncate has + * committed, we could see stale data in block A. This function is a + * helper to solve this problem. It starts writeout of the truncated + * part in case it is in the committing transaction. + * + * Filesystem code must call this function when inode is journaled in + * ordered mode before truncation happens and after the inode has been + * placed on orphan list with the new inode size. The second condition + * avoids the race that someone writes new data and we start + * committing the transaction after this function has been called but + * before a transaction for truncate is started (and furthermore it + * allows us to optimize the case where the addition to orphan list + * happens in the same transaction as write --- we don't have to write + * any data in such case). */ -int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, +int jbd2_journal_begin_ordered_truncate(journal_t *journal, + struct jbd2_inode *jinode, loff_t new_size) { - journal_t *journal; - transaction_t *commit_trans; + transaction_t *inode_trans, *commit_trans; int ret = 0; - if (!inode->i_transaction && !inode->i_next_transaction) + /* This is a quick check to avoid locking if not necessary */ + if (!jinode->i_transaction) goto out; - journal = inode->i_transaction->t_journal; + /* Locks are here just to force reading of recent values, it is + * enough that the transaction was not committing before we started + * a transaction adding the inode to orphan list */ spin_lock(&journal->j_state_lock); commit_trans = journal->j_committing_transaction; spin_unlock(&journal->j_state_lock); - if (inode->i_transaction == commit_trans) { - ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, + spin_lock(&journal->j_list_lock); + inode_trans = jinode->i_transaction; + spin_unlock(&journal->j_list_lock); + if (inode_trans == commit_trans) { + ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, new_size, LLONG_MAX); if (ret) jbd2_journal_abort(journal, ret); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 3c3532e1307c..172850a9a12a 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) static inline int ocfs2_begin_ordered_truncate(struct inode *inode, loff_t new_size) { - return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode, - new_size); + return jbd2_journal_begin_ordered_truncate( + OCFS2_SB(inode->i_sb)->journal->j_journal, + &OCFS2_I(inode)->ip_jinode, + new_size); } #endif /* OCFS2_JOURNAL_H */ diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index b28b37eb11c6..4d248b3f1323 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1150,7 +1150,8 @@ extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); -extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size); +extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, + struct jbd2_inode *inode, loff_t new_size); extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); -- cgit v1.2.3 From 5a6fe125950676015f5108fb71b2a67441755003 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Feb 2009 14:02:27 +0000 Subject: Do not account for the address space used by hugetlbfs using VM_ACCOUNT When overcommit is disabled, the core VM accounts for pages used by anonymous shared, private mappings and special mappings. It keeps track of VMAs that should be accounted for with VM_ACCOUNT and VMAs that never had a reserve with VM_NORESERVE. Overcommit for hugetlbfs is much riskier than overcommit for base pages due to contiguity requirements. It avoids overcommiting on both shared and private mappings using reservation counters that are checked and updated during mmap(). This ensures (within limits) that hugepages exist in the future when faults occurs or it is too easy to applications to be SIGKILLed. As hugetlbfs makes its own reservations of a different unit to the base page size, VM_ACCOUNT should never be set. Even if the units were correct, we would double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may be set because an application can request no reserves be made for hugetlbfs at the risk of getting killed later. With commit fc8744adc870a8d4366908221508bb113d8b72ee, VM_NORESERVE and VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This breaks the accounting for both the core VM and hugetlbfs, can trigger an OOM storm when hugepage pools are too small lockups and corrupted counters otherwise are used. This patch brings hugetlbfs more in line with how the core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set. Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 8 +++++--- include/linux/hugetlb.h | 5 +++-- include/linux/mm.h | 3 +-- ipc/shm.c | 8 +++++--- mm/fremap.c | 2 +- mm/hugetlb.c | 39 +++++++++++++++++++++++++-------------- mm/mmap.c | 38 ++++++++++++++++++++++---------------- mm/mprotect.c | 5 +++-- 8 files changed, 65 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6903d37af037..9b800d97a687 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), - len >> huge_page_shift(h), vma)) + len >> huge_page_shift(h), vma, + vma->vm_flags)) goto out; ret = 0; @@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void) can_do_mlock()); } -struct file *hugetlb_file_setup(const char *name, size_t size) +struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) { int error = -ENOMEM; struct file *file; @@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size) error = -ENOMEM; if (hugetlb_reserve_pages(inode, 0, - size >> huge_page_shift(hstate_inode(inode)), NULL)) + size >> huge_page_shift(hstate_inode(inode)), NULL, + acctflag)) goto out_inode; d_instantiate(dentry, inode); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f1d2fba19ea0..af09660001c7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -33,7 +33,8 @@ unsigned long hugetlb_total_pages(void); int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access); int hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma); + struct vm_area_struct *vma, + int acctflags); void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); extern unsigned long hugepages_treat_as_movable; @@ -138,7 +139,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) extern const struct file_operations hugetlbfs_file_operations; extern struct vm_operations_struct hugetlb_vm_ops; -struct file *hugetlb_file_setup(const char *name, size_t); +struct file *hugetlb_file_setup(const char *name, size_t, int); int hugetlb_get_quota(struct address_space *mapping, long delta); void hugetlb_put_quota(struct address_space *mapping, long delta); diff --git a/include/linux/mm.h b/include/linux/mm.h index e8ddc98b8405..323561582c10 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1129,8 +1129,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long flag, unsigned long pgoff); extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - unsigned int vm_flags, unsigned long pgoff, - int accountable); + unsigned int vm_flags, unsigned long pgoff); static inline unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, diff --git a/ipc/shm.c b/ipc/shm.c index f8f69fad3a27..05d51d2a792c 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -340,6 +340,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) struct file * file; char name[13]; int id; + int acctflag = 0; if (size < SHMMIN || size > ns->shm_ctlmax) return -EINVAL; @@ -364,11 +365,12 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) sprintf (name, "SYSV%08x", key); if (shmflg & SHM_HUGETLB) { - /* hugetlb_file_setup takes care of mlock user accounting */ - file = hugetlb_file_setup(name, size); + /* hugetlb_file_setup applies strict accounting */ + if (shmflg & SHM_NORESERVE) + acctflag = VM_NORESERVE; + file = hugetlb_file_setup(name, size, acctflag); shp->mlock_user = current_user(); } else { - int acctflag = 0; /* * Do not allow no accounting for OVERCOMMIT_NEVER, even * if it's asked for. diff --git a/mm/fremap.c b/mm/fremap.c index 736ba7f3306a..b6ec85abbb39 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, flags &= MAP_NONBLOCK; get_file(file); addr = mmap_region(file, start, size, - flags, vma->vm_flags, pgoff, 1); + flags, vma->vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 618e98304080..207464209546 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2269,14 +2269,12 @@ void hugetlb_change_protection(struct vm_area_struct *vma, int hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int acctflag) { - long ret, chg; + long ret = 0, chg; struct hstate *h = hstate_inode(inode); - if (vma && vma->vm_flags & VM_NORESERVE) - return 0; - /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need @@ -2285,22 +2283,25 @@ int hugetlb_reserve_pages(struct inode *inode, */ if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); - if (!resv_map) - return -ENOMEM; - + else chg = to - from; - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); - } - if (chg < 0) return chg; if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; + + /* + * Only apply hugepage reservation if asked. We still have to + * take the filesystem quota because it is an upper limit + * defined for the mount and not necessarily memory as a whole + */ + if (acctflag & VM_NORESERVE) { + reset_vma_resv_huge_pages(vma); + return 0; + } + ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); @@ -2308,6 +2309,16 @@ int hugetlb_reserve_pages(struct inode *inode, } if (!vma || vma->vm_flags & VM_SHARED) region_add(&inode->i_mapping->private_list, from, to); + else { + struct resv_map *resv_map = resv_map_alloc(); + + if (!resv_map) + return -ENOMEM; + + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 214b6a258eeb..eb1270bebe67 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -918,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, struct inode *inode; unsigned int vm_flags; int error; - int accountable = 1; unsigned long reqprot = prot; /* @@ -1019,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, return -EPERM; vm_flags &= ~VM_MAYEXEC; } - if (is_file_hugepages(file)) - accountable = 0; if (!file->f_op || !file->f_op->mmap) return -ENODEV; @@ -1053,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (error) return error; - return mmap_region(file, addr, len, flags, vm_flags, pgoff, - accountable); + return mmap_region(file, addr, len, flags, vm_flags, pgoff); } EXPORT_SYMBOL(do_mmap_pgoff); @@ -1092,17 +1088,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma) /* * We account for memory if it's a private writeable mapping, - * and VM_NORESERVE wasn't set. + * not hugepages and VM_NORESERVE wasn't set. */ -static inline int accountable_mapping(unsigned int vm_flags) +static inline int accountable_mapping(struct file *file, unsigned int vm_flags) { + /* + * hugetlb has its own accounting separate from the core VM + * VM_HUGETLB may not be set yet so we cannot check for that flag. + */ + if (file && is_file_hugepages(file)) + return 0; + return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - unsigned int vm_flags, unsigned long pgoff, - int accountable) + unsigned int vm_flags, unsigned long pgoff) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1128,18 +1130,22 @@ munmap_back: /* * Set 'VM_NORESERVE' if we should not account for the - * memory use of this mapping. We only honor MAP_NORESERVE - * if we're allowed to overcommit memory. + * memory use of this mapping. */ - if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) - vm_flags |= VM_NORESERVE; - if (!accountable) - vm_flags |= VM_NORESERVE; + if ((flags & MAP_NORESERVE)) { + /* We honor MAP_NORESERVE if allowed to overcommit */ + if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + + /* hugetlb applies strict overcommit unless MAP_NORESERVE */ + if (file && is_file_hugepages(file)) + vm_flags |= VM_NORESERVE; + } /* * Private writable mapping: check memory availability */ - if (accountable_mapping(vm_flags)) { + if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; if (security_vm_enough_memory(charged)) return -ENOMEM; diff --git a/mm/mprotect.c b/mm/mprotect.c index abe2694e13f4..258197b76fb4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we - * make it unwritable again. + * make it unwritable again. hugetlb mapping were accounted for + * even if read-only so there is no need to account for them here */ if (newflags & VM_WRITE) { - if (!(oldflags & (VM_ACCOUNT|VM_WRITE| + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| VM_SHARED|VM_NORESERVE))) { charged = nrpages; if (security_vm_enough_memory(charged)) -- cgit v1.2.3 From 1db8508cf483dc1ecf66141f90a7c03659d69512 Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Tue, 10 Feb 2009 23:27:32 +0100 Subject: hugetlbfs: fix build failure with !CONFIG_HUGETLBFS Fix regression due to 5a6fe125950676015f5108fb71b2a67441755003, "Do not account for the address space used by hugetlbfs using VM_ACCOUNT" which added an argument to the function hugetlb_file_setup() but not to the macro hugetlb_file_setup(). Reported-by: Chris Clayton Signed-off-by: Stefan Richter Acked-by: Mel Gorman Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index af09660001c7..03be7f29ca01 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -159,9 +159,9 @@ static inline void set_file_hugepages(struct file *file) } #else /* !CONFIG_HUGETLBFS */ -#define is_file_hugepages(file) 0 -#define set_file_hugepages(file) BUG() -#define hugetlb_file_setup(name,size) ERR_PTR(-ENOSYS) +#define is_file_hugepages(file) 0 +#define set_file_hugepages(file) BUG() +#define hugetlb_file_setup(name,size,acctflag) ERR_PTR(-ENOSYS) #endif /* !CONFIG_HUGETLBFS */ -- cgit v1.2.3 From e672f7db767156bf71adf9c592cfe81b339523d6 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Tue, 10 Feb 2009 17:18:17 -0800 Subject: pkt_sched: type should be __u32 in header Using u32 in this header breaks the build of iptables. Signed-off-by: Chuck Ebbert Signed-off-by: David S. Miller --- include/linux/pkt_sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index b2648e8e4987..d51a2b3e221e 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -515,7 +515,7 @@ enum struct tc_drr_stats { - u32 deficit; + __u32 deficit; }; #endif -- cgit v1.2.3 From 3fccfd67df79c6351a156eb25a7a514e5f39c4d9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Feb 2009 16:37:31 +0100 Subject: timers: split process wide cpu clocks/timers, fix To decrease the chance of a missed enable, always enable the timer when we sample it, we'll always disable it when we find that there are no active timers in the jiffy tick. This fixes a flood of warnings reported by Mike Galbraith. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/posix-cpu-timers.c | 42 ++++++++++++++---------------------------- 2 files changed, 15 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 79392916d6c9..5d10fa0b6002 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2209,6 +2209,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) unsigned long flags; spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 1; *times = cputimer->cputime; spin_unlock_irqrestore(&cputimer->lock, flags); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index db107c9bbc05..e5d7bfdfa7d4 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -488,7 +488,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) { struct task_cputime cputime; - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); cleanup_timers(tsk->signal->cpu_timers, cputime.utime, cputime.stime, cputime.sum_exec_runtime); } @@ -506,29 +506,6 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) now); } -/* - * Enable the process wide cpu timer accounting. - * - * serialized using ->sighand->siglock - */ -static void start_process_timers(struct task_struct *tsk) -{ - tsk->signal->cputimer.running = 1; - barrier(); -} - -/* - * Release the process wide timer accounting -- timer stops ticking when - * nobody cares about it. - * - * serialized using ->sighand->siglock - */ -static void stop_process_timers(struct task_struct *tsk) -{ - tsk->signal->cputimer.running = 0; - barrier(); -} - /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held @@ -549,9 +526,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) BUG_ON(!irqs_disabled()); spin_lock(&p->sighand->siglock); - if (!CPUCLOCK_PERTHREAD(timer->it_clock)) - start_process_timers(p); - listpos = head; if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { list_for_each_entry(next, head, entry) { @@ -1021,6 +995,19 @@ static void check_thread_timers(struct task_struct *tsk, } } +static void stop_process_timers(struct task_struct *tsk) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + unsigned long flags; + + if (!cputimer->running) + return; + + spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 0; + spin_unlock_irqrestore(&cputimer->lock, flags); +} + /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1427,7 +1414,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - start_process_timers(tsk); cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval) { -- cgit v1.2.3 From 4da94d49b2ecb0a26e716a8811c3ecc542c2a65d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Feb 2009 11:30:27 +0100 Subject: timers: fix TIMER_ABSTIME for process wide cpu timers The POSIX timer interface allows for absolute time expiry values through the TIMER_ABSTIME flag, therefore we have to synchronize the timer to the clock every time we start it. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 13 +------------ kernel/posix-cpu-timers.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5d10fa0b6002..8981e52c714f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2201,18 +2201,7 @@ static inline int spin_needbreak(spinlock_t *lock) * Thread group CPU time accounting. */ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); - -static inline -void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) -{ - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - unsigned long flags; - - spin_lock_irqsave(&cputimer->lock, flags); - cputimer->running = 1; - *times = cputimer->cputime; - spin_unlock_irqrestore(&cputimer->lock, flags); -} +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); static inline void thread_group_cputime_init(struct signal_struct *sig) { diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e5d7bfdfa7d4..2313a4cc14ea 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -261,6 +261,40 @@ out: rcu_read_unlock(); } +static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) +{ + if (cputime_gt(b->utime, a->utime)) + a->utime = b->utime; + + if (cputime_gt(b->stime, a->stime)) + a->stime = b->stime; + + if (b->sum_exec_runtime > a->sum_exec_runtime) + a->sum_exec_runtime = b->sum_exec_runtime; +} + +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct task_cputime sum; + unsigned long flags; + + spin_lock_irqsave(&cputimer->lock, flags); + if (!cputimer->running) { + cputimer->running = 1; + /* + * The POSIX timer interface allows for absolute time expiry + * values through the TIMER_ABSTIME flag, therefore we have + * to synchronize the timer to the clock every time we start + * it. + */ + thread_group_cputime(tsk, &sum); + update_gt_cputime(&cputimer->cputime, &sum); + } + *times = cputimer->cputime; + spin_unlock_irqrestore(&cputimer->lock, flags); +} + /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. -- cgit v1.2.3 From 9f339e7028e2855717af3193c938f9960ad13b38 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Wed, 11 Feb 2009 15:10:27 +0100 Subject: x86, ptrace, mm: fix double-free on race Ptrace_detach() races with __ptrace_unlink() if the traced task is reaped while detaching. This might cause a double-free of the BTS buffer. Change the ptrace_detach() path to only do the memory accounting in ptrace_bts_detach() and leave the buffer free to ptrace_bts_untrace() which will be called from __ptrace_unlink(). The fix follows a proposal from Oleg Nesterov. Reported-by: Oleg Nesterov Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 16 ++++++++++------ include/linux/mm.h | 1 + mm/mlock.c | 7 ++++++- 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0a5df5f82fb9..5a4c23d89892 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -810,12 +810,16 @@ static void ptrace_bts_untrace(struct task_struct *child) static void ptrace_bts_detach(struct task_struct *child) { - if (unlikely(child->bts)) { - ds_release_bts(child->bts); - child->bts = NULL; - - ptrace_bts_free_buffer(child); - } + /* + * Ptrace_detach() races with ptrace_untrace() in case + * the child dies and is reaped by another thread. + * + * We only do the memory accounting at this point and + * leave the buffer deallocation and the bts tracer + * release to ptrace_bts_untrace() which will be called + * later on with tasklist_lock held. + */ + release_locked_buffer(child->bts_buffer, child->bts_size); } #else static inline void ptrace_bts_fork(struct task_struct *tsk) {} diff --git a/include/linux/mm.h b/include/linux/mm.h index e8ddc98b8405..3d7fb44d7d7e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1305,5 +1305,6 @@ void vmemmap_populate_print_last(void); extern void *alloc_locked_buffer(size_t size); extern void free_locked_buffer(void *buffer, size_t size); +extern void release_locked_buffer(void *buffer, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/mlock.c b/mm/mlock.c index 028ec482fdd4..2b57f7e60390 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -657,7 +657,7 @@ void *alloc_locked_buffer(size_t size) return buffer; } -void free_locked_buffer(void *buffer, size_t size) +void release_locked_buffer(void *buffer, size_t size) { unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; @@ -667,6 +667,11 @@ void free_locked_buffer(void *buffer, size_t size) current->mm->locked_vm -= pgsz; up_write(¤t->mm->mmap_sem); +} + +void free_locked_buffer(void *buffer, size_t size) +{ + release_locked_buffer(buffer, size); kfree(buffer); } -- cgit v1.2.3 From cfebe563bd0a3ff97e1bc167123120d59c7a84db Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 11 Feb 2009 13:04:36 -0800 Subject: cgroups: fix lockdep subclasses overflow I enabled all cgroup subsystems when compiling kernel, and then: # mount -t cgroup -o net_cls xxx /mnt # mkdir /mnt/0 This showed up immediately: BUG: MAX_LOCKDEP_SUBCLASSES too low! turning off the locking correctness validator. It's caused by the cgroup hierarchy lock: for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->root == root) mutex_lock_nested(&ss->hierarchy_mutex, i); } Now we have 9 cgroup subsystems, and the above 'i' for net_cls is 8, but MAX_LOCKDEP_SUBCLASSES is 8. This patch uses different lockdep keys for different subsystems. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 1 + kernel/cgroup.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e4e8e117d27d..499900d0cee7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -378,6 +378,7 @@ struct cgroup_subsys { * - initiating hotplug events */ struct mutex hierarchy_mutex; + struct lock_class_key subsys_key; /* * Link to parent, and list entry in parent's children. diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5a54ff42874e..e14db9c089b9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2351,7 +2351,7 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root) for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->root == root) - mutex_lock_nested(&ss->hierarchy_mutex, i); + mutex_lock(&ss->hierarchy_mutex); } } @@ -2637,6 +2637,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(!list_empty(&init_task.tasks)); mutex_init(&ss->hierarchy_mutex); + lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; } -- cgit v1.2.3 From 6c5979631b4b03c9288776562c18036765e398c1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 11 Feb 2009 13:04:38 -0800 Subject: syscall define: fix uml compile bug With the new system call defines we get this on uml: arch/um/sys-i386/built-in.o: In function `sys_call_table': (.rodata+0x308): undefined reference to `sys_sigprocmask' Reason for this is that uml passes the preprocessor option -Dsigprocmask=kernel_sigprocmask to gcc when compiling the kernel. This causes SYSCALL_DEFINE3(sigprocmask, ...) to be expanded to SYSCALL_DEFINEx(3, kernel_sigprocmask, ...) and finally to a system call named sys_kernel_sigprocmask. However sys_sigprocmask is missing because of this. To avoid macro expansion for the system call name just concatenate the name at first define instead of carrying it through severel levels. This was pointed out by Al Viro. Signed-off-by: Heiko Carstens Cc: Geert Uytterhoeven Cc: Al Viro Reviewed-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0eda02ff2414..f9f900cfd066 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -95,13 +95,13 @@ struct old_linux_dirent; #define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) #define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) -#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) -#define SYSCALL_DEFINE1(...) SYSCALL_DEFINEx(1, __VA_ARGS__) -#define SYSCALL_DEFINE2(...) SYSCALL_DEFINEx(2, __VA_ARGS__) -#define SYSCALL_DEFINE3(...) SYSCALL_DEFINEx(3, __VA_ARGS__) -#define SYSCALL_DEFINE4(...) SYSCALL_DEFINEx(4, __VA_ARGS__) -#define SYSCALL_DEFINE5(...) SYSCALL_DEFINEx(5, __VA_ARGS__) -#define SYSCALL_DEFINE6(...) SYSCALL_DEFINEx(6, __VA_ARGS__) +#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) +#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__) #ifdef CONFIG_PPC64 #define SYSCALL_ALIAS(alias, name) \ @@ -121,21 +121,21 @@ struct old_linux_dirent; #define SYSCALL_DEFINE(name) static inline long SYSC_##name #define SYSCALL_DEFINEx(x, name, ...) \ - asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__)); \ - static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__)); \ - asmlinkage long SyS_##name(__SC_LONG##x(__VA_ARGS__)) \ + asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \ + static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \ + asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \ { \ __SC_TEST##x(__VA_ARGS__); \ - return (long) SYSC_##name(__SC_CAST##x(__VA_ARGS__)); \ + return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__)); \ } \ - SYSCALL_ALIAS(sys_##name, SyS_##name); \ - static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__)) + SYSCALL_ALIAS(sys##name, SyS##name); \ + static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)) #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */ #define SYSCALL_DEFINE(name) asmlinkage long sys_##name #define SYSCALL_DEFINEx(x, name, ...) \ - asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__)) + asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)) #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */ -- cgit v1.2.3 From 1d93e52eb48df986a3c4d5ad8a520bf1f6837367 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 11 Feb 2009 08:47:19 -0700 Subject: dmaengine: update kerneldoc Some of the kerneldoc comments in the dmaengine header describe already removed structure members. Remove them. Also add a short description for dma_device->device_is_tx_complete. Signed-off-by: Johannes Weiner Signed-off-by: Dan Williams --- include/linux/dmaengine.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 3e68469c1885..087e79acf8c7 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -97,7 +97,6 @@ typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t; /** * struct dma_chan_percpu - the per-CPU part of struct dma_chan - * @refcount: local_t used for open-coded "bigref" counting * @memcpy_count: transaction counter * @bytes_transferred: byte counter */ @@ -114,9 +113,6 @@ struct dma_chan_percpu { * @cookie: last cookie value returned to client * @chan_id: channel ID for sysfs * @dev: class device for sysfs - * @refcount: kref, used in "bigref" slow-mode - * @slow_ref: indicates that the DMA channel is free - * @rcu: the DMA channel's RCU head * @device_node: used to add this to the device chan list * @local: per-cpu pointer to a struct dma_chan_percpu * @client-count: how many clients are using this channel @@ -211,8 +207,6 @@ struct dma_async_tx_descriptor { * @global_node: list_head for global dma_device_list * @cap_mask: one or more dma_capability flags * @max_xor: maximum number of xor sources, 0 if no capability - * @refcount: reference count - * @done: IO completion struct * @dev_id: unique device ID * @dev: struct device reference for dma mapping api * @device_alloc_chan_resources: allocate resources and return the @@ -225,6 +219,7 @@ struct dma_async_tx_descriptor { * @device_prep_dma_interrupt: prepares an end of chain interrupt operation * @device_prep_slave_sg: prepares a slave dma operation * @device_terminate_all: terminate all pending operations + * @device_is_tx_complete: poll for transaction completion * @device_issue_pending: push pending transactions to hardware */ struct dma_device { -- cgit v1.2.3 From 7a0eb1960e8ddcb68ea631caf16815485af0e228 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 19 Jan 2009 14:57:52 +0200 Subject: KVM: Avoid using CONFIG_ in userspace visible headers Kconfig symbols are not available in userspace, and are not stripped by headers-install. Avoid their use by adding #defines in to suit each architecture. Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm.h | 4 ++++ arch/x86/include/asm/kvm.h | 7 +++++++ include/linux/kvm.h | 10 +++++----- 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index 68aa6da807c1..bfa86b6af7cd 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h @@ -25,6 +25,10 @@ #include +/* Select x86 specific features in */ +#define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_DEVICE_ASSIGNMENT + /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index d2e3bf3608af..886c9402ec45 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -9,6 +9,13 @@ #include #include +/* Select x86 specific features in */ +#define __KVM_HAVE_PIT +#define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_DEVICE_ASSIGNMENT +#define __KVM_HAVE_MSI +#define __KVM_HAVE_USER_NMI + /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 5715f1907601..0424326f1679 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -58,10 +58,10 @@ struct kvm_irqchip { __u32 pad; union { char dummy[512]; /* reserving space */ -#ifdef CONFIG_X86 +#ifdef __KVM_HAVE_PIT struct kvm_pic_state pic; #endif -#if defined(CONFIG_X86) || defined(CONFIG_IA64) +#ifdef __KVM_HAVE_IOAPIC struct kvm_ioapic_state ioapic; #endif } chip; @@ -384,16 +384,16 @@ struct kvm_trace_rec { #define KVM_CAP_MP_STATE 14 #define KVM_CAP_COALESCED_MMIO 15 #define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ -#if defined(CONFIG_X86)||defined(CONFIG_IA64) +#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT #define KVM_CAP_DEVICE_ASSIGNMENT 17 #endif #define KVM_CAP_IOMMU 18 -#if defined(CONFIG_X86) +#ifdef __KVM_HAVE_MSI #define KVM_CAP_DEVICE_MSI 20 #endif /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */ #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21 -#if defined(CONFIG_X86) +#ifdef __KVM_HAVE_USER_NMI #define KVM_CAP_USER_NMI 22 #endif -- cgit v1.2.3 From ad8ba2cd44d4d39fb3fe55d5dcc565b19fc3a7fb Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 6 Jan 2009 10:03:02 +0800 Subject: KVM: Add kvm_arch_sync_events to sync with asynchronize events kvm_arch_sync_events is introduced to quiet down all other events may happen contemporary with VM destroy process, like IRQ handler and work struct for assigned device. For kvm_arch_sync_events is called at the very beginning of kvm_destroy_vm(), so the state of KVM here is legal and can provide a environment to quiet down other events. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 4 ++++ arch/powerpc/kvm/powerpc.c | 4 ++++ arch/s390/kvm/kvm-s390.c | 4 ++++ arch/x86/kvm/x86.c | 4 ++++ include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 1 + 6 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 4e586f6110aa..28f982045f29 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1337,6 +1337,10 @@ static void kvm_release_vm_pages(struct kvm *kvm) } } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2822c8ccfaaf..5f81256287f5 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -125,6 +125,10 @@ static void kvmppc_free_vcpus(struct kvm *kvm) } } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvmppc_free_vcpus(kvm); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index be8497186b96..0d33893e1e89 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -212,6 +212,10 @@ static void kvm_free_vcpus(struct kvm *kvm) } } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_free_vcpus(kvm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cc17546a2406..b0fc079f1bee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4127,6 +4127,10 @@ static void kvm_free_vcpus(struct kvm *kvm) } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_free_all_assigned_devices(kvm); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ec49d0be7f52..bf6f703642fc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -285,6 +285,7 @@ void kvm_free_physmem(struct kvm *kvm); struct kvm *kvm_arch_create_vm(void); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_free_all_assigned_devices(struct kvm *kvm); +void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *v); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0b6f2f71271f..68e3f1ec1674 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -891,6 +891,7 @@ static void kvm_destroy_vm(struct kvm *kvm) { struct mm_struct *mm = kvm->mm; + kvm_arch_sync_events(kvm); spin_lock(&kvm_lock); list_del(&kvm->vm_list); spin_unlock(&kvm_lock); -- cgit v1.2.3 From 5955c7a2cfb6a35429adea5dc480002b15ca8cfc Mon Sep 17 00:00:00 2001 From: Zlatko Calusic Date: Wed, 18 Feb 2009 01:33:34 +0100 Subject: Add support for VT6415 PCIE PATA IDE Host Controller Signed-off-by: Zlatko Calusic Signed-off-by: Linus Torvalds --- drivers/ata/pata_via.c | 4 +++- include/linux/pci_ids.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c index 79a6c9a0b721..ba556d3e6963 100644 --- a/drivers/ata/pata_via.c +++ b/drivers/ata/pata_via.c @@ -110,7 +110,8 @@ static const struct via_isa_bridge { { "vt8237s", PCI_DEVICE_ID_VIA_8237S, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, { "vt8251", PCI_DEVICE_ID_VIA_8251, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, { "cx700", PCI_DEVICE_ID_VIA_CX700, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_SATA_PATA }, - { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES}, + { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES }, + { "vt6415", PCI_DEVICE_ID_VIA_6415, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES }, { "vt8237a", PCI_DEVICE_ID_VIA_8237A, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, { "vt8237", PCI_DEVICE_ID_VIA_8237, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, { "vt8235", PCI_DEVICE_ID_VIA_8235, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, @@ -593,6 +594,7 @@ static int via_reinit_one(struct pci_dev *pdev) #endif static const struct pci_device_id via[] = { + { PCI_VDEVICE(VIA, 0x0415), }, { PCI_VDEVICE(VIA, 0x0571), }, { PCI_VDEVICE(VIA, 0x0581), }, { PCI_VDEVICE(VIA, 0x1571), }, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 52a9fe08451c..918391b4b109 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1312,6 +1312,7 @@ #define PCI_DEVICE_ID_VIA_VT3351 0x0351 #define PCI_DEVICE_ID_VIA_VT3364 0x0364 #define PCI_DEVICE_ID_VIA_8371_0 0x0391 +#define PCI_DEVICE_ID_VIA_6415 0x0415 #define PCI_DEVICE_ID_VIA_8501_0 0x0501 #define PCI_DEVICE_ID_VIA_82C561 0x0561 #define PCI_DEVICE_ID_VIA_82C586_1 0x0571 -- cgit v1.2.3 From 92a0acce186cde8ead56c6915d9479773673ea1a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Feb 2009 21:24:05 -0800 Subject: net: Kill skb_truesize_check(), it only catches false-positives. A long time ago we had bugs, primarily in TCP, where we would modify skb->truesize (for TSO queue collapsing) in ways which would corrupt the socket memory accounting. skb_truesize_check() was added in order to try and catch this error more systematically. However this debugging check has morphed into a Frankenstein of sorts and these days it does nothing other than catch false-positives. Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 --------- include/net/sock.h | 1 - net/core/skbuff.c | 8 -------- net/core/sock.c | 1 - 4 files changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cf2cb50f77d1..9dcf956ad18a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -416,15 +416,6 @@ extern void skb_over_panic(struct sk_buff *skb, int len, void *here); extern void skb_under_panic(struct sk_buff *skb, int len, void *here); -extern void skb_truesize_bug(struct sk_buff *skb); - -static inline void skb_truesize_check(struct sk_buff *skb) -{ - int len = sizeof(struct sk_buff) + skb->len; - - if (unlikely((int)skb->truesize < len)) - skb_truesize_bug(skb); -} extern int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, int getfrag(void *from, char *to, int offset, diff --git a/include/net/sock.h b/include/net/sock.h index ce3b5b622683..eefeeaf7fc46 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -860,7 +860,6 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { - skb_truesize_check(skb); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); sk->sk_wmem_queued -= skb->truesize; sk_mem_uncharge(sk, skb->truesize); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index da74b844f4ea..c6a6b166f8d6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -143,14 +143,6 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) BUG(); } -void skb_truesize_bug(struct sk_buff *skb) -{ - WARN(net_ratelimit(), KERN_ERR "SKB BUG: Invalid truesize (%u) " - "len=%u, sizeof(sk_buff)=%Zd\n", - skb->truesize, skb->len, sizeof(struct sk_buff)); -} -EXPORT_SYMBOL(skb_truesize_bug); - /* Allocate a new skbuff. We do this ourselves so we can fill in a few * 'private' fields and also do memory statistics to find all the * [BEEP] leaks. diff --git a/net/core/sock.c b/net/core/sock.c index 6f2e1337975d..6e4f14d1ef81 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1137,7 +1137,6 @@ void sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; - skb_truesize_check(skb); atomic_sub(skb->truesize, &sk->sk_rmem_alloc); sk_mem_uncharge(skb->sk, skb->truesize); } -- cgit v1.2.3 From 93dbb393503d53cd226e5e1f0088fe8f4dbaa2b8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 16 Feb 2009 10:25:40 +0100 Subject: block: fix bad definition of BIO_RW_SYNC We can't OR shift values, so get rid of BIO_RW_SYNC and use BIO_RW_SYNCIO and BIO_RW_UNPLUG explicitly. This brings back the behaviour from before 213d9417fec62ef4c3675621b9364a667954d4dd. Signed-off-by: Jens Axboe --- block/blktrace.c | 2 +- drivers/md/dm-io.c | 2 +- drivers/md/dm-kcopyd.c | 2 +- drivers/md/md.c | 4 ++-- include/linux/bio.h | 2 -- include/linux/blktrace_api.h | 1 + include/linux/fs.h | 6 +++--- kernel/power/swap.c | 5 +++-- mm/page_io.c | 2 +- 9 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/block/blktrace.c b/block/blktrace.c index 39cc3bfe56e4..7cf9d1ff45a0 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -142,7 +142,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= ddir_act[rw & WRITE]; what |= MASK_TC_BIT(rw, BARRIER); - what |= MASK_TC_BIT(rw, SYNC); + what |= MASK_TC_BIT(rw, SYNCIO); what |= MASK_TC_BIT(rw, AHEAD); what |= MASK_TC_BIT(rw, META); what |= MASK_TC_BIT(rw, DISCARD); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index a34338567a2a..f14813be4eff 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -328,7 +328,7 @@ static void dispatch_io(int rw, unsigned int num_regions, struct dpages old_pages = *dp; if (sync) - rw |= (1 << BIO_RW_SYNC); + rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); /* * For multiple regions we need to be careful to rewind diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 3073618269ea..0a225da21272 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -344,7 +344,7 @@ static int run_io_job(struct kcopyd_job *job) { int r; struct dm_io_request io_req = { - .bi_rw = job->rw | (1 << BIO_RW_SYNC), + .bi_rw = job->rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG), .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, .mem.offset = job->offset, diff --git a/drivers/md/md.c b/drivers/md/md.c index 4495104f6c9f..03b4cd0a6344 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -474,7 +474,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, * causes ENOTSUPP, we allocate a spare bio... */ struct bio *bio = bio_alloc(GFP_NOIO, 1); - int rw = (1<bi_bdev = rdev->bdev; bio->bi_sector = sector; @@ -531,7 +531,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct completion event; int ret; - rw |= (1 << BIO_RW_SYNC); + rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); bio->bi_bdev = bdev; bio->bi_sector = sector; diff --git a/include/linux/bio.h b/include/linux/bio.h index 2aa283ab062b..1b16108a5417 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -171,8 +171,6 @@ struct bio { #define BIO_RW_FAILFAST_TRANSPORT 8 #define BIO_RW_FAILFAST_DRIVER 9 -#define BIO_RW_SYNC (BIO_RW_SYNCIO | BIO_RW_UNPLUG) - #define bio_rw_flagged(bio, flag) ((bio)->bi_rw & (1 << (flag))) /* diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 25379cba2370..6e915878e88c 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -15,6 +15,7 @@ enum blktrace_cat { BLK_TC_WRITE = 1 << 1, /* writes */ BLK_TC_BARRIER = 1 << 2, /* barrier */ BLK_TC_SYNC = 1 << 3, /* sync IO */ + BLK_TC_SYNCIO = BLK_TC_SYNC, BLK_TC_QUEUE = 1 << 4, /* queueing/merging */ BLK_TC_REQUEUE = 1 << 5, /* requeueing */ BLK_TC_ISSUE = 1 << 6, /* issue */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 6022f44043f2..67857dc16365 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -87,10 +87,10 @@ struct inodes_stat_t { #define WRITE 1 #define READA 2 /* read-ahead - don't block if no resources */ #define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */ -#define READ_SYNC (READ | (1 << BIO_RW_SYNC)) +#define READ_SYNC (READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) #define READ_META (READ | (1 << BIO_RW_META)) -#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) -#define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNC)) +#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) +#define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) #define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 6da14358537c..505f319e489c 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -60,6 +60,7 @@ static struct block_device *resume_bdev; static int submit(int rw, pgoff_t page_off, struct page *page, struct bio **bio_chain) { + const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); @@ -80,7 +81,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page, bio_get(bio); if (bio_chain == NULL) { - submit_bio(rw | (1 << BIO_RW_SYNC), bio); + submit_bio(bio_rw, bio); wait_on_page_locked(page); if (rw == READ) bio_set_pages_dirty(bio); @@ -90,7 +91,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page, get_page(page); /* These pages are freed later */ bio->bi_private = *bio_chain; *bio_chain = bio; - submit_bio(rw | (1 << BIO_RW_SYNC), bio); + submit_bio(bio_rw, bio); } return 0; } diff --git a/mm/page_io.c b/mm/page_io.c index dc6ce0afbded..3023c475e041 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -111,7 +111,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= (1 << BIO_RW_SYNC); + rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); -- cgit v1.2.3 From 5ca431f9ae8db8c6edb9c64bebe6d6521077afd6 Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Wed, 18 Feb 2009 15:29:23 +0100 Subject: netfilter: nfnetlink_log: fix per-rule qthreshold override In NFLOG the per-rule qthreshold should overrides per-instance only it is set. With current code, the per-rule qthreshold is 1 if not set and it overrides the per-instance qthreshold. This patch modifies the default xt_NFLOG threshold from 1 to 0. Thus a value of 0 means there is no per-rule setting and the instance parameter has to apply. Signed-off-by: Eric Leblond Signed-off-by: Patrick McHardy --- include/linux/netfilter/xt_NFLOG.h | 2 +- net/netfilter/nfnetlink_log.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/xt_NFLOG.h b/include/linux/netfilter/xt_NFLOG.h index cdcd0ed58f7a..4b36aeb46a10 100644 --- a/include/linux/netfilter/xt_NFLOG.h +++ b/include/linux/netfilter/xt_NFLOG.h @@ -2,7 +2,7 @@ #define _XT_NFLOG_TARGET #define XT_NFLOG_DEFAULT_GROUP 0x1 -#define XT_NFLOG_DEFAULT_THRESHOLD 1 +#define XT_NFLOG_DEFAULT_THRESHOLD 0 #define XT_NFLOG_MASK 0x0 diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index fa49dc7fe100..580b837e44dd 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -590,8 +590,10 @@ nfulnl_log_packet(u_int8_t pf, qthreshold = inst->qthreshold; /* per-rule qthreshold overrides per-instance */ - if (qthreshold > li->u.ulog.qthreshold) - qthreshold = li->u.ulog.qthreshold; + if (li->u.ulog.qthreshold) + if (qthreshold > li->u.ulog.qthreshold) + qthreshold = li->u.ulog.qthreshold; + switch (inst->copy_mode) { case NFULNL_COPY_META: -- cgit v1.2.3 From c296861291669f305deef19b78042330d7135017 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 18 Feb 2009 14:48:12 -0800 Subject: vmalloc: add __get_vm_area_caller() We have get_vm_area_caller() and __get_vm_area() but not __get_vm_area_caller() On powerpc, I use __get_vm_area() to separate the ranges of addresses given to vmalloc vs. ioremap (various good reasons for that) so in order to be able to implement the new caller tracking in /proc/vmallocinfo, I need a "_caller" variant of it. (akpm: needed for ongoing powerpc development, so merge it early) [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Benjamin Herrenschmidt Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 4 ++++ mm/vmalloc.c | 8 ++++++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 506e7620a986..9c0890c7a06a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -84,6 +84,10 @@ extern struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, void *caller); extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end); +extern struct vm_struct *__get_vm_area_caller(unsigned long size, + unsigned long flags, + unsigned long start, unsigned long end, + void *caller); extern struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node, gfp_t gfp_mask); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 75f49d312e8c..4dd2636d0b92 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1106,6 +1106,14 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, } EXPORT_SYMBOL_GPL(__get_vm_area); +struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end, + void *caller) +{ + return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + caller); +} + /** * get_vm_area - reserve a contiguous kernel virtual area * @size: size of the area -- cgit v1.2.3 From 55ec82176eca52e4e0530a82a0eb59160a1a95a1 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Wed, 18 Feb 2009 14:48:15 -0800 Subject: vfs: separate FMODE_PREAD/FMODE_PWRITE into separate flags Separate FMODE_PREAD and FMODE_PWRITE into separate flags to reflect the reality that the read and write paths may have independent restrictions. A git grep verifies that these flags are always cleared together so this new behavior will only apply to interfaces that change to clear flags individually. This is required for "seq_file: properly cope with pread", a post-2.6.25 regression fix. [akpm@linux-foundation.org: add comment] Signed-off-by: Paul Turner Cc: Eric Biederman Cc: Alexey Dobriyan Cc: Al Viro Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6022f44043f2..5852bd6afbe4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -54,24 +54,30 @@ struct inodes_stat_t { #define MAY_ACCESS 16 #define MAY_OPEN 32 +/* + * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond + * to O_WRONLY and O_RDWR via the strange trick in __dentry_open() + */ + /* file is open for reading */ #define FMODE_READ ((__force fmode_t)1) /* file is open for writing */ #define FMODE_WRITE ((__force fmode_t)2) /* file is seekable */ #define FMODE_LSEEK ((__force fmode_t)4) -/* file can be accessed using pread/pwrite */ +/* file can be accessed using pread */ #define FMODE_PREAD ((__force fmode_t)8) -#define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */ +/* file can be accessed using pwrite */ +#define FMODE_PWRITE ((__force fmode_t)16) /* File is opened for execution with sys_execve / sys_uselib */ -#define FMODE_EXEC ((__force fmode_t)16) +#define FMODE_EXEC ((__force fmode_t)32) /* File is opened with O_NDELAY (only set for block devices) */ -#define FMODE_NDELAY ((__force fmode_t)32) +#define FMODE_NDELAY ((__force fmode_t)64) /* File is opened with O_EXCL (only set for block devices) */ -#define FMODE_EXCL ((__force fmode_t)64) +#define FMODE_EXCL ((__force fmode_t)128) /* File is opened using open(.., 3, ..) and is writeable only for ioctls (specialy hack for floppy.c) */ -#define FMODE_WRITE_IOCTL ((__force fmode_t)128) +#define FMODE_WRITE_IOCTL ((__force fmode_t)256) /* * Don't update ctime and mtime. -- cgit v1.2.3 From 8f19d472935c83d823fa4cf02bcc0a7b9952db30 Mon Sep 17 00:00:00 2001 From: Eric Biederman Date: Wed, 18 Feb 2009 14:48:16 -0800 Subject: seq_file: properly cope with pread Currently seq_read assumes that the offset passed to it is always the offset it passed to user space. In the case pread this assumption is broken and we do the wrong thing when presented with pread. To solve this I introduce an offset cache inside of struct seq_file so we know where our logical file position is. Then in seq_read if we try to read from another offset we reset our data structures and attempt to go to the offset user space wanted. [akpm@linux-foundation.org: restore FMODE_PWRITE] [pjt@google.com: seq_open needs its fmode opened up to take advantage of this] Signed-off-by: Eric Biederman Cc: Alexey Dobriyan Cc: Al Viro Cc: Paul Turner Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 36 ++++++++++++++++++++++++++++++++---- include/linux/seq_file.h | 1 + 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/seq_file.c b/fs/seq_file.c index 5267098532bf..a1a4cfe19210 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -48,8 +48,16 @@ int seq_open(struct file *file, const struct seq_operations *op) */ file->f_version = 0; - /* SEQ files support lseek, but not pread/pwrite */ - file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE); + /* + * seq_files support lseek() and pread(). They do not implement + * write() at all, but we clear FMODE_PWRITE here for historical + * reasons. + * + * If a client of seq_files a) implements file.write() and b) wishes to + * support pwrite() then that client will need to implement its own + * file.open() which calls seq_open() and then sets FMODE_PWRITE. + */ + file->f_mode &= ~FMODE_PWRITE; return 0; } EXPORT_SYMBOL(seq_open); @@ -131,6 +139,22 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) int err = 0; mutex_lock(&m->lock); + + /* Don't assume *ppos is where we left it */ + if (unlikely(*ppos != m->read_pos)) { + m->read_pos = *ppos; + while ((err = traverse(m, *ppos)) == -EAGAIN) + ; + if (err) { + /* With prejudice... */ + m->read_pos = 0; + m->version = 0; + m->index = 0; + m->count = 0; + goto Done; + } + } + /* * seq_file->op->..m_start/m_stop/m_next may do special actions * or optimisations based on the file->f_version, so we want to @@ -230,8 +254,10 @@ Fill: Done: if (!copied) copied = err; - else + else { *ppos += copied; + m->read_pos += copied; + } file->f_version = m->version; mutex_unlock(&m->lock); return copied; @@ -266,16 +292,18 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin) if (offset < 0) break; retval = offset; - if (offset != file->f_pos) { + if (offset != m->read_pos) { while ((retval=traverse(m, offset)) == -EAGAIN) ; if (retval) { /* with extreme prejudice... */ file->f_pos = 0; + m->read_pos = 0; m->version = 0; m->index = 0; m->count = 0; } else { + m->read_pos = offset; retval = file->f_pos = offset; } } diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 40ea5058c2ec..f616f31576d7 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -19,6 +19,7 @@ struct seq_file { size_t from; size_t count; loff_t index; + loff_t read_pos; u64 version; struct mutex lock; const struct seq_operations *op; -- cgit v1.2.3 From 610d18f4128ebbd88845d0fc60cce67b49af881e Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Wed, 18 Feb 2009 14:48:18 -0800 Subject: timerfd: add flags check As requested by Michael, add a missing check for valid flags in timerfd_settime(), and make it return EINVAL in case some extra bits are set. Michael said: If this is to be any use to userland apps that want to check flag support (perhaps it is too late already), then the sooner we get it into the kernel the better: 2.6.29 would be good; earlier stables as well would be even better. [akpm@linux-foundation.org: remove unused TFD_FLAGS_SET] Acked-by: Michael Kerrisk Signed-off-by: Davide Libenzi Cc: [2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/timerfd.c | 12 ++++++------ include/linux/timerfd.h | 16 ++++++++++++---- 2 files changed, 18 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/timerfd.c b/fs/timerfd.c index 6a123b8ff3f5..b042bd7034b1 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -186,10 +186,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK); - if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK)) - return -EINVAL; - if (clockid != CLOCK_MONOTONIC && - clockid != CLOCK_REALTIME) + if ((flags & ~TFD_CREATE_FLAGS) || + (clockid != CLOCK_MONOTONIC && + clockid != CLOCK_REALTIME)) return -EINVAL; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); @@ -201,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, - flags & (O_CLOEXEC | O_NONBLOCK)); + flags & TFD_SHARED_FCNTL_FLAGS); if (ufd < 0) kfree(ctx); @@ -219,7 +218,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) return -EFAULT; - if (!timespec_valid(&ktmr.it_value) || + if ((flags & ~TFD_SETTIME_FLAGS) || + !timespec_valid(&ktmr.it_value) || !timespec_valid(&ktmr.it_interval)) return -EINVAL; diff --git a/include/linux/timerfd.h b/include/linux/timerfd.h index 86cb0501d3e2..2d0792983f8c 100644 --- a/include/linux/timerfd.h +++ b/include/linux/timerfd.h @@ -11,13 +11,21 @@ /* For O_CLOEXEC and O_NONBLOCK */ #include -/* Flags for timerfd_settime. */ +/* + * CAREFUL: Check include/asm-generic/fcntl.h when defining + * new flags, since they might collide with O_* ones. We want + * to re-use O_* flags that couldn't possibly have a meaning + * from eventfd, in order to leave a free define-space for + * shared O_* flags. + */ #define TFD_TIMER_ABSTIME (1 << 0) - -/* Flags for timerfd_create. */ #define TFD_CLOEXEC O_CLOEXEC #define TFD_NONBLOCK O_NONBLOCK +#define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK) +/* Flags for timerfd_create. */ +#define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS +/* Flags for timerfd_settime. */ +#define TFD_SETTIME_FLAGS TFD_TIMER_ABSTIME #endif /* _LINUX_TIMERFD_H */ - -- cgit v1.2.3 From 1cf6e7d83bf334cc5916137862c920a97aabc018 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Feb 2009 14:48:18 -0800 Subject: mm: task dirty accounting fix YAMAMOTO-san noticed that task_dirty_inc doesn't seem to be called properly for cases where set_page_dirty is not used to dirty a page (eg. mark_buffer_dirty). Additionally, there is some inconsistency about when task_dirty_inc is called. It is used for dirty balancing, however it even gets called for __set_page_dirty_no_writeback. So rather than increment it in a set_page_dirty wrapper, move it down to exactly where the dirty page accounting stats are incremented. Cc: YAMAMOTO Takashi Signed-off-by: Nick Piggin Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 1 + include/linux/mm.h | 1 + mm/page-writeback.c | 13 +++---------- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/buffer.c b/fs/buffer.c index 665d446b25bc..ff4d1cdd779b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -777,6 +777,7 @@ static int __set_page_dirty(struct page *page, __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } radix_tree_tag_set(&mapping->page_tree, diff --git a/include/linux/mm.h b/include/linux/mm.h index 7dc04ff5ab89..10074212a35b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1159,6 +1159,7 @@ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); /* mm/page-writeback.c */ int write_one_page(struct page *page, int wait); +void task_dirty_inc(struct task_struct *tsk); /* readahead.c */ #define VM_MAX_READAHEAD 128 /* kbytes */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3c84128596ba..74dc57c74349 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -240,7 +240,7 @@ void bdi_writeout_inc(struct backing_dev_info *bdi) } EXPORT_SYMBOL_GPL(bdi_writeout_inc); -static inline void task_dirty_inc(struct task_struct *tsk) +void task_dirty_inc(struct task_struct *tsk) { prop_inc_single(&vm_dirties, &tsk->dirties); } @@ -1230,6 +1230,7 @@ int __set_page_dirty_nobuffers(struct page *page) __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } radix_tree_tag_set(&mapping->page_tree, @@ -1262,7 +1263,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage); * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ -static int __set_page_dirty(struct page *page) +int set_page_dirty(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -1280,14 +1281,6 @@ static int __set_page_dirty(struct page *page) } return 0; } - -int set_page_dirty(struct page *page) -{ - int ret = __set_page_dirty(page); - if (ret) - task_dirty_inc(current); - return ret; -} EXPORT_SYMBOL(set_page_dirty); /* -- cgit v1.2.3 From 287d859222e0adbc67666a6154aaf42d7d5bbb54 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 18 Feb 2009 14:48:26 -0800 Subject: atmel-mci: fix initialization of dma slave data The conversion of atmel-mci to dma_request_channel missed the initialization of the channel dma_slave information. The filter_fn passed to dma_request_channel is responsible for initializing the channel's private data. This implementation has the additional benefit of enabling a generic client-channel data passing mechanism. Reviewed-by: Atsushi Nemoto Signed-off-by: Dan Williams Acked-by: Haavard Skinnemoen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dma/dmaengine.c | 2 ++ drivers/dma/dw_dmac.c | 5 ++--- drivers/dma/dw_dmac_regs.h | 2 -- drivers/mmc/host/atmel-mci.c | 5 +++-- include/linux/dmaengine.h | 2 ++ 5 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index a58993011edb..280a9d263eb3 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -518,6 +518,7 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v dma_chan_name(chan), err); else break; + chan->private = NULL; chan = NULL; } } @@ -536,6 +537,7 @@ void dma_release_channel(struct dma_chan *chan) WARN_ONCE(chan->client_count != 1, "chan reference count %d != 1\n", chan->client_count); dma_chan_put(chan); + chan->private = NULL; mutex_unlock(&dma_list_mutex); } EXPORT_SYMBOL_GPL(dma_release_channel); diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c index 6b702cc46b3d..a97c07eef7ec 100644 --- a/drivers/dma/dw_dmac.c +++ b/drivers/dma/dw_dmac.c @@ -560,7 +560,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, unsigned long flags) { struct dw_dma_chan *dwc = to_dw_dma_chan(chan); - struct dw_dma_slave *dws = dwc->dws; + struct dw_dma_slave *dws = chan->private; struct dw_desc *prev; struct dw_desc *first; u32 ctllo; @@ -790,7 +790,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan) cfghi = DWC_CFGH_FIFO_MODE; cfglo = 0; - dws = dwc->dws; + dws = chan->private; if (dws) { /* * We need controller-specific data to set up slave @@ -866,7 +866,6 @@ static void dwc_free_chan_resources(struct dma_chan *chan) spin_lock_bh(&dwc->lock); list_splice_init(&dwc->free_list, &list); dwc->descs_allocated = 0; - dwc->dws = NULL; /* Disable interrupts */ channel_clear_bit(dw, MASK.XFER, dwc->mask); diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h index 00fdd187bb0c..b252b202c5cf 100644 --- a/drivers/dma/dw_dmac_regs.h +++ b/drivers/dma/dw_dmac_regs.h @@ -139,8 +139,6 @@ struct dw_dma_chan { struct list_head queue; struct list_head free_list; - struct dw_dma_slave *dws; - unsigned int descs_allocated; }; diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index 76bfe16c09b1..2b1196e6142c 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -1548,9 +1548,10 @@ static bool filter(struct dma_chan *chan, void *slave) { struct dw_dma_slave *dws = slave; - if (dws->dma_dev == chan->device->dev) + if (dws->dma_dev == chan->device->dev) { + chan->private = dws; return true; - else + } else return false; } #endif diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 3e68469c1885..f0413845f20e 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -121,6 +121,7 @@ struct dma_chan_percpu { * @local: per-cpu pointer to a struct dma_chan_percpu * @client-count: how many clients are using this channel * @table_count: number of appearances in the mem-to-mem allocation table + * @private: private data for certain client-channel associations */ struct dma_chan { struct dma_device *device; @@ -134,6 +135,7 @@ struct dma_chan { struct dma_chan_percpu *local; int client_count; int table_count; + void *private; }; /** -- cgit v1.2.3 From f2dbcfa738368c8a40d4a5f0b65dc9879577cb21 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 18 Feb 2009 14:48:32 -0800 Subject: mm: clean up for early_pfn_to_nid() What's happening is that the assertion in mm/page_alloc.c:move_freepages() is triggering: BUG_ON(page_zone(start_page) != page_zone(end_page)); Once I knew this is what was happening, I added some annotations: if (unlikely(page_zone(start_page) != page_zone(end_page))) { printk(KERN_ERR "move_freepages: Bogus zones: " "start_page[%p] end_page[%p] zone[%p]\n", start_page, end_page, zone); printk(KERN_ERR "move_freepages: " "start_zone[%p] end_zone[%p]\n", page_zone(start_page), page_zone(end_page)); printk(KERN_ERR "move_freepages: " "start_pfn[0x%lx] end_pfn[0x%lx]\n", page_to_pfn(start_page), page_to_pfn(end_page)); printk(KERN_ERR "move_freepages: " "start_nid[%d] end_nid[%d]\n", page_to_nid(start_page), page_to_nid(end_page)); ... And here's what I got: move_freepages: Bogus zones: start_page[2207d0000] end_page[2207dffc0] zone[fffff8103effcb00] move_freepages: start_zone[fffff8103effcb00] end_zone[fffff8003fffeb00] move_freepages: start_pfn[0x81f600] end_pfn[0x81f7ff] move_freepages: start_nid[1] end_nid[0] My memory layout on this box is: [ 0.000000] Zone PFN ranges: [ 0.000000] Normal 0x00000000 -> 0x0081ff5d [ 0.000000] Movable zone start PFN for each node [ 0.000000] early_node_map[8] active PFN ranges [ 0.000000] 0: 0x00000000 -> 0x00020000 [ 0.000000] 1: 0x00800000 -> 0x0081f7ff [ 0.000000] 1: 0x0081f800 -> 0x0081fe50 [ 0.000000] 1: 0x0081fed1 -> 0x0081fed8 [ 0.000000] 1: 0x0081feda -> 0x0081fedb [ 0.000000] 1: 0x0081fedd -> 0x0081fee5 [ 0.000000] 1: 0x0081fee7 -> 0x0081ff51 [ 0.000000] 1: 0x0081ff59 -> 0x0081ff5d So it's a block move in that 0x81f600-->0x81f7ff region which triggers the problem. This patch: Declaration of early_pfn_to_nid() is scattered over per-arch include files, and it seems it's complicated to know when the declaration is used. I think it makes fix-for-memmap-init not easy. This patch moves all declaration to include/linux/mm.h After this, if !CONFIG_NODES_POPULATES_NODE_MAP && !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -> Use static definition in include/linux/mm.h else if !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -> Use generic definition in mm/page_alloc.c else -> per-arch back end function will be called. Signed-off-by: KAMEZAWA Hiroyuki Tested-by: KOSAKI Motohiro Reported-by: David Miller Cc: Mel Gorman Cc: Heiko Carstens Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/mmzone.h | 4 ---- arch/ia64/mm/numa.c | 2 +- arch/x86/include/asm/mmzone_32.h | 2 -- arch/x86/include/asm/mmzone_64.h | 2 -- arch/x86/mm/numa_64.c | 2 +- include/linux/mm.h | 19 ++++++++++++++++--- mm/page_alloc.c | 8 +++++++- 7 files changed, 25 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/include/asm/mmzone.h b/arch/ia64/include/asm/mmzone.h index 34efe88eb849..f2ca32069b3f 100644 --- a/arch/ia64/include/asm/mmzone.h +++ b/arch/ia64/include/asm/mmzone.h @@ -31,10 +31,6 @@ static inline int pfn_to_nid(unsigned long pfn) #endif } -#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -extern int early_pfn_to_nid(unsigned long pfn); -#endif - #ifdef CONFIG_IA64_DIG /* DIG systems are small */ # define MAX_PHYSNODE_ID 8 # define NR_NODE_MEMBLKS (MAX_NUMNODES * 8) diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c index b73bf1838e57..5061c3fb6796 100644 --- a/arch/ia64/mm/numa.c +++ b/arch/ia64/mm/numa.c @@ -58,7 +58,7 @@ paddr_to_nid(unsigned long paddr) * SPARSEMEM to allocate the SPARSEMEM sectionmap on the NUMA node where * the section resides. */ -int early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn) { int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec; diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 07f1af494ca5..105fb90a0635 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -32,8 +32,6 @@ static inline void get_memcfg_numa(void) get_memcfg_numa_flat(); } -extern int early_pfn_to_nid(unsigned long pfn); - extern void resume_map_numa_kva(pgd_t *pgd); #else /* !CONFIG_NUMA */ diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index a5b3817d4b9e..a29f48c2a322 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h @@ -40,8 +40,6 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ NODE_DATA(nid)->node_spanned_pages) -extern int early_pfn_to_nid(unsigned long pfn); - #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 71a14f89f89e..f3516da035d1 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -145,7 +145,7 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes, return shift; } -int early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn) { return phys_to_nid(pfn << PAGE_SHIFT); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 10074212a35b..065cdf8c09fb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1041,10 +1041,23 @@ extern void free_bootmem_with_active_regions(int nid, typedef int (*work_fn_t)(unsigned long, unsigned long, void *); extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -extern int early_pfn_to_nid(unsigned long pfn); -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + +#if !defined(CONFIG_ARCH_POPULATES_NODE_MAP) && \ + !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) +static inline int __early_pfn_to_nid(unsigned long pfn) +{ + return 0; +} +#else +/* please see mm/page_alloc.c */ +extern int __meminit early_pfn_to_nid(unsigned long pfn); +#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +/* there is a per-arch backend function. */ +extern int __meminit __early_pfn_to_nid(unsigned long pfn); +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ +#endif + extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5675b3073854..c5dd74602efc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2989,7 +2989,7 @@ static int __meminit next_active_region_index_in_nid(int index, int nid) * was used and there are no special requirements, this is a convenient * alternative */ -int __meminit early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn) { int i; @@ -3005,6 +3005,12 @@ int __meminit early_pfn_to_nid(unsigned long pfn) } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + return __early_pfn_to_nid(pfn); +} + + /* Basic iterator support to walk early_node_map[] */ #define for_each_active_range_index_in_nid(i, nid) \ for (i = first_active_region_index_in_nid(nid); i != -1; \ -- cgit v1.2.3 From cc2559bccc72767cb446f79b071d96c30c26439b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 18 Feb 2009 14:48:33 -0800 Subject: mm: fix memmap init for handling memory hole Now, early_pfn_in_nid(PFN, NID) may returns false if PFN is a hole. and memmap initialization was not done. This was a trouble for sparc boot. To fix this, the PFN should be initialized and marked as PG_reserved. This patch changes early_pfn_in_nid() return true if PFN is a hole. Signed-off-by: KAMEZAWA Hiroyuki Reported-by: David Miller Tested-by: KOSAKI Motohiro Cc: Mel Gorman Cc: Heiko Carstens Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/numa.c | 2 +- include/linux/mmzone.h | 2 +- mm/page_alloc.c | 23 ++++++++++++++++++++--- 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c index 5061c3fb6796..3efea7d0a351 100644 --- a/arch/ia64/mm/numa.c +++ b/arch/ia64/mm/numa.c @@ -70,7 +70,7 @@ int __meminit __early_pfn_to_nid(unsigned long pfn) return node_memblk[i].nid; } - return 0; + return -1; } #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 09c14e213b63..1aca6cebbb78 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1071,7 +1071,7 @@ void sparse_init(void); #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_NODES_SPAN_OTHER_NODES -#define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid)) +bool early_pfn_in_nid(unsigned long pfn, int nid); #else #define early_pfn_in_nid(pfn, nid) (1) #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c5dd74602efc..5c44ed49ca93 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3000,16 +3000,33 @@ int __meminit __early_pfn_to_nid(unsigned long pfn) if (start_pfn <= pfn && pfn < end_pfn) return early_node_map[i].nid; } - - return 0; + /* This is a memory hole */ + return -1; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ int __meminit early_pfn_to_nid(unsigned long pfn) { - return __early_pfn_to_nid(pfn); + int nid; + + nid = __early_pfn_to_nid(pfn); + if (nid >= 0) + return nid; + /* just returns 0 */ + return 0; } +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + int nid; + + nid = __early_pfn_to_nid(pfn); + if (nid >= 0 && nid != node) + return false; + return true; +} +#endif /* Basic iterator support to walk early_node_map[] */ #define for_each_active_range_index_in_nid(i, nid) \ -- cgit v1.2.3 From ffa7525c13eb3db0fd19a3e1cffe2ce6f561f5f3 Mon Sep 17 00:00:00 2001 From: Adam Lackorzynski Date: Wed, 18 Feb 2009 14:48:34 -0800 Subject: jsm: additional device support I have a Digi Neo 8 PCI card (114f:00b1) Serial controller: Digi International Digi Neo 8 (rev 05) that works with the jsm driver after using the following patch. Signed-off-by: Adam Lackorzynski Cc: Scott H Kilau Cc: Wendy Xiong Acked-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/serial/jsm/jsm_driver.c | 3 +++ include/linux/pci_ids.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/drivers/serial/jsm/jsm_driver.c b/drivers/serial/jsm/jsm_driver.c index 92187e28608a..ac79cbe4c2cf 100644 --- a/drivers/serial/jsm/jsm_driver.c +++ b/drivers/serial/jsm/jsm_driver.c @@ -84,6 +84,8 @@ static int jsm_probe_one(struct pci_dev *pdev, const struct pci_device_id *ent) brd->pci_dev = pdev; if (pdev->device == PCIE_DEVICE_ID_NEO_4_IBM) brd->maxports = 4; + else if (pdev->device == PCI_DEVICE_ID_DIGI_NEO_8) + brd->maxports = 8; else brd->maxports = 2; @@ -212,6 +214,7 @@ static struct pci_device_id jsm_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_DIGI, PCI_DEVICE_ID_NEO_2RJ45), 0, 0, 2 }, { PCI_DEVICE(PCI_VENDOR_ID_DIGI, PCI_DEVICE_ID_NEO_2RJ45PRI), 0, 0, 3 }, { PCI_DEVICE(PCI_VENDOR_ID_DIGI, PCIE_DEVICE_ID_NEO_4_IBM), 0, 0, 4 }, + { PCI_DEVICE(PCI_VENDOR_ID_DIGI, PCI_DEVICE_ID_DIGI_NEO_8), 0, 0, 5 }, { 0, } }; MODULE_DEVICE_TABLE(pci, jsm_pci_tbl); diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 918391b4b109..114b8192eab9 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1445,6 +1445,7 @@ #define PCI_DEVICE_ID_DIGI_DF_M_E 0x0071 #define PCI_DEVICE_ID_DIGI_DF_M_IOM2_A 0x0072 #define PCI_DEVICE_ID_DIGI_DF_M_A 0x0073 +#define PCI_DEVICE_ID_DIGI_NEO_8 0x00B1 #define PCI_DEVICE_ID_NEO_2DB9 0x00C8 #define PCI_DEVICE_ID_NEO_2DB9PRI 0x00C9 #define PCI_DEVICE_ID_NEO_2RJ45 0x00CA -- cgit v1.2.3 From 97bef7dd05563807539122c488a5dd93ed327722 Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Wed, 18 Feb 2009 14:48:40 -0800 Subject: Bernhard has moved Since I don't work for SUSE any more and the bwalle@suse.de address is invalid, correct it in the copyright headers and documentation. Signed-off-by: Bernhard Walle Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-firmware-memmap | 2 +- drivers/firmware/memmap.c | 2 +- include/linux/firmware-map.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-firmware-memmap b/Documentation/ABI/testing/sysfs-firmware-memmap index 0d99ee6ae02e..eca0d65087dc 100644 --- a/Documentation/ABI/testing/sysfs-firmware-memmap +++ b/Documentation/ABI/testing/sysfs-firmware-memmap @@ -1,6 +1,6 @@ What: /sys/firmware/memmap/ Date: June 2008 -Contact: Bernhard Walle +Contact: Bernhard Walle Description: On all platforms, the firmware provides a memory map which the kernel reads. The resources from that memory map are registered diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index 261b9aa3f248..05aa2d406ac6 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -1,7 +1,7 @@ /* * linux/drivers/firmware/memmap.c * Copyright (C) 2008 SUSE LINUX Products GmbH - * by Bernhard Walle + * by Bernhard Walle * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License v2.0 as published by diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h index 6e199c8dfacc..cca686b39123 100644 --- a/include/linux/firmware-map.h +++ b/include/linux/firmware-map.h @@ -1,7 +1,7 @@ /* * include/linux/firmware-map.h: * Copyright (C) 2008 SUSE LINUX Products GmbH - * by Bernhard Walle + * by Bernhard Walle * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License v2.0 as published by -- cgit v1.2.3 From e4dd61882e2cfe47ea72ecd825671e8e5ae29038 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Wed, 18 Feb 2009 23:31:11 -0800 Subject: vlan: Update skb->mac_header in __vlan_put_tag(). After moving mac addresses in __vlan_put_tag() skb->mac_header needs to be updated. Reported-by: Karl Hiramoto Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index f8ff918c208f..e1ff5b14310e 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -210,6 +210,7 @@ static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci) /* Move the mac addresses to the beginning of the new header. */ memmove(skb->data, skb->data + VLAN_HLEN, 2 * VLAN_ETH_ALEN); + skb->mac_header -= VLAN_HLEN; /* first, the ethernet type */ veth->h_vlan_proto = htons(ETH_P_8021Q); -- cgit v1.2.3 From 3ef0e5ba467366125f04b423f4638baca54a4fc1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 20 Feb 2009 15:38:41 -0800 Subject: slab: introduce kzfree() kzfree() is a wrapper for kfree() that additionally zeroes the underlying memory before releasing it to the slab allocator. Currently there is code which memset()s the memory region of an object before releasing it back to the slab allocator to make sure security-sensitive data are really zeroed out after use. These callsites can then just use kzfree() which saves some code, makes users greppable and allows for a stupid destructor that isn't necessarily aware of the actual object size. Signed-off-by: Johannes Weiner Reviewed-by: Pekka Enberg Cc: Matt Mackall Acked-by: Christoph Lameter Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 1 + mm/util.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index f96d13c281e8..24c5602bee99 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -127,6 +127,7 @@ int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr); void * __must_check __krealloc(const void *, size_t, gfp_t); void * __must_check krealloc(const void *, size_t, gfp_t); void kfree(const void *); +void kzfree(const void *); size_t ksize(const void *); /* diff --git a/mm/util.c b/mm/util.c index cb00b748ce47..37eaccdf3054 100644 --- a/mm/util.c +++ b/mm/util.c @@ -129,6 +129,26 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) } EXPORT_SYMBOL(krealloc); +/** + * kzfree - like kfree but zero memory + * @p: object to free memory of + * + * The memory of the object @p points to is zeroed before freed. + * If @p is %NULL, kzfree() does nothing. + */ +void kzfree(const void *p) +{ + size_t ks; + void *mem = (void *)p; + + if (unlikely(ZERO_OR_NULL_PTR(mem))) + return; + ks = ksize(mem); + memset(mem, 0, ks); + kfree(mem); +} +EXPORT_SYMBOL(kzfree); + /* * strndup_user - duplicate an existing string from user space * @s: The string to duplicate -- cgit v1.2.3 From 01b24fee285b098c74318a8be801ef3711ec18d7 Mon Sep 17 00:00:00 2001 From: Michael Buesch Date: Fri, 20 Feb 2009 15:38:49 -0800 Subject: spi_bitbang: add more lowlevel function documentation This adds more documentation of the lowlevel API to avoid future bugs. Signed-off-by: Michael Buesch Acked-by: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/spi/spi_bitbang.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h index bf8de281b4ed..eed4254bd503 100644 --- a/include/linux/spi/spi_bitbang.h +++ b/include/linux/spi/spi_bitbang.h @@ -83,6 +83,13 @@ extern int spi_bitbang_stop(struct spi_bitbang *spi); * int getmiso(struct spi_device *); * void spidelay(unsigned); * + * setsck()'s is_on parameter is a zero/nonzero boolean. + * + * setmosi()'s is_on parameter is a zero/nonzero boolean. + * + * getmiso() is required to return 0 or 1 only. Any other value is invalid + * and will result in improper operation. + * * A non-inlined routine would call bitbang_txrx_*() routines. The * main loop could easily compile down to a handful of instructions, * especially if the delay is a NOP (to run at peak speed). -- cgit v1.2.3 From b6adea334c6c89d5e6c94f9196bbf3a279cb53bd Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 20 Feb 2009 15:38:52 -0800 Subject: 8250: fix boot hang with serial console when using with Serial Over Lan port Intel 8257x Ethernet boards have a feature called Serial Over Lan. This feature works by emulating a serial port, and it is detected by kernel as a normal 8250 port. However, this emulation is not perfect, as also noticed on changeset 7500b1f602aad75901774a67a687ee985d85893f. Before this patch, the kernel were trying to check if the serial TX is capable of work using IRQ's. This were done with a code similar this: serial_outp(up, UART_IER, UART_IER_THRI); lsr = serial_in(up, UART_LSR); iir = serial_in(up, UART_IIR); serial_outp(up, UART_IER, 0); if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) up->bugs |= UART_BUG_TXEN; This works fine for other 8250 ports, but, on 8250-emulated SoL port, the chip is a little lazy to down UART_IIR_NO_INT at UART_IIR register. Due to that, UART_BUG_TXEN is sometimes enabled. However, as TX IRQ keeps working, and the TX polling is now enabled, the driver miss-interprets the IRQ received later, hanging up the machine until a key is pressed at the serial console. This is the 6 version of this patch. Previous versions were trying to introduce a large enough delay between serial_outp and serial_in(up, UART_IIR), but not taking forever. However, the needed delay couldn't be safely determined. At the experimental tests, a delay of 1us solves most of the cases, but still hangs sometimes. Increasing the delay to 5us was better, but still doesn't solve. A very high delay of 50 ms seemed to work every time. However, poking around with delays and pray for it to be enough doesn't seem to be a good approach, even for a quirk. So, instead of playing with random large arbitrary delays, let's just disable UART_BUG_TXEN for all SoL ports. [akpm@linux-foundation.org: fix warnings] Signed-off-by: Mauro Carvalho Chehab Cc: Alan Cox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/serial/8250.c | 15 +++++++++++++++ drivers/serial/8250_pci.c | 36 ++++++++++++++++++++++++++++++++++++ include/linux/pci_ids.h | 3 +++ include/linux/serial_core.h | 1 + 4 files changed, 55 insertions(+) (limited to 'include/linux') diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c index 0d934bfbdd9b..b4b39811b445 100644 --- a/drivers/serial/8250.c +++ b/drivers/serial/8250.c @@ -2083,6 +2083,20 @@ static int serial8250_startup(struct uart_port *port) serial8250_set_mctrl(&up->port, up->port.mctrl); + /* Serial over Lan (SoL) hack: + Intel 8257x Gigabit ethernet chips have a + 16550 emulation, to be used for Serial Over Lan. + Those chips take a longer time than a normal + serial device to signalize that a transmission + data was queued. Due to that, the above test generally + fails. One solution would be to delay the reading of + iir. However, this is not reliable, since the timeout + is variable. So, let's just don't test if we receive + TX irq. This way, we'll never enable UART_BUG_TXEN. + */ + if (up->port.flags & UPF_NO_TXEN_TEST) + goto dont_test_tx_en; + /* * Do a quick test to see if we receive an * interrupt when we enable the TX irq. @@ -2102,6 +2116,7 @@ static int serial8250_startup(struct uart_port *port) up->bugs &= ~UART_BUG_TXEN; } +dont_test_tx_en: spin_unlock_irqrestore(&up->port.lock, flags); /* diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c index 536d8e510f66..533f82025adf 100644 --- a/drivers/serial/8250_pci.c +++ b/drivers/serial/8250_pci.c @@ -798,6 +798,21 @@ pci_default_setup(struct serial_private *priv, return setup_port(priv, port, bar, offset, board->reg_shift); } +static int skip_tx_en_setup(struct serial_private *priv, + const struct pciserial_board *board, + struct uart_port *port, int idx) +{ + port->flags |= UPF_NO_TXEN_TEST; + printk(KERN_DEBUG "serial8250: skipping TxEn test for device " + "[%04x:%04x] subsystem [%04x:%04x]\n", + priv->dev->vendor, + priv->dev->device, + priv->dev->subsystem_vendor, + priv->dev->subsystem_device); + + return pci_default_setup(priv, board, port, idx); +} + /* This should be in linux/pci_ids.h */ #define PCI_VENDOR_ID_SBSMODULARIO 0x124B #define PCI_SUBVENDOR_ID_SBSMODULARIO 0x124B @@ -864,6 +879,27 @@ static struct pci_serial_quirk pci_serial_quirks[] __refdata = { .init = pci_inteli960ni_init, .setup = pci_default_setup, }, + { + .vendor = PCI_VENDOR_ID_INTEL, + .device = PCI_DEVICE_ID_INTEL_8257X_SOL, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .setup = skip_tx_en_setup, + }, + { + .vendor = PCI_VENDOR_ID_INTEL, + .device = PCI_DEVICE_ID_INTEL_82573L_SOL, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .setup = skip_tx_en_setup, + }, + { + .vendor = PCI_VENDOR_ID_INTEL, + .device = PCI_DEVICE_ID_INTEL_82573E_SOL, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .setup = skip_tx_en_setup, + }, /* * ITE */ diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 114b8192eab9..aca8c458aa8a 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2323,6 +2323,9 @@ #define PCI_DEVICE_ID_INTEL_82378 0x0484 #define PCI_DEVICE_ID_INTEL_I960 0x0960 #define PCI_DEVICE_ID_INTEL_I960RM 0x0962 +#define PCI_DEVICE_ID_INTEL_8257X_SOL 0x1062 +#define PCI_DEVICE_ID_INTEL_82573E_SOL 0x1085 +#define PCI_DEVICE_ID_INTEL_82573L_SOL 0x108F #define PCI_DEVICE_ID_INTEL_82815_MC 0x1130 #define PCI_DEVICE_ID_INTEL_82815_CGC 0x1132 #define PCI_DEVICE_ID_INTEL_82092AA_0 0x1221 diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 90bbbf0b1161..df9245c7bd3b 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -296,6 +296,7 @@ struct uart_port { #define UPF_HARDPPS_CD ((__force upf_t) (1 << 11)) #define UPF_LOW_LATENCY ((__force upf_t) (1 << 13)) #define UPF_BUGGY_UART ((__force upf_t) (1 << 14)) +#define UPF_NO_TXEN_TEST ((__force upf_t) (1 << 15)) #define UPF_MAGIC_MULTIPLIER ((__force upf_t) (1 << 16)) #define UPF_CONS_FLOW ((__force upf_t) (1 << 23)) #define UPF_SHARE_IRQ ((__force upf_t) (1 << 24)) -- cgit v1.2.3 From 216773a787c3c46ef26bf1742c1fdba37d26be45 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 14 Feb 2009 01:59:06 +0100 Subject: Consolidate driver_probe_done() loops into one place there's a few places that currently loop over driver_probe_done(), and I'm about to add another one. This patch abstracts it into a helper to reduce duplication. Signed-off-by: Arjan van de Ven Signed-off-by: Rafael J. Wysocki Cc: Len Brown Acked-by: Greg KH Signed-off-by: Linus Torvalds --- drivers/base/dd.c | 17 +++++++++++++++++ include/linux/device.h | 2 ++ init/do_mounts.c | 13 +++++++++---- init/do_mounts_md.c | 5 +++-- 4 files changed, 31 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 315bed8d5e7f..135231239103 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -18,9 +18,11 @@ */ #include +#include #include #include #include +#include #include "base.h" #include "power/power.h" @@ -167,6 +169,21 @@ int driver_probe_done(void) return 0; } +/** + * wait_for_device_probe + * Wait for device probing to be completed. + * + * Note: this function polls at 100 msec intervals. + */ +int wait_for_device_probe(void) +{ + /* wait for the known devices to complete their probing */ + while (driver_probe_done() != 0) + msleep(100); + async_synchronize_full(); + return 0; +} + /** * driver_probe_device - attempt to bind device & driver together * @drv: driver to bind a device to diff --git a/include/linux/device.h b/include/linux/device.h index 45e5b1921fbb..47f343c7bdda 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -147,6 +147,8 @@ extern void put_driver(struct device_driver *drv); extern struct device_driver *driver_find(const char *name, struct bus_type *bus); extern int driver_probe_done(void); +extern int wait_for_device_probe(void); + /* sysfs interface for exporting driver attributes */ diff --git a/init/do_mounts.c b/init/do_mounts.c index 708105e163df..8d4ff5afc1d8 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -370,10 +370,14 @@ void __init prepare_namespace(void) ssleep(root_delay); } - /* wait for the known devices to complete their probing */ - while (driver_probe_done() != 0) - msleep(100); - async_synchronize_full(); + /* + * wait for the known devices to complete their probing + * + * Note: this is a potential source of long boot delays. + * For example, it is not atypical to wait 5 seconds here + * for the touchpad of a laptop to initialize. + */ + wait_for_device_probe(); md_run_setup(); @@ -399,6 +403,7 @@ void __init prepare_namespace(void) while (driver_probe_done() != 0 || (ROOT_DEV = name_to_dev_t(saved_root_name)) == 0) msleep(100); + async_synchronize_full(); } is_floppy = MAJOR(ROOT_DEV) == FLOPPY_MAJOR; diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c index ff95e3192884..9bdddbcb3d6a 100644 --- a/init/do_mounts_md.c +++ b/init/do_mounts_md.c @@ -281,8 +281,9 @@ static void __init autodetect_raid(void) */ printk(KERN_INFO "md: Waiting for all devices to be available before autodetect\n"); printk(KERN_INFO "md: If you don't use raid, use raid=noautodetect\n"); - while (driver_probe_done() < 0) - msleep(100); + + wait_for_device_probe(); + fd = sys_open("/dev/md0", 0, 0); if (fd >= 0) { sys_ioctl(fd, RAID_AUTORUN, raid_autopart); -- cgit v1.2.3 From 770824bdc421ff58a64db608294323571c949f4c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 22 Feb 2009 18:38:50 +0100 Subject: PM: Split up sysdev_[suspend|resume] from device_power_[down|up] Move the sysdev_suspend/resume from the callee to the callers, with no real change in semantics, so that we can rework the disabling of interrupts during suspend/hibernation. This is based on an earlier patch from Linus. Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- arch/x86/kernel/apm_32.c | 4 ++++ drivers/base/base.h | 2 -- drivers/base/power/main.c | 3 --- drivers/xen/manage.c | 8 ++++++++ include/linux/pm.h | 2 ++ kernel/kexec.c | 7 +++++++ kernel/power/disk.c | 11 +++++++++++ kernel/power/main.c | 8 ++++++-- 8 files changed, 38 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 98807bb095ad..266ec6c18b6c 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1192,6 +1192,7 @@ static int suspend(int vetoable) device_suspend(PMSG_SUSPEND); local_irq_disable(); device_power_down(PMSG_SUSPEND); + sysdev_suspend(PMSG_SUSPEND); local_irq_enable(); @@ -1208,6 +1209,7 @@ static int suspend(int vetoable) if (err != APM_SUCCESS) apm_error("suspend", err); err = (err == APM_SUCCESS) ? 0 : -EIO; + sysdev_resume(); device_power_up(PMSG_RESUME); local_irq_enable(); device_resume(PMSG_RESUME); @@ -1228,6 +1230,7 @@ static void standby(void) local_irq_disable(); device_power_down(PMSG_SUSPEND); + sysdev_suspend(PMSG_SUSPEND); local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); @@ -1235,6 +1238,7 @@ static void standby(void) apm_error("standby", err); local_irq_disable(); + sysdev_resume(); device_power_up(PMSG_RESUME); local_irq_enable(); } diff --git a/drivers/base/base.h b/drivers/base/base.h index 0a5f055dffba..9f50f1b545dc 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -88,8 +88,6 @@ extern void driver_detach(struct device_driver *drv); extern int driver_probe_device(struct device_driver *drv, struct device *dev); extern void sysdev_shutdown(void); -extern int sysdev_suspend(pm_message_t state); -extern int sysdev_resume(void); extern char *make_class_name(const char *name, struct kobject *kobj); diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 670c9d6c1407..2d14f4ae6c01 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -333,7 +333,6 @@ static void dpm_power_up(pm_message_t state) */ void device_power_up(pm_message_t state) { - sysdev_resume(); dpm_power_up(state); } EXPORT_SYMBOL_GPL(device_power_up); @@ -577,8 +576,6 @@ int device_power_down(pm_message_t state) } dev->power.status = DPM_OFF_IRQ; } - if (!error) - error = sysdev_suspend(state); if (error) dpm_power_up(resume_event(state)); return error; diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 9b91617b9582..56892a142ee2 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -45,6 +45,13 @@ static int xen_suspend(void *data) err); return err; } + err = sysdev_suspend(PMSG_SUSPEND); + if (err) { + printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n", + err); + device_power_up(PMSG_RESUME); + return err; + } xen_mm_pin_all(); gnttab_suspend(); @@ -61,6 +68,7 @@ static int xen_suspend(void *data) gnttab_resume(); xen_mm_unpin_all(); + sysdev_resume(); device_power_up(PMSG_RESUME); if (!*cancelled) { diff --git a/include/linux/pm.h b/include/linux/pm.h index de2e0a8f6728..24ba5f67b3a3 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -381,10 +381,12 @@ struct dev_pm_info { #ifdef CONFIG_PM_SLEEP extern void device_pm_lock(void); +extern int sysdev_resume(void); extern void device_power_up(pm_message_t state); extern void device_resume(pm_message_t state); extern void device_pm_unlock(void); +extern int sysdev_suspend(pm_message_t state); extern int device_power_down(pm_message_t state); extern int device_suspend(pm_message_t state); extern int device_prepare_suspend(pm_message_t state); diff --git a/kernel/kexec.c b/kernel/kexec.c index 8a6d7b08864e..483899578259 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1465,6 +1465,11 @@ int kernel_kexec(void) error = device_power_down(PMSG_FREEZE); if (error) goto Enable_irqs; + + /* Suspend system devices */ + error = sysdev_suspend(PMSG_FREEZE); + if (error) + goto Power_up_devices; } else #endif { @@ -1477,6 +1482,8 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { + sysdev_resume(); + Power_up_devices: device_power_up(PMSG_RESTORE); Enable_irqs: local_irq_enable(); diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 7b40e94b1d42..4a4a206b1979 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -227,6 +227,12 @@ static int create_image(int platform_mode) "aborting hibernation\n"); goto Enable_irqs; } + sysdev_suspend(PMSG_FREEZE); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down, " + "aborting hibernation\n"); + goto Power_up_devices; + } if (hibernation_test(TEST_CORE)) goto Power_up; @@ -242,9 +248,11 @@ static int create_image(int platform_mode) if (!in_suspend) platform_leave(platform_mode); Power_up: + sysdev_resume(); /* NOTE: device_power_up() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ + Power_up_devices: device_power_up(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); Enable_irqs: @@ -335,6 +343,7 @@ static int resume_target_kernel(void) "aborting resume\n"); goto Enable_irqs; } + sysdev_suspend(PMSG_QUIESCE); /* We'll ignore saved state, but this gets preempt count (etc) right */ save_processor_state(); error = restore_highmem(); @@ -357,6 +366,7 @@ static int resume_target_kernel(void) swsusp_free(); restore_processor_state(); touch_softlockup_watchdog(); + sysdev_resume(); device_power_up(PMSG_RECOVER); Enable_irqs: local_irq_enable(); @@ -440,6 +450,7 @@ int hibernation_platform_enter(void) local_irq_disable(); error = device_power_down(PMSG_HIBERNATE); if (!error) { + sysdev_suspend(PMSG_HIBERNATE); hibernation_ops->enter(); /* We should never get here */ while (1); diff --git a/kernel/power/main.c b/kernel/power/main.c index b4d219016b6c..c9632f841f64 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -298,8 +298,12 @@ static int suspend_enter(suspend_state_t state) goto Done; } - if (!suspend_test(TEST_CORE)) - error = suspend_ops->enter(state); + error = sysdev_suspend(PMSG_SUSPEND); + if (!error) { + if (!suspend_test(TEST_CORE)) + error = suspend_ops->enter(state); + sysdev_resume(); + } device_power_up(PMSG_RESUME); Done: -- cgit v1.2.3 From cd97f39b7cdf1c8a9c9f52865eec795b7f0c811d Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 24 Feb 2009 19:19:49 +0100 Subject: i2c-dev: Clarify the unit of ioctl I2C_TIMEOUT The unit in which user-space can set the bus timeout value is jiffies for historical reasons (back when HZ was always 100.) This is however not good because user-space doesn't know how long a jiffy lasts. The timeout value should instead be set in a fixed time unit. Given the original value of HZ, this unit should be 10 ms, for compatibility. Signed-off-by: Jean Delvare Acked-by: Wolfram Sang --- drivers/i2c/i2c-dev.c | 6 +++++- include/linux/i2c-dev.h | 2 +- include/linux/i2c.h | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c index c171988a9f51..7e13d2df9af3 100644 --- a/drivers/i2c/i2c-dev.c +++ b/drivers/i2c/i2c-dev.c @@ -35,6 +35,7 @@ #include #include #include +#include #include static struct i2c_driver i2cdev_driver; @@ -422,7 +423,10 @@ static long i2cdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) client->adapter->retries = arg; break; case I2C_TIMEOUT: - client->adapter->timeout = arg; + /* For historical reasons, user-space sets the timeout + * value in units of 10 ms. + */ + client->adapter->timeout = msecs_to_jiffies(arg * 10); break; default: /* NOTE: returning a fault code here could cause trouble diff --git a/include/linux/i2c-dev.h b/include/linux/i2c-dev.h index 311315b56b61..fd53bfd26470 100644 --- a/include/linux/i2c-dev.h +++ b/include/linux/i2c-dev.h @@ -33,7 +33,7 @@ */ #define I2C_RETRIES 0x0701 /* number of times a device address should be polled when not acknowledging */ -#define I2C_TIMEOUT 0x0702 /* set timeout in jiffies - call with int */ +#define I2C_TIMEOUT 0x0702 /* set timeout in units of 10 ms */ /* NOTE: Slave address is 7 or 10 bits, but 10-bit addresses * are NOT supported! (due to code brokenness) diff --git a/include/linux/i2c.h b/include/linux/i2c.h index fcfbfea3af72..c86c3b07604c 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -361,7 +361,7 @@ struct i2c_adapter { struct mutex bus_lock; struct mutex clist_lock; - int timeout; + int timeout; /* in jiffies */ int retries; struct device dev; /* the adapter device */ -- cgit v1.2.3 From 4ab0d47d0ab311eb181532c1ecb6d02905685071 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 24 Feb 2009 17:35:12 -0800 Subject: gpu/drm, x86, PAT: io_mapping_create_wc and resource_size_t io_mapping_create_wc should take a resource_size_t parameter in place of unsigned long. With unsigned long, there will be no way to map greater than 4GB address in i386/32 bit. On x86, greater than 4GB addresses cannot be mapped on i386 without PAE. Return error for such a case. Patch also adds a structure for io_mapping, that saves the base, size and type on HAVE_ATOMIC_IOMAP archs, that can be used to verify the offset on io_mapping_map calls. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Cc: Dave Airlie Cc: Jesse Barnes Cc: Eric Anholt Cc: Keith Packard Signed-off-by: Ingo Molnar --- arch/x86/include/asm/iomap.h | 3 +++ arch/x86/mm/iomap_32.c | 18 +++++++++++++++++ include/linux/io-mapping.h | 46 +++++++++++++++++++++++++++++++++----------- 3 files changed, 56 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index c1f06289b14b..86af26091d6c 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -23,6 +23,9 @@ #include #include +int +is_io_mapping_possible(resource_size_t base, unsigned long size); + void * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index ca53224fc56c..6c2b1af16926 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -20,6 +20,24 @@ #include #include +#ifdef CONFIG_X86_PAE +int +is_io_mapping_possible(resource_size_t base, unsigned long size) +{ + return 1; +} +#else +int +is_io_mapping_possible(resource_size_t base, unsigned long size) +{ + /* There is no way to map greater than 1 << 32 address without PAE */ + if (base + size > 0x100000000ULL) + return 0; + + return 1; +} +#endif + /* Map 'pfn' using fixed map 'type' and protections 'prot' */ void * diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 82df31726a54..cbc2f0cd631b 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -30,11 +30,14 @@ * See Documentation/io_mapping.txt */ -/* this struct isn't actually defined anywhere */ -struct io_mapping; - #ifdef CONFIG_HAVE_ATOMIC_IOMAP +struct io_mapping { + resource_size_t base; + unsigned long size; + pgprot_t prot; +}; + /* * For small address space machines, mapping large objects * into the kernel virtual space isn't practical. Where @@ -43,23 +46,40 @@ struct io_mapping; */ static inline struct io_mapping * -io_mapping_create_wc(unsigned long base, unsigned long size) +io_mapping_create_wc(resource_size_t base, unsigned long size) { - return (struct io_mapping *) base; + struct io_mapping *iomap; + + if (!is_io_mapping_possible(base, size)) + return NULL; + + iomap = kmalloc(sizeof(*iomap), GFP_KERNEL); + if (!iomap) + return NULL; + + iomap->base = base; + iomap->size = size; + iomap->prot = pgprot_writecombine(__pgprot(__PAGE_KERNEL)); + return iomap; } static inline void io_mapping_free(struct io_mapping *mapping) { + kfree(mapping); } /* Atomic map/unmap */ static inline void * io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset) { - offset += (unsigned long) mapping; - return iomap_atomic_prot_pfn(offset >> PAGE_SHIFT, KM_USER0, - __pgprot(__PAGE_KERNEL_WC)); + resource_size_t phys_addr; + unsigned long pfn; + + BUG_ON(offset >= mapping->size); + phys_addr = mapping->base + offset; + pfn = (unsigned long) (phys_addr >> PAGE_SHIFT); + return iomap_atomic_prot_pfn(pfn, KM_USER0, mapping->prot); } static inline void @@ -71,8 +91,9 @@ io_mapping_unmap_atomic(void *vaddr) static inline void * io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset) { - offset += (unsigned long) mapping; - return ioremap_wc(offset, PAGE_SIZE); + BUG_ON(offset >= mapping->size); + resource_size_t phys_addr = mapping->base + offset; + return ioremap_wc(phys_addr, PAGE_SIZE); } static inline void @@ -83,9 +104,12 @@ io_mapping_unmap(void *vaddr) #else +/* this struct isn't actually defined anywhere */ +struct io_mapping; + /* Create the io_mapping object*/ static inline struct io_mapping * -io_mapping_create_wc(unsigned long base, unsigned long size) +io_mapping_create_wc(resource_size_t base, unsigned long size) { return (struct io_mapping *) ioremap_wc(base, size); } -- cgit v1.2.3 From 8fed43684174b68f04d01d1210fd00536af790df Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Wed, 25 Feb 2009 20:28:24 +0100 Subject: ide: fix refcounting in device drivers During host driver module removal del_gendisk() results in a final put on drive->gendev and freeing the drive by drive_release_dev(). Convert device drivers from using struct kref to use struct device so device driver's object holds reference on ->gendev and prevents drive from prematurely going away. Also fix ->remove methods to not erroneously drop reference on a host driver by using only put_device() instead of ide*_put(). Reported-by: Stanislaw Gruszka Tested-by: Stanislaw Gruszka Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 27 ++++++++++++++++++--------- drivers/ide/ide-cd.h | 2 +- drivers/ide/ide-gd.c | 26 +++++++++++++++++--------- drivers/ide/ide-gd.h | 2 +- drivers/ide/ide-tape.c | 29 +++++++++++++++++++---------- include/linux/ide.h | 2 +- 6 files changed, 57 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 690475b834de..ddfbea41d296 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -55,7 +55,7 @@ static DEFINE_MUTEX(idecd_ref_mutex); -static void ide_cd_release(struct kref *); +static void ide_cd_release(struct device *); static struct cdrom_info *ide_cd_get(struct gendisk *disk) { @@ -67,7 +67,7 @@ static struct cdrom_info *ide_cd_get(struct gendisk *disk) if (ide_device_get(cd->drive)) cd = NULL; else - kref_get(&cd->kref); + get_device(&cd->dev); } mutex_unlock(&idecd_ref_mutex); @@ -79,7 +79,7 @@ static void ide_cd_put(struct cdrom_info *cd) ide_drive_t *drive = cd->drive; mutex_lock(&idecd_ref_mutex); - kref_put(&cd->kref, ide_cd_release); + put_device(&cd->dev); ide_device_put(drive); mutex_unlock(&idecd_ref_mutex); } @@ -1798,15 +1798,17 @@ static void ide_cd_remove(ide_drive_t *drive) ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__); ide_proc_unregister_driver(drive, info->driver); - + device_del(&info->dev); del_gendisk(info->disk); - ide_cd_put(info); + mutex_lock(&idecd_ref_mutex); + put_device(&info->dev); + mutex_unlock(&idecd_ref_mutex); } -static void ide_cd_release(struct kref *kref) +static void ide_cd_release(struct device *dev) { - struct cdrom_info *info = to_ide_drv(kref, cdrom_info); + struct cdrom_info *info = to_ide_drv(dev, cdrom_info); struct cdrom_device_info *devinfo = &info->devinfo; ide_drive_t *drive = info->drive; struct gendisk *g = info->disk; @@ -2005,7 +2007,12 @@ static int ide_cd_probe(ide_drive_t *drive) ide_init_disk(g, drive); - kref_init(&info->kref); + info->dev.parent = &drive->gendev; + info->dev.release = ide_cd_release; + dev_set_name(&info->dev, dev_name(&drive->gendev)); + + if (device_register(&info->dev)) + goto out_free_disk; info->drive = drive; info->driver = &ide_cdrom_driver; @@ -2019,7 +2026,7 @@ static int ide_cd_probe(ide_drive_t *drive) g->driverfs_dev = &drive->gendev; g->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE; if (ide_cdrom_setup(drive)) { - ide_cd_release(&info->kref); + put_device(&info->dev); goto failed; } @@ -2029,6 +2036,8 @@ static int ide_cd_probe(ide_drive_t *drive) add_disk(g); return 0; +out_free_disk: + put_disk(g); out_free_cd: kfree(info); failed: diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h index ac40d6cb90a2..c878bfcf1116 100644 --- a/drivers/ide/ide-cd.h +++ b/drivers/ide/ide-cd.h @@ -80,7 +80,7 @@ struct cdrom_info { ide_drive_t *drive; struct ide_driver *driver; struct gendisk *disk; - struct kref kref; + struct device dev; /* Buffer for table of contents. NULL if we haven't allocated a TOC buffer for this device yet. */ diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c index 7857b209c6df..047109419902 100644 --- a/drivers/ide/ide-gd.c +++ b/drivers/ide/ide-gd.c @@ -25,7 +25,7 @@ module_param(debug_mask, ulong, 0644); static DEFINE_MUTEX(ide_disk_ref_mutex); -static void ide_disk_release(struct kref *); +static void ide_disk_release(struct device *); static struct ide_disk_obj *ide_disk_get(struct gendisk *disk) { @@ -37,7 +37,7 @@ static struct ide_disk_obj *ide_disk_get(struct gendisk *disk) if (ide_device_get(idkp->drive)) idkp = NULL; else - kref_get(&idkp->kref); + get_device(&idkp->dev); } mutex_unlock(&ide_disk_ref_mutex); return idkp; @@ -48,7 +48,7 @@ static void ide_disk_put(struct ide_disk_obj *idkp) ide_drive_t *drive = idkp->drive; mutex_lock(&ide_disk_ref_mutex); - kref_put(&idkp->kref, ide_disk_release); + put_device(&idkp->dev); ide_device_put(drive); mutex_unlock(&ide_disk_ref_mutex); } @@ -66,17 +66,18 @@ static void ide_gd_remove(ide_drive_t *drive) struct gendisk *g = idkp->disk; ide_proc_unregister_driver(drive, idkp->driver); - + device_del(&idkp->dev); del_gendisk(g); - drive->disk_ops->flush(drive); - ide_disk_put(idkp); + mutex_lock(&ide_disk_ref_mutex); + put_device(&idkp->dev); + mutex_unlock(&ide_disk_ref_mutex); } -static void ide_disk_release(struct kref *kref) +static void ide_disk_release(struct device *dev) { - struct ide_disk_obj *idkp = to_ide_drv(kref, ide_disk_obj); + struct ide_disk_obj *idkp = to_ide_drv(dev, ide_disk_obj); ide_drive_t *drive = idkp->drive; struct gendisk *g = idkp->disk; @@ -348,7 +349,12 @@ static int ide_gd_probe(ide_drive_t *drive) ide_init_disk(g, drive); - kref_init(&idkp->kref); + idkp->dev.parent = &drive->gendev; + idkp->dev.release = ide_disk_release; + dev_set_name(&idkp->dev, dev_name(&drive->gendev)); + + if (device_register(&idkp->dev)) + goto out_free_disk; idkp->drive = drive; idkp->driver = &ide_gd_driver; @@ -373,6 +379,8 @@ static int ide_gd_probe(ide_drive_t *drive) add_disk(g); return 0; +out_free_disk: + put_disk(g); out_free_idkp: kfree(idkp); failed: diff --git a/drivers/ide/ide-gd.h b/drivers/ide/ide-gd.h index a86779f0756b..b604bdd318a1 100644 --- a/drivers/ide/ide-gd.h +++ b/drivers/ide/ide-gd.h @@ -17,7 +17,7 @@ struct ide_disk_obj { ide_drive_t *drive; struct ide_driver *driver; struct gendisk *disk; - struct kref kref; + struct device dev; unsigned int openers; /* protected by BKL for now */ /* Last failed packet command */ diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index d7ecd3c79757..bb450a7608c2 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -169,7 +169,7 @@ typedef struct ide_tape_obj { ide_drive_t *drive; struct ide_driver *driver; struct gendisk *disk; - struct kref kref; + struct device dev; /* * failed_pc points to the last failed packet command, or contains @@ -267,7 +267,7 @@ static DEFINE_MUTEX(idetape_ref_mutex); static struct class *idetape_sysfs_class; -static void ide_tape_release(struct kref *); +static void ide_tape_release(struct device *); static struct ide_tape_obj *ide_tape_get(struct gendisk *disk) { @@ -279,7 +279,7 @@ static struct ide_tape_obj *ide_tape_get(struct gendisk *disk) if (ide_device_get(tape->drive)) tape = NULL; else - kref_get(&tape->kref); + get_device(&tape->dev); } mutex_unlock(&idetape_ref_mutex); return tape; @@ -290,7 +290,7 @@ static void ide_tape_put(struct ide_tape_obj *tape) ide_drive_t *drive = tape->drive; mutex_lock(&idetape_ref_mutex); - kref_put(&tape->kref, ide_tape_release); + put_device(&tape->dev); ide_device_put(drive); mutex_unlock(&idetape_ref_mutex); } @@ -308,7 +308,7 @@ static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i) mutex_lock(&idetape_ref_mutex); tape = idetape_devs[i]; if (tape) - kref_get(&tape->kref); + get_device(&tape->dev); mutex_unlock(&idetape_ref_mutex); return tape; } @@ -2256,15 +2256,17 @@ static void ide_tape_remove(ide_drive_t *drive) idetape_tape_t *tape = drive->driver_data; ide_proc_unregister_driver(drive, tape->driver); - + device_del(&tape->dev); ide_unregister_region(tape->disk); - ide_tape_put(tape); + mutex_lock(&idetape_ref_mutex); + put_device(&tape->dev); + mutex_unlock(&idetape_ref_mutex); } -static void ide_tape_release(struct kref *kref) +static void ide_tape_release(struct device *dev) { - struct ide_tape_obj *tape = to_ide_drv(kref, ide_tape_obj); + struct ide_tape_obj *tape = to_ide_drv(dev, ide_tape_obj); ide_drive_t *drive = tape->drive; struct gendisk *g = tape->disk; @@ -2407,7 +2409,12 @@ static int ide_tape_probe(ide_drive_t *drive) ide_init_disk(g, drive); - kref_init(&tape->kref); + tape->dev.parent = &drive->gendev; + tape->dev.release = ide_tape_release; + dev_set_name(&tape->dev, dev_name(&drive->gendev)); + + if (device_register(&tape->dev)) + goto out_free_disk; tape->drive = drive; tape->driver = &idetape_driver; @@ -2436,6 +2443,8 @@ static int ide_tape_probe(ide_drive_t *drive) return 0; +out_free_disk: + put_disk(g); out_free_tape: kfree(tape); failed: diff --git a/include/linux/ide.h b/include/linux/ide.h index 194da5a4b0d6..fe235b65207e 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -663,7 +663,7 @@ typedef struct ide_drive_s ide_drive_t; #define to_ide_device(dev) container_of(dev, ide_drive_t, gendev) #define to_ide_drv(obj, cont_type) \ - container_of(obj, struct cont_type, kref) + container_of(obj, struct cont_type, dev) #define ide_drv_g(disk, cont_type) \ container_of((disk)->private_data, struct cont_type, driver) -- cgit v1.2.3 From a682604838763981613e42015cd0e39f2989d6bb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Feb 2009 18:03:42 -0800 Subject: rcu: Teach RCU that idle task is not quiscent state at boot This patch fixes a bug located by Vegard Nossum with the aid of kmemcheck, updated based on review comments from Nick Piggin, Ingo Molnar, and Andrew Morton. And cleans up the variable-name and function-name language. ;-) The boot CPU runs in the context of its idle thread during boot-up. During this time, idle_cpu(0) will always return nonzero, which will fool Classic and Hierarchical RCU into deciding that a large chunk of the boot-up sequence is a big long quiescent state. This in turn causes RCU to prematurely end grace periods during this time. This patch changes the rcutree.c and rcuclassic.c rcu_check_callbacks() function to ignore the idle task as a quiescent state until the system has started up the scheduler in rest_init(), introducing a new non-API function rcu_idle_now_means_idle() to inform RCU of this transition. RCU maintains an internal rcu_idle_cpu_truthful variable to track this state, which is then used by rcu_check_callback() to determine if it should believe idle_cpu(). Because this patch has the effect of disallowing RCU grace periods during long stretches of the boot-up sequence, this patch also introduces Josh Triplett's UP-only optimization that makes synchronize_rcu() be a no-op if num_online_cpus() returns 1. This allows boot-time code that calls synchronize_rcu() to proceed normally. Note, however, that RCU callbacks registered by call_rcu() will likely queue up until later in the boot sequence. Although rcuclassic and rcutree can also use this same optimization after boot completes, rcupreempt must restrict its use of this optimization to the portion of the boot sequence before the scheduler starts up, given that an rcupreempt RCU read-side critical section may be preeempted. In addition, this patch takes Nick Piggin's suggestion to make the system_state global variable be __read_mostly. Changes since v4: o Changes the name of the introduced function and variable to be less emotional. ;-) Changes since v3: o WARN_ON(nr_context_switches() > 0) to verify that RCU switches out of boot-time mode before the first context switch, as suggested by Nick Piggin. Changes since v2: o Created rcu_blocking_is_gp() internal-to-RCU API that determines whether a call to synchronize_rcu() is itself a grace period. o The definition of rcu_blocking_is_gp() for rcuclassic and rcutree checks to see if but a single CPU is online. o The definition of rcu_blocking_is_gp() for rcupreempt checks to see both if but a single CPU is online and if the system is still in early boot. This allows rcupreempt to again work correctly if running on a single CPU after booting is complete. o Added check to rcupreempt's synchronize_sched() for there being but one online CPU. Tested all three variants both SMP and !SMP, booted fine, passed a short rcutorture test on both x86 and Power. Located-by: Vegard Nossum Tested-by: Vegard Nossum Tested-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/rcuclassic.h | 6 ++++++ include/linux/rcupdate.h | 4 ++++ include/linux/rcupreempt.h | 15 +++++++++++++++ include/linux/rcutree.h | 6 ++++++ init/main.c | 3 ++- kernel/rcuclassic.c | 4 ++-- kernel/rcupdate.c | 12 ++++++++++++ kernel/rcupreempt.c | 3 +++ kernel/rcutree.c | 4 ++-- 9 files changed, 52 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index f3f697df1d71..80044a4f3ab9 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -181,4 +181,10 @@ extern long rcu_batches_completed_bh(void); #define rcu_enter_nohz() do { } while (0) #define rcu_exit_nohz() do { } while (0) +/* A context switch is a grace period for rcuclassic. */ +static inline int rcu_blocking_is_gp(void) +{ + return num_online_cpus() == 1; +} + #endif /* __LINUX_RCUCLASSIC_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 921340a7b71c..528343e6da51 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -52,6 +52,9 @@ struct rcu_head { void (*func)(struct rcu_head *head); }; +/* Internal to kernel, but needed by rcupreempt.h. */ +extern int rcu_scheduler_active; + #if defined(CONFIG_CLASSIC_RCU) #include #elif defined(CONFIG_TREE_RCU) @@ -265,6 +268,7 @@ extern void rcu_barrier_sched(void); /* Internal to kernel */ extern void rcu_init(void); +extern void rcu_scheduler_starting(void); extern int rcu_needs_cpu(int cpu); #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h index 3e05c09b54a2..74304b4538d8 100644 --- a/include/linux/rcupreempt.h +++ b/include/linux/rcupreempt.h @@ -142,4 +142,19 @@ static inline void rcu_exit_nohz(void) #define rcu_exit_nohz() do { } while (0) #endif /* CONFIG_NO_HZ */ +/* + * A context switch is a grace period for rcupreempt synchronize_rcu() + * only during early boot, before the scheduler has been initialized. + * So, how the heck do we get a context switch? Well, if the caller + * invokes synchronize_rcu(), they are willing to accept a context + * switch, so we simply pretend that one happened. + * + * After boot, there might be a blocked or preempted task in an RCU + * read-side critical section, so we cannot then take the fastpath. + */ +static inline int rcu_blocking_is_gp(void) +{ + return num_online_cpus() == 1 && !rcu_scheduler_active; +} + #endif /* __LINUX_RCUPREEMPT_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d4368b7975c3..a722fb67bb2d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -326,4 +326,10 @@ static inline void rcu_exit_nohz(void) } #endif /* CONFIG_NO_HZ */ +/* A context switch is a grace period for rcutree. */ +static inline int rcu_blocking_is_gp(void) +{ + return num_online_cpus() == 1; +} + #endif /* __LINUX_RCUTREE_H */ diff --git a/init/main.c b/init/main.c index 844209453c02..83697e160b3a 100644 --- a/init/main.c +++ b/init/main.c @@ -97,7 +97,7 @@ static inline void mark_rodata_ro(void) { } extern void tc_init(void); #endif -enum system_states system_state; +enum system_states system_state __read_mostly; EXPORT_SYMBOL(system_state); /* @@ -463,6 +463,7 @@ static noinline void __init_refok rest_init(void) * at least once to get things moving: */ init_idle_bootup_task(current); + rcu_scheduler_starting(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index bd5a9003497c..654c640a6b9c 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -679,8 +679,8 @@ int rcu_needs_cpu(int cpu) void rcu_check_callbacks(int cpu, int user) { if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + (idle_cpu(cpu) && rcu_scheduler_active && + !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { /* * Get here if this CPU took its interrupt from user diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index d92a76a881aa..cae8a059cf47 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -44,6 +44,7 @@ #include #include #include +#include enum rcu_barrier { RCU_BARRIER_STD, @@ -55,6 +56,7 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; +int rcu_scheduler_active __read_mostly; /* * Awaken the corresponding synchronize_rcu() instance now that a @@ -80,6 +82,10 @@ void wakeme_after_rcu(struct rcu_head *head) void synchronize_rcu(void) { struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu(&rcu.head, wakeme_after_rcu); @@ -175,3 +181,9 @@ void __init rcu_init(void) __rcu_init(); } +void rcu_scheduler_starting(void) +{ + WARN_ON(num_online_cpus() != 1); + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 33cfc50781f9..5d59e850fb71 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1181,6 +1181,9 @@ void __synchronize_sched(void) { struct rcu_synchronize rcu; + if (num_online_cpus() == 1) + return; /* blocking is gp if only one CPU! */ + init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu_sched(&rcu.head, wakeme_after_rcu); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b2fd602a6f6f..97ce31579ec0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -948,8 +948,8 @@ static void rcu_do_batch(struct rcu_data *rdp) void rcu_check_callbacks(int cpu, int user) { if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + (idle_cpu(cpu) && rcu_scheduler_active && + !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { /* * Get here if this CPU took its interrupt from user -- cgit v1.2.3 From 1e42807918d17e8c93bf14fbb74be84b141334c1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 Feb 2009 09:03:10 +0100 Subject: block: reduce stack footprint of blk_recount_segments() blk_recalc_rq_segments() requires a request structure passed in, which we don't have from blk_recount_segments(). So the latter allocates one on the stack, using > 400 bytes of stack for that. This can cause us to spill over one page of stack from ext4 at least: 0) 4560 400 blk_recount_segments+0x43/0x62 1) 4160 32 bio_phys_segments+0x1c/0x24 2) 4128 32 blk_rq_bio_prep+0x2a/0xf9 3) 4096 32 init_request_from_bio+0xf9/0xfe 4) 4064 112 __make_request+0x33c/0x3f6 5) 3952 144 generic_make_request+0x2d1/0x321 6) 3808 64 submit_bio+0xb9/0xc3 7) 3744 48 submit_bh+0xea/0x10e 8) 3696 368 ext4_mb_init_cache+0x257/0xa6a [ext4] 9) 3328 288 ext4_mb_regular_allocator+0x421/0xcd9 [ext4] 10) 3040 160 ext4_mb_new_blocks+0x211/0x4b4 [ext4] 11) 2880 336 ext4_ext_get_blocks+0xb61/0xd45 [ext4] 12) 2544 96 ext4_get_blocks_wrap+0xf2/0x200 [ext4] 13) 2448 80 ext4_da_get_block_write+0x6e/0x16b [ext4] 14) 2368 352 mpage_da_map_blocks+0x7e/0x4b3 [ext4] 15) 2016 352 ext4_da_writepages+0x2ce/0x43c [ext4] 16) 1664 32 do_writepages+0x2d/0x3c 17) 1632 144 __writeback_single_inode+0x162/0x2cd 18) 1488 96 generic_sync_sb_inodes+0x1e3/0x32b 19) 1392 16 sync_sb_inodes+0xe/0x10 20) 1376 48 writeback_inodes+0x69/0xb3 21) 1328 208 balance_dirty_pages_ratelimited_nr+0x187/0x2f9 22) 1120 224 generic_file_buffered_write+0x1d4/0x2c4 23) 896 176 __generic_file_aio_write_nolock+0x35f/0x393 24) 720 80 generic_file_aio_write+0x6c/0xc8 25) 640 80 ext4_file_write+0xa9/0x137 [ext4] 26) 560 320 do_sync_write+0xf0/0x137 27) 240 48 vfs_write+0xb3/0x13c 28) 192 64 sys_write+0x4c/0x74 29) 128 128 system_call_fastpath+0x16/0x1b Split the segment counting out into a __blk_recalc_rq_segments() helper to avoid allocating an onstack request just for checking the physical segment count. Signed-off-by: Jens Axboe --- block/blk-merge.c | 94 ++++++++++++++++++++++++++++---------------------- include/linux/blkdev.h | 2 ++ 2 files changed, 55 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/block/blk-merge.c b/block/blk-merge.c index b92f5b0866b0..a104593e70c3 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -38,72 +38,84 @@ void blk_recalc_rq_sectors(struct request *rq, int nsect) } } -void blk_recalc_rq_segments(struct request *rq) +static unsigned int __blk_recalc_rq_segments(struct request_queue *q, + struct bio *bio, + unsigned int *seg_size_ptr) { - int nr_phys_segs; unsigned int phys_size; struct bio_vec *bv, *bvprv = NULL; - int seg_size; - int cluster; - struct req_iterator iter; - int high, highprv = 1; - struct request_queue *q = rq->q; + int cluster, i, high, highprv = 1; + unsigned int seg_size, nr_phys_segs; + struct bio *fbio; - if (!rq->bio) - return; + if (!bio) + return 0; + fbio = bio; cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); seg_size = 0; phys_size = nr_phys_segs = 0; - rq_for_each_segment(bv, rq, iter) { - /* - * the trick here is making sure that a high page is never - * considered part of another segment, since that might - * change with the bounce page. - */ - high = page_to_pfn(bv->bv_page) > q->bounce_pfn; - if (high || highprv) - goto new_segment; - if (cluster) { - if (seg_size + bv->bv_len > q->max_segment_size) - goto new_segment; - if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) - goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) + for_each_bio(bio) { + bio_for_each_segment(bv, bio, i) { + /* + * the trick here is making sure that a high page is + * never considered part of another segment, since that + * might change with the bounce page. + */ + high = page_to_pfn(bv->bv_page) > q->bounce_pfn; + if (high || highprv) goto new_segment; + if (cluster) { + if (seg_size + bv->bv_len > q->max_segment_size) + goto new_segment; + if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) + goto new_segment; + if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) + goto new_segment; + + seg_size += bv->bv_len; + bvprv = bv; + continue; + } +new_segment: + if (nr_phys_segs == 1 && seg_size > + fbio->bi_seg_front_size) + fbio->bi_seg_front_size = seg_size; - seg_size += bv->bv_len; + nr_phys_segs++; bvprv = bv; - continue; + seg_size = bv->bv_len; + highprv = high; } -new_segment: - if (nr_phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size) - rq->bio->bi_seg_front_size = seg_size; - - nr_phys_segs++; - bvprv = bv; - seg_size = bv->bv_len; - highprv = high; } - if (nr_phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size) + if (seg_size_ptr) + *seg_size_ptr = seg_size; + + return nr_phys_segs; +} + +void blk_recalc_rq_segments(struct request *rq) +{ + unsigned int seg_size = 0, phys_segs; + + phys_segs = __blk_recalc_rq_segments(rq->q, rq->bio, &seg_size); + + if (phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size) rq->bio->bi_seg_front_size = seg_size; if (seg_size > rq->biotail->bi_seg_back_size) rq->biotail->bi_seg_back_size = seg_size; - rq->nr_phys_segments = nr_phys_segs; + rq->nr_phys_segments = phys_segs; } void blk_recount_segments(struct request_queue *q, struct bio *bio) { - struct request rq; struct bio *nxt = bio->bi_next; - rq.q = q; - rq.bio = rq.biotail = bio; + bio->bi_next = NULL; - blk_recalc_rq_segments(&rq); + bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, NULL); bio->bi_next = nxt; - bio->bi_phys_segments = rq.nr_phys_segments; bio->bi_flags |= (1 << BIO_SEG_VALID); } EXPORT_SYMBOL(blk_recount_segments); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dcaa0fd84b02..465d6babc847 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -708,6 +708,8 @@ struct req_iterator { }; /* This should not be used directly - use rq_for_each_segment */ +#define for_each_bio(_bio) \ + for (; _bio; _bio = _bio->bi_next) #define __rq_for_each_bio(_bio, rq) \ if ((rq->bio)) \ for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) -- cgit v1.2.3 From 54e991242850edc8c53f71fa5aa3ba7a93ce38f5 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Fri, 27 Feb 2009 15:13:54 +0530 Subject: sched: don't allow setuid to succeed if the user does not have rt bandwidth Impact: fix hung task with certain (non-default) rt-limit settings Corey Hickey reported that on using setuid to change the uid of a rt process, the process would be unkillable and not be running. This is because there was no rt runtime for that user group. Add in a check to see if a user can attach an rt task to its task group. On failure, return EINVAL, which is also returned in CONFIG_CGROUP_SCHED. Reported-by: Corey Hickey Signed-off-by: Dhaval Giani Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/sched.c | 13 +++++++++++-- kernel/sys.c | 31 ++++++++++++++++++++----------- kernel/user.c | 18 ++++++++++++++++++ 4 files changed, 53 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8981e52c714f..8c216e057c94 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2291,9 +2291,13 @@ extern long sched_group_rt_runtime(struct task_group *tg); extern int sched_group_set_rt_period(struct task_group *tg, long rt_period_us); extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); #endif #endif +extern int task_can_switch_user(struct user_struct *up, + struct task_struct *tsk); + #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { diff --git a/kernel/sched.c b/kernel/sched.c index c3baa9653d1d..8e2558c2ba67 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9224,6 +9224,16 @@ static int sched_rt_global_constraints(void) return ret; } + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + return 0; + + return 1; +} + #else /* !CONFIG_RT_GROUP_SCHED */ static int sched_rt_global_constraints(void) { @@ -9317,8 +9327,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED - /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) + if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ diff --git a/kernel/sys.c b/kernel/sys.c index f145c415bc16..37f458e6882a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -559,7 +559,7 @@ error: abort_creds(new); return retval; } - + /* * change the user struct in a credentials set to match the new UID */ @@ -571,6 +571,11 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; + if (!task_can_switch_user(new_user, current)) { + free_uid(new_user); + return -EINVAL; + } + if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && new_user != INIT_USER) { @@ -631,10 +636,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) goto error; } - retval = -EAGAIN; - if (new->uid != old->uid && set_user(new) < 0) - goto error; - + if (new->uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } if (ruid != (uid_t) -1 || (euid != (uid_t) -1 && euid != old->uid)) new->suid = new->euid; @@ -680,9 +686,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) retval = -EPERM; if (capable(CAP_SETUID)) { new->suid = new->uid = uid; - if (uid != old->uid && set_user(new) < 0) { - retval = -EAGAIN; - goto error; + if (uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; } } else if (uid != old->uid && uid != new->suid) { goto error; @@ -734,11 +741,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) goto error; } - retval = -EAGAIN; if (ruid != (uid_t) -1) { new->uid = ruid; - if (ruid != old->uid && set_user(new) < 0) - goto error; + if (ruid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } } if (euid != (uid_t) -1) new->euid = euid; diff --git a/kernel/user.c b/kernel/user.c index 3551ac742395..6a9b696128c8 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -362,6 +362,24 @@ static void free_user(struct user_struct *up, unsigned long flags) #endif +#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED) +/* + * We need to check if a setuid can take place. This function should be called + * before successfully completing the setuid. + */ +int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) +{ + + return sched_rt_can_attach(up->tg, tsk); + +} +#else +int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) +{ + return 1; +} +#endif + /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). -- cgit v1.2.3 From 5170836679185357dc1b7660bad13287b39e1e33 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 27 Feb 2009 14:03:03 -0800 Subject: Fix recursive lock in free_uid()/free_user_ns() free_uid() and free_user_ns() are corecursive when CONFIG_USER_SCHED=n, but free_user_ns() is called from free_uid() by way of uid_hash_remove(), which requires uidhash_lock to be held. free_user_ns() then calls free_uid() to complete the destruction. Fix this by deferring the destruction of the user_namespace. Signed-off-by: David Howells Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/user_namespace.h | 1 + kernel/user_namespace.c | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 315bcd375224..cc4f45361dbb 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -13,6 +13,7 @@ struct user_namespace { struct kref kref; struct hlist_head uidhash_table[UIDHASH_SZ]; struct user_struct *creator; + struct work_struct destroyer; }; extern struct user_namespace init_user_ns; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 79084311ee57..076c7c8215b0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -60,12 +60,25 @@ int create_user_ns(struct cred *new) return 0; } -void free_user_ns(struct kref *kref) +/* + * Deferred destructor for a user namespace. This is required because + * free_user_ns() may be called with uidhash_lock held, but we need to call + * back to free_uid() which will want to take the lock again. + */ +static void free_user_ns_work(struct work_struct *work) { - struct user_namespace *ns; - - ns = container_of(kref, struct user_namespace, kref); + struct user_namespace *ns = + container_of(work, struct user_namespace, destroyer); free_uid(ns->creator); kfree(ns); } + +void free_user_ns(struct kref *kref) +{ + struct user_namespace *ns = + container_of(kref, struct user_namespace, kref); + + INIT_WORK(&ns->destroyer, free_user_ns_work); + schedule_work(&ns->destroyer); +} EXPORT_SYMBOL(free_user_ns); -- cgit v1.2.3 From 5c2522218059ca1f4174a568923b988aad3ddfda Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Fri, 27 Feb 2009 10:01:36 +0000 Subject: net headers: cleanup dcbnl.h 1) add an include for 2) change dcbmsg.dcb_family from unsigned char to __u8 to be more consistent with use of kernel types Signed-off-by: Chris Leech Acked-by: Sam Ravnborg Signed-off-by: David S. Miller --- include/linux/dcbnl.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h index b0ef274e0031..7d2e10006188 100644 --- a/include/linux/dcbnl.h +++ b/include/linux/dcbnl.h @@ -20,10 +20,12 @@ #ifndef __LINUX_DCBNL_H__ #define __LINUX_DCBNL_H__ +#include + #define DCB_PROTO_VERSION 1 struct dcbmsg { - unsigned char dcb_family; + __u8 dcb_family; __u8 cmd; __u16 dcb_pad; }; -- cgit v1.2.3 From 709ab3261e3ed789c0bb31c6ab53c9eccb276522 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Fri, 27 Feb 2009 10:01:42 +0000 Subject: net headers: export dcbnl.h The DCB netlink interface is required for building the userspace tools available at e1000.sourceforge.net Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- include/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/Kbuild b/include/linux/Kbuild index b97cdc516a8f..106c3ba50844 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -52,6 +52,7 @@ header-y += const.h header-y += cgroupstats.h header-y += cramfs_fs.h header-y += cycx_cfm.h +header-y += dcbnl.h header-y += dlmconstants.h header-y += dlm_device.h header-y += dlm_netlink.h -- cgit v1.2.3 From 5ce04e3de8c36ba37c56e94e3c4dc7973c7f546c Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Sun, 1 Mar 2009 08:53:27 -0800 Subject: fix warning in io_mapping_map_wc() Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- include/linux/io-mapping.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index cbc2f0cd631b..0adb0f91568c 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -91,8 +91,11 @@ io_mapping_unmap_atomic(void *vaddr) static inline void * io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset) { + resource_size_t phys_addr; + BUG_ON(offset >= mapping->size); - resource_size_t phys_addr = mapping->base + offset; + phys_addr = mapping->base + offset; + return ioremap_wc(phys_addr, PAGE_SIZE); } -- cgit v1.2.3 From 9d40bbda599def1e1d155d7f7dca14fe8744bd2b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 4 Mar 2009 23:46:25 -0800 Subject: vlan: Fix vlan-in-vlan crashes. As analyzed by Patrick McHardy, vlan needs to reset it's netdev_ops pointer in it's ->init() function but this leaves the compat method pointers stale. Add a netdev_resync_ops() and call it from the vlan code. Any other driver which changes ->netdev_ops after register_netdevice() will need to call this new function after doing so too. With help from Patrick McHardy. Tested-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/8021q/vlan_dev.c | 3 ++- net/core/dev.c | 56 ++++++++++++++++++++++++++++------------------- 3 files changed, 37 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ec54785d34f9..659366734f3f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1079,6 +1079,7 @@ extern void synchronize_net(void); extern int register_netdevice_notifier(struct notifier_block *nb); extern int unregister_netdevice_notifier(struct notifier_block *nb); extern int init_dummy_netdev(struct net_device *dev); +extern void netdev_resync_ops(struct net_device *dev); extern int call_netdevice_notifiers(unsigned long val, struct net_device *dev); extern struct net_device *dev_get_by_index(struct net *net, int ifindex); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 4a19acd3a32b..1b34135cf990 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -553,7 +553,7 @@ static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa) int err = 0; if (netif_device_present(real_dev) && ops->ndo_neigh_setup) - err = ops->ndo_neigh_setup(dev, pa); + err = ops->ndo_neigh_setup(real_dev, pa); return err; } @@ -639,6 +639,7 @@ static int vlan_dev_init(struct net_device *dev) dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN; dev->netdev_ops = &vlan_netdev_ops; } + netdev_resync_ops(dev); if (is_vlan_dev(real_dev)) subclass = 1; diff --git a/net/core/dev.c b/net/core/dev.c index 2dd484ed3dbb..f1129706ce7b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4282,6 +4282,39 @@ unsigned long netdev_fix_features(unsigned long features, const char *name) } EXPORT_SYMBOL(netdev_fix_features); +/* Some devices need to (re-)set their netdev_ops inside + * ->init() or similar. If that happens, we have to setup + * the compat pointers again. + */ +void netdev_resync_ops(struct net_device *dev) +{ +#ifdef CONFIG_COMPAT_NET_DEV_OPS + const struct net_device_ops *ops = dev->netdev_ops; + + dev->init = ops->ndo_init; + dev->uninit = ops->ndo_uninit; + dev->open = ops->ndo_open; + dev->change_rx_flags = ops->ndo_change_rx_flags; + dev->set_rx_mode = ops->ndo_set_rx_mode; + dev->set_multicast_list = ops->ndo_set_multicast_list; + dev->set_mac_address = ops->ndo_set_mac_address; + dev->validate_addr = ops->ndo_validate_addr; + dev->do_ioctl = ops->ndo_do_ioctl; + dev->set_config = ops->ndo_set_config; + dev->change_mtu = ops->ndo_change_mtu; + dev->neigh_setup = ops->ndo_neigh_setup; + dev->tx_timeout = ops->ndo_tx_timeout; + dev->get_stats = ops->ndo_get_stats; + dev->vlan_rx_register = ops->ndo_vlan_rx_register; + dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid; + dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid; +#ifdef CONFIG_NET_POLL_CONTROLLER + dev->poll_controller = ops->ndo_poll_controller; +#endif +#endif +} +EXPORT_SYMBOL(netdev_resync_ops); + /** * register_netdevice - register a network device * @dev: device to register @@ -4326,28 +4359,7 @@ int register_netdevice(struct net_device *dev) * This is temporary until all network devices are converted. */ if (dev->netdev_ops) { - const struct net_device_ops *ops = dev->netdev_ops; - - dev->init = ops->ndo_init; - dev->uninit = ops->ndo_uninit; - dev->open = ops->ndo_open; - dev->change_rx_flags = ops->ndo_change_rx_flags; - dev->set_rx_mode = ops->ndo_set_rx_mode; - dev->set_multicast_list = ops->ndo_set_multicast_list; - dev->set_mac_address = ops->ndo_set_mac_address; - dev->validate_addr = ops->ndo_validate_addr; - dev->do_ioctl = ops->ndo_do_ioctl; - dev->set_config = ops->ndo_set_config; - dev->change_mtu = ops->ndo_change_mtu; - dev->neigh_setup = ops->ndo_neigh_setup; - dev->tx_timeout = ops->ndo_tx_timeout; - dev->get_stats = ops->ndo_get_stats; - dev->vlan_rx_register = ops->ndo_vlan_rx_register; - dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid; - dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid; -#ifdef CONFIG_NET_POLL_CONTROLLER - dev->poll_controller = ops->ndo_poll_controller; -#endif + netdev_resync_ops(dev); } else { char drivername[64]; pr_info("%s (%s): not using net_device_ops yet\n", -- cgit v1.2.3 From e7d3ef13d52a126438f687a1a32da65ff926ed57 Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Wed, 4 Mar 2009 11:59:46 -0800 Subject: libata: change drive ready wait after hard reset to 5s This fixes problems during resume with drives that take longer than 1s to be ready. The ATA-6 spec appears to allow 5 seconds for a drive to be ready. On one affected system, this patch changes "PM: resume devices took..." message from 17 seconds to 4 seconds, and gets rid of a lot of ugly timeout/error messages. Without this patch, the libata code moves on after 1s, tries to send a soft reset (which the drive doesn't see because it isn't ready) which also times out, then an IDENTIFY command is sent to the drive which times out, and finally the error handler will try to send another hard reset which will finally get things working. Signed-off-by: Stuart Hayes Signed-off-by: Andrew Morton Signed-off-by: Jeff Garzik --- include/linux/libata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 5d87bc09a1f5..dd818c7decd7 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -275,7 +275,7 @@ enum { * advised to wait only for the following duration before * doing SRST. */ - ATA_TMOUT_PMP_SRST_WAIT = 1000, + ATA_TMOUT_PMP_SRST_WAIT = 5000, /* ATA bus states */ BUS_UNKNOWN = 0, -- cgit v1.2.3 From 5825627c9463581fd9e70f8285685889ae5bb9bb Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 27 Feb 2009 17:35:43 +0900 Subject: libata: fix dma_unmap_sg misuse libata passes the returned value of dma_map_sg() to dma_unmap_sg(),which is the misuse of dma_unmap_sg(). DMA-mapping.txt says: To unmap a scatterlist, just call: pci_unmap_sg(pdev, sglist, nents, direction); Again, make sure DMA activity has already finished. PLEASE NOTE: The 'nents' argument to the pci_unmap_sg call must be the _same_ one you passed into the pci_map_sg call, it should _NOT_ be the 'count' value _returned_ from the pci_map_sg call. Signed-off-by: FUJITA Tomonori Acked-by: Bartlomiej Zolnierkiewicz Acked-by: Tejun Heo Signed-off-by: Jeff Garzik --- drivers/ata/libata-core.c | 4 ++-- include/linux/libata.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 9fbf0595f3d4..5e324cea3019 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4612,7 +4612,7 @@ void ata_sg_clean(struct ata_queued_cmd *qc) VPRINTK("unmapping %u sg elements\n", qc->n_elem); if (qc->n_elem) - dma_unmap_sg(ap->dev, sg, qc->n_elem, dir); + dma_unmap_sg(ap->dev, sg, qc->orig_n_elem, dir); qc->flags &= ~ATA_QCFLAG_DMAMAP; qc->sg = NULL; @@ -4727,7 +4727,7 @@ static int ata_sg_setup(struct ata_queued_cmd *qc) return -1; DPRINTK("%d sg elements mapped\n", n_elem); - + qc->orig_n_elem = qc->n_elem; qc->n_elem = n_elem; qc->flags |= ATA_QCFLAG_DMAMAP; diff --git a/include/linux/libata.h b/include/linux/libata.h index dd818c7decd7..fbf064e13ad5 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -530,6 +530,7 @@ struct ata_queued_cmd { unsigned long flags; /* ATA_QCFLAG_xxx */ unsigned int tag; unsigned int n_elem; + unsigned int orig_n_elem; int dma_dir; -- cgit v1.2.3 From 84bda12af31f930e4200c5244aa111de2485d7b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 Mar 2009 18:53:26 +0900 Subject: libata: align ap->sector_buf ap->sector_buf is used as DMA target and should at least be aligned on cacheline. This caused problems on some embedded machines. Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- include/linux/libata.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index fbf064e13ad5..dc18b87ed722 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -751,7 +751,8 @@ struct ata_port { acpi_handle acpi_handle; struct ata_acpi_gtm __acpi_init_gtm; /* use ata_acpi_init_gtm() */ #endif - u8 sector_buf[ATA_SECT_SIZE]; /* owned by EH */ + /* owned by EH */ + u8 sector_buf[ATA_SECT_SIZE] ____cacheline_aligned; }; /* The following initializer overrides a method to NULL whether one of -- cgit v1.2.3 From 849d7130001ab740a5a4778a561049841fdd77c9 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 5 Mar 2009 16:10:57 +0100 Subject: ide: allow to wrap interrupt handler Signed-off-by: Stanislaw Gruszka Cc: Andrew Victor [bart: minor checkpatch.pl / CodingStyle fixups] Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-io.c | 1 + drivers/ide/ide-probe.c | 7 ++++++- include/linux/ide.h | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 9ff90cb1dbf1..a9a6c208288a 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -1162,6 +1162,7 @@ out_early: return irq_ret; } +EXPORT_SYMBOL_GPL(ide_intr); /** * ide_do_drive_cmd - issue IDE special command diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index ce0818a993f6..ee8e3e7cad51 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -950,6 +950,7 @@ static int ide_port_setup_devices(ide_hwif_t *hwif) static int init_irq (ide_hwif_t *hwif) { struct ide_io_ports *io_ports = &hwif->io_ports; + irq_handler_t irq_handler; int sa = 0; mutex_lock(&ide_cfg_mtx); @@ -959,6 +960,10 @@ static int init_irq (ide_hwif_t *hwif) hwif->timer.function = &ide_timer_expiry; hwif->timer.data = (unsigned long)hwif; + irq_handler = hwif->host->irq_handler; + if (irq_handler == NULL) + irq_handler = ide_intr; + #if defined(__mc68000__) sa = IRQF_SHARED; #endif /* __mc68000__ */ @@ -969,7 +974,7 @@ static int init_irq (ide_hwif_t *hwif) if (io_ports->ctl_addr) hwif->tp_ops->set_irq(hwif, 1); - if (request_irq(hwif->irq, &ide_intr, sa, hwif->name, hwif)) + if (request_irq(hwif->irq, irq_handler, sa, hwif->name, hwif)) goto out_up; if (!hwif->rqsize) { diff --git a/include/linux/ide.h b/include/linux/ide.h index fe235b65207e..e0cedfe9fad4 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -866,6 +866,7 @@ struct ide_host { unsigned int n_ports; struct device *dev[2]; unsigned int (*init_chipset)(struct pci_dev *); + irq_handler_t irq_handler; unsigned long host_flags; void *host_priv; ide_hwif_t *cur_port; /* for hosts requiring serialization */ -- cgit v1.2.3 From ebcad5aaea26da3cb2ca90b7f31a67a027eb60db Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Thu, 5 Mar 2009 16:10:59 +0100 Subject: remove stale comment from HDIO_GET_IDENTITY returns 256 words currently. Noticed by Norman Diamond. Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/hdreg.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hdreg.h b/include/linux/hdreg.h index c37e9241fae7..ed21bd3dbd25 100644 --- a/include/linux/hdreg.h +++ b/include/linux/hdreg.h @@ -511,7 +511,6 @@ struct hd_driveid { unsigned short words69_70[2]; /* reserved words 69-70 * future command overlap and queuing */ - /* HDIO_GET_IDENTITY currently returns only words 0 through 70 */ unsigned short words71_74[4]; /* reserved words 71-74 * for IDENTIFY PACKET DEVICE command */ -- cgit v1.2.3 From d42ad15b759d05a87f22b484af63987eff38ea88 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Thu, 5 Mar 2009 17:20:55 +0100 Subject: ata: add CFA specific identify data words Declare CFA specific identify data words 162 and 163 for future use. Signed-off-by: Sergei Shtylyov Signed-off-by: Jeff Garzik [bart: update patch summary/description] Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ata.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index 08a86d5cdf1b..9a061accd8b8 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -89,6 +89,8 @@ enum { ATA_ID_DLF = 128, ATA_ID_CSFO = 129, ATA_ID_CFA_POWER = 160, + ATA_ID_CFA_KEY_MGMT = 162, + ATA_ID_CFA_MODES = 163, ATA_ID_ROT_SPEED = 217, ATA_ID_PIO4 = (1 << 1), -- cgit v1.2.3 From ab96ddec7213004b632d24dc2cdcd2df5f16f50b Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sat, 7 Mar 2009 13:39:22 -0800 Subject: Input: serio - fix protocol number for TouchIT213 Protocol 0x37 has been reserved for iNexio devices and Sahara was supposed to get 0x38. Reported-by: Claudio Nieder Signed-off-by: Dmitry Torokhov --- include/linux/serio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/serio.h b/include/linux/serio.h index 1bcb357a01a1..e0417e4d3f15 100644 --- a/include/linux/serio.h +++ b/include/linux/serio.h @@ -212,7 +212,7 @@ static inline void serio_unpin_driver(struct serio *serio) #define SERIO_FUJITSU 0x35 #define SERIO_ZHENHUA 0x36 #define SERIO_INEXIO 0x37 -#define SERIO_TOUCHIT213 0x37 +#define SERIO_TOUCHIT213 0x38 #define SERIO_W8001 0x39 #endif -- cgit v1.2.3 From 129f8ae9b1b5be94517da76009ea956e89104ce8 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 9 Mar 2009 15:07:33 -0400 Subject: Revert "[CPUFREQ] Disable sysfs ui for p4-clockmod." This reverts commit e088e4c9cdb618675874becb91b2fd581ee707e6. Removing the sysfs interface for p4-clockmod was flagged as a regression in bug 12826. Course of action: - Find out the remaining causes of overheating, and fix them if possible. ACPI should be doing the right thing automatically. If it isn't, we need to fix that. - mark p4-clockmod ui as deprecated - try again with the removal in six months. It's not really feasible to printk about the deprecation, because it needs to happen at all the sysfs entry points, which means adding a lot of strcmp("p4-clockmod".. calls to the core, which.. bleuch. Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 1 - drivers/cpufreq/cpufreq.c | 51 +++++++++++-------------------- include/linux/cpufreq.h | 1 - 3 files changed, 18 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index b585e04cbc9e..3178c3acd97e 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -277,7 +277,6 @@ static struct cpufreq_driver p4clockmod_driver = { .name = "p4-clockmod", .owner = THIS_MODULE, .attr = p4clockmod_attr, - .hide_interface = 1, }; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index b55cb67435bd..d6daf3c507d3 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -754,11 +754,6 @@ static struct kobj_type ktype_cpufreq = { .release = cpufreq_sysfs_release, }; -static struct kobj_type ktype_empty_cpufreq = { - .sysfs_ops = &sysfs_ops, - .release = cpufreq_sysfs_release, -}; - /** * cpufreq_add_dev - add a CPU device @@ -892,36 +887,26 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) memcpy(&new_policy, policy, sizeof(struct cpufreq_policy)); /* prepare interface data */ - if (!cpufreq_driver->hide_interface) { - ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, - &sys_dev->kobj, "cpufreq"); + ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, &sys_dev->kobj, + "cpufreq"); + if (ret) + goto err_out_driver_exit; + + /* set up files for this cpu device */ + drv_attr = cpufreq_driver->attr; + while ((drv_attr) && (*drv_attr)) { + ret = sysfs_create_file(&policy->kobj, &((*drv_attr)->attr)); if (ret) goto err_out_driver_exit; - - /* set up files for this cpu device */ - drv_attr = cpufreq_driver->attr; - while ((drv_attr) && (*drv_attr)) { - ret = sysfs_create_file(&policy->kobj, - &((*drv_attr)->attr)); - if (ret) - goto err_out_driver_exit; - drv_attr++; - } - if (cpufreq_driver->get) { - ret = sysfs_create_file(&policy->kobj, - &cpuinfo_cur_freq.attr); - if (ret) - goto err_out_driver_exit; - } - if (cpufreq_driver->target) { - ret = sysfs_create_file(&policy->kobj, - &scaling_cur_freq.attr); - if (ret) - goto err_out_driver_exit; - } - } else { - ret = kobject_init_and_add(&policy->kobj, &ktype_empty_cpufreq, - &sys_dev->kobj, "cpufreq"); + drv_attr++; + } + if (cpufreq_driver->get) { + ret = sysfs_create_file(&policy->kobj, &cpuinfo_cur_freq.attr); + if (ret) + goto err_out_driver_exit; + } + if (cpufreq_driver->target) { + ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); if (ret) goto err_out_driver_exit; } diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 384b38d3e8e2..161042746afc 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -234,7 +234,6 @@ struct cpufreq_driver { int (*suspend) (struct cpufreq_policy *policy, pm_message_t pmsg); int (*resume) (struct cpufreq_policy *policy); struct freq_attr **attr; - bool hide_interface; }; /* flags */ -- cgit v1.2.3 From ae46141ff08f1965b17c531b571953c39ce8b9e2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Mar 2009 20:33:18 -0400 Subject: NFSv3: Fix posix ACL code Fix a memory leak due to allocation in the XDR layer. In cases where the RPC call needs to be retransmitted, we end up allocating new pages without clearing the old ones. Fix this by moving the allocation into nfs3_proc_setacls(). Also fix an issue discovered by Kevin Rudd, whereby the amount of memory reserved for the acls in the xdr_buf->head was miscalculated, and causing corruption. Signed-off-by: Trond Myklebust --- fs/nfs/nfs3acl.c | 27 +++++++++++++++++++++------ fs/nfs/nfs3xdr.c | 34 +++++++++++++--------------------- include/linux/nfs_xdr.h | 2 ++ include/linux/nfsacl.h | 3 +++ 4 files changed, 39 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index cef62557c87d..6bbf0e6daad2 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -292,7 +292,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, { struct nfs_server *server = NFS_SERVER(inode); struct nfs_fattr fattr; - struct page *pages[NFSACL_MAXPAGES] = { }; + struct page *pages[NFSACL_MAXPAGES]; struct nfs3_setaclargs args = { .inode = inode, .mask = NFS_ACL, @@ -303,7 +303,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, .rpc_argp = &args, .rpc_resp = &fattr, }; - int status, count; + int status; status = -EOPNOTSUPP; if (!nfs_server_capable(inode, NFS_CAP_ACLS)) @@ -319,6 +319,20 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, if (S_ISDIR(inode->i_mode)) { args.mask |= NFS_DFACL; args.acl_default = dfacl; + args.len = nfsacl_size(acl, dfacl); + } else + args.len = nfsacl_size(acl, NULL); + + if (args.len > NFS_ACL_INLINE_BUFSIZE) { + unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT); + + status = -ENOMEM; + do { + args.pages[args.npages] = alloc_page(GFP_KERNEL); + if (args.pages[args.npages] == NULL) + goto out_freepages; + args.npages++; + } while (args.npages < npages); } dprintk("NFS call setacl\n"); @@ -329,10 +343,6 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, nfs_zap_acl_cache(inode); dprintk("NFS reply setacl: %d\n", status); - /* pages may have been allocated at the xdr layer. */ - for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++) - __free_page(args.pages[count]); - switch (status) { case 0: status = nfs_refresh_inode(inode, &fattr); @@ -346,6 +356,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, case -ENOTSUPP: status = -EOPNOTSUPP; } +out_freepages: + while (args.npages != 0) { + args.npages--; + __free_page(args.pages[args.npages]); + } out: return status; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 11cdddec1432..6cdeacffde46 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -82,8 +82,10 @@ #define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2) #define ACL3_getaclargs_sz (NFS3_fh_sz+1) -#define ACL3_setaclargs_sz (NFS3_fh_sz+1+2*(2+5*3)) -#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+2*(2+5*3)) +#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) +#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) #define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) /* @@ -703,28 +705,18 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p, struct nfs3_setaclargs *args) { struct xdr_buf *buf = &req->rq_snd_buf; - unsigned int base, len_in_head, len = nfsacl_size( - (args->mask & NFS_ACL) ? args->acl_access : NULL, - (args->mask & NFS_DFACL) ? args->acl_default : NULL); - int count, err; + unsigned int base; + int err; p = xdr_encode_fhandle(p, NFS_FH(args->inode)); *p++ = htonl(args->mask); - base = (char *)p - (char *)buf->head->iov_base; - /* put as much of the acls into head as possible. */ - len_in_head = min_t(unsigned int, buf->head->iov_len - base, len); - len -= len_in_head; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + (len_in_head >> 2)); - - for (count = 0; (count << PAGE_SHIFT) < len; count++) { - args->pages[count] = alloc_page(GFP_KERNEL); - if (!args->pages[count]) { - while (count) - __free_page(args->pages[--count]); - return -ENOMEM; - } - } - xdr_encode_pages(buf, args->pages, 0, len); + req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + base = req->rq_slen; + + if (args->npages != 0) + xdr_encode_pages(buf, args->pages, 0, args->len); + else + req->rq_slen += args->len; err = nfsacl_encode(buf, base, args->inode, (args->mask & NFS_ACL) ? diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index a550b528319f..2e5f00066afd 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -406,6 +406,8 @@ struct nfs3_setaclargs { int mask; struct posix_acl * acl_access; struct posix_acl * acl_default; + size_t len; + unsigned int npages; struct page ** pages; }; diff --git a/include/linux/nfsacl.h b/include/linux/nfsacl.h index 54487a99beb8..43011b69297c 100644 --- a/include/linux/nfsacl.h +++ b/include/linux/nfsacl.h @@ -37,6 +37,9 @@ #define NFSACL_MAXPAGES ((2*(8+12*NFS_ACL_MAX_ENTRIES) + PAGE_SIZE-1) \ >> PAGE_SHIFT) +#define NFS_ACL_MAX_ENTRIES_INLINE (5) +#define NFS_ACL_INLINE_BUFSIZE ((2*(2+3*NFS_ACL_MAX_ENTRIES_INLINE)) << 2) + static inline unsigned int nfsacl_size(struct posix_acl *acl_access, struct posix_acl *acl_default) { -- cgit v1.2.3 From 78851e1aa4c3b796d5f0bb11b445016726302b44 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 10 Mar 2009 20:33:19 -0400 Subject: NLM: Shrink the IPv4-only version of nlm_cmp_addr() Clean up/micro-optimatization: Make the AF_INET-only version of nlm_cmp_addr() smaller. This matches the style of nlm_privileged_requester(), and makes the AF_INET-only version of nlm_cmp_addr() nearly the same size as it was before IPv6 support. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/lockd/lockd.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index aa6fe7026de7..51855dfd8adb 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -346,6 +346,7 @@ static inline int __nlm_cmp_addr4(const struct sockaddr *sap1, return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) static inline int __nlm_cmp_addr6(const struct sockaddr *sap1, const struct sockaddr *sap2) { @@ -353,6 +354,13 @@ static inline int __nlm_cmp_addr6(const struct sockaddr *sap1, const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2; return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); } +#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ +static inline int __nlm_cmp_addr6(const struct sockaddr *sap1, + const struct sockaddr *sap2) +{ + return 0; +} +#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ /* * Compare two host addresses -- cgit v1.2.3 From 76e6eee03353f01bfca707d4dbb1f10a4ee27dc0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 12 Mar 2009 14:35:43 -0600 Subject: cpumask: tsk_cpumask for accessing the struct task_struct's cpus_allowed. This allows us to change the representation (to a dangling bitmap or cpumask_var_t) without breaking all the callers: they can use tsk_cpumask() now and won't see a difference as the changes roll into linux-next. Signed-off-by: Rusty Russell --- include/linux/sched.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8c216e057c94..011db2f4c94c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1419,6 +1419,9 @@ struct task_struct { #endif }; +/* Future-safe accessor for struct task_struct's cpus_allowed. */ +#define tsk_cpumask(tsk) (&(tsk)->cpus_allowed) + /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH -- cgit v1.2.3 From 45e575ab9bfada5a5ef1b6174f8e749b1ecf0864 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 12 Mar 2009 14:35:44 -0600 Subject: cpumask: mm_cpumask for accessing the struct mm_struct's cpu_vm_mask. This allows us to change the representation (to a dangling bitmap or cpumask_var_t) without breaking all the callers: they can use mm_cpumask() now and won't see a difference as the changes roll into linux-next. Signed-off-by: Rusty Russell --- include/linux/mm_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 92915e81443f..d84feb7bdbf0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -276,4 +276,7 @@ struct mm_struct { #endif }; +/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ +#define mm_cpumask(mm) (&(mm)->cpu_vm_mask) + #endif /* _LINUX_MM_TYPES_H */ -- cgit v1.2.3 From 446c92b2901bedb3725d29b4e73def8aba623ffc Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Thu, 12 Mar 2009 18:03:16 +0100 Subject: [ARM] 5421/1: ftrace: fix crash due to tracing of __naked functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a fix for the following crash observed in 2.6.29-rc3: http://lkml.org/lkml/2009/1/29/150 On ARM it doesn't make sense to trace a naked function because then mcount is called without stack and frame pointer being set up and there is no chance to restore the lr register to the value before mcount was called. Reported-by: Matthias Kaehlcke Tested-by: Matthias Kaehlcke Cc: Abhishek Sagar Cc: Steven Rostedt Cc: Ingo Molnar Signed-off-by: Uwe Kleine-König Signed-off-by: Russell King --- arch/arm/kernel/fiq.c | 4 ++-- arch/arm/mm/copypage-feroceon.c | 2 +- arch/arm/mm/copypage-v3.c | 2 +- arch/arm/mm/copypage-v4mc.c | 2 +- arch/arm/mm/copypage-v4wb.c | 2 +- arch/arm/mm/copypage-v4wt.c | 2 +- arch/arm/mm/copypage-xsc3.c | 2 +- arch/arm/mm/copypage-xscale.c | 2 +- include/linux/compiler-gcc.h | 10 +++++++++- 9 files changed, 18 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/kernel/fiq.c b/arch/arm/kernel/fiq.c index 36f81d967979..6ff7919613d7 100644 --- a/arch/arm/kernel/fiq.c +++ b/arch/arm/kernel/fiq.c @@ -88,7 +88,7 @@ void set_fiq_handler(void *start, unsigned int length) * disable irqs for the duration. Note - these functions are almost * entirely coded in assembly. */ -void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) +void __naked set_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( @@ -106,7 +106,7 @@ void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) : "r" (®s->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE)); } -void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) +void __naked get_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( diff --git a/arch/arm/mm/copypage-feroceon.c b/arch/arm/mm/copypage-feroceon.c index c3ba6a94da0c..70997d5bee2d 100644 --- a/arch/arm/mm/copypage-feroceon.c +++ b/arch/arm/mm/copypage-feroceon.c @@ -13,7 +13,7 @@ #include #include -static void __attribute__((naked)) +static void __naked feroceon_copy_user_page(void *kto, const void *kfrom) { asm("\ diff --git a/arch/arm/mm/copypage-v3.c b/arch/arm/mm/copypage-v3.c index 70ed96c8af8e..de9c06854ad7 100644 --- a/arch/arm/mm/copypage-v3.c +++ b/arch/arm/mm/copypage-v3.c @@ -15,7 +15,7 @@ * * FIXME: do we need to handle cache stuff... */ -static void __attribute__((naked)) +static void __naked v3_copy_user_page(void *kto, const void *kfrom) { asm("\n\ diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c index 1601698b9800..7370a7142b04 100644 --- a/arch/arm/mm/copypage-v4mc.c +++ b/arch/arm/mm/copypage-v4mc.c @@ -44,7 +44,7 @@ static DEFINE_SPINLOCK(minicache_lock); * instruction. If your processor does not supply this, you have to write your * own copy_user_highpage that does the right thing. */ -static void __attribute__((naked)) +static void __naked mc_copy_user_page(void *from, void *to) { asm volatile( diff --git a/arch/arm/mm/copypage-v4wb.c b/arch/arm/mm/copypage-v4wb.c index 3ec93dab7656..9ab098414227 100644 --- a/arch/arm/mm/copypage-v4wb.c +++ b/arch/arm/mm/copypage-v4wb.c @@ -22,7 +22,7 @@ * instruction. If your processor does not supply this, you have to write your * own copy_user_highpage that does the right thing. */ -static void __attribute__((naked)) +static void __naked v4wb_copy_user_page(void *kto, const void *kfrom) { asm("\ diff --git a/arch/arm/mm/copypage-v4wt.c b/arch/arm/mm/copypage-v4wt.c index 0f1188efae45..300efafd6643 100644 --- a/arch/arm/mm/copypage-v4wt.c +++ b/arch/arm/mm/copypage-v4wt.c @@ -20,7 +20,7 @@ * dirty data in the cache. However, we do have to ensure that * subsequent reads are up to date. */ -static void __attribute__((naked)) +static void __naked v4wt_copy_user_page(void *kto, const void *kfrom) { asm("\ diff --git a/arch/arm/mm/copypage-xsc3.c b/arch/arm/mm/copypage-xsc3.c index 39a994542cad..bc4525f5ab23 100644 --- a/arch/arm/mm/copypage-xsc3.c +++ b/arch/arm/mm/copypage-xsc3.c @@ -29,7 +29,7 @@ * if we eventually end up using our copied page. * */ -static void __attribute__((naked)) +static void __naked xsc3_mc_copy_user_page(void *kto, const void *kfrom) { asm("\ diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c index d18f2397ee2d..76824d3e966a 100644 --- a/arch/arm/mm/copypage-xscale.c +++ b/arch/arm/mm/copypage-xscale.c @@ -42,7 +42,7 @@ static DEFINE_SPINLOCK(minicache_lock); * Dcache aliasing issue. The writes will be forwarded to the write buffer, * and merged as appropriate. */ -static void __attribute__((naked)) +static void __naked mc_copy_user_page(void *from, void *to) { /* diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 1514d534deeb..a3ed7cb8ca34 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -52,7 +52,15 @@ #define __deprecated __attribute__((deprecated)) #define __packed __attribute__((packed)) #define __weak __attribute__((weak)) -#define __naked __attribute__((naked)) + +/* + * it doesn't make sense on ARM (currently the only user of __naked) to trace + * naked functions because then mcount is called without stack and frame pointer + * being set up and there is no chance to restore the lr register to the value + * before mcount was called. + */ +#define __naked __attribute__((naked)) notrace + #define __noreturn __attribute__((noreturn)) /* -- cgit v1.2.3 From 5d82720a7f41f0c877e026c7d17e3bf20ccdbae0 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 13 Mar 2009 21:16:13 +0100 Subject: ide: save the returned value of dma_map_sg dma_map_sg could return a value different to 'nents' argument of dma_map_sg so the ide stack needs to save it for the later usage (e.g. for_each_sg). The ide stack also needs to save the original sg_nents value for pci_unmap_sg. Signed-off-by: FUJITA Tomonori [bart: backport to Linus' tree] Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-dma.c | 12 +++++++++--- include/linux/ide.h | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index 72ebab0bc755..059c90bb5ad2 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -128,6 +128,7 @@ int ide_build_sglist(ide_drive_t *drive, struct request *rq) { ide_hwif_t *hwif = drive->hwif; struct scatterlist *sg = hwif->sg_table; + int i; ide_map_sg(drive, rq); @@ -136,8 +137,13 @@ int ide_build_sglist(ide_drive_t *drive, struct request *rq) else hwif->sg_dma_direction = DMA_TO_DEVICE; - return dma_map_sg(hwif->dev, sg, hwif->sg_nents, - hwif->sg_dma_direction); + i = dma_map_sg(hwif->dev, sg, hwif->sg_nents, hwif->sg_dma_direction); + if (i) { + hwif->orig_sg_nents = hwif->sg_nents; + hwif->sg_nents = i; + } + + return i; } EXPORT_SYMBOL_GPL(ide_build_sglist); @@ -156,7 +162,7 @@ void ide_destroy_dmatable(ide_drive_t *drive) { ide_hwif_t *hwif = drive->hwif; - dma_unmap_sg(hwif->dev, hwif->sg_table, hwif->sg_nents, + dma_unmap_sg(hwif->dev, hwif->sg_table, hwif->orig_sg_nents, hwif->sg_dma_direction); } EXPORT_SYMBOL_GPL(ide_destroy_dmatable); diff --git a/include/linux/ide.h b/include/linux/ide.h index e0cedfe9fad4..25087aead657 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -797,6 +797,7 @@ typedef struct hwif_s { struct scatterlist *sg_table; int sg_max_nents; /* Maximum number of entries in it */ int sg_nents; /* Current number of entries in it */ + int orig_sg_nents; int sg_dma_direction; /* dma transfer direction */ /* data phase of the active command (currently only valid for PIO/DMA) */ -- cgit v1.2.3 From 87092698c665e0a358caf9825ae13114343027e8 Mon Sep 17 00:00:00 2001 From: un'ichi Nomura Date: Mon, 9 Mar 2009 10:40:52 +0100 Subject: block: Add gfp_mask parameter to bio_integrity_clone() Stricter gfp_mask might be required for clone allocation. For example, request-based dm may clone bio in interrupt context so it has to use GFP_ATOMIC. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Acked-by: Martin K. Petersen Cc: Alasdair G Kergon Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 5 +++-- fs/bio.c | 2 +- include/linux/bio.h | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 549b0144da11..fe2b1aa2464e 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -685,19 +685,20 @@ EXPORT_SYMBOL(bio_integrity_split); * bio_integrity_clone - Callback for cloning bios with integrity metadata * @bio: New bio * @bio_src: Original bio + * @gfp_mask: Memory allocation mask * @bs: bio_set to allocate bip from * * Description: Called to allocate a bip when cloning a bio */ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - struct bio_set *bs) + gfp_t gfp_mask, struct bio_set *bs) { struct bio_integrity_payload *bip_src = bio_src->bi_integrity; struct bio_integrity_payload *bip; BUG_ON(bip_src == NULL); - bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs); + bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs); if (bip == NULL) return -EIO; diff --git a/fs/bio.c b/fs/bio.c index 124b95c4d582..cf747378b977 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -463,7 +463,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) if (bio_integrity(bio)) { int ret; - ret = bio_integrity_clone(b, bio, fs_bio_set); + ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); if (ret < 0) return NULL; diff --git a/include/linux/bio.h b/include/linux/bio.h index 1b16108a5417..d8bd43bfdcf5 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -531,7 +531,7 @@ extern void bio_integrity_endio(struct bio *, int); extern void bio_integrity_advance(struct bio *, unsigned int); extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int); extern void bio_integrity_split(struct bio *, struct bio_pair *, int); -extern int bio_integrity_clone(struct bio *, struct bio *, struct bio_set *); +extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t, struct bio_set *); extern int bioset_integrity_create(struct bio_set *, int); extern void bioset_integrity_free(struct bio_set *); extern void bio_integrity_init_slab(void); @@ -542,7 +542,7 @@ extern void bio_integrity_init_slab(void); #define bioset_integrity_create(a, b) (0) #define bio_integrity_prep(a) (0) #define bio_integrity_enabled(a) (0) -#define bio_integrity_clone(a, b, c) (0) +#define bio_integrity_clone(a, b, c,d ) (0) #define bioset_integrity_free(a) do { } while (0) #define bio_integrity_free(a, b) do { } while (0) #define bio_integrity_endio(a, b) do { } while (0) -- cgit v1.2.3 From 76a67ec6fb79ff3570dcb5342142c16098299911 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 16 Mar 2009 18:34:20 -0400 Subject: nfsd: nfsd should drop CAP_MKNOD for non-root Since creating a device node is normally an operation requiring special privilege, Igor Zhbanov points out that it is surprising (to say the least) that a client can, for example, create a device node on a filesystem exported with root_squash. So, make sure CAP_MKNOD is among the capabilities dropped when an nfsd thread handles a request from a non-root user. Reported-by: Igor Zhbanov Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- include/linux/capability.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index 1b9872556131..4864a43b2b45 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -393,8 +393,10 @@ struct cpu_vfs_cap_data { # define CAP_FULL_SET ((kernel_cap_t){{ ~0, ~0 }}) # define CAP_INIT_EFF_SET ((kernel_cap_t){{ ~CAP_TO_MASK(CAP_SETPCAP), ~0 }}) # define CAP_FS_SET ((kernel_cap_t){{ CAP_FS_MASK_B0, CAP_FS_MASK_B1 } }) -# define CAP_NFSD_SET ((kernel_cap_t){{ CAP_FS_MASK_B0|CAP_TO_MASK(CAP_SYS_RESOURCE), \ - CAP_FS_MASK_B1 } }) +# define CAP_NFSD_SET ((kernel_cap_t){{ CAP_FS_MASK_B0 \ + | CAP_TO_MASK(CAP_SYS_RESOURCE) \ + | CAP_TO_MASK(CAP_MKNOD), \ + CAP_FS_MASK_B1 } }) #endif /* _KERNEL_CAPABILITY_U32S != 2 */ -- cgit v1.2.3 From e2fc4d19292ef2eb208f76976ddc3320cc5839b6 Mon Sep 17 00:00:00 2001 From: Maciej Sosnowski Date: Sat, 21 Mar 2009 13:31:23 -0700 Subject: dca: add missing copyright/license headers In two dca files copyright and license headers are missing. This patch adds them there. Signed-off-by: Maciej Sosnowski Signed-off-by: David S. Miller --- drivers/dca/dca-sysfs.c | 21 +++++++++++++++++++++ include/linux/dca.h | 20 ++++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'include/linux') diff --git a/drivers/dca/dca-sysfs.c b/drivers/dca/dca-sysfs.c index bb538b9690e0..ee916c9857ee 100644 --- a/drivers/dca/dca-sysfs.c +++ b/drivers/dca/dca-sysfs.c @@ -1,3 +1,24 @@ +/* + * Copyright(c) 2007 - 2009 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + #include #include #include diff --git a/include/linux/dca.h b/include/linux/dca.h index b00a753eda53..9c20c7e87d0a 100644 --- a/include/linux/dca.h +++ b/include/linux/dca.h @@ -1,3 +1,23 @@ +/* + * Copyright(c) 2007 - 2009 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ #ifndef DCA_H #define DCA_H /* DCA Provider API */ -- cgit v1.2.3