From dd17c8f72993f9461e9c19250e3f155d6d99df22 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Oct 2009 22:34:15 +0900 Subject: percpu: remove per_cpu__ prefix. Now that the return from alloc_percpu is compatible with the address of per-cpu vars, it makes sense to hand around the address of per-cpu variables. To make this sane, we remove the per_cpu__ prefix we used created to stop people accidentally using these vars directly. Now we have sparse, we can use that (next patch). tj: * Updated to convert stuff which were missed by or added after the original patch. * Kill per_cpu_var() macro. Signed-off-by: Rusty Russell Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter --- include/linux/percpu-defs.h | 18 ++++++------------ include/linux/percpu.h | 5 ++--- include/linux/vmstat.h | 8 ++++---- 3 files changed, 12 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 5a5d6ce4bd55..ee99f6c2cdcd 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -1,12 +1,6 @@ #ifndef _LINUX_PERCPU_DEFS_H #define _LINUX_PERCPU_DEFS_H -/* - * Determine the real variable name from the name visible in the - * kernel sources. - */ -#define per_cpu_var(var) per_cpu__##var - /* * Base implementations of per-CPU variable declarations and definitions, where * the section in which the variable is to be placed is provided by the @@ -56,24 +50,24 @@ */ #define DECLARE_PER_CPU_SECTION(type, name, sec) \ extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ - extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name + extern __PCPU_ATTRS(sec) __typeof__(type) name #define DEFINE_PER_CPU_SECTION(type, name, sec) \ __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ extern __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \ __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \ __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES __weak \ - __typeof__(type) per_cpu__##name + __typeof__(type) name #else /* * Normal declaration and definition macros. */ #define DECLARE_PER_CPU_SECTION(type, name, sec) \ - extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name + extern __PCPU_ATTRS(sec) __typeof__(type) name #define DEFINE_PER_CPU_SECTION(type, name, sec) \ __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES \ - __typeof__(type) per_cpu__##name + __typeof__(type) name #endif /* @@ -137,8 +131,8 @@ /* * Intermodule exports for per-CPU variables. */ -#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) -#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) +#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var) +#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var) #endif /* _LINUX_PERCPU_DEFS_H */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 522f421ec213..e12410e55e05 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -182,7 +182,7 @@ static inline void *pcpu_lpage_remapped(void *kaddr) #ifndef percpu_read # define percpu_read(var) \ ({ \ - typeof(per_cpu_var(var)) __tmp_var__; \ + typeof(var) __tmp_var__; \ __tmp_var__ = get_cpu_var(var); \ put_cpu_var(var); \ __tmp_var__; \ @@ -253,8 +253,7 @@ do { \ /* * Optimized manipulation for memory allocated through the per cpu - * allocator or for addresses of per cpu variables (can be determined - * using per_cpu_var(xx). + * allocator or for addresses of per cpu variables. * * These operation guarantee exclusivity of access for other operations * on the *same* processor. The assumption is that per cpu data is only diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d85889710f9b..3e489fda11a1 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -76,22 +76,22 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states); static inline void __count_vm_event(enum vm_event_item item) { - __this_cpu_inc(per_cpu_var(vm_event_states).event[item]); + __this_cpu_inc(vm_event_states.event[item]); } static inline void count_vm_event(enum vm_event_item item) { - this_cpu_inc(per_cpu_var(vm_event_states).event[item]); + this_cpu_inc(vm_event_states.event[item]); } static inline void __count_vm_events(enum vm_event_item item, long delta) { - __this_cpu_add(per_cpu_var(vm_event_states).event[item], delta); + __this_cpu_add(vm_event_states.event[item], delta); } static inline void count_vm_events(enum vm_event_item item, long delta) { - this_cpu_add(per_cpu_var(vm_event_states).event[item], delta); + this_cpu_add(vm_event_states.event[item], delta); } extern void all_vm_events(unsigned long *); -- cgit v1.2.3 From f7b64fe806029e0a0454df132eec3c5ab576102c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Oct 2009 22:34:15 +0900 Subject: percpu: make access macros universal Now that per_cpu__ prefix is gone, there's no distinction between static and dynamic percpu variables. Make get_cpu_var() take dynamic percpu variables and ensure that all macros have parentheses around the parameter evaluation and evaluate the variable parameter only once such that any expression which evaluates to percpu address can be used safely. Signed-off-by: Tejun Heo --- include/linux/percpu.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e12410e55e05..f965f833a643 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -27,10 +27,13 @@ * we force a syntax error here if it isn't. */ #define get_cpu_var(var) (*({ \ - extern int simple_identifier_##var(void); \ preempt_disable(); \ &__get_cpu_var(var); })) -#define put_cpu_var(var) preempt_enable() + +#define put_cpu_var(var) do { \ + (void)(var); \ + preempt_enable(); \ +} while (0) #ifdef CONFIG_SMP @@ -182,17 +185,19 @@ static inline void *pcpu_lpage_remapped(void *kaddr) #ifndef percpu_read # define percpu_read(var) \ ({ \ - typeof(var) __tmp_var__; \ - __tmp_var__ = get_cpu_var(var); \ - put_cpu_var(var); \ - __tmp_var__; \ + typeof(var) *pr_ptr__ = &(var); \ + typeof(var) pr_ret__; \ + pr_ret__ = get_cpu_var(*pr_ptr__); \ + put_cpu_var(*pr_ptr__); \ + pr_ret__; \ }) #endif #define __percpu_generic_to_op(var, val, op) \ do { \ - get_cpu_var(var) op val; \ - put_cpu_var(var); \ + typeof(var) *pgto_ptr__ = &(var); \ + get_cpu_var(*pgto_ptr__) op val; \ + put_cpu_var(*pgto_ptr__); \ } while (0) #ifndef percpu_write @@ -304,7 +309,7 @@ do { \ #define _this_cpu_generic_to_op(pcp, val, op) \ do { \ preempt_disable(); \ - *__this_cpu_ptr(&pcp) op val; \ + *__this_cpu_ptr(&(pcp)) op val; \ preempt_enable(); \ } while (0) -- cgit v1.2.3 From e0fdb0e050eae331046385643618f12452aa7e73 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Oct 2009 22:34:15 +0900 Subject: percpu: add __percpu for sparse. We have to make __kernel "__attribute__((address_space(0)))" so we can cast to it. tj: * put_cpu_var() update. * Annotations added to dynamic allocator interface. Signed-off-by: Rusty Russell Cc: Al Viro Signed-off-by: Tejun Heo --- include/asm-generic/percpu.h | 4 +++- include/linux/compiler.h | 4 +++- include/linux/percpu-defs.h | 2 +- include/linux/percpu.h | 18 +++++++++++------- 4 files changed, 18 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index ca6f0491412b..fded453fd25c 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -41,7 +41,9 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; * Only S390 provides its own means of moving the pointer. */ #ifndef SHIFT_PERCPU_PTR -#define SHIFT_PERCPU_PTR(__p, __offset) RELOC_HIDE((__p), (__offset)) +/* Weird cast keeps both GCC and sparse happy. */ +#define SHIFT_PERCPU_PTR(__p, __offset) \ + RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)) #endif /* diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 04fb5135b4e1..abba8045c6ef 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -5,7 +5,7 @@ #ifdef __CHECKER__ # define __user __attribute__((noderef, address_space(1))) -# define __kernel /* default address space */ +# define __kernel __attribute__((address_space(0))) # define __safe __attribute__((safe)) # define __force __attribute__((force)) # define __nocast __attribute__((nocast)) @@ -15,6 +15,7 @@ # define __acquire(x) __context__(x,1) # define __release(x) __context__(x,-1) # define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) +# define __percpu __attribute__((noderef, address_space(3))) extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); #else @@ -32,6 +33,7 @@ extern void __chk_io_ptr(const volatile void __iomem *); # define __acquire(x) (void)0 # define __release(x) (void)0 # define __cond_lock(x,c) (c) +# define __percpu #endif #ifdef __KERNEL__ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index ee99f6c2cdcd..0fa0cb524250 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -12,7 +12,7 @@ * that section. */ #define __PCPU_ATTRS(sec) \ - __attribute__((section(PER_CPU_BASE_SECTION sec))) \ + __percpu __attribute__((section(PER_CPU_BASE_SECTION sec))) \ PER_CPU_ATTRIBUTES #define __PCPU_DUMMY_ATTRS \ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index f965f833a643..2c0d31a3f6b6 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -30,8 +30,12 @@ preempt_disable(); \ &__get_cpu_var(var); })) +/* + * The weird & is necessary because sparse considers (void)(var) to be + * a direct dereference of percpu variable (var). + */ #define put_cpu_var(var) do { \ - (void)(var); \ + (void)&(var); \ preempt_enable(); \ } while (0) @@ -130,9 +134,9 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, */ #define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu))) -extern void *__alloc_reserved_percpu(size_t size, size_t align); -extern void *__alloc_percpu(size_t size, size_t align); -extern void free_percpu(void *__pdata); +extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); +extern void __percpu *__alloc_percpu(size_t size, size_t align); +extern void free_percpu(void __percpu *__pdata); #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA extern void __init setup_per_cpu_areas(void); @@ -142,7 +146,7 @@ extern void __init setup_per_cpu_areas(void); #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) -static inline void *__alloc_percpu(size_t size, size_t align) +static inline void __percpu *__alloc_percpu(size_t size, size_t align) { /* * Can't easily make larger alignment work with kmalloc. WARN @@ -153,7 +157,7 @@ static inline void *__alloc_percpu(size_t size, size_t align) return kzalloc(size, GFP_KERNEL); } -static inline void free_percpu(void *p) +static inline void free_percpu(void __percpu *p) { kfree(p); } @@ -168,7 +172,7 @@ static inline void *pcpu_lpage_remapped(void *kaddr) #endif /* CONFIG_SMP */ #define alloc_percpu(type) \ - (typeof(type) *)__alloc_percpu(sizeof(type), __alignof__(type)) + (typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type)) /* * Optional methods for optimized non-lvalue per-cpu variable access. -- cgit v1.2.3 From 545695fb41da117928ab946067a42d9e15fd009d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Oct 2009 22:34:15 +0900 Subject: percpu: make accessors check for percpu pointer in sparse The previous patch made sparse warn about percpu variables being used directly without going through percpu accessors. This patch implements the other half - checking whether non percpu variable is passed into percpu accessors. Signed-off-by: Tejun Heo Cc: Rusty Russell Cc: Al Viro --- include/asm-generic/percpu.h | 6 ++++-- include/linux/percpu-defs.h | 20 ++++++++++++++++++-- include/linux/percpu.h | 2 ++ 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index fded453fd25c..04f91c2d3f7b 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -42,8 +42,10 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; */ #ifndef SHIFT_PERCPU_PTR /* Weird cast keeps both GCC and sparse happy. */ -#define SHIFT_PERCPU_PTR(__p, __offset) \ - RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)) +#define SHIFT_PERCPU_PTR(__p, __offset) ({ \ + __verify_pcpu_ptr((__p)); \ + RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)); \ +}) #endif /* diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 0fa0cb524250..1fa36eb54b6a 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -18,6 +18,16 @@ #define __PCPU_DUMMY_ATTRS \ __attribute__((section(".discard"), unused)) +/* + * Macro which verifies @ptr is a percpu pointer without evaluating + * @ptr. This is to be used in percpu accessors to verify that the + * input parameter is a percpu pointer. + */ +#define __verify_pcpu_ptr(ptr) do { \ + void __percpu *__vpp_verify = (typeof(ptr))NULL; \ + (void)__vpp_verify; \ +} while (0) + /* * s390 and alpha modules require percpu variables to be defined as * weak to force the compiler to generate GOT based external @@ -129,10 +139,16 @@ __aligned(PAGE_SIZE) /* - * Intermodule exports for per-CPU variables. + * Intermodule exports for per-CPU variables. sparse forgets about + * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to + * noop if __CHECKER__. */ +#ifndef __CHECKER__ #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var) #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var) - +#else +#define EXPORT_PER_CPU_SYMBOL(var) +#define EXPORT_PER_CPU_SYMBOL_GPL(var) +#endif #endif /* _LINUX_PERCPU_DEFS_H */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 2c0d31a3f6b6..42878f0cd0e2 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -237,6 +237,7 @@ extern void __bad_size_call_parameter(void); #define __pcpu_size_call_return(stem, variable) \ ({ typeof(variable) pscr_ret__; \ + __verify_pcpu_ptr(&(variable)); \ switch(sizeof(variable)) { \ case 1: pscr_ret__ = stem##1(variable);break; \ case 2: pscr_ret__ = stem##2(variable);break; \ @@ -250,6 +251,7 @@ extern void __bad_size_call_parameter(void); #define __pcpu_size_call(stem, variable, ...) \ do { \ + __verify_pcpu_ptr(&(variable)); \ switch(sizeof(variable)) { \ case 1: stem##1(variable, __VA_ARGS__);break; \ case 2: stem##2(variable, __VA_ARGS__);break; \ -- cgit v1.2.3 From 5db53f3e80dee2d9dff5e534f9e9fe1db17c9936 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Fri, 20 Nov 2009 20:13:39 +0100 Subject: [LogFS] add new flash file system This is a new flash file system. See Documentation/filesystems/logfs.txt Signed-off-by: Joern Engel --- Documentation/filesystems/00-INDEX | 2 + Documentation/filesystems/logfs.txt | 241 ++++ fs/Kconfig | 1 + fs/Makefile | 1 + fs/logfs/Kconfig | 17 + fs/logfs/Makefile | 13 + fs/logfs/compr.c | 95 ++ fs/logfs/dev_bdev.c | 263 ++++ fs/logfs/dev_mtd.c | 253 ++++ fs/logfs/dir.c | 818 +++++++++++++ fs/logfs/file.c | 263 ++++ fs/logfs/gc.c | 730 ++++++++++++ fs/logfs/inode.c | 417 +++++++ fs/logfs/journal.c | 879 ++++++++++++++ fs/logfs/logfs.h | 722 +++++++++++ fs/logfs/logfs_abi.h | 627 ++++++++++ fs/logfs/readwrite.c | 2246 +++++++++++++++++++++++++++++++++++ fs/logfs/segment.c | 924 ++++++++++++++ fs/logfs/super.c | 634 ++++++++++ include/linux/btree-128.h | 109 ++ include/linux/btree-type.h | 147 +++ include/linux/btree.h | 243 ++++ lib/Kconfig | 3 + lib/Makefile | 1 + lib/btree.c | 797 +++++++++++++ 25 files changed, 10446 insertions(+) create mode 100644 Documentation/filesystems/logfs.txt create mode 100644 fs/logfs/Kconfig create mode 100644 fs/logfs/Makefile create mode 100644 fs/logfs/compr.c create mode 100644 fs/logfs/dev_bdev.c create mode 100644 fs/logfs/dev_mtd.c create mode 100644 fs/logfs/dir.c create mode 100644 fs/logfs/file.c create mode 100644 fs/logfs/gc.c create mode 100644 fs/logfs/inode.c create mode 100644 fs/logfs/journal.c create mode 100644 fs/logfs/logfs.h create mode 100644 fs/logfs/logfs_abi.h create mode 100644 fs/logfs/readwrite.c create mode 100644 fs/logfs/segment.c create mode 100644 fs/logfs/super.c create mode 100644 include/linux/btree-128.h create mode 100644 include/linux/btree-type.h create mode 100644 include/linux/btree.h create mode 100644 lib/btree.c (limited to 'include/linux') diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index f15621ee5599..d362aa543b27 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -62,6 +62,8 @@ jfs.txt - info and mount options for the JFS filesystem. locks.txt - info on file locking implementations, flock() vs. fcntl(), etc. +logfs.txt + - info on the LogFS flash filesystem. mandatory-locking.txt - info on the Linux implementation of Sys V mandatory file locking. ncpfs.txt diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt new file mode 100644 index 000000000000..e64c94ba401a --- /dev/null +++ b/Documentation/filesystems/logfs.txt @@ -0,0 +1,241 @@ + +The LogFS Flash Filesystem +========================== + +Specification +============= + +Superblocks +----------- + +Two superblocks exist at the beginning and end of the filesystem. +Each superblock is 256 Bytes large, with another 3840 Bytes reserved +for future purposes, making a total of 4096 Bytes. + +Superblock locations may differ for MTD and block devices. On MTD the +first non-bad block contains a superblock in the first 4096 Bytes and +the last non-bad block contains a superblock in the last 4096 Bytes. +On block devices, the first 4096 Bytes of the device contain the first +superblock and the last aligned 4096 Byte-block contains the second +superblock. + +For the most part, the superblocks can be considered read-only. They +are written only to correct errors detected within the superblocks, +move the journal and change the filesystem parameters through tunefs. +As a result, the superblock does not contain any fields that require +constant updates, like the amount of free space, etc. + +Segments +-------- + +The space in the device is split up into equal-sized segments. +Segments are the primary write unit of LogFS. Within each segments, +writes happen from front (low addresses) to back (high addresses. If +only a partial segment has been written, the segment number, the +current position within and optionally a write buffer are stored in +the journal. + +Segments are erased as a whole. Therefore Garbage Collection may be +required to completely free a segment before doing so. + +Journal +-------- + +The journal contains all global information about the filesystem that +is subject to frequent change. At mount time, it has to be scanned +for the most recent commit entry, which contains a list of pointers to +all currently valid entries. + +Object Store +------------ + +All space except for the superblocks and journal is part of the object +store. Each segment contains a segment header and a number of +objects, each consisting of the object header and the payload. +Objects are either inodes, directory entries (dentries), file data +blocks or indirect blocks. + +Levels +------ + +Garbage collection (GC) may fail if all data is written +indiscriminately. One requirement of GC is that data is seperated +roughly according to the distance between the tree root and the data. +Effectively that means all file data is on level 0, indirect blocks +are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks, +respectively. Inode file data is on level 6 for the inodes and 7-11 +for indirect blocks. + +Each segment contains objects of a single level only. As a result, +each level requires its own seperate segment to be open for writing. + +Inode File +---------- + +All inodes are stored in a special file, the inode file. Single +exception is the inode file's inode (master inode) which for obvious +reasons is stored in the journal instead. Instead of data blocks, the +leaf nodes of the inode files are inodes. + +Aliases +------- + +Writes in LogFS are done by means of a wandering tree. A naïve +implementation would require that for each write or a block, all +parent blocks are written as well, since the block pointers have +changed. Such an implementation would not be very efficient. + +In LogFS, the block pointer changes are cached in the journal by means +of alias entries. Each alias consists of its logical address - inode +number, block index, level and child number (index into block) - and +the changed data. Any 8-byte word can be changes in this manner. + +Currently aliases are used for block pointers, file size, file used +bytes and the height of an inodes indirect tree. + +Segment Aliases +--------------- + +Related to regular aliases, these are used to handle bad blocks. +Initially, bad blocks are handled by moving the affected segment +content to a spare segment and noting this move in the journal with a +segment alias, a simple (to, from) tupel. GC will later empty this +segment and the alias can be removed again. This is used on MTD only. + +Vim +--- + +By cleverly predicting the life time of data, it is possible to +seperate long-living data from short-living data and thereby reduce +the GC overhead later. Each type of distinc life expectency (vim) can +have a seperate segment open for writing. Each (level, vim) tupel can +be open just once. If an open segment with unknown vim is encountered +at mount time, it is closed and ignored henceforth. + +Indirect Tree +------------- + +Inodes in LogFS are similar to FFS-style filesystems with direct and +indirect block pointers. One difference is that LogFS uses a single +indirect pointer that can be either a 1x, 2x, etc. indirect pointer. +A height field in the inode defines the height of the indirect tree +and thereby the indirection of the pointer. + +Another difference is the addressing of indirect blocks. In LogFS, +the first 16 pointers in the first indirect block are left empty, +corresponding to the 16 direct pointers in the inode. In ext2 (maybe +others as well) the first pointer in the first indirect block +corresponds to logical block 12, skipping the 12 direct pointers. +So where ext2 is using arithmetic to better utilize space, LogFS keeps +arithmetic simple and uses compression to save space. + +Compression +----------- + +Both file data and metadata can be compressed. Compression for file +data can be enabled with chattr +c and disabled with chattr -c. Doing +so has no effect on existing data, but new data will be stored +accordingly. New inodes will inherit the compression flag of the +parent directory. + +Metadata is always compressed. However, the space accounting ignores +this and charges for the uncompressed size. Failing to do so could +result in GC failures when, after moving some data, indirect blocks +compress worse than previously. Even on a 100% full medium, GC may +not consume any extra space, so the compression gains are lost space +to the user. + +However, they are not lost space to the filesystem internals. By +cheating the user for those bytes, the filesystem gained some slack +space and GC will run less often and faster. + +Garbage Collection and Wear Leveling +------------------------------------ + +Garbage collection is invoked whenever the number of free segments +falls below a threshold. The best (known) candidate is picked based +on the least amount of valid data contained in the segment. All +remaining valid data is copied elsewhere, thereby invalidating it. + +The GC code also checks for aliases and writes then back if their +number gets too large. + +Wear leveling is done by occasionally picking a suboptimal segment for +garbage collection. If a stale segments erase count is significantly +lower than the active segments' erase counts, it will be picked. Wear +leveling is rate limited, so it will never monopolize the device for +more than one segment worth at a time. + +Values for "occasionally", "significantly lower" are compile time +constants. + +Hashed directories +------------------ + +To satisfy efficient lookup(), directory entries are hashed and +located based on the hash. In order to both support large directories +and not be overly inefficient for small directories, several hash +tables of increasing size are used. For each table, the hash value +modulo the table size gives the table index. + +Tables sizes are chosen to limit the number of indirect blocks with a +fully populated table to 0, 1, 2 or 3 respectively. So the first +table contains 16 entries, the second 512-16, etc. + +The last table is special in several ways. First its size depends on +the effective 32bit limit on telldir/seekdir cookies. Since logfs +uses the upper half of the address space for indirect blocks, the size +is limited to 2^31. Secondly the table contains hash buckets with 16 +entries each. + +Using single-entry buckets would result in birthday "attacks". At +just 2^16 used entries, hash collisions would be likely (P >= 0.5). +My math skills are insufficient to do the combinatorics for the 17x +collisions necessary to overflow a bucket, but testing showed that in +10,000 runs the lowest directory fill before a bucket overflow was +188,057,130 entries with an average of 315,149,915 entries. So for +directory sizes of up to a million, bucket overflows should be +virtually impossible under normal circumstances. + +With carefully chosen filenames, it is obviously possible to cause an +overflow with just 21 entries (4 higher tables + 16 entries + 1). So +there may be a security concern if a malicious user has write access +to a directory. + +Open For Discussion +=================== + +Device Address Space +-------------------- + +A device address space is used for caching. Both block devices and +MTD provide functions to either read a single page or write a segment. +Partial segments may be written for data integrity, but where possible +complete segments are written for performance on simple block device +flash media. + +Meta Inodes +----------- + +Inodes are stored in the inode file, which is just a regular file for +most purposes. At umount time, however, the inode file needs to +remain open until all dirty inodes are written. So +generic_shutdown_super() may not close this inode, but shouldn't +complain about remaining inodes due to the inode file either. Same +goes for mapping inode of the device address space. + +Currently logfs uses a hack that essentially copies part of fs/inode.c +code over. A general solution would be preferred. + +Indirect block mapping +---------------------- + +With compression, the block device (or mapping inode) cannot be used +to cache indirect blocks. Some other place is required. Currently +logfs uses the top half of each inode's address space. The low 8TB +(on 32bit) are filled with file data, the high 8TB are used for +indirect blocks. + +One problem is that 16TB files created on 64bit systems actually have +data in the top 8TB. But files >16TB would cause problems anyway, so +only the limit has changed. diff --git a/fs/Kconfig b/fs/Kconfig index 64d44efad7a5..7405f071be67 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -177,6 +177,7 @@ source "fs/efs/Kconfig" source "fs/jffs2/Kconfig" # UBIFS File system configuration source "fs/ubifs/Kconfig" +source "fs/logfs/Kconfig" source "fs/cramfs/Kconfig" source "fs/squashfs/Kconfig" source "fs/freevxfs/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index af6d04700d9c..c3633aa46911 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -99,6 +99,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/ obj-$(CONFIG_UFS_FS) += ufs/ obj-$(CONFIG_EFS_FS) += efs/ obj-$(CONFIG_JFFS2_FS) += jffs2/ +obj-$(CONFIG_LOGFS) += logfs/ obj-$(CONFIG_UBIFS_FS) += ubifs/ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig new file mode 100644 index 000000000000..daf9a9b32dd3 --- /dev/null +++ b/fs/logfs/Kconfig @@ -0,0 +1,17 @@ +config LOGFS + tristate "LogFS file system (EXPERIMENTAL)" + depends on (MTD || BLOCK) && EXPERIMENTAL + select ZLIB_INFLATE + select ZLIB_DEFLATE + select CRC32 + select BTREE + help + Flash filesystem aimed to scale efficiently to large devices. + In comparison to JFFS2 it offers significantly faster mount + times and potentially less RAM usage, although the latter has + not been measured yet. + + In its current state it is still very experimental and should + not be used for other than testing purposes. + + If unsure, say N. diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile new file mode 100644 index 000000000000..4820027787ee --- /dev/null +++ b/fs/logfs/Makefile @@ -0,0 +1,13 @@ +obj-$(CONFIG_LOGFS) += logfs.o + +logfs-y += compr.o +logfs-y += dir.o +logfs-y += file.o +logfs-y += gc.o +logfs-y += inode.o +logfs-y += journal.o +logfs-y += readwrite.o +logfs-y += segment.o +logfs-y += super.o +logfs-$(CONFIG_BLOCK) += dev_bdev.o +logfs-$(CONFIG_MTD) += dev_mtd.o diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c new file mode 100644 index 000000000000..44bbfd249abc --- /dev/null +++ b/fs/logfs/compr.c @@ -0,0 +1,95 @@ +/* + * fs/logfs/compr.c - compression routines + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include + +#define COMPR_LEVEL 3 + +static DEFINE_MUTEX(compr_mutex); +static struct z_stream_s stream; + +int logfs_compress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + mutex_lock(&compr_mutex); + err = zlib_deflateInit(&stream, COMPR_LEVEL); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_deflateEnd(&stream); + if (err != Z_OK) + goto error; + + if (stream.total_out >= stream.total_in) + goto error; + + ret = stream.total_out; +error: + mutex_unlock(&compr_mutex); + return ret; +} + +int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + mutex_lock(&compr_mutex); + err = zlib_inflateInit(&stream); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_inflateEnd(&stream); + if (err != Z_OK) + goto error; + + ret = 0; +error: + mutex_unlock(&compr_mutex); + return ret; +} + +int __init logfs_compr_init(void) +{ + size_t size = max(zlib_deflate_workspacesize(), + zlib_inflate_workspacesize()); + stream.workspace = vmalloc(size); + if (!stream.workspace) + return -ENOMEM; + return 0; +} + +void logfs_compr_exit(void) +{ + vfree(stream.workspace); +} diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c new file mode 100644 index 000000000000..58a057b6e1af --- /dev/null +++ b/fs/logfs/dev_bdev.c @@ -0,0 +1,263 @@ +/* + * fs/logfs/dev_bdev.c - Device access methods for block devices + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include +#include + +#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) + +static void request_complete(struct bio *bio, int err) +{ + complete((struct completion *)bio->bi_private); +} + +static int sync_request(struct page *page, struct block_device *bdev, int rw) +{ + struct bio bio; + struct bio_vec bio_vec; + struct completion complete; + + bio_init(&bio); + bio.bi_io_vec = &bio_vec; + bio_vec.bv_page = page; + bio_vec.bv_len = PAGE_SIZE; + bio_vec.bv_offset = 0; + bio.bi_vcnt = 1; + bio.bi_idx = 0; + bio.bi_size = PAGE_SIZE; + bio.bi_bdev = bdev; + bio.bi_sector = page->index * (PAGE_SIZE >> 9); + init_completion(&complete); + bio.bi_private = &complete; + bio.bi_end_io = request_complete; + + submit_bio(rw, &bio); + generic_unplug_device(bdev_get_queue(bdev)); + wait_for_completion(&complete); + return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO; +} + +static int bdev_readpage(void *_sb, struct page *page) +{ + struct super_block *sb = _sb; + struct block_device *bdev = logfs_super(sb)->s_bdev; + int err; + + err = sync_request(page, bdev, READ); + if (err) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + unlock_page(page); + return err; +} + +static DECLARE_WAIT_QUEUE_HEAD(wq); + +static void writeseg_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct super_block *sb = bio->bi_private; + struct logfs_super *super = logfs_super(sb); + struct page *page; + + BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ + BUG_ON(err); + BUG_ON(bio->bi_vcnt == 0); + do { + page = bvec->bv_page; + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + end_page_writeback(page); + } while (bvec >= bio->bi_io_vec); + bio_put(bio); + if (atomic_dec_and_test(&super->s_pending_writes)) + wake_up(&wq); +} + +static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct bio *bio; + struct page *page; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); + int i; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); /* FIXME: handle this */ + + for (i = 0; i < nr_pages; i++) { + if (i >= max_pages) { + /* Block layer cannot split bios :( */ + bio->bi_vcnt = i; + bio->bi_idx = 0; + bio->bi_size = i * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = writeseg_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + + ofs += i * PAGE_SIZE; + index += i; + nr_pages -= i; + i = 0; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + } + page = find_lock_page(mapping, index + i); + BUG_ON(!page); + bio->bi_io_vec[i].bv_page = page; + bio->bi_io_vec[i].bv_len = PAGE_SIZE; + bio->bi_io_vec[i].bv_offset = 0; + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + } + bio->bi_vcnt = nr_pages; + bio->bi_idx = 0; + bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = writeseg_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + return 0; +} + +static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + int head; + + BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO); + + if (len == 0) { + /* This can happen when the object fit perfectly into a + * segment, the segment gets written per sync and subsequently + * closed. + */ + return; + } + head = ofs & (PAGE_SIZE - 1); + if (head) { + ofs -= head; + len += head; + } + len = PAGE_ALIGN(len); + __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); + generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev)); +} + +static int bdev_erase(struct super_block *sb, loff_t to, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + pgoff_t index = to >> PAGE_SHIFT; + int i, nr_pages = len >> PAGE_SHIFT; + + BUG_ON(to & (PAGE_SIZE - 1)); + BUG_ON(len & (PAGE_SIZE - 1)); + + if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + for (i = 0; i < nr_pages; i++) { + page = find_get_page(mapping, index + i); + if (page) { + memset(page_address(page), 0xFF, PAGE_SIZE); + page_cache_release(page); + } + } + return 0; +} + +static void bdev_sync(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + wait_event(wq, atomic_read(&super->s_pending_writes) == 0); +} + +static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = bdev_readpage; + + *ofs = 0; + return read_cache_page(mapping, 0, filler, sb); +} + +static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = bdev_readpage; + u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000; + pgoff_t index = pos >> PAGE_SHIFT; + + *ofs = pos; + return read_cache_page(mapping, index, filler, sb); +} + +static int bdev_write_sb(struct super_block *sb, struct page *page) +{ + struct block_device *bdev = logfs_super(sb)->s_bdev; + + /* Nothing special to do for block devices. */ + return sync_request(page, bdev, WRITE); +} + +static void bdev_put_device(struct super_block *sb) +{ + close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE); +} + +static const struct logfs_device_ops bd_devops = { + .find_first_sb = bdev_find_first_sb, + .find_last_sb = bdev_find_last_sb, + .write_sb = bdev_write_sb, + .readpage = bdev_readpage, + .writeseg = bdev_writeseg, + .erase = bdev_erase, + .sync = bdev_sync, + .put_device = bdev_put_device, +}; + +int logfs_get_sb_bdev(struct file_system_type *type, int flags, + const char *devname, struct vfsmount *mnt) +{ + struct block_device *bdev; + + bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { + int mtdnr = MINOR(bdev->bd_dev); + close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); + return logfs_get_sb_mtd(type, flags, mtdnr, mnt); + } + + return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt); +} diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c new file mode 100644 index 000000000000..68e99d046c23 --- /dev/null +++ b/fs/logfs/dev_mtd.c @@ -0,0 +1,253 @@ +/* + * fs/logfs/dev_mtd.c - Device access methods for MTD + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include +#include + +#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) + +static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + size_t retlen; + int ret; + + ret = mtd->read(mtd, ofs, len, &retlen, buf); + BUG_ON(ret == -EINVAL); + if (ret) + return ret; + + /* Not sure if we should loop instead. */ + if (retlen != len) + return -EIO; + + return 0; +} + +static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf) +{ + struct logfs_super *super = logfs_super(sb); + struct mtd_info *mtd = super->s_mtd; + size_t retlen; + loff_t page_start, page_end; + int ret; + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs)); + BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift); + BUG_ON(len > PAGE_CACHE_SIZE); + page_start = ofs & PAGE_CACHE_MASK; + page_end = PAGE_CACHE_ALIGN(ofs + len) - 1; + ret = mtd->write(mtd, ofs, len, &retlen, buf); + if (ret || (retlen != len)) + return -EIO; + + return 0; +} + +/* + * For as long as I can remember (since about 2001) mtd->erase has been an + * asynchronous interface lacking the first driver to actually use the + * asynchronous properties. So just to prevent the first implementor of such + * a thing from breaking logfs in 2350, we do the usual pointless dance to + * declare a completion variable and wait for completion before returning + * from mtd_erase(). What an excercise in futility! + */ +static void logfs_erase_callback(struct erase_info *ei) +{ + complete((struct completion *)ei->priv); +} + +static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + pgoff_t index = ofs >> PAGE_SHIFT; + + for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) { + page = find_get_page(mapping, index); + if (!page) + continue; + memset(page_address(page), 0xFF, PAGE_SIZE); + page_cache_release(page); + } + return 0; +} + +static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + struct erase_info ei; + DECLARE_COMPLETION_ONSTACK(complete); + int ret; + + BUG_ON(len % mtd->erasesize); + if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + memset(&ei, 0, sizeof(ei)); + ei.mtd = mtd; + ei.addr = ofs; + ei.len = len; + ei.callback = logfs_erase_callback; + ei.priv = (long)&complete; + ret = mtd->erase(mtd, &ei); + if (ret) + return -EIO; + + wait_for_completion(&complete); + if (ei.state != MTD_ERASE_DONE) + return -EIO; + return mtd_erase_mapping(sb, ofs, len); +} + +static void mtd_sync(struct super_block *sb) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + + if (mtd->sync) + mtd->sync(mtd); +} + +static int mtd_readpage(void *_sb, struct page *page) +{ + struct super_block *sb = _sb; + int err; + + err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); + if (err == -EUCLEAN) { + err = 0; + /* FIXME: force GC this segment */ + } + if (err) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + unlock_page(page); + return err; +} + +static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = mtd_readpage; + struct mtd_info *mtd = super->s_mtd; + + if (!mtd->block_isbad) + return NULL; + + *ofs = 0; + while (mtd->block_isbad(mtd, *ofs)) { + *ofs += mtd->erasesize; + if (*ofs >= mtd->size) + return NULL; + } + BUG_ON(*ofs & ~PAGE_MASK); + return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); +} + +static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = mtd_readpage; + struct mtd_info *mtd = super->s_mtd; + + if (!mtd->block_isbad) + return NULL; + + *ofs = mtd->size - mtd->erasesize; + while (mtd->block_isbad(mtd, *ofs)) { + *ofs -= mtd->erasesize; + if (*ofs <= 0) + return NULL; + } + *ofs = *ofs + mtd->erasesize - 0x1000; + BUG_ON(*ofs & ~PAGE_MASK); + return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); +} + +static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + int i, err; + + for (i = 0; i < nr_pages; i++) { + page = find_lock_page(mapping, index + i); + BUG_ON(!page); + + err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); + unlock_page(page); + page_cache_release(page); + if (err) + return err; + } + return 0; +} + +static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + int head; + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return; + + if (len == 0) { + /* This can happen when the object fit perfectly into a + * segment, the segment gets written per sync and subsequently + * closed. + */ + return; + } + head = ofs & (PAGE_SIZE - 1); + if (head) { + ofs -= head; + len += head; + } + len = PAGE_ALIGN(len); + __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); +} + +static void mtd_put_device(struct super_block *sb) +{ + put_mtd_device(logfs_super(sb)->s_mtd); +} + +static const struct logfs_device_ops mtd_devops = { + .find_first_sb = mtd_find_first_sb, + .find_last_sb = mtd_find_last_sb, + .readpage = mtd_readpage, + .writeseg = mtd_writeseg, + .erase = mtd_erase, + .sync = mtd_sync, + .put_device = mtd_put_device, +}; + +int logfs_get_sb_mtd(struct file_system_type *type, int flags, + int mtdnr, struct vfsmount *mnt) +{ + struct mtd_info *mtd; + const struct logfs_device_ops *devops = &mtd_devops; + + mtd = get_mtd_device(NULL, mtdnr); + return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt); +} diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c new file mode 100644 index 000000000000..89104e6f81c4 --- /dev/null +++ b/fs/logfs/dir.c @@ -0,0 +1,818 @@ +/* + * fs/logfs/dir.c - directory-related code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" + + +/* + * Atomic dir operations + * + * Directory operations are by default not atomic. Dentries and Inodes are + * created/removed/altered in seperate operations. Therefore we need to do + * a small amount of journaling. + * + * Create, link, mkdir, mknod and symlink all share the same function to do + * the work: __logfs_create. This function works in two atomic steps: + * 1. allocate inode (remember in journal) + * 2. allocate dentry (clear journal) + * + * As we can only get interrupted between the two, when the inode we just + * created is simply stored in the anchor. On next mount, if we were + * interrupted, we delete the inode. From a users point of view the + * operation never happened. + * + * Unlink and rmdir also share the same function: unlink. Again, this + * function works in two atomic steps + * 1. remove dentry (remember inode in journal) + * 2. unlink inode (clear journal) + * + * And again, on the next mount, if we were interrupted, we delete the inode. + * From a users point of view the operation succeeded. + * + * Rename is the real pain to deal with, harder than all the other methods + * combined. Depending on the circumstances we can run into three cases. + * A "target rename" where the target dentry already existed, a "local + * rename" where both parent directories are identical or a "cross-directory + * rename" in the remaining case. + * + * Local rename is atomic, as the old dentry is simply rewritten with a new + * name. + * + * Cross-directory rename works in two steps, similar to __logfs_create and + * logfs_unlink: + * 1. Write new dentry (remember old dentry in journal) + * 2. Remove old dentry (clear journal) + * + * Here we remember a dentry instead of an inode. On next mount, if we were + * interrupted, we delete the dentry. From a users point of view, the + * operation succeeded. + * + * Target rename works in three atomic steps: + * 1. Attach old inode to new dentry (remember old dentry and new inode) + * 2. Remove old dentry (still remember the new inode) + * 3. Remove victim inode + * + * Here we remember both an inode an a dentry. If we get interrupted + * between steps 1 and 2, we delete both the dentry and the inode. If + * we get interrupted between steps 2 and 3, we delete just the inode. + * In either case, the remaining objects are deleted on next mount. From + * a users point of view, the operation succeeded. + */ + +static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd, + loff_t pos) +{ + return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL); +} + +static int write_inode(struct inode *inode) +{ + return __logfs_write_inode(inode, WF_LOCK); +} + +static s64 dir_seek_data(struct inode *inode, s64 pos) +{ + s64 new_pos = logfs_seek_data(inode, pos); + + return max(pos, new_pos - 1); +} + +static int beyond_eof(struct inode *inode, loff_t bix) +{ + loff_t pos = bix << inode->i_sb->s_blocksize_bits; + return pos >= i_size_read(inode); +} + +/* + * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11, + * so short names (len <= 9) don't even occupy the complete 32bit name + * space. A prime >256 ensures short names quickly spread the 32bit + * name space. Add about 26 for the estimated amount of information + * of each character and pick a prime nearby, preferrably a bit-sparse + * one. + */ +static u32 hash_32(const char *s, int len, u32 seed) +{ + u32 hash = seed; + int i; + + for (i = 0; i < len; i++) + hash = hash * 293 + s[i]; + return hash; +} + +/* + * We have to satisfy several conflicting requirements here. Small + * directories should stay fairly compact and not require too many + * indirect blocks. The number of possible locations for a given hash + * should be small to make lookup() fast. And we should try hard not + * to overflow the 32bit name space or nfs and 32bit host systems will + * be unhappy. + * + * So we use the following scheme. First we reduce the hash to 0..15 + * and try a direct block. If that is occupied we reduce the hash to + * 16..255 and try an indirect block. Same for 2x and 3x indirect + * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff, + * but use buckets containing eight entries instead of a single one. + * + * Using 16 entries should allow for a reasonable amount of hash + * collisions, so the 32bit name space can be packed fairly tight + * before overflowing. Oh and currently we don't overflow but return + * and error. + * + * How likely are collisions? Doing the appropriate math is beyond me + * and the Bronstein textbook. But running a test program to brute + * force collisions for a couple of days showed that on average the + * first collision occurs after 598M entries, with 290M being the + * smallest result. Obviously 21 entries could already cause a + * collision if all entries are carefully chosen. + */ +static pgoff_t hash_index(u32 hash, int round) +{ + switch (round) { + case 0: + return hash % I0_BLOCKS; + case 1: + return I0_BLOCKS + hash % (I1_BLOCKS - I0_BLOCKS); + case 2: + return I1_BLOCKS + hash % (I2_BLOCKS - I1_BLOCKS); + case 3: + return I2_BLOCKS + hash % (I3_BLOCKS - I2_BLOCKS); + case 4 ... 19: + return I3_BLOCKS + 16 * (hash % (((1<<31) - I3_BLOCKS) / 16)) + + round - 4; + } + BUG(); +} + +static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry) +{ + struct qstr *name = &dentry->d_name; + struct page *page; + struct logfs_disk_dentry *dd; + u32 hash = hash_32(name->name, name->len, 0); + pgoff_t index; + int round; + + if (name->len > LOGFS_MAX_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + for (round = 0; round < 20; round++) { + index = hash_index(hash, round); + + if (beyond_eof(dir, index)) + return NULL; + if (!logfs_exist_block(dir, index)) + continue; + page = read_cache_page(dir->i_mapping, index, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return page; + dd = kmap_atomic(page, KM_USER0); + BUG_ON(dd->namelen == 0); + + if (name->len != be16_to_cpu(dd->namelen) || + memcmp(name->name, dd->name, name->len)) { + kunmap_atomic(dd, KM_USER0); + page_cache_release(page); + continue; + } + + kunmap_atomic(dd, KM_USER0); + return page; + } + return NULL; +} + +static int logfs_remove_inode(struct inode *inode) +{ + int ret; + + inode->i_nlink--; + ret = write_inode(inode); + LOGFS_BUG_ON(ret, inode->i_sb); + return ret; +} + +static void abort_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + if (logfs_inode(inode)->li_block) + logfs_inode(inode)->li_block->ta = NULL; + kfree(ta); +} + +static int logfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct logfs_super *super = logfs_super(dir->i_sb); + struct inode *inode = dentry->d_inode; + struct logfs_transaction *ta; + struct page *page; + pgoff_t index; + int ret; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = UNLINK_1; + ta->ino = inode->i_ino; + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + page = logfs_get_dd_page(dir, dentry); + if (!page) + return -ENOENT; + if (IS_ERR(page)) + return PTR_ERR(page); + index = page->index; + page_cache_release(page); + + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(dir, ta); + + ret = logfs_delete(dir, index, NULL); + if (!ret) + ret = write_inode(dir); + + if (ret) { + abort_transaction(dir, ta); + printk(KERN_ERR"LOGFS: unable to delete inode\n"); + goto out; + } + + ta->state = UNLINK_2; + logfs_add_transaction(inode, ta); + ret = logfs_remove_inode(inode); +out: + mutex_unlock(&super->s_dirop_mutex); + return ret; +} + +static inline int logfs_empty_dir(struct inode *dir) +{ + u64 data; + + data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits; + return data >= i_size_read(dir); +} + +static int logfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + if (!logfs_empty_dir(inode)) + return -ENOTEMPTY; + + return logfs_unlink(dir, dentry); +} + +/* FIXME: readdir currently has it's own dir_walk code. I don't see a good + * way to combine the two copies */ +#define IMPLICIT_NODES 2 +static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir) +{ + struct inode *dir = file->f_dentry->d_inode; + loff_t pos = file->f_pos - IMPLICIT_NODES; + struct page *page; + struct logfs_disk_dentry *dd; + int full; + + BUG_ON(pos < 0); + for (;; pos++) { + if (beyond_eof(dir, pos)) + break; + if (!logfs_exist_block(dir, pos)) { + /* deleted dentry */ + pos = dir_seek_data(dir, pos); + continue; + } + page = read_cache_page(dir->i_mapping, pos, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + dd = kmap_atomic(page, KM_USER0); + BUG_ON(dd->namelen == 0); + + full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen), + pos, be64_to_cpu(dd->ino), dd->type); + kunmap_atomic(dd, KM_USER0); + page_cache_release(page); + if (full) + break; + } + + file->f_pos = pos + IMPLICIT_NODES; + return 0; +} + +static int logfs_readdir(struct file *file, void *buf, filldir_t filldir) +{ + struct inode *inode = file->f_dentry->d_inode; + ino_t pino = parent_ino(file->f_dentry); + int err; + + if (file->f_pos < 0) + return -EINVAL; + + if (file->f_pos == 0) { + if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0) + return 0; + file->f_pos++; + } + if (file->f_pos == 1) { + if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0) + return 0; + file->f_pos++; + } + + err = __logfs_readdir(file, buf, filldir); + return err; +} + +static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name) +{ + dd->namelen = cpu_to_be16(name->len); + memcpy(dd->name, name->name, name->len); +} + +static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct page *page; + struct logfs_disk_dentry *dd; + pgoff_t index; + u64 ino = 0; + struct inode *inode; + + page = logfs_get_dd_page(dir, dentry); + if (IS_ERR(page)) + return ERR_CAST(page); + if (!page) { + d_add(dentry, NULL); + return NULL; + } + index = page->index; + dd = kmap_atomic(page, KM_USER0); + ino = be64_to_cpu(dd->ino); + kunmap_atomic(dd, KM_USER0); + page_cache_release(page); + + inode = logfs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) { + printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n", + ino, dir->i_ino, index); + return ERR_CAST(inode); + } + return d_splice_alias(inode, dentry); +} + +static void grow_dir(struct inode *dir, loff_t index) +{ + index = (index + 1) << dir->i_sb->s_blocksize_bits; + if (i_size_read(dir) < index) + i_size_write(dir, index); +} + +static int logfs_write_dir(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct page *page; + struct logfs_disk_dentry *dd; + u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0); + pgoff_t index; + int round, err; + + for (round = 0; round < 20; round++) { + index = hash_index(hash, round); + + if (logfs_exist_block(dir, index)) + continue; + page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL); + if (!page) + return -ENOMEM; + + dd = kmap_atomic(page, KM_USER0); + memset(dd, 0, sizeof(*dd)); + dd->ino = cpu_to_be64(inode->i_ino); + dd->type = logfs_type(inode); + logfs_set_name(dd, &dentry->d_name); + kunmap_atomic(dd, KM_USER0); + + err = logfs_write_buf(dir, page, WF_LOCK); + unlock_page(page); + page_cache_release(page); + if (!err) + grow_dir(dir, index); + return err; + } + /* FIXME: Is there a better return value? In most cases neither + * the filesystem nor the directory are full. But we have had + * too many collisions for this particular hash and no fallback. + */ + return -ENOSPC; +} + +static int __logfs_create(struct inode *dir, struct dentry *dentry, + struct inode *inode, const char *dest, long destlen) +{ + struct logfs_super *super = logfs_super(dir->i_sb); + struct logfs_inode *li = logfs_inode(inode); + struct logfs_transaction *ta; + int ret; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = CREATE_1; + ta->ino = inode->i_ino; + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(inode, ta); + + if (dest) { + /* symlink */ + ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL); + if (!ret) + ret = write_inode(inode); + } else { + /* creat/mkdir/mknod */ + ret = write_inode(inode); + } + if (ret) { + abort_transaction(inode, ta); + li->li_flags |= LOGFS_IF_STILLBORN; + /* FIXME: truncate symlink */ + inode->i_nlink--; + iput(inode); + goto out; + } + + ta->state = CREATE_2; + logfs_add_transaction(dir, ta); + ret = logfs_write_dir(dir, dentry, inode); + /* sync directory */ + if (!ret) + ret = write_inode(dir); + + if (ret) { + logfs_del_transaction(dir, ta); + ta->state = CREATE_2; + logfs_add_transaction(inode, ta); + logfs_remove_inode(inode); + iput(inode); + goto out; + } + d_instantiate(dentry, inode); +out: + mutex_unlock(&super->s_dirop_mutex); + return ret; +} + +static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + + /* + * FIXME: why do we have to fill in S_IFDIR, while the mode is + * correct for mknod, creat, etc.? Smells like the vfs *should* + * do it for us but for some reason fails to do so. + */ + inode = logfs_new_inode(dir, S_IFDIR | mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_dir_iops; + inode->i_fop = &logfs_dir_fops; + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + + inode = logfs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_reg_iops; + inode->i_fop = &logfs_reg_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + struct inode *inode; + + if (dentry->d_name.len > LOGFS_MAX_NAMELEN) + return -ENAMETOOLONG; + + inode = logfs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + init_special_inode(inode, mode, rdev); + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_symlink(struct inode *dir, struct dentry *dentry, + const char *target) +{ + struct inode *inode; + size_t destlen = strlen(target) + 1; + + if (destlen > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + + inode = logfs_new_inode(dir, S_IFLNK | 0777); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_symlink_iops; + inode->i_mapping->a_ops = &logfs_reg_aops; + + return __logfs_create(dir, dentry, inode, target, destlen); +} + +static int logfs_permission(struct inode *inode, int mask) +{ + return generic_permission(inode, mask, NULL); +} + +static int logfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + + if (inode->i_nlink >= LOGFS_LINK_MAX) + return -EMLINK; + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + atomic_inc(&inode->i_count); + inode->i_nlink++; + mark_inode_dirty_sync(inode); + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_get_dd(struct inode *dir, struct dentry *dentry, + struct logfs_disk_dentry *dd, loff_t *pos) +{ + struct page *page; + void *map; + + page = logfs_get_dd_page(dir, dentry); + if (IS_ERR(page)) + return PTR_ERR(page); + *pos = page->index; + map = kmap_atomic(page, KM_USER0); + memcpy(dd, map, sizeof(*dd)); + kunmap_atomic(map, KM_USER0); + page_cache_release(page); + return 0; +} + +static int logfs_delete_dd(struct inode *dir, loff_t pos) +{ + /* + * Getting called with pos somewhere beyond eof is either a goofup + * within this file or means someone maliciously edited the + * (crc-protected) journal. + */ + BUG_ON(beyond_eof(dir, pos)); + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos); + return logfs_delete(dir, pos, NULL); +} + +/* + * Cross-directory rename, target does not exist. Just a little nasty. + * Create a new dentry in the target dir, then remove the old dentry, + * all the while taking care to remember our operation in the journal. + */ +static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct logfs_super *super = logfs_super(old_dir->i_sb); + struct logfs_disk_dentry dd; + struct logfs_transaction *ta; + loff_t pos; + int err; + + /* 1. locate source dd */ + err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); + if (err) + return err; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = CROSS_RENAME_1; + ta->dir = old_dir->i_ino; + ta->pos = pos; + + /* 2. write target dd */ + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(new_dir, ta); + err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode); + if (!err) + err = write_inode(new_dir); + + if (err) { + super->s_rename_dir = 0; + super->s_rename_pos = 0; + abort_transaction(new_dir, ta); + goto out; + } + + /* 3. remove source dd */ + ta->state = CROSS_RENAME_2; + logfs_add_transaction(old_dir, ta); + err = logfs_delete_dd(old_dir, pos); + if (!err) + err = write_inode(old_dir); + LOGFS_BUG_ON(err, old_dir->i_sb); +out: + mutex_unlock(&super->s_dirop_mutex); + return err; +} + +static int logfs_replace_inode(struct inode *dir, struct dentry *dentry, + struct logfs_disk_dentry *dd, struct inode *inode) +{ + loff_t pos; + int err; + + err = logfs_get_dd(dir, dentry, dd, &pos); + if (err) + return err; + dd->ino = cpu_to_be64(inode->i_ino); + dd->type = logfs_type(inode); + + err = write_dir(dir, dd, pos); + if (err) + return err; + log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos, + dd->name, be64_to_cpu(dd->ino)); + return write_inode(dir); +} + +/* Target dentry exists - the worst case. We need to attach the source + * inode to the target dentry, then remove the orphaned target inode and + * source dentry. + */ +static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct logfs_super *super = logfs_super(old_dir->i_sb); + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + int isdir = S_ISDIR(old_inode->i_mode); + struct logfs_disk_dentry dd; + struct logfs_transaction *ta; + loff_t pos; + int err; + + BUG_ON(isdir != S_ISDIR(new_inode->i_mode)); + if (isdir) { + if (!logfs_empty_dir(new_inode)) + return -ENOTEMPTY; + } + + /* 1. locate source dd */ + err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); + if (err) + return err; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = TARGET_RENAME_1; + ta->dir = old_dir->i_ino; + ta->pos = pos; + ta->ino = new_inode->i_ino; + + /* 2. attach source inode to target dd */ + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(new_dir, ta); + err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode); + if (err) { + super->s_rename_dir = 0; + super->s_rename_pos = 0; + super->s_victim_ino = 0; + abort_transaction(new_dir, ta); + goto out; + } + + /* 3. remove source dd */ + ta->state = TARGET_RENAME_2; + logfs_add_transaction(old_dir, ta); + err = logfs_delete_dd(old_dir, pos); + if (!err) + err = write_inode(old_dir); + LOGFS_BUG_ON(err, old_dir->i_sb); + + /* 4. remove target inode */ + ta->state = TARGET_RENAME_3; + logfs_add_transaction(new_inode, ta); + err = logfs_remove_inode(new_inode); + +out: + mutex_unlock(&super->s_dirop_mutex); + return err; +} + +static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (new_dentry->d_inode) + return logfs_rename_target(old_dir, old_dentry, + new_dir, new_dentry); + return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry); +} + +/* No locking done here, as this is called before .get_sb() returns. */ +int logfs_replay_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + u64 ino, pos; + int err; + + if (super->s_victim_ino) { + /* delete victim inode */ + ino = super->s_victim_ino; + printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino); + inode = logfs_iget(sb, ino); + if (IS_ERR(inode)) + goto fail; + + LOGFS_BUG_ON(i_size_read(inode) > 0, sb); + super->s_victim_ino = 0; + err = logfs_remove_inode(inode); + iput(inode); + if (err) { + super->s_victim_ino = ino; + goto fail; + } + } + if (super->s_rename_dir) { + /* delete old dd from rename */ + ino = super->s_rename_dir; + pos = super->s_rename_pos; + printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n", + ino, pos); + inode = logfs_iget(sb, ino); + if (IS_ERR(inode)) + goto fail; + + super->s_rename_dir = 0; + super->s_rename_pos = 0; + err = logfs_delete_dd(inode, pos); + iput(inode); + if (err) { + super->s_rename_dir = ino; + super->s_rename_pos = pos; + goto fail; + } + } + return 0; +fail: + LOGFS_BUG(sb); + return -EIO; +} + +const struct inode_operations logfs_symlink_iops = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, +}; + +const struct inode_operations logfs_dir_iops = { + .create = logfs_create, + .link = logfs_link, + .lookup = logfs_lookup, + .mkdir = logfs_mkdir, + .mknod = logfs_mknod, + .rename = logfs_rename, + .rmdir = logfs_rmdir, + .permission = logfs_permission, + .symlink = logfs_symlink, + .unlink = logfs_unlink, +}; +const struct file_operations logfs_dir_fops = { + .fsync = logfs_fsync, + .ioctl = logfs_ioctl, + .readdir = logfs_readdir, + .read = generic_read_dir, +}; diff --git a/fs/logfs/file.c b/fs/logfs/file.c new file mode 100644 index 000000000000..370f367a933e --- /dev/null +++ b/fs/logfs/file.c @@ -0,0 +1,263 @@ +/* + * fs/logfs/file.c - prepare_write, commit_write and friends + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include + +static int logfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct page *page; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) + return 0; + if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + len; + + /* Reading beyond i_size is simple: memset to zero */ + zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); + return 0; + } + return logfs_readpage_nolock(page); +} + +static int logfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, + void *fsdata) +{ + struct inode *inode = mapping->host; + pgoff_t index = page->index; + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + copied; + int ret = 0; + + BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize); + BUG_ON(page->index > I3_BLOCKS); + + if (copied < len) { + /* + * Short write of a non-initialized paged. Just tell userspace + * to retry the entire page. + */ + if (!PageUptodate(page)) { + copied = 0; + goto out; + } + } + if (copied == 0) + goto out; /* FIXME: do we need to update inode? */ + + if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) { + i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end); + mark_inode_dirty_sync(inode); + } + + SetPageUptodate(page); + if (!PageDirty(page)) { + if (!get_page_reserve(inode, page)) + __set_page_dirty_nobuffers(page); + else + ret = logfs_write_buf(inode, page, WF_LOCK); + } +out: + unlock_page(page); + page_cache_release(page); + return ret ? ret : copied; +} + +int logfs_readpage(struct file *file, struct page *page) +{ + int ret; + + ret = logfs_readpage_nolock(page); + unlock_page(page); + return ret; +} + +/* Clear the page's dirty flag in the radix tree. */ +/* TODO: mucking with PageWriteback is silly. Add a generic function to clear + * the dirty bit from the radix tree for filesystems that don't have to wait + * for page writeback to finish (i.e. any compressing filesystem). + */ +static void clear_radix_tree_dirty(struct page *page) +{ + BUG_ON(PagePrivate(page) || page->private); + set_page_writeback(page); + end_page_writeback(page); +} + +static int __logfs_writepage(struct page *page) +{ + struct inode *inode = page->mapping->host; + int err; + + err = logfs_write_buf(inode, page, WF_LOCK); + if (err) + set_page_dirty(page); + else + clear_radix_tree_dirty(page); + unlock_page(page); + return err; +} + +static int logfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + u64 bix; + level_t level; + + log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index, + page); + + logfs_unpack_index(page->index, &bix, &level); + + /* Indirect blocks are never truncated */ + if (level != 0) + return __logfs_writepage(page); + + /* + * TODO: everything below is a near-verbatim copy of nobh_writepage(). + * The relevant bits should be factored out after logfs is merged. + */ + + /* Is the page fully inside i_size? */ + if (bix < end_index) + return __logfs_writepage(page); + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (bix > end_index || offset == 0) { + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invokation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + return __logfs_writepage(page); +} + +static void logfs_invalidatepage(struct page *page, unsigned long offset) +{ + move_page_to_btree(page); + BUG_ON(PagePrivate(page) || page->private); +} + +static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this) +{ + return 0; /* None of these are easy to release */ +} + + +int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct logfs_inode *li = logfs_inode(inode); + unsigned int oldflags, flags; + int err; + + switch (cmd) { + case FS_IOC_GETFLAGS: + flags = li->li_flags & LOGFS_FL_USER_VISIBLE; + return put_user(flags, (int __user *)arg); + case FS_IOC_SETFLAGS: + if (IS_RDONLY(inode)) + return -EROFS; + + if (!is_owner_or_cap(inode)) + return -EACCES; + + err = get_user(flags, (int __user *)arg); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + oldflags = li->li_flags; + flags &= LOGFS_FL_USER_MODIFIABLE; + flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE; + li->li_flags = flags; + mutex_unlock(&inode->i_mutex); + + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty_sync(inode); + return 0; + + default: + return -ENOTTY; + } +} + +int logfs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + struct super_block *sb = dentry->d_inode->i_sb; + struct logfs_super *super = logfs_super(sb); + + /* FIXME: write anchor */ + super->s_devops->sync(sb); + return 0; +} + +static int logfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int err = 0; + + if (attr->ia_valid & ATTR_SIZE) + err = logfs_truncate(inode, attr->ia_size); + attr->ia_valid &= ~ATTR_SIZE; + + if (!err) + err = inode_change_ok(inode, attr); + if (!err) + err = inode_setattr(inode, attr); + return err; +} + +const struct inode_operations logfs_reg_iops = { + .setattr = logfs_setattr, +}; + +const struct file_operations logfs_reg_fops = { + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .fsync = logfs_fsync, + .ioctl = logfs_ioctl, + .llseek = generic_file_llseek, + .mmap = generic_file_readonly_mmap, + .open = generic_file_open, + .read = do_sync_read, + .write = do_sync_write, +}; + +const struct address_space_operations logfs_reg_aops = { + .invalidatepage = logfs_invalidatepage, + .readpage = logfs_readpage, + .releasepage = logfs_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, + .writepage = logfs_writepage, + .writepages = generic_writepages, + .write_begin = logfs_write_begin, + .write_end = logfs_write_end, +}; diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c new file mode 100644 index 000000000000..b3656c44190e --- /dev/null +++ b/fs/logfs/gc.c @@ -0,0 +1,730 @@ +/* + * fs/logfs/gc.c - garbage collection code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include + +/* + * Wear leveling needs to kick in when the difference between low erase + * counts and high erase counts gets too big. A good value for "too big" + * may be somewhat below 10% of maximum erase count for the device. + * Why not 397, to pick a nice round number with no specific meaning? :) + * + * WL_RATELIMIT is the minimum time between two wear level events. A huge + * number of segments may fulfil the requirements for wear leveling at the + * same time. If that happens we don't want to cause a latency from hell, + * but just gently pick one segment every so often and minimize overhead. + */ +#define WL_DELTA 397 +#define WL_RATELIMIT 100 +#define MAX_OBJ_ALIASES 2600 +#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */ +#define LIST_SIZE 64 /* base size of candidate lists */ +#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */ +#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */ + +static int no_free_segments(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + return super->s_free_list.count; +} + +/* journal has distance -1, top-most ifile layer distance 0 */ +static u8 root_distance(struct super_block *sb, gc_level_t __gc_level) +{ + struct logfs_super *super = logfs_super(sb); + u8 gc_level = (__force u8)__gc_level; + + switch (gc_level) { + case 0: /* fall through */ + case 1: /* fall through */ + case 2: /* fall through */ + case 3: + /* file data or indirect blocks */ + return super->s_ifile_levels + super->s_iblock_levels - gc_level; + case 6: /* fall through */ + case 7: /* fall through */ + case 8: /* fall through */ + case 9: + /* inode file data or indirect blocks */ + return super->s_ifile_levels - (gc_level - 6); + default: + printk(KERN_ERR"LOGFS: segment of unknown level %x found\n", + gc_level); + WARN_ON(1); + return super->s_ifile_levels + super->s_iblock_levels; + } +} + +static int segment_is_reserved(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area; + void *reserved; + int i; + + /* Some segments are reserved. Just pretend they were all valid */ + reserved = btree_lookup32(&super->s_reserved_segments, segno); + if (reserved) + return 1; + + /* Currently open segments */ + for_each_area(i) { + area = super->s_area[i]; + if (area->a_is_open && area->a_segno == segno) + return 1; + } + + return 0; +} + +static void logfs_mark_segment_bad(struct super_block *sb, u32 segno) +{ + BUG(); +} + +/* + * Returns the bytes consumed by valid objects in this segment. Object headers + * are counted, the segment header is not. + */ +static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec, + gc_level_t *gc_level) +{ + struct logfs_segment_entry se; + u32 ec_level; + + logfs_get_segment_entry(sb, segno, &se); + if (se.ec_level == cpu_to_be32(BADSEG) || + se.valid == cpu_to_be32(RESERVED)) + return RESERVED; + + ec_level = be32_to_cpu(se.ec_level); + *ec = ec_level >> 4; + *gc_level = GC_LEVEL(ec_level & 0xf); + return be32_to_cpu(se.valid); +} + +static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino, + u64 bix, gc_level_t gc_level) +{ + struct inode *inode; + int err, cookie; + + inode = logfs_safe_iget(sb, ino, &cookie); + err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0); + BUG_ON(err); + logfs_safe_iput(inode, cookie); +} + +static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_header sh; + struct logfs_object_header oh; + u64 ofs, ino, bix; + u32 seg_ofs, logical_segno, cleaned = 0; + int err, len, valid; + gc_level_t gc_level; + + LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb); + + btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS); + err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); + BUG_ON(err); + gc_level = GC_LEVEL(sh.level); + logical_segno = be32_to_cpu(sh.segno); + if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { + logfs_mark_segment_bad(sb, segno); + cleaned = -1; + goto out; + } + + for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; + seg_ofs + sizeof(oh) < super->s_segsize; ) { + ofs = dev_ofs(sb, logical_segno, seg_ofs); + err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh), + &oh); + BUG_ON(err); + + if (!memchr_inv(&oh, 0xff, sizeof(oh))) + break; + + if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { + logfs_mark_segment_bad(sb, segno); + cleaned = super->s_segsize - 1; + goto out; + } + + ino = be64_to_cpu(oh.ino); + bix = be64_to_cpu(oh.bix); + len = sizeof(oh) + be16_to_cpu(oh.len); + valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level); + if (valid == 1) { + logfs_cleanse_block(sb, ofs, ino, bix, gc_level); + cleaned += len; + } else if (valid == 2) { + /* Will be invalid upon journal commit */ + cleaned += len; + } + seg_ofs += len; + } +out: + btree_remove32(&super->s_reserved_segments, segno); + return cleaned; +} + +static struct gc_candidate *add_list(struct gc_candidate *cand, + struct candidate_list *list) +{ + struct rb_node **p = &list->rb_tree.rb_node; + struct rb_node *parent = NULL; + struct gc_candidate *cur; + int comp; + + cand->list = list; + while (*p) { + parent = *p; + cur = rb_entry(parent, struct gc_candidate, rb_node); + + if (list->sort_by_ec) + comp = cand->erase_count < cur->erase_count; + else + comp = cand->valid < cur->valid; + + if (comp) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&cand->rb_node, parent, p); + rb_insert_color(&cand->rb_node, &list->rb_tree); + + if (list->count <= list->maxcount) { + list->count++; + return NULL; + } + cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node); + rb_erase(&cand->rb_node, &list->rb_tree); + cand->list = NULL; + return cand; +} + +static void remove_from_list(struct gc_candidate *cand) +{ + struct candidate_list *list = cand->list; + + rb_erase(&cand->rb_node, &list->rb_tree); + list->count--; +} + +static void free_candidate(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + + btree_remove32(&super->s_cand_tree, cand->segno); + kfree(cand); +} + +u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec) +{ + struct gc_candidate *cand; + u32 segno; + + BUG_ON(list->count == 0); + + cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); + remove_from_list(cand); + segno = cand->segno; + if (ec) + *ec = cand->erase_count; + free_candidate(sb, cand); + return segno; +} + +/* + * We have several lists to manage segments with. The reserve_list is used to + * deal with bad blocks. We try to keep the best (lowest ec) segments on this + * list. + * The free_list contains free segments for normal usage. It usually gets the + * second pick after the reserve_list. But when the free_list is running short + * it is more important to keep the free_list full than to keep a reserve. + * + * Segments that are not free are put onto a per-level low_list. If we have + * to run garbage collection, we pick a candidate from there. All segments on + * those lists should have at least some free space so GC will make progress. + * + * And last we have the ec_list, which is used to pick segments for wear + * leveling. + * + * If all appropriate lists are full, we simply free the candidate and forget + * about that segment for a while. We have better candidates for each purpose. + */ +static void __add_candidate(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE; + + if (cand->valid == 0) { + /* 100% free segments */ + log_gc_noisy("add reserve segment %x (ec %x) at %llx\n", + cand->segno, cand->erase_count, + dev_ofs(sb, cand->segno, 0)); + cand = add_list(cand, &super->s_reserve_list); + if (cand) { + log_gc_noisy("add free segment %x (ec %x) at %llx\n", + cand->segno, cand->erase_count, + dev_ofs(sb, cand->segno, 0)); + cand = add_list(cand, &super->s_free_list); + } + } else { + /* good candidates for Garbage Collection */ + if (cand->valid < full) + cand = add_list(cand, &super->s_low_list[cand->dist]); + /* good candidates for wear leveling, + * segments that were recently written get ignored */ + if (cand) + cand = add_list(cand, &super->s_ec_list); + } + if (cand) + free_candidate(sb, cand); +} + +static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec, + u8 dist) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + + cand = kmalloc(sizeof(*cand), GFP_NOFS); + if (!cand) + return -ENOMEM; + + cand->segno = segno; + cand->valid = valid; + cand->erase_count = ec; + cand->dist = dist; + + btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS); + __add_candidate(sb, cand); + return 0; +} + +static void remove_segment_from_lists(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + + cand = btree_lookup32(&super->s_cand_tree, segno); + if (cand) { + remove_from_list(cand); + free_candidate(sb, cand); + } +} + +static void scan_segment(struct super_block *sb, u32 segno) +{ + u32 valid, ec = 0; + gc_level_t gc_level = 0; + u8 dist; + + if (segment_is_reserved(sb, segno)) + return; + + remove_segment_from_lists(sb, segno); + valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); + if (valid == RESERVED) + return; + + dist = root_distance(sb, gc_level); + add_candidate(sb, segno, valid, ec, dist); +} + +static struct gc_candidate *first_in_list(struct candidate_list *list) +{ + if (list->count == 0) + return NULL; + return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); +} + +/* + * Find the best segment for garbage collection. Main criterion is + * the segment requiring the least effort to clean. Secondary + * criterion is to GC on the lowest level available. + * + * So we search the least effort segment on the lowest level first, + * then move up and pick another segment iff is requires significantly + * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison. + */ +static struct gc_candidate *get_candidate(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i, max_dist; + struct gc_candidate *cand = NULL, *this; + + max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS); + + for (i = max_dist; i >= 0; i--) { + this = first_in_list(&super->s_low_list[i]); + if (!this) + continue; + if (!cand) + cand = this; + if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid) + cand = this; + } + return cand; +} + +static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + gc_level_t gc_level; + u32 cleaned, valid, segno, ec; + u8 dist; + + if (!cand) { + log_gc("GC attempted, but no candidate found\n"); + return 0; + } + + segno = cand->segno; + dist = cand->dist; + valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); + free_candidate(sb, cand); + log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n", + segno, (u64)segno << super->s_segshift, + dist, no_free_segments(sb), valid, + super->s_free_bytes); + cleaned = logfs_gc_segment(sb, segno, dist); + log_gc("GC segment #%02x complete - now %x valid\n", segno, + valid - cleaned); + BUG_ON(cleaned != valid); + return 1; +} + +static int logfs_gc_once(struct super_block *sb) +{ + struct gc_candidate *cand; + + cand = get_candidate(sb); + if (cand) + remove_from_list(cand); + return __logfs_gc_once(sb, cand); +} + +/* returns 1 if a wrap occurs, 0 otherwise */ +static int logfs_scan_some(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u32 segno; + int i, ret = 0; + + segno = super->s_sweeper; + for (i = SCAN_RATIO; i > 0; i--) { + segno++; + if (segno >= super->s_no_segs) { + segno = 0; + ret = 1; + /* Break out of the loop. We want to read a single + * block from the segment size on next invocation if + * SCAN_RATIO is set to match block size + */ + break; + } + + scan_segment(sb, segno); + } + super->s_sweeper = segno; + return ret; +} + +/* + * In principle, this function should loop forever, looking for GC candidates + * and moving data. LogFS is designed in such a way that this loop is + * guaranteed to terminate. + * + * Limiting the loop to some iterations serves purely to catch cases when + * these guarantees have failed. An actual endless loop is an obvious bug + * and should be reported as such. + */ +static void __logfs_gc_pass(struct super_block *sb, int target) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + int round, progress, last_progress = 0; + + if (no_free_segments(sb) >= target && + super->s_no_object_aliases < MAX_OBJ_ALIASES) + return; + + log_gc("__logfs_gc_pass(%x)\n", target); + for (round = 0; round < SCAN_ROUNDS; ) { + if (no_free_segments(sb) >= target) + goto write_alias; + + /* Sync in-memory state with on-medium state in case they + * diverged */ + logfs_write_anchor(super->s_master_inode); + round += logfs_scan_some(sb); + if (no_free_segments(sb) >= target) + goto write_alias; + progress = logfs_gc_once(sb); + if (progress) + last_progress = round; + else if (round - last_progress > 2) + break; + continue; + + /* + * The goto logic is nasty, I just don't know a better way to + * code it. GC is supposed to ensure two things: + * 1. Enough free segments are available. + * 2. The number of aliases is bounded. + * When 1. is achieved, we take a look at 2. and write back + * some alias-containing blocks, if necessary. However, after + * each such write we need to go back to 1., as writes can + * consume free segments. + */ +write_alias: + if (super->s_no_object_aliases < MAX_OBJ_ALIASES) + return; + if (list_empty(&super->s_object_alias)) { + /* All aliases are still in btree */ + return; + } + log_gc("Write back one alias\n"); + block = list_entry(super->s_object_alias.next, + struct logfs_block, alias_list); + block->ops->write_block(block); + /* + * To round off the nasty goto logic, we reset round here. It + * is a safety-net for GC not making any progress and limited + * to something reasonably small. If incremented it for every + * single alias, the loop could terminate rather quickly. + */ + round = 0; + } + LOGFS_BUG(sb); +} + +static int wl_ratelimit(struct super_block *sb, u64 *next_event) +{ + struct logfs_super *super = logfs_super(sb); + + if (*next_event < super->s_gec) { + *next_event = super->s_gec + WL_RATELIMIT; + return 0; + } + return 1; +} + +static void logfs_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *wl_cand, *free_cand; + + if (wl_ratelimit(sb, &super->s_wl_gec_ostore)) + return; + + wl_cand = first_in_list(&super->s_ec_list); + if (!wl_cand) + return; + free_cand = first_in_list(&super->s_free_list); + if (!free_cand) + return; + + if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) { + remove_from_list(wl_cand); + __logfs_gc_once(sb, wl_cand); + } +} + +/* + * The journal needs wear leveling as well. But moving the journal is an + * expensive operation so we try to avoid it as much as possible. And if we + * have to do it, we move the whole journal, not individual segments. + * + * Ratelimiting is not strictly necessary here, it mainly serves to avoid the + * calculations. First we check whether moving the journal would be a + * significant improvement. That means that a) the current journal segments + * have more wear than the future journal segments and b) the current journal + * segments have more wear than normal ostore segments. + * Rationale for b) is that we don't have to move the journal if it is aging + * less than the ostore, even if the reserve segments age even less (they are + * excluded from wear leveling, after all). + * Next we check that the superblocks have less wear than the journal. Since + * moving the journal requires writing the superblocks, we have to protect the + * superblocks even more than the journal. + * + * Also we double the acceptable wear difference, compared to ostore wear + * leveling. Journal data is read and rewritten rapidly, comparatively. So + * soft errors have much less time to accumulate and we allow the journal to + * be a bit worse than the ostore. + */ +static void logfs_journal_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + u32 min_journal_ec = -1, max_reserve_ec = 0; + int i; + + if (wl_ratelimit(sb, &super->s_wl_gec_journal)) + return; + + if (super->s_reserve_list.count < super->s_no_journal_segs) { + /* Reserve is not full enough to move complete journal */ + return; + } + + journal_for_each(i) + if (super->s_journal_seg[i]) + min_journal_ec = min(min_journal_ec, + super->s_journal_ec[i]); + cand = rb_entry(rb_first(&super->s_free_list.rb_tree), + struct gc_candidate, rb_node); + max_reserve_ec = cand->erase_count; + for (i = 0; i < 2; i++) { + struct logfs_segment_entry se; + u32 segno = seg_no(sb, super->s_sb_ofs[i]); + u32 ec; + + logfs_get_segment_entry(sb, segno, &se); + ec = be32_to_cpu(se.ec_level) >> 4; + max_reserve_ec = max(max_reserve_ec, ec); + } + + if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) { + do_logfs_journal_wl_pass(sb); + } +} + +void logfs_gc_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex)); + /* Write journal before free space is getting saturated with dirty + * objects. + */ + if (super->s_dirty_used_bytes + super->s_dirty_free_bytes + + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes) + logfs_write_anchor(super->s_master_inode); + __logfs_gc_pass(sb, logfs_super(sb)->s_total_levels); + logfs_wl_pass(sb); + logfs_journal_wl_pass(sb); +} + +static int check_area(struct super_block *sb, int i) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[i]; + struct logfs_object_header oh; + u32 segno = area->a_segno; + u32 ofs = area->a_used_bytes; + __be32 crc; + int err; + + if (!area->a_is_open) + return 0; + + for (ofs = area->a_used_bytes; + ofs <= super->s_segsize - sizeof(oh); + ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) { + err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh); + if (err) + return err; + + if (!memchr_inv(&oh, 0xff, sizeof(oh))) + break; + + crc = logfs_crc32(&oh, sizeof(oh) - 4, 4); + if (crc != oh.crc) { + printk(KERN_INFO "interrupted header at %llx\n", + dev_ofs(sb, segno, ofs)); + return 0; + } + } + if (ofs != area->a_used_bytes) { + printk(KERN_INFO "%x bytes unaccounted data found at %llx\n", + ofs - area->a_used_bytes, + dev_ofs(sb, segno, area->a_used_bytes)); + area->a_used_bytes = ofs; + } + return 0; +} + +int logfs_check_areas(struct super_block *sb) +{ + int i, err; + + for_each_area(i) { + err = check_area(sb, i); + if (err) + return err; + } + return 0; +} + +static void logfs_init_candlist(struct candidate_list *list, int maxcount, + int sort_by_ec) +{ + list->count = 0; + list->maxcount = maxcount; + list->sort_by_ec = sort_by_ec; + list->rb_tree = RB_ROOT; +} + +int logfs_init_gc(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool); + logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1); + logfs_init_candlist(&super->s_reserve_list, + super->s_bad_seg_reserve, 1); + for_each_area(i) + logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0); + logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1); + return 0; +} + +static void logfs_cleanup_list(struct super_block *sb, + struct candidate_list *list) +{ + struct gc_candidate *cand; + + while (list->count) { + cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate, + rb_node); + remove_from_list(cand); + free_candidate(sb, cand); + } + BUG_ON(list->rb_tree.rb_node); +} + +void logfs_cleanup_gc(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + if (!super->s_free_list.count) + return; + + /* + * FIXME: The btree may still contain a single empty node. So we + * call the grim visitor to clean up that mess. Btree code should + * do it for us, really. + */ + btree_grim_visitor32(&super->s_cand_tree, 0, NULL); + logfs_cleanup_list(sb, &super->s_free_list); + logfs_cleanup_list(sb, &super->s_reserve_list); + for_each_area(i) + logfs_cleanup_list(sb, &super->s_low_list[i]); + logfs_cleanup_list(sb, &super->s_ec_list); +} diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c new file mode 100644 index 000000000000..6d08b3762641 --- /dev/null +++ b/fs/logfs/inode.c @@ -0,0 +1,417 @@ +/* + * fs/logfs/inode.c - inode handling code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include + +/* + * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes + * on the medium. It therefore also lacks a method to store the previous + * generation number for deleted inodes. Instead a single generation number + * is stored which will be used for new inodes. Being just a 32bit counter, + * this can obvious wrap relatively quickly. So we only reuse inodes if we + * know that a fair number of inodes can be created before we have to increment + * the generation again - effectively adding some bits to the counter. + * But being too aggressive here means we keep a very large and very sparse + * inode file, wasting space on indirect blocks. + * So what is a good value? Beats me. 64k seems moderately bad on both + * fronts, so let's use that for now... + * + * NFS sucks, as everyone already knows. + */ +#define INOS_PER_WRAP (0x10000) + +/* + * Logfs' requirement to read inodes for garbage collection makes life a bit + * harder. GC may have to read inodes that are in I_FREEING state, when they + * are being written out - and waiting for GC to make progress, naturally. + * + * So we cannot just call iget() or some variant of it, but first have to check + * wether the inode in question might be in I_FREEING state. Therefore we + * maintain our own per-sb list of "almost deleted" inodes and check against + * that list first. Normally this should be at most 1-2 entries long. + * + * Also, inodes have logfs-specific reference counting on top of what the vfs + * does. When .destroy_inode is called, normally the reference count will drop + * to zero and the inode gets deleted. But if GC accessed the inode, its + * refcount will remain nonzero and final deletion will have to wait. + * + * As a result we have two sets of functions to get/put inodes: + * logfs_safe_iget/logfs_safe_iput - safe to call from GC context + * logfs_iget/iput - normal version + */ +static struct kmem_cache *logfs_inode_cache; + +static DEFINE_SPINLOCK(logfs_inode_lock); + +static void logfs_inode_setops(struct inode *inode) +{ + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + inode->i_op = &logfs_dir_iops; + inode->i_fop = &logfs_dir_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFREG: + inode->i_op = &logfs_reg_iops; + inode->i_fop = &logfs_reg_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFLNK: + inode->i_op = &logfs_symlink_iops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + default: + BUG(); + } +} + +static struct inode *__logfs_iget(struct super_block *sb, ino_t ino) +{ + struct inode *inode = iget_locked(sb, ino); + int err; + + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = logfs_read_inode(inode); + if (err || inode->i_nlink == 0) { + /* inode->i_nlink == 0 can be true when called from + * block validator */ + /* set i_nlink to 0 to prevent caching */ + inode->i_nlink = 0; + logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE; + iget_failed(inode); + if (!err) + err = -ENOENT; + return ERR_PTR(err); + } + + logfs_inode_setops(inode); + unlock_new_inode(inode); + return inode; +} + +struct inode *logfs_iget(struct super_block *sb, ino_t ino) +{ + BUG_ON(ino == LOGFS_INO_MASTER); + BUG_ON(ino == LOGFS_INO_SEGFILE); + return __logfs_iget(sb, ino); +} + +/* + * is_cached is set to 1 if we hand out a cached inode, 0 otherwise. + * this allows logfs_iput to do the right thing later + */ +struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_inode *li; + + if (ino == LOGFS_INO_MASTER) + return super->s_master_inode; + if (ino == LOGFS_INO_SEGFILE) + return super->s_segfile_inode; + + spin_lock(&logfs_inode_lock); + list_for_each_entry(li, &super->s_freeing_list, li_freeing_list) + if (li->vfs_inode.i_ino == ino) { + li->li_refcount++; + spin_unlock(&logfs_inode_lock); + *is_cached = 1; + return &li->vfs_inode; + } + spin_unlock(&logfs_inode_lock); + + *is_cached = 0; + return __logfs_iget(sb, ino); +} + +static void __logfs_destroy_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + BUG_ON(li->li_block); + list_del(&li->li_freeing_list); + kmem_cache_free(logfs_inode_cache, li); +} + +static void logfs_destroy_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + BUG_ON(list_empty(&li->li_freeing_list)); + spin_lock(&logfs_inode_lock); + li->li_refcount--; + if (li->li_refcount == 0) + __logfs_destroy_inode(inode); + spin_unlock(&logfs_inode_lock); +} + +void logfs_safe_iput(struct inode *inode, int is_cached) +{ + if (inode->i_ino == LOGFS_INO_MASTER) + return; + if (inode->i_ino == LOGFS_INO_SEGFILE) + return; + + if (is_cached) { + logfs_destroy_inode(inode); + return; + } + + iput(inode); +} + +static void logfs_init_inode(struct super_block *sb, struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + li->li_flags = 0; + li->li_height = 0; + li->li_used_bytes = 0; + li->li_block = NULL; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_ctime = CURRENT_TIME; + inode->i_mtime = CURRENT_TIME; + inode->i_nlink = 1; + INIT_LIST_HEAD(&li->li_freeing_list); + + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = 0; + + return; +} + +static struct inode *logfs_alloc_inode(struct super_block *sb) +{ + struct logfs_inode *li; + + li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS); + if (!li) + return NULL; + logfs_init_inode(sb, &li->vfs_inode); + return &li->vfs_inode; +} + +/* + * In logfs inodes are written to an inode file. The inode file, like any + * other file, is managed with a inode. The inode file's inode, aka master + * inode, requires special handling in several respects. First, it cannot be + * written to the inode file, so it is stored in the journal instead. + * + * Secondly, this inode cannot be written back and destroyed before all other + * inodes have been written. The ordering is important. Linux' VFS is happily + * unaware of the ordering constraint and would ordinarily destroy the master + * inode at umount time while other inodes are still in use and dirty. Not + * good. + * + * So logfs makes sure the master inode is not written until all other inodes + * have been destroyed. Sadly, this method has another side-effect. The VFS + * will notice one remaining inode and print a frightening warning message. + * Worse, it is impossible to judge whether such a warning was caused by the + * master inode or any other inodes have leaked as well. + * + * Our attempt of solving this is with logfs_new_meta_inode() below. Its + * purpose is to create a new inode that will not trigger the warning if such + * an inode is still in use. An ugly hack, no doubt. Suggections for + * improvement are welcome. + */ +struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode; + + inode = logfs_alloc_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_mode = S_IFREG; + inode->i_ino = ino; + inode->i_sb = sb; + + /* This is a blatant copy of alloc_inode code. We'd need alloc_inode + * to be nonstatic, alas. */ + { + struct address_space * const mapping = &inode->i_data; + + mapping->a_ops = &logfs_reg_aops; + mapping->host = inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + inode->i_mapping = mapping; + inode->i_nlink = 1; + } + + return inode; +} + +struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode; + int err; + + inode = logfs_new_meta_inode(sb, ino); + if (IS_ERR(inode)) + return inode; + + err = logfs_read_inode(inode); + if (err) { + destroy_meta_inode(inode); + return ERR_PTR(err); + } + logfs_inode_setops(inode); + return inode; +} + +static int logfs_write_inode(struct inode *inode, int do_sync) +{ + int ret; + long flags = WF_LOCK; + + /* Can only happen if creat() failed. Safe to skip. */ + if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN) + return 0; + + ret = __logfs_write_inode(inode, flags); + LOGFS_BUG_ON(ret, inode->i_sb); + return ret; +} + +void destroy_meta_inode(struct inode *inode) +{ + if (inode) { + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + logfs_clear_inode(inode); + kmem_cache_free(logfs_inode_cache, logfs_inode(inode)); + } +} + +/* called with inode_lock held */ +static void logfs_drop_inode(struct inode *inode) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_inode *li = logfs_inode(inode); + + spin_lock(&logfs_inode_lock); + list_move(&li->li_freeing_list, &super->s_freeing_list); + spin_unlock(&logfs_inode_lock); + generic_drop_inode(inode); +} + +static void logfs_set_ino_generation(struct super_block *sb, + struct inode *inode) +{ + struct logfs_super *super = logfs_super(sb); + u64 ino; + + mutex_lock(&super->s_journal_mutex); + ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino); + super->s_last_ino = ino; + super->s_inos_till_wrap--; + if (super->s_inos_till_wrap < 0) { + super->s_last_ino = LOGFS_RESERVED_INOS; + super->s_generation++; + super->s_inos_till_wrap = INOS_PER_WRAP; + } + inode->i_ino = ino; + inode->i_generation = super->s_generation; + mutex_unlock(&super->s_journal_mutex); +} + +struct inode *logfs_new_inode(struct inode *dir, int mode) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + logfs_init_inode(sb, inode); + + /* inherit parent flags */ + logfs_inode(inode)->li_flags |= + logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED; + + inode->i_mode = mode; + logfs_set_ino_generation(sb, inode); + + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + inode->i_mode |= S_ISGID; + } + + logfs_inode_setops(inode); + insert_inode_hash(inode); + + return inode; +} + +static void logfs_init_once(void *_li) +{ + struct logfs_inode *li = _li; + int i; + + li->li_flags = 0; + li->li_used_bytes = 0; + li->li_refcount = 1; + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = 0; + inode_init_once(&li->vfs_inode); +} + +static int logfs_sync_fs(struct super_block *sb, int wait) +{ + /* FIXME: write anchor */ + logfs_super(sb)->s_devops->sync(sb); + return 0; +} + +const struct super_operations logfs_super_operations = { + .alloc_inode = logfs_alloc_inode, + .clear_inode = logfs_clear_inode, + .delete_inode = logfs_delete_inode, + .destroy_inode = logfs_destroy_inode, + .drop_inode = logfs_drop_inode, + .write_inode = logfs_write_inode, + .statfs = logfs_statfs, + .sync_fs = logfs_sync_fs, +}; + +int logfs_init_inode_cache(void) +{ + logfs_inode_cache = kmem_cache_create("logfs_inode_cache", + sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT, + logfs_init_once); + if (!logfs_inode_cache) + return -ENOMEM; + return 0; +} + +void logfs_destroy_inode_cache(void) +{ + kmem_cache_destroy(logfs_inode_cache); +} diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c new file mode 100644 index 000000000000..7a023dbba9f8 --- /dev/null +++ b/fs/logfs/journal.c @@ -0,0 +1,879 @@ +/* + * fs/logfs/journal.c - journal handling code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" + +static void logfs_calc_free(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u64 reserve, no_segs = super->s_no_segs; + s64 free; + int i; + + /* superblock segments */ + no_segs -= 2; + super->s_no_journal_segs = 0; + /* journal */ + journal_for_each(i) + if (super->s_journal_seg[i]) { + no_segs--; + super->s_no_journal_segs++; + } + + /* open segments plus one extra per level for GC */ + no_segs -= 2 * super->s_total_levels; + + free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE); + free -= super->s_used_bytes; + /* just a bit extra */ + free -= super->s_total_levels * 4096; + + /* Bad blocks are 'paid' for with speed reserve - the filesystem + * simply gets slower as bad blocks accumulate. Until the bad blocks + * exceed the speed reserve - then the filesystem gets smaller. + */ + reserve = super->s_bad_segments + super->s_bad_seg_reserve; + reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE; + reserve = max(reserve, super->s_speed_reserve); + free -= reserve; + if (free < 0) + free = 0; + + super->s_free_bytes = free; +} + +static void reserve_sb_and_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct btree_head32 *head = &super->s_reserved_segments; + int i, err; + + err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1, + GFP_KERNEL); + BUG_ON(err); + + err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1, + GFP_KERNEL); + BUG_ON(err); + + journal_for_each(i) { + if (!super->s_journal_seg[i]) + continue; + err = btree_insert32(head, super->s_journal_seg[i], (void *)1, + GFP_KERNEL); + BUG_ON(err); + } +} + +static void read_dynsb(struct super_block *sb, + struct logfs_je_dynsb *dynsb) +{ + struct logfs_super *super = logfs_super(sb); + + super->s_gec = be64_to_cpu(dynsb->ds_gec); + super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper); + super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino); + super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir); + super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos); + super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes); + super->s_generation = be32_to_cpu(dynsb->ds_generation); +} + +static void read_anchor(struct super_block *sb, + struct logfs_je_anchor *da) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + int i; + + super->s_last_ino = be64_to_cpu(da->da_last_ino); + li->li_flags = 0; + li->li_height = da->da_height; + i_size_write(inode, be64_to_cpu(da->da_size)); + li->li_used_bytes = be64_to_cpu(da->da_used_bytes); + + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = be64_to_cpu(da->da_data[i]); +} + +static void read_erasecount(struct super_block *sb, + struct logfs_je_journal_ec *ec) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + journal_for_each(i) + super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]); +} + +static int read_area(struct super_block *sb, struct logfs_je_area *a) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[a->gc_level]; + u64 ofs; + u32 writemask = ~(super->s_writesize - 1); + + if (a->gc_level >= LOGFS_NO_AREAS) + return -EIO; + if (a->vim != VIM_DEFAULT) + return -EIO; /* TODO: close area and continue */ + + area->a_used_bytes = be32_to_cpu(a->used_bytes); + area->a_written_bytes = area->a_used_bytes & writemask; + area->a_segno = be32_to_cpu(a->segno); + if (area->a_segno) + area->a_is_open = 1; + + ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + if (super->s_writesize > 1) + logfs_buf_recover(area, ofs, a + 1, super->s_writesize); + else + logfs_buf_recover(area, ofs, NULL, 0); + return 0; +} + +static void *unpack(void *from, void *to) +{ + struct logfs_journal_header *jh = from; + void *data = from + sizeof(struct logfs_journal_header); + int err; + size_t inlen, outlen; + + inlen = be16_to_cpu(jh->h_len); + outlen = be16_to_cpu(jh->h_datalen); + + if (jh->h_compr == COMPR_NONE) + memcpy(to, data, inlen); + else { + err = logfs_uncompress(data, to, inlen, outlen); + BUG_ON(err); + } + return to; +} + +static int __read_je_header(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + struct logfs_super *super = logfs_super(sb); + size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize) + + MAX_JOURNAL_HEADER; + u16 type, len, datalen; + int err; + + /* read header only */ + err = wbuf_read(sb, ofs, sizeof(*jh), jh); + if (err) + return err; + type = be16_to_cpu(jh->h_type); + len = be16_to_cpu(jh->h_len); + datalen = be16_to_cpu(jh->h_datalen); + if (len > sb->s_blocksize) + return -EIO; + if ((type < JE_FIRST) || (type > JE_LAST)) + return -EIO; + if (datalen > bufsize) + return -EIO; + return 0; +} + +static int __read_je_payload(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + u16 len; + int err; + + len = be16_to_cpu(jh->h_len); + err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1); + if (err) + return err; + if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) { + /* Old code was confused. It forgot about the header length + * and stopped calculating the crc 16 bytes before the end + * of data - ick! + * FIXME: Remove this hack once the old code is fixed. + */ + if (jh->h_crc == logfs_crc32(jh, len, 4)) + WARN_ON_ONCE(1); + else + return -EIO; + } + return 0; +} + +/* + * jh needs to be large enough to hold the complete entry, not just the header + */ +static int __read_je(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + int err; + + err = __read_je_header(sb, ofs, jh); + if (err) + return err; + return __read_je_payload(sb, ofs, jh); +} + +static int read_je(struct super_block *sb, u64 ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_journal_header *jh = super->s_compressed_je; + void *scratch = super->s_je; + u16 type, datalen; + int err; + + err = __read_je(sb, ofs, jh); + if (err) + return err; + type = be16_to_cpu(jh->h_type); + datalen = be16_to_cpu(jh->h_datalen); + + switch (type) { + case JE_DYNSB: + read_dynsb(sb, unpack(jh, scratch)); + break; + case JE_ANCHOR: + read_anchor(sb, unpack(jh, scratch)); + break; + case JE_ERASECOUNT: + read_erasecount(sb, unpack(jh, scratch)); + break; + case JE_AREA: + read_area(sb, unpack(jh, scratch)); + break; + case JE_OBJ_ALIAS: + err = logfs_load_object_aliases(sb, unpack(jh, scratch), + datalen); + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + return err; +} + +static int logfs_read_segment(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_journal_header *jh = super->s_compressed_je; + u64 ofs, seg_ofs = dev_ofs(sb, segno, 0); + u32 h_ofs, last_ofs = 0; + u16 len, datalen, last_len; + int i, err; + + /* search for most recent commit */ + for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) { + ofs = seg_ofs + h_ofs; + err = __read_je_header(sb, ofs, jh); + if (err) + continue; + if (jh->h_type != cpu_to_be16(JE_COMMIT)) + continue; + err = __read_je_payload(sb, ofs, jh); + if (err) + continue; + len = be16_to_cpu(jh->h_len); + datalen = be16_to_cpu(jh->h_datalen); + if ((datalen > sizeof(super->s_je_array)) || + (datalen % sizeof(__be64))) + continue; + last_ofs = h_ofs; + last_len = datalen; + h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh); + } + /* read commit */ + if (last_ofs == 0) + return -ENOENT; + ofs = seg_ofs + last_ofs; + log_journal("Read commit from %llx\n", ofs); + err = __read_je(sb, ofs, jh); + BUG_ON(err); /* We should have caught it in the scan loop already */ + if (err) + return err; + /* uncompress */ + unpack(jh, super->s_je_array); + super->s_no_je = last_len / sizeof(__be64); + /* iterate over array */ + for (i = 0; i < super->s_no_je; i++) { + err = read_je(sb, be64_to_cpu(super->s_je_array[i])); + if (err) + return err; + } + super->s_journal_area->a_segno = segno; + return 0; +} + +static u64 read_gec(struct super_block *sb, u32 segno) +{ + struct logfs_segment_header sh; + __be32 crc; + int err; + + if (!segno) + return 0; + err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); + if (err) + return 0; + crc = logfs_crc32(&sh, sizeof(sh), 4); + if (crc != sh.crc) { + WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull)); + /* Most likely it was just erased */ + return 0; + } + return be64_to_cpu(sh.gec); +} + +static int logfs_read_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u64 gec[LOGFS_JOURNAL_SEGS], max; + u32 segno; + int i, max_i; + + max = 0; + max_i = -1; + journal_for_each(i) { + segno = super->s_journal_seg[i]; + gec[i] = read_gec(sb, super->s_journal_seg[i]); + if (gec[i] > max) { + max = gec[i]; + max_i = i; + } + } + if (max_i == -1) + return -EIO; + /* FIXME: Try older segments in case of error */ + return logfs_read_segment(sb, super->s_journal_seg[max_i]); +} + +/* + * First search the current segment (outer loop), then pick the next segment + * in the array, skipping any zero entries (inner loop). + */ +static void journal_get_free_segment(struct logfs_area *area) +{ + struct logfs_super *super = logfs_super(area->a_sb); + int i; + + journal_for_each(i) { + if (area->a_segno != super->s_journal_seg[i]) + continue; + + do { + i++; + if (i == LOGFS_JOURNAL_SEGS) + i = 0; + } while (!super->s_journal_seg[i]); + + area->a_segno = super->s_journal_seg[i]; + area->a_erase_count = ++(super->s_journal_ec[i]); + log_journal("Journal now at %x (ec %x)\n", area->a_segno, + area->a_erase_count); + return; + } + BUG(); +} + +static void journal_get_erase_count(struct logfs_area *area) +{ + /* erase count is stored globally and incremented in + * journal_get_free_segment() - nothing to do here */ +} + +static int journal_erase_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_segment_header sh; + u64 ofs; + int err; + + err = logfs_erase_segment(sb, area->a_segno); + if (err) + return err; + + sh.pad = 0; + sh.type = SEG_JOURNAL; + sh.level = 0; + sh.segno = cpu_to_be32(area->a_segno); + sh.ec = cpu_to_be32(area->a_erase_count); + sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); + sh.crc = logfs_crc32(&sh, sizeof(sh), 4); + + /* This causes a bug in segment.c. Not yet. */ + //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0); + + ofs = dev_ofs(sb, area->a_segno, 0); + area->a_used_bytes = ALIGN(sizeof(sh), 16); + logfs_buf_write(area, ofs, &sh, sizeof(sh)); + return 0; +} + +static size_t __logfs_write_header(struct logfs_super *super, + struct logfs_journal_header *jh, size_t len, size_t datalen, + u16 type, u8 compr) +{ + jh->h_len = cpu_to_be16(len); + jh->h_type = cpu_to_be16(type); + jh->h_version = cpu_to_be16(++super->s_last_version); + jh->h_datalen = cpu_to_be16(datalen); + jh->h_compr = compr; + jh->h_pad[0] = 'H'; + jh->h_pad[1] = 'A'; + jh->h_pad[2] = 'T'; + jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4); + return ALIGN(len, 16) + sizeof(*jh); +} + +static size_t logfs_write_header(struct logfs_super *super, + struct logfs_journal_header *jh, size_t datalen, u16 type) +{ + size_t len = datalen; + + return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE); +} + +static inline size_t logfs_journal_erasecount_size(struct logfs_super *super) +{ + return LOGFS_JOURNAL_SEGS * sizeof(__be32); +} + +static void *logfs_write_erasecount(struct super_block *sb, void *_ec, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_journal_ec *ec = _ec; + int i; + + journal_for_each(i) + ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]); + *type = JE_ERASECOUNT; + *len = logfs_journal_erasecount_size(super); + return ec; +} + +static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore, + size_t ignore2) +{ + struct logfs_shadow *shadow = _shadow; + struct super_block *sb = (void *)_sb; + struct logfs_super *super = logfs_super(sb); + + /* consume new space */ + super->s_free_bytes -= shadow->new_len; + super->s_used_bytes += shadow->new_len; + super->s_dirty_used_bytes -= shadow->new_len; + + /* free up old space */ + super->s_free_bytes += shadow->old_len; + super->s_used_bytes -= shadow->old_len; + super->s_dirty_free_bytes -= shadow->old_len; + + logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len); + logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len); + + log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + mempool_free(shadow, super->s_shadow_pool); +} + +static void account_shadows(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + struct shadow_tree *tree = &super->s_shadow_tree; + + btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow); + btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow); + + if (li->li_block) { + /* + * We never actually use the structure, when attached to the + * master inode. But it is easier to always free it here than + * to have checks in several places elsewhere when allocating + * it. + */ + li->li_block->ops->free_block(sb, li->li_block); + } + BUG_ON((s64)li->li_used_bytes < 0); +} + +static void *__logfs_write_anchor(struct super_block *sb, void *_da, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_anchor *da = _da; + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + int i; + + da->da_height = li->li_height; + da->da_last_ino = cpu_to_be64(super->s_last_ino); + da->da_size = cpu_to_be64(i_size_read(inode)); + da->da_used_bytes = cpu_to_be64(li->li_used_bytes); + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + da->da_data[i] = cpu_to_be64(li->li_data[i]); + *type = JE_ANCHOR; + *len = sizeof(*da); + return da; +} + +static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_dynsb *dynsb = _dynsb; + + dynsb->ds_gec = cpu_to_be64(super->s_gec); + dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper); + dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino); + dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir); + dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos); + dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes); + dynsb->ds_generation = cpu_to_be32(super->s_generation); + *type = JE_DYNSB; + *len = sizeof(*dynsb); + return dynsb; +} + +static void write_wbuf(struct super_block *sb, struct logfs_area *area, + void *wbuf) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + u64 ofs; + pgoff_t index; + int page_ofs; + struct page *page; + + ofs = dev_ofs(sb, area->a_segno, + area->a_used_bytes & ~(super->s_writesize - 1)); + index = ofs >> PAGE_SHIFT; + page_ofs = ofs & (PAGE_SIZE - 1); + + page = find_lock_page(mapping, index); + BUG_ON(!page); + memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize); + unlock_page(page); +} + +static void *logfs_write_area(struct super_block *sb, void *_a, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[super->s_sum_index]; + struct logfs_je_area *a = _a; + + a->vim = VIM_DEFAULT; + a->gc_level = super->s_sum_index; + a->used_bytes = cpu_to_be32(area->a_used_bytes); + a->segno = cpu_to_be32(area->a_segno); + if (super->s_writesize > 1) + write_wbuf(sb, area, a + 1); + + *type = JE_AREA; + *len = sizeof(*a) + super->s_writesize; + return a; +} + +static void *logfs_write_commit(struct super_block *sb, void *h, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + + *type = JE_COMMIT; + *len = super->s_no_je * sizeof(__be64); + return super->s_je_array; +} + +static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type, + size_t len) +{ + struct logfs_super *super = logfs_super(sb); + void *header = super->s_compressed_je; + void *data = header + sizeof(struct logfs_journal_header); + ssize_t compr_len, pad_len; + u8 compr = COMPR_ZLIB; + + if (len == 0) + return logfs_write_header(super, header, 0, type); + + compr_len = logfs_compress(buf, data, len, sb->s_blocksize); + if (compr_len < 0 || type == JE_ANCHOR) { + BUG_ON(len > sb->s_blocksize); + memcpy(data, buf, len); + compr_len = len; + compr = COMPR_NONE; + } + + pad_len = ALIGN(compr_len, 16); + memset(data + compr_len, 0, pad_len - compr_len); + + return __logfs_write_header(super, header, compr_len, len, type, compr); +} + +static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes, + int must_pad) +{ + u32 writesize = logfs_super(area->a_sb)->s_writesize; + s32 ofs; + int ret; + + ret = logfs_open_area(area, *bytes); + if (ret) + return -EAGAIN; + + ofs = area->a_used_bytes; + area->a_used_bytes += *bytes; + + if (must_pad) { + area->a_used_bytes = ALIGN(area->a_used_bytes, writesize); + *bytes = area->a_used_bytes - ofs; + } + + return dev_ofs(area->a_sb, area->a_segno, ofs); +} + +static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type, + size_t buf_len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + struct logfs_journal_header *jh = super->s_compressed_je; + size_t len; + int must_pad = 0; + s64 ofs; + + len = __logfs_write_je(sb, buf, type, buf_len); + if (jh->h_type == cpu_to_be16(JE_COMMIT)) + must_pad = 1; + + ofs = logfs_get_free_bytes(area, &len, must_pad); + if (ofs < 0) + return ofs; + logfs_buf_write(area, ofs, super->s_compressed_je, len); + super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs); + return 0; +} + +static int logfs_write_je(struct super_block *sb, + void* (*write)(struct super_block *sb, void *scratch, + u16 *type, size_t *len)) +{ + void *buf; + size_t len; + u16 type; + + buf = write(sb, logfs_super(sb)->s_je, &type, &len); + return logfs_write_je_buf(sb, buf, type, len); +} + +int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_obj_alias *oa = super->s_je; + int err = 0, fill = super->s_je_fill; + + log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n", + fill, ino, bix, level, child_no, be64_to_cpu(val)); + oa[fill].ino = cpu_to_be64(ino); + oa[fill].bix = cpu_to_be64(bix); + oa[fill].val = val; + oa[fill].level = (__force u8)level; + oa[fill].child_no = cpu_to_be16(child_no); + fill++; + if (fill >= sb->s_blocksize / sizeof(*oa)) { + err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize); + fill = 0; + } + + super->s_je_fill = fill; + return err; +} + +static int logfs_write_obj_aliases(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int err; + + log_journal("logfs_write_obj_aliases: %d aliases to write\n", + super->s_no_object_aliases); + super->s_je_fill = 0; + err = logfs_write_obj_aliases_pagecache(sb); + if (err) + return err; + + if (super->s_je_fill) + err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS, + super->s_je_fill + * sizeof(struct logfs_obj_alias)); + return err; +} + +/* + * Write all journal entries. The goto logic ensures that all journal entries + * are written whenever a new segment is used. It is ugly and potentially a + * bit wasteful, but robustness is more important. With this we can *always* + * erase all journal segments except the one containing the most recent commit. + */ +void logfs_write_anchor(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + int i, err; + + BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + mutex_lock(&super->s_journal_mutex); + + /* Do this first or suffer corruption */ + logfs_sync_segments(sb); + account_shadows(sb); + +again: + super->s_no_je = 0; + for_each_area(i) { + if (!super->s_area[i]->a_is_open) + continue; + super->s_sum_index = i; + err = logfs_write_je(sb, logfs_write_area); + if (err) + goto again; + } + err = logfs_write_obj_aliases(sb); + if (err) + goto again; + err = logfs_write_je(sb, logfs_write_erasecount); + if (err) + goto again; + err = logfs_write_je(sb, __logfs_write_anchor); + if (err) + goto again; + err = logfs_write_je(sb, logfs_write_dynsb); + if (err) + goto again; + /* + * Order is imperative. First we sync all writes, including the + * non-committed journal writes. Then we write the final commit and + * sync the current journal segment. + * There is a theoretical bug here. Syncing the journal segment will + * write a number of journal entries and the final commit. All these + * are written in a single operation. If the device layer writes the + * data back-to-front, the commit will precede the other journal + * entries, leaving a race window. + * Two fixes are possible. Preferred is to fix the device layer to + * ensure writes happen front-to-back. Alternatively we can insert + * another logfs_sync_area() super->s_devops->sync() combo before + * writing the commit. + */ + /* + * On another subject, super->s_devops->sync is usually not necessary. + * Unless called from sys_sync or friends, a barrier would suffice. + */ + super->s_devops->sync(sb); + err = logfs_write_je(sb, logfs_write_commit); + if (err) + goto again; + log_journal("Write commit to %llx\n", + be64_to_cpu(super->s_je_array[super->s_no_je - 1])); + logfs_sync_area(area); + BUG_ON(area->a_used_bytes != area->a_written_bytes); + super->s_devops->sync(sb); + + mutex_unlock(&super->s_journal_mutex); + return; +} + +void do_logfs_journal_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + u32 segno, ec; + int i, err; + + log_journal("Journal requires wear-leveling.\n"); + /* Drop old segments */ + journal_for_each(i) + if (super->s_journal_seg[i]) { + logfs_set_segment_unreserved(sb, + super->s_journal_seg[i], + super->s_journal_ec[i]); + super->s_journal_seg[i] = 0; + super->s_journal_ec[i] = 0; + } + /* Get new segments */ + for (i = 0; i < super->s_no_journal_segs; i++) { + segno = get_best_cand(sb, &super->s_reserve_list, &ec); + super->s_journal_seg[i] = segno; + super->s_journal_ec[i] = ec; + logfs_set_segment_reserved(sb, segno); + } + /* Manually move journal_area */ + area->a_segno = super->s_journal_seg[0]; + area->a_is_open = 0; + area->a_used_bytes = 0; + /* Write journal */ + logfs_write_anchor(super->s_master_inode); + /* Write superblocks */ + err = logfs_write_sb(sb); + BUG_ON(err); +} + +static const struct logfs_area_ops journal_area_ops = { + .get_free_segment = journal_get_free_segment, + .get_erase_count = journal_get_erase_count, + .erase_segment = journal_erase_segment, +}; + +int logfs_init_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize) + + MAX_JOURNAL_HEADER; + int ret = -ENOMEM; + + mutex_init(&super->s_journal_mutex); + btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool); + + super->s_je = kzalloc(bufsize, GFP_KERNEL); + if (!super->s_je) + return ret; + + super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL); + if (!super->s_compressed_je) + return ret; + + super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER); + if (IS_ERR(super->s_master_inode)) + return PTR_ERR(super->s_master_inode); + + ret = logfs_read_journal(sb); + if (ret) + return -EIO; + + reserve_sb_and_journal(sb); + logfs_calc_free(sb); + + super->s_journal_area->a_ops = &journal_area_ops; + return 0; +} + +void logfs_cleanup_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + btree_grim_visitor32(&super->s_reserved_segments, 0, NULL); + destroy_meta_inode(super->s_master_inode); + super->s_master_inode = NULL; + + kfree(super->s_compressed_je); + kfree(super->s_je); +} diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h new file mode 100644 index 000000000000..e3082abe9e3b --- /dev/null +++ b/fs/logfs/logfs.h @@ -0,0 +1,722 @@ +/* + * fs/logfs/logfs.h + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Private header for logfs. + */ +#ifndef FS_LOGFS_LOGFS_H +#define FS_LOGFS_LOGFS_H + +#undef __CHECK_ENDIAN__ +#define __CHECK_ENDIAN__ + +#include +#include +#include +#include +#include +#include +#include +#include "logfs_abi.h" + +#define LOGFS_DEBUG_SUPER (0x0001) +#define LOGFS_DEBUG_SEGMENT (0x0002) +#define LOGFS_DEBUG_JOURNAL (0x0004) +#define LOGFS_DEBUG_DIR (0x0008) +#define LOGFS_DEBUG_FILE (0x0010) +#define LOGFS_DEBUG_INODE (0x0020) +#define LOGFS_DEBUG_READWRITE (0x0040) +#define LOGFS_DEBUG_GC (0x0080) +#define LOGFS_DEBUG_GC_NOISY (0x0100) +#define LOGFS_DEBUG_ALIASES (0x0200) +#define LOGFS_DEBUG_BLOCKMOVE (0x0400) +#define LOGFS_DEBUG_ALL (0xffffffff) + +#define LOGFS_DEBUG (0x01) +/* + * To enable specific log messages, simply define LOGFS_DEBUG to match any + * or all of the above. + */ +#ifndef LOGFS_DEBUG +#define LOGFS_DEBUG (0) +#endif + +#define log_cond(cond, fmt, arg...) do { \ + if (cond) \ + printk(KERN_DEBUG fmt, ##arg); \ +} while (0) + +#define log_super(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg) +#define log_segment(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg) +#define log_journal(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg) +#define log_dir(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg) +#define log_file(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg) +#define log_inode(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg) +#define log_readwrite(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg) +#define log_gc(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg) +#define log_gc_noisy(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg) +#define log_aliases(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg) +#define log_blockmove(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg) + +#define PG_pre_locked PG_owner_priv_1 +#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags) +#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags) +#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags) + +/* FIXME: This should really be somewhere in the 64bit area. */ +#define LOGFS_LINK_MAX (1<<30) + +/* Read-only filesystem */ +#define LOGFS_SB_FLAG_RO 0x0001 +#define LOGFS_SB_FLAG_SEG_ALIAS 0x0002 +#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004 +#define LOGFS_SB_FLAG_SHUTDOWN 0x0008 + +/* Write Control Flags */ +#define WF_LOCK 0x01 /* take write lock */ +#define WF_WRITE 0x02 /* write block */ +#define WF_DELETE 0x04 /* delete old block */ + +typedef u8 __bitwise level_t; +typedef u8 __bitwise gc_level_t; + +#define LEVEL(level) ((__force level_t)(level)) +#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level)) + +#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \ + (__force level_t)((__force u8)(level) - 1) ) + +/** + * struct logfs_area - area management information + * + * @a_sb: the superblock this area belongs to + * @a_is_open: 1 if the area is currently open, else 0 + * @a_segno: segment number of area + * @a_written_bytes: number of bytes already written back + * @a_used_bytes: number of used bytes + * @a_ops: area operations (either journal or ostore) + * @a_erase_count: erase count + * @a_level: GC level + */ +struct logfs_area { /* a segment open for writing */ + struct super_block *a_sb; + int a_is_open; + u32 a_segno; + u32 a_written_bytes; + u32 a_used_bytes; + const struct logfs_area_ops *a_ops; + u32 a_erase_count; + gc_level_t a_level; +}; + +/** + * struct logfs_area_ops - area operations + * + * @get_free_segment: fill area->ofs with the offset of a free segment + * @get_erase_count: fill area->erase_count (needs area->ofs) + * @erase_segment: erase and setup segment + */ +struct logfs_area_ops { + void (*get_free_segment)(struct logfs_area *area); + void (*get_erase_count)(struct logfs_area *area); + int (*erase_segment)(struct logfs_area *area); +}; + +/** + * struct logfs_device_ops - device access operations + * + * @readpage: read one page (mm page) + * @writeseg: write one segment. may be a partial segment + * @erase: erase one segment + * @read: read from the device + * @erase: erase part of the device + */ +struct logfs_device_ops { + struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs); + struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs); + int (*write_sb)(struct super_block *sb, struct page *page); + int (*readpage)(void *_sb, struct page *page); + void (*writeseg)(struct super_block *sb, u64 ofs, size_t len); + int (*erase)(struct super_block *sb, loff_t ofs, size_t len); + void (*sync)(struct super_block *sb); + void (*put_device)(struct super_block *sb); +}; + +/** + * struct candidate_list - list of similar candidates + */ +struct candidate_list { + struct rb_root rb_tree; + int count; + int maxcount; + int sort_by_ec; +}; + +/** + * struct gc_candidate - "candidate" segment to be garbage collected next + * + * @list: list (either free of low) + * @segno: segment number + * @valid: number of valid bytes + * @erase_count: erase count of segment + * @dist: distance from tree root + * + * Candidates can be on two lists. The free list contains electees rather + * than candidates - segments that no longer contain any valid data. The + * low list contains candidates to be picked for GC. It should be kept + * short. It is not required to always pick a perfect candidate. In the + * worst case GC will have to move more data than absolutely necessary. + */ +struct gc_candidate { + struct rb_node rb_node; + struct candidate_list *list; + u32 segno; + u32 valid; + u32 erase_count; + u8 dist; +}; + +/** + * struct logfs_journal_entry - temporary structure used during journal scan + * + * @used: + * @version: normalized version + * @len: length + * @offset: offset + */ +struct logfs_journal_entry { + int used; + s16 version; + u16 len; + u16 datalen; + u64 offset; +}; + +enum transaction_state { + CREATE_1 = 1, + CREATE_2, + UNLINK_1, + UNLINK_2, + CROSS_RENAME_1, + CROSS_RENAME_2, + TARGET_RENAME_1, + TARGET_RENAME_2, + TARGET_RENAME_3 +}; + +/** + * struct logfs_transaction - essential fields to support atomic dirops + * + * @ino: target inode + * @dir: inode of directory containing dentry + * @pos: pos of dentry in directory + */ +struct logfs_transaction { + enum transaction_state state; + u64 ino; + u64 dir; + u64 pos; +}; + +/** + * struct logfs_shadow - old block in the shadow of a not-yet-committed new one + * @old_ofs: offset of old block on medium + * @new_ofs: offset of new block on medium + * @ino: inode number + * @bix: block index + * @old_len: size of old block, including header + * @new_len: size of new block, including header + * @level: block level + */ +struct logfs_shadow { + u64 old_ofs; + u64 new_ofs; + u64 ino; + u64 bix; + int old_len; + int new_len; + gc_level_t gc_level; +}; + +/** + * struct shadow_tree + * @new: shadows where old_ofs==0, indexed by new_ofs + * @old: shadows where old_ofs!=0, indexed by old_ofs + */ +struct shadow_tree { + struct btree_head64 new; + struct btree_head64 old; +}; + +struct object_alias_item { + struct list_head list; + __be64 val; + int child_no; +}; + +/** + * struct logfs_block - contains any block state + * @type: indirect block or inode + * @full: number of fully populated children + * @partial: number of partially populated children + * + * Most blocks are directly represented by page cache pages. But when a block + * becomes dirty, is part of a transaction, contains aliases or is otherwise + * special, a struct logfs_block is allocated to track the additional state. + * Inodes are very similar to indirect blocks, so they can also get one of + * these structures added when appropriate. + */ +#define BLOCK_INDIRECT 1 /* Indirect block */ +#define BLOCK_INODE 2 /* Inode */ +struct logfs_block_ops; +struct logfs_block { + struct list_head alias_list; + struct list_head item_list; + struct super_block *sb; + u64 ino; + u64 bix; + level_t level; + struct page *page; + struct inode *inode; + struct logfs_transaction *ta; + unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG]; + struct logfs_block_ops *ops; + int full; + int partial; + int reserved_bytes; +}; + +typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val); +struct logfs_block_ops { + void (*write_block)(struct logfs_block *block); + gc_level_t (*block_level)(struct logfs_block *block); + void (*free_block)(struct super_block *sb, struct logfs_block*block); + int (*write_alias)(struct super_block *sb, + struct logfs_block *block, + write_alias_t *write_one_alias); +}; + +struct logfs_super { + struct mtd_info *s_mtd; /* underlying device */ + struct block_device *s_bdev; /* underlying device */ + const struct logfs_device_ops *s_devops;/* device access */ + struct inode *s_master_inode; /* inode file */ + struct inode *s_segfile_inode; /* segment file */ + struct inode *s_mapping_inode; /* device mapping */ + atomic_t s_pending_writes; /* outstanting bios */ + long s_flags; + mempool_t *s_btree_pool; /* for btree nodes */ + mempool_t *s_alias_pool; /* aliases in segment.c */ + u64 s_feature_incompat; + u64 s_feature_ro_compat; + u64 s_feature_compat; + u64 s_feature_flags; + u64 s_sb_ofs[2]; + /* alias.c fields */ + struct btree_head32 s_segment_alias; /* remapped segments */ + int s_no_object_aliases; + struct list_head s_object_alias; /* remapped objects */ + struct btree_head128 s_object_alias_tree; /* remapped objects */ + struct mutex s_object_alias_mutex; + /* dir.c fields */ + struct mutex s_dirop_mutex; /* for creat/unlink/rename */ + u64 s_victim_ino; /* used for atomic dir-ops */ + u64 s_rename_dir; /* source directory ino */ + u64 s_rename_pos; /* position of source dd */ + /* gc.c fields */ + long s_segsize; /* size of a segment */ + int s_segshift; /* log2 of segment size */ + long s_segmask; /* 1 << s_segshift - 1 */ + long s_no_segs; /* segments on device */ + long s_no_journal_segs; /* segments used for journal */ + long s_no_blocks; /* blocks per segment */ + long s_writesize; /* minimum write size */ + int s_writeshift; /* log2 of write size */ + u64 s_size; /* filesystem size */ + struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */ + u64 s_gec; /* global erase count */ + u64 s_wl_gec_ostore; /* time of last wl event */ + u64 s_wl_gec_journal; /* time of last wl event */ + u64 s_sweeper; /* current sweeper pos */ + u8 s_ifile_levels; /* max level of ifile */ + u8 s_iblock_levels; /* max level of regular files */ + u8 s_data_levels; /* # of segments to leaf block*/ + u8 s_total_levels; /* sum of above three */ + struct btree_head32 s_cand_tree; /* all candidates */ + struct candidate_list s_free_list; /* 100% free segments */ + struct candidate_list s_reserve_list; /* Bad segment reserve */ + struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */ + struct candidate_list s_ec_list; /* wear level candidates */ + struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */ + /* inode.c fields */ + u64 s_last_ino; /* highest ino used */ + long s_inos_till_wrap; + u32 s_generation; /* i_generation for new files */ + struct list_head s_freeing_list; /* inodes being freed */ + /* journal.c fields */ + struct mutex s_journal_mutex; + void *s_je; /* journal entry to compress */ + void *s_compressed_je; /* block to write to journal */ + u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */ + u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */ + u64 s_last_version; + struct logfs_area *s_journal_area; /* open journal segment */ + __be64 s_je_array[64]; + int s_no_je; + + int s_sum_index; /* for the 12 summaries */ + struct shadow_tree s_shadow_tree; + int s_je_fill; /* index of current je */ + /* readwrite.c fields */ + struct mutex s_write_mutex; + int s_lock_count; + mempool_t *s_block_pool; /* struct logfs_block pool */ + mempool_t *s_shadow_pool; /* struct logfs_shadow pool */ + /* + * Space accounting: + * - s_used_bytes specifies space used to store valid data objects. + * - s_dirty_used_bytes is space used to store non-committed data + * objects. Those objects have already been written themselves, + * but they don't become valid until all indirect blocks up to the + * journal have been written as well. + * - s_dirty_free_bytes is space used to store the old copy of a + * replaced object, as long as the replacement is non-committed. + * In other words, it is the amount of space freed when all dirty + * blocks are written back. + * - s_free_bytes is the amount of free space available for any + * purpose. + * - s_root_reserve is the amount of free space available only to + * the root user. Non-privileged users can no longer write once + * this watermark has been reached. + * - s_speed_reserve is space which remains unused to speed up + * garbage collection performance. + * - s_dirty_pages is the space reserved for currently dirty pages. + * It is a pessimistic estimate, so some/most will get freed on + * page writeback. + * + * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size + */ + u64 s_free_bytes; + u64 s_used_bytes; + u64 s_dirty_free_bytes; + u64 s_dirty_used_bytes; + u64 s_root_reserve; + u64 s_speed_reserve; + u64 s_dirty_pages; + /* Bad block handling: + * - s_bad_seg_reserve is a number of segments usually kept + * free. When encountering bad blocks, the affected segment's data + * is _temporarily_ moved to a reserved segment. + * - s_bad_segments is the number of known bad segments. + */ + u32 s_bad_seg_reserve; + u32 s_bad_segments; +}; + +/** + * struct logfs_inode - in-memory inode + * + * @vfs_inode: struct inode + * @li_data: data pointers + * @li_used_bytes: number of used bytes + * @li_freeing_list: used to track inodes currently being freed + * @li_flags: inode flags + * @li_refcount: number of internal (GC-induced) references + */ +struct logfs_inode { + struct inode vfs_inode; + u64 li_data[LOGFS_EMBEDDED_FIELDS]; + u64 li_used_bytes; + struct list_head li_freeing_list; + struct logfs_block *li_block; + u32 li_flags; + u8 li_height; + int li_refcount; +}; + +#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++) +#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++) +#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--) + +/* compr.c */ +int logfs_compress(void *in, void *out, size_t inlen, size_t outlen); +int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen); +int __init logfs_compr_init(void); +void logfs_compr_exit(void); + +/* dev_bdev.c */ +#ifdef CONFIG_BLOCK +int logfs_get_sb_bdev(struct file_system_type *type, int flags, + const char *devname, struct vfsmount *mnt); +#else +static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags, + const char *devname, struct vfsmount *mnt) +{ + return -ENODEV; +} +#endif + +/* dev_mtd.c */ +#ifdef CONFIG_MTD +int logfs_get_sb_mtd(struct file_system_type *type, int flags, + int mtdnr, struct vfsmount *mnt); +#else +static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags, + int mtdnr, struct vfsmount *mnt) +{ + return -ENODEV; +} +#endif + +/* dir.c */ +extern const struct inode_operations logfs_symlink_iops; +extern const struct inode_operations logfs_dir_iops; +extern const struct file_operations logfs_dir_fops; +int logfs_replay_journal(struct super_block *sb); + +/* file.c */ +extern const struct inode_operations logfs_reg_iops; +extern const struct file_operations logfs_reg_fops; +extern const struct address_space_operations logfs_reg_aops; +int logfs_readpage(struct file *file, struct page *page); +int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long arg); +int logfs_fsync(struct file *file, struct dentry *dentry, int datasync); + +/* gc.c */ +u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec); +void logfs_gc_pass(struct super_block *sb); +int logfs_check_areas(struct super_block *sb); +int logfs_init_gc(struct super_block *sb); +void logfs_cleanup_gc(struct super_block *sb); + +/* inode.c */ +extern const struct super_operations logfs_super_operations; +struct inode *logfs_iget(struct super_block *sb, ino_t ino); +struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie); +void logfs_safe_iput(struct inode *inode, int cookie); +struct inode *logfs_new_inode(struct inode *dir, int mode); +struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino); +struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino); +int logfs_init_inode_cache(void); +void logfs_destroy_inode_cache(void); +void destroy_meta_inode(struct inode *inode); +void logfs_set_blocks(struct inode *inode, u64 no); +/* these logically belong into inode.c but actually reside in readwrite.c */ +int logfs_read_inode(struct inode *inode); +int __logfs_write_inode(struct inode *inode, long flags); +void logfs_delete_inode(struct inode *inode); +void logfs_clear_inode(struct inode *inode); + +/* journal.c */ +void logfs_write_anchor(struct inode *inode); +int logfs_init_journal(struct super_block *sb); +void logfs_cleanup_journal(struct super_block *sb); +int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val); +void do_logfs_journal_wl_pass(struct super_block *sb); + +/* readwrite.c */ +pgoff_t logfs_pack_index(u64 bix, level_t level); +void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level); +int logfs_inode_write(struct inode *inode, const void *buf, size_t count, + loff_t bix, long flags, struct shadow_tree *shadow_tree); +int logfs_readpage_nolock(struct page *page); +int logfs_write_buf(struct inode *inode, struct page *page, long flags); +int logfs_delete(struct inode *inode, pgoff_t index, + struct shadow_tree *shadow_tree); +int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, + gc_level_t gc_level, long flags); +int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, + gc_level_t gc_level); +int logfs_truncate(struct inode *inode, u64 size); +u64 logfs_seek_hole(struct inode *inode, u64 bix); +u64 logfs_seek_data(struct inode *inode, u64 bix); +int logfs_open_segfile(struct super_block *sb); +int logfs_init_rw(struct super_block *sb); +void logfs_cleanup_rw(struct super_block *sb); +void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta); +void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta); +void logfs_write_block(struct logfs_block *block, long flags); +int logfs_write_obj_aliases_pagecache(struct super_block *sb); +void logfs_get_segment_entry(struct super_block *sb, u32 segno, + struct logfs_segment_entry *se); +void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment); +void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec, + gc_level_t gc_level); +void logfs_set_segment_reserved(struct super_block *sb, u32 segno); +void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec); +struct logfs_block *__alloc_block(struct super_block *sb, + u64 ino, u64 bix, level_t level); +void __free_block(struct super_block *sb, struct logfs_block *block); +void btree_write_block(struct logfs_block *block); +void initialize_block_counters(struct page *page, struct logfs_block *block, + __be64 *array, int page_is_empty); +int logfs_exist_block(struct inode *inode, u64 bix); +int get_page_reserve(struct inode *inode, struct page *page); +extern struct logfs_block_ops indirect_block_ops; + +/* segment.c */ +int logfs_erase_segment(struct super_block *sb, u32 ofs); +int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf); +int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix, + level_t level); +int logfs_segment_write(struct inode *inode, struct page *page, + struct logfs_shadow *shadow); +int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow); +int logfs_load_object_aliases(struct super_block *sb, + struct logfs_obj_alias *oa, int count); +void move_page_to_btree(struct page *page); +int logfs_init_mapping(struct super_block *sb); +void logfs_sync_area(struct logfs_area *area); +void logfs_sync_segments(struct super_block *sb); + +/* area handling */ +int logfs_init_areas(struct super_block *sb); +void logfs_cleanup_areas(struct super_block *sb); +int logfs_open_area(struct logfs_area *area, size_t bytes); +void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, + int use_filler); + +static inline void logfs_buf_write(struct logfs_area *area, u64 ofs, + void *buf, size_t len) +{ + __logfs_buf_write(area, ofs, buf, len, 0); +} + +static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs, + void *buf, size_t len) +{ + __logfs_buf_write(area, ofs, buf, len, 1); +} + +/* super.c */ +struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index); +void emergency_read_end(struct page *page); +void logfs_crash_dump(struct super_block *sb); +void *memchr_inv(const void *s, int c, size_t n); +int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); +int logfs_get_sb_device(struct file_system_type *type, int flags, + struct mtd_info *mtd, struct block_device *bdev, + const struct logfs_device_ops *devops, struct vfsmount *mnt); +int logfs_check_ds(struct logfs_disk_super *ds); +int logfs_write_sb(struct super_block *sb); + +static inline struct logfs_super *logfs_super(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct logfs_inode *logfs_inode(struct inode *inode) +{ + return container_of(inode, struct logfs_inode, vfs_inode); +} + +static inline void logfs_set_ro(struct super_block *sb) +{ + logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO; +} + +#define LOGFS_BUG(sb) do { \ + struct super_block *__sb = sb; \ + logfs_crash_dump(__sb); \ + logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \ + BUG(); \ +} while (0) + +#define LOGFS_BUG_ON(condition, sb) \ + do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0) + +static inline __be32 logfs_crc32(void *data, size_t len, size_t skip) +{ + return cpu_to_be32(crc32(~0, data+skip, len-skip)); +} + +static inline u8 logfs_type(struct inode *inode) +{ + return (inode->i_mode >> 12) & 15; +} + +static inline pgoff_t logfs_index(struct super_block *sb, u64 pos) +{ + return pos >> sb->s_blocksize_bits; +} + +static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs) +{ + return ((u64)segno << logfs_super(sb)->s_segshift) + ofs; +} + +static inline u32 seg_no(struct super_block *sb, u64 ofs) +{ + return ofs >> logfs_super(sb)->s_segshift; +} + +static inline u32 seg_ofs(struct super_block *sb, u64 ofs) +{ + return ofs & logfs_super(sb)->s_segmask; +} + +static inline u64 seg_align(struct super_block *sb, u64 ofs) +{ + return ofs & ~logfs_super(sb)->s_segmask; +} + +static inline struct logfs_block *logfs_block(struct page *page) +{ + return (void *)page->private; +} + +static inline level_t shrink_level(gc_level_t __level) +{ + u8 level = (__force u8)__level; + + if (level >= LOGFS_MAX_LEVELS) + level -= LOGFS_MAX_LEVELS; + return (__force level_t)level; +} + +static inline gc_level_t expand_level(u64 ino, level_t __level) +{ + u8 level = (__force u8)__level; + + if (ino == LOGFS_INO_MASTER) { + /* ifile has seperate areas */ + level += LOGFS_MAX_LEVELS; + } + return (__force gc_level_t)level; +} + +static inline int logfs_block_shift(struct super_block *sb, level_t level) +{ + level = shrink_level((__force gc_level_t)level); + return (__force int)level * (sb->s_blocksize_bits - 3); +} + +static inline u64 logfs_block_mask(struct super_block *sb, level_t level) +{ + return ~0ull << logfs_block_shift(sb, level); +} + +static inline struct logfs_area *get_area(struct super_block *sb, + gc_level_t gc_level) +{ + return logfs_super(sb)->s_area[(__force u8)gc_level]; +} + +#endif diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h new file mode 100644 index 000000000000..5d3782ddecc8 --- /dev/null +++ b/fs/logfs/logfs_abi.h @@ -0,0 +1,627 @@ +/* + * fs/logfs/logfs_abi.h + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Public header for logfs. + */ +#ifndef FS_LOGFS_LOGFS_ABI_H +#define FS_LOGFS_LOGFS_ABI_H + +/* For out-of-kernel compiles */ +#ifndef BUILD_BUG_ON +#define BUILD_BUG_ON(condition) /**/ +#endif + +#define SIZE_CHECK(type, size) \ +static inline void check_##type(void) \ +{ \ + BUILD_BUG_ON(sizeof(struct type) != (size)); \ +} + +/* + * Throughout the logfs code, we're constantly dealing with blocks at + * various positions or offsets. To remove confusion, we stricly + * distinguish between a "position" - the logical position within a + * file and an "offset" - the physical location within the device. + * + * Any usage of the term offset for a logical location or position for + * a physical one is a bug and should get fixed. + */ + +/* + * Block are allocated in one of several segments depending on their + * level. The following levels are used: + * 0 - regular data block + * 1 - i1 indirect blocks + * 2 - i2 indirect blocks + * 3 - i3 indirect blocks + * 4 - i4 indirect blocks + * 5 - i5 indirect blocks + * 6 - ifile data blocks + * 7 - ifile i1 indirect blocks + * 8 - ifile i2 indirect blocks + * 9 - ifile i3 indirect blocks + * 10 - ifile i4 indirect blocks + * 11 - ifile i5 indirect blocks + * Potential levels to be used in the future: + * 12 - gc recycled blocks, long-lived data + * 13 - replacement blocks, short-lived data + * + * Levels 1-11 are necessary for robust gc operations and help seperate + * short-lived metadata from longer-lived file data. In the future, + * file data should get seperated into several segments based on simple + * heuristics. Old data recycled during gc operation is expected to be + * long-lived. New data is of uncertain life expectancy. New data + * used to replace older blocks in existing files is expected to be + * short-lived. + */ + + +/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */ +#define LOGFS_MAGIC 0xb21f205ac97e8168ull +#define LOGFS_MAGIC_U32 0xc97e8168u + +/* + * Various blocksize related macros. Blocksize is currently fixed at 4KiB. + * Sooner or later that should become configurable and the macros replaced + * by something superblock-dependent. Pointers in indirect blocks are and + * will remain 64bit. + * + * LOGFS_BLOCKSIZE - self-explaining + * LOGFS_BLOCK_FACTOR - number of pointers per indirect block + * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts + */ +#define LOGFS_BLOCKSIZE (4096ull) +#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64)) +#define LOGFS_BLOCK_BITS (9) + +/* + * Number of blocks at various levels of indirection. There are 16 direct + * block pointers plus a single indirect pointer. + */ +#define I0_BLOCKS (16) +#define I1_BLOCKS LOGFS_BLOCK_FACTOR +#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS) +#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS) +#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS) +#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS) + +#define INDIRECT_INDEX I0_BLOCKS +#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1) + +/* + * Sizes at which files require another level of indirection. Files smaller + * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself, + * similar like ext2 fast symlinks. + * + * Data at a position smaller than LOGFS_I0_SIZE is accessed through the + * direct pointers, else through the 1x indirect pointer and so forth. + */ +#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64)) +#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE) + +/* + * Each indirect block pointer must have this flag set, if all block pointers + * behind it are set, i.e. there is no hole hidden in the shadow of this + * indirect block pointer. + */ +#define LOGFS_FULLY_POPULATED (1ULL << 63) +#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED) + +/* + * LogFS needs to seperate data into levels. Each level is defined as the + * maximal possible distance from the master inode (inode of the inode file). + * Data blocks reside on level 0, 1x indirect block on level 1, etc. + * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11. + * This effort is necessary to guarantee garbage collection to always make + * progress. + * + * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks, + * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is + * the maximal number of levels for one file. + * LOGFS_NO_AREAS is twice that, as the inode file and regular files are + * effectively stacked on top of each other. + */ +#define LOGFS_MAX_INDIRECT (5) +#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1) +#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS) + +/* Maximum size of filenames */ +#define LOGFS_MAX_NAMELEN (255) + +/* Number of segments in the primary journal. */ +#define LOGFS_JOURNAL_SEGS (16) + +/* Maximum number of free/erased/etc. segments in journal entries */ +#define MAX_CACHED_SEGS (64) + + +/* + * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store, + * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including + * its header, + * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for + * its segment header and the padded space at the end when no further objects + * fit. + */ +#define LOGFS_OBJECT_HEADERSIZE (0x1c) +#define LOGFS_SEGMENT_HEADERSIZE (0x18) +#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE) +#define LOGFS_SEGMENT_RESERVE \ + (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1) + +/* + * Segment types: + * SEG_SUPER - Data or indirect block + * SEG_JOURNAL - Inode + * SEG_OSTORE - Dentry + */ +enum { + SEG_SUPER = 0x01, + SEG_JOURNAL = 0x02, + SEG_OSTORE = 0x03, +}; + +/** + * struct logfs_segment_header - per-segment header in the ostore + * + * @crc: crc32 of header (there is no data) + * @pad: unused, must be 0 + * @type: segment type, see above + * @level: GC level for all objects in this segment + * @segno: segment number + * @ec: erase count for this segment + * @gec: global erase count at time of writing + */ +struct logfs_segment_header { + __be32 crc; + __be16 pad; + __u8 type; + __u8 level; + __be32 segno; + __be32 ec; + __be64 gec; +}; + +SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE); + +/** + * struct logfs_disk_super - on-medium superblock + * + * @ds_magic: magic number, must equal LOGFS_MAGIC + * @ds_crc: crc32 of structure starting with the next field + * @ds_ifile_levels: maximum number of levels for ifile + * @ds_iblock_levels: maximum number of levels for regular files + * @ds_data_levels: number of seperate levels for data + * @pad0: reserved, must be 0 + * @ds_feature_incompat: incompatible filesystem features + * @ds_feature_ro_compat: read-only compatible filesystem features + * @ds_feature_compat: compatible filesystem features + * @ds_flags: flags + * @ds_segment_shift: log2 of segment size + * @ds_block_shift: log2 of block size + * @ds_write_shift: log2 of write size + * @pad1: reserved, must be 0 + * @ds_journal_seg: segments used by primary journal + * @ds_root_reserve: bytes reserved for the superuser + * @ds_speed_reserve: bytes reserved to speed up GC + * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks + * @pad2: reserved, must be 0 + * @pad3: reserved, must be 0 + * + * Contains only read-only fields. Read-write fields like the amount of used + * space is tracked in the dynamic superblock, which is stored in the journal. + */ +struct logfs_disk_super { + struct logfs_segment_header ds_sh; + __be64 ds_magic; + + __be32 ds_crc; + __u8 ds_ifile_levels; + __u8 ds_iblock_levels; + __u8 ds_data_levels; + __u8 ds_segment_shift; + __u8 ds_block_shift; + __u8 ds_write_shift; + __u8 pad0[6]; + + __be64 ds_filesystem_size; + __be32 ds_segment_size; + __be32 ds_bad_seg_reserve; + + __be64 ds_feature_incompat; + __be64 ds_feature_ro_compat; + + __be64 ds_feature_compat; + __be64 ds_feature_flags; + + __be64 ds_root_reserve; + __be64 ds_speed_reserve; + + __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS]; + + __be64 ds_super_ofs[2]; + __be64 pad3[8]; +}; + +SIZE_CHECK(logfs_disk_super, 256); + +/* + * Object types: + * OBJ_BLOCK - Data or indirect block + * OBJ_INODE - Inode + * OBJ_DENTRY - Dentry + */ +enum { + OBJ_BLOCK = 0x04, + OBJ_INODE = 0x05, + OBJ_DENTRY = 0x06, +}; + +/** + * struct logfs_object_header - per-object header in the ostore + * + * @crc: crc32 of header, excluding data_crc + * @len: length of data + * @type: object type, see above + * @compr: compression type + * @ino: inode number + * @bix: block index + * @data_crc: crc32 of payload + */ +struct logfs_object_header { + __be32 crc; + __be16 len; + __u8 type; + __u8 compr; + __be64 ino; + __be64 bix; + __be32 data_crc; +} __attribute__((packed)); + +SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE); + +/* + * Reserved inode numbers: + * LOGFS_INO_MASTER - master inode (for inode file) + * LOGFS_INO_ROOT - root directory + * LOGFS_INO_SEGFILE - per-segment used bytes and erase count + */ +enum { + LOGFS_INO_MAPPING = 0x00, + LOGFS_INO_MASTER = 0x01, + LOGFS_INO_ROOT = 0x02, + LOGFS_INO_SEGFILE = 0x03, + LOGFS_RESERVED_INOS = 0x10, +}; + +/* + * Inode flags. High bits should never be written to the medium. They are + * reserved for in-memory usage. + * Low bits should either remain in sync with the corresponding FS_*_FL or + * reuse slots that obviously don't make sense for logfs. + * + * LOGFS_IF_DIRTY Inode must be written back + * LOGFS_IF_ZOMBIE Inode has been deleted + * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode + */ +#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */ +#define LOGFS_IF_DIRTY 0x20000000 +#define LOGFS_IF_ZOMBIE 0x40000000 +#define LOGFS_IF_STILLBORN 0x80000000 + +/* Flags available to chattr */ +#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED) +#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED) +/* Flags inherited from parent directory on file/directory creation */ +#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED) + +/** + * struct logfs_disk_inode - on-medium inode + * + * @di_mode: file mode + * @di_pad: reserved, must be 0 + * @di_flags: inode flags, see above + * @di_uid: user id + * @di_gid: group id + * @di_ctime: change time + * @di_mtime: modify time + * @di_refcount: reference count (aka nlink or link count) + * @di_generation: inode generation, for nfs + * @di_used_bytes: number of bytes used + * @di_size: file size + * @di_data: data pointers + */ +struct logfs_disk_inode { + __be16 di_mode; + __u8 di_height; + __u8 di_pad; + __be32 di_flags; + __be32 di_uid; + __be32 di_gid; + + __be64 di_ctime; + __be64 di_mtime; + + __be64 di_atime; + __be32 di_refcount; + __be32 di_generation; + + __be64 di_used_bytes; + __be64 di_size; + + __be64 di_data[LOGFS_EMBEDDED_FIELDS]; +}; + +SIZE_CHECK(logfs_disk_inode, 200); + +#define INODE_POINTER_OFS \ + (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64)) +#define INODE_USED_OFS \ + (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64)) +#define INODE_SIZE_OFS \ + (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64)) +#define INODE_HEIGHT_OFS (0) + +/** + * struct logfs_disk_dentry - on-medium dentry structure + * + * @ino: inode number + * @namelen: length of file name + * @type: file type, identical to bits 12..15 of mode + * @name: file name + */ +/* FIXME: add 6 bytes of padding to remove the __packed */ +struct logfs_disk_dentry { + __be64 ino; + __be16 namelen; + __u8 type; + __u8 name[LOGFS_MAX_NAMELEN]; +} __attribute__((packed)); + +SIZE_CHECK(logfs_disk_dentry, 266); + +#define RESERVED 0xffffffff +#define BADSEG 0xffffffff +/** + * struct logfs_segment_entry - segment file entry + * + * @ec_level: erase count and level + * @valid: number of valid bytes + * + * Segment file contains one entry for every segment. ec_level contains the + * erasecount in the upper 28 bits and the level in the lower 4 bits. An + * ec_level of BADSEG (-1) identifies bad segments. valid contains the number + * of valid bytes or RESERVED (-1 again) if the segment is used for either the + * superblock or the journal, or when the segment is bad. + */ +struct logfs_segment_entry { + __be32 ec_level; + __be32 valid; +}; + +SIZE_CHECK(logfs_segment_entry, 8); + +/** + * struct logfs_journal_header - header for journal entries (JEs) + * + * @h_crc: crc32 of journal entry + * @h_len: length of compressed journal entry, + * not including header + * @h_datalen: length of uncompressed data + * @h_type: JE type + * @h_version: unnormalized version of journal entry + * @h_compr: compression type + * @h_pad: reserved + */ +struct logfs_journal_header { + __be32 h_crc; + __be16 h_len; + __be16 h_datalen; + __be16 h_type; + __be16 h_version; + __u8 h_compr; + __u8 h_pad[3]; +}; + +SIZE_CHECK(logfs_journal_header, 16); + +/* + * Life expectency of data. + * VIM_DEFAULT - default vim + * VIM_SEGFILE - for segment file only - very short-living + * VIM_GC - GC'd data - likely long-living + */ +enum logfs_vim { + VIM_DEFAULT = 0, + VIM_SEGFILE = 1, +}; + +/** + * struct logfs_je_area - wbuf header + * + * @segno: segment number of area + * @used_bytes: number of bytes already used + * @gc_level: GC level + * @vim: life expectancy of data + * + * "Areas" are segments currently being used for writing. There is at least + * one area per GC level. Several may be used to seperate long-living from + * short-living data. If an area with unknown vim is encountered, it can + * simply be closed. + * The write buffer immediately follow this header. + */ +struct logfs_je_area { + __be32 segno; + __be32 used_bytes; + __u8 gc_level; + __u8 vim; +} __attribute__((packed)); + +SIZE_CHECK(logfs_je_area, 10); + +#define MAX_JOURNAL_HEADER \ + (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area)) + +/** + * struct logfs_je_dynsb - dynamic superblock + * + * @ds_gec: global erase count + * @ds_sweeper: current position of GC "sweeper" + * @ds_rename_dir: source directory ino (see dir.c documentation) + * @ds_rename_pos: position of source dd (see dir.c documentation) + * @ds_victim_ino: victims of incomplete dir operation (see dir.c) + * @ds_victim_ino: parent inode of victim (see dir.c) + * @ds_used_bytes: number of used bytes + */ +struct logfs_je_dynsb { + __be64 ds_gec; + __be64 ds_sweeper; + + __be64 ds_rename_dir; + __be64 ds_rename_pos; + + __be64 ds_victim_ino; + __be64 ds_victim_parent; /* XXX */ + + __be64 ds_used_bytes; + __be32 ds_generation; + __be32 pad; +}; + +SIZE_CHECK(logfs_je_dynsb, 64); + +/** + * struct logfs_je_anchor - anchor of filesystem tree, aka master inode + * + * @da_size: size of inode file + * @da_last_ino: last created inode + * @da_used_bytes: number of bytes used + * @da_data: data pointers + */ +struct logfs_je_anchor { + __be64 da_size; + __be64 da_last_ino; + + __be64 da_used_bytes; + u8 da_height; + u8 pad[7]; + + __be64 da_data[LOGFS_EMBEDDED_FIELDS]; +}; + +SIZE_CHECK(logfs_je_anchor, 168); + +/** + * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal) + * + * @so_segment: segments used for 2nd journal + * + * Length of the array is given by h_len field in the header. + */ +struct logfs_je_spillout { + __be64 so_segment[0]; +}; + +SIZE_CHECK(logfs_je_spillout, 0); + +/** + * struct logfs_je_journal_ec - erase counts for all journal segments + * + * @ec: erase count + * + * Length of the array is given by h_len field in the header. + */ +struct logfs_je_journal_ec { + __be32 ec[0]; +}; + +SIZE_CHECK(logfs_je_journal_ec, 0); + +/** + * struct logfs_je_free_segments - list of free segmetns with erase count + */ +struct logfs_je_free_segments { + __be32 segno; + __be32 ec; +}; + +SIZE_CHECK(logfs_je_free_segments, 8); + +/** + * struct logfs_seg_alias - list of segment aliases + */ +struct logfs_seg_alias { + __be32 old_segno; + __be32 new_segno; +}; + +SIZE_CHECK(logfs_seg_alias, 8); + +/** + * struct logfs_obj_alias - list of object aliases + */ +struct logfs_obj_alias { + __be64 ino; + __be64 bix; + __be64 val; + u8 level; + u8 pad[5]; + __be16 child_no; +}; + +SIZE_CHECK(logfs_obj_alias, 32); + +/** + * Compression types. + * + * COMPR_NONE - uncompressed + * COMPR_ZLIB - compressed with zlib + */ +enum { + COMPR_NONE = 0, + COMPR_ZLIB = 1, +}; + +/* + * Journal entries come in groups of 16. First group contains unique + * entries, next groups contain one entry per level + * + * JE_FIRST - smallest possible journal entry number + * + * JEG_BASE - base group, containing unique entries + * JE_COMMIT - commit entry, validates all previous entries + * JE_DYNSB - dynamic superblock, anything that ought to be in the + * superblock but cannot because it is read-write data + * JE_ANCHOR - anchor aka master inode aka inode file's inode + * JE_ERASECOUNT erasecounts for all journal segments + * JE_SPILLOUT - unused + * JE_SEG_ALIAS - aliases segments + * JE_AREA - area description + * + * JE_LAST - largest possible journal entry number + */ +enum { + JE_FIRST = 0x01, + + JEG_BASE = 0x00, + JE_COMMIT = 0x02, + JE_DYNSB = 0x03, + JE_ANCHOR = 0x04, + JE_ERASECOUNT = 0x05, + JE_SPILLOUT = 0x06, + JE_OBJ_ALIAS = 0x0d, + JE_AREA = 0x0e, + + JE_LAST = 0x0e, +}; + +#endif diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c new file mode 100644 index 000000000000..1dbe6e8cccec --- /dev/null +++ b/fs/logfs/readwrite.c @@ -0,0 +1,2246 @@ +/* + * fs/logfs/readwrite.c + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * + * Actually contains five sets of very similar functions: + * read read blocks from a file + * seek_hole find next hole + * seek_data find next data block + * valid check whether a block still belongs to a file + * write write blocks to a file + * delete delete a block (for directories and ifile) + * rewrite move existing blocks of a file to a new location (gc helper) + * truncate truncate a file + */ +#include "logfs.h" +#include + +static u64 adjust_bix(u64 bix, level_t level) +{ + switch (level) { + case 0: + return bix; + case LEVEL(1): + return max_t(u64, bix, I0_BLOCKS); + case LEVEL(2): + return max_t(u64, bix, I1_BLOCKS); + case LEVEL(3): + return max_t(u64, bix, I2_BLOCKS); + case LEVEL(4): + return max_t(u64, bix, I3_BLOCKS); + case LEVEL(5): + return max_t(u64, bix, I4_BLOCKS); + default: + WARN_ON(1); + return bix; + } +} + +static inline u64 maxbix(u8 height) +{ + return 1ULL << (LOGFS_BLOCK_BITS * height); +} + +/** + * The inode address space is cut in two halves. Lower half belongs to data + * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is + * set, the actual block index (bix) and level can be derived from the page + * index. + * + * The lowest three bits of the block index are set to 0 after packing and + * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored + * anyway this is harmless. + */ +#define ARCH_SHIFT (BITS_PER_LONG - 32) +#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT) +#define LEVEL_SHIFT (28 + ARCH_SHIFT) +static inline pgoff_t first_indirect_block(void) +{ + return INDIRECT_BIT | (1ULL << LEVEL_SHIFT); +} + +pgoff_t logfs_pack_index(u64 bix, level_t level) +{ + pgoff_t index; + + BUG_ON(bix >= INDIRECT_BIT); + if (level == 0) + return bix; + + index = INDIRECT_BIT; + index |= (__force long)level << LEVEL_SHIFT; + index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS); + return index; +} + +void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level) +{ + u8 __level; + + if (!(index & INDIRECT_BIT)) { + *bix = index; + *level = 0; + return; + } + + __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT; + *level = LEVEL(__level); + *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT; + *bix = adjust_bix(*bix, *level); + return; +} +#undef ARCH_SHIFT +#undef INDIRECT_BIT +#undef LEVEL_SHIFT + +/* + * Time is stored as nanoseconds since the epoch. + */ +static struct timespec be64_to_timespec(__be64 betime) +{ + return ns_to_timespec(be64_to_cpu(betime)); +} + +static __be64 timespec_to_be64(struct timespec tsp) +{ + return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec); +} + +static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + inode->i_mode = be16_to_cpu(di->di_mode); + li->li_height = di->di_height; + li->li_flags = be32_to_cpu(di->di_flags); + inode->i_uid = be32_to_cpu(di->di_uid); + inode->i_gid = be32_to_cpu(di->di_gid); + inode->i_size = be64_to_cpu(di->di_size); + logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes)); + inode->i_atime = be64_to_timespec(di->di_atime); + inode->i_ctime = be64_to_timespec(di->di_ctime); + inode->i_mtime = be64_to_timespec(di->di_mtime); + inode->i_nlink = be32_to_cpu(di->di_refcount); + inode->i_generation = be32_to_cpu(di->di_generation); + + switch (inode->i_mode & S_IFMT) { + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + inode->i_rdev = be64_to_cpu(di->di_data[0]); + break; + case S_IFDIR: /* fall through */ + case S_IFREG: /* fall through */ + case S_IFLNK: + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = be64_to_cpu(di->di_data[i]); + break; + default: + BUG(); + } +} + +static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + di->di_mode = cpu_to_be16(inode->i_mode); + di->di_height = li->li_height; + di->di_pad = 0; + di->di_flags = cpu_to_be32(li->li_flags); + di->di_uid = cpu_to_be32(inode->i_uid); + di->di_gid = cpu_to_be32(inode->i_gid); + di->di_size = cpu_to_be64(i_size_read(inode)); + di->di_used_bytes = cpu_to_be64(li->li_used_bytes); + di->di_atime = timespec_to_be64(inode->i_atime); + di->di_ctime = timespec_to_be64(inode->i_ctime); + di->di_mtime = timespec_to_be64(inode->i_mtime); + di->di_refcount = cpu_to_be32(inode->i_nlink); + di->di_generation = cpu_to_be32(inode->i_generation); + + switch (inode->i_mode & S_IFMT) { + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + di->di_data[0] = cpu_to_be64(inode->i_rdev); + break; + case S_IFDIR: /* fall through */ + case S_IFREG: /* fall through */ + case S_IFLNK: + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + di->di_data[i] = cpu_to_be64(li->li_data[i]); + break; + default: + BUG(); + } +} + +static void __logfs_set_blocks(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_inode *li = logfs_inode(inode); + + inode->i_blocks = ULONG_MAX; + if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX) + inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9; +} + +void logfs_set_blocks(struct inode *inode, u64 bytes) +{ + struct logfs_inode *li = logfs_inode(inode); + + li->li_used_bytes = bytes; + __logfs_set_blocks(inode); +} + +static void prelock_page(struct super_block *sb, struct page *page, int lock) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!PageLocked(page)); + if (lock) { + BUG_ON(PagePreLocked(page)); + SetPagePreLocked(page); + } else { + /* We are in GC path. */ + if (PagePreLocked(page)) + super->s_lock_count++; + else + SetPagePreLocked(page); + } +} + +static void preunlock_page(struct super_block *sb, struct page *page, int lock) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!PageLocked(page)); + if (lock) + ClearPagePreLocked(page); + else { + /* We are in GC path. */ + BUG_ON(!PagePreLocked(page)); + if (super->s_lock_count) + super->s_lock_count--; + else + ClearPagePreLocked(page); + } +} + +/* + * Logfs is prone to an AB-BA deadlock where one task tries to acquire + * s_write_mutex with a locked page and GC tries to get that page while holding + * s_write_mutex. + * To solve this issue logfs will ignore the page lock iff the page in question + * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked + * in addition to PG_locked. + */ +static void logfs_get_wblocks(struct super_block *sb, struct page *page, + int lock) +{ + struct logfs_super *super = logfs_super(sb); + + if (page) + prelock_page(sb, page, lock); + + if (lock) { + mutex_lock(&super->s_write_mutex); + logfs_gc_pass(sb); + /* FIXME: We also have to check for shadowed space + * and mempool fill grade */ + } +} + +static void logfs_put_wblocks(struct super_block *sb, struct page *page, + int lock) +{ + struct logfs_super *super = logfs_super(sb); + + if (page) + preunlock_page(sb, page, lock); + /* Order matters - we must clear PG_pre_locked before releasing + * s_write_mutex or we could race against another task. */ + if (lock) + mutex_unlock(&super->s_write_mutex); +} + +static struct page *logfs_get_read_page(struct inode *inode, u64 bix, + level_t level) +{ + return find_or_create_page(inode->i_mapping, + logfs_pack_index(bix, level), GFP_NOFS); +} + +static void logfs_put_read_page(struct page *page) +{ + unlock_page(page); + page_cache_release(page); +} + +static void logfs_lock_write_page(struct page *page) +{ + int loop = 0; + + while (unlikely(!trylock_page(page))) { + if (loop++ > 0x1000) { + /* Has been observed once so far... */ + printk(KERN_ERR "stack at %p\n", &loop); + BUG(); + } + if (PagePreLocked(page)) { + /* Holder of page lock is waiting for us, it + * is safe to use this page. */ + break; + } + /* Some other process has this page locked and has + * nothing to do with us. Wait for it to finish. + */ + schedule(); + } + BUG_ON(!PageLocked(page)); +} + +static struct page *logfs_get_write_page(struct inode *inode, u64 bix, + level_t level) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index = logfs_pack_index(bix, level); + struct page *page; + int err; + +repeat: + page = find_get_page(mapping, index); + if (!page) { + page = __page_cache_alloc(GFP_NOFS); + if (!page) + return NULL; + err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS); + if (unlikely(err)) { + page_cache_release(page); + if (err == -EEXIST) + goto repeat; + return NULL; + } + } else logfs_lock_write_page(page); + BUG_ON(!PageLocked(page)); + return page; +} + +static void logfs_unlock_write_page(struct page *page) +{ + if (!PagePreLocked(page)) + unlock_page(page); +} + +static void logfs_put_write_page(struct page *page) +{ + logfs_unlock_write_page(page); + page_cache_release(page); +} + +static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level, + int rw) +{ + if (rw == READ) + return logfs_get_read_page(inode, bix, level); + else + return logfs_get_write_page(inode, bix, level); +} + +static void logfs_put_page(struct page *page, int rw) +{ + if (rw == READ) + logfs_put_read_page(page); + else + logfs_put_write_page(page); +} + +static unsigned long __get_bits(u64 val, int skip, int no) +{ + u64 ret = val; + + ret >>= skip * no; + ret <<= 64 - no; + ret >>= 64 - no; + return ret; +} + +static unsigned long get_bits(u64 val, level_t skip) +{ + return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS); +} + +static inline void init_shadow_tree(struct super_block *sb, + struct shadow_tree *tree) +{ + struct logfs_super *super = logfs_super(sb); + + btree_init_mempool64(&tree->new, super->s_btree_pool); + btree_init_mempool64(&tree->old, super->s_btree_pool); +} + +static void indirect_write_block(struct logfs_block *block) +{ + struct page *page; + struct inode *inode; + int ret; + + page = block->page; + inode = page->mapping->host; + logfs_lock_write_page(page); + ret = logfs_write_buf(inode, page, 0); + logfs_unlock_write_page(page); + /* + * This needs some rework. Unless you want your filesystem to run + * completely synchronously (you don't), the filesystem will always + * report writes as 'successful' before the actual work has been + * done. The actual work gets done here and this is where any errors + * will show up. And there isn't much we can do about it, really. + * + * Some attempts to fix the errors (move from bad blocks, retry io,...) + * have already been done, so anything left should be either a broken + * device or a bug somewhere in logfs itself. Being relatively new, + * the odds currently favor a bug, so for now the line below isn't + * entirely tasteles. + */ + BUG_ON(ret); +} + +static void inode_write_block(struct logfs_block *block) +{ + struct inode *inode; + int ret; + + inode = block->inode; + if (inode->i_ino == LOGFS_INO_MASTER) + logfs_write_anchor(inode); + else { + ret = __logfs_write_inode(inode, 0); + /* see indirect_write_block comment */ + BUG_ON(ret); + } +} + +static gc_level_t inode_block_level(struct logfs_block *block) +{ + BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER); + return GC_LEVEL(LOGFS_MAX_LEVELS); +} + +static gc_level_t indirect_block_level(struct logfs_block *block) +{ + struct page *page; + struct inode *inode; + u64 bix; + level_t level; + + page = block->page; + inode = page->mapping->host; + logfs_unpack_index(page->index, &bix, &level); + return expand_level(inode->i_ino, level); +} + +/* + * This silences a false, yet annoying gcc warning. I hate it when my editor + * jumps into bitops.h each time I recompile this file. + * TODO: Complain to gcc folks about this and upgrade compiler. + */ +static unsigned long fnb(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +static __be64 inode_val0(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 val; + + /* + * Explicit shifting generates good code, but must match the format + * of the structure. Add some paranoia just in case. + */ + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0); + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2); + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4); + + val = (u64)inode->i_mode << 48 | + (u64)li->li_height << 40 | + (u64)li->li_flags; + return cpu_to_be64(val); +} + +static int inode_write_alias(struct super_block *sb, + struct logfs_block *block, write_alias_t *write_one_alias) +{ + struct inode *inode = block->inode; + struct logfs_inode *li = logfs_inode(inode); + unsigned long pos; + u64 ino , bix; + __be64 val; + level_t level; + int err; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS) + return 0; + + switch (pos) { + case INODE_HEIGHT_OFS: + val = inode_val0(inode); + break; + case INODE_USED_OFS: + val = cpu_to_be64(li->li_used_bytes);; + break; + case INODE_SIZE_OFS: + val = cpu_to_be64(i_size_read(inode)); + break; + case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1: + val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]); + break; + default: + BUG(); + } + + ino = LOGFS_INO_MASTER; + bix = inode->i_ino; + level = LEVEL(0); + err = write_one_alias(sb, ino, bix, level, pos, val); + if (err) + return err; + } +} + +static int indirect_write_alias(struct super_block *sb, + struct logfs_block *block, write_alias_t *write_one_alias) +{ + unsigned long pos; + struct page *page = block->page; + u64 ino , bix; + __be64 *child, val; + level_t level; + int err; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_BLOCK_FACTOR) + return 0; + + ino = page->mapping->host->i_ino; + logfs_unpack_index(page->index, &bix, &level); + child = kmap_atomic(page, KM_USER0); + val = child[pos]; + kunmap_atomic(child, KM_USER0); + err = write_one_alias(sb, ino, bix, level, pos, val); + if (err) + return err; + } +} + +int logfs_write_obj_aliases_pagecache(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + int err; + + list_for_each_entry(block, &super->s_object_alias, alias_list) { + err = block->ops->write_alias(sb, block, write_alias_journal); + if (err) + return err; + } + return 0; +} + +void __free_block(struct super_block *sb, struct logfs_block *block) +{ + BUG_ON(!list_empty(&block->item_list)); + list_del(&block->alias_list); + mempool_free(block, logfs_super(sb)->s_block_pool); +} + +static void inode_free_block(struct super_block *sb, struct logfs_block *block) +{ + struct inode *inode = block->inode; + + logfs_inode(inode)->li_block = NULL; + __free_block(sb, block); +} + +static void indirect_free_block(struct super_block *sb, + struct logfs_block *block) +{ + ClearPagePrivate(block->page); + block->page->private = 0; + __free_block(sb, block); +} + + +static struct logfs_block_ops inode_block_ops = { + .write_block = inode_write_block, + .block_level = inode_block_level, + .free_block = inode_free_block, + .write_alias = inode_write_alias, +}; + +struct logfs_block_ops indirect_block_ops = { + .write_block = indirect_write_block, + .block_level = indirect_block_level, + .free_block = indirect_free_block, + .write_alias = indirect_write_alias, +}; + +struct logfs_block *__alloc_block(struct super_block *sb, + u64 ino, u64 bix, level_t level) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + + block = mempool_alloc(super->s_block_pool, GFP_NOFS); + memset(block, 0, sizeof(*block)); + INIT_LIST_HEAD(&block->alias_list); + INIT_LIST_HEAD(&block->item_list); + block->sb = sb; + block->ino = ino; + block->bix = bix; + block->level = level; + return block; +} + +static void alloc_inode_block(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block; + + if (li->li_block) + return; + + block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0); + block->inode = inode; + li->li_block = block; + block->ops = &inode_block_ops; +} + +void initialize_block_counters(struct page *page, struct logfs_block *block, + __be64 *array, int page_is_empty) +{ + u64 ptr; + int i, start; + + block->partial = 0; + block->full = 0; + start = 0; + if (page->index < first_indirect_block()) { + /* Counters are pointless on level 0 */ + return; + } + if (page->index == first_indirect_block()) { + /* Skip unused pointers */ + start = I0_BLOCKS; + block->full = I0_BLOCKS; + } + if (!page_is_empty) { + for (i = start; i < LOGFS_BLOCK_FACTOR; i++) { + ptr = be64_to_cpu(array[i]); + if (ptr) + block->partial++; + if (ptr & LOGFS_FULLY_POPULATED) + block->full++; + } + } +} + +static void alloc_data_block(struct inode *inode, struct page *page) +{ + struct logfs_block *block; + u64 bix; + level_t level; + + if (PagePrivate(page)) + return; + + logfs_unpack_index(page->index, &bix, &level); + block = __alloc_block(inode->i_sb, inode->i_ino, bix, level); + block->page = page; + SetPagePrivate(page); + page->private = (unsigned long)block; + block->ops = &indirect_block_ops; +} + +static void alloc_indirect_block(struct inode *inode, struct page *page, + int page_is_empty) +{ + struct logfs_block *block; + __be64 *array; + + if (PagePrivate(page)) + return; + + alloc_data_block(inode, page); + + block = logfs_block(page); + array = kmap_atomic(page, KM_USER0); + initialize_block_counters(page, block, array, page_is_empty); + kunmap_atomic(array, KM_USER0); +} + +static void block_set_pointer(struct page *page, int index, u64 ptr) +{ + struct logfs_block *block = logfs_block(page); + __be64 *array; + u64 oldptr; + + BUG_ON(!block); + array = kmap_atomic(page, KM_USER0); + oldptr = be64_to_cpu(array[index]); + array[index] = cpu_to_be64(ptr); + kunmap_atomic(array, KM_USER0); + SetPageUptodate(page); + + block->full += !!(ptr & LOGFS_FULLY_POPULATED) + - !!(oldptr & LOGFS_FULLY_POPULATED); + block->partial += !!ptr - !!oldptr; +} + +static u64 block_get_pointer(struct page *page, int index) +{ + __be64 *block; + u64 ptr; + + block = kmap_atomic(page, KM_USER0); + ptr = be64_to_cpu(block[index]); + kunmap_atomic(block, KM_USER0); + return ptr; +} + +static int logfs_read_empty(struct page *page) +{ + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + return 0; +} + +static int logfs_read_direct(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + pgoff_t index = page->index; + u64 block; + + block = li->li_data[index]; + if (!block) + return logfs_read_empty(page); + + return logfs_segment_read(inode, page, block, index, 0); +} + +static int logfs_read_loop(struct inode *inode, struct page *page, + int rw_context) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bix, bofs = li->li_data[INDIRECT_INDEX]; + level_t level, target_level; + int ret; + struct page *ipage; + + logfs_unpack_index(page->index, &bix, &target_level); + if (!bofs) + return logfs_read_empty(page); + + if (bix >= maxbix(li->li_height)) + return logfs_read_empty(page); + + for (level = LEVEL(li->li_height); + (__force u8)level > (__force u8)target_level; + level = SUBLEVEL(level)){ + ipage = logfs_get_page(inode, bix, level, rw_context); + if (!ipage) + return -ENOMEM; + + ret = logfs_segment_read(inode, ipage, bofs, bix, level); + if (ret) { + logfs_put_read_page(ipage); + return ret; + } + + bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level))); + logfs_put_page(ipage, rw_context); + if (!bofs) + return logfs_read_empty(page); + } + + return logfs_segment_read(inode, page, bofs, bix, 0); +} + +static int logfs_read_block(struct inode *inode, struct page *page, + int rw_context) +{ + pgoff_t index = page->index; + + if (index < I0_BLOCKS) + return logfs_read_direct(inode, page); + return logfs_read_loop(inode, page, rw_context); +} + +static int logfs_exist_loop(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bofs = li->li_data[INDIRECT_INDEX]; + level_t level; + int ret; + struct page *ipage; + + if (!bofs) + return 0; + if (bix >= maxbix(li->li_height)) + return 0; + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) { + ipage = logfs_get_read_page(inode, bix, level); + if (!ipage) + return -ENOMEM; + + ret = logfs_segment_read(inode, ipage, bofs, bix, level); + if (ret) { + logfs_put_read_page(ipage); + return ret; + } + + bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level))); + logfs_put_read_page(ipage); + if (!bofs) + return 0; + } + + return 1; +} + +int logfs_exist_block(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) + return !!li->li_data[bix]; + return logfs_exist_loop(inode, bix); +} + +static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data) +{ + struct logfs_inode *li = logfs_inode(inode); + + for (; bix < I0_BLOCKS; bix++) + if (data ^ (li->li_data[bix] == 0)) + return bix; + return I0_BLOCKS; +} + +static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data) +{ + struct logfs_inode *li = logfs_inode(inode); + __be64 *rblock; + u64 increment, bofs = li->li_data[INDIRECT_INDEX]; + level_t level; + int ret, slot; + struct page *page; + + BUG_ON(!bofs); + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) { + increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1)); + page = logfs_get_read_page(inode, bix, level); + if (!page) + return bix; + + ret = logfs_segment_read(inode, page, bofs, bix, level); + if (ret) { + logfs_put_read_page(page); + return bix; + } + + slot = get_bits(bix, SUBLEVEL(level)); + rblock = kmap_atomic(page, KM_USER0); + while (slot < LOGFS_BLOCK_FACTOR) { + if (data && (rblock[slot] != 0)) + break; + if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED)) + break; + slot++; + bix += increment; + bix &= ~(increment - 1); + } + if (slot >= LOGFS_BLOCK_FACTOR) { + kunmap_atomic(rblock, KM_USER0); + logfs_put_read_page(page); + return bix; + } + bofs = be64_to_cpu(rblock[slot]); + kunmap_atomic(rblock, KM_USER0); + logfs_put_read_page(page); + if (!bofs) { + BUG_ON(data); + return bix; + } + } + return bix; +} + +/** + * logfs_seek_hole - find next hole starting at a given block index + * @inode: inode to search in + * @bix: block index to start searching + * + * Returns next hole. If the file doesn't contain any further holes, the + * block address next to eof is returned instead. + */ +u64 logfs_seek_hole(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) { + bix = seek_holedata_direct(inode, bix, 0); + if (bix < I0_BLOCKS) + return bix; + } + + if (!li->li_data[INDIRECT_INDEX]) + return bix; + else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED) + bix = maxbix(li->li_height); + else { + bix = seek_holedata_loop(inode, bix, 0); + if (bix < maxbix(li->li_height)) + return bix; + /* Should not happen anymore. But if some port writes semi- + * corrupt images (as this one used to) we might run into it. + */ + WARN_ON_ONCE(bix == maxbix(li->li_height)); + } + + return bix; +} + +static u64 __logfs_seek_data(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) { + bix = seek_holedata_direct(inode, bix, 1); + if (bix < I0_BLOCKS) + return bix; + } + + if (bix < maxbix(li->li_height)) { + if (!li->li_data[INDIRECT_INDEX]) + bix = maxbix(li->li_height); + else + return seek_holedata_loop(inode, bix, 1); + } + + return bix; +} + +/** + * logfs_seek_data - find next data block after a given block index + * @inode: inode to search in + * @bix: block index to start searching + * + * Returns next data block. If the file doesn't contain any further data + * blocks, the last block in the file is returned instead. + */ +u64 logfs_seek_data(struct inode *inode, u64 bix) +{ + struct super_block *sb = inode->i_sb; + u64 ret, end; + + ret = __logfs_seek_data(inode, bix); + end = i_size_read(inode) >> sb->s_blocksize_bits; + if (ret >= end) + ret = max(bix, end); + return ret; +} + +static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs) +{ + return pure_ofs(li->li_data[bix]) == ofs; +} + +static int __logfs_is_valid_loop(struct inode *inode, u64 bix, + u64 ofs, u64 bofs) +{ + struct logfs_inode *li = logfs_inode(inode); + level_t level; + int ret; + struct page *page; + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){ + page = logfs_get_write_page(inode, bix, level); + BUG_ON(!page); + + ret = logfs_segment_read(inode, page, bofs, bix, level); + if (ret) { + logfs_put_write_page(page); + return 0; + } + + bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level))); + logfs_put_write_page(page); + if (!bofs) + return 0; + + if (pure_ofs(bofs) == ofs) + return 1; + } + return 0; +} + +static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bofs = li->li_data[INDIRECT_INDEX]; + + if (!bofs) + return 0; + + if (bix >= maxbix(li->li_height)) + return 0; + + if (pure_ofs(bofs) == ofs) + return 1; + + return __logfs_is_valid_loop(inode, bix, ofs, bofs); +} + +static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs) +{ + struct logfs_inode *li = logfs_inode(inode); + + if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1) + return 0; + + if (bix < I0_BLOCKS) + return logfs_is_valid_direct(li, bix, ofs); + return logfs_is_valid_loop(inode, bix, ofs); +} + +/** + * logfs_is_valid_block - check whether this block is still valid + * + * @sb - superblock + * @ofs - block physical offset + * @ino - block inode number + * @bix - block index + * @level - block level + * + * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will + * become invalid once the journal is written. + */ +int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, + gc_level_t gc_level) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + int ret, cookie; + + /* Umount closes a segment with free blocks remaining. Those + * blocks are by definition invalid. */ + if (ino == -1) + return 0; + + LOGFS_BUG_ON((u64)(u_long)ino != ino, sb); + + inode = logfs_safe_iget(sb, ino, &cookie); + if (IS_ERR(inode)) + goto invalid; + + ret = __logfs_is_valid_block(inode, bix, ofs); + logfs_safe_iput(inode, cookie); + if (ret) + return ret; + +invalid: + /* Block is nominally invalid, but may still sit in the shadow tree, + * waiting for a journal commit. + */ + if (btree_lookup64(&super->s_shadow_tree.old, ofs)) + return 2; + return 0; +} + +int logfs_readpage_nolock(struct page *page) +{ + struct inode *inode = page->mapping->host; + int ret = -EIO; + + ret = logfs_read_block(inode, page, READ); + + if (ret) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + flush_dcache_page(page); + + return ret; +} + +static int logfs_reserve_bytes(struct inode *inode, int bytes) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + u64 available = super->s_free_bytes + super->s_dirty_free_bytes + - super->s_dirty_used_bytes - super->s_dirty_pages; + + if (!bytes) + return 0; + + if (available < bytes) + return -ENOSPC; + + if (available < bytes + super->s_root_reserve && + !capable(CAP_SYS_RESOURCE)) + return -ENOSPC; + + return 0; +} + +int get_page_reserve(struct inode *inode, struct page *page) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + int ret; + + if (logfs_block(page) && logfs_block(page)->reserved_bytes) + return 0; + + logfs_get_wblocks(inode->i_sb, page, WF_LOCK); + ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE); + if (!ret) { + alloc_data_block(inode, page); + logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE; + super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE; + } + logfs_put_wblocks(inode->i_sb, page, WF_LOCK); + return ret; +} + +/* + * We are protected by write lock. Push victims up to superblock level + * and release transaction when appropriate. + */ +/* FIXME: This is currently called from the wrong spots. */ +static void logfs_handle_transaction(struct inode *inode, + struct logfs_transaction *ta) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + if (!ta) + return; + logfs_inode(inode)->li_block->ta = NULL; + + if (inode->i_ino != LOGFS_INO_MASTER) { + BUG(); /* FIXME: Yes, this needs more thought */ + /* just remember the transaction until inode is written */ + //BUG_ON(logfs_inode(inode)->li_transaction); + //logfs_inode(inode)->li_transaction = ta; + return; + } + + switch (ta->state) { + case CREATE_1: /* fall through */ + case UNLINK_1: + BUG_ON(super->s_victim_ino); + super->s_victim_ino = ta->ino; + break; + case CREATE_2: /* fall through */ + case UNLINK_2: + BUG_ON(super->s_victim_ino != ta->ino); + super->s_victim_ino = 0; + /* transaction ends here - free it */ + kfree(ta); + break; + case CROSS_RENAME_1: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + super->s_rename_dir = ta->dir; + super->s_rename_pos = ta->pos; + break; + case CROSS_RENAME_2: + BUG_ON(super->s_rename_dir != ta->dir); + BUG_ON(super->s_rename_pos != ta->pos); + super->s_rename_dir = 0; + super->s_rename_pos = 0; + kfree(ta); + break; + case TARGET_RENAME_1: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + BUG_ON(super->s_victim_ino); + super->s_rename_dir = ta->dir; + super->s_rename_pos = ta->pos; + super->s_victim_ino = ta->ino; + break; + case TARGET_RENAME_2: + BUG_ON(super->s_rename_dir != ta->dir); + BUG_ON(super->s_rename_pos != ta->pos); + BUG_ON(super->s_victim_ino != ta->ino); + super->s_rename_dir = 0; + super->s_rename_pos = 0; + break; + case TARGET_RENAME_3: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + BUG_ON(super->s_victim_ino != ta->ino); + super->s_victim_ino = 0; + kfree(ta); + break; + default: + BUG(); + } +} + +/* + * Not strictly a reservation, but rather a check that we still have enough + * space to satisfy the write. + */ +static int logfs_reserve_blocks(struct inode *inode, int blocks) +{ + return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE); +} + +struct write_control { + u64 ofs; + long flags; +}; + +static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix, + level_t level, u64 old_ofs) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_shadow *shadow; + + shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS); + memset(shadow, 0, sizeof(*shadow)); + shadow->ino = inode->i_ino; + shadow->bix = bix; + shadow->gc_level = expand_level(inode->i_ino, level); + shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED; + return shadow; +} + +static void free_shadow(struct inode *inode, struct logfs_shadow *shadow) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + mempool_free(shadow, super->s_shadow_pool); +} + +/** + * fill_shadow_tree - Propagate shadow tree changes due to a write + * @inode: Inode owning the page + * @page: Struct page that was written + * @shadow: Shadow for the current write + * + * Writes in logfs can result in two semi-valid objects. The old object + * is still valid as long as it can be reached by following pointers on + * the medium. Only when writes propagate all the way up to the journal + * has the new object safely replaced the old one. + * + * To handle this problem, a struct logfs_shadow is used to represent + * every single write. It is attached to the indirect block, which is + * marked dirty. When the indirect block is written, its shadows are + * handed up to the next indirect block (or inode). Untimately they + * will reach the master inode and be freed upon journal commit. + * + * This function handles a single step in the propagation. It adds the + * shadow for the current write to the tree, along with any shadows in + * the page's tree, in case it was an indirect block. If a page is + * written, the inode parameter is left NULL, if an inode is written, + * the page parameter is left NULL. + */ +static void fill_shadow_tree(struct inode *inode, struct page *page, + struct logfs_shadow *shadow) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_block *block = logfs_block(page); + struct shadow_tree *tree = &super->s_shadow_tree; + + if (PagePrivate(page)) { + if (block->alias_map) + super->s_no_object_aliases -= bitmap_weight( + block->alias_map, LOGFS_BLOCK_FACTOR); + logfs_handle_transaction(inode, block->ta); + block->ops->free_block(inode->i_sb, block); + } + if (shadow) { + if (shadow->old_ofs) + btree_insert64(&tree->old, shadow->old_ofs, shadow, + GFP_NOFS); + else + btree_insert64(&tree->new, shadow->new_ofs, shadow, + GFP_NOFS); + + super->s_dirty_used_bytes += shadow->new_len; + super->s_dirty_free_bytes += shadow->old_len; + } +} + +static void logfs_set_alias(struct super_block *sb, struct logfs_block *block, + long child_no) +{ + struct logfs_super *super = logfs_super(sb); + + if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) { + /* Aliases in the master inode are pointless. */ + return; + } + + if (!test_bit(child_no, block->alias_map)) { + set_bit(child_no, block->alias_map); + super->s_no_object_aliases++; + } + list_move_tail(&block->alias_list, &super->s_object_alias); +} + +/* + * Object aliases can and often do change the size and occupied space of a + * file. So not only do we have to change the pointers, we also have to + * change inode->i_size and li->li_used_bytes. Which is done by setting + * another two object aliases for the inode itself. + */ +static void set_iused(struct inode *inode, struct logfs_shadow *shadow) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (shadow->new_len == shadow->old_len) + return; + + alloc_inode_block(inode); + li->li_used_bytes += shadow->new_len - shadow->old_len; + __logfs_set_blocks(inode); + logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS); + logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS); +} + +static int logfs_write_i0(struct inode *inode, struct page *page, + struct write_control *wc) +{ + struct logfs_shadow *shadow; + u64 bix; + level_t level; + int full, err = 0; + + logfs_unpack_index(page->index, &bix, &level); + if (wc->ofs == 0) + if (logfs_reserve_blocks(inode, 1)) + return -ENOSPC; + + shadow = alloc_shadow(inode, bix, level, wc->ofs); + if (wc->flags & WF_WRITE) + err = logfs_segment_write(inode, page, shadow); + if (wc->flags & WF_DELETE) + logfs_segment_delete(inode, shadow); + if (err) { + free_shadow(inode, shadow); + return err; + } + + set_iused(inode, shadow); + full = 1; + if (level != 0) { + alloc_indirect_block(inode, page, 0); + full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR; + } + fill_shadow_tree(inode, page, shadow); + wc->ofs = shadow->new_ofs; + if (wc->ofs && full) + wc->ofs |= LOGFS_FULLY_POPULATED; + return 0; +} + +static int logfs_write_direct(struct inode *inode, struct page *page, + long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[page->index], + .flags = flags, + }; + int err; + + alloc_inode_block(inode); + + err = logfs_write_i0(inode, page, &wc); + if (err) + return err; + + li->li_data[page->index] = wc.ofs; + logfs_set_alias(inode->i_sb, li->li_block, + page->index + INODE_POINTER_OFS); + return 0; +} + +static int ptr_change(u64 ofs, struct page *page) +{ + struct logfs_block *block = logfs_block(page); + int empty0, empty1, full0, full1; + + empty0 = ofs == 0; + empty1 = block->partial == 0; + if (empty0 != empty1) + return 1; + + /* The !! is necessary to shrink result to int */ + full0 = !!(ofs & LOGFS_FULLY_POPULATED); + full1 = block->full == LOGFS_BLOCK_FACTOR; + if (full0 != full1) + return 1; + return 0; +} + +static int __logfs_write_rec(struct inode *inode, struct page *page, + struct write_control *this_wc, + pgoff_t bix, level_t target_level, level_t level) +{ + int ret, page_empty = 0; + int child_no = get_bits(bix, SUBLEVEL(level)); + struct page *ipage; + struct write_control child_wc = { + .flags = this_wc->flags, + }; + + ipage = logfs_get_write_page(inode, bix, level); + if (!ipage) + return -ENOMEM; + + if (this_wc->ofs) { + ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); + if (ret) + goto out; + } else if (!PageUptodate(ipage)) { + page_empty = 1; + logfs_read_empty(ipage); + } + + child_wc.ofs = block_get_pointer(ipage, child_no); + + if ((__force u8)level-1 > (__force u8)target_level) + ret = __logfs_write_rec(inode, page, &child_wc, bix, + target_level, SUBLEVEL(level)); + else + ret = logfs_write_i0(inode, page, &child_wc); + + if (ret) + goto out; + + alloc_indirect_block(inode, ipage, page_empty); + block_set_pointer(ipage, child_no, child_wc.ofs); + /* FIXME: first condition seems superfluous */ + if (child_wc.ofs || logfs_block(ipage)->partial) + this_wc->flags |= WF_WRITE; + /* the condition on this_wc->ofs ensures that we won't consume extra + * space for indirect blocks in the future, which we cannot reserve */ + if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage)) + ret = logfs_write_i0(inode, ipage, this_wc); + else + logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no); +out: + logfs_put_write_page(ipage); + return ret; +} + +static int logfs_write_rec(struct inode *inode, struct page *page, + pgoff_t bix, level_t target_level, long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[INDIRECT_INDEX], + .flags = flags, + }; + int ret; + + alloc_inode_block(inode); + + if (li->li_height > (__force u8)target_level) + ret = __logfs_write_rec(inode, page, &wc, bix, target_level, + LEVEL(li->li_height)); + else + ret = logfs_write_i0(inode, page, &wc); + if (ret) + return ret; + + if (li->li_data[INDIRECT_INDEX] != wc.ofs) { + li->li_data[INDIRECT_INDEX] = wc.ofs; + logfs_set_alias(inode->i_sb, li->li_block, + INDIRECT_INDEX + INODE_POINTER_OFS); + } + return ret; +} + +void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + alloc_inode_block(inode); + logfs_inode(inode)->li_block->ta = ta; +} + +void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + struct logfs_block *block = logfs_inode(inode)->li_block; + + if (block && block->ta) + block->ta = NULL; +} + +static int grow_inode(struct inode *inode, u64 bix, level_t level) +{ + struct logfs_inode *li = logfs_inode(inode); + u8 height = (__force u8)level; + struct page *page; + struct write_control wc = { + .flags = WF_WRITE, + }; + int err; + + BUG_ON(height > 5 || li->li_height > 5); + while (height > li->li_height || bix >= maxbix(li->li_height)) { + page = logfs_get_write_page(inode, I0_BLOCKS + 1, + LEVEL(li->li_height + 1)); + if (!page) + return -ENOMEM; + logfs_read_empty(page); + alloc_indirect_block(inode, page, 1); + block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]); + err = logfs_write_i0(inode, page, &wc); + logfs_put_write_page(page); + if (err) + return err; + li->li_data[INDIRECT_INDEX] = wc.ofs; + wc.ofs = 0; + li->li_height++; + logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS); + } + return 0; +} + +static int __logfs_write_buf(struct inode *inode, struct page *page, long flags) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + pgoff_t index = page->index; + u64 bix; + level_t level; + int err; + + flags |= WF_WRITE | WF_DELETE; + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + logfs_unpack_index(index, &bix, &level); + if (logfs_block(page) && logfs_block(page)->reserved_bytes) + super->s_dirty_pages -= logfs_block(page)->reserved_bytes; + + if (index < I0_BLOCKS) + return logfs_write_direct(inode, page, flags); + + bix = adjust_bix(bix, level); + err = grow_inode(inode, bix, level); + if (err) + return err; + return logfs_write_rec(inode, page, bix, level, flags); +} + +int logfs_write_buf(struct inode *inode, struct page *page, long flags) +{ + struct super_block *sb = inode->i_sb; + int ret; + + logfs_get_wblocks(sb, page, flags & WF_LOCK); + ret = __logfs_write_buf(inode, page, flags); + logfs_put_wblocks(sb, page, flags & WF_LOCK); + return ret; +} + +static int __logfs_delete(struct inode *inode, struct page *page) +{ + long flags = WF_DELETE; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + if (page->index < I0_BLOCKS) + return logfs_write_direct(inode, page, flags); + return logfs_write_rec(inode, page, page->index, 0, flags); +} + +int logfs_delete(struct inode *inode, pgoff_t index, + struct shadow_tree *shadow_tree) +{ + struct super_block *sb = inode->i_sb; + struct page *page; + int ret; + + page = logfs_get_read_page(inode, index, 0); + if (!page) + return -ENOMEM; + + logfs_get_wblocks(sb, page, 1); + ret = __logfs_delete(inode, page); + logfs_put_wblocks(sb, page, 1); + + logfs_put_read_page(page); + + return ret; +} + +/* Rewrite cannot mark the inode dirty but has to write it immediatly. */ +int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, + gc_level_t gc_level, long flags) +{ + level_t level = shrink_level(gc_level); + struct page *page; + int err; + + page = logfs_get_write_page(inode, bix, level); + if (!page) + return -ENOMEM; + + err = logfs_segment_read(inode, page, ofs, bix, level); + if (!err) { + if (level != 0) + alloc_indirect_block(inode, page, 0); + err = logfs_write_buf(inode, page, flags); + } + logfs_put_write_page(page); + return err; +} + +static int truncate_data_block(struct inode *inode, struct page *page, + u64 ofs, struct logfs_shadow *shadow, u64 size) +{ + loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits; + u64 bix; + level_t level; + int err; + + /* Does truncation happen within this page? */ + if (size <= pageofs || size - pageofs >= PAGE_SIZE) + return 0; + + logfs_unpack_index(page->index, &bix, &level); + BUG_ON(level != 0); + + err = logfs_segment_read(inode, page, ofs, bix, level); + if (err) + return err; + + zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE); + return logfs_segment_write(inode, page, shadow); +} + +static int logfs_truncate_i0(struct inode *inode, struct page *page, + struct write_control *wc, u64 size) +{ + struct logfs_shadow *shadow; + u64 bix; + level_t level; + int err = 0; + + logfs_unpack_index(page->index, &bix, &level); + BUG_ON(level != 0); + shadow = alloc_shadow(inode, bix, level, wc->ofs); + + err = truncate_data_block(inode, page, wc->ofs, shadow, size); + if (err) { + free_shadow(inode, shadow); + return err; + } + + logfs_segment_delete(inode, shadow); + set_iused(inode, shadow); + fill_shadow_tree(inode, page, shadow); + wc->ofs = shadow->new_ofs; + return 0; +} + +static int logfs_truncate_direct(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc; + struct page *page; + int e; + int err; + + alloc_inode_block(inode); + + for (e = I0_BLOCKS - 1; e >= 0; e--) { + if (size > (e+1) * LOGFS_BLOCKSIZE) + break; + + wc.ofs = li->li_data[e]; + if (!wc.ofs) + continue; + + page = logfs_get_write_page(inode, e, 0); + if (!page) + return -ENOMEM; + err = logfs_segment_read(inode, page, wc.ofs, e, 0); + if (err) { + logfs_put_write_page(page); + return err; + } + err = logfs_truncate_i0(inode, page, &wc, size); + logfs_put_write_page(page); + if (err) + return err; + + li->li_data[e] = wc.ofs; + } + return 0; +} + +/* FIXME: these need to become per-sb once we support different blocksizes */ +static u64 __logfs_step[] = { + 1, + I1_BLOCKS, + I2_BLOCKS, + I3_BLOCKS, +}; + +static u64 __logfs_start_index[] = { + I0_BLOCKS, + I1_BLOCKS, + I2_BLOCKS, + I3_BLOCKS +}; + +static inline u64 logfs_step(level_t level) +{ + return __logfs_step[(__force u8)level]; +} + +static inline u64 logfs_factor(u8 level) +{ + return __logfs_step[level] * LOGFS_BLOCKSIZE; +} + +static inline u64 logfs_start_index(level_t level) +{ + return __logfs_start_index[(__force u8)level]; +} + +static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level) +{ + logfs_unpack_index(index, bix, level); + if (*bix <= logfs_start_index(SUBLEVEL(*level))) + *bix = 0; +} + +static int __logfs_truncate_rec(struct inode *inode, struct page *ipage, + struct write_control *this_wc, u64 size) +{ + int truncate_happened = 0; + int e, err = 0; + u64 bix, child_bix, next_bix; + level_t level; + struct page *page; + struct write_control child_wc = { /* FIXME: flags */ }; + + logfs_unpack_raw_index(ipage->index, &bix, &level); + err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); + if (err) + return err; + + for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) { + child_bix = bix + e * logfs_step(SUBLEVEL(level)); + next_bix = child_bix + logfs_step(SUBLEVEL(level)); + if (size > next_bix * LOGFS_BLOCKSIZE) + break; + + child_wc.ofs = pure_ofs(block_get_pointer(ipage, e)); + if (!child_wc.ofs) + continue; + + page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level)); + if (!page) + return -ENOMEM; + + if ((__force u8)level > 1) + err = __logfs_truncate_rec(inode, page, &child_wc, size); + else + err = logfs_truncate_i0(inode, page, &child_wc, size); + logfs_put_write_page(page); + if (err) + return err; + + truncate_happened = 1; + alloc_indirect_block(inode, ipage, 0); + block_set_pointer(ipage, e, child_wc.ofs); + } + + if (!truncate_happened) { + printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size); + return 0; + } + + this_wc->flags = WF_DELETE; + if (logfs_block(ipage)->partial) + this_wc->flags |= WF_WRITE; + + return logfs_write_i0(inode, ipage, this_wc); +} + +static int logfs_truncate_rec(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[INDIRECT_INDEX], + }; + struct page *page; + int err; + + alloc_inode_block(inode); + + if (!wc.ofs) + return 0; + + page = logfs_get_write_page(inode, 0, LEVEL(li->li_height)); + if (!page) + return -ENOMEM; + + err = __logfs_truncate_rec(inode, page, &wc, size); + logfs_put_write_page(page); + if (err) + return err; + + if (li->li_data[INDIRECT_INDEX] != wc.ofs) + li->li_data[INDIRECT_INDEX] = wc.ofs; + return 0; +} + +static int __logfs_truncate(struct inode *inode, u64 size) +{ + int ret; + + if (size >= logfs_factor(logfs_inode(inode)->li_height)) + return 0; + + ret = logfs_truncate_rec(inode, size); + if (ret) + return ret; + + return logfs_truncate_direct(inode, size); +} + +int logfs_truncate(struct inode *inode, u64 size) +{ + struct super_block *sb = inode->i_sb; + int err; + + logfs_get_wblocks(sb, NULL, 1); + err = __logfs_truncate(inode, size); + if (!err) + err = __logfs_write_inode(inode, 0); + logfs_put_wblocks(sb, NULL, 1); + + if (!err) + err = vmtruncate(inode, size); + + /* I don't trust error recovery yet. */ + WARN_ON(err); + return err; +} + +static void move_page_to_inode(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = logfs_block(page); + + if (!block) + return; + + log_blockmove("move_page_to_inode(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + BUG_ON(li->li_block); + block->ops = &inode_block_ops; + block->inode = inode; + li->li_block = block; + + block->page = NULL; + page->private = 0; + ClearPagePrivate(page); +} + +static void move_inode_to_page(struct page *page, struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = li->li_block; + + if (!block) + return; + + log_blockmove("move_inode_to_page(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + BUG_ON(PagePrivate(page)); + block->ops = &indirect_block_ops; + block->page = page; + page->private = (unsigned long)block; + SetPagePrivate(page); + + block->inode = NULL; + li->li_block = NULL; +} + +int logfs_read_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct inode *master_inode = super->s_master_inode; + struct page *page; + struct logfs_disk_inode *di; + u64 ino = inode->i_ino; + + if (ino << sb->s_blocksize_bits > i_size_read(master_inode)) + return -ENODATA; + if (!logfs_exist_block(master_inode, ino)) + return -ENODATA; + + page = read_cache_page(master_inode->i_mapping, ino, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + di = kmap_atomic(page, KM_USER0); + logfs_disk_to_inode(di, inode); + kunmap_atomic(di, KM_USER0); + move_page_to_inode(inode, page); + page_cache_release(page); + return 0; +} + +/* Caller must logfs_put_write_page(page); */ +static struct page *inode_to_page(struct inode *inode) +{ + struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode; + struct logfs_disk_inode *di; + struct page *page; + + BUG_ON(inode->i_ino == LOGFS_INO_MASTER); + + page = logfs_get_write_page(master_inode, inode->i_ino, 0); + if (!page) + return NULL; + + di = kmap_atomic(page, KM_USER0); + logfs_inode_to_disk(inode, di); + kunmap_atomic(di, KM_USER0); + move_inode_to_page(page, inode); + return page; +} + +/* Cheaper version of write_inode. All changes are concealed in + * aliases, which are moved back. No write to the medium happens. + */ +void logfs_clear_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = li->li_block; + struct page *page; + + /* Only deleted files may be dirty at this point */ + BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink); + if (!block) + return; + if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) { + block->ops->free_block(inode->i_sb, block); + return; + } + + BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS); + page = inode_to_page(inode); + BUG_ON(!page); /* FIXME: Use emergency page */ + logfs_put_write_page(page); +} + +static int do_write_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct inode *master_inode = logfs_super(sb)->s_master_inode; + loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits; + struct page *page; + int err; + + BUG_ON(inode->i_ino == LOGFS_INO_MASTER); + /* FIXME: lock inode */ + + if (i_size_read(master_inode) < size) + i_size_write(master_inode, size); + + /* TODO: Tell vfs this inode is clean now */ + + page = inode_to_page(inode); + if (!page) + return -ENOMEM; + + /* FIXME: transaction is part of logfs_block now. Is that enough? */ + err = logfs_write_buf(master_inode, page, 0); + logfs_put_write_page(page); + return err; +} + +static void logfs_mod_segment_entry(struct super_block *sb, u32 segno, + int write, + void (*change_se)(struct logfs_segment_entry *, long), + long arg) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + struct page *page; + struct logfs_segment_entry *se; + pgoff_t page_no; + int child_no; + + page_no = segno >> (sb->s_blocksize_bits - 3); + child_no = segno & ((sb->s_blocksize >> 3) - 1); + + inode = super->s_segfile_inode; + page = logfs_get_write_page(inode, page_no, 0); + BUG_ON(!page); /* FIXME: We need some reserve page for this case */ + if (!PageUptodate(page)) + logfs_read_block(inode, page, WRITE); + + if (write) + alloc_indirect_block(inode, page, 0); + se = kmap_atomic(page, KM_USER0); + change_se(se + child_no, arg); + if (write) { + logfs_set_alias(sb, logfs_block(page), child_no); + BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize); + } + kunmap_atomic(se, KM_USER0); + + logfs_put_write_page(page); +} + +static void __get_segment_entry(struct logfs_segment_entry *se, long _target) +{ + struct logfs_segment_entry *target = (void *)_target; + + *target = *se; +} + +void logfs_get_segment_entry(struct super_block *sb, u32 segno, + struct logfs_segment_entry *se) +{ + logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se); +} + +static void __set_segment_used(struct logfs_segment_entry *se, long increment) +{ + u32 valid; + + valid = be32_to_cpu(se->valid); + valid += increment; + se->valid = cpu_to_be32(valid); +} + +void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment) +{ + struct logfs_super *super = logfs_super(sb); + u32 segno = ofs >> super->s_segshift; + + if (!increment) + return; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment); +} + +static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level) +{ + se->ec_level = cpu_to_be32(ec_level); +} + +void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec, + gc_level_t gc_level) +{ + u32 ec_level = ec << 4 | (__force u8)gc_level; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level); +} + +static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore) +{ + se->valid = cpu_to_be32(RESERVED); +} + +void logfs_set_segment_reserved(struct super_block *sb, u32 segno) +{ + logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0); +} + +static void __set_segment_unreserved(struct logfs_segment_entry *se, + long ec_level) +{ + se->valid = 0; + se->ec_level = cpu_to_be32(ec_level); +} + +void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec) +{ + u32 ec_level = ec << 4; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved, + ec_level); +} + +int __logfs_write_inode(struct inode *inode, long flags) +{ + struct super_block *sb = inode->i_sb; + int ret; + + logfs_get_wblocks(sb, NULL, flags & WF_LOCK); + ret = do_write_inode(inode); + logfs_put_wblocks(sb, NULL, flags & WF_LOCK); + return ret; +} + +static int do_delete_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct inode *master_inode = logfs_super(sb)->s_master_inode; + struct page *page; + int ret; + + page = logfs_get_write_page(master_inode, inode->i_ino, 0); + if (!page) + return -ENOMEM; + + move_inode_to_page(page, inode); + + logfs_get_wblocks(sb, page, 1); + ret = __logfs_delete(master_inode, page); + logfs_put_wblocks(sb, page, 1); + + logfs_put_write_page(page); + return ret; +} + +/* + * ZOMBIE inodes have already been deleted before and should remain dead, + * if it weren't for valid checking. No need to kill them again here. + */ +void logfs_delete_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (!(li->li_flags & LOGFS_IF_ZOMBIE)) { + li->li_flags |= LOGFS_IF_ZOMBIE; + if (i_size_read(inode) > 0) + logfs_truncate(inode, 0); + do_delete_inode(inode); + } + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); +} + +void btree_write_block(struct logfs_block *block) +{ + struct inode *inode; + struct page *page; + int err, cookie; + + inode = logfs_safe_iget(block->sb, block->ino, &cookie); + page = logfs_get_write_page(inode, block->bix, block->level); + + err = logfs_readpage_nolock(page); + BUG_ON(err); + BUG_ON(!PagePrivate(page)); + BUG_ON(logfs_block(page) != block); + err = __logfs_write_buf(inode, page, 0); + BUG_ON(err); + BUG_ON(PagePrivate(page) || page->private); + + logfs_put_write_page(page); + logfs_safe_iput(inode, cookie); +} + +/** + * logfs_inode_write - write inode or dentry objects + * + * @inode: parent inode (ifile or directory) + * @buf: object to write (inode or dentry) + * @n: object size + * @_pos: object number (file position in blocks/objects) + * @flags: write flags + * @lock: 0 if write lock is already taken, 1 otherwise + * @shadow_tree: shadow below this inode + * + * FIXME: All caller of this put a 200-300 byte variable on the stack, + * only to call here and do a memcpy from that stack variable. A good + * example of wasted performance and stack space. + */ +int logfs_inode_write(struct inode *inode, const void *buf, size_t count, + loff_t bix, long flags, struct shadow_tree *shadow_tree) +{ + loff_t pos = bix << inode->i_sb->s_blocksize_bits; + int err; + struct page *page; + void *pagebuf; + + BUG_ON(pos & (LOGFS_BLOCKSIZE-1)); + BUG_ON(count > LOGFS_BLOCKSIZE); + page = logfs_get_write_page(inode, bix, 0); + if (!page) + return -ENOMEM; + + pagebuf = kmap_atomic(page, KM_USER0); + memcpy(pagebuf, buf, count); + flush_dcache_page(page); + kunmap_atomic(pagebuf, KM_USER0); + + if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE) + i_size_write(inode, pos + LOGFS_BLOCKSIZE); + + err = logfs_write_buf(inode, page, flags); + logfs_put_write_page(page); + return err; +} + +int logfs_open_segfile(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + + inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE); + if (IS_ERR(inode)) + return PTR_ERR(inode); + super->s_segfile_inode = inode; + return 0; +} + +int logfs_init_rw(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int min_fill = 3 * super->s_no_blocks; + + INIT_LIST_HEAD(&super->s_object_alias); + mutex_init(&super->s_write_mutex); + super->s_block_pool = mempool_create_kmalloc_pool(min_fill, + sizeof(struct logfs_block)); + super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill, + sizeof(struct logfs_shadow)); + return 0; +} + +void logfs_cleanup_rw(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + destroy_meta_inode(super->s_segfile_inode); + if (super->s_block_pool) + mempool_destroy(super->s_block_pool); + if (super->s_shadow_pool) + mempool_destroy(super->s_shadow_pool); +} diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c new file mode 100644 index 000000000000..5f58b74516ca --- /dev/null +++ b/fs/logfs/segment.c @@ -0,0 +1,924 @@ +/* + * fs/logfs/segment.c - Handling the Object Store + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Object store or ostore makes up the complete device with exception of + * the superblock and journal areas. Apart from its own metadata it stores + * three kinds of objects: inodes, dentries and blocks, both data and indirect. + */ +#include "logfs.h" + +static int logfs_mark_segment_bad(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct btree_head32 *head = &super->s_reserved_segments; + int err; + + err = btree_insert32(head, segno, (void *)1, GFP_NOFS); + if (err) + return err; + logfs_super(sb)->s_bad_segments++; + /* FIXME: write to journal */ + return 0; +} + +int logfs_erase_segment(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + + super->s_gec++; + + return super->s_devops->erase(sb, (u64)segno << super->s_segshift, + super->s_segsize); +} + +static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes) +{ + s32 ofs; + + logfs_open_area(area, bytes); + + ofs = area->a_used_bytes; + area->a_used_bytes += bytes; + BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize); + + return dev_ofs(area->a_sb, area->a_segno, ofs); +} + +static struct page *get_mapping_page(struct super_block *sb, pgoff_t index, + int use_filler) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = super->s_devops->readpage; + struct page *page; + + BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS); + if (use_filler) + page = read_cache_page(mapping, index, filler, sb); + else { + page = find_or_create_page(mapping, index, GFP_NOFS); + unlock_page(page); + } + return page; +} + +void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, + int use_filler) +{ + pgoff_t index = ofs >> PAGE_SHIFT; + struct page *page; + long offset = ofs & (PAGE_SIZE-1); + long copylen; + + /* Only logfs_wbuf_recover may use len==0 */ + BUG_ON(!len && !use_filler); + do { + copylen = min((ulong)len, PAGE_SIZE - offset); + + page = get_mapping_page(area->a_sb, index, use_filler); + SetPageUptodate(page); + BUG_ON(!page); /* FIXME: reserve a pool */ + memcpy(page_address(page) + offset, buf, copylen); + SetPagePrivate(page); + page_cache_release(page); + + buf += copylen; + len -= copylen; + offset = 0; + index++; + } while (len); +} + +/* + * bdev_writeseg will write full pages. Memset the tail to prevent data leaks. + */ +static void pad_wbuf(struct logfs_area *area, int final) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + struct page *page; + u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); + pgoff_t index = ofs >> PAGE_SHIFT; + long offset = ofs & (PAGE_SIZE-1); + u32 len = PAGE_SIZE - offset; + + if (len == PAGE_SIZE) { + /* The math in this function can surely use some love */ + len = 0; + } + if (len) { + BUG_ON(area->a_used_bytes >= super->s_segsize); + + page = get_mapping_page(area->a_sb, index, 0); + BUG_ON(!page); /* FIXME: reserve a pool */ + memset(page_address(page) + offset, 0xff, len); + SetPagePrivate(page); + page_cache_release(page); + } + + if (!final) + return; + + area->a_used_bytes += len; + for ( ; area->a_used_bytes < super->s_segsize; + area->a_used_bytes += PAGE_SIZE) { + /* Memset another page */ + index++; + page = get_mapping_page(area->a_sb, index, 0); + BUG_ON(!page); /* FIXME: reserve a pool */ + memset(page_address(page), 0xff, PAGE_SIZE); + SetPagePrivate(page); + page_cache_release(page); + } +} + +/* + * We have to be careful with the alias tree. Since lookup is done by bix, + * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with + * indirect blocks. So always use it through accessor functions. + */ +static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix, + level_t level) +{ + struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree; + pgoff_t index = logfs_pack_index(bix, level); + + return btree_lookup128(head, ino, index); +} + +static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix, + level_t level, void *val) +{ + struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree; + pgoff_t index = logfs_pack_index(bix, level); + + return btree_insert128(head, ino, index, val, GFP_NOFS); +} + +static int btree_write_alias(struct super_block *sb, struct logfs_block *block, + write_alias_t *write_one_alias) +{ + struct object_alias_item *item; + int err; + + list_for_each_entry(item, &block->item_list, list) { + err = write_alias_journal(sb, block->ino, block->bix, + block->level, item->child_no, item->val); + if (err) + return err; + } + return 0; +} + +static gc_level_t btree_block_level(struct logfs_block *block) +{ + return expand_level(block->ino, block->level); +} + +static struct logfs_block_ops btree_block_ops = { + .write_block = btree_write_block, + .block_level = btree_block_level, + .free_block = __free_block, + .write_alias = btree_write_alias, +}; + +int logfs_load_object_aliases(struct super_block *sb, + struct logfs_obj_alias *oa, int count) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + struct object_alias_item *item; + u64 ino, bix; + level_t level; + int i, err; + + super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS; + count /= sizeof(*oa); + for (i = 0; i < count; i++) { + item = mempool_alloc(super->s_alias_pool, GFP_NOFS); + if (!item) + return -ENOMEM; + memset(item, 0, sizeof(*item)); + + super->s_no_object_aliases++; + item->val = oa[i].val; + item->child_no = be16_to_cpu(oa[i].child_no); + + ino = be64_to_cpu(oa[i].ino); + bix = be64_to_cpu(oa[i].bix); + level = LEVEL(oa[i].level); + + log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n", + ino, bix, level, item->child_no, + be64_to_cpu(item->val)); + block = alias_tree_lookup(sb, ino, bix, level); + if (!block) { + block = __alloc_block(sb, ino, bix, level); + block->ops = &btree_block_ops; + err = alias_tree_insert(sb, ino, bix, level, block); + BUG_ON(err); /* mempool empty */ + } + if (test_and_set_bit(item->child_no, block->alias_map)) { + printk(KERN_ERR"LogFS: Alias collision detected\n"); + return -EIO; + } + list_move_tail(&block->alias_list, &super->s_object_alias); + list_add(&item->list, &block->item_list); + } + return 0; +} + +static void kill_alias(void *_block, unsigned long ignore0, + u64 ignore1, u64 ignore2, size_t ignore3) +{ + struct logfs_block *block = _block; + struct super_block *sb = block->sb; + struct logfs_super *super = logfs_super(sb); + struct object_alias_item *item; + + while (!list_empty(&block->item_list)) { + item = list_entry(block->item_list.next, typeof(*item), list); + list_del(&item->list); + mempool_free(item, super->s_alias_pool); + } + block->ops->free_block(sb, block); +} + +static int obj_type(struct inode *inode, level_t level) +{ + if (level == 0) { + if (S_ISDIR(inode->i_mode)) + return OBJ_DENTRY; + if (inode->i_ino == LOGFS_INO_MASTER) + return OBJ_INODE; + } + return OBJ_BLOCK; +} + +static int obj_len(struct super_block *sb, int obj_type) +{ + switch (obj_type) { + case OBJ_DENTRY: + return sizeof(struct logfs_disk_dentry); + case OBJ_INODE: + return sizeof(struct logfs_disk_inode); + case OBJ_BLOCK: + return sb->s_blocksize; + default: + BUG(); + } +} + +static int __logfs_segment_write(struct inode *inode, void *buf, + struct logfs_shadow *shadow, int type, int len, int compr) +{ + struct logfs_area *area; + struct super_block *sb = inode->i_sb; + s64 ofs; + struct logfs_object_header h; + int acc_len; + + if (shadow->gc_level == 0) + acc_len = len; + else + acc_len = obj_len(sb, type); + + area = get_area(sb, shadow->gc_level); + ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE); + LOGFS_BUG_ON(ofs <= 0, sb); + /* + * Order is important. logfs_get_free_bytes(), by modifying the + * segment file, may modify the content of the very page we're about + * to write now. Which is fine, as long as the calculated crc and + * written data still match. So do the modifications _before_ + * calculating the crc. + */ + + h.len = cpu_to_be16(len); + h.type = type; + h.compr = compr; + h.ino = cpu_to_be64(inode->i_ino); + h.bix = cpu_to_be64(shadow->bix); + h.crc = logfs_crc32(&h, sizeof(h) - 4, 4); + h.data_crc = logfs_crc32(buf, len, 0); + + logfs_buf_write(area, ofs, &h, sizeof(h)); + logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len); + + shadow->new_ofs = ofs; + shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE; + + return 0; +} + +static s64 logfs_segment_write_compress(struct inode *inode, void *buf, + struct logfs_shadow *shadow, int type, int len) +{ + struct super_block *sb = inode->i_sb; + void *compressor_buf = logfs_super(sb)->s_compressed_je; + ssize_t compr_len; + int ret; + + mutex_lock(&logfs_super(sb)->s_journal_mutex); + compr_len = logfs_compress(buf, compressor_buf, len, len); + + if (compr_len >= 0) { + ret = __logfs_segment_write(inode, compressor_buf, shadow, + type, compr_len, COMPR_ZLIB); + } else { + ret = __logfs_segment_write(inode, buf, shadow, type, len, + COMPR_NONE); + } + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + return ret; +} + +/** + * logfs_segment_write - write data block to object store + * @inode: inode containing data + * + * Returns an errno or zero. + */ +int logfs_segment_write(struct inode *inode, struct page *page, + struct logfs_shadow *shadow) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + int do_compress, type, len; + int ret; + void *buf; + + BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED; + if (shadow->gc_level != 0) { + /* temporarily disable compression for indirect blocks */ + do_compress = 0; + } + + type = obj_type(inode, shrink_level(shadow->gc_level)); + len = obj_len(sb, type); + buf = kmap(page); + if (do_compress) + ret = logfs_segment_write_compress(inode, buf, shadow, type, + len); + else + ret = __logfs_segment_write(inode, buf, shadow, type, len, + COMPR_NONE); + kunmap(page); + + log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + /* this BUG_ON did catch a locking bug. useful */ + BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1))); + return ret; +} + +int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf) +{ + pgoff_t index = ofs >> PAGE_SHIFT; + struct page *page; + long offset = ofs & (PAGE_SIZE-1); + long copylen; + + while (len) { + copylen = min((ulong)len, PAGE_SIZE - offset); + + page = get_mapping_page(sb, index, 1); + if (IS_ERR(page)) + return PTR_ERR(page); + memcpy(buf, page_address(page) + offset, copylen); + page_cache_release(page); + + buf += copylen; + len -= copylen; + offset = 0; + index++; + } + return 0; +} + +/* + * The "position" of indirect blocks is ambiguous. It can be the position + * of any data block somewhere behind this indirect block. So we need to + * normalize the positions through logfs_block_mask() before comparing. + */ +static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level) +{ + return (pos1 & logfs_block_mask(sb, level)) != + (pos2 & logfs_block_mask(sb, level)); +} + +#if 0 +static int read_seg_header(struct super_block *sb, u64 ofs, + struct logfs_segment_header *sh) +{ + __be32 crc; + int err; + + err = wbuf_read(sb, ofs, sizeof(*sh), sh); + if (err) + return err; + crc = logfs_crc32(sh, sizeof(*sh), 4); + if (crc != sh->crc) { + printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, " + "got %x\n", ofs, be32_to_cpu(sh->crc), + be32_to_cpu(crc)); + return -EIO; + } + return 0; +} +#endif + +static int read_obj_header(struct super_block *sb, u64 ofs, + struct logfs_object_header *oh) +{ + __be32 crc; + int err; + + err = wbuf_read(sb, ofs, sizeof(*oh), oh); + if (err) + return err; + crc = logfs_crc32(oh, sizeof(*oh) - 4, 4); + if (crc != oh->crc) { + printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, " + "got %x\n", ofs, be32_to_cpu(oh->crc), + be32_to_cpu(crc)); + return -EIO; + } + return 0; +} + +static void move_btree_to_page(struct inode *inode, struct page *page, + __be64 *data) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct btree_head128 *head = &super->s_object_alias_tree; + struct logfs_block *block; + struct object_alias_item *item, *next; + + if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS)) + return; + + block = btree_remove128(head, inode->i_ino, page->index); + if (!block) + return; + + log_blockmove("move_btree_to_page(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + list_for_each_entry_safe(item, next, &block->item_list, list) { + data[item->child_no] = item->val; + list_del(&item->list); + mempool_free(item, super->s_alias_pool); + } + block->page = page; + SetPagePrivate(page); + page->private = (unsigned long)block; + block->ops = &indirect_block_ops; + initialize_block_counters(page, block, data, 0); +} + +/* + * This silences a false, yet annoying gcc warning. I hate it when my editor + * jumps into bitops.h each time I recompile this file. + * TODO: Complain to gcc folks about this and upgrade compiler. + */ +static unsigned long fnb(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +void move_page_to_btree(struct page *page) +{ + struct logfs_block *block = logfs_block(page); + struct super_block *sb = block->sb; + struct logfs_super *super = logfs_super(sb); + struct object_alias_item *item; + unsigned long pos; + __be64 *child; + int err; + + if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) { + block->ops->free_block(sb, block); + return; + } + log_blockmove("move_page_to_btree(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_BLOCK_FACTOR) + break; + + item = mempool_alloc(super->s_alias_pool, GFP_NOFS); + BUG_ON(!item); /* mempool empty */ + memset(item, 0, sizeof(*item)); + + child = kmap_atomic(page, KM_USER0); + item->val = child[pos]; + kunmap_atomic(child, KM_USER0); + item->child_no = pos; + list_add(&item->list, &block->item_list); + } + block->page = NULL; + ClearPagePrivate(page); + page->private = 0; + block->ops = &btree_block_ops; + err = alias_tree_insert(block->sb, block->ino, block->bix, block->level, + block); + BUG_ON(err); /* mempool empty */ + ClearPageUptodate(page); +} + +static int __logfs_segment_read(struct inode *inode, void *buf, + u64 ofs, u64 bix, level_t level) +{ + struct super_block *sb = inode->i_sb; + void *compressor_buf = logfs_super(sb)->s_compressed_je; + struct logfs_object_header oh; + __be32 crc; + u16 len; + int err, block_len; + + block_len = obj_len(sb, obj_type(inode, level)); + err = read_obj_header(sb, ofs, &oh); + if (err) + goto out_err; + + err = -EIO; + if (be64_to_cpu(oh.ino) != inode->i_ino + || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) { + printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: " + "expected (%lx, %llx), got (%llx, %llx)\n", + ofs, inode->i_ino, bix, + be64_to_cpu(oh.ino), be64_to_cpu(oh.bix)); + goto out_err; + } + + len = be16_to_cpu(oh.len); + + switch (oh.compr) { + case COMPR_NONE: + err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf); + if (err) + goto out_err; + crc = logfs_crc32(buf, len, 0); + if (crc != oh.data_crc) { + printk(KERN_ERR"LOGFS: uncompressed data crc error at " + "%llx: expected %x, got %x\n", ofs, + be32_to_cpu(oh.data_crc), + be32_to_cpu(crc)); + goto out_err; + } + break; + case COMPR_ZLIB: + mutex_lock(&logfs_super(sb)->s_journal_mutex); + err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, + compressor_buf); + if (err) { + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + goto out_err; + } + crc = logfs_crc32(compressor_buf, len, 0); + if (crc != oh.data_crc) { + printk(KERN_ERR"LOGFS: compressed data crc error at " + "%llx: expected %x, got %x\n", ofs, + be32_to_cpu(oh.data_crc), + be32_to_cpu(crc)); + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + goto out_err; + } + err = logfs_uncompress(compressor_buf, buf, len, block_len); + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + if (err) { + printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs); + goto out_err; + } + break; + default: + LOGFS_BUG(sb); + err = -EIO; + goto out_err; + } + return 0; + +out_err: + logfs_set_ro(sb); + printk(KERN_ERR"LOGFS: device is read-only now\n"); + LOGFS_BUG(sb); + return err; +} + +/** + * logfs_segment_read - read data block from object store + * @inode: inode containing data + * @buf: data buffer + * @ofs: physical data offset + * @bix: block index + * @level: block level + * + * Returns 0 on success or a negative errno. + */ +int logfs_segment_read(struct inode *inode, struct page *page, + u64 ofs, u64 bix, level_t level) +{ + int err; + void *buf; + + if (PageUptodate(page)) + return 0; + + ofs &= ~LOGFS_FULLY_POPULATED; + + buf = kmap(page); + err = __logfs_segment_read(inode, buf, ofs, bix, level); + if (!err) { + move_btree_to_page(inode, page, buf); + SetPageUptodate(page); + } + kunmap(page); + log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n", + inode->i_ino, bix, level, ofs, err); + return err; +} + +int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow) +{ + struct super_block *sb = inode->i_sb; + struct logfs_object_header h; + u16 len; + int err; + + BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED); + if (!shadow->old_ofs) + return 0; + + log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + err = read_obj_header(sb, shadow->old_ofs, &h); + LOGFS_BUG_ON(err, sb); + LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb); + LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix), + shrink_level(shadow->gc_level)), sb); + + if (shadow->gc_level == 0) + len = be16_to_cpu(h.len); + else + len = obj_len(sb, h.type); + shadow->old_len = len + sizeof(h); + return 0; +} + +static void freeseg(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + u64 ofs, start, end; + + start = dev_ofs(sb, segno, 0); + end = dev_ofs(sb, segno + 1, 0); + for (ofs = start; ofs < end; ofs += PAGE_SIZE) { + page = find_get_page(mapping, ofs >> PAGE_SHIFT); + if (!page) + continue; + ClearPagePrivate(page); + page_cache_release(page); + } +} + +int logfs_open_area(struct logfs_area *area, size_t bytes) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + int err, closed = 0; + + if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize) + return 0; + + if (area->a_is_open) { + u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + u32 len = super->s_segsize - area->a_written_bytes; + + log_gc("logfs_close_area(%x)\n", area->a_segno); + pad_wbuf(area, 1); + super->s_devops->writeseg(area->a_sb, ofs, len); + freeseg(sb, area->a_segno); + closed = 1; + } + + area->a_used_bytes = 0; + area->a_written_bytes = 0; +again: + area->a_ops->get_free_segment(area); + area->a_ops->get_erase_count(area); + + log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level); + err = area->a_ops->erase_segment(area); + if (err) { + printk(KERN_WARNING "LogFS: Error erasing segment %x\n", + area->a_segno); + logfs_mark_segment_bad(sb, area->a_segno); + goto again; + } + area->a_is_open = 1; + return closed; +} + +void logfs_sync_area(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + u32 len = (area->a_used_bytes - area->a_written_bytes); + + if (super->s_writesize) + len &= ~(super->s_writesize - 1); + if (len == 0) + return; + pad_wbuf(area, 0); + super->s_devops->writeseg(sb, ofs, len); + area->a_written_bytes += len; +} + +void logfs_sync_segments(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + for_each_area(i) + logfs_sync_area(super->s_area[i]); +} + +/* + * Pick a free segment to be used for this area. Effectively takes a + * candidate from the free list (not really a candidate anymore). + */ +static void ostore_get_free_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + + if (super->s_free_list.count == 0) { + printk(KERN_ERR"LOGFS: ran out of free segments\n"); + LOGFS_BUG(sb); + } + + area->a_segno = get_best_cand(sb, &super->s_free_list, NULL); +} + +static void ostore_get_erase_count(struct logfs_area *area) +{ + struct logfs_segment_entry se; + u32 ec_level; + + logfs_get_segment_entry(area->a_sb, area->a_segno, &se); + BUG_ON(se.ec_level == cpu_to_be32(BADSEG) || + se.valid == cpu_to_be32(RESERVED)); + + ec_level = be32_to_cpu(se.ec_level); + area->a_erase_count = (ec_level >> 4) + 1; +} + +static int ostore_erase_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_segment_header sh; + u64 ofs; + int err; + + err = logfs_erase_segment(sb, area->a_segno); + if (err) + return err; + + sh.pad = 0; + sh.type = SEG_OSTORE; + sh.level = (__force u8)area->a_level; + sh.segno = cpu_to_be32(area->a_segno); + sh.ec = cpu_to_be32(area->a_erase_count); + sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); + sh.crc = logfs_crc32(&sh, sizeof(sh), 4); + + logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, + area->a_level); + + ofs = dev_ofs(sb, area->a_segno, 0); + area->a_used_bytes = sizeof(sh); + logfs_buf_write(area, ofs, &sh, sizeof(sh)); + return 0; +} + +static const struct logfs_area_ops ostore_area_ops = { + .get_free_segment = ostore_get_free_segment, + .get_erase_count = ostore_get_erase_count, + .erase_segment = ostore_erase_segment, +}; + +static void free_area(struct logfs_area *area) +{ + if (area) + freeseg(area->a_sb, area->a_segno); + kfree(area); +} + +static struct logfs_area *alloc_area(struct super_block *sb) +{ + struct logfs_area *area; + + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + + area->a_sb = sb; + return area; +} + +static void map_invalidatepage(struct page *page, unsigned long l) +{ + BUG(); +} + +static int map_releasepage(struct page *page, gfp_t g) +{ + /* Don't release these pages */ + return 0; +} + +static const struct address_space_operations mapping_aops = { + .invalidatepage = map_invalidatepage, + .releasepage = map_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, +}; + +int logfs_init_mapping(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping; + struct inode *inode; + + inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING); + if (IS_ERR(inode)) + return PTR_ERR(inode); + super->s_mapping_inode = inode; + mapping = inode->i_mapping; + mapping->a_ops = &mapping_aops; + /* Would it be possible to use __GFP_HIGHMEM as well? */ + mapping_set_gfp_mask(mapping, GFP_NOFS); + return 0; +} + +int logfs_init_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i = -1; + + super->s_alias_pool = mempool_create_kmalloc_pool(600, + sizeof(struct object_alias_item)); + if (!super->s_alias_pool) + return -ENOMEM; + + super->s_journal_area = alloc_area(sb); + if (!super->s_journal_area) + goto err; + + for_each_area(i) { + super->s_area[i] = alloc_area(sb); + if (!super->s_area[i]) + goto err; + super->s_area[i]->a_level = GC_LEVEL(i); + super->s_area[i]->a_ops = &ostore_area_ops; + } + btree_init_mempool128(&super->s_object_alias_tree, + super->s_btree_pool); + return 0; + +err: + for (i--; i >= 0; i--) + free_area(super->s_area[i]); + free_area(super->s_journal_area); + mempool_destroy(super->s_alias_pool); + return -ENOMEM; +} + +void logfs_cleanup_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias); + for_each_area(i) + free_area(super->s_area[i]); + free_area(super->s_journal_area); + destroy_meta_inode(super->s_mapping_inode); +} diff --git a/fs/logfs/super.c b/fs/logfs/super.c new file mode 100644 index 000000000000..d128a2c1c8d1 --- /dev/null +++ b/fs/logfs/super.c @@ -0,0 +1,634 @@ +/* + * fs/logfs/super.c + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Generally contains mount/umount code and also serves as a dump area for + * any functions that don't fit elsewhere and neither justify a file of their + * own. + */ +#include "logfs.h" +#include +#include +#include +#include + +static DEFINE_MUTEX(emergency_mutex); +static struct page *emergency_page; + +struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index) +{ + filler_t *filler = (filler_t *)mapping->a_ops->readpage; + struct page *page; + int err; + + page = read_cache_page(mapping, index, filler, NULL); + if (page) + return page; + + /* No more pages available, switch to emergency page */ + printk(KERN_INFO"Logfs: Using emergency page\n"); + mutex_lock(&emergency_mutex); + err = filler(NULL, emergency_page); + if (err) { + mutex_unlock(&emergency_mutex); + printk(KERN_EMERG"Logfs: Error reading emergency page\n"); + return ERR_PTR(err); + } + return emergency_page; +} + +void emergency_read_end(struct page *page) +{ + if (page == emergency_page) + mutex_unlock(&emergency_mutex); + else + page_cache_release(page); +} + +static void dump_segfile(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_entry se; + u32 segno; + + for (segno = 0; segno < super->s_no_segs; segno++) { + logfs_get_segment_entry(sb, segno, &se); + printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + printk("\n"); + } +} + +/* + * logfs_crash_dump - dump debug information to device + * + * The LogFS superblock only occupies part of a segment. This function will + * write as much debug information as it can gather into the spare space. + */ +void logfs_crash_dump(struct super_block *sb) +{ + dump_segfile(sb); +} + +/* + * TODO: move to lib/string.c + */ +/** + * memchr_inv - Find a character in an area of memory. + * @s: The memory area + * @c: The byte to search for + * @n: The size of the area. + * + * returns the address of the first character other than @c, or %NULL + * if the whole buffer contains just @c. + */ +void *memchr_inv(const void *s, int c, size_t n) +{ + const unsigned char *p = s; + while (n-- != 0) + if ((unsigned char)c != *p++) + return (void *)(p - 1); + + return NULL; +} + +/* + * FIXME: There should be a reserve for root, similar to ext2. + */ +int logfs_statfs(struct dentry *dentry, struct kstatfs *stats) +{ + struct super_block *sb = dentry->d_sb; + struct logfs_super *super = logfs_super(sb); + + stats->f_type = LOGFS_MAGIC_U32; + stats->f_bsize = sb->s_blocksize; + stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3; + stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits; + stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits; + stats->f_files = 0; + stats->f_ffree = 0; + stats->f_namelen = LOGFS_MAX_NAMELEN; + return 0; +} + +static int logfs_sb_set(struct super_block *sb, void *_super) +{ + struct logfs_super *super = _super; + + sb->s_fs_info = super; + sb->s_mtd = super->s_mtd; + sb->s_bdev = super->s_bdev; + return 0; +} + +static int logfs_sb_test(struct super_block *sb, void *_super) +{ + struct logfs_super *super = _super; + struct mtd_info *mtd = super->s_mtd; + + if (mtd && sb->s_mtd == mtd) + return 1; + if (super->s_bdev && sb->s_bdev == super->s_bdev) + return 1; + return 0; +} + +static void set_segment_header(struct logfs_segment_header *sh, u8 type, + u8 level, u32 segno, u32 ec) +{ + sh->pad = 0; + sh->type = type; + sh->level = level; + sh->segno = cpu_to_be32(segno); + sh->ec = cpu_to_be32(ec); + sh->gec = cpu_to_be64(segno); + sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4); +} + +static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds, + u32 segno, u32 ec) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_header *sh = &ds->ds_sh; + int i; + + memset(ds, 0, sizeof(*ds)); + set_segment_header(sh, SEG_SUPER, 0, segno, ec); + + ds->ds_ifile_levels = super->s_ifile_levels; + ds->ds_iblock_levels = super->s_iblock_levels; + ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */ + ds->ds_segment_shift = super->s_segshift; + ds->ds_block_shift = sb->s_blocksize_bits; + ds->ds_write_shift = super->s_writeshift; + ds->ds_filesystem_size = cpu_to_be64(super->s_size); + ds->ds_segment_size = cpu_to_be32(super->s_segsize); + ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve); + ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat); + ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat); + ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat); + ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags); + ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve); + ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve); + journal_for_each(i) + ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]); + ds->ds_magic = cpu_to_be64(LOGFS_MAGIC); + ds->ds_crc = logfs_crc32(ds, sizeof(*ds), + LOGFS_SEGMENT_HEADERSIZE + 12); +} + +static int write_one_sb(struct super_block *sb, + struct page *(*find_sb)(struct super_block *sb, u64 *ofs)) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_disk_super *ds; + struct logfs_segment_entry se; + struct page *page; + u64 ofs; + u32 ec, segno; + int err; + + page = find_sb(sb, &ofs); + if (!page) + return -EIO; + ds = page_address(page); + segno = seg_no(sb, ofs); + logfs_get_segment_entry(sb, segno, &se); + ec = be32_to_cpu(se.ec_level) >> 4; + ec++; + logfs_set_segment_erased(sb, segno, ec, 0); + logfs_write_ds(sb, ds, segno, ec); + err = super->s_devops->write_sb(sb, page); + page_cache_release(page); + return err; +} + +int logfs_write_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int err; + + /* First superblock */ + err = write_one_sb(sb, super->s_devops->find_first_sb); + if (err) + return err; + + /* Last superblock */ + err = write_one_sb(sb, super->s_devops->find_last_sb); + if (err) + return err; + return 0; +} + +static int ds_cmp(const void *ds0, const void *ds1) +{ + size_t len = sizeof(struct logfs_disk_super); + + /* We know the segment headers differ, so ignore them */ + len -= LOGFS_SEGMENT_HEADERSIZE; + ds0 += LOGFS_SEGMENT_HEADERSIZE; + ds1 += LOGFS_SEGMENT_HEADERSIZE; + return memcmp(ds0, ds1, len); +} + +static int logfs_recover_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_disk_super _ds0, *ds0 = &_ds0; + struct logfs_disk_super _ds1, *ds1 = &_ds1; + int err, valid0, valid1; + + /* read first superblock */ + err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0); + if (err) + return err; + /* read last superblock */ + err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1); + if (err) + return err; + valid0 = logfs_check_ds(ds0) == 0; + valid1 = logfs_check_ds(ds1) == 0; + + if (!valid0 && valid1) { + printk(KERN_INFO"First superblock is invalid - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_first_sb); + } + if (valid0 && !valid1) { + printk(KERN_INFO"Last superblock is invalid - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_last_sb); + } + if (valid0 && valid1 && ds_cmp(ds0, ds1)) { + printk(KERN_INFO"Superblocks don't match - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_last_sb); + } + /* If neither is valid now, something's wrong. Didn't we properly + * check them before?!? */ + BUG_ON(!valid0 && !valid1); + return 0; +} + +static int logfs_make_writeable(struct super_block *sb) +{ + int err; + + /* Repair any broken superblock copies */ + err = logfs_recover_sb(sb); + if (err) + return err; + + /* Check areas for trailing unaccounted data */ + err = logfs_check_areas(sb); + if (err) + return err; + + err = logfs_open_segfile(sb); + if (err) + return err; + + /* Do one GC pass before any data gets dirtied */ + logfs_gc_pass(sb); + + /* after all initializations are done, replay the journal + * for rw-mounts, if necessary */ + err = logfs_replay_journal(sb); + if (err) + return err; + + return 0; +} + +static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) +{ + struct inode *rootdir; + int err; + + /* root dir */ + rootdir = logfs_iget(sb, LOGFS_INO_ROOT); + if (IS_ERR(rootdir)) + goto fail; + + sb->s_root = d_alloc_root(rootdir); + if (!sb->s_root) + goto fail; + + /* FIXME: check for read-only mounts */ + err = logfs_make_writeable(sb); + if (err) + goto fail2; + + log_super("LogFS: Finished mounting\n"); + simple_set_mnt(mnt, sb); + return 0; + +fail2: + iput(rootdir); +fail: + iput(logfs_super(sb)->s_master_inode); + return -EIO; +} + +int logfs_check_ds(struct logfs_disk_super *ds) +{ + struct logfs_segment_header *sh = &ds->ds_sh; + + if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC)) + return -EINVAL; + if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4)) + return -EINVAL; + if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds), + LOGFS_SEGMENT_HEADERSIZE + 12)) + return -EINVAL; + return 0; +} + +static struct page *find_super_block(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct page *first, *last; + + first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]); + if (!first || IS_ERR(first)) + return NULL; + last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]); + if (!last || IS_ERR(first)) { + page_cache_release(first); + return NULL; + } + + if (!logfs_check_ds(page_address(first))) { + page_cache_release(last); + return first; + } + + /* First one didn't work, try the second superblock */ + if (!logfs_check_ds(page_address(last))) { + page_cache_release(first); + return last; + } + + /* Neither worked, sorry folks */ + page_cache_release(first); + page_cache_release(last); + return NULL; +} + +static int __logfs_read_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct page *page; + struct logfs_disk_super *ds; + int i; + + page = find_super_block(sb); + if (!page) + return -EIO; + + ds = page_address(page); + super->s_size = be64_to_cpu(ds->ds_filesystem_size); + super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve); + super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve); + super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve); + super->s_segsize = 1 << ds->ds_segment_shift; + super->s_segmask = (1 << ds->ds_segment_shift) - 1; + super->s_segshift = ds->ds_segment_shift; + sb->s_blocksize = 1 << ds->ds_block_shift; + sb->s_blocksize_bits = ds->ds_block_shift; + super->s_writesize = 1 << ds->ds_write_shift; + super->s_writeshift = ds->ds_write_shift; + super->s_no_segs = super->s_size >> super->s_segshift; + super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits; + super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat); + super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat); + super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat); + super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags); + + journal_for_each(i) + super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]); + + super->s_ifile_levels = ds->ds_ifile_levels; + super->s_iblock_levels = ds->ds_iblock_levels; + super->s_data_levels = ds->ds_data_levels; + super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels + + super->s_data_levels; + page_cache_release(page); + return 0; +} + +static int logfs_read_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int ret; + + super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL); + if (!super->s_btree_pool) + return -ENOMEM; + + btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool); + btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool); + + ret = logfs_init_mapping(sb); + if (ret) + return ret; + + ret = __logfs_read_sb(sb); + if (ret) + return ret; + + mutex_init(&super->s_dirop_mutex); + mutex_init(&super->s_object_alias_mutex); + INIT_LIST_HEAD(&super->s_freeing_list); + + ret = logfs_init_rw(sb); + if (ret) + return ret; + + ret = logfs_init_areas(sb); + if (ret) + return ret; + + ret = logfs_init_gc(sb); + if (ret) + return ret; + + ret = logfs_init_journal(sb); + if (ret) + return ret; + + return 0; +} + +static void logfs_kill_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + log_super("LogFS: Start unmounting\n"); + /* Alias entries slow down mount, so evict as many as possible */ + sync_filesystem(sb); + logfs_write_anchor(super->s_master_inode); + + /* + * From this point on alias entries are simply dropped - and any + * writes to the object store are considered bugs. + */ + super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN; + log_super("LogFS: Now in shutdown\n"); + generic_shutdown_super(sb); + + BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes); + + logfs_cleanup_gc(sb); + logfs_cleanup_journal(sb); + logfs_cleanup_areas(sb); + logfs_cleanup_rw(sb); + super->s_devops->put_device(sb); + mempool_destroy(super->s_btree_pool); + mempool_destroy(super->s_alias_pool); + kfree(super); + log_super("LogFS: Finished unmounting\n"); +} + +int logfs_get_sb_device(struct file_system_type *type, int flags, + struct mtd_info *mtd, struct block_device *bdev, + const struct logfs_device_ops *devops, struct vfsmount *mnt) +{ + struct logfs_super *super; + struct super_block *sb; + int err = -ENOMEM; + static int mount_count; + + log_super("LogFS: Start mount %x\n", mount_count++); + super = kzalloc(sizeof(*super), GFP_KERNEL); + if (!super) + goto err0; + + super->s_mtd = mtd; + super->s_bdev = bdev; + err = -EINVAL; + sb = sget(type, logfs_sb_test, logfs_sb_set, super); + if (IS_ERR(sb)) + goto err0; + + if (sb->s_root) { + /* Device is already in use */ + err = 0; + simple_set_mnt(mnt, sb); + goto err0; + } + + super->s_devops = devops; + + /* + * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache + * only covers 16TB and the upper 8TB are used for indirect blocks. + * On 64bit system we could bump up the limit, but that would make + * the filesystem incompatible with 32bit systems. + */ + sb->s_maxbytes = (1ull << 43) - 1; + sb->s_op = &logfs_super_operations; + sb->s_flags = flags | MS_NOATIME; + + err = logfs_read_sb(sb); + if (err) + goto err1; + + sb->s_flags |= MS_ACTIVE; + err = logfs_get_sb_final(sb, mnt); + if (err) + goto err1; + return 0; + +err1: + up_write(&sb->s_umount); + deactivate_super(sb); + return err; +err0: + kfree(super); + //devops->put_device(sb); + return err; +} + +static int logfs_get_sb(struct file_system_type *type, int flags, + const char *devname, void *data, struct vfsmount *mnt) +{ + ulong mtdnr; + + if (!devname) + return logfs_get_sb_bdev(type, flags, devname, mnt); + if (strncmp(devname, "mtd", 3)) + return logfs_get_sb_bdev(type, flags, devname, mnt); + + { + char *garbage; + mtdnr = simple_strtoul(devname+3, &garbage, 0); + if (*garbage) + return -EINVAL; + } + + return logfs_get_sb_mtd(type, flags, mtdnr, mnt); +} + +static struct file_system_type logfs_fs_type = { + .owner = THIS_MODULE, + .name = "logfs", + .get_sb = logfs_get_sb, + .kill_sb = logfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, + +}; + +static int __init logfs_init(void) +{ + int ret; + + emergency_page = alloc_pages(GFP_KERNEL, 0); + if (!emergency_page) + return -ENOMEM; + + ret = logfs_compr_init(); + if (ret) + goto out1; + + ret = logfs_init_inode_cache(); + if (ret) + goto out2; + + return register_filesystem(&logfs_fs_type); +out2: + logfs_compr_exit(); +out1: + __free_pages(emergency_page, 0); + return ret; +} + +static void __exit logfs_exit(void) +{ + unregister_filesystem(&logfs_fs_type); + logfs_destroy_inode_cache(); + logfs_compr_exit(); + __free_pages(emergency_page, 0); +} + +module_init(logfs_init); +module_exit(logfs_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joern Engel "); +MODULE_DESCRIPTION("scalable flash filesystem"); diff --git a/include/linux/btree-128.h b/include/linux/btree-128.h new file mode 100644 index 000000000000..0b3414c4c928 --- /dev/null +++ b/include/linux/btree-128.h @@ -0,0 +1,109 @@ +extern struct btree_geo btree_geo128; + +struct btree_head128 { struct btree_head h; }; + +static inline void btree_init_mempool128(struct btree_head128 *head, + mempool_t *mempool) +{ + btree_init_mempool(&head->h, mempool); +} + +static inline int btree_init128(struct btree_head128 *head) +{ + return btree_init(&head->h); +} + +static inline void btree_destroy128(struct btree_head128 *head) +{ + btree_destroy(&head->h); +} + +static inline void *btree_lookup128(struct btree_head128 *head, u64 k1, u64 k2) +{ + u64 key[2] = {k1, k2}; + return btree_lookup(&head->h, &btree_geo128, (unsigned long *)&key); +} + +static inline void *btree_get_prev128(struct btree_head128 *head, + u64 *k1, u64 *k2) +{ + u64 key[2] = {*k1, *k2}; + void *val; + + val = btree_get_prev(&head->h, &btree_geo128, + (unsigned long *)&key); + *k1 = key[0]; + *k2 = key[1]; + return val; +} + +static inline int btree_insert128(struct btree_head128 *head, u64 k1, u64 k2, + void *val, gfp_t gfp) +{ + u64 key[2] = {k1, k2}; + return btree_insert(&head->h, &btree_geo128, + (unsigned long *)&key, val, gfp); +} + +static inline int btree_update128(struct btree_head128 *head, u64 k1, u64 k2, + void *val) +{ + u64 key[2] = {k1, k2}; + return btree_update(&head->h, &btree_geo128, + (unsigned long *)&key, val); +} + +static inline void *btree_remove128(struct btree_head128 *head, u64 k1, u64 k2) +{ + u64 key[2] = {k1, k2}; + return btree_remove(&head->h, &btree_geo128, (unsigned long *)&key); +} + +static inline void *btree_last128(struct btree_head128 *head, u64 *k1, u64 *k2) +{ + u64 key[2]; + void *val; + + val = btree_last(&head->h, &btree_geo128, (unsigned long *)&key[0]); + if (val) { + *k1 = key[0]; + *k2 = key[1]; + } + + return val; +} + +static inline int btree_merge128(struct btree_head128 *target, + struct btree_head128 *victim, + gfp_t gfp) +{ + return btree_merge(&target->h, &victim->h, &btree_geo128, gfp); +} + +void visitor128(void *elem, unsigned long opaque, unsigned long *__key, + size_t index, void *__func); + +typedef void (*visitor128_t)(void *elem, unsigned long opaque, + u64 key1, u64 key2, size_t index); + +static inline size_t btree_visitor128(struct btree_head128 *head, + unsigned long opaque, + visitor128_t func2) +{ + return btree_visitor(&head->h, &btree_geo128, opaque, + visitor128, func2); +} + +static inline size_t btree_grim_visitor128(struct btree_head128 *head, + unsigned long opaque, + visitor128_t func2) +{ + return btree_grim_visitor(&head->h, &btree_geo128, opaque, + visitor128, func2); +} + +#define btree_for_each_safe128(head, k1, k2, val) \ + for (val = btree_last128(head, &k1, &k2); \ + val; \ + val = btree_get_prev128(head, &k1, &k2)) + diff --git a/include/linux/btree-type.h b/include/linux/btree-type.h new file mode 100644 index 000000000000..9a1147ef8563 --- /dev/null +++ b/include/linux/btree-type.h @@ -0,0 +1,147 @@ +#define __BTREE_TP(pfx, type, sfx) pfx ## type ## sfx +#define _BTREE_TP(pfx, type, sfx) __BTREE_TP(pfx, type, sfx) +#define BTREE_TP(pfx) _BTREE_TP(pfx, BTREE_TYPE_SUFFIX,) +#define BTREE_FN(name) BTREE_TP(btree_ ## name) +#define BTREE_TYPE_HEAD BTREE_TP(struct btree_head) +#define VISITOR_FN BTREE_TP(visitor) +#define VISITOR_FN_T _BTREE_TP(visitor, BTREE_TYPE_SUFFIX, _t) + +BTREE_TYPE_HEAD { + struct btree_head h; +}; + +static inline void BTREE_FN(init_mempool)(BTREE_TYPE_HEAD *head, + mempool_t *mempool) +{ + btree_init_mempool(&head->h, mempool); +} + +static inline int BTREE_FN(init)(BTREE_TYPE_HEAD *head) +{ + return btree_init(&head->h); +} + +static inline void BTREE_FN(destroy)(BTREE_TYPE_HEAD *head) +{ + btree_destroy(&head->h); +} + +static inline int BTREE_FN(merge)(BTREE_TYPE_HEAD *target, + BTREE_TYPE_HEAD *victim, + gfp_t gfp) +{ + return btree_merge(&target->h, &victim->h, BTREE_TYPE_GEO, gfp); +} + +#if (BITS_PER_LONG > BTREE_TYPE_BITS) +static inline void *BTREE_FN(lookup)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key) +{ + unsigned long _key = key; + return btree_lookup(&head->h, BTREE_TYPE_GEO, &_key); +} + +static inline int BTREE_FN(insert)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key, + void *val, gfp_t gfp) +{ + unsigned long _key = key; + return btree_insert(&head->h, BTREE_TYPE_GEO, &_key, val, gfp); +} + +static inline int BTREE_FN(update)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key, + void *val) +{ + unsigned long _key = key; + return btree_update(&head->h, BTREE_TYPE_GEO, &_key, val); +} + +static inline void *BTREE_FN(remove)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key) +{ + unsigned long _key = key; + return btree_remove(&head->h, BTREE_TYPE_GEO, &_key); +} + +static inline void *BTREE_FN(last)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key) +{ + unsigned long _key; + void *val = btree_last(&head->h, BTREE_TYPE_GEO, &_key); + if (val) + *key = _key; + return val; +} + +static inline void *BTREE_FN(get_prev)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key) +{ + unsigned long _key = *key; + void *val = btree_get_prev(&head->h, BTREE_TYPE_GEO, &_key); + if (val) + *key = _key; + return val; +} +#else +static inline void *BTREE_FN(lookup)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key) +{ + return btree_lookup(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key); +} + +static inline int BTREE_FN(insert)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key, + void *val, gfp_t gfp) +{ + return btree_insert(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key, + val, gfp); +} + +static inline int BTREE_FN(update)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key, + void *val) +{ + return btree_update(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key, val); +} + +static inline void *BTREE_FN(remove)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key) +{ + return btree_remove(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key); +} + +static inline void *BTREE_FN(last)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key) +{ + return btree_last(&head->h, BTREE_TYPE_GEO, (unsigned long *)key); +} + +static inline void *BTREE_FN(get_prev)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key) +{ + return btree_get_prev(&head->h, BTREE_TYPE_GEO, (unsigned long *)key); +} +#endif + +void VISITOR_FN(void *elem, unsigned long opaque, unsigned long *key, + size_t index, void *__func); + +typedef void (*VISITOR_FN_T)(void *elem, unsigned long opaque, + BTREE_KEYTYPE key, size_t index); + +static inline size_t BTREE_FN(visitor)(BTREE_TYPE_HEAD *head, + unsigned long opaque, + VISITOR_FN_T func2) +{ + return btree_visitor(&head->h, BTREE_TYPE_GEO, opaque, + visitorl, func2); +} + +static inline size_t BTREE_FN(grim_visitor)(BTREE_TYPE_HEAD *head, + unsigned long opaque, + VISITOR_FN_T func2) +{ + return btree_grim_visitor(&head->h, BTREE_TYPE_GEO, opaque, + visitorl, func2); +} + +#undef VISITOR_FN +#undef VISITOR_FN_T +#undef __BTREE_TP +#undef _BTREE_TP +#undef BTREE_TP +#undef BTREE_FN +#undef BTREE_TYPE_HEAD +#undef BTREE_TYPE_SUFFIX +#undef BTREE_TYPE_GEO +#undef BTREE_KEYTYPE +#undef BTREE_TYPE_BITS diff --git a/include/linux/btree.h b/include/linux/btree.h new file mode 100644 index 000000000000..65b5bb058324 --- /dev/null +++ b/include/linux/btree.h @@ -0,0 +1,243 @@ +#ifndef BTREE_H +#define BTREE_H + +#include +#include + +/** + * DOC: B+Tree basics + * + * A B+Tree is a data structure for looking up arbitrary (currently allowing + * unsigned long, u32, u64 and 2 * u64) keys into pointers. The data structure + * is described at http://en.wikipedia.org/wiki/B-tree, we currently do not + * use binary search to find the key on lookups. + * + * Each B+Tree consists of a head, that contains bookkeeping information and + * a variable number (starting with zero) nodes. Each node contains the keys + * and pointers to sub-nodes, or, for leaf nodes, the keys and values for the + * tree entries. + * + * Each node in this implementation has the following layout: + * [key1, key2, ..., keyN] [val1, val2, ..., valN] + * + * Each key here is an array of unsigned longs, geo->no_longs in total. The + * number of keys and values (N) is geo->no_pairs. + */ + +/** + * struct btree_head - btree head + * + * @node: the first node in the tree + * @mempool: mempool used for node allocations + * @height: current of the tree + */ +struct btree_head { + unsigned long *node; + mempool_t *mempool; + int height; +}; + +/* btree geometry */ +struct btree_geo; + +/** + * btree_alloc - allocate function for the mempool + * @gfp_mask: gfp mask for the allocation + * @pool_data: unused + */ +void *btree_alloc(gfp_t gfp_mask, void *pool_data); + +/** + * btree_free - free function for the mempool + * @element: the element to free + * @pool_data: unused + */ +void btree_free(void *element, void *pool_data); + +/** + * btree_init_mempool - initialise a btree with given mempool + * + * @head: the btree head to initialise + * @mempool: the mempool to use + * + * When this function is used, there is no need to destroy + * the mempool. + */ +void btree_init_mempool(struct btree_head *head, mempool_t *mempool); + +/** + * btree_init - initialise a btree + * + * @head: the btree head to initialise + * + * This function allocates the memory pool that the + * btree needs. Returns zero or a negative error code + * (-%ENOMEM) when memory allocation fails. + * + */ +int __must_check btree_init(struct btree_head *head); + +/** + * btree_destroy - destroy mempool + * + * @head: the btree head to destroy + * + * This function destroys the internal memory pool, use only + * when using btree_init(), not with btree_init_mempool(). + */ +void btree_destroy(struct btree_head *head); + +/** + * btree_lookup - look up a key in the btree + * + * @head: the btree to look in + * @geo: the btree geometry + * @key: the key to look up + * + * This function returns the value for the given key, or %NULL. + */ +void *btree_lookup(struct btree_head *head, struct btree_geo *geo, + unsigned long *key); + +/** + * btree_insert - insert an entry into the btree + * + * @head: the btree to add to + * @geo: the btree geometry + * @key: the key to add (must not already be present) + * @val: the value to add (must not be %NULL) + * @gfp: allocation flags for node allocations + * + * This function returns 0 if the item could be added, or an + * error code if it failed (may fail due to memory pressure). + */ +int __must_check btree_insert(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, void *val, gfp_t gfp); +/** + * btree_update - update an entry in the btree + * + * @head: the btree to update + * @geo: the btree geometry + * @key: the key to update + * @val: the value to change it to (must not be %NULL) + * + * This function returns 0 if the update was successful, or + * -%ENOENT if the key could not be found. + */ +int btree_update(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, void *val); +/** + * btree_remove - remove an entry from the btree + * + * @head: the btree to update + * @geo: the btree geometry + * @key: the key to remove + * + * This function returns the removed entry, or %NULL if the key + * could not be found. + */ +void *btree_remove(struct btree_head *head, struct btree_geo *geo, + unsigned long *key); + +/** + * btree_merge - merge two btrees + * + * @target: the tree that gets all the entries + * @victim: the tree that gets merged into @target + * @geo: the btree geometry + * @gfp: allocation flags + * + * The two trees @target and @victim may not contain the same keys, + * that is a bug and triggers a BUG(). This function returns zero + * if the trees were merged successfully, and may return a failure + * when memory allocation fails, in which case both trees might have + * been partially merged, i.e. some entries have been moved from + * @victim to @target. + */ +int btree_merge(struct btree_head *target, struct btree_head *victim, + struct btree_geo *geo, gfp_t gfp); + +/** + * btree_last - get last entry in btree + * + * @head: btree head + * @geo: btree geometry + * @key: last key + * + * Returns the last entry in the btree, and sets @key to the key + * of that entry; returns NULL if the tree is empty, in that case + * key is not changed. + */ +void *btree_last(struct btree_head *head, struct btree_geo *geo, + unsigned long *key); + +/** + * btree_get_prev - get previous entry + * + * @head: btree head + * @geo: btree geometry + * @key: pointer to key + * + * The function returns the next item right before the value pointed to by + * @key, and updates @key with its key, or returns %NULL when there is no + * entry with a key smaller than the given key. + */ +void *btree_get_prev(struct btree_head *head, struct btree_geo *geo, + unsigned long *key); + + +/* internal use, use btree_visitor{l,32,64,128} */ +size_t btree_visitor(struct btree_head *head, struct btree_geo *geo, + unsigned long opaque, + void (*func)(void *elem, unsigned long opaque, + unsigned long *key, size_t index, + void *func2), + void *func2); + +/* internal use, use btree_grim_visitor{l,32,64,128} */ +size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo, + unsigned long opaque, + void (*func)(void *elem, unsigned long opaque, + unsigned long *key, + size_t index, void *func2), + void *func2); + + +#include + +extern struct btree_geo btree_geo32; +#define BTREE_TYPE_SUFFIX l +#define BTREE_TYPE_BITS BITS_PER_LONG +#define BTREE_TYPE_GEO &btree_geo32 +#define BTREE_KEYTYPE unsigned long +#include + +#define btree_for_each_safel(head, key, val) \ + for (val = btree_lastl(head, &key); \ + val; \ + val = btree_get_prevl(head, &key)) + +#define BTREE_TYPE_SUFFIX 32 +#define BTREE_TYPE_BITS 32 +#define BTREE_TYPE_GEO &btree_geo32 +#define BTREE_KEYTYPE u32 +#include + +#define btree_for_each_safe32(head, key, val) \ + for (val = btree_last32(head, &key); \ + val; \ + val = btree_get_prev32(head, &key)) + +extern struct btree_geo btree_geo64; +#define BTREE_TYPE_SUFFIX 64 +#define BTREE_TYPE_BITS 64 +#define BTREE_TYPE_GEO &btree_geo64 +#define BTREE_KEYTYPE u64 +#include + +#define btree_for_each_safe64(head, key, val) \ + for (val = btree_last64(head, &key); \ + val; \ + val = btree_get_prev64(head, &key)) + +#endif diff --git a/lib/Kconfig b/lib/Kconfig index bb1326d3839c..277fbfb233b9 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -156,6 +156,9 @@ config TEXTSEARCH_BM config TEXTSEARCH_FSM tristate +config BTREE + boolean + config HAS_IOMEM boolean depends on !NO_IOMEM diff --git a/lib/Makefile b/lib/Makefile index 2e78277eff9d..cff82612e98b 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -41,6 +41,7 @@ lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o +obj-$(CONFIG_BTREE) += btree.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o obj-$(CONFIG_DEBUG_LIST) += list_debug.o obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o diff --git a/lib/btree.c b/lib/btree.c new file mode 100644 index 000000000000..41859a820218 --- /dev/null +++ b/lib/btree.c @@ -0,0 +1,797 @@ +/* + * lib/btree.c - Simple In-memory B+Tree + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2007-2008 Joern Engel + * Bits and pieces stolen from Peter Zijlstra's code, which is + * Copyright 2007, Red Hat Inc. Peter Zijlstra + * GPLv2 + * + * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch + * + * A relatively simple B+Tree implementation. I have written it as a learning + * excercise to understand how B+Trees work. Turned out to be useful as well. + * + * B+Trees can be used similar to Linux radix trees (which don't have anything + * in common with textbook radix trees, beware). Prerequisite for them working + * well is that access to a random tree node is much faster than a large number + * of operations within each node. + * + * Disks have fulfilled the prerequisite for a long time. More recently DRAM + * has gained similar properties, as memory access times, when measured in cpu + * cycles, have increased. Cacheline sizes have increased as well, which also + * helps B+Trees. + * + * Compared to radix trees, B+Trees are more efficient when dealing with a + * sparsely populated address space. Between 25% and 50% of the memory is + * occupied with valid pointers. When densely populated, radix trees contain + * ~98% pointers - hard to beat. Very sparse radix trees contain only ~2% + * pointers. + * + * This particular implementation stores pointers identified by a long value. + * Storing NULL pointers is illegal, lookup will return NULL when no entry + * was found. + * + * A tricks was used that is not commonly found in textbooks. The lowest + * values are to the right, not to the left. All used slots within a node + * are on the left, all unused slots contain NUL values. Most operations + * simply loop once over all slots and terminate on the first NUL. + */ + +#include +#include +#include +#include +#include + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define NODESIZE MAX(L1_CACHE_BYTES, 128) + +struct btree_geo { + int keylen; + int no_pairs; + int no_longs; +}; + +struct btree_geo btree_geo32 = { + .keylen = 1, + .no_pairs = NODESIZE / sizeof(long) / 2, + .no_longs = NODESIZE / sizeof(long) / 2, +}; +EXPORT_SYMBOL_GPL(btree_geo32); + +#define LONG_PER_U64 (64 / BITS_PER_LONG) +struct btree_geo btree_geo64 = { + .keylen = LONG_PER_U64, + .no_pairs = NODESIZE / sizeof(long) / (1 + LONG_PER_U64), + .no_longs = LONG_PER_U64 * (NODESIZE / sizeof(long) / (1 + LONG_PER_U64)), +}; +EXPORT_SYMBOL_GPL(btree_geo64); + +struct btree_geo btree_geo128 = { + .keylen = 2 * LONG_PER_U64, + .no_pairs = NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64), + .no_longs = 2 * LONG_PER_U64 * (NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64)), +}; +EXPORT_SYMBOL_GPL(btree_geo128); + +static struct kmem_cache *btree_cachep; + +void *btree_alloc(gfp_t gfp_mask, void *pool_data) +{ + return kmem_cache_alloc(btree_cachep, gfp_mask); +} +EXPORT_SYMBOL_GPL(btree_alloc); + +void btree_free(void *element, void *pool_data) +{ + kmem_cache_free(btree_cachep, element); +} +EXPORT_SYMBOL_GPL(btree_free); + +static unsigned long *btree_node_alloc(struct btree_head *head, gfp_t gfp) +{ + unsigned long *node; + + node = mempool_alloc(head->mempool, gfp); + memset(node, 0, NODESIZE); + return node; +} + +static int longcmp(const unsigned long *l1, const unsigned long *l2, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) { + if (l1[i] < l2[i]) + return -1; + if (l1[i] > l2[i]) + return 1; + } + return 0; +} + +static unsigned long *longcpy(unsigned long *dest, const unsigned long *src, + size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) + dest[i] = src[i]; + return dest; +} + +static unsigned long *longset(unsigned long *s, unsigned long c, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) + s[i] = c; + return s; +} + +static void dec_key(struct btree_geo *geo, unsigned long *key) +{ + unsigned long val; + int i; + + for (i = geo->keylen - 1; i >= 0; i--) { + val = key[i]; + key[i] = val - 1; + if (val) + break; + } +} + +static unsigned long *bkey(struct btree_geo *geo, unsigned long *node, int n) +{ + return &node[n * geo->keylen]; +} + +static void *bval(struct btree_geo *geo, unsigned long *node, int n) +{ + return (void *)node[geo->no_longs + n]; +} + +static void setkey(struct btree_geo *geo, unsigned long *node, int n, + unsigned long *key) +{ + longcpy(bkey(geo, node, n), key, geo->keylen); +} + +static void setval(struct btree_geo *geo, unsigned long *node, int n, + void *val) +{ + node[geo->no_longs + n] = (unsigned long) val; +} + +static void clearpair(struct btree_geo *geo, unsigned long *node, int n) +{ + longset(bkey(geo, node, n), 0, geo->keylen); + node[geo->no_longs + n] = 0; +} + +static inline void __btree_init(struct btree_head *head) +{ + head->node = NULL; + head->height = 0; +} + +void btree_init_mempool(struct btree_head *head, mempool_t *mempool) +{ + __btree_init(head); + head->mempool = mempool; +} +EXPORT_SYMBOL_GPL(btree_init_mempool); + +int btree_init(struct btree_head *head) +{ + __btree_init(head); + head->mempool = mempool_create(0, btree_alloc, btree_free, NULL); + if (!head->mempool) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL_GPL(btree_init); + +void btree_destroy(struct btree_head *head) +{ + mempool_destroy(head->mempool); + head->mempool = NULL; +} +EXPORT_SYMBOL_GPL(btree_destroy); + +void *btree_last(struct btree_head *head, struct btree_geo *geo, + unsigned long *key) +{ + int height = head->height; + unsigned long *node = head->node; + + if (height == 0) + return NULL; + + for ( ; height > 1; height--) + node = bval(geo, node, 0); + + longcpy(key, bkey(geo, node, 0), geo->keylen); + return bval(geo, node, 0); +} +EXPORT_SYMBOL_GPL(btree_last); + +static int keycmp(struct btree_geo *geo, unsigned long *node, int pos, + unsigned long *key) +{ + return longcmp(bkey(geo, node, pos), key, geo->keylen); +} + +static int keyzero(struct btree_geo *geo, unsigned long *key) +{ + int i; + + for (i = 0; i < geo->keylen; i++) + if (key[i]) + return 0; + + return 1; +} + +void *btree_lookup(struct btree_head *head, struct btree_geo *geo, + unsigned long *key) +{ + int i, height = head->height; + unsigned long *node = head->node; + + if (height == 0) + return NULL; + + for ( ; height > 1; height--) { + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) <= 0) + break; + if (i == geo->no_pairs) + return NULL; + node = bval(geo, node, i); + if (!node) + return NULL; + } + + if (!node) + return NULL; + + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) == 0) + return bval(geo, node, i); + return NULL; +} +EXPORT_SYMBOL_GPL(btree_lookup); + +int btree_update(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, void *val) +{ + int i, height = head->height; + unsigned long *node = head->node; + + if (height == 0) + return -ENOENT; + + for ( ; height > 1; height--) { + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) <= 0) + break; + if (i == geo->no_pairs) + return -ENOENT; + node = bval(geo, node, i); + if (!node) + return -ENOENT; + } + + if (!node) + return -ENOENT; + + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) == 0) { + setval(geo, node, i, val); + return 0; + } + return -ENOENT; +} +EXPORT_SYMBOL_GPL(btree_update); + +/* + * Usually this function is quite similar to normal lookup. But the key of + * a parent node may be smaller than the smallest key of all its siblings. + * In such a case we cannot just return NULL, as we have only proven that no + * key smaller than __key, but larger than this parent key exists. + * So we set __key to the parent key and retry. We have to use the smallest + * such parent key, which is the last parent key we encountered. + */ +void *btree_get_prev(struct btree_head *head, struct btree_geo *geo, + unsigned long *__key) +{ + int i, height; + unsigned long *node, *oldnode; + unsigned long *retry_key = NULL, key[geo->keylen]; + + if (keyzero(geo, __key)) + return NULL; + + if (head->height == 0) + return NULL; +retry: + longcpy(key, __key, geo->keylen); + dec_key(geo, key); + + node = head->node; + for (height = head->height ; height > 1; height--) { + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) <= 0) + break; + if (i == geo->no_pairs) + goto miss; + oldnode = node; + node = bval(geo, node, i); + if (!node) + goto miss; + retry_key = bkey(geo, oldnode, i); + } + + if (!node) + goto miss; + + for (i = 0; i < geo->no_pairs; i++) { + if (keycmp(geo, node, i, key) <= 0) { + if (bval(geo, node, i)) { + longcpy(__key, bkey(geo, node, i), geo->keylen); + return bval(geo, node, i); + } else + goto miss; + } + } +miss: + if (retry_key) { + __key = retry_key; + retry_key = NULL; + goto retry; + } + return NULL; +} + +static int getpos(struct btree_geo *geo, unsigned long *node, + unsigned long *key) +{ + int i; + + for (i = 0; i < geo->no_pairs; i++) { + if (keycmp(geo, node, i, key) <= 0) + break; + } + return i; +} + +static int getfill(struct btree_geo *geo, unsigned long *node, int start) +{ + int i; + + for (i = start; i < geo->no_pairs; i++) + if (!bval(geo, node, i)) + break; + return i; +} + +/* + * locate the correct leaf node in the btree + */ +static unsigned long *find_level(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, int level) +{ + unsigned long *node = head->node; + int i, height; + + for (height = head->height; height > level; height--) { + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) <= 0) + break; + + if ((i == geo->no_pairs) || !bval(geo, node, i)) { + /* right-most key is too large, update it */ + /* FIXME: If the right-most key on higher levels is + * always zero, this wouldn't be necessary. */ + i--; + setkey(geo, node, i, key); + } + BUG_ON(i < 0); + node = bval(geo, node, i); + } + BUG_ON(!node); + return node; +} + +static int btree_grow(struct btree_head *head, struct btree_geo *geo, + gfp_t gfp) +{ + unsigned long *node; + int fill; + + node = btree_node_alloc(head, gfp); + if (!node) + return -ENOMEM; + if (head->node) { + fill = getfill(geo, head->node, 0); + setkey(geo, node, 0, bkey(geo, head->node, fill - 1)); + setval(geo, node, 0, head->node); + } + head->node = node; + head->height++; + return 0; +} + +static void btree_shrink(struct btree_head *head, struct btree_geo *geo) +{ + unsigned long *node; + int fill; + + if (head->height <= 1) + return; + + node = head->node; + fill = getfill(geo, node, 0); + BUG_ON(fill > 1); + head->node = bval(geo, node, 0); + head->height--; + mempool_free(node, head->mempool); +} + +static int btree_insert_level(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, void *val, int level, + gfp_t gfp) +{ + unsigned long *node; + int i, pos, fill, err; + + BUG_ON(!val); + if (head->height < level) { + err = btree_grow(head, geo, gfp); + if (err) + return err; + } + +retry: + node = find_level(head, geo, key, level); + pos = getpos(geo, node, key); + fill = getfill(geo, node, pos); + /* two identical keys are not allowed */ + BUG_ON(pos < fill && keycmp(geo, node, pos, key) == 0); + + if (fill == geo->no_pairs) { + /* need to split node */ + unsigned long *new; + + new = btree_node_alloc(head, gfp); + if (!new) + return -ENOMEM; + err = btree_insert_level(head, geo, + bkey(geo, node, fill / 2 - 1), + new, level + 1, gfp); + if (err) { + mempool_free(new, head->mempool); + return err; + } + for (i = 0; i < fill / 2; i++) { + setkey(geo, new, i, bkey(geo, node, i)); + setval(geo, new, i, bval(geo, node, i)); + setkey(geo, node, i, bkey(geo, node, i + fill / 2)); + setval(geo, node, i, bval(geo, node, i + fill / 2)); + clearpair(geo, node, i + fill / 2); + } + if (fill & 1) { + setkey(geo, node, i, bkey(geo, node, fill - 1)); + setval(geo, node, i, bval(geo, node, fill - 1)); + clearpair(geo, node, fill - 1); + } + goto retry; + } + BUG_ON(fill >= geo->no_pairs); + + /* shift and insert */ + for (i = fill; i > pos; i--) { + setkey(geo, node, i, bkey(geo, node, i - 1)); + setval(geo, node, i, bval(geo, node, i - 1)); + } + setkey(geo, node, pos, key); + setval(geo, node, pos, val); + + return 0; +} + +int btree_insert(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, void *val, gfp_t gfp) +{ + return btree_insert_level(head, geo, key, val, 1, gfp); +} +EXPORT_SYMBOL_GPL(btree_insert); + +static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, int level); +static void merge(struct btree_head *head, struct btree_geo *geo, int level, + unsigned long *left, int lfill, + unsigned long *right, int rfill, + unsigned long *parent, int lpos) +{ + int i; + + for (i = 0; i < rfill; i++) { + /* Move all keys to the left */ + setkey(geo, left, lfill + i, bkey(geo, right, i)); + setval(geo, left, lfill + i, bval(geo, right, i)); + } + /* Exchange left and right child in parent */ + setval(geo, parent, lpos, right); + setval(geo, parent, lpos + 1, left); + /* Remove left (formerly right) child from parent */ + btree_remove_level(head, geo, bkey(geo, parent, lpos), level + 1); + mempool_free(right, head->mempool); +} + +static void rebalance(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, int level, unsigned long *child, int fill) +{ + unsigned long *parent, *left = NULL, *right = NULL; + int i, no_left, no_right; + + if (fill == 0) { + /* Because we don't steal entries from a neigbour, this case + * can happen. Parent node contains a single child, this + * node, so merging with a sibling never happens. + */ + btree_remove_level(head, geo, key, level + 1); + mempool_free(child, head->mempool); + return; + } + + parent = find_level(head, geo, key, level + 1); + i = getpos(geo, parent, key); + BUG_ON(bval(geo, parent, i) != child); + + if (i > 0) { + left = bval(geo, parent, i - 1); + no_left = getfill(geo, left, 0); + if (fill + no_left <= geo->no_pairs) { + merge(head, geo, level, + left, no_left, + child, fill, + parent, i - 1); + return; + } + } + if (i + 1 < getfill(geo, parent, i)) { + right = bval(geo, parent, i + 1); + no_right = getfill(geo, right, 0); + if (fill + no_right <= geo->no_pairs) { + merge(head, geo, level, + child, fill, + right, no_right, + parent, i); + return; + } + } + /* + * We could also try to steal one entry from the left or right + * neighbor. By not doing so we changed the invariant from + * "all nodes are at least half full" to "no two neighboring + * nodes can be merged". Which means that the average fill of + * all nodes is still half or better. + */ +} + +static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, int level) +{ + unsigned long *node; + int i, pos, fill; + void *ret; + + if (level > head->height) { + /* we recursed all the way up */ + head->height = 0; + head->node = NULL; + return NULL; + } + + node = find_level(head, geo, key, level); + pos = getpos(geo, node, key); + fill = getfill(geo, node, pos); + if ((level == 1) && (keycmp(geo, node, pos, key) != 0)) + return NULL; + ret = bval(geo, node, pos); + + /* remove and shift */ + for (i = pos; i < fill - 1; i++) { + setkey(geo, node, i, bkey(geo, node, i + 1)); + setval(geo, node, i, bval(geo, node, i + 1)); + } + clearpair(geo, node, fill - 1); + + if (fill - 1 < geo->no_pairs / 2) { + if (level < head->height) + rebalance(head, geo, key, level, node, fill - 1); + else if (fill - 1 == 1) + btree_shrink(head, geo); + } + + return ret; +} + +void *btree_remove(struct btree_head *head, struct btree_geo *geo, + unsigned long *key) +{ + if (head->height == 0) + return NULL; + + return btree_remove_level(head, geo, key, 1); +} +EXPORT_SYMBOL_GPL(btree_remove); + +int btree_merge(struct btree_head *target, struct btree_head *victim, + struct btree_geo *geo, gfp_t gfp) +{ + unsigned long key[geo->keylen]; + unsigned long dup[geo->keylen]; + void *val; + int err; + + BUG_ON(target == victim); + + if (!(target->node)) { + /* target is empty, just copy fields over */ + target->node = victim->node; + target->height = victim->height; + __btree_init(victim); + return 0; + } + + /* TODO: This needs some optimizations. Currently we do three tree + * walks to remove a single object from the victim. + */ + for (;;) { + if (!btree_last(victim, geo, key)) + break; + val = btree_lookup(victim, geo, key); + err = btree_insert(target, geo, key, val, gfp); + if (err) + return err; + /* We must make a copy of the key, as the original will get + * mangled inside btree_remove. */ + longcpy(dup, key, geo->keylen); + btree_remove(victim, geo, dup); + } + return 0; +} +EXPORT_SYMBOL_GPL(btree_merge); + +static size_t __btree_for_each(struct btree_head *head, struct btree_geo *geo, + unsigned long *node, unsigned long opaque, + void (*func)(void *elem, unsigned long opaque, + unsigned long *key, size_t index, + void *func2), + void *func2, int reap, int height, size_t count) +{ + int i; + unsigned long *child; + + for (i = 0; i < geo->no_pairs; i++) { + child = bval(geo, node, i); + if (!child) + break; + if (height > 1) + count = __btree_for_each(head, geo, child, opaque, + func, func2, reap, height - 1, count); + else + func(child, opaque, bkey(geo, node, i), count++, + func2); + } + if (reap) + mempool_free(node, head->mempool); + return count; +} + +static void empty(void *elem, unsigned long opaque, unsigned long *key, + size_t index, void *func2) +{ +} + +void visitorl(void *elem, unsigned long opaque, unsigned long *key, + size_t index, void *__func) +{ + visitorl_t func = __func; + + func(elem, opaque, *key, index); +} +EXPORT_SYMBOL_GPL(visitorl); + +void visitor32(void *elem, unsigned long opaque, unsigned long *__key, + size_t index, void *__func) +{ + visitor32_t func = __func; + u32 *key = (void *)__key; + + func(elem, opaque, *key, index); +} +EXPORT_SYMBOL_GPL(visitor32); + +void visitor64(void *elem, unsigned long opaque, unsigned long *__key, + size_t index, void *__func) +{ + visitor64_t func = __func; + u64 *key = (void *)__key; + + func(elem, opaque, *key, index); +} +EXPORT_SYMBOL_GPL(visitor64); + +void visitor128(void *elem, unsigned long opaque, unsigned long *__key, + size_t index, void *__func) +{ + visitor128_t func = __func; + u64 *key = (void *)__key; + + func(elem, opaque, key[0], key[1], index); +} +EXPORT_SYMBOL_GPL(visitor128); + +size_t btree_visitor(struct btree_head *head, struct btree_geo *geo, + unsigned long opaque, + void (*func)(void *elem, unsigned long opaque, + unsigned long *key, + size_t index, void *func2), + void *func2) +{ + size_t count = 0; + + if (!func2) + func = empty; + if (head->node) + count = __btree_for_each(head, geo, head->node, opaque, func, + func2, 0, head->height, 0); + return count; +} +EXPORT_SYMBOL_GPL(btree_visitor); + +size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo, + unsigned long opaque, + void (*func)(void *elem, unsigned long opaque, + unsigned long *key, + size_t index, void *func2), + void *func2) +{ + size_t count = 0; + + if (!func2) + func = empty; + if (head->node) + count = __btree_for_each(head, geo, head->node, opaque, func, + func2, 1, head->height, 0); + __btree_init(head); + return count; +} +EXPORT_SYMBOL_GPL(btree_grim_visitor); + +static int __init btree_module_init(void) +{ + btree_cachep = kmem_cache_create("btree_node", NODESIZE, 0, + SLAB_HWCACHE_ALIGN, NULL); + return 0; +} + +static void __exit btree_module_exit(void) +{ + kmem_cache_destroy(btree_cachep); +} + +/* If core code starts using btree, initialization should happen even earlier */ +module_init(btree_module_init); +module_exit(btree_module_exit); + +MODULE_AUTHOR("Joern Engel "); +MODULE_AUTHOR("Johannes Berg "); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From e169cfbef46d62e042614ffafa8880eed1d894bb Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 23 Nov 2009 14:53:09 -0700 Subject: of/flattree: merge find_flat_dt_string and initial_boot_params Merge common code between Microblaze and PowerPC. Signed-off-by: Grant Likely Reviewed-by: Wolfram Sang Tested-by: Michal Simek --- arch/microblaze/Kconfig | 1 + arch/microblaze/kernel/prom.c | 8 -------- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/prom.c | 12 ------------ drivers/of/Kconfig | 4 ++++ drivers/of/Makefile | 1 + drivers/of/fdt.c | 21 +++++++++++++++++++++ include/linux/of_fdt.h | 4 ++++ 8 files changed, 32 insertions(+), 20 deletions(-) create mode 100644 drivers/of/fdt.c (limited to 'include/linux') diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index bbd8327f1890..f39c9275a29b 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -111,6 +111,7 @@ config CMDLINE_FORCE config OF def_bool y + select OF_FLATTREE config PROC_DEVICETREE bool "Support for device tree in /proc" diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index b817df172aa9..06d620ab4168 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -47,17 +47,9 @@ static int __initdata dt_root_size_cells; typedef u32 cell_t; -static struct boot_param_header *initial_boot_params; - /* export that to outside world */ struct device_node *of_chosen; -static inline char *find_flat_dt_string(u32 offset) -{ - return ((char *)initial_boot_params) + - initial_boot_params->off_dt_strings + offset; -} - /** * This function is used to scan the flattened device-tree, it is * used to extract the memory informations at boot before we can diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2ba14e77296c..2a75c6ae2a8b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -163,6 +163,7 @@ config PPC_OF config OF def_bool y + select OF_FLATTREE config PPC_UDBG_16550 bool diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 4ec300862466..fccf7e49bb28 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -73,12 +73,6 @@ unsigned long tce_alloc_start, tce_alloc_end; typedef u32 cell_t; -#if 0 -static struct boot_param_header *initial_boot_params __initdata; -#else -struct boot_param_header *initial_boot_params; -#endif - extern struct device_node *allnodes; /* temporary while merging */ extern rwlock_t devtree_lock; /* temporary while merging */ @@ -86,12 +80,6 @@ extern rwlock_t devtree_lock; /* temporary while merging */ /* export that to outside world */ struct device_node *of_chosen; -static inline char *find_flat_dt_string(u32 offset) -{ - return ((char *)initial_boot_params) + - initial_boot_params->off_dt_strings + offset; -} - /** * This function is used to scan the flattened device-tree, it is * used to extract the memory informations at boot before we can diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig index d2fa27c5c1b2..462825e03123 100644 --- a/drivers/of/Kconfig +++ b/drivers/of/Kconfig @@ -1,3 +1,7 @@ +config OF_FLATTREE + bool + depends on OF + config OF_DEVICE def_bool y depends on OF && (SPARC || PPC_OF || MICROBLAZE) diff --git a/drivers/of/Makefile b/drivers/of/Makefile index bdfb5f5d4b06..f232cc98ce00 100644 --- a/drivers/of/Makefile +++ b/drivers/of/Makefile @@ -1,4 +1,5 @@ obj-y = base.o +obj-$(CONFIG_OF_FLATTREE) += fdt.o obj-$(CONFIG_OF_DEVICE) += device.o platform.o obj-$(CONFIG_OF_GPIO) += gpio.o obj-$(CONFIG_OF_I2C) += of_i2c.o diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c new file mode 100644 index 000000000000..9faa9a5cbdf0 --- /dev/null +++ b/drivers/of/fdt.c @@ -0,0 +1,21 @@ +/* + * Functions for working with the Flattened Device Tree data format + * + * Copyright 2009 Benjamin Herrenschmidt, IBM Corp + * benh@kernel.crashing.org + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include +#include + +struct boot_param_header *initial_boot_params; + +char *find_flat_dt_string(u32 offset) +{ + return ((char *)initial_boot_params) + + initial_boot_params->off_dt_strings + offset; +} diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 41d432b13553..d1a79f3da789 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -57,7 +57,11 @@ struct boot_param_header { u32 dt_struct_size; /* size of the DT structure block */ }; +/* TBD: Temporary export of fdt globals - remove when code fully merged */ +extern struct boot_param_header *initial_boot_params; + /* For scanning the flat device-tree at boot time */ +extern char *find_flat_dt_string(u32 offset); extern int __init of_scan_flat_dt(int (*it)(unsigned long node, const char *uname, int depth, void *data), -- cgit v1.2.3 From 31a6a87dfc34fbf02aef9a160adf558ec56d3ccd Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 23 Nov 2009 19:49:38 -0700 Subject: of/flattree: remove __init annotations from the header file __init annotation belongs in the .c file, not the header. Signed-off-by: Grant Likely Reviewed-by: Wolfram Sang Tested-by: Michal Simek --- include/linux/of_fdt.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index d1a79f3da789..81231e04e8f3 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -62,15 +62,13 @@ extern struct boot_param_header *initial_boot_params; /* For scanning the flat device-tree at boot time */ extern char *find_flat_dt_string(u32 offset); -extern int __init of_scan_flat_dt(int (*it)(unsigned long node, - const char *uname, int depth, - void *data), - void *data); -extern void __init *of_get_flat_dt_prop(unsigned long node, const char *name, - unsigned long *size); -extern int __init of_flat_dt_is_compatible(unsigned long node, - const char *name); -extern unsigned long __init of_get_flat_dt_root(void); +extern int of_scan_flat_dt(int (*it)(unsigned long node, const char *uname, + int depth, void *data), + void *data); +extern void *of_get_flat_dt_prop(unsigned long node, const char *name, + unsigned long *size); +extern int of_flat_dt_is_compatible(unsigned long node, const char *name); +extern unsigned long of_get_flat_dt_root(void); /* Other Prototypes */ extern void finish_device_tree(void); -- cgit v1.2.3 From bbd33931a08362f78266a4016211a35947b91041 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 23 Nov 2009 20:07:00 -0700 Subject: of/flattree: Merge unflatten_dt_node Merge common code between PowerPC and MicroBlaze Signed-off-by: Grant Likely Reviewed-by: Wolfram Sang Tested-by: Michal Simek --- arch/microblaze/kernel/prom.c | 195 ---------------------------------------- arch/powerpc/kernel/prom.c | 194 ---------------------------------------- drivers/of/fdt.c | 200 ++++++++++++++++++++++++++++++++++++++++++ include/linux/of_fdt.h | 4 + 4 files changed, 204 insertions(+), 389 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index eb27bd3a39b4..021770abfbd7 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -50,201 +50,6 @@ typedef u32 cell_t; /* export that to outside world */ struct device_node *of_chosen; -static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size, - unsigned long align) -{ - void *res; - - *mem = _ALIGN(*mem, align); - res = (void *)*mem; - *mem += size; - - return res; -} - -static unsigned long __init unflatten_dt_node(unsigned long mem, - unsigned long *p, - struct device_node *dad, - struct device_node ***allnextpp, - unsigned long fpsize) -{ - struct device_node *np; - struct property *pp, **prev_pp = NULL; - char *pathp; - u32 tag; - unsigned int l, allocl; - int has_name = 0; - int new_format = 0; - - tag = *((u32 *)(*p)); - if (tag != OF_DT_BEGIN_NODE) { - printk("Weird tag at start of node: %x\n", tag); - return mem; - } - *p += 4; - pathp = (char *)*p; - l = allocl = strlen(pathp) + 1; - *p = _ALIGN(*p + l, 4); - - /* version 0x10 has a more compact unit name here instead of the full - * path. we accumulate the full path size using "fpsize", we'll rebuild - * it later. We detect this because the first character of the name is - * not '/'. - */ - if ((*pathp) != '/') { - new_format = 1; - if (fpsize == 0) { - /* root node: special case. fpsize accounts for path - * plus terminating zero. root node only has '/', so - * fpsize should be 2, but we want to avoid the first - * level nodes to have two '/' so we use fpsize 1 here - */ - fpsize = 1; - allocl = 2; - } else { - /* account for '/' and path size minus terminal 0 - * already in 'l' - */ - fpsize += l; - allocl = fpsize; - } - } - - np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl, - __alignof__(struct device_node)); - if (allnextpp) { - memset(np, 0, sizeof(*np)); - np->full_name = ((char *)np) + sizeof(struct device_node); - if (new_format) { - char *p2 = np->full_name; - /* rebuild full path for new format */ - if (dad && dad->parent) { - strcpy(p2, dad->full_name); -#ifdef DEBUG - if ((strlen(p2) + l + 1) != allocl) { - pr_debug("%s: p: %d, l: %d, a: %d\n", - pathp, (int)strlen(p2), - l, allocl); - } -#endif - p2 += strlen(p2); - } - *(p2++) = '/'; - memcpy(p2, pathp, l); - } else - memcpy(np->full_name, pathp, l); - prev_pp = &np->properties; - **allnextpp = np; - *allnextpp = &np->allnext; - if (dad != NULL) { - np->parent = dad; - /* we temporarily use the next field as `last_child'*/ - if (dad->next == NULL) - dad->child = np; - else - dad->next->sibling = np; - dad->next = np; - } - kref_init(&np->kref); - } - while (1) { - u32 sz, noff; - char *pname; - - tag = *((u32 *)(*p)); - if (tag == OF_DT_NOP) { - *p += 4; - continue; - } - if (tag != OF_DT_PROP) - break; - *p += 4; - sz = *((u32 *)(*p)); - noff = *((u32 *)((*p) + 4)); - *p += 8; - if (initial_boot_params->version < 0x10) - *p = _ALIGN(*p, sz >= 8 ? 8 : 4); - - pname = find_flat_dt_string(noff); - if (pname == NULL) { - printk(KERN_INFO - "Can't find property name in list !\n"); - break; - } - if (strcmp(pname, "name") == 0) - has_name = 1; - l = strlen(pname) + 1; - pp = unflatten_dt_alloc(&mem, sizeof(struct property), - __alignof__(struct property)); - if (allnextpp) { - if (strcmp(pname, "linux,phandle") == 0) { - np->node = *((u32 *)*p); - if (np->linux_phandle == 0) - np->linux_phandle = np->node; - } - if (strcmp(pname, "ibm,phandle") == 0) - np->linux_phandle = *((u32 *)*p); - pp->name = pname; - pp->length = sz; - pp->value = (void *)*p; - *prev_pp = pp; - prev_pp = &pp->next; - } - *p = _ALIGN((*p) + sz, 4); - } - /* with version 0x10 we may not have the name property, recreate - * it here from the unit name if absent - */ - if (!has_name) { - char *p1 = pathp, *ps = pathp, *pa = NULL; - int sz; - - while (*p1) { - if ((*p1) == '@') - pa = p1; - if ((*p1) == '/') - ps = p1 + 1; - p1++; - } - if (pa < ps) - pa = p1; - sz = (pa - ps) + 1; - pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz, - __alignof__(struct property)); - if (allnextpp) { - pp->name = "name"; - pp->length = sz; - pp->value = pp + 1; - *prev_pp = pp; - prev_pp = &pp->next; - memcpy(pp->value, ps, sz - 1); - ((char *)pp->value)[sz - 1] = 0; - pr_debug("fixed up name for %s -> %s\n", pathp, - (char *)pp->value); - } - } - if (allnextpp) { - *prev_pp = NULL; - np->name = of_get_property(np, "name", NULL); - np->type = of_get_property(np, "device_type", NULL); - - if (!np->name) - np->name = ""; - if (!np->type) - np->type = ""; - } - while (tag == OF_DT_BEGIN_NODE) { - mem = unflatten_dt_node(mem, p, np, allnextpp, fpsize); - tag = *((u32 *)(*p)); - } - if (tag != OF_DT_END_NODE) { - printk(KERN_INFO "Weird tag at end of node: %x\n", tag); - return mem; - } - *p += 4; - return mem; -} - /** * unflattens the device-tree passed by the firmware, creating the * tree of struct device_node. It also fills the "name" and "type" diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 413e608863dd..a102a0a33ed1 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -80,200 +80,6 @@ extern rwlock_t devtree_lock; /* temporary while merging */ /* export that to outside world */ struct device_node *of_chosen; -static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size, - unsigned long align) -{ - void *res; - - *mem = _ALIGN(*mem, align); - res = (void *)*mem; - *mem += size; - - return res; -} - -static unsigned long __init unflatten_dt_node(unsigned long mem, - unsigned long *p, - struct device_node *dad, - struct device_node ***allnextpp, - unsigned long fpsize) -{ - struct device_node *np; - struct property *pp, **prev_pp = NULL; - char *pathp; - u32 tag; - unsigned int l, allocl; - int has_name = 0; - int new_format = 0; - - tag = *((u32 *)(*p)); - if (tag != OF_DT_BEGIN_NODE) { - printk("Weird tag at start of node: %x\n", tag); - return mem; - } - *p += 4; - pathp = (char *)*p; - l = allocl = strlen(pathp) + 1; - *p = _ALIGN(*p + l, 4); - - /* version 0x10 has a more compact unit name here instead of the full - * path. we accumulate the full path size using "fpsize", we'll rebuild - * it later. We detect this because the first character of the name is - * not '/'. - */ - if ((*pathp) != '/') { - new_format = 1; - if (fpsize == 0) { - /* root node: special case. fpsize accounts for path - * plus terminating zero. root node only has '/', so - * fpsize should be 2, but we want to avoid the first - * level nodes to have two '/' so we use fpsize 1 here - */ - fpsize = 1; - allocl = 2; - } else { - /* account for '/' and path size minus terminal 0 - * already in 'l' - */ - fpsize += l; - allocl = fpsize; - } - } - - - np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl, - __alignof__(struct device_node)); - if (allnextpp) { - memset(np, 0, sizeof(*np)); - np->full_name = ((char*)np) + sizeof(struct device_node); - if (new_format) { - char *p = np->full_name; - /* rebuild full path for new format */ - if (dad && dad->parent) { - strcpy(p, dad->full_name); -#ifdef DEBUG - if ((strlen(p) + l + 1) != allocl) { - DBG("%s: p: %d, l: %d, a: %d\n", - pathp, (int)strlen(p), l, allocl); - } -#endif - p += strlen(p); - } - *(p++) = '/'; - memcpy(p, pathp, l); - } else - memcpy(np->full_name, pathp, l); - prev_pp = &np->properties; - **allnextpp = np; - *allnextpp = &np->allnext; - if (dad != NULL) { - np->parent = dad; - /* we temporarily use the next field as `last_child'*/ - if (dad->next == 0) - dad->child = np; - else - dad->next->sibling = np; - dad->next = np; - } - kref_init(&np->kref); - } - while(1) { - u32 sz, noff; - char *pname; - - tag = *((u32 *)(*p)); - if (tag == OF_DT_NOP) { - *p += 4; - continue; - } - if (tag != OF_DT_PROP) - break; - *p += 4; - sz = *((u32 *)(*p)); - noff = *((u32 *)((*p) + 4)); - *p += 8; - if (initial_boot_params->version < 0x10) - *p = _ALIGN(*p, sz >= 8 ? 8 : 4); - - pname = find_flat_dt_string(noff); - if (pname == NULL) { - printk("Can't find property name in list !\n"); - break; - } - if (strcmp(pname, "name") == 0) - has_name = 1; - l = strlen(pname) + 1; - pp = unflatten_dt_alloc(&mem, sizeof(struct property), - __alignof__(struct property)); - if (allnextpp) { - if (strcmp(pname, "linux,phandle") == 0) { - np->node = *((u32 *)*p); - if (np->linux_phandle == 0) - np->linux_phandle = np->node; - } - if (strcmp(pname, "ibm,phandle") == 0) - np->linux_phandle = *((u32 *)*p); - pp->name = pname; - pp->length = sz; - pp->value = (void *)*p; - *prev_pp = pp; - prev_pp = &pp->next; - } - *p = _ALIGN((*p) + sz, 4); - } - /* with version 0x10 we may not have the name property, recreate - * it here from the unit name if absent - */ - if (!has_name) { - char *p = pathp, *ps = pathp, *pa = NULL; - int sz; - - while (*p) { - if ((*p) == '@') - pa = p; - if ((*p) == '/') - ps = p + 1; - p++; - } - if (pa < ps) - pa = p; - sz = (pa - ps) + 1; - pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz, - __alignof__(struct property)); - if (allnextpp) { - pp->name = "name"; - pp->length = sz; - pp->value = pp + 1; - *prev_pp = pp; - prev_pp = &pp->next; - memcpy(pp->value, ps, sz - 1); - ((char *)pp->value)[sz - 1] = 0; - DBG("fixed up name for %s -> %s\n", pathp, - (char *)pp->value); - } - } - if (allnextpp) { - *prev_pp = NULL; - np->name = of_get_property(np, "name", NULL); - np->type = of_get_property(np, "device_type", NULL); - - if (!np->name) - np->name = ""; - if (!np->type) - np->type = ""; - } - while (tag == OF_DT_BEGIN_NODE) { - mem = unflatten_dt_node(mem, p, np, allnextpp, fpsize); - tag = *((u32 *)(*p)); - } - if (tag != OF_DT_END_NODE) { - printk("Weird tag at end of node: %x\n", tag); - return mem; - } - *p += 4; - return mem; -} - static int __init early_parse_mem(char *p) { if (!p) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 5cdd958db9af..6852ecf6d1e1 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -166,3 +166,203 @@ int __init of_flat_dt_is_compatible(unsigned long node, const char *compat) return 0; } +static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size, + unsigned long align) +{ + void *res; + + *mem = _ALIGN(*mem, align); + res = (void *)*mem; + *mem += size; + + return res; +} + +/** + * unflatten_dt_node - Alloc and populate a device_node from the flat tree + * @p: pointer to node in flat tree + * @dad: Parent struct device_node + * @allnextpp: pointer to ->allnext from last allocated device_node + * @fpsize: Size of the node path up at the current depth. + */ +unsigned long __init unflatten_dt_node(unsigned long mem, + unsigned long *p, + struct device_node *dad, + struct device_node ***allnextpp, + unsigned long fpsize) +{ + struct device_node *np; + struct property *pp, **prev_pp = NULL; + char *pathp; + u32 tag; + unsigned int l, allocl; + int has_name = 0; + int new_format = 0; + + tag = *((u32 *)(*p)); + if (tag != OF_DT_BEGIN_NODE) { + pr_err("Weird tag at start of node: %x\n", tag); + return mem; + } + *p += 4; + pathp = (char *)*p; + l = allocl = strlen(pathp) + 1; + *p = _ALIGN(*p + l, 4); + + /* version 0x10 has a more compact unit name here instead of the full + * path. we accumulate the full path size using "fpsize", we'll rebuild + * it later. We detect this because the first character of the name is + * not '/'. + */ + if ((*pathp) != '/') { + new_format = 1; + if (fpsize == 0) { + /* root node: special case. fpsize accounts for path + * plus terminating zero. root node only has '/', so + * fpsize should be 2, but we want to avoid the first + * level nodes to have two '/' so we use fpsize 1 here + */ + fpsize = 1; + allocl = 2; + } else { + /* account for '/' and path size minus terminal 0 + * already in 'l' + */ + fpsize += l; + allocl = fpsize; + } + } + + np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl, + __alignof__(struct device_node)); + if (allnextpp) { + memset(np, 0, sizeof(*np)); + np->full_name = ((char *)np) + sizeof(struct device_node); + if (new_format) { + char *fn = np->full_name; + /* rebuild full path for new format */ + if (dad && dad->parent) { + strcpy(fn, dad->full_name); +#ifdef DEBUG + if ((strlen(fn) + l + 1) != allocl) { + pr_debug("%s: p: %d, l: %d, a: %d\n", + pathp, (int)strlen(fn), + l, allocl); + } +#endif + fn += strlen(fn); + } + *(fn++) = '/'; + memcpy(fn, pathp, l); + } else + memcpy(np->full_name, pathp, l); + prev_pp = &np->properties; + **allnextpp = np; + *allnextpp = &np->allnext; + if (dad != NULL) { + np->parent = dad; + /* we temporarily use the next field as `last_child'*/ + if (dad->next == NULL) + dad->child = np; + else + dad->next->sibling = np; + dad->next = np; + } + kref_init(&np->kref); + } + while (1) { + u32 sz, noff; + char *pname; + + tag = *((u32 *)(*p)); + if (tag == OF_DT_NOP) { + *p += 4; + continue; + } + if (tag != OF_DT_PROP) + break; + *p += 4; + sz = *((u32 *)(*p)); + noff = *((u32 *)((*p) + 4)); + *p += 8; + if (initial_boot_params->version < 0x10) + *p = _ALIGN(*p, sz >= 8 ? 8 : 4); + + pname = find_flat_dt_string(noff); + if (pname == NULL) { + pr_info("Can't find property name in list !\n"); + break; + } + if (strcmp(pname, "name") == 0) + has_name = 1; + l = strlen(pname) + 1; + pp = unflatten_dt_alloc(&mem, sizeof(struct property), + __alignof__(struct property)); + if (allnextpp) { + if (strcmp(pname, "linux,phandle") == 0) { + np->node = *((u32 *)*p); + if (np->linux_phandle == 0) + np->linux_phandle = np->node; + } + if (strcmp(pname, "ibm,phandle") == 0) + np->linux_phandle = *((u32 *)*p); + pp->name = pname; + pp->length = sz; + pp->value = (void *)*p; + *prev_pp = pp; + prev_pp = &pp->next; + } + *p = _ALIGN((*p) + sz, 4); + } + /* with version 0x10 we may not have the name property, recreate + * it here from the unit name if absent + */ + if (!has_name) { + char *p1 = pathp, *ps = pathp, *pa = NULL; + int sz; + + while (*p1) { + if ((*p1) == '@') + pa = p1; + if ((*p1) == '/') + ps = p1 + 1; + p1++; + } + if (pa < ps) + pa = p1; + sz = (pa - ps) + 1; + pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz, + __alignof__(struct property)); + if (allnextpp) { + pp->name = "name"; + pp->length = sz; + pp->value = pp + 1; + *prev_pp = pp; + prev_pp = &pp->next; + memcpy(pp->value, ps, sz - 1); + ((char *)pp->value)[sz - 1] = 0; + pr_debug("fixed up name for %s -> %s\n", pathp, + (char *)pp->value); + } + } + if (allnextpp) { + *prev_pp = NULL; + np->name = of_get_property(np, "name", NULL); + np->type = of_get_property(np, "device_type", NULL); + + if (!np->name) + np->name = ""; + if (!np->type) + np->type = ""; + } + while (tag == OF_DT_BEGIN_NODE) { + mem = unflatten_dt_node(mem, p, np, allnextpp, fpsize); + tag = *((u32 *)(*p)); + } + if (tag != OF_DT_END_NODE) { + pr_err("Weird tag at end of node: %x\n", tag); + return mem; + } + *p += 4; + return mem; +} diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 81231e04e8f3..ace9068e07e8 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -69,6 +69,10 @@ extern void *of_get_flat_dt_prop(unsigned long node, const char *name, unsigned long *size); extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); +extern unsigned long unflatten_dt_node(unsigned long mem, unsigned long *p, + struct device_node *dad, + struct device_node ***allnextpp, + unsigned long fpsize); /* Other Prototypes */ extern void finish_device_tree(void); -- cgit v1.2.3 From 41f880091c15b039ffcc8b3d831656b81517a6d3 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 23 Nov 2009 20:07:01 -0700 Subject: of/flattree: Merge unflatten_device_tree Merge common code between PowerPC and MicroBlaze Signed-off-by: Grant Likely Reviewed-by: Wolfram Sang Tested-by: Michal Simek --- arch/microblaze/include/asm/prom.h | 1 - arch/microblaze/kernel/prom.c | 49 ----------------------------------- arch/powerpc/kernel/prom.c | 50 ------------------------------------ drivers/of/fdt.c | 52 ++++++++++++++++++++++++++++++++++++++ include/linux/of.h | 3 +++ include/linux/of_fdt.h | 4 --- 6 files changed, 55 insertions(+), 104 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h index ef3ec1d6ceb3..07d1063f9aae 100644 --- a/arch/microblaze/include/asm/prom.h +++ b/arch/microblaze/include/asm/prom.h @@ -37,7 +37,6 @@ extern struct device_node *of_chosen; #define HAVE_ARCH_DEVTREE_FIXUPS -extern struct device_node *allnodes; /* temporary while merging */ extern rwlock_t devtree_lock; /* temporary while merging */ /* For updating the device tree at runtime */ diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index 021770abfbd7..901d538c15ef 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -50,55 +50,6 @@ typedef u32 cell_t; /* export that to outside world */ struct device_node *of_chosen; -/** - * unflattens the device-tree passed by the firmware, creating the - * tree of struct device_node. It also fills the "name" and "type" - * pointers of the nodes so the normal device-tree walking functions - * can be used (this used to be done by finish_device_tree) - */ -void __init unflatten_device_tree(void) -{ - unsigned long start, mem, size; - struct device_node **allnextp = &allnodes; - - pr_debug(" -> unflatten_device_tree()\n"); - - /* First pass, scan for size */ - start = ((unsigned long)initial_boot_params) + - initial_boot_params->off_dt_struct; - size = unflatten_dt_node(0, &start, NULL, NULL, 0); - size = (size | 3) + 1; - - pr_debug(" size is %lx, allocating...\n", size); - - /* Allocate memory for the expanded device tree */ - mem = lmb_alloc(size + 4, __alignof__(struct device_node)); - mem = (unsigned long) __va(mem); - - ((u32 *)mem)[size / 4] = 0xdeadbeef; - - pr_debug(" unflattening %lx...\n", mem); - - /* Second pass, do actual unflattening */ - start = ((unsigned long)initial_boot_params) + - initial_boot_params->off_dt_struct; - unflatten_dt_node(mem, &start, NULL, &allnextp, 0); - if (*((u32 *)start) != OF_DT_END) - printk(KERN_WARNING "Weird tag at end of tree: %08x\n", - *((u32 *)start)); - if (((u32 *)mem)[size / 4] != 0xdeadbeef) - printk(KERN_WARNING "End of tree marker overwritten: %08x\n", - ((u32 *)mem)[size / 4]); - *allnextp = NULL; - - /* Get pointer to OF "/chosen" node for use everywhere */ - of_chosen = of_find_node_by_path("/chosen"); - if (of_chosen == NULL) - of_chosen = of_find_node_by_path("/chosen@0"); - - pr_debug(" <- unflatten_device_tree()\n"); -} - #define early_init_dt_scan_drconf_memory(node) 0 static int __init early_init_dt_scan_cpus(unsigned long node, diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index a102a0a33ed1..1280f3484ad3 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -73,8 +73,6 @@ unsigned long tce_alloc_start, tce_alloc_end; typedef u32 cell_t; -extern struct device_node *allnodes; /* temporary while merging */ - extern rwlock_t devtree_lock; /* temporary while merging */ /* export that to outside world */ @@ -119,54 +117,6 @@ static void __init move_device_tree(void) DBG("<- move_device_tree\n"); } -/** - * unflattens the device-tree passed by the firmware, creating the - * tree of struct device_node. It also fills the "name" and "type" - * pointers of the nodes so the normal device-tree walking functions - * can be used (this used to be done by finish_device_tree) - */ -void __init unflatten_device_tree(void) -{ - unsigned long start, mem, size; - struct device_node **allnextp = &allnodes; - - DBG(" -> unflatten_device_tree()\n"); - - /* First pass, scan for size */ - start = ((unsigned long)initial_boot_params) + - initial_boot_params->off_dt_struct; - size = unflatten_dt_node(0, &start, NULL, NULL, 0); - size = (size | 3) + 1; - - DBG(" size is %lx, allocating...\n", size); - - /* Allocate memory for the expanded device tree */ - mem = lmb_alloc(size + 4, __alignof__(struct device_node)); - mem = (unsigned long) __va(mem); - - ((u32 *)mem)[size / 4] = 0xdeadbeef; - - DBG(" unflattening %lx...\n", mem); - - /* Second pass, do actual unflattening */ - start = ((unsigned long)initial_boot_params) + - initial_boot_params->off_dt_struct; - unflatten_dt_node(mem, &start, NULL, &allnextp, 0); - if (*((u32 *)start) != OF_DT_END) - printk(KERN_WARNING "Weird tag at end of tree: %08x\n", *((u32 *)start)); - if (((u32 *)mem)[size / 4] != 0xdeadbeef) - printk(KERN_WARNING "End of tree marker overwritten: %08x\n", - ((u32 *)mem)[size / 4] ); - *allnextp = NULL; - - /* Get pointer to OF "/chosen" node for use everywhere */ - of_chosen = of_find_node_by_path("/chosen"); - if (of_chosen == NULL) - of_chosen = of_find_node_by_path("/chosen@0"); - - DBG(" <- unflatten_device_tree()\n"); -} - /* * ibm,pa-features is a per-cpu property that contains a string of * attribute descriptors, each of which has a 2 byte header plus up diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 6852ecf6d1e1..43d236cbc17b 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -9,6 +9,8 @@ * version 2 as published by the Free Software Foundation. */ +#include +#include #include #include @@ -366,3 +368,53 @@ unsigned long __init unflatten_dt_node(unsigned long mem, *p += 4; return mem; } + +/** + * unflatten_device_tree - create tree of device_nodes from flat blob + * + * unflattens the device-tree passed by the firmware, creating the + * tree of struct device_node. It also fills the "name" and "type" + * pointers of the nodes so the normal device-tree walking functions + * can be used. + */ +void __init unflatten_device_tree(void) +{ + unsigned long start, mem, size; + struct device_node **allnextp = &allnodes; + + pr_debug(" -> unflatten_device_tree()\n"); + + /* First pass, scan for size */ + start = ((unsigned long)initial_boot_params) + + initial_boot_params->off_dt_struct; + size = unflatten_dt_node(0, &start, NULL, NULL, 0); + size = (size | 3) + 1; + + pr_debug(" size is %lx, allocating...\n", size); + + /* Allocate memory for the expanded device tree */ + mem = lmb_alloc(size + 4, __alignof__(struct device_node)); + mem = (unsigned long) __va(mem); + + ((u32 *)mem)[size / 4] = 0xdeadbeef; + + pr_debug(" unflattening %lx...\n", mem); + + /* Second pass, do actual unflattening */ + start = ((unsigned long)initial_boot_params) + + initial_boot_params->off_dt_struct; + unflatten_dt_node(mem, &start, NULL, &allnextp, 0); + if (*((u32 *)start) != OF_DT_END) + pr_warning("Weird tag at end of tree: %08x\n", *((u32 *)start)); + if (((u32 *)mem)[size / 4] != 0xdeadbeef) + pr_warning("End of tree marker overwritten: %08x\n", + ((u32 *)mem)[size / 4]); + *allnextp = NULL; + + /* Get pointer to OF "/chosen" node for use everywhere */ + of_chosen = of_find_node_by_path("/chosen"); + if (of_chosen == NULL) + of_chosen = of_find_node_by_path("/chosen@0"); + + pr_debug(" <- unflatten_device_tree()\n"); +} diff --git a/include/linux/of.h b/include/linux/of.h index e7facd8fbce8..bec215792c4f 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -63,6 +63,9 @@ struct device_node { #endif }; +/* Pointer for first entry in chain of all nodes. */ +extern struct device_node *allnodes; + static inline int of_node_check_flag(struct device_node *n, unsigned long flag) { return test_bit(flag, &n->_flags); diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index ace9068e07e8..81231e04e8f3 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -69,10 +69,6 @@ extern void *of_get_flat_dt_prop(unsigned long node, const char *name, unsigned long *size); extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); -extern unsigned long unflatten_dt_node(unsigned long mem, unsigned long *p, - struct device_node *dad, - struct device_node ***allnextpp, - unsigned long fpsize); /* Other Prototypes */ extern void finish_device_tree(void); -- cgit v1.2.3 From 2be09cb993826b52c9fc1d44747c20dd43a50038 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 23 Nov 2009 20:16:46 -0700 Subject: of: remove special case definition of of_read_ulong() Special case of of_read_ulong() was defined for PPC32 to toss away all but the last 32 bits when a large number value was read, and the 'normal' version for ppc64 just #defined of_read_ulong to of_read_number which causes compiler warnings on MicroBlaze and other 32 bit architectures because it returns a u64 instead of a ulong. This patch fixes the problem by defining a common implementation of of_read_ulong() that works everywhere. Signed-off-by: Grant Likely Reviewed-by: Wolfram Sang Tested-by: Michal Simek --- include/linux/of.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index bec215792c4f..d4c014a35ea5 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -113,14 +113,11 @@ static inline u64 of_read_number(const u32 *cell, int size) } /* Like of_read_number, but we want an unsigned long result */ -#ifdef CONFIG_PPC32 static inline unsigned long of_read_ulong(const u32 *cell, int size) { - return cell[size-1]; + /* toss away upper bits if unsigned long is smaller than u64 */ + return of_read_number(cell, size); } -#else -#define of_read_ulong(cell, size) of_read_number(cell, size) -#endif #include -- cgit v1.2.3 From d2eecb03936878ec574ade5532fa83df7d75dde7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 7 Dec 2009 10:36:20 -0500 Subject: ext4: Use slab allocator for sub-page sized allocations Now that the SLUB seems to be fixed so that it respects the requested alignment, use kmem_cache_alloc() to allocator if the block size of the buffer heads to be allocated is less than the page size. Previously, we were using 16k page on a Power system for each buffer, even when the file system was using 1k or 4k block size. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/jbd2.h | 11 +---- 2 files changed, 134 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ac0d027595d0..c03d4dce4d76 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -39,6 +39,8 @@ #include #include #include +#include +#include #define CREATE_TRACE_POINTS #include @@ -93,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); +static int jbd2_journal_create_slab(size_t slab_size); /* * Helper function used to manage commit timeouts @@ -1248,6 +1251,13 @@ int jbd2_journal_load(journal_t *journal) } } + /* + * Create a slab for this blocksize + */ + err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize)); + if (err) + return err; + /* Let the recovery code check whether it needs to recover any * data from the journal. */ if (jbd2_journal_recover(journal)) @@ -1806,6 +1816,127 @@ size_t journal_tag_bytes(journal_t *journal) return JBD2_TAG_SIZE32; } +/* + * JBD memory management + * + * These functions are used to allocate block-sized chunks of memory + * used for making copies of buffer_head data. Very often it will be + * page-sized chunks of data, but sometimes it will be in + * sub-page-size chunks. (For example, 16k pages on Power systems + * with a 4k block file system.) For blocks smaller than a page, we + * use a SLAB allocator. There are slab caches for each block size, + * which are allocated at mount time, if necessary, and we only free + * (all of) the slab caches when/if the jbd2 module is unloaded. For + * this reason we don't need to a mutex to protect access to + * jbd2_slab[] allocating or releasing memory; only in + * jbd2_journal_create_slab(). + */ +#define JBD2_MAX_SLABS 8 +static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; +static DECLARE_MUTEX(jbd2_slab_create_sem); + +static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { + "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", + "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k" +}; + + +static void jbd2_journal_destroy_slabs(void) +{ + int i; + + for (i = 0; i < JBD2_MAX_SLABS; i++) { + if (jbd2_slab[i]) + kmem_cache_destroy(jbd2_slab[i]); + jbd2_slab[i] = NULL; + } +} + +static int jbd2_journal_create_slab(size_t size) +{ + int i = order_base_2(size) - 10; + size_t slab_size; + + if (size == PAGE_SIZE) + return 0; + + if (i >= JBD2_MAX_SLABS) + return -EINVAL; + + if (unlikely(i < 0)) + i = 0; + down(&jbd2_slab_create_sem); + if (jbd2_slab[i]) { + up(&jbd2_slab_create_sem); + return 0; /* Already created */ + } + + slab_size = 1 << (i+10); + jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, + slab_size, 0, NULL); + up(&jbd2_slab_create_sem); + if (!jbd2_slab[i]) { + printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); + return -ENOMEM; + } + return 0; +} + +static struct kmem_cache *get_slab(size_t size) +{ + int i = order_base_2(size) - 10; + + BUG_ON(i >= JBD2_MAX_SLABS); + if (unlikely(i < 0)) + i = 0; + BUG_ON(jbd2_slab[i] == 0); + return jbd2_slab[i]; +} + +void *jbd2_alloc(size_t size, gfp_t flags) +{ + void *ptr; + + BUG_ON(size & (size-1)); /* Must be a power of 2 */ + + flags |= __GFP_REPEAT; + if (size == PAGE_SIZE) + ptr = (void *)__get_free_pages(flags, 0); + else if (size > PAGE_SIZE) { + int order = get_order(size); + + if (order < 3) + ptr = (void *)__get_free_pages(flags, order); + else + ptr = vmalloc(size); + } else + ptr = kmem_cache_alloc(get_slab(size), flags); + + /* Check alignment; SLUB has gotten this wrong in the past, + * and this can lead to user data corruption! */ + BUG_ON(((unsigned long) ptr) & (size-1)); + + return ptr; +} + +void jbd2_free(void *ptr, size_t size) +{ + if (size == PAGE_SIZE) { + free_pages((unsigned long)ptr, 0); + return; + } + if (size > PAGE_SIZE) { + int order = get_order(size); + + if (order < 3) + free_pages((unsigned long)ptr, order); + else + vfree(ptr); + return; + } + kmem_cache_free(get_slab(size), ptr); +}; + /* * Journal_head storage management */ @@ -2204,6 +2335,7 @@ static void jbd2_journal_destroy_caches(void) jbd2_journal_destroy_revoke_caches(); jbd2_journal_destroy_jbd2_journal_head_cache(); jbd2_journal_destroy_handle_cache(); + jbd2_journal_destroy_slabs(); } static int __init journal_init(void) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 638ce4554c76..8ada2a129d08 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -69,15 +69,8 @@ extern u8 jbd2_journal_enable_debug; #define jbd_debug(f, a...) /**/ #endif -static inline void *jbd2_alloc(size_t size, gfp_t flags) -{ - return (void *)__get_free_pages(flags, get_order(size)); -} - -static inline void jbd2_free(void *ptr, size_t size) -{ - free_pages((unsigned long)ptr, get_order(size)); -}; +extern void *jbd2_alloc(size_t size, gfp_t flags); +extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 -- cgit v1.2.3 From 85438592f179c126ad4cb9a280046d4f0a501e6d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 18 Nov 2009 17:53:21 +0900 Subject: percpu: remove compile warnings caused by __verify_pcpu_ptr() If percpu pointer is const, __verify_pcpu_ptr() triggers warnings like the following. drivers/net/loopback.c: In function 'loopback_get_stats': drivers/net/loopback.c:109: warning: initialization discards qualifiers from pointer target type Fix it by adding const to the verification target pointer used in __verify_pcpu_ptr(). Signed-off-by: Tejun Heo Reported-by: Stephen Rothwell --- include/linux/percpu-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 1fa36eb54b6a..68567c0b3a5d 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -24,7 +24,7 @@ * input parameter is a percpu pointer. */ #define __verify_pcpu_ptr(ptr) do { \ - void __percpu *__vpp_verify = (typeof(ptr))NULL; \ + const void __percpu *__vpp_verify = (typeof(ptr))NULL; \ (void)__vpp_verify; \ } while (0) -- cgit v1.2.3 From f7b3a8355ba6cad251297844a0bdd08898ea36e0 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 24 Nov 2009 03:26:58 -0700 Subject: of/flattree: Merge early_init_dt_check_for_initrd() Merge common code between PowerPC and Microblaze Signed-off-by: Grant Likely Tested-by: Wolfram Sang Acked-by: Benjamin Herrenschmidt --- arch/microblaze/kernel/prom.c | 32 -------------------------------- arch/powerpc/kernel/prom.c | 30 ------------------------------ drivers/of/fdt.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/of_fdt.h | 1 + 4 files changed, 38 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index a38e3733a09c..7959495b1d00 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -113,38 +113,6 @@ static int __init early_init_dt_scan_cpus(unsigned long node, return 0; } -#ifdef CONFIG_BLK_DEV_INITRD -static void __init early_init_dt_check_for_initrd(unsigned long node) -{ - unsigned long l; - u32 *prop; - - pr_debug("Looking for initrd properties... "); - - prop = of_get_flat_dt_prop(node, "linux,initrd-start", &l); - if (prop) { - initrd_start = (unsigned long) - __va((u32)of_read_ulong(prop, l/4)); - - prop = of_get_flat_dt_prop(node, "linux,initrd-end", &l); - if (prop) { - initrd_end = (unsigned long) - __va((u32)of_read_ulong(prop, 1/4)); - initrd_below_start_ok = 1; - } else { - initrd_start = 0; - } - } - - pr_debug("initrd_start=0x%lx initrd_end=0x%lx\n", - initrd_start, initrd_end); -} -#else -static inline void early_init_dt_check_for_initrd(unsigned long node) -{ -} -#endif /* CONFIG_BLK_DEV_INITRD */ - static int __init early_init_dt_scan_chosen(unsigned long node, const char *uname, int depth, void *data) { diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 7f8856655144..1ecd6c6ecabd 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -373,36 +373,6 @@ static int __init early_init_dt_scan_cpus(unsigned long node, return 0; } -#ifdef CONFIG_BLK_DEV_INITRD -static void __init early_init_dt_check_for_initrd(unsigned long node) -{ - unsigned long l; - u32 *prop; - - DBG("Looking for initrd properties... "); - - prop = of_get_flat_dt_prop(node, "linux,initrd-start", &l); - if (prop) { - initrd_start = (unsigned long)__va(of_read_ulong(prop, l/4)); - - prop = of_get_flat_dt_prop(node, "linux,initrd-end", &l); - if (prop) { - initrd_end = (unsigned long) - __va(of_read_ulong(prop, l/4)); - initrd_below_start_ok = 1; - } else { - initrd_start = 0; - } - } - - DBG("initrd_start=0x%lx initrd_end=0x%lx\n", initrd_start, initrd_end); -} -#else -static inline void early_init_dt_check_for_initrd(unsigned long node) -{ -} -#endif /* CONFIG_BLK_DEV_INITRD */ - static int __init early_init_dt_scan_chosen(unsigned long node, const char *uname, int depth, void *data) { diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 43d236cbc17b..6ad98e85dc93 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -369,6 +370,42 @@ unsigned long __init unflatten_dt_node(unsigned long mem, return mem; } +#ifdef CONFIG_BLK_DEV_INITRD +/** + * early_init_dt_check_for_initrd - Decode initrd location from flat tree + * @node: reference to node containing initrd location ('chosen') + */ +void __init early_init_dt_check_for_initrd(unsigned long node) +{ + unsigned long len; + u32 *prop; + + pr_debug("Looking for initrd properties... "); + + prop = of_get_flat_dt_prop(node, "linux,initrd-start", &len); + if (prop) { + initrd_start = (unsigned long) + __va(of_read_ulong(prop, len/4)); + + prop = of_get_flat_dt_prop(node, "linux,initrd-end", &len); + if (prop) { + initrd_end = (unsigned long) + __va(of_read_ulong(prop, len/4)); + initrd_below_start_ok = 1; + } else { + initrd_start = 0; + } + } + + pr_debug("initrd_start=0x%lx initrd_end=0x%lx\n", + initrd_start, initrd_end); +} +#else +inline void early_init_dt_check_for_initrd(unsigned long node) +{ +} +#endif /* CONFIG_BLK_DEV_INITRD */ + /** * unflatten_device_tree - create tree of device_nodes from flat blob * diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 81231e04e8f3..ec2db8278c3f 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -69,6 +69,7 @@ extern void *of_get_flat_dt_prop(unsigned long node, const char *name, unsigned long *size); extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); +extern void early_init_dt_check_for_initrd(unsigned long node); /* Other Prototypes */ extern void finish_device_tree(void); -- cgit v1.2.3 From f00abd94918c9780f9d2d961fc0e419c11457922 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 24 Nov 2009 03:27:10 -0700 Subject: of/flattree: Merge earlyinit_dt_scan_root() Merge common code between PowerPC and Microblaze Signed-off-by: Grant Likely Acked-by: Benjamin Herrenschmidt Tested-by: Wolfram Sang --- arch/microblaze/kernel/prom.c | 23 ----------------------- arch/powerpc/kernel/prom.c | 24 ------------------------ drivers/of/fdt.c | 26 ++++++++++++++++++++++++++ include/linux/of_fdt.h | 6 ++++++ 4 files changed, 32 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index 7959495b1d00..189179a9b554 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -42,9 +42,6 @@ #include #include -static int __initdata dt_root_addr_cells; -static int __initdata dt_root_size_cells; - typedef u32 cell_t; /* export that to outside world */ @@ -158,26 +155,6 @@ static int __init early_init_dt_scan_chosen(unsigned long node, return 1; } -static int __init early_init_dt_scan_root(unsigned long node, - const char *uname, int depth, void *data) -{ - u32 *prop; - - if (depth != 0) - return 0; - - prop = of_get_flat_dt_prop(node, "#size-cells", NULL); - dt_root_size_cells = (prop == NULL) ? 1 : *prop; - pr_debug("dt_root_size_cells = %x\n", dt_root_size_cells); - - prop = of_get_flat_dt_prop(node, "#address-cells", NULL); - dt_root_addr_cells = (prop == NULL) ? 2 : *prop; - pr_debug("dt_root_addr_cells = %x\n", dt_root_addr_cells); - - /* break now */ - return 1; -} - static u64 __init dt_mem_next_cell(int s, cell_t **cellp) { cell_t *p = *cellp; diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 1ecd6c6ecabd..78f65a4d8b03 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -61,10 +61,6 @@ #define DBG(fmt...) #endif - -static int __initdata dt_root_addr_cells; -static int __initdata dt_root_size_cells; - #ifdef CONFIG_PPC64 int __initdata iommu_is_off; int __initdata iommu_force_on; @@ -436,26 +432,6 @@ static int __init early_init_dt_scan_chosen(unsigned long node, return 1; } -static int __init early_init_dt_scan_root(unsigned long node, - const char *uname, int depth, void *data) -{ - u32 *prop; - - if (depth != 0) - return 0; - - prop = of_get_flat_dt_prop(node, "#size-cells", NULL); - dt_root_size_cells = (prop == NULL) ? 1 : *prop; - DBG("dt_root_size_cells = %x\n", dt_root_size_cells); - - prop = of_get_flat_dt_prop(node, "#address-cells", NULL); - dt_root_addr_cells = (prop == NULL) ? 2 : *prop; - DBG("dt_root_addr_cells = %x\n", dt_root_addr_cells); - - /* break now */ - return 1; -} - static u64 __init dt_mem_next_cell(int s, cell_t **cellp) { cell_t *p = *cellp; diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 6ad98e85dc93..be200be47269 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -15,6 +15,9 @@ #include #include +int __initdata dt_root_addr_cells; +int __initdata dt_root_size_cells; + struct boot_param_header *initial_boot_params; char *find_flat_dt_string(u32 offset) @@ -406,6 +409,29 @@ inline void early_init_dt_check_for_initrd(unsigned long node) } #endif /* CONFIG_BLK_DEV_INITRD */ +/** + * early_init_dt_scan_root - fetch the top level address and size cells + */ +int __init early_init_dt_scan_root(unsigned long node, const char *uname, + int depth, void *data) +{ + u32 *prop; + + if (depth != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "#size-cells", NULL); + dt_root_size_cells = (prop == NULL) ? 1 : *prop; + pr_debug("dt_root_size_cells = %x\n", dt_root_size_cells); + + prop = of_get_flat_dt_prop(node, "#address-cells", NULL); + dt_root_addr_cells = (prop == NULL) ? 2 : *prop; + pr_debug("dt_root_addr_cells = %x\n", dt_root_addr_cells); + + /* break now */ + return 1; +} + /** * unflatten_device_tree - create tree of device_nodes from flat blob * diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index ec2db8278c3f..828c3cdaea78 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -58,6 +58,8 @@ struct boot_param_header { }; /* TBD: Temporary export of fdt globals - remove when code fully merged */ +extern int __initdata dt_root_addr_cells; +extern int __initdata dt_root_size_cells; extern struct boot_param_header *initial_boot_params; /* For scanning the flat device-tree at boot time */ @@ -71,6 +73,10 @@ extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); extern void early_init_dt_check_for_initrd(unsigned long node); +/* Early flat tree scan hooks */ +extern int early_init_dt_scan_root(unsigned long node, const char *uname, + int depth, void *data); + /* Other Prototypes */ extern void finish_device_tree(void); extern void unflatten_device_tree(void); -- cgit v1.2.3 From 83f7a06eb479e2aeb83536e77a2cb14cc2285e32 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 24 Nov 2009 03:37:56 -0700 Subject: of/flattree: merge dt_mem_next_cell Merge common code between PowerPC and Microblaze Signed-off-by: Grant Likely Acked-by: Benjamin Herrenschmidt Tested-by: Wolfram Sang --- arch/microblaze/kernel/prom.c | 8 -------- arch/powerpc/kernel/prom.c | 8 -------- drivers/of/fdt.c | 8 ++++++++ include/linux/of_fdt.h | 1 + 4 files changed, 9 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index 189179a9b554..e0f4c34ed0f2 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -155,14 +155,6 @@ static int __init early_init_dt_scan_chosen(unsigned long node, return 1; } -static u64 __init dt_mem_next_cell(int s, cell_t **cellp) -{ - cell_t *p = *cellp; - - *cellp = p + s; - return of_read_number(p, s); -} - static int __init early_init_dt_scan_memory(unsigned long node, const char *uname, int depth, void *data) { diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 78f65a4d8b03..048e3a3e9876 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -432,14 +432,6 @@ static int __init early_init_dt_scan_chosen(unsigned long node, return 1; } -static u64 __init dt_mem_next_cell(int s, cell_t **cellp) -{ - cell_t *p = *cellp; - - *cellp = p + s; - return of_read_number(p, s); -} - #ifdef CONFIG_PPC_PSERIES /* * Interpret the ibm,dynamic-memory property in the diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index be200be47269..ebce509b0886 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -432,6 +432,14 @@ int __init early_init_dt_scan_root(unsigned long node, const char *uname, return 1; } +u64 __init dt_mem_next_cell(int s, u32 **cellp) +{ + u32 *p = *cellp; + + *cellp = p + s; + return of_read_number(p, s); +} + /** * unflatten_device_tree - create tree of device_nodes from flat blob * diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 828c3cdaea78..d1a37e56031e 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -72,6 +72,7 @@ extern void *of_get_flat_dt_prop(unsigned long node, const char *name, extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); extern void early_init_dt_check_for_initrd(unsigned long node); +extern u64 dt_mem_next_cell(int s, u32 **cellp); /* Early flat tree scan hooks */ extern int early_init_dt_scan_root(unsigned long node, const char *uname, -- cgit v1.2.3 From 86e032213424958b45564d0cc96b3316641a49d3 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 10 Dec 2009 23:42:21 -0700 Subject: of/flattree: merge early_init_dt_scan_chosen() Merge common code between PowerPC and Microblaze. This patch splits the arch-specific stuff out into a new function, early_init_dt_scan_chosen_arch(). Signed-off-by: Grant Likely Tested-by: Wolfram Sang Acked-by: Benjamin Herrenschmidt --- arch/microblaze/kernel/prom.c | 44 ++--------------------------------------- arch/powerpc/kernel/prom.c | 46 ++++++++++--------------------------------- drivers/of/fdt.c | 38 +++++++++++++++++++++++++++++++++++ include/linux/of_fdt.h | 3 +++ 4 files changed, 53 insertions(+), 78 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index 50d8b09d5e3f..5505bcffd7dd 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -108,49 +108,9 @@ static int __init early_init_dt_scan_cpus(unsigned long node, return 0; } -static int __init early_init_dt_scan_chosen(unsigned long node, - const char *uname, int depth, void *data) +void __init early_init_dt_scan_chosen_arch(unsigned long node) { - unsigned long l; - char *p; - - pr_debug("search \"chosen\", depth: %d, uname: %s\n", depth, uname); - - if (depth != 1 || - (strcmp(uname, "chosen") != 0 && - strcmp(uname, "chosen@0") != 0)) - return 0; - -#ifdef CONFIG_KEXEC - lprop = (u64 *)of_get_flat_dt_prop(node, - "linux,crashkernel-base", NULL); - if (lprop) - crashk_res.start = *lprop; - - lprop = (u64 *)of_get_flat_dt_prop(node, - "linux,crashkernel-size", NULL); - if (lprop) - crashk_res.end = crashk_res.start + *lprop - 1; -#endif - - early_init_dt_check_for_initrd(node); - - /* Retreive command line */ - p = of_get_flat_dt_prop(node, "bootargs", &l); - if (p != NULL && l > 0) - strlcpy(cmd_line, p, min((int)l, COMMAND_LINE_SIZE)); - -#ifdef CONFIG_CMDLINE -#ifndef CONFIG_CMDLINE_FORCE - if (p == NULL || l == 0 || (l == 1 && (*p) == 0)) -#endif - strlcpy(cmd_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); -#endif /* CONFIG_CMDLINE */ - - pr_debug("Command line is: %s\n", cmd_line); - - /* break now */ - return 1; + /* No Microblaze specific code here */ } static int __init early_init_dt_scan_memory(unsigned long node, diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 4e3181cded44..877fad9b3745 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -367,18 +367,9 @@ static int __init early_init_dt_scan_cpus(unsigned long node, return 0; } -static int __init early_init_dt_scan_chosen(unsigned long node, - const char *uname, int depth, void *data) +void __init early_init_dt_scan_chosen_arch(unsigned long node) { unsigned long *lprop; - unsigned long l; - char *p; - - DBG("search \"chosen\", depth: %d, uname: %s\n", depth, uname); - - if (depth != 1 || - (strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0)) - return 0; #ifdef CONFIG_PPC64 /* check if iommu is forced on or off */ @@ -389,17 +380,17 @@ static int __init early_init_dt_scan_chosen(unsigned long node, #endif /* mem=x on the command line is the preferred mechanism */ - lprop = of_get_flat_dt_prop(node, "linux,memory-limit", NULL); - if (lprop) - memory_limit = *lprop; + lprop = of_get_flat_dt_prop(node, "linux,memory-limit", NULL); + if (lprop) + memory_limit = *lprop; #ifdef CONFIG_PPC64 - lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-start", NULL); - if (lprop) - tce_alloc_start = *lprop; - lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-end", NULL); - if (lprop) - tce_alloc_end = *lprop; + lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-start", NULL); + if (lprop) + tce_alloc_start = *lprop; + lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-end", NULL); + if (lprop) + tce_alloc_end = *lprop; #endif #ifdef CONFIG_KEXEC @@ -411,23 +402,6 @@ static int __init early_init_dt_scan_chosen(unsigned long node, if (lprop) crashk_res.end = crashk_res.start + *lprop - 1; #endif - - early_init_dt_check_for_initrd(node); - - /* Retreive command line */ - p = of_get_flat_dt_prop(node, "bootargs", &l); - if (p != NULL && l > 0) - strlcpy(cmd_line, p, min((int)l, COMMAND_LINE_SIZE)); - -#ifdef CONFIG_CMDLINE - if (p == NULL || l == 0 || (l == 1 && (*p) == 0)) - strlcpy(cmd_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); -#endif /* CONFIG_CMDLINE */ - - DBG("Command line is: %s\n", cmd_line); - - /* break now */ - return 1; } #ifdef CONFIG_PPC_PSERIES diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index ebce509b0886..616a4767a950 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -15,6 +15,10 @@ #include #include +#ifdef CONFIG_PPC +#include +#endif /* CONFIG_PPC */ + int __initdata dt_root_addr_cells; int __initdata dt_root_size_cells; @@ -440,6 +444,40 @@ u64 __init dt_mem_next_cell(int s, u32 **cellp) return of_read_number(p, s); } +int __init early_init_dt_scan_chosen(unsigned long node, const char *uname, + int depth, void *data) +{ + unsigned long l; + char *p; + + pr_debug("search \"chosen\", depth: %d, uname: %s\n", depth, uname); + + if (depth != 1 || + (strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0)) + return 0; + + early_init_dt_check_for_initrd(node); + + /* Retreive command line */ + p = of_get_flat_dt_prop(node, "bootargs", &l); + if (p != NULL && l > 0) + strlcpy(cmd_line, p, min((int)l, COMMAND_LINE_SIZE)); + +#ifdef CONFIG_CMDLINE +#ifndef CONFIG_CMDLINE_FORCE + if (p == NULL || l == 0 || (l == 1 && (*p) == 0)) +#endif + strlcpy(cmd_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); +#endif /* CONFIG_CMDLINE */ + + early_init_dt_scan_chosen_arch(node); + + pr_debug("Command line is: %s\n", cmd_line); + + /* break now */ + return 1; +} + /** * unflatten_device_tree - create tree of device_nodes from flat blob * diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index d1a37e56031e..8118d4559dd5 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -71,6 +71,9 @@ extern void *of_get_flat_dt_prop(unsigned long node, const char *name, unsigned long *size); extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); +extern void early_init_dt_scan_chosen_arch(unsigned long node); +extern int early_init_dt_scan_chosen(unsigned long node, const char *uname, + int depth, void *data); extern void early_init_dt_check_for_initrd(unsigned long node); extern u64 dt_mem_next_cell(int s, u32 **cellp); -- cgit v1.2.3 From 9dfc6e68bfe6ee452efb1a4e9ca26a9007f2b864 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 18 Dec 2009 16:26:20 -0600 Subject: SLUB: Use this_cpu operations in slub Using per cpu allocations removes the needs for the per cpu arrays in the kmem_cache struct. These could get quite big if we have to support systems with thousands of cpus. The use of this_cpu_xx operations results in: 1. The size of kmem_cache for SMP configuration shrinks since we will only need 1 pointer instead of NR_CPUS. The same pointer can be used by all processors. Reduces cache footprint of the allocator. 2. We can dynamically size kmem_cache according to the actual nodes in the system meaning less memory overhead for configurations that may potentially support up to 1k NUMA nodes / 4k cpus. 3. We can remove the diddle widdle with allocating and releasing of kmem_cache_cpu structures when bringing up and shutting down cpus. The cpu alloc logic will do it all for us. Removes some portions of the cpu hotplug functionality. 4. Fastpath performance increases since per cpu pointer lookups and address calculations are avoided. V7-V8 - Convert missed get_cpu_slab() under CONFIG_SLUB_STATS Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 6 +- mm/slub.c | 202 +++++++++++------------------------------------ 2 files changed, 49 insertions(+), 159 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 1e14beb23f9b..17ebe0f89bf3 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -69,6 +69,7 @@ struct kmem_cache_order_objects { * Slab cache management. */ struct kmem_cache { + struct kmem_cache_cpu *cpu_slab; /* Used for retriving partial slabs etc */ unsigned long flags; int size; /* The size of an object including meta data */ @@ -104,11 +105,6 @@ struct kmem_cache { int remote_node_defrag_ratio; struct kmem_cache_node *node[MAX_NUMNODES]; #endif -#ifdef CONFIG_SMP - struct kmem_cache_cpu *cpu_slab[NR_CPUS]; -#else - struct kmem_cache_cpu cpu_slab; -#endif }; /* diff --git a/mm/slub.c b/mm/slub.c index 8d71aaf888d7..d6c9ecf629d5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -242,15 +242,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) #endif } -static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) -{ -#ifdef CONFIG_SMP - return s->cpu_slab[cpu]; -#else - return &s->cpu_slab; -#endif -} - /* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) @@ -1124,7 +1115,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) return NULL; - stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); + stat(this_cpu_ptr(s->cpu_slab), ORDER_FALLBACK); } if (kmemcheck_enabled @@ -1422,7 +1413,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); __ClearPageSlubFrozen(page); if (page->inuse) { @@ -1454,7 +1445,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) slab_unlock(page); } else { slab_unlock(page); - stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); + stat(__this_cpu_ptr(s->cpu_slab), FREE_SLAB); discard_slab(s, page); } } @@ -1507,7 +1498,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) */ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); if (likely(c && c->page)) flush_slab(s, c); @@ -1673,7 +1664,7 @@ new_slab: local_irq_disable(); if (new) { - c = get_cpu_slab(s, smp_processor_id()); + c = __this_cpu_ptr(s->cpu_slab); stat(c, ALLOC_SLAB); if (c->page) flush_slab(s, c); @@ -1711,7 +1702,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, void **object; struct kmem_cache_cpu *c; unsigned long flags; - unsigned int objsize; + unsigned long objsize; gfpflags &= gfp_allowed_mask; @@ -1722,14 +1713,14 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, return NULL; local_irq_save(flags); - c = get_cpu_slab(s, smp_processor_id()); + c = __this_cpu_ptr(s->cpu_slab); + object = c->freelist; objsize = c->objsize; - if (unlikely(!c->freelist || !node_match(c, node))) + if (unlikely(!object || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { - object = c->freelist; c->freelist = object[c->offset]; stat(c, ALLOC_FASTPATH); } @@ -1800,7 +1791,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, void **object = (void *)x; struct kmem_cache_cpu *c; - c = get_cpu_slab(s, raw_smp_processor_id()); + c = __this_cpu_ptr(s->cpu_slab); stat(c, FREE_SLOWPATH); slab_lock(page); @@ -1872,7 +1863,7 @@ static __always_inline void slab_free(struct kmem_cache *s, kmemleak_free_recursive(x, s->flags); local_irq_save(flags); - c = get_cpu_slab(s, smp_processor_id()); + c = __this_cpu_ptr(s->cpu_slab); kmemcheck_slab_free(s, object, c->objsize); debug_check_no_locks_freed(object, c->objsize); if (!(s->flags & SLAB_DEBUG_OBJECTS)) @@ -2095,130 +2086,28 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) #endif } -#ifdef CONFIG_SMP -/* - * Per cpu array for per cpu structures. - * - * The per cpu array places all kmem_cache_cpu structures from one processor - * close together meaning that it becomes possible that multiple per cpu - * structures are contained in one cacheline. This may be particularly - * beneficial for the kmalloc caches. - * - * A desktop system typically has around 60-80 slabs. With 100 here we are - * likely able to get per cpu structures for all caches from the array defined - * here. We must be able to cover all kmalloc caches during bootstrap. - * - * If the per cpu array is exhausted then fall back to kmalloc - * of individual cachelines. No sharing is possible then. - */ -#define NR_KMEM_CACHE_CPU 100 - -static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], - kmem_cache_cpu); - -static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); -static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); - -static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, - int cpu, gfp_t flags) -{ - struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); - - if (c) - per_cpu(kmem_cache_cpu_free, cpu) = - (void *)c->freelist; - else { - /* Table overflow: So allocate ourselves */ - c = kmalloc_node( - ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), - flags, cpu_to_node(cpu)); - if (!c) - return NULL; - } - - init_kmem_cache_cpu(s, c); - return c; -} - -static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) -{ - if (c < per_cpu(kmem_cache_cpu, cpu) || - c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { - kfree(c); - return; - } - c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); - per_cpu(kmem_cache_cpu_free, cpu) = c; -} - -static void free_kmem_cache_cpus(struct kmem_cache *s) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - - if (c) { - s->cpu_slab[cpu] = NULL; - free_kmem_cache_cpu(c, cpu); - } - } -} - -static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); +static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[SLUB_PAGE_SHIFT]); - if (c) - continue; - - c = alloc_kmem_cache_cpu(s, cpu, flags); - if (!c) { - free_kmem_cache_cpus(s); - return 0; - } - s->cpu_slab[cpu] = c; - } - return 1; -} - -/* - * Initialize the per cpu array. - */ -static void init_alloc_cpu_cpu(int cpu) -{ - int i; - - if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) - return; - - for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) - free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); - - cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)); -} - -static void __init init_alloc_cpu(void) +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) { int cpu; - for_each_online_cpu(cpu) - init_alloc_cpu_cpu(cpu); - } + if (s < kmalloc_caches + SLUB_PAGE_SHIFT && s >= kmalloc_caches) + /* + * Boot time creation of the kmalloc array. Use static per cpu data + * since the per cpu allocator is not available yet. + */ + s->cpu_slab = per_cpu_var(kmalloc_percpu) + (s - kmalloc_caches); + else + s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); -#else -static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} -static inline void init_alloc_cpu(void) {} + if (!s->cpu_slab) + return 0; -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) -{ - init_kmem_cache_cpu(s, &s->cpu_slab); + for_each_possible_cpu(cpu) + init_kmem_cache_cpu(s, per_cpu_ptr(s->cpu_slab, cpu)); return 1; } -#endif #ifdef CONFIG_NUMA /* @@ -2609,9 +2498,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) int node; flush_all(s); - + free_percpu(s->cpu_slab); /* Attempt to free all objects */ - free_kmem_cache_cpus(s); for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -2760,7 +2648,19 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) realsize = kmalloc_caches[index].objsize; text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize); - s = kmalloc(kmem_size, flags & ~SLUB_DMA); + + if (flags & __GFP_WAIT) + s = kmalloc(kmem_size, flags & ~SLUB_DMA); + else { + int i; + + s = NULL; + for (i = 0; i < SLUB_PAGE_SHIFT; i++) + if (kmalloc_caches[i].size) { + s = kmalloc_caches + i; + break; + } + } /* * Must defer sysfs creation to a workqueue because we don't know @@ -3176,8 +3076,6 @@ void __init kmem_cache_init(void) int i; int caches = 0; - init_alloc_cpu(); - #ifdef CONFIG_NUMA /* * Must first have the slab cache available for the allocations of the @@ -3261,8 +3159,10 @@ void __init kmem_cache_init(void) #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); - kmem_size = offsetof(struct kmem_cache, cpu_slab) + - nr_cpu_ids * sizeof(struct kmem_cache_cpu *); +#endif +#ifdef CONFIG_NUMA + kmem_size = offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *); #else kmem_size = sizeof(struct kmem_cache); #endif @@ -3365,7 +3265,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, * per cpu structures */ for_each_online_cpu(cpu) - get_cpu_slab(s, cpu)->objsize = s->objsize; + per_cpu_ptr(s->cpu_slab, cpu)->objsize = s->objsize; s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); up_write(&slub_lock); @@ -3422,11 +3322,9 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - init_alloc_cpu_cpu(cpu); down_read(&slub_lock); list_for_each_entry(s, &slab_caches, list) - s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, - GFP_KERNEL); + init_kmem_cache_cpu(s, per_cpu_ptr(s->cpu_slab, cpu)); up_read(&slub_lock); break; @@ -3436,13 +3334,9 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, case CPU_DEAD_FROZEN: down_read(&slub_lock); list_for_each_entry(s, &slab_caches, list) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - local_irq_save(flags); __flush_cpu_slab(s, cpu); local_irq_restore(flags); - free_kmem_cache_cpu(c, cpu); - s->cpu_slab[cpu] = NULL; } up_read(&slub_lock); break; @@ -3928,7 +3822,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, int cpu; for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); if (!c || c->node < 0) continue; @@ -4353,7 +4247,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) return -ENOMEM; for_each_online_cpu(cpu) { - unsigned x = get_cpu_slab(s, cpu)->stat[si]; + unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; data[cpu] = x; sum += x; @@ -4376,7 +4270,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si) int cpu; for_each_online_cpu(cpu) - get_cpu_slab(s, cpu)->stat[si] = 0; + per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; } #define STAT_ATTR(si, text) \ -- cgit v1.2.3 From 756dee75872a2a764b478e18076360b8a4ec9045 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 18 Dec 2009 16:26:21 -0600 Subject: SLUB: Get rid of dynamic DMA kmalloc cache allocation Dynamic DMA kmalloc cache allocation is troublesome since the new percpu allocator does not support allocations in atomic contexts. Reserve some statically allocated kmalloc_cpu structures instead. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 19 +++++++++++-------- mm/slub.c | 24 ++++++++++-------------- 2 files changed, 21 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 17ebe0f89bf3..a78fb4ac2015 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -131,11 +131,21 @@ struct kmem_cache { #define SLUB_PAGE_SHIFT (PAGE_SHIFT + 2) +#ifdef CONFIG_ZONE_DMA +#define SLUB_DMA __GFP_DMA +/* Reserve extra caches for potential DMA use */ +#define KMALLOC_CACHES (2 * SLUB_PAGE_SHIFT - 6) +#else +/* Disable DMA functionality */ +#define SLUB_DMA (__force gfp_t)0 +#define KMALLOC_CACHES SLUB_PAGE_SHIFT +#endif + /* * We keep the general caches in an array of slab caches that are used for * 2^x bytes of allocations. */ -extern struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT]; +extern struct kmem_cache kmalloc_caches[KMALLOC_CACHES]; /* * Sorry that the following has to be that ugly but some versions of GCC @@ -203,13 +213,6 @@ static __always_inline struct kmem_cache *kmalloc_slab(size_t size) return &kmalloc_caches[index]; } -#ifdef CONFIG_ZONE_DMA -#define SLUB_DMA __GFP_DMA -#else -/* Disable DMA functionality */ -#define SLUB_DMA (__force gfp_t)0 -#endif - void *kmem_cache_alloc(struct kmem_cache *, gfp_t); void *__kmalloc(size_t size, gfp_t flags); diff --git a/mm/slub.c b/mm/slub.c index d6c9ecf629d5..cdb7f0214af0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2092,7 +2092,7 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) { int cpu; - if (s < kmalloc_caches + SLUB_PAGE_SHIFT && s >= kmalloc_caches) + if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) /* * Boot time creation of the kmalloc array. Use static per cpu data * since the per cpu allocator is not available yet. @@ -2539,7 +2539,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; +struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); static int __init setup_slub_min_order(char *str) @@ -2629,6 +2629,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) char *text; size_t realsize; unsigned long slabflags; + int i; s = kmalloc_caches_dma[index]; if (s) @@ -2649,18 +2650,13 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize); - if (flags & __GFP_WAIT) - s = kmalloc(kmem_size, flags & ~SLUB_DMA); - else { - int i; + s = NULL; + for (i = 0; i < KMALLOC_CACHES; i++) + if (!kmalloc_caches[i].size) + break; - s = NULL; - for (i = 0; i < SLUB_PAGE_SHIFT; i++) - if (kmalloc_caches[i].size) { - s = kmalloc_caches + i; - break; - } - } + BUG_ON(i >= KMALLOC_CACHES); + s = kmalloc_caches + i; /* * Must defer sysfs creation to a workqueue because we don't know @@ -2674,7 +2670,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) if (!s || !text || !kmem_cache_open(s, flags, text, realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { - kfree(s); + s->size = 0; kfree(text); goto unlock_out; } -- cgit v1.2.3 From ff12059ed14b0773d7bbef86f98218ada6c20770 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 18 Dec 2009 16:26:22 -0600 Subject: SLUB: this_cpu: Remove slub kmem_cache fields Remove the fields in struct kmem_cache_cpu that were used to cache data from struct kmem_cache when they were in different cachelines. The cacheline that holds the per cpu array pointer now also holds these values. We can cut down the struct kmem_cache_cpu size to almost half. The get_freepointer() and set_freepointer() functions that used to be only intended for the slow path now are also useful for the hot path since access to the size field does not require accessing an additional cacheline anymore. This results in consistent use of functions for setting the freepointer of objects throughout SLUB. Also we initialize all possible kmem_cache_cpu structures when a slab is created. No need to initialize them when a processor or node comes online. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 2 -- mm/slub.c | 76 +++++++++++------------------------------------- 2 files changed, 17 insertions(+), 61 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index a78fb4ac2015..0249d4175bac 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -38,8 +38,6 @@ struct kmem_cache_cpu { void **freelist; /* Pointer to first free per cpu object */ struct page *page; /* The slab from which we are allocating */ int node; /* The node of the page (or -1 for debug) */ - unsigned int offset; /* Freepointer offset (in word units) */ - unsigned int objsize; /* Size of an object (from kmem_cache) */ #ifdef CONFIG_SLUB_STATS unsigned stat[NR_SLUB_STAT_ITEMS]; #endif diff --git a/mm/slub.c b/mm/slub.c index cdb7f0214af0..30d2dde27563 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -260,13 +260,6 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; } -/* - * Slow version of get and set free pointer. - * - * This version requires touching the cache lines of kmem_cache which - * we avoid to do in the fast alloc free paths. There we obtain the offset - * from the page struct. - */ static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -1473,10 +1466,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) /* Retrieve object from cpu_freelist */ object = c->freelist; - c->freelist = c->freelist[c->offset]; + c->freelist = get_freepointer(s, c->freelist); /* And put onto the regular freelist */ - object[c->offset] = page->freelist; + set_freepointer(s, object, page->freelist); page->freelist = object; page->inuse--; } @@ -1635,7 +1628,7 @@ load_freelist: if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) goto debug; - c->freelist = object[c->offset]; + c->freelist = get_freepointer(s, object); c->page->inuse = c->page->objects; c->page->freelist = NULL; c->node = page_to_nid(c->page); @@ -1681,7 +1674,7 @@ debug: goto another_slab; c->page->inuse++; - c->page->freelist = object[c->offset]; + c->page->freelist = get_freepointer(s, object); c->node = -1; goto unlock_out; } @@ -1702,7 +1695,6 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, void **object; struct kmem_cache_cpu *c; unsigned long flags; - unsigned long objsize; gfpflags &= gfp_allowed_mask; @@ -1715,22 +1707,21 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, local_irq_save(flags); c = __this_cpu_ptr(s->cpu_slab); object = c->freelist; - objsize = c->objsize; if (unlikely(!object || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { - c->freelist = object[c->offset]; + c->freelist = get_freepointer(s, object); stat(c, ALLOC_FASTPATH); } local_irq_restore(flags); if (unlikely(gfpflags & __GFP_ZERO) && object) - memset(object, 0, objsize); + memset(object, 0, s->objsize); - kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); - kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); + kmemcheck_slab_alloc(s, gfpflags, object, s->objsize); + kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags); return object; } @@ -1785,7 +1776,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, unsigned long addr, unsigned int offset) + void *x, unsigned long addr) { void *prior; void **object = (void *)x; @@ -1799,7 +1790,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page, goto debug; checks_ok: - prior = object[offset] = page->freelist; + prior = page->freelist; + set_freepointer(s, object, prior); page->freelist = object; page->inuse--; @@ -1864,16 +1856,16 @@ static __always_inline void slab_free(struct kmem_cache *s, kmemleak_free_recursive(x, s->flags); local_irq_save(flags); c = __this_cpu_ptr(s->cpu_slab); - kmemcheck_slab_free(s, object, c->objsize); - debug_check_no_locks_freed(object, c->objsize); + kmemcheck_slab_free(s, object, s->objsize); + debug_check_no_locks_freed(object, s->objsize); if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, c->objsize); + debug_check_no_obj_freed(object, s->objsize); if (likely(page == c->page && c->node >= 0)) { - object[c->offset] = c->freelist; + set_freepointer(s, object, c->freelist); c->freelist = object; stat(c, FREE_FASTPATH); } else - __slab_free(s, page, x, addr, c->offset); + __slab_free(s, page, x, addr); local_irq_restore(flags); } @@ -2060,19 +2052,6 @@ static unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } -static void init_kmem_cache_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) -{ - c->page = NULL; - c->freelist = NULL; - c->node = 0; - c->offset = s->offset / sizeof(void *); - c->objsize = s->objsize; -#ifdef CONFIG_SLUB_STATS - memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned)); -#endif -} - static void init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { @@ -2090,8 +2069,6 @@ static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[SLUB_PAGE_SHIFT]); static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) { - int cpu; - if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) /* * Boot time creation of the kmalloc array. Use static per cpu data @@ -2104,8 +2081,6 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) if (!s->cpu_slab) return 0; - for_each_possible_cpu(cpu) - init_kmem_cache_cpu(s, per_cpu_ptr(s->cpu_slab, cpu)); return 1; } @@ -2391,6 +2366,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) return 1; + free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) @@ -3247,22 +3223,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { - int cpu; - s->refcount++; /* * Adjust the object sizes so that we clear * the complete object on kzalloc. */ s->objsize = max(s->objsize, (int)size); - - /* - * And then we need to update the object size in the - * per cpu structures - */ - for_each_online_cpu(cpu) - per_cpu_ptr(s->cpu_slab, cpu)->objsize = s->objsize; - s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); up_write(&slub_lock); @@ -3316,14 +3282,6 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, unsigned long flags; switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - down_read(&slub_lock); - list_for_each_entry(s, &slab_caches, list) - init_kmem_cache_cpu(s, per_cpu_ptr(s->cpu_slab, cpu)); - up_read(&slub_lock); - break; - case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: -- cgit v1.2.3 From 0f78231bffb868a30e8533aace142213266bb811 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 1 Dec 2009 13:37:02 +0100 Subject: mac80211: enable spatial multiplexing powersave Enable spatial multiplexing in mac80211 by telling the driver what to do and, where necessary, sending action frames to the AP to update the requested SMPS mode. Also includes a trivial implementation for hwsim that just logs the requested mode. For now, the userspace interface is in debugfs only, and let you toggle the requested mode at any time. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- drivers/net/wireless/mac80211_hwsim.c | 24 ++++++-- include/linux/ieee80211.h | 25 +++++++- include/net/mac80211.h | 59 ++++++++++++++++++ net/mac80211/cfg.c | 49 +++++++++++++++ net/mac80211/debugfs_netdev.c | 111 +++++++++++++++++++++++++++++++++- net/mac80211/driver-trace.h | 2 + net/mac80211/ht.c | 47 ++++++++++++++ net/mac80211/ieee80211_i.h | 14 +++++ net/mac80211/main.c | 24 ++++++++ net/mac80211/mlme.c | 63 +++++++++++++++++-- net/mac80211/status.c | 38 ++++++++++++ net/mac80211/util.c | 74 +++++++++++++++++++++++ 12 files changed, 518 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 88e41176e7fd..92c669ebb358 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -618,12 +618,26 @@ static int mac80211_hwsim_config(struct ieee80211_hw *hw, u32 changed) { struct mac80211_hwsim_data *data = hw->priv; struct ieee80211_conf *conf = &hw->conf; - - printk(KERN_DEBUG "%s:%s (freq=%d idle=%d ps=%d)\n", + static const char *chantypes[4] = { + [NL80211_CHAN_NO_HT] = "noht", + [NL80211_CHAN_HT20] = "ht20", + [NL80211_CHAN_HT40MINUS] = "ht40-", + [NL80211_CHAN_HT40PLUS] = "ht40+", + }; + static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = { + [IEEE80211_SMPS_AUTOMATIC] = "auto", + [IEEE80211_SMPS_OFF] = "off", + [IEEE80211_SMPS_STATIC] = "static", + [IEEE80211_SMPS_DYNAMIC] = "dynamic", + }; + + printk(KERN_DEBUG "%s:%s (freq=%d/%s idle=%d ps=%d smps=%s)\n", wiphy_name(hw->wiphy), __func__, conf->channel->center_freq, + chantypes[conf->channel_type], !!(conf->flags & IEEE80211_CONF_IDLE), - !!(conf->flags & IEEE80211_CONF_PS)); + !!(conf->flags & IEEE80211_CONF_PS), + smps_modes[conf->smps_mode]); data->idle = !!(conf->flags & IEEE80211_CONF_IDLE); @@ -1082,7 +1096,9 @@ static int __init init_mac80211_hwsim(void) BIT(NL80211_IFTYPE_MESH_POINT); hw->flags = IEEE80211_HW_MFP_CAPABLE | - IEEE80211_HW_SIGNAL_DBM; + IEEE80211_HW_SIGNAL_DBM | + IEEE80211_HW_SUPPORTS_STATIC_SMPS | + IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS; /* ask mac80211 to reserve space for magic */ hw->vif_data_size = sizeof(struct hwsim_vif_priv); diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d9724a28c0c2..e8d43d0ff2c3 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -707,6 +707,10 @@ struct ieee80211_mgmt { u8 action; u8 trans_id[WLAN_SA_QUERY_TR_ID_LEN]; } __attribute__ ((packed)) sa_query; + struct { + u8 action; + u8 smps_control; + } __attribute__ ((packed)) ht_smps; } u; } __attribute__ ((packed)) action; } u; @@ -824,6 +828,7 @@ struct ieee80211_ht_cap { #define IEEE80211_HT_CAP_LDPC_CODING 0x0001 #define IEEE80211_HT_CAP_SUP_WIDTH_20_40 0x0002 #define IEEE80211_HT_CAP_SM_PS 0x000C +#define IEEE80211_HT_CAP_SM_PS_SHIFT 2 #define IEEE80211_HT_CAP_GRN_FLD 0x0010 #define IEEE80211_HT_CAP_SGI_20 0x0020 #define IEEE80211_HT_CAP_SGI_40 0x0040 @@ -839,6 +844,7 @@ struct ieee80211_ht_cap { /* 802.11n HT capability AMPDU settings (for ampdu_params_info) */ #define IEEE80211_HT_AMPDU_PARM_FACTOR 0x03 #define IEEE80211_HT_AMPDU_PARM_DENSITY 0x1C +#define IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT 2 /* * Maximum length of AMPDU that the STA can receive. @@ -922,12 +928,17 @@ struct ieee80211_ht_info { #define IEEE80211_MAX_AMPDU_BUF 0x40 -/* Spatial Multiplexing Power Save Modes */ +/* Spatial Multiplexing Power Save Modes (for capability) */ #define WLAN_HT_CAP_SM_PS_STATIC 0 #define WLAN_HT_CAP_SM_PS_DYNAMIC 1 #define WLAN_HT_CAP_SM_PS_INVALID 2 #define WLAN_HT_CAP_SM_PS_DISABLED 3 +/* for SM power control field lower two bits */ +#define WLAN_HT_SMPS_CONTROL_DISABLED 0 +#define WLAN_HT_SMPS_CONTROL_STATIC 1 +#define WLAN_HT_SMPS_CONTROL_DYNAMIC 3 + /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 @@ -1150,6 +1161,18 @@ enum ieee80211_spectrum_mgmt_actioncode { WLAN_ACTION_SPCT_CHL_SWITCH = 4, }; +/* HT action codes */ +enum ieee80211_ht_actioncode { + WLAN_HT_ACTION_NOTIFY_CHANWIDTH = 0, + WLAN_HT_ACTION_SMPS = 1, + WLAN_HT_ACTION_PSMP = 2, + WLAN_HT_ACTION_PCO_PHASE = 3, + WLAN_HT_ACTION_CSI = 4, + WLAN_HT_ACTION_NONCOMPRESSED_BF = 5, + WLAN_HT_ACTION_COMPRESSED_BF = 6, + WLAN_HT_ACTION_ASEL_IDX_FEEDBACK = 7, +}; + /* Security key length */ enum ieee80211_key_len { WLAN_KEY_LEN_WEP40 = 5, diff --git a/include/net/mac80211.h b/include/net/mac80211.h index e94cc526b0f6..e6b6bf81d5b9 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -597,8 +597,10 @@ enum ieee80211_conf_flags { * @IEEE80211_CONF_CHANGE_CHANNEL: the channel/channel_type changed * @IEEE80211_CONF_CHANGE_RETRY_LIMITS: retry limits changed * @IEEE80211_CONF_CHANGE_IDLE: Idle flag changed + * @IEEE80211_CONF_CHANGE_SMPS: Spatial multiplexing powersave mode changed */ enum ieee80211_conf_changed { + IEEE80211_CONF_CHANGE_SMPS = BIT(1), IEEE80211_CONF_CHANGE_LISTEN_INTERVAL = BIT(2), IEEE80211_CONF_CHANGE_MONITOR = BIT(3), IEEE80211_CONF_CHANGE_PS = BIT(4), @@ -608,6 +610,21 @@ enum ieee80211_conf_changed { IEEE80211_CONF_CHANGE_IDLE = BIT(8), }; +/** + * enum ieee80211_smps_mode - spatial multiplexing power save mode + * + * @ + */ +enum ieee80211_smps_mode { + IEEE80211_SMPS_AUTOMATIC, + IEEE80211_SMPS_OFF, + IEEE80211_SMPS_STATIC, + IEEE80211_SMPS_DYNAMIC, + + /* keep last */ + IEEE80211_SMPS_NUM_MODES, +}; + /** * struct ieee80211_conf - configuration of the device * @@ -636,6 +653,10 @@ enum ieee80211_conf_changed { * @short_frame_max_tx_count: Maximum number of transmissions for a "short" * frame, called "dot11ShortRetryLimit" in 802.11, but actually means the * number of transmissions not the number of retries + * + * @smps_mode: spatial multiplexing powersave mode; note that + * %IEEE80211_SMPS_STATIC is used when the device is not + * configured for an HT channel */ struct ieee80211_conf { u32 flags; @@ -648,6 +669,7 @@ struct ieee80211_conf { struct ieee80211_channel *channel; enum nl80211_channel_type channel_type; + enum ieee80211_smps_mode smps_mode; }; /** @@ -930,6 +952,16 @@ enum ieee80211_tkip_key_type { * @IEEE80211_HW_BEACON_FILTER: * Hardware supports dropping of irrelevant beacon frames to * avoid waking up cpu. + * + * @IEEE80211_HW_SUPPORTS_STATIC_SMPS: + * Hardware supports static spatial multiplexing powersave, + * ie. can turn off all but one chain even on HT connections + * that should be using more chains. + * + * @IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS: + * Hardware supports dynamic spatial multiplexing powersave, + * ie. can turn off all but one chain and then wake the rest + * up as required after, for example, rts/cts handshake. */ enum ieee80211_hw_flags { IEEE80211_HW_HAS_RATE_CONTROL = 1<<0, @@ -947,6 +979,8 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_DYNAMIC_PS = 1<<12, IEEE80211_HW_MFP_CAPABLE = 1<<13, IEEE80211_HW_BEACON_FILTER = 1<<14, + IEEE80211_HW_SUPPORTS_STATIC_SMPS = 1<<15, + IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS = 1<<16, }; /** @@ -1214,6 +1248,31 @@ ieee80211_get_alt_retry_rate(const struct ieee80211_hw *hw, * signal strength threshold checking. */ +/** + * DOC: Spatial multiplexing power save + * + * SMPS (Spatial multiplexing power save) is a mechanism to conserve + * power in an 802.11n implementation. For details on the mechanism + * and rationale, please refer to 802.11 (as amended by 802.11n-2009) + * "11.2.3 SM power save". + * + * The mac80211 implementation is capable of sending action frames + * to update the AP about the station's SMPS mode, and will instruct + * the driver to enter the specific mode. It will also announce the + * requested SMPS mode during the association handshake. Hardware + * support for this feature is required, and can be indicated by + * hardware flags. + * + * The default mode will be "automatic", which nl80211/cfg80211 + * defines to be dynamic SMPS in (regular) powersave, and SMPS + * turned off otherwise. + * + * To support this feature, the driver must set the appropriate + * hardware support flags, and handle the SMPS flag to the config() + * operation. It will then with this mechanism be instructed to + * enter the requested SMPS mode while associated to an HT AP. + */ + /** * DOC: Frame filtering * diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index fcfa1bf776a7..8c35418d1c96 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1318,6 +1318,50 @@ static int ieee80211_testmode_cmd(struct wiphy *wiphy, void *data, int len) } #endif +int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps_mode) +{ + const u8 *ap; + enum ieee80211_smps_mode old_req; + int err; + + old_req = sdata->u.mgd.req_smps; + sdata->u.mgd.req_smps = smps_mode; + + if (old_req == smps_mode && + smps_mode != IEEE80211_SMPS_AUTOMATIC) + return 0; + + /* + * If not associated, or current association is not an HT + * association, there's no need to send an action frame. + */ + if (!sdata->u.mgd.associated || + sdata->local->oper_channel_type == NL80211_CHAN_NO_HT) { + mutex_lock(&sdata->local->iflist_mtx); + ieee80211_recalc_smps(sdata->local, sdata); + mutex_unlock(&sdata->local->iflist_mtx); + return 0; + } + + ap = sdata->u.mgd.associated->cbss.bssid; + + if (smps_mode == IEEE80211_SMPS_AUTOMATIC) { + if (sdata->u.mgd.powersave) + smps_mode = IEEE80211_SMPS_DYNAMIC; + else + smps_mode = IEEE80211_SMPS_OFF; + } + + /* send SM PS frame to AP */ + err = ieee80211_send_smps_action(sdata, smps_mode, + ap, ap); + if (err) + sdata->u.mgd.req_smps = old_req; + + return err; +} + static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, bool enabled, int timeout) { @@ -1335,6 +1379,11 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, sdata->u.mgd.powersave = enabled; conf->dynamic_ps_timeout = timeout; + /* no change, but if automatic follow powersave */ + mutex_lock(&sdata->u.mgd.mtx); + __ieee80211_request_smps(sdata, sdata->u.mgd.req_smps); + mutex_unlock(&sdata->u.mgd.mtx); + if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 5d9c797635a9..355983503885 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -41,6 +41,30 @@ static ssize_t ieee80211_if_read( return ret; } +static ssize_t ieee80211_if_write( + struct ieee80211_sub_if_data *sdata, + const char __user *userbuf, + size_t count, loff_t *ppos, + ssize_t (*write)(struct ieee80211_sub_if_data *, const char *, int)) +{ + u8 *buf; + ssize_t ret = -ENODEV; + + buf = kzalloc(count, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, userbuf, count)) + return -EFAULT; + + rtnl_lock(); + if (sdata->dev->reg_state == NETREG_REGISTERED) + ret = (*write)(sdata, buf, count); + rtnl_unlock(); + + return ret; +} + #define IEEE80211_IF_FMT(name, field, format_string) \ static ssize_t ieee80211_if_fmt_##name( \ const struct ieee80211_sub_if_data *sdata, char *buf, \ @@ -71,7 +95,7 @@ static ssize_t ieee80211_if_fmt_##name( \ return scnprintf(buf, buflen, "%pM\n", sdata->field); \ } -#define __IEEE80211_IF_FILE(name) \ +#define __IEEE80211_IF_FILE(name, _write) \ static ssize_t ieee80211_if_read_##name(struct file *file, \ char __user *userbuf, \ size_t count, loff_t *ppos) \ @@ -82,12 +106,24 @@ static ssize_t ieee80211_if_read_##name(struct file *file, \ } \ static const struct file_operations name##_ops = { \ .read = ieee80211_if_read_##name, \ + .write = (_write), \ .open = mac80211_open_file_generic, \ } +#define __IEEE80211_IF_FILE_W(name) \ +static ssize_t ieee80211_if_write_##name(struct file *file, \ + const char __user *userbuf, \ + size_t count, loff_t *ppos) \ +{ \ + return ieee80211_if_write(file->private_data, userbuf, count, \ + ppos, ieee80211_if_parse_##name); \ +} \ +__IEEE80211_IF_FILE(name, ieee80211_if_write_##name) + + #define IEEE80211_IF_FILE(name, field, format) \ IEEE80211_IF_FMT_##format(name, field) \ - __IEEE80211_IF_FILE(name) + __IEEE80211_IF_FILE(name, NULL) /* common attributes */ IEEE80211_IF_FILE(drop_unencrypted, drop_unencrypted, DEC); @@ -99,6 +135,70 @@ IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC); IEEE80211_IF_FILE(aid, u.mgd.aid, DEC); IEEE80211_IF_FILE(capab, u.mgd.capab, HEX); +static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps_mode) +{ + struct ieee80211_local *local = sdata->local; + int err; + + if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_STATIC_SMPS) && + smps_mode == IEEE80211_SMPS_STATIC) + return -EINVAL; + + /* auto should be dynamic if in PS mode */ + if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) && + (smps_mode == IEEE80211_SMPS_DYNAMIC || + smps_mode == IEEE80211_SMPS_AUTOMATIC)) + return -EINVAL; + + /* supported only on managed interfaces for now */ + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return -EOPNOTSUPP; + + mutex_lock(&local->iflist_mtx); + err = __ieee80211_request_smps(sdata, smps_mode); + mutex_unlock(&local->iflist_mtx); + + return err; +} + +static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = { + [IEEE80211_SMPS_AUTOMATIC] = "auto", + [IEEE80211_SMPS_OFF] = "off", + [IEEE80211_SMPS_STATIC] = "static", + [IEEE80211_SMPS_DYNAMIC] = "dynamic", +}; + +static ssize_t ieee80211_if_fmt_smps(const struct ieee80211_sub_if_data *sdata, + char *buf, int buflen) +{ + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return -EOPNOTSUPP; + + return snprintf(buf, buflen, "request: %s\nused: %s\n", + smps_modes[sdata->u.mgd.req_smps], + smps_modes[sdata->u.mgd.ap_smps]); +} + +static ssize_t ieee80211_if_parse_smps(struct ieee80211_sub_if_data *sdata, + const char *buf, int buflen) +{ + enum ieee80211_smps_mode mode; + + for (mode = 0; mode < IEEE80211_SMPS_NUM_MODES; mode++) { + if (strncmp(buf, smps_modes[mode], buflen) == 0) { + int err = ieee80211_set_smps(sdata, mode); + if (!err) + return buflen; + return err; + } + } + + return -EINVAL; +} + +__IEEE80211_IF_FILE_W(smps); + /* AP attributes */ IEEE80211_IF_FILE(num_sta_ps, u.ap.num_sta_ps, ATOMIC); IEEE80211_IF_FILE(dtim_count, u.ap.dtim_count, DEC); @@ -109,7 +209,7 @@ static ssize_t ieee80211_if_fmt_num_buffered_multicast( return scnprintf(buf, buflen, "%u\n", skb_queue_len(&sdata->u.ap.ps_bc_buf)); } -__IEEE80211_IF_FILE(num_buffered_multicast); +__IEEE80211_IF_FILE(num_buffered_multicast, NULL); /* WDS attributes */ IEEE80211_IF_FILE(peer, u.wds.remote_addr, MAC); @@ -158,6 +258,10 @@ IEEE80211_IF_FILE(dot11MeshHWMPRootMode, debugfs_create_file(#name, 0400, sdata->debugfs.dir, \ sdata, &name##_ops); +#define DEBUGFS_ADD_MODE(name, mode) \ + debugfs_create_file(#name, mode, sdata->debugfs.dir, \ + sdata, &name##_ops); + static void add_sta_files(struct ieee80211_sub_if_data *sdata) { DEBUGFS_ADD(drop_unencrypted, sta); @@ -167,6 +271,7 @@ static void add_sta_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD(bssid, sta); DEBUGFS_ADD(aid, sta); DEBUGFS_ADD(capab, sta); + DEBUGFS_ADD_MODE(smps, 0600); } static void add_ap_files(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/driver-trace.h b/net/mac80211/driver-trace.h index ee2d19a25ce1..7a849b920165 100644 --- a/net/mac80211/driver-trace.h +++ b/net/mac80211/driver-trace.h @@ -140,6 +140,7 @@ TRACE_EVENT(drv_config, __field(u8, short_frame_max_tx_count) __field(int, center_freq) __field(int, channel_type) + __field(int, smps) ), TP_fast_assign( @@ -155,6 +156,7 @@ TRACE_EVENT(drv_config, __entry->short_frame_max_tx_count = local->hw.conf.short_frame_max_tx_count; __entry->center_freq = local->hw.conf.channel->center_freq; __entry->channel_type = local->hw.conf.channel_type; + __entry->smps = local->hw.conf.smps_mode; ), TP_printk( diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 45ebd062a2fb..63b8f86b7f16 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -166,3 +166,50 @@ void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, spin_unlock_bh(&sta->lock); } } + +int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps, const u8 *da, + const u8 *bssid) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *action_frame; + + /* 27 = header + category + action + smps mode */ + skb = dev_alloc_skb(27 + local->hw.extra_tx_headroom); + if (!skb) + return -ENOMEM; + + skb_reserve(skb, local->hw.extra_tx_headroom); + action_frame = (void *)skb_put(skb, 27); + memcpy(action_frame->da, da, ETH_ALEN); + memcpy(action_frame->sa, sdata->dev->dev_addr, ETH_ALEN); + memcpy(action_frame->bssid, bssid, ETH_ALEN); + action_frame->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + action_frame->u.action.category = WLAN_CATEGORY_HT; + action_frame->u.action.u.ht_smps.action = WLAN_HT_ACTION_SMPS; + switch (smps) { + case IEEE80211_SMPS_AUTOMATIC: + case IEEE80211_SMPS_NUM_MODES: + WARN_ON(1); + case IEEE80211_SMPS_OFF: + action_frame->u.action.u.ht_smps.smps_control = + WLAN_HT_SMPS_CONTROL_DISABLED; + break; + case IEEE80211_SMPS_STATIC: + action_frame->u.action.u.ht_smps.smps_control = + WLAN_HT_SMPS_CONTROL_STATIC; + break; + case IEEE80211_SMPS_DYNAMIC: + action_frame->u.action.u.ht_smps.smps_control = + WLAN_HT_SMPS_CONTROL_DYNAMIC; + break; + } + + /* we'll do more on status of this frame */ + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + ieee80211_tx_skb(sdata, skb); + + return 0; +} diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 178e329f9257..e63aecbddfbe 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -297,6 +297,8 @@ struct ieee80211_if_managed { unsigned long timers_running; /* used for quiesce/restart */ bool powersave; /* powersave requested for this iface */ + enum ieee80211_smps_mode req_smps, /* requested smps mode */ + ap_smps; /* smps mode AP thinks we're in */ unsigned long request; @@ -587,6 +589,9 @@ struct ieee80211_local { /* used for uploading changed mc list */ struct work_struct reconfig_filter; + /* used to reconfigure hardware SM PS */ + struct work_struct recalc_smps; + /* aggregated multicast list */ struct dev_addr_list *mc_list; int mc_count; @@ -760,6 +765,8 @@ struct ieee80211_local { int user_power_level; /* in dBm */ int power_constr_level; /* in dBm */ + enum ieee80211_smps_mode smps_mode; + struct work_struct restart_work; #ifdef CONFIG_MAC80211_DEBUGFS @@ -978,6 +985,9 @@ void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u1 void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, u16 initiator, u16 reason_code); +int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps, const u8 *da, + const u8 *bssid); void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid, u16 initiator, u16 reason); @@ -1088,6 +1098,10 @@ void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata, u32 ieee80211_sta_get_rates(struct ieee80211_local *local, struct ieee802_11_elems *elems, enum ieee80211_band band); +int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps_mode); +void ieee80211_recalc_smps(struct ieee80211_local *local, + struct ieee80211_sub_if_data *forsdata); #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 98320a94c270..e1293e8ed83a 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -113,6 +113,18 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) changed |= IEEE80211_CONF_CHANGE_CHANNEL; } + if (!conf_is_ht(&local->hw.conf)) { + /* + * mac80211.h documents that this is only valid + * when the channel is set to an HT type, and + * that otherwise STATIC is used. + */ + local->hw.conf.smps_mode = IEEE80211_SMPS_STATIC; + } else if (local->hw.conf.smps_mode != local->smps_mode) { + local->hw.conf.smps_mode = local->smps_mode; + changed |= IEEE80211_CONF_CHANGE_SMPS; + } + if (scan_chan) power = chan->max_power; else @@ -297,6 +309,16 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw) } EXPORT_SYMBOL(ieee80211_restart_hw); +static void ieee80211_recalc_smps_work(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, recalc_smps); + + mutex_lock(&local->iflist_mtx); + ieee80211_recalc_smps(local, NULL); + mutex_unlock(&local->iflist_mtx); +} + struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops) { @@ -370,6 +392,8 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, INIT_WORK(&local->restart_work, ieee80211_restart_work); INIT_WORK(&local->reconfig_filter, ieee80211_reconfig_filter); + INIT_WORK(&local->recalc_smps, ieee80211_recalc_smps_work); + local->smps_mode = IEEE80211_SMPS_OFF; INIT_WORK(&local->dynamic_ps_enable_work, ieee80211_dynamic_ps_enable_work); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index cd5dcc3d8c2b..0a762a9ba4df 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -398,6 +398,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, __le16 tmp; u32 flags = local->hw.conf.channel->flags; + /* determine capability flags */ + switch (ht_info->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: if (flags & IEEE80211_CHAN_NO_HT40PLUS) { @@ -413,17 +415,64 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, break; } - tmp = cpu_to_le16(cap); - pos = skb_put(skb, sizeof(struct ieee80211_ht_cap)+2); + /* set SM PS mode properly */ + cap &= ~IEEE80211_HT_CAP_SM_PS; + /* new association always uses requested smps mode */ + if (ifmgd->req_smps == IEEE80211_SMPS_AUTOMATIC) { + if (ifmgd->powersave) + ifmgd->ap_smps = IEEE80211_SMPS_DYNAMIC; + else + ifmgd->ap_smps = IEEE80211_SMPS_OFF; + } else + ifmgd->ap_smps = ifmgd->req_smps; + + switch (ifmgd->ap_smps) { + case IEEE80211_SMPS_AUTOMATIC: + case IEEE80211_SMPS_NUM_MODES: + WARN_ON(1); + case IEEE80211_SMPS_OFF: + cap |= WLAN_HT_CAP_SM_PS_DISABLED << + IEEE80211_HT_CAP_SM_PS_SHIFT; + break; + case IEEE80211_SMPS_STATIC: + cap |= WLAN_HT_CAP_SM_PS_STATIC << + IEEE80211_HT_CAP_SM_PS_SHIFT; + break; + case IEEE80211_SMPS_DYNAMIC: + cap |= WLAN_HT_CAP_SM_PS_DYNAMIC << + IEEE80211_HT_CAP_SM_PS_SHIFT; + break; + } + + /* reserve and fill IE */ + + pos = skb_put(skb, sizeof(struct ieee80211_ht_cap) + 2); *pos++ = WLAN_EID_HT_CAPABILITY; *pos++ = sizeof(struct ieee80211_ht_cap); memset(pos, 0, sizeof(struct ieee80211_ht_cap)); + + /* capability flags */ + tmp = cpu_to_le16(cap); memcpy(pos, &tmp, sizeof(u16)); pos += sizeof(u16); - /* TODO: needs a define here for << 2 */ + + /* AMPDU parameters */ *pos++ = sband->ht_cap.ampdu_factor | - (sband->ht_cap.ampdu_density << 2); + (sband->ht_cap.ampdu_density << + IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT); + + /* MCS set */ memcpy(pos, &sband->ht_cap.mcs, sizeof(sband->ht_cap.mcs)); + pos += sizeof(sband->ht_cap.mcs); + + /* extended capabilities */ + pos += sizeof(__le16); + + /* BF capabilities */ + pos += sizeof(__le32); + + /* antenna selection */ + pos += sizeof(u8); } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; @@ -932,6 +981,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, mutex_lock(&local->iflist_mtx); ieee80211_recalc_ps(local, -1); + ieee80211_recalc_smps(local, sdata); mutex_unlock(&local->iflist_mtx); netif_start_queue(sdata->dev); @@ -2327,6 +2377,11 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) ifmgd->flags |= IEEE80211_STA_WMM_ENABLED; mutex_init(&ifmgd->mtx); + + if (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) + ifmgd->req_smps = IEEE80211_SMPS_AUTOMATIC; + else + ifmgd->req_smps = IEEE80211_SMPS_OFF; } /* scan finished notification */ diff --git a/net/mac80211/status.c b/net/mac80211/status.c index b4608f11a40f..0c0850d37dda 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -134,6 +134,40 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, dev_kfree_skb(skb); } +static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (void *) skb->data; + struct ieee80211_local *local = sta->local; + struct ieee80211_sub_if_data *sdata = sta->sdata; + + if (ieee80211_is_action(mgmt->frame_control) && + sdata->vif.type == NL80211_IFTYPE_STATION && + mgmt->u.action.category == WLAN_CATEGORY_HT && + mgmt->u.action.u.ht_smps.action == WLAN_HT_ACTION_SMPS) { + /* + * This update looks racy, but isn't -- if we come + * here we've definitely got a station that we're + * talking to, and on a managed interface that can + * only be the AP. And the only other place updating + * this variable is before we're associated. + */ + switch (mgmt->u.action.u.ht_smps.smps_control) { + case WLAN_HT_SMPS_CONTROL_DYNAMIC: + sta->sdata->u.mgd.ap_smps = IEEE80211_SMPS_DYNAMIC; + break; + case WLAN_HT_SMPS_CONTROL_STATIC: + sta->sdata->u.mgd.ap_smps = IEEE80211_SMPS_STATIC; + break; + case WLAN_HT_SMPS_CONTROL_DISABLED: + default: /* shouldn't happen since we don't send that */ + sta->sdata->u.mgd.ap_smps = IEEE80211_SMPS_OFF; + break; + } + + ieee80211_queue_work(&local->hw, &local->recalc_smps); + } +} + void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) { struct sk_buff *skb2; @@ -210,6 +244,10 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) rate_control_tx_status(local, sband, sta, skb); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) ieee80211s_update_metric(local, sta, skb); + + if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && + (info->flags & IEEE80211_TX_STAT_ACK)) + ieee80211_frame_acked(sta, skb); } rcu_read_unlock(); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index d54dbe8e09ba..086ef6257b4b 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1170,3 +1170,77 @@ int ieee80211_reconfig(struct ieee80211_local *local) return 0; } +static int check_mgd_smps(struct ieee80211_if_managed *ifmgd, + enum ieee80211_smps_mode *smps_mode) +{ + if (ifmgd->associated) { + *smps_mode = ifmgd->ap_smps; + + if (*smps_mode == IEEE80211_SMPS_AUTOMATIC) { + if (ifmgd->powersave) + *smps_mode = IEEE80211_SMPS_DYNAMIC; + else + *smps_mode = IEEE80211_SMPS_OFF; + } + + return 1; + } + + return 0; +} + +/* must hold iflist_mtx */ +void ieee80211_recalc_smps(struct ieee80211_local *local, + struct ieee80211_sub_if_data *forsdata) +{ + struct ieee80211_sub_if_data *sdata; + enum ieee80211_smps_mode smps_mode = IEEE80211_SMPS_OFF; + int count = 0; + + if (forsdata) + WARN_ON(!mutex_is_locked(&forsdata->u.mgd.mtx)); + + WARN_ON(!mutex_is_locked(&local->iflist_mtx)); + + /* + * This function could be improved to handle multiple + * interfaces better, but right now it makes any + * non-station interfaces force SM PS to be turned + * off. If there are multiple station interfaces it + * could also use the best possible mode, e.g. if + * one is in static and the other in dynamic then + * dynamic is ok. + */ + + list_for_each_entry(sdata, &local->interfaces, list) { + if (!netif_running(sdata->dev)) + continue; + if (sdata->vif.type != NL80211_IFTYPE_STATION) + goto set; + if (sdata != forsdata) { + /* + * This nested is ok -- we are holding the iflist_mtx + * so can't get here twice or so. But it's required + * since normally we acquire it first and then the + * iflist_mtx. + */ + mutex_lock_nested(&sdata->u.mgd.mtx, SINGLE_DEPTH_NESTING); + count += check_mgd_smps(&sdata->u.mgd, &smps_mode); + mutex_unlock(&sdata->u.mgd.mtx); + } else + count += check_mgd_smps(&sdata->u.mgd, &smps_mode); + + if (count > 1) { + smps_mode = IEEE80211_SMPS_OFF; + break; + } + } + + if (smps_mode == local->smps_mode) + return; + + set: + local->smps_mode = smps_mode; + /* changed flag is auto-detected for this */ + ieee80211_hw_config(local, 0); +} -- cgit v1.2.3 From 9da3e068142ec7856b2f13261dcf0660fad32b61 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Mon, 7 Dec 2009 15:57:50 -0500 Subject: mac80211: only bother printing highest data rate on debugfs if its set IEEE-802.11n spec says the RX highest data rate field does not specify the highest supported RX data rate if its not set. Ignore it if not set then. Refer to section 7.3.56.4 Cc: johannes@sipsolutions.net Signed-off-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 5 ++++- net/mac80211/debugfs_sta.c | 12 ++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index e8d43d0ff2c3..098bedcde9bb 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -775,7 +775,10 @@ struct ieee80211_bar { /** * struct ieee80211_mcs_info - MCS information * @rx_mask: RX mask - * @rx_highest: highest supported RX rate + * @rx_highest: highest supported RX rate. If set represents + * the highest supported RX data rate in units of 1 Mbps. + * If this field is 0 this value should not be used to + * consider the highest RX data rate supported. * @tx_params: TX parameters */ struct ieee80211_mcs_info { diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index c833b6ce9902..0d4a759ba72c 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -218,11 +218,19 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf, p += scnprintf(p, sizeof(buf)+buf-p, "ampdu factor/density: %d/%d\n", htc->ampdu_factor, htc->ampdu_density); p += scnprintf(p, sizeof(buf)+buf-p, "MCS mask:"); + for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) p += scnprintf(p, sizeof(buf)+buf-p, " %.2x", htc->mcs.rx_mask[i]); - p += scnprintf(p, sizeof(buf)+buf-p, "\nMCS rx highest: %d\n", - le16_to_cpu(htc->mcs.rx_highest)); + p += scnprintf(p, sizeof(buf)+buf-p, "\n"); + + /* If not set this is meaningless */ + if (le16_to_cpu(htc->mcs.rx_highest)) { + p += scnprintf(p, sizeof(buf)+buf-p, + "MCS rx highest: %d Mbps\n", + le16_to_cpu(htc->mcs.rx_highest)); + } + p += scnprintf(p, sizeof(buf)+buf-p, "MCS tx params: %x\n", htc->mcs.tx_params); } -- cgit v1.2.3 From 31d12926e37291970dd4f6e9940df3897766a81d Mon Sep 17 00:00:00 2001 From: laurent chavey Date: Tue, 15 Dec 2009 11:15:28 +0000 Subject: net: Add rtnetlink init_rcvwnd to set the TCP initial receive window Add rtnetlink init_rcvwnd to set the TCP initial receive window size advertised by passive and active TCP connections. The current Linux TCP implementation limits the advertised TCP initial receive window to the one prescribed by slow start. For short lived TCP connections used for transaction type of traffic (i.e. http requests), bounding the advertised TCP initial receive window results in increased latency to complete the transaction. Support for setting initial congestion window is already supported using rtnetlink init_cwnd, but the feature is useless without the ability to set a larger TCP initial receive window. The rtnetlink init_rcvwnd allows increasing the TCP initial receive window, allowing TCP connection to advertise larger TCP receive window than the ones bounded by slow start. Signed-off-by: Laurent Chavey Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 2 ++ include/net/dst.h | 2 -- include/net/tcp.h | 3 ++- net/ipv4/syncookies.c | 3 ++- net/ipv4/tcp_output.c | 17 +++++++++++++---- net/ipv6/syncookies.c | 3 ++- 6 files changed, 21 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 05330fc5b436..9590364fe8b5 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -362,6 +362,8 @@ enum { #define RTAX_FEATURES RTAX_FEATURES RTAX_RTO_MIN, #define RTAX_RTO_MIN RTAX_RTO_MIN + RTAX_INITRWND, +#define RTAX_INITRWND RTAX_INITRWND __RTAX_MAX }; diff --git a/include/net/dst.h b/include/net/dst.h index 39c4a5963e12..ce078cda6b74 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -83,8 +83,6 @@ struct dst_entry { * (L1_CACHE_SIZE would be too much) */ #ifdef CONFIG_64BIT - long __pad_to_align_refcnt[2]; -#else long __pad_to_align_refcnt[1]; #endif /* diff --git a/include/net/tcp.h b/include/net/tcp.h index 185e22baecb1..788c99f98597 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -965,7 +965,8 @@ static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) /* Determine a window scaling and initial window to offer. */ extern void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, - int wscale_ok, __u8 *rcv_wscale); + int wscale_ok, __u8 *rcv_wscale, + __u32 init_rcv_wnd); static inline int tcp_win_from_space(int space) { diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 66fd80ef2473..5c24db4a3c91 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -358,7 +358,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, - ireq->wscale_ok, &rcv_wscale); + ireq->wscale_ok, &rcv_wscale, + dst_metric(&rt->u.dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 12b2af36eab8..4a1605d3f909 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -183,7 +183,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) */ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, - int wscale_ok, __u8 *rcv_wscale) + int wscale_ok, __u8 *rcv_wscale, + __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); @@ -232,7 +233,13 @@ void tcp_select_initial_window(int __space, __u32 mss, init_cwnd = 2; else if (mss > 1460) init_cwnd = 3; - if (*rcv_wnd > init_cwnd * mss) + /* when initializing use the value from init_rcv_wnd + * rather than the default from above + */ + if (init_rcv_wnd && + (*rcv_wnd > init_rcv_wnd * mss)) + *rcv_wnd = init_rcv_wnd * mss; + else if (*rcv_wnd > init_cwnd * mss) *rcv_wnd = init_cwnd * mss; } @@ -2417,7 +2424,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, - &rcv_wscale); + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; } @@ -2544,7 +2552,8 @@ static void tcp_connect_init(struct sock *sk) &tp->rcv_wnd, &tp->window_clamp, sysctl_tcp_window_scaling, - &rcv_wscale); + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7208a06576c6..34d1f0690d7e 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -269,7 +269,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, - ireq->wscale_ok, &rcv_wscale); + ireq->wscale_ok, &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; -- cgit v1.2.3 From e5cd6fe391aa8c93560bb7ffdfe334cf4d0a02e4 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Sat, 26 Dec 2009 11:51:00 +0000 Subject: llc: add support for LLC_OPT_PKTINFO Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- include/linux/llc.h | 7 +++++++ include/net/llc_conn.h | 1 + net/llc/af_llc.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/llc.h b/include/linux/llc.h index 7733585603f1..ad7074ba81af 100644 --- a/include/linux/llc.h +++ b/include/linux/llc.h @@ -36,6 +36,7 @@ enum llc_sockopts { LLC_OPT_BUSY_TMR_EXP, /* busy state expire time (secs). */ LLC_OPT_TX_WIN, /* tx window size. */ LLC_OPT_RX_WIN, /* rx window size. */ + LLC_OPT_PKTINFO, /* ancillary packet information. */ LLC_OPT_MAX }; @@ -70,6 +71,12 @@ enum llc_sockopts { #define LLC_SAP_RM 0xD4 /* Resource Management */ #define LLC_SAP_GLOBAL 0xFF /* Global SAP. */ +struct llc_pktinfo { + int lpi_ifindex; + unsigned char lpi_sap; + unsigned char lpi_mac[IFHWADDRLEN]; +}; + #ifdef __KERNEL__ #define LLC_SAP_DYN_START 0xC0 #define LLC_SAP_DYN_STOP 0xDE diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h index e2374e34989f..fe982fd94c4a 100644 --- a/include/net/llc_conn.h +++ b/include/net/llc_conn.h @@ -76,6 +76,7 @@ struct llc_sock { u32 rx_pdu_hdr; /* used for saving header of last pdu received and caused sending FRMR. Used for resending FRMR */ + u32 cmsg_flags; }; static inline struct llc_sock *llc_sk(const struct sock *sk) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 3a66546cad06..ac691fe08076 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -47,6 +47,10 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout); #define dprintk(args...) #endif +/* Maybe we'll add some more in the future. */ +#define LLC_CMSG_PKTINFO 1 + + /** * llc_ui_next_link_no - return the next unused link number for a sap * @sap: Address of sap to get link number from. @@ -591,6 +595,20 @@ static int llc_wait_data(struct sock *sk, long timeo) return rc; } +static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(skb->sk); + + if (llc->cmsg_flags & LLC_CMSG_PKTINFO) { + struct llc_pktinfo info; + + info.lpi_ifindex = llc_sk(skb->sk)->dev->ifindex; + llc_pdu_decode_dsap(skb, &info.lpi_sap); + llc_pdu_decode_da(skb, info.lpi_mac); + put_cmsg(msg, SOL_LLC, LLC_OPT_PKTINFO, sizeof(info), &info); + } +} + /** * llc_ui_accept - accept a new incoming connection. * @sock: Socket which connections arrive on. @@ -812,6 +830,8 @@ copy_uaddr: memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr)); msg->msg_namelen = sizeof(*uaddr); } + if (llc_sk(sk)->cmsg_flags) + llc_cmsg_rcv(msg, skb); goto out; } @@ -1030,6 +1050,12 @@ static int llc_ui_setsockopt(struct socket *sock, int level, int optname, goto out; llc->rw = opt; break; + case LLC_OPT_PKTINFO: + if (opt) + llc->cmsg_flags |= LLC_CMSG_PKTINFO; + else + llc->cmsg_flags &= ~LLC_CMSG_PKTINFO; + break; default: rc = -ENOPROTOOPT; goto out; @@ -1083,6 +1109,9 @@ static int llc_ui_getsockopt(struct socket *sock, int level, int optname, val = llc->k; break; case LLC_OPT_RX_WIN: val = llc->rw; break; + case LLC_OPT_PKTINFO: + val = (llc->cmsg_flags & LLC_CMSG_PKTINFO) != 0; + break; default: rc = -ENOPROTOOPT; goto out; -- cgit v1.2.3 From 49f474331e563a6ecf3b1e87ec27ec5482b3e4f1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 27 Dec 2009 11:51:52 +0100 Subject: perf events: Remove arg from perf sched hooks Since we only ever schedule the local cpu, there is no need to pass the cpu number to the perf sched hooks. This micro-optimizes things a bit. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 12 ++++++------ kernel/perf_event.c | 27 ++++++++++++++------------- kernel/sched.c | 6 +++--- 3 files changed, 23 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c66b34f75eea..a494e7501292 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -746,10 +746,10 @@ extern int perf_max_events; extern const struct pmu *hw_perf_event_init(struct perf_event *event); -extern void perf_event_task_sched_in(struct task_struct *task, int cpu); +extern void perf_event_task_sched_in(struct task_struct *task); extern void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu); -extern void perf_event_task_tick(struct task_struct *task, int cpu); + struct task_struct *next); +extern void perf_event_task_tick(struct task_struct *task); extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); @@ -870,12 +870,12 @@ extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); #else static inline void -perf_event_task_sched_in(struct task_struct *task, int cpu) { } +perf_event_task_sched_in(struct task_struct *task) { } static inline void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) { } + struct task_struct *next) { } static inline void -perf_event_task_tick(struct task_struct *task, int cpu) { } +perf_event_task_tick(struct task_struct *task) { } static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 03cc061398d1..099bd662daa6 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1170,9 +1170,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, * not restart the event. */ void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) + struct task_struct *next) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent; @@ -1252,8 +1252,9 @@ static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) static void __perf_event_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, int cpu) + struct perf_cpu_context *cpuctx) { + int cpu = smp_processor_id(); struct perf_event *event; int can_add_hw = 1; @@ -1326,24 +1327,24 @@ __perf_event_sched_in(struct perf_event_context *ctx, * accessing the event control register. If a NMI hits, then it will * keep the event running. */ -void perf_event_task_sched_in(struct task_struct *task, int cpu) +void perf_event_task_sched_in(struct task_struct *task) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; if (likely(!ctx)) return; if (cpuctx->task_ctx == ctx) return; - __perf_event_sched_in(ctx, cpuctx, cpu); + __perf_event_sched_in(ctx, cpuctx); cpuctx->task_ctx = ctx; } -static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = &cpuctx->ctx; - __perf_event_sched_in(ctx, cpuctx, cpu); + __perf_event_sched_in(ctx, cpuctx); } #define MAX_INTERRUPTS (~0ULL) @@ -1461,7 +1462,7 @@ static void rotate_ctx(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); } -void perf_event_task_tick(struct task_struct *curr, int cpu) +void perf_event_task_tick(struct task_struct *curr) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; @@ -1469,7 +1470,7 @@ void perf_event_task_tick(struct task_struct *curr, int cpu) if (!atomic_read(&nr_events)) return; - cpuctx = &per_cpu(perf_cpu_context, cpu); + cpuctx = &__get_cpu_var(perf_cpu_context); ctx = curr->perf_event_ctxp; perf_ctx_adjust_freq(&cpuctx->ctx); @@ -1484,9 +1485,9 @@ void perf_event_task_tick(struct task_struct *curr, int cpu) if (ctx) rotate_ctx(ctx); - perf_event_cpu_sched_in(cpuctx, cpu); + perf_event_cpu_sched_in(cpuctx); if (ctx) - perf_event_task_sched_in(curr, cpu); + perf_event_task_sched_in(curr); } /* @@ -1527,7 +1528,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_unlock(&ctx->lock); - perf_event_task_sched_in(task, smp_processor_id()); + perf_event_task_sched_in(task); out: local_irq_restore(flags); } diff --git a/kernel/sched.c b/kernel/sched.c index 18cceeecce35..d6527ac0f6e7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2752,7 +2752,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); - perf_event_task_sched_in(current, cpu_of(rq)); + perf_event_task_sched_in(current); finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); @@ -5266,7 +5266,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); - perf_event_task_tick(curr, cpu); + perf_event_task_tick(curr); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -5480,7 +5480,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_event_task_sched_out(prev, next, cpu); + perf_event_task_sched_out(prev, next); rq->nr_switches++; rq->curr = next; -- cgit v1.2.3 From 07b139c8c81b97bbe55c68daf0cbeca8b1c609ca Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 21 Dec 2009 14:27:35 +0800 Subject: perf events: Remove CONFIG_EVENT_PROFILE Quoted from Ingo: | This reminds me - i think we should eliminate CONFIG_EVENT_PROFILE - | it's an unnecessary Kconfig complication. If both PERF_EVENTS and | EVENT_TRACING is enabled we should expose generic tracepoints. | | Nor is it limited to event 'profiling', so it has become a misnomer as | well. Signed-off-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4B2F1557.2050705@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 2 +- include/linux/perf_event.h | 2 +- include/linux/syscalls.h | 4 ++-- include/trace/ftrace.h | 12 ++++++------ include/trace/syscall.h | 4 ++-- init/Kconfig | 13 ------------- kernel/perf_event.c | 4 ++-- kernel/trace/Makefile | 4 +++- kernel/trace/trace_events_filter.c | 4 ++-- kernel/trace/trace_kprobe.c | 14 +++++++------- kernel/trace/trace_syscalls.c | 5 ++--- 11 files changed, 28 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 2233c98d80df..0a09e758c7d3 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -188,7 +188,7 @@ do { \ __trace_printk(ip, fmt, ##args); \ } while (0) -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS struct perf_event; extern int ftrace_profile_enable(int event_id); extern void ftrace_profile_disable(int event_id); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a494e7501292..9a1d276db754 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -658,7 +658,7 @@ struct perf_event { perf_overflow_handler_t overflow_handler; -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_EVENT_TRACING struct event_filter *filter; #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 65793e90d6f6..b7c7fcf7790b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -99,7 +99,7 @@ struct perf_event_attr; #define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) #define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS #define TRACE_SYS_ENTER_PROFILE_INIT(sname) \ .profile_enable = prof_sysenter_enable, \ @@ -113,7 +113,7 @@ struct perf_event_attr; #define TRACE_SYS_ENTER_PROFILE_INIT(sname) #define TRACE_SYS_EXIT_PROFILE(sname) #define TRACE_SYS_EXIT_PROFILE_INIT(sname) -#endif +#endif /* CONFIG_PERF_EVENTS */ #ifdef CONFIG_FTRACE_SYSCALLS #define __SC_STR_ADECL1(t, a) #a diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 73523151a731..2fdd36df41f6 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -498,7 +498,7 @@ static inline int ftrace_get_offsets_##call( \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS /* * Generate the functions needed for tracepoint perf_event support. @@ -541,7 +541,7 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#endif +#endif /* CONFIG_PERF_EVENTS */ /* * Stage 4 of the trace events. @@ -626,7 +626,7 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ * */ -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS #define _TRACE_PROFILE_INIT(call) \ .profile_enable = ftrace_profile_enable_##call, \ @@ -634,7 +634,7 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ #else #define _TRACE_PROFILE_INIT(call) -#endif +#endif /* CONFIG_PERF_EVENTS */ #undef __entry #define __entry entry @@ -834,7 +834,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * } */ -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS #undef __perf_addr #define __perf_addr(a) __addr = (a) @@ -926,7 +926,7 @@ static void ftrace_profile_##call(proto) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_PERF_EVENTS */ #undef _TRACE_PROFILE_INIT diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 961fda3556bb..3d463dcef298 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -49,12 +49,12 @@ ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s); enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags); enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags); #endif -#ifdef CONFIG_EVENT_PROFILE + +#ifdef CONFIG_PERF_EVENTS int prof_sysenter_enable(struct ftrace_event_call *call); void prof_sysenter_disable(struct ftrace_event_call *call); int prof_sysexit_enable(struct ftrace_event_call *call); void prof_sysexit_disable(struct ftrace_event_call *call); - #endif #endif /* _TRACE_SYSCALL_H */ diff --git a/init/Kconfig b/init/Kconfig index a23da9f01803..06dab27c18d9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -966,19 +966,6 @@ config PERF_EVENTS Say Y if unsure. -config EVENT_PROFILE - bool "Tracepoint profiling sources" - depends on PERF_EVENTS && EVENT_TRACING - default y - help - Allow the use of tracepoints as software performance events. - - When this is enabled, you can create perf events based on - tracepoints using PERF_TYPE_TRACEPOINT and the tracepoint ID - found in debugfs://tracing/events/*/*/id. (The -e/--events - option to the perf tool can parse and interpret symbolic - tracepoints, in the subsystem:tracepoint_name format.) - config PERF_COUNTERS bool "Kernel performance counters (old config option)" depends on HAVE_PERF_EVENTS diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 099bd662daa6..5b987b4a98a8 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4177,7 +4177,7 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_event_read, }; -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_EVENT_TRACING void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) @@ -4282,7 +4282,7 @@ static void perf_event_free_filter(struct perf_event *event) { } -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT static void bp_perf_event_destroy(struct perf_event *event) diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index cd9ecd89ec77..d00c6fe23f54 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -51,7 +51,9 @@ endif obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o -obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o +endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 50504cb228de..74563d7e102e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1360,7 +1360,7 @@ out_unlock: return err; } -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS void ftrace_profile_free_filter(struct perf_event *event) { @@ -1428,5 +1428,5 @@ out_unlock: return err; } -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 375f81a568dc..75d75dec226a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1249,7 +1249,7 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call, ", REC->" FIELD_STRING_RETIP); } -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ static __kprobes int kprobe_profile_func(struct kprobe *kp, @@ -1407,7 +1407,7 @@ static void probe_profile_disable(struct ftrace_event_call *call) disable_kprobe(&tp->rp.kp); } } -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_PERF_EVENTS */ static __kprobes @@ -1417,10 +1417,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) if (tp->flags & TP_FLAG_TRACE) kprobe_trace_func(kp, regs); -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) kprobe_profile_func(kp, regs); -#endif /* CONFIG_EVENT_PROFILE */ +#endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1431,10 +1431,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) if (tp->flags & TP_FLAG_TRACE) kretprobe_trace_func(ri, regs); -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) kretprobe_profile_func(ri, regs); -#endif /* CONFIG_EVENT_PROFILE */ +#endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1463,7 +1463,7 @@ static int register_probe_event(struct trace_probe *tp) call->regfunc = probe_event_enable; call->unregfunc = probe_event_disable; -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS call->profile_enable = probe_profile_enable; call->profile_disable = probe_profile_disable; #endif diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 75289f372dd2..f694f66d75b0 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -421,7 +421,7 @@ int __init init_ftrace_syscalls(void) } core_initcall(init_ftrace_syscalls); -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); @@ -626,6 +626,5 @@ void prof_sysexit_disable(struct ftrace_event_call *call) mutex_unlock(&syscall_trace_lock); } -#endif - +#endif /* CONFIG_PERF_EVENTS */ -- cgit v1.2.3 From d894837f23f491aa7ed167aae767fc07cfe6e6e6 Mon Sep 17 00:00:00 2001 From: Simon Kagstrom Date: Wed, 23 Dec 2009 11:08:18 +0100 Subject: sched: might_sleep(): Make file parameter const char * Fixes a warning when building with g++: warning: deprecated conversion from string constant to 'char*' And the file parameter use is constant, so mark it as such. Signed-off-by: Simon Kagstrom Cc: peterz@infradead.org LKML-Reference: <20091223110818.442d848e@marrow.netinsight.se> Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 5 +++-- kernel/sched.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3fc9f5aab5f8..785d7d1099d4 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -124,7 +124,7 @@ extern int _cond_resched(void); #endif #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - void __might_sleep(char *file, int line, int preempt_offset); + void __might_sleep(const char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep * @@ -138,7 +138,8 @@ extern int _cond_resched(void); # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) #else - static inline void __might_sleep(char *file, int line, int preempt_offset) { } + static inline void __might_sleep(const char *file, int line, + int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) #endif diff --git a/kernel/sched.c b/kernel/sched.c index c535cc4f6428..64298a52eaa6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9694,7 +9694,7 @@ static inline int preempt_count_equals(int preempt_offset) return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); } -void __might_sleep(char *file, int line, int preempt_offset) +void __might_sleep(const char *file, int line, int preempt_offset) { #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ -- cgit v1.2.3 From 8e664fb3fd2b04e3ac5fad7f046000ba54e0e275 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 23 Dec 2009 13:15:38 +0100 Subject: mac80211: split up and insert custom IEs correctly Currently, we insert all user-specified IEs before the HT IE for association, and after the HT IE for probe requests. For association, that's correct only if the user-specified IEs are RSN only, incorrect in all other cases including WPA. Change this to split apart the user-specified IEs in two places for association: before the HT IE (e.g. RSN), after the HT IE (generally empty right now I think?) and after WMM (all other vendor-specific IEs). For probes, split the IEs in different places to be correct according to the spec. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 39 ++++++++++--- net/mac80211/ieee80211_i.h | 4 ++ net/mac80211/util.c | 134 ++++++++++++++++++++++++++++++++++++++------- net/mac80211/work.c | 43 ++++++++++++--- 4 files changed, 184 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d62edc7df3ae..aeea282bd2fe 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1085,12 +1085,12 @@ enum ieee80211_eid { WLAN_EID_TIM = 5, WLAN_EID_IBSS_PARAMS = 6, WLAN_EID_CHALLENGE = 16, - /* 802.11d */ + WLAN_EID_COUNTRY = 7, WLAN_EID_HP_PARAMS = 8, WLAN_EID_HP_TABLE = 9, WLAN_EID_REQUEST = 10, - /* 802.11e */ + WLAN_EID_QBSS_LOAD = 11, WLAN_EID_EDCA_PARAM_SET = 12, WLAN_EID_TSPEC = 13, @@ -1113,7 +1113,7 @@ enum ieee80211_eid { WLAN_EID_PREP = 69, WLAN_EID_PERR = 70, WLAN_EID_RANN = 49, /* compatible with FreeBSD */ - /* 802.11h */ + WLAN_EID_PWR_CONSTRAINT = 32, WLAN_EID_PWR_CAPABILITY = 33, WLAN_EID_TPC_REQUEST = 34, @@ -1124,20 +1124,41 @@ enum ieee80211_eid { WLAN_EID_MEASURE_REPORT = 39, WLAN_EID_QUIET = 40, WLAN_EID_IBSS_DFS = 41, - /* 802.11g */ + WLAN_EID_ERP_INFO = 42, WLAN_EID_EXT_SUPP_RATES = 50, - /* 802.11n */ + WLAN_EID_HT_CAPABILITY = 45, WLAN_EID_HT_INFORMATION = 61, - /* 802.11i */ + WLAN_EID_RSN = 48, - WLAN_EID_TIMEOUT_INTERVAL = 56, - WLAN_EID_MMIE = 76 /* 802.11w */, + WLAN_EID_MMIE = 76, WLAN_EID_WPA = 221, WLAN_EID_GENERIC = 221, WLAN_EID_VENDOR_SPECIFIC = 221, - WLAN_EID_QOS_PARAMETER = 222 + WLAN_EID_QOS_PARAMETER = 222, + + WLAN_EID_AP_CHAN_REPORT = 51, + WLAN_EID_NEIGHBOR_REPORT = 52, + WLAN_EID_RCPI = 53, + WLAN_EID_BSS_AVG_ACCESS_DELAY = 63, + WLAN_EID_ANTENNA_INFO = 64, + WLAN_EID_RSNI = 65, + WLAN_EID_MEASUREMENT_PILOT_TX_INFO = 66, + WLAN_EID_BSS_AVAILABLE_CAPACITY = 67, + WLAN_EID_BSS_AC_ACCESS_DELAY = 68, + WLAN_EID_RRM_ENABLED_CAPABILITIES = 70, + WLAN_EID_MULTIPLE_BSSID = 71, + + WLAN_EID_MOBILITY_DOMAIN = 54, + WLAN_EID_FAST_BSS_TRANSITION = 55, + WLAN_EID_TIMEOUT_INTERVAL = 56, + WLAN_EID_RIC_DATA = 57, + WLAN_EID_RIC_DESCRIPTOR = 75, + + WLAN_EID_DSE_REGISTERED_LOCATION = 58, + WLAN_EID_SUPPORTED_REGULATORY_CLASSES = 59, + WLAN_EID_EXT_CHANSWITCH_ANN = 60, }; /* Action category code */ diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 97b6076b492e..6ea4ffbf84d8 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1148,6 +1148,10 @@ int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata, void ieee80211_recalc_smps(struct ieee80211_local *local, struct ieee80211_sub_if_data *forsdata); +size_t ieee80211_ie_split(const u8 *ies, size_t ielen, + const u8 *ids, int n_ids, size_t offset); +size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset); + /* internal work items */ void ieee80211_work_init(struct ieee80211_local *local); void ieee80211_add_work(struct ieee80211_work *wk); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 5ffe9e831b66..1fdb80ff9241 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -881,30 +881,66 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, enum ieee80211_band band) { struct ieee80211_supported_band *sband; - u8 *pos, *supp_rates_len, *esupp_rates_len = NULL; - int i; + u8 *pos; + size_t offset = 0, noffset; + int supp_rates_len, i; sband = local->hw.wiphy->bands[band]; pos = buffer; + supp_rates_len = min_t(int, sband->n_bitrates, 8); + *pos++ = WLAN_EID_SUPP_RATES; - supp_rates_len = pos; - *pos++ = 0; - - for (i = 0; i < sband->n_bitrates; i++) { - struct ieee80211_rate *rate = &sband->bitrates[i]; - - if (esupp_rates_len) { - *esupp_rates_len += 1; - } else if (*supp_rates_len == 8) { - *pos++ = WLAN_EID_EXT_SUPP_RATES; - esupp_rates_len = pos; - *pos++ = 1; - } else - *supp_rates_len += 1; + *pos++ = supp_rates_len; - *pos++ = rate->bitrate / 5; + for (i = 0; i < supp_rates_len; i++) { + int rate = sband->bitrates[i].bitrate; + *pos++ = (u8) (rate / 5); + } + + /* insert "request information" if in custom IEs */ + if (ie && ie_len) { + static const u8 before_extrates[] = { + WLAN_EID_SSID, + WLAN_EID_SUPP_RATES, + WLAN_EID_REQUEST, + }; + noffset = ieee80211_ie_split(ie, ie_len, + before_extrates, + ARRAY_SIZE(before_extrates), + offset); + memcpy(pos, ie + offset, noffset - offset); + pos += noffset - offset; + offset = noffset; + } + + if (sband->n_bitrates > i) { + *pos++ = WLAN_EID_EXT_SUPP_RATES; + *pos++ = sband->n_bitrates - i; + + for (; i < sband->n_bitrates; i++) { + int rate = sband->bitrates[i].bitrate; + *pos++ = (u8) (rate / 5); + } + } + + /* insert custom IEs that go before HT */ + if (ie && ie_len) { + static const u8 before_ht[] = { + WLAN_EID_SSID, + WLAN_EID_SUPP_RATES, + WLAN_EID_REQUEST, + WLAN_EID_EXT_SUPP_RATES, + WLAN_EID_DS_PARAMS, + WLAN_EID_SUPPORTED_REGULATORY_CLASSES, + }; + noffset = ieee80211_ie_split(ie, ie_len, + before_ht, ARRAY_SIZE(before_ht), + offset); + memcpy(pos, ie + offset, noffset - offset); + pos += noffset - offset; + offset = noffset; } if (sband->ht_cap.ht_supported) { @@ -936,9 +972,11 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, * that calculates local->scan_ies_len. */ - if (ie) { - memcpy(pos, ie, ie_len); - pos += ie_len; + /* add any remaining custom IEs */ + if (ie && ie_len) { + noffset = ie_len; + memcpy(pos, ie + offset, noffset - offset); + pos += noffset - offset; } return pos - buffer; @@ -1252,3 +1290,59 @@ void ieee80211_recalc_smps(struct ieee80211_local *local, /* changed flag is auto-detected for this */ ieee80211_hw_config(local, 0); } + +static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id) +{ + int i; + + for (i = 0; i < n_ids; i++) + if (ids[i] == id) + return true; + return false; +} + +/** + * ieee80211_ie_split - split an IE buffer according to ordering + * + * @ies: the IE buffer + * @ielen: the length of the IE buffer + * @ids: an array with element IDs that are allowed before + * the split + * @n_ids: the size of the element ID array + * @offset: offset where to start splitting in the buffer + * + * This function splits an IE buffer by updating the @offset + * variable to point to the location where the buffer should be + * split. + * + * It assumes that the given IE buffer is well-formed, this + * has to be guaranteed by the caller! + * + * It also assumes that the IEs in the buffer are ordered + * correctly, if not the result of using this function will not + * be ordered correctly either, i.e. it does no reordering. + * + * The function returns the offset where the next part of the + * buffer starts, which may be @ielen if the entire (remainder) + * of the buffer should be used. + */ +size_t ieee80211_ie_split(const u8 *ies, size_t ielen, + const u8 *ids, int n_ids, size_t offset) +{ + size_t pos = offset; + + while (pos < ielen && ieee80211_id_in_list(ids, n_ids, ies[pos])) + pos += 2 + ies[pos + 1]; + + return pos; +} + +size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset) +{ + size_t pos = offset; + + while (pos < ielen && ies[pos] != WLAN_EID_VENDOR_SPECIFIC) + pos += 2 + ies[pos + 1]; + + return pos; +} diff --git a/net/mac80211/work.c b/net/mac80211/work.c index c03c22d5bca3..affdd10b67ad 100644 --- a/net/mac80211/work.c +++ b/net/mac80211/work.c @@ -204,6 +204,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt; u8 *pos; const u8 *ies; + size_t offset = 0, noffset; int i, len, count, rates_len, supp_rates_len; u16 capab; struct ieee80211_supported_band *sband; @@ -337,14 +338,26 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, } } - /* - * XXX: These IEs could contain (vendor-specified) - * IEs that belong after HT -- the buffer may - * need to be split up. - */ + /* if present, add any custom IEs that go before HT */ if (wk->ie_len && wk->ie) { - pos = skb_put(skb, wk->ie_len); - memcpy(pos, wk->ie, wk->ie_len); + static const u8 before_ht[] = { + WLAN_EID_SSID, + WLAN_EID_SUPP_RATES, + WLAN_EID_EXT_SUPP_RATES, + WLAN_EID_PWR_CAPABILITY, + WLAN_EID_SUPPORTED_CHANNELS, + WLAN_EID_RSN, + WLAN_EID_QOS_CAPA, + WLAN_EID_RRM_ENABLED_CAPABILITIES, + WLAN_EID_MOBILITY_DOMAIN, + WLAN_EID_SUPPORTED_REGULATORY_CLASSES, + }; + noffset = ieee80211_ie_split(wk->ie, wk->ie_len, + before_ht, ARRAY_SIZE(before_ht), + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, wk->ie + offset, noffset - offset); + offset = noffset; } if (wk->assoc.use_11n && wk->assoc.wmm_used && @@ -352,6 +365,15 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, ieee80211_add_ht_ie(skb, wk->assoc.ht_information_ie, sband, wk->chan, wk->assoc.smps); + /* if present, add any custom non-vendor IEs that go after HT */ + if (wk->ie_len && wk->ie) { + noffset = ieee80211_ie_split_vendor(wk->ie, wk->ie_len, + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, wk->ie + offset, noffset - offset); + offset = noffset; + } + if (wk->assoc.wmm_used && local->hw.queues >= 4) { pos = skb_put(skb, 9); *pos++ = WLAN_EID_VENDOR_SPECIFIC; @@ -365,6 +387,13 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, *pos++ = 0; } + /* add any remaining custom (i.e. vendor specific here) IEs */ + if (wk->ie_len && wk->ie) { + noffset = wk->ie_len; + pos = skb_put(skb, noffset - offset); + memcpy(pos, wk->ie + offset, noffset - offset); + } + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; ieee80211_tx_skb(sdata, skb); } -- cgit v1.2.3 From 9588bbd5529461a3dacd435bf239c84c3508f569 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 23 Dec 2009 13:15:41 +0100 Subject: cfg80211: add remain-on-channel command Add new commands for requesting the driver to remain awake on a specified channel for the specified amount of time (and another command to cancel such an operation). This can be used to implement userspace-controlled off-channel operations, like Public Action frame exchange on another channel than the operation channel. The off-channel operation should behave similarly to scan, i.e. the local station (if associated) moves into power save mode to request the AP to buffer frames for it and then moves to the other channel to allow the off-channel operation to be completed. The duration parameter can be used to request enough time to receive a response from the target station. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/nl80211.h | 36 ++++++++ include/net/cfg80211.h | 47 +++++++++++ net/wireless/chan.c | 41 +++++---- net/wireless/core.h | 3 + net/wireless/mlme.c | 27 ++++++ net/wireless/nl80211.c | 218 +++++++++++++++++++++++++++++++++++++++++++++++- net/wireless/nl80211.h | 11 +++ 7 files changed, 368 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index da8ea2e19273..2bfbe88837ef 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -270,6 +270,31 @@ * @NL80211_CMD_SET_WIPHY_NETNS: Set a wiphy's netns. Note that all devices * associated with this wiphy must be down and will follow. * + * @NL80211_CMD_REMAIN_ON_CHANNEL: Request to remain awake on the specified + * channel for the specified amount of time. This can be used to do + * off-channel operations like transmit a Public Action frame and wait for + * a response while being associated to an AP on another channel. + * %NL80211_ATTR_WIPHY or %NL80211_ATTR_IFINDEX is used to specify which + * radio is used. %NL80211_ATTR_WIPHY_FREQ is used to specify the + * frequency for the operation and %NL80211_ATTR_WIPHY_CHANNEL_TYPE may be + * optionally used to specify additional channel parameters. + * %NL80211_ATTR_DURATION is used to specify the duration in milliseconds + * to remain on the channel. This command is also used as an event to + * notify when the requested duration starts (it may take a while for the + * driver to schedule this time due to other concurrent needs for the + * radio). + * When called, this operation returns a cookie (%NL80211_ATTR_COOKIE) + * that will be included with any events pertaining to this request; + * the cookie is also used to cancel the request. + * @NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL: This command can be used to cancel a + * pending remain-on-channel duration if the desired operation has been + * completed prior to expiration of the originally requested duration. + * %NL80211_ATTR_WIPHY or %NL80211_ATTR_IFINDEX is used to specify the + * radio. The %NL80211_ATTR_COOKIE attribute must be given as well to + * uniquely identify the request. + * This command is also used as an event to notify when a requested + * remain-on-channel duration has expired. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -353,6 +378,9 @@ enum nl80211_commands { NL80211_CMD_DEL_PMKSA, NL80211_CMD_FLUSH_PMKSA, + NL80211_CMD_REMAIN_ON_CHANNEL, + NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -606,6 +634,10 @@ enum nl80211_commands { * @NL80211_ATTR_MAX_NUM_PMKIDS: maximum number of PMKIDs a firmware can * cache, a wiphy attribute. * + * @NL80211_ATTR_DURATION: Duration of an operation in milliseconds, u32. + * + * @NL80211_ATTR_COOKIE: Generic 64-bit cookie to identify objects. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -743,6 +775,10 @@ enum nl80211_attrs { NL80211_ATTR_PMKID, NL80211_ATTR_MAX_NUM_PMKIDS, + NL80211_ATTR_DURATION, + + NL80211_ATTR_COOKIE, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 542a477a94da..b66beb052054 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -988,6 +988,15 @@ struct cfg80211_pmksa { * * @dump_survey: get site survey information. * + * @remain_on_channel: Request the driver to remain awake on the specified + * channel for the specified duration to complete an off-channel + * operation (e.g., public action frame exchange). When the driver is + * ready on the requested channel, it must indicate this with an event + * notification by calling cfg80211_ready_on_channel(). + * @cancel_remain_on_channel: Cancel an on-going remain-on-channel operation. + * This allows the operation to be terminated prior to timeout based on + * the duration value. + * * @testmode_cmd: run a test mode command * * @set_pmksa: Cache a PMKID for a BSSID. This is mostly useful for fullmac @@ -1123,6 +1132,16 @@ struct cfg80211_ops { struct cfg80211_pmksa *pmksa); int (*flush_pmksa)(struct wiphy *wiphy, struct net_device *netdev); + int (*remain_on_channel)(struct wiphy *wiphy, + struct net_device *dev, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, + u64 *cookie); + int (*cancel_remain_on_channel)(struct wiphy *wiphy, + struct net_device *dev, + u64 cookie); + /* some temporary stuff to finish wext */ int (*set_power_mgmt)(struct wiphy *wiphy, struct net_device *dev, bool enabled, int timeout); @@ -2147,5 +2166,33 @@ void cfg80211_roamed(struct net_device *dev, const u8 *bssid, void cfg80211_disconnected(struct net_device *dev, u16 reason, u8 *ie, size_t ie_len, gfp_t gfp); +/** + * cfg80211_ready_on_channel - notification of remain_on_channel start + * @dev: network device + * @cookie: the request cookie + * @chan: The current channel (from remain_on_channel request) + * @channel_type: Channel type + * @duration: Duration in milliseconds that the driver intents to remain on the + * channel + * @gfp: allocation flags + */ +void cfg80211_ready_on_channel(struct net_device *dev, u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, gfp_t gfp); + +/** + * cfg80211_remain_on_channel_expired - remain_on_channel duration expired + * @dev: network device + * @cookie: the request cookie + * @chan: The current channel (from remain_on_channel request) + * @channel_type: Channel type + * @gfp: allocation flags + */ +void cfg80211_remain_on_channel_expired(struct net_device *dev, + u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + gfp_t gfp); #endif /* __NET_CFG80211_H */ diff --git a/net/wireless/chan.c b/net/wireless/chan.c index a46ac6c9b365..bf1737fc9a7e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -41,44 +41,57 @@ rdev_fixed_channel(struct cfg80211_registered_device *rdev, return result; } -int rdev_set_freq(struct cfg80211_registered_device *rdev, - struct wireless_dev *for_wdev, +struct ieee80211_channel * +rdev_freq_to_chan(struct cfg80211_registered_device *rdev, int freq, enum nl80211_channel_type channel_type) { struct ieee80211_channel *chan; struct ieee80211_sta_ht_cap *ht_cap; - int result; - - if (rdev_fixed_channel(rdev, for_wdev)) - return -EBUSY; - - if (!rdev->ops->set_channel) - return -EOPNOTSUPP; chan = ieee80211_get_channel(&rdev->wiphy, freq); /* Primary channel not allowed */ if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) - return -EINVAL; + return NULL; if (channel_type == NL80211_CHAN_HT40MINUS && chan->flags & IEEE80211_CHAN_NO_HT40MINUS) - return -EINVAL; + return NULL; else if (channel_type == NL80211_CHAN_HT40PLUS && chan->flags & IEEE80211_CHAN_NO_HT40PLUS) - return -EINVAL; + return NULL; ht_cap = &rdev->wiphy.bands[chan->band]->ht_cap; if (channel_type != NL80211_CHAN_NO_HT) { if (!ht_cap->ht_supported) - return -EINVAL; + return NULL; if (!(ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) || ht_cap->cap & IEEE80211_HT_CAP_40MHZ_INTOLERANT) - return -EINVAL; + return NULL; } + return chan; +} + +int rdev_set_freq(struct cfg80211_registered_device *rdev, + struct wireless_dev *for_wdev, + int freq, enum nl80211_channel_type channel_type) +{ + struct ieee80211_channel *chan; + int result; + + if (rdev_fixed_channel(rdev, for_wdev)) + return -EBUSY; + + if (!rdev->ops->set_channel) + return -EOPNOTSUPP; + + chan = rdev_freq_to_chan(rdev, freq, channel_type); + if (!chan) + return -EINVAL; + result = rdev->ops->set_channel(&rdev->wiphy, chan, channel_type); if (result) return result; diff --git a/net/wireless/core.h b/net/wireless/core.h index 35b712127143..30ec95f05b52 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -374,6 +374,9 @@ void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); struct ieee80211_channel * rdev_fixed_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *for_wdev); +struct ieee80211_channel * +rdev_freq_to_chan(struct cfg80211_registered_device *rdev, + int freq, enum nl80211_channel_type channel_type); int rdev_set_freq(struct cfg80211_registered_device *rdev, struct wireless_dev *for_wdev, int freq, enum nl80211_channel_type channel_type); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index acaeaa784d68..11f6469b3f98 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -680,3 +680,30 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, } } } + +void cfg80211_ready_on_channel(struct net_device *dev, u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, gfp_t gfp) +{ + struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + nl80211_send_remain_on_channel(rdev, dev, cookie, chan, channel_type, + duration, gfp); +} +EXPORT_SYMBOL(cfg80211_ready_on_channel); + +void cfg80211_remain_on_channel_expired(struct net_device *dev, + u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + gfp_t gfp) +{ + struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + nl80211_send_remain_on_channel_cancel(rdev, dev, cookie, chan, + channel_type, gfp); +} +EXPORT_SYMBOL(cfg80211_remain_on_channel_expired); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 60f854377f90..ff857f10cb85 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -141,6 +141,8 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = { [NL80211_ATTR_4ADDR] = { .type = NLA_U8 }, [NL80211_ATTR_PMKID] = { .type = NLA_BINARY, .len = WLAN_PMKID_LEN }, + [NL80211_ATTR_DURATION] = { .type = NLA_U32 }, + [NL80211_ATTR_COOKIE] = { .type = NLA_U64 }, }; /* policy for the attributes */ @@ -569,6 +571,7 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, CMD(set_pmksa, SET_PMKSA); CMD(del_pmksa, DEL_PMKSA); CMD(flush_pmksa, FLUSH_PMKSA); + CMD(remain_on_channel, REMAIN_ON_CHANNEL); if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) { i++; NLA_PUT_U32(msg, i, NL80211_CMD_SET_WIPHY_NETNS); @@ -4283,6 +4286,143 @@ static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info) } +static int nl80211_remain_on_channel(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev; + struct net_device *dev; + struct ieee80211_channel *chan; + struct sk_buff *msg; + void *hdr; + u64 cookie; + enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT; + u32 freq, duration; + int err; + + if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] || + !info->attrs[NL80211_ATTR_DURATION]) + return -EINVAL; + + duration = nla_get_u32(info->attrs[NL80211_ATTR_DURATION]); + + /* + * We should be on that channel for at least one jiffie, + * and more than 5 seconds seems excessive. + */ + if (!duration || !msecs_to_jiffies(duration) || duration > 5000) + return -EINVAL; + + rtnl_lock(); + + err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); + if (err) + goto unlock_rtnl; + + if (!rdev->ops->remain_on_channel) { + err = -EOPNOTSUPP; + goto out; + } + + if (!netif_running(dev)) { + err = -ENETDOWN; + goto out; + } + + if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { + channel_type = nla_get_u32( + info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]); + if (channel_type != NL80211_CHAN_NO_HT && + channel_type != NL80211_CHAN_HT20 && + channel_type != NL80211_CHAN_HT40PLUS && + channel_type != NL80211_CHAN_HT40MINUS) + err = -EINVAL; + goto out; + } + + freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]); + chan = rdev_freq_to_chan(rdev, freq, channel_type); + if (chan == NULL) { + err = -EINVAL; + goto out; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto out; + } + + hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, + NL80211_CMD_REMAIN_ON_CHANNEL); + + if (IS_ERR(hdr)) { + err = PTR_ERR(hdr); + goto free_msg; + } + + err = rdev->ops->remain_on_channel(&rdev->wiphy, dev, chan, + channel_type, duration, &cookie); + + if (err) + goto free_msg; + + NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie); + + genlmsg_end(msg, hdr); + err = genlmsg_reply(msg, info); + goto out; + + nla_put_failure: + err = -ENOBUFS; + free_msg: + nlmsg_free(msg); + out: + cfg80211_unlock_rdev(rdev); + dev_put(dev); + unlock_rtnl: + rtnl_unlock(); + return err; +} + +static int nl80211_cancel_remain_on_channel(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev; + struct net_device *dev; + u64 cookie; + int err; + + if (!info->attrs[NL80211_ATTR_COOKIE]) + return -EINVAL; + + rtnl_lock(); + + err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); + if (err) + goto unlock_rtnl; + + if (!rdev->ops->cancel_remain_on_channel) { + err = -EOPNOTSUPP; + goto out; + } + + if (!netif_running(dev)) { + err = -ENETDOWN; + goto out; + } + + cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]); + + err = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, dev, cookie); + + out: + cfg80211_unlock_rdev(rdev); + dev_put(dev); + unlock_rtnl: + rtnl_unlock(); + return err; +} + static struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_GET_WIPHY, @@ -4545,8 +4685,20 @@ static struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, }, - + { + .cmd = NL80211_CMD_REMAIN_ON_CHANNEL, + .doit = nl80211_remain_on_channel, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, + .doit = nl80211_cancel_remain_on_channel, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, }; + static struct genl_multicast_group nl80211_mlme_mcgrp = { .name = "mlme", }; @@ -5134,6 +5286,70 @@ nla_put_failure: nlmsg_free(msg); } +static void nl80211_send_remain_on_chan_event( + int cmd, struct cfg80211_registered_device *rdev, + struct net_device *netdev, u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, gfp_t gfp) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, cmd); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_CHANNEL_TYPE, channel_type); + NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie); + + if (cmd == NL80211_CMD_REMAIN_ON_CHANNEL) + NLA_PUT_U32(msg, NL80211_ATTR_DURATION, duration); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +void nl80211_send_remain_on_channel(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, gfp_t gfp) +{ + nl80211_send_remain_on_chan_event(NL80211_CMD_REMAIN_ON_CHANNEL, + rdev, netdev, cookie, chan, + channel_type, duration, gfp); +} + +void nl80211_send_remain_on_channel_cancel( + struct cfg80211_registered_device *rdev, struct net_device *netdev, + u64 cookie, struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, gfp_t gfp) +{ + nl80211_send_remain_on_chan_event(NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, + rdev, netdev, cookie, chan, + channel_type, 0, gfp); +} + /* initialisation/exit functions */ int nl80211_init(void) diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 44cc2a76a1b0..a5e2de419b7a 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -59,4 +59,15 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, gfp_t gfp); +void nl80211_send_remain_on_channel(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, gfp_t gfp); +void nl80211_send_remain_on_channel_cancel( + struct cfg80211_registered_device *rdev, struct net_device *netdev, + u64 cookie, struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, gfp_t gfp); + #endif /* __NET_WIRELESS_NL80211_H */ -- cgit v1.2.3 From 1f8fef7b3388b5a976e80839679b5bae581a1091 Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Thu, 24 Dec 2009 11:59:57 +0100 Subject: firewire: add fw_csr_string() helper function The core (sysfs attributes), the firedtv driver, and possible future drivers all read strings from some configuration ROM directory. Factor out the generic code from show_text_leaf() into a new helper function, modified slightly to handle arbitrary buffer sizes. Signed-off-by: Clemens Ladisch Signed-off-by: Stefan Richter --- drivers/firewire/core-device.c | 110 ++++++++++++++++++++++---------- drivers/media/dvb/firewire/firedtv-fw.c | 39 ++--------- include/linux/firewire.h | 2 + 3 files changed, 84 insertions(+), 67 deletions(-) (limited to 'include/linux') diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 9d0dfcbe2c1c..a39e4344cd58 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -59,6 +59,67 @@ int fw_csr_iterator_next(struct fw_csr_iterator *ci, int *key, int *value) } EXPORT_SYMBOL(fw_csr_iterator_next); +static u32 *search_leaf(u32 *directory, int search_key) +{ + struct fw_csr_iterator ci; + int last_key = 0, key, value; + + fw_csr_iterator_init(&ci, directory); + while (fw_csr_iterator_next(&ci, &key, &value)) { + if (last_key == search_key && + key == (CSR_DESCRIPTOR | CSR_LEAF)) + return ci.p - 1 + value; + last_key = key; + } + return NULL; +} + +static int textual_leaf_to_string(u32 *block, char *buf, size_t size) +{ + unsigned int quadlets, length; + + if (!size || !buf) + return -EINVAL; + + quadlets = min(block[0] >> 16, 256u); + if (quadlets < 2) + return -ENODATA; + + if (block[1] != 0 || block[2] != 0) + /* unknown language/character set */ + return -ENODATA; + + block += 3; + quadlets -= 2; + for (length = 0; length < quadlets * 4 && length + 1 < size; length++) { + char c = block[length / 4] >> (24 - 8 * (length % 4)); + if (c == '\0') + break; + buf[length] = c; + } + buf[length] = '\0'; + return length; +} + +/** + * fw_csr_string - reads a string from the configuration ROM + * @directory: device or unit directory; + * fw_device->config_rom+5 or fw_unit->directory + * @key: the key of the preceding directory entry + * @buf: where to put the string + * @size: size of @buf, in bytes + * + * Returns string length (>= 0) or error code (< 0). + */ +int fw_csr_string(u32 *directory, int key, char *buf, size_t size) +{ + u32 *leaf = search_leaf(directory, key); + if (!leaf) + return -ENOENT; + return textual_leaf_to_string(leaf, buf, size); +} +EXPORT_SYMBOL(fw_csr_string); + static bool is_fw_unit(struct device *dev); static int match_unit_directory(u32 *directory, u32 match_flags, @@ -226,10 +287,10 @@ static ssize_t show_text_leaf(struct device *dev, { struct config_rom_attribute *attr = container_of(dattr, struct config_rom_attribute, attr); - struct fw_csr_iterator ci; - u32 *dir, *block = NULL, *p, *end; - int length, key, value, last_key = 0, ret = -ENOENT; - char *b; + u32 *dir; + size_t bufsize; + char dummy_buf[2]; + int ret; down_read(&fw_device_rwsem); @@ -238,40 +299,23 @@ static ssize_t show_text_leaf(struct device *dev, else dir = fw_device(dev)->config_rom + 5; - fw_csr_iterator_init(&ci, dir); - while (fw_csr_iterator_next(&ci, &key, &value)) { - if (attr->key == last_key && - key == (CSR_DESCRIPTOR | CSR_LEAF)) - block = ci.p - 1 + value; - last_key = key; + if (buf) { + bufsize = PAGE_SIZE - 1; + } else { + buf = dummy_buf; + bufsize = 1; } - if (block == NULL) - goto out; - - length = min(block[0] >> 16, 256U); - if (length < 3) - goto out; - - if (block[1] != 0 || block[2] != 0) - /* Unknown encoding. */ - goto out; + ret = fw_csr_string(dir, attr->key, buf, bufsize); - if (buf == NULL) { - ret = length * 4; - goto out; + if (ret >= 0) { + /* Strip trailing whitespace and add newline. */ + while (ret > 0 && isspace(buf[ret - 1])) + ret--; + strcpy(buf + ret, "\n"); + ret++; } - b = buf; - end = &block[length + 1]; - for (p = &block[3]; p < end; p++, b += 4) - * (u32 *) b = (__force u32) __cpu_to_be32(*p); - - /* Strip trailing whitespace and add newline. */ - while (b--, (isspace(*b) || *b == '\0') && b > buf); - strcpy(b + 1, "\n"); - ret = b + 2 - buf; - out: up_read(&fw_device_rwsem); return ret; diff --git a/drivers/media/dvb/firewire/firedtv-fw.c b/drivers/media/dvb/firewire/firedtv-fw.c index 6223bf01efe9..4253b7ab0097 100644 --- a/drivers/media/dvb/firewire/firedtv-fw.c +++ b/drivers/media/dvb/firewire/firedtv-fw.c @@ -239,47 +239,18 @@ static const struct fw_address_region fcp_region = { }; /* Adjust the template string if models with longer names appear. */ -#define MAX_MODEL_NAME_LEN ((int)DIV_ROUND_UP(sizeof("FireDTV ????"), 4)) - -static size_t model_name(u32 *directory, __be32 *buffer) -{ - struct fw_csr_iterator ci; - int i, length, key, value, last_key = 0; - u32 *block = NULL; - - fw_csr_iterator_init(&ci, directory); - while (fw_csr_iterator_next(&ci, &key, &value)) { - if (last_key == CSR_MODEL && - key == (CSR_DESCRIPTOR | CSR_LEAF)) - block = ci.p - 1 + value; - last_key = key; - } - - if (block == NULL) - return 0; - - length = min((int)(block[0] >> 16) - 2, MAX_MODEL_NAME_LEN); - if (length <= 0) - return 0; - - /* fast-forward to text string */ - block += 3; - - for (i = 0; i < length; i++) - buffer[i] = cpu_to_be32(block[i]); - - return length * 4; -} +#define MAX_MODEL_NAME_LEN sizeof("FireDTV ????") static int node_probe(struct device *dev) { struct firedtv *fdtv; - __be32 name[MAX_MODEL_NAME_LEN]; + char name[MAX_MODEL_NAME_LEN]; int name_len, err; - name_len = model_name(fw_unit(dev)->directory, name); + name_len = fw_csr_string(fw_unit(dev)->directory, CSR_MODEL, + name, sizeof(name)); - fdtv = fdtv_alloc(dev, &backend, (char *)name, name_len); + fdtv = fdtv_alloc(dev, &backend, name, name_len >= 0 ? name_len : 0); if (!fdtv) return -ENOMEM; diff --git a/include/linux/firewire.h b/include/linux/firewire.h index a0e67150a729..5246869d8083 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -72,6 +72,8 @@ struct fw_csr_iterator { void fw_csr_iterator_init(struct fw_csr_iterator *ci, u32 *p); int fw_csr_iterator_next(struct fw_csr_iterator *ci, int *key, int *value); +int fw_csr_string(u32 *directory, int key, char *buf, size_t size); + extern struct bus_type fw_bus_type; struct fw_card_driver; -- cgit v1.2.3 From 3c2c58cb33b3b15a2c4871babeec8fe1456e1db6 Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Sat, 26 Dec 2009 01:43:21 +0100 Subject: firewire: core: fw_csr_string addendum Witespace and comment changes, and a different way to say i + 1 < end. Signed-off-by: Stefan Richter --- drivers/firewire/core-device.c | 26 ++++++++++++++++---------- include/linux/firewire.h | 1 - 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index a39e4344cd58..5d5c6a689837 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -69,19 +69,22 @@ static u32 *search_leaf(u32 *directory, int search_key) if (last_key == search_key && key == (CSR_DESCRIPTOR | CSR_LEAF)) return ci.p - 1 + value; + last_key = key; } + return NULL; } static int textual_leaf_to_string(u32 *block, char *buf, size_t size) { - unsigned int quadlets, length; + unsigned int quadlets, i; + char c; if (!size || !buf) return -EINVAL; - quadlets = min(block[0] >> 16, 256u); + quadlets = min(block[0] >> 16, 256U); if (quadlets < 2) return -ENODATA; @@ -91,31 +94,34 @@ static int textual_leaf_to_string(u32 *block, char *buf, size_t size) block += 3; quadlets -= 2; - for (length = 0; length < quadlets * 4 && length + 1 < size; length++) { - char c = block[length / 4] >> (24 - 8 * (length % 4)); + for (i = 0; i < quadlets * 4 && i < size - 1; i++) { + c = block[i / 4] >> (24 - 8 * (i % 4)); if (c == '\0') break; - buf[length] = c; + buf[i] = c; } - buf[length] = '\0'; - return length; + buf[i] = '\0'; + + return i; } /** * fw_csr_string - reads a string from the configuration ROM - * @directory: device or unit directory; - * fw_device->config_rom+5 or fw_unit->directory + * @directory: e.g. root directory or unit directory * @key: the key of the preceding directory entry * @buf: where to put the string * @size: size of @buf, in bytes * - * Returns string length (>= 0) or error code (< 0). + * The string is taken from a minimal ASCII text descriptor leaf after + * the immediate entry with @key. The string is zero-terminated. + * Returns strlen(buf) or a negative error code. */ int fw_csr_string(u32 *directory, int key, char *buf, size_t size) { u32 *leaf = search_leaf(directory, key); if (!leaf) return -ENOENT; + return textual_leaf_to_string(leaf, buf, size); } EXPORT_SYMBOL(fw_csr_string); diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 5246869d8083..df680216e7b6 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -71,7 +71,6 @@ struct fw_csr_iterator { void fw_csr_iterator_init(struct fw_csr_iterator *ci, u32 *p); int fw_csr_iterator_next(struct fw_csr_iterator *ci, int *key, int *value); - int fw_csr_string(u32 *directory, int key, char *buf, size_t size); extern struct bus_type fw_bus_type; -- cgit v1.2.3 From 13b302d0a217580c0129b0641b0ca8b592e437b0 Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Sat, 26 Dec 2009 01:44:10 +0100 Subject: firewire: qualify config ROM cache pointers as const pointers Several config ROM related functions only peek at the ROM cache; mark their arguments as const pointers. Ditto fw_device.config_rom and fw_unit.directory, as the memory behind them is meant to be write-once. Signed-off-by: Stefan Richter --- drivers/firewire/core-device.c | 21 +++++++++++---------- drivers/firewire/sbp2.c | 5 +++-- include/linux/firewire.h | 12 ++++++------ 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 5d5c6a689837..eecd52dc8e98 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -43,7 +43,7 @@ #include "core.h" -void fw_csr_iterator_init(struct fw_csr_iterator *ci, u32 * p) +void fw_csr_iterator_init(struct fw_csr_iterator *ci, const u32 *p) { ci->p = p + 1; ci->end = ci->p + (p[0] >> 16); @@ -59,7 +59,7 @@ int fw_csr_iterator_next(struct fw_csr_iterator *ci, int *key, int *value) } EXPORT_SYMBOL(fw_csr_iterator_next); -static u32 *search_leaf(u32 *directory, int search_key) +static const u32 *search_leaf(const u32 *directory, int search_key) { struct fw_csr_iterator ci; int last_key = 0, key, value; @@ -76,7 +76,7 @@ static u32 *search_leaf(u32 *directory, int search_key) return NULL; } -static int textual_leaf_to_string(u32 *block, char *buf, size_t size) +static int textual_leaf_to_string(const u32 *block, char *buf, size_t size) { unsigned int quadlets, i; char c; @@ -116,9 +116,9 @@ static int textual_leaf_to_string(u32 *block, char *buf, size_t size) * the immediate entry with @key. The string is zero-terminated. * Returns strlen(buf) or a negative error code. */ -int fw_csr_string(u32 *directory, int key, char *buf, size_t size) +int fw_csr_string(const u32 *directory, int key, char *buf, size_t size) { - u32 *leaf = search_leaf(directory, key); + const u32 *leaf = search_leaf(directory, key); if (!leaf) return -ENOENT; @@ -128,7 +128,7 @@ EXPORT_SYMBOL(fw_csr_string); static bool is_fw_unit(struct device *dev); -static int match_unit_directory(u32 *directory, u32 match_flags, +static int match_unit_directory(const u32 *directory, u32 match_flags, const struct ieee1394_device_id *id) { struct fw_csr_iterator ci; @@ -262,7 +262,7 @@ static ssize_t show_immediate(struct device *dev, struct config_rom_attribute *attr = container_of(dattr, struct config_rom_attribute, attr); struct fw_csr_iterator ci; - u32 *dir; + const u32 *dir; int key, value, ret = -ENOENT; down_read(&fw_device_rwsem); @@ -293,7 +293,7 @@ static ssize_t show_text_leaf(struct device *dev, { struct config_rom_attribute *attr = container_of(dattr, struct config_rom_attribute, attr); - u32 *dir; + const u32 *dir; size_t bufsize; char dummy_buf[2]; int ret; @@ -421,7 +421,7 @@ static ssize_t guid_show(struct device *dev, return ret; } -static int units_sprintf(char *buf, u32 *directory) +static int units_sprintf(char *buf, const u32 *directory) { struct fw_csr_iterator ci; int key, value; @@ -503,7 +503,8 @@ static int read_rom(struct fw_device *device, */ static int read_bus_info_block(struct fw_device *device, int generation) { - u32 *rom, *stack, *old_rom, *new_rom; + const u32 *old_rom, *new_rom; + u32 *rom, *stack; u32 sp, key; int i, end, length, ret = -1; diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c index d485cdd8cbac..7e33b0b1704c 100644 --- a/drivers/firewire/sbp2.c +++ b/drivers/firewire/sbp2.c @@ -1014,7 +1014,8 @@ static int sbp2_add_logical_unit(struct sbp2_target *tgt, int lun_entry) return 0; } -static int sbp2_scan_logical_unit_dir(struct sbp2_target *tgt, u32 *directory) +static int sbp2_scan_logical_unit_dir(struct sbp2_target *tgt, + const u32 *directory) { struct fw_csr_iterator ci; int key, value; @@ -1027,7 +1028,7 @@ static int sbp2_scan_logical_unit_dir(struct sbp2_target *tgt, u32 *directory) return 0; } -static int sbp2_scan_unit_dir(struct sbp2_target *tgt, u32 *directory, +static int sbp2_scan_unit_dir(struct sbp2_target *tgt, const u32 *directory, u32 *model, u32 *firmware_revision) { struct fw_csr_iterator ci; diff --git a/include/linux/firewire.h b/include/linux/firewire.h index df680216e7b6..4bd94bf5e739 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -65,13 +65,13 @@ #define CSR_DIRECTORY_ID 0x20 struct fw_csr_iterator { - u32 *p; - u32 *end; + const u32 *p; + const u32 *end; }; -void fw_csr_iterator_init(struct fw_csr_iterator *ci, u32 *p); +void fw_csr_iterator_init(struct fw_csr_iterator *ci, const u32 *p); int fw_csr_iterator_next(struct fw_csr_iterator *ci, int *key, int *value); -int fw_csr_string(u32 *directory, int key, char *buf, size_t size); +int fw_csr_string(const u32 *directory, int key, char *buf, size_t size); extern struct bus_type fw_bus_type; @@ -163,7 +163,7 @@ struct fw_device { struct mutex client_list_mutex; struct list_head client_list; - u32 *config_rom; + const u32 *config_rom; size_t config_rom_length; int config_rom_retries; unsigned is_local:1; @@ -205,7 +205,7 @@ int fw_device_enable_phys_dma(struct fw_device *device); */ struct fw_unit { struct device device; - u32 *directory; + const u32 *directory; struct fw_attribute_group attribute_group; }; -- cgit v1.2.3 From c1c5523dd1517250cac8b15a4acbc237c24a67d4 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 23 Dec 2009 01:27:48 +0000 Subject: can/netlink: add CAN_CTRLMODE_ONE_SHOT This patch adds the flag CAN_CTRLMODE_ONE_SHOT. It is used as mask or flag in the "struct can_ctrlmode". It allows userspace via netlink to set a CAN controller into the special "one-shot" mode. In this mode, if supported by the CAN controller, it tries only once to deliver a CAN frame and aborts it if an error (e.g.: arbitration lost) happens. Signed-off-by: Marc Kleine-Budde Acked-by: Wolfgang Grandegger Signed-off-by: David S. Miller --- include/linux/can/netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/can/netlink.h b/include/linux/can/netlink.h index 9ecbb7871c0e..c818335fbb13 100644 --- a/include/linux/can/netlink.h +++ b/include/linux/can/netlink.h @@ -80,6 +80,7 @@ struct can_ctrlmode { #define CAN_CTRLMODE_LOOPBACK 0x1 /* Loopback mode */ #define CAN_CTRLMODE_LISTENONLY 0x2 /* Listen-only mode */ #define CAN_CTRLMODE_3_SAMPLES 0x4 /* Triple sampling mode */ +#define CAN_CTRLMODE_ONE_SHOT 0x8 /* One-Shot mode */ /* * CAN device statistics -- cgit v1.2.3 From cf2f765f1896064e34c6f0f2ef896ff058dd5c06 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 4 Jan 2010 12:20:56 +0100 Subject: HID: handle joysticks with large number of buttons Current HID code doesn't properly handle HID joysticks which have larger number of buttons than what fits into current range reserved for BTN_JOYSTICK. One such joystick reported to not work properly is Saitek X52 Pro Flight System. We can't extend the range to fit more buttons in, because of backwards compatibility reasons. Therefore this patch introduces a new BTN_TRIGGER_HAPPY range, and uses these to map the buttons which are over BTN_JOYSTICK limit. Acked-by: Dmitry Torokhov [for the input.h part] Signed-off-by: Jiri Kosina --- drivers/hid/hid-input.c | 7 ++++++- include/linux/input.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 5862b0f3b55d..dad7aae9c975 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -198,7 +198,12 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel switch (field->application) { case HID_GD_MOUSE: case HID_GD_POINTER: code += 0x110; break; - case HID_GD_JOYSTICK: code += 0x120; break; + case HID_GD_JOYSTICK: + if (code <= 0xf) + code += BTN_JOYSTICK; + else + code += BTN_TRIGGER_HAPPY; + break; case HID_GD_GAMEPAD: code += 0x130; break; default: switch (field->physical) { diff --git a/include/linux/input.h b/include/linux/input.h index 7be8a6537b57..97f98ca9b040 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -597,6 +597,48 @@ struct input_absinfo { #define KEY_CAMERA_FOCUS 0x210 +#define BTN_TRIGGER_HAPPY 0x2c0 +#define BTN_TRIGGER_HAPPY1 0x2c0 +#define BTN_TRIGGER_HAPPY2 0x2c1 +#define BTN_TRIGGER_HAPPY3 0x2c2 +#define BTN_TRIGGER_HAPPY4 0x2c3 +#define BTN_TRIGGER_HAPPY5 0x2c4 +#define BTN_TRIGGER_HAPPY6 0x2c5 +#define BTN_TRIGGER_HAPPY7 0x2c6 +#define BTN_TRIGGER_HAPPY8 0x2c7 +#define BTN_TRIGGER_HAPPY9 0x2c8 +#define BTN_TRIGGER_HAPPY10 0x2c9 +#define BTN_TRIGGER_HAPPY11 0x2ca +#define BTN_TRIGGER_HAPPY12 0x2cb +#define BTN_TRIGGER_HAPPY13 0x2cc +#define BTN_TRIGGER_HAPPY14 0x2cd +#define BTN_TRIGGER_HAPPY15 0x2ce +#define BTN_TRIGGER_HAPPY16 0x2cf +#define BTN_TRIGGER_HAPPY17 0x2d0 +#define BTN_TRIGGER_HAPPY18 0x2d1 +#define BTN_TRIGGER_HAPPY19 0x2d2 +#define BTN_TRIGGER_HAPPY20 0x2d3 +#define BTN_TRIGGER_HAPPY21 0x2d4 +#define BTN_TRIGGER_HAPPY22 0x2d5 +#define BTN_TRIGGER_HAPPY23 0x2d6 +#define BTN_TRIGGER_HAPPY24 0x2d7 +#define BTN_TRIGGER_HAPPY25 0x2d8 +#define BTN_TRIGGER_HAPPY26 0x2d9 +#define BTN_TRIGGER_HAPPY27 0x2da +#define BTN_TRIGGER_HAPPY28 0x2db +#define BTN_TRIGGER_HAPPY29 0x2dc +#define BTN_TRIGGER_HAPPY30 0x2dd +#define BTN_TRIGGER_HAPPY31 0x2de +#define BTN_TRIGGER_HAPPY32 0x2df +#define BTN_TRIGGER_HAPPY33 0x2e0 +#define BTN_TRIGGER_HAPPY34 0x2e1 +#define BTN_TRIGGER_HAPPY35 0x2e2 +#define BTN_TRIGGER_HAPPY36 0x2e3 +#define BTN_TRIGGER_HAPPY37 0x2e4 +#define BTN_TRIGGER_HAPPY38 0x2e5 +#define BTN_TRIGGER_HAPPY39 0x2e6 +#define BTN_TRIGGER_HAPPY40 0x2e7 + /* We avoid low common keys in module aliases so they don't get huge. */ #define KEY_MIN_INTERESTING KEY_MUTE #define KEY_MAX 0x2ff -- cgit v1.2.3 From e1783a240f491fb233f04edc042e16b18a7a79ba Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 5 Jan 2010 15:34:50 +0900 Subject: module: Use this_cpu_xx to dynamically allocate counters Use cpu ops to deal with the per cpu data instead of a local_t. Reduces memory requirements, cache footprint and decreases cycle counts. The this_cpu_xx operations are also used for !SMP mode. Otherwise we could not drop the use of __module_ref_addr() which would make per cpu data handling complicated. this_cpu_xx operations have their own fallback for !SMP. V8-V9: - Leave include asm/module.h since ringbuffer.c depends on it. Nothing else does though. Another patch will deal with that. - Remove spurious free. Signed-off-by: Christoph Lameter Acked-by: Rusty Russell Signed-off-by: Tejun Heo --- include/linux/module.h | 36 ++++++++++++++---------------------- kernel/module.c | 29 +++++++++++++++-------------- 2 files changed, 29 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 6cb1a3cab5d3..2302f09ea2d9 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -363,11 +364,9 @@ struct module /* Destruction function. */ void (*exit)(void); -#ifdef CONFIG_SMP - char *refptr; -#else - local_t ref; -#endif + struct module_ref { + int count; + } *refptr; #endif #ifdef CONFIG_CONSTRUCTORS @@ -454,25 +453,16 @@ void __symbol_put(const char *symbol); #define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x) void symbol_put_addr(void *addr); -static inline local_t *__module_ref_addr(struct module *mod, int cpu) -{ -#ifdef CONFIG_SMP - return (local_t *) (mod->refptr + per_cpu_offset(cpu)); -#else - return &mod->ref; -#endif -} - /* Sometimes we know we already have a refcount, and it's easier not to handle the error case (which only happens with rmmod --wait). */ static inline void __module_get(struct module *module) { if (module) { - unsigned int cpu = get_cpu(); - local_inc(__module_ref_addr(module, cpu)); + preempt_disable(); + __this_cpu_inc(module->refptr->count); trace_module_get(module, _THIS_IP_, - local_read(__module_ref_addr(module, cpu))); - put_cpu(); + __this_cpu_read(module->refptr->count)); + preempt_enable(); } } @@ -481,15 +471,17 @@ static inline int try_module_get(struct module *module) int ret = 1; if (module) { - unsigned int cpu = get_cpu(); + preempt_disable(); + if (likely(module_is_live(module))) { - local_inc(__module_ref_addr(module, cpu)); + __this_cpu_inc(module->refptr->count); trace_module_get(module, _THIS_IP_, - local_read(__module_ref_addr(module, cpu))); + __this_cpu_read(module->refptr->count)); } else ret = 0; - put_cpu(); + + preempt_enable(); } return ret; } diff --git a/kernel/module.c b/kernel/module.c index e96b8ed1cb6a..9bf228052ec5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -474,9 +474,10 @@ static void module_unload_init(struct module *mod) INIT_LIST_HEAD(&mod->modules_which_use_me); for_each_possible_cpu(cpu) - local_set(__module_ref_addr(mod, cpu), 0); + per_cpu_ptr(mod->refptr, cpu)->count = 0; + /* Hold reference count during initialization. */ - local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); + __this_cpu_write(mod->refptr->count, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } @@ -619,7 +620,7 @@ unsigned int module_refcount(struct module *mod) int cpu; for_each_possible_cpu(cpu) - total += local_read(__module_ref_addr(mod, cpu)); + total += per_cpu_ptr(mod->refptr, cpu)->count; return total; } EXPORT_SYMBOL(module_refcount); @@ -796,14 +797,15 @@ static struct module_attribute refcnt = { void module_put(struct module *module) { if (module) { - unsigned int cpu = get_cpu(); - local_dec(__module_ref_addr(module, cpu)); + preempt_disable(); + __this_cpu_dec(module->refptr->count); + trace_module_put(module, _RET_IP_, - local_read(__module_ref_addr(module, cpu))); + __this_cpu_read(module->refptr->count)); /* Maybe they're waiting for us to drop reference? */ if (unlikely(!module_is_live(module))) wake_up_process(module->waiter); - put_cpu(); + preempt_enable(); } } EXPORT_SYMBOL(module_put); @@ -1394,9 +1396,9 @@ static void free_module(struct module *mod) kfree(mod->args); if (mod->percpu) percpu_modfree(mod->percpu); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) +#if defined(CONFIG_MODULE_UNLOAD) if (mod->refptr) - percpu_modfree(mod->refptr); + free_percpu(mod->refptr); #endif /* Free lock-classes: */ lockdep_free_key_range(mod->module_core, mod->core_size); @@ -2159,9 +2161,8 @@ static noinline struct module *load_module(void __user *umod, mod = (void *)sechdrs[modindex].sh_addr; kmemleak_load_module(mod, hdr, sechdrs, secstrings); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), - mod->name); +#if defined(CONFIG_MODULE_UNLOAD) + mod->refptr = alloc_percpu(struct module_ref); if (!mod->refptr) { err = -ENOMEM; goto free_init; @@ -2393,8 +2394,8 @@ static noinline struct module *load_module(void __user *umod, kobject_put(&mod->mkobj.kobj); free_unload: module_unload_free(mod); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - percpu_modfree(mod->refptr); +#if defined(CONFIG_MODULE_UNLOAD) + free_percpu(mod->refptr); free_init: #endif module_free(mod, mod->module_init); -- cgit v1.2.3 From 79615760f380ec86cd58204744e774c33fab9211 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 5 Jan 2010 15:34:50 +0900 Subject: local_t: Move local.h include to ringbuffer.c and ring_buffer_benchmark.c ringbuffer*.c are the last users of local.h. Remove the include from modules.h and add it to ringbuffer files. Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- include/linux/module.h | 1 - kernel/trace/ring_buffer.c | 1 + kernel/trace/ring_buffer_benchmark.c | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 2302f09ea2d9..7e74ae0051cc 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -17,7 +17,6 @@ #include #include -#include #include #include diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2326b04c95c4..eb6c8988c31a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -20,6 +20,7 @@ #include #include +#include #include "trace.h" /* diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index b2477caf09c2..df74c7982255 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -8,6 +8,7 @@ #include #include #include +#include struct rb_page { u64 ts; -- cgit v1.2.3 From 99dcc3e5a94ed491fbef402831d8c0bbb267f995 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 5 Jan 2010 15:34:51 +0900 Subject: this_cpu: Page allocator conversion Use the per cpu allocator functionality to avoid per cpu arrays in struct zone. This drastically reduces the size of struct zone for systems with large amounts of processors and allows placement of critical variables of struct zone in one cacheline even on very large systems. Another effect is that the pagesets of one processor are placed near one another. If multiple pagesets from different zones fit into one cacheline then additional cacheline fetches can be avoided on the hot paths when allocating memory from multiple zones. Bootstrap becomes simpler if we use the same scheme for UP, SMP, NUMA. #ifdefs are reduced and we can drop the zone_pcp macro. Hotplug handling is also simplified since cpu alloc can bring up and shut down cpu areas for a specific cpu as a whole. So there is no need to allocate or free individual pagesets. V7-V8: - Explain chicken egg dilemmna with percpu allocator. V4-V5: - Fix up cases where per_cpu_ptr is called before irq disable - Integrate the bootstrap logic that was separate before. tj: Build failure in pageset_cpuup_callback() due to missing ret variable fixed. Reviewed-by: Mel Gorman Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- include/linux/mm.h | 4 - include/linux/mmzone.h | 12 +-- mm/page_alloc.c | 202 +++++++++++++++++-------------------------------- mm/vmstat.c | 14 ++-- 4 files changed, 81 insertions(+), 151 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2265f28eb47a..554fa395aac9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1079,11 +1079,7 @@ extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern int after_bootmem; -#ifdef CONFIG_NUMA extern void setup_per_cpu_pageset(void); -#else -static inline void setup_per_cpu_pageset(void) {} -#endif extern void zone_pcp_update(struct zone *zone); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 30fe668c2542..7874201a3556 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -184,13 +184,7 @@ struct per_cpu_pageset { s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; #endif -} ____cacheline_aligned_in_smp; - -#ifdef CONFIG_NUMA -#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)]) -#else -#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)]) -#endif +}; #endif /* !__GENERATING_BOUNDS.H */ @@ -306,10 +300,8 @@ struct zone { */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; - struct per_cpu_pageset *pageset[NR_CPUS]; -#else - struct per_cpu_pageset pageset[NR_CPUS]; #endif + struct per_cpu_pageset *pageset; /* * free areas of different sizes */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e9f5cc5fb59..6849e870de54 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1008,10 +1008,10 @@ static void drain_pages(unsigned int cpu) struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - pset = zone_pcp(zone, cpu); + local_irq_save(flags); + pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - local_irq_save(flags); free_pcppages_bulk(zone, pcp->count, pcp); pcp->count = 0; local_irq_restore(flags); @@ -1095,7 +1095,6 @@ static void free_hot_cold_page(struct page *page, int cold) arch_free_page(page, 0); kernel_map_pages(page, 1, 0); - pcp = &zone_pcp(zone, get_cpu())->pcp; migratetype = get_pageblock_migratetype(page); set_page_private(page, migratetype); local_irq_save(flags); @@ -1118,6 +1117,7 @@ static void free_hot_cold_page(struct page *page, int cold) migratetype = MIGRATE_MOVABLE; } + pcp = &this_cpu_ptr(zone->pageset)->pcp; if (cold) list_add_tail(&page->lru, &pcp->lists[migratetype]); else @@ -1130,7 +1130,6 @@ static void free_hot_cold_page(struct page *page, int cold) out: local_irq_restore(flags); - put_cpu(); } void free_hot_page(struct page *page) @@ -1180,17 +1179,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); - int cpu; again: - cpu = get_cpu(); if (likely(order == 0)) { struct per_cpu_pages *pcp; struct list_head *list; - pcp = &zone_pcp(zone, cpu)->pcp; - list = &pcp->lists[migratetype]; local_irq_save(flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, @@ -1231,7 +1228,6 @@ again: __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone); local_irq_restore(flags); - put_cpu(); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) @@ -1240,7 +1236,6 @@ again: failed: local_irq_restore(flags); - put_cpu(); return NULL; } @@ -2179,7 +2174,7 @@ void show_free_areas(void) for_each_online_cpu(cpu) { struct per_cpu_pageset *pageset; - pageset = zone_pcp(zone, cpu); + pageset = per_cpu_ptr(zone->pageset, cpu); printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", cpu, pageset->pcp.high, @@ -2744,10 +2739,29 @@ static void build_zonelist_cache(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ +/* + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * The boot_pagesets must be kept even after bootup is complete for + * unused processors and/or zones. They do play a role for bootstrapping + * hotplugged processors. + * + * zoneinfo_show() and maybe other functions do + * not check if the processor is online before following the pageset pointer. + * Other parts of the kernel may not check if the zone is available. + */ +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); +static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); + /* return values int ....just for stop_machine() */ static int __build_all_zonelists(void *dummy) { int nid; + int cpu; #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -2758,6 +2772,23 @@ static int __build_all_zonelists(void *dummy) build_zonelists(pgdat); build_zonelist_cache(pgdat); } + + /* + * Initialize the boot_pagesets that are going to be used + * for bootstrapping processors. The real pagesets for + * each zone will be allocated later when the per cpu + * allocator is available. + * + * boot_pagesets are used also for bootstrapping offline + * cpus if the system is already booted because the pagesets + * are needed to initialize allocators on a specific cpu too. + * F.e. the percpu allocator needs the page allocator which + * needs the percpu allocator in order to allocate its pagesets + * (a chicken-egg dilemma). + */ + for_each_possible_cpu(cpu) + setup_pageset(&per_cpu(boot_pageset, cpu), 0); + return 0; } @@ -3095,121 +3126,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pcp->batch = PAGE_SHIFT * 8; } - -#ifdef CONFIG_NUMA -/* - * Boot pageset table. One per cpu which is going to be used for all - * zones and all nodes. The parameters will be set in such a way - * that an item put on a list will immediately be handed over to - * the buddy list. This is safe since pageset manipulation is done - * with interrupts disabled. - * - * Some NUMA counter updates may also be caught by the boot pagesets. - * - * The boot_pagesets must be kept even after bootup is complete for - * unused processors and/or zones. They do play a role for bootstrapping - * hotplugged processors. - * - * zoneinfo_show() and maybe other functions do - * not check if the processor is online before following the pageset pointer. - * Other parts of the kernel may not check if the zone is available. - */ -static struct per_cpu_pageset boot_pageset[NR_CPUS]; - /* - * Dynamically allocate memory for the - * per cpu pageset array in struct zone. + * Allocate per cpu pagesets and initialize them. + * Before this call only boot pagesets were available. + * Boot pagesets will no longer be used by this processorr + * after setup_per_cpu_pageset(). */ -static int __cpuinit process_zones(int cpu) +void __init setup_per_cpu_pageset(void) { - struct zone *zone, *dzone; - int node = cpu_to_node(cpu); - - node_set_state(node, N_CPU); /* this node has a cpu */ + struct zone *zone; + int cpu; for_each_populated_zone(zone) { - zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), - GFP_KERNEL, node); - if (!zone_pcp(zone, cpu)) - goto bad; - - setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); - - if (percpu_pagelist_fraction) - setup_pagelist_highmark(zone_pcp(zone, cpu), - (zone->present_pages / percpu_pagelist_fraction)); - } - - return 0; -bad: - for_each_zone(dzone) { - if (!populated_zone(dzone)) - continue; - if (dzone == zone) - break; - kfree(zone_pcp(dzone, cpu)); - zone_pcp(dzone, cpu) = &boot_pageset[cpu]; - } - return -ENOMEM; -} + zone->pageset = alloc_percpu(struct per_cpu_pageset); -static inline void free_zone_pagesets(int cpu) -{ - struct zone *zone; - - for_each_zone(zone) { - struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - /* Free per_cpu_pageset if it is slab allocated */ - if (pset != &boot_pageset[cpu]) - kfree(pset); - zone_pcp(zone, cpu) = &boot_pageset[cpu]; - } -} + setup_pageset(pcp, zone_batchsize(zone)); -static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - int ret = NOTIFY_OK; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (process_zones(cpu)) - ret = NOTIFY_BAD; - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - free_zone_pagesets(cpu); - break; - default: - break; + if (percpu_pagelist_fraction) + setup_pagelist_highmark(pcp, + (zone->present_pages / + percpu_pagelist_fraction)); + } } - return ret; } -static struct notifier_block __cpuinitdata pageset_notifier = - { &pageset_cpuup_callback, NULL, 0 }; - -void __init setup_per_cpu_pageset(void) -{ - int err; - - /* Initialize per_cpu_pageset for cpu 0. - * A cpuup callback will do this for every cpu - * as it comes online - */ - err = process_zones(smp_processor_id()); - BUG_ON(err); - register_cpu_notifier(&pageset_notifier); -} - -#endif - static noinline __init_refok int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { @@ -3263,7 +3206,7 @@ static int __zone_pcp_update(void *data) struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - pset = zone_pcp(zone, cpu); + pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; local_irq_save(flags); @@ -3281,21 +3224,17 @@ void zone_pcp_update(struct zone *zone) static __meminit void zone_pcp_init(struct zone *zone) { - int cpu; - unsigned long batch = zone_batchsize(zone); + /* + * per cpu subsystem is not up at this point. The following code + * relies on the ability of the linker to provide the + * offset of a (static) per cpu variable into the per cpu area. + */ + zone->pageset = &boot_pageset; - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone_pcp(zone, cpu) = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); -#endif - } if (zone->present_pages) - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone->name, zone->present_pages, batch); + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", + zone->name, zone->present_pages, + zone_batchsize(zone)); } __meminit int init_currently_empty_zone(struct zone *zone, @@ -4809,10 +4748,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, if (!write || (ret == -EINVAL)) return ret; for_each_populated_zone(zone) { - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { unsigned long high; high = zone->present_pages / percpu_pagelist_fraction; - setup_pagelist_highmark(zone_pcp(zone, cpu), high); + setup_pagelist_highmark( + per_cpu_ptr(zone->pageset, cpu), high); } } return 0; diff --git a/mm/vmstat.c b/mm/vmstat.c index 6051fbab67ba..1ba0bb7ad043 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds(void) threshold = calculate_threshold(zone); for_each_online_cpu(cpu) - zone_pcp(zone, cpu)->stat_threshold = threshold; + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; } } @@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds(void) void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); + s8 *p = pcp->vm_stat_diff + item; long x; @@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); s8 *p = pcp->vm_stat_diff + item; (*p)++; @@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); s8 *p = pcp->vm_stat_diff + item; (*p)--; @@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu) for_each_populated_zone(zone) { struct per_cpu_pageset *p; - p = zone_pcp(zone, cpu); + p = per_cpu_ptr(zone->pageset, cpu); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) if (p->vm_stat_diff[i]) { @@ -741,7 +743,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, for_each_online_cpu(i) { struct per_cpu_pageset *pageset; - pageset = zone_pcp(zone, i); + pageset = per_cpu_ptr(zone->pageset, i); seq_printf(m, "\n cpu: %i" "\n count: %i" -- cgit v1.2.3 From ddf1ffbd40c92ff1e58c45fa96d309788f7beb60 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 5 Jan 2010 17:56:04 -0800 Subject: Input: serio - let device core tell us if device was registered No need to keep track of it by ourselves. Signed-off-by: Dmitry Torokhov --- drivers/input/serio/serio.c | 8 ++------ include/linux/serio.h | 1 - 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c index 0a278f9f1d3a..d89880450f77 100644 --- a/drivers/input/serio/serio.c +++ b/drivers/input/serio/serio.c @@ -577,8 +577,6 @@ static void serio_add_port(struct serio *serio) printk(KERN_ERR "serio: device_add() failed for %s (%s), error: %d\n", serio->phys, serio->name, error); - else - serio->registered = true; } /* @@ -605,10 +603,8 @@ static void serio_destroy_port(struct serio *serio) serio->parent = NULL; } - if (serio->registered) { + if (device_is_registered(&serio->dev)) device_del(&serio->dev); - serio->registered = false; - } list_del_init(&serio->node); serio_remove_pending_events(serio); @@ -995,7 +991,7 @@ irqreturn_t serio_interrupt(struct serio *serio, if (likely(serio->drv)) { ret = serio->drv->interrupt(serio, data, dfl); - } else if (!dfl && serio->registered) { + } else if (!dfl && device_is_registered(&serio->dev)) { serio_rescan(serio); ret = IRQ_HANDLED; } diff --git a/include/linux/serio.h b/include/linux/serio.h index e2f3044d4a4a..d0fb702059cd 100644 --- a/include/linux/serio.h +++ b/include/linux/serio.h @@ -30,7 +30,6 @@ struct serio { char phys[32]; bool manual_bind; - bool registered; /* port has been fully registered with driver core */ struct serio_device_id id; -- cgit v1.2.3 From 361b7b5b032338361ea88412f1fc45479fdd5859 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 5 Jan 2010 17:56:03 -0800 Subject: Input: gameport - let device core tell us if device was registered No need to keep track of it by ourselves. Signed-off-by: Dmitry Torokhov --- drivers/input/gameport/gameport.c | 6 +----- include/linux/gameport.h | 1 - 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c index ac11be08585e..f9e5f8e1690b 100644 --- a/drivers/input/gameport/gameport.c +++ b/drivers/input/gameport/gameport.c @@ -561,8 +561,6 @@ static void gameport_add_port(struct gameport *gameport) printk(KERN_ERR "gameport: device_add() failed for %s (%s), error: %d\n", gameport->phys, gameport->name, error); - else - gameport->registered = 1; } /* @@ -584,10 +582,8 @@ static void gameport_destroy_port(struct gameport *gameport) gameport->parent = NULL; } - if (gameport->registered) { + if (device_is_registered(&gameport->dev)) device_del(&gameport->dev); - gameport->registered = 0; - } list_del_init(&gameport->node); diff --git a/include/linux/gameport.h b/include/linux/gameport.h index 1bc08541c2b9..48e68da097f6 100644 --- a/include/linux/gameport.h +++ b/include/linux/gameport.h @@ -46,7 +46,6 @@ struct gameport { struct mutex drv_mutex; /* protects serio->drv so attributes can pin driver */ struct device dev; - unsigned int registered; /* port has been fully registered with driver core */ struct list_head node; }; -- cgit v1.2.3 From 16295bec6398a3eedc9377e1af6ff4c71b98c300 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 6 Jan 2010 19:47:10 +1100 Subject: padata: Generic parallelization/serialization interface This patch introduces an interface to process data objects in parallel. The parallelized objects return after serialization in the same order as they were before the parallelization. Signed-off-by: Steffen Klassert Signed-off-by: Herbert Xu --- include/linux/padata.h | 88 +++++++ init/Kconfig | 4 + kernel/Makefile | 1 + kernel/padata.c | 690 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 783 insertions(+) create mode 100644 include/linux/padata.h create mode 100644 kernel/padata.c (limited to 'include/linux') diff --git a/include/linux/padata.h b/include/linux/padata.h new file mode 100644 index 000000000000..51611da9c498 --- /dev/null +++ b/include/linux/padata.h @@ -0,0 +1,88 @@ +/* + * padata.h - header for the padata parallelization interface + * + * Copyright (C) 2008, 2009 secunet Security Networks AG + * Copyright (C) 2008, 2009 Steffen Klassert + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef PADATA_H +#define PADATA_H + +#include +#include +#include + +struct padata_priv { + struct list_head list; + struct parallel_data *pd; + int cb_cpu; + int seq_nr; + int info; + void (*parallel)(struct padata_priv *padata); + void (*serial)(struct padata_priv *padata); +}; + +struct padata_list { + struct list_head list; + spinlock_t lock; +}; + +struct padata_queue { + struct padata_list parallel; + struct padata_list reorder; + struct padata_list serial; + struct work_struct pwork; + struct work_struct swork; + struct parallel_data *pd; + atomic_t num_obj; + int cpu_index; +}; + +struct parallel_data { + struct padata_instance *pinst; + struct padata_queue *queue; + atomic_t seq_nr; + atomic_t reorder_objects; + atomic_t refcnt; + unsigned int max_seq_nr; + cpumask_var_t cpumask; + spinlock_t lock; +}; + +struct padata_instance { + struct notifier_block cpu_notifier; + struct workqueue_struct *wq; + struct parallel_data *pd; + cpumask_var_t cpumask; + struct mutex lock; + u8 flags; +#define PADATA_INIT 1 +#define PADATA_RESET 2 +}; + +extern struct padata_instance *padata_alloc(const struct cpumask *cpumask, + struct workqueue_struct *wq); +extern void padata_free(struct padata_instance *pinst); +extern int padata_do_parallel(struct padata_instance *pinst, + struct padata_priv *padata, int cb_cpu); +extern void padata_do_serial(struct padata_priv *padata); +extern int padata_set_cpumask(struct padata_instance *pinst, + cpumask_var_t cpumask); +extern int padata_add_cpu(struct padata_instance *pinst, int cpu); +extern int padata_remove_cpu(struct padata_instance *pinst, int cpu); +extern void padata_start(struct padata_instance *pinst); +extern void padata_stop(struct padata_instance *pinst); +#endif diff --git a/init/Kconfig b/init/Kconfig index a23da9f01803..9fd23bcc1709 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1252,4 +1252,8 @@ source "block/Kconfig" config PREEMPT_NOTIFIERS bool +config PADATA + depends on SMP + bool + source "kernel/Kconfig.locks" diff --git a/kernel/Makefile b/kernel/Makefile index 864ff75d65f2..6aebdeb2aa34 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -100,6 +100,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o +obj-$(CONFIG_PADATA) += padata.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/padata.c b/kernel/padata.c new file mode 100644 index 000000000000..6f9bcb8313d6 --- /dev/null +++ b/kernel/padata.c @@ -0,0 +1,690 @@ +/* + * padata.c - generic interface to process data streams in parallel + * + * Copyright (C) 2008, 2009 secunet Security Networks AG + * Copyright (C) 2008, 2009 Steffen Klassert + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_SEQ_NR INT_MAX - NR_CPUS +#define MAX_OBJ_NUM 10000 * NR_CPUS + +static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) +{ + int cpu, target_cpu; + + target_cpu = cpumask_first(pd->cpumask); + for (cpu = 0; cpu < cpu_index; cpu++) + target_cpu = cpumask_next(target_cpu, pd->cpumask); + + return target_cpu; +} + +static int padata_cpu_hash(struct padata_priv *padata) +{ + int cpu_index; + struct parallel_data *pd; + + pd = padata->pd; + + /* + * Hash the sequence numbers to the cpus by taking + * seq_nr mod. number of cpus in use. + */ + cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); + + return padata_index_to_cpu(pd, cpu_index); +} + +static void padata_parallel_worker(struct work_struct *work) +{ + struct padata_queue *queue; + struct parallel_data *pd; + struct padata_instance *pinst; + LIST_HEAD(local_list); + + local_bh_disable(); + queue = container_of(work, struct padata_queue, pwork); + pd = queue->pd; + pinst = pd->pinst; + + spin_lock(&queue->parallel.lock); + list_replace_init(&queue->parallel.list, &local_list); + spin_unlock(&queue->parallel.lock); + + while (!list_empty(&local_list)) { + struct padata_priv *padata; + + padata = list_entry(local_list.next, + struct padata_priv, list); + + list_del_init(&padata->list); + + padata->parallel(padata); + } + + local_bh_enable(); +} + +/* + * padata_do_parallel - padata parallelization function + * + * @pinst: padata instance + * @padata: object to be parallelized + * @cb_cpu: cpu the serialization callback function will run on, + * must be in the cpumask of padata. + * + * The parallelization callback function will run with BHs off. + * Note: Every object which is parallelized by padata_do_parallel + * must be seen by padata_do_serial. + */ +int padata_do_parallel(struct padata_instance *pinst, + struct padata_priv *padata, int cb_cpu) +{ + int target_cpu, err; + struct padata_queue *queue; + struct parallel_data *pd; + + rcu_read_lock_bh(); + + pd = rcu_dereference(pinst->pd); + + err = 0; + if (!(pinst->flags & PADATA_INIT)) + goto out; + + err = -EBUSY; + if ((pinst->flags & PADATA_RESET)) + goto out; + + if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) + goto out; + + err = -EINVAL; + if (!cpumask_test_cpu(cb_cpu, pd->cpumask)) + goto out; + + err = -EINPROGRESS; + atomic_inc(&pd->refcnt); + padata->pd = pd; + padata->cb_cpu = cb_cpu; + + if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) + atomic_set(&pd->seq_nr, -1); + + padata->seq_nr = atomic_inc_return(&pd->seq_nr); + + target_cpu = padata_cpu_hash(padata); + queue = per_cpu_ptr(pd->queue, target_cpu); + + spin_lock(&queue->parallel.lock); + list_add_tail(&padata->list, &queue->parallel.list); + spin_unlock(&queue->parallel.lock); + + queue_work_on(target_cpu, pinst->wq, &queue->pwork); + +out: + rcu_read_unlock_bh(); + + return err; +} +EXPORT_SYMBOL(padata_do_parallel); + +static struct padata_priv *padata_get_next(struct parallel_data *pd) +{ + int cpu, num_cpus, empty, calc_seq_nr; + int seq_nr, next_nr, overrun, next_overrun; + struct padata_queue *queue, *next_queue; + struct padata_priv *padata; + struct padata_list *reorder; + + empty = 0; + next_nr = -1; + next_overrun = 0; + next_queue = NULL; + + num_cpus = cpumask_weight(pd->cpumask); + + for_each_cpu(cpu, pd->cpumask) { + queue = per_cpu_ptr(pd->queue, cpu); + reorder = &queue->reorder; + + /* + * Calculate the seq_nr of the object that should be + * next in this queue. + */ + overrun = 0; + calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) + + queue->cpu_index; + + if (unlikely(calc_seq_nr > pd->max_seq_nr)) { + calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; + overrun = 1; + } + + if (!list_empty(&reorder->list)) { + padata = list_entry(reorder->list.next, + struct padata_priv, list); + + seq_nr = padata->seq_nr; + BUG_ON(calc_seq_nr != seq_nr); + } else { + seq_nr = calc_seq_nr; + empty++; + } + + if (next_nr < 0 || seq_nr < next_nr + || (next_overrun && !overrun)) { + next_nr = seq_nr; + next_overrun = overrun; + next_queue = queue; + } + } + + padata = NULL; + + if (empty == num_cpus) + goto out; + + reorder = &next_queue->reorder; + + if (!list_empty(&reorder->list)) { + padata = list_entry(reorder->list.next, + struct padata_priv, list); + + if (unlikely(next_overrun)) { + for_each_cpu(cpu, pd->cpumask) { + queue = per_cpu_ptr(pd->queue, cpu); + atomic_set(&queue->num_obj, 0); + } + } + + spin_lock(&reorder->lock); + list_del_init(&padata->list); + atomic_dec(&pd->reorder_objects); + spin_unlock(&reorder->lock); + + atomic_inc(&next_queue->num_obj); + + goto out; + } + + if (next_nr % num_cpus == next_queue->cpu_index) { + padata = ERR_PTR(-ENODATA); + goto out; + } + + padata = ERR_PTR(-EINPROGRESS); +out: + return padata; +} + +static void padata_reorder(struct parallel_data *pd) +{ + struct padata_priv *padata; + struct padata_queue *queue; + struct padata_instance *pinst = pd->pinst; + +try_again: + if (!spin_trylock_bh(&pd->lock)) + goto out; + + while (1) { + padata = padata_get_next(pd); + + if (!padata || PTR_ERR(padata) == -EINPROGRESS) + break; + + if (PTR_ERR(padata) == -ENODATA) { + spin_unlock_bh(&pd->lock); + goto out; + } + + queue = per_cpu_ptr(pd->queue, padata->cb_cpu); + + spin_lock(&queue->serial.lock); + list_add_tail(&padata->list, &queue->serial.list); + spin_unlock(&queue->serial.lock); + + queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); + } + + spin_unlock_bh(&pd->lock); + + if (atomic_read(&pd->reorder_objects)) + goto try_again; + +out: + return; +} + +static void padata_serial_worker(struct work_struct *work) +{ + struct padata_queue *queue; + struct parallel_data *pd; + LIST_HEAD(local_list); + + local_bh_disable(); + queue = container_of(work, struct padata_queue, swork); + pd = queue->pd; + + spin_lock(&queue->serial.lock); + list_replace_init(&queue->serial.list, &local_list); + spin_unlock(&queue->serial.lock); + + while (!list_empty(&local_list)) { + struct padata_priv *padata; + + padata = list_entry(local_list.next, + struct padata_priv, list); + + list_del_init(&padata->list); + + padata->serial(padata); + atomic_dec(&pd->refcnt); + } + local_bh_enable(); +} + +/* + * padata_do_serial - padata serialization function + * + * @padata: object to be serialized. + * + * padata_do_serial must be called for every parallelized object. + * The serialization callback function will run with BHs off. + */ +void padata_do_serial(struct padata_priv *padata) +{ + int cpu; + struct padata_queue *queue; + struct parallel_data *pd; + + pd = padata->pd; + + cpu = get_cpu(); + queue = per_cpu_ptr(pd->queue, cpu); + + spin_lock(&queue->reorder.lock); + atomic_inc(&pd->reorder_objects); + list_add_tail(&padata->list, &queue->reorder.list); + spin_unlock(&queue->reorder.lock); + + put_cpu(); + + padata_reorder(pd); +} +EXPORT_SYMBOL(padata_do_serial); + +static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, + const struct cpumask *cpumask) +{ + int cpu, cpu_index, num_cpus; + struct padata_queue *queue; + struct parallel_data *pd; + + cpu_index = 0; + + pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); + if (!pd) + goto err; + + pd->queue = alloc_percpu(struct padata_queue); + if (!pd->queue) + goto err_free_pd; + + if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) + goto err_free_queue; + + for_each_possible_cpu(cpu) { + queue = per_cpu_ptr(pd->queue, cpu); + + queue->pd = pd; + + if (cpumask_test_cpu(cpu, cpumask) + && cpumask_test_cpu(cpu, cpu_active_mask)) { + queue->cpu_index = cpu_index; + cpu_index++; + } else + queue->cpu_index = -1; + + INIT_LIST_HEAD(&queue->reorder.list); + INIT_LIST_HEAD(&queue->parallel.list); + INIT_LIST_HEAD(&queue->serial.list); + spin_lock_init(&queue->reorder.lock); + spin_lock_init(&queue->parallel.lock); + spin_lock_init(&queue->serial.lock); + + INIT_WORK(&queue->pwork, padata_parallel_worker); + INIT_WORK(&queue->swork, padata_serial_worker); + atomic_set(&queue->num_obj, 0); + } + + cpumask_and(pd->cpumask, cpumask, cpu_active_mask); + + num_cpus = cpumask_weight(pd->cpumask); + pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; + + atomic_set(&pd->seq_nr, -1); + atomic_set(&pd->reorder_objects, 0); + atomic_set(&pd->refcnt, 0); + pd->pinst = pinst; + spin_lock_init(&pd->lock); + + return pd; + +err_free_queue: + free_percpu(pd->queue); +err_free_pd: + kfree(pd); +err: + return NULL; +} + +static void padata_free_pd(struct parallel_data *pd) +{ + free_cpumask_var(pd->cpumask); + free_percpu(pd->queue); + kfree(pd); +} + +static void padata_replace(struct padata_instance *pinst, + struct parallel_data *pd_new) +{ + struct parallel_data *pd_old = pinst->pd; + + pinst->flags |= PADATA_RESET; + + rcu_assign_pointer(pinst->pd, pd_new); + + synchronize_rcu(); + + while (atomic_read(&pd_old->refcnt) != 0) + yield(); + + flush_workqueue(pinst->wq); + + padata_free_pd(pd_old); + + pinst->flags &= ~PADATA_RESET; +} + +/* + * padata_set_cpumask - set the cpumask that padata should use + * + * @pinst: padata instance + * @cpumask: the cpumask to use + */ +int padata_set_cpumask(struct padata_instance *pinst, + cpumask_var_t cpumask) +{ + struct parallel_data *pd; + int err = 0; + + might_sleep(); + + mutex_lock(&pinst->lock); + + pd = padata_alloc_pd(pinst, cpumask); + if (!pd) { + err = -ENOMEM; + goto out; + } + + cpumask_copy(pinst->cpumask, cpumask); + + padata_replace(pinst, pd); + +out: + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_set_cpumask); + +static int __padata_add_cpu(struct padata_instance *pinst, int cpu) +{ + struct parallel_data *pd; + + if (cpumask_test_cpu(cpu, cpu_active_mask)) { + pd = padata_alloc_pd(pinst, pinst->cpumask); + if (!pd) + return -ENOMEM; + + padata_replace(pinst, pd); + } + + return 0; +} + +/* + * padata_add_cpu - add a cpu to the padata cpumask + * + * @pinst: padata instance + * @cpu: cpu to add + */ +int padata_add_cpu(struct padata_instance *pinst, int cpu) +{ + int err; + + might_sleep(); + + mutex_lock(&pinst->lock); + + cpumask_set_cpu(cpu, pinst->cpumask); + err = __padata_add_cpu(pinst, cpu); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_add_cpu); + +static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) +{ + struct parallel_data *pd; + + if (cpumask_test_cpu(cpu, cpu_online_mask)) { + pd = padata_alloc_pd(pinst, pinst->cpumask); + if (!pd) + return -ENOMEM; + + padata_replace(pinst, pd); + } + + return 0; +} + +/* + * padata_remove_cpu - remove a cpu from the padata cpumask + * + * @pinst: padata instance + * @cpu: cpu to remove + */ +int padata_remove_cpu(struct padata_instance *pinst, int cpu) +{ + int err; + + might_sleep(); + + mutex_lock(&pinst->lock); + + cpumask_clear_cpu(cpu, pinst->cpumask); + err = __padata_remove_cpu(pinst, cpu); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_remove_cpu); + +/* + * padata_start - start the parallel processing + * + * @pinst: padata instance to start + */ +void padata_start(struct padata_instance *pinst) +{ + might_sleep(); + + mutex_lock(&pinst->lock); + pinst->flags |= PADATA_INIT; + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_start); + +/* + * padata_stop - stop the parallel processing + * + * @pinst: padata instance to stop + */ +void padata_stop(struct padata_instance *pinst) +{ + might_sleep(); + + mutex_lock(&pinst->lock); + pinst->flags &= ~PADATA_INIT; + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_stop); + +static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int err; + struct padata_instance *pinst; + int cpu = (unsigned long)hcpu; + + pinst = container_of(nfb, struct padata_instance, cpu_notifier); + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + err = __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + if (err) + return NOTIFY_BAD; + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + err = __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + if (err) + return NOTIFY_BAD; + break; + + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + } + + return NOTIFY_OK; +} + +/* + * padata_alloc - allocate and initialize a padata instance + * + * @cpumask: cpumask that padata uses for parallelization + * @wq: workqueue to use for the allocated padata instance + */ +struct padata_instance *padata_alloc(const struct cpumask *cpumask, + struct workqueue_struct *wq) +{ + int err; + struct padata_instance *pinst; + struct parallel_data *pd; + + pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); + if (!pinst) + goto err; + + pd = padata_alloc_pd(pinst, cpumask); + if (!pd) + goto err_free_inst; + + rcu_assign_pointer(pinst->pd, pd); + + pinst->wq = wq; + + cpumask_copy(pinst->cpumask, cpumask); + + pinst->flags = 0; + + pinst->cpu_notifier.notifier_call = padata_cpu_callback; + pinst->cpu_notifier.priority = 0; + err = register_hotcpu_notifier(&pinst->cpu_notifier); + if (err) + goto err_free_pd; + + mutex_init(&pinst->lock); + + return pinst; + +err_free_pd: + padata_free_pd(pd); +err_free_inst: + kfree(pinst); +err: + return NULL; +} +EXPORT_SYMBOL(padata_alloc); + +/* + * padata_free - free a padata instance + * + * @ padata_inst: padata instance to free + */ +void padata_free(struct padata_instance *pinst) +{ + padata_stop(pinst); + + synchronize_rcu(); + + while (atomic_read(&pinst->pd->refcnt) != 0) + yield(); + + unregister_hotcpu_notifier(&pinst->cpu_notifier); + padata_free_pd(pinst->pd); + kfree(pinst); +} +EXPORT_SYMBOL(padata_free); -- cgit v1.2.3 From 509e760cd91c831983097ae174cb6c0b8c6c8e6b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:42 +0800 Subject: tracing: Add print_fmt field This is part of a patch set that removes the show_format method in the ftrace event macros. The print_fmt field is added to hold the string that shows the print_fmt in the event format files. This patch only adds the field but it is currently not used. Later patches will use this field to enable us to remove the show_format field and function. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D3E.2000704@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + include/trace/ftrace.h | 28 +++++++++++++++++++++++++++- kernel/trace/trace_export.c | 7 +++++++ 3 files changed, 35 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 2233c98d80df..bd23d8e52f02 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -121,6 +121,7 @@ struct ftrace_event_call { int (*regfunc)(struct ftrace_event_call *); void (*unregfunc)(struct ftrace_event_call *); int id; + const char *print_fmt; int (*raw_init)(struct ftrace_event_call *); int (*show_format)(struct ftrace_event_call *, struct trace_seq *); diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index c6fe03e902ca..3351d85c83a3 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -722,8 +722,20 @@ static struct trace_event ftrace_event_type_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#undef __entry +#define __entry REC + +#undef __print_flags +#undef __print_symbolic +#undef __get_dynamic_array +#undef __get_str + +#undef TP_printk +#define TP_printk(fmt, args...) "\"" fmt "\", " __stringify(args) + #undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +static const char print_fmt_##call[] = print; #undef DEFINE_EVENT #define DEFINE_EVENT(template, call, proto, args) \ @@ -737,6 +749,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .raw_init = trace_event_raw_init, \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ + .print_fmt = print_fmt_##template, \ .show_format = ftrace_format_##template, \ .define_fields = ftrace_define_fields_##template, \ _TRACE_PROFILE_INIT(call) \ @@ -745,6 +758,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, call, proto, args, print) \ \ +static const char print_fmt_##call[] = print; \ + \ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ @@ -754,6 +769,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .raw_init = trace_event_raw_init, \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ + .print_fmt = print_fmt_##call, \ .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##template, \ _TRACE_PROFILE_INIT(call) \ @@ -837,6 +853,16 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #ifdef CONFIG_EVENT_PROFILE +#undef __entry +#define __entry entry + +#undef __get_dynamic_array +#define __get_dynamic_array(field) \ + ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) + +#undef __get_str +#define __get_str(field) (char *)__get_dynamic_array(field) + #undef __perf_addr #define __perf_addr(a) __addr = (a) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 9978a4f40090..95d14b640a66 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -203,6 +203,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) return 0; } +#undef __entry +#define __entry REC + #undef __field #define __field(type, item) @@ -218,6 +221,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) #undef __dynamic_array #define __dynamic_array(type, item) +#undef F_printk +#define F_printk(fmt, args...) #fmt ", " __stringify(args) + #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ \ @@ -228,6 +234,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .id = type, \ .system = __stringify(TRACE_SYSTEM), \ .raw_init = ftrace_raw_init_event, \ + .print_fmt = print, \ .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##call, \ }; \ -- cgit v1.2.3 From c7ef3a9004201bca90626db246a19dadd2c29c9b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 28 Dec 2009 21:13:59 -0500 Subject: tracing: Have syscall tracing call its own init function In the clean up of having all events call one specific function, the syscall event init was changed to call this helper function. With the new print_fmt updates, the syscalls need to do special initializations. This patch converts the syscall events to call its own init function again. Cc: Lai Jiangshan Cc: Li Zefan Signed-off-by: Steven Rostedt --- include/linux/syscalls.h | 4 ++-- kernel/trace/trace_syscalls.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 207466a49f3d..ed353d274a77 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -143,7 +143,7 @@ struct perf_event_attr; .name = "sys_enter"#sname, \ .system = "syscalls", \ .event = &enter_syscall_print_##sname, \ - .raw_init = trace_event_raw_init, \ + .raw_init = init_syscall_trace, \ .show_format = syscall_enter_format, \ .define_fields = syscall_enter_define_fields, \ .regfunc = reg_event_syscall_enter, \ @@ -165,7 +165,7 @@ struct perf_event_attr; .name = "sys_exit"#sname, \ .system = "syscalls", \ .event = &exit_syscall_print_##sname, \ - .raw_init = trace_event_raw_init, \ + .raw_init = init_syscall_trace, \ .show_format = syscall_exit_format, \ .define_fields = syscall_exit_define_fields, \ .regfunc = reg_event_syscall_exit, \ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 1352b0a36fac..a78e86349ecb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -450,14 +450,14 @@ int init_syscall_trace(struct ftrace_event_call *call) if (set_syscall_print_fmt(call) < 0) return -ENOMEM; - id = register_ftrace_event(call->event); - if (!id) { + id = trace_event_raw_init(call); + + if (id < 0) { free_syscall_print_fmt(call); - return -ENODEV; + return id; } - call->id = id; - INIT_LIST_HEAD(&call->fields); - return 0; + + return id; } int __init init_ftrace_syscalls(void) -- cgit v1.2.3 From 0fa0edaf32b9a78b9854f1da98d4511a501089b0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:57 +0800 Subject: tracing: Remove show_format and related macros from TRACE_EVENT The previous patches added the use of print_fmt string and changes the trace_define_field() function to also create the fields and format output for the event format files. text data bss dec hex filename 5857201 1355780 9336808 16549789 fc879d vmlinux 5884589 1351684 9337896 16574169 fce6d9 vmlinux-orig The above shows the size of the vmlinux after this patch set compared to the vmlinux-orig which is before the patch set. This saves us 27k on text, 1k on bss and adds just 4k of data. The total savings of 24k in size. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D4D.40604@cn.fujitsu.com> Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 - include/linux/syscalls.h | 2 - include/trace/ftrace.h | 133 ++---------------------------------------- include/trace/syscall.h | 4 -- kernel/trace/trace_events.c | 12 ---- kernel/trace/trace_export.c | 73 ----------------------- kernel/trace/trace_kprobe.c | 78 ------------------------- kernel/trace/trace_syscalls.c | 66 --------------------- 8 files changed, 6 insertions(+), 364 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index bd23d8e52f02..84a5629adfd8 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -123,8 +123,6 @@ struct ftrace_event_call { int id; const char *print_fmt; int (*raw_init)(struct ftrace_event_call *); - int (*show_format)(struct ftrace_event_call *, - struct trace_seq *); int (*define_fields)(struct ftrace_event_call *); struct list_head fields; int filter_active; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index ed353d274a77..7b219696ad24 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -144,7 +144,6 @@ struct perf_event_attr; .system = "syscalls", \ .event = &enter_syscall_print_##sname, \ .raw_init = init_syscall_trace, \ - .show_format = syscall_enter_format, \ .define_fields = syscall_enter_define_fields, \ .regfunc = reg_event_syscall_enter, \ .unregfunc = unreg_event_syscall_enter, \ @@ -166,7 +165,6 @@ struct perf_event_attr; .system = "syscalls", \ .event = &exit_syscall_print_##sname, \ .raw_init = init_syscall_trace, \ - .show_format = syscall_exit_format, \ .define_fields = syscall_exit_define_fields, \ .regfunc = reg_event_syscall_exit, \ .unregfunc = unreg_event_syscall_exit, \ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 3351d85c83a3..df65b99880b1 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -130,130 +130,6 @@ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - * struct ftrace_raw_##call field; - * int ret; - * - * ret = trace_seq_printf(s, #type " " #item ";" - * " offset:%u; size:%u;\n", - * offsetof(struct ftrace_raw_##call, item), - * sizeof(field.type)); - * - * } - */ - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\tsigned:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item), \ - (unsigned int)is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __field_ext -#define __field_ext(type, item, filter_type) __field(type, item) - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%u;\tsize:%u;\tsigned:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item), \ - (unsigned int)is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __dynamic_array -#define __dynamic_array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:__data_loc " #type "[] " #item ";\t"\ - "offset:%u;\tsize:%u;\tsigned:%u;\n", \ - (unsigned int)offsetof(typeof(field), \ - __data_loc_##item), \ - (unsigned int)sizeof(field.__data_loc_##item), \ - (unsigned int)is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __string -#define __string(item, src) __dynamic_array(char, item, -1) - -#undef __entry -#define __entry REC - -#undef __print_symbolic -#undef __get_dynamic_array -#undef __get_str - -#undef TP_printk -#define TP_printk(fmt, args...) "\"%s\", %s\n", fmt, __stringify(args) - -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TP_perf_assign -#define TP_perf_assign(args...) - -#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print) \ -static int \ -ftrace_format_setup_##call(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - struct ftrace_raw_##call field __attribute__((unused)); \ - int ret = 0; \ - \ - tstruct; \ - \ - return ret; \ -} \ - \ -static int \ -ftrace_format_##call(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - int ret = 0; \ - \ - ret = ftrace_format_setup_##call(unused, s); \ - if (!ret) \ - return ret; \ - \ - ret = trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#undef DEFINE_EVENT -#define DEFINE_EVENT(template, name, proto, args) - -#undef DEFINE_EVENT_PRINT -#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ -static int \ -ftrace_format_##name(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - int ret = 0; \ - \ - ret = ftrace_format_setup_##template(unused, s); \ - if (!ret) \ - return ret; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) - /* * Stage 3 of the trace events. * @@ -622,7 +498,6 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ * .raw_init = trace_event_raw_init, * .regfunc = ftrace_reg_event_, * .unregfunc = ftrace_unreg_event_, - * .show_format = ftrace_format_, * } * */ @@ -657,6 +532,12 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ #define __assign_str(dst, src) \ strcpy(__get_str(dst), src); +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TP_perf_assign +#define TP_perf_assign(args...) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ \ @@ -750,7 +631,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ .print_fmt = print_fmt_##template, \ - .show_format = ftrace_format_##template, \ .define_fields = ftrace_define_fields_##template, \ _TRACE_PROFILE_INIT(call) \ } @@ -770,7 +650,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ .print_fmt = print_fmt_##call, \ - .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##template, \ _TRACE_PROFILE_INIT(call) \ } diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 961fda3556bb..8cd410254456 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -34,10 +34,6 @@ struct syscall_metadata { extern unsigned long arch_syscall_addr(int nr); extern int init_syscall_trace(struct ftrace_event_call *call); -extern int syscall_enter_format(struct ftrace_event_call *call, - struct trace_seq *s); -extern int syscall_exit_format(struct ftrace_event_call *call, - struct trace_seq *s); extern int syscall_enter_define_fields(struct ftrace_event_call *call); extern int syscall_exit_define_fields(struct ftrace_event_call *call); extern int reg_event_syscall_enter(struct ftrace_event_call *call); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 250ec865d5f5..c2a3077b7353 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -520,14 +520,6 @@ out: return ret; } -extern char *__bad_type_size(void); - -#undef FIELD -#define FIELD(type, name) \ - sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ - #type, "common_" #name, offsetof(typeof(field), name), \ - sizeof(field.name), is_signed_type(type) - static ssize_t event_format_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -965,10 +957,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, filter); } - /* A trace may not want to export its format */ - if (!call->show_format) - return 0; - trace_create_file("format", 0444, call->dir, call, format); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 95d14b640a66..e091f64ba6ce 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \ #include "trace_entries.h" - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __field_desc -#define __field_desc(type, container, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), container.item), \ - sizeof(field.container.item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __array_desc -#define __array_desc(type, container, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), container.item), \ - sizeof(field.container.item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __dynamic_array -#define __dynamic_array(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:0;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef F_printk -#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef __entry -#define __entry REC - -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -static int \ -ftrace_format_##name(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - struct struct_name field __attribute__((unused)); \ - int ret = 0; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include "trace_entries.h" - #undef __field #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ @@ -235,7 +163,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .raw_init = ftrace_raw_init_event, \ .print_fmt = print, \ - .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##call, \ }; \ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 147491dccead..c99029916c76 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1174,82 +1174,6 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -static int __probe_event_show_format(struct trace_seq *s, - struct trace_probe *tp, const char *fmt, - const char *arg) -{ - int i; - - /* Show format */ - if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) - return 0; - - for (i = 0; i < tp->nr_args; i++) - if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) - return 0; - - if (!trace_seq_printf(s, "\", %s", arg)) - return 0; - - for (i = 0; i < tp->nr_args; i++) - if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) - return 0; - - return trace_seq_puts(s, "\n"); -} - -#undef SHOW_FIELD -#define SHOW_FIELD(type, item, name) \ - do { \ - ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ - "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\ - (unsigned int)offsetof(typeof(field), item),\ - (unsigned int)sizeof(type), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; \ - } while (0) - -static int kprobe_event_show_format(struct ftrace_event_call *call, - struct trace_seq *s) -{ - struct kprobe_trace_entry field __attribute__((unused)); - int ret, i; - struct trace_probe *tp = (struct trace_probe *)call->data; - - SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); - SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); - - /* Show fields */ - for (i = 0; i < tp->nr_args; i++) - SHOW_FIELD(unsigned long, args[i], tp->args[i].name); - trace_seq_puts(s, "\n"); - - return __probe_event_show_format(s, tp, "(%lx)", - "REC->" FIELD_STRING_IP); -} - -static int kretprobe_event_show_format(struct ftrace_event_call *call, - struct trace_seq *s) -{ - struct kretprobe_trace_entry field __attribute__((unused)); - int ret, i; - struct trace_probe *tp = (struct trace_probe *)call->data; - - SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); - SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); - SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); - - /* Show fields */ - for (i = 0; i < tp->nr_args; i++) - SHOW_FIELD(unsigned long, args[i], tp->args[i].name); - trace_seq_puts(s, "\n"); - - return __probe_event_show_format(s, tp, "(%lx <- %lx)", - "REC->" FIELD_STRING_FUNC - ", REC->" FIELD_STRING_RETIP); -} - static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) { int i; @@ -1504,12 +1428,10 @@ static int register_probe_event(struct trace_probe *tp) if (probe_is_return(tp)) { tp->event.trace = print_kretprobe_event; call->raw_init = probe_event_raw_init; - call->show_format = kretprobe_event_show_format; call->define_fields = kretprobe_event_define_fields; } else { tp->event.trace = print_kprobe_event; call->raw_init = probe_event_raw_init; - call->show_format = kprobe_event_show_format; call->define_fields = kprobe_event_define_fields; } if (set_print_fmt(tp) < 0) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index a78e86349ecb..49cea70fbf6d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -143,54 +143,6 @@ extern char *__bad_type_size(void); #type, #name, offsetof(typeof(trace), name), \ sizeof(trace.name), is_signed_type(type) -int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) -{ - int i; - int ret; - struct syscall_metadata *entry = call->data; - struct syscall_trace_enter trace; - int offset = offsetof(struct syscall_trace_enter, args); - - ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n", - SYSCALL_FIELD(int, nr)); - if (!ret) - return 0; - - for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], - entry->args[i]); - if (!ret) - return 0; - ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" - "\tsigned:%u;\n", offset, - sizeof(unsigned long), - is_signed_type(unsigned long)); - if (!ret) - return 0; - offset += sizeof(unsigned long); - } - - trace_seq_puts(s, "\nprint fmt: \""); - for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], - sizeof(unsigned long), - i == entry->nb_args - 1 ? "" : ", "); - if (!ret) - return 0; - } - trace_seq_putc(s, '"'); - - for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))", - entry->args[i]); - if (!ret) - return 0; - } - - return trace_seq_putc(s, '\n'); -} - static int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { @@ -252,24 +204,6 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call) kfree(call->print_fmt); } -int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) -{ - int ret; - struct syscall_trace_exit trace; - - ret = trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n", - SYSCALL_FIELD(int, nr), - SYSCALL_FIELD(long, ret)); - if (!ret) - return 0; - - return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); -} - int syscall_enter_define_fields(struct ftrace_event_call *call) { struct syscall_trace_enter trace; -- cgit v1.2.3 From 65324144b50bc7022cc9b6ca8f4a536a957019e3 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 5 Jan 2010 05:50:47 +0000 Subject: net: RFC3069, private VLAN proxy arp support This is to be used together with switch technologies, like RFC3069, that where the individual ports are not allowed to communicate with each other, but they are allowed to talk to the upstream router. As described in RFC 3069, it is possible to allow these hosts to communicate through the upstream router by proxy_arp'ing. This patch basically allow proxy arp replies back to the same interface (from which the ARP request/solicitation was received). Tunable per device via proc "proxy_arp_pvlan": /proc/sys/net/ipv4/conf/*/proxy_arp_pvlan This switch technology is known by different vendor names: - In RFC 3069 it is called VLAN Aggregation. - Cisco and Allied Telesyn call it Private VLAN. - Hewlett-Packard call it Source-Port filtering or port-isolation. - Ericsson call it MAC-Forced Forwarding (RFC Draft). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 19 +++++++++++++ include/linux/inetdevice.h | 1 + include/linux/sysctl.h | 1 + net/ipv4/arp.c | 52 +++++++++++++++++++++++++++++++--- net/ipv4/devinet.c | 1 + net/ipv4/route.c | 7 ++++- 6 files changed, 76 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 006b39dec87d..c532884f4fec 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -692,6 +692,25 @@ proxy_arp - BOOLEAN conf/{all,interface}/proxy_arp is set to TRUE, it will be disabled otherwise +proxy_arp_pvlan - BOOLEAN + Private VLAN proxy arp. + Basically allow proxy arp replies back to the same interface + (from which the ARP request/solicitation was received). + + This is done to support (ethernet) switch features, like RFC + 3069, where the individual ports are NOT allowed to + communicate with each other, but they are allowed to talk to + the upstream router. As described in RFC 3069, it is possible + to allow these hosts to communicate through the upstream + router by proxy_arp'ing. Don't need to be used together with + proxy_arp. + + This technology is known by different names: + In RFC 3069 it is called VLAN Aggregation. + Cisco and Allied Telesyn call it Private VLAN. + Hewlett-Packard call it Source-Port filtering or port-isolation. + Ericsson call it MAC-Forced Forwarding (RFC Draft). + shared_media - BOOLEAN Send(router) or accept(host) RFC1620 shared media redirects. Overrides ip_secure_redirects. diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 699e85c01a4d..9a8c57467d3d 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -88,6 +88,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) #define IN_DEV_LOG_MARTIANS(in_dev) IN_DEV_ORCONF((in_dev), LOG_MARTIANS) #define IN_DEV_PROXY_ARP(in_dev) IN_DEV_ORCONF((in_dev), PROXY_ARP) +#define IN_DEV_PROXY_ARP_PVLAN(in_dev) IN_DEV_CONF_GET(in_dev, PROXY_ARP_PVLAN) #define IN_DEV_SHARED_MEDIA(in_dev) IN_DEV_ORCONF((in_dev), SHARED_MEDIA) #define IN_DEV_TX_REDIRECTS(in_dev) IN_DEV_ORCONF((in_dev), SEND_REDIRECTS) #define IN_DEV_SEC_REDIRECTS(in_dev) IN_DEV_ORCONF((in_dev), \ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 877ba039e6a4..24ff7e3a0d59 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -482,6 +482,7 @@ enum NET_IPV4_CONF_ARP_ACCEPT=21, NET_IPV4_CONF_ARP_NOTIFY=22, NET_IPV4_CONF_ACCEPT_LOCAL=23, + NET_IPV4_CONF_PROXY_ARP_PVLAN=24, __NET_IPV4_CONF_MAX }; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c95cd93acf29..078709233bc4 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -70,6 +70,7 @@ * bonding can change the skb before * sending (e.g. insert 8021q tag). * Harald Welte : convert to make use of jenkins hash + * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. */ #include @@ -524,12 +525,15 @@ int arp_bind_neighbour(struct dst_entry *dst) /* * Check if we can use proxy ARP for this path */ - -static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) +static inline int arp_fwd_proxy(struct in_device *in_dev, + struct net_device *dev, struct rtable *rt) { struct in_device *out_dev; int imi, omi = -1; + if (rt->u.dst.dev == dev) + return 0; + if (!IN_DEV_PROXY_ARP(in_dev)) return 0; @@ -547,6 +551,43 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) return (omi != imi && omi != -1); } +/* + * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev) + * + * RFC3069 supports proxy arp replies back to the same interface. This + * is done to support (ethernet) switch features, like RFC 3069, where + * the individual ports are not allowed to communicate with each + * other, BUT they are allowed to talk to the upstream router. As + * described in RFC 3069, it is possible to allow these hosts to + * communicate through the upstream router, by proxy_arp'ing. + * + * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation" + * + * This technology is known by different names: + * In RFC 3069 it is called VLAN Aggregation. + * Cisco and Allied Telesyn call it Private VLAN. + * Hewlett-Packard call it Source-Port filtering or port-isolation. + * Ericsson call it MAC-Forced Forwarding (RFC Draft). + * + */ +static inline int arp_fwd_pvlan(struct in_device *in_dev, + struct net_device *dev, struct rtable *rt, + __be32 sip, __be32 tip) +{ + /* Private VLAN is only concerned about the same ethernet segment */ + if (rt->u.dst.dev != dev) + return 0; + + /* Don't reply on self probes (often done by windowz boxes)*/ + if (sip == tip) + return 0; + + if (IN_DEV_PROXY_ARP_PVLAN(in_dev)) + return 1; + else + return 0; +} + /* * Interface to link layer: send routine and receive handler. */ @@ -833,8 +874,11 @@ static int arp_process(struct sk_buff *skb) } goto out; } else if (IN_DEV_FORWARD(in_dev)) { - if (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && - (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { + if (addr_type == RTN_UNICAST && + (arp_fwd_proxy(in_dev, dev, rt) || + arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || + pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) + { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5cdbc102a418..0715f4cac391 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1407,6 +1407,7 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), + DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e446496f564f..1cc339441e7d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1988,8 +1988,13 @@ static int __mkroute_input(struct sk_buff *skb, if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. + * + * Proxy arp feature have been extended to allow, ARP + * replies back to the same interface, to support + * Private VLAN switch technologies. See arp.c. */ - if (out_dev == in_dev) { + if (out_dev == in_dev && + IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { err = -EINVAL; goto cleanup; } -- cgit v1.2.3 From 3c9732c06879d85f2fdf7ec69198c1d78da42a98 Mon Sep 17 00:00:00 2001 From: Giuseppe CAVALLARO Date: Wed, 6 Jan 2010 23:07:13 +0000 Subject: stmmac: add the new Header file for stmmac platform data Signed-off-by: Giuseppe Cavallaro Signed-off-by: David S. Miller --- include/linux/stmmac.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 include/linux/stmmac.h (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h new file mode 100644 index 000000000000..32bfd1a8a48d --- /dev/null +++ b/include/linux/stmmac.h @@ -0,0 +1,53 @@ +/******************************************************************************* + + Header file for stmmac platform data + + Copyright (C) 2009 STMicroelectronics Ltd + + This program is free software; you can redistribute it and/or modify it + under the terms and conditions of the GNU General Public License, + version 2, as published by the Free Software Foundation. + + This program is distributed in the hope it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + + The full GNU General Public License is included in this distribution in + the file called "COPYING". + + Author: Giuseppe Cavallaro +*******************************************************************************/ + +#ifndef __STMMAC_PLATFORM_DATA +#define __STMMAC_PLATFORM_DATA + +/* platfrom data for platfrom device structure's platfrom_data field */ + +/* Private data for the STM on-board ethernet driver */ +struct plat_stmmacenet_data { + int bus_id; + int pbl; + int has_gmac; + void (*fix_mac_speed)(void *priv, unsigned int speed); + void (*bus_setup)(unsigned long ioaddr); +#ifdef CONFIG_STM_DRIVERS + struct stm_pad_config *pad_config; +#endif + void *bsp_priv; +}; + +struct plat_stmmacphy_data { + int bus_id; + int phy_addr; + unsigned int phy_mask; + int interface; + int (*phy_reset)(void *priv); + void *priv; +}; +#endif + -- cgit v1.2.3 From 0ed731859e24cd6e3ec058cf2b49b2a0df80e86b Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 6 Jan 2010 09:23:54 +0900 Subject: LSM: Update comment on security_sock_rcv_skb It is not permitted to do sleeping operation inside security_sock_rcv_skb(). Signed-off-by: Tetsuo Handa Acked-by: Serge Hallyn -- Signed-off-by: James Morris --- include/linux/security.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 466cbadbd1ef..3696ca345745 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -978,6 +978,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Check permissions on incoming network packets. This hook is distinct * from Netfilter's IP input hooks since it is the first time that the * incoming sk_buff @skb has been associated with a particular socket, @sk. + * Must not sleep inside this hook because some callers hold spinlocks. * @sk contains the sock (not socket) associated with the incoming sk_buff. * @skb contains the incoming network data. * @socket_getpeersec_stream: -- cgit v1.2.3 From e03a72e13648ac6277bf2bab6b8324d51f89c0fa Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 11 Jan 2010 03:21:51 -0500 Subject: block: Stop using byte offsets All callers of the stacking functions use 512-byte sector units rather than byte offsets. Simplify the code so the stacking functions take sectors when specifying data offsets. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 26 +++++++++----------------- fs/partitions/check.c | 7 ++++--- include/linux/blkdev.h | 17 +++++------------ 3 files changed, 18 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/block/blk-settings.c b/block/blk-settings.c index 5eeb9e0d256e..78549c723783 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -507,7 +507,7 @@ static unsigned int lcm(unsigned int a, unsigned int b) * blk_stack_limits - adjust queue_limits for stacked devices * @t: the stacking driver limits (top device) * @b: the underlying queue limits (bottom, component device) - * @offset: offset to beginning of data within component device + * @start: first data sector within component device * * Description: * This function is used by stacking drivers like MD and DM to ensure @@ -525,10 +525,9 @@ static unsigned int lcm(unsigned int a, unsigned int b) * the alignment_offset is undefined. */ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, - sector_t offset) + sector_t start) { - sector_t alignment; - unsigned int top, bottom, ret = 0; + unsigned int top, bottom, alignment, ret = 0; t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); @@ -548,7 +547,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->misaligned |= b->misaligned; - alignment = queue_limit_alignment_offset(b, offset); + alignment = queue_limit_alignment_offset(b, start); /* Bottom device has different alignment. Check that it is * compatible with the current top alignment. @@ -611,11 +610,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Discard alignment and granularity */ if (b->discard_granularity) { - unsigned int granularity = b->discard_granularity; - offset &= granularity - 1; - - alignment = (granularity + b->discard_alignment - offset) - & (granularity - 1); + alignment = queue_limit_discard_alignment(b, start); if (t->discard_granularity != 0 && t->discard_alignment != alignment) { @@ -657,7 +652,7 @@ int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev, start += get_start_sect(bdev); - return blk_stack_limits(t, &bq->limits, start << 9); + return blk_stack_limits(t, &bq->limits, start); } EXPORT_SYMBOL(bdev_stack_limits); @@ -668,9 +663,8 @@ EXPORT_SYMBOL(bdev_stack_limits); * @offset: offset to beginning of data within component device * * Description: - * Merges the limits for two queues. Returns 0 if alignment - * didn't change. Returns -1 if adding the bottom device caused - * misalignment. + * Merges the limits for a top level gendisk and a bottom level + * block_device. */ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset) @@ -678,9 +672,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, struct request_queue *t = disk->queue; struct request_queue *b = bdev_get_queue(bdev); - offset += get_start_sect(bdev) << 9; - - if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) { + if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) { char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; disk_name(disk, 0, top); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 64bc8998ac9a..e8865c11777f 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -412,9 +412,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev = part_to_dev(p); p->start_sect = start; - p->alignment_offset = queue_sector_alignment_offset(disk->queue, start); - p->discard_alignment = queue_sector_discard_alignment(disk->queue, - start); + p->alignment_offset = + queue_limit_alignment_offset(&disk->queue->limits, start); + p->discard_alignment = + queue_limit_discard_alignment(&disk->queue->limits, start); p->nr_sects = len; p->partno = partno; p->policy = get_disk_ro(disk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c8018977efa..ffb13ad35716 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1112,18 +1112,13 @@ static inline int queue_alignment_offset(struct request_queue *q) return q->limits.alignment_offset; } -static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t offset) +static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector) { unsigned int granularity = max(lim->physical_block_size, lim->io_min); + unsigned int alignment = (sector << 9) & (granularity - 1); - offset &= granularity - 1; - return (granularity + lim->alignment_offset - offset) & (granularity - 1); -} - -static inline int queue_sector_alignment_offset(struct request_queue *q, - sector_t sector) -{ - return queue_limit_alignment_offset(&q->limits, sector << 9); + return (granularity + lim->alignment_offset - alignment) + & (granularity - 1); } static inline int bdev_alignment_offset(struct block_device *bdev) @@ -1147,10 +1142,8 @@ static inline int queue_discard_alignment(struct request_queue *q) return q->limits.discard_alignment; } -static inline int queue_sector_discard_alignment(struct request_queue *q, - sector_t sector) +static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector_t sector) { - struct queue_limits *lim = &q->limits; unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1); return (lim->discard_granularity + lim->discard_alignment - alignment) -- cgit v1.2.3 From d218d11133d888f9745802146a50255a4781d37a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 11 Jan 2010 16:28:01 -0800 Subject: tcp: Generalized TTL Security Mechanism This patch adds the kernel portions needed to implement RFC 5082 Generalized TTL Security Mechanism (GTSM). It is a lightweight security measure against forged packets causing DoS attacks (for BGP). This is already implemented the same way in BSD kernels. For the necessary Quagga patch http://www.gossamer-threads.com/lists/quagga/dev/17389 Description from Cisco http://www.cisco.com/en/US/docs/ios/12_3t/12_3t7/feature/guide/gt_btsh.html It does add one byte to each socket structure, but I did a little rearrangement to reuse a hole (on 64 bit), but it does grow the structure on 32 bit This should be documented on ip(4) man page and the Glibc in.h file also needs update. IPV6_MINHOPLIMIT should also be added (although BSD doesn't support that). Only TCP is supported, but could also be added to UDP, DCCP, SCTP if desired. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/in.h | 2 ++ include/net/inet_sock.h | 4 +++- net/ipv4/ip_sockglue.c | 14 +++++++++++++- net/ipv4/tcp_ipv4.c | 3 +++ 4 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/in.h b/include/linux/in.h index b615649db129..583c76f9c30f 100644 --- a/include/linux/in.h +++ b/include/linux/in.h @@ -84,6 +84,8 @@ struct in_addr { #define IP_ORIGDSTADDR 20 #define IP_RECVORIGDSTADDR IP_ORIGDSTADDR +#define IP_MINTTL 21 + /* IP_MTU_DISCOVER values */ #define IP_PMTUDISC_DONT 0 /* Never send DF frames */ #define IP_PMTUDISC_WANT 1 /* Use per route hints */ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index bd4c53f75ac0..83fd34437cf1 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -122,10 +122,12 @@ struct inet_sock { __be32 inet_saddr; __s16 uc_ttl; __u16 cmsg_flags; - struct ip_options *opt; __be16 inet_sport; __u16 inet_id; + + struct ip_options *opt; __u8 tos; + __u8 min_ttl; __u8 mc_ttl; __u8 pmtudisc; __u8 recverr:1, diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index cafad9baff03..644dc43a55de 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -451,7 +451,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, (1<transparent = !!val; break; + case IP_MINTTL: + if (optlen < 1) + goto e_inval; + if (val < 0 || val > 255) + goto e_inval; + inet->min_ttl = val; + break; + default: err = -ENOPROTOOPT; break; @@ -1198,6 +1207,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_TRANSPARENT: val = inet->transparent; break; + case IP_MINTTL: + val = inet->min_ttl; + break; default: release_sock(sk); return -ENOPROTOOPT; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 65b8ebfd078a..382f667238ec 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1649,6 +1649,9 @@ int tcp_v4_rcv(struct sk_buff *skb) if (!sk) goto no_tcp_socket; + if (iph->ttl < inet_sk(sk)->min_ttl) + goto discard_and_relse; + process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; -- cgit v1.2.3 From 3ccd4c6167d3b39d52631767ebbf8b5677c5855d Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 12 Jan 2010 02:00:46 -0800 Subject: can: Unify droping of invalid tx skbs and netdev stats To prevent the CAN drivers to operate on invalid socketbuffers the skbs are now checked and silently dropped at the xmit-function consistently. Also the netdev stats are consistently using the CAN data length code (dlc) for [rx|tx]_bytes now. Signed-off-by: Oliver Hartkopp Acked-by: Wolfgang Grandegger Signed-off-by: David S. Miller --- drivers/net/can/at91_can.c | 3 +++ drivers/net/can/bfin_can.c | 3 +++ drivers/net/can/mcp251x.c | 6 +----- drivers/net/can/mscan/mscan.c | 5 +---- drivers/net/can/sja1000/sja1000.c | 3 +++ drivers/net/can/ti_hecc.c | 4 +++- drivers/net/can/usb/ems_usb.c | 3 +++ drivers/net/can/vcan.c | 12 +++++++++--- include/linux/can/dev.h | 15 +++++++++++++++ 9 files changed, 41 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c index 166cc7e579c0..f7287497ba6e 100644 --- a/drivers/net/can/at91_can.c +++ b/drivers/net/can/at91_can.c @@ -342,6 +342,9 @@ static netdev_tx_t at91_start_xmit(struct sk_buff *skb, struct net_device *dev) unsigned int mb, prio; u32 reg_mid, reg_mcr; + if (can_dropped_invalid_skb(dev, skb)) + return NETDEV_TX_OK; + mb = get_tx_next_mb(priv); prio = get_tx_next_prio(priv); diff --git a/drivers/net/can/bfin_can.c b/drivers/net/can/bfin_can.c index 0ec1524523cc..7e1926e79e98 100644 --- a/drivers/net/can/bfin_can.c +++ b/drivers/net/can/bfin_can.c @@ -318,6 +318,9 @@ static int bfin_can_start_xmit(struct sk_buff *skb, struct net_device *dev) u16 val; int i; + if (can_dropped_invalid_skb(dev, skb)) + return NETDEV_TX_OK; + netif_stop_queue(dev); /* fill id */ diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c index 1a72ca066a17..afa2fa45fed9 100644 --- a/drivers/net/can/mcp251x.c +++ b/drivers/net/can/mcp251x.c @@ -494,12 +494,8 @@ static netdev_tx_t mcp251x_hard_start_xmit(struct sk_buff *skb, return NETDEV_TX_BUSY; } - if (skb->len != sizeof(struct can_frame)) { - dev_err(&spi->dev, "dropping packet - bad length\n"); - dev_kfree_skb(skb); - net->stats.tx_dropped++; + if (can_dropped_invalid_skb(net, skb)) return NETDEV_TX_OK; - } netif_stop_queue(net); priv->tx_skb = skb; diff --git a/drivers/net/can/mscan/mscan.c b/drivers/net/can/mscan/mscan.c index 500d18918bd5..40827c128b65 100644 --- a/drivers/net/can/mscan/mscan.c +++ b/drivers/net/can/mscan/mscan.c @@ -204,11 +204,8 @@ static netdev_tx_t mscan_start_xmit(struct sk_buff *skb, struct net_device *dev) int i, rtr, buf_id; u32 can_id; - if (skb->len != sizeof(*frame) || frame->can_dlc > 8) { - kfree_skb(skb); - dev->stats.tx_dropped++; + if (can_dropped_invalid_skb(dev, skb)) return NETDEV_TX_OK; - } out_8(®s->cantier, 0); diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c index 542a4f7255b4..345304d779b9 100644 --- a/drivers/net/can/sja1000/sja1000.c +++ b/drivers/net/can/sja1000/sja1000.c @@ -249,6 +249,9 @@ static netdev_tx_t sja1000_start_xmit(struct sk_buff *skb, uint8_t dreg; int i; + if (can_dropped_invalid_skb(dev, skb)) + return NETDEV_TX_OK; + netif_stop_queue(dev); fi = dlc = cf->can_dlc; diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c index 5c993c2da528..7d370e32a7a8 100644 --- a/drivers/net/can/ti_hecc.c +++ b/drivers/net/can/ti_hecc.c @@ -477,6 +477,9 @@ static netdev_tx_t ti_hecc_xmit(struct sk_buff *skb, struct net_device *ndev) u32 mbxno, mbx_mask, data; unsigned long flags; + if (can_dropped_invalid_skb(ndev, skb)) + return NETDEV_TX_OK; + mbxno = get_tx_head_mb(priv); mbx_mask = BIT(mbxno); spin_lock_irqsave(&priv->mbx_lock, flags); @@ -491,7 +494,6 @@ static netdev_tx_t ti_hecc_xmit(struct sk_buff *skb, struct net_device *ndev) spin_unlock_irqrestore(&priv->mbx_lock, flags); /* Prepare mailbox for transmission */ - data = min_t(u8, cf->can_dlc, 8); if (cf->can_id & CAN_RTR_FLAG) /* Remote transmission request */ data |= HECC_CANMCF_RTR; data |= get_tx_head_prio(priv) << 8; diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index efbb05c71bf4..ddb17e256656 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -767,6 +767,9 @@ static netdev_tx_t ems_usb_start_xmit(struct sk_buff *skb, struct net_device *ne size_t size = CPC_HEADER_SIZE + CPC_MSG_HEADER_LEN + sizeof(struct cpc_can_msg); + if (can_dropped_invalid_skb(netdev, skb)) + return NETDEV_TX_OK; + /* create a URB, and a buffer for it, and copy the data to the URB */ urb = usb_alloc_urb(0, GFP_ATOMIC); if (!urb) { diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c index 80ac56313981..d124d837ae58 100644 --- a/drivers/net/can/vcan.c +++ b/drivers/net/can/vcan.c @@ -47,6 +47,7 @@ #include #include #include +#include #include static __initdata const char banner[] = @@ -70,10 +71,11 @@ MODULE_PARM_DESC(echo, "Echo sent frames (for testing). Default: 0 (Off)"); static void vcan_rx(struct sk_buff *skb, struct net_device *dev) { + struct can_frame *cf = (struct can_frame *)skb->data; struct net_device_stats *stats = &dev->stats; stats->rx_packets++; - stats->rx_bytes += skb->len; + stats->rx_bytes += cf->can_dlc; skb->protocol = htons(ETH_P_CAN); skb->pkt_type = PACKET_BROADCAST; @@ -85,11 +87,15 @@ static void vcan_rx(struct sk_buff *skb, struct net_device *dev) static netdev_tx_t vcan_tx(struct sk_buff *skb, struct net_device *dev) { + struct can_frame *cf = (struct can_frame *)skb->data; struct net_device_stats *stats = &dev->stats; int loop; + if (can_dropped_invalid_skb(dev, skb)) + return NETDEV_TX_OK; + stats->tx_packets++; - stats->tx_bytes += skb->len; + stats->tx_bytes += cf->can_dlc; /* set flag whether this packet has to be looped back */ loop = skb->pkt_type == PACKET_LOOPBACK; @@ -103,7 +109,7 @@ static netdev_tx_t vcan_tx(struct sk_buff *skb, struct net_device *dev) * CAN core already did the echo for us */ stats->rx_packets++; - stats->rx_bytes += skb->len; + stats->rx_bytes += cf->can_dlc; } kfree_skb(skb); return NETDEV_TX_OK; diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 3db7767d2a17..7e7c98a3e908 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -60,6 +60,21 @@ struct can_priv { */ #define get_can_dlc(i) (min_t(__u8, (i), 8)) +/* Drop a given socketbuffer if it does not contain a valid CAN frame. */ +static inline int can_dropped_invalid_skb(struct net_device *dev, + struct sk_buff *skb) +{ + const struct can_frame *cf = (struct can_frame *)skb->data; + + if (unlikely(skb->len != sizeof(*cf) || cf->can_dlc > 8)) { + kfree_skb(skb); + dev->stats.tx_dropped++; + return 1; + } + + return 0; +} + struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max); void free_candev(struct net_device *dev); -- cgit v1.2.3 From 81077e82c3f591578625805dd6464a27a9ff56ec Mon Sep 17 00:00:00 2001 From: Lukáš Turek <8an@praha12.net> Date: Mon, 21 Dec 2009 22:50:47 +0100 Subject: nl80211: Add new WIPHY attribute COVERAGE_CLASS The new attribute NL80211_ATTR_WIPHY_COVERAGE_CLASS sets IEEE 802.11 Coverage Class, which depends on maximum distance of nodes in a wireless network. It's required for long distance links (more than a few hundred meters). The attribute is now ignored by two non-mac80211 drivers, rndis and iwmc3200wifi, together with WIPHY_PARAM_RETRY_SHORT and WIPHY_PARAM_RETRY_LONG. If it turns out to be a problem, we could split set_wiphy_params callback or add new capability bits. Signed-off-by: Lukas Turek <8an@praha12.net> Signed-off-by: John W. Linville --- include/linux/nl80211.h | 4 ++++ include/net/cfg80211.h | 2 ++ net/wireless/core.c | 1 + net/wireless/nl80211.c | 15 +++++++++++++++ 4 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index 2bfbe88837ef..d4c556de7170 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -430,6 +430,8 @@ enum nl80211_commands { * @NL80211_ATTR_WIPHY_RTS_THRESHOLD: RTS threshold (TX frames with length * larger than or equal to this use RTS/CTS handshake); allowed range: * 0..65536, disable with (u32)-1; dot11RTSThreshold; u32 + * @NL80211_ATTR_WIPHY_COVERAGE_CLASS: Coverage Class as defined by IEEE 802.11 + * section 7.3.2.9; dot11CoverageClass; u8 * * @NL80211_ATTR_IFINDEX: network interface index of the device to operate on * @NL80211_ATTR_IFNAME: network interface name @@ -779,6 +781,8 @@ enum nl80211_attrs { NL80211_ATTR_COOKIE, + NL80211_ATTR_WIPHY_COVERAGE_CLASS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index add79930f47d..a8d5d04314b9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -837,6 +837,7 @@ enum wiphy_params_flags { WIPHY_PARAM_RETRY_LONG = 1 << 1, WIPHY_PARAM_FRAG_THRESHOLD = 1 << 2, WIPHY_PARAM_RTS_THRESHOLD = 1 << 3, + WIPHY_PARAM_COVERAGE_CLASS = 1 << 4, }; /** @@ -1236,6 +1237,7 @@ struct wiphy { u8 retry_long; u32 frag_threshold; u32 rts_threshold; + u8 coverage_class; char fw_version[ETHTOOL_BUSINFO_LEN]; u32 hw_version; diff --git a/net/wireless/core.c b/net/wireless/core.c index c2a2c563d21a..0a545bb6ed05 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -402,6 +402,7 @@ struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv) rdev->wiphy.retry_long = 4; rdev->wiphy.frag_threshold = (u32) -1; rdev->wiphy.rts_threshold = (u32) -1; + rdev->wiphy.coverage_class = 0; return &rdev->wiphy; } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e3bee3cecdfa..c09fbcd278fb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -69,6 +69,7 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = { [NL80211_ATTR_WIPHY_RETRY_LONG] = { .type = NLA_U8 }, [NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 }, + [NL80211_ATTR_WIPHY_COVERAGE_CLASS] = { .type = NLA_U8 }, [NL80211_ATTR_IFTYPE] = { .type = NLA_U32 }, [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 }, @@ -444,6 +445,8 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, dev->wiphy.frag_threshold); NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_RTS_THRESHOLD, dev->wiphy.rts_threshold); + NLA_PUT_U8(msg, NL80211_ATTR_WIPHY_COVERAGE_CLASS, + dev->wiphy.coverage_class); NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS, dev->wiphy.max_scan_ssids); @@ -684,6 +687,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u32 changed; u8 retry_short = 0, retry_long = 0; u32 frag_threshold = 0, rts_threshold = 0; + u8 coverage_class = 0; rtnl_lock(); @@ -806,9 +810,16 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) changed |= WIPHY_PARAM_RTS_THRESHOLD; } + if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) { + coverage_class = nla_get_u8( + info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]); + changed |= WIPHY_PARAM_COVERAGE_CLASS; + } + if (changed) { u8 old_retry_short, old_retry_long; u32 old_frag_threshold, old_rts_threshold; + u8 old_coverage_class; if (!rdev->ops->set_wiphy_params) { result = -EOPNOTSUPP; @@ -819,6 +830,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) old_retry_long = rdev->wiphy.retry_long; old_frag_threshold = rdev->wiphy.frag_threshold; old_rts_threshold = rdev->wiphy.rts_threshold; + old_coverage_class = rdev->wiphy.coverage_class; if (changed & WIPHY_PARAM_RETRY_SHORT) rdev->wiphy.retry_short = retry_short; @@ -828,6 +840,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.frag_threshold = frag_threshold; if (changed & WIPHY_PARAM_RTS_THRESHOLD) rdev->wiphy.rts_threshold = rts_threshold; + if (changed & WIPHY_PARAM_COVERAGE_CLASS) + rdev->wiphy.coverage_class = coverage_class; result = rdev->ops->set_wiphy_params(&rdev->wiphy, changed); if (result) { @@ -835,6 +849,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.retry_long = old_retry_long; rdev->wiphy.frag_threshold = old_frag_threshold; rdev->wiphy.rts_threshold = old_rts_threshold; + rdev->wiphy.coverage_class = old_coverage_class; } } -- cgit v1.2.3 From 13ae75b103e07304a34ab40c9136e9f53e06475c Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Tue, 29 Dec 2009 12:59:45 +0200 Subject: nl80211: New command for setting TX rate mask for rate control Add a new NL80211_CMD_SET_TX_BITRATE_MASK command and related attributes to provide support for setting TX rate mask for rate control. This uses the existing cfg80211 set_bitrate_mask operation that was previously used only with WEXT compat code (SIOCSIWRATE). The nl80211 command allows more generic configuration of allowed rates as a mask instead of fixed/max rate. Signed-off-by: Jouni Malinen Acked-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/nl80211.h | 44 +++++++++++++++++++ include/net/cfg80211.h | 4 +- net/wireless/nl80211.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index d4c556de7170..7a1c8c145b22 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -295,6 +295,10 @@ * This command is also used as an event to notify when a requested * remain-on-channel duration has expired. * + * @NL80211_CMD_SET_TX_BITRATE_MASK: Set the mask of rates to be used in TX + * rate selection. %NL80211_ATTR_IFINDEX is used to specify the interface + * and @NL80211_ATTR_TX_RATES the set of allowed rates. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -381,6 +385,8 @@ enum nl80211_commands { NL80211_CMD_REMAIN_ON_CHANNEL, NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, + NL80211_CMD_SET_TX_BITRATE_MASK, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -640,6 +646,13 @@ enum nl80211_commands { * * @NL80211_ATTR_COOKIE: Generic 64-bit cookie to identify objects. * + * @NL80211_ATTR_TX_RATES: Nested set of attributes + * (enum nl80211_tx_rate_attributes) describing TX rates per band. The + * enum nl80211_band value is used as the index (nla_type() of the nested + * data. If a band is not included, it will be configured to allow all + * rates based on negotiated supported rates information. This attribute + * is used with %NL80211_CMD_SET_TX_BITRATE_MASK. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -783,6 +796,8 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_COVERAGE_CLASS, + NL80211_ATTR_TX_RATES, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -1482,4 +1497,33 @@ enum nl80211_key_attributes { NL80211_KEY_MAX = __NL80211_KEY_AFTER_LAST - 1 }; +/** + * enum nl80211_tx_rate_attributes - TX rate set attributes + * @__NL80211_TXRATE_INVALID: invalid + * @NL80211_TXRATE_LEGACY: Legacy (non-MCS) rates allowed for TX rate selection + * in an array of rates as defined in IEEE 802.11 7.3.2.2 (u8 values with + * 1 = 500 kbps) but without the IE length restriction (at most + * %NL80211_MAX_SUPP_RATES in a single array). + * @__NL80211_TXRATE_AFTER_LAST: internal + * @NL80211_TXRATE_MAX: highest TX rate attribute + */ +enum nl80211_tx_rate_attributes { + __NL80211_TXRATE_INVALID, + NL80211_TXRATE_LEGACY, + + /* keep last */ + __NL80211_TXRATE_AFTER_LAST, + NL80211_TXRATE_MAX = __NL80211_TXRATE_AFTER_LAST - 1 +}; + +/** + * enum nl80211_band - Frequency band + * @NL80211_BAND_2GHZ - 2.4 GHz ISM band + * @NL80211_BAND_5GHZ - around 5 GHz band (4.9 - 5.7 GHz) + */ +enum nl80211_band { + NL80211_BAND_2GHZ, + NL80211_BAND_5GHZ, +}; + #endif /* __LINUX_NL80211_H */ diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 22e062afb5a1..0d734413b5fb 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -39,8 +39,8 @@ * @IEEE80211_BAND_5GHZ: around 5GHz band (4.9-5.7) */ enum ieee80211_band { - IEEE80211_BAND_2GHZ, - IEEE80211_BAND_5GHZ, + IEEE80211_BAND_2GHZ = NL80211_BAND_2GHZ, + IEEE80211_BAND_5GHZ = NL80211_BAND_5GHZ, /* keep last */ IEEE80211_NUM_BANDS diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c09fbcd278fb..b804062e0179 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -144,6 +144,7 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = { .len = WLAN_PMKID_LEN }, [NL80211_ATTR_DURATION] = { .type = NLA_U32 }, [NL80211_ATTR_COOKIE] = { .type = NLA_U64 }, + [NL80211_ATTR_TX_RATES] = { .type = NLA_NESTED }, }; /* policy for the attributes */ @@ -575,6 +576,7 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, CMD(del_pmksa, DEL_PMKSA); CMD(flush_pmksa, FLUSH_PMKSA); CMD(remain_on_channel, REMAIN_ON_CHANNEL); + CMD(set_bitrate_mask, SET_TX_BITRATE_MASK); if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) { i++; NLA_PUT_U32(msg, i, NL80211_CMD_SET_WIPHY_NETNS); @@ -4438,6 +4440,109 @@ static int nl80211_cancel_remain_on_channel(struct sk_buff *skb, return err; } +static u32 rateset_to_mask(struct ieee80211_supported_band *sband, + u8 *rates, u8 rates_len) +{ + u8 i; + u32 mask = 0; + + for (i = 0; i < rates_len; i++) { + int rate = (rates[i] & 0x7f) * 5; + int ridx; + for (ridx = 0; ridx < sband->n_bitrates; ridx++) { + struct ieee80211_rate *srate = + &sband->bitrates[ridx]; + if (rate == srate->bitrate) { + mask |= 1 << ridx; + break; + } + } + if (ridx == sband->n_bitrates) + return 0; /* rate not found */ + } + + return mask; +} + +static struct nla_policy +nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] __read_mostly = { + [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_RATES }, +}; + +static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *tb[NL80211_TXRATE_MAX + 1]; + struct cfg80211_registered_device *rdev; + struct cfg80211_bitrate_mask mask; + int err, rem, i; + struct net_device *dev; + struct nlattr *tx_rates; + struct ieee80211_supported_band *sband; + + if (info->attrs[NL80211_ATTR_TX_RATES] == NULL) + return -EINVAL; + + rtnl_lock(); + + err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); + if (err) + goto unlock_rtnl; + + if (!rdev->ops->set_bitrate_mask) { + err = -EOPNOTSUPP; + goto unlock; + } + + memset(&mask, 0, sizeof(mask)); + /* Default to all rates enabled */ + for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + sband = rdev->wiphy.bands[i]; + mask.control[i].legacy = + sband ? (1 << sband->n_bitrates) - 1 : 0; + } + + /* + * The nested attribute uses enum nl80211_band as the index. This maps + * directly to the enum ieee80211_band values used in cfg80211. + */ + nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) + { + enum ieee80211_band band = nla_type(tx_rates); + if (band < 0 || band >= IEEE80211_NUM_BANDS) { + err = -EINVAL; + goto unlock; + } + sband = rdev->wiphy.bands[band]; + if (sband == NULL) { + err = -EINVAL; + goto unlock; + } + nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates), + nla_len(tx_rates), nl80211_txattr_policy); + if (tb[NL80211_TXRATE_LEGACY]) { + mask.control[band].legacy = rateset_to_mask( + sband, + nla_data(tb[NL80211_TXRATE_LEGACY]), + nla_len(tb[NL80211_TXRATE_LEGACY])); + if (mask.control[band].legacy == 0) { + err = -EINVAL; + goto unlock; + } + } + } + + err = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, NULL, &mask); + + unlock: + dev_put(dev); + cfg80211_unlock_rdev(rdev); + unlock_rtnl: + rtnl_unlock(); + return err; +} + static struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_GET_WIPHY, @@ -4712,6 +4817,12 @@ static struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, }, + { + .cmd = NL80211_CMD_SET_TX_BITRATE_MASK, + .doit = nl80211_set_tx_bitrate_mask, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_multicast_group nl80211_mlme_mcgrp = { -- cgit v1.2.3 From 7044cc565b45a898c140fb185174a66f2d68a163 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Tue, 5 Jan 2010 20:16:19 +0200 Subject: mac80211: add functions to create PS Poll and Nullfunc templates Some hardware, for example wl1251 and wl1271, handle the transmission of power save related frames in hardware, but the driver is responsible for creating the templates. It's better to create the templates in mac80211, that way all drivers can benefit from this. Add two new functions, ieee80211_pspoll_get() and ieee80211_nullfunc_get() which drivers need to call to get the frame. Drivers are also responsible for updating the templates after each association. Also new struct ieee80211_hdr_3addr is added to ieee80211.h to make it easy to calculate length of the Nullfunc frame. Signed-off-by: Kalle Valo Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 9 ++++++ include/net/mac80211.h | 30 ++++++++++++++++++ net/mac80211/tx.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index aeea282bd2fe..602c0692c3fc 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -130,6 +130,15 @@ struct ieee80211_hdr { u8 addr4[6]; } __attribute__ ((packed)); +struct ieee80211_hdr_3addr { + __le16 frame_control; + __le16 duration_id; + u8 addr1[6]; + u8 addr2[6]; + u8 addr3[6]; + __le16 seq_ctrl; +} __attribute__ ((packed)); + /** * ieee80211_has_tods - check if IEEE80211_FCTL_TODS is set * @fc: frame control bytes in little-endian byteorder diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7e5af6d90b93..75f46e26ad60 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1874,6 +1874,36 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, return ieee80211_beacon_get_tim(hw, vif, NULL, NULL); } +/** + * ieee80211_pspoll_get - retrieve a PS Poll template + * @hw: pointer obtained from ieee80211_alloc_hw(). + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * + * Creates a PS Poll a template which can, for example, uploaded to + * hardware. The template must be updated after association so that correct + * AID, BSSID and MAC address is used. + * + * Note: Caller (or hardware) is responsible for setting the + * &IEEE80211_FCTL_PM bit. + */ +struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif); + +/** + * ieee80211_nullfunc_get - retrieve a nullfunc template + * @hw: pointer obtained from ieee80211_alloc_hw(). + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * + * Creates a Nullfunc template which can, for example, uploaded to + * hardware. The template must be updated after association so that correct + * BSSID and address is used. + * + * Note: Caller (or hardware) is responsible for setting the + * &IEEE80211_FCTL_PM bit as well as Duration and Sequence Control fields. + */ +struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif); + /** * ieee80211_rts_get - RTS frame generation function * @hw: pointer obtained from ieee80211_alloc_hw(). diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index d3a44812f8bf..055b45b146d9 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2200,6 +2200,84 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_beacon_get_tim); +struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata; + struct ieee80211_if_managed *ifmgd; + struct ieee80211_pspoll *pspoll; + struct ieee80211_local *local; + struct sk_buff *skb; + + if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) + return NULL; + + sdata = vif_to_sdata(vif); + ifmgd = &sdata->u.mgd; + local = sdata->local; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*pspoll)); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for " + "pspoll template\n", sdata->name); + return NULL; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + pspoll = (struct ieee80211_pspoll *) skb_put(skb, sizeof(*pspoll)); + memset(pspoll, 0, sizeof(*pspoll)); + pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | + IEEE80211_STYPE_PSPOLL); + pspoll->aid = cpu_to_le16(ifmgd->aid); + + /* aid in PS-Poll has its two MSBs each set to 1 */ + pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14); + + memcpy(pspoll->bssid, ifmgd->bssid, ETH_ALEN); + memcpy(pspoll->ta, vif->addr, ETH_ALEN); + + return skb; +} +EXPORT_SYMBOL(ieee80211_pspoll_get); + +struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) +{ + struct ieee80211_hdr_3addr *nullfunc; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_if_managed *ifmgd; + struct ieee80211_local *local; + struct sk_buff *skb; + + if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) + return NULL; + + sdata = vif_to_sdata(vif); + ifmgd = &sdata->u.mgd; + local = sdata->local; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*nullfunc)); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc " + "template\n", sdata->name); + return NULL; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + nullfunc = (struct ieee80211_hdr_3addr *) skb_put(skb, + sizeof(*nullfunc)); + memset(nullfunc, 0, sizeof(*nullfunc)); + nullfunc->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA | + IEEE80211_STYPE_NULLFUNC | + IEEE80211_FCTL_TODS); + memcpy(nullfunc->addr1, ifmgd->bssid, ETH_ALEN); + memcpy(nullfunc->addr2, vif->addr, ETH_ALEN); + memcpy(nullfunc->addr3, ifmgd->bssid, ETH_ALEN); + + return skb; +} +EXPORT_SYMBOL(ieee80211_nullfunc_get); + void ieee80211_rts_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_info *frame_txctl, -- cgit v1.2.3 From 34a6eddbabd704b3c7dae9362234552267573be2 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 6 Jan 2010 16:19:24 +0200 Subject: cfg80211: Store IEs from both Beacon and Probe Response frames Store information elements from Beacon and Probe Response frames in separate buffers to allow both sets to be made available through nl80211. This allows user space applications to get access to IEs from Beacon frames even if we have received Probe Response frames from the BSS. Previously, the IEs from Probe Response frames would have overridden the IEs from Beacon frames. This feature is of somewhat limited use since most protocols include the same (or extended) information in Probe Response frames. However, there are couple of exceptions where the IEs from Beacon frames could be of some use: TIM IE is only included in Beacon frames (and it would be needed to figure out the DTIM period used in the BSS) and at least some implementations of Wireless Provisioning Services seem to include the full IE only in Beacon frames). The new BSS attribute for scan results is added to allow both the IE sets to be delivered. This is done in a way that maintains the previously used behavior for applications that are not aware of the new NL80211_BSS_BEACON_IES attribute. Signed-off-by: Jouni Malinen Acked-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/nl80211.h | 10 +++- include/net/cfg80211.h | 12 ++++- net/wireless/core.h | 3 +- net/wireless/nl80211.c | 4 ++ net/wireless/scan.c | 120 ++++++++++++++++++++++++++++++++++++------------ 5 files changed, 116 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index 7a1c8c145b22..127a73015760 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -1378,13 +1378,20 @@ enum nl80211_channel_type { * @NL80211_BSS_BEACON_INTERVAL: beacon interval of the (I)BSS (u16) * @NL80211_BSS_CAPABILITY: capability field (CPU order, u16) * @NL80211_BSS_INFORMATION_ELEMENTS: binary attribute containing the - * raw information elements from the probe response/beacon (bin) + * raw information elements from the probe response/beacon (bin); + * if the %NL80211_BSS_BEACON_IES attribute is present, the IEs here are + * from a Probe Response frame; otherwise they are from a Beacon frame. + * However, if the driver does not indicate the source of the IEs, these + * IEs may be from either frame subtype. * @NL80211_BSS_SIGNAL_MBM: signal strength of probe response/beacon * in mBm (100 * dBm) (s32) * @NL80211_BSS_SIGNAL_UNSPEC: signal strength of the probe response/beacon * in unspecified units, scaled to 0..100 (u8) * @NL80211_BSS_STATUS: status, if this BSS is "used" * @NL80211_BSS_SEEN_MS_AGO: age of this BSS entry in ms + * @NL80211_BSS_BEACON_IES: binary attribute containing the raw information + * elements from a Beacon frame (bin); not present if no Beacon frame has + * yet been received * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -1400,6 +1407,7 @@ enum nl80211_bss { NL80211_BSS_SIGNAL_UNSPEC, NL80211_BSS_STATUS, NL80211_BSS_SEEN_MS_AGO, + NL80211_BSS_BEACON_IES, /* keep last */ __NL80211_BSS_AFTER_LAST, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0d734413b5fb..2af52704e670 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -626,8 +626,14 @@ enum cfg80211_signal_type { * @beacon_interval: the beacon interval as from the frame * @capability: the capability field in host byte order * @information_elements: the information elements (Note that there - * is no guarantee that these are well-formed!) + * is no guarantee that these are well-formed!); this is a pointer to + * either the beacon_ies or proberesp_ies depending on whether Probe + * Response frame has been received * @len_information_elements: total length of the information elements + * @beacon_ies: the information elements from the last Beacon frame + * @len_beacon_ies: total length of the beacon_ies + * @proberesp_ies: the information elements from the last Probe Response frame + * @len_proberesp_ies: total length of the proberesp_ies * @signal: signal strength value (type depends on the wiphy's signal_type) * @free_priv: function pointer to free private data * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes @@ -641,6 +647,10 @@ struct cfg80211_bss { u16 capability; u8 *information_elements; size_t len_information_elements; + u8 *beacon_ies; + size_t len_beacon_ies; + u8 *proberesp_ies; + size_t len_proberesp_ies; s32 signal; diff --git a/net/wireless/core.h b/net/wireless/core.h index 30ec95f05b52..2d6a6b9c0c43 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -111,7 +111,8 @@ struct cfg80211_internal_bss { unsigned long ts; struct kref ref; atomic_t hold; - bool ies_allocated; + bool beacon_ies_allocated; + bool proberesp_ies_allocated; /* must be last because of priv member */ struct cfg80211_bss pub; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b804062e0179..4af7991a9ec8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3163,6 +3163,10 @@ static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags, NLA_PUT(msg, NL80211_BSS_INFORMATION_ELEMENTS, res->len_information_elements, res->information_elements); + if (res->beacon_ies && res->len_beacon_ies && + res->beacon_ies != res->information_elements) + NLA_PUT(msg, NL80211_BSS_BEACON_IES, + res->len_beacon_ies, res->beacon_ies); if (res->tsf) NLA_PUT_U64(msg, NL80211_BSS_TSF, res->tsf); if (res->beacon_interval) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 0c2cbbebca95..06b0231ee5e3 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -100,8 +100,10 @@ static void bss_release(struct kref *ref) if (bss->pub.free_priv) bss->pub.free_priv(&bss->pub); - if (bss->ies_allocated) - kfree(bss->pub.information_elements); + if (bss->beacon_ies_allocated) + kfree(bss->pub.beacon_ies); + if (bss->proberesp_ies_allocated) + kfree(bss->pub.proberesp_ies); BUG_ON(atomic_read(&bss->hold)); @@ -375,8 +377,7 @@ rb_find_bss(struct cfg80211_registered_device *dev, static struct cfg80211_internal_bss * cfg80211_bss_update(struct cfg80211_registered_device *dev, - struct cfg80211_internal_bss *res, - bool overwrite) + struct cfg80211_internal_bss *res) { struct cfg80211_internal_bss *found = NULL; const u8 *meshid, *meshcfg; @@ -418,28 +419,64 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev, found->pub.capability = res->pub.capability; found->ts = res->ts; - /* overwrite IEs */ - if (overwrite) { + /* Update IEs */ + if (res->pub.proberesp_ies) { size_t used = dev->wiphy.bss_priv_size + sizeof(*res); - size_t ielen = res->pub.len_information_elements; + size_t ielen = res->pub.len_proberesp_ies; + + if (found->pub.proberesp_ies && + !found->proberesp_ies_allocated && + ksize(found) >= used + ielen) { + memcpy(found->pub.proberesp_ies, + res->pub.proberesp_ies, ielen); + found->pub.len_proberesp_ies = ielen; + } else { + u8 *ies = found->pub.proberesp_ies; + + if (found->proberesp_ies_allocated) + ies = krealloc(ies, ielen, GFP_ATOMIC); + else + ies = kmalloc(ielen, GFP_ATOMIC); + + if (ies) { + memcpy(ies, res->pub.proberesp_ies, + ielen); + found->proberesp_ies_allocated = true; + found->pub.proberesp_ies = ies; + found->pub.len_proberesp_ies = ielen; + } + } - if (!found->ies_allocated && ksize(found) >= used + ielen) { - memcpy(found->pub.information_elements, - res->pub.information_elements, ielen); - found->pub.len_information_elements = ielen; + /* Override possible earlier Beacon frame IEs */ + found->pub.information_elements = + found->pub.proberesp_ies; + found->pub.len_information_elements = + found->pub.len_proberesp_ies; + } + if (res->pub.beacon_ies) { + size_t used = dev->wiphy.bss_priv_size + sizeof(*res); + size_t ielen = res->pub.len_beacon_ies; + + if (found->pub.beacon_ies && + !found->beacon_ies_allocated && + ksize(found) >= used + ielen) { + memcpy(found->pub.beacon_ies, + res->pub.beacon_ies, ielen); + found->pub.len_beacon_ies = ielen; } else { - u8 *ies = found->pub.information_elements; + u8 *ies = found->pub.beacon_ies; - if (found->ies_allocated) + if (found->beacon_ies_allocated) ies = krealloc(ies, ielen, GFP_ATOMIC); else ies = kmalloc(ielen, GFP_ATOMIC); if (ies) { - memcpy(ies, res->pub.information_elements, ielen); - found->ies_allocated = true; - found->pub.information_elements = ies; - found->pub.len_information_elements = ielen; + memcpy(ies, res->pub.beacon_ies, + ielen); + found->beacon_ies_allocated = true; + found->pub.beacon_ies = ies; + found->pub.len_beacon_ies = ielen; } } } @@ -489,14 +526,26 @@ cfg80211_inform_bss(struct wiphy *wiphy, res->pub.tsf = timestamp; res->pub.beacon_interval = beacon_interval; res->pub.capability = capability; - /* point to after the private area */ - res->pub.information_elements = (u8 *)res + sizeof(*res) + privsz; - memcpy(res->pub.information_elements, ie, ielen); - res->pub.len_information_elements = ielen; + /* + * Since we do not know here whether the IEs are from a Beacon or Probe + * Response frame, we need to pick one of the options and only use it + * with the driver that does not provide the full Beacon/Probe Response + * frame. Use Beacon frame pointer to avoid indicating that this should + * override the information_elements pointer should we have received an + * earlier indication of Probe Response data. + * + * The initial buffer for the IEs is allocated with the BSS entry and + * is located after the private area. + */ + res->pub.beacon_ies = (u8 *)res + sizeof(*res) + privsz; + memcpy(res->pub.beacon_ies, ie, ielen); + res->pub.len_beacon_ies = ielen; + res->pub.information_elements = res->pub.beacon_ies; + res->pub.len_information_elements = res->pub.len_beacon_ies; kref_init(&res->ref); - res = cfg80211_bss_update(wiphy_to_dev(wiphy), res, 0); + res = cfg80211_bss_update(wiphy_to_dev(wiphy), res); if (!res) return NULL; @@ -517,7 +566,6 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, struct cfg80211_internal_bss *res; size_t ielen = len - offsetof(struct ieee80211_mgmt, u.probe_resp.variable); - bool overwrite; size_t privsz = wiphy->bss_priv_size; if (WARN_ON(wiphy->signal_type == NL80211_BSS_SIGNAL_UNSPEC && @@ -538,16 +586,28 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, res->pub.tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); res->pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); res->pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); - /* point to after the private area */ - res->pub.information_elements = (u8 *)res + sizeof(*res) + privsz; - memcpy(res->pub.information_elements, mgmt->u.probe_resp.variable, ielen); - res->pub.len_information_elements = ielen; + /* + * The initial buffer for the IEs is allocated with the BSS entry and + * is located after the private area. + */ + if (ieee80211_is_probe_resp(mgmt->frame_control)) { + res->pub.proberesp_ies = (u8 *) res + sizeof(*res) + privsz; + memcpy(res->pub.proberesp_ies, mgmt->u.probe_resp.variable, + ielen); + res->pub.len_proberesp_ies = ielen; + res->pub.information_elements = res->pub.proberesp_ies; + res->pub.len_information_elements = res->pub.len_proberesp_ies; + } else { + res->pub.beacon_ies = (u8 *) res + sizeof(*res) + privsz; + memcpy(res->pub.beacon_ies, mgmt->u.beacon.variable, ielen); + res->pub.len_beacon_ies = ielen; + res->pub.information_elements = res->pub.beacon_ies; + res->pub.len_information_elements = res->pub.len_beacon_ies; + } kref_init(&res->ref); - overwrite = ieee80211_is_probe_resp(mgmt->frame_control); - - res = cfg80211_bss_update(wiphy_to_dev(wiphy), res, overwrite); + res = cfg80211_bss_update(wiphy_to_dev(wiphy), res); if (!res) return NULL; -- cgit v1.2.3 From ab13315af97919fae0e014748105fdc2e30afb2d Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Tue, 12 Jan 2010 10:42:31 +0200 Subject: mac80211: add U-APSD client support Add Unscheduled Automatic Power-Save Delivery (U-APSD) client support. The idea is that the data frames from the client trigger AP to send the buffered frames with ACs which have U-APSD enabled. This decreases latency and makes it possible to save even more power. Driver needs to use IEEE80211_HW_UAPSD to enable the feature. The current implementation assumes that firmware takes care of the wakeup and hardware needing IEEE80211_HW_PS_NULLFUNC_STACK is not yet supported. Tested with wl1251 on a Nokia N900 and Cisco Aironet 1231G AP and running various test traffic with ping. Signed-off-by: Kalle Valo Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 18 ++++++++++++++++++ include/net/mac80211.h | 7 +++++++ net/mac80211/cfg.c | 7 +++++++ net/mac80211/ieee80211_i.h | 13 ++++++++++++- net/mac80211/main.c | 4 ++++ net/mac80211/mlme.c | 31 ++++++++++++++++++++++++++++--- net/mac80211/scan.c | 18 ++++++++++++++++++ net/mac80211/util.c | 2 ++ net/mac80211/work.c | 12 ++++++++++-- 9 files changed, 106 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 602c0692c3fc..a8c6069a0d9f 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -120,6 +120,24 @@ #define IEEE80211_QOS_CTL_TID_MASK 0x000F #define IEEE80211_QOS_CTL_TAG1D_MASK 0x0007 +/* U-APSD queue for WMM IEs sent by AP */ +#define IEEE80211_WMM_IE_AP_QOSINFO_UAPSD (1<<7) + +/* U-APSD queues for WMM IEs sent by STA */ +#define IEEE80211_WMM_IE_STA_QOSINFO_AC_VO (1<<0) +#define IEEE80211_WMM_IE_STA_QOSINFO_AC_VI (1<<1) +#define IEEE80211_WMM_IE_STA_QOSINFO_AC_BK (1<<2) +#define IEEE80211_WMM_IE_STA_QOSINFO_AC_BE (1<<3) +#define IEEE80211_WMM_IE_STA_QOSINFO_AC_MASK 0x0f + +/* U-APSD max SP length for WMM IEs sent by STA */ +#define IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL 0x00 +#define IEEE80211_WMM_IE_STA_QOSINFO_SP_2 0x01 +#define IEEE80211_WMM_IE_STA_QOSINFO_SP_4 0x02 +#define IEEE80211_WMM_IE_STA_QOSINFO_SP_6 0x03 +#define IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK 0x03 +#define IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT 5 + struct ieee80211_hdr { __le16 frame_control; __le16 duration_id; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index e1e73c6abeff..f313a3cbabda 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -113,6 +113,7 @@ struct ieee80211_tx_queue_params { u16 cw_min; u16 cw_max; u8 aifs; + bool uapsd; }; /** @@ -929,6 +930,11 @@ enum ieee80211_tkip_key_type { * Hardware supports dynamic spatial multiplexing powersave, * ie. can turn off all but one chain and then wake the rest * up as required after, for example, rts/cts handshake. + * + * @IEEE80211_HW_SUPPORTS_UAPSD: + * Hardware supports Unscheduled Automatic Power Save Delivery + * (U-APSD) in managed mode. The mode is configured with + * conf_tx() operation. */ enum ieee80211_hw_flags { IEEE80211_HW_HAS_RATE_CONTROL = 1<<0, @@ -948,6 +954,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_BEACON_FILTER = 1<<14, IEEE80211_HW_SUPPORTS_STATIC_SMPS = 1<<15, IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS = 1<<16, + IEEE80211_HW_SUPPORTS_UAPSD = 1<<17, }; /** diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index dc12e9466ffd..8286df5822d5 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1128,6 +1128,13 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy, p.cw_max = params->cwmax; p.cw_min = params->cwmin; p.txop = params->txop; + + /* + * Setting tx queue params disables u-apsd because it's only + * called in master mode. + */ + p.uapsd = false; + if (drv_conf_tx(local, params->queue, &p)) { printk(KERN_DEBUG "%s: failed to set TX queue " "parameters for queue %d\n", diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 3e4ac3f30857..3468e378509a 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -58,6 +58,15 @@ struct ieee80211_local; #define TU_TO_EXP_TIME(x) (jiffies + usecs_to_jiffies((x) * 1024)) +#define IEEE80211_DEFAULT_UAPSD_QUEUES \ + (IEEE80211_WMM_IE_STA_QOSINFO_AC_BK | \ + IEEE80211_WMM_IE_STA_QOSINFO_AC_BE | \ + IEEE80211_WMM_IE_STA_QOSINFO_AC_VI | \ + IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) + +#define IEEE80211_DEFAULT_MAX_SP_LEN \ + IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL + struct ieee80211_fragment_entry { unsigned long first_frag_time; unsigned int seq; @@ -78,6 +87,7 @@ struct ieee80211_bss { u8 dtim_period; bool wmm_used; + bool uapsd_supported; unsigned long last_probe_resp; @@ -285,7 +295,7 @@ struct ieee80211_work { u8 ssid[IEEE80211_MAX_SSID_LEN]; u8 ssid_len; u8 supp_rates_len; - bool wmm_used, use_11n; + bool wmm_used, use_11n, uapsd_used; } assoc; struct { u32 duration; @@ -306,6 +316,7 @@ enum ieee80211_sta_flags { IEEE80211_STA_DISABLE_11N = BIT(4), IEEE80211_STA_CSA_RECEIVED = BIT(5), IEEE80211_STA_MFP_ENABLED = BIT(6), + IEEE80211_STA_UAPSD_ENABLED = BIT(7), }; struct ieee80211_if_managed { diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 468829143991..0054bba08ce1 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -491,6 +491,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC; + WARN((local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD) + && (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK), + "U-APSD not supported with HW_PS_NULLFUNC_STACK\n"); + /* * Calculate scan IE length -- we need this to alloc * memory and to subtract from the driver limit. It diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 86f025bc9456..39c27d83a4f2 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -569,7 +569,7 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, struct ieee80211_tx_queue_params params; size_t left; int count; - u8 *pos; + u8 *pos, uapsd_queues = 0; if (local->hw.queues < 4) return; @@ -579,6 +579,10 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, if (wmm_param_len < 8 || wmm_param[5] /* version */ != 1) return; + + if (ifmgd->flags & IEEE80211_STA_UAPSD_ENABLED) + uapsd_queues = IEEE80211_DEFAULT_UAPSD_QUEUES; + count = wmm_param[6] & 0x0f; if (count == ifmgd->wmm_last_param_set) return; @@ -593,6 +597,7 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, for (; left >= 4; left -= 4, pos += 4) { int aci = (pos[0] >> 5) & 0x03; int acm = (pos[0] >> 4) & 0x01; + bool uapsd = false; int queue; switch (aci) { @@ -600,22 +605,30 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, queue = 3; if (acm) local->wmm_acm |= BIT(1) | BIT(2); /* BK/- */ + if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK) + uapsd = true; break; case 2: /* AC_VI */ queue = 1; if (acm) local->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */ + if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI) + uapsd = true; break; case 3: /* AC_VO */ queue = 0; if (acm) local->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */ + if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) + uapsd = true; break; case 0: /* AC_BE */ default: queue = 2; if (acm) local->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */ + if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE) + uapsd = true; break; } @@ -623,11 +636,14 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, params.cw_max = ecw2cw((pos[1] & 0xf0) >> 4); params.cw_min = ecw2cw(pos[1] & 0x0f); params.txop = get_unaligned_le16(pos + 2); + params.uapsd = uapsd; + #ifdef CONFIG_MAC80211_VERBOSE_DEBUG printk(KERN_DEBUG "%s: WMM queue=%d aci=%d acm=%d aifs=%d " - "cWmin=%d cWmax=%d txop=%d\n", + "cWmin=%d cWmax=%d txop=%d uapsd=%d\n", wiphy_name(local->hw.wiphy), queue, aci, acm, - params.aifs, params.cw_min, params.cw_max, params.txop); + params.aifs, params.cw_min, params.cw_max, params.txop, + params.uapsd); #endif if (drv_conf_tx(local, queue, ¶ms) && local->ops->conf_tx) printk(KERN_DEBUG "%s: failed to set TX queue " @@ -1906,6 +1922,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, wk->assoc.ht_information_ie = ieee80211_bss_get_ie(req->bss, WLAN_EID_HT_INFORMATION); + if (bss->wmm_used && bss->uapsd_supported && + (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD)) { + wk->assoc.uapsd_used = true; + ifmgd->flags |= IEEE80211_STA_UAPSD_ENABLED; + } else { + wk->assoc.uapsd_used = false; + ifmgd->flags &= ~IEEE80211_STA_UAPSD_ENABLED; + } + ssid = ieee80211_bss_get_ie(req->bss, WLAN_EID_SSID); memcpy(wk->assoc.ssid, ssid + 2, ssid[1]); wk->assoc.ssid_len = ssid[1]; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 30cb62bb45b3..9afe2f9885dc 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -54,6 +54,23 @@ void ieee80211_rx_bss_put(struct ieee80211_local *local, cfg80211_put_bss(container_of((void *)bss, struct cfg80211_bss, priv)); } +static bool is_uapsd_supported(struct ieee802_11_elems *elems) +{ + u8 qos_info; + + if (elems->wmm_info && elems->wmm_info_len == 7 + && elems->wmm_info[5] == 1) + qos_info = elems->wmm_info[6]; + else if (elems->wmm_param && elems->wmm_param_len == 24 + && elems->wmm_param[5] == 1) + qos_info = elems->wmm_param[6]; + else + /* no valid wmm information or parameter element found */ + return false; + + return qos_info & IEEE80211_WMM_IE_AP_QOSINFO_UAPSD; +} + struct ieee80211_bss * ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_rx_status *rx_status, @@ -117,6 +134,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local, } bss->wmm_used = elems->wmm_param || elems->wmm_info; + bss->uapsd_supported = is_uapsd_supported(elems); if (!beacon) bss->last_probe_resp = jiffies; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index a2ba6e29bd9a..e278f97c8305 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -792,6 +792,8 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata) break; } + qparam.uapsd = false; + drv_conf_tx(local, queue, &qparam); } } diff --git a/net/mac80211/work.c b/net/mac80211/work.c index 7c5d95b1bc04..a74fd6ee0083 100644 --- a/net/mac80211/work.c +++ b/net/mac80211/work.c @@ -202,7 +202,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; - u8 *pos; + u8 *pos, qos_info; const u8 *ies; size_t offset = 0, noffset; int i, len, count, rates_len, supp_rates_len; @@ -375,6 +375,14 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, } if (wk->assoc.wmm_used && local->hw.queues >= 4) { + if (wk->assoc.uapsd_used) { + qos_info = IEEE80211_DEFAULT_UAPSD_QUEUES; + qos_info |= (IEEE80211_DEFAULT_MAX_SP_LEN << + IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT); + } else { + qos_info = 0; + } + pos = skb_put(skb, 9); *pos++ = WLAN_EID_VENDOR_SPECIFIC; *pos++ = 7; /* len */ @@ -384,7 +392,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, *pos++ = 2; /* WME */ *pos++ = 0; /* WME info */ *pos++ = 1; /* WME ver */ - *pos++ = 0; + *pos++ = qos_info; } /* add any remaining custom (i.e. vendor specific here) IEs */ -- cgit v1.2.3 From 558a6669d7cb407fbb0b5aec184b5c3b9a893d30 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Tue, 12 Jan 2010 10:43:00 +0200 Subject: ieee80211: add struct ieee80211_hdr_qos The header can be used to create qos nullfunc frames, for example. Signed-off-by: Kalle Valo Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a8c6069a0d9f..842701906ae9 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -157,6 +157,16 @@ struct ieee80211_hdr_3addr { __le16 seq_ctrl; } __attribute__ ((packed)); +struct ieee80211_qos_hdr { + __le16 frame_control; + __le16 duration_id; + u8 addr1[6]; + u8 addr2[6]; + u8 addr3[6]; + __le16 seq_ctrl; + __le16 qos_ctrl; +} __attribute__ ((packed)); + /** * ieee80211_has_tods - check if IEEE80211_FCTL_TODS is set * @fc: frame control bytes in little-endian byteorder -- cgit v1.2.3 From bf66f18e79e34c421bbd8f6511e2c556b779df2f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Jan 2010 15:09:10 -0800 Subject: rcu: Add force_quiescent_state() testing to rcutorture Add force_quiescent_state() testing to rcutorture, with a separate thread that repeatedly invokes force_quiescent_state() in bursts. This can greatly increase the probability of encountering certain types of race conditions. Suggested-by: Josh Triplett Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1262646551116-git-send-email-> Signed-off-by: Ingo Molnar --- include/linux/rcutiny.h | 12 ++++++++ include/linux/rcutree.h | 3 ++ kernel/rcutorture.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcutree.c | 18 +++++++++++ kernel/rcutree_plugin.h | 19 ++++++++++++ 5 files changed, 130 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 96cc307ed9f4..2b70d4e37383 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -62,6 +62,18 @@ static inline long rcu_batches_completed_bh(void) extern int rcu_expedited_torture_stats(char *page); +static inline void rcu_force_quiescent_state(void) +{ +} + +static inline void rcu_bh_force_quiescent_state(void) +{ +} + +static inline void rcu_sched_force_quiescent_state(void) +{ +} + #define synchronize_rcu synchronize_sched static inline void synchronize_rcu_expedited(void) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 8044b1b94333..704a010f686c 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -99,6 +99,9 @@ extern void rcu_check_callbacks(int cpu, int user); extern long rcu_batches_completed(void); extern long rcu_batches_completed_bh(void); extern long rcu_batches_completed_sched(void); +extern void rcu_force_quiescent_state(void); +extern void rcu_bh_force_quiescent_state(void); +extern void rcu_sched_force_quiescent_state(void); #ifdef CONFIG_NO_HZ void rcu_enter_nohz(void); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9bb52177af02..adda92bfafac 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -61,6 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ static int stutter = 5; /* Start/stop testing interval (in sec) */ static int irqreader = 1; /* RCU readers from irq (timers). */ +static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ +static int fqs_holdoff = 0; /* Hold time within burst (us). */ +static int fqs_stutter = 3; /* Wait time between bursts (s). */ static char *torture_type = "rcu"; /* What RCU implementation to torture. */ module_param(nreaders, int, 0444); @@ -79,6 +82,12 @@ module_param(stutter, int, 0444); MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); module_param(irqreader, int, 0444); MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); +module_param(fqs_duration, int, 0444); +MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); +module_param(fqs_holdoff, int, 0444); +MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); +module_param(fqs_stutter, int, 0444); +MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); @@ -99,6 +108,7 @@ static struct task_struct **reader_tasks; static struct task_struct *stats_task; static struct task_struct *shuffler_task; static struct task_struct *stutter_task; +static struct task_struct *fqs_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -263,6 +273,7 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*cb_barrier)(void); + void (*fqs)(void); int (*stats)(char *page); int irq_capable; char *name; @@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = { .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .cb_barrier = rcu_barrier, + .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu" @@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu, .cb_barrier = NULL, + .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu_sync" @@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu_expedited, .cb_barrier = NULL, + .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu_expedited" @@ -465,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .deferred_free = rcu_bh_torture_deferred_free, .sync = rcu_bh_torture_synchronize, .cb_barrier = rcu_barrier_bh, + .fqs = rcu_bh_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu_bh" @@ -480,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = rcu_bh_torture_synchronize, .cb_barrier = NULL, + .fqs = rcu_bh_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu_bh_sync" @@ -621,6 +637,7 @@ static struct rcu_torture_ops sched_ops = { .deferred_free = rcu_sched_torture_deferred_free, .sync = sched_torture_synchronize, .cb_barrier = rcu_barrier_sched, + .fqs = rcu_sched_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "sched" @@ -636,6 +653,7 @@ static struct rcu_torture_ops sched_sync_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = sched_torture_synchronize, .cb_barrier = NULL, + .fqs = rcu_sched_force_quiescent_state, .stats = NULL, .name = "sched_sync" }; @@ -650,11 +668,44 @@ static struct rcu_torture_ops sched_expedited_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_sched_expedited, .cb_barrier = NULL, + .fqs = rcu_sched_force_quiescent_state, .stats = rcu_expedited_torture_stats, .irq_capable = 1, .name = "sched_expedited" }; +/* + * RCU torture force-quiescent-state kthread. Repeatedly induces + * bursts of calls to force_quiescent_state(), increasing the probability + * of occurrence of some important types of race conditions. + */ +static int +rcu_torture_fqs(void *arg) +{ + unsigned long fqs_resume_time; + int fqs_burst_remaining; + + VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); + do { + fqs_resume_time = jiffies + fqs_stutter * HZ; + while (jiffies - fqs_resume_time > LONG_MAX) { + schedule_timeout_interruptible(1); + } + fqs_burst_remaining = fqs_duration; + while (fqs_burst_remaining > 0) { + cur_ops->fqs(); + udelay(fqs_holdoff); + fqs_burst_remaining -= fqs_holdoff; + } + rcu_stutter_wait("rcu_torture_fqs"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); + rcutorture_shutdown_absorb("rcu_torture_fqs"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + /* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure @@ -1030,10 +1081,11 @@ rcu_torture_print_module_parms(char *tag) printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d nfakewriters=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval=%d stutter=%d irqreader=%d\n", + "shuffle_interval=%d stutter=%d irqreader=%d " + "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter, irqreader); + stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); } static struct notifier_block rcutorture_nb = { @@ -1109,6 +1161,12 @@ rcu_torture_cleanup(void) } stats_task = NULL; + if (fqs_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); + kthread_stop(fqs_task); + } + fqs_task = NULL; + /* Wait for all RCU callbacks to fire. */ if (cur_ops->cb_barrier != NULL) @@ -1154,6 +1212,11 @@ rcu_torture_init(void) mutex_unlock(&fullstop_mutex); return -EINVAL; } + if (cur_ops->fqs == NULL && fqs_duration != 0) { + printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " + "fqs_duration, fqs disabled.\n"); + fqs_duration = 0; + } if (cur_ops->init) cur_ops->init(); /* no "goto unwind" prior to this point!!! */ @@ -1282,6 +1345,19 @@ rcu_torture_init(void) goto unwind; } } + if (fqs_duration < 0) + fqs_duration = 0; + if (fqs_duration) { + /* Create the stutter thread */ + fqs_task = kthread_run(rcu_torture_fqs, NULL, + "rcu_torture_fqs"); + if (IS_ERR(fqs_task)) { + firsterr = PTR_ERR(fqs_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); + fqs_task = NULL; + goto unwind; + } + } register_reboot_notifier(&rcutorture_nb); mutex_unlock(&fullstop_mutex); return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 55e8f6ef8195..0a4c32879398 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -156,6 +156,24 @@ long rcu_batches_completed_bh(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); +/* + * Force a quiescent state for RCU BH. + */ +void rcu_bh_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_bh_state, 0); +} +EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); + +/* + * Force a quiescent state for RCU-sched. + */ +void rcu_sched_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_sched_state, 0); +} +EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); + /* * Does the CPU have callbacks ready to be invoked? */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 37fbccdf41d5..f11ebd44b454 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -61,6 +61,15 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); +/* + * Force a quiescent state for preemptible RCU. + */ +void rcu_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_preempt_state, 0); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + /* * Record a preemptable-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is @@ -712,6 +721,16 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); +/* + * Force a quiescent state for RCU, which, because there is no preemptible + * RCU, becomes the same as rcu-sched. + */ +void rcu_force_quiescent_state(void) +{ + rcu_sched_force_quiescent_state(); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + /* * Because preemptable RCU does not exist, we never have to check for * CPUs being in quiescent states. -- cgit v1.2.3 From 9ca94d7c016130f9ed77f142424ace9c19742809 Mon Sep 17 00:00:00 2001 From: John Kacur Date: Mon, 11 Jan 2010 21:21:06 +0100 Subject: plist: Fix grammar mistake, and c-style mistake Signed-off-by: John Kacur Acked-by: Peter Zijlstra LKML-Reference: <1263241267-25204-2-git-send-email-jkacur@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/plist.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/plist.h b/include/linux/plist.h index 8227f717c70f..6898985e7b38 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -45,7 +45,7 @@ * the insertion of new nodes. There are no nodes with duplicate * priorites on the list. * - * The nodes on the node_list is ordered by priority and can contain + * The nodes on the node_list are ordered by priority and can contain * entries which have the same priority. Those entries are ordered * FIFO * @@ -265,7 +265,7 @@ static inline int plist_node_empty(const struct plist_node *node) * * Assumes the plist is _not_ empty. */ -static inline struct plist_node* plist_first(const struct plist_head *head) +static inline struct plist_node *plist_first(const struct plist_head *head) { return list_entry(head->node_list.next, struct plist_node, plist.node_list); -- cgit v1.2.3 From 599faa0e264fe2e7f563f87b4aad8c83e9dc46d1 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 5 Jan 2010 13:29:58 +0000 Subject: genirq: Fix documentation of default chip disable() The documentation says that by default disable() will be chip->mask but in fact default_disable() is a noop. Signed-off-by: Mark Brown LKML-Reference: <1262698198-30392-1-git-send-email-broonie@opensource.wolfsonmicro.com> Signed-off-by: Ingo Molnar --- include/linux/irq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 451481c082b5..d13492df57a1 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -90,7 +90,7 @@ struct msi_desc; * @startup: start up the interrupt (defaults to ->enable if NULL) * @shutdown: shut down the interrupt (defaults to ->disable if NULL) * @enable: enable the interrupt (defaults to chip->unmask if NULL) - * @disable: disable the interrupt (defaults to chip->mask if NULL) + * @disable: disable the interrupt * @ack: start of a new interrupt * @mask: mask an interrupt source * @mask_ack: ack and mask an interrupt source -- cgit v1.2.3 From cd8c20b650f49354722b8cc1f03320b004815a0a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 13 Jan 2010 16:02:14 +0100 Subject: netfilter: nfnetlink: netns support Make nfnl socket per-petns. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/linux/netfilter/nfnetlink.h | 8 ++--- include/net/net_namespace.h | 2 ++ net/netfilter/nf_conntrack_netlink.c | 13 ++++---- net/netfilter/nfnetlink.c | 65 +++++++++++++++++++++++------------- net/netfilter/nfnetlink_log.c | 3 +- net/netfilter/nfnetlink_queue.c | 2 +- 6 files changed, 58 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 49d321f3ccd2..53923868c9bd 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -73,11 +73,11 @@ struct nfnetlink_subsystem { extern int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n); -extern int nfnetlink_has_listeners(unsigned int group); -extern int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, +extern int nfnetlink_has_listeners(struct net *net, unsigned int group); +extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo, gfp_t flags); -extern void nfnetlink_set_err(u32 pid, u32 group, int error); -extern int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags); +extern void nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error); +extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags); extern void nfnl_lock(void); extern void nfnl_unlock(void); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index f307e133d14c..82b7be4db89a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -81,6 +81,8 @@ struct net { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct netns_ct ct; #endif + struct sock *nfnl; + struct sock *nfnl_stash; #endif #ifdef CONFIG_XFRM struct netns_xfrm xfrm; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 59d8064eb522..d4c5d06677f9 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -482,7 +482,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) } else return 0; - if (!item->report && !nfnetlink_has_listeners(group)) + if (!item->report && !nfnetlink_has_listeners(&init_net, group)) return 0; skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC); @@ -559,7 +559,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) rcu_read_unlock(); nlmsg_end(skb, nlh); - err = nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC); + err = nfnetlink_send(skb, &init_net, item->pid, group, item->report, + GFP_ATOMIC); if (err == -ENOBUFS || err == -EAGAIN) return -ENOBUFS; @@ -571,7 +572,7 @@ nla_put_failure: nlmsg_failure: kfree_skb(skb); errout: - nfnetlink_set_err(0, group, -ENOBUFS); + nfnetlink_set_err(&init_net, 0, group, -ENOBUFS); return 0; } #endif /* CONFIG_NF_CONNTRACK_EVENTS */ @@ -1539,7 +1540,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) return 0; if (!item->report && - !nfnetlink_has_listeners(NFNLGRP_CONNTRACK_EXP_NEW)) + !nfnetlink_has_listeners(&init_net, NFNLGRP_CONNTRACK_EXP_NEW)) return 0; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); @@ -1562,7 +1563,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) rcu_read_unlock(); nlmsg_end(skb, nlh); - nfnetlink_send(skb, item->pid, NFNLGRP_CONNTRACK_EXP_NEW, + nfnetlink_send(skb, &init_net, item->pid, NFNLGRP_CONNTRACK_EXP_NEW, item->report, GFP_ATOMIC); return 0; @@ -1572,7 +1573,7 @@ nla_put_failure: nlmsg_failure: kfree_skb(skb); errout: - nfnetlink_set_err(0, 0, -ENOBUFS); + nfnetlink_set_err(&init_net, 0, 0, -ENOBUFS); return 0; } #endif diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index eedc0c1ac7a4..8eb0cc23ada3 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -40,7 +40,6 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); static char __initdata nfversion[] = "0.30"; -static struct sock *nfnl = NULL; static const struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT]; static DEFINE_MUTEX(nfnl_mutex); @@ -101,34 +100,35 @@ nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss) return &ss->cb[cb_id]; } -int nfnetlink_has_listeners(unsigned int group) +int nfnetlink_has_listeners(struct net *net, unsigned int group) { - return netlink_has_listeners(nfnl, group); + return netlink_has_listeners(net->nfnl, group); } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); -int nfnetlink_send(struct sk_buff *skb, u32 pid, +int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo, gfp_t flags) { - return nlmsg_notify(nfnl, skb, pid, group, echo, flags); + return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags); } EXPORT_SYMBOL_GPL(nfnetlink_send); -void nfnetlink_set_err(u32 pid, u32 group, int error) +void nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error) { - netlink_set_err(nfnl, pid, group, error); + netlink_set_err(net->nfnl, pid, group, error); } EXPORT_SYMBOL_GPL(nfnetlink_set_err); -int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags) { - return netlink_unicast(nfnl, skb, pid, flags); + return netlink_unicast(net->nfnl, skb, pid, flags); } EXPORT_SYMBOL_GPL(nfnetlink_unicast); /* Process one complete nfnetlink message. */ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { + struct net *net = sock_net(skb->sk); const struct nfnl_callback *nc; const struct nfnetlink_subsystem *ss; int type, err; @@ -170,7 +170,7 @@ replay: if (err < 0) return err; - err = nc->call(nfnl, skb, nlh, (const struct nlattr **)cda); + err = nc->call(net->nfnl, skb, nlh, (const struct nlattr **)cda); if (err == -EAGAIN) goto replay; return err; @@ -184,26 +184,45 @@ static void nfnetlink_rcv(struct sk_buff *skb) nfnl_unlock(); } -static void __exit nfnetlink_exit(void) +static int __net_init nfnetlink_net_init(struct net *net) { - printk("Removing netfilter NETLINK layer.\n"); - netlink_kernel_release(nfnl); - return; + struct sock *nfnl; + + nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX, + nfnetlink_rcv, NULL, THIS_MODULE); + if (!nfnl) + return -ENOMEM; + net->nfnl_stash = nfnl; + rcu_assign_pointer(net->nfnl, nfnl); + return 0; } -static int __init nfnetlink_init(void) +static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list) { - printk("Netfilter messages via NETLINK v%s.\n", nfversion); + struct net *net; - nfnl = netlink_kernel_create(&init_net, NETLINK_NETFILTER, NFNLGRP_MAX, - nfnetlink_rcv, NULL, THIS_MODULE); - if (!nfnl) { - printk(KERN_ERR "cannot initialize nfnetlink!\n"); - return -ENOMEM; - } + list_for_each_entry(net, net_exit_list, exit_list) + rcu_assign_pointer(net->nfnl, NULL); + synchronize_net(); + list_for_each_entry(net, net_exit_list, exit_list) + netlink_kernel_release(net->nfnl_stash); +} - return 0; +static struct pernet_operations nfnetlink_net_ops = { + .init = nfnetlink_net_init, + .exit_batch = nfnetlink_net_exit_batch, +}; + +static int __init nfnetlink_init(void) +{ + printk("Netfilter messages via NETLINK v%s.\n", nfversion); + return register_pernet_subsys(&nfnetlink_net_ops); } +static void __exit nfnetlink_exit(void) +{ + printk("Removing netfilter NETLINK layer.\n"); + unregister_pernet_subsys(&nfnetlink_net_ops); +} module_init(nfnetlink_init); module_exit(nfnetlink_exit); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 9de0470d557e..285e9029a9ff 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -323,7 +323,8 @@ __nfulnl_send(struct nfulnl_instance *inst) NLMSG_DONE, sizeof(struct nfgenmsg)); - status = nfnetlink_unicast(inst->skb, inst->peer_pid, MSG_DONTWAIT); + status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_pid, + MSG_DONTWAIT); inst->qlen = 0; inst->skb = NULL; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 7e3fa410641e..5c589b27d6eb 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -420,7 +420,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) } /* nfnetlink_unicast will either free the nskb or add it to a socket */ - err = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT); + err = nfnetlink_unicast(nskb, &init_net, queue->peer_pid, MSG_DONTWAIT); if (err < 0) { queue->queue_user_dropped++; goto err_out_unlock; -- cgit v1.2.3 From 508e14b4a4fb1a824a14f2c5b8d7df67b313f8e4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Jan 2010 14:27:30 +0000 Subject: netpoll: allow execution of multiple rx_hooks per interface Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/netpoll.h | 11 +++- net/core/netpoll.c | 169 ++++++++++++++++++++++++++++++------------------ 2 files changed, 114 insertions(+), 66 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 2524267210d3..a765ea898549 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -21,15 +21,20 @@ struct netpoll { __be32 local_ip, remote_ip; u16 local_port, remote_port; u8 remote_mac[ETH_ALEN]; + + struct list_head rx; /* rx_np list element */ }; struct netpoll_info { atomic_t refcnt; + int rx_flags; spinlock_t rx_lock; - struct netpoll *rx_np; /* netpoll that registered an rx_hook */ + struct list_head rx_np; /* netpolls that registered an rx_hook */ + struct sk_buff_head arp_tx; /* list of arp requests to reply to */ struct sk_buff_head txq; + struct delayed_work tx_work; }; @@ -51,7 +56,7 @@ static inline int netpoll_rx(struct sk_buff *skb) unsigned long flags; int ret = 0; - if (!npinfo || (!npinfo->rx_np && !npinfo->rx_flags)) + if (!npinfo || (list_empty(&npinfo->rx_np) && !npinfo->rx_flags)) return 0; spin_lock_irqsave(&npinfo->rx_lock, flags); @@ -67,7 +72,7 @@ static inline int netpoll_rx_on(struct sk_buff *skb) { struct netpoll_info *npinfo = skb->dev->npinfo; - return npinfo && (npinfo->rx_np || npinfo->rx_flags); + return npinfo && (!list_empty(&npinfo->rx_np) || npinfo->rx_flags); } static inline int netpoll_receive_skb(struct sk_buff *skb) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 0b4d0d35ef40..7aa697253765 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -407,11 +407,24 @@ static void arp_reply(struct sk_buff *skb) __be32 sip, tip; unsigned char *sha; struct sk_buff *send_skb; - struct netpoll *np = NULL; + struct netpoll *np, *tmp; + unsigned long flags; + int hits = 0; + + if (list_empty(&npinfo->rx_np)) + return; + + /* Before checking the packet, we do some early + inspection whether this is interesting at all */ + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (np->dev == skb->dev) + hits++; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); - if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev) - np = npinfo->rx_np; - if (!np) + /* No netpoll struct is using this dev */ + if (!hits) return; /* No arp on this interface */ @@ -437,77 +450,91 @@ static void arp_reply(struct sk_buff *skb) arp_ptr += skb->dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4; - /* if we actually cared about dst hw addr, it would get copied here */ + /* If we actually cared about dst hw addr, + it would get copied here */ arp_ptr += skb->dev->addr_len; memcpy(&tip, arp_ptr, 4); /* Should we ignore arp? */ - if (tip != np->local_ip || - ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) + if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) return; size = arp_hdr_len(skb->dev); - send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev), - LL_RESERVED_SPACE(np->dev)); - if (!send_skb) - return; - - skb_reset_network_header(send_skb); - arp = (struct arphdr *) skb_put(send_skb, size); - send_skb->dev = skb->dev; - send_skb->protocol = htons(ETH_P_ARP); + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (tip != np->local_ip) + continue; - /* Fill the device header for the ARP frame */ - if (dev_hard_header(send_skb, skb->dev, ptype, - sha, np->dev->dev_addr, - send_skb->len) < 0) { - kfree_skb(send_skb); - return; - } + send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev), + LL_RESERVED_SPACE(np->dev)); + if (!send_skb) + continue; - /* - * Fill out the arp protocol part. - * - * we only support ethernet device type, - * which (according to RFC 1390) should always equal 1 (Ethernet). - */ + skb_reset_network_header(send_skb); + arp = (struct arphdr *) skb_put(send_skb, size); + send_skb->dev = skb->dev; + send_skb->protocol = htons(ETH_P_ARP); - arp->ar_hrd = htons(np->dev->type); - arp->ar_pro = htons(ETH_P_IP); - arp->ar_hln = np->dev->addr_len; - arp->ar_pln = 4; - arp->ar_op = htons(type); + /* Fill the device header for the ARP frame */ + if (dev_hard_header(send_skb, skb->dev, ptype, + sha, np->dev->dev_addr, + send_skb->len) < 0) { + kfree_skb(send_skb); + continue; + } - arp_ptr=(unsigned char *)(arp + 1); - memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &tip, 4); - arp_ptr += 4; - memcpy(arp_ptr, sha, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &sip, 4); + /* + * Fill out the arp protocol part. + * + * we only support ethernet device type, + * which (according to RFC 1390) should + * always equal 1 (Ethernet). + */ - netpoll_send_skb(np, send_skb); + arp->ar_hrd = htons(np->dev->type); + arp->ar_pro = htons(ETH_P_IP); + arp->ar_hln = np->dev->addr_len; + arp->ar_pln = 4; + arp->ar_op = htons(type); + + arp_ptr = (unsigned char *)(arp + 1); + memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &tip, 4); + arp_ptr += 4; + memcpy(arp_ptr, sha, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &sip, 4); + + netpoll_send_skb(np, send_skb); + + /* If there are several rx_hooks for the same address, + we're fine by sending a single reply */ + break; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); } int __netpoll_rx(struct sk_buff *skb) { int proto, len, ulen; + int hits = 0; struct iphdr *iph; struct udphdr *uh; - struct netpoll_info *npi = skb->dev->npinfo; - struct netpoll *np = npi->rx_np; + struct netpoll_info *npinfo = skb->dev->npinfo; + struct netpoll *np, *tmp; - if (!np) + if (list_empty(&npinfo->rx_np)) goto out; + if (skb->dev->type != ARPHRD_ETHER) goto out; /* check if netpoll clients need ARP */ if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) { - skb_queue_tail(&npi->arp_tx, skb); + skb_queue_tail(&npinfo->arp_tx, skb); return 1; } @@ -551,16 +578,23 @@ int __netpoll_rx(struct sk_buff *skb) goto out; if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) goto out; - if (np->local_ip && np->local_ip != iph->daddr) - goto out; - if (np->remote_ip && np->remote_ip != iph->saddr) - goto out; - if (np->local_port && np->local_port != ntohs(uh->dest)) - goto out; - np->rx_hook(np, ntohs(uh->source), - (char *)(uh+1), - ulen - sizeof(struct udphdr)); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (np->local_ip && np->local_ip != iph->daddr) + continue; + if (np->remote_ip && np->remote_ip != iph->saddr) + continue; + if (np->local_port && np->local_port != ntohs(uh->dest)) + continue; + + np->rx_hook(np, ntohs(uh->source), + (char *)(uh+1), + ulen - sizeof(struct udphdr)); + hits++; + } + + if (!hits) + goto out; kfree_skb(skb); return 1; @@ -684,6 +718,7 @@ int netpoll_setup(struct netpoll *np) struct net_device *ndev = NULL; struct in_device *in_dev; struct netpoll_info *npinfo; + struct netpoll *npe, *tmp; unsigned long flags; int err; @@ -704,7 +739,7 @@ int netpoll_setup(struct netpoll *np) } npinfo->rx_flags = 0; - npinfo->rx_np = NULL; + INIT_LIST_HEAD(&npinfo->rx_np); spin_lock_init(&npinfo->rx_lock); skb_queue_head_init(&npinfo->arp_tx); @@ -785,7 +820,7 @@ int netpoll_setup(struct netpoll *np) if (np->rx_hook) { spin_lock_irqsave(&npinfo->rx_lock, flags); npinfo->rx_flags |= NETPOLL_RX_ENABLED; - npinfo->rx_np = np; + list_add_tail(&np->rx, &npinfo->rx_np); spin_unlock_irqrestore(&npinfo->rx_lock, flags); } @@ -801,9 +836,16 @@ int netpoll_setup(struct netpoll *np) return 0; release: - if (!ndev->npinfo) + if (!ndev->npinfo) { + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(npe, tmp, &npinfo->rx_np, rx) { + npe->dev = NULL; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + kfree(npinfo); - np->dev = NULL; + } + dev_put(ndev); return err; } @@ -823,10 +865,11 @@ void netpoll_cleanup(struct netpoll *np) if (np->dev) { npinfo = np->dev->npinfo; if (npinfo) { - if (npinfo->rx_np == np) { + if (!list_empty(&npinfo->rx_np)) { spin_lock_irqsave(&npinfo->rx_lock, flags); - npinfo->rx_np = NULL; - npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; + list_del(&np->rx); + if (list_empty(&npinfo->rx_np)) + npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; spin_unlock_irqrestore(&npinfo->rx_lock, flags); } -- cgit v1.2.3 From 9a58a80a701bdb2d220cdab4914218df5b48d781 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 14 Jan 2010 03:10:54 -0800 Subject: proc_fops: convert drivers/isdn/ to seq_file Convert code away from ->read_proc/->write_proc interfaces. Switch to proc_create()/proc_create_data() which make addition of proc entries reliable wrt NULL ->proc_fops, NULL ->data and so on. Problem with ->read_proc et al is described here commit 786d7e1612f0b0adb6046f19b906609e4fe8b1ba "Fix rmmod/read/write races in /proc entries" [akpm@linux-foundation.org: CONFIG_PROC_FS=n build fix] Signed-off-by: Alexey Dobriyan Signed-off-by: Tilman Schmidt Signed-off-by: Karsten Keil Signed-off-by: David S. Miller --- Documentation/isdn/INTERFACE.CAPI | 9 +- drivers/isdn/capi/capi.c | 99 ++++++---------- drivers/isdn/capi/capidrv.c | 55 +++------ drivers/isdn/capi/kcapi.c | 8 +- drivers/isdn/gigaset/capi.c | 75 ++++++------ drivers/isdn/hardware/avm/avmcard.h | 6 +- drivers/isdn/hardware/avm/b1.c | 54 +++++---- drivers/isdn/hardware/avm/b1dma.c | 71 ++++++------ drivers/isdn/hardware/avm/b1isa.c | 2 +- drivers/isdn/hardware/avm/b1pci.c | 4 +- drivers/isdn/hardware/avm/b1pcmcia.c | 2 +- drivers/isdn/hardware/avm/c4.c | 53 +++++---- drivers/isdn/hardware/avm/t1isa.c | 2 +- drivers/isdn/hardware/avm/t1pci.c | 2 +- drivers/isdn/hardware/eicon/capimain.c | 40 ++++--- drivers/isdn/hardware/eicon/diva_didd.c | 45 ++++---- drivers/isdn/hardware/eicon/divasi.c | 48 ++++---- drivers/isdn/hardware/eicon/divasproc.c | 198 ++++++++++++++------------------ drivers/isdn/hysdn/hycapi.c | 56 ++++----- include/linux/isdn/capilli.h | 3 +- net/bluetooth/cmtp/capi.c | 37 +++--- 21 files changed, 411 insertions(+), 458 deletions(-) (limited to 'include/linux') diff --git a/Documentation/isdn/INTERFACE.CAPI b/Documentation/isdn/INTERFACE.CAPI index 5fe8de5cc727..f172091fb7cd 100644 --- a/Documentation/isdn/INTERFACE.CAPI +++ b/Documentation/isdn/INTERFACE.CAPI @@ -149,10 +149,11 @@ char *(*procinfo)(struct capi_ctr *ctrlr) pointer to a callback function returning the entry for the device in the CAPI controller info table, /proc/capi/controller -read_proc_t *ctr_read_proc - pointer to the read_proc callback function for the device's proc file - system entry, /proc/capi/controllers/; will be called with a - pointer to the device's capi_ctr structure as the last (data) argument +const struct file_operations *proc_fops + pointers to callback functions for the device's proc file + system entry, /proc/capi/controllers/; pointer to the device's + capi_ctr structure is available from struct proc_dir_entry::data + which is available from struct inode. Note: Callback functions except send_message() are never called in interrupt context. diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c index 65bf91e16a42..79f9364aded6 100644 --- a/drivers/isdn/capi/capi.c +++ b/drivers/isdn/capi/capi.c @@ -33,6 +33,7 @@ #endif /* CONFIG_ISDN_CAPI_MIDDLEWARE */ #include #include +#include #include #include #include @@ -1407,114 +1408,84 @@ static void capinc_tty_exit(void) * /proc/capi/capi20: * minor applid nrecvctlpkt nrecvdatapkt nsendctlpkt nsenddatapkt */ -static int proc_capidev_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int capi20_proc_show(struct seq_file *m, void *v) { struct capidev *cdev; struct list_head *l; - int len = 0; read_lock(&capidev_list_lock); list_for_each(l, &capidev_list) { cdev = list_entry(l, struct capidev, list); - len += sprintf(page+len, "0 %d %lu %lu %lu %lu\n", + seq_printf(m, "0 %d %lu %lu %lu %lu\n", cdev->ap.applid, cdev->ap.nrecvctlpkt, cdev->ap.nrecvdatapkt, cdev->ap.nsentctlpkt, cdev->ap.nsentdatapkt); - if (len <= off) { - off -= len; - len = 0; - } else { - if (len-off > count) - goto endloop; - } } - -endloop: read_unlock(&capidev_list_lock); - if (len < count) - *eof = 1; - if (len > count) len = count; - if (len < 0) len = 0; - return len; + return 0; } +static int capi20_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, capi20_proc_show, NULL); +} + +static const struct file_operations capi20_proc_fops = { + .owner = THIS_MODULE, + .open = capi20_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /* * /proc/capi/capi20ncci: * applid ncci */ -static int proc_capincci_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int capi20ncci_proc_show(struct seq_file *m, void *v) { struct capidev *cdev; struct capincci *np; struct list_head *l; - int len = 0; read_lock(&capidev_list_lock); list_for_each(l, &capidev_list) { cdev = list_entry(l, struct capidev, list); for (np=cdev->nccis; np; np = np->next) { - len += sprintf(page+len, "%d 0x%x\n", + seq_printf(m, "%d 0x%x\n", cdev->ap.applid, np->ncci); - if (len <= off) { - off -= len; - len = 0; - } else { - if (len-off > count) - goto endloop; - } } } -endloop: read_unlock(&capidev_list_lock); - *start = page+off; - if (len < count) - *eof = 1; - if (len>count) len = count; - if (len<0) len = 0; - return len; + return 0; } -static struct procfsentries { - char *name; - mode_t mode; - int (*read_proc)(char *page, char **start, off_t off, - int count, int *eof, void *data); - struct proc_dir_entry *procent; -} procfsentries[] = { - /* { "capi", S_IFDIR, 0 }, */ - { "capi/capi20", 0 , proc_capidev_read_proc }, - { "capi/capi20ncci", 0 , proc_capincci_read_proc }, +static int capi20ncci_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, capi20ncci_proc_show, NULL); +} + +static const struct file_operations capi20ncci_proc_fops = { + .owner = THIS_MODULE, + .open = capi20ncci_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; static void __init proc_init(void) { - int nelem = ARRAY_SIZE(procfsentries); - int i; - - for (i=0; i < nelem; i++) { - struct procfsentries *p = procfsentries + i; - p->procent = create_proc_entry(p->name, p->mode, NULL); - if (p->procent) p->procent->read_proc = p->read_proc; - } + proc_create("capi/capi20", 0, NULL, &capi20_proc_fops); + proc_create("capi/capi20ncci", 0, NULL, &capi20ncci_proc_fops); } static void __exit proc_exit(void) { - int nelem = ARRAY_SIZE(procfsentries); - int i; - - for (i=nelem-1; i >= 0; i--) { - struct procfsentries *p = procfsentries + i; - if (p->procent) { - remove_proc_entry(p->name, NULL); - p->procent = NULL; - } - } + remove_proc_entry("capi/capi20", NULL); + remove_proc_entry("capi/capi20ncci", NULL); } /* -------- init function and module interface ---------------------- */ diff --git a/drivers/isdn/capi/capidrv.c b/drivers/isdn/capi/capidrv.c index 66b7d7a86474..bb450152fb74 100644 --- a/drivers/isdn/capi/capidrv.c +++ b/drivers/isdn/capi/capidrv.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -2229,59 +2230,37 @@ static void lower_callback(unsigned int cmd, u32 contr, void *data) * /proc/capi/capidrv: * nrecvctlpkt nrecvdatapkt nsendctlpkt nsenddatapkt */ -static int proc_capidrv_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int capidrv_proc_show(struct seq_file *m, void *v) { - int len = 0; - - len += sprintf(page+len, "%lu %lu %lu %lu\n", + seq_printf(m, "%lu %lu %lu %lu\n", global.ap.nrecvctlpkt, global.ap.nrecvdatapkt, global.ap.nsentctlpkt, global.ap.nsentdatapkt); - if (off+count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len-off) ? count : len-off); + return 0; +} + +static int capidrv_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, capidrv_proc_show, NULL); } -static struct procfsentries { - char *name; - mode_t mode; - int (*read_proc)(char *page, char **start, off_t off, - int count, int *eof, void *data); - struct proc_dir_entry *procent; -} procfsentries[] = { - /* { "capi", S_IFDIR, 0 }, */ - { "capi/capidrv", 0 , proc_capidrv_read_proc }, +static const struct file_operations capidrv_proc_fops = { + .owner = THIS_MODULE, + .open = capidrv_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; static void __init proc_init(void) { - int nelem = ARRAY_SIZE(procfsentries); - int i; - - for (i=0; i < nelem; i++) { - struct procfsentries *p = procfsentries + i; - p->procent = create_proc_entry(p->name, p->mode, NULL); - if (p->procent) p->procent->read_proc = p->read_proc; - } + proc_create("capi/capidrv", 0, NULL, &capidrv_proc_fops); } static void __exit proc_exit(void) { - int nelem = ARRAY_SIZE(procfsentries); - int i; - - for (i=nelem-1; i >= 0; i--) { - struct procfsentries *p = procfsentries + i; - if (p->procent) { - remove_proc_entry(p->name, NULL); - p->procent = NULL; - } - } + remove_proc_entry("capi/capidrv", NULL); } static int __init capidrv_init(void) diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index dc506ab99cac..b0bacf377c18 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -490,13 +490,7 @@ attach_capi_ctr(struct capi_ctr *card) card->traceflag = showcapimsgs; sprintf(card->procfn, "capi/controllers/%d", card->cnr); - card->procent = create_proc_entry(card->procfn, 0, NULL); - if (card->procent) { - card->procent->read_proc = - (int (*)(char *,char **,off_t,int,int *,void *)) - card->ctr_read_proc; - card->procent->data = card; - } + card->procent = proc_create_data(card->procfn, 0, NULL, card->proc_fops, card); ncards++; printk(KERN_NOTICE "kcapi: Controller [%03d]: %s attached\n", diff --git a/drivers/isdn/gigaset/capi.c b/drivers/isdn/gigaset/capi.c index 3f5cd06af104..6f0ae32906bf 100644 --- a/drivers/isdn/gigaset/capi.c +++ b/drivers/isdn/gigaset/capi.c @@ -13,6 +13,8 @@ #include "gigaset.h" #include +#include +#include #include #include #include @@ -2106,35 +2108,22 @@ static char *gigaset_procinfo(struct capi_ctr *ctr) return ctr->name; /* ToDo: more? */ } -/** - * gigaset_ctr_read_proc() - build controller proc file entry - * @page: buffer of PAGE_SIZE bytes for receiving the entry. - * @start: unused. - * @off: unused. - * @count: unused. - * @eof: unused. - * @ctr: controller descriptor structure. - * - * Return value: length of generated entry - */ -static int gigaset_ctr_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctr) +static int gigaset_proc_show(struct seq_file *m, void *v) { + struct capi_ctr *ctr = m->private; struct cardstate *cs = ctr->driverdata; char *s; int i; - int len = 0; - len += sprintf(page+len, "%-16s %s\n", "name", ctr->name); - len += sprintf(page+len, "%-16s %s %s\n", "dev", + + seq_printf(m, "%-16s %s\n", "name", ctr->name); + seq_printf(m, "%-16s %s %s\n", "dev", dev_driver_string(cs->dev), dev_name(cs->dev)); - len += sprintf(page+len, "%-16s %d\n", "id", cs->myid); + seq_printf(m, "%-16s %d\n", "id", cs->myid); if (cs->gotfwver) - len += sprintf(page+len, "%-16s %d.%d.%d.%d\n", "firmware", + seq_printf(m, "%-16s %d.%d.%d.%d\n", "firmware", cs->fwver[0], cs->fwver[1], cs->fwver[2], cs->fwver[3]); - len += sprintf(page+len, "%-16s %d\n", "channels", - cs->channels); - len += sprintf(page+len, "%-16s %s\n", "onechannel", - cs->onechannel ? "yes" : "no"); + seq_printf(m, "%-16s %d\n", "channels", cs->channels); + seq_printf(m, "%-16s %s\n", "onechannel", cs->onechannel ? "yes" : "no"); switch (cs->mode) { case M_UNKNOWN: @@ -2152,7 +2141,7 @@ static int gigaset_ctr_read_proc(char *page, char **start, off_t off, default: s = "??"; } - len += sprintf(page+len, "%-16s %s\n", "mode", s); + seq_printf(m, "%-16s %s\n", "mode", s); switch (cs->mstate) { case MS_UNINITIALIZED: @@ -2176,25 +2165,21 @@ static int gigaset_ctr_read_proc(char *page, char **start, off_t off, default: s = "??"; } - len += sprintf(page+len, "%-16s %s\n", "mstate", s); + seq_printf(m, "%-16s %s\n", "mstate", s); - len += sprintf(page+len, "%-16s %s\n", "running", - cs->running ? "yes" : "no"); - len += sprintf(page+len, "%-16s %s\n", "connected", - cs->connected ? "yes" : "no"); - len += sprintf(page+len, "%-16s %s\n", "isdn_up", - cs->isdn_up ? "yes" : "no"); - len += sprintf(page+len, "%-16s %s\n", "cidmode", - cs->cidmode ? "yes" : "no"); + seq_printf(m, "%-16s %s\n", "running", cs->running ? "yes" : "no"); + seq_printf(m, "%-16s %s\n", "connected", cs->connected ? "yes" : "no"); + seq_printf(m, "%-16s %s\n", "isdn_up", cs->isdn_up ? "yes" : "no"); + seq_printf(m, "%-16s %s\n", "cidmode", cs->cidmode ? "yes" : "no"); for (i = 0; i < cs->channels; i++) { - len += sprintf(page+len, "[%d]%-13s %d\n", i, "corrupted", + seq_printf(m, "[%d]%-13s %d\n", i, "corrupted", cs->bcs[i].corrupted); - len += sprintf(page+len, "[%d]%-13s %d\n", i, "trans_down", + seq_printf(m, "[%d]%-13s %d\n", i, "trans_down", cs->bcs[i].trans_down); - len += sprintf(page+len, "[%d]%-13s %d\n", i, "trans_up", + seq_printf(m, "[%d]%-13s %d\n", i, "trans_up", cs->bcs[i].trans_up); - len += sprintf(page+len, "[%d]%-13s %d\n", i, "chstate", + seq_printf(m, "[%d]%-13s %d\n", i, "chstate", cs->bcs[i].chstate); switch (cs->bcs[i].proto2) { case L2_BITSYNC: @@ -2209,11 +2194,23 @@ static int gigaset_ctr_read_proc(char *page, char **start, off_t off, default: s = "??"; } - len += sprintf(page+len, "[%d]%-13s %s\n", i, "proto2", s); + seq_printf(m, "[%d]%-13s %s\n", i, "proto2", s); } - return len; + return 0; } +static int gigaset_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, gigaset_proc_show, PDE(inode)->data); +} + +static const struct file_operations gigaset_proc_fops = { + .owner = THIS_MODULE, + .open = gigaset_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; static struct capi_driver capi_driver_gigaset = { .name = "gigaset", @@ -2256,7 +2253,7 @@ int gigaset_isdn_register(struct cardstate *cs, const char *isdnid) iif->ctr.release_appl = gigaset_release_appl; iif->ctr.send_message = gigaset_send_message; iif->ctr.procinfo = gigaset_procinfo; - iif->ctr.ctr_read_proc = gigaset_ctr_read_proc; + iif->ctr.proc_fops = &gigaset_proc_fops; INIT_LIST_HEAD(&iif->appls); skb_queue_head_init(&iif->sendqueue); atomic_set(&iif->sendqlen, 0); diff --git a/drivers/isdn/hardware/avm/avmcard.h b/drivers/isdn/hardware/avm/avmcard.h index d964f07e4a56..a70e8854461d 100644 --- a/drivers/isdn/hardware/avm/avmcard.h +++ b/drivers/isdn/hardware/avm/avmcard.h @@ -556,8 +556,7 @@ u16 b1_send_message(struct capi_ctr *ctrl, struct sk_buff *skb); void b1_parse_version(avmctrl_info *card); irqreturn_t b1_interrupt(int interrupt, void *devptr); -int b1ctl_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctrl); +extern const struct file_operations b1ctl_proc_fops; avmcard_dmainfo *avmcard_dma_alloc(char *name, struct pci_dev *, long rsize, long ssize); @@ -577,7 +576,6 @@ void b1dma_register_appl(struct capi_ctr *ctrl, capi_register_params *rp); void b1dma_release_appl(struct capi_ctr *ctrl, u16 appl); u16 b1dma_send_message(struct capi_ctr *ctrl, struct sk_buff *skb); -int b1dmactl_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctrl); +extern const struct file_operations b1dmactl_proc_fops; #endif /* _AVMCARD_H_ */ diff --git a/drivers/isdn/hardware/avm/b1.c b/drivers/isdn/hardware/avm/b1.c index a7c0083e78a7..c38fa0f4c729 100644 --- a/drivers/isdn/hardware/avm/b1.c +++ b/drivers/isdn/hardware/avm/b1.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -634,18 +636,17 @@ irqreturn_t b1_interrupt(int interrupt, void *devptr) } /* ------------------------------------------------------------- */ -int b1ctl_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctrl) +static int b1ctl_proc_show(struct seq_file *m, void *v) { + struct capi_ctr *ctrl = m->private; avmctrl_info *cinfo = (avmctrl_info *)(ctrl->driverdata); avmcard *card = cinfo->card; u8 flag; - int len = 0; char *s; - len += sprintf(page+len, "%-16s %s\n", "name", card->name); - len += sprintf(page+len, "%-16s 0x%x\n", "io", card->port); - len += sprintf(page+len, "%-16s %d\n", "irq", card->irq); + seq_printf(m, "%-16s %s\n", "name", card->name); + seq_printf(m, "%-16s 0x%x\n", "io", card->port); + seq_printf(m, "%-16s %d\n", "irq", card->irq); switch (card->cardtype) { case avm_b1isa: s = "B1 ISA"; break; case avm_b1pci: s = "B1 PCI"; break; @@ -658,20 +659,20 @@ int b1ctl_read_proc(char *page, char **start, off_t off, case avm_c2: s = "C2"; break; default: s = "???"; break; } - len += sprintf(page+len, "%-16s %s\n", "type", s); + seq_printf(m, "%-16s %s\n", "type", s); if (card->cardtype == avm_t1isa) - len += sprintf(page+len, "%-16s %d\n", "cardnr", card->cardnr); + seq_printf(m, "%-16s %d\n", "cardnr", card->cardnr); if ((s = cinfo->version[VER_DRIVER]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_driver", s); + seq_printf(m, "%-16s %s\n", "ver_driver", s); if ((s = cinfo->version[VER_CARDTYPE]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_cardtype", s); + seq_printf(m, "%-16s %s\n", "ver_cardtype", s); if ((s = cinfo->version[VER_SERIAL]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_serial", s); + seq_printf(m, "%-16s %s\n", "ver_serial", s); if (card->cardtype != avm_m1) { flag = ((u8 *)(ctrl->profile.manu))[3]; if (flag) - len += sprintf(page+len, "%-16s%s%s%s%s%s%s%s\n", + seq_printf(m, "%-16s%s%s%s%s%s%s%s\n", "protocol", (flag & 0x01) ? " DSS1" : "", (flag & 0x02) ? " CT1" : "", @@ -685,7 +686,7 @@ int b1ctl_read_proc(char *page, char **start, off_t off, if (card->cardtype != avm_m1) { flag = ((u8 *)(ctrl->profile.manu))[5]; if (flag) - len += sprintf(page+len, "%-16s%s%s%s%s\n", + seq_printf(m, "%-16s%s%s%s%s\n", "linetype", (flag & 0x01) ? " point to point" : "", (flag & 0x02) ? " point to multipoint" : "", @@ -693,16 +694,25 @@ int b1ctl_read_proc(char *page, char **start, off_t off, (flag & 0x04) ? " leased line with D-channel" : "" ); } - len += sprintf(page+len, "%-16s %s\n", "cardname", cinfo->cardname); - - if (off+count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len-off) ? count : len-off); + seq_printf(m, "%-16s %s\n", "cardname", cinfo->cardname); + + return 0; +} + +static int b1ctl_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, b1ctl_proc_show, PDE(inode)->data); } +const struct file_operations b1ctl_proc_fops = { + .owner = THIS_MODULE, + .open = b1ctl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +EXPORT_SYMBOL(b1ctl_proc_fops); + /* ------------------------------------------------------------- */ #ifdef CONFIG_PCI @@ -781,8 +791,6 @@ EXPORT_SYMBOL(b1_send_message); EXPORT_SYMBOL(b1_parse_version); EXPORT_SYMBOL(b1_interrupt); -EXPORT_SYMBOL(b1ctl_read_proc); - static int __init b1_init(void) { char *p; diff --git a/drivers/isdn/hardware/avm/b1dma.c b/drivers/isdn/hardware/avm/b1dma.c index 0e84aaae43fd..124550d0dbf3 100644 --- a/drivers/isdn/hardware/avm/b1dma.c +++ b/drivers/isdn/hardware/avm/b1dma.c @@ -11,6 +11,8 @@ #include #include +#include +#include #include #include #include @@ -855,21 +857,20 @@ u16 b1dma_send_message(struct capi_ctr *ctrl, struct sk_buff *skb) /* ------------------------------------------------------------- */ -int b1dmactl_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctrl) +static int b1dmactl_proc_show(struct seq_file *m, void *v) { + struct capi_ctr *ctrl = m->private; avmctrl_info *cinfo = (avmctrl_info *)(ctrl->driverdata); avmcard *card = cinfo->card; u8 flag; - int len = 0; char *s; u32 txoff, txlen, rxoff, rxlen, csr; unsigned long flags; - len += sprintf(page+len, "%-16s %s\n", "name", card->name); - len += sprintf(page+len, "%-16s 0x%x\n", "io", card->port); - len += sprintf(page+len, "%-16s %d\n", "irq", card->irq); - len += sprintf(page+len, "%-16s 0x%lx\n", "membase", card->membase); + seq_printf(m, "%-16s %s\n", "name", card->name); + seq_printf(m, "%-16s 0x%x\n", "io", card->port); + seq_printf(m, "%-16s %d\n", "irq", card->irq); + seq_printf(m, "%-16s 0x%lx\n", "membase", card->membase); switch (card->cardtype) { case avm_b1isa: s = "B1 ISA"; break; case avm_b1pci: s = "B1 PCI"; break; @@ -882,18 +883,18 @@ int b1dmactl_read_proc(char *page, char **start, off_t off, case avm_c2: s = "C2"; break; default: s = "???"; break; } - len += sprintf(page+len, "%-16s %s\n", "type", s); + seq_printf(m, "%-16s %s\n", "type", s); if ((s = cinfo->version[VER_DRIVER]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_driver", s); + seq_printf(m, "%-16s %s\n", "ver_driver", s); if ((s = cinfo->version[VER_CARDTYPE]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_cardtype", s); + seq_printf(m, "%-16s %s\n", "ver_cardtype", s); if ((s = cinfo->version[VER_SERIAL]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_serial", s); + seq_printf(m, "%-16s %s\n", "ver_serial", s); if (card->cardtype != avm_m1) { flag = ((u8 *)(ctrl->profile.manu))[3]; if (flag) - len += sprintf(page+len, "%-16s%s%s%s%s%s%s%s\n", + seq_printf(m, "%-16s%s%s%s%s%s%s%s\n", "protocol", (flag & 0x01) ? " DSS1" : "", (flag & 0x02) ? " CT1" : "", @@ -907,7 +908,7 @@ int b1dmactl_read_proc(char *page, char **start, off_t off, if (card->cardtype != avm_m1) { flag = ((u8 *)(ctrl->profile.manu))[5]; if (flag) - len += sprintf(page+len, "%-16s%s%s%s%s\n", + seq_printf(m, "%-16s%s%s%s%s\n", "linetype", (flag & 0x01) ? " point to point" : "", (flag & 0x02) ? " point to multipoint" : "", @@ -915,7 +916,7 @@ int b1dmactl_read_proc(char *page, char **start, off_t off, (flag & 0x04) ? " leased line with D-channel" : "" ); } - len += sprintf(page+len, "%-16s %s\n", "cardname", cinfo->cardname); + seq_printf(m, "%-16s %s\n", "cardname", cinfo->cardname); spin_lock_irqsave(&card->lock, flags); @@ -930,27 +931,30 @@ int b1dmactl_read_proc(char *page, char **start, off_t off, spin_unlock_irqrestore(&card->lock, flags); - len += sprintf(page+len, "%-16s 0x%lx\n", - "csr (cached)", (unsigned long)card->csr); - len += sprintf(page+len, "%-16s 0x%lx\n", - "csr", (unsigned long)csr); - len += sprintf(page+len, "%-16s %lu\n", - "txoff", (unsigned long)txoff); - len += sprintf(page+len, "%-16s %lu\n", - "txlen", (unsigned long)txlen); - len += sprintf(page+len, "%-16s %lu\n", - "rxoff", (unsigned long)rxoff); - len += sprintf(page+len, "%-16s %lu\n", - "rxlen", (unsigned long)rxlen); - - if (off+count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len-off) ? count : len-off); + seq_printf(m, "%-16s 0x%lx\n", "csr (cached)", (unsigned long)card->csr); + seq_printf(m, "%-16s 0x%lx\n", "csr", (unsigned long)csr); + seq_printf(m, "%-16s %lu\n", "txoff", (unsigned long)txoff); + seq_printf(m, "%-16s %lu\n", "txlen", (unsigned long)txlen); + seq_printf(m, "%-16s %lu\n", "rxoff", (unsigned long)rxoff); + seq_printf(m, "%-16s %lu\n", "rxlen", (unsigned long)rxlen); + + return 0; +} + +static int b1dmactl_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, b1dmactl_proc_show, PDE(inode)->data); } +const struct file_operations b1dmactl_proc_fops = { + .owner = THIS_MODULE, + .open = b1dmactl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +EXPORT_SYMBOL(b1dmactl_proc_fops); + /* ------------------------------------------------------------- */ EXPORT_SYMBOL(b1dma_reset); @@ -963,7 +967,6 @@ EXPORT_SYMBOL(b1dma_reset_ctr); EXPORT_SYMBOL(b1dma_register_appl); EXPORT_SYMBOL(b1dma_release_appl); EXPORT_SYMBOL(b1dma_send_message); -EXPORT_SYMBOL(b1dmactl_read_proc); static int __init b1dma_init(void) { diff --git a/drivers/isdn/hardware/avm/b1isa.c b/drivers/isdn/hardware/avm/b1isa.c index 6461a32bc838..ff5390546f92 100644 --- a/drivers/isdn/hardware/avm/b1isa.c +++ b/drivers/isdn/hardware/avm/b1isa.c @@ -121,7 +121,7 @@ static int b1isa_probe(struct pci_dev *pdev) cinfo->capi_ctrl.load_firmware = b1_load_firmware; cinfo->capi_ctrl.reset_ctr = b1_reset_ctr; cinfo->capi_ctrl.procinfo = b1isa_procinfo; - cinfo->capi_ctrl.ctr_read_proc = b1ctl_read_proc; + cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops; strcpy(cinfo->capi_ctrl.name, card->name); retval = attach_capi_ctr(&cinfo->capi_ctrl); diff --git a/drivers/isdn/hardware/avm/b1pci.c b/drivers/isdn/hardware/avm/b1pci.c index 5b314a2c4049..c97e4315079d 100644 --- a/drivers/isdn/hardware/avm/b1pci.c +++ b/drivers/isdn/hardware/avm/b1pci.c @@ -112,7 +112,7 @@ static int b1pci_probe(struct capicardparams *p, struct pci_dev *pdev) cinfo->capi_ctrl.load_firmware = b1_load_firmware; cinfo->capi_ctrl.reset_ctr = b1_reset_ctr; cinfo->capi_ctrl.procinfo = b1pci_procinfo; - cinfo->capi_ctrl.ctr_read_proc = b1ctl_read_proc; + cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops; strcpy(cinfo->capi_ctrl.name, card->name); cinfo->capi_ctrl.owner = THIS_MODULE; @@ -251,7 +251,7 @@ static int b1pciv4_probe(struct capicardparams *p, struct pci_dev *pdev) cinfo->capi_ctrl.load_firmware = b1dma_load_firmware; cinfo->capi_ctrl.reset_ctr = b1dma_reset_ctr; cinfo->capi_ctrl.procinfo = b1pciv4_procinfo; - cinfo->capi_ctrl.ctr_read_proc = b1dmactl_read_proc; + cinfo->capi_ctrl.proc_fops = &b1dmactl_proc_fops; strcpy(cinfo->capi_ctrl.name, card->name); retval = attach_capi_ctr(&cinfo->capi_ctrl); diff --git a/drivers/isdn/hardware/avm/b1pcmcia.c b/drivers/isdn/hardware/avm/b1pcmcia.c index 7740403b40e1..d6391e0afeea 100644 --- a/drivers/isdn/hardware/avm/b1pcmcia.c +++ b/drivers/isdn/hardware/avm/b1pcmcia.c @@ -108,7 +108,7 @@ static int b1pcmcia_add_card(unsigned int port, unsigned irq, cinfo->capi_ctrl.load_firmware = b1_load_firmware; cinfo->capi_ctrl.reset_ctr = b1_reset_ctr; cinfo->capi_ctrl.procinfo = b1pcmcia_procinfo; - cinfo->capi_ctrl.ctr_read_proc = b1ctl_read_proc; + cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops; strcpy(cinfo->capi_ctrl.name, card->name); retval = attach_capi_ctr(&cinfo->capi_ctrl); diff --git a/drivers/isdn/hardware/avm/c4.c b/drivers/isdn/hardware/avm/c4.c index 6833301a45fc..de6e6b311819 100644 --- a/drivers/isdn/hardware/avm/c4.c +++ b/drivers/isdn/hardware/avm/c4.c @@ -11,6 +11,8 @@ #include #include +#include +#include #include #include #include @@ -1062,19 +1064,18 @@ static char *c4_procinfo(struct capi_ctr *ctrl) return cinfo->infobuf; } -static int c4_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctrl) +static int c4_proc_show(struct seq_file *m, void *v) { + struct capi_ctr *ctrl = m->private; avmctrl_info *cinfo = (avmctrl_info *)(ctrl->driverdata); avmcard *card = cinfo->card; u8 flag; - int len = 0; char *s; - len += sprintf(page+len, "%-16s %s\n", "name", card->name); - len += sprintf(page+len, "%-16s 0x%x\n", "io", card->port); - len += sprintf(page+len, "%-16s %d\n", "irq", card->irq); - len += sprintf(page+len, "%-16s 0x%lx\n", "membase", card->membase); + seq_printf(m, "%-16s %s\n", "name", card->name); + seq_printf(m, "%-16s 0x%x\n", "io", card->port); + seq_printf(m, "%-16s %d\n", "irq", card->irq); + seq_printf(m, "%-16s 0x%lx\n", "membase", card->membase); switch (card->cardtype) { case avm_b1isa: s = "B1 ISA"; break; case avm_b1pci: s = "B1 PCI"; break; @@ -1087,18 +1088,18 @@ static int c4_read_proc(char *page, char **start, off_t off, case avm_c2: s = "C2"; break; default: s = "???"; break; } - len += sprintf(page+len, "%-16s %s\n", "type", s); + seq_printf(m, "%-16s %s\n", "type", s); if ((s = cinfo->version[VER_DRIVER]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_driver", s); + seq_printf(m, "%-16s %s\n", "ver_driver", s); if ((s = cinfo->version[VER_CARDTYPE]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_cardtype", s); + seq_printf(m, "%-16s %s\n", "ver_cardtype", s); if ((s = cinfo->version[VER_SERIAL]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_serial", s); + seq_printf(m, "%-16s %s\n", "ver_serial", s); if (card->cardtype != avm_m1) { flag = ((u8 *)(ctrl->profile.manu))[3]; if (flag) - len += sprintf(page+len, "%-16s%s%s%s%s%s%s%s\n", + seq_printf(m, "%-16s%s%s%s%s%s%s%s\n", "protocol", (flag & 0x01) ? " DSS1" : "", (flag & 0x02) ? " CT1" : "", @@ -1112,7 +1113,7 @@ static int c4_read_proc(char *page, char **start, off_t off, if (card->cardtype != avm_m1) { flag = ((u8 *)(ctrl->profile.manu))[5]; if (flag) - len += sprintf(page+len, "%-16s%s%s%s%s\n", + seq_printf(m, "%-16s%s%s%s%s\n", "linetype", (flag & 0x01) ? " point to point" : "", (flag & 0x02) ? " point to multipoint" : "", @@ -1120,16 +1121,24 @@ static int c4_read_proc(char *page, char **start, off_t off, (flag & 0x04) ? " leased line with D-channel" : "" ); } - len += sprintf(page+len, "%-16s %s\n", "cardname", cinfo->cardname); - - if (off+count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len-off) ? count : len-off); + seq_printf(m, "%-16s %s\n", "cardname", cinfo->cardname); + + return 0; } +static int c4_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, c4_proc_show, PDE(inode)->data); +} + +static const struct file_operations c4_proc_fops = { + .owner = THIS_MODULE, + .open = c4_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /* ------------------------------------------------------------- */ static int c4_add_card(struct capicardparams *p, struct pci_dev *dev, @@ -1201,7 +1210,7 @@ static int c4_add_card(struct capicardparams *p, struct pci_dev *dev, cinfo->capi_ctrl.load_firmware = c4_load_firmware; cinfo->capi_ctrl.reset_ctr = c4_reset_ctr; cinfo->capi_ctrl.procinfo = c4_procinfo; - cinfo->capi_ctrl.ctr_read_proc = c4_read_proc; + cinfo->capi_ctrl.proc_fops = &c4_proc_fops; strcpy(cinfo->capi_ctrl.name, card->name); retval = attach_capi_ctr(&cinfo->capi_ctrl); diff --git a/drivers/isdn/hardware/avm/t1isa.c b/drivers/isdn/hardware/avm/t1isa.c index 1c53fd49adb6..baeeb3c2a3ee 100644 --- a/drivers/isdn/hardware/avm/t1isa.c +++ b/drivers/isdn/hardware/avm/t1isa.c @@ -429,7 +429,7 @@ static int t1isa_probe(struct pci_dev *pdev, int cardnr) cinfo->capi_ctrl.load_firmware = t1isa_load_firmware; cinfo->capi_ctrl.reset_ctr = t1isa_reset_ctr; cinfo->capi_ctrl.procinfo = t1isa_procinfo; - cinfo->capi_ctrl.ctr_read_proc = b1ctl_read_proc; + cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops; strcpy(cinfo->capi_ctrl.name, card->name); retval = attach_capi_ctr(&cinfo->capi_ctrl); diff --git a/drivers/isdn/hardware/avm/t1pci.c b/drivers/isdn/hardware/avm/t1pci.c index e6d298d75146..5a3f83098018 100644 --- a/drivers/isdn/hardware/avm/t1pci.c +++ b/drivers/isdn/hardware/avm/t1pci.c @@ -119,7 +119,7 @@ static int t1pci_add_card(struct capicardparams *p, struct pci_dev *pdev) cinfo->capi_ctrl.load_firmware = b1dma_load_firmware; cinfo->capi_ctrl.reset_ctr = b1dma_reset_ctr; cinfo->capi_ctrl.procinfo = t1pci_procinfo; - cinfo->capi_ctrl.ctr_read_proc = b1dmactl_read_proc; + cinfo->capi_ctrl.proc_fops = &b1dmactl_proc_fops; strcpy(cinfo->capi_ctrl.name, card->name); retval = attach_capi_ctr(&cinfo->capi_ctrl); diff --git a/drivers/isdn/hardware/eicon/capimain.c b/drivers/isdn/hardware/eicon/capimain.c index 98fcdfc7ca55..0f073cd73763 100644 --- a/drivers/isdn/hardware/eicon/capimain.c +++ b/drivers/isdn/hardware/eicon/capimain.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "os_capi.h" @@ -75,25 +76,32 @@ void diva_os_free_message_buffer(diva_os_message_buffer_s * dmb) /* * proc function for controller info */ -static int diva_ctl_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctrl) +static int diva_ctl_proc_show(struct seq_file *m, void *v) { + struct capi_ctr *ctrl = m->private; diva_card *card = (diva_card *) ctrl->driverdata; - int len = 0; - - len += sprintf(page + len, "%s\n", ctrl->name); - len += sprintf(page + len, "Serial No. : %s\n", ctrl->serial); - len += sprintf(page + len, "Id : %d\n", card->Id); - len += sprintf(page + len, "Channels : %d\n", card->d.channels); - - if (off + count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len - off) ? count : len - off); + + seq_printf(m, "%s\n", ctrl->name); + seq_printf(m, "Serial No. : %s\n", ctrl->serial); + seq_printf(m, "Id : %d\n", card->Id); + seq_printf(m, "Channels : %d\n", card->d.channels); + + return 0; +} + +static int diva_ctl_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, diva_ctl_proc_show, NULL); } +static const struct file_operations diva_ctl_proc_fops = { + .owner = THIS_MODULE, + .open = diva_ctl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /* * set additional os settings in capi_ctr struct */ @@ -102,7 +110,7 @@ void diva_os_set_controller_struct(struct capi_ctr *ctrl) ctrl->driver_name = DRIVERLNAME; ctrl->load_firmware = NULL; ctrl->reset_ctr = NULL; - ctrl->ctr_read_proc = diva_ctl_read_proc; + ctrl->proc_fops = &diva_ctl_proc_fops; ctrl->owner = THIS_MODULE; } diff --git a/drivers/isdn/hardware/eicon/diva_didd.c b/drivers/isdn/hardware/eicon/diva_didd.c index 993b14cf1778..5d06a7437824 100644 --- a/drivers/isdn/hardware/eicon/diva_didd.c +++ b/drivers/isdn/hardware/eicon/diva_didd.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "platform.h" @@ -62,39 +63,41 @@ static char *getrev(const char *revision) return rev; } -static int -proc_read(char *page, char **start, off_t off, int count, int *eof, - void *data) +static int divadidd_proc_show(struct seq_file *m, void *v) { - int len = 0; char tmprev[32]; strcpy(tmprev, main_revision); - len += sprintf(page + len, "%s\n", DRIVERNAME); - len += sprintf(page + len, "name : %s\n", DRIVERLNAME); - len += sprintf(page + len, "release : %s\n", DRIVERRELEASE_DIDD); - len += sprintf(page + len, "build : %s(%s)\n", + seq_printf(m, "%s\n", DRIVERNAME); + seq_printf(m, "name : %s\n", DRIVERLNAME); + seq_printf(m, "release : %s\n", DRIVERRELEASE_DIDD); + seq_printf(m, "build : %s(%s)\n", diva_didd_common_code_build, DIVA_BUILD); - len += sprintf(page + len, "revision : %s\n", getrev(tmprev)); - - if (off + count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len - off) ? count : len - off); + seq_printf(m, "revision : %s\n", getrev(tmprev)); + + return 0; } +static int divadidd_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, divadidd_proc_show, NULL); +} + +static const struct file_operations divadidd_proc_fops = { + .owner = THIS_MODULE, + .open = divadidd_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int DIVA_INIT_FUNCTION create_proc(void) { proc_net_eicon = proc_mkdir("eicon", init_net.proc_net); if (proc_net_eicon) { - if ((proc_didd = - create_proc_entry(DRIVERLNAME, S_IFREG | S_IRUGO, - proc_net_eicon))) { - proc_didd->read_proc = proc_read; - } + proc_didd = proc_create(DRIVERLNAME, S_IRUGO, proc_net_eicon, + &divadidd_proc_fops); return (1); } return (0); diff --git a/drivers/isdn/hardware/eicon/divasi.c b/drivers/isdn/hardware/eicon/divasi.c index 69e71ebe7841..f577719ab3fa 100644 --- a/drivers/isdn/hardware/eicon/divasi.c +++ b/drivers/isdn/hardware/eicon/divasi.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -86,39 +87,40 @@ static void diva_um_timer_function(unsigned long data); extern struct proc_dir_entry *proc_net_eicon; static struct proc_dir_entry *um_idi_proc_entry = NULL; -static int -um_idi_proc_read(char *page, char **start, off_t off, int count, int *eof, - void *data) +static int um_idi_proc_show(struct seq_file *m, void *v) { - int len = 0; char tmprev[32]; - len += sprintf(page + len, "%s\n", DRIVERNAME); - len += sprintf(page + len, "name : %s\n", DRIVERLNAME); - len += sprintf(page + len, "release : %s\n", DRIVERRELEASE_IDI); + seq_printf(m, "%s\n", DRIVERNAME); + seq_printf(m, "name : %s\n", DRIVERLNAME); + seq_printf(m, "release : %s\n", DRIVERRELEASE_IDI); strcpy(tmprev, main_revision); - len += sprintf(page + len, "revision : %s\n", getrev(tmprev)); - len += sprintf(page + len, "build : %s\n", DIVA_BUILD); - len += sprintf(page + len, "major : %d\n", major); - - if (off + count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len - off) ? count : len - off); + seq_printf(m, "revision : %s\n", getrev(tmprev)); + seq_printf(m, "build : %s\n", DIVA_BUILD); + seq_printf(m, "major : %d\n", major); + + return 0; +} + +static int um_idi_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, um_idi_proc_show, NULL); } +static const struct file_operations um_idi_proc_fops = { + .owner = THIS_MODULE, + .open = um_idi_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int DIVA_INIT_FUNCTION create_um_idi_proc(void) { - um_idi_proc_entry = create_proc_entry(DRIVERLNAME, - S_IFREG | S_IRUGO | S_IWUSR, - proc_net_eicon); + um_idi_proc_entry = proc_create(DRIVERLNAME, S_IRUGO, proc_net_eicon, + &um_idi_proc_fops); if (!um_idi_proc_entry) return (0); - - um_idi_proc_entry->read_proc = um_idi_proc_read; - return (1); } diff --git a/drivers/isdn/hardware/eicon/divasproc.c b/drivers/isdn/hardware/eicon/divasproc.c index 040827288ec9..46d44a942624 100644 --- a/drivers/isdn/hardware/eicon/divasproc.c +++ b/drivers/isdn/hardware/eicon/divasproc.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -141,14 +142,10 @@ void remove_divas_proc(void) } } -/* -** write group_optimization -*/ -static int -write_grp_opt(struct file *file, const char __user *buffer, unsigned long count, - void *data) +static ssize_t grp_opt_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *pos) { - diva_os_xdi_adapter_t *a = (diva_os_xdi_adapter_t *) data; + diva_os_xdi_adapter_t *a = PDE(file->f_path.dentry->d_inode)->data; PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1]; if ((count == 1) || (count == 2)) { @@ -172,14 +169,10 @@ write_grp_opt(struct file *file, const char __user *buffer, unsigned long count, return (-EINVAL); } -/* -** write dynamic_l1_down -*/ -static int -write_d_l1_down(struct file *file, const char __user *buffer, unsigned long count, - void *data) +static ssize_t d_l1_down_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *pos) { - diva_os_xdi_adapter_t *a = (diva_os_xdi_adapter_t *) data; + diva_os_xdi_adapter_t *a = PDE(file->f_path.dentry->d_inode)->data; PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1]; if ((count == 1) || (count == 2)) { @@ -203,63 +196,62 @@ write_d_l1_down(struct file *file, const char __user *buffer, unsigned long coun return (-EINVAL); } - -/* -** read dynamic_l1_down -*/ -static int -read_d_l1_down(char *page, char **start, off_t off, int count, int *eof, - void *data) +static int d_l1_down_proc_show(struct seq_file *m, void *v) { - int len = 0; - diva_os_xdi_adapter_t *a = (diva_os_xdi_adapter_t *) data; + diva_os_xdi_adapter_t *a = m->private; PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1]; - len += sprintf(page + len, "%s\n", + seq_printf(m, "%s\n", (IoAdapter->capi_cfg. cfg_1 & DIVA_XDI_CAPI_CFG_1_DYNAMIC_L1_ON) ? "1" : "0"); + return 0; +} - if (off + count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len - off) ? count : len - off); +static int d_l1_down_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, d_l1_down_proc_show, PDE(inode)->data); } -/* -** read group_optimization -*/ -static int -read_grp_opt(char *page, char **start, off_t off, int count, int *eof, - void *data) +static const struct file_operations d_l1_down_proc_fops = { + .owner = THIS_MODULE, + .open = d_l1_down_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = d_l1_down_proc_write, +}; + +static int grp_opt_proc_show(struct seq_file *m, void *v) { - int len = 0; - diva_os_xdi_adapter_t *a = (diva_os_xdi_adapter_t *) data; + diva_os_xdi_adapter_t *a = m->private; PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1]; - len += sprintf(page + len, "%s\n", + seq_printf(m, "%s\n", (IoAdapter->capi_cfg. cfg_1 & DIVA_XDI_CAPI_CFG_1_GROUP_POPTIMIZATION_ON) ? "1" : "0"); + return 0; +} - if (off + count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len - off) ? count : len - off); +static int grp_opt_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, grp_opt_proc_show, PDE(inode)->data); } -/* -** info write -*/ -static int -info_write(struct file *file, const char __user *buffer, unsigned long count, - void *data) +static const struct file_operations grp_opt_proc_fops = { + .owner = THIS_MODULE, + .open = grp_opt_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = grp_opt_proc_write, +}; + +static ssize_t info_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *pos) { - diva_os_xdi_adapter_t *a = (diva_os_xdi_adapter_t *) data; + diva_os_xdi_adapter_t *a = PDE(file->f_path.dentry->d_inode)->data; PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1]; char c[4]; @@ -277,63 +269,46 @@ info_write(struct file *file, const char __user *buffer, unsigned long count, return (-EINVAL); } -/* -** info read -*/ -static int -info_read(char *page, char **start, off_t off, int count, int *eof, - void *data) +static int info_proc_show(struct seq_file *m, void *v) { int i = 0; - int len = 0; char *p; char tmpser[16]; - diva_os_xdi_adapter_t *a = (diva_os_xdi_adapter_t *) data; + diva_os_xdi_adapter_t *a = m->private; PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1]; - len += - sprintf(page + len, "Name : %s\n", - IoAdapter->Properties.Name); - len += sprintf(page + len, "DSP state : %08x\n", a->dsp_mask); - len += sprintf(page + len, "Channels : %02d\n", - IoAdapter->Properties.Channels); - len += sprintf(page + len, "E. max/used : %03d/%03d\n", + seq_printf(m, "Name : %s\n", IoAdapter->Properties.Name); + seq_printf(m, "DSP state : %08x\n", a->dsp_mask); + seq_printf(m, "Channels : %02d\n", IoAdapter->Properties.Channels); + seq_printf(m, "E. max/used : %03d/%03d\n", IoAdapter->e_max, IoAdapter->e_count); diva_get_vserial_number(IoAdapter, tmpser); - len += sprintf(page + len, "Serial : %s\n", tmpser); - len += - sprintf(page + len, "IRQ : %d\n", - IoAdapter->irq_info.irq_nr); - len += sprintf(page + len, "CardIndex : %d\n", a->CardIndex); - len += sprintf(page + len, "CardOrdinal : %d\n", a->CardOrdinal); - len += sprintf(page + len, "Controller : %d\n", a->controller); - len += sprintf(page + len, "Bus-Type : %s\n", + seq_printf(m, "Serial : %s\n", tmpser); + seq_printf(m, "IRQ : %d\n", IoAdapter->irq_info.irq_nr); + seq_printf(m, "CardIndex : %d\n", a->CardIndex); + seq_printf(m, "CardOrdinal : %d\n", a->CardOrdinal); + seq_printf(m, "Controller : %d\n", a->controller); + seq_printf(m, "Bus-Type : %s\n", (a->Bus == DIVAS_XDI_ADAPTER_BUS_ISA) ? "ISA" : "PCI"); - len += sprintf(page + len, "Port-Name : %s\n", a->port_name); + seq_printf(m, "Port-Name : %s\n", a->port_name); if (a->Bus == DIVAS_XDI_ADAPTER_BUS_PCI) { - len += - sprintf(page + len, "PCI-bus : %d\n", - a->resources.pci.bus); - len += - sprintf(page + len, "PCI-func : %d\n", - a->resources.pci.func); + seq_printf(m, "PCI-bus : %d\n", a->resources.pci.bus); + seq_printf(m, "PCI-func : %d\n", a->resources.pci.func); for (i = 0; i < 8; i++) { if (a->resources.pci.bar[i]) { - len += - sprintf(page + len, + seq_printf(m, "Mem / I/O %d : 0x%x / mapped : 0x%lx", i, a->resources.pci.bar[i], (unsigned long) a->resources. pci.addr[i]); if (a->resources.pci.length[i]) { - len += - sprintf(page + len, + seq_printf(m, " / length : %d", a->resources.pci. length[i]); } - len += sprintf(page + len, "\n"); + seq_putc(m, '\n'); } } } @@ -353,16 +328,25 @@ info_read(char *page, char **start, off_t off, int count, int *eof, } else { p = "ready"; } - len += sprintf(page + len, "State : %s\n", p); + seq_printf(m, "State : %s\n", p); - if (off + count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len - off) ? count : len - off); + return 0; +} + +static int info_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, info_proc_show, PDE(inode)->data); } +static const struct file_operations info_proc_fops = { + .owner = THIS_MODULE, + .open = info_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = info_proc_write, +}; + /* ** adapter proc init/de-init */ @@ -380,28 +364,20 @@ int create_adapter_proc(diva_os_xdi_adapter_t * a) return (0); a->proc_adapter_dir = (void *) de; - if (!(pe = - create_proc_entry(info_proc_name, S_IFREG | S_IRUGO | S_IWUSR, de))) + pe = proc_create_data(info_proc_name, S_IRUGO | S_IWUSR, de, + &info_proc_fops, a); + if (!pe) return (0); a->proc_info = (void *) pe; - pe->write_proc = info_write; - pe->read_proc = info_read; - pe->data = a; - if ((pe = create_proc_entry(grp_opt_proc_name, - S_IFREG | S_IRUGO | S_IWUSR, de))) { + pe = proc_create_data(grp_opt_proc_name, S_IRUGO | S_IWUSR, de, + &grp_opt_proc_fops, a); + if (pe) a->proc_grp_opt = (void *) pe; - pe->write_proc = write_grp_opt; - pe->read_proc = read_grp_opt; - pe->data = a; - } - if ((pe = create_proc_entry(d_l1_down_proc_name, - S_IFREG | S_IRUGO | S_IWUSR, de))) { + pe = proc_create_data(d_l1_down_proc_name, S_IRUGO | S_IWUSR, de, + &d_l1_down_proc_fops, a); + if (pe) a->proc_d_l1_down = (void *) pe; - pe->write_proc = write_d_l1_down; - pe->read_proc = read_d_l1_down; - pe->data = a; - } DBG_TRC(("proc entry %s created", tmp)); diff --git a/drivers/isdn/hysdn/hycapi.c b/drivers/isdn/hysdn/hycapi.c index 4ffaa14b9fc4..fe874afa4f81 100644 --- a/drivers/isdn/hysdn/hycapi.c +++ b/drivers/isdn/hysdn/hycapi.c @@ -11,6 +11,8 @@ */ #include +#include +#include #include #include #include @@ -432,26 +434,16 @@ static u16 hycapi_send_message(struct capi_ctr *ctrl, struct sk_buff *skb) return retval; } -/********************************************************************* -hycapi_read_proc - -Informations provided in the /proc/capi-entries. - -*********************************************************************/ - -static int hycapi_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctrl) +static int hycapi_proc_show(struct seq_file *m, void *v) { + struct capi_ctr *ctrl = m->private; hycapictrl_info *cinfo = (hycapictrl_info *)(ctrl->driverdata); hysdn_card *card = cinfo->card; - int len = 0; char *s; -#ifdef HYCAPI_PRINTFNAMES - printk(KERN_NOTICE "hycapi_read_proc\n"); -#endif - len += sprintf(page+len, "%-16s %s\n", "name", cinfo->cardname); - len += sprintf(page+len, "%-16s 0x%x\n", "io", card->iobase); - len += sprintf(page+len, "%-16s %d\n", "irq", card->irq); + + seq_printf(m, "%-16s %s\n", "name", cinfo->cardname); + seq_printf(m, "%-16s 0x%x\n", "io", card->iobase); + seq_printf(m, "%-16s %d\n", "irq", card->irq); switch (card->brdtype) { case BD_PCCARD: s = "HYSDN Hycard"; break; @@ -461,24 +453,32 @@ static int hycapi_read_proc(char *page, char **start, off_t off, case BD_PLEXUS: s = "HYSDN Plexus30"; break; default: s = "???"; break; } - len += sprintf(page+len, "%-16s %s\n", "type", s); + seq_printf(m, "%-16s %s\n", "type", s); if ((s = cinfo->version[VER_DRIVER]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_driver", s); + seq_printf(m, "%-16s %s\n", "ver_driver", s); if ((s = cinfo->version[VER_CARDTYPE]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_cardtype", s); + seq_printf(m, "%-16s %s\n", "ver_cardtype", s); if ((s = cinfo->version[VER_SERIAL]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_serial", s); + seq_printf(m, "%-16s %s\n", "ver_serial", s); - len += sprintf(page+len, "%-16s %s\n", "cardname", cinfo->cardname); + seq_printf(m, "%-16s %s\n", "cardname", cinfo->cardname); - if (off+count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len-off) ? count : len-off); + return 0; +} + +static int hycapi_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, hycapi_proc_show, PDE(inode)->data); } +static const struct file_operations hycapi_proc_fops = { + .owner = THIS_MODULE, + .open = hycapi_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /************************************************************** hycapi_load_firmware @@ -774,7 +774,7 @@ hycapi_capi_create(hysdn_card *card) ctrl->load_firmware = hycapi_load_firmware; ctrl->reset_ctr = hycapi_reset_ctr; ctrl->procinfo = hycapi_procinfo; - ctrl->ctr_read_proc = hycapi_read_proc; + ctrl->proc_fops = &hycapi_proc_fops; strcpy(ctrl->name, cinfo->cardname); ctrl->owner = THIS_MODULE; diff --git a/include/linux/isdn/capilli.h b/include/linux/isdn/capilli.h index 7acb87a44872..d3e5e9da0c82 100644 --- a/include/linux/isdn/capilli.h +++ b/include/linux/isdn/capilli.h @@ -50,8 +50,7 @@ struct capi_ctr { u16 (*send_message)(struct capi_ctr *, struct sk_buff *skb); char *(*procinfo)(struct capi_ctr *); - int (*ctr_read_proc)(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *card); + const struct file_operations *proc_fops; /* filled in before calling ready callback */ u8 manu[CAPI_MANUFACTURER_LEN]; /* CAPI_GET_MANUFACTURER */ diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c index 97f8d68d574d..3487cfe74aec 100644 --- a/net/bluetooth/cmtp/capi.c +++ b/net/bluetooth/cmtp/capi.c @@ -21,7 +21,8 @@ */ #include - +#include +#include #include #include #include @@ -516,33 +517,37 @@ static char *cmtp_procinfo(struct capi_ctr *ctrl) return "CAPI Message Transport Protocol"; } -static int cmtp_ctr_read_proc(char *page, char **start, off_t off, int count, int *eof, struct capi_ctr *ctrl) +static int cmtp_proc_show(struct seq_file *m, void *v) { + struct capi_ctr *ctrl = m->private; struct cmtp_session *session = ctrl->driverdata; struct cmtp_application *app; struct list_head *p, *n; - int len = 0; - len += sprintf(page + len, "%s\n\n", cmtp_procinfo(ctrl)); - len += sprintf(page + len, "addr %s\n", session->name); - len += sprintf(page + len, "ctrl %d\n", session->num); + seq_printf(m, "%s\n\n", cmtp_procinfo(ctrl)); + seq_printf(m, "addr %s\n", session->name); + seq_printf(m, "ctrl %d\n", session->num); list_for_each_safe(p, n, &session->applications) { app = list_entry(p, struct cmtp_application, list); - len += sprintf(page + len, "appl %d -> %d\n", app->appl, app->mapping); + seq_printf(m, "appl %d -> %d\n", app->appl, app->mapping); } - if (off + count >= len) - *eof = 1; - - if (len < off) - return 0; - - *start = page + off; + return 0; +} - return ((count < len - off) ? count : len - off); +static int cmtp_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cmtp_proc_show, PDE(inode)->data); } +static const struct file_operations cmtp_proc_fops = { + .owner = THIS_MODULE, + .open = cmtp_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; int cmtp_attach_device(struct cmtp_session *session) { @@ -582,7 +587,7 @@ int cmtp_attach_device(struct cmtp_session *session) session->ctrl.send_message = cmtp_send_message; session->ctrl.procinfo = cmtp_procinfo; - session->ctrl.ctr_read_proc = cmtp_ctr_read_proc; + session->ctrl.proc_fops = &cmtp_proc_fops; if (attach_capi_ctr(&session->ctrl) < 0) { BT_ERR("Can't attach new controller"); -- cgit v1.2.3 From ad72c347e56bf3a0231b9d686e17764157d2961c Mon Sep 17 00:00:00 2001 From: Christian Pellegrin Date: Thu, 14 Jan 2010 07:08:34 +0000 Subject: can: Proper ctrlmode handling for CAN devices This patch adds error checking of ctrlmode values for CAN devices. As an example all availabe bits are implemented in the mcp251x driver. Signed-off-by: Christian Pellegrin Acked-by: Wolfgang Grandegger Signed-off-by: David S. Miller --- drivers/net/can/at91_can.c | 1 + drivers/net/can/bfin_can.c | 1 + drivers/net/can/dev.c | 2 ++ drivers/net/can/mcp251x.c | 11 ++++++++++- drivers/net/can/mscan/mscan.c | 1 + drivers/net/can/sja1000/sja1000.c | 1 + drivers/net/can/ti_hecc.c | 1 + drivers/net/can/usb/ems_usb.c | 1 + include/linux/can/dev.h | 1 + 9 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c index f7287497ba6e..a2f29a38798a 100644 --- a/drivers/net/can/at91_can.c +++ b/drivers/net/can/at91_can.c @@ -1073,6 +1073,7 @@ static int __init at91_can_probe(struct platform_device *pdev) priv->can.bittiming_const = &at91_bittiming_const; priv->can.do_set_bittiming = at91_set_bittiming; priv->can.do_set_mode = at91_set_mode; + priv->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES; priv->reg_base = addr; priv->dev = dev; priv->clk = clk; diff --git a/drivers/net/can/bfin_can.c b/drivers/net/can/bfin_can.c index 7e1926e79e98..bf7f9ba2d903 100644 --- a/drivers/net/can/bfin_can.c +++ b/drivers/net/can/bfin_can.c @@ -603,6 +603,7 @@ struct net_device *alloc_bfin_candev(void) priv->can.bittiming_const = &bfin_can_bittiming_const; priv->can.do_set_bittiming = bfin_can_set_bittiming; priv->can.do_set_mode = bfin_can_set_mode; + priv->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES; return dev; } diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index c1bb29f0322b..f08f1202ff00 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -592,6 +592,8 @@ static int can_changelink(struct net_device *dev, if (dev->flags & IFF_UP) return -EBUSY; cm = nla_data(data[IFLA_CAN_CTRLMODE]); + if (cm->flags & ~priv->ctrlmode_supported) + return -EOPNOTSUPP; priv->ctrlmode &= ~cm->mask; priv->ctrlmode |= cm->flags; } diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c index afa2fa45fed9..bbe186b5a0ed 100644 --- a/drivers/net/can/mcp251x.c +++ b/drivers/net/can/mcp251x.c @@ -539,9 +539,14 @@ static void mcp251x_set_normal_mode(struct spi_device *spi) if (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) { /* Put device into loopback mode */ mcp251x_write_reg(spi, CANCTRL, CANCTRL_REQOP_LOOPBACK); + } else if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) { + /* Put device into listen-only mode */ + mcp251x_write_reg(spi, CANCTRL, CANCTRL_REQOP_LISTEN_ONLY); } else { /* Put device into normal mode */ - mcp251x_write_reg(spi, CANCTRL, CANCTRL_REQOP_NORMAL); + mcp251x_write_reg(spi, CANCTRL, CANCTRL_REQOP_NORMAL | + (priv->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT ? + CANCTRL_OSM : 0)); /* Wait for the device to enter normal mode */ timeout = jiffies + HZ; @@ -948,6 +953,10 @@ static int __devinit mcp251x_can_probe(struct spi_device *spi) priv->can.bittiming_const = &mcp251x_bittiming_const; priv->can.do_set_mode = mcp251x_do_set_mode; priv->can.clock.freq = pdata->oscillator_frequency / 2; + priv->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES | + CAN_CTRLMODE_LOOPBACK | CAN_CTRLMODE_LISTENONLY; + if (pdata->model == CAN_MCP251X_MCP2515) + priv->can.ctrlmode_supported |= CAN_CTRLMODE_ONE_SHOT; priv->net = net; dev_set_drvdata(&spi->dev, priv); diff --git a/drivers/net/can/mscan/mscan.c b/drivers/net/can/mscan/mscan.c index 40827c128b65..6b7dd578d417 100644 --- a/drivers/net/can/mscan/mscan.c +++ b/drivers/net/can/mscan/mscan.c @@ -686,6 +686,7 @@ struct net_device *alloc_mscandev(void) priv->can.bittiming_const = &mscan_bittiming_const; priv->can.do_set_bittiming = mscan_do_set_bittiming; priv->can.do_set_mode = mscan_do_set_mode; + priv->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES; for (i = 0; i < TX_QUEUE_SIZE; i++) { priv->tx_queue[i].id = i; diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c index 345304d779b9..ace103a44833 100644 --- a/drivers/net/can/sja1000/sja1000.c +++ b/drivers/net/can/sja1000/sja1000.c @@ -567,6 +567,7 @@ struct net_device *alloc_sja1000dev(int sizeof_priv) priv->can.bittiming_const = &sja1000_bittiming_const; priv->can.do_set_bittiming = sja1000_set_bittiming; priv->can.do_set_mode = sja1000_set_mode; + priv->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES; if (sizeof_priv) priv->priv = (void *)priv + sizeof(struct sja1000_priv); diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c index 7d370e32a7a8..8332e242b0be 100644 --- a/drivers/net/can/ti_hecc.c +++ b/drivers/net/can/ti_hecc.c @@ -909,6 +909,7 @@ static int ti_hecc_probe(struct platform_device *pdev) priv->can.bittiming_const = &ti_hecc_bittiming_const; priv->can.do_set_mode = ti_hecc_do_set_mode; priv->can.do_get_state = ti_hecc_get_state; + priv->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES; ndev->irq = irq->start; ndev->flags |= IFF_ECHO; diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index ddb17e256656..bfab283ba9b1 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -1022,6 +1022,7 @@ static int ems_usb_probe(struct usb_interface *intf, dev->can.bittiming_const = &ems_usb_bittiming_const; dev->can.do_set_bittiming = ems_usb_set_bittiming; dev->can.do_set_mode = ems_usb_set_mode; + dev->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES; netdev->flags |= IFF_ECHO; /* we support local echo */ diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 7e7c98a3e908..c8c660a79f90 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -38,6 +38,7 @@ struct can_priv { enum can_state state; u32 ctrlmode; + u32 ctrlmode_supported; int restart_ms; struct timer_list restart_timer; -- cgit v1.2.3 From 05c2828c72c4eabf62376adfe27bd24797621f62 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 14 Jan 2010 06:17:09 +0000 Subject: tun: export underlying socket Tun device looks similar to a packet socket in that both pass complete frames from/to userspace. This patch fills in enough fields in the socket underlying tun driver to support sendmsg/recvmsg operations, and message flags MSG_TRUNC and MSG_DONTWAIT, and exports access to this socket to modules. Regular read/write behaviour is unchanged. This way, code using raw sockets to inject packets into a physical device, can support injecting packets into host network stack almost without modification. First user of this interface will be vhost virtualization accelerator. Signed-off-by: Michael S. Tsirkin Acked-by: Herbert Xu Acked-by: David S. Miller Signed-off-by: David S. Miller --- drivers/net/tun.c | 101 +++++++++++++++++++++++++++++++++++++++---------- include/linux/if_tun.h | 14 +++++++ 2 files changed, 96 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2834a01bae24..5adb3d150552 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -144,6 +144,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file) err = 0; tfile->tun = tun; tun->tfile = tfile; + tun->socket.file = file; dev_hold(tun->dev); sock_hold(tun->socket.sk); atomic_inc(&tfile->count); @@ -158,6 +159,7 @@ static void __tun_detach(struct tun_struct *tun) /* Detach from net device */ netif_tx_lock_bh(tun->dev); tun->tfile = NULL; + tun->socket.file = NULL; netif_tx_unlock_bh(tun->dev); /* Drop read queue */ @@ -387,7 +389,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) /* Notify and wake up reader process */ if (tun->flags & TUN_FASYNC) kill_fasync(&tun->fasync, SIGIO, POLL_IN); - wake_up_interruptible(&tun->socket.wait); + wake_up_interruptible_poll(&tun->socket.wait, POLLIN | + POLLRDNORM | POLLRDBAND); return NETDEV_TX_OK; drop: @@ -743,7 +746,7 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun, len = min_t(int, skb->len, len); skb_copy_datagram_const_iovec(skb, 0, iv, total, len); - total += len; + total += skb->len; tun->dev->stats.tx_packets++; tun->dev->stats.tx_bytes += len; @@ -751,34 +754,23 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun, return total; } -static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, - unsigned long count, loff_t pos) +static ssize_t tun_do_read(struct tun_struct *tun, + struct kiocb *iocb, const struct iovec *iv, + ssize_t len, int noblock) { - struct file *file = iocb->ki_filp; - struct tun_file *tfile = file->private_data; - struct tun_struct *tun = __tun_get(tfile); DECLARE_WAITQUEUE(wait, current); struct sk_buff *skb; - ssize_t len, ret = 0; - - if (!tun) - return -EBADFD; + ssize_t ret = 0; DBG(KERN_INFO "%s: tun_chr_read\n", tun->dev->name); - len = iov_length(iv, count); - if (len < 0) { - ret = -EINVAL; - goto out; - } - add_wait_queue(&tun->socket.wait, &wait); while (len) { current->state = TASK_INTERRUPTIBLE; /* Read frames from the queue */ if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) { - if (file->f_flags & O_NONBLOCK) { + if (noblock) { ret = -EAGAIN; break; } @@ -805,6 +797,27 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, current->state = TASK_RUNNING; remove_wait_queue(&tun->socket.wait, &wait); + return ret; +} + +static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct tun_file *tfile = file->private_data; + struct tun_struct *tun = __tun_get(tfile); + ssize_t len, ret; + + if (!tun) + return -EBADFD; + len = iov_length(iv, count); + if (len < 0) { + ret = -EINVAL; + goto out; + } + + ret = tun_do_read(tun, iocb, iv, len, file->f_flags & O_NONBLOCK); + ret = min_t(ssize_t, ret, len); out: tun_put(tun); return ret; @@ -847,7 +860,8 @@ static void tun_sock_write_space(struct sock *sk) return; if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible_sync(sk->sk_sleep); + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT | + POLLWRNORM | POLLWRBAND); tun = tun_sk(sk)->tun; kill_fasync(&tun->fasync, SIGIO, POLL_OUT); @@ -858,6 +872,37 @@ static void tun_sock_destruct(struct sock *sk) free_netdev(tun_sk(sk)->tun->dev); } +static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len) +{ + struct tun_struct *tun = container_of(sock, struct tun_struct, socket); + return tun_get_user(tun, m->msg_iov, total_len, + m->msg_flags & MSG_DONTWAIT); +} + +static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len, + int flags) +{ + struct tun_struct *tun = container_of(sock, struct tun_struct, socket); + int ret; + if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) + return -EINVAL; + ret = tun_do_read(tun, iocb, m->msg_iov, total_len, + flags & MSG_DONTWAIT); + if (ret > total_len) { + m->msg_flags |= MSG_TRUNC; + ret = flags & MSG_TRUNC ? ret : total_len; + } + return ret; +} + +/* Ops structure to mimic raw sockets with tun */ +static const struct proto_ops tun_socket_ops = { + .sendmsg = tun_sendmsg, + .recvmsg = tun_recvmsg, +}; + static struct proto tun_proto = { .name = "tun", .owner = THIS_MODULE, @@ -986,6 +1031,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) goto err_free_dev; init_waitqueue_head(&tun->socket.wait); + tun->socket.ops = &tun_socket_ops; sock_init_data(&tun->socket, sk); sk->sk_write_space = tun_sock_write_space; sk->sk_sndbuf = INT_MAX; @@ -1525,6 +1571,23 @@ static void tun_cleanup(void) rtnl_link_unregister(&tun_link_ops); } +/* Get an underlying socket object from tun file. Returns error unless file is + * attached to a device. The returned object works like a packet socket, it + * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for + * holding a reference to the file for as long as the socket is in use. */ +struct socket *tun_get_socket(struct file *file) +{ + struct tun_struct *tun; + if (file->f_op != &tun_fops) + return ERR_PTR(-EINVAL); + tun = tun_get(file); + if (!tun) + return ERR_PTR(-EBADFD); + tun_put(tun); + return &tun->socket; +} +EXPORT_SYMBOL_GPL(tun_get_socket); + module_init(tun_init); module_exit(tun_cleanup); MODULE_DESCRIPTION(DRV_DESCRIPTION); diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 3f5fd523b49d..404abe00162c 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -86,4 +86,18 @@ struct tun_filter { __u8 addr[0][ETH_ALEN]; }; +#ifdef __KERNEL__ +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) +struct socket *tun_get_socket(struct file *); +#else +#include +#include +struct file; +struct socket; +static inline struct socket *tun_get_socket(struct file *f) +{ + return ERR_PTR(-EINVAL); +} +#endif /* CONFIG_TUN */ +#endif /* __KERNEL__ */ #endif /* __IF_TUN_H */ -- cgit v1.2.3 From 3a4d5c94e959359ece6d6b55045c3f046677f55c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 14 Jan 2010 06:17:27 +0000 Subject: vhost_net: a kernel-level virtio server What it is: vhost net is a character device that can be used to reduce the number of system calls involved in virtio networking. Existing virtio net code is used in the guest without modification. There's similarity with vringfd, with some differences and reduced scope - uses eventfd for signalling - structures can be moved around in memory at any time (good for migration, bug work-arounds in userspace) - write logging is supported (good for migration) - support memory table and not just an offset (needed for kvm) common virtio related code has been put in a separate file vhost.c and can be made into a separate module if/when more backends appear. I used Rusty's lguest.c as the source for developing this part : this supplied me with witty comments I wouldn't be able to write myself. What it is not: vhost net is not a bus, and not a generic new system call. No assumptions are made on how guest performs hypercalls. Userspace hypervisors are supported as well as kvm. How it works: Basically, we connect virtio frontend (configured by userspace) to a backend. The backend could be a network device, or a tap device. Backend is also configured by userspace, including vlan/mac etc. Status: This works for me, and I haven't see any crashes. Compared to userspace, people reported improved latency (as I save up to 4 system calls per packet), as well as better bandwidth and CPU utilization. Features that I plan to look at in the future: - mergeable buffers - zero copy - scalability tuning: figure out the best threading model to use Note on RCU usage (this is also documented in vhost.h, near private_pointer which is the value protected by this variant of RCU): what is happening is that the rcu_dereference() is being used in a workqueue item. The role of rcu_read_lock() is taken on by the start of execution of the workqueue item, of rcu_read_unlock() by the end of execution of the workqueue item, and of synchronize_rcu() by flush_workqueue()/flush_work(). In the future we might need to apply some gcc attribute or sparse annotation to the function passed to INIT_WORK(). Paul's ack below is for this RCU usage. (Includes fixes by Alan Cox , David L Stevens , Chris Wright ) Acked-by: Rusty Russell Acked-by: Arnd Bergmann Acked-by: "Paul E. McKenney" Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- MAINTAINERS | 9 + arch/ia64/kvm/Kconfig | 1 + arch/powerpc/kvm/Kconfig | 1 + arch/s390/kvm/Kconfig | 1 + arch/x86/kvm/Kconfig | 1 + drivers/Makefile | 1 + drivers/vhost/Kconfig | 11 + drivers/vhost/Makefile | 2 + drivers/vhost/net.c | 661 ++++++++++++++++++++++++++ drivers/vhost/vhost.c | 1098 ++++++++++++++++++++++++++++++++++++++++++++ drivers/vhost/vhost.h | 161 +++++++ include/linux/Kbuild | 1 + include/linux/miscdevice.h | 1 + include/linux/vhost.h | 130 ++++++ 14 files changed, 2079 insertions(+) create mode 100644 drivers/vhost/Kconfig create mode 100644 drivers/vhost/Makefile create mode 100644 drivers/vhost/net.c create mode 100644 drivers/vhost/vhost.c create mode 100644 drivers/vhost/vhost.h create mode 100644 include/linux/vhost.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 745643b8c344..337dffbe9a47 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5803,6 +5803,15 @@ S: Maintained F: Documentation/filesystems/vfat.txt F: fs/fat/ +VIRTIO HOST (VHOST) +M: "Michael S. Tsirkin" +L: kvm@vger.kernel.org +L: virtualization@lists.osdl.org +L: netdev@vger.kernel.org +S: Maintained +F: drivers/vhost/ +F: include/linux/vhost.h + VIA RHINE NETWORK DRIVER M: Roger Luethi S: Maintained diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index ef3e7be29caf..01c75797119c 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -47,6 +47,7 @@ config KVM_INTEL Provides support for KVM on Itanium 2 processors equipped with the VT extensions. +source drivers/vhost/Kconfig source drivers/virtio/Kconfig endif # VIRTUALIZATION diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 07703f72330e..e28841fbfb8d 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -75,6 +75,7 @@ config KVM_E500 If unsure, say N. +source drivers/vhost/Kconfig source drivers/virtio/Kconfig endif # VIRTUALIZATION diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 6ee55ae84ce2..a7251580891c 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -35,6 +35,7 @@ config KVM # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. +source drivers/vhost/Kconfig source drivers/virtio/Kconfig endif # VIRTUALIZATION diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 4cd498332466..3c4d0109ad20 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -65,6 +65,7 @@ config KVM_AMD # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. +source drivers/vhost/Kconfig source drivers/lguest/Kconfig source drivers/virtio/Kconfig diff --git a/drivers/Makefile b/drivers/Makefile index 6ee53c7a57a1..81e36596b1e9 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -106,6 +106,7 @@ obj-$(CONFIG_HID) += hid/ obj-$(CONFIG_PPC_PS3) += ps3/ obj-$(CONFIG_OF) += of/ obj-$(CONFIG_SSB) += ssb/ +obj-$(CONFIG_VHOST_NET) += vhost/ obj-$(CONFIG_VIRTIO) += virtio/ obj-$(CONFIG_VLYNQ) += vlynq/ obj-$(CONFIG_STAGING) += staging/ diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig new file mode 100644 index 000000000000..9f409f447aea --- /dev/null +++ b/drivers/vhost/Kconfig @@ -0,0 +1,11 @@ +config VHOST_NET + tristate "Host kernel accelerator for virtio net (EXPERIMENTAL)" + depends on NET && EVENTFD && EXPERIMENTAL + ---help--- + This kernel module can be loaded in host kernel to accelerate + guest networking with virtio_net. Not to be confused with virtio_net + module itself which needs to be loaded in guest kernel. + + To compile this driver as a module, choose M here: the module will + be called vhost_net. + diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile new file mode 100644 index 000000000000..72dd02050bb9 --- /dev/null +++ b/drivers/vhost/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_VHOST_NET) += vhost_net.o +vhost_net-y := vhost.o net.o diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c new file mode 100644 index 000000000000..4c8928319e1d --- /dev/null +++ b/drivers/vhost/net.c @@ -0,0 +1,661 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * Author: Michael S. Tsirkin + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * virtio-net server in host kernel. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "vhost.h" + +/* Max number of bytes transferred before requeueing the job. + * Using this limit prevents one virtqueue from starving others. */ +#define VHOST_NET_WEIGHT 0x80000 + +enum { + VHOST_NET_VQ_RX = 0, + VHOST_NET_VQ_TX = 1, + VHOST_NET_VQ_MAX = 2, +}; + +enum vhost_net_poll_state { + VHOST_NET_POLL_DISABLED = 0, + VHOST_NET_POLL_STARTED = 1, + VHOST_NET_POLL_STOPPED = 2, +}; + +struct vhost_net { + struct vhost_dev dev; + struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; + struct vhost_poll poll[VHOST_NET_VQ_MAX]; + /* Tells us whether we are polling a socket for TX. + * We only do this when socket buffer fills up. + * Protected by tx vq lock. */ + enum vhost_net_poll_state tx_poll_state; +}; + +/* Pop first len bytes from iovec. Return number of segments used. */ +static int move_iovec_hdr(struct iovec *from, struct iovec *to, + size_t len, int iov_count) +{ + int seg = 0; + size_t size; + while (len && seg < iov_count) { + size = min(from->iov_len, len); + to->iov_base = from->iov_base; + to->iov_len = size; + from->iov_len -= size; + from->iov_base += size; + len -= size; + ++from; + ++to; + ++seg; + } + return seg; +} + +/* Caller must have TX VQ lock */ +static void tx_poll_stop(struct vhost_net *net) +{ + if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED)) + return; + vhost_poll_stop(net->poll + VHOST_NET_VQ_TX); + net->tx_poll_state = VHOST_NET_POLL_STOPPED; +} + +/* Caller must have TX VQ lock */ +static void tx_poll_start(struct vhost_net *net, struct socket *sock) +{ + if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED)) + return; + vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file); + net->tx_poll_state = VHOST_NET_POLL_STARTED; +} + +/* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ +static void handle_tx(struct vhost_net *net) +{ + struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; + unsigned head, out, in, s; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_control = NULL, + .msg_controllen = 0, + .msg_iov = vq->iov, + .msg_flags = MSG_DONTWAIT, + }; + size_t len, total_len = 0; + int err, wmem; + size_t hdr_size; + struct socket *sock = rcu_dereference(vq->private_data); + if (!sock) + return; + + wmem = atomic_read(&sock->sk->sk_wmem_alloc); + if (wmem >= sock->sk->sk_sndbuf) + return; + + use_mm(net->dev.mm); + mutex_lock(&vq->mutex); + vhost_disable_notify(vq); + + if (wmem < sock->sk->sk_sndbuf * 2) + tx_poll_stop(net); + hdr_size = vq->hdr_size; + + for (;;) { + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, + ARRAY_SIZE(vq->iov), + &out, &in, + NULL, NULL); + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (head == vq->num) { + wmem = atomic_read(&sock->sk->sk_wmem_alloc); + if (wmem >= sock->sk->sk_sndbuf * 3 / 4) { + tx_poll_start(net, sock); + set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + break; + } + if (unlikely(vhost_enable_notify(vq))) { + vhost_disable_notify(vq); + continue; + } + break; + } + if (in) { + vq_err(vq, "Unexpected descriptor format for TX: " + "out %d, int %d\n", out, in); + break; + } + /* Skip header. TODO: support TSO. */ + s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out); + msg.msg_iovlen = out; + len = iov_length(vq->iov, out); + /* Sanity check */ + if (!len) { + vq_err(vq, "Unexpected header len for TX: " + "%zd expected %zd\n", + iov_length(vq->hdr, s), hdr_size); + break; + } + /* TODO: Check specific error and bomb out unless ENOBUFS? */ + err = sock->ops->sendmsg(NULL, sock, &msg, len); + if (unlikely(err < 0)) { + vhost_discard_vq_desc(vq); + tx_poll_start(net, sock); + break; + } + if (err != len) + pr_err("Truncated TX packet: " + " len %d != %zd\n", err, len); + vhost_add_used_and_signal(&net->dev, vq, head, 0); + total_len += len; + if (unlikely(total_len >= VHOST_NET_WEIGHT)) { + vhost_poll_queue(&vq->poll); + break; + } + } + + mutex_unlock(&vq->mutex); + unuse_mm(net->dev.mm); +} + +/* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ +static void handle_rx(struct vhost_net *net) +{ + struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; + unsigned head, out, in, log, s; + struct vhost_log *vq_log; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_control = NULL, /* FIXME: get and handle RX aux data. */ + .msg_controllen = 0, + .msg_iov = vq->iov, + .msg_flags = MSG_DONTWAIT, + }; + + struct virtio_net_hdr hdr = { + .flags = 0, + .gso_type = VIRTIO_NET_HDR_GSO_NONE + }; + + size_t len, total_len = 0; + int err; + size_t hdr_size; + struct socket *sock = rcu_dereference(vq->private_data); + if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) + return; + + use_mm(net->dev.mm); + mutex_lock(&vq->mutex); + vhost_disable_notify(vq); + hdr_size = vq->hdr_size; + + vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? + vq->log : NULL; + + for (;;) { + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, + ARRAY_SIZE(vq->iov), + &out, &in, + vq_log, &log); + /* OK, now we need to know about added descriptors. */ + if (head == vq->num) { + if (unlikely(vhost_enable_notify(vq))) { + /* They have slipped one in as we were + * doing that: check again. */ + vhost_disable_notify(vq); + continue; + } + /* Nothing new? Wait for eventfd to tell us + * they refilled. */ + break; + } + /* We don't need to be notified again. */ + if (out) { + vq_err(vq, "Unexpected descriptor format for RX: " + "out %d, int %d\n", + out, in); + break; + } + /* Skip header. TODO: support TSO/mergeable rx buffers. */ + s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in); + msg.msg_iovlen = in; + len = iov_length(vq->iov, in); + /* Sanity check */ + if (!len) { + vq_err(vq, "Unexpected header len for RX: " + "%zd expected %zd\n", + iov_length(vq->hdr, s), hdr_size); + break; + } + err = sock->ops->recvmsg(NULL, sock, &msg, + len, MSG_DONTWAIT | MSG_TRUNC); + /* TODO: Check specific error and bomb out unless EAGAIN? */ + if (err < 0) { + vhost_discard_vq_desc(vq); + break; + } + /* TODO: Should check and handle checksum. */ + if (err > len) { + pr_err("Discarded truncated rx packet: " + " len %d > %zd\n", err, len); + vhost_discard_vq_desc(vq); + continue; + } + len = err; + err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size); + if (err) { + vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n", + vq->iov->iov_base, err); + break; + } + len += hdr_size; + vhost_add_used_and_signal(&net->dev, vq, head, len); + if (unlikely(vq_log)) + vhost_log_write(vq, vq_log, log, len); + total_len += len; + if (unlikely(total_len >= VHOST_NET_WEIGHT)) { + vhost_poll_queue(&vq->poll); + break; + } + } + + mutex_unlock(&vq->mutex); + unuse_mm(net->dev.mm); +} + +static void handle_tx_kick(struct work_struct *work) +{ + struct vhost_virtqueue *vq; + struct vhost_net *net; + vq = container_of(work, struct vhost_virtqueue, poll.work); + net = container_of(vq->dev, struct vhost_net, dev); + handle_tx(net); +} + +static void handle_rx_kick(struct work_struct *work) +{ + struct vhost_virtqueue *vq; + struct vhost_net *net; + vq = container_of(work, struct vhost_virtqueue, poll.work); + net = container_of(vq->dev, struct vhost_net, dev); + handle_rx(net); +} + +static void handle_tx_net(struct work_struct *work) +{ + struct vhost_net *net; + net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); + handle_tx(net); +} + +static void handle_rx_net(struct work_struct *work) +{ + struct vhost_net *net; + net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); + handle_rx(net); +} + +static int vhost_net_open(struct inode *inode, struct file *f) +{ + struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); + int r; + if (!n) + return -ENOMEM; + n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; + n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; + r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); + if (r < 0) { + kfree(n); + return r; + } + + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); + n->tx_poll_state = VHOST_NET_POLL_DISABLED; + + f->private_data = n; + + return 0; +} + +static void vhost_net_disable_vq(struct vhost_net *n, + struct vhost_virtqueue *vq) +{ + if (!vq->private_data) + return; + if (vq == n->vqs + VHOST_NET_VQ_TX) { + tx_poll_stop(n); + n->tx_poll_state = VHOST_NET_POLL_DISABLED; + } else + vhost_poll_stop(n->poll + VHOST_NET_VQ_RX); +} + +static void vhost_net_enable_vq(struct vhost_net *n, + struct vhost_virtqueue *vq) +{ + struct socket *sock = vq->private_data; + if (!sock) + return; + if (vq == n->vqs + VHOST_NET_VQ_TX) { + n->tx_poll_state = VHOST_NET_POLL_STOPPED; + tx_poll_start(n, sock); + } else + vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file); +} + +static struct socket *vhost_net_stop_vq(struct vhost_net *n, + struct vhost_virtqueue *vq) +{ + struct socket *sock; + + mutex_lock(&vq->mutex); + sock = vq->private_data; + vhost_net_disable_vq(n, vq); + rcu_assign_pointer(vq->private_data, NULL); + mutex_unlock(&vq->mutex); + return sock; +} + +static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, + struct socket **rx_sock) +{ + *tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX); + *rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX); +} + +static void vhost_net_flush_vq(struct vhost_net *n, int index) +{ + vhost_poll_flush(n->poll + index); + vhost_poll_flush(&n->dev.vqs[index].poll); +} + +static void vhost_net_flush(struct vhost_net *n) +{ + vhost_net_flush_vq(n, VHOST_NET_VQ_TX); + vhost_net_flush_vq(n, VHOST_NET_VQ_RX); +} + +static int vhost_net_release(struct inode *inode, struct file *f) +{ + struct vhost_net *n = f->private_data; + struct socket *tx_sock; + struct socket *rx_sock; + + vhost_net_stop(n, &tx_sock, &rx_sock); + vhost_net_flush(n); + vhost_dev_cleanup(&n->dev); + if (tx_sock) + fput(tx_sock->file); + if (rx_sock) + fput(rx_sock->file); + /* We do an extra flush before freeing memory, + * since jobs can re-queue themselves. */ + vhost_net_flush(n); + kfree(n); + return 0; +} + +static struct socket *get_raw_socket(int fd) +{ + struct { + struct sockaddr_ll sa; + char buf[MAX_ADDR_LEN]; + } uaddr; + int uaddr_len = sizeof uaddr, r; + struct socket *sock = sockfd_lookup(fd, &r); + if (!sock) + return ERR_PTR(-ENOTSOCK); + + /* Parameter checking */ + if (sock->sk->sk_type != SOCK_RAW) { + r = -ESOCKTNOSUPPORT; + goto err; + } + + r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, + &uaddr_len, 0); + if (r) + goto err; + + if (uaddr.sa.sll_family != AF_PACKET) { + r = -EPFNOSUPPORT; + goto err; + } + return sock; +err: + fput(sock->file); + return ERR_PTR(r); +} + +static struct socket *get_tun_socket(int fd) +{ + struct file *file = fget(fd); + struct socket *sock; + if (!file) + return ERR_PTR(-EBADF); + sock = tun_get_socket(file); + if (IS_ERR(sock)) + fput(file); + return sock; +} + +static struct socket *get_socket(int fd) +{ + struct socket *sock; + /* special case to disable backend */ + if (fd == -1) + return NULL; + sock = get_raw_socket(fd); + if (!IS_ERR(sock)) + return sock; + sock = get_tun_socket(fd); + if (!IS_ERR(sock)) + return sock; + return ERR_PTR(-ENOTSOCK); +} + +static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) +{ + struct socket *sock, *oldsock; + struct vhost_virtqueue *vq; + int r; + + mutex_lock(&n->dev.mutex); + r = vhost_dev_check_owner(&n->dev); + if (r) + goto err; + + if (index >= VHOST_NET_VQ_MAX) { + r = -ENOBUFS; + goto err; + } + vq = n->vqs + index; + mutex_lock(&vq->mutex); + + /* Verify that ring has been setup correctly. */ + if (!vhost_vq_access_ok(vq)) { + r = -EFAULT; + goto err; + } + sock = get_socket(fd); + if (IS_ERR(sock)) { + r = PTR_ERR(sock); + goto err; + } + + /* start polling new socket */ + oldsock = vq->private_data; + if (sock == oldsock) + goto done; + + vhost_net_disable_vq(n, vq); + rcu_assign_pointer(vq->private_data, sock); + vhost_net_enable_vq(n, vq); + mutex_unlock(&vq->mutex); +done: + if (oldsock) { + vhost_net_flush_vq(n, index); + fput(oldsock->file); + } +err: + mutex_unlock(&n->dev.mutex); + return r; +} + +static long vhost_net_reset_owner(struct vhost_net *n) +{ + struct socket *tx_sock = NULL; + struct socket *rx_sock = NULL; + long err; + mutex_lock(&n->dev.mutex); + err = vhost_dev_check_owner(&n->dev); + if (err) + goto done; + vhost_net_stop(n, &tx_sock, &rx_sock); + vhost_net_flush(n); + err = vhost_dev_reset_owner(&n->dev); +done: + mutex_unlock(&n->dev.mutex); + if (tx_sock) + fput(tx_sock->file); + if (rx_sock) + fput(rx_sock->file); + return err; +} + +static int vhost_net_set_features(struct vhost_net *n, u64 features) +{ + size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ? + sizeof(struct virtio_net_hdr) : 0; + int i; + mutex_lock(&n->dev.mutex); + if ((features & (1 << VHOST_F_LOG_ALL)) && + !vhost_log_access_ok(&n->dev)) { + mutex_unlock(&n->dev.mutex); + return -EFAULT; + } + n->dev.acked_features = features; + smp_wmb(); + for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { + mutex_lock(&n->vqs[i].mutex); + n->vqs[i].hdr_size = hdr_size; + mutex_unlock(&n->vqs[i].mutex); + } + vhost_net_flush(n); + mutex_unlock(&n->dev.mutex); + return 0; +} + +static long vhost_net_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct vhost_net *n = f->private_data; + void __user *argp = (void __user *)arg; + u64 __user *featurep = argp; + struct vhost_vring_file backend; + u64 features; + int r; + switch (ioctl) { + case VHOST_NET_SET_BACKEND: + r = copy_from_user(&backend, argp, sizeof backend); + if (r < 0) + return r; + return vhost_net_set_backend(n, backend.index, backend.fd); + case VHOST_GET_FEATURES: + features = VHOST_FEATURES; + return copy_to_user(featurep, &features, sizeof features); + case VHOST_SET_FEATURES: + r = copy_from_user(&features, featurep, sizeof features); + if (r < 0) + return r; + if (features & ~VHOST_FEATURES) + return -EOPNOTSUPP; + return vhost_net_set_features(n, features); + case VHOST_RESET_OWNER: + return vhost_net_reset_owner(n); + default: + mutex_lock(&n->dev.mutex); + r = vhost_dev_ioctl(&n->dev, ioctl, arg); + vhost_net_flush(n); + mutex_unlock(&n->dev.mutex); + return r; + } +} + +#ifdef CONFIG_COMPAT +static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); +} +#endif + +const static struct file_operations vhost_net_fops = { + .owner = THIS_MODULE, + .release = vhost_net_release, + .unlocked_ioctl = vhost_net_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vhost_net_compat_ioctl, +#endif + .open = vhost_net_open, +}; + +static struct miscdevice vhost_net_misc = { + VHOST_NET_MINOR, + "vhost-net", + &vhost_net_fops, +}; + +int vhost_net_init(void) +{ + int r = vhost_init(); + if (r) + goto err_init; + r = misc_register(&vhost_net_misc); + if (r) + goto err_reg; + return 0; +err_reg: + vhost_cleanup(); +err_init: + return r; + +} +module_init(vhost_net_init); + +void vhost_net_exit(void) +{ + misc_deregister(&vhost_net_misc); + vhost_cleanup(); +} +module_exit(vhost_net_exit); + +MODULE_VERSION("0.0.1"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Michael S. Tsirkin"); +MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c new file mode 100644 index 000000000000..c8c25dbc5857 --- /dev/null +++ b/drivers/vhost/vhost.c @@ -0,0 +1,1098 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * Copyright (C) 2006 Rusty Russell IBM Corporation + * + * Author: Michael S. Tsirkin + * + * Inspiration, some code, and most witty comments come from + * Documentation/lguest/lguest.c, by Rusty Russell + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Generic code for virtio server in host kernel. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "vhost.h" + +enum { + VHOST_MEMORY_MAX_NREGIONS = 64, + VHOST_MEMORY_F_LOG = 0x1, +}; + +static struct workqueue_struct *vhost_workqueue; + +static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct vhost_poll *poll; + poll = container_of(pt, struct vhost_poll, table); + + poll->wqh = wqh; + add_wait_queue(wqh, &poll->wait); +} + +static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + struct vhost_poll *poll; + poll = container_of(wait, struct vhost_poll, wait); + if (!((unsigned long)key & poll->mask)) + return 0; + + queue_work(vhost_workqueue, &poll->work); + return 0; +} + +/* Init poll structure */ +void vhost_poll_init(struct vhost_poll *poll, work_func_t func, + unsigned long mask) +{ + INIT_WORK(&poll->work, func); + init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); + init_poll_funcptr(&poll->table, vhost_poll_func); + poll->mask = mask; +} + +/* Start polling a file. We add ourselves to file's wait queue. The caller must + * keep a reference to a file until after vhost_poll_stop is called. */ +void vhost_poll_start(struct vhost_poll *poll, struct file *file) +{ + unsigned long mask; + mask = file->f_op->poll(file, &poll->table); + if (mask) + vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask); +} + +/* Stop polling a file. After this function returns, it becomes safe to drop the + * file reference. You must also flush afterwards. */ +void vhost_poll_stop(struct vhost_poll *poll) +{ + remove_wait_queue(poll->wqh, &poll->wait); +} + +/* Flush any work that has been scheduled. When calling this, don't hold any + * locks that are also used by the callback. */ +void vhost_poll_flush(struct vhost_poll *poll) +{ + flush_work(&poll->work); +} + +void vhost_poll_queue(struct vhost_poll *poll) +{ + queue_work(vhost_workqueue, &poll->work); +} + +static void vhost_vq_reset(struct vhost_dev *dev, + struct vhost_virtqueue *vq) +{ + vq->num = 1; + vq->desc = NULL; + vq->avail = NULL; + vq->used = NULL; + vq->last_avail_idx = 0; + vq->avail_idx = 0; + vq->last_used_idx = 0; + vq->used_flags = 0; + vq->used_flags = 0; + vq->log_used = false; + vq->log_addr = -1ull; + vq->hdr_size = 0; + vq->private_data = NULL; + vq->log_base = NULL; + vq->error_ctx = NULL; + vq->error = NULL; + vq->kick = NULL; + vq->call_ctx = NULL; + vq->call = NULL; +} + +long vhost_dev_init(struct vhost_dev *dev, + struct vhost_virtqueue *vqs, int nvqs) +{ + int i; + dev->vqs = vqs; + dev->nvqs = nvqs; + mutex_init(&dev->mutex); + dev->log_ctx = NULL; + dev->log_file = NULL; + dev->memory = NULL; + dev->mm = NULL; + + for (i = 0; i < dev->nvqs; ++i) { + dev->vqs[i].dev = dev; + mutex_init(&dev->vqs[i].mutex); + vhost_vq_reset(dev, dev->vqs + i); + if (dev->vqs[i].handle_kick) + vhost_poll_init(&dev->vqs[i].poll, + dev->vqs[i].handle_kick, + POLLIN); + } + return 0; +} + +/* Caller should have device mutex */ +long vhost_dev_check_owner(struct vhost_dev *dev) +{ + /* Are you the owner? If not, I don't think you mean to do that */ + return dev->mm == current->mm ? 0 : -EPERM; +} + +/* Caller should have device mutex */ +static long vhost_dev_set_owner(struct vhost_dev *dev) +{ + /* Is there an owner already? */ + if (dev->mm) + return -EBUSY; + /* No owner, become one */ + dev->mm = get_task_mm(current); + return 0; +} + +/* Caller should have device mutex */ +long vhost_dev_reset_owner(struct vhost_dev *dev) +{ + struct vhost_memory *memory; + + /* Restore memory to default empty mapping. */ + memory = kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL); + if (!memory) + return -ENOMEM; + + vhost_dev_cleanup(dev); + + memory->nregions = 0; + dev->memory = memory; + return 0; +} + +/* Caller should have device mutex */ +void vhost_dev_cleanup(struct vhost_dev *dev) +{ + int i; + for (i = 0; i < dev->nvqs; ++i) { + if (dev->vqs[i].kick && dev->vqs[i].handle_kick) { + vhost_poll_stop(&dev->vqs[i].poll); + vhost_poll_flush(&dev->vqs[i].poll); + } + if (dev->vqs[i].error_ctx) + eventfd_ctx_put(dev->vqs[i].error_ctx); + if (dev->vqs[i].error) + fput(dev->vqs[i].error); + if (dev->vqs[i].kick) + fput(dev->vqs[i].kick); + if (dev->vqs[i].call_ctx) + eventfd_ctx_put(dev->vqs[i].call_ctx); + if (dev->vqs[i].call) + fput(dev->vqs[i].call); + vhost_vq_reset(dev, dev->vqs + i); + } + if (dev->log_ctx) + eventfd_ctx_put(dev->log_ctx); + dev->log_ctx = NULL; + if (dev->log_file) + fput(dev->log_file); + dev->log_file = NULL; + /* No one will access memory at this point */ + kfree(dev->memory); + dev->memory = NULL; + if (dev->mm) + mmput(dev->mm); + dev->mm = NULL; +} + +static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) +{ + u64 a = addr / VHOST_PAGE_SIZE / 8; + /* Make sure 64 bit math will not overflow. */ + if (a > ULONG_MAX - (unsigned long)log_base || + a + (unsigned long)log_base > ULONG_MAX) + return -EFAULT; + + return access_ok(VERIFY_WRITE, log_base + a, + (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8); +} + +/* Caller should have vq mutex and device mutex. */ +static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem, + int log_all) +{ + int i; + for (i = 0; i < mem->nregions; ++i) { + struct vhost_memory_region *m = mem->regions + i; + unsigned long a = m->userspace_addr; + if (m->memory_size > ULONG_MAX) + return 0; + else if (!access_ok(VERIFY_WRITE, (void __user *)a, + m->memory_size)) + return 0; + else if (log_all && !log_access_ok(log_base, + m->guest_phys_addr, + m->memory_size)) + return 0; + } + return 1; +} + +/* Can we switch to this memory table? */ +/* Caller should have device mutex but not vq mutex */ +static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem, + int log_all) +{ + int i; + for (i = 0; i < d->nvqs; ++i) { + int ok; + mutex_lock(&d->vqs[i].mutex); + /* If ring is inactive, will check when it's enabled. */ + if (d->vqs[i].private_data) + ok = vq_memory_access_ok(d->vqs[i].log_base, mem, + log_all); + else + ok = 1; + mutex_unlock(&d->vqs[i].mutex); + if (!ok) + return 0; + } + return 1; +} + +static int vq_access_ok(unsigned int num, + struct vring_desc __user *desc, + struct vring_avail __user *avail, + struct vring_used __user *used) +{ + return access_ok(VERIFY_READ, desc, num * sizeof *desc) && + access_ok(VERIFY_READ, avail, + sizeof *avail + num * sizeof *avail->ring) && + access_ok(VERIFY_WRITE, used, + sizeof *used + num * sizeof *used->ring); +} + +/* Can we log writes? */ +/* Caller should have device mutex but not vq mutex */ +int vhost_log_access_ok(struct vhost_dev *dev) +{ + return memory_access_ok(dev, dev->memory, 1); +} + +/* Verify access for write logging. */ +/* Caller should have vq mutex and device mutex */ +static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base) +{ + return vq_memory_access_ok(log_base, vq->dev->memory, + vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) && + (!vq->log_used || log_access_ok(log_base, vq->log_addr, + sizeof *vq->used + + vq->num * sizeof *vq->used->ring)); +} + +/* Can we start vq? */ +/* Caller should have vq mutex and device mutex */ +int vhost_vq_access_ok(struct vhost_virtqueue *vq) +{ + return vq_access_ok(vq->num, vq->desc, vq->avail, vq->used) && + vq_log_access_ok(vq, vq->log_base); +} + +static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) +{ + struct vhost_memory mem, *newmem, *oldmem; + unsigned long size = offsetof(struct vhost_memory, regions); + long r; + r = copy_from_user(&mem, m, size); + if (r) + return r; + if (mem.padding) + return -EOPNOTSUPP; + if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS) + return -E2BIG; + newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL); + if (!newmem) + return -ENOMEM; + + memcpy(newmem, &mem, size); + r = copy_from_user(newmem->regions, m->regions, + mem.nregions * sizeof *m->regions); + if (r) { + kfree(newmem); + return r; + } + + if (!memory_access_ok(d, newmem, vhost_has_feature(d, VHOST_F_LOG_ALL))) + return -EFAULT; + oldmem = d->memory; + rcu_assign_pointer(d->memory, newmem); + synchronize_rcu(); + kfree(oldmem); + return 0; +} + +static int init_used(struct vhost_virtqueue *vq, + struct vring_used __user *used) +{ + int r = put_user(vq->used_flags, &used->flags); + if (r) + return r; + return get_user(vq->last_used_idx, &used->idx); +} + +static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) +{ + struct file *eventfp, *filep = NULL, + *pollstart = NULL, *pollstop = NULL; + struct eventfd_ctx *ctx = NULL; + u32 __user *idxp = argp; + struct vhost_virtqueue *vq; + struct vhost_vring_state s; + struct vhost_vring_file f; + struct vhost_vring_addr a; + u32 idx; + long r; + + r = get_user(idx, idxp); + if (r < 0) + return r; + if (idx > d->nvqs) + return -ENOBUFS; + + vq = d->vqs + idx; + + mutex_lock(&vq->mutex); + + switch (ioctl) { + case VHOST_SET_VRING_NUM: + /* Resizing ring with an active backend? + * You don't want to do that. */ + if (vq->private_data) { + r = -EBUSY; + break; + } + r = copy_from_user(&s, argp, sizeof s); + if (r < 0) + break; + if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) { + r = -EINVAL; + break; + } + vq->num = s.num; + break; + case VHOST_SET_VRING_BASE: + /* Moving base with an active backend? + * You don't want to do that. */ + if (vq->private_data) { + r = -EBUSY; + break; + } + r = copy_from_user(&s, argp, sizeof s); + if (r < 0) + break; + if (s.num > 0xffff) { + r = -EINVAL; + break; + } + vq->last_avail_idx = s.num; + /* Forget the cached index value. */ + vq->avail_idx = vq->last_avail_idx; + break; + case VHOST_GET_VRING_BASE: + s.index = idx; + s.num = vq->last_avail_idx; + r = copy_to_user(argp, &s, sizeof s); + break; + case VHOST_SET_VRING_ADDR: + r = copy_from_user(&a, argp, sizeof a); + if (r < 0) + break; + if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) { + r = -EOPNOTSUPP; + break; + } + /* For 32bit, verify that the top 32bits of the user + data are set to zero. */ + if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr || + (u64)(unsigned long)a.used_user_addr != a.used_user_addr || + (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) { + r = -EFAULT; + break; + } + if ((a.avail_user_addr & (sizeof *vq->avail->ring - 1)) || + (a.used_user_addr & (sizeof *vq->used->ring - 1)) || + (a.log_guest_addr & (sizeof *vq->used->ring - 1))) { + r = -EINVAL; + break; + } + + /* We only verify access here if backend is configured. + * If it is not, we don't as size might not have been setup. + * We will verify when backend is configured. */ + if (vq->private_data) { + if (!vq_access_ok(vq->num, + (void __user *)(unsigned long)a.desc_user_addr, + (void __user *)(unsigned long)a.avail_user_addr, + (void __user *)(unsigned long)a.used_user_addr)) { + r = -EINVAL; + break; + } + + /* Also validate log access for used ring if enabled. */ + if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) && + !log_access_ok(vq->log_base, a.log_guest_addr, + sizeof *vq->used + + vq->num * sizeof *vq->used->ring)) { + r = -EINVAL; + break; + } + } + + r = init_used(vq, (struct vring_used __user *)(unsigned long) + a.used_user_addr); + if (r) + break; + vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG)); + vq->desc = (void __user *)(unsigned long)a.desc_user_addr; + vq->avail = (void __user *)(unsigned long)a.avail_user_addr; + vq->log_addr = a.log_guest_addr; + vq->used = (void __user *)(unsigned long)a.used_user_addr; + break; + case VHOST_SET_VRING_KICK: + r = copy_from_user(&f, argp, sizeof f); + if (r < 0) + break; + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); + if (IS_ERR(eventfp)) + return PTR_ERR(eventfp); + if (eventfp != vq->kick) { + pollstop = filep = vq->kick; + pollstart = vq->kick = eventfp; + } else + filep = eventfp; + break; + case VHOST_SET_VRING_CALL: + r = copy_from_user(&f, argp, sizeof f); + if (r < 0) + break; + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); + if (IS_ERR(eventfp)) + return PTR_ERR(eventfp); + if (eventfp != vq->call) { + filep = vq->call; + ctx = vq->call_ctx; + vq->call = eventfp; + vq->call_ctx = eventfp ? + eventfd_ctx_fileget(eventfp) : NULL; + } else + filep = eventfp; + break; + case VHOST_SET_VRING_ERR: + r = copy_from_user(&f, argp, sizeof f); + if (r < 0) + break; + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); + if (IS_ERR(eventfp)) + return PTR_ERR(eventfp); + if (eventfp != vq->error) { + filep = vq->error; + vq->error = eventfp; + ctx = vq->error_ctx; + vq->error_ctx = eventfp ? + eventfd_ctx_fileget(eventfp) : NULL; + } else + filep = eventfp; + break; + default: + r = -ENOIOCTLCMD; + } + + if (pollstop && vq->handle_kick) + vhost_poll_stop(&vq->poll); + + if (ctx) + eventfd_ctx_put(ctx); + if (filep) + fput(filep); + + if (pollstart && vq->handle_kick) + vhost_poll_start(&vq->poll, vq->kick); + + mutex_unlock(&vq->mutex); + + if (pollstop && vq->handle_kick) + vhost_poll_flush(&vq->poll); + return r; +} + +/* Caller must have device mutex */ +long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct file *eventfp, *filep = NULL; + struct eventfd_ctx *ctx = NULL; + u64 p; + long r; + int i, fd; + + /* If you are not the owner, you can become one */ + if (ioctl == VHOST_SET_OWNER) { + r = vhost_dev_set_owner(d); + goto done; + } + + /* You must be the owner to do anything else */ + r = vhost_dev_check_owner(d); + if (r) + goto done; + + switch (ioctl) { + case VHOST_SET_MEM_TABLE: + r = vhost_set_memory(d, argp); + break; + case VHOST_SET_LOG_BASE: + r = copy_from_user(&p, argp, sizeof p); + if (r < 0) + break; + if ((u64)(unsigned long)p != p) { + r = -EFAULT; + break; + } + for (i = 0; i < d->nvqs; ++i) { + struct vhost_virtqueue *vq; + void __user *base = (void __user *)(unsigned long)p; + vq = d->vqs + i; + mutex_lock(&vq->mutex); + /* If ring is inactive, will check when it's enabled. */ + if (vq->private_data && !vq_log_access_ok(vq, base)) + r = -EFAULT; + else + vq->log_base = base; + mutex_unlock(&vq->mutex); + } + break; + case VHOST_SET_LOG_FD: + r = get_user(fd, (int __user *)argp); + if (r < 0) + break; + eventfp = fd == -1 ? NULL : eventfd_fget(fd); + if (IS_ERR(eventfp)) { + r = PTR_ERR(eventfp); + break; + } + if (eventfp != d->log_file) { + filep = d->log_file; + ctx = d->log_ctx; + d->log_ctx = eventfp ? + eventfd_ctx_fileget(eventfp) : NULL; + } else + filep = eventfp; + for (i = 0; i < d->nvqs; ++i) { + mutex_lock(&d->vqs[i].mutex); + d->vqs[i].log_ctx = d->log_ctx; + mutex_unlock(&d->vqs[i].mutex); + } + if (ctx) + eventfd_ctx_put(ctx); + if (filep) + fput(filep); + break; + default: + r = vhost_set_vring(d, ioctl, argp); + break; + } +done: + return r; +} + +static const struct vhost_memory_region *find_region(struct vhost_memory *mem, + __u64 addr, __u32 len) +{ + struct vhost_memory_region *reg; + int i; + /* linear search is not brilliant, but we really have on the order of 6 + * regions in practice */ + for (i = 0; i < mem->nregions; ++i) { + reg = mem->regions + i; + if (reg->guest_phys_addr <= addr && + reg->guest_phys_addr + reg->memory_size - 1 >= addr) + return reg; + } + return NULL; +} + +/* TODO: This is really inefficient. We need something like get_user() + * (instruction directly accesses the data, with an exception table entry + * returning -EFAULT). See Documentation/x86/exception-tables.txt. + */ +static int set_bit_to_user(int nr, void __user *addr) +{ + unsigned long log = (unsigned long)addr; + struct page *page; + void *base; + int bit = nr + (log % PAGE_SIZE) * 8; + int r; + r = get_user_pages_fast(log, 1, 1, &page); + if (r) + return r; + base = kmap_atomic(page, KM_USER0); + set_bit(bit, base); + kunmap_atomic(base, KM_USER0); + set_page_dirty_lock(page); + put_page(page); + return 0; +} + +static int log_write(void __user *log_base, + u64 write_address, u64 write_length) +{ + int r; + if (!write_length) + return 0; + write_address /= VHOST_PAGE_SIZE; + for (;;) { + u64 base = (u64)(unsigned long)log_base; + u64 log = base + write_address / 8; + int bit = write_address % 8; + if ((u64)(unsigned long)log != log) + return -EFAULT; + r = set_bit_to_user(bit, (void __user *)(unsigned long)log); + if (r < 0) + return r; + if (write_length <= VHOST_PAGE_SIZE) + break; + write_length -= VHOST_PAGE_SIZE; + write_address += VHOST_PAGE_SIZE; + } + return r; +} + +int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, + unsigned int log_num, u64 len) +{ + int i, r; + + /* Make sure data written is seen before log. */ + wmb(); + for (i = 0; i < log_num; ++i) { + u64 l = min(log[i].len, len); + r = log_write(vq->log_base, log[i].addr, l); + if (r < 0) + return r; + len -= l; + if (!len) + return 0; + } + if (vq->log_ctx) + eventfd_signal(vq->log_ctx, 1); + /* Length written exceeds what we have stored. This is a bug. */ + BUG(); + return 0; +} + +int translate_desc(struct vhost_dev *dev, u64 addr, u32 len, + struct iovec iov[], int iov_size) +{ + const struct vhost_memory_region *reg; + struct vhost_memory *mem; + struct iovec *_iov; + u64 s = 0; + int ret = 0; + + rcu_read_lock(); + + mem = rcu_dereference(dev->memory); + while ((u64)len > s) { + u64 size; + if (ret >= iov_size) { + ret = -ENOBUFS; + break; + } + reg = find_region(mem, addr, len); + if (!reg) { + ret = -EFAULT; + break; + } + _iov = iov + ret; + size = reg->memory_size - addr + reg->guest_phys_addr; + _iov->iov_len = min((u64)len, size); + _iov->iov_base = (void *)(unsigned long) + (reg->userspace_addr + addr - reg->guest_phys_addr); + s += size; + addr += size; + ++ret; + } + + rcu_read_unlock(); + return ret; +} + +/* Each buffer in the virtqueues is actually a chain of descriptors. This + * function returns the next descriptor in the chain, + * or -1U if we're at the end. */ +static unsigned next_desc(struct vring_desc *desc) +{ + unsigned int next; + + /* If this descriptor says it doesn't chain, we're done. */ + if (!(desc->flags & VRING_DESC_F_NEXT)) + return -1U; + + /* Check they're not leading us off end of descriptors. */ + next = desc->next; + /* Make sure compiler knows to grab that: we don't want it changing! */ + /* We will use the result as an index in an array, so most + * architectures only need a compiler barrier here. */ + read_barrier_depends(); + + return next; +} + +static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num, + struct vring_desc *indirect) +{ + struct vring_desc desc; + unsigned int i = 0, count, found = 0; + int ret; + + /* Sanity check */ + if (indirect->len % sizeof desc) { + vq_err(vq, "Invalid length in indirect descriptor: " + "len 0x%llx not multiple of 0x%zx\n", + (unsigned long long)indirect->len, + sizeof desc); + return -EINVAL; + } + + ret = translate_desc(dev, indirect->addr, indirect->len, vq->indirect, + ARRAY_SIZE(vq->indirect)); + if (ret < 0) { + vq_err(vq, "Translation failure %d in indirect.\n", ret); + return ret; + } + + /* We will use the result as an address to read from, so most + * architectures only need a compiler barrier here. */ + read_barrier_depends(); + + count = indirect->len / sizeof desc; + /* Buffers are chained via a 16 bit next field, so + * we can have at most 2^16 of these. */ + if (count > USHORT_MAX + 1) { + vq_err(vq, "Indirect buffer length too big: %d\n", + indirect->len); + return -E2BIG; + } + + do { + unsigned iov_count = *in_num + *out_num; + if (++found > count) { + vq_err(vq, "Loop detected: last one at %u " + "indirect size %u\n", + i, count); + return -EINVAL; + } + if (memcpy_fromiovec((unsigned char *)&desc, vq->indirect, + sizeof desc)) { + vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n", + i, (size_t)indirect->addr + i * sizeof desc); + return -EINVAL; + } + if (desc.flags & VRING_DESC_F_INDIRECT) { + vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n", + i, (size_t)indirect->addr + i * sizeof desc); + return -EINVAL; + } + + ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count, + iov_size - iov_count); + if (ret < 0) { + vq_err(vq, "Translation failure %d indirect idx %d\n", + ret, i); + return ret; + } + /* If this is an input descriptor, increment that count. */ + if (desc.flags & VRING_DESC_F_WRITE) { + *in_num += ret; + if (unlikely(log)) { + log[*log_num].addr = desc.addr; + log[*log_num].len = desc.len; + ++*log_num; + } + } else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (*in_num) { + vq_err(vq, "Indirect descriptor " + "has out after in: idx %d\n", i); + return -EINVAL; + } + *out_num += ret; + } + } while ((i = next_desc(&desc)) != -1); + return 0; +} + +/* This looks in the virtqueue and for the first available buffer, and converts + * it to an iovec for convenient access. Since descriptors consist of some + * number of output then some number of input descriptors, it's actually two + * iovecs, but we pack them into one and note how many of each there were. + * + * This function returns the descriptor number found, or vq->num (which + * is never a valid descriptor number) if none was found. */ +unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num) +{ + struct vring_desc desc; + unsigned int i, head, found = 0; + u16 last_avail_idx; + int ret; + + /* Check it isn't doing very strange things with descriptor numbers. */ + last_avail_idx = vq->last_avail_idx; + if (get_user(vq->avail_idx, &vq->avail->idx)) { + vq_err(vq, "Failed to access avail idx at %p\n", + &vq->avail->idx); + return vq->num; + } + + if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) { + vq_err(vq, "Guest moved used index from %u to %u", + last_avail_idx, vq->avail_idx); + return vq->num; + } + + /* If there's nothing new since last we looked, return invalid. */ + if (vq->avail_idx == last_avail_idx) + return vq->num; + + /* Only get avail ring entries after they have been exposed by guest. */ + rmb(); + + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) { + vq_err(vq, "Failed to read head: idx %d address %p\n", + last_avail_idx, + &vq->avail->ring[last_avail_idx % vq->num]); + return vq->num; + } + + /* If their number is silly, that's an error. */ + if (head >= vq->num) { + vq_err(vq, "Guest says index %u > %u is available", + head, vq->num); + return vq->num; + } + + /* When we start there are none of either input nor output. */ + *out_num = *in_num = 0; + if (unlikely(log)) + *log_num = 0; + + i = head; + do { + unsigned iov_count = *in_num + *out_num; + if (i >= vq->num) { + vq_err(vq, "Desc index is %u > %u, head = %u", + i, vq->num, head); + return vq->num; + } + if (++found > vq->num) { + vq_err(vq, "Loop detected: last one at %u " + "vq size %u head %u\n", + i, vq->num, head); + return vq->num; + } + ret = copy_from_user(&desc, vq->desc + i, sizeof desc); + if (ret) { + vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", + i, vq->desc + i); + return vq->num; + } + if (desc.flags & VRING_DESC_F_INDIRECT) { + ret = get_indirect(dev, vq, iov, iov_size, + out_num, in_num, + log, log_num, &desc); + if (ret < 0) { + vq_err(vq, "Failure detected " + "in indirect descriptor at idx %d\n", i); + return vq->num; + } + continue; + } + + ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count, + iov_size - iov_count); + if (ret < 0) { + vq_err(vq, "Translation failure %d descriptor idx %d\n", + ret, i); + return vq->num; + } + if (desc.flags & VRING_DESC_F_WRITE) { + /* If this is an input descriptor, + * increment that count. */ + *in_num += ret; + if (unlikely(log)) { + log[*log_num].addr = desc.addr; + log[*log_num].len = desc.len; + ++*log_num; + } + } else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (*in_num) { + vq_err(vq, "Descriptor has out after in: " + "idx %d\n", i); + return vq->num; + } + *out_num += ret; + } + } while ((i = next_desc(&desc)) != -1); + + /* On success, increment avail index. */ + vq->last_avail_idx++; + return head; +} + +/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ +void vhost_discard_vq_desc(struct vhost_virtqueue *vq) +{ + vq->last_avail_idx--; +} + +/* After we've used one of their buffers, we tell them about it. We'll then + * want to notify the guest, using eventfd. */ +int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) +{ + struct vring_used_elem *used; + + /* The virtqueue contains a ring of used buffers. Get a pointer to the + * next entry in that used ring. */ + used = &vq->used->ring[vq->last_used_idx % vq->num]; + if (put_user(head, &used->id)) { + vq_err(vq, "Failed to write used id"); + return -EFAULT; + } + if (put_user(len, &used->len)) { + vq_err(vq, "Failed to write used len"); + return -EFAULT; + } + /* Make sure buffer is written before we update index. */ + wmb(); + if (put_user(vq->last_used_idx + 1, &vq->used->idx)) { + vq_err(vq, "Failed to increment used idx"); + return -EFAULT; + } + if (unlikely(vq->log_used)) { + /* Make sure data is seen before log. */ + wmb(); + log_write(vq->log_base, vq->log_addr + sizeof *vq->used->ring * + (vq->last_used_idx % vq->num), + sizeof *vq->used->ring); + log_write(vq->log_base, vq->log_addr, sizeof *vq->used->ring); + if (vq->log_ctx) + eventfd_signal(vq->log_ctx, 1); + } + vq->last_used_idx++; + return 0; +} + +/* This actually signals the guest, using eventfd. */ +void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) +{ + __u16 flags = 0; + if (get_user(flags, &vq->avail->flags)) { + vq_err(vq, "Failed to get flags"); + return; + } + + /* If they don't want an interrupt, don't signal, unless empty. */ + if ((flags & VRING_AVAIL_F_NO_INTERRUPT) && + (vq->avail_idx != vq->last_avail_idx || + !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY))) + return; + + /* Signal the Guest tell them we used something up. */ + if (vq->call_ctx) + eventfd_signal(vq->call_ctx, 1); +} + +/* And here's the combo meal deal. Supersize me! */ +void vhost_add_used_and_signal(struct vhost_dev *dev, + struct vhost_virtqueue *vq, + unsigned int head, int len) +{ + vhost_add_used(vq, head, len); + vhost_signal(dev, vq); +} + +/* OK, now we need to know about added descriptors. */ +bool vhost_enable_notify(struct vhost_virtqueue *vq) +{ + u16 avail_idx; + int r; + if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) + return false; + vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; + r = put_user(vq->used_flags, &vq->used->flags); + if (r) { + vq_err(vq, "Failed to enable notification at %p: %d\n", + &vq->used->flags, r); + return false; + } + /* They could have slipped one in as we were doing that: make + * sure it's written, then check again. */ + mb(); + r = get_user(avail_idx, &vq->avail->idx); + if (r) { + vq_err(vq, "Failed to check avail idx at %p: %d\n", + &vq->avail->idx, r); + return false; + } + + return avail_idx != vq->last_avail_idx; +} + +/* We don't need to be notified again. */ +void vhost_disable_notify(struct vhost_virtqueue *vq) +{ + int r; + if (vq->used_flags & VRING_USED_F_NO_NOTIFY) + return; + vq->used_flags |= VRING_USED_F_NO_NOTIFY; + r = put_user(vq->used_flags, &vq->used->flags); + if (r) + vq_err(vq, "Failed to enable notification at %p: %d\n", + &vq->used->flags, r); +} + +int vhost_init(void) +{ + vhost_workqueue = create_singlethread_workqueue("vhost"); + if (!vhost_workqueue) + return -ENOMEM; + return 0; +} + +void vhost_cleanup(void) +{ + destroy_workqueue(vhost_workqueue); +} diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h new file mode 100644 index 000000000000..44591ba9b07a --- /dev/null +++ b/drivers/vhost/vhost.h @@ -0,0 +1,161 @@ +#ifndef _VHOST_H +#define _VHOST_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct vhost_device; + +enum { + /* Enough place for all fragments, head, and virtio net header. */ + VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2, +}; + +/* Poll a file (eventfd or socket) */ +/* Note: there's nothing vhost specific about this structure. */ +struct vhost_poll { + poll_table table; + wait_queue_head_t *wqh; + wait_queue_t wait; + /* struct which will handle all actual work. */ + struct work_struct work; + unsigned long mask; +}; + +void vhost_poll_init(struct vhost_poll *poll, work_func_t func, + unsigned long mask); +void vhost_poll_start(struct vhost_poll *poll, struct file *file); +void vhost_poll_stop(struct vhost_poll *poll); +void vhost_poll_flush(struct vhost_poll *poll); +void vhost_poll_queue(struct vhost_poll *poll); + +struct vhost_log { + u64 addr; + u64 len; +}; + +/* The virtqueue structure describes a queue attached to a device. */ +struct vhost_virtqueue { + struct vhost_dev *dev; + + /* The actual ring of buffers. */ + struct mutex mutex; + unsigned int num; + struct vring_desc __user *desc; + struct vring_avail __user *avail; + struct vring_used __user *used; + struct file *kick; + struct file *call; + struct file *error; + struct eventfd_ctx *call_ctx; + struct eventfd_ctx *error_ctx; + struct eventfd_ctx *log_ctx; + + struct vhost_poll poll; + + /* The routine to call when the Guest pings us, or timeout. */ + work_func_t handle_kick; + + /* Last available index we saw. */ + u16 last_avail_idx; + + /* Caches available index value from user. */ + u16 avail_idx; + + /* Last index we used. */ + u16 last_used_idx; + + /* Used flags */ + u16 used_flags; + + /* Log writes to used structure. */ + bool log_used; + u64 log_addr; + + struct iovec indirect[VHOST_NET_MAX_SG]; + struct iovec iov[VHOST_NET_MAX_SG]; + struct iovec hdr[VHOST_NET_MAX_SG]; + size_t hdr_size; + /* We use a kind of RCU to access private pointer. + * All readers access it from workqueue, which makes it possible to + * flush the workqueue instead of synchronize_rcu. Therefore readers do + * not need to call rcu_read_lock/rcu_read_unlock: the beginning of + * work item execution acts instead of rcu_read_lock() and the end of + * work item execution acts instead of rcu_read_lock(). + * Writers use virtqueue mutex. */ + void *private_data; + /* Log write descriptors */ + void __user *log_base; + struct vhost_log log[VHOST_NET_MAX_SG]; +}; + +struct vhost_dev { + /* Readers use RCU to access memory table pointer + * log base pointer and features. + * Writers use mutex below.*/ + struct vhost_memory *memory; + struct mm_struct *mm; + struct mutex mutex; + unsigned acked_features; + struct vhost_virtqueue *vqs; + int nvqs; + struct file *log_file; + struct eventfd_ctx *log_ctx; +}; + +long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); +long vhost_dev_check_owner(struct vhost_dev *); +long vhost_dev_reset_owner(struct vhost_dev *); +void vhost_dev_cleanup(struct vhost_dev *); +long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg); +int vhost_vq_access_ok(struct vhost_virtqueue *vq); +int vhost_log_access_ok(struct vhost_dev *); + +unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *, + struct iovec iov[], unsigned int iov_count, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num); +void vhost_discard_vq_desc(struct vhost_virtqueue *); + +int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); +void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); +void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, + unsigned int head, int len); +void vhost_disable_notify(struct vhost_virtqueue *); +bool vhost_enable_notify(struct vhost_virtqueue *); + +int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, + unsigned int log_num, u64 len); + +int vhost_init(void); +void vhost_cleanup(void); + +#define vq_err(vq, fmt, ...) do { \ + pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ + if ((vq)->error_ctx) \ + eventfd_signal((vq)->error_ctx, 1);\ + } while (0) + +enum { + VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) | + (1 << VIRTIO_RING_F_INDIRECT_DESC) | + (1 << VHOST_F_LOG_ALL) | + (1 << VHOST_NET_F_VIRTIO_NET_HDR), +}; + +static inline int vhost_has_feature(struct vhost_dev *dev, int bit) +{ + unsigned acked_features = rcu_dereference(dev->acked_features); + return acked_features & (1 << bit); +} + +#endif diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 756f831cbdd5..d93080748a91 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -362,6 +362,7 @@ unifdef-y += uio.h unifdef-y += unistd.h unifdef-y += usbdevice_fs.h unifdef-y += utsname.h +unifdef-y += vhost.h unifdef-y += videodev2.h unifdef-y += videodev.h unifdef-y += virtio_config.h diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index adaf3c15e449..8b5f7cc0fba6 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -30,6 +30,7 @@ #define HPET_MINOR 228 #define FUSE_MINOR 229 #define KVM_MINOR 232 +#define VHOST_NET_MINOR 233 #define MISC_DYNAMIC_MINOR 255 struct device; diff --git a/include/linux/vhost.h b/include/linux/vhost.h new file mode 100644 index 000000000000..e847f1e30756 --- /dev/null +++ b/include/linux/vhost.h @@ -0,0 +1,130 @@ +#ifndef _LINUX_VHOST_H +#define _LINUX_VHOST_H +/* Userspace interface for in-kernel virtio accelerators. */ + +/* vhost is used to reduce the number of system calls involved in virtio. + * + * Existing virtio net code is used in the guest without modification. + * + * This header includes interface used by userspace hypervisor for + * device configuration. + */ + +#include +#include +#include +#include +#include + +struct vhost_vring_state { + unsigned int index; + unsigned int num; +}; + +struct vhost_vring_file { + unsigned int index; + int fd; /* Pass -1 to unbind from file. */ + +}; + +struct vhost_vring_addr { + unsigned int index; + /* Option flags. */ + unsigned int flags; + /* Flag values: */ + /* Whether log address is valid. If set enables logging. */ +#define VHOST_VRING_F_LOG 0 + + /* Start of array of descriptors (virtually contiguous) */ + __u64 desc_user_addr; + /* Used structure address. Must be 32 bit aligned */ + __u64 used_user_addr; + /* Available structure address. Must be 16 bit aligned */ + __u64 avail_user_addr; + /* Logging support. */ + /* Log writes to used structure, at offset calculated from specified + * address. Address must be 32 bit aligned. */ + __u64 log_guest_addr; +}; + +struct vhost_memory_region { + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; + __u64 flags_padding; /* No flags are currently specified. */ +}; + +/* All region addresses and sizes must be 4K aligned. */ +#define VHOST_PAGE_SIZE 0x1000 + +struct vhost_memory { + __u32 nregions; + __u32 padding; + struct vhost_memory_region regions[0]; +}; + +/* ioctls */ + +#define VHOST_VIRTIO 0xAF + +/* Features bitmask for forward compatibility. Transport bits are used for + * vhost specific features. */ +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) + +/* Set current process as the (exclusive) owner of this file descriptor. This + * must be called before any other vhost command. Further calls to + * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +/* Give up ownership, and reset the device to default values. + * Allows subsequent call to VHOST_OWNER_SET to succeed. */ +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) + +/* Set up/modify memory layout */ +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) + +/* Write logging setup. */ +/* Memory writes can optionally be logged by setting bit at an offset + * (calculated from the physical address) from specified log base. + * The bit is set using an atomic 32 bit operation. */ +/* Set base address for logging. */ +#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) +/* Specify an eventfd file descriptor to signal on log write. */ +#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) + +/* Ring setup. */ +/* Set number of descriptors in ring. This parameter can not + * be modified while ring is running (bound to a device). */ +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +/* Set addresses for the ring. */ +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +/* Base value where queue looks for available descriptors */ +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) +/* Get accessor: reads index, writes value in num */ +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) + +/* The following ioctls use eventfd file descriptors to signal and poll + * for events. */ + +/* Set eventfd to poll for added buffers */ +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +/* Set eventfd to signal when buffers have beed used */ +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +/* Set eventfd to signal an error */ +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) + +/* VHOST_NET specific defines */ + +/* Attach virtio net ring to a raw socket, or tap device. + * The socket must be already bound to an ethernet device, this device will be + * used for transmit. Pass fd -1 to unbind from the socket and the transmit + * device. This can be used to stop the ring (e.g. for migration). */ +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) + +/* Feature bits */ +/* Log all write descriptors. Can be changed while device is active. */ +#define VHOST_F_LOG_ALL 26 +/* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */ +#define VHOST_NET_F_VIRTIO_NET_HDR 27 + +#endif -- cgit v1.2.3 From c955fe8e0bdd7be7a6bc2d49245d570a816f7cc5 Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Thu, 15 Oct 2009 14:31:30 +0400 Subject: POWER: Add support for cycle_count Signed-off-by: Alexey Starikovskiy Signed-off-by: Len Brown --- drivers/power/power_supply_sysfs.c | 1 + include/linux/power_supply.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c index c790e0c77d4b..ff05e6189768 100644 --- a/drivers/power/power_supply_sysfs.c +++ b/drivers/power/power_supply_sysfs.c @@ -99,6 +99,7 @@ static struct device_attribute power_supply_attrs[] = { POWER_SUPPLY_ATTR(present), POWER_SUPPLY_ATTR(online), POWER_SUPPLY_ATTR(technology), + POWER_SUPPLY_ATTR(cycle_count), POWER_SUPPLY_ATTR(voltage_max), POWER_SUPPLY_ATTR(voltage_min), POWER_SUPPLY_ATTR(voltage_max_design), diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index b5d096d3a9be..ebd2b8fb00d0 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -82,6 +82,7 @@ enum power_supply_property { POWER_SUPPLY_PROP_PRESENT, POWER_SUPPLY_PROP_ONLINE, POWER_SUPPLY_PROP_TECHNOLOGY, + POWER_SUPPLY_PROP_CYCLE_COUNT, POWER_SUPPLY_PROP_VOLTAGE_MAX, POWER_SUPPLY_PROP_VOLTAGE_MIN, POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN, -- cgit v1.2.3 From 889ff0150661512d79484219612b7e2e024b6c07 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 9 Jan 2010 20:04:47 +0100 Subject: perf/core: Split context's event group list into pinned and non-pinned lists Split-up struct perf_event_context::group_list into pinned_groups and flexible_groups (non-pinned). This first appears to be useless as it duplicates various loops around the group list handlings. But it scales better in the fast-path in perf_sched_in(). We don't anymore iterate twice through the entire list to separate pinned and non-pinned scheduling. Instead we interate through two distinct lists. The another desired effect is that it makes easier to define distinct scheduling rules on both. Changes in v2: - Respectively rename pinned_grp_list and volatile_grp_list into pinned_groups and flexible_groups as per Ingo suggestion. - Various cleanups Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 3 +- kernel/perf_event.c | 227 ++++++++++++++++++++++++++++++--------------- 2 files changed, 153 insertions(+), 77 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9a1d276db754..cdbc2aa64a0b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -683,7 +683,8 @@ struct perf_event_context { */ struct mutex mutex; - struct list_head group_list; + struct list_head pinned_groups; + struct list_head flexible_groups; struct list_head event_list; int nr_events; int nr_active; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 27f69a04541d..c9f8a757649d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -289,6 +289,15 @@ static void update_event_times(struct perf_event *event) event->total_time_running = run_end - event->tstamp_running; } +static struct list_head * +ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +{ + if (event->attr.pinned) + return &ctx->pinned_groups; + else + return &ctx->flexible_groups; +} + /* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -303,9 +312,12 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) * add it straight to the context's event list, or to the group * leader's sibling list: */ - if (group_leader == event) - list_add_tail(&event->group_entry, &ctx->group_list); - else { + if (group_leader == event) { + struct list_head *list; + + list = ctx_group_list(event, ctx); + list_add_tail(&event->group_entry, list); + } else { list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; } @@ -355,8 +367,10 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) * to the context list directly: */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + struct list_head *list; - list_move_tail(&sibling->group_entry, &ctx->group_list); + list = ctx_group_list(event, ctx); + list_move_tail(&sibling->group_entry, list); sibling->group_leader = sibling; } } @@ -1056,7 +1070,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx, perf_disable(); if (ctx->nr_active) { - list_for_each_entry(event, &ctx->group_list, group_entry) + list_for_each_entry(event, &ctx->pinned_groups, group_entry) + group_sched_out(event, cpuctx, ctx); + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } perf_enable(); @@ -1271,9 +1288,8 @@ __perf_event_sched_in(struct perf_event_context *ctx, * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - list_for_each_entry(event, &ctx->group_list, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF || - !event->attr.pinned) + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) continue; if (event->cpu != -1 && event->cpu != cpu) continue; @@ -1291,15 +1307,10 @@ __perf_event_sched_in(struct perf_event_context *ctx, } } - list_for_each_entry(event, &ctx->group_list, group_entry) { - /* - * Ignore events in OFF or ERROR state, and - * ignore pinned events since we did them already. - */ - if (event->state <= PERF_EVENT_STATE_OFF || - event->attr.pinned) + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + /* Ignore events in OFF or ERROR state */ + if (event->state <= PERF_EVENT_STATE_OFF) continue; - /* * Listen to the 'cpu' scheduling filter constraint * of events: @@ -1453,8 +1464,13 @@ static void rotate_ctx(struct perf_event_context *ctx) * Rotate the first entry last (works just fine for group events too): */ perf_disable(); - list_for_each_entry(event, &ctx->group_list, group_entry) { - list_move_tail(&event->group_entry, &ctx->group_list); + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + list_move_tail(&event->group_entry, &ctx->pinned_groups); + break; + } + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + list_move_tail(&event->group_entry, &ctx->flexible_groups); break; } perf_enable(); @@ -1490,6 +1506,21 @@ void perf_event_task_tick(struct task_struct *curr) perf_event_task_sched_in(curr); } +static int event_enable_on_exec(struct perf_event *event, + struct perf_event_context *ctx) +{ + if (!event->attr.enable_on_exec) + return 0; + + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) + return 0; + + __perf_event_mark_enabled(event, ctx); + + return 1; +} + /* * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. @@ -1500,6 +1531,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) struct perf_event *event; unsigned long flags; int enabled = 0; + int ret; local_irq_save(flags); ctx = task->perf_event_ctxp; @@ -1510,14 +1542,16 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_lock(&ctx->lock); - list_for_each_entry(event, &ctx->group_list, group_entry) { - if (!event->attr.enable_on_exec) - continue; - event->attr.enable_on_exec = 0; - if (event->state >= PERF_EVENT_STATE_INACTIVE) - continue; - __perf_event_mark_enabled(event, ctx); - enabled = 1; + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; + } + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; } /* @@ -1591,7 +1625,8 @@ __perf_event_init_context(struct perf_event_context *ctx, { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->group_list); + INIT_LIST_HEAD(&ctx->pinned_groups); + INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); ctx->task = task; @@ -5032,7 +5067,11 @@ void perf_event_exit_task(struct task_struct *child) mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); again: - list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, + list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, group_entry) __perf_event_exit_task(child_event, child_ctx, child); @@ -5041,7 +5080,8 @@ again: * its siblings to the list, but we obtained 'tmp' before that which * will still point to the list head terminating the iteration. */ - if (!list_empty(&child_ctx->group_list)) + if (!list_empty(&child_ctx->pinned_groups) || + !list_empty(&child_ctx->flexible_groups)) goto again; mutex_unlock(&child_ctx->mutex); @@ -5049,6 +5089,24 @@ again: put_ctx(child_ctx); } +static void perf_free_event(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *parent = event->parent; + + if (WARN_ON_ONCE(!parent)) + return; + + mutex_lock(&parent->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + list_del_event(event, ctx); + free_event(event); +} + /* * free an unexposed, unused context as created by inheritance by * init_task below, used by fork() in case of fail. @@ -5063,36 +5121,70 @@ void perf_event_free_task(struct task_struct *task) mutex_lock(&ctx->mutex); again: - list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { - struct perf_event *parent = event->parent; + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) + perf_free_event(event, ctx); - if (WARN_ON_ONCE(!parent)) - continue; + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, + group_entry) + perf_free_event(event, ctx); - mutex_lock(&parent->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent->child_mutex); + if (!list_empty(&ctx->pinned_groups) || + !list_empty(&ctx->flexible_groups)) + goto again; - fput(parent->filp); + mutex_unlock(&ctx->mutex); - list_del_event(event, ctx); - free_event(event); + put_ctx(ctx); +} + +static int +inherit_task_group(struct perf_event *event, struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + int *inherited_all) +{ + int ret; + struct perf_event_context *child_ctx = child->perf_event_ctxp; + + if (!event->attr.inherit) { + *inherited_all = 0; + return 0; } - if (!list_empty(&ctx->group_list)) - goto again; + if (!child_ctx) { + /* + * This is executed from the parent task context, so + * inherit events that have been marked for cloning. + * First allocate and initialize a context for the + * child. + */ - mutex_unlock(&ctx->mutex); + child_ctx = kzalloc(sizeof(struct perf_event_context), + GFP_KERNEL); + if (!child_ctx) + return -ENOMEM; - put_ctx(ctx); + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + } + + ret = inherit_group(event, parent, parent_ctx, + child, child_ctx); + + if (ret) + *inherited_all = 0; + + return ret; } + /* * Initialize the perf_event context in task_struct */ int perf_event_init_task(struct task_struct *child) { - struct perf_event_context *child_ctx = NULL, *parent_ctx; + struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; @@ -5130,41 +5222,22 @@ int perf_event_init_task(struct task_struct *child) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry(event, &parent_ctx->group_list, group_entry) { - - if (!event->attr.inherit) { - inherited_all = 0; - continue; - } - - if (!child->perf_event_ctxp) { - /* - * This is executed from the parent task context, so - * inherit events that have been marked for cloning. - * First allocate and initialize a context for the - * child. - */ - - child_ctx = kzalloc(sizeof(struct perf_event_context), - GFP_KERNEL); - if (!child_ctx) { - ret = -ENOMEM; - break; - } - - __perf_event_init_context(child_ctx, child); - child->perf_event_ctxp = child_ctx; - get_task_struct(child); - } + list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { + ret = inherit_task_group(event, parent, parent_ctx, child, + &inherited_all); + if (ret) + break; + } - ret = inherit_group(event, parent, parent_ctx, - child, child_ctx); - if (ret) { - inherited_all = 0; + list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { + ret = inherit_task_group(event, parent, parent_ctx, child, + &inherited_all); + if (ret) break; - } } + child_ctx = child->perf_event_ctxp; + if (child_ctx && inherited_all) { /* * Mark the child context as a clone of the parent @@ -5213,7 +5286,9 @@ static void __perf_event_exit_cpu(void *info) struct perf_event_context *ctx = &cpuctx->ctx; struct perf_event *event, *tmp; - list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) + __perf_event_remove_from_context(event); + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) __perf_event_remove_from_context(event); } static void perf_event_exit_cpu(int cpu) -- cgit v1.2.3 From 5908cdc85eb30f8d07f2cb11d4a62334d7229048 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 9 Jan 2010 20:53:14 +0100 Subject: list: Introduce list_rotate_left() Bring a new list_rotate_left() helper that rotates a list to the left. This is useful for codes that need to round roubin elements which queue priority increases from tail to head. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- include/linux/list.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 969f6e92d089..5d9c6558e8ab 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -205,6 +205,20 @@ static inline int list_empty_careful(const struct list_head *head) return (next == head) && (next == head->prev); } +/** + * list_rotate_left - rotate the list to the left + * @head: the head of the list + */ +static inline void list_rotate_left(struct list_head *head) +{ + struct list_head *first; + + if (!list_empty(head)) { + first = head->next; + list_move_tail(first, head); + } +} + /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. -- cgit v1.2.3 From d6f962b57bfaab62891c7abbf1469212a56d6103 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 10 Jan 2010 01:25:51 +0100 Subject: perf: Export software-only event group characteristic as a flag Before scheduling an event group, we first check if a group can go on. We first check if the group is made of software only events first, in which case it is enough to know if the group can be scheduled in. For that purpose, we iterate through the whole group, which is wasteful as we could do this check when we add/delete an event to a group. So we create a group_flags field in perf event that can host characteristics from a group of events, starting with a first PERF_GROUP_SOFTWARE flag that reduces the check on the fast path. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 5 +++++ kernel/perf_event.c | 30 +++++++++++------------------- 2 files changed, 16 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index cdbc2aa64a0b..c6f812e4d058 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -565,6 +565,10 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *, int, struct perf_sample_data *, struct pt_regs *regs); +enum perf_group_flag { + PERF_GROUP_SOFTWARE = 0x1, +}; + /** * struct perf_event - performance event kernel representation: */ @@ -574,6 +578,7 @@ struct perf_event { struct list_head event_entry; struct list_head sibling_list; int nr_siblings; + int group_flags; struct perf_event *group_leader; struct perf_event *output; const struct pmu *pmu; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index bbebe2832639..eae6ff693604 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -315,9 +315,16 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (group_leader == event) { struct list_head *list; + if (is_software_event(event)) + event->group_flags |= PERF_GROUP_SOFTWARE; + list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); } else { + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && + !is_software_event(event)) + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; } @@ -372,6 +379,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) list = ctx_group_list(event, ctx); list_move_tail(&sibling->group_entry, list); sibling->group_leader = sibling; + + /* Inherit group flags from the previous leader */ + sibling->group_flags = event->group_flags; } } @@ -699,24 +709,6 @@ group_error: return -EAGAIN; } -/* - * Return 1 for a group consisting entirely of software events, - * 0 if the group contains any hardware events. - */ -static int is_software_only_group(struct perf_event *leader) -{ - struct perf_event *event; - - if (!is_software_event(leader)) - return 0; - - list_for_each_entry(event, &leader->sibling_list, group_entry) - if (!is_software_event(event)) - return 0; - - return 1; -} - /* * Work out whether we can put this event group on the CPU now. */ @@ -727,7 +719,7 @@ static int group_can_go_on(struct perf_event *event, /* * Groups consisting entirely of software events can always go on. */ - if (is_software_only_group(event)) + if (event->group_flags & PERF_GROUP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware -- cgit v1.2.3 From 73c89c15b959adf06366722c4be8d2eddec0a529 Mon Sep 17 00:00:00 2001 From: Tobias Brunner Date: Sun, 17 Jan 2010 21:52:11 +1100 Subject: crypto: gcm - Add RFC4543 wrapper for GCM This patch adds the RFC4543 (GMAC) wrapper for GCM similar to the existing RFC4106 wrapper. The main differences between GCM and GMAC are the contents of the AAD and that the plaintext is empty for the latter. Signed-off-by: Tobias Brunner Signed-off-by: Herbert Xu --- crypto/gcm.c | 287 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pfkeyv2.h | 1 + net/xfrm/xfrm_algo.c | 16 +++ 3 files changed, 304 insertions(+) (limited to 'include/linux') diff --git a/crypto/gcm.c b/crypto/gcm.c index c6547130624c..2f5fbba6576c 100644 --- a/crypto/gcm.c +++ b/crypto/gcm.c @@ -37,6 +37,19 @@ struct crypto_rfc4106_ctx { u8 nonce[4]; }; +struct crypto_rfc4543_ctx { + struct crypto_aead *child; + u8 nonce[4]; +}; + +struct crypto_rfc4543_req_ctx { + u8 auth_tag[16]; + struct scatterlist cipher[1]; + struct scatterlist payload[2]; + struct scatterlist assoc[2]; + struct aead_request subreq; +}; + struct crypto_gcm_ghash_ctx { unsigned int cryptlen; struct scatterlist *src; @@ -1047,6 +1060,272 @@ static struct crypto_template crypto_rfc4106_tmpl = { .module = THIS_MODULE, }; +static inline struct crypto_rfc4543_req_ctx *crypto_rfc4543_reqctx( + struct aead_request *req) +{ + unsigned long align = crypto_aead_alignmask(crypto_aead_reqtfm(req)); + + return (void *)PTR_ALIGN((u8 *)aead_request_ctx(req), align + 1); +} + +static int crypto_rfc4543_setkey(struct crypto_aead *parent, const u8 *key, + unsigned int keylen) +{ + struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(parent); + struct crypto_aead *child = ctx->child; + int err; + + if (keylen < 4) + return -EINVAL; + + keylen -= 4; + memcpy(ctx->nonce, key + keylen, 4); + + crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_aead_set_flags(child, crypto_aead_get_flags(parent) & + CRYPTO_TFM_REQ_MASK); + err = crypto_aead_setkey(child, key, keylen); + crypto_aead_set_flags(parent, crypto_aead_get_flags(child) & + CRYPTO_TFM_RES_MASK); + + return err; +} + +static int crypto_rfc4543_setauthsize(struct crypto_aead *parent, + unsigned int authsize) +{ + struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(parent); + + if (authsize != 16) + return -EINVAL; + + return crypto_aead_setauthsize(ctx->child, authsize); +} + +/* this is the same as crypto_authenc_chain */ +static void crypto_rfc4543_chain(struct scatterlist *head, + struct scatterlist *sg, int chain) +{ + if (chain) { + head->length += sg->length; + sg = scatterwalk_sg_next(sg); + } + + if (sg) + scatterwalk_sg_chain(head, 2, sg); + else + sg_mark_end(head); +} + +static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req, + int enc) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead); + struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req); + struct aead_request *subreq = &rctx->subreq; + struct scatterlist *dst = req->dst; + struct scatterlist *cipher = rctx->cipher; + struct scatterlist *payload = rctx->payload; + struct scatterlist *assoc = rctx->assoc; + unsigned int authsize = crypto_aead_authsize(aead); + unsigned int assoclen = req->assoclen; + struct page *dstp; + u8 *vdst; + u8 *iv = PTR_ALIGN((u8 *)(rctx + 1) + crypto_aead_reqsize(ctx->child), + crypto_aead_alignmask(ctx->child) + 1); + + memcpy(iv, ctx->nonce, 4); + memcpy(iv + 4, req->iv, 8); + + /* construct cipher/plaintext */ + if (enc) + memset(rctx->auth_tag, 0, authsize); + else + scatterwalk_map_and_copy(rctx->auth_tag, dst, + req->cryptlen - authsize, + authsize, 0); + + sg_init_one(cipher, rctx->auth_tag, authsize); + + /* construct the aad */ + dstp = sg_page(dst); + vdst = PageHighMem(dstp) ? NULL : page_address(dstp) + dst->offset; + + sg_init_table(payload, 2); + sg_set_buf(payload, req->iv, 8); + crypto_rfc4543_chain(payload, dst, vdst == req->iv + 8); + assoclen += 8 + req->cryptlen - (enc ? 0 : authsize); + + sg_init_table(assoc, 2); + sg_set_page(assoc, sg_page(req->assoc), req->assoc->length, + req->assoc->offset); + crypto_rfc4543_chain(assoc, payload, 0); + + aead_request_set_tfm(subreq, ctx->child); + aead_request_set_callback(subreq, req->base.flags, req->base.complete, + req->base.data); + aead_request_set_crypt(subreq, cipher, cipher, enc ? 0 : authsize, iv); + aead_request_set_assoc(subreq, assoc, assoclen); + + return subreq; +} + +static int crypto_rfc4543_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req); + struct aead_request *subreq; + int err; + + subreq = crypto_rfc4543_crypt(req, 1); + err = crypto_aead_encrypt(subreq); + if (err) + return err; + + scatterwalk_map_and_copy(rctx->auth_tag, req->dst, req->cryptlen, + crypto_aead_authsize(aead), 1); + + return 0; +} + +static int crypto_rfc4543_decrypt(struct aead_request *req) +{ + req = crypto_rfc4543_crypt(req, 0); + + return crypto_aead_decrypt(req); +} + +static int crypto_rfc4543_init_tfm(struct crypto_tfm *tfm) +{ + struct crypto_instance *inst = (void *)tfm->__crt_alg; + struct crypto_aead_spawn *spawn = crypto_instance_ctx(inst); + struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm); + struct crypto_aead *aead; + unsigned long align; + + aead = crypto_spawn_aead(spawn); + if (IS_ERR(aead)) + return PTR_ERR(aead); + + ctx->child = aead; + + align = crypto_aead_alignmask(aead); + align &= ~(crypto_tfm_ctx_alignment() - 1); + tfm->crt_aead.reqsize = sizeof(struct crypto_rfc4543_req_ctx) + + ALIGN(crypto_aead_reqsize(aead), + crypto_tfm_ctx_alignment()) + + align + 16; + + return 0; +} + +static void crypto_rfc4543_exit_tfm(struct crypto_tfm *tfm) +{ + struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm); + + crypto_free_aead(ctx->child); +} + +static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb) +{ + struct crypto_attr_type *algt; + struct crypto_instance *inst; + struct crypto_aead_spawn *spawn; + struct crypto_alg *alg; + const char *ccm_name; + int err; + + algt = crypto_get_attr_type(tb); + err = PTR_ERR(algt); + if (IS_ERR(algt)) + return ERR_PTR(err); + + if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask) + return ERR_PTR(-EINVAL); + + ccm_name = crypto_attr_alg_name(tb[1]); + err = PTR_ERR(ccm_name); + if (IS_ERR(ccm_name)) + return ERR_PTR(err); + + inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); + if (!inst) + return ERR_PTR(-ENOMEM); + + spawn = crypto_instance_ctx(inst); + crypto_set_aead_spawn(spawn, inst); + err = crypto_grab_aead(spawn, ccm_name, 0, + crypto_requires_sync(algt->type, algt->mask)); + if (err) + goto out_free_inst; + + alg = crypto_aead_spawn_alg(spawn); + + err = -EINVAL; + + /* We only support 16-byte blocks. */ + if (alg->cra_aead.ivsize != 16) + goto out_drop_alg; + + /* Not a stream cipher? */ + if (alg->cra_blocksize != 1) + goto out_drop_alg; + + err = -ENAMETOOLONG; + if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME, + "rfc4543(%s)", alg->cra_name) >= CRYPTO_MAX_ALG_NAME || + snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, + "rfc4543(%s)", alg->cra_driver_name) >= + CRYPTO_MAX_ALG_NAME) + goto out_drop_alg; + + inst->alg.cra_flags = CRYPTO_ALG_TYPE_AEAD; + inst->alg.cra_flags |= alg->cra_flags & CRYPTO_ALG_ASYNC; + inst->alg.cra_priority = alg->cra_priority; + inst->alg.cra_blocksize = 1; + inst->alg.cra_alignmask = alg->cra_alignmask; + inst->alg.cra_type = &crypto_nivaead_type; + + inst->alg.cra_aead.ivsize = 8; + inst->alg.cra_aead.maxauthsize = 16; + + inst->alg.cra_ctxsize = sizeof(struct crypto_rfc4543_ctx); + + inst->alg.cra_init = crypto_rfc4543_init_tfm; + inst->alg.cra_exit = crypto_rfc4543_exit_tfm; + + inst->alg.cra_aead.setkey = crypto_rfc4543_setkey; + inst->alg.cra_aead.setauthsize = crypto_rfc4543_setauthsize; + inst->alg.cra_aead.encrypt = crypto_rfc4543_encrypt; + inst->alg.cra_aead.decrypt = crypto_rfc4543_decrypt; + + inst->alg.cra_aead.geniv = "seqiv"; + +out: + return inst; + +out_drop_alg: + crypto_drop_aead(spawn); +out_free_inst: + kfree(inst); + inst = ERR_PTR(err); + goto out; +} + +static void crypto_rfc4543_free(struct crypto_instance *inst) +{ + crypto_drop_spawn(crypto_instance_ctx(inst)); + kfree(inst); +} + +static struct crypto_template crypto_rfc4543_tmpl = { + .name = "rfc4543", + .alloc = crypto_rfc4543_alloc, + .free = crypto_rfc4543_free, + .module = THIS_MODULE, +}; + static int __init crypto_gcm_module_init(void) { int err; @@ -1067,8 +1346,14 @@ static int __init crypto_gcm_module_init(void) if (err) goto out_undo_gcm; + err = crypto_register_template(&crypto_rfc4543_tmpl); + if (err) + goto out_undo_rfc4106; + return 0; +out_undo_rfc4106: + crypto_unregister_template(&crypto_rfc4106_tmpl); out_undo_gcm: crypto_unregister_template(&crypto_gcm_tmpl); out_undo_base: @@ -1081,6 +1366,7 @@ out: static void __exit crypto_gcm_module_exit(void) { kfree(gcm_zeroes); + crypto_unregister_template(&crypto_rfc4543_tmpl); crypto_unregister_template(&crypto_rfc4106_tmpl); crypto_unregister_template(&crypto_gcm_tmpl); crypto_unregister_template(&crypto_gcm_base_tmpl); @@ -1094,3 +1380,4 @@ MODULE_DESCRIPTION("Galois/Counter Mode"); MODULE_AUTHOR("Mikko Herranen "); MODULE_ALIAS("gcm_base"); MODULE_ALIAS("rfc4106"); +MODULE_ALIAS("rfc4543"); diff --git a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h index 228b0b6306b0..0b80c806631f 100644 --- a/include/linux/pfkeyv2.h +++ b/include/linux/pfkeyv2.h @@ -315,6 +315,7 @@ struct sadb_x_kmaddress { #define SADB_X_EALG_AES_GCM_ICV12 19 #define SADB_X_EALG_AES_GCM_ICV16 20 #define SADB_X_EALG_CAMELLIACBC 22 +#define SADB_X_EALG_NULL_AES_GMAC 23 #define SADB_EALG_MAX 253 /* last EALG */ /* private allocations should use 249-255 (RFC2407) */ #define SADB_X_EALG_SERPENTCBC 252 /* draft-ietf-ipsec-ciph-aes-cbc-00 */ diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 743c0134a6a9..8b4d6e3246e5 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -125,6 +125,22 @@ static struct xfrm_algo_desc aead_list[] = { .sadb_alg_maxbits = 256 } }, +{ + .name = "rfc4543(gcm(aes))", + + .uinfo = { + .aead = { + .icv_truncbits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_NULL_AES_GMAC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, }; static struct xfrm_algo_desc aalg_list[] = { -- cgit v1.2.3 From 8b0e58a70a7a41443c779de074288035b014cb94 Mon Sep 17 00:00:00 2001 From: Stephane Chatty Date: Wed, 13 Jan 2010 21:52:34 +0100 Subject: HID: let hid-input accept digitizers Extended IS_INPUT_APPLICATION to accept digitzers that are actual input devices (touchscreens, light pens, touch pads, white boards) Signed-off-by: Stephane Chatty Signed-off-by: Jiri Kosina --- include/linux/hid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 87093652dda8..b978c1e2e74d 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -663,7 +663,7 @@ struct hid_ll_driver { /* Applications from HID Usage Tables 4/8/99 Version 1.1 */ /* We ignore a few input applications that are not widely used */ -#define IS_INPUT_APPLICATION(a) (((a >= 0x00010000) && (a <= 0x00010008)) || (a == 0x00010080) || (a == 0x000c0001) || (a == 0x000d0002)) +#define IS_INPUT_APPLICATION(a) (((a >= 0x00010000) && (a <= 0x00010008)) || (a == 0x00010080) || (a == 0x000c0001) || ((a >= 0x000d0002) && (a <= 0x000d0006))) /* HID core API */ -- cgit v1.2.3 From a83d8e8d099fc373a5ca7112ad08c553bb2c180f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 18 Jan 2010 08:21:13 +0100 Subject: netfilter: xtables: add struct xt_mtchk_param::net Some complex match modules (like xt_hashlimit/xt_recent) want netns information at constructor and destructor time. We propably can play games at match destruction time, because netns can be passed in object, but I think it's cleaner to explicitly pass netns. Add ->net, make sure it's set from ebtables/iptables/ip6tables code. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/linux/netfilter/x_tables.h | 1 + net/bridge/netfilter/ebtables.c | 14 +++++++++----- net/ipv4/netfilter/ip_tables.c | 24 ++++++++++++++---------- net/ipv6/netfilter/ip6_tables.c | 14 ++++++++------ 4 files changed, 32 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 378f27ae7772..88261b9829a7 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -205,6 +205,7 @@ struct xt_match_param { * @hook_mask: via which hooks the new rule is reachable */ struct xt_mtchk_param { + struct net *net; const char *table; const void *entryinfo; const struct xt_match *match; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index bd1c65425d4f..c77bab986696 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -619,7 +619,9 @@ ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt) } static inline int -ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, +ebt_check_entry(struct ebt_entry *e, + struct net *net, + struct ebt_table_info *newinfo, const char *name, unsigned int *cnt, struct ebt_cl_stack *cl_s, unsigned int udc_cnt) { @@ -671,6 +673,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, } i = 0; + mtpar.net = net; mtpar.table = tgpar.table = name; mtpar.entryinfo = tgpar.entryinfo = e; mtpar.hook_mask = tgpar.hook_mask = hookmask; @@ -808,7 +811,8 @@ letscontinue: } /* do the parsing of the table/chains/entries/matches/watchers/targets, heh */ -static int translate_table(char *name, struct ebt_table_info *newinfo) +static int translate_table(struct net *net, char *name, + struct ebt_table_info *newinfo) { unsigned int i, j, k, udc_cnt; int ret; @@ -917,7 +921,7 @@ static int translate_table(char *name, struct ebt_table_info *newinfo) /* used to know what we need to clean up if something goes wrong */ i = 0; ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, - ebt_check_entry, newinfo, name, &i, cl_s, udc_cnt); + ebt_check_entry, net, newinfo, name, &i, cl_s, udc_cnt); if (ret != 0) { EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, ebt_cleanup_entry, &i); @@ -1017,7 +1021,7 @@ static int do_replace(struct net *net, void __user *user, unsigned int len) if (ret != 0) goto free_counterstmp; - ret = translate_table(tmp.name, newinfo); + ret = translate_table(net, tmp.name, newinfo); if (ret != 0) goto free_counterstmp; @@ -1154,7 +1158,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table) newinfo->hook_entry[i] = p + ((char *)repl->hook_entry[i] - repl->entries); } - ret = translate_table(repl->name, newinfo); + ret = translate_table(net, repl->name, newinfo); if (ret != 0) { BUGPRINT("Translate_table failed\n"); goto free_chainstack; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 572330a552ef..a069d72d9482 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -661,8 +661,8 @@ static int check_target(struct ipt_entry *e, const char *name) } static int -find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, - unsigned int *i) +find_check_entry(struct ipt_entry *e, struct net *net, const char *name, + unsigned int size, unsigned int *i) { struct ipt_entry_target *t; struct xt_target *target; @@ -675,6 +675,7 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, return ret; j = 0; + mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ip; mtpar.hook_mask = e->comefrom; @@ -798,7 +799,8 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i) /* Checks and translates the user-supplied table segment (held in newinfo) */ static int -translate_table(const char *name, +translate_table(struct net *net, + const char *name, unsigned int valid_hooks, struct xt_table_info *newinfo, void *entry0, @@ -860,7 +862,7 @@ translate_table(const char *name, /* Finally, each sanity check must pass */ i = 0; ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, - find_check_entry, name, size, &i); + find_check_entry, net, name, size, &i); if (ret != 0) { IPT_ENTRY_ITERATE(entry0, newinfo->size, @@ -1303,7 +1305,7 @@ do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_table(tmp.name, tmp.valid_hooks, + ret = translate_table(net, tmp.name, tmp.valid_hooks, newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) @@ -1655,7 +1657,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, } static int -compat_check_entry(struct ipt_entry *e, const char *name, +compat_check_entry(struct ipt_entry *e, struct net *net, const char *name, unsigned int *i) { struct xt_mtchk_param mtpar; @@ -1663,6 +1665,7 @@ compat_check_entry(struct ipt_entry *e, const char *name, int ret; j = 0; + mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ip; mtpar.hook_mask = e->comefrom; @@ -1684,7 +1687,8 @@ compat_check_entry(struct ipt_entry *e, const char *name, } static int -translate_compat_table(const char *name, +translate_compat_table(struct net *net, + const char *name, unsigned int valid_hooks, struct xt_table_info **pinfo, void **pentry0, @@ -1773,7 +1777,7 @@ translate_compat_table(const char *name, i = 0; ret = IPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, - name, &i); + net, name, &i); if (ret) { j -= i; COMPAT_IPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, @@ -1833,7 +1837,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_compat_table(tmp.name, tmp.valid_hooks, + ret = translate_compat_table(net, tmp.name, tmp.valid_hooks, &newinfo, &loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); @@ -2086,7 +2090,7 @@ struct xt_table *ipt_register_table(struct net *net, loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; memcpy(loc_cpu_entry, repl->entries, repl->size); - ret = translate_table(table->name, table->valid_hooks, + ret = translate_table(net, table->name, table->valid_hooks, newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 480d7f8c9802..a825940a92ef 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -693,8 +693,8 @@ static int check_target(struct ip6t_entry *e, const char *name) } static int -find_check_entry(struct ip6t_entry *e, const char *name, unsigned int size, - unsigned int *i) +find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, + unsigned int size, unsigned int *i) { struct ip6t_entry_target *t; struct xt_target *target; @@ -707,6 +707,7 @@ find_check_entry(struct ip6t_entry *e, const char *name, unsigned int size, return ret; j = 0; + mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ipv6; mtpar.hook_mask = e->comefrom; @@ -830,7 +831,8 @@ cleanup_entry(struct ip6t_entry *e, unsigned int *i) /* Checks and translates the user-supplied table segment (held in newinfo) */ static int -translate_table(const char *name, +translate_table(struct net *net, + const char *name, unsigned int valid_hooks, struct xt_table_info *newinfo, void *entry0, @@ -892,7 +894,7 @@ translate_table(const char *name, /* Finally, each sanity check must pass */ i = 0; ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, - find_check_entry, name, size, &i); + find_check_entry, net, name, size, &i); if (ret != 0) { IP6T_ENTRY_ITERATE(entry0, newinfo->size, @@ -1336,7 +1338,7 @@ do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_table(tmp.name, tmp.valid_hooks, + ret = translate_table(net, tmp.name, tmp.valid_hooks, newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) @@ -2121,7 +2123,7 @@ struct xt_table *ip6t_register_table(struct net *net, loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; memcpy(loc_cpu_entry, repl->entries, repl->size); - ret = translate_table(table->name, table->valid_hooks, + ret = translate_table(net, table->name, table->valid_hooks, newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, -- cgit v1.2.3 From f54e9367f8499a9bf6b2afbc0dce63e1d53c525a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 18 Jan 2010 08:25:47 +0100 Subject: netfilter: xtables: add struct xt_mtdtor_param::net Add ->net to match destructor list like ->net in constructor list. Make sure it's set in ebtables/iptables/ip6tables, this requires to propagate netns up to *_unregister_table(). Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/linux/netfilter/x_tables.h | 1 + include/linux/netfilter_bridge/ebtables.h | 2 +- include/linux/netfilter_ipv4/ip_tables.h | 2 +- include/linux/netfilter_ipv6/ip6_tables.h | 2 +- net/bridge/netfilter/ebtable_broute.c | 2 +- net/bridge/netfilter/ebtable_filter.c | 2 +- net/bridge/netfilter/ebtable_nat.c | 2 +- net/bridge/netfilter/ebtables.c | 19 ++++++++-------- net/ipv4/netfilter/ip_tables.c | 25 +++++++++++---------- net/ipv4/netfilter/iptable_filter.c | 2 +- net/ipv4/netfilter/iptable_mangle.c | 2 +- net/ipv4/netfilter/iptable_raw.c | 2 +- net/ipv4/netfilter/iptable_security.c | 2 +- net/ipv4/netfilter/nf_nat_rule.c | 2 +- net/ipv6/netfilter/ip6_tables.c | 37 +++++++++++++++++-------------- net/ipv6/netfilter/ip6table_filter.c | 2 +- net/ipv6/netfilter/ip6table_mangle.c | 2 +- net/ipv6/netfilter/ip6table_raw.c | 2 +- net/ipv6/netfilter/ip6table_security.c | 2 +- 19 files changed, 59 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 88261b9829a7..3caf5e151102 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -216,6 +216,7 @@ struct xt_mtchk_param { /* Match destructor parameters */ struct xt_mtdtor_param { + struct net *net; const struct xt_match *match; void *matchinfo; u_int8_t family; diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 3cc40c131cc3..1c6f0c5f530e 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -289,7 +289,7 @@ struct ebt_table { ~(__alignof__(struct ebt_replace)-1)) extern struct ebt_table *ebt_register_table(struct net *net, const struct ebt_table *table); -extern void ebt_unregister_table(struct ebt_table *table); +extern void ebt_unregister_table(struct net *net, struct ebt_table *table); extern unsigned int ebt_do_table(unsigned int hook, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, struct ebt_table *table); diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index 27b3f5807305..8d1f273d350b 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -242,7 +242,7 @@ extern void ipt_init(void) __init; extern struct xt_table *ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl); -extern void ipt_unregister_table(struct xt_table *table); +extern void ipt_unregister_table(struct net *net, struct xt_table *table); /* Standard entry. */ struct ipt_standard { diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index b31050d20ae4..d2952d2fa658 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -300,7 +300,7 @@ extern void ip6t_init(void) __init; extern struct xt_table *ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl); -extern void ip6t_unregister_table(struct xt_table *table); +extern void ip6t_unregister_table(struct net *net, struct xt_table *table); extern unsigned int ip6t_do_table(struct sk_buff *skb, unsigned int hook, const struct net_device *in, diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index d32ab13e728c..ae3f106c3908 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -71,7 +71,7 @@ static int __net_init broute_net_init(struct net *net) static void __net_exit broute_net_exit(struct net *net) { - ebt_unregister_table(net->xt.broute_table); + ebt_unregister_table(net, net->xt.broute_table); } static struct pernet_operations broute_net_ops = { diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 60b1a6ca7185..42e6bd094574 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -107,7 +107,7 @@ static int __net_init frame_filter_net_init(struct net *net) static void __net_exit frame_filter_net_exit(struct net *net) { - ebt_unregister_table(net->xt.frame_filter); + ebt_unregister_table(net, net->xt.frame_filter); } static struct pernet_operations frame_filter_net_ops = { diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 4a98804203b0..6dc2f878ae05 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -107,7 +107,7 @@ static int __net_init frame_nat_net_init(struct net *net) static void __net_exit frame_nat_net_exit(struct net *net) { - ebt_unregister_table(net->xt.frame_nat); + ebt_unregister_table(net, net->xt.frame_nat); } static struct pernet_operations frame_nat_net_ops = { diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index c77bab986696..1aa0e4c1f52d 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -561,13 +561,14 @@ ebt_get_udc_positions(struct ebt_entry *e, struct ebt_table_info *newinfo, } static inline int -ebt_cleanup_match(struct ebt_entry_match *m, unsigned int *i) +ebt_cleanup_match(struct ebt_entry_match *m, struct net *net, unsigned int *i) { struct xt_mtdtor_param par; if (i && (*i)-- == 0) return 1; + par.net = net; par.match = m->u.match; par.matchinfo = m->data; par.family = NFPROTO_BRIDGE; @@ -595,7 +596,7 @@ ebt_cleanup_watcher(struct ebt_entry_watcher *w, unsigned int *i) } static inline int -ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt) +ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt) { struct xt_tgdtor_param par; struct ebt_entry_target *t; @@ -606,7 +607,7 @@ ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt) if (cnt && (*cnt)-- == 0) return 1; EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, NULL); - EBT_MATCH_ITERATE(e, ebt_cleanup_match, NULL); + EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL); t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); par.target = t->u.target; @@ -731,7 +732,7 @@ ebt_check_entry(struct ebt_entry *e, cleanup_watchers: EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, &j); cleanup_matches: - EBT_MATCH_ITERATE(e, ebt_cleanup_match, &i); + EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, &i); return ret; } @@ -924,7 +925,7 @@ static int translate_table(struct net *net, char *name, ebt_check_entry, net, newinfo, name, &i, cl_s, udc_cnt); if (ret != 0) { EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, - ebt_cleanup_entry, &i); + ebt_cleanup_entry, net, &i); } vfree(cl_s); return ret; @@ -1074,7 +1075,7 @@ static int do_replace(struct net *net, void __user *user, unsigned int len) /* decrease module count and free resources */ EBT_ENTRY_ITERATE(table->entries, table->entries_size, - ebt_cleanup_entry, NULL); + ebt_cleanup_entry, net, NULL); vfree(table->entries); if (table->chainstack) { @@ -1091,7 +1092,7 @@ free_unlock: mutex_unlock(&ebt_mutex); free_iterate: EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, - ebt_cleanup_entry, NULL); + ebt_cleanup_entry, net, NULL); free_counterstmp: vfree(counterstmp); /* can be initialized in translate_table() */ @@ -1208,7 +1209,7 @@ out: return ERR_PTR(ret); } -void ebt_unregister_table(struct ebt_table *table) +void ebt_unregister_table(struct net *net, struct ebt_table *table) { int i; @@ -1220,7 +1221,7 @@ void ebt_unregister_table(struct ebt_table *table) list_del(&table->list); mutex_unlock(&ebt_mutex); EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size, - ebt_cleanup_entry, NULL); + ebt_cleanup_entry, net, NULL); if (table->private->nentries) module_put(table->me); vfree(table->private->entries); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index a069d72d9482..cfaba0e2e6fc 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -553,13 +553,14 @@ mark_source_chains(struct xt_table_info *newinfo, } static int -cleanup_match(struct ipt_entry_match *m, unsigned int *i) +cleanup_match(struct ipt_entry_match *m, struct net *net, unsigned int *i) { struct xt_mtdtor_param par; if (i && (*i)-- == 0) return 1; + par.net = net; par.match = m->u.kernel.match; par.matchinfo = m->data; par.family = NFPROTO_IPV4; @@ -705,7 +706,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, err: module_put(t->u.kernel.target->me); cleanup_matches: - IPT_MATCH_ITERATE(e, cleanup_match, &j); + IPT_MATCH_ITERATE(e, cleanup_match, net, &j); return ret; } @@ -775,7 +776,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, } static int -cleanup_entry(struct ipt_entry *e, unsigned int *i) +cleanup_entry(struct ipt_entry *e, struct net *net, unsigned int *i) { struct xt_tgdtor_param par; struct ipt_entry_target *t; @@ -784,7 +785,7 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i) return 1; /* Cleanup all matches */ - IPT_MATCH_ITERATE(e, cleanup_match, NULL); + IPT_MATCH_ITERATE(e, cleanup_match, net, NULL); t = ipt_get_target(e); par.target = t->u.kernel.target; @@ -866,7 +867,7 @@ translate_table(struct net *net, if (ret != 0) { IPT_ENTRY_ITERATE(entry0, newinfo->size, - cleanup_entry, &i); + cleanup_entry, net, &i); return ret; } @@ -1260,7 +1261,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, - NULL); + net, NULL); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, sizeof(struct xt_counters) * num_counters) != 0) @@ -1320,7 +1321,7 @@ do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, net, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1682,7 +1683,7 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name, return 0; cleanup_matches: - IPT_MATCH_ITERATE(e, cleanup_match, &j); + IPT_MATCH_ITERATE(e, cleanup_match, net, &j); return ret; } @@ -1782,7 +1783,7 @@ translate_compat_table(struct net *net, j -= i; COMPAT_IPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, compat_release_entry, &j); - IPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); + IPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, net, &i); xt_free_table_info(newinfo); return ret; } @@ -1853,7 +1854,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, net, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -2112,7 +2113,7 @@ out: return ERR_PTR(ret); } -void ipt_unregister_table(struct xt_table *table) +void ipt_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; @@ -2122,7 +2123,7 @@ void ipt_unregister_table(struct xt_table *table) /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; - IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, net, NULL); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index df566cbd68e5..dee90eb8aa47 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -138,7 +138,7 @@ static int __net_init iptable_filter_net_init(struct net *net) static void __net_exit iptable_filter_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_filter); + ipt_unregister_table(net, net->ipv4.iptable_filter); } static struct pernet_operations iptable_filter_net_ops = { diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index fae78c3076c4..e07bf242343a 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -208,7 +208,7 @@ static int __net_init iptable_mangle_net_init(struct net *net) static void __net_exit iptable_mangle_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_mangle); + ipt_unregister_table(net, net->ipv4.iptable_mangle); } static struct pernet_operations iptable_mangle_net_ops = { diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 993edc23be09..40f2b9f611a2 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -100,7 +100,7 @@ static int __net_init iptable_raw_net_init(struct net *net) static void __net_exit iptable_raw_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_raw); + ipt_unregister_table(net, net->ipv4.iptable_raw); } static struct pernet_operations iptable_raw_net_ops = { diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 3bd3d6388da5..7ce2366e4305 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -138,7 +138,7 @@ static int __net_init iptable_security_net_init(struct net *net) static void __net_exit iptable_security_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_security); + ipt_unregister_table(net, net->ipv4.iptable_security); } static struct pernet_operations iptable_security_net_ops = { diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 9e81e0dfb4ec..85da34fdc755 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -195,7 +195,7 @@ static int __net_init nf_nat_rule_net_init(struct net *net) static void __net_exit nf_nat_rule_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.nat_table); + ipt_unregister_table(net, net->ipv4.nat_table); } static struct pernet_operations nf_nat_rule_net_ops = { diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index a825940a92ef..9f1d45f2ba8f 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -585,13 +585,14 @@ mark_source_chains(struct xt_table_info *newinfo, } static int -cleanup_match(struct ip6t_entry_match *m, unsigned int *i) +cleanup_match(struct ip6t_entry_match *m, struct net *net, unsigned int *i) { struct xt_mtdtor_param par; if (i && (*i)-- == 0) return 1; + par.net = net; par.match = m->u.kernel.match; par.matchinfo = m->data; par.family = NFPROTO_IPV6; @@ -737,7 +738,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, err: module_put(t->u.kernel.target->me); cleanup_matches: - IP6T_MATCH_ITERATE(e, cleanup_match, &j); + IP6T_MATCH_ITERATE(e, cleanup_match, net, &j); return ret; } @@ -807,7 +808,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e, } static int -cleanup_entry(struct ip6t_entry *e, unsigned int *i) +cleanup_entry(struct ip6t_entry *e, struct net *net, unsigned int *i) { struct xt_tgdtor_param par; struct ip6t_entry_target *t; @@ -816,7 +817,7 @@ cleanup_entry(struct ip6t_entry *e, unsigned int *i) return 1; /* Cleanup all matches */ - IP6T_MATCH_ITERATE(e, cleanup_match, NULL); + IP6T_MATCH_ITERATE(e, cleanup_match, net, NULL); t = ip6t_get_target(e); par.target = t->u.kernel.target; @@ -898,7 +899,7 @@ translate_table(struct net *net, if (ret != 0) { IP6T_ENTRY_ITERATE(entry0, newinfo->size, - cleanup_entry, &i); + cleanup_entry, net, &i); return ret; } @@ -1293,7 +1294,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, - NULL); + net, NULL); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, sizeof(struct xt_counters) * num_counters) != 0) @@ -1353,7 +1354,7 @@ do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, net, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1692,14 +1693,15 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, return ret; } -static int compat_check_entry(struct ip6t_entry *e, const char *name, - unsigned int *i) +static int compat_check_entry(struct ip6t_entry *e, struct net *net, + const char *name, unsigned int *i) { unsigned int j; int ret; struct xt_mtchk_param mtpar; j = 0; + mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ipv6; mtpar.hook_mask = e->comefrom; @@ -1716,12 +1718,13 @@ static int compat_check_entry(struct ip6t_entry *e, const char *name, return 0; cleanup_matches: - IP6T_MATCH_ITERATE(e, cleanup_match, &j); + IP6T_MATCH_ITERATE(e, cleanup_match, net, &j); return ret; } static int -translate_compat_table(const char *name, +translate_compat_table(struct net *net, + const char *name, unsigned int valid_hooks, struct xt_table_info **pinfo, void **pentry0, @@ -1810,12 +1813,12 @@ translate_compat_table(const char *name, i = 0; ret = IP6T_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, - name, &i); + net, name, &i); if (ret) { j -= i; COMPAT_IP6T_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, compat_release_entry, &j); - IP6T_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); + IP6T_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, net, &i); xt_free_table_info(newinfo); return ret; } @@ -1870,7 +1873,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_compat_table(tmp.name, tmp.valid_hooks, + ret = translate_compat_table(net, tmp.name, tmp.valid_hooks, &newinfo, &loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); @@ -1886,7 +1889,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, net, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -2144,7 +2147,7 @@ out: return ERR_PTR(ret); } -void ip6t_unregister_table(struct xt_table *table) +void ip6t_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; @@ -2154,7 +2157,7 @@ void ip6t_unregister_table(struct xt_table *table) /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; - IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); + IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, net, NULL); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index ad378efd0eb8..33ddfe53e18d 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -131,7 +131,7 @@ static int __net_init ip6table_filter_net_init(struct net *net) static void __net_exit ip6table_filter_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_filter); + ip6t_unregister_table(net, net->ipv6.ip6table_filter); } static struct pernet_operations ip6table_filter_net_ops = { diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index a929c19d30e3..9bc483f000e5 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -182,7 +182,7 @@ static int __net_init ip6table_mangle_net_init(struct net *net) static void __net_exit ip6table_mangle_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_mangle); + ip6t_unregister_table(net, net->ipv6.ip6table_mangle); } static struct pernet_operations ip6table_mangle_net_ops = { diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index ed1a1180f3b3..4c90b552e433 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -94,7 +94,7 @@ static int __net_init ip6table_raw_net_init(struct net *net) static void __net_exit ip6table_raw_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_raw); + ip6t_unregister_table(net, net->ipv6.ip6table_raw); } static struct pernet_operations ip6table_raw_net_ops = { diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 41b444c60934..baa8d4ef3b0a 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -134,7 +134,7 @@ static int __net_init ip6table_security_net_init(struct net *net) static void __net_exit ip6table_security_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_security); + ip6t_unregister_table(net, net->ipv6.ip6table_security); } static struct pernet_operations ip6table_security_net_ops = { -- cgit v1.2.3 From d2d4e780aff2fab46a792ebc89f80d1a6872b325 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 18 Jan 2010 07:20:28 +0000 Subject: ide: add drive->pio_mode field Add pio_mode field to ide_drive_t matching pio_mode field used in struct ata_device. The validity of the field is restricted to ->set_pio_mode method only currently in IDE subsystem. Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: David S. Miller --- drivers/ide/ide-devsets.c | 2 ++ drivers/ide/ide-probe.c | 2 ++ drivers/ide/ide-xfer-mode.c | 3 +++ include/linux/ide.h | 1 + 4 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index 1099bf7cf968..cb3341ce655c 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -105,6 +105,8 @@ static int set_pio_mode(ide_drive_t *drive, int arg) return -ENOSYS; if (set_pio_mode_abuse(drive->hwif, arg)) { + drive->pio_mode = arg + XFER_PIO_0; + if (arg == 8 || arg == 9) { unsigned long flags; diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 4d76ba473097..9a9f10f4cf9f 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -1043,6 +1043,8 @@ static void ide_port_init_devices(ide_hwif_t *hwif) if (hwif->host_flags & IDE_HFLAG_NO_UNMASK_IRQS) drive->dev_flags |= IDE_DFLAG_NO_UNMASK; + drive->pio_mode = XFER_PIO_0; + if (port_ops && port_ops->init_dev) port_ops->init_dev(drive); } diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c index 46d203ce60cc..cdae463f6b41 100644 --- a/drivers/ide/ide-xfer-mode.c +++ b/drivers/ide/ide-xfer-mode.c @@ -135,6 +135,7 @@ int ide_set_pio_mode(ide_drive_t *drive, const u8 mode) * set transfer mode on the device in ->set_pio_mode method... */ if (port_ops->set_dma_mode == NULL) { + drive->pio_mode = mode; port_ops->set_pio_mode(drive, mode - XFER_PIO_0); return 0; } @@ -142,9 +143,11 @@ int ide_set_pio_mode(ide_drive_t *drive, const u8 mode) if (hwif->host_flags & IDE_HFLAG_POST_SET_MODE) { if (ide_config_drive_speed(drive, mode)) return -1; + drive->pio_mode = mode; port_ops->set_pio_mode(drive, mode - XFER_PIO_0); return 0; } else { + drive->pio_mode = mode; port_ops->set_pio_mode(drive, mode - XFER_PIO_0); return ide_config_drive_speed(drive, mode); } diff --git a/include/linux/ide.h b/include/linux/ide.h index 0ec612959042..b5d2e9655059 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -515,6 +515,7 @@ struct ide_drive_s { u8 init_speed; /* transfer rate set at boot */ u8 current_speed; /* current transfer rate set */ u8 desired_speed; /* desired transfer rate set */ + u8 pio_mode; /* for ->set_pio_mode _only_ */ u8 dn; /* now wide spread use */ u8 acoustic; /* acoustic management */ u8 media; /* disk, cdrom, tape, floppy, ... */ -- cgit v1.2.3 From 3fccaa192b9501e79a57e02e62b6bf420d2b461e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 18 Jan 2010 07:20:35 +0000 Subject: ide: add drive->dma_mode field Add dma_mode field to ide_drive_t matching dma_mode field used in struct ata_device. The validity of the field is restricted to ->dma_pio_mode method only currently in IDE subsystem. Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: David S. Miller --- drivers/ide/aec62xx.c | 1 + drivers/ide/ide-xfer-mode.c | 2 ++ include/linux/ide.h | 1 + 3 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/drivers/ide/aec62xx.c b/drivers/ide/aec62xx.c index 878f8ec6dbe1..4c869872eb9a 100644 --- a/drivers/ide/aec62xx.c +++ b/drivers/ide/aec62xx.c @@ -136,6 +136,7 @@ static void aec6260_set_mode(ide_drive_t *drive, const u8 speed) static void aec_set_pio_mode(ide_drive_t *drive, const u8 pio) { + drive->dma_mode = pio + XFER_PIO_0; drive->hwif->port_ops->set_dma_mode(drive, pio + XFER_PIO_0); } diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c index cdae463f6b41..c2323869d92a 100644 --- a/drivers/ide/ide-xfer-mode.c +++ b/drivers/ide/ide-xfer-mode.c @@ -167,9 +167,11 @@ int ide_set_dma_mode(ide_drive_t *drive, const u8 mode) if (hwif->host_flags & IDE_HFLAG_POST_SET_MODE) { if (ide_config_drive_speed(drive, mode)) return -1; + drive->dma_mode = mode; port_ops->set_dma_mode(drive, mode); return 0; } else { + drive->dma_mode = mode; port_ops->set_dma_mode(drive, mode); return ide_config_drive_speed(drive, mode); } diff --git a/include/linux/ide.h b/include/linux/ide.h index b5d2e9655059..746ef9fdabcb 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -516,6 +516,7 @@ struct ide_drive_s { u8 current_speed; /* current transfer rate set */ u8 desired_speed; /* desired transfer rate set */ u8 pio_mode; /* for ->set_pio_mode _only_ */ + u8 dma_mode; /* for ->dma_pio_mode _only_ */ u8 dn; /* now wide spread use */ u8 acoustic; /* acoustic management */ u8 media; /* disk, cdrom, tape, floppy, ... */ -- cgit v1.2.3 From e085b3cae85af47eb0a3eda3186bd898310fb322 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 19 Jan 2010 01:44:41 -0800 Subject: ide: change ->set_pio_mode method parameters Change ->set_pio_mode method parameters to match ->set_piomode method used in struct ata_port_operations. Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: David S. Miller --- drivers/ide/aec62xx.c | 6 +++--- drivers/ide/ali14xx.c | 3 ++- drivers/ide/alim15x3.c | 7 +++---- drivers/ide/amd74xx.c | 4 ++-- drivers/ide/at91_ide.c | 5 +++-- drivers/ide/atiixp.c | 7 ++++--- drivers/ide/au1xxx-ide.c | 5 ++--- drivers/ide/cmd640.c | 3 ++- drivers/ide/cmd64x.c | 4 +++- drivers/ide/cs5520.c | 7 ++++--- drivers/ide/cs5530.c | 7 ++++--- drivers/ide/cs5535.c | 6 +++--- drivers/ide/cs5536.c | 7 ++++--- drivers/ide/cy82c693.c | 5 ++--- drivers/ide/dtc2278.c | 4 ++-- drivers/ide/hpt366.c | 4 ++-- drivers/ide/ht6560b.c | 3 ++- drivers/ide/ide-devsets.c | 4 ++-- drivers/ide/ide-xfer-mode.c | 6 +++--- drivers/ide/it8172.c | 10 +++++----- drivers/ide/it8213.c | 14 +++++++------- drivers/ide/it821x.c | 6 +++--- drivers/ide/jmicron.c | 2 +- drivers/ide/opti621.c | 6 +++--- drivers/ide/palm_bk3710.c | 5 +++-- drivers/ide/pdc202xx_new.c | 4 ++-- drivers/ide/pdc202xx_old.c | 4 ++-- drivers/ide/piix.c | 14 +++++++------- drivers/ide/pmac.c | 5 ++--- drivers/ide/qd65xx.c | 10 ++++------ drivers/ide/sc1200.c | 4 ++-- drivers/ide/scc_pata.c | 6 +++--- drivers/ide/serverworks.c | 5 +++-- drivers/ide/siimage.c | 6 +++--- drivers/ide/sis5513.c | 4 ++-- drivers/ide/sl82c105.c | 5 +++-- drivers/ide/slc90e66.c | 13 +++++++------ drivers/ide/tc86c001.c | 4 ++-- drivers/ide/triflex.c | 4 ++-- drivers/ide/tx4938ide.c | 5 ++--- drivers/ide/tx4939ide.c | 4 ++-- drivers/ide/umc8672.c | 5 +++-- drivers/ide/via82cxxx.c | 6 +++--- include/linux/ide.h | 2 +- 44 files changed, 129 insertions(+), 121 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/aec62xx.c b/drivers/ide/aec62xx.c index 4c869872eb9a..3790847361c3 100644 --- a/drivers/ide/aec62xx.c +++ b/drivers/ide/aec62xx.c @@ -134,10 +134,10 @@ static void aec6260_set_mode(ide_drive_t *drive, const u8 speed) local_irq_restore(flags); } -static void aec_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void aec_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - drive->dma_mode = pio + XFER_PIO_0; - drive->hwif->port_ops->set_dma_mode(drive, pio + XFER_PIO_0); + drive->dma_mode = drive->pio_mode; + hwif->port_ops->set_dma_mode(drive, drive->dma_mode); } static int init_chipset_aec62xx(struct pci_dev *dev) diff --git a/drivers/ide/ali14xx.c b/drivers/ide/ali14xx.c index 90da1f953ed0..25b9fe3a9f8e 100644 --- a/drivers/ide/ali14xx.c +++ b/drivers/ide/ali14xx.c @@ -109,13 +109,14 @@ static DEFINE_SPINLOCK(ali14xx_lock); * This function computes timing parameters * and sets controller registers accordingly. */ -static void ali14xx_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void ali14xx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { int driveNum; int time1, time2; u8 param1, param2, param3, param4; unsigned long flags; int bus_speed = ide_vlb_clk ? ide_vlb_clk : 50; + const u8 pio = drive->pio_mode - XFER_PIO_0; struct ide_timing *t = ide_timing_find_mode(XFER_PIO_0 + pio); /* calculate timing, according to PIO mode */ diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c index 8f03cce055fa..28cee1055f76 100644 --- a/drivers/ide/alim15x3.c +++ b/drivers/ide/alim15x3.c @@ -63,15 +63,14 @@ static void ali_fifo_control(ide_hwif_t *hwif, ide_drive_t *drive, int on) /** * ali_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * Program the controller for the given PIO mode. */ -static void ali_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void ali_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); int bus_speed = ide_pci_clk ? ide_pci_clk : 33; unsigned long T = 1000000 / bus_speed; /* PCI clock based */ @@ -79,7 +78,7 @@ static void ali_set_pio_mode(ide_drive_t *drive, const u8 pio) u8 unit = drive->dn & 1; struct ide_timing t; - ide_timing_compute(drive, XFER_PIO_0 + pio, &t, T, 1); + ide_timing_compute(drive, drive->pio_mode, &t, T, 1); t.setup = clamp_val(t.setup, 1, 8) & 7; t.active = clamp_val(t.active, 1, 8) & 7; diff --git a/drivers/ide/amd74xx.c b/drivers/ide/amd74xx.c index 108e9b676859..3eee7be7ca6f 100644 --- a/drivers/ide/amd74xx.c +++ b/drivers/ide/amd74xx.c @@ -108,9 +108,9 @@ static void amd_set_drive(ide_drive_t *drive, const u8 speed) * amd_set_pio_mode() is a callback from upper layers for PIO-only tuning. */ -static void amd_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void amd_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - amd_set_drive(drive, XFER_PIO_0 + pio); + amd_set_drive(drive, drive->pio_mode); } static void amd7409_cable_detect(struct pci_dev *dev) diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c index 248219a89a68..000a78e5246c 100644 --- a/drivers/ide/at91_ide.c +++ b/drivers/ide/at91_ide.c @@ -172,11 +172,12 @@ static void at91_ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, leave_16bit(chipselect, mode); } -static void at91_ide_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void at91_ide_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { struct ide_timing *timing; - u8 chipselect = drive->hwif->select_data; + u8 chipselect = hwif->select_data; int use_iordy = 0; + const u8 pio = drive->pio_mode - XFER_PIO_0; pdbg("chipselect %u pio %u\n", chipselect, pio); diff --git a/drivers/ide/atiixp.c b/drivers/ide/atiixp.c index 837322b10a4c..b6848dfb93b0 100644 --- a/drivers/ide/atiixp.c +++ b/drivers/ide/atiixp.c @@ -42,19 +42,20 @@ static DEFINE_SPINLOCK(atiixp_lock); /** * atiixp_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * Set the interface PIO mode. */ -static void atiixp_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void atiixp_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - struct pci_dev *dev = to_pci_dev(drive->hwif->dev); + struct pci_dev *dev = to_pci_dev(hwif->dev); unsigned long flags; int timing_shift = (drive->dn ^ 1) * 8; u32 pio_timing_data; u16 pio_mode_data; + const u8 pio = drive->pio_mode - XFER_PIO_0; spin_lock_irqsave(&atiixp_lock, flags); diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c index 87cef0c440ad..c90e9b0a9f6e 100644 --- a/drivers/ide/au1xxx-ide.c +++ b/drivers/ide/au1xxx-ide.c @@ -99,12 +99,11 @@ static void au1xxx_output_data(ide_drive_t *drive, struct ide_cmd *cmd, } #endif -static void au1xxx_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void au1xxx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { int mem_sttime = 0, mem_stcfg = au_readl(MEM_STCFG2); - /* set pio mode! */ - switch(pio) { + switch (drive->pio_mode - XFER_PIO_0) { case 0: mem_sttime = SBC_IDE_TIMING(PIO0); diff --git a/drivers/ide/cmd640.c b/drivers/ide/cmd640.c index 1a32d62ed86b..c7d46a3d347a 100644 --- a/drivers/ide/cmd640.c +++ b/drivers/ide/cmd640.c @@ -572,9 +572,10 @@ static void cmd640_set_mode(ide_drive_t *drive, unsigned int index, program_drive_counts(drive, index); } -static void cmd640_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void cmd640_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { unsigned int index = 0, cycle_time; + const u8 pio = drive->pio_mode - XFER_PIO_0; u8 b; switch (pio) { diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c index 9f89f3116df0..0b11745937e7 100644 --- a/drivers/ide/cmd64x.c +++ b/drivers/ide/cmd64x.c @@ -127,8 +127,10 @@ static void cmd64x_program_timings(ide_drive_t *drive, u8 mode) * Special cases are 8: prefetch off, 9: prefetch on (both never worked) */ -static void cmd64x_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void cmd64x_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { + const u8 pio = drive->pio_mode - XFER_PIO_0; + /* * Filter out the prefetch control values * to prevent PIO5 from being programmed diff --git a/drivers/ide/cs5520.c b/drivers/ide/cs5520.c index 09f98ed0731f..b8094f049f3e 100644 --- a/drivers/ide/cs5520.c +++ b/drivers/ide/cs5520.c @@ -57,11 +57,11 @@ static struct pio_clocks cs5520_pio_clocks[]={ {1, 2, 1} }; -static void cs5520_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void cs5520_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *pdev = to_pci_dev(hwif->dev); int controller = drive->dn > 1 ? 1 : 0; + const u8 pio = drive->pio_mode - XFER_PIO_0; /* 8bit CAT/CRT - 8bit command timing for channel */ pci_write_config_byte(pdev, 0x62 + controller, @@ -85,7 +85,8 @@ static void cs5520_set_dma_mode(ide_drive_t *drive, const u8 speed) { printk(KERN_ERR "cs55x0: bad ide timing.\n"); - cs5520_set_pio_mode(drive, 0); + drive->pio_mode = XFER_PIO_0 + 0; + cs5520_set_pio_mode(drive->hwif, drive); } static const struct ide_port_ops cs5520_port_ops = { diff --git a/drivers/ide/cs5530.c b/drivers/ide/cs5530.c index 40bf05eddf6e..4ced40255ad6 100644 --- a/drivers/ide/cs5530.c +++ b/drivers/ide/cs5530.c @@ -41,8 +41,8 @@ static unsigned int cs5530_pio_timings[2][5] = { /** * cs5530_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * Handles setting of PIO mode for the chipset. * @@ -50,10 +50,11 @@ static unsigned int cs5530_pio_timings[2][5] = { * will have valid default PIO timings set up before we get here. */ -static void cs5530_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void cs5530_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - unsigned long basereg = CS5530_BASEREG(drive->hwif); + unsigned long basereg = CS5530_BASEREG(hwif); unsigned int format = (inl(basereg + 4) >> 31) & 1; + const u8 pio = drive->pio_mode - XFER_PIO_0; outl(cs5530_pio_timings[format][pio], basereg + ((drive->dn & 1)<<3)); } diff --git a/drivers/ide/cs5535.c b/drivers/ide/cs5535.c index b883838adc24..7974415ea89f 100644 --- a/drivers/ide/cs5535.c +++ b/drivers/ide/cs5535.c @@ -142,15 +142,15 @@ static void cs5535_set_dma_mode(ide_drive_t *drive, const u8 speed) /** * cs5535_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * A callback from the upper layers for PIO-only tuning. */ -static void cs5535_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void cs5535_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - cs5535_set_speed(drive, XFER_PIO_0 + pio); + cs5535_set_speed(drive, drive->pio_mode); } static u8 cs5535_cable_detect(ide_hwif_t *hwif) diff --git a/drivers/ide/cs5536.c b/drivers/ide/cs5536.c index 9623b852c616..b518ef0e9a35 100644 --- a/drivers/ide/cs5536.c +++ b/drivers/ide/cs5536.c @@ -125,11 +125,11 @@ static u8 cs5536_cable_detect(ide_hwif_t *hwif) /** * cs5536_set_pio_mode - PIO timing setup + * @hwif: ATA port * @drive: ATA device - * @pio: PIO mode number */ -static void cs5536_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void cs5536_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { static const u8 drv_timings[5] = { 0x98, 0x55, 0x32, 0x21, 0x20, @@ -143,11 +143,12 @@ static void cs5536_set_pio_mode(ide_drive_t *drive, const u8 pio) 0x99, 0x92, 0x90, 0x22, 0x20, }; - struct pci_dev *pdev = to_pci_dev(drive->hwif->dev); + struct pci_dev *pdev = to_pci_dev(hwif->dev); ide_drive_t *pair = ide_get_pair_dev(drive); int cshift = (drive->dn & 1) ? IDE_CAST_D1_SHIFT : IDE_CAST_D0_SHIFT; unsigned long timings = (unsigned long)ide_get_drivedata(drive); u32 cast; + const u8 pio = drive->pio_mode - XFER_PIO_0; u8 cmd_pio = pio; if (pair) diff --git a/drivers/ide/cy82c693.c b/drivers/ide/cy82c693.c index fbf3dcc26577..ead65c394f00 100644 --- a/drivers/ide/cy82c693.c +++ b/drivers/ide/cy82c693.c @@ -80,9 +80,8 @@ static void cy82c693_set_dma_mode(ide_drive_t *drive, const u8 mode) outb(data, CY82_DATA_PORT); } -static void cy82c693_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void cy82c693_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); int bus_speed = ide_pci_clk ? ide_pci_clk : 33; const unsigned long T = 1000000 / bus_speed; @@ -101,7 +100,7 @@ static void cy82c693_set_pio_mode(ide_drive_t *drive, const u8 pio) } } - ide_timing_compute(drive, XFER_PIO_0 + pio, &t, T, 1); + ide_timing_compute(drive, drive->pio_mode, &t, T, 1); time_16 = clamp_val(t.recover - 1, 0, 15) | (clamp_val(t.active - 1, 0, 15) << 4); diff --git a/drivers/ide/dtc2278.c b/drivers/ide/dtc2278.c index c6b138122981..6929f7fce93a 100644 --- a/drivers/ide/dtc2278.c +++ b/drivers/ide/dtc2278.c @@ -68,11 +68,11 @@ static void sub22 (char b, char c) static DEFINE_SPINLOCK(dtc2278_lock); -static void dtc2278_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void dtc2278_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { unsigned long flags; - if (pio >= 3) { + if (drive->pio_mode >= XFER_PIO_3) { spin_lock_irqsave(&dtc2278_lock, flags); /* * This enables PIO mode4 (3?) on the first interface diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c index 4d90ac2dbb1b..f1dec519a9e6 100644 --- a/drivers/ide/hpt366.c +++ b/drivers/ide/hpt366.c @@ -651,9 +651,9 @@ static void hpt3xx_set_mode(ide_drive_t *drive, const u8 speed) pci_write_config_dword(dev, itr_addr, new_itr); } -static void hpt3xx_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void hpt3xx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - hpt3xx_set_mode(drive, XFER_PIO_0 + pio); + hpt3xx_set_mode(drive, drive->pio_mode); } static void hpt3xx_maskproc(ide_drive_t *drive, int mask) diff --git a/drivers/ide/ht6560b.c b/drivers/ide/ht6560b.c index aafed8060e17..d81e49680c3f 100644 --- a/drivers/ide/ht6560b.c +++ b/drivers/ide/ht6560b.c @@ -279,9 +279,10 @@ static void ht_set_prefetch(ide_drive_t *drive, u8 state) #endif } -static void ht6560b_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void ht6560b_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { unsigned long flags, config; + const u8 pio = drive->pio_mode - XFER_PIO_0; u8 timing; switch (pio) { diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index cb3341ce655c..c6935c78757c 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -112,10 +112,10 @@ static int set_pio_mode(ide_drive_t *drive, int arg) /* take lock for IDE_DFLAG_[NO_]UNMASK/[NO_]IO_32BIT */ spin_lock_irqsave(&hwif->lock, flags); - port_ops->set_pio_mode(drive, arg); + port_ops->set_pio_mode(hwif, drive); spin_unlock_irqrestore(&hwif->lock, flags); } else - port_ops->set_pio_mode(drive, arg); + port_ops->set_pio_mode(hwif, drive); } else { int keep_dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA); diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c index c2323869d92a..a62fb03fc1cc 100644 --- a/drivers/ide/ide-xfer-mode.c +++ b/drivers/ide/ide-xfer-mode.c @@ -136,7 +136,7 @@ int ide_set_pio_mode(ide_drive_t *drive, const u8 mode) */ if (port_ops->set_dma_mode == NULL) { drive->pio_mode = mode; - port_ops->set_pio_mode(drive, mode - XFER_PIO_0); + port_ops->set_pio_mode(hwif, drive); return 0; } @@ -144,11 +144,11 @@ int ide_set_pio_mode(ide_drive_t *drive, const u8 mode) if (ide_config_drive_speed(drive, mode)) return -1; drive->pio_mode = mode; - port_ops->set_pio_mode(drive, mode - XFER_PIO_0); + port_ops->set_pio_mode(hwif, drive); return 0; } else { drive->pio_mode = mode; - port_ops->set_pio_mode(drive, mode - XFER_PIO_0); + port_ops->set_pio_mode(hwif, drive); return ide_config_drive_speed(drive, mode); } } diff --git a/drivers/ide/it8172.c b/drivers/ide/it8172.c index 0d266a5b524d..9dfdc8741a7b 100644 --- a/drivers/ide/it8172.c +++ b/drivers/ide/it8172.c @@ -37,12 +37,12 @@ #define DRV_NAME "IT8172" -static void it8172_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void it8172_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u16 drive_enables; u32 drive_timing; + const u8 pio = drive->pio_mode - XFER_PIO_0; /* * The highest value of DIOR/DIOW pulse width and recovery time @@ -98,14 +98,14 @@ static void it8172_set_dma_mode(ide_drive_t *drive, const u8 speed) pci_write_config_byte(dev, 0x4a, reg4a | u_speed); } else { const u8 mwdma_to_pio[] = { 0, 3, 4 }; - u8 pio; pci_write_config_byte(dev, 0x48, reg48 & ~u_flag); pci_write_config_byte(dev, 0x4a, reg4a & ~a_speed); - pio = mwdma_to_pio[speed - XFER_MW_DMA_0]; + drive->pio_mode = + mwdma_to_pio[speed - XFER_MW_DMA_0] + XFER_PIO_0; - it8172_set_pio_mode(drive, pio); + it8172_set_pio_mode(hwif, drive); } } diff --git a/drivers/ide/it8213.c b/drivers/ide/it8213.c index 47976167796a..492c07d5f4f3 100644 --- a/drivers/ide/it8213.c +++ b/drivers/ide/it8213.c @@ -17,15 +17,14 @@ /** * it8213_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * Set the interface PIO mode. */ -static void it8213_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void it8213_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); int is_slave = drive->dn & 1; int master_port = 0x40; @@ -35,6 +34,7 @@ static void it8213_set_pio_mode(ide_drive_t *drive, const u8 pio) u8 slave_data; static DEFINE_SPINLOCK(tune_lock); int control = 0; + const u8 pio = drive->pio_mode - XFER_PIO_0; static const u8 timings[][2] = { { 0, 0 }, @@ -120,7 +120,6 @@ static void it8213_set_dma_mode(ide_drive_t *drive, const u8 speed) pci_write_config_byte(dev, 0x54, reg54 & ~v_flag); } else { const u8 mwdma_to_pio[] = { 0, 3, 4 }; - u8 pio; if (reg48 & u_flag) pci_write_config_byte(dev, 0x48, reg48 & ~u_flag); @@ -132,11 +131,12 @@ static void it8213_set_dma_mode(ide_drive_t *drive, const u8 speed) pci_write_config_byte(dev, 0x55, (u8) reg55 & ~w_flag); if (speed >= XFER_MW_DMA_0) - pio = mwdma_to_pio[speed - XFER_MW_DMA_0]; + drive->pio_mode = + mwdma_to_pio[speed - XFER_MW_DMA_0] + XFER_PIO_0; else - pio = 2; /* only SWDMA2 is allowed */ + drive->pio_mode = XFER_PIO_2; /* for SWDMA2 */ - it8213_set_pio_mode(drive, pio); + it8213_set_pio_mode(hwif, drive); } } diff --git a/drivers/ide/it821x.c b/drivers/ide/it821x.c index 51aa745246dc..69becb7b9656 100644 --- a/drivers/ide/it821x.c +++ b/drivers/ide/it821x.c @@ -228,18 +228,18 @@ static void it821x_clock_strategy(ide_drive_t *drive) /** * it821x_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * Tune the host to the desired PIO mode taking into the consideration * the maximum PIO mode supported by the other device on the cable. */ -static void it821x_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void it821x_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct it821x_dev *itdev = ide_get_hwifdata(hwif); ide_drive_t *pair = ide_get_pair_dev(drive); + const u8 pio = drive->pio_mode - XFER_PIO_0; u8 unit = drive->dn & 1, set_pio = pio; /* Spec says 89 ref driver uses 88 */ diff --git a/drivers/ide/jmicron.c b/drivers/ide/jmicron.c index bf2be6431b20..ebffb904ed24 100644 --- a/drivers/ide/jmicron.c +++ b/drivers/ide/jmicron.c @@ -80,7 +80,7 @@ static u8 jmicron_cable_detect(ide_hwif_t *hwif) return ATA_CBL_PATA80; } -static void jmicron_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void jmicron_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { } diff --git a/drivers/ide/opti621.c b/drivers/ide/opti621.c index 2052788fab7a..1a53a4c375ed 100644 --- a/drivers/ide/opti621.c +++ b/drivers/ide/opti621.c @@ -62,12 +62,12 @@ static u8 read_reg(int reg) return ret; } -static void opti621_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void opti621_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; ide_drive_t *pair = ide_get_pair_dev(drive); unsigned long flags; - unsigned long mode = XFER_PIO_0 + pio, pair_mode; + unsigned long mode = drive->pio_mode, pair_mode; + const u8 pio = mode - XFER_PIO_0; u8 tim, misc, addr_pio = pio, clk; /* DRDY is default 2 (by OPTi Databook) */ diff --git a/drivers/ide/palm_bk3710.c b/drivers/ide/palm_bk3710.c index f8eddf05ecb8..0f262d07c378 100644 --- a/drivers/ide/palm_bk3710.c +++ b/drivers/ide/palm_bk3710.c @@ -203,12 +203,13 @@ static void palm_bk3710_set_dma_mode(ide_drive_t *drive, u8 xferspeed) } } -static void palm_bk3710_set_pio_mode(ide_drive_t *drive, u8 pio) +static void palm_bk3710_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { unsigned int cycle_time; int is_slave = drive->dn & 1; ide_drive_t *mate; - void __iomem *base = (void *)drive->hwif->dma_base; + void __iomem *base = (void *)hwif->dma_base; + const u8 pio = drive->pio_mode - XFER_PIO_0; /* * Obtain the drive PIO data for tuning the Palm Chip registers diff --git a/drivers/ide/pdc202xx_new.c b/drivers/ide/pdc202xx_new.c index 65ba8239e7b5..874acd2bb6e6 100644 --- a/drivers/ide/pdc202xx_new.c +++ b/drivers/ide/pdc202xx_new.c @@ -167,11 +167,11 @@ static void pdcnew_set_dma_mode(ide_drive_t *drive, const u8 speed) } } -static void pdcnew_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void pdcnew_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u8 adj = (drive->dn & 1) ? 0x08 : 0x00; + const u8 pio = drive->pio_mode - XFER_PIO_0; if (max_dma_rate(dev) == 4) { set_indexed_reg(hwif, 0x0c + adj, pio_timings[pio].reg0c); diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c index 1d20594ee420..402aab7f3baa 100644 --- a/drivers/ide/pdc202xx_old.c +++ b/drivers/ide/pdc202xx_old.c @@ -76,9 +76,9 @@ static void pdc202xx_set_mode(ide_drive_t *drive, const u8 speed) } } -static void pdc202xx_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void pdc202xx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - pdc202xx_set_mode(drive, XFER_PIO_0 + pio); + pdc202xx_set_mode(drive, drive->pio_mode); } static int pdc202xx_test_irq(ide_hwif_t *hwif) diff --git a/drivers/ide/piix.c b/drivers/ide/piix.c index bf14f39bd3a7..64b3041daa60 100644 --- a/drivers/ide/piix.c +++ b/drivers/ide/piix.c @@ -59,15 +59,14 @@ static int no_piix_dma; /** * piix_set_pio_mode - set host controller for PIO mode + * @port: port * @drive: drive - * @pio: PIO mode number * * Set the interface PIO mode based upon the settings done by AMI BIOS. */ -static void piix_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void piix_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); int is_slave = drive->dn & 1; int master_port = hwif->channel ? 0x42 : 0x40; @@ -77,6 +76,7 @@ static void piix_set_pio_mode(ide_drive_t *drive, const u8 pio) u8 slave_data; static DEFINE_SPINLOCK(tune_lock); int control = 0; + const u8 pio = drive->pio_mode - XFER_PIO_0; /* ISP RTC */ static const u8 timings[][2]= { @@ -176,7 +176,6 @@ static void piix_set_dma_mode(ide_drive_t *drive, const u8 speed) pci_write_config_byte(dev, 0x54, reg54 & ~v_flag); } else { const u8 mwdma_to_pio[] = { 0, 3, 4 }; - u8 pio; if (reg48 & u_flag) pci_write_config_byte(dev, 0x48, reg48 & ~u_flag); @@ -188,11 +187,12 @@ static void piix_set_dma_mode(ide_drive_t *drive, const u8 speed) pci_write_config_byte(dev, 0x55, (u8) reg55 & ~w_flag); if (speed >= XFER_MW_DMA_0) - pio = mwdma_to_pio[speed - XFER_MW_DMA_0]; + drive->pio_mode = + mwdma_to_pio[speed - XFER_MW_DMA_0] + XFER_PIO_0; else - pio = 2; /* only SWDMA2 is allowed */ + drive->pio_mode = XFER_PIO_2; /* for SWDMA2 */ - piix_set_pio_mode(drive, pio); + piix_set_pio_mode(hwif, drive); } } diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c index 7a4e788cab2f..a167968a2d42 100644 --- a/drivers/ide/pmac.c +++ b/drivers/ide/pmac.c @@ -496,12 +496,11 @@ static void pmac_write_devctl(ide_hwif_t *hwif, u8 ctl) /* * Old tuning functions (called on hdparm -p), sets up drive PIO timings */ -static void -pmac_ide_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void pmac_ide_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; pmac_ide_hwif_t *pmif = (pmac_ide_hwif_t *)dev_get_drvdata(hwif->gendev.parent); + const u8 pio = drive->pio_mode - XFER_PIO_0; struct ide_timing *tim = ide_timing_find_mode(XFER_PIO_0 + pio); u32 *timings, t; unsigned accessTicks, recTicks; diff --git a/drivers/ide/qd65xx.c b/drivers/ide/qd65xx.c index 74696edc8d1d..3f0244fd8e62 100644 --- a/drivers/ide/qd65xx.c +++ b/drivers/ide/qd65xx.c @@ -189,15 +189,13 @@ static void qd_set_timing (ide_drive_t *drive, u8 timing) printk(KERN_DEBUG "%s: %#x\n", drive->name, timing); } -static void qd6500_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void qd6500_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { u16 *id = drive->id; int active_time = 175; int recovery_time = 415; /* worst case values from the dos driver */ - /* - * FIXME: use "pio" value - */ + /* FIXME: use drive->pio_mode value */ if (!qd_find_disk_type(drive, &active_time, &recovery_time) && (id[ATA_ID_OLD_PIO_MODES] & 0xff) && (id[ATA_ID_FIELD_VALID] & 2) && id[ATA_ID_EIDE_PIO] >= 240) { @@ -211,9 +209,9 @@ static void qd6500_set_pio_mode(ide_drive_t *drive, const u8 pio) active_time, recovery_time)); } -static void qd6580_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void qd6580_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; + const u8 pio = drive->pio_mode - XFER_PIO_0; struct ide_timing *t = ide_timing_find_mode(XFER_PIO_0 + pio); unsigned int cycle_time; int active_time = 175; diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c index d467478d68da..bb0166e460ab 100644 --- a/drivers/ide/sc1200.c +++ b/drivers/ide/sc1200.c @@ -193,10 +193,10 @@ static int sc1200_dma_end(ide_drive_t *drive) * will have valid default PIO timings set up before we get here. */ -static void sc1200_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void sc1200_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; int mode = -1; + const u8 pio = drive->pio_mode - XFER_PIO_0; /* * bad abuse of ->set_pio_mode interface diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c index 1104bb301eb9..23e16e4460ee 100644 --- a/drivers/ide/scc_pata.c +++ b/drivers/ide/scc_pata.c @@ -199,16 +199,15 @@ scc_ide_outsl(unsigned long port, void *addr, u32 count) /** * scc_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * Load the timing settings for this device mode into the * controller. */ -static void scc_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void scc_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct scc_ports *ports = ide_get_hwifdata(hwif); unsigned long ctl_base = ports->ctl; unsigned long cckctrl_port = ctl_base + 0xff0; @@ -216,6 +215,7 @@ static void scc_set_pio_mode(ide_drive_t *drive, const u8 pio) unsigned long pioct_port = ctl_base + 0x004; unsigned long reg; int offset; + const u8 pio = drive->pio_mode - XFER_PIO_0; reg = in_be32((void __iomem *)cckctrl_port); if (reg & CCKCTRL_ATACLKOEN) { diff --git a/drivers/ide/serverworks.c b/drivers/ide/serverworks.c index 657f0433ec50..a56bc51ae032 100644 --- a/drivers/ide/serverworks.c +++ b/drivers/ide/serverworks.c @@ -106,12 +106,13 @@ static u8 svwks_csb_check (struct pci_dev *dev) return 0; } -static void svwks_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void svwks_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { static const u8 pio_modes[] = { 0x5d, 0x47, 0x34, 0x22, 0x20 }; static const u8 drive_pci[] = { 0x41, 0x40, 0x43, 0x42 }; - struct pci_dev *dev = to_pci_dev(drive->hwif->dev); + struct pci_dev *dev = to_pci_dev(hwif->dev); + const u8 pio = drive->pio_mode - XFER_PIO_0; pci_write_config_byte(dev, drive_pci[drive->dn], pio_modes[pio]); diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c index d95df528562f..97266958f744 100644 --- a/drivers/ide/siimage.c +++ b/drivers/ide/siimage.c @@ -229,19 +229,18 @@ static u8 sil_sata_udma_filter(ide_drive_t *drive) /** * sil_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * Load the timing settings for this device mode into the * controller. */ -static void sil_set_pio_mode(ide_drive_t *drive, u8 pio) +static void sil_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { static const u16 tf_speed[] = { 0x328a, 0x2283, 0x1281, 0x10c3, 0x10c1 }; static const u16 data_speed[] = { 0x328a, 0x2283, 0x1104, 0x10c3, 0x10c1 }; - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); ide_drive_t *pair = ide_get_pair_dev(drive); u32 speedt = 0; @@ -249,6 +248,7 @@ static void sil_set_pio_mode(ide_drive_t *drive, u8 pio) unsigned long addr = siimage_seldev(drive, 0x04); unsigned long tfaddr = siimage_selreg(hwif, 0x02); unsigned long base = (unsigned long)hwif->hwif_data; + const u8 pio = drive->pio_mode - XFER_PIO_0; u8 tf_pio = pio; u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0; u8 addr_mask = hwif->channel ? (mmio ? 0xF4 : 0x84) diff --git a/drivers/ide/sis5513.c b/drivers/ide/sis5513.c index 468706082fb5..5a0192060531 100644 --- a/drivers/ide/sis5513.c +++ b/drivers/ide/sis5513.c @@ -290,10 +290,10 @@ static void config_drive_art_rwp(ide_drive_t *drive) pci_write_config_byte(dev, 0x4b, rw_prefetch); } -static void sis_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void sis_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { config_drive_art_rwp(drive); - sis_program_timings(drive, XFER_PIO_0 + pio); + sis_program_timings(drive, drive->pio_mode); } static void sis_ata133_program_udma_timings(ide_drive_t *drive, const u8 mode) diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c index 3c2bbf0057ea..419cd3bc6c84 100644 --- a/drivers/ide/sl82c105.c +++ b/drivers/ide/sl82c105.c @@ -63,12 +63,13 @@ static unsigned int get_pio_timings(ide_drive_t *drive, u8 pio) /* * Configure the chipset for PIO mode. */ -static void sl82c105_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void sl82c105_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - struct pci_dev *dev = to_pci_dev(drive->hwif->dev); + struct pci_dev *dev = to_pci_dev(hwif->dev); unsigned long timings = (unsigned long)ide_get_drivedata(drive); int reg = 0x44 + drive->dn * 4; u16 drv_ctrl; + const u8 pio = drive->pio_mode - XFER_PIO_0; drv_ctrl = get_pio_timings(drive, pio); diff --git a/drivers/ide/slc90e66.c b/drivers/ide/slc90e66.c index 1ccfb40e7215..019777522cd2 100644 --- a/drivers/ide/slc90e66.c +++ b/drivers/ide/slc90e66.c @@ -18,9 +18,8 @@ static DEFINE_SPINLOCK(slc90e66_lock); -static void slc90e66_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void slc90e66_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); int is_slave = drive->dn & 1; int master_port = hwif->channel ? 0x42 : 0x40; @@ -29,6 +28,8 @@ static void slc90e66_set_pio_mode(ide_drive_t *drive, const u8 pio) u16 master_data; u8 slave_data; int control = 0; + const u8 pio = drive->pio_mode - XFER_PIO_0; + /* ISP RTC */ static const u8 timings[][2] = { { 0, 0 }, @@ -98,7 +99,6 @@ static void slc90e66_set_dma_mode(ide_drive_t *drive, const u8 speed) } } else { const u8 mwdma_to_pio[] = { 0, 3, 4 }; - u8 pio; if (reg48 & u_flag) pci_write_config_word(dev, 0x48, reg48 & ~u_flag); @@ -106,11 +106,12 @@ static void slc90e66_set_dma_mode(ide_drive_t *drive, const u8 speed) pci_write_config_word(dev, 0x4a, reg4a & ~a_speed); if (speed >= XFER_MW_DMA_0) - pio = mwdma_to_pio[speed - XFER_MW_DMA_0]; + drive->pio_mode = + mwdma_to_pio[speed - XFER_MW_DMA_0] + XFER_PIO_0; else - pio = 2; /* only SWDMA2 is allowed */ + drive->pio_mode = XFER_PIO_2; /* for SWDMA2 */ - slc90e66_set_pio_mode(drive, pio); + slc90e66_set_pio_mode(hwif, drive); } } diff --git a/drivers/ide/tc86c001.c b/drivers/ide/tc86c001.c index 05a93d6baecc..f2cb62bf3f22 100644 --- a/drivers/ide/tc86c001.c +++ b/drivers/ide/tc86c001.c @@ -41,9 +41,9 @@ static void tc86c001_set_mode(ide_drive_t *drive, const u8 speed) outw(scr, scr_port); } -static void tc86c001_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void tc86c001_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - tc86c001_set_mode(drive, XFER_PIO_0 + pio); + tc86c001_set_mode(drive, drive->pio_mode); } /* diff --git a/drivers/ide/triflex.c b/drivers/ide/triflex.c index 8773c3ba7462..d34a7eecdea5 100644 --- a/drivers/ide/triflex.c +++ b/drivers/ide/triflex.c @@ -82,9 +82,9 @@ static void triflex_set_mode(ide_drive_t *drive, const u8 speed) pci_write_config_dword(dev, channel_offset, triflex_timings); } -static void triflex_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void triflex_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - triflex_set_mode(drive, XFER_PIO_0 + pio); + triflex_set_mode(drive, drive->pio_mode); } static const struct ide_port_ops triflex_port_ops = { diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c index fd59c0d235b5..326d4683488b 100644 --- a/drivers/ide/tx4938ide.c +++ b/drivers/ide/tx4938ide.c @@ -56,11 +56,10 @@ static void tx4938ide_tune_ebusc(unsigned int ebus_ch, &tx4938_ebuscptr->cr[ebus_ch]); } -static void tx4938ide_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void tx4938ide_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct tx4938ide_platform_info *pdata = hwif->dev->platform_data; - u8 safe = pio; + u8 safe = drive->pio_mode - XFER_PIO_0; ide_drive_t *pair; pair = ide_get_pair_dev(drive); diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index 64b58ecc3f0e..5228a4786de5 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -104,11 +104,11 @@ static void tx4939ide_writeb(u8 val, void __iomem *base, u32 reg) #define TX4939IDE_BASE(hwif) ((void __iomem *)(hwif)->extra_base) -static void tx4939ide_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void tx4939ide_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; int is_slave = drive->dn; u32 mask, val; + const u8 pio = drive->pio_mode - XFER_PIO_0; u8 safe = pio; ide_drive_t *pair; diff --git a/drivers/ide/umc8672.c b/drivers/ide/umc8672.c index 60f936e2319c..47adcd09cb26 100644 --- a/drivers/ide/umc8672.c +++ b/drivers/ide/umc8672.c @@ -104,10 +104,11 @@ static void umc_set_speeds(u8 speeds[]) speeds[0], speeds[1], speeds[2], speeds[3]); } -static void umc_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void umc_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif, *mate = hwif->mate; + ide_hwif_t *mate = hwif->mate; unsigned long uninitialized_var(flags); + const u8 pio = drive->pio_mode - XFER_PIO_0; printk("%s: setting umc8672 to PIO mode%d (speed %d)\n", drive->name, pio, pio_to_umc[pio]); diff --git a/drivers/ide/via82cxxx.c b/drivers/ide/via82cxxx.c index fbecf8ea8207..6d995fc9d4f5 100644 --- a/drivers/ide/via82cxxx.c +++ b/drivers/ide/via82cxxx.c @@ -208,15 +208,15 @@ static void via_set_drive(ide_drive_t *drive, const u8 speed) /** * via_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * A callback from the upper layers for PIO-only tuning. */ -static void via_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void via_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - via_set_drive(drive, XFER_PIO_0 + pio); + via_set_drive(drive, drive->pio_mode); } static struct via_isa_bridge *via_config_find(struct pci_dev **isa) diff --git a/include/linux/ide.h b/include/linux/ide.h index 746ef9fdabcb..803ec306883c 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -624,7 +624,7 @@ extern const struct ide_tp_ops default_tp_ops; */ struct ide_port_ops { void (*init_dev)(ide_drive_t *); - void (*set_pio_mode)(ide_drive_t *, const u8); + void (*set_pio_mode)(struct hwif_s *, ide_drive_t *); void (*set_dma_mode)(ide_drive_t *, const u8); int (*reset_poll)(ide_drive_t *); void (*pre_reset)(ide_drive_t *); -- cgit v1.2.3 From 8776168ca2151850164af1de5565d01f7b8b2c53 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 19 Jan 2010 01:45:29 -0800 Subject: ide: change ->set_dma_mode method parameters Change ->set_dma_mode method parameters to match ->set_dmamode method used in struct ata_port_operations. Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/aec62xx.c | 10 +++++----- drivers/ide/alim15x3.c | 6 +++--- drivers/ide/amd74xx.c | 7 ++++--- drivers/ide/atiixp.c | 7 ++++--- drivers/ide/au1xxx-ide.c | 4 ++-- drivers/ide/cmd64x.c | 4 ++-- drivers/ide/cs5520.c | 4 ++-- drivers/ide/cs5530.c | 6 +++--- drivers/ide/cs5535.c | 6 +++--- drivers/ide/cs5536.c | 7 ++++--- drivers/ide/cy82c693.c | 4 ++-- drivers/ide/hpt366.c | 7 ++++--- drivers/ide/icside.c | 3 ++- drivers/ide/ide-xfer-mode.c | 4 ++-- drivers/ide/it8172.c | 4 ++-- drivers/ide/it8213.c | 6 +++--- drivers/ide/it821x.c | 6 ++++-- drivers/ide/jmicron.c | 4 ++-- drivers/ide/palm_bk3710.c | 5 +++-- drivers/ide/pdc202xx_new.c | 4 ++-- drivers/ide/pdc202xx_old.c | 7 ++++--- drivers/ide/piix.c | 6 +++--- drivers/ide/pmac.c | 4 ++-- drivers/ide/sc1200.c | 4 ++-- drivers/ide/scc_pata.c | 6 +++--- drivers/ide/serverworks.c | 4 ++-- drivers/ide/sgiioc4.c | 2 +- drivers/ide/siimage.c | 6 +++--- drivers/ide/sis5513.c | 4 +++- drivers/ide/sl82c105.c | 3 ++- drivers/ide/slc90e66.c | 4 ++-- drivers/ide/tc86c001.c | 7 ++++--- drivers/ide/triflex.c | 8 ++++---- drivers/ide/tx4939ide.c | 4 ++-- drivers/ide/via82cxxx.c | 9 +++++---- include/linux/ide.h | 2 +- 36 files changed, 101 insertions(+), 87 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/aec62xx.c b/drivers/ide/aec62xx.c index 3790847361c3..57d00caefc86 100644 --- a/drivers/ide/aec62xx.c +++ b/drivers/ide/aec62xx.c @@ -81,15 +81,15 @@ static u8 pci_bus_clock_list_ultra (u8 speed, struct chipset_bus_clock_list_entr return chipset_table->ultra_settings; } -static void aec6210_set_mode(ide_drive_t *drive, const u8 speed) +static void aec6210_set_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); struct ide_host *host = pci_get_drvdata(dev); struct chipset_bus_clock_list_entry *bus_clock = host->host_priv; u16 d_conf = 0; u8 ultra = 0, ultra_conf = 0; u8 tmp0 = 0, tmp1 = 0, tmp2 = 0; + const u8 speed = drive->dma_mode; unsigned long flags; local_irq_save(flags); @@ -109,15 +109,15 @@ static void aec6210_set_mode(ide_drive_t *drive, const u8 speed) local_irq_restore(flags); } -static void aec6260_set_mode(ide_drive_t *drive, const u8 speed) +static void aec6260_set_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); struct ide_host *host = pci_get_drvdata(dev); struct chipset_bus_clock_list_entry *bus_clock = host->host_priv; u8 unit = drive->dn & 1; u8 tmp1 = 0, tmp2 = 0; u8 ultra = 0, drive_conf = 0, ultra_conf = 0; + const u8 speed = drive->dma_mode; unsigned long flags; local_irq_save(flags); @@ -137,7 +137,7 @@ static void aec6260_set_mode(ide_drive_t *drive, const u8 speed) static void aec_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { drive->dma_mode = drive->pio_mode; - hwif->port_ops->set_dma_mode(drive, drive->dma_mode); + hwif->port_ops->set_dma_mode(hwif, drive); } static int init_chipset_aec62xx(struct pci_dev *dev) diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c index 28cee1055f76..6f0debae4e27 100644 --- a/drivers/ide/alim15x3.c +++ b/drivers/ide/alim15x3.c @@ -121,16 +121,16 @@ static u8 ali_udma_filter(ide_drive_t *drive) /** * ali_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Configure the hardware for the desired IDE transfer mode. */ -static void ali_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void ali_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); + const u8 speed = drive->dma_mode; u8 speed1 = speed; u8 unit = drive->dn & 1; u8 tmpbyte = 0x00; diff --git a/drivers/ide/amd74xx.c b/drivers/ide/amd74xx.c index 3eee7be7ca6f..b7e105338205 100644 --- a/drivers/ide/amd74xx.c +++ b/drivers/ide/amd74xx.c @@ -79,14 +79,14 @@ static void amd_set_speed(struct pci_dev *dev, u8 dn, u8 udma_mask, * to a desired transfer mode. It also can be called by upper layers. */ -static void amd_set_drive(ide_drive_t *drive, const u8 speed) +static void amd_set_drive(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); ide_drive_t *peer = ide_get_pair_dev(drive); struct ide_timing t, p; int T, UT; u8 udma_mask = hwif->ultra_mask; + const u8 speed = drive->dma_mode; T = 1000000000 / amd_clock; UT = (udma_mask == ATA_UDMA2) ? T : (T / 2); @@ -110,7 +110,8 @@ static void amd_set_drive(ide_drive_t *drive, const u8 speed) static void amd_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - amd_set_drive(drive, drive->pio_mode); + drive->dma_mode = drive->pio_mode; + amd_set_drive(hwif, drive); } static void amd7409_cable_detect(struct pci_dev *dev) diff --git a/drivers/ide/atiixp.c b/drivers/ide/atiixp.c index b6848dfb93b0..15f0ead89f5c 100644 --- a/drivers/ide/atiixp.c +++ b/drivers/ide/atiixp.c @@ -75,21 +75,22 @@ static void atiixp_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /** * atiixp_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Set a ATIIXP host controller to the desired DMA mode. This involves * programming the right timing data into the PCI configuration space. */ -static void atiixp_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void atiixp_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - struct pci_dev *dev = to_pci_dev(drive->hwif->dev); + struct pci_dev *dev = to_pci_dev(hwif->dev); unsigned long flags; int timing_shift = (drive->dn ^ 1) * 8; u32 tmp32; u16 tmp16; u16 udma_ctl = 0; + const u8 speed = drive->dma_mode; spin_lock_irqsave(&atiixp_lock, flags); diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c index c90e9b0a9f6e..e2fd378ba9de 100644 --- a/drivers/ide/au1xxx-ide.c +++ b/drivers/ide/au1xxx-ide.c @@ -160,11 +160,11 @@ static void au1xxx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) au_writel(mem_stcfg,MEM_STCFG2); } -static void auide_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void auide_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { int mem_sttime = 0, mem_stcfg = au_readl(MEM_STCFG2); - switch(speed) { + switch (drive->dma_mode) { #ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA case XFER_MW_DMA_2: mem_sttime = SBC_IDE_TIMING(MDMA2); diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c index 0b11745937e7..a65a69171250 100644 --- a/drivers/ide/cmd64x.c +++ b/drivers/ide/cmd64x.c @@ -141,12 +141,12 @@ static void cmd64x_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) cmd64x_program_timings(drive, XFER_PIO_0 + pio); } -static void cmd64x_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void cmd64x_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u8 unit = drive->dn & 0x01; u8 regU = 0, pciU = hwif->channel ? UDIDETCR1 : UDIDETCR0; + const u8 speed = drive->dma_mode; pci_read_config_byte(dev, pciU, ®U); regU &= ~(unit ? 0xCA : 0x35); diff --git a/drivers/ide/cs5520.c b/drivers/ide/cs5520.c index b8094f049f3e..2c1e5f7cd261 100644 --- a/drivers/ide/cs5520.c +++ b/drivers/ide/cs5520.c @@ -81,12 +81,12 @@ static void cs5520_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) (cs5520_pio_clocks[pio].assert)); } -static void cs5520_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void cs5520_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { printk(KERN_ERR "cs55x0: bad ide timing.\n"); drive->pio_mode = XFER_PIO_0 + 0; - cs5520_set_pio_mode(drive->hwif, drive); + cs5520_set_pio_mode(hwif, drive); } static const struct ide_port_ops cs5520_port_ops = { diff --git a/drivers/ide/cs5530.c b/drivers/ide/cs5530.c index 4ced40255ad6..4dc4eb92b076 100644 --- a/drivers/ide/cs5530.c +++ b/drivers/ide/cs5530.c @@ -100,12 +100,12 @@ out: return mask; } -static void cs5530_set_dma_mode(ide_drive_t *drive, const u8 mode) +static void cs5530_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { unsigned long basereg; unsigned int reg, timings = 0; - switch (mode) { + switch (drive->dma_mode) { case XFER_UDMA_0: timings = 0x00921250; break; case XFER_UDMA_1: timings = 0x00911140; break; case XFER_UDMA_2: timings = 0x00911030; break; @@ -113,7 +113,7 @@ static void cs5530_set_dma_mode(ide_drive_t *drive, const u8 mode) case XFER_MW_DMA_1: timings = 0x00012121; break; case XFER_MW_DMA_2: timings = 0x00002020; break; } - basereg = CS5530_BASEREG(drive->hwif); + basereg = CS5530_BASEREG(hwif); reg = inl(basereg + 4); /* get drive0 config register */ timings |= reg & 0x80000000; /* preserve PIO format bit */ if ((drive-> dn & 1) == 0) { /* are we configuring drive0? */ diff --git a/drivers/ide/cs5535.c b/drivers/ide/cs5535.c index 7974415ea89f..740002b2f3e8 100644 --- a/drivers/ide/cs5535.c +++ b/drivers/ide/cs5535.c @@ -129,15 +129,15 @@ static void cs5535_set_speed(ide_drive_t *drive, const u8 speed) /** * cs5535_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Programs the chipset for DMA mode. */ -static void cs5535_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void cs5535_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - cs5535_set_speed(drive, speed); + cs5535_set_speed(drive, drive->dma_mode); } /** diff --git a/drivers/ide/cs5536.c b/drivers/ide/cs5536.c index b518ef0e9a35..70871fbc3c0a 100644 --- a/drivers/ide/cs5536.c +++ b/drivers/ide/cs5536.c @@ -173,11 +173,11 @@ static void cs5536_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /** * cs5536_set_dma_mode - DMA timing setup + * @hwif: ATA port * @drive: ATA device - * @mode: DMA mode */ -static void cs5536_set_dma_mode(ide_drive_t *drive, const u8 mode) +static void cs5536_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { static const u8 udma_timings[6] = { 0xc2, 0xc1, 0xc0, 0xc4, 0xc5, 0xc6, @@ -187,10 +187,11 @@ static void cs5536_set_dma_mode(ide_drive_t *drive, const u8 mode) 0x67, 0x21, 0x20, }; - struct pci_dev *pdev = to_pci_dev(drive->hwif->dev); + struct pci_dev *pdev = to_pci_dev(hwif->dev); int dshift = (drive->dn & 1) ? IDE_D1_SHIFT : IDE_D0_SHIFT; unsigned long timings = (unsigned long)ide_get_drivedata(drive); u32 etc; + const u8 mode = drive->dma_mode; cs5536_read(pdev, ETC, &etc); diff --git a/drivers/ide/cy82c693.c b/drivers/ide/cy82c693.c index ead65c394f00..9383f67deae1 100644 --- a/drivers/ide/cy82c693.c +++ b/drivers/ide/cy82c693.c @@ -53,9 +53,9 @@ * set DMA mode a specific channel for CY82C693 */ -static void cy82c693_set_dma_mode(ide_drive_t *drive, const u8 mode) +static void cy82c693_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; + const u8 mode = drive->dma_mode; u8 single = (mode & 0x10) >> 4, index = 0, data = 0; index = hwif->channel ? CY82_INDEX_CHANNEL1 : CY82_INDEX_CHANNEL0; diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c index f1dec519a9e6..b885c1d548f5 100644 --- a/drivers/ide/hpt366.c +++ b/drivers/ide/hpt366.c @@ -627,14 +627,14 @@ static u32 get_speed_setting(u8 speed, struct hpt_info *info) return info->timings->clock_table[info->clock][i]; } -static void hpt3xx_set_mode(ide_drive_t *drive, const u8 speed) +static void hpt3xx_set_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); struct hpt_info *info = hpt3xx_get_info(hwif->dev); struct hpt_timings *t = info->timings; u8 itr_addr = 0x40 + (drive->dn * 4); u32 old_itr = 0; + const u8 speed = drive->dma_mode; u32 new_itr = get_speed_setting(speed, info); u32 itr_mask = speed < XFER_MW_DMA_0 ? t->pio_mask : (speed < XFER_UDMA_0 ? t->dma_mask : @@ -653,7 +653,8 @@ static void hpt3xx_set_mode(ide_drive_t *drive, const u8 speed) static void hpt3xx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - hpt3xx_set_mode(drive, drive->pio_mode); + drive->dma_mode = drive->pio_mode; + hpt3xx_set_mode(hwif, drive); } static void hpt3xx_maskproc(ide_drive_t *drive, int mask) diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c index 0f67f1abbbd3..26b6c0a1f772 100644 --- a/drivers/ide/icside.c +++ b/drivers/ide/icside.c @@ -185,10 +185,11 @@ static const expansioncard_ops_t icside_ops_arcin_v6 = { * MW1 80 50 50 150 C * MW2 70 25 25 120 C */ -static void icside_set_dma_mode(ide_drive_t *drive, const u8 xfer_mode) +static void icside_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { unsigned long cycle_time; int use_dma_info = 0; + const u8 xfer_mode = drive->dma_mode; switch (xfer_mode) { case XFER_MW_DMA_2: diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c index a62fb03fc1cc..9b549e4d1848 100644 --- a/drivers/ide/ide-xfer-mode.c +++ b/drivers/ide/ide-xfer-mode.c @@ -168,11 +168,11 @@ int ide_set_dma_mode(ide_drive_t *drive, const u8 mode) if (ide_config_drive_speed(drive, mode)) return -1; drive->dma_mode = mode; - port_ops->set_dma_mode(drive, mode); + port_ops->set_dma_mode(hwif, drive); return 0; } else { drive->dma_mode = mode; - port_ops->set_dma_mode(drive, mode); + port_ops->set_dma_mode(hwif, drive); return ide_config_drive_speed(drive, mode); } } diff --git a/drivers/ide/it8172.c b/drivers/ide/it8172.c index 9dfdc8741a7b..560e66d07659 100644 --- a/drivers/ide/it8172.c +++ b/drivers/ide/it8172.c @@ -77,14 +77,14 @@ static void it8172_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) pci_write_config_dword(dev, 0x44, drive_timing); } -static void it8172_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void it8172_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); int a_speed = 3 << (drive->dn * 4); int u_flag = 1 << drive->dn; int u_speed = 0; u8 reg48, reg4a; + const u8 speed = drive->dma_mode; pci_read_config_byte(dev, 0x48, ®48); pci_read_config_byte(dev, 0x4a, ®4a); diff --git a/drivers/ide/it8213.c b/drivers/ide/it8213.c index 492c07d5f4f3..46816ba26416 100644 --- a/drivers/ide/it8213.c +++ b/drivers/ide/it8213.c @@ -74,15 +74,14 @@ static void it8213_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /** * it8213_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Tune the ITE chipset for the DMA mode. */ -static void it8213_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void it8213_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u8 maslave = 0x40; int a_speed = 3 << (drive->dn * 4); @@ -92,6 +91,7 @@ static void it8213_set_dma_mode(ide_drive_t *drive, const u8 speed) int u_speed = 0; u16 reg4042, reg4a; u8 reg48, reg54, reg55; + const u8 speed = drive->dma_mode; pci_read_config_word(dev, maslave, ®4042); pci_read_config_byte(dev, 0x48, ®48); diff --git a/drivers/ide/it821x.c b/drivers/ide/it821x.c index 69becb7b9656..56b79194156b 100644 --- a/drivers/ide/it821x.c +++ b/drivers/ide/it821x.c @@ -393,14 +393,16 @@ static int it821x_dma_end(ide_drive_t *drive) /** * it821x_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Tune the ITE chipset for the desired DMA mode. */ -static void it821x_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void it821x_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { + const u8 speed = drive->dma_mode; + /* * MWDMA tuning is really hard because our MWDMA and PIO * timings are kept in the same place. We can switch in the diff --git a/drivers/ide/jmicron.c b/drivers/ide/jmicron.c index ebffb904ed24..74c2c4a6d909 100644 --- a/drivers/ide/jmicron.c +++ b/drivers/ide/jmicron.c @@ -86,13 +86,13 @@ static void jmicron_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /** * jmicron_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @mode: DMA mode * * As the JMicron snoops for timings we don't need to do anything here. */ -static void jmicron_set_dma_mode(ide_drive_t *drive, const u8 mode) +static void jmicron_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { } diff --git a/drivers/ide/palm_bk3710.c b/drivers/ide/palm_bk3710.c index 0f262d07c378..35448c91b8c8 100644 --- a/drivers/ide/palm_bk3710.c +++ b/drivers/ide/palm_bk3710.c @@ -188,10 +188,11 @@ static void palm_bk3710_setpiomode(void __iomem *base, ide_drive_t *mate, writel(val32, base + BK3710_REGRCVR); } -static void palm_bk3710_set_dma_mode(ide_drive_t *drive, u8 xferspeed) +static void palm_bk3710_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { int is_slave = drive->dn & 1; - void __iomem *base = (void *)drive->hwif->dma_base; + void __iomem *base = (void *)hwif->dma_base; + const u8 xferspeed = drive->dma_mode; if (xferspeed >= XFER_UDMA_0) { palm_bk3710_setudmamode(base, is_slave, diff --git a/drivers/ide/pdc202xx_new.c b/drivers/ide/pdc202xx_new.c index 874acd2bb6e6..9546fe2a93f7 100644 --- a/drivers/ide/pdc202xx_new.c +++ b/drivers/ide/pdc202xx_new.c @@ -129,11 +129,11 @@ static struct udma_timing { { 0x1a, 0x01, 0xcb }, /* UDMA mode 6 */ }; -static void pdcnew_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void pdcnew_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u8 adj = (drive->dn & 1) ? 0x08 : 0x00; + const u8 speed = drive->dma_mode; /* * IDE core issues SETFEATURES_XFER to the drive first (thanks to diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c index 402aab7f3baa..07cd37516ba6 100644 --- a/drivers/ide/pdc202xx_old.c +++ b/drivers/ide/pdc202xx_old.c @@ -21,11 +21,11 @@ #define DRV_NAME "pdc202xx_old" -static void pdc202xx_set_mode(ide_drive_t *drive, const u8 speed) +static void pdc202xx_set_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u8 drive_pci = 0x60 + (drive->dn << 2); + const u8 speed = drive->dma_mode; u8 AP = 0, BP = 0, CP = 0; u8 TA = 0, TB = 0, TC = 0; @@ -78,7 +78,8 @@ static void pdc202xx_set_mode(ide_drive_t *drive, const u8 speed) static void pdc202xx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - pdc202xx_set_mode(drive, drive->pio_mode); + drive->dma_mode = drive->pio_mode; + pdc202xx_set_mode(hwif, drive); } static int pdc202xx_test_irq(ide_hwif_t *hwif) diff --git a/drivers/ide/piix.c b/drivers/ide/piix.c index 64b3041daa60..1bdca49e5a03 100644 --- a/drivers/ide/piix.c +++ b/drivers/ide/piix.c @@ -127,16 +127,15 @@ static void piix_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /** * piix_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Set a PIIX host controller to the desired DMA mode. This involves * programming the right timing data into the PCI configuration space. */ -static void piix_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void piix_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u8 maslave = hwif->channel ? 0x42 : 0x40; int a_speed = 3 << (drive->dn * 4); @@ -147,6 +146,7 @@ static void piix_set_dma_mode(ide_drive_t *drive, const u8 speed) int sitre; u16 reg4042, reg4a; u8 reg48, reg54, reg55; + const u8 speed = drive->dma_mode; pci_read_config_word(dev, maslave, ®4042); sitre = (reg4042 & 0x4000) ? 1 : 0; diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c index a167968a2d42..9fae1fb1468b 100644 --- a/drivers/ide/pmac.c +++ b/drivers/ide/pmac.c @@ -777,14 +777,14 @@ set_timings_mdma(ide_drive_t *drive, int intf_type, u32 *timings, u32 *timings2, #endif } -static void pmac_ide_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void pmac_ide_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; pmac_ide_hwif_t *pmif = (pmac_ide_hwif_t *)dev_get_drvdata(hwif->gendev.parent); int ret = 0; u32 *timings, *timings2, tl[2]; u8 unit = drive->dn & 1; + const u8 speed = drive->dma_mode; timings = &pmif->timings[unit]; timings2 = &pmif->timings[unit+2]; diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c index bb0166e460ab..134f1fd13866 100644 --- a/drivers/ide/sc1200.c +++ b/drivers/ide/sc1200.c @@ -122,13 +122,13 @@ out: return mask; } -static void sc1200_set_dma_mode(ide_drive_t *drive, const u8 mode) +static void sc1200_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); unsigned int reg, timings; unsigned short pci_clock; unsigned int basereg = hwif->channel ? 0x50 : 0x40; + const u8 mode = drive->dma_mode; static const u32 udma_timing[3][3] = { { 0x00921250, 0x00911140, 0x00911030 }, diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c index 23e16e4460ee..e9d4b441d1c3 100644 --- a/drivers/ide/scc_pata.c +++ b/drivers/ide/scc_pata.c @@ -231,16 +231,15 @@ static void scc_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /** * scc_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Load the timing settings for this device mode into the * controller. */ -static void scc_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void scc_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct scc_ports *ports = ide_get_hwifdata(hwif); unsigned long ctl_base = ports->ctl; unsigned long cckctrl_port = ctl_base + 0xff0; @@ -254,6 +253,7 @@ static void scc_set_dma_mode(ide_drive_t *drive, const u8 speed) int offset, idx; unsigned long reg; unsigned long jcactsel; + const u8 speed = drive->dma_mode; reg = in_be32((void __iomem *)cckctrl_port); if (reg & CCKCTRL_ATACLKOEN) { diff --git a/drivers/ide/serverworks.c b/drivers/ide/serverworks.c index a56bc51ae032..35fb8dabb55d 100644 --- a/drivers/ide/serverworks.c +++ b/drivers/ide/serverworks.c @@ -128,14 +128,14 @@ static void svwks_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) } } -static void svwks_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void svwks_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { static const u8 udma_modes[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05 }; static const u8 dma_modes[] = { 0x77, 0x21, 0x20 }; static const u8 drive_pci2[] = { 0x45, 0x44, 0x47, 0x46 }; - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); + const u8 speed = drive->dma_mode; u8 unit = drive->dn & 1; u8 ultra_enable = 0, ultra_timing = 0, dma_timing = 0; diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c index b7d61dc64096..e3ea591f66d3 100644 --- a/drivers/ide/sgiioc4.c +++ b/drivers/ide/sgiioc4.c @@ -255,7 +255,7 @@ static int sgiioc4_dma_end(ide_drive_t *drive) return dma_stat; } -static void sgiioc4_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void sgiioc4_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { } diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c index 97266958f744..2009ac2ff658 100644 --- a/drivers/ide/siimage.c +++ b/drivers/ide/siimage.c @@ -289,19 +289,18 @@ static void sil_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /** * sil_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Tune the SiI chipset for the desired DMA mode. */ -static void sil_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void sil_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { static const u8 ultra6[] = { 0x0F, 0x0B, 0x07, 0x05, 0x03, 0x02, 0x01 }; static const u8 ultra5[] = { 0x0C, 0x07, 0x05, 0x04, 0x02, 0x01 }; static const u16 dma[] = { 0x2208, 0x10C2, 0x10C1 }; - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); unsigned long base = (unsigned long)hwif->hwif_data; u16 ultra = 0, multi = 0; @@ -311,6 +310,7 @@ static void sil_set_dma_mode(ide_drive_t *drive, const u8 speed) : (mmio ? 0xB4 : 0x80); unsigned long ma = siimage_seldev(drive, 0x08); unsigned long ua = siimage_seldev(drive, 0x0C); + const u8 speed = drive->dma_mode; scsc = sil_ioread8 (dev, base + (mmio ? 0x4A : 0x8A)); mode = sil_ioread8 (dev, base + addr_mask); diff --git a/drivers/ide/sis5513.c b/drivers/ide/sis5513.c index 5a0192060531..db7f4e761dbc 100644 --- a/drivers/ide/sis5513.c +++ b/drivers/ide/sis5513.c @@ -340,8 +340,10 @@ static void sis_program_udma_timings(ide_drive_t *drive, const u8 mode) sis_ata33_program_udma_timings(drive, mode); } -static void sis_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void sis_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { + const u8 speed = drive->dma_mode; + if (speed >= XFER_UDMA_0) sis_program_udma_timings(drive, speed); else diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c index 419cd3bc6c84..f21dc2ad7682 100644 --- a/drivers/ide/sl82c105.c +++ b/drivers/ide/sl82c105.c @@ -92,11 +92,12 @@ static void sl82c105_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /* * Configure the chipset for DMA mode. */ -static void sl82c105_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void sl82c105_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { static u16 mwdma_timings[] = {0x0707, 0x0201, 0x0200}; unsigned long timings = (unsigned long)ide_get_drivedata(drive); u16 drv_ctrl; + const u8 speed = drive->dma_mode; drv_ctrl = mwdma_timings[speed - XFER_MW_DMA_0]; diff --git a/drivers/ide/slc90e66.c b/drivers/ide/slc90e66.c index 019777522cd2..864ffe0e26d9 100644 --- a/drivers/ide/slc90e66.c +++ b/drivers/ide/slc90e66.c @@ -72,14 +72,14 @@ static void slc90e66_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) spin_unlock_irqrestore(&slc90e66_lock, flags); } -static void slc90e66_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void slc90e66_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u8 maslave = hwif->channel ? 0x42 : 0x40; int sitre = 0, a_speed = 7 << (drive->dn * 4); int u_speed = 0, u_flag = 1 << drive->dn; u16 reg4042, reg44, reg48, reg4a; + const u8 speed = drive->dma_mode; pci_read_config_word(dev, maslave, ®4042); sitre = (reg4042 & 0x4000) ? 1 : 0; diff --git a/drivers/ide/tc86c001.c b/drivers/ide/tc86c001.c index f2cb62bf3f22..e444d24934b3 100644 --- a/drivers/ide/tc86c001.c +++ b/drivers/ide/tc86c001.c @@ -13,11 +13,11 @@ #define DRV_NAME "tc86c001" -static void tc86c001_set_mode(ide_drive_t *drive, const u8 speed) +static void tc86c001_set_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; unsigned long scr_port = hwif->config_data + (drive->dn ? 0x02 : 0x00); u16 mode, scr = inw(scr_port); + const u8 speed = drive->dma_mode; switch (speed) { case XFER_UDMA_4: mode = 0x00c0; break; @@ -43,7 +43,8 @@ static void tc86c001_set_mode(ide_drive_t *drive, const u8 speed) static void tc86c001_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - tc86c001_set_mode(drive, drive->pio_mode); + drive->dma_mode = drive->pio_mode; + tc86c001_set_mode(hwif, drive); } /* diff --git a/drivers/ide/triflex.c b/drivers/ide/triflex.c index d34a7eecdea5..7953447eae0f 100644 --- a/drivers/ide/triflex.c +++ b/drivers/ide/triflex.c @@ -34,9 +34,8 @@ #define DRV_NAME "triflex" -static void triflex_set_mode(ide_drive_t *drive, const u8 speed) +static void triflex_set_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u32 triflex_timings = 0; u16 timing = 0; @@ -44,7 +43,7 @@ static void triflex_set_mode(ide_drive_t *drive, const u8 speed) pci_read_config_dword(dev, channel_offset, &triflex_timings); - switch(speed) { + switch (drive->dma_mode) { case XFER_MW_DMA_2: timing = 0x0103; break; @@ -84,7 +83,8 @@ static void triflex_set_mode(ide_drive_t *drive, const u8 speed) static void triflex_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - triflex_set_mode(drive, drive->pio_mode); + drive->dma_mode = drive->pio_mode; + triflex_set_mode(hwif, drive); } static const struct ide_port_ops triflex_port_ops = { diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index 5228a4786de5..f210633a3d57 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -125,10 +125,10 @@ static void tx4939ide_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /* tx4939ide_tf_load_fixup() will set the Sys_Ctl register */ } -static void tx4939ide_set_dma_mode(ide_drive_t *drive, const u8 mode) +static void tx4939ide_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; u32 mask, val; + const u8 mode = drive->dma_mode; /* Update Data Transfer Mode for this drive. */ if (mode >= XFER_UDMA_0) diff --git a/drivers/ide/via82cxxx.c b/drivers/ide/via82cxxx.c index 6d995fc9d4f5..6769fe252b07 100644 --- a/drivers/ide/via82cxxx.c +++ b/drivers/ide/via82cxxx.c @@ -169,22 +169,22 @@ static void via_set_speed(ide_hwif_t *hwif, u8 dn, struct ide_timing *timing) /** * via_set_drive - configure transfer mode + * @hwif: port * @drive: Drive to set up - * @speed: desired speed * * via_set_drive() computes timing values configures the chipset to * a desired transfer mode. It also can be called by upper layers. */ -static void via_set_drive(ide_drive_t *drive, const u8 speed) +static void via_set_drive(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; ide_drive_t *peer = ide_get_pair_dev(drive); struct pci_dev *dev = to_pci_dev(hwif->dev); struct ide_host *host = pci_get_drvdata(dev); struct via82cxxx_dev *vdev = host->host_priv; struct ide_timing t, p; unsigned int T, UT; + const u8 speed = drive->dma_mode; T = 1000000000 / via_clock; @@ -216,7 +216,8 @@ static void via_set_drive(ide_drive_t *drive, const u8 speed) static void via_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - via_set_drive(drive, drive->pio_mode); + drive->dma_mode = drive->pio_mode; + via_set_drive(hwif, drive); } static struct via_isa_bridge *via_config_find(struct pci_dev **isa) diff --git a/include/linux/ide.h b/include/linux/ide.h index 803ec306883c..53ecdba82d72 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -625,7 +625,7 @@ extern const struct ide_tp_ops default_tp_ops; struct ide_port_ops { void (*init_dev)(ide_drive_t *); void (*set_pio_mode)(struct hwif_s *, ide_drive_t *); - void (*set_dma_mode)(ide_drive_t *, const u8); + void (*set_dma_mode)(struct hwif_s *, ide_drive_t *); int (*reset_poll)(ide_drive_t *); void (*pre_reset)(ide_drive_t *); void (*resetproc)(ide_drive_t *); -- cgit v1.2.3 From 220c58bc6d1198c4c4e69a385d364602c38b6b1c Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 18 Jan 2010 07:22:38 +0000 Subject: ide: make ide_get_best_pio_mode() static Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: David S. Miller --- drivers/ide/ide-xfer-mode.c | 3 +-- include/linux/ide.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c index 9b549e4d1848..5fc8d5c17de9 100644 --- a/drivers/ide/ide-xfer-mode.c +++ b/drivers/ide/ide-xfer-mode.c @@ -58,7 +58,7 @@ EXPORT_SYMBOL(ide_xfer_verbose); * This is used by most chipset support modules when "auto-tuning". */ -u8 ide_get_best_pio_mode(ide_drive_t *drive, u8 mode_wanted, u8 max_mode) +static u8 ide_get_best_pio_mode(ide_drive_t *drive, u8 mode_wanted, u8 max_mode) { u16 *id = drive->id; int pio_mode = -1, overridden = 0; @@ -105,7 +105,6 @@ u8 ide_get_best_pio_mode(ide_drive_t *drive, u8 mode_wanted, u8 max_mode) return pio_mode; } -EXPORT_SYMBOL_GPL(ide_get_best_pio_mode); int ide_pio_need_iordy(ide_drive_t *drive, const u8 pio) { diff --git a/include/linux/ide.h b/include/linux/ide.h index 53ecdba82d72..97e6ab435184 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1496,7 +1496,6 @@ int ide_timing_compute(ide_drive_t *, u8, struct ide_timing *, int, int); #ifdef CONFIG_IDE_XFER_MODE int ide_scan_pio_blacklist(char *); const char *ide_xfer_verbose(u8); -u8 ide_get_best_pio_mode(ide_drive_t *, u8, u8); int ide_pio_need_iordy(ide_drive_t *, const u8); int ide_set_pio_mode(ide_drive_t *, u8); int ide_set_dma_mode(ide_drive_t *, u8); -- cgit v1.2.3 From 11380a4b2d86fae9a6bce75c9373668cc323fe57 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 19 Jan 2010 13:46:10 -0800 Subject: net: Unexport napi_gro_flush(). Nothing outside of net/core/dev.c uses it. Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - net/core/dev.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a3fccc85b1a0..468a11dea58c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1527,7 +1527,6 @@ extern int netif_rx(struct sk_buff *skb); extern int netif_rx_ni(struct sk_buff *skb); #define HAVE_NETIF_RECEIVE_SKB 1 extern int netif_receive_skb(struct sk_buff *skb); -extern void napi_gro_flush(struct napi_struct *napi); extern gro_result_t dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb); extern gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index a008f6987a95..5747b9edc1bb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2582,7 +2582,7 @@ out: return netif_receive_skb(skb); } -void napi_gro_flush(struct napi_struct *napi) +static void napi_gro_flush(struct napi_struct *napi) { struct sk_buff *skb, *next; @@ -2595,7 +2595,6 @@ void napi_gro_flush(struct napi_struct *napi) napi->gro_count = 0; napi->gro_list = NULL; } -EXPORT_SYMBOL(napi_gro_flush); enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { -- cgit v1.2.3 From 552e450929a7298cc8834fd2824a60b2e914f70e Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 20 Jan 2010 13:49:45 -0700 Subject: spi/dw_spi: refine the IRQ mode working flow Now dw_spi core fully supports 3 transfer modes: pure polling, DMA and IRQ mode. IRQ mode will use the FIFO half empty as the IRQ trigger, so each interface driver need set the fifo_len, so that core driver can handle it properly Signed-off-by: Feng Tang Signed-off-by: Grant Likely --- drivers/spi/dw_spi.c | 64 ++++++++++++++++++++++++++++------------------ drivers/spi/dw_spi_pci.c | 1 + include/linux/spi/dw_spi.h | 1 + 3 files changed, 41 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/spi/dw_spi.c b/drivers/spi/dw_spi.c index 521d680af289..1bb709b3920f 100644 --- a/drivers/spi/dw_spi.c +++ b/drivers/spi/dw_spi.c @@ -358,6 +358,8 @@ static void transfer_complete(struct dw_spi *dws) static irqreturn_t interrupt_transfer(struct dw_spi *dws) { u16 irq_status, irq_mask = 0x3f; + u32 int_level = dws->fifo_len / 2; + u32 left; irq_status = dw_readw(dws, isr) & irq_mask; /* Error handling */ @@ -369,22 +371,23 @@ static irqreturn_t interrupt_transfer(struct dw_spi *dws) return IRQ_HANDLED; } - /* INT comes from tx */ - if (dws->tx && (irq_status & SPI_INT_TXEI)) { - while (dws->tx < dws->tx_end) + if (irq_status & SPI_INT_TXEI) { + spi_mask_intr(dws, SPI_INT_TXEI); + + left = (dws->tx_end - dws->tx) / dws->n_bytes; + left = (left > int_level) ? int_level : left; + + while (left--) dws->write(dws); + dws->read(dws); - if (dws->tx == dws->tx_end) { - spi_mask_intr(dws, SPI_INT_TXEI); + /* Re-enable the IRQ if there is still data left to tx */ + if (dws->tx_end > dws->tx) + spi_umask_intr(dws, SPI_INT_TXEI); + else transfer_complete(dws); - } } - /* INT comes from rx */ - if (dws->rx && (irq_status & SPI_INT_RXFI)) { - if (dws->read(dws)) - transfer_complete(dws); - } return IRQ_HANDLED; } @@ -428,6 +431,7 @@ static void pump_transfers(unsigned long data) u8 bits = 0; u8 imask = 0; u8 cs_change = 0; + u16 txint_level = 0; u16 clk_div = 0; u32 speed = 0; u32 cr0 = 0; @@ -438,6 +442,9 @@ static void pump_transfers(unsigned long data) chip = dws->cur_chip; spi = message->spi; + if (unlikely(!chip->clk_div)) + chip->clk_div = dws->max_freq / chip->speed_hz; + if (message->state == ERROR_STATE) { message->status = -EIO; goto early_exit; @@ -492,7 +499,7 @@ static void pump_transfers(unsigned long data) /* clk_div doesn't support odd number */ clk_div = dws->max_freq / speed; - clk_div = (clk_div >> 1) << 1; + clk_div = (clk_div + 1) & 0xfffe; chip->speed_hz = speed; chip->clk_div = clk_div; @@ -535,11 +542,16 @@ static void pump_transfers(unsigned long data) /* Check if current transfer is a DMA transaction */ dws->dma_mapped = map_dma_buffers(dws); + /* + * Interrupt mode + * we only need set the TXEI IRQ, as TX/RX always happen syncronizely + */ if (!dws->dma_mapped && !chip->poll_mode) { - if (dws->rx) - imask |= SPI_INT_RXFI; - if (dws->tx) - imask |= SPI_INT_TXEI; + int templen = dws->len / dws->n_bytes; + txint_level = dws->fifo_len / 2; + txint_level = (templen > txint_level) ? txint_level : templen; + + imask |= SPI_INT_TXEI; dws->transfer_handler = interrupt_transfer; } @@ -549,21 +561,23 @@ static void pump_transfers(unsigned long data) * 2. clk_div is changed * 3. control value changes */ - if (dw_readw(dws, ctrl0) != cr0 || cs_change || clk_div) { + if (dw_readw(dws, ctrl0) != cr0 || cs_change || clk_div || imask) { spi_enable_chip(dws, 0); if (dw_readw(dws, ctrl0) != cr0) dw_writew(dws, ctrl0, cr0); + spi_set_clk(dws, clk_div ? clk_div : chip->clk_div); + spi_chip_sel(dws, spi->chip_select); + /* Set the interrupt mask, for poll mode just diable all int */ spi_mask_intr(dws, 0xff); - if (!chip->poll_mode) + if (imask) spi_umask_intr(dws, imask); + if (txint_level) + dw_writew(dws, txfltr, txint_level); - spi_set_clk(dws, clk_div ? clk_div : chip->clk_div); - spi_chip_sel(dws, spi->chip_select); spi_enable_chip(dws, 1); - if (cs_change) dws->prev_chip = chip; } @@ -712,11 +726,11 @@ static int dw_spi_setup(struct spi_device *spi) } chip->bits_per_word = spi->bits_per_word; + if (!spi->max_speed_hz) { + dev_err(&spi->dev, "No max speed HZ parameter\n"); + return -EINVAL; + } chip->speed_hz = spi->max_speed_hz; - if (chip->speed_hz) - chip->clk_div = 25000000 / chip->speed_hz; - else - chip->clk_div = 8; /* default value */ chip->tmode = 0; /* Tx & Rx */ /* Default SPI mode is SCPOL = 0, SCPH = 0 */ diff --git a/drivers/spi/dw_spi_pci.c b/drivers/spi/dw_spi_pci.c index 7980f1443ce1..1f0735f9cc76 100644 --- a/drivers/spi/dw_spi_pci.c +++ b/drivers/spi/dw_spi_pci.c @@ -73,6 +73,7 @@ static int __devinit spi_pci_probe(struct pci_dev *pdev, dws->num_cs = 4; dws->max_freq = 25000000; /* for Moorestwon */ dws->irq = pdev->irq; + dws->fifo_len = 40; /* FIFO has 40 words buffer */ ret = dw_spi_add_host(dws); if (ret) diff --git a/include/linux/spi/dw_spi.h b/include/linux/spi/dw_spi.h index 51b3e771a9a3..1a127a31e017 100644 --- a/include/linux/spi/dw_spi.h +++ b/include/linux/spi/dw_spi.h @@ -90,6 +90,7 @@ struct dw_spi { unsigned long paddr; u32 iolen; int irq; + u32 fifo_len; /* depth of the FIFO buffer */ u32 max_freq; /* max bus freq supported */ u16 bus_num; -- cgit v1.2.3 From 3bf127637e22ddf95e67e10a23c339cee3d52429 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 21 Jan 2010 00:02:36 -0800 Subject: Input: sh_keysc - add mode 4 and mode 5 support Add Mode 4 and Mode 5 support to the SH_KEYSC driver. These modes allow slightly larger key pad matrixes. While at it, make use of resource_size(). Signed-off-by: Magnus Damm Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/sh_keysc.c | 6 +++--- include/linux/input/sh_keysc.h | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/keyboard/sh_keysc.c b/drivers/input/keyboard/sh_keysc.c index 076111fc72d2..25706f802258 100644 --- a/drivers/input/keyboard/sh_keysc.c +++ b/drivers/input/keyboard/sh_keysc.c @@ -36,6 +36,8 @@ static const struct { [SH_KEYSC_MODE_1] = { 0, 6, 5 }, [SH_KEYSC_MODE_2] = { 1, 5, 6 }, [SH_KEYSC_MODE_3] = { 2, 4, 7 }, + [SH_KEYSC_MODE_4] = { 3, 6, 6 }, + [SH_KEYSC_MODE_5] = { 4, 6, 7 }, }; struct sh_keysc_priv { @@ -122,8 +124,6 @@ static irqreturn_t sh_keysc_isr(int irq, void *dev_id) return IRQ_HANDLED; } -#define res_size(res) ((res)->end - (res)->start + 1) - static int __devinit sh_keysc_probe(struct platform_device *pdev) { struct sh_keysc_priv *priv; @@ -164,7 +164,7 @@ static int __devinit sh_keysc_probe(struct platform_device *pdev) memcpy(&priv->pdata, pdev->dev.platform_data, sizeof(priv->pdata)); pdata = &priv->pdata; - priv->iomem_base = ioremap_nocache(res->start, res_size(res)); + priv->iomem_base = ioremap_nocache(res->start, resource_size(res)); if (priv->iomem_base == NULL) { dev_err(&pdev->dev, "failed to remap I/O memory\n"); error = -ENXIO; diff --git a/include/linux/input/sh_keysc.h b/include/linux/input/sh_keysc.h index c211b5cf08e6..2aff38bcf2ba 100644 --- a/include/linux/input/sh_keysc.h +++ b/include/linux/input/sh_keysc.h @@ -1,10 +1,11 @@ #ifndef __SH_KEYSC_H__ #define __SH_KEYSC_H__ -#define SH_KEYSC_MAXKEYS 30 +#define SH_KEYSC_MAXKEYS 42 struct sh_keysc_info { - enum { SH_KEYSC_MODE_1, SH_KEYSC_MODE_2, SH_KEYSC_MODE_3 } mode; + enum { SH_KEYSC_MODE_1, SH_KEYSC_MODE_2, SH_KEYSC_MODE_3, + SH_KEYSC_MODE_4, SH_KEYSC_MODE_5 } mode; int scan_timing; /* 0 -> 7, see KYCR1, SCN[2:0] */ int delay; int kycr2_delay; -- cgit v1.2.3 From 3d45fd804a95055ecab5b3eed81f5ab2dbb047a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 17:12:46 +0100 Subject: sched: Remove the sched_class load_balance methods Take out the sched_class methods for load-balancing. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ------ kernel/sched.c | 26 ------------------- kernel/sched_fair.c | 66 +++++++++++++++++++++++++++---------------------- kernel/sched_idletask.c | 21 ---------------- kernel/sched_rt.c | 20 --------------- 5 files changed, 37 insertions(+), 104 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f2f842db03ce..50d685cde70e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1087,14 +1087,6 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); - unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, - struct rq *busiest, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio); - - int (*move_one_task) (struct rq *this_rq, int this_cpu, - struct rq *busiest, struct sched_domain *sd, - enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); void (*task_waking) (struct rq *this_rq, struct task_struct *task); diff --git a/kernel/sched.c b/kernel/sched.c index 13a2acf18b2d..c0be07932a8d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1390,32 +1390,6 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); - -/* - * runqueue iterator, to support SMP load-balancing between different - * scheduling classes, without having to expose their internal data - * structures to the load-balancing proper: - */ -struct rq_iterator { - void *arg; - struct task_struct *(*start)(void *); - struct task_struct *(*next)(void *); -}; - -#ifdef CONFIG_SMP -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, - int *this_best_prio, struct rq_iterator *iterator); - -static int -iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle, - struct rq_iterator *iterator); -#endif - /* Time spent by the tasks of the cpu accounting group executing in ... */ enum cpuacct_stat_index { CPUACCT_STAT_USER, /* ... user mode */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5116b81d7727..faf9a2f099ab 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1851,6 +1851,24 @@ static struct task_struct *load_balance_next_fair(void *arg) return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); } +/* + * runqueue iterator, to support SMP load-balancing between different + * scheduling classes, without having to expose their internal data + * structures to the load-balancing proper: + */ +struct rq_iterator { + void *arg; + struct task_struct *(*start)(void *); + struct task_struct *(*next)(void *); +}; + +static unsigned long +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, + int *this_best_prio, struct rq_iterator *iterator); + + static unsigned long __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, @@ -1929,8 +1947,20 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, #endif static int -move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle, + struct rq_iterator *iterator); + +/* + * move_one_task tries to move exactly one task from busiest to this_rq, as + * part of active balancing operations within "domain". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int +move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) { struct cfs_rq *busy_cfs_rq; struct rq_iterator cfs_rq_iterator; @@ -2094,16 +2124,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { - const struct sched_class *class = sched_class_highest; - unsigned long total_load_moved = 0; + unsigned long total_load_moved = 0, load_moved; int this_best_prio = this_rq->curr->prio; do { - total_load_moved += - class->load_balance(this_rq, this_cpu, busiest, + load_moved = load_balance_fair(this_rq, this_cpu, busiest, max_load_move - total_load_moved, sd, idle, all_pinned, &this_best_prio); - class = class->next; + + total_load_moved += load_moved; #ifdef CONFIG_PREEMPT /* @@ -2114,7 +2143,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) break; #endif - } while (class && max_load_move > total_load_moved); + } while (load_moved && max_load_move > total_load_moved); return total_load_moved > 0; } @@ -2145,25 +2174,6 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } -/* - * move_one_task tries to move exactly one task from busiest to this_rq, as - * part of active balancing operations within "domain". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) -{ - const struct sched_class *class; - - for_each_class(class) { - if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) - return 1; - } - - return 0; -} /********** Helpers for find_busiest_group ************************/ /* * sd_lb_stats - Structure to store the statistics of a sched_domain @@ -3873,8 +3883,6 @@ static const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, - .load_balance = load_balance_fair, - .move_one_task = move_one_task_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 01332bfc61a7..a8a6d8a50947 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { } -#ifdef CONFIG_SMP -static unsigned long -load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) -{ - return 0; -} - -static int -move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) -{ - return 0; -} -#endif - static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) { } @@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, - - .load_balance = load_balance_idle, - .move_one_task = move_one_task_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 072b3fcee8d8..502bb614e40a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1481,24 +1481,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) push_rt_tasks(rq); } -static unsigned long -load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) -{ - /* don't touch RT tasks */ - return 0; -} - -static int -move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) -{ - /* don't touch RT tasks */ - return 0; -} - static void set_cpus_allowed_rt(struct task_struct *p, const struct cpumask *new_mask) { @@ -1746,8 +1728,6 @@ static const struct sched_class rt_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_rt, - .load_balance = load_balance_rt, - .move_one_task = move_one_task_rt, .set_cpus_allowed = set_cpus_allowed_rt, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, -- cgit v1.2.3 From 7c9414385ebfdd87cc542d4e7e3bb0dbb2d3ce25 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Wed, 20 Jan 2010 13:26:18 +0100 Subject: sched: Remove USER_SCHED Remove the USER_SCHED feature. It has been scheduled to be removed in 2.6.34 as per http://marc.info/?l=linux-kernel&m=125728479022976&w=2 Signed-off-by: Dhaval Giani Signed-off-by: Peter Zijlstra LKML-Reference: <1263990378.24844.3.camel@localhost> Signed-off-by: Ingo Molnar --- Documentation/feature-removal-schedule.txt | 15 -- include/linux/sched.h | 14 +- init/Kconfig | 81 +++----- kernel/ksysfs.c | 8 - kernel/sched.c | 114 +---------- kernel/sys.c | 5 - kernel/user.c | 305 ----------------------------- 7 files changed, 38 insertions(+), 504 deletions(-) (limited to 'include/linux') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 870d190fe617..04a3fc3d139b 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -6,21 +6,6 @@ be removed from this file. --------------------------- -What: USER_SCHED -When: 2.6.34 - -Why: USER_SCHED was implemented as a proof of concept for group scheduling. - The effect of USER_SCHED can already be achieved from userspace with - the help of libcgroup. The removal of USER_SCHED will also simplify - the scheduler code with the removal of one major ifdef. There are also - issues USER_SCHED has with USER_NS. A decision was taken not to fix - those and instead remove USER_SCHED. Also new group scheduling - features will not be implemented for USER_SCHED. - -Who: Dhaval Giani - ---------------------------- - What: PRISM54 When: 2.6.34 diff --git a/include/linux/sched.h b/include/linux/sched.h index 50d685cde70e..8b079735ae5f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -731,14 +731,6 @@ struct user_struct { uid_t uid; struct user_namespace *user_ns; -#ifdef CONFIG_USER_SCHED - struct task_group *tg; -#ifdef CONFIG_SYSFS - struct kobject kobj; - struct delayed_work work; -#endif -#endif - #ifdef CONFIG_PERF_EVENTS atomic_long_t locked_vm; #endif @@ -2502,13 +2494,9 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); extern void normalize_rt_tasks(void); -#ifdef CONFIG_GROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED extern struct task_group init_task_group; -#ifdef CONFIG_USER_SCHED -extern struct task_group root_task_group; -extern void set_tg_uid(struct user_struct *user); -#endif extern struct task_group *sched_create_group(struct task_group *parent); extern void sched_destroy_group(struct task_group *tg); diff --git a/init/Kconfig b/init/Kconfig index a23da9f01803..e9fa3007a6fc 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -435,57 +435,6 @@ config LOG_BUF_SHIFT config HAVE_UNSTABLE_SCHED_CLOCK bool -config GROUP_SCHED - bool "Group CPU scheduler" - depends on EXPERIMENTAL - default n - help - This feature lets CPU scheduler recognize task groups and control CPU - bandwidth allocation to such task groups. - In order to create a group from arbitrary set of processes, use - CONFIG_CGROUPS. (See Control Group support.) - -config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on GROUP_SCHED - default GROUP_SCHED - -config RT_GROUP_SCHED - bool "Group scheduling for SCHED_RR/FIFO" - depends on EXPERIMENTAL - depends on GROUP_SCHED - default n - help - This feature lets you explicitly allocate real CPU bandwidth - to users or control groups (depending on the "Basis for grouping tasks" - setting below. If enabled, it will also make it impossible to - schedule realtime tasks for non-root users until you allocate - realtime bandwidth for them. - See Documentation/scheduler/sched-rt-group.txt for more information. - -choice - depends on GROUP_SCHED - prompt "Basis for grouping tasks" - default USER_SCHED - -config USER_SCHED - bool "user id" - help - This option will choose userid as the basis for grouping - tasks, thus providing equal CPU bandwidth to each user. - -config CGROUP_SCHED - bool "Control groups" - depends on CGROUPS - help - This option allows you to create arbitrary task groups - using the "cgroup" pseudo filesystem and control - the cpu bandwidth allocated to each such task group. - Refer to Documentation/cgroups/cgroups.txt for more - information on "cgroup" pseudo filesystem. - -endchoice - menuconfig CGROUPS boolean "Control Group support" help @@ -606,6 +555,36 @@ config CGROUP_MEM_RES_CTLR_SWAP Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page size is 4096bytes, 512k per 1Gbytes of swap. +menuconfig CGROUP_SCHED + bool "Group CPU scheduler" + depends on EXPERIMENTAL && CGROUPS + default n + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +if CGROUP_SCHED +config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED + default CGROUP_SCHED + +config RT_GROUP_SCHED + bool "Group scheduling for SCHED_RR/FIFO" + depends on EXPERIMENTAL + depends on CGROUP_SCHED + default n + help + This feature lets you explicitly allocate real CPU bandwidth + to users or control groups (depending on the "Basis for grouping tasks" + setting below. If enabled, it will also make it impossible to + schedule realtime tasks for non-root users until you allocate + realtime bandwidth for them. + See Documentation/scheduler/sched-rt-group.txt for more information. + +endif #CGROUP_SCHED + endif # CGROUPS config MM_OWNER diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 3feaf5a74514..6b1ccc3f0205 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -197,16 +197,8 @@ static int __init ksysfs_init(void) goto group_exit; } - /* create the /sys/kernel/uids/ directory */ - error = uids_sysfs_init(); - if (error) - goto notes_exit; - return 0; -notes_exit: - if (notes_size > 0) - sysfs_remove_bin_file(kernel_kobj, ¬es_attr); group_exit: sysfs_remove_group(kernel_kobj, &kernel_attr_group); kset_exit: diff --git a/kernel/sched.c b/kernel/sched.c index c0be07932a8d..41e76d325648 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) */ static DEFINE_MUTEX(sched_domains_mutex); -#ifdef CONFIG_GROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED #include @@ -243,13 +243,7 @@ static LIST_HEAD(task_groups); /* task group related information */ struct task_group { -#ifdef CONFIG_CGROUP_SCHED struct cgroup_subsys_state css; -#endif - -#ifdef CONFIG_USER_SCHED - uid_t uid; -#endif #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ @@ -274,35 +268,7 @@ struct task_group { struct list_head children; }; -#ifdef CONFIG_USER_SCHED - -/* Helper function to pass uid information to create_sched_user() */ -void set_tg_uid(struct user_struct *user) -{ - user->tg->uid = user->uid; -} - -/* - * Root task group. - * Every UID task group (including init_task_group aka UID-0) will - * be a child to this group. - */ -struct task_group root_task_group; - -#ifdef CONFIG_FAIR_GROUP_SCHED -/* Default task group's sched entity on each cpu */ -static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); -/* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var); -#endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -318,11 +284,7 @@ static int root_task_group_empty(void) } #endif -#ifdef CONFIG_USER_SCHED -# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) -#else /* !CONFIG_USER_SCHED */ # define INIT_TASK_GROUP_LOAD NICE_0_LOAD -#endif /* CONFIG_USER_SCHED */ /* * A weight of 0 or 1 can cause arithmetics problems. @@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p) { struct task_group *tg; -#ifdef CONFIG_USER_SCHED - rcu_read_lock(); - tg = __task_cred(p)->user->tg; - rcu_read_unlock(); -#elif defined(CONFIG_CGROUP_SCHED) +#ifdef CONFIG_CGROUP_SCHED tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); #else @@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p) return NULL; } -#endif /* CONFIG_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ /* CFS-related fields in a runqueue */ struct cfs_rq { @@ -7678,9 +7636,6 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif -#ifdef CONFIG_USER_SCHED - alloc_size *= 2; -#endif #ifdef CONFIG_CPUMASK_OFFSTACK alloc_size += num_possible_cpus() * cpumask_size(); #endif @@ -7694,13 +7649,6 @@ void __init sched_init(void) init_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); -#ifdef CONFIG_USER_SCHED - root_task_group.se = (struct sched_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.cfs_rq = (struct cfs_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); -#endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED init_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -7709,13 +7657,6 @@ void __init sched_init(void) init_task_group.rt_rq = (struct rt_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); -#ifdef CONFIG_USER_SCHED - root_task_group.rt_se = (struct sched_rt_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.rt_rq = (struct rt_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); -#endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CPUMASK_OFFSTACK for_each_possible_cpu(i) { @@ -7735,22 +7676,13 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&init_task_group.rt_bandwidth, global_rt_period(), global_rt_runtime()); -#ifdef CONFIG_USER_SCHED - init_rt_bandwidth(&root_task_group.rt_bandwidth, - global_rt_period(), RUNTIME_INF); -#endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_GROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED list_add(&init_task_group.list, &task_groups); INIT_LIST_HEAD(&init_task_group.children); -#ifdef CONFIG_USER_SCHED - INIT_LIST_HEAD(&root_task_group.children); - init_task_group.parent = &root_task_group; - list_add(&init_task_group.siblings, &root_task_group.children); -#endif /* CONFIG_USER_SCHED */ -#endif /* CONFIG_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), @@ -7790,25 +7722,6 @@ void __init sched_init(void) * directly in rq->cfs (i.e init_task_group->se[] = NULL). */ init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); -#elif defined CONFIG_USER_SCHED - root_task_group.shares = NICE_0_LOAD; - init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); - /* - * In case of task-groups formed thr' the user id of tasks, - * init_task_group represents tasks belonging to root user. - * Hence it forms a sibling of all subsequent groups formed. - * In this case, init_task_group gets only a fraction of overall - * system cpu resource, based on the weight assigned to root - * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished - * by letting tasks of init_task_group sit in a separate cfs_rq - * (init_tg_cfs_rq) and having one entity represent this group of - * tasks in rq->cfs (i.e init_task_group->se[] != NULL). - */ - init_tg_cfs_entry(&init_task_group, - &per_cpu(init_tg_cfs_rq, i), - &per_cpu(init_sched_entity, i), i, 1, - root_task_group.se[i]); - #endif #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -7817,12 +7730,6 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->leaf_rt_rq_list); #ifdef CONFIG_CGROUP_SCHED init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); -#elif defined CONFIG_USER_SCHED - init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); - init_tg_rt_entry(&init_task_group, - &per_cpu(init_rt_rq_var, i), - &per_cpu(init_sched_rt_entity, i), i, 1, - root_task_group.rt_se[i]); #endif #endif @@ -8218,7 +8125,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) } #endif /* CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_GROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -8327,7 +8234,7 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, &flags); } -#endif /* CONFIG_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED static void __set_se_shares(struct sched_entity *se, unsigned long shares) @@ -8469,13 +8376,6 @@ static int tg_schedulable(struct task_group *tg, void *data) runtime = d->rt_runtime; } -#ifdef CONFIG_USER_SCHED - if (tg == &root_task_group) { - period = global_rt_period(); - runtime = global_rt_runtime(); - } -#endif - /* * Cannot have more runtime than the period. */ diff --git a/kernel/sys.c b/kernel/sys.c index 26a6b73a6b85..f75bf0936f47 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -569,11 +569,6 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; - if (!task_can_switch_user(new_user, current)) { - free_uid(new_user); - return -EINVAL; - } - if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && new_user != INIT_USER) { diff --git a/kernel/user.c b/kernel/user.c index 46d0165ca70c..766467b3bcb7 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -56,9 +56,6 @@ struct user_struct root_user = { .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .user_ns = &init_user_ns, -#ifdef CONFIG_USER_SCHED - .tg = &init_task_group, -#endif }; /* @@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up) put_user_ns(up->user_ns); } -#ifdef CONFIG_USER_SCHED - -static void sched_destroy_user(struct user_struct *up) -{ - sched_destroy_group(up->tg); -} - -static int sched_create_user(struct user_struct *up) -{ - int rc = 0; - - up->tg = sched_create_group(&root_task_group); - if (IS_ERR(up->tg)) - rc = -ENOMEM; - - set_tg_uid(up); - - return rc; -} - -#else /* CONFIG_USER_SCHED */ - -static void sched_destroy_user(struct user_struct *up) { } -static int sched_create_user(struct user_struct *up) { return 0; } - -#endif /* CONFIG_USER_SCHED */ - -#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS) - -static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) -{ - struct user_struct *user; - struct hlist_node *h; - - hlist_for_each_entry(user, h, hashent, uidhash_node) { - if (user->uid == uid) { - /* possibly resurrect an "almost deleted" object */ - if (atomic_inc_return(&user->__count) == 1) - cancel_delayed_work(&user->work); - return user; - } - } - - return NULL; -} - -static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ -static DEFINE_MUTEX(uids_mutex); - -static inline void uids_mutex_lock(void) -{ - mutex_lock(&uids_mutex); -} - -static inline void uids_mutex_unlock(void) -{ - mutex_unlock(&uids_mutex); -} - -/* uid directory attributes */ -#ifdef CONFIG_FAIR_GROUP_SCHED -static ssize_t cpu_shares_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - - return sprintf(buf, "%lu\n", sched_group_shares(up->tg)); -} - -static ssize_t cpu_shares_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t size) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - unsigned long shares; - int rc; - - sscanf(buf, "%lu", &shares); - - rc = sched_group_set_shares(up->tg, shares); - - return (rc ? rc : size); -} - -static struct kobj_attribute cpu_share_attr = - __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); -#endif - -#ifdef CONFIG_RT_GROUP_SCHED -static ssize_t cpu_rt_runtime_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - - return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); -} - -static ssize_t cpu_rt_runtime_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t size) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - unsigned long rt_runtime; - int rc; - - sscanf(buf, "%ld", &rt_runtime); - - rc = sched_group_set_rt_runtime(up->tg, rt_runtime); - - return (rc ? rc : size); -} - -static struct kobj_attribute cpu_rt_runtime_attr = - __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); - -static ssize_t cpu_rt_period_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - - return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg)); -} - -static ssize_t cpu_rt_period_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t size) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - unsigned long rt_period; - int rc; - - sscanf(buf, "%lu", &rt_period); - - rc = sched_group_set_rt_period(up->tg, rt_period); - - return (rc ? rc : size); -} - -static struct kobj_attribute cpu_rt_period_attr = - __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store); -#endif - -/* default attributes per uid directory */ -static struct attribute *uids_attributes[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED - &cpu_share_attr.attr, -#endif -#ifdef CONFIG_RT_GROUP_SCHED - &cpu_rt_runtime_attr.attr, - &cpu_rt_period_attr.attr, -#endif - NULL -}; - -/* the lifetime of user_struct is not managed by the core (now) */ -static void uids_release(struct kobject *kobj) -{ - return; -} - -static struct kobj_type uids_ktype = { - .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = uids_attributes, - .release = uids_release, -}; - -/* - * Create /sys/kernel/uids//cpu_share file for this user - * We do not create this file for users in a user namespace (until - * sysfs tagging is implemented). - * - * See Documentation/scheduler/sched-design-CFS.txt for ramifications. - */ -static int uids_user_create(struct user_struct *up) -{ - struct kobject *kobj = &up->kobj; - int error; - - memset(kobj, 0, sizeof(struct kobject)); - if (up->user_ns != &init_user_ns) - return 0; - kobj->kset = uids_kset; - error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); - if (error) { - kobject_put(kobj); - goto done; - } - - kobject_uevent(kobj, KOBJ_ADD); -done: - return error; -} - -/* create these entries in sysfs: - * "/sys/kernel/uids" directory - * "/sys/kernel/uids/0" directory (for root user) - * "/sys/kernel/uids/0/cpu_share" file (for root user) - */ -int __init uids_sysfs_init(void) -{ - uids_kset = kset_create_and_add("uids", NULL, kernel_kobj); - if (!uids_kset) - return -ENOMEM; - - return uids_user_create(&root_user); -} - -/* delayed work function to remove sysfs directory for a user and free up - * corresponding structures. - */ -static void cleanup_user_struct(struct work_struct *w) -{ - struct user_struct *up = container_of(w, struct user_struct, work.work); - unsigned long flags; - int remove_user = 0; - - /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() - * atomic. - */ - uids_mutex_lock(); - - spin_lock_irqsave(&uidhash_lock, flags); - if (atomic_read(&up->__count) == 0) { - uid_hash_remove(up); - remove_user = 1; - } - spin_unlock_irqrestore(&uidhash_lock, flags); - - if (!remove_user) - goto done; - - if (up->user_ns == &init_user_ns) { - kobject_uevent(&up->kobj, KOBJ_REMOVE); - kobject_del(&up->kobj); - kobject_put(&up->kobj); - } - - sched_destroy_user(up); - key_put(up->uid_keyring); - key_put(up->session_keyring); - kmem_cache_free(uid_cachep, up); - -done: - uids_mutex_unlock(); -} - -/* IRQs are disabled and uidhash_lock is held upon function entry. - * IRQ state (as stored in flags) is restored and uidhash_lock released - * upon function exit. - */ -static void free_user(struct user_struct *up, unsigned long flags) -{ - INIT_DELAYED_WORK(&up->work, cleanup_user_struct); - schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); - spin_unlock_irqrestore(&uidhash_lock, flags); -} - -#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ - static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) { struct user_struct *user; @@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) return NULL; } -int uids_sysfs_init(void) { return 0; } -static inline int uids_user_create(struct user_struct *up) { return 0; } -static inline void uids_mutex_lock(void) { } -static inline void uids_mutex_unlock(void) { } - /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. @@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); - sched_destroy_user(up); key_put(up->uid_keyring); key_put(up->session_keyring); kmem_cache_free(uid_cachep, up); } -#endif - -#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED) -/* - * We need to check if a setuid can take place. This function should be called - * before successfully completing the setuid. - */ -int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) -{ - - return sched_rt_can_attach(up->tg, tsk); - -} -#else -int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) -{ - return 1; -} -#endif - /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). @@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() * atomic. */ - uids_mutex_lock(); - spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); spin_unlock_irq(&uidhash_lock); @@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) new->uid = uid; atomic_set(&new->__count, 1); - if (sched_create_user(new) < 0) - goto out_free_user; - new->user_ns = get_user_ns(ns); - if (uids_user_create(new)) - goto out_destoy_sched; - /* * Before adding this, check whether we raced * on adding the same user already.. @@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) spin_unlock_irq(&uidhash_lock); } - uids_mutex_unlock(); - return up; -out_destoy_sched: - sched_destroy_user(new); put_user_ns(new->user_ns); -out_free_user: kmem_cache_free(uid_cachep, new); out_unlock: - uids_mutex_unlock(); return NULL; } -- cgit v1.2.3 From 83fe518a839e317480e50a138ef4acd73510d7ce Mon Sep 17 00:00:00 2001 From: George Shore Date: Thu, 21 Jan 2010 11:40:48 +0000 Subject: spi/dw_spi: enable platform specific chipselect. The driver core allows for a platform-specific chipselect assert/deassert function, however the chipselect function in the core doesn't take advantage of this fact. This enables the use of a custom function, should it be defined. Signed-off-by: George Shore Signed-off-by: Grant Likely --- include/linux/spi/dw_spi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/dw_spi.h b/include/linux/spi/dw_spi.h index 1a127a31e017..cc813f95a2f2 100644 --- a/include/linux/spi/dw_spi.h +++ b/include/linux/spi/dw_spi.h @@ -172,6 +172,10 @@ static inline void spi_chip_sel(struct dw_spi *dws, u16 cs) { if (cs > dws->num_cs) return; + + if (dws->cs_control) + dws->cs_control(1); + dw_writel(dws, ser, 1 << cs); } -- cgit v1.2.3 From ea87bb7853168434f4a82426dd1ea8421f9e604d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Jan 2010 20:58:57 +0000 Subject: sched: Extend enqueue_task to allow head queueing The ability of enqueueing a task to the head of a SCHED_FIFO priority list is required to fix some violations of POSIX scheduling policy. Extend the related functions with a "head" argument. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Tested-by: Carsten Emde Tested-by: Mathias Weber LKML-Reference: <20100120171629.734886007@linutronix.de> --- include/linux/sched.h | 3 ++- kernel/sched.c | 13 +++++++------ kernel/sched_fair.c | 3 ++- kernel/sched_rt.c | 3 ++- 4 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8b079735ae5f..b35c0c7130c8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1067,7 +1067,8 @@ struct sched_domain; struct sched_class { const struct sched_class *next; - void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup, + bool head); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); void (*yield_task) (struct rq *rq); diff --git a/kernel/sched.c b/kernel/sched.c index 41e76d325648..f47560ff3346 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1856,13 +1856,14 @@ static void update_avg(u64 *avg, u64 sample) *avg += diff >> 3; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) +static void +enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) { if (wakeup) p->se.start_runtime = p->se.sum_exec_runtime; sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, wakeup); + p->sched_class->enqueue_task(rq, p, wakeup, head); p->se.on_rq = 1; } @@ -1892,7 +1893,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) if (task_contributes_to_load(p)) rq->nr_uninterruptible--; - enqueue_task(rq, p, wakeup); + enqueue_task(rq, p, wakeup, false); inc_nr_running(rq); } @@ -4236,7 +4237,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); if (on_rq) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, 0, false); check_class_changed(rq, p, prev_class, oldprio, running); } @@ -4280,7 +4281,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (on_rq) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, 0, false); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -8230,7 +8231,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (on_rq) - enqueue_task(rq, tsk, 0); + enqueue_task(rq, tsk, 0, false); task_rq_unlock(rq, &flags); } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 22231ccb2f98..0e7a7af9cf8b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq) * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */ -static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) +static void +enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 502bb614e40a..38076dabb44a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -878,7 +878,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) /* * Adding/removing a task to/from a priority array: */ -static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) +static void +enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) { struct sched_rt_entity *rt_se = &p->rt; -- cgit v1.2.3 From d0dd2de0d055f0ffb1e2ecdc21380de9d12a85e2 Mon Sep 17 00:00:00 2001 From: Andriy Tkachuk Date: Wed, 20 Jan 2010 13:55:06 +0200 Subject: mac80211: Account HT Control field in Data frame hdrlen according to 802.11n-2009 ieee80211_hdrlen() should account account new HT Control field in 802.11 data frame header introduced by IEEE 802.11n standard. According to 802.11n-2009 HT Control field is present in data frames when both of following are met: 1. It is QoS data frame. 2. Order bit is set in Frame Control field. The change might be totally compatible with legacy non-11n aware frames, because 802.11-2007 standard states that "all QoS STAs set this subfield to 0". Signed-off-by: Andriy V. Tkachuk Acked-by : Benoit Papillault Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 2 ++ net/wireless/util.c | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 842701906ae9..19984958ab7b 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -138,6 +138,8 @@ #define IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK 0x03 #define IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT 5 +#define IEEE80211_HT_CTL_LEN 4 + struct ieee80211_hdr { __le16 frame_control; __le16 duration_id; diff --git a/net/wireless/util.c b/net/wireless/util.c index 23557c1d0a9c..be2ab8c59e3a 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -227,8 +227,11 @@ unsigned int ieee80211_hdrlen(__le16 fc) if (ieee80211_is_data(fc)) { if (ieee80211_has_a4(fc)) hdrlen = 30; - if (ieee80211_is_data_qos(fc)) + if (ieee80211_is_data_qos(fc)) { hdrlen += IEEE80211_QOS_CTL_LEN; + if (ieee80211_has_order(fc)) + hdrlen += IEEE80211_HT_CTL_LEN; + } goto out; } -- cgit v1.2.3 From a271623f871dda970319ca15dfad3a8c8c36249f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 22 Jan 2010 10:13:10 +0000 Subject: netdev: remove certain HAVE_ macros After netdev_ops compat code HAVE_* macros aren't needed, in fact they _will_ result in compile breakage for out of tree drivers. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 468a11dea58c..b5fb51d0b8b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -621,30 +621,21 @@ struct net_device_ops { struct net_device *dev); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb); -#define HAVE_CHANGE_RX_FLAGS void (*ndo_change_rx_flags)(struct net_device *dev, int flags); -#define HAVE_SET_RX_MODE void (*ndo_set_rx_mode)(struct net_device *dev); -#define HAVE_MULTICAST void (*ndo_set_multicast_list)(struct net_device *dev); -#define HAVE_SET_MAC_ADDR int (*ndo_set_mac_address)(struct net_device *dev, void *addr); -#define HAVE_VALIDATE_ADDR int (*ndo_validate_addr)(struct net_device *dev); -#define HAVE_PRIVATE_IOCTL int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); -#define HAVE_SET_CONFIG int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); -#define HAVE_CHANGE_MTU int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); int (*ndo_neigh_setup)(struct net_device *dev, struct neigh_parms *); -#define HAVE_TX_TIMEOUT void (*ndo_tx_timeout) (struct net_device *dev); struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); @@ -656,7 +647,6 @@ struct net_device_ops { void (*ndo_vlan_rx_kill_vid)(struct net_device *dev, unsigned short vid); #ifdef CONFIG_NET_POLL_CONTROLLER -#define HAVE_NETDEV_POLL void (*ndo_poll_controller)(struct net_device *dev); #endif #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) -- cgit v1.2.3 From 9df5f74194871ebd0e51ef5ad2eca5084acaaaba Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Mon, 25 Jan 2010 11:42:20 -0600 Subject: mm: add coherence API for DMA to vmalloc/vmap areas On Virtually Indexed architectures (which don't do automatic alias resolution in their caches), we have to flush via the correct virtual address to prepare pages for DMA. On some architectures (like arm) we cannot prevent the CPU from doing data movein along the alias (and thus giving stale read data), so we not only have to introduce a flush API to push dirty cache lines out, but also an invalidate API to kill inconsistent cache lines that may have moved in before DMA changed the data Signed-off-by: James Bottomley --- Documentation/cachetlb.txt | 24 ++++++++++++++++++++++++ include/linux/highmem.h | 6 ++++++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index da42ab414c48..b231414bb8bc 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt @@ -377,3 +377,27 @@ maps this page at its virtual address. All the functionality of flush_icache_page can be implemented in flush_dcache_page and update_mmu_cache. In 2.7 the hope is to remove this interface completely. + +The final category of APIs is for I/O to deliberately aliased address +ranges inside the kernel. Such aliases are set up by use of the +vmap/vmalloc API. Since kernel I/O goes via physical pages, the I/O +subsystem assumes that the user mapping and kernel offset mapping are +the only aliases. This isn't true for vmap aliases, so anything in +the kernel trying to do I/O to vmap areas must manually manage +coherency. It must do this by flushing the vmap range before doing +I/O and invalidating it after the I/O returns. + + void flush_kernel_vmap_range(void *vaddr, int size) + flushes the kernel cache for a given virtual address range in + the vmap area. This is to make sure that any data the kernel + modified in the vmap range is made visible to the physical + page. The design is to make this area safe to perform I/O on. + Note that this API does *not* also flush the offset map alias + of the area. + + void invalidate_kernel_vmap_range(void *vaddr, int size) invalidates + the cache for a given virtual address range in the vmap area + which prevents the processor from making the cache stale by + speculatively reading data while the I/O was occurring to the + physical pages. This is only necessary for data reads into the + vmap area. diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 211ff4497269..adfe1013b2bd 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -17,6 +17,12 @@ static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page static inline void flush_kernel_dcache_page(struct page *page) { } +static inline void flush_kernel_vmap_range(void *vaddr, int size) +{ +} +static inline void invalidate_kernel_vmap_range(void *vaddr, int size) +{ +} #endif #include -- cgit v1.2.3 From 32e7bfc41110bc8f29ec0f293c3bcee6645fef34 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 25 Jan 2010 13:36:10 -0800 Subject: net: use helpers to access uc list V2 This patch introduces three macros to work with uc list from net drivers. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/bnx2.c | 5 ++--- drivers/net/e1000/e1000_main.c | 4 ++-- drivers/net/igb/igb_main.c | 7 ++++--- drivers/net/ixgbe/ixgbe_common.c | 7 +++---- drivers/net/ixgbe/ixgbe_common.h | 2 +- drivers/net/ixgbe/ixgbe_main.c | 2 +- drivers/net/ixgbe/ixgbe_type.h | 4 ++-- drivers/net/mv643xx_eth.c | 3 +-- drivers/net/niu.c | 4 ++-- drivers/net/stmmac/dwmac1000_core.c | 10 +++++----- drivers/net/virtio_net.c | 12 +++++++----- drivers/s390/net/qeth_l2_main.c | 2 +- include/linux/netdevice.h | 5 +++++ net/core/dev.c | 4 ++-- 14 files changed, 38 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index d83512d3e02f..a7b6b12c1c05 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -48,7 +48,6 @@ #include #include #include -#include #if defined(CONFIG_CNIC) || defined(CONFIG_CNIC_MODULE) #define BCM_CNIC 1 @@ -3579,14 +3578,14 @@ bnx2_set_rx_mode(struct net_device *dev) sort_mode |= BNX2_RPM_SORT_USER0_MC_HSH_EN; } - if (dev->uc.count > BNX2_MAX_UNICAST_ADDRESSES) { + if (netdev_uc_count(dev) > BNX2_MAX_UNICAST_ADDRESSES) { rx_mode |= BNX2_EMAC_RX_MODE_PROMISCUOUS; sort_mode |= BNX2_RPM_SORT_USER0_PROM_EN | BNX2_RPM_SORT_USER0_PROM_VLAN; } else if (!(dev->flags & IFF_PROMISC)) { /* Add all entries into to the match filter list */ i = 0; - list_for_each_entry(ha, &dev->uc.list, list) { + netdev_for_each_uc_addr(ha, dev) { bnx2_set_mac_addr(bp, ha->addr, i + BNX2_START_UNICAST_ADDRESS_INDEX); sort_mode |= (1 << diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index 87f575ca427d..2ce88c5f75c5 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -2139,7 +2139,7 @@ static void e1000_set_rx_mode(struct net_device *netdev) rctl |= E1000_RCTL_VFE; } - if (netdev->uc.count > rar_entries - 1) { + if (netdev_uc_count(netdev) > rar_entries - 1) { rctl |= E1000_RCTL_UPE; } else if (!(netdev->flags & IFF_PROMISC)) { rctl &= ~E1000_RCTL_UPE; @@ -2162,7 +2162,7 @@ static void e1000_set_rx_mode(struct net_device *netdev) */ i = 1; if (use_uc) - list_for_each_entry(ha, &netdev->uc.list, list) { + netdev_for_each_uc_addr(ha, netdev) { if (i == rar_entries) break; e1000_rar_set(hw, ha->addr, i++); diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c index d9679493c635..01cc29483e2f 100644 --- a/drivers/net/igb/igb_main.c +++ b/drivers/net/igb/igb_main.c @@ -2905,12 +2905,13 @@ static int igb_write_uc_addr_list(struct net_device *netdev) int count = 0; /* return ENOMEM indicating insufficient memory for addresses */ - if (netdev->uc.count > rar_entries) + if (netdev_uc_count(netdev) > rar_entries) return -ENOMEM; - if (netdev->uc.count && rar_entries) { + if (!netdev_uc_empty(netdev) && rar_entries) { struct netdev_hw_addr *ha; - list_for_each_entry(ha, &netdev->uc.list, list) { + + netdev_for_each_uc_addr(ha, netdev) { if (!rar_entries) break; igb_rar_set_qsel(adapter, ha->addr, diff --git a/drivers/net/ixgbe/ixgbe_common.c b/drivers/net/ixgbe/ixgbe_common.c index 276c2aaa800b..eb49020903c1 100644 --- a/drivers/net/ixgbe/ixgbe_common.c +++ b/drivers/net/ixgbe/ixgbe_common.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include "ixgbe.h" @@ -1347,7 +1346,7 @@ static void ixgbe_add_uc_addr(struct ixgbe_hw *hw, u8 *addr, u32 vmdq) /** * ixgbe_update_uc_addr_list_generic - Updates MAC list of secondary addresses * @hw: pointer to hardware structure - * @uc_list: the list of new addresses + * @netdev: pointer to net device structure * * The given list replaces any existing list. Clears the secondary addrs from * receive address registers. Uses unused receive address registers for the @@ -1357,7 +1356,7 @@ static void ixgbe_add_uc_addr(struct ixgbe_hw *hw, u8 *addr, u32 vmdq) * manually putting the device into promiscuous mode. **/ s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw, - struct list_head *uc_list) + struct net_device *netdev) { u32 i; u32 old_promisc_setting = hw->addr_ctrl.overflow_promisc; @@ -1381,7 +1380,7 @@ s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw, } /* Add the new addresses */ - list_for_each_entry(ha, uc_list, list) { + netdev_for_each_uc_addr(ha, netdev) { hw_dbg(hw, " Adding the secondary addresses:\n"); ixgbe_add_uc_addr(hw, ha->addr, 0); } diff --git a/drivers/net/ixgbe/ixgbe_common.h b/drivers/net/ixgbe/ixgbe_common.h index dfff0ffaa502..13606d4809c9 100644 --- a/drivers/net/ixgbe/ixgbe_common.h +++ b/drivers/net/ixgbe/ixgbe_common.h @@ -60,7 +60,7 @@ s32 ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list, u32 mc_addr_count, ixgbe_mc_addr_itr func); s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw, - struct list_head *uc_list); + struct net_device *netdev); s32 ixgbe_enable_mc_generic(struct ixgbe_hw *hw); s32 ixgbe_disable_mc_generic(struct ixgbe_hw *hw); s32 ixgbe_enable_rx_dma_generic(struct ixgbe_hw *hw, u32 regval); diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c index ee41d331a35f..439645d2aeef 100644 --- a/drivers/net/ixgbe/ixgbe_main.c +++ b/drivers/net/ixgbe/ixgbe_main.c @@ -2568,7 +2568,7 @@ void ixgbe_set_rx_mode(struct net_device *netdev) IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlnctrl); /* reprogram secondary unicast list */ - hw->mac.ops.update_uc_addr_list(hw, &netdev->uc.list); + hw->mac.ops.update_uc_addr_list(hw, netdev); /* reprogram multicast list */ addr_count = netdev->mc_count; diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h index b4caa7011a2b..0db67c19b2c4 100644 --- a/drivers/net/ixgbe/ixgbe_type.h +++ b/drivers/net/ixgbe/ixgbe_type.h @@ -30,7 +30,7 @@ #include #include -#include +#include /* Vendor ID */ #define IXGBE_INTEL_VENDOR_ID 0x8086 @@ -2405,7 +2405,7 @@ struct ixgbe_mac_operations { s32 (*set_vmdq)(struct ixgbe_hw *, u32, u32); s32 (*clear_vmdq)(struct ixgbe_hw *, u32, u32); s32 (*init_rx_addrs)(struct ixgbe_hw *); - s32 (*update_uc_addr_list)(struct ixgbe_hw *, struct list_head *); + s32 (*update_uc_addr_list)(struct ixgbe_hw *, struct net_device *); s32 (*update_mc_addr_list)(struct ixgbe_hw *, u8 *, u32, ixgbe_mc_addr_itr); s32 (*enable_mc)(struct ixgbe_hw *); diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c index af67af55efe7..e24072a9a979 100644 --- a/drivers/net/mv643xx_eth.c +++ b/drivers/net/mv643xx_eth.c @@ -55,7 +55,6 @@ #include #include #include -#include static char mv643xx_eth_driver_name[] = "mv643xx_eth"; static char mv643xx_eth_driver_version[] = "1.4"; @@ -1697,7 +1696,7 @@ static u32 uc_addr_filter_mask(struct net_device *dev) return 0; nibbles = 1 << (dev->dev_addr[5] & 0x0f); - list_for_each_entry(ha, &dev->uc.list, list) { + netdev_for_each_uc_addr(ha, dev) { if (memcmp(dev->dev_addr, ha->addr, 5)) return 0; if ((dev->dev_addr[5] ^ ha->addr[5]) & 0xf0) diff --git a/drivers/net/niu.c b/drivers/net/niu.c index 0e260cfbff7b..af9a8647c7e8 100644 --- a/drivers/net/niu.c +++ b/drivers/net/niu.c @@ -6372,7 +6372,7 @@ static void niu_set_rx_mode(struct net_device *dev) if ((dev->flags & IFF_ALLMULTI) || (dev->mc_count > 0)) np->flags |= NIU_FLAGS_MCAST; - alt_cnt = dev->uc.count; + alt_cnt = netdev_uc_count(dev); if (alt_cnt > niu_num_alt_addr(np)) { alt_cnt = 0; np->flags |= NIU_FLAGS_PROMISC; @@ -6381,7 +6381,7 @@ static void niu_set_rx_mode(struct net_device *dev) if (alt_cnt) { int index = 0; - list_for_each_entry(ha, &dev->uc.list, list) { + netdev_for_each_uc_addr(ha, dev) { err = niu_set_alt_mac(np, index, ha->addr); if (err) printk(KERN_WARNING PFX "%s: Error %d " diff --git a/drivers/net/stmmac/dwmac1000_core.c b/drivers/net/stmmac/dwmac1000_core.c index 928eac05b912..d812e9cdb3db 100644 --- a/drivers/net/stmmac/dwmac1000_core.c +++ b/drivers/net/stmmac/dwmac1000_core.c @@ -83,7 +83,7 @@ static void dwmac1000_set_filter(struct net_device *dev) unsigned int value = 0; DBG(KERN_INFO "%s: # mcasts %d, # unicast %d\n", - __func__, dev->mc_count, dev->uc.count); + __func__, dev->mc_count, netdev_uc_count(dev)); if (dev->flags & IFF_PROMISC) value = GMAC_FRAME_FILTER_PR; @@ -117,7 +117,7 @@ static void dwmac1000_set_filter(struct net_device *dev) } /* Handle multiple unicast addresses (perfect filtering)*/ - if (dev->uc.count > GMAC_MAX_UNICAST_ADDRESSES) + if (netdev_uc_count(dev) > GMAC_MAX_UNICAST_ADDRESSES) /* Switch to promiscuous mode is more than 16 addrs are required */ value |= GMAC_FRAME_FILTER_PR; @@ -125,9 +125,9 @@ static void dwmac1000_set_filter(struct net_device *dev) int reg = 1; struct netdev_hw_addr *ha; - list_for_each_entry(ha, &dev->uc.list, list) { - dwmac1000_set_umac_addr(ioaddr, ha->addr, reg); - reg++; + netdev_for_each_uc_addr(ha, dev) { + dwmac1000_set_umac_addr(ioaddr, ha->addr, reg); + reg++; } } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index c708ecc3cb2e..088332a943f7 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -675,6 +675,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) struct virtio_net_ctrl_mac *mac_data; struct dev_addr_list *addr; struct netdev_hw_addr *ha; + int uc_count; void *buf; int i; @@ -701,8 +702,9 @@ static void virtnet_set_rx_mode(struct net_device *dev) dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", allmulti ? "en" : "dis"); + uc_count = netdev_uc_count(dev); /* MAC filter - use one buffer for both lists */ - mac_data = buf = kzalloc(((dev->uc.count + dev->mc_count) * ETH_ALEN) + + mac_data = buf = kzalloc(((uc_count + dev->mc_count) * ETH_ALEN) + (2 * sizeof(mac_data->entries)), GFP_ATOMIC); if (!buf) { dev_warn(&dev->dev, "No memory for MAC address buffer\n"); @@ -712,16 +714,16 @@ static void virtnet_set_rx_mode(struct net_device *dev) sg_init_table(sg, 2); /* Store the unicast list and count in the front of the buffer */ - mac_data->entries = dev->uc.count; + mac_data->entries = uc_count; i = 0; - list_for_each_entry(ha, &dev->uc.list, list) + netdev_for_each_uc_addr(ha, dev) memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN); sg_set_buf(&sg[0], mac_data, - sizeof(mac_data->entries) + (dev->uc.count * ETH_ALEN)); + sizeof(mac_data->entries) + (uc_count * ETH_ALEN)); /* multicast list and count fill the end */ - mac_data = (void *)&mac_data->macs[dev->uc.count][0]; + mac_data = (void *)&mac_data->macs[uc_count][0]; mac_data->entries = dev->mc_count; addr = dev->mc_list; diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index c3258b0dd649..51fde6f2e0b8 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -622,7 +622,7 @@ static void qeth_l2_set_multicast_list(struct net_device *dev) for (dm = dev->mc_list; dm; dm = dm->next) qeth_l2_add_mc(card, dm->da_addr, 0); - list_for_each_entry(ha, &dev->uc.list, list) + netdev_for_each_uc_addr(ha, dev) qeth_l2_add_mc(card, ha->addr, 1); spin_unlock_bh(&card->mclock); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b5fb51d0b8b1..93a32a5ca74f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -263,6 +263,11 @@ struct netdev_hw_addr_list { int count; }; +#define netdev_uc_count(dev) ((dev)->uc.count) +#define netdev_uc_empty(dev) ((dev)->uc.count == 0) +#define netdev_for_each_uc_addr(ha, dev) \ + list_for_each_entry(ha, &dev->uc.list, list) + struct hh_cache { struct hh_cache *hh_next; /* Next entry */ atomic_t hh_refcnt; /* number of users */ diff --git a/net/core/dev.c b/net/core/dev.c index 4fad9db417b1..2cba5c521e56 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3665,10 +3665,10 @@ void __dev_set_rx_mode(struct net_device *dev) /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ - if (dev->uc.count > 0 && !dev->uc_promisc) { + if (!netdev_uc_empty(dev) && !dev->uc_promisc) { __dev_set_promiscuity(dev, 1); dev->uc_promisc = 1; - } else if (dev->uc.count == 0 && dev->uc_promisc) { + } else if (netdev_uc_empty(dev) && dev->uc_promisc) { __dev_set_promiscuity(dev, -1); dev->uc_promisc = 0; } -- cgit v1.2.3 From abd50713944c8ea9e0af5b7bffa0aacae21cc91a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2010 18:50:16 +0100 Subject: perf: Reimplement frequency driven sampling There was a bug in the old period code that caused intel_pmu_enable_all() or native_write_msr_safe() to show up quite high in the profiles. In staring at that code it made my head hurt, so I rewrote it in a hopefully simpler fashion. Its now fully symetric between tick and overflow driven adjustments and uses less data to boot. The only complication is that it basically wants to do a u128 division. The code approximates that in a rather simple truncate until it fits fashion, taking care to balance the terms while truncating. This version does not generate that sampling artefact. Signed-off-by: Peter Zijlstra LKML-Reference: Cc: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 5 +- kernel/perf_event.c | 132 +++++++++++++++++++++++++++++++-------------- 2 files changed, 94 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c6f812e4d058..72b2615600d8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -498,9 +498,8 @@ struct hw_perf_event { atomic64_t period_left; u64 interrupts; - u64 freq_count; - u64 freq_interrupts; - u64 freq_stamp; + u64 freq_time_stamp; + u64 freq_count_stamp; #endif }; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index edc46b92b508..251fb9552492 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1423,14 +1423,83 @@ void perf_event_task_sched_in(struct task_struct *task) static void perf_log_throttle(struct perf_event *event, int enable); -static void perf_adjust_period(struct perf_event *event, u64 events) +static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) +{ + u64 frequency = event->attr.sample_freq; + u64 sec = NSEC_PER_SEC; + u64 divisor, dividend; + + int count_fls, nsec_fls, frequency_fls, sec_fls; + + count_fls = fls64(count); + nsec_fls = fls64(nsec); + frequency_fls = fls64(frequency); + sec_fls = 30; + + /* + * We got @count in @nsec, with a target of sample_freq HZ + * the target period becomes: + * + * @count * 10^9 + * period = ------------------- + * @nsec * sample_freq + * + */ + + /* + * Reduce accuracy by one bit such that @a and @b converge + * to a similar magnitude. + */ +#define REDUCE_FLS(a, b) \ +do { \ + if (a##_fls > b##_fls) { \ + a >>= 1; \ + a##_fls--; \ + } else { \ + b >>= 1; \ + b##_fls--; \ + } \ +} while (0) + + /* + * Reduce accuracy until either term fits in a u64, then proceed with + * the other, so that finally we can do a u64/u64 division. + */ + while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + REDUCE_FLS(sec, count); + } + + if (count_fls + sec_fls > 64) { + divisor = nsec * frequency; + + while (count_fls + sec_fls > 64) { + REDUCE_FLS(count, sec); + divisor >>= 1; + } + + dividend = count * sec; + } else { + dividend = count * sec; + + while (nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + dividend >>= 1; + } + + divisor = nsec * frequency; + } + + return div64_u64(dividend, divisor); +} + +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; u64 period, sample_period; s64 delta; - events *= hwc->sample_period; - period = div64_u64(events, event->attr.sample_freq); + period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); delta = (delta + 7) / 8; /* low pass filter */ @@ -1441,13 +1510,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events) sample_period = 1; hwc->sample_period = sample_period; + + if (atomic64_read(&hwc->period_left) > 8*sample_period) { + perf_disable(); + event->pmu->disable(event); + atomic64_set(&hwc->period_left, 0); + event->pmu->enable(event); + perf_enable(); + } } static void perf_ctx_adjust_freq(struct perf_event_context *ctx) { struct perf_event *event; struct hw_perf_event *hwc; - u64 interrupts, freq; + u64 interrupts, now; + s64 delta; raw_spin_lock(&ctx->lock); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { @@ -1468,44 +1546,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(event, 1); event->pmu->unthrottle(event); - interrupts = 2*sysctl_perf_event_sample_rate/HZ; } if (!event->attr.freq || !event->attr.sample_freq) continue; - /* - * if the specified freq < HZ then we need to skip ticks - */ - if (event->attr.sample_freq < HZ) { - freq = event->attr.sample_freq; - - hwc->freq_count += freq; - hwc->freq_interrupts += interrupts; - - if (hwc->freq_count < HZ) - continue; - - interrupts = hwc->freq_interrupts; - hwc->freq_interrupts = 0; - hwc->freq_count -= HZ; - } else - freq = HZ; - - perf_adjust_period(event, freq * interrupts); + event->pmu->read(event); + now = atomic64_read(&event->count); + delta = now - hwc->freq_count_stamp; + hwc->freq_count_stamp = now; - /* - * In order to avoid being stalled by an (accidental) huge - * sample period, force reset the sample period if we didn't - * get any events in this freq period. - */ - if (!interrupts) { - perf_disable(); - event->pmu->disable(event); - atomic64_set(&hwc->period_left, 0); - event->pmu->enable(event); - perf_enable(); - } + if (delta > 0) + perf_adjust_period(event, TICK_NSEC, delta); } raw_spin_unlock(&ctx->lock); } @@ -3768,12 +3820,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, if (event->attr.freq) { u64 now = perf_clock(); - s64 delta = now - hwc->freq_stamp; + s64 delta = now - hwc->freq_time_stamp; - hwc->freq_stamp = now; + hwc->freq_time_stamp = now; - if (delta > 0 && delta < TICK_NSEC) - perf_adjust_period(event, NSEC_PER_SEC / (int)delta); + if (delta > 0 && delta < 2*TICK_NSEC) + perf_adjust_period(event, delta, hwc->last_period); } /* -- cgit v1.2.3 From 6016a363f6b56b46b24655bcfc0499b715851cf3 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 28 Jan 2010 14:06:53 -0700 Subject: of: unify phandle name in struct device_node In struct device_node, the phandle is named 'linux_phandle' for PowerPC and MicroBlaze, and 'node' for SPARC. There is no good reason for the difference, it is just an artifact of the code diverging over a couple of years. This patch renames both to simply .phandle. Note: the .node also existed in PowerPC/MicroBlaze, but the only user seems to be arch/powerpc/platforms/powermac/pfunc_core.c. It doesn't look like the assignment between .linux_phandle and .node is significantly different enough to warrant the separate code paths unless ibm,phandle properties actually appear in Apple device trees. I think it is safe to eliminate the old .node property and use phandle everywhere. Signed-off-by: Grant Likely Acked-by: David S. Miller Tested-by: Wolfram Sang Acked-by: Benjamin Herrenschmidt --- arch/microblaze/kernel/of_platform.c | 2 +- arch/microblaze/kernel/prom.c | 2 +- arch/powerpc/kernel/of_platform.c | 2 +- arch/powerpc/kernel/prom.c | 6 +++--- arch/powerpc/platforms/cell/spu_manage.c | 6 +++--- arch/powerpc/platforms/powermac/pfunc_core.c | 2 +- arch/sparc/kernel/devices.c | 2 +- arch/sparc/kernel/of_device_32.c | 2 +- arch/sparc/kernel/of_device_64.c | 2 +- arch/sparc/kernel/prom_common.c | 8 ++++---- arch/sparc/kernel/smp_64.c | 2 +- drivers/of/fdt.c | 7 +++---- drivers/sbus/char/openprom.c | 10 +++++----- drivers/video/aty/atyfb_base.c | 2 +- include/linux/of.h | 5 +---- sound/aoa/fabrics/layout.c | 2 +- 16 files changed, 29 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/kernel/of_platform.c b/arch/microblaze/kernel/of_platform.c index acf4574d0f18..1c6d684996d7 100644 --- a/arch/microblaze/kernel/of_platform.c +++ b/arch/microblaze/kernel/of_platform.c @@ -185,7 +185,7 @@ EXPORT_SYMBOL(of_find_device_by_node); static int of_dev_phandle_match(struct device *dev, void *data) { phandle *ph = data; - return to_of_device(dev)->node->linux_phandle == *ph; + return to_of_device(dev)->node->phandle == *ph; } struct of_device *of_find_device_by_phandle(phandle ph) diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index 46407e643926..6eff83a71218 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -342,7 +342,7 @@ struct device_node *of_find_node_by_phandle(phandle handle) read_lock(&devtree_lock); for (np = allnodes; np != NULL; np = np->allnext) - if (np->linux_phandle == handle) + if (np->phandle == handle) break; of_node_get(np); read_unlock(&devtree_lock); diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index 1a4fc0d11a03..666d08db319e 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -214,7 +214,7 @@ EXPORT_SYMBOL(of_find_device_by_node); static int of_dev_phandle_match(struct device *dev, void *data) { phandle *ph = data; - return to_of_device(dev)->node->linux_phandle == *ph; + return to_of_device(dev)->node->phandle == *ph; } struct of_device *of_find_device_by_phandle(phandle ph) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index deccd91d7e81..1ed2ec2ea05b 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -778,7 +778,7 @@ struct device_node *of_find_node_by_phandle(phandle handle) read_lock(&devtree_lock); for (np = allnodes; np != 0; np = np->allnext) - if (np->linux_phandle == handle) + if (np->phandle == handle) break; of_node_get(np); read_unlock(&devtree_lock); @@ -907,9 +907,9 @@ static int of_finish_dynamic_node(struct device_node *node) if (machine_is(powermac)) return -ENODEV; - /* fix up new node's linux_phandle field */ + /* fix up new node's phandle field */ if ((ibm_phandle = of_get_property(node, "ibm,phandle", NULL))) - node->linux_phandle = *ibm_phandle; + node->phandle = *ibm_phandle; out: of_node_put(parent); diff --git a/arch/powerpc/platforms/cell/spu_manage.c b/arch/powerpc/platforms/cell/spu_manage.c index 4c506c1463cd..891f18e337a2 100644 --- a/arch/powerpc/platforms/cell/spu_manage.c +++ b/arch/powerpc/platforms/cell/spu_manage.c @@ -457,7 +457,7 @@ neighbour_spu(int cbe, struct device_node *target, struct device_node *avoid) continue; vic_handles = of_get_property(spu_dn, "vicinity", &lenp); for (i=0; i < (lenp / sizeof(phandle)); i++) { - if (vic_handles[i] == target->linux_phandle) + if (vic_handles[i] == target->phandle) return spu; } } @@ -499,7 +499,7 @@ static void init_affinity_node(int cbe) if (strcmp(name, "spe") == 0) { spu = devnode_spu(cbe, vic_dn); - avoid_ph = last_spu_dn->linux_phandle; + avoid_ph = last_spu_dn->phandle; } else { /* * "mic-tm" and "bif0" nodes do not have @@ -514,7 +514,7 @@ static void init_affinity_node(int cbe) last_spu->has_mem_affinity = 1; spu->has_mem_affinity = 1; } - avoid_ph = vic_dn->linux_phandle; + avoid_ph = vic_dn->phandle; } list_add_tail(&spu->aff_list, &last_spu->aff_list); diff --git a/arch/powerpc/platforms/powermac/pfunc_core.c b/arch/powerpc/platforms/powermac/pfunc_core.c index 96d5ce50364e..ede49e78a8da 100644 --- a/arch/powerpc/platforms/powermac/pfunc_core.c +++ b/arch/powerpc/platforms/powermac/pfunc_core.c @@ -842,7 +842,7 @@ struct pmf_function *__pmf_find_function(struct device_node *target, list_for_each_entry(func, &dev->functions, link) { if (name && strcmp(name, func->name)) continue; - if (func->phandle && target->node != func->phandle) + if (func->phandle && target->phandle != func->phandle) continue; if ((func->flags & flags) == 0) continue; diff --git a/arch/sparc/kernel/devices.c b/arch/sparc/kernel/devices.c index b171ae8de90d..b062de9424a4 100644 --- a/arch/sparc/kernel/devices.c +++ b/arch/sparc/kernel/devices.c @@ -59,7 +59,7 @@ static int __cpu_find_by(int (*compare)(int, int, void *), void *compare_arg, cur_inst = 0; for_each_node_by_type(dp, "cpu") { - int err = check_cpu_node(dp->node, &cur_inst, + int err = check_cpu_node(dp->phandle, &cur_inst, compare, compare_arg, prom_node, mid); if (!err) { diff --git a/arch/sparc/kernel/of_device_32.c b/arch/sparc/kernel/of_device_32.c index 4c26eb59e742..09138d403c7f 100644 --- a/arch/sparc/kernel/of_device_32.c +++ b/arch/sparc/kernel/of_device_32.c @@ -433,7 +433,7 @@ build_resources: if (!parent) dev_set_name(&op->dev, "root"); else - dev_set_name(&op->dev, "%08x", dp->node); + dev_set_name(&op->dev, "%08x", dp->phandle); if (of_device_register(op)) { printk("%s: Could not register of device.\n", diff --git a/arch/sparc/kernel/of_device_64.c b/arch/sparc/kernel/of_device_64.c index 881947e59e95..036f18ae59a6 100644 --- a/arch/sparc/kernel/of_device_64.c +++ b/arch/sparc/kernel/of_device_64.c @@ -666,7 +666,7 @@ static struct of_device * __init scan_one_device(struct device_node *dp, if (!parent) dev_set_name(&op->dev, "root"); else - dev_set_name(&op->dev, "%08x", dp->node); + dev_set_name(&op->dev, "%08x", dp->phandle); if (of_device_register(op)) { printk("%s: Could not register of device.\n", diff --git a/arch/sparc/kernel/prom_common.c b/arch/sparc/kernel/prom_common.c index d80a65d9e893..5832e13dfeeb 100644 --- a/arch/sparc/kernel/prom_common.c +++ b/arch/sparc/kernel/prom_common.c @@ -42,7 +42,7 @@ struct device_node *of_find_node_by_phandle(phandle handle) struct device_node *np; for (np = allnodes; np; np = np->allnext) - if (np->node == handle) + if (np->phandle == handle) break; return np; @@ -89,7 +89,7 @@ int of_set_property(struct device_node *dp, const char *name, void *val, int len void *old_val = prop->value; int ret; - ret = prom_setprop(dp->node, name, val, len); + ret = prom_setprop(dp->phandle, name, val, len); err = -EINVAL; if (ret >= 0) { @@ -236,7 +236,7 @@ static struct device_node * __init prom_create_node(phandle node, dp->name = get_one_property(node, "name"); dp->type = get_one_property(node, "device_type"); - dp->node = node; + dp->phandle = node; dp->properties = build_prop_list(node); @@ -313,7 +313,7 @@ void __init prom_build_devicetree(void) nextp = &allnodes->allnext; allnodes->child = prom_build_tree(allnodes, - prom_getchild(allnodes->node), + prom_getchild(allnodes->phandle), &nextp); of_console_init(); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index aa36223497b9..eb14844a0021 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -370,7 +370,7 @@ static int __cpuinit smp_boot_one_cpu(unsigned int cpu) } else { struct device_node *dp = of_find_node_by_cpuid(cpu); - prom_startcpu(dp->node, entry, cookie); + prom_startcpu(dp->phandle, entry, cookie); } for (timeout = 0; timeout < 50000; timeout++) { diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 616a4767a950..7f8861121a31 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -310,12 +310,11 @@ unsigned long __init unflatten_dt_node(unsigned long mem, __alignof__(struct property)); if (allnextpp) { if (strcmp(pname, "linux,phandle") == 0) { - np->node = *((u32 *)*p); - if (np->linux_phandle == 0) - np->linux_phandle = np->node; + if (np->phandle == 0) + np->phandle = *((u32 *)*p); } if (strcmp(pname, "ibm,phandle") == 0) - np->linux_phandle = *((u32 *)*p); + np->phandle = *((u32 *)*p); pp->name = pname; pp->length = sz; pp->value = (void *)*p; diff --git a/drivers/sbus/char/openprom.c b/drivers/sbus/char/openprom.c index 75ac19b1192f..fc2f676e984d 100644 --- a/drivers/sbus/char/openprom.c +++ b/drivers/sbus/char/openprom.c @@ -233,7 +233,7 @@ static int opromnext(void __user *argp, unsigned int cmd, struct device_node *dp ph = 0; if (dp) - ph = dp->node; + ph = dp->phandle; data->current_node = dp; *((int *) op->oprom_array) = ph; @@ -256,7 +256,7 @@ static int oprompci2node(void __user *argp, struct device_node *dp, struct openp dp = pci_device_to_OF_node(pdev); data->current_node = dp; - *((int *)op->oprom_array) = dp->node; + *((int *)op->oprom_array) = dp->phandle; op->oprom_size = sizeof(int); err = copyout(argp, op, bufsize + sizeof(int)); @@ -273,7 +273,7 @@ static int oprompath2node(void __user *argp, struct device_node *dp, struct open dp = of_find_node_by_path(op->oprom_array); if (dp) - ph = dp->node; + ph = dp->phandle; data->current_node = dp; *((int *)op->oprom_array) = ph; op->oprom_size = sizeof(int); @@ -540,7 +540,7 @@ static int opiocgetnext(unsigned int cmd, void __user *argp) } } if (dp) - nd = dp->node; + nd = dp->phandle; if (copy_to_user(argp, &nd, sizeof(phandle))) return -EFAULT; @@ -570,7 +570,7 @@ static int openprom_bsd_ioctl(struct inode * inode, struct file * file, case OPIOCGETOPTNODE: BUILD_BUG_ON(sizeof(phandle) != sizeof(int)); - if (copy_to_user(argp, &options_node->node, sizeof(phandle))) + if (copy_to_user(argp, &options_node->phandle, sizeof(phandle))) return -EFAULT; return 0; diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c index 913b4a47ae52..bb20987d58af 100644 --- a/drivers/video/aty/atyfb_base.c +++ b/drivers/video/aty/atyfb_base.c @@ -3104,7 +3104,7 @@ static int __devinit atyfb_setup_sparc(struct pci_dev *pdev, } dp = pci_device_to_OF_node(pdev); - if (node == dp->node) { + if (node == dp->phandle) { struct fb_var_screeninfo *var = &default_var; unsigned int N, P, Q, M, T, R; u32 v_total, h_total; diff --git a/include/linux/of.h b/include/linux/of.h index d4c014a35ea5..dbabf86e0b7a 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -39,10 +39,7 @@ struct of_irq_controller; struct device_node { const char *name; const char *type; - phandle node; -#if !defined(CONFIG_SPARC) - phandle linux_phandle; -#endif + phandle phandle; char *full_name; struct property *properties; diff --git a/sound/aoa/fabrics/layout.c b/sound/aoa/fabrics/layout.c index 586965f9605f..7a437da05646 100644 --- a/sound/aoa/fabrics/layout.c +++ b/sound/aoa/fabrics/layout.c @@ -768,7 +768,7 @@ static int check_codec(struct aoa_codec *codec, "required property %s not present\n", propname); return -ENODEV; } - if (*ref != codec->node->linux_phandle) { + if (*ref != codec->node->phandle) { printk(KERN_INFO "snd-aoa-fabric-layout: " "%s doesn't match!\n", propname); return -ENODEV; -- cgit v1.2.3 From 430ad5a600a83956749307b13257c464c3826b55 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 28 Jan 2010 09:32:29 +0800 Subject: perf: Factorize trace events raw sample buffer operations Introduce ftrace_perf_buf_prepare() and ftrace_perf_buf_submit() to gather the common code that operates on raw events sampling buffer. This cleans up redundant code between regular trace events, syscall events and kprobe events. Changelog v1->v2: - Rename function name as per Masami and Frederic's suggestion - Add __kprobes for ftrace_perf_buf_prepare() and make ftrace_perf_buf_submit() inline as per Masami's suggestion - Export ftrace_perf_buf_prepare since modules will use it Signed-off-by: Xiao Guangrong Acked-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Steven Rostedt Cc: Paul Mackerras Cc: Jason Baron Cc: Peter Zijlstra LKML-Reference: <4B60E92D.9000808@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- include/linux/ftrace_event.h | 18 ++++++-- include/trace/ftrace.h | 48 +++------------------ kernel/trace/trace_event_profile.c | 52 ++++++++++++++++++++--- kernel/trace/trace_kprobe.c | 86 +++++--------------------------------- kernel/trace/trace_syscalls.c | 71 +++++-------------------------- 5 files changed, 88 insertions(+), 187 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 0a09e758c7d3..cd95919d9ff3 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -5,6 +5,7 @@ #include #include #include +#include struct trace_array; struct tracer; @@ -138,9 +139,6 @@ struct ftrace_event_call { #define FTRACE_MAX_PROFILE_SIZE 2048 -extern char *perf_trace_buf; -extern char *perf_trace_buf_nmi; - #define MAX_FILTER_PRED 32 #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ @@ -195,6 +193,20 @@ extern void ftrace_profile_disable(int event_id); extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); +extern void * +ftrace_perf_buf_prepare(int size, unsigned short type, int *rctxp, + unsigned long *irq_flags); + +static inline void +ftrace_perf_buf_submit(void *raw_data, int size, int rctx, u64 addr, + u64 count, unsigned long irq_flags) +{ + struct trace_entry *entry = raw_data; + + perf_tp_event(entry->type, addr, count, raw_data, size); + perf_swevent_put_recursion_context(rctx); + local_irq_restore(irq_flags); +} #endif #endif /* _LINUX_FTRACE_EVENT_H */ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 4a46a60c2077..f2c09e4d656c 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -850,22 +850,12 @@ ftrace_profile_templ_##call(struct ftrace_event_call *event_call, \ proto) \ { \ struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ - extern int perf_swevent_get_recursion_context(void); \ - extern void perf_swevent_put_recursion_context(int rctx); \ - extern void perf_tp_event(int, u64, u64, void *, int); \ struct ftrace_raw_##call *entry; \ u64 __addr = 0, __count = 1; \ unsigned long irq_flags; \ - struct trace_entry *ent; \ int __entry_size; \ int __data_size; \ - char *trace_buf; \ - char *raw_data; \ - int __cpu; \ int rctx; \ - int pc; \ - \ - pc = preempt_count(); \ \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ @@ -875,42 +865,16 @@ ftrace_profile_templ_##call(struct ftrace_event_call *event_call, \ if (WARN_ONCE(__entry_size > FTRACE_MAX_PROFILE_SIZE, \ "profile buffer not large enough")) \ return; \ - \ - local_irq_save(irq_flags); \ - \ - rctx = perf_swevent_get_recursion_context(); \ - if (rctx < 0) \ - goto end_recursion; \ - \ - __cpu = smp_processor_id(); \ - \ - if (in_nmi()) \ - trace_buf = rcu_dereference(perf_trace_buf_nmi); \ - else \ - trace_buf = rcu_dereference(perf_trace_buf); \ - \ - if (!trace_buf) \ - goto end; \ - \ - raw_data = per_cpu_ptr(trace_buf, __cpu); \ - \ - *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \ - entry = (struct ftrace_raw_##call *)raw_data; \ - ent = &entry->ent; \ - tracing_generic_entry_update(ent, irq_flags, pc); \ - ent->type = event_call->id; \ - \ + entry = (struct ftrace_raw_##call *)ftrace_perf_buf_prepare( \ + __entry_size, event_call->id, &rctx, &irq_flags); \ + if (!entry) \ + return; \ tstruct \ \ { assign; } \ \ - perf_tp_event(event_call->id, __addr, __count, entry, \ - __entry_size); \ - \ -end: \ - perf_swevent_put_recursion_context(rctx); \ -end_recursion: \ - local_irq_restore(irq_flags); \ + ftrace_perf_buf_submit(entry, __entry_size, rctx, __addr, \ + __count, irq_flags); \ } #undef DEFINE_EVENT diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 9e25573242cf..f0d693005075 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -6,14 +6,12 @@ */ #include +#include #include "trace.h" -char *perf_trace_buf; -EXPORT_SYMBOL_GPL(perf_trace_buf); - -char *perf_trace_buf_nmi; -EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); +static char *perf_trace_buf; +static char *perf_trace_buf_nmi; typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; @@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id) } mutex_unlock(&event_mutex); } + +__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, + int *rctxp, unsigned long *irq_flags) +{ + struct trace_entry *entry; + char *trace_buf, *raw_data; + int pc, cpu; + + pc = preempt_count(); + + /* Protect the per cpu buffer, begin the rcu read side */ + local_irq_save(*irq_flags); + + *rctxp = perf_swevent_get_recursion_context(); + if (*rctxp < 0) + goto err_recursion; + + cpu = smp_processor_id(); + + if (in_nmi()) + trace_buf = rcu_dereference(perf_trace_buf_nmi); + else + trace_buf = rcu_dereference(perf_trace_buf); + + if (!trace_buf) + goto err; + + raw_data = per_cpu_ptr(trace_buf, cpu); + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + entry = (struct trace_entry *)raw_data; + tracing_generic_entry_update(entry, *irq_flags, pc); + entry->type = type; + + return raw_data; +err: + perf_swevent_put_recursion_context(*rctxp); +err_recursion: + local_irq_restore(*irq_flags); + return NULL; +} +EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d6266cad6953..2e28ee36646f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1243,14 +1243,10 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct ftrace_event_call *call = &tp->call; struct kprobe_trace_entry *entry; - struct trace_entry *ent; - int size, __size, i, pc, __cpu; + int size, __size, i; unsigned long irq_flags; - char *trace_buf; - char *raw_data; int rctx; - pc = preempt_count(); __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1258,45 +1254,16 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, "profile buffer not large enough")) return 0; - /* - * Protect the non nmi buffer - * This also protects the rcu read side - */ - local_irq_save(irq_flags); - - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - goto end_recursion; - - __cpu = smp_processor_id(); - - if (in_nmi()) - trace_buf = rcu_dereference(perf_trace_buf_nmi); - else - trace_buf = rcu_dereference(perf_trace_buf); - - if (!trace_buf) - goto end; - - raw_data = per_cpu_ptr(trace_buf, __cpu); - - /* Zero dead bytes from alignment to avoid buffer leak to userspace */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; - entry = (struct kprobe_trace_entry *)raw_data; - ent = &entry->ent; + entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); + if (!entry) + return 0; - tracing_generic_entry_update(ent, irq_flags, pc); - ent->type = call->id; entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - perf_tp_event(call->id, entry->ip, 1, entry, size); -end: - perf_swevent_put_recursion_context(rctx); -end_recursion: - local_irq_restore(irq_flags); + ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags); return 0; } @@ -1308,14 +1275,10 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct ftrace_event_call *call = &tp->call; struct kretprobe_trace_entry *entry; - struct trace_entry *ent; - int size, __size, i, pc, __cpu; + int size, __size, i; unsigned long irq_flags; - char *trace_buf; - char *raw_data; int rctx; - pc = preempt_count(); __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1323,46 +1286,17 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, "profile buffer not large enough")) return 0; - /* - * Protect the non nmi buffer - * This also protects the rcu read side - */ - local_irq_save(irq_flags); - - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - goto end_recursion; - - __cpu = smp_processor_id(); - - if (in_nmi()) - trace_buf = rcu_dereference(perf_trace_buf_nmi); - else - trace_buf = rcu_dereference(perf_trace_buf); - - if (!trace_buf) - goto end; - - raw_data = per_cpu_ptr(trace_buf, __cpu); - - /* Zero dead bytes from alignment to avoid buffer leak to userspace */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; - entry = (struct kretprobe_trace_entry *)raw_data; - ent = &entry->ent; + entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); + if (!entry) + return 0; - tracing_generic_entry_update(ent, irq_flags, pc); - ent->type = call->id; entry->nargs = tp->nr_args; entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - perf_tp_event(call->id, entry->ret_ip, 1, entry, size); -end: - perf_swevent_put_recursion_context(rctx); -end_recursion: - local_irq_restore(irq_flags); + ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags); return 0; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f694f66d75b0..4e332b9e449c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -433,12 +433,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; unsigned long flags; - char *trace_buf; - char *raw_data; int syscall_nr; int rctx; int size; - int cpu; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) @@ -457,37 +454,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) "profile buffer not large enough")) return; - /* Protect the per cpu buffer, begin the rcu read side */ - local_irq_save(flags); - - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - goto end_recursion; - - cpu = smp_processor_id(); - - trace_buf = rcu_dereference(perf_trace_buf); - - if (!trace_buf) - goto end; - - raw_data = per_cpu_ptr(trace_buf, cpu); - - /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size, + sys_data->enter_event->id, &rctx, &flags); + if (!rec) + return; - rec = (struct syscall_trace_enter *) raw_data; - tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->enter_event->id; rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); - -end: - perf_swevent_put_recursion_context(rctx); -end_recursion: - local_irq_restore(flags); + ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); } int prof_sysenter_enable(struct ftrace_event_call *call) @@ -531,11 +506,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) struct syscall_trace_exit *rec; unsigned long flags; int syscall_nr; - char *trace_buf; - char *raw_data; int rctx; int size; - int cpu; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) @@ -557,38 +529,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) "exit event has grown above profile buffer size")) return; - /* Protect the per cpu buffer, begin the rcu read side */ - local_irq_save(flags); - - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - goto end_recursion; - - cpu = smp_processor_id(); - - trace_buf = rcu_dereference(perf_trace_buf); - - if (!trace_buf) - goto end; - - raw_data = per_cpu_ptr(trace_buf, cpu); - - /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; - - rec = (struct syscall_trace_exit *)raw_data; + rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size, + sys_data->exit_event->id, &rctx, &flags); + if (!rec) + return; - tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->exit_event->id; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); - -end: - perf_swevent_put_recursion_context(rctx); -end_recursion: - local_irq_restore(flags); + ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); } int prof_sysexit_enable(struct ftrace_event_call *call) -- cgit v1.2.3 From bb209c8287d2d55ec4a67e3933346e0a3ee0da76 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 26 Jan 2010 17:10:03 +0000 Subject: powerpc/pci: Add calls to set_pcie_port_type() and set_pcie_hotplug_bridge() We are missing these when building the pci_dev from scratch off the Open Firmware device-tree Signed-off-by: Benjamin Herrenschmidt Acked-by: Jesse Barnes --- arch/powerpc/kernel/pci_of_scan.c | 2 ++ drivers/pci/probe.c | 4 ++-- include/linux/pci.h | 4 ++++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index 7311fdfb9bf8..693eb9a25bfa 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -140,6 +140,7 @@ struct pci_dev *of_create_pci_dev(struct device_node *node, dev->devfn = devfn; dev->multifunction = 0; /* maybe a lie? */ dev->needs_freset = 0; /* pcie fundamental reset required */ + set_pcie_port_type(dev); dev->vendor = get_int_prop(node, "vendor-id", 0xffff); dev->device = get_int_prop(node, "device-id", 0xffff); @@ -164,6 +165,7 @@ struct pci_dev *of_create_pci_dev(struct device_node *node, /* a PCI-PCI bridge */ dev->hdr_type = PCI_HEADER_TYPE_BRIDGE; dev->rom_base_reg = PCI_ROM_ADDRESS1; + set_pcie_hotplug_bridge(dev); } else if (!strcmp(type, "cardbus")) { dev->hdr_type = PCI_HEADER_TYPE_CARDBUS; } else { diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 98ffb2de22e9..446e4a94d7d3 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -681,7 +681,7 @@ static void pci_read_irq(struct pci_dev *dev) dev->irq = irq; } -static void set_pcie_port_type(struct pci_dev *pdev) +void set_pcie_port_type(struct pci_dev *pdev) { int pos; u16 reg16; @@ -695,7 +695,7 @@ static void set_pcie_port_type(struct pci_dev *pdev) pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4; } -static void set_pcie_hotplug_bridge(struct pci_dev *pdev) +void set_pcie_hotplug_bridge(struct pci_dev *pdev) { int pos; u16 reg16; diff --git a/include/linux/pci.h b/include/linux/pci.h index 174e5392e51e..c1968f464c38 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -756,6 +756,10 @@ pci_power_t pci_target_state(struct pci_dev *dev); int pci_prepare_to_sleep(struct pci_dev *dev); int pci_back_from_sleep(struct pci_dev *dev); +/* For use by arch with custom probe code */ +void set_pcie_port_type(struct pci_dev *pdev); +void set_pcie_hotplug_bridge(struct pci_dev *pdev); + /* Functions for PCI Hotplug drivers to use */ int pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap); #ifdef CONFIG_HOTPLUG -- cgit v1.2.3 From 9f41699ed067fa695faff8e2e9981b2550abec62 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jan 2010 15:59:29 +0100 Subject: bitops: Provide compile time HWEIGHT{8,16,32,64} Provide compile time versions of hweight. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner LKML-Reference: <20100122155535.797688466@chello.nl> [ Remove some whitespace damage while we are at it ] Signed-off-by: Ingo Molnar --- include/linux/bitops.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index c05a29cb9bb2..ba0fd1eb4af7 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -25,7 +25,7 @@ static __inline__ int get_bitmask_order(unsigned int count) { int order; - + order = fls(count); return order; /* We could be slightly more clever with -1 here... */ } @@ -33,7 +33,7 @@ static __inline__ int get_bitmask_order(unsigned int count) static __inline__ int get_count_order(unsigned int count) { int order; - + order = fls(count) - 1; if (count & (count - 1)) order++; @@ -45,6 +45,20 @@ static inline unsigned long hweight_long(unsigned long w) return sizeof(w) == 4 ? hweight32(w) : hweight64(w); } +#define HWEIGHT8(w) \ + ( (!!((w) & (1ULL << 0))) + \ + (!!((w) & (1ULL << 1))) + \ + (!!((w) & (1ULL << 2))) + \ + (!!((w) & (1ULL << 3))) + \ + (!!((w) & (1ULL << 4))) + \ + (!!((w) & (1ULL << 5))) + \ + (!!((w) & (1ULL << 6))) + \ + (!!((w) & (1ULL << 7))) ) + +#define HWEIGHT16(w) (HWEIGHT8(w) + HWEIGHT8(w >> 8)) +#define HWEIGHT32(w) (HWEIGHT16(w) + HWEIGHT16(w >> 16)) +#define HWEIGHT64(w) (HWEIGHT32(w) + HWEIGHT32(w >> 32)) + /** * rol32 - rotate a 32-bit value left * @word: value to rotate -- cgit v1.2.3 From 184f412c3341cd24fbd26604634a5800b83dbdc3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Jan 2010 08:39:39 +0100 Subject: perf, x86: Clean up event constraints code a bit - Remove stray debug code - Improve ugly macros a bit - Remove some whitespace damage - (Also fix up some accumulated damage in perf_event.h) Signed-off-by: Ingo Molnar Cc: Stephane Eranian Cc: Peter Zijlstra LKML-Reference: --- arch/x86/kernel/cpu/perf_event.c | 37 ++++++++----------------------------- include/linux/perf_event.h | 24 +++++++++++------------- 2 files changed, 19 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 66de282ad2fb..fdbe24842271 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -93,24 +93,19 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; -#define EVENT_CONSTRAINT(c, n, m) { \ +#define EVENT_CONSTRAINT(c, n, m) { \ { .idxmsk64[0] = (n) }, \ .code = (c), \ .cmask = (m), \ .weight = HWEIGHT64((u64)(n)), \ } -#define INTEL_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +#define INTEL_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +#define FIXED_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) -#define FIXED_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) +#define EVENT_CONSTRAINT_END EVENT_CONSTRAINT(0, 0, 0) -#define EVENT_CONSTRAINT_END \ - EVENT_CONSTRAINT(0, 0, 0) - -#define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->cmask; (e)++) +#define for_each_event_constraint(e, c) for ((e) = (c); (e)->cmask; (e)++) /* * struct x86_pmu - generic x86 pmu @@ -1276,14 +1271,6 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (test_bit(hwc->idx, used_mask)) break; -#if 0 - pr_debug("CPU%d fast config=0x%llx idx=%d assign=%c\n", - smp_processor_id(), - hwc->config, - hwc->idx, - assign ? 'y' : 'n'); -#endif - set_bit(hwc->idx, used_mask); if (assign) assign[i] = hwc->idx; @@ -1333,14 +1320,6 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (j == X86_PMC_IDX_MAX) break; -#if 0 - pr_debug("CPU%d slow config=0x%llx idx=%d assign=%c\n", - smp_processor_id(), - hwc->config, - j, - assign ? 'y' : 'n'); -#endif - set_bit(j, used_mask); if (assign) @@ -2596,9 +2575,9 @@ static const struct pmu pmu = { * validate a single event group * * validation include: - * - check events are compatible which each other - * - events do not compete for the same counter - * - number of events <= number of counters + * - check events are compatible which each other + * - events do not compete for the same counter + * - number of events <= number of counters * * validation ensures the group can be loaded onto the * PMU if it was the only group available. diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 72b2615600d8..953c17731e0d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -290,7 +290,7 @@ struct perf_event_mmap_page { }; #define PERF_RECORD_MISC_CPUMODE_MASK (3 << 0) -#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) +#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_RECORD_MISC_KERNEL (1 << 0) #define PERF_RECORD_MISC_USER (2 << 0) #define PERF_RECORD_MISC_HYPERVISOR (3 << 0) @@ -356,8 +356,8 @@ enum perf_event_type { * u64 stream_id; * }; */ - PERF_RECORD_THROTTLE = 5, - PERF_RECORD_UNTHROTTLE = 6, + PERF_RECORD_THROTTLE = 5, + PERF_RECORD_UNTHROTTLE = 6, /* * struct { @@ -371,10 +371,10 @@ enum perf_event_type { /* * struct { - * struct perf_event_header header; - * u32 pid, tid; + * struct perf_event_header header; + * u32 pid, tid; * - * struct read_format values; + * struct read_format values; * }; */ PERF_RECORD_READ = 8, @@ -412,7 +412,7 @@ enum perf_event_type { * char data[size];}&& PERF_SAMPLE_RAW * }; */ - PERF_RECORD_SAMPLE = 9, + PERF_RECORD_SAMPLE = 9, PERF_RECORD_MAX, /* non-ABI */ }; @@ -752,8 +752,7 @@ extern int perf_max_events; extern const struct pmu *hw_perf_event_init(struct perf_event *event); extern void perf_event_task_sched_in(struct task_struct *task); -extern void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next); +extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next); extern void perf_event_task_tick(struct task_struct *task); extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); @@ -853,8 +852,7 @@ extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; extern void perf_event_init(void); -extern void perf_tp_event(int event_id, u64 addr, u64 count, - void *record, int entry_size); +extern void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size); extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags @@ -895,13 +893,13 @@ static inline void perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { } static inline void -perf_bp_event(struct perf_event *event, void *data) { } +perf_bp_event(struct perf_event *event, void *data) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_comm(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } -static inline int perf_swevent_get_recursion_context(void) { return -1; } +static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } -- cgit v1.2.3 From 488991e28e55b4fbca8067edf0259f69d1a6f92c Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Fri, 29 Jan 2010 09:04:08 +0100 Subject: block: Added in stricter no merge semantics for block I/O Updated 'nomerges' tunable to accept a value of '2' - indicating that _no_ merges at all are to be attempted (not even the simple one-hit cache). The following table illustrates the additional benefit - 5 minute runs of a random I/O load were applied to a dozen devices on a 16-way x86_64 system. nomerges Throughput %System Improvement (tput / %sys) -------- ------------ ----------- ------------------------- 0 12.45 MB/sec 0.669365609 1 12.50 MB/sec 0.641519199 0.40% / 2.71% 2 12.52 MB/sec 0.639849750 0.56% / 2.96% Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block | 14 ++++++++++++++ Documentation/block/queue-sysfs.txt | 10 +++++----- block/blk-sysfs.c | 11 +++++++---- block/elevator.c | 11 ++++++++++- include/linux/blkdev.h | 3 +++ 5 files changed, 39 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index d2f90334bb93..4873c759d535 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -128,3 +128,17 @@ Description: preferred request size for workloads where sustained throughput is desired. If no optimal I/O size is reported this file contains 0. + +What: /sys/block//queue/nomerges +Date: January 2010 +Contact: +Description: + Standard I/O elevator operations include attempts to + merge contiguous I/Os. For known random I/O loads these + attempts will always fail and result in extra cycles + being spent in the kernel. This allows one to turn off + this behavior on one of two ways: When set to 1, complex + merge checks are disabled, but the simple one-shot merges + with the previous I/O request are enabled. When set to 2, + all merge tries are disabled. The default value is 0 - + which enables all types of merge tries. diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index e164403f60e1..f65274081c8d 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -25,11 +25,11 @@ size allowed by the hardware. nomerges (RW) ------------- -This enables the user to disable the lookup logic involved with IO merging -requests in the block layer. Merging may still occur through a direct -1-hit cache, since that comes for (almost) free. The IO scheduler will not -waste cycles doing tree/hash lookups for merges if nomerges is 1. Defaults -to 0, enabling all merges. +This enables the user to disable the lookup logic involved with IO +merging requests in the block layer. By default (0) all merges are +enabled. When set to 1 only simple one-hit merges will be tried. When +set to 2 no merge algorithms will be tried (including one-hit or more +complex tree/hash lookups). nr_requests (RW) ---------------- diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8606c9543fdd..e85442415db3 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -189,7 +189,8 @@ static ssize_t queue_nonrot_store(struct request_queue *q, const char *page, static ssize_t queue_nomerges_show(struct request_queue *q, char *page) { - return queue_var_show(blk_queue_nomerges(q), page); + return queue_var_show((blk_queue_nomerges(q) << 1) | + blk_queue_noxmerges(q), page); } static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, @@ -199,10 +200,12 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, ssize_t ret = queue_var_store(&nm, page, count); spin_lock_irq(q->queue_lock); - if (nm) + queue_flag_clear(QUEUE_FLAG_NOMERGES, q); + queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); + if (nm == 2) queue_flag_set(QUEUE_FLAG_NOMERGES, q); - else - queue_flag_clear(QUEUE_FLAG_NOMERGES, q); + else if (nm) + queue_flag_set(QUEUE_FLAG_NOXMERGES, q); spin_unlock_irq(q->queue_lock); return ret; diff --git a/block/elevator.c b/block/elevator.c index 9ad5ccc4c5ee..ee3a883840f2 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -473,6 +473,15 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) struct request *__rq; int ret; + /* + * Levels of merges: + * nomerges: No merges at all attempted + * noxmerges: Only simple one-hit cache try + * merges: All merge tries attempted + */ + if (blk_queue_nomerges(q)) + return ELEVATOR_NO_MERGE; + /* * First try one-hit cache. */ @@ -484,7 +493,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) } } - if (blk_queue_nomerges(q)) + if (blk_queue_noxmerges(q)) return ELEVATOR_NO_MERGE; /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ffb13ad35716..f71f5c58620c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -463,6 +463,7 @@ struct request_queue #define QUEUE_FLAG_IO_STAT 15 /* do IO stats */ #define QUEUE_FLAG_CQ 16 /* hardware does queuing */ #define QUEUE_FLAG_DISCARD 17 /* supports DISCARD */ +#define QUEUE_FLAG_NOXMERGES 18 /* No extended merges */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_CLUSTER) | \ @@ -589,6 +590,8 @@ enum { #define blk_queue_queuing(q) test_bit(QUEUE_FLAG_CQ, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) +#define blk_queue_noxmerges(q) \ + test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_flushing(q) ((q)->ordseq) -- cgit v1.2.3 From 1f5b8f8a2031ae9507eb67743cad4d424739bfff Mon Sep 17 00:00:00 2001 From: john stultz Date: Thu, 28 Jan 2010 15:02:41 -0800 Subject: ntp: Make time_esterror and time_maxerror static Make time_esterror and time_maxerror static as no one uses them outside of ntp.c Signed-off-by: John Stultz Cc: richard@rsk.demon.co.uk LKML-Reference: <1264719761.3437.47.camel@localhost.localdomain> Signed-off-by: Thomas Gleixner --- include/linux/timex.h | 3 --- kernel/time/ntp.c | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timex.h b/include/linux/timex.h index 94f8faecdcbc..7a082b32d8e1 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -238,9 +238,6 @@ extern int tickadj; /* amount of adjustment per tick */ * phase-lock loop variables */ extern int time_status; /* clock synchronization status bits */ -extern long time_maxerror; /* maximum error */ -extern long time_esterror; /* estimated error */ - extern long time_adjust; /* The amount of adjtime left */ extern void ntp_init(void); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4800f933910e..74b1b37b1595 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -58,10 +58,10 @@ static s64 time_offset; static long time_constant = 2; /* maximum error (usecs): */ -long time_maxerror = NTP_PHASE_LIMIT; +static long time_maxerror = NTP_PHASE_LIMIT; /* estimated error (usecs): */ -long time_esterror = NTP_PHASE_LIMIT; +static long time_esterror = NTP_PHASE_LIMIT; /* frequency offset (scaled nsecs/secs): */ static s64 time_freq; -- cgit v1.2.3 From 5352ae638e2d7d5c9b2e4d528676bbf2af6fd6f3 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 28 Jan 2010 17:04:43 -0600 Subject: perf, hw_breakpoint, kgdb: Do not take mutex for kernel debugger This patch fixes the regression in functionality where the kernel debugger and the perf API do not nicely share hw breakpoint reservations. The kernel debugger cannot use any mutex_lock() calls because it can start the kernel running from an invalid context. A mutex free version of the reservation API needed to get created for the kernel debugger to safely update hw breakpoint reservations. The possibility for a breakpoint reservation to be concurrently processed at the time that kgdb interrupts the system is improbable. Should this corner case occur the end user is warned, and the kernel debugger will prohibit updating the hardware breakpoint reservations. Any time the kernel debugger reserves a hardware breakpoint it will be a system wide reservation. Signed-off-by: Jason Wessel Acked-by: Frederic Weisbecker Cc: kgdb-bugreport@lists.sourceforge.net Cc: K.Prasad Cc: Peter Zijlstra Cc: Alan Stern Cc: torvalds@linux-foundation.org LKML-Reference: <1264719883-7285-3-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 51 ++++++++++++++++++++++++++++++++++++++++++ include/linux/hw_breakpoint.h | 2 ++ kernel/hw_breakpoint.c | 52 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 95 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 62bea7307eaa..bfba6019d762 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -239,6 +239,49 @@ static void kgdb_correct_hw_break(void) hw_breakpoint_restore(); } +static int hw_break_reserve_slot(int breakno) +{ + int cpu; + int cnt = 0; + struct perf_event **pevent; + + for_each_online_cpu(cpu) { + cnt++; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_reserve_bp_slot(*pevent)) + goto fail; + } + + return 0; + +fail: + for_each_online_cpu(cpu) { + cnt--; + if (!cnt) + break; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + dbg_release_bp_slot(*pevent); + } + return -1; +} + +static int hw_break_release_slot(int breakno) +{ + struct perf_event **pevent; + int cpu; + + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_release_bp_slot(*pevent)) + /* + * The debugger is responisble for handing the retry on + * remove failure. + */ + return -1; + } + return 0; +} + static int kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { @@ -250,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) if (i == 4) return -1; + if (hw_break_release_slot(i)) { + printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr); + return -1; + } breakinfo[i].enabled = 0; return 0; @@ -316,6 +363,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) return -1; } breakinfo[i].addr = addr; + if (hw_break_reserve_slot(i)) { + breakinfo[i].addr = 0; + return -1; + } breakinfo[i].enabled = 1; return 0; diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 41235c93e4e9..070ba0621738 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -75,6 +75,8 @@ extern int __register_perf_hw_breakpoint(struct perf_event *bp); extern void unregister_hw_breakpoint(struct perf_event *bp); extern void unregister_wide_hw_breakpoint(struct perf_event **cpu_events); +extern int dbg_reserve_bp_slot(struct perf_event *bp); +extern int dbg_release_bp_slot(struct perf_event *bp); extern int reserve_bp_slot(struct perf_event *bp); extern void release_bp_slot(struct perf_event *bp); diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index c030ae657f20..8a5c7d55ac9f 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -243,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM */ -int reserve_bp_slot(struct perf_event *bp) +static int __reserve_bp_slot(struct perf_event *bp) { struct bp_busy_slots slots = {0}; - int ret = 0; - - mutex_lock(&nr_bp_mutex); fetch_bp_busy_slots(&slots, bp); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) == HBP_NUM) { - ret = -ENOSPC; - goto end; - } + if (slots.pinned + (!!slots.flexible) == HBP_NUM) + return -ENOSPC; toggle_bp_slot(bp, true); -end: + return 0; +} + +int reserve_bp_slot(struct perf_event *bp) +{ + int ret; + + mutex_lock(&nr_bp_mutex); + + ret = __reserve_bp_slot(bp); + mutex_unlock(&nr_bp_mutex); return ret; } +static void __release_bp_slot(struct perf_event *bp) +{ + toggle_bp_slot(bp, false); +} + void release_bp_slot(struct perf_event *bp) { mutex_lock(&nr_bp_mutex); - toggle_bp_slot(bp, false); + __release_bp_slot(bp); mutex_unlock(&nr_bp_mutex); } +/* + * Allow the kernel debugger to reserve breakpoint slots without + * taking a lock using the dbg_* variant of for the reserve and + * release breakpoint slots. + */ +int dbg_reserve_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + return __reserve_bp_slot(bp); +} + +int dbg_release_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + __release_bp_slot(bp); + + return 0; +} int register_perf_hw_breakpoint(struct perf_event *bp) { -- cgit v1.2.3 From ef7995f4e46b1677f3eaaf547316e1a910b38dcb Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 29 Jan 2010 23:59:12 -0800 Subject: Input: implement input filters Sometimes it is desirable to suppress certain events from reaching input handlers and thus user space. One such example is Mac mouse button emulation code which catches certain key presses and converts them into button clicks as if they were emitted by a virtual mouse. The original key press events should be completely suppressed, otherwise user space will be confused, and while keyboard driver does it on its own evdev is blissfully unaware of this arrangement. This patch adds notion of 'filter' to the standard input handlers, which may flag event as filtered thus preventing it from reaching other input handlers. Filters don't (nor will they ever) have a notion of priority relative to each other, input core will run all of them first and any one of them may mark event as filtered. This patch is inspired by similar patch by Matthew Garret but the implementation and intended usage are quite different. Signed-off-by: Dmitry Torokhov --- drivers/input/input.c | 41 ++++++++++++++++++++++++++++++++++------- include/linux/input.h | 8 ++++++++ 2 files changed, 42 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/input.c b/drivers/input/input.c index 6c161e220868..7080a9d4b840 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -86,12 +86,14 @@ static int input_defuzz_abs_event(int value, int old_val, int fuzz) } /* - * Pass event through all open handles. This function is called with + * Pass event first through all filters and then, if event has not been + * filtered out, through all open handles. This function is called with * dev->event_lock held and interrupts disabled. */ static void input_pass_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { + struct input_handler *handler; struct input_handle *handle; rcu_read_lock(); @@ -99,11 +101,25 @@ static void input_pass_event(struct input_dev *dev, handle = rcu_dereference(dev->grab); if (handle) handle->handler->event(handle, type, code, value); - else - list_for_each_entry_rcu(handle, &dev->h_list, d_node) - if (handle->open) - handle->handler->event(handle, - type, code, value); + else { + bool filtered = false; + + list_for_each_entry_rcu(handle, &dev->h_list, d_node) { + if (!handle->open) + continue; + + handler = handle->handler; + if (!handler->filter) { + if (filtered) + break; + + handler->event(handle, type, code, value); + + } else if (handler->filter(handle, type, code, value)) + filtered = true; + } + } + rcu_read_unlock(); } @@ -990,6 +1006,8 @@ static int input_handlers_seq_show(struct seq_file *seq, void *v) union input_seq_state *state = (union input_seq_state *)&seq->private; seq_printf(seq, "N: Number=%u Name=%s", state->pos, handler->name); + if (handler->filter) + seq_puts(seq, " (filter)"); if (handler->fops) seq_printf(seq, " Minor=%d", handler->minor); seq_putc(seq, '\n'); @@ -1803,7 +1821,16 @@ int input_register_handle(struct input_handle *handle) error = mutex_lock_interruptible(&dev->mutex); if (error) return error; - list_add_tail_rcu(&handle->d_node, &dev->h_list); + + /* + * Filters go to the head of the list, normal handlers + * to the tail. + */ + if (handler->filter) + list_add_rcu(&handle->d_node, &dev->h_list); + else + list_add_tail_rcu(&handle->d_node, &dev->h_list); + mutex_unlock(&dev->mutex); /* diff --git a/include/linux/input.h b/include/linux/input.h index 7be8a6537b57..6c9d3d49fa91 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -1198,6 +1198,8 @@ struct input_handle; * @event: event handler. This method is being called by input core with * interrupts disabled and dev->event_lock spinlock held and so * it may not sleep + * @filter: similar to @event; separates normal event handlers from + * "filters". * @connect: called when attaching a handler to an input device * @disconnect: disconnects a handler from input device * @start: starts handler for given handle. This function is called by @@ -1219,6 +1221,11 @@ struct input_handle; * same time. All of them will get their copy of input event generated by * the device. * + * The very same structure is used to implement input filters. Input core + * allows filters to run first and will not pass event to regular handlers + * if any of the filters indicate that the event should be filtered (by + * returning %true from their filter() method). + * * Note that input core serializes calls to connect() and disconnect() * methods. */ @@ -1227,6 +1234,7 @@ struct input_handler { void *private; void (*event)(struct input_handle *handle, unsigned int type, unsigned int code, int value); + bool (*filter)(struct input_handle *handle, unsigned int type, unsigned int code, int value); int (*connect)(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id); void (*disconnect)(struct input_handle *handle); void (*start)(struct input_handle *handle); -- cgit v1.2.3 From 99b089c3c38a83ebaeb1cc4584ddcde841626467 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sat, 30 Jan 2010 00:53:29 -0800 Subject: Input: Mac button emulation - implement as an input filter Current implementation of Mac mouse button emulation plugs into legacy keyboard driver, converts certain keys into button events on a separate device, and suppresses the real events from reaching tty. This worked well enough until user space started using evdev which was completely unaware of this arrangement and kept sending original key presses to its users. Change the implementation to use newly added input filter framework so that original key presses are not transmitted to any handlers. As a bonus remove SYSCTL dependencies from the code and use Kconfig instead; also do not create the emulated mouse device until user activates emulation. Signed-off-by: Dmitry Torokhov --- drivers/char/keyboard.c | 5 - drivers/macintosh/Kconfig | 1 + drivers/macintosh/mac_hid.c | 257 ++++++++++++++++++++++++++++++++------------ include/linux/kbd_kern.h | 3 - 4 files changed, 188 insertions(+), 78 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c index f706b1dffdb3..cbf64b985ef4 100644 --- a/drivers/char/keyboard.c +++ b/drivers/char/keyboard.c @@ -1185,11 +1185,6 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw) rep = (down == 2); -#ifdef CONFIG_MAC_EMUMOUSEBTN - if (mac_hid_mouse_emulate_buttons(1, keycode, down)) - return; -#endif /* CONFIG_MAC_EMUMOUSEBTN */ - if ((raw_mode = (kbd->kbdmode == VC_RAW)) && !hw_raw) if (emulate_raw(vc, keycode, !down << 7)) if (keycode < BTN_MISC && printk_ratelimit()) diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig index 3d906833948d..aa3c27e5255d 100644 --- a/drivers/macintosh/Kconfig +++ b/drivers/macintosh/Kconfig @@ -172,6 +172,7 @@ config INPUT_ADBHID config MAC_EMUMOUSEBTN bool "Support for mouse button 2+3 emulation" + depends on SYSCTL select INPUT help This provides generic support for emulating the 2nd and 3rd mouse diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c index 7b4ef5bb556b..0b210a90aef5 100644 --- a/drivers/macintosh/mac_hid.c +++ b/drivers/macintosh/mac_hid.c @@ -13,17 +13,195 @@ #include #include #include -#include - -static struct input_dev *emumousebtn; -static int emumousebtn_input_register(void); static int mouse_emulate_buttons; static int mouse_button2_keycode = KEY_RIGHTCTRL; /* right control key */ static int mouse_button3_keycode = KEY_RIGHTALT; /* right option key */ -static int mouse_last_keycode; -#if defined(CONFIG_SYSCTL) +static struct input_dev *mac_hid_emumouse_dev; + +static int mac_hid_create_emumouse(void) +{ + static struct lock_class_key mac_hid_emumouse_dev_event_class; + static struct lock_class_key mac_hid_emumouse_dev_mutex_class; + int err; + + mac_hid_emumouse_dev = input_allocate_device(); + if (!mac_hid_emumouse_dev) + return -ENOMEM; + + lockdep_set_class(&mac_hid_emumouse_dev->event_lock, + &mac_hid_emumouse_dev_event_class); + lockdep_set_class(&mac_hid_emumouse_dev->mutex, + &mac_hid_emumouse_dev_mutex_class); + + mac_hid_emumouse_dev->name = "Macintosh mouse button emulation"; + mac_hid_emumouse_dev->id.bustype = BUS_ADB; + mac_hid_emumouse_dev->id.vendor = 0x0001; + mac_hid_emumouse_dev->id.product = 0x0001; + mac_hid_emumouse_dev->id.version = 0x0100; + + mac_hid_emumouse_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + mac_hid_emumouse_dev->keybit[BIT_WORD(BTN_MOUSE)] = + BIT_MASK(BTN_LEFT) | BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); + mac_hid_emumouse_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); + + err = input_register_device(mac_hid_emumouse_dev); + if (err) { + input_free_device(mac_hid_emumouse_dev); + mac_hid_emumouse_dev = NULL; + return err; + } + + return 0; +} + +static void mac_hid_destroy_emumouse(void) +{ + input_unregister_device(mac_hid_emumouse_dev); + mac_hid_emumouse_dev = NULL; +} + +static bool mac_hid_emumouse_filter(struct input_handle *handle, + unsigned int type, unsigned int code, + int value) +{ + unsigned int btn; + + if (type != EV_KEY) + return false; + + if (code == mouse_button2_keycode) + btn = BTN_MIDDLE; + else if (code == mouse_button3_keycode) + btn = BTN_RIGHT; + else + return false; + + input_report_key(mac_hid_emumouse_dev, btn, value); + input_sync(mac_hid_emumouse_dev); + + return true; +} + +static int mac_hid_emumouse_connect(struct input_handler *handler, + struct input_dev *dev, + const struct input_device_id *id) +{ + struct input_handle *handle; + int error; + + /* Don't bind to ourselves */ + if (dev == mac_hid_emumouse_dev) + return -ENODEV; + + handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + handle->dev = dev; + handle->handler = handler; + handle->name = "mac-button-emul"; + + error = input_register_handle(handle); + if (error) { + printk(KERN_ERR + "mac_hid: Failed to register button emulation handle, " + "error %d\n", error); + goto err_free; + } + + error = input_open_device(handle); + if (error) { + printk(KERN_ERR + "mac_hid: Failed to open input device, error %d\n", + error); + goto err_unregister; + } + + return 0; + + err_unregister: + input_unregister_handle(handle); + err_free: + kfree(handle); + return error; +} + +static void mac_hid_emumouse_disconnect(struct input_handle *handle) +{ + input_close_device(handle); + input_unregister_handle(handle); + kfree(handle); +} + +static const struct input_device_id mac_hid_emumouse_ids[] = { + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT, + .evbit = { BIT_MASK(EV_KEY) }, + }, + { }, +}; + +MODULE_DEVICE_TABLE(input, mac_hid_emumouse_ids); + +static struct input_handler mac_hid_emumouse_handler = { + .filter = mac_hid_emumouse_filter, + .connect = mac_hid_emumouse_connect, + .disconnect = mac_hid_emumouse_disconnect, + .name = "mac-button-emul", + .id_table = mac_hid_emumouse_ids, +}; + +static int mac_hid_start_emulation(void) +{ + int err; + + err = mac_hid_create_emumouse(); + if (err) + return err; + + err = input_register_handler(&mac_hid_emumouse_handler); + if (err) { + mac_hid_destroy_emumouse(); + return err; + } + + return 0; +} + +static void mac_hid_stop_emulation(void) +{ + input_unregister_handler(&mac_hid_emumouse_handler); + mac_hid_destroy_emumouse(); +} + +static int mac_hid_toggle_emumouse(ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int *valp = table->data; + int old_val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + + if (rc == 0 && write && *valp != old_val) { + if (*valp == 1) + rc = mac_hid_start_emulation(); + else if (*valp == 0) + mac_hid_stop_emulation(); + else + rc = -EINVAL; + } + + /* Restore the old value in case of error */ + if (rc) + *valp = old_val; + + return rc; +} + /* file(s) in /proc/sys/dev/mac_hid */ static ctl_table mac_hid_files[] = { { @@ -31,7 +209,7 @@ static ctl_table mac_hid_files[] = { .data = &mouse_emulate_buttons, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = mac_hid_toggle_emumouse, }, { .procname = "mouse_button2_keycode", @@ -74,73 +252,12 @@ static ctl_table mac_hid_root_dir[] = { static struct ctl_table_header *mac_hid_sysctl_header; -#endif /* endif CONFIG_SYSCTL */ - -int mac_hid_mouse_emulate_buttons(int caller, unsigned int keycode, int down) -{ - switch (caller) { - case 1: - /* Called from keyboard.c */ - if (mouse_emulate_buttons - && (keycode == mouse_button2_keycode - || keycode == mouse_button3_keycode)) { - if (mouse_emulate_buttons == 1) { - input_report_key(emumousebtn, - keycode == mouse_button2_keycode ? BTN_MIDDLE : BTN_RIGHT, - down); - input_sync(emumousebtn); - return 1; - } - mouse_last_keycode = down ? keycode : 0; - } - break; - } - return 0; -} - -static struct lock_class_key emumousebtn_event_class; -static struct lock_class_key emumousebtn_mutex_class; - -static int emumousebtn_input_register(void) -{ - int ret; - - emumousebtn = input_allocate_device(); - if (!emumousebtn) - return -ENOMEM; - - lockdep_set_class(&emumousebtn->event_lock, &emumousebtn_event_class); - lockdep_set_class(&emumousebtn->mutex, &emumousebtn_mutex_class); - - emumousebtn->name = "Macintosh mouse button emulation"; - emumousebtn->id.bustype = BUS_ADB; - emumousebtn->id.vendor = 0x0001; - emumousebtn->id.product = 0x0001; - emumousebtn->id.version = 0x0100; - - emumousebtn->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); - emumousebtn->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | - BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); - emumousebtn->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); - - ret = input_register_device(emumousebtn); - if (ret) - input_free_device(emumousebtn); - - return ret; -} static int __init mac_hid_init(void) { - int err; - - err = emumousebtn_input_register(); - if (err) - return err; - -#if defined(CONFIG_SYSCTL) mac_hid_sysctl_header = register_sysctl_table(mac_hid_root_dir); -#endif /* CONFIG_SYSCTL */ + if (!mac_hid_sysctl_header) + return -ENOMEM; return 0; } diff --git a/include/linux/kbd_kern.h b/include/linux/kbd_kern.h index 8bdb16bfe5fb..506ad20c18f8 100644 --- a/include/linux/kbd_kern.h +++ b/include/linux/kbd_kern.h @@ -161,7 +161,4 @@ static inline void con_schedule_flip(struct tty_struct *t) schedule_delayed_work(&t->buf.work, 0); } -/* mac_hid.c */ -extern int mac_hid_mouse_emulate_buttons(int, unsigned int, int); - #endif -- cgit v1.2.3 From d6ad3e286d2c075a60b9f11075a2c55aeeeca2ad Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 27 Jan 2010 16:25:22 -0600 Subject: softlockup: Add sched_clock_tick() to avoid kernel warning on kgdb resume When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, sched_clock() gets the time from hardware such as the TSC on x86. In this configuration kgdb will report a softlock warning message on resuming or detaching from a debug session. Sequence of events in the problem case: 1) "cpu sched clock" and "hardware time" are at 100 sec prior to a call to kgdb_handle_exception() 2) Debugger waits in kgdb_handle_exception() for 80 sec and on exit the following is called ... touch_softlockup_watchdog() --> __raw_get_cpu_var(touch_timestamp) = 0; 3) "cpu sched clock" = 100s (it was not updated, because the interrupt was disabled in kgdb) but the "hardware time" = 180 sec 4) The first timer interrupt after resuming from kgdb_handle_exception updates the watchdog from the "cpu sched clock" update_process_times() { ... run_local_timers() --> softlockup_tick() --> check (touch_timestamp == 0) (it is "YES" here, we have set "touch_timestamp = 0" at kgdb) --> __touch_softlockup_watchdog() ***(A)--> reset "touch_timestamp" to "get_timestamp()" (Here, the "touch_timestamp" will still be set to 100s.) ... scheduler_tick() ***(B)--> sched_clock_tick() (update "cpu sched clock" to "hardware time" = 180s) ... } 5) The Second timer interrupt handler appears to have a large jump and trips the softlockup warning. update_process_times() { ... run_local_timers() --> softlockup_tick() --> "cpu sched clock" - "touch_timestamp" = 180s-100s > 60s --> printk "soft lockup error messages" ... } note: ***(A) reset "touch_timestamp" to "get_timestamp(this_cpu)" Why is "touch_timestamp" 100 sec, instead of 180 sec? When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, the call trace of get_timestamp() is: get_timestamp(this_cpu) -->cpu_clock(this_cpu) -->sched_clock_cpu(this_cpu) -->__update_sched_clock(sched_clock_data, now) The __update_sched_clock() function uses the GTOD tick value to create a window to normalize the "now" values. So if "now" value is too big for sched_clock_data, it will be ignored. The fix is to invoke sched_clock_tick() to update "cpu sched clock" in order to recover from this state. This is done by introducing the function touch_softlockup_watchdog_sync(). This allows kgdb to request that the sched clock is updated when the watchdog thread runs the first time after a resume from kgdb. [yong.zhang0@gmail.com: Use per cpu instead of an array] Signed-off-by: Jason Wessel Signed-off-by: Dongdong Deng Cc: kgdb-bugreport@lists.sourceforge.net Cc: peterz@infradead.org LKML-Reference: <1264631124-4837-2-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/kgdb.c | 6 +++--- kernel/softlockup.c | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6f7bba93929b..89232151a9d0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -310,6 +310,7 @@ extern void sched_show_task(struct task_struct *p); #ifdef CONFIG_DETECT_SOFTLOCKUP extern void softlockup_tick(void); extern void touch_softlockup_watchdog(void); +extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, void __user *buffer, @@ -323,6 +324,9 @@ static inline void softlockup_tick(void) static inline void touch_softlockup_watchdog(void) { } +static inline void touch_softlockup_watchdog_sync(void) +{ +} static inline void touch_all_softlockup_watchdogs(void) { } diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 2eb517e23514..87f2cc557553 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -596,7 +596,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -1450,7 +1450,7 @@ acquirelock: (kgdb_info[cpu].task && kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1550,7 +1550,7 @@ kgdb_restore: } /* Free kgdb_active */ atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index d22579087e27..0d4c7898ab80 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock); static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); +static DEFINE_PER_CPU(bool, softlock_touch_sync); static int __read_mostly did_panic; int __read_mostly softlockup_thresh = 60; @@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void) } EXPORT_SYMBOL(touch_softlockup_watchdog); +void touch_softlockup_watchdog_sync(void) +{ + __raw_get_cpu_var(softlock_touch_sync) = true; + __raw_get_cpu_var(softlockup_touch_ts) = 0; +} + void touch_all_softlockup_watchdogs(void) { int cpu; @@ -118,6 +125,14 @@ void softlockup_tick(void) } if (touch_ts == 0) { + if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { + /* + * If the time stamp was touched atomically + * make sure the scheduler tick is up to date. + */ + per_cpu(softlock_touch_sync, this_cpu) = false; + sched_clock_tick(); + } __touch_softlockup_watchdog(); return; } -- cgit v1.2.3 From 61ef2489dbf587258526cfd4ebf4bba3b079f401 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 22 Jan 2010 16:16:19 +0800 Subject: resources: introduce generic page_is_ram() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's based on walk_system_ram_range(), for archs that don't have their own page_is_ram(). The static verions in MIPS and SCORE are also made global. v4: prefer plain 1 instead of PAGE_IS_RAM (H. Peter Anvin) v3: add comment (KAMEZAWA Hiroyuki) "AFAIK, this "System RAM" information has been used for kdump to grab valid memory area and seems good for the kernel itself." v2: add PAGE_IS_RAM macro (Américo Wang) Cc: Chen Liqin Cc: Lennox Wu Cc: Américo Wang Cc: linux-mips@linux-mips.org Cc: Yinghai Lu Acked-by: Ralf Baechle Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Wu Fengguang LKML-Reference: <20100122081619.GA6431@localhost> Cc: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/mips/mm/init.c | 2 +- arch/score/mm/init.c | 2 +- include/linux/ioport.h | 2 ++ kernel/resource.c | 13 +++++++++++++ 4 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 15aa1902a788..4d72aabe8352 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -294,7 +294,7 @@ void __init fixrange_init(unsigned long start, unsigned long end, } #ifndef CONFIG_NEED_MULTIPLE_NODES -static int __init page_is_ram(unsigned long pagenr) +int page_is_ram(unsigned long pagenr) { int i; diff --git a/arch/score/mm/init.c b/arch/score/mm/init.c index 4e3dcd0c4716..f684a590c21d 100644 --- a/arch/score/mm/init.c +++ b/arch/score/mm/init.c @@ -59,7 +59,7 @@ static unsigned long setup_zero_page(void) } #ifndef CONFIG_NEED_MULTIPLE_NODES -static int __init page_is_ram(unsigned long pagenr) +int page_is_ram(unsigned long pagenr) { if (pagenr >= min_low_pfn && pagenr < max_low_pfn) return 1; diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 83aa81297ea3..11ef7952b63a 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -188,5 +188,7 @@ extern int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)); +extern int page_is_ram(unsigned long pfn); + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */ diff --git a/kernel/resource.c b/kernel/resource.c index fb11a58b9594..b4d637a55256 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -297,6 +297,19 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, #endif +static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) +{ + return 1; +} +/* + * This generic page_is_ram() returns true if specified address is + * registered as "System RAM" in iomem_resource list. + */ +int __attribute__((weak)) page_is_ram(unsigned long pfn) +{ + return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; +} + /* * Find empty slot in the resource tree given range and alignment. */ -- cgit v1.2.3 From 53df8fdc15fb646b0219e43c989c2cdab1ab100c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 27 Jan 2010 11:06:39 +0800 Subject: Move page_is_ram() declaration to mm.h Move page_is_ram() declaration to mm.h, it makes no sense in . Signed-off-by: Wu Fengguang LKML-Reference: <20100127030639.GD8132@localhost> Signed-off-by: H. Peter Anvin --- include/linux/ioport.h | 2 -- include/linux/mm.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 11ef7952b63a..83aa81297ea3 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -188,7 +188,5 @@ extern int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)); -extern int page_is_ram(unsigned long pfn); - #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 24c395694f4d..bad433fdbfce 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -265,6 +265,8 @@ static inline int get_page_unless_zero(struct page *page) return atomic_inc_not_zero(&page->_count); } +extern int page_is_ram(unsigned long pfn); + /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); -- cgit v1.2.3 From b79c7adf82e8b8a6d6ad1dadf7e687a4a030cf8c Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 2 Feb 2010 13:01:25 +0900 Subject: mtd: trivial sh_flctl changes This patch contains a few changes for the sh_flctl driver: - not sh7723-only driver - get rid of kconfig dependency - use dev_err() instead of printk() - use __devinit and __devexit for probe()/remove() - fix probe() return values Signed-off-by: Magnus Damm Acked-by: Yoshihiro Shimoda Signed-off-by: Paul Mundt --- drivers/mtd/nand/Kconfig | 4 ++-- drivers/mtd/nand/sh_flctl.c | 42 +++++++++++++++++++++++------------------- include/linux/mtd/sh_flctl.h | 1 + 3 files changed, 26 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig index 677cd53f18c3..bb6465604235 100644 --- a/drivers/mtd/nand/Kconfig +++ b/drivers/mtd/nand/Kconfig @@ -457,10 +457,10 @@ config MTD_NAND_NOMADIK config MTD_NAND_SH_FLCTL tristate "Support for NAND on Renesas SuperH FLCTL" - depends on MTD_NAND && SUPERH && CPU_SUBTYPE_SH7723 + depends on MTD_NAND && SUPERH help Several Renesas SuperH CPU has FLCTL. This option enables support - for NAND Flash using FLCTL. This driver support SH7723. + for NAND Flash using FLCTL. config MTD_NAND_DAVINCI tristate "Support NAND on DaVinci SoC" diff --git a/drivers/mtd/nand/sh_flctl.c b/drivers/mtd/nand/sh_flctl.c index 02bef21f2e4b..ab068a503b29 100644 --- a/drivers/mtd/nand/sh_flctl.c +++ b/drivers/mtd/nand/sh_flctl.c @@ -1,10 +1,10 @@ /* * SuperH FLCTL nand controller * - * Copyright © 2008 Renesas Solutions Corp. - * Copyright © 2008 Atom Create Engineering Co., Ltd. + * Copyright (c) 2008 Renesas Solutions Corp. + * Copyright (c) 2008 Atom Create Engineering Co., Ltd. * - * Based on fsl_elbc_nand.c, Copyright © 2006-2007 Freescale Semiconductor + * Based on fsl_elbc_nand.c, Copyright (c) 2006-2007 Freescale Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -75,6 +75,11 @@ static void start_translation(struct sh_flctl *flctl) writeb(TRSTRT, FLTRCR(flctl)); } +static void timeout_error(struct sh_flctl *flctl, const char *str) +{ + dev_err(&flctl->pdev->dev, "Timeout occured in %s\n", str); +} + static void wait_completion(struct sh_flctl *flctl) { uint32_t timeout = LOOP_TIMEOUT_MAX; @@ -87,7 +92,7 @@ static void wait_completion(struct sh_flctl *flctl) udelay(1); } - printk(KERN_ERR "wait_completion(): Timeout occured \n"); + timeout_error(flctl, __func__); writeb(0x0, FLTRCR(flctl)); } @@ -132,7 +137,7 @@ static void wait_rfifo_ready(struct sh_flctl *flctl) return; udelay(1); } - printk(KERN_ERR "wait_rfifo_ready(): Timeout occured \n"); + timeout_error(flctl, __func__); } static void wait_wfifo_ready(struct sh_flctl *flctl) @@ -146,7 +151,7 @@ static void wait_wfifo_ready(struct sh_flctl *flctl) return; udelay(1); } - printk(KERN_ERR "wait_wfifo_ready(): Timeout occured \n"); + timeout_error(flctl, __func__); } static int wait_recfifo_ready(struct sh_flctl *flctl, int sector_number) @@ -198,7 +203,7 @@ static int wait_recfifo_ready(struct sh_flctl *flctl, int sector_number) writel(0, FL4ECCCR(flctl)); } - printk(KERN_ERR "wait_recfifo_ready(): Timeout occured \n"); + timeout_error(flctl, __func__); return 1; /* timeout */ } @@ -214,7 +219,7 @@ static void wait_wecfifo_ready(struct sh_flctl *flctl) return; udelay(1); } - printk(KERN_ERR "wait_wecfifo_ready(): Timeout occured \n"); + timeout_error(flctl, __func__); } static void read_datareg(struct sh_flctl *flctl, int offset) @@ -769,38 +774,36 @@ static int flctl_chip_init_tail(struct mtd_info *mtd) return 0; } -static int __init flctl_probe(struct platform_device *pdev) +static int __devinit flctl_probe(struct platform_device *pdev) { struct resource *res; struct sh_flctl *flctl; struct mtd_info *flctl_mtd; struct nand_chip *nand; struct sh_flctl_platform_data *pdata; - int ret; + int ret = -ENXIO; pdata = pdev->dev.platform_data; if (pdata == NULL) { - printk(KERN_ERR "sh_flctl platform_data not found.\n"); - return -ENODEV; + dev_err(&pdev->dev, "no platform data defined\n"); + return -EINVAL; } flctl = kzalloc(sizeof(struct sh_flctl), GFP_KERNEL); if (!flctl) { - printk(KERN_ERR "Unable to allocate NAND MTD dev structure.\n"); + dev_err(&pdev->dev, "failed to allocate driver data\n"); return -ENOMEM; } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) { - printk(KERN_ERR "%s: resource not found.\n", __func__); - ret = -ENODEV; + dev_err(&pdev->dev, "failed to get I/O memory\n"); goto err; } - flctl->reg = ioremap(res->start, res->end - res->start + 1); + flctl->reg = ioremap(res->start, resource_size(res)); if (flctl->reg == NULL) { - printk(KERN_ERR "%s: ioremap error.\n", __func__); - ret = -ENOMEM; + dev_err(&pdev->dev, "failed to remap I/O memory\n"); goto err; } @@ -808,6 +811,7 @@ static int __init flctl_probe(struct platform_device *pdev) flctl_mtd = &flctl->mtd; nand = &flctl->chip; flctl_mtd->priv = nand; + flctl->pdev = pdev; flctl->hwecc = pdata->has_hwecc; flctl_register_init(flctl, pdata->flcmncr_val); @@ -846,7 +850,7 @@ err: return ret; } -static int __exit flctl_remove(struct platform_device *pdev) +static int __devexit flctl_remove(struct platform_device *pdev) { struct sh_flctl *flctl = platform_get_drvdata(pdev); diff --git a/include/linux/mtd/sh_flctl.h b/include/linux/mtd/sh_flctl.h index e77c1cea404d..164c9d4013c0 100644 --- a/include/linux/mtd/sh_flctl.h +++ b/include/linux/mtd/sh_flctl.h @@ -96,6 +96,7 @@ struct sh_flctl { struct mtd_info mtd; struct nand_chip chip; + struct platform_device *pdev; void __iomem *reg; uint8_t done_buff[2048 + 64]; /* max size 2048 + 64 */ -- cgit v1.2.3 From 010ab820582d03bcd3648416b5837107e8a9c5f3 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 27 Jan 2010 09:17:21 +0000 Subject: mtd: sh_flctl SHBUSSEL and SEL_16BIT support This patch extends the sh_flctl driver with support for 16-bit bus configuration using SEL_16BIT and support for multiplexed pins using SHBUSSEL. Signed-off-by: Magnus Damm Acked-by: Yoshihiro Shimoda Signed-off-by: Paul Mundt --- drivers/mtd/nand/sh_flctl.c | 27 ++++++++++++++++++++++++++- include/linux/mtd/sh_flctl.h | 2 ++ 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/sh_flctl.c b/drivers/mtd/nand/sh_flctl.c index ab068a503b29..1842df8bdd93 100644 --- a/drivers/mtd/nand/sh_flctl.c +++ b/drivers/mtd/nand/sh_flctl.c @@ -105,6 +105,8 @@ static void set_addr(struct mtd_info *mtd, int column, int page_addr) addr = page_addr; /* ERASE1 */ } else if (page_addr != -1) { /* SEQIN, READ0, etc.. */ + if (flctl->chip.options & NAND_BUSWIDTH_16) + column >>= 1; if (flctl->page_size) { addr = column & 0x0FFF; addr |= (page_addr & 0xff) << 16; @@ -280,7 +282,7 @@ static void write_fiforeg(struct sh_flctl *flctl, int rlen, int offset) static void set_cmd_regs(struct mtd_info *mtd, uint32_t cmd, uint32_t flcmcdr_val) { struct sh_flctl *flctl = mtd_to_flctl(mtd); - uint32_t flcmncr_val = readl(FLCMNCR(flctl)); + uint32_t flcmncr_val = readl(FLCMNCR(flctl)) & ~SEL_16BIT; uint32_t flcmdcr_val, addr_len_bytes = 0; /* Set SNAND bit if page size is 2048byte */ @@ -302,6 +304,8 @@ static void set_cmd_regs(struct mtd_info *mtd, uint32_t cmd, uint32_t flcmcdr_va case NAND_CMD_READOOB: addr_len_bytes = flctl->rw_ADRCNT; flcmdcr_val |= CDSRC_E; + if (flctl->chip.options & NAND_BUSWIDTH_16) + flcmncr_val |= SEL_16BIT; break; case NAND_CMD_SEQIN: /* This case is that cmd is READ0 or READ1 or READ00 */ @@ -310,6 +314,8 @@ static void set_cmd_regs(struct mtd_info *mtd, uint32_t cmd, uint32_t flcmcdr_va case NAND_CMD_PAGEPROG: addr_len_bytes = flctl->rw_ADRCNT; flcmdcr_val |= DOCMD2_E | CDSRC_E | SELRW; + if (flctl->chip.options & NAND_BUSWIDTH_16) + flcmncr_val |= SEL_16BIT; break; case NAND_CMD_READID: flcmncr_val &= ~SNAND_E; @@ -528,6 +534,8 @@ static void flctl_cmdfunc(struct mtd_info *mtd, unsigned int command, set_addr(mtd, 0, page_addr); flctl->read_bytes = mtd->writesize + mtd->oobsize; + if (flctl->chip.options & NAND_BUSWIDTH_16) + column >>= 1; flctl->index += column; goto read_normal_exit; @@ -691,6 +699,18 @@ static uint8_t flctl_read_byte(struct mtd_info *mtd) return data; } +static uint16_t flctl_read_word(struct mtd_info *mtd) +{ + struct sh_flctl *flctl = mtd_to_flctl(mtd); + int index = flctl->index; + uint16_t data; + uint16_t *buf = (uint16_t *)&flctl->done_buff[index]; + + data = *buf; + flctl->index += 2; + return data; +} + static void flctl_read_buf(struct mtd_info *mtd, uint8_t *buf, int len) { int i; @@ -829,6 +849,11 @@ static int __devinit flctl_probe(struct platform_device *pdev) nand->select_chip = flctl_select_chip; nand->cmdfunc = flctl_cmdfunc; + if (pdata->flcmncr_val & SEL_16BIT) { + nand->options |= NAND_BUSWIDTH_16; + nand->read_word = flctl_read_word; + } + ret = nand_scan_ident(flctl_mtd, 1); if (ret) goto err; diff --git a/include/linux/mtd/sh_flctl.h b/include/linux/mtd/sh_flctl.h index 164c9d4013c0..ab77609ec337 100644 --- a/include/linux/mtd/sh_flctl.h +++ b/include/linux/mtd/sh_flctl.h @@ -51,6 +51,8 @@ #define _4ECCCNTEN (0x1 << 24) #define _4ECCEN (0x1 << 23) #define _4ECCCORRECT (0x1 << 22) +#define SHBUSSEL (0x1 << 20) +#define SEL_16BIT (0x1 << 19) #define SNAND_E (0x1 << 18) /* SNAND (0=512 1=2048)*/ #define QTSEL_E (0x1 << 17) #define ENDIAN (0x1 << 16) /* 1 = little endian */ -- cgit v1.2.3 From c30f540b63047437ffa894b5353216410c480d1a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 2 Feb 2010 15:03:24 +0100 Subject: netfilter: xtables: CONFIG_COMPAT redux Ifdef out struct nf_sockopt_ops::compat_set struct nf_sockopt_ops::compat_get struct xt_match::compat_from_user struct xt_match::compat_to_user struct xt_match::compatsize to make structures smaller on COMPAT=n kernels. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/linux/netfilter.h | 9 ++++++--- include/linux/netfilter/x_tables.h | 12 ++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 48c54960773c..78f33d223680 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -114,15 +114,17 @@ struct nf_sockopt_ops { int set_optmin; int set_optmax; int (*set)(struct sock *sk, int optval, void __user *user, unsigned int len); +#ifdef CONFIG_COMPAT int (*compat_set)(struct sock *sk, int optval, void __user *user, unsigned int len); - +#endif int get_optmin; int get_optmax; int (*get)(struct sock *sk, int optval, void __user *user, int *len); +#ifdef CONFIG_COMPAT int (*compat_get)(struct sock *sk, int optval, void __user *user, int *len); - +#endif /* Use the module struct to lock set/get code in place */ struct module *owner; }; @@ -222,11 +224,12 @@ int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, unsigned int len); int nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, int *len); - +#ifdef CONFIG_COMPAT int compat_nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, unsigned int len); int compat_nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, int *len); +#endif /* Call this before modifying an existing packet: ensures it is modifiable and linear to the point you care about (writable_len). diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 3caf5e151102..026eb78ee83c 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -283,11 +283,11 @@ struct xt_match { /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_mtdtor_param *); - +#ifdef CONFIG_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, void *src); int (*compat_to_user)(void __user *dst, void *src); - +#endif /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; @@ -296,7 +296,9 @@ struct xt_match { const char *table; unsigned int matchsize; +#ifdef CONFIG_COMPAT unsigned int compatsize; +#endif unsigned int hooks; unsigned short proto; @@ -323,17 +325,19 @@ struct xt_target { /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_tgdtor_param *); - +#ifdef CONFIG_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, void *src); int (*compat_to_user)(void __user *dst, void *src); - +#endif /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; const char *table; unsigned int targetsize; +#ifdef CONFIG_COMPAT unsigned int compatsize; +#endif unsigned int hooks; unsigned short proto; -- cgit v1.2.3 From c85bb41e93184bf5494dde6d8fe5a81b564c84c8 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Tue, 2 Feb 2010 07:32:29 -0800 Subject: igmp: fix ip_mc_sf_allow race [v5] Almost all igmp functions accessing inet->mc_list are protected by rtnl_lock(), but there is one exception which is ip_mc_sf_allow(), so there is a chance of either ip_mc_drop_socket or ip_mc_leave_group remove an entry while ip_mc_sf_allow is running causing a crash. Signed-off-by: Flavio Leitner Signed-off-by: David S. Miller --- include/linux/igmp.h | 2 ++ net/ipv4/igmp.c | 83 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 64 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 724c27e5d173..93fc2449af10 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -153,6 +153,7 @@ extern int sysctl_igmp_max_msf; struct ip_sf_socklist { unsigned int sl_max; unsigned int sl_count; + struct rcu_head rcu; __be32 sl_addr[0]; }; @@ -170,6 +171,7 @@ struct ip_mc_socklist { struct ip_mreqn multi; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ip_sf_socklist *sflist; + struct rcu_head rcu; }; struct ip_sf_list { diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 8f5468393f01..d28363998743 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1799,7 +1799,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) iml->next = inet->mc_list; iml->sflist = NULL; iml->sfmode = MCAST_EXCLUDE; - inet->mc_list = iml; + rcu_assign_pointer(inet->mc_list, iml); ip_mc_inc_group(in_dev, addr); err = 0; done: @@ -1807,24 +1807,46 @@ done: return err; } +static void ip_sf_socklist_reclaim(struct rcu_head *rp) +{ + struct ip_sf_socklist *psf; + + psf = container_of(rp, struct ip_sf_socklist, rcu); + /* sk_omem_alloc should have been decreased by the caller*/ + kfree(psf); +} + static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { + struct ip_sf_socklist *psf = iml->sflist; int err; - if (iml->sflist == NULL) { + if (psf == NULL) { /* any-source empty exclude case */ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, 0, NULL, 0); } err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, - iml->sfmode, iml->sflist->sl_count, - iml->sflist->sl_addr, 0); - sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max)); - iml->sflist = NULL; + iml->sfmode, psf->sl_count, psf->sl_addr, 0); + rcu_assign_pointer(iml->sflist, NULL); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); + call_rcu(&psf->rcu, ip_sf_socklist_reclaim); return err; } + +static void ip_mc_socklist_reclaim(struct rcu_head *rp) +{ + struct ip_mc_socklist *iml; + + iml = container_of(rp, struct ip_mc_socklist, rcu); + /* sk_omem_alloc should have been decreased by the caller*/ + kfree(iml); +} + + /* * Ask a socket to leave a group. */ @@ -1854,12 +1876,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) (void) ip_mc_leave_src(sk, iml, in_dev); - *imlp = iml->next; + rcu_assign_pointer(*imlp, iml->next); if (in_dev) ip_mc_dec_group(in_dev, group); rtnl_unlock(); - sock_kfree_s(sk, iml, sizeof(*iml)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); + call_rcu(&iml->rcu, ip_mc_socklist_reclaim); return 0; } if (!in_dev) @@ -1974,9 +1998,12 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (psl) { for (i=0; isl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; - sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + call_rcu(&psl->rcu, ip_sf_socklist_reclaim); } - pmc->sflist = psl = newpsl; + rcu_assign_pointer(pmc->sflist, newpsl); + psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i=0; isl_count; i++) { @@ -2072,11 +2099,13 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) if (psl) { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); - sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + call_rcu(&psl->rcu, ip_sf_socklist_reclaim); } else (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 0, NULL, 0); - pmc->sflist = newpsl; + rcu_assign_pointer(pmc->sflist, newpsl); pmc->sfmode = msf->imsf_fmode; err = 0; done: @@ -2209,30 +2238,40 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) struct ip_mc_socklist *pmc; struct ip_sf_socklist *psl; int i; + int ret; + ret = 1; if (!ipv4_is_multicast(loc_addr)) - return 1; + goto out; - for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + rcu_read_lock(); + for (pmc=rcu_dereference(inet->mc_list); pmc; pmc=rcu_dereference(pmc->next)) { if (pmc->multi.imr_multiaddr.s_addr == loc_addr && pmc->multi.imr_ifindex == dif) break; } + ret = inet->mc_all; if (!pmc) - return inet->mc_all; + goto unlock; psl = pmc->sflist; + ret = (pmc->sfmode == MCAST_EXCLUDE); if (!psl) - return pmc->sfmode == MCAST_EXCLUDE; + goto unlock; for (i=0; isl_count; i++) { if (psl->sl_addr[i] == rmt_addr) break; } + ret = 0; if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) - return 0; + goto unlock; if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) - return 0; - return 1; + goto unlock; + ret = 1; +unlock: + rcu_read_unlock(); +out: + return ret; } /* @@ -2251,7 +2290,7 @@ void ip_mc_drop_socket(struct sock *sk) rtnl_lock(); while ((iml = inet->mc_list) != NULL) { struct in_device *in_dev; - inet->mc_list = iml->next; + rcu_assign_pointer(inet->mc_list, iml->next); in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); @@ -2259,7 +2298,9 @@ void ip_mc_drop_socket(struct sock *sk) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); in_dev_put(in_dev); } - sock_kfree_s(sk, iml, sizeof(*iml)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); + call_rcu(&iml->rcu, ip_mc_socklist_reclaim); } rtnl_unlock(); } -- cgit v1.2.3 From f9bfbebf34eab707b065116cdc9699d25ba4252a Mon Sep 17 00:00:00 2001 From: Shirley Ma Date: Fri, 29 Jan 2010 03:19:05 +0000 Subject: virtio: Add ability to detach unused buffers from vrings There's currently no way for a virtio driver to ask for unused buffers, so it has to keep a list itself to reclaim them at shutdown. This is redundant, since virtio_ring stores that information. So add a new hook to do this. Signed-off-by: Shirley Ma Signed-off-by: Amit Shah Signed-off-by: Rusty Russell Signed-off-by: David S. Miller --- drivers/virtio/virtio_ring.c | 25 +++++++++++++++++++++++++ include/linux/virtio.h | 4 ++++ 2 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index fbd2ecde93e4..71929ee00d69 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -334,6 +334,30 @@ static bool vring_enable_cb(struct virtqueue *_vq) return true; } +static void *vring_detach_unused_buf(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + unsigned int i; + void *buf; + + START_USE(vq); + + for (i = 0; i < vq->vring.num; i++) { + if (!vq->data[i]) + continue; + /* detach_buf clears data, so grab it now. */ + buf = vq->data[i]; + detach_buf(vq, i); + END_USE(vq); + return buf; + } + /* That should have freed everything. */ + BUG_ON(vq->num_free != vq->vring.num); + + END_USE(vq); + return NULL; +} + irqreturn_t vring_interrupt(int irq, void *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); @@ -360,6 +384,7 @@ static struct virtqueue_ops vring_vq_ops = { .kick = vring_kick, .disable_cb = vring_disable_cb, .enable_cb = vring_enable_cb, + .detach_unused_buf = vring_detach_unused_buf, }; struct virtqueue *vring_new_virtqueue(unsigned int num, diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 057a2e010758..f508c651e53d 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -51,6 +51,9 @@ struct virtqueue { * This re-enables callbacks; it returns "false" if there are pending * buffers in the queue, to detect a possible race between the driver * checking for more work, and enabling callbacks. + * @detach_unused_buf: detach first unused buffer + * vq: the struct virtqueue we're talking about. + * Returns NULL or the "data" token handed to add_buf * * Locking rules are straightforward: the driver is responsible for * locking. No two operations may be invoked simultaneously, with the exception @@ -71,6 +74,7 @@ struct virtqueue_ops { void (*disable_cb)(struct virtqueue *vq); bool (*enable_cb)(struct virtqueue *vq); + void *(*detach_unused_buf)(struct virtqueue *vq); }; /** -- cgit v1.2.3 From f98bfbd78c37c5946cc53089da32a5f741efdeb7 Mon Sep 17 00:00:00 2001 From: Evgeniy Polyakov Date: Tue, 2 Feb 2010 15:58:48 -0800 Subject: connector: Delete buggy notification code. On Tue, Feb 02, 2010 at 02:57:14PM -0800, Greg KH (gregkh@suse.de) wrote: > > There are at least two ways to fix it: using a big cannon and a small > > one. The former way is to disable notification registration, since it is > > not used by anyone at all. Second way is to check whether calling > > process is root and its destination group is -1 (kind of priveledged > > one) before command is dispatched to workqueue. > > Well if no one is using it, removing it makes the most sense, right? > > No objection from me, care to make up a patch either way for this? Getting it is not used, let's drop support for notifications about (un)registered events from connector. Another option was to check credentials on receiving, but we can always restore it without bugs if needed, but genetlink has a wider code base and none complained, that userspace can not get notification when some other clients were (un)registered. Kudos for Sebastian Krahmer , who found a bug in the code. Signed-off-by: Evgeniy Polyakov Acked-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- drivers/connector/connector.c | 175 ------------------------------------------ include/linux/connector.h | 32 -------- 2 files changed, 207 deletions(-) (limited to 'include/linux') diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c index f06024668f99..537c29ac4487 100644 --- a/drivers/connector/connector.c +++ b/drivers/connector/connector.c @@ -36,17 +36,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Evgeniy Polyakov "); MODULE_DESCRIPTION("Generic userspace <-> kernelspace connector."); -static u32 cn_idx = CN_IDX_CONNECTOR; -static u32 cn_val = CN_VAL_CONNECTOR; - -module_param(cn_idx, uint, 0); -module_param(cn_val, uint, 0); -MODULE_PARM_DESC(cn_idx, "Connector's main device idx."); -MODULE_PARM_DESC(cn_val, "Connector's main device val."); - -static DEFINE_MUTEX(notify_lock); -static LIST_HEAD(notify_list); - static struct cn_dev cdev; static int cn_already_initialized; @@ -209,54 +198,6 @@ static void cn_rx_skb(struct sk_buff *__skb) } } -/* - * Notification routing. - * - * Gets id and checks if there are notification request for it's idx - * and val. If there are such requests notify the listeners with the - * given notify event. - * - */ -static void cn_notify(struct cb_id *id, u32 notify_event) -{ - struct cn_ctl_entry *ent; - - mutex_lock(¬ify_lock); - list_for_each_entry(ent, ¬ify_list, notify_entry) { - int i; - struct cn_notify_req *req; - struct cn_ctl_msg *ctl = ent->msg; - int idx_found, val_found; - - idx_found = val_found = 0; - - req = (struct cn_notify_req *)ctl->data; - for (i = 0; i < ctl->idx_notify_num; ++i, ++req) { - if (id->idx >= req->first && - id->idx < req->first + req->range) { - idx_found = 1; - break; - } - } - - for (i = 0; i < ctl->val_notify_num; ++i, ++req) { - if (id->val >= req->first && - id->val < req->first + req->range) { - val_found = 1; - break; - } - } - - if (idx_found && val_found) { - struct cn_msg m = { .ack = notify_event, }; - - memcpy(&m.id, id, sizeof(m.id)); - cn_netlink_send(&m, ctl->group, GFP_KERNEL); - } - } - mutex_unlock(¬ify_lock); -} - /* * Callback add routing - adds callback with given ID and name. * If there is registered callback with the same ID it will not be added. @@ -276,8 +217,6 @@ int cn_add_callback(struct cb_id *id, char *name, if (err) return err; - cn_notify(id, 0); - return 0; } EXPORT_SYMBOL_GPL(cn_add_callback); @@ -295,111 +234,9 @@ void cn_del_callback(struct cb_id *id) struct cn_dev *dev = &cdev; cn_queue_del_callback(dev->cbdev, id); - cn_notify(id, 1); } EXPORT_SYMBOL_GPL(cn_del_callback); -/* - * Checks two connector's control messages to be the same. - * Returns 1 if they are the same or if the first one is corrupted. - */ -static int cn_ctl_msg_equals(struct cn_ctl_msg *m1, struct cn_ctl_msg *m2) -{ - int i; - struct cn_notify_req *req1, *req2; - - if (m1->idx_notify_num != m2->idx_notify_num) - return 0; - - if (m1->val_notify_num != m2->val_notify_num) - return 0; - - if (m1->len != m2->len) - return 0; - - if ((m1->idx_notify_num + m1->val_notify_num) * sizeof(*req1) != - m1->len) - return 1; - - req1 = (struct cn_notify_req *)m1->data; - req2 = (struct cn_notify_req *)m2->data; - - for (i = 0; i < m1->idx_notify_num; ++i) { - if (req1->first != req2->first || req1->range != req2->range) - return 0; - req1++; - req2++; - } - - for (i = 0; i < m1->val_notify_num; ++i) { - if (req1->first != req2->first || req1->range != req2->range) - return 0; - req1++; - req2++; - } - - return 1; -} - -/* - * Main connector device's callback. - * - * Used for notification of a request's processing. - */ -static void cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) -{ - struct cn_ctl_msg *ctl; - struct cn_ctl_entry *ent; - u32 size; - - if (msg->len < sizeof(*ctl)) - return; - - ctl = (struct cn_ctl_msg *)msg->data; - - size = (sizeof(*ctl) + ((ctl->idx_notify_num + - ctl->val_notify_num) * - sizeof(struct cn_notify_req))); - - if (msg->len != size) - return; - - if (ctl->len + sizeof(*ctl) != msg->len) - return; - - /* - * Remove notification. - */ - if (ctl->group == 0) { - struct cn_ctl_entry *n; - - mutex_lock(¬ify_lock); - list_for_each_entry_safe(ent, n, ¬ify_list, notify_entry) { - if (cn_ctl_msg_equals(ent->msg, ctl)) { - list_del(&ent->notify_entry); - kfree(ent); - } - } - mutex_unlock(¬ify_lock); - - return; - } - - size += sizeof(*ent); - - ent = kzalloc(size, GFP_KERNEL); - if (!ent) - return; - - ent->msg = (struct cn_ctl_msg *)(ent + 1); - - memcpy(ent->msg, ctl, size - sizeof(*ent)); - - mutex_lock(¬ify_lock); - list_add(&ent->notify_entry, ¬ify_list); - mutex_unlock(¬ify_lock); -} - static int cn_proc_show(struct seq_file *m, void *v) { struct cn_queue_dev *dev = cdev.cbdev; @@ -437,11 +274,8 @@ static const struct file_operations cn_file_ops = { static int __devinit cn_init(void) { struct cn_dev *dev = &cdev; - int err; dev->input = cn_rx_skb; - dev->id.idx = cn_idx; - dev->id.val = cn_val; dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR, CN_NETLINK_USERS + 0xf, @@ -457,14 +291,6 @@ static int __devinit cn_init(void) cn_already_initialized = 1; - err = cn_add_callback(&dev->id, "connector", &cn_callback); - if (err) { - cn_already_initialized = 0; - cn_queue_free_dev(dev->cbdev); - netlink_kernel_release(dev->nls); - return -EINVAL; - } - proc_net_fops_create(&init_net, "connector", S_IRUGO, &cn_file_ops); return 0; @@ -478,7 +304,6 @@ static void __devexit cn_fini(void) proc_net_remove(&init_net, "connector"); - cn_del_callback(&dev->id); cn_queue_free_dev(dev->cbdev); netlink_kernel_release(dev->nls); } diff --git a/include/linux/connector.h b/include/linux/connector.h index 72ba63eb83c5..3a779ffba60b 100644 --- a/include/linux/connector.h +++ b/include/linux/connector.h @@ -24,9 +24,6 @@ #include -#define CN_IDX_CONNECTOR 0xffffffff -#define CN_VAL_CONNECTOR 0xffffffff - /* * Process Events connector unique ids -- used for message routing */ @@ -75,30 +72,6 @@ struct cn_msg { __u8 data[0]; }; -/* - * Notify structure - requests notification about - * registering/unregistering idx/val in range [first, first+range]. - */ -struct cn_notify_req { - __u32 first; - __u32 range; -}; - -/* - * Main notification control message - * *_notify_num - number of appropriate cn_notify_req structures after - * this struct. - * group - notification receiver's idx. - * len - total length of the attached data. - */ -struct cn_ctl_msg { - __u32 idx_notify_num; - __u32 val_notify_num; - __u32 group; - __u32 len; - __u8 data[0]; -}; - #ifdef __KERNEL__ #include @@ -151,11 +124,6 @@ struct cn_callback_entry { u32 seq, group; }; -struct cn_ctl_entry { - struct list_head notify_entry; - struct cn_ctl_msg *msg; -}; - struct cn_dev { struct cb_id id; -- cgit v1.2.3 From 24551f64d47af9539a7f324343bffeea09d9dcfa Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 12 Jan 2010 21:25:24 +0000 Subject: lmb: Add lmb_free() We can free memory allocated with lmb_alloc() by removing it from the list of reserved LMBs. Rework lmb_remove() to allow that possibility and add lmb_free() which exploits it. BenH: Removed some useless parenthesis Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- include/linux/lmb.h | 1 + lib/lmb.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lmb.h b/include/linux/lmb.h index ef82b8fcbddb..f3d14333ebed 100644 --- a/include/linux/lmb.h +++ b/include/linux/lmb.h @@ -42,6 +42,7 @@ extern void __init lmb_init(void); extern void __init lmb_analyze(void); extern long lmb_add(u64 base, u64 size); extern long lmb_remove(u64 base, u64 size); +extern long __init lmb_free(u64 base, u64 size); extern long __init lmb_reserve(u64 base, u64 size); extern u64 __init lmb_alloc_nid(u64 size, u64 align, int nid, u64 (*nid_range)(u64, u64, int *)); diff --git a/lib/lmb.c b/lib/lmb.c index 9cee17142b2c..b1fc52606524 100644 --- a/lib/lmb.c +++ b/lib/lmb.c @@ -205,9 +205,8 @@ long lmb_add(u64 base, u64 size) } -long lmb_remove(u64 base, u64 size) +static long __lmb_remove(struct lmb_region *rgn, u64 base, u64 size) { - struct lmb_region *rgn = &(lmb.memory); u64 rgnbegin, rgnend; u64 end = base + size; int i; @@ -254,6 +253,16 @@ long lmb_remove(u64 base, u64 size) return lmb_add_region(rgn, end, rgnend - end); } +long lmb_remove(u64 base, u64 size) +{ + return __lmb_remove(&lmb.memory, base, size); +} + +long __init lmb_free(u64 base, u64 size) +{ + return __lmb_remove(&lmb.reserved, base, size); +} + long __init lmb_reserve(u64 base, u64 size) { struct lmb_region *_rgn = &lmb.reserved; -- cgit v1.2.3 From add67461240c1dadc7c8d97e66f8f92b556ca523 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 3 Feb 2010 13:45:12 +0100 Subject: netfilter: add struct net * to target parameters Signed-off-by: Patrick McHardy --- include/linux/netfilter/x_tables.h | 2 ++ net/bridge/netfilter/ebtables.c | 10 ++++++---- net/ipv4/netfilter/ip_tables.c | 8 +++++--- net/ipv6/netfilter/ip6_tables.c | 8 +++++--- 4 files changed, 18 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 026eb78ee83c..365fabe1b16e 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -249,6 +249,7 @@ struct xt_target_param { * Other fields see above. */ struct xt_tgchk_param { + struct net *net; const char *table; const void *entryinfo; const struct xt_target *target; @@ -259,6 +260,7 @@ struct xt_tgchk_param { /* Target destructor parameters */ struct xt_tgdtor_param { + struct net *net; const struct xt_target *target; void *targinfo; u_int8_t family; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 1aa0e4c1f52d..12beb580aa21 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -579,13 +579,14 @@ ebt_cleanup_match(struct ebt_entry_match *m, struct net *net, unsigned int *i) } static inline int -ebt_cleanup_watcher(struct ebt_entry_watcher *w, unsigned int *i) +ebt_cleanup_watcher(struct ebt_entry_watcher *w, struct net *net, unsigned int *i) { struct xt_tgdtor_param par; if (i && (*i)-- == 0) return 1; + par.net = net; par.target = w->u.watcher; par.targinfo = w->data; par.family = NFPROTO_BRIDGE; @@ -606,10 +607,11 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt) /* we're done */ if (cnt && (*cnt)-- == 0) return 1; - EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, NULL); + EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL); EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL); t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + par.net = net; par.target = t->u.target; par.targinfo = t->data; par.family = NFPROTO_BRIDGE; @@ -674,7 +676,7 @@ ebt_check_entry(struct ebt_entry *e, } i = 0; - mtpar.net = net; + mtpar.net = tgpar.net = net; mtpar.table = tgpar.table = name; mtpar.entryinfo = tgpar.entryinfo = e; mtpar.hook_mask = tgpar.hook_mask = hookmask; @@ -730,7 +732,7 @@ ebt_check_entry(struct ebt_entry *e, (*cnt)++; return 0; cleanup_watchers: - EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, &j); + EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, &j); cleanup_matches: EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, &i); return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index cfaba0e2e6fc..7fde8f6950d8 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -638,10 +638,11 @@ err: return ret; } -static int check_target(struct ipt_entry *e, const char *name) +static int check_target(struct ipt_entry *e, struct net *net, const char *name) { struct ipt_entry_target *t = ipt_get_target(e); struct xt_tgchk_param par = { + .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, @@ -697,7 +698,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, } t->u.kernel.target = target; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto err; @@ -788,6 +789,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net, unsigned int *i) IPT_MATCH_ITERATE(e, cleanup_match, net, NULL); t = ipt_get_target(e); + par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_IPV4; @@ -1675,7 +1677,7 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name, if (ret) goto cleanup_matches; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto cleanup_matches; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 9f1d45f2ba8f..0376ed6d5594 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -669,10 +669,11 @@ err: return ret; } -static int check_target(struct ip6t_entry *e, const char *name) +static int check_target(struct ip6t_entry *e, struct net *net, const char *name) { struct ip6t_entry_target *t = ip6t_get_target(e); struct xt_tgchk_param par = { + .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, @@ -729,7 +730,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, } t->u.kernel.target = target; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto err; @@ -820,6 +821,7 @@ cleanup_entry(struct ip6t_entry *e, struct net *net, unsigned int *i) IP6T_MATCH_ITERATE(e, cleanup_match, net, NULL); t = ip6t_get_target(e); + par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_IPV6; @@ -1710,7 +1712,7 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net, if (ret) goto cleanup_matches; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto cleanup_matches; -- cgit v1.2.3 From 0cebe4b4163b6373c9d24c1a192939777bc27e55 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 3 Feb 2010 13:51:51 +0100 Subject: netfilter: ctnetlink: support selective event delivery Add two masks for conntrack end expectation events to struct nf_conntrack_ecache and use them to filter events. Their default value is "all events" when the event sysctl is on and "no events" when it is off. A following patch will add specific initializations. Expectation events depend on the ecache struct of their master conntrack. Signed-off-by: Patrick McHardy --- include/linux/netfilter/nf_conntrack_common.h | 18 ++++++++ include/net/netfilter/nf_conntrack_ecache.h | 59 +++++++++++++-------------- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 2 +- 4 files changed, 48 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index a374787ed9b0..ebfed90733f7 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -74,6 +74,24 @@ enum ip_conntrack_status { IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT), }; +/* Connection tracking event types */ +enum ip_conntrack_events { + IPCT_NEW, /* new conntrack */ + IPCT_RELATED, /* related conntrack */ + IPCT_DESTROY, /* destroyed conntrack */ + IPCT_REPLY, /* connection has seen two-way traffic */ + IPCT_ASSURED, /* connection status has changed to assured */ + IPCT_PROTOINFO, /* protocol information has changed */ + IPCT_HELPER, /* new helper has been set */ + IPCT_MARK, /* new mark has been set */ + IPCT_NATSEQADJ, /* NAT is doing sequence adjustment */ + IPCT_SECMARK, /* new security mark has been set */ +}; + +enum ip_conntrack_expect_events { + IPEXP_NEW, /* new expectation */ +}; + #ifdef __KERNEL__ struct ip_conntrack_stat { unsigned int searched; diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 5e05fb883ab1..96ba5f7dcab6 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -12,28 +12,12 @@ #include #include -/* Connection tracking event types */ -enum ip_conntrack_events { - IPCT_NEW, /* new conntrack */ - IPCT_RELATED, /* related conntrack */ - IPCT_DESTROY, /* destroyed conntrack */ - IPCT_REPLY, /* connection has seen two-way traffic */ - IPCT_ASSURED, /* connection status has changed to assured */ - IPCT_PROTOINFO, /* protocol information has changed */ - IPCT_HELPER, /* new helper has been set */ - IPCT_MARK, /* new mark has been set */ - IPCT_NATSEQADJ, /* NAT is doing sequence adjustment */ - IPCT_SECMARK, /* new security mark has been set */ -}; - -enum ip_conntrack_expect_events { - IPEXP_NEW, /* new expectation */ -}; - struct nf_conntrack_ecache { - unsigned long cache; /* bitops want long */ - unsigned long missed; /* missed events */ - u32 pid; /* netlink pid of destroyer */ + unsigned long cache; /* bitops want long */ + unsigned long missed; /* missed events */ + u16 ctmask; /* bitmask of ct events to be delivered */ + u16 expmask; /* bitmask of expect events to be delivered */ + u32 pid; /* netlink pid of destroyer */ }; static inline struct nf_conntrack_ecache * @@ -43,14 +27,24 @@ nf_ct_ecache_find(const struct nf_conn *ct) } static inline struct nf_conntrack_ecache * -nf_ct_ecache_ext_add(struct nf_conn *ct, gfp_t gfp) +nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) { struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *e; - if (!net->ct.sysctl_events) + if (!ctmask && !expmask && net->ct.sysctl_events) { + ctmask = ~0; + expmask = ~0; + } + if (!ctmask && !expmask) return NULL; - return nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); + e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); + if (e) { + e->ctmask = ctmask; + e->expmask = expmask; + } + return e; }; #ifdef CONFIG_NF_CONNTRACK_EVENTS @@ -83,6 +77,9 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) if (e == NULL) return; + if (!(e->ctmask & (1 << event))) + return; + set_bit(event, &e->cache); } @@ -93,7 +90,6 @@ nf_conntrack_eventmask_report(unsigned int eventmask, int report) { int ret = 0; - struct net *net = nf_ct_net(ct); struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; @@ -102,9 +98,6 @@ nf_conntrack_eventmask_report(unsigned int eventmask, if (notify == NULL) goto out_unlock; - if (!net->ct.sysctl_events) - goto out_unlock; - e = nf_ct_ecache_find(ct); if (e == NULL) goto out_unlock; @@ -118,6 +111,9 @@ nf_conntrack_eventmask_report(unsigned int eventmask, /* This is a resent of a destroy event? If so, skip missed */ unsigned long missed = e->pid ? 0 : e->missed; + if (!((eventmask | missed) & e->ctmask)) + goto out_unlock; + ret = notify->fcn(eventmask | missed, &item); if (unlikely(ret < 0 || missed)) { spin_lock_bh(&ct->lock); @@ -173,18 +169,19 @@ nf_ct_expect_event_report(enum ip_conntrack_expect_events event, u32 pid, int report) { - struct net *net = nf_ct_exp_net(exp); struct nf_exp_event_notifier *notify; + struct nf_conntrack_ecache *e; rcu_read_lock(); notify = rcu_dereference(nf_expect_event_cb); if (notify == NULL) goto out_unlock; - if (!net->ct.sysctl_events) + e = nf_ct_ecache_find(exp->master); + if (e == NULL) goto out_unlock; - { + if (e->expmask & (1 << event)) { struct nf_exp_event item = { .exp = exp, .pid = pid, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 091ff770eb7b..53b8da6ad6b7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -648,7 +648,7 @@ init_conntrack(struct net *net, } nf_ct_acct_ext_add(ct, GFP_ATOMIC); - nf_ct_ecache_ext_add(ct, GFP_ATOMIC); + nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); spin_lock_bh(&nf_conntrack_lock); exp = nf_ct_find_expectation(net, tuple); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index ff594eb138c1..f5c0b09e12f1 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1281,7 +1281,7 @@ ctnetlink_create_conntrack(struct net *net, } nf_ct_acct_ext_add(ct, GFP_ATOMIC); - nf_ct_ecache_ext_add(ct, GFP_ATOMIC); + nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) -- cgit v1.2.3 From b2a15a604d379af323645e330638e2cfcc696aff Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 3 Feb 2010 14:13:03 +0100 Subject: netfilter: nf_conntrack: support conntrack templates Support initializing selected parameters of new conntrack entries from a "conntrack template", which is a specially marked conntrack entry attached to the skb. Currently the helper and the event delivery masks can be initialized this way. Signed-off-by: Patrick McHardy --- include/linux/netfilter/nf_conntrack_common.h | 4 +++ include/net/netfilter/nf_conntrack.h | 5 +++ include/net/netfilter/nf_conntrack_helper.h | 3 +- net/ipv4/netfilter/nf_defrag_ipv4.c | 2 +- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 2 +- net/netfilter/nf_conntrack_core.c | 50 +++++++++++++++++--------- net/netfilter/nf_conntrack_helper.c | 17 ++++++--- net/netfilter/nf_conntrack_netlink.c | 2 +- 8 files changed, 61 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index ebfed90733f7..c608677dda60 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -72,6 +72,10 @@ enum ip_conntrack_status { /* Connection has fixed timeout. */ IPS_FIXED_TIMEOUT_BIT = 10, IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT), + + /* Conntrack is a template */ + IPS_TEMPLATE_BIT = 11, + IPS_TEMPLATE = (1 << IPS_TEMPLATE_BIT), }; /* Connection tracking event types */ diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index a0904adfb8f7..5043d61c99a7 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -272,6 +272,11 @@ nf_conntrack_alloc(struct net *net, const struct nf_conntrack_tuple *repl, gfp_t gfp); +static inline int nf_ct_is_template(const struct nf_conn *ct) +{ + return test_bit(IPS_TEMPLATE_BIT, &ct->status); +} + /* It's confirmed if it is, or has been in the hash table. */ static inline int nf_ct_is_confirmed(struct nf_conn *ct) { diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 86be7c4816d6..e17aaa3e19fd 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -47,7 +47,8 @@ extern void nf_conntrack_helper_unregister(struct nf_conntrack_helper *); extern struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp); -extern int __nf_ct_try_assign_helper(struct nf_conn *ct, gfp_t flags); +extern int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, + gfp_t flags); extern void nf_ct_helper_destroy(struct nf_conn *ct); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 331ead3ebd1b..77627fa80561 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -59,7 +59,7 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, #if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ - if (skb->nfct) + if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) return NF_ACCEPT; #endif #endif diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 0956ebabbff2..55ce22e5de49 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -212,7 +212,7 @@ static unsigned int ipv6_defrag(unsigned int hooknum, struct sk_buff *reasm; /* Previously seen (loopback)? */ - if (skb->nfct) + if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) return NF_ACCEPT; reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb)); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 53b8da6ad6b7..471e2a79d26f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -618,7 +618,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free); /* Allocate a new conntrack: we return -ENOMEM if classification failed due to stress. Otherwise it really is unclassifiable. */ static struct nf_conntrack_tuple_hash * -init_conntrack(struct net *net, +init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_l3proto *l3proto, struct nf_conntrack_l4proto *l4proto, @@ -628,6 +628,7 @@ init_conntrack(struct net *net, struct nf_conn *ct; struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; + struct nf_conntrack_ecache *ecache; struct nf_conntrack_expect *exp; if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { @@ -648,7 +649,11 @@ init_conntrack(struct net *net, } nf_ct_acct_ext_add(ct, GFP_ATOMIC); - nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); + + ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; + nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, + ecache ? ecache->expmask : 0, + GFP_ATOMIC); spin_lock_bh(&nf_conntrack_lock); exp = nf_ct_find_expectation(net, tuple); @@ -673,7 +678,7 @@ init_conntrack(struct net *net, nf_conntrack_get(&ct->master->ct_general); NF_CT_STAT_INC(net, expect_new); } else { - __nf_ct_try_assign_helper(ct, GFP_ATOMIC); + __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); NF_CT_STAT_INC(net, new); } @@ -694,7 +699,7 @@ init_conntrack(struct net *net, /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ static inline struct nf_conn * -resolve_normal_ct(struct net *net, +resolve_normal_ct(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u_int16_t l3num, @@ -718,7 +723,8 @@ resolve_normal_ct(struct net *net, /* look for tuple match */ h = nf_conntrack_find_get(net, &tuple); if (!h) { - h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff); + h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, + skb, dataoff); if (!h) return NULL; if (IS_ERR(h)) @@ -755,7 +761,7 @@ unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) { - struct nf_conn *ct; + struct nf_conn *ct, *tmpl = NULL; enum ip_conntrack_info ctinfo; struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; @@ -764,10 +770,14 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, int set_reply = 0; int ret; - /* Previously seen (loopback or untracked)? Ignore. */ if (skb->nfct) { - NF_CT_STAT_INC_ATOMIC(net, ignore); - return NF_ACCEPT; + /* Previously seen (loopback or untracked)? Ignore. */ + tmpl = (struct nf_conn *)skb->nfct; + if (!nf_ct_is_template(tmpl)) { + NF_CT_STAT_INC_ATOMIC(net, ignore); + return NF_ACCEPT; + } + skb->nfct = NULL; } /* rcu_read_lock()ed by nf_hook_slow */ @@ -778,7 +788,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, pr_debug("not prepared to track yet or error occured\n"); NF_CT_STAT_INC_ATOMIC(net, error); NF_CT_STAT_INC_ATOMIC(net, invalid); - return -ret; + ret = -ret; + goto out; } l4proto = __nf_ct_l4proto_find(pf, protonum); @@ -791,22 +802,25 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, if (ret <= 0) { NF_CT_STAT_INC_ATOMIC(net, error); NF_CT_STAT_INC_ATOMIC(net, invalid); - return -ret; + ret = -ret; + goto out; } } - ct = resolve_normal_ct(net, skb, dataoff, pf, protonum, + ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l3proto, l4proto, &set_reply, &ctinfo); if (!ct) { /* Not valid part of a connection */ NF_CT_STAT_INC_ATOMIC(net, invalid); - return NF_ACCEPT; + ret = NF_ACCEPT; + goto out; } if (IS_ERR(ct)) { /* Too stressed to deal. */ NF_CT_STAT_INC_ATOMIC(net, drop); - return NF_DROP; + ret = NF_DROP; + goto out; } NF_CT_ASSERT(skb->nfct); @@ -821,11 +835,15 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, NF_CT_STAT_INC_ATOMIC(net, invalid); if (ret == -NF_DROP) NF_CT_STAT_INC_ATOMIC(net, drop); - return -ret; + ret = -ret; + goto out; } if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_REPLY, ct); +out: + if (tmpl) + nf_ct_put(tmpl); return ret; } @@ -864,7 +882,7 @@ void nf_conntrack_alter_reply(struct nf_conn *ct, return; rcu_read_lock(); - __nf_ct_try_assign_helper(ct, GFP_ATOMIC); + __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index c0e461f466ae..8144b0da5515 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -96,13 +96,22 @@ struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) } EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); -int __nf_ct_try_assign_helper(struct nf_conn *ct, gfp_t flags) +int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, + gfp_t flags) { + struct nf_conntrack_helper *helper = NULL; + struct nf_conn_help *help; int ret = 0; - struct nf_conntrack_helper *helper; - struct nf_conn_help *help = nfct_help(ct); - helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + if (tmpl != NULL) { + help = nfct_help(tmpl); + if (help != NULL) + helper = help->helper; + } + + help = nfct_help(ct); + if (helper == NULL) + helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); if (helper == NULL) { if (help) rcu_assign_pointer(help->helper, NULL); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index f5c0b09e12f1..09044f9f4b2e 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1249,7 +1249,7 @@ ctnetlink_create_conntrack(struct net *net, } } else { /* try an implicit helper assignation */ - err = __nf_ct_try_assign_helper(ct, GFP_ATOMIC); + err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); if (err < 0) goto err2; } -- cgit v1.2.3 From d4bfa033ed84e0ae446eff445d107ffd5ee78df3 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 29 Jan 2010 15:03:36 +0100 Subject: HID: make raw reports possible for both feature and output reports In commit 2da31939a42 ("Bluetooth: Implement raw output support for HIDP layer"), support for Bluetooth hid_output_raw_report was added, but it pushes the data to the intr socket instead of the ctrl one. This has been fixed by 6bf8268f9a91f1 ("Bluetooth: Use the control channel for raw HID reports") Still, it is necessary to distinguish whether the report in question should be either FEATURE or OUTPUT. For this, we have to extend the generic HID API, so that hid_output_raw_report() callback provides means to specify this value so that it can be passed down to lower level hardware drivers (currently Bluetooth and USB). Based on original patch by Bastien Nocera Acked-by: Marcel Holtmann Signed-off-by: Jiri Kosina --- drivers/hid/hidraw.c | 2 +- drivers/hid/usbhid/hid-core.c | 5 +++-- include/linux/hid.h | 2 +- net/bluetooth/hidp/core.c | 17 ++++++++++++++--- 4 files changed, 19 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index cdd136942bca..d04476700b7b 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -134,7 +134,7 @@ static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t goto out; } - ret = dev->hid_output_raw_report(dev, buf, count); + ret = dev->hid_output_raw_report(dev, buf, count, HID_OUTPUT_REPORT); out: kfree(buf); return ret; diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index e2997a8d5e1b..caa16c057ce2 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -774,7 +774,8 @@ static int hid_alloc_buffers(struct usb_device *dev, struct hid_device *hid) return 0; } -static int usbhid_output_raw_report(struct hid_device *hid, __u8 *buf, size_t count) +static int usbhid_output_raw_report(struct hid_device *hid, __u8 *buf, size_t count, + unsigned char report_type) { struct usbhid_device *usbhid = hid->driver_data; struct usb_device *dev = hid_to_usb_dev(hid); @@ -785,7 +786,7 @@ static int usbhid_output_raw_report(struct hid_device *hid, __u8 *buf, size_t co ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), HID_REQ_SET_REPORT, USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE, - ((HID_OUTPUT_REPORT + 1) << 8) | *buf, + ((report_type + 1) << 8) | *buf, interface->desc.bInterfaceNumber, buf + 1, count - 1, USB_CTRL_SET_TIMEOUT); diff --git a/include/linux/hid.h b/include/linux/hid.h index 87093652dda8..3661a626941d 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -501,7 +501,7 @@ struct hid_device { /* device report descriptor */ void (*hiddev_report_event) (struct hid_device *, struct hid_report *); /* handler for raw output data, used by hidraw */ - int (*hid_output_raw_report) (struct hid_device *, __u8 *, size_t); + int (*hid_output_raw_report) (struct hid_device *, __u8 *, size_t, unsigned char); /* debugging support via debugfs */ unsigned short debug; diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 6cf526d06e21..37ba153c4cd4 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -313,10 +313,21 @@ static int hidp_send_report(struct hidp_session *session, struct hid_report *rep return hidp_queue_report(session, buf, rsize); } -static int hidp_output_raw_report(struct hid_device *hid, unsigned char *data, size_t count) +static int hidp_output_raw_report(struct hid_device *hid, unsigned char *data, size_t count, + unsigned char report_type) { - if (hidp_send_ctrl_message(hid->driver_data, - HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_FEATURE, + switch (report_type) { + case HID_FEATURE_REPORT: + report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_FEATURE; + break; + case HID_OUTPUT_REPORT: + report_type = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT; + break; + default: + return -EINVAL; + } + + if (hidp_send_ctrl_message(hid->driver_data, report_type, data, count)) return -ENOMEM; return count; -- cgit v1.2.3 From 84f3bb9ae9db90f7fb15d98b55279a58ab1b2363 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 3 Feb 2010 17:17:06 +0100 Subject: netfilter: xtables: add CT target Add a new target for the raw table, which can be used to specify conntrack parameters for specific connections, f.i. the conntrack helper. The target attaches a "template" connection tracking entry to the skb, which is used by the conntrack core when initializing a new conntrack. Signed-off-by: Patrick McHardy --- include/linux/netfilter/Kbuild | 1 + include/linux/netfilter/xt_CT.h | 17 +++ include/net/netfilter/nf_conntrack_helper.h | 3 + net/netfilter/Kconfig | 12 +++ net/netfilter/Makefile | 1 + net/netfilter/nf_conntrack_helper.c | 19 ++++ net/netfilter/xt_CT.c | 158 ++++++++++++++++++++++++++++ 7 files changed, 211 insertions(+) create mode 100644 include/linux/netfilter/xt_CT.h create mode 100644 net/netfilter/xt_CT.c (limited to 'include/linux') diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild index 2aea50399c0b..a5a63e41b8af 100644 --- a/include/linux/netfilter/Kbuild +++ b/include/linux/netfilter/Kbuild @@ -6,6 +6,7 @@ header-y += nfnetlink_queue.h header-y += xt_CLASSIFY.h header-y += xt_CONNMARK.h header-y += xt_CONNSECMARK.h +header-y += xt_CT.h header-y += xt_DSCP.h header-y += xt_LED.h header-y += xt_MARK.h diff --git a/include/linux/netfilter/xt_CT.h b/include/linux/netfilter/xt_CT.h new file mode 100644 index 000000000000..7fd0effe1316 --- /dev/null +++ b/include/linux/netfilter/xt_CT.h @@ -0,0 +1,17 @@ +#ifndef _XT_CT_H +#define _XT_CT_H + +#define XT_CT_NOTRACK 0x1 + +struct xt_ct_target_info { + u_int16_t flags; + u_int16_t __unused; + u_int32_t ct_events; + u_int32_t exp_events; + char helper[16]; + + /* Used internally by the kernel */ + struct nf_conn *ct __attribute__((aligned(8))); +}; + +#endif /* _XT_CT_H */ diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index e17aaa3e19fd..32c305dbdab6 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -42,6 +42,9 @@ struct nf_conntrack_helper { extern struct nf_conntrack_helper * __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum); +extern struct nf_conntrack_helper * +nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum); + extern int nf_conntrack_helper_register(struct nf_conntrack_helper *); extern void nf_conntrack_helper_unregister(struct nf_conntrack_helper *); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 634d14affc8d..4469d45261f4 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -341,6 +341,18 @@ config NETFILTER_XT_TARGET_CONNSECMARK To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_CT + tristate '"CT" target support' + depends on NF_CONNTRACK + depends on IP_NF_RAW || IP6_NF_RAW + depends on NETFILTER_ADVANCED + help + This options adds a `CT' target, which allows to specify initial + connection tracking parameters like events to be delivered and + the helper to be used. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_DSCP tristate '"DSCP" and "TOS" target support' depends on IP_NF_MANGLE || IP6_NF_MANGLE diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 49f62ee4e9ff..f873644f02f6 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 8144b0da5515..a74a5769877b 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -83,6 +83,25 @@ __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) } EXPORT_SYMBOL_GPL(__nf_conntrack_helper_find); +struct nf_conntrack_helper * +nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) +{ + struct nf_conntrack_helper *h; + + h = __nf_conntrack_helper_find(name, l3num, protonum); +#ifdef CONFIG_MODULES + if (h == NULL) { + if (request_module("nfct-helper-%s", name) == 0) + h = __nf_conntrack_helper_find(name, l3num, protonum); + } +#endif + if (h != NULL && !try_module_get(h->me)) + h = NULL; + + return h; +} +EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); + struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) { struct nf_conn_help *help; diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c new file mode 100644 index 000000000000..8183a054256f --- /dev/null +++ b/net/netfilter/xt_CT.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2010 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int xt_ct_target(struct sk_buff *skb, + const struct xt_target_param *par) +{ + const struct xt_ct_target_info *info = par->targinfo; + struct nf_conn *ct = info->ct; + + /* Previously seen (loopback)? Ignore. */ + if (skb->nfct != NULL) + return XT_CONTINUE; + + atomic_inc(&ct->ct_general.use); + skb->nfct = &ct->ct_general; + skb->nfctinfo = IP_CT_NEW; + + return XT_CONTINUE; +} + +static u8 xt_ct_find_proto(const struct xt_tgchk_param *par) +{ + if (par->family == AF_INET) { + const struct ipt_entry *e = par->entryinfo; + + if (e->ip.invflags & IPT_INV_PROTO) + return 0; + return e->ip.proto; + } else if (par->family == AF_INET6) { + const struct ip6t_entry *e = par->entryinfo; + + if (e->ipv6.invflags & IP6T_INV_PROTO) + return 0; + return e->ipv6.proto; + } else + return 0; +} + +static bool xt_ct_tg_check(const struct xt_tgchk_param *par) +{ + struct xt_ct_target_info *info = par->targinfo; + struct nf_conntrack_tuple t; + struct nf_conn_help *help; + struct nf_conn *ct; + u8 proto; + + if (info->flags & ~XT_CT_NOTRACK) + return false; + + if (info->flags & XT_CT_NOTRACK) { + ct = &nf_conntrack_untracked; + atomic_inc(&ct->ct_general.use); + goto out; + } + + if (nf_ct_l3proto_try_module_get(par->family) < 0) + goto err1; + + memset(&t, 0, sizeof(t)); + ct = nf_conntrack_alloc(par->net, &t, &t, GFP_KERNEL); + if (IS_ERR(ct)) + goto err2; + + if ((info->ct_events || info->exp_events) && + !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, + GFP_KERNEL)) + goto err3; + + if (info->helper[0]) { + proto = xt_ct_find_proto(par); + if (!proto) + goto err3; + + help = nf_ct_helper_ext_add(ct, GFP_KERNEL); + if (help == NULL) + goto err3; + + help->helper = nf_conntrack_helper_try_module_get(info->helper, + par->family, + proto); + if (help->helper == NULL) + goto err3; + } + + __set_bit(IPS_TEMPLATE_BIT, &ct->status); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); +out: + info->ct = ct; + return true; + +err3: + nf_conntrack_free(ct); +err2: + nf_ct_l3proto_module_put(par->family); +err1: + return false; +} + +static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par) +{ + struct xt_ct_target_info *info = par->targinfo; + struct nf_conn *ct = info->ct; + struct nf_conn_help *help; + + if (ct != &nf_conntrack_untracked) { + help = nfct_help(ct); + if (help) + module_put(help->helper->me); + + nf_ct_l3proto_module_put(par->family); + } + nf_ct_put(info->ct); +} + +static struct xt_target xt_ct_tg __read_mostly = { + .name = "CT", + .family = NFPROTO_UNSPEC, + .targetsize = XT_ALIGN(sizeof(struct xt_ct_target_info)), + .checkentry = xt_ct_tg_check, + .destroy = xt_ct_tg_destroy, + .target = xt_ct_target, + .table = "raw", + .me = THIS_MODULE, +}; + +static int __init xt_ct_tg_init(void) +{ + return xt_register_target(&xt_ct_tg); +} + +static void __exit xt_ct_tg_exit(void) +{ + xt_unregister_target(&xt_ct_tg); +} + +module_init(xt_ct_tg_init); +module_exit(xt_ct_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: connection tracking target"); +MODULE_ALIAS("ipt_CT"); +MODULE_ALIAS("ip6t_CT"); -- cgit v1.2.3 From cd757645fbdc34a8343c04bb0e74e06fccc2cb10 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Sat, 30 Jan 2010 10:25:18 +0530 Subject: perf: Make bp_len type to u64 generic across the arch Change 'bp_len' type to __u64 to make it work across archs as the s390 architecture watch point length can be upto 2^64. reference: http://lkml.org/lkml/2010/1/25/212 This is an ABI change that is not backward compatible with the previous hardware breakpoint info layout integrated in this development cycle, a rebuilt of perf tools is necessary for versions based on 2.6.33-rc1 - 2.6.33-rc6 to work with a kernel based on this patch. Signed-off-by: Mahesh Salgaonkar Acked-by: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: "K. Prasad" Cc: Maneesh Soni Cc: Heiko Carstens Cc: Martin LKML-Reference: <20100130045518.GA20776@in.ibm.com> Signed-off-by: Frederic Weisbecker --- include/linux/hw_breakpoint.h | 2 +- include/linux/perf_event.h | 6 ++---- kernel/hw_breakpoint.c | 2 +- kernel/perf_event.c | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 070ba0621738..5977b724f7c6 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -44,7 +44,7 @@ static inline int hw_breakpoint_type(struct perf_event *bp) return bp->attr.bp_type; } -static inline int hw_breakpoint_len(struct perf_event *bp) +static inline unsigned long hw_breakpoint_len(struct perf_event *bp) { return bp->attr.bp_len; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8fa71874113f..a177698d95e2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -211,11 +211,9 @@ struct perf_event_attr { __u32 wakeup_watermark; /* bytes before wakeup */ }; - __u32 __reserved_2; - - __u64 bp_addr; __u32 bp_type; - __u32 bp_len; + __u64 bp_addr; + __u64 bp_len; }; /* diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 8a5c7d55ac9f..967e66143e11 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -360,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { u64 old_addr = bp->attr.bp_addr; + u64 old_len = bp->attr.bp_len; int old_type = bp->attr.bp_type; - int old_len = bp->attr.bp_len; int err = 0; perf_event_disable(bp); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index d27746bd3a06..2b19297742cb 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4580,7 +4580,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->type >= PERF_TYPE_MAX) return -EINVAL; - if (attr->__reserved_1 || attr->__reserved_2) + if (attr->__reserved_1) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) -- cgit v1.2.3 From 002345925e6c45861f60db6f4fc6236713fd8847 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Feb 2010 15:36:43 -0800 Subject: syslog: distinguish between /proc/kmsg and syscalls This allows the LSM to distinguish between syslog functions originating from /proc/kmsg access and direct syscalls. By default, the commoncaps will now no longer require CAP_SYS_ADMIN to read an opened /proc/kmsg file descriptor. For example the kernel syslog reader can now drop privileges after opening /proc/kmsg, instead of staying privileged with CAP_SYS_ADMIN. MAC systems that implement security_syslog have unchanged behavior. Signed-off-by: Kees Cook Acked-by: Serge Hallyn Acked-by: John Johansen Signed-off-by: James Morris --- fs/proc/kmsg.c | 14 +++++++------- include/linux/security.h | 11 ++++++----- include/linux/syslog.h | 29 +++++++++++++++++++++++++++++ kernel/printk.c | 7 ++++--- security/commoncap.c | 7 ++++++- security/security.c | 4 ++-- security/selinux/hooks.c | 5 +++-- security/smack/smack_lsm.c | 4 ++-- 8 files changed, 59 insertions(+), 22 deletions(-) create mode 100644 include/linux/syslog.h (limited to 'include/linux') diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 7ca78346d3f0..6a3d843a1088 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -12,37 +12,37 @@ #include #include #include +#include #include #include extern wait_queue_head_t log_wait; -extern int do_syslog(int type, char __user *bug, int count); - static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(1,NULL,0); + return do_syslog(1, NULL, 0, SYSLOG_FROM_FILE); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(0,NULL,0); + (void) do_syslog(0, NULL, 0, SYSLOG_FROM_FILE); return 0; } static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - if ((file->f_flags & O_NONBLOCK) && !do_syslog(9, NULL, 0)) + if ((file->f_flags & O_NONBLOCK) && + !do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) return -EAGAIN; - return do_syslog(2, buf, count); + return do_syslog(2, buf, count, SYSLOG_FROM_FILE); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(9, NULL, 0)) + if (do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) return POLLIN | POLLRDNORM; return 0; } diff --git a/include/linux/security.h b/include/linux/security.h index 26eca85b2417..a4dc74d86ac6 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -76,7 +76,7 @@ extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp); extern int cap_task_setioprio(struct task_struct *p, int ioprio); extern int cap_task_setnice(struct task_struct *p, int nice); -extern int cap_syslog(int type); +extern int cap_syslog(int type, bool from_file); extern int cap_vm_enough_memory(struct mm_struct *mm, long pages); struct msghdr; @@ -1349,6 +1349,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * logging to the console. * See the syslog(2) manual page for an explanation of the @type values. * @type contains the type of action. + * @from_file indicates the context of action (if it came from /proc). * Return 0 if permission is granted. * @settime: * Check permission to change the system time. @@ -1463,7 +1464,7 @@ struct security_operations { int (*sysctl) (struct ctl_table *table, int op); int (*quotactl) (int cmds, int type, int id, struct super_block *sb); int (*quota_on) (struct dentry *dentry); - int (*syslog) (int type); + int (*syslog) (int type, bool from_file); int (*settime) (struct timespec *ts, struct timezone *tz); int (*vm_enough_memory) (struct mm_struct *mm, long pages); @@ -1762,7 +1763,7 @@ int security_acct(struct file *file); int security_sysctl(struct ctl_table *table, int op); int security_quotactl(int cmds, int type, int id, struct super_block *sb); int security_quota_on(struct dentry *dentry); -int security_syslog(int type); +int security_syslog(int type, bool from_file); int security_settime(struct timespec *ts, struct timezone *tz); int security_vm_enough_memory(long pages); int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); @@ -2008,9 +2009,9 @@ static inline int security_quota_on(struct dentry *dentry) return 0; } -static inline int security_syslog(int type) +static inline int security_syslog(int type, bool from_file) { - return cap_syslog(type); + return cap_syslog(type, from_file); } static inline int security_settime(struct timespec *ts, struct timezone *tz) diff --git a/include/linux/syslog.h b/include/linux/syslog.h new file mode 100644 index 000000000000..5f02b1817be1 --- /dev/null +++ b/include/linux/syslog.h @@ -0,0 +1,29 @@ +/* Syslog internals + * + * Copyright 2010 Canonical, Ltd. + * Author: Kees Cook + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _LINUX_SYSLOG_H +#define _LINUX_SYSLOG_H + +#define SYSLOG_FROM_CALL 0 +#define SYSLOG_FROM_FILE 1 + +int do_syslog(int type, char __user *buf, int count, bool from_file); + +#endif /* _LINUX_SYSLOG_H */ diff --git a/kernel/printk.c b/kernel/printk.c index 17463ca2e229..809cf9a258a0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -273,14 +274,14 @@ static inline void boot_delay_msec(void) * 9 -- Return number of unread characters in the log buffer * 10 -- Return size of the log buffer */ -int do_syslog(int type, char __user *buf, int len) +int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; int do_clear = 0; char c; int error = 0; - error = security_syslog(type); + error = security_syslog(type, from_file); if (error) return error; @@ -417,7 +418,7 @@ out: SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { - return do_syslog(type, buf, len); + return do_syslog(type, buf, len, SYSLOG_FROM_CALL); } /* diff --git a/security/commoncap.c b/security/commoncap.c index f800fdb3de94..677fad9d5cba 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * If a non-root user executes a setuid-root binary in @@ -888,12 +889,16 @@ error: /** * cap_syslog - Determine whether syslog function is permitted * @type: Function requested + * @from_file: Whether this request came from an open file (i.e. /proc) * * Determine whether the current process is permitted to use a particular * syslog function, returning 0 if permission is granted, -ve if not. */ -int cap_syslog(int type) +int cap_syslog(int type, bool from_file) { + /* /proc/kmsg can open be opened by CAP_SYS_ADMIN */ + if (type != 1 && from_file) + return 0; if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) return -EPERM; return 0; diff --git a/security/security.c b/security/security.c index 440afe5eb54c..971092c06f31 100644 --- a/security/security.c +++ b/security/security.c @@ -203,9 +203,9 @@ int security_quota_on(struct dentry *dentry) return security_ops->quota_on(dentry); } -int security_syslog(int type) +int security_syslog(int type, bool from_file) { - return security_ops->syslog(type); + return security_ops->syslog(type, from_file); } int security_settime(struct timespec *ts, struct timezone *tz) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 9a2ee845e9d4..a4862a0730fa 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -76,6 +76,7 @@ #include #include #include +#include #include "avc.h" #include "objsec.h" @@ -2049,11 +2050,11 @@ static int selinux_quota_on(struct dentry *dentry) return dentry_has_perm(cred, NULL, dentry, FILE__QUOTAON); } -static int selinux_syslog(int type) +static int selinux_syslog(int type, bool from_file) { int rc; - rc = cap_syslog(type); + rc = cap_syslog(type, from_file); if (rc) return rc; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 529c9ca65878..a5721b373f53 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -157,12 +157,12 @@ static int smack_ptrace_traceme(struct task_struct *ptp) * * Returns 0 on success, error code otherwise. */ -static int smack_syslog(int type) +static int smack_syslog(int type, bool from_file) { int rc; char *sp = current_security(); - rc = cap_syslog(type); + rc = cap_syslog(type, from_file); if (rc != 0) return rc; -- cgit v1.2.3 From d78ca3cd733d8a2c3dcd88471beb1a15d973eed8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Feb 2010 15:37:13 -0800 Subject: syslog: use defined constants instead of raw numbers Right now the syslog "type" action are just raw numbers which makes the source difficult to follow. This patch replaces the raw numbers with defined constants for some level of sanity. Signed-off-by: Kees Cook Acked-by: John Johansen Acked-by: Serge Hallyn Signed-off-by: James Morris --- fs/proc/kmsg.c | 10 +++++----- include/linux/syslog.h | 23 +++++++++++++++++++++++ kernel/printk.c | 45 +++++++++++++++++++-------------------------- security/commoncap.c | 5 +++-- security/selinux/hooks.c | 21 +++++++++++---------- 5 files changed, 61 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 6a3d843a1088..cfe90a48a6e8 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait; static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(1, NULL, 0, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(0, NULL, 0, SYSLOG_FROM_FILE); + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); return 0; } @@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { if ((file->f_flags & O_NONBLOCK) && - !do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) return -EAGAIN; - return do_syslog(2, buf, count, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) return POLLIN | POLLRDNORM; return 0; } diff --git a/include/linux/syslog.h b/include/linux/syslog.h index 5f02b1817be1..38911391a139 100644 --- a/include/linux/syslog.h +++ b/include/linux/syslog.h @@ -21,6 +21,29 @@ #ifndef _LINUX_SYSLOG_H #define _LINUX_SYSLOG_H +/* Close the log. Currently a NOP. */ +#define SYSLOG_ACTION_CLOSE 0 +/* Open the log. Currently a NOP. */ +#define SYSLOG_ACTION_OPEN 1 +/* Read from the log. */ +#define SYSLOG_ACTION_READ 2 +/* Read all messages remaining in the ring buffer. */ +#define SYSLOG_ACTION_READ_ALL 3 +/* Read and clear all messages remaining in the ring buffer */ +#define SYSLOG_ACTION_READ_CLEAR 4 +/* Clear ring buffer. */ +#define SYSLOG_ACTION_CLEAR 5 +/* Disable printk's to console */ +#define SYSLOG_ACTION_CONSOLE_OFF 6 +/* Enable printk's to console */ +#define SYSLOG_ACTION_CONSOLE_ON 7 +/* Set level of messages printed to console */ +#define SYSLOG_ACTION_CONSOLE_LEVEL 8 +/* Return number of unread characters in the log buffer */ +#define SYSLOG_ACTION_SIZE_UNREAD 9 +/* Return size of the log buffer */ +#define SYSLOG_ACTION_SIZE_BUFFER 10 + #define SYSLOG_FROM_CALL 0 #define SYSLOG_FROM_FILE 1 diff --git a/kernel/printk.c b/kernel/printk.c index 809cf9a258a0..3e162d867098 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -259,21 +259,6 @@ static inline void boot_delay_msec(void) } #endif -/* - * Commands to do_syslog: - * - * 0 -- Close the log. Currently a NOP. - * 1 -- Open the log. Currently a NOP. - * 2 -- Read from the log. - * 3 -- Read all messages remaining in the ring buffer. - * 4 -- Read and clear all messages remaining in the ring buffer - * 5 -- Clear ring buffer. - * 6 -- Disable printk's to console - * 7 -- Enable printk's to console - * 8 -- Set level of messages printed to console - * 9 -- Return number of unread characters in the log buffer - * 10 -- Return size of the log buffer - */ int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; @@ -286,11 +271,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) return error; switch (type) { - case 0: /* Close log */ + case SYSLOG_ACTION_CLOSE: /* Close log */ break; - case 1: /* Open log */ + case SYSLOG_ACTION_OPEN: /* Open log */ break; - case 2: /* Read from log */ + case SYSLOG_ACTION_READ: /* Read from log */ error = -EINVAL; if (!buf || len < 0) goto out; @@ -321,10 +306,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) if (!error) error = i; break; - case 4: /* Read/clear last kernel messages */ + /* Read/clear last kernel messages */ + case SYSLOG_ACTION_READ_CLEAR: do_clear = 1; /* FALL THRU */ - case 3: /* Read last kernel messages */ + /* Read last kernel messages */ + case SYSLOG_ACTION_READ_ALL: error = -EINVAL; if (!buf || len < 0) goto out; @@ -377,21 +364,25 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) } } break; - case 5: /* Clear ring buffer */ + /* Clear ring buffer */ + case SYSLOG_ACTION_CLEAR: logged_chars = 0; break; - case 6: /* Disable logging to console */ + /* Disable logging to console */ + case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == -1) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; - case 7: /* Enable logging to console */ + /* Enable logging to console */ + case SYSLOG_ACTION_CONSOLE_ON: if (saved_console_loglevel != -1) { console_loglevel = saved_console_loglevel; saved_console_loglevel = -1; } break; - case 8: /* Set level of messages printed to console */ + /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_LEVEL: error = -EINVAL; if (len < 1 || len > 8) goto out; @@ -402,10 +393,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) saved_console_loglevel = -1; error = 0; break; - case 9: /* Number of chars in the log buffer */ + /* Number of chars in the log buffer */ + case SYSLOG_ACTION_SIZE_UNREAD: error = log_end - log_start; break; - case 10: /* Size of the log buffer */ + /* Size of the log buffer */ + case SYSLOG_ACTION_SIZE_BUFFER: error = log_buf_len; break; default: diff --git a/security/commoncap.c b/security/commoncap.c index 677fad9d5cba..cf01b2eebb60 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -897,9 +897,10 @@ error: int cap_syslog(int type, bool from_file) { /* /proc/kmsg can open be opened by CAP_SYS_ADMIN */ - if (type != 1 && from_file) + if (type != SYSLOG_ACTION_OPEN && from_file) return 0; - if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) + if ((type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER) && !capable(CAP_SYS_ADMIN)) return -EPERM; return 0; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index a4862a0730fa..6b36ce2eef2e 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2059,20 +2059,21 @@ static int selinux_syslog(int type, bool from_file) return rc; switch (type) { - case 3: /* Read last kernel messages */ - case 10: /* Return size of the log buffer */ + case SYSLOG_ACTION_READ_ALL: /* Read last kernel messages */ + case SYSLOG_ACTION_SIZE_BUFFER: /* Return size of the log buffer */ rc = task_has_system(current, SYSTEM__SYSLOG_READ); break; - case 6: /* Disable logging to console */ - case 7: /* Enable logging to console */ - case 8: /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_OFF: /* Disable logging to console */ + case SYSLOG_ACTION_CONSOLE_ON: /* Enable logging to console */ + /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_LEVEL: rc = task_has_system(current, SYSTEM__SYSLOG_CONSOLE); break; - case 0: /* Close log */ - case 1: /* Open log */ - case 2: /* Read from log */ - case 4: /* Read/clear last kernel messages */ - case 5: /* Clear ring buffer */ + case SYSLOG_ACTION_CLOSE: /* Close log */ + case SYSLOG_ACTION_OPEN: /* Open log */ + case SYSLOG_ACTION_READ: /* Read from log */ + case SYSLOG_ACTION_READ_CLEAR: /* Read/clear last kernel messages */ + case SYSLOG_ACTION_CLEAR: /* Clear ring buffer */ default: rc = task_has_system(current, SYSTEM__SYSLOG_MOD); break; -- cgit v1.2.3 From 8a83a00b0735190384a348156837918271034144 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 30 Jan 2010 12:23:03 +0000 Subject: net: maintain namespace isolation between vlan and real device In the vlan and macvlan drivers, the start_xmit function forwards data to the dev_queue_xmit function for another device, which may potentially belong to a different namespace. To make sure that classification stays within a single namespace, this resets the potentially critical fields. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 2 +- include/linux/netdevice.h | 9 +++++++++ net/8021q/vlan_dev.c | 2 +- net/core/dev.c | 35 +++++++++++++++++++++++++++++++---- 4 files changed, 42 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index fa0dc514dbaf..d32e0bdfc5e9 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -269,7 +269,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) } xmit_world: - skb->dev = vlan->lowerdev; + skb_set_dev(skb, vlan->lowerdev); return dev_queue_xmit(skb); } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 93a32a5ca74f..622ba5aa93c4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1004,6 +1004,15 @@ static inline bool netdev_uses_dsa_tags(struct net_device *dev) return 0; } +#ifndef CONFIG_NET_NS +static inline void skb_set_dev(struct sk_buff *skb, struct net_device *dev) +{ + skb->dev = dev; +} +#else /* CONFIG_NET_NS */ +void skb_set_dev(struct sk_buff *skb, struct net_device *dev); +#endif + static inline bool netdev_uses_trailer_tags(struct net_device *dev) { #ifdef CONFIG_NET_DSA_TAG_TRAILER diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index a9e1f1785614..9e83272fc5b0 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -322,7 +322,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, } - skb->dev = vlan_dev_info(dev)->real_dev; + skb_set_dev(skb, vlan_dev_info(dev)->real_dev); len = skb->len; ret = dev_queue_xmit(skb); diff --git a/net/core/dev.c b/net/core/dev.c index 2cba5c521e56..94c1eeed25e5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1448,13 +1448,10 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) if (skb->len > (dev->mtu + dev->hard_header_len)) return NET_RX_DROP; - skb_dst_drop(skb); + skb_set_dev(skb, dev); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, dev); - skb->mark = 0; - secpath_reset(skb); - nf_reset(skb); return netif_rx(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); @@ -1614,6 +1611,36 @@ static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) return false; } +/** + * skb_dev_set -- assign a new device to a buffer + * @skb: buffer for the new device + * @dev: network device + * + * If an skb is owned by a device already, we have to reset + * all data private to the namespace a device belongs to + * before assigning it a new device. + */ +#ifdef CONFIG_NET_NS +void skb_set_dev(struct sk_buff *skb, struct net_device *dev) +{ + skb_dst_drop(skb); + if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) { + secpath_reset(skb); + nf_reset(skb); + skb_init_secmark(skb); + skb->mark = 0; + skb->priority = 0; + skb->nf_trace = 0; + skb->ipvs_property = 0; +#ifdef CONFIG_NET_SCHED + skb->tc_index = 0; +#endif + } + skb->dev = dev; +} +EXPORT_SYMBOL(skb_set_dev); +#endif /* CONFIG_NET_NS */ + /* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path. -- cgit v1.2.3 From fc0663d6b5e6d8e9b57f872a644c0aafd82361b7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 30 Jan 2010 12:23:40 +0000 Subject: macvlan: allow multiple driver backends This makes it possible to hook into the macvlan driver from another kernel module. In particular, the goal is to extend it with the macvtap backend that provides a tun/tap compatible interface directly on the macvlan device. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 113 ++++++++++++++++++++------------------------- include/linux/if_macvlan.h | 70 ++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d32e0bdfc5e9..40faa368b07a 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -39,31 +39,6 @@ struct macvlan_port { struct list_head vlans; }; -/** - * struct macvlan_rx_stats - MACVLAN percpu rx stats - * @rx_packets: number of received packets - * @rx_bytes: number of received bytes - * @multicast: number of received multicast packets - * @rx_errors: number of errors - */ -struct macvlan_rx_stats { - unsigned long rx_packets; - unsigned long rx_bytes; - unsigned long multicast; - unsigned long rx_errors; -}; - -struct macvlan_dev { - struct net_device *dev; - struct list_head list; - struct hlist_node hlist; - struct macvlan_port *port; - struct net_device *lowerdev; - struct macvlan_rx_stats *rx_stats; - enum macvlan_mode mode; -}; - - static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, const unsigned char *addr) { @@ -118,31 +93,17 @@ static int macvlan_addr_busy(const struct macvlan_port *port, return 0; } -static inline void macvlan_count_rx(const struct macvlan_dev *vlan, - unsigned int len, bool success, - bool multicast) -{ - struct macvlan_rx_stats *rx_stats; - - rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id()); - if (likely(success)) { - rx_stats->rx_packets++;; - rx_stats->rx_bytes += len; - if (multicast) - rx_stats->multicast++; - } else { - rx_stats->rx_errors++; - } -} -static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev, +static int macvlan_broadcast_one(struct sk_buff *skb, + const struct macvlan_dev *vlan, const struct ethhdr *eth, bool local) { + struct net_device *dev = vlan->dev; if (!skb) return NET_RX_DROP; if (local) - return dev_forward_skb(dev, skb); + return vlan->forward(dev, skb); skb->dev = dev; if (!compare_ether_addr_64bits(eth->h_dest, @@ -151,7 +112,7 @@ static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev, else skb->pkt_type = PACKET_MULTICAST; - return netif_rx(skb); + return vlan->receive(skb); } static void macvlan_broadcast(struct sk_buff *skb, @@ -175,7 +136,7 @@ static void macvlan_broadcast(struct sk_buff *skb, continue; nskb = skb_clone(skb, GFP_ATOMIC); - err = macvlan_broadcast_one(nskb, vlan->dev, eth, + err = macvlan_broadcast_one(nskb, vlan, eth, mode == MACVLAN_MODE_BRIDGE); macvlan_count_rx(vlan, skb->len + ETH_HLEN, err == NET_RX_SUCCESS, 1); @@ -238,7 +199,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) skb->dev = dev; skb->pkt_type = PACKET_HOST; - netif_rx(skb); + vlan->receive(skb); return NULL; } @@ -260,7 +221,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) dest = macvlan_hash_lookup(port, eth->h_dest); if (dest && dest->mode == MACVLAN_MODE_BRIDGE) { unsigned int length = skb->len + ETH_HLEN; - int ret = dev_forward_skb(dest->dev, skb); + int ret = dest->forward(dest->dev, skb); macvlan_count_rx(dest, length, ret == NET_RX_SUCCESS, 0); @@ -273,8 +234,8 @@ xmit_world: return dev_queue_xmit(skb); } -static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, - struct net_device *dev) +netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, + struct net_device *dev) { int i = skb_get_queue_mapping(skb); struct netdev_queue *txq = netdev_get_tx_queue(dev, i); @@ -290,6 +251,7 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, return ret; } +EXPORT_SYMBOL_GPL(macvlan_start_xmit); static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, @@ -623,8 +585,11 @@ static int macvlan_get_tx_queues(struct net *net, return 0; } -static int macvlan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) +int macvlan_common_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + int (*receive)(struct sk_buff *skb), + int (*forward)(struct net_device *dev, + struct sk_buff *skb)) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port; @@ -664,6 +629,8 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev, vlan->lowerdev = lowerdev; vlan->dev = dev; vlan->port = port; + vlan->receive = receive; + vlan->forward = forward; vlan->mode = MACVLAN_MODE_VEPA; if (data && data[IFLA_MACVLAN_MODE]) @@ -677,8 +644,17 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev, netif_stacked_transfer_operstate(lowerdev, dev); return 0; } +EXPORT_SYMBOL_GPL(macvlan_common_newlink); -static void macvlan_dellink(struct net_device *dev, struct list_head *head) +static int macvlan_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + return macvlan_common_newlink(src_net, dev, tb, data, + netif_rx, + dev_forward_skb); +} + +void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; @@ -689,6 +665,7 @@ static void macvlan_dellink(struct net_device *dev, struct list_head *head) if (list_empty(&port->vlans)) macvlan_port_destroy(port->dev); } +EXPORT_SYMBOL_GPL(macvlan_dellink); static int macvlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) @@ -720,19 +697,27 @@ static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = { [IFLA_MACVLAN_MODE] = { .type = NLA_U32 }, }; -static struct rtnl_link_ops macvlan_link_ops __read_mostly = { +int macvlan_link_register(struct rtnl_link_ops *ops) +{ + /* common fields */ + ops->priv_size = sizeof(struct macvlan_dev); + ops->get_tx_queues = macvlan_get_tx_queues; + ops->setup = macvlan_setup; + ops->validate = macvlan_validate; + ops->maxtype = IFLA_MACVLAN_MAX; + ops->policy = macvlan_policy; + ops->changelink = macvlan_changelink; + ops->get_size = macvlan_get_size; + ops->fill_info = macvlan_fill_info; + + return rtnl_link_register(ops); +}; +EXPORT_SYMBOL_GPL(macvlan_link_register); + +static struct rtnl_link_ops macvlan_link_ops = { .kind = "macvlan", - .priv_size = sizeof(struct macvlan_dev), - .get_tx_queues = macvlan_get_tx_queues, - .setup = macvlan_setup, - .validate = macvlan_validate, .newlink = macvlan_newlink, .dellink = macvlan_dellink, - .maxtype = IFLA_MACVLAN_MAX, - .policy = macvlan_policy, - .changelink = macvlan_changelink, - .get_size = macvlan_get_size, - .fill_info = macvlan_fill_info, }; static int macvlan_device_event(struct notifier_block *unused, @@ -761,7 +746,7 @@ static int macvlan_device_event(struct notifier_block *unused, break; case NETDEV_UNREGISTER: list_for_each_entry_safe(vlan, next, &port->vlans, list) - macvlan_dellink(vlan->dev, NULL); + vlan->dev->rtnl_link_ops->dellink(vlan->dev, NULL); break; } return NOTIFY_DONE; @@ -778,7 +763,7 @@ static int __init macvlan_init_module(void) register_netdevice_notifier(&macvlan_notifier_block); macvlan_handle_frame_hook = macvlan_handle_frame; - err = rtnl_link_register(&macvlan_link_ops); + err = macvlan_link_register(&macvlan_link_ops); if (err < 0) goto err1; return 0; diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 5f200bac3749..9a11544bb0b1 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -1,6 +1,76 @@ #ifndef _LINUX_IF_MACVLAN_H #define _LINUX_IF_MACVLAN_H +#include +#include +#include +#include +#include + +struct macvlan_port; +struct macvtap_queue; + +/** + * struct macvlan_rx_stats - MACVLAN percpu rx stats + * @rx_packets: number of received packets + * @rx_bytes: number of received bytes + * @multicast: number of received multicast packets + * @rx_errors: number of errors + */ +struct macvlan_rx_stats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long multicast; + unsigned long rx_errors; +}; + +struct macvlan_dev { + struct net_device *dev; + struct list_head list; + struct hlist_node hlist; + struct macvlan_port *port; + struct net_device *lowerdev; + struct macvlan_rx_stats *rx_stats; + enum macvlan_mode mode; + int (*receive)(struct sk_buff *skb); + int (*forward)(struct net_device *dev, struct sk_buff *skb); +}; + +static inline void macvlan_count_rx(const struct macvlan_dev *vlan, + unsigned int len, bool success, + bool multicast) +{ + struct macvlan_rx_stats *rx_stats; + + rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id()); + if (likely(success)) { + rx_stats->rx_packets++;; + rx_stats->rx_bytes += len; + if (multicast) + rx_stats->multicast++; + } else { + rx_stats->rx_errors++; + } +} + +extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + int (*receive)(struct sk_buff *skb), + int (*forward)(struct net_device *dev, + struct sk_buff *skb)); + +extern void macvlan_count_rx(const struct macvlan_dev *vlan, + unsigned int len, bool success, + bool multicast); + +extern void macvlan_dellink(struct net_device *dev, struct list_head *head); + +extern int macvlan_link_register(struct rtnl_link_ops *ops); + +extern netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, + struct net_device *dev); + + extern struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *); #endif /* _LINUX_IF_MACVLAN_H */ -- cgit v1.2.3 From 20d29d7a916a47bf533b5709437fe735b6b5b79e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 30 Jan 2010 12:24:26 +0000 Subject: net: macvtap driver In order to use macvlan with qemu and other tools that require a tap file descriptor, the macvtap driver adds a small backend with a character device with the same interface as the tun driver, with a minimum set of features. Macvtap interfaces are created in the same way as macvlan interfaces using ip link, but the netif is just used as a handle for configuration and accounting, while the data goes through the chardev. Each macvtap interface has its own character device, simplifying permission management significantly over the generic tun/tap driver. Cc: Patrick McHardy Cc: Stephen Hemminger Cc: David S. Miller" Cc: "Michael S. Tsirkin" Cc: Herbert Xu Cc: Or Gerlitz Cc: netdev@vger.kernel.org Cc: bridge@lists.linux-foundation.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/Kconfig | 12 + drivers/net/Makefile | 1 + drivers/net/macvtap.c | 581 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/if_macvlan.h | 1 + 4 files changed, 595 insertions(+) create mode 100644 drivers/net/macvtap.c (limited to 'include/linux') diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index cb0e534418e3..411e20703110 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -90,6 +90,18 @@ config MACVLAN To compile this driver as a module, choose M here: the module will be called macvlan. +config MACVTAP + tristate "MAC-VLAN based tap driver (EXPERIMENTAL)" + depends on MACVLAN + help + This adds a specialized tap character device driver that is based + on the MAC-VLAN network interface, called macvtap. A macvtap device + can be added in the same way as a macvlan device, using 'type + macvlan', and then be accessed through the tap user space interface. + + To compile this driver as a module, choose M here: the module + will be called macvtap. + config EQUALIZER tristate "EQL (serial line load balancing) support" ---help--- diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 0b763cbe9b1f..95958032cd31 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -169,6 +169,7 @@ obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o obj-$(CONFIG_DUMMY) += dummy.o obj-$(CONFIG_IFB) += ifb.o obj-$(CONFIG_MACVLAN) += macvlan.o +obj-$(CONFIG_MACVTAP) += macvtap.o obj-$(CONFIG_DE600) += de600.o obj-$(CONFIG_DE620) += de620.o obj-$(CONFIG_LANCE) += lance.o diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c new file mode 100644 index 000000000000..ad1f6ef89308 --- /dev/null +++ b/drivers/net/macvtap.c @@ -0,0 +1,581 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * A macvtap queue is the central object of this driver, it connects + * an open character device to a macvlan interface. There can be + * multiple queues on one interface, which map back to queues + * implemented in hardware on the underlying device. + * + * macvtap_proto is used to allocate queues through the sock allocation + * mechanism. + * + * TODO: multiqueue support is currently not implemented, even though + * macvtap is basically prepared for that. We will need to add this + * here as well as in virtio-net and qemu to get line rate on 10gbit + * adapters from a guest. + */ +struct macvtap_queue { + struct sock sk; + struct socket sock; + struct macvlan_dev *vlan; + struct file *file; +}; + +static struct proto macvtap_proto = { + .name = "macvtap", + .owner = THIS_MODULE, + .obj_size = sizeof (struct macvtap_queue), +}; + +/* + * Minor number matches netdev->ifindex, so need a potentially + * large value. This also makes it possible to split the + * tap functionality out again in the future by offering it + * from other drivers besides macvtap. As long as every device + * only has one tap, the interface numbers assure that the + * device nodes are unique. + */ +static unsigned int macvtap_major; +#define MACVTAP_NUM_DEVS 65536 +static struct class *macvtap_class; +static struct cdev macvtap_cdev; + +/* + * RCU usage: + * The macvtap_queue is referenced both from the chardev struct file + * and from the struct macvlan_dev using rcu_read_lock. + * + * We never actually update the contents of a macvtap_queue atomically + * with RCU but it is used for race-free destruction of a queue when + * either the file or the macvlan_dev goes away. Pointers back to + * the dev and the file are implicitly valid as long as the queue + * exists. + * + * The callbacks from macvlan are always done with rcu_read_lock held + * already, while in the file_operations, we get it ourselves. + * + * When destroying a queue, we remove the pointers from the file and + * from the dev and then synchronize_rcu to make sure no thread is + * still using the queue. There may still be references to the struct + * sock inside of the queue from outbound SKBs, but these never + * reference back to the file or the dev. The data structure is freed + * through __sk_free when both our references and any pending SKBs + * are gone. + * + * macvtap_lock is only used to prevent multiple concurrent open() + * calls to assign a new vlan->tap pointer. It could be moved into + * the macvlan_dev itself but is extremely rarely used. + */ +static DEFINE_SPINLOCK(macvtap_lock); + +/* + * Choose the next free queue, for now there is only one + */ +static int macvtap_set_queue(struct net_device *dev, struct file *file, + struct macvtap_queue *q) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + int err = -EBUSY; + + spin_lock(&macvtap_lock); + if (rcu_dereference(vlan->tap)) + goto out; + + err = 0; + q->vlan = vlan; + rcu_assign_pointer(vlan->tap, q); + + q->file = file; + rcu_assign_pointer(file->private_data, q); + +out: + spin_unlock(&macvtap_lock); + return err; +} + +/* + * We must destroy each queue exactly once, when either + * the netdev or the file go away. + * + * Using the spinlock makes sure that we don't get + * to the queue again after destroying it. + * + * synchronize_rcu serializes with the packet flow + * that uses rcu_read_lock. + */ +static void macvtap_del_queue(struct macvtap_queue **qp) +{ + struct macvtap_queue *q; + + spin_lock(&macvtap_lock); + q = rcu_dereference(*qp); + if (!q) { + spin_unlock(&macvtap_lock); + return; + } + + rcu_assign_pointer(q->vlan->tap, NULL); + rcu_assign_pointer(q->file->private_data, NULL); + spin_unlock(&macvtap_lock); + + synchronize_rcu(); + sock_put(&q->sk); +} + +/* + * Since we only support one queue, just dereference the pointer. + */ +static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, + struct sk_buff *skb) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + + return rcu_dereference(vlan->tap); +} + +static void macvtap_del_queues(struct net_device *dev) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + macvtap_del_queue(&vlan->tap); +} + +static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file) +{ + rcu_read_lock_bh(); + return rcu_dereference(file->private_data); +} + +static inline void macvtap_file_put_queue(void) +{ + rcu_read_unlock_bh(); +} + +/* + * Forward happens for data that gets sent from one macvlan + * endpoint to another one in bridge mode. We just take + * the skb and put it into the receive queue. + */ +static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) +{ + struct macvtap_queue *q = macvtap_get_queue(dev, skb); + if (!q) + return -ENOLINK; + + skb_queue_tail(&q->sk.sk_receive_queue, skb); + wake_up(q->sk.sk_sleep); + return 0; +} + +/* + * Receive is for data from the external interface (lowerdev), + * in case of macvtap, we can treat that the same way as + * forward, which macvlan cannot. + */ +static int macvtap_receive(struct sk_buff *skb) +{ + skb_push(skb, ETH_HLEN); + return macvtap_forward(skb->dev, skb); +} + +static int macvtap_newlink(struct net *src_net, + struct net_device *dev, + struct nlattr *tb[], + struct nlattr *data[]) +{ + struct device *classdev; + dev_t devt; + int err; + + err = macvlan_common_newlink(src_net, dev, tb, data, + macvtap_receive, macvtap_forward); + if (err) + goto out; + + devt = MKDEV(MAJOR(macvtap_major), dev->ifindex); + + classdev = device_create(macvtap_class, &dev->dev, devt, + dev, "tap%d", dev->ifindex); + if (IS_ERR(classdev)) { + err = PTR_ERR(classdev); + macvtap_del_queues(dev); + } + +out: + return err; +} + +static void macvtap_dellink(struct net_device *dev, + struct list_head *head) +{ + device_destroy(macvtap_class, + MKDEV(MAJOR(macvtap_major), dev->ifindex)); + + macvtap_del_queues(dev); + macvlan_dellink(dev, head); +} + +static struct rtnl_link_ops macvtap_link_ops __read_mostly = { + .kind = "macvtap", + .newlink = macvtap_newlink, + .dellink = macvtap_dellink, +}; + + +static void macvtap_sock_write_space(struct sock *sk) +{ + if (!sock_writeable(sk) || + !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) + return; + + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible_sync(sk->sk_sleep); +} + +static int macvtap_open(struct inode *inode, struct file *file) +{ + struct net *net = current->nsproxy->net_ns; + struct net_device *dev = dev_get_by_index(net, iminor(inode)); + struct macvtap_queue *q; + int err; + + err = -ENODEV; + if (!dev) + goto out; + + /* check if this is a macvtap device */ + err = -EINVAL; + if (dev->rtnl_link_ops != &macvtap_link_ops) + goto out; + + err = -ENOMEM; + q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, + &macvtap_proto); + if (!q) + goto out; + + init_waitqueue_head(&q->sock.wait); + q->sock.type = SOCK_RAW; + q->sock.state = SS_CONNECTED; + sock_init_data(&q->sock, &q->sk); + q->sk.sk_allocation = GFP_ATOMIC; /* for now */ + q->sk.sk_write_space = macvtap_sock_write_space; + + err = macvtap_set_queue(dev, file, q); + if (err) + sock_put(&q->sk); + +out: + if (dev) + dev_put(dev); + + return err; +} + +static int macvtap_release(struct inode *inode, struct file *file) +{ + macvtap_del_queue((struct macvtap_queue **)&file->private_data); + return 0; +} + +static unsigned int macvtap_poll(struct file *file, poll_table * wait) +{ + struct macvtap_queue *q = macvtap_file_get_queue(file); + unsigned int mask = POLLERR; + + if (!q) + goto out; + + mask = 0; + poll_wait(file, &q->sock.wait, wait); + + if (!skb_queue_empty(&q->sk.sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + + if (sock_writeable(&q->sk) || + (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) && + sock_writeable(&q->sk))) + mask |= POLLOUT | POLLWRNORM; + +out: + macvtap_file_put_queue(); + return mask; +} + +/* Get packet from user space buffer */ +static ssize_t macvtap_get_user(struct macvtap_queue *q, + const struct iovec *iv, size_t count, + int noblock) +{ + struct sk_buff *skb; + size_t len = count; + int err; + + if (unlikely(len < ETH_HLEN)) + return -EINVAL; + + skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err); + + if (!skb) { + macvlan_count_rx(q->vlan, 0, false, false); + return err; + } + + skb_reserve(skb, NET_IP_ALIGN); + skb_put(skb, count); + + if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) { + macvlan_count_rx(q->vlan, 0, false, false); + kfree_skb(skb); + return -EFAULT; + } + + skb_set_network_header(skb, ETH_HLEN); + + macvlan_start_xmit(skb, q->vlan->dev); + + return count; +} + +static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) +{ + struct file *file = iocb->ki_filp; + ssize_t result = -ENOLINK; + struct macvtap_queue *q = macvtap_file_get_queue(file); + + if (!q) + goto out; + + result = macvtap_get_user(q, iv, iov_length(iv, count), + file->f_flags & O_NONBLOCK); +out: + macvtap_file_put_queue(); + return result; +} + +/* Put packet to the user space buffer */ +static ssize_t macvtap_put_user(struct macvtap_queue *q, + const struct sk_buff *skb, + const struct iovec *iv, int len) +{ + struct macvlan_dev *vlan = q->vlan; + int ret; + + len = min_t(int, skb->len, len); + + ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len); + + macvlan_count_rx(vlan, len, ret == 0, 0); + + return ret ? ret : len; +} + +static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct macvtap_queue *q = macvtap_file_get_queue(file); + + DECLARE_WAITQUEUE(wait, current); + struct sk_buff *skb; + ssize_t len, ret = 0; + + if (!q) { + ret = -ENOLINK; + goto out; + } + + len = iov_length(iv, count); + if (len < 0) { + ret = -EINVAL; + goto out; + } + + add_wait_queue(q->sk.sk_sleep, &wait); + while (len) { + current->state = TASK_INTERRUPTIBLE; + + /* Read frames from the queue */ + skb = skb_dequeue(&q->sk.sk_receive_queue); + if (!skb) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + /* Nothing to read, let's sleep */ + schedule(); + continue; + } + ret = macvtap_put_user(q, skb, iv, len); + kfree_skb(skb); + break; + } + + current->state = TASK_RUNNING; + remove_wait_queue(q->sk.sk_sleep, &wait); + +out: + macvtap_file_put_queue(); + return ret; +} + +/* + * provide compatibility with generic tun/tap interface + */ +static long macvtap_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct macvtap_queue *q; + void __user *argp = (void __user *)arg; + struct ifreq __user *ifr = argp; + unsigned int __user *up = argp; + unsigned int u; + char devname[IFNAMSIZ]; + + switch (cmd) { + case TUNSETIFF: + /* ignore the name, just look at flags */ + if (get_user(u, &ifr->ifr_flags)) + return -EFAULT; + if (u != (IFF_TAP | IFF_NO_PI)) + return -EINVAL; + return 0; + + case TUNGETIFF: + q = macvtap_file_get_queue(file); + if (!q) + return -ENOLINK; + memcpy(devname, q->vlan->dev->name, sizeof(devname)); + macvtap_file_put_queue(); + + if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) || + put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags)) + return -EFAULT; + return 0; + + case TUNGETFEATURES: + if (put_user((IFF_TAP | IFF_NO_PI), up)) + return -EFAULT; + return 0; + + case TUNSETSNDBUF: + if (get_user(u, up)) + return -EFAULT; + + q = macvtap_file_get_queue(file); + q->sk.sk_sndbuf = u; + macvtap_file_put_queue(); + return 0; + + case TUNSETOFFLOAD: + /* let the user check for future flags */ + if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | + TUN_F_TSO_ECN | TUN_F_UFO)) + return -EINVAL; + + /* TODO: add support for these, so far we don't + support any offload */ + if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | + TUN_F_TSO_ECN | TUN_F_UFO)) + return -EINVAL; + + return 0; + + default: + return -EINVAL; + } +} + +#ifdef CONFIG_COMPAT +static long macvtap_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + +static const struct file_operations macvtap_fops = { + .owner = THIS_MODULE, + .open = macvtap_open, + .release = macvtap_release, + .aio_read = macvtap_aio_read, + .aio_write = macvtap_aio_write, + .poll = macvtap_poll, + .llseek = no_llseek, + .unlocked_ioctl = macvtap_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = macvtap_compat_ioctl, +#endif +}; + +static int macvtap_init(void) +{ + int err; + + err = alloc_chrdev_region(&macvtap_major, 0, + MACVTAP_NUM_DEVS, "macvtap"); + if (err) + goto out1; + + cdev_init(&macvtap_cdev, &macvtap_fops); + err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS); + if (err) + goto out2; + + macvtap_class = class_create(THIS_MODULE, "macvtap"); + if (IS_ERR(macvtap_class)) { + err = PTR_ERR(macvtap_class); + goto out3; + } + + err = macvlan_link_register(&macvtap_link_ops); + if (err) + goto out4; + + return 0; + +out4: + class_unregister(macvtap_class); +out3: + cdev_del(&macvtap_cdev); +out2: + unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); +out1: + return err; +} +module_init(macvtap_init); + +static void macvtap_exit(void) +{ + rtnl_link_unregister(&macvtap_link_ops); + class_unregister(macvtap_class); + cdev_del(&macvtap_cdev); + unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); +} +module_exit(macvtap_exit); + +MODULE_ALIAS_RTNL_LINK("macvtap"); +MODULE_AUTHOR("Arnd Bergmann "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 9a11544bb0b1..51f1512045e9 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -34,6 +34,7 @@ struct macvlan_dev { enum macvlan_mode mode; int (*receive)(struct sk_buff *skb); int (*forward)(struct net_device *dev, struct sk_buff *skb); + struct macvtap_queue *tap; }; static inline void macvlan_count_rx(const struct macvlan_dev *vlan, -- cgit v1.2.3 From 1621e0940294c20e302faf401f41204de7252e22 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 1 Feb 2010 09:44:19 +0000 Subject: net: CONFIG_COMPAT redux Ifdef out struct proto_ops::compat_ioctl struct proto_ops::compat_setsockopt struct proto_ops::compat_getsockopt to make structures smaller on COMPAT=n kernels. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/linux/net.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 5e8083cacc8b..4157b5d42bd6 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -174,18 +174,22 @@ struct proto_ops { struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); +#ifdef CONFIG_COMPAT int (*compat_ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); +#endif int (*listen) (struct socket *sock, int len); int (*shutdown) (struct socket *sock, int flags); int (*setsockopt)(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen); int (*getsockopt)(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); +#ifdef CONFIG_COMPAT int (*compat_setsockopt)(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen); int (*compat_getsockopt)(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); +#endif int (*sendmsg) (struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t total_len); int (*recvmsg) (struct kiocb *iocb, struct socket *sock, -- cgit v1.2.3 From f7acede65d6b65919aee5b6a360a17cedb11f2f7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Jan 2010 13:30:11 +0100 Subject: libata: fix ata_id_logical_per_physical_sectors The value we get from the low byte of the ATA_ID_SECTOR_SIZE word is not not a plain multiple, but the log of it, so fix the helper to give the correct answer. Without this we'll get an incorrect minimal I/O size in the block limits VPD page for 4k sector drives. Also change the return value of ata_id_logical_per_physical_sectors to u16 for the unlikely case of very large logical sectors. Signed-off-by: Christoph Hellwig Signed-off-by: Jeff Garzik --- include/linux/ata.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index 38a6948ce0c2..20f31567ccee 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -647,9 +647,9 @@ static inline int ata_id_has_large_logical_sectors(const u16 *id) return id[ATA_ID_SECTOR_SIZE] & (1 << 13); } -static inline u8 ata_id_logical_per_physical_sectors(const u16 *id) +static inline u16 ata_id_logical_per_physical_sectors(const u16 *id) { - return id[ATA_ID_SECTOR_SIZE] & 0xf; + return 1 << (id[ATA_ID_SECTOR_SIZE] & 0xf); } static inline int ata_id_has_lba48(const u16 *id) -- cgit v1.2.3 From 0b7024ac4df5821347141c18e680b7166bc1cb20 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 2 Feb 2010 21:08:26 -0800 Subject: Input: add match() method to input hanlders Get rid of blacklist in input handler structure and instead allow handlers to define their own match() method to perform fine-grained filtering of supported devices. Signed-off-by: Dmitry Torokhov --- drivers/char/keyboard.c | 24 ++++++++++++++++-------- drivers/input/input.c | 13 ++++++------- drivers/input/joydev.c | 32 +++++++++++++++----------------- include/linux/input.h | 6 +++--- 4 files changed, 40 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c index cbf64b985ef4..ada25bb8941e 100644 --- a/drivers/char/keyboard.c +++ b/drivers/char/keyboard.c @@ -1323,6 +1323,21 @@ static void kbd_event(struct input_handle *handle, unsigned int event_type, schedule_console_callback(); } +static bool kbd_match(struct input_handler *handler, struct input_dev *dev) +{ + int i; + + if (test_bit(EV_SND, dev->evbit)) + return true; + + if (test_bit(EV_KEY, dev->evbit)) + for (i = KEY_RESERVED; i < BTN_MISC; i++) + if (test_bit(i, dev->keybit)) + return true; + + return false; +} + /* * When a keyboard (or other input device) is found, the kbd_connect * function is called. The function then looks at the device, and if it @@ -1334,14 +1349,6 @@ static int kbd_connect(struct input_handler *handler, struct input_dev *dev, { struct input_handle *handle; int error; - int i; - - for (i = KEY_RESERVED; i < BTN_MISC; i++) - if (test_bit(i, dev->keybit)) - break; - - if (i == BTN_MISC && !test_bit(EV_SND, dev->evbit)) - return -ENODEV; handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL); if (!handle) @@ -1407,6 +1414,7 @@ MODULE_DEVICE_TABLE(input, kbd_ids); static struct input_handler kbd_handler = { .event = kbd_event, + .match = kbd_match, .connect = kbd_connect, .disconnect = kbd_disconnect, .start = kbd_start, diff --git a/drivers/input/input.c b/drivers/input/input.c index 7080a9d4b840..dae49eba6ccd 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -723,12 +723,13 @@ EXPORT_SYMBOL(input_set_keycode); if (i != BITS_TO_LONGS(max)) \ continue; -static const struct input_device_id *input_match_device(const struct input_device_id *id, +static const struct input_device_id *input_match_device(struct input_handler *handler, struct input_dev *dev) { + const struct input_device_id *id; int i; - for (; id->flags || id->driver_info; id++) { + for (id = handler->id_table; id->flags || id->driver_info; id++) { if (id->flags & INPUT_DEVICE_ID_MATCH_BUS) if (id->bustype != dev->id.bustype) @@ -756,7 +757,8 @@ static const struct input_device_id *input_match_device(const struct input_devic MATCH_BIT(ffbit, FF_MAX); MATCH_BIT(swbit, SW_MAX); - return id; + if (!handler->match || handler->match(handler, dev)) + return id; } return NULL; @@ -767,10 +769,7 @@ static int input_attach_handler(struct input_dev *dev, struct input_handler *han const struct input_device_id *id; int error; - if (handler->blacklist && input_match_device(handler->blacklist, dev)) - return -ENODEV; - - id = input_match_device(handler->id_table, dev); + id = input_match_device(handler, dev); if (!id) return -ENODEV; diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c index b1bd6dd32286..63e71f2a7acc 100644 --- a/drivers/input/joydev.c +++ b/drivers/input/joydev.c @@ -775,6 +775,20 @@ static void joydev_cleanup(struct joydev *joydev) input_close_device(handle); } + +static bool joydev_match(struct input_handler *handler, struct input_dev *dev) +{ + /* Avoid touchpads and touchscreens */ + if (test_bit(EV_KEY, dev->evbit) && test_bit(BTN_TOUCH, dev->keybit)) + return false; + + /* Avoid tablets, digitisers and similar devices */ + if (test_bit(EV_KEY, dev->evbit) && test_bit(BTN_DIGI, dev->keybit)) + return false; + + return true; +} + static int joydev_connect(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id) { @@ -894,22 +908,6 @@ static void joydev_disconnect(struct input_handle *handle) put_device(&joydev->dev); } -static const struct input_device_id joydev_blacklist[] = { - { - .flags = INPUT_DEVICE_ID_MATCH_EVBIT | - INPUT_DEVICE_ID_MATCH_KEYBIT, - .evbit = { BIT_MASK(EV_KEY) }, - .keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) }, - }, /* Avoid itouchpads and touchscreens */ - { - .flags = INPUT_DEVICE_ID_MATCH_EVBIT | - INPUT_DEVICE_ID_MATCH_KEYBIT, - .evbit = { BIT_MASK(EV_KEY) }, - .keybit = { [BIT_WORD(BTN_DIGI)] = BIT_MASK(BTN_DIGI) }, - }, /* Avoid tablets, digitisers and similar devices */ - { } /* Terminating entry */ -}; - static const struct input_device_id joydev_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | @@ -936,13 +934,13 @@ MODULE_DEVICE_TABLE(input, joydev_ids); static struct input_handler joydev_handler = { .event = joydev_event, + .match = joydev_match, .connect = joydev_connect, .disconnect = joydev_disconnect, .fops = &joydev_fops, .minor = JOYDEV_MINOR_BASE, .name = "joydev", .id_table = joydev_ids, - .blacklist = joydev_blacklist, }; static int __init joydev_init(void) diff --git a/include/linux/input.h b/include/linux/input.h index 6c9d3d49fa91..8dc5d724c703 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -1200,6 +1200,8 @@ struct input_handle; * it may not sleep * @filter: similar to @event; separates normal event handlers from * "filters". + * @match: called after comparing device's id with handler's id_table + * to perform fine-grained matching between device and handler * @connect: called when attaching a handler to an input device * @disconnect: disconnects a handler from input device * @start: starts handler for given handle. This function is called by @@ -1211,8 +1213,6 @@ struct input_handle; * @name: name of the handler, to be shown in /proc/bus/input/handlers * @id_table: pointer to a table of input_device_ids this driver can * handle - * @blacklist: pointer to a table of input_device_ids this driver should - * ignore even if they match @id_table * @h_list: list of input handles associated with the handler * @node: for placing the driver onto input_handler_list * @@ -1235,6 +1235,7 @@ struct input_handler { void (*event)(struct input_handle *handle, unsigned int type, unsigned int code, int value); bool (*filter)(struct input_handle *handle, unsigned int type, unsigned int code, int value); + bool (*match)(struct input_handler *handler, struct input_dev *dev); int (*connect)(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id); void (*disconnect)(struct input_handle *handle); void (*start)(struct input_handle *handle); @@ -1244,7 +1245,6 @@ struct input_handler { const char *name; const struct input_device_id *id_table; - const struct input_device_id *blacklist; struct list_head h_list; struct list_head node; -- cgit v1.2.3 From 2cfa19780d61740f65790c5bae363b759d7c96fa Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 2 Feb 2010 16:49:11 -0500 Subject: ftrace/alternatives: Introducing *_text_reserved functions Introducing *_text_reserved functions for checking the text address range is partially reserved or not. This patch provides checking routines for x86 smp alternatives and dynamic ftrace. Since both functions modify fixed pieces of kernel text, they should reserve and protect those from other dynamic text modifier, like kprobes. This will also be extended when introducing other subsystems which modify fixed pieces of kernel text. Dynamic text modifiers should avoid those. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Steven Rostedt Cc: przemyslaw@pawelczyk.it Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Mathieu Desnoyers Cc: Jason Baron LKML-Reference: <20100202214911.4694.16587.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 5 +++++ arch/x86/kernel/alternative.c | 16 ++++++++++++++++ include/linux/ftrace.h | 6 ++++++ kernel/trace/ftrace.c | 15 +++++++++++++++ 4 files changed, 42 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 69b74a7b877f..ac80b7d70014 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -65,12 +65,17 @@ extern void alternatives_smp_module_add(struct module *mod, char *name, void *text, void *text_end); extern void alternatives_smp_module_del(struct module *mod); extern void alternatives_smp_switch(int smp); +extern int alternatives_text_reserved(void *start, void *end); #else static inline void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) {} static inline void alternatives_smp_module_del(struct module *mod) {} static inline void alternatives_smp_switch(int smp) {} +static inline int alternatives_text_reserved(void *start, void *end) +{ + return 0; +} #endif /* CONFIG_SMP */ /* alternative assembly primitive: */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index de7353c0ce9c..3c13284ff86d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -390,6 +390,22 @@ void alternatives_smp_switch(int smp) mutex_unlock(&smp_alt); } +/* Return 1 if the address range is reserved for smp-alternatives */ +int alternatives_text_reserved(void *start, void *end) +{ + struct smp_alt_module *mod; + u8 **ptr; + + list_for_each_entry(mod, &smp_alt_modules, next) { + if (mod->text > end || mod->text_end < start) + continue; + for (ptr = mod->locks; ptr < mod->locks_end; ptr++) + if (start <= *ptr && end >= *ptr) + return 1; + } + + return 0; +} #endif #ifdef CONFIG_PARAVIRT diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0b4f97d24d7f..9d127efed43c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -134,6 +134,8 @@ extern void unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); extern void unregister_ftrace_function_probe_all(char *glob); +extern int ftrace_text_reserved(void *start, void *end); + enum { FTRACE_FL_FREE = (1 << 0), FTRACE_FL_FAILED = (1 << 1), @@ -250,6 +252,10 @@ static inline int unregister_ftrace_command(char *cmd_name) { return -EINVAL; } +static inline int ftrace_text_reserved(void *start, void *end) +{ + return 0; +} #endif /* CONFIG_DYNAMIC_FTRACE */ /* totally disable ftrace - can not re-enable after this */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1e6640f80454..3d90661a5f40 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1025,6 +1025,21 @@ static void ftrace_bug(int failed, unsigned long ip) } +/* Return 1 if the address range is reserved for ftrace */ +int ftrace_text_reserved(void *start, void *end) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + + do_for_each_ftrace_rec(pg, rec) { + if (rec->ip <= (unsigned long)end && + rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) + return 1; + } while_for_each_ftrace_rec(); + return 0; +} + + static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { -- cgit v1.2.3 From f24bb999d2b9f2950e5cac5b69bffedf73c24ea4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 2 Feb 2010 16:49:25 -0500 Subject: ftrace: Remove record freezing Remove record freezing. Because kprobes never puts probe on ftrace's mcount call anymore, it doesn't need ftrace to check whether kprobes on it. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Steven Rostedt Cc: przemyslaw@pawelczyk.it Cc: Frederic Weisbecker LKML-Reference: <20100202214925.4694.73469.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 1 - kernel/trace/ftrace.c | 39 --------------------------------------- 2 files changed, 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9d127efed43c..eb054ae95605 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -143,7 +143,6 @@ enum { FTRACE_FL_ENABLED = (1 << 3), FTRACE_FL_NOTRACE = (1 << 4), FTRACE_FL_CONVERTED = (1 << 5), - FTRACE_FL_FROZEN = (1 << 6), }; struct dyn_ftrace { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3d90661a5f40..1904797f4a8a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -898,36 +897,6 @@ static struct dyn_ftrace *ftrace_free_records; } \ } -#ifdef CONFIG_KPROBES - -static int frozen_record_count; - -static inline void freeze_record(struct dyn_ftrace *rec) -{ - if (!(rec->flags & FTRACE_FL_FROZEN)) { - rec->flags |= FTRACE_FL_FROZEN; - frozen_record_count++; - } -} - -static inline void unfreeze_record(struct dyn_ftrace *rec) -{ - if (rec->flags & FTRACE_FL_FROZEN) { - rec->flags &= ~FTRACE_FL_FROZEN; - frozen_record_count--; - } -} - -static inline int record_frozen(struct dyn_ftrace *rec) -{ - return rec->flags & FTRACE_FL_FROZEN; -} -#else -# define freeze_record(rec) ({ 0; }) -# define unfreeze_record(rec) ({ 0; }) -# define record_frozen(rec) ({ 0; }) -#endif /* CONFIG_KPROBES */ - static void ftrace_free_rec(struct dyn_ftrace *rec) { rec->freelist = ftrace_free_records; @@ -1091,14 +1060,6 @@ static void ftrace_replace_code(int enable) !(rec->flags & FTRACE_FL_CONVERTED)) continue; - /* ignore updates to this record's mcount site */ - if (get_kprobe((void *)rec->ip)) { - freeze_record(rec); - continue; - } else { - unfreeze_record(rec); - } - failed = __ftrace_replace_code(rec, enable); if (failed) { rec->flags |= FTRACE_FL_FAILED; -- cgit v1.2.3 From 9e3af04f8787315f63f55b191bb9a06741dbf183 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 4 Feb 2010 00:48:00 -0800 Subject: Input: gpio-keys - add support for disabling gpios through sysfs Now gpio-keys input driver exports 4 new attributes to userland through sysfs: /sys/devices/platform/gpio-keys/keys [ro] /sys/devices/platform/gpio-keys/switches [ro] /sys/devices/platform/gpio-keys/disabled_keys [rw] /sys/devices/platform/gpio-keys/disables_switches [rw] With these attributes, userland program can read which keys and switches can be disabled and then disable/enable them as needed. Keys and switches are exported as stringified bitmap of codes (keycodes or switch codes). For example keys 15, 89, 100, 101, 102 are exported as: '15,89,100-102'. Description of the attributes: keys - bitmap of keys which can be disabled switches - bitmap of switches which can be disabled disabled_keys - bitmap of currently disabled keys (bit 1 means disabled, 0 enabled) disabled_switches - bitmap of currently disabled switches (bit 1 means disabled, 0 enabled) Signed-off-by: Mika Westerberg Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/gpio_keys.c | 318 +++++++++++++++++++++++++++++++++++-- include/linux/gpio_keys.h | 1 + 2 files changed, 308 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c index 1aff3b76effd..2b708aa85553 100644 --- a/drivers/input/keyboard/gpio_keys.c +++ b/drivers/input/keyboard/gpio_keys.c @@ -30,13 +30,289 @@ struct gpio_button_data { struct input_dev *input; struct timer_list timer; struct work_struct work; + bool disabled; }; struct gpio_keys_drvdata { struct input_dev *input; + struct mutex disable_lock; + unsigned int n_buttons; struct gpio_button_data data[0]; }; +/* + * SYSFS interface for enabling/disabling keys and switches: + * + * There are 4 attributes under /sys/devices/platform/gpio-keys/ + * keys [ro] - bitmap of keys (EV_KEY) which can be + * disabled + * switches [ro] - bitmap of switches (EV_SW) which can be + * disabled + * disabled_keys [rw] - bitmap of keys currently disabled + * disabled_switches [rw] - bitmap of switches currently disabled + * + * Userland can change these values and hence disable event generation + * for each key (or switch). Disabling a key means its interrupt line + * is disabled. + * + * For example, if we have following switches set up as gpio-keys: + * SW_DOCK = 5 + * SW_CAMERA_LENS_COVER = 9 + * SW_KEYPAD_SLIDE = 10 + * SW_FRONT_PROXIMITY = 11 + * This is read from switches: + * 11-9,5 + * Next we want to disable proximity (11) and dock (5), we write: + * 11,5 + * to file disabled_switches. Now proximity and dock IRQs are disabled. + * This can be verified by reading the file disabled_switches: + * 11,5 + * If we now want to enable proximity (11) switch we write: + * 5 + * to disabled_switches. + * + * We can disable only those keys which don't allow sharing the irq. + */ + +/** + * get_n_events_by_type() - returns maximum number of events per @type + * @type: type of button (%EV_KEY, %EV_SW) + * + * Return value of this function can be used to allocate bitmap + * large enough to hold all bits for given type. + */ +static inline int get_n_events_by_type(int type) +{ + BUG_ON(type != EV_SW && type != EV_KEY); + + return (type == EV_KEY) ? KEY_CNT : SW_CNT; +} + +/** + * gpio_keys_disable_button() - disables given GPIO button + * @bdata: button data for button to be disabled + * + * Disables button pointed by @bdata. This is done by masking + * IRQ line. After this function is called, button won't generate + * input events anymore. Note that one can only disable buttons + * that don't share IRQs. + * + * Make sure that @bdata->disable_lock is locked when entering + * this function to avoid races when concurrent threads are + * disabling buttons at the same time. + */ +static void gpio_keys_disable_button(struct gpio_button_data *bdata) +{ + if (!bdata->disabled) { + /* + * Disable IRQ and possible debouncing timer. + */ + disable_irq(gpio_to_irq(bdata->button->gpio)); + if (bdata->button->debounce_interval) + del_timer_sync(&bdata->timer); + + bdata->disabled = true; + } +} + +/** + * gpio_keys_enable_button() - enables given GPIO button + * @bdata: button data for button to be disabled + * + * Enables given button pointed by @bdata. + * + * Make sure that @bdata->disable_lock is locked when entering + * this function to avoid races with concurrent threads trying + * to enable the same button at the same time. + */ +static void gpio_keys_enable_button(struct gpio_button_data *bdata) +{ + if (bdata->disabled) { + enable_irq(gpio_to_irq(bdata->button->gpio)); + bdata->disabled = false; + } +} + +/** + * gpio_keys_attr_show_helper() - fill in stringified bitmap of buttons + * @ddata: pointer to drvdata + * @buf: buffer where stringified bitmap is written + * @type: button type (%EV_KEY, %EV_SW) + * @only_disabled: does caller want only those buttons that are + * currently disabled or all buttons that can be + * disabled + * + * This function writes buttons that can be disabled to @buf. If + * @only_disabled is true, then @buf contains only those buttons + * that are currently disabled. Returns 0 on success or negative + * errno on failure. + */ +static ssize_t gpio_keys_attr_show_helper(struct gpio_keys_drvdata *ddata, + char *buf, unsigned int type, + bool only_disabled) +{ + int n_events = get_n_events_by_type(type); + unsigned long *bits; + ssize_t ret; + int i; + + bits = kcalloc(BITS_TO_LONGS(n_events), sizeof(*bits), GFP_KERNEL); + if (!bits) + return -ENOMEM; + + for (i = 0; i < ddata->n_buttons; i++) { + struct gpio_button_data *bdata = &ddata->data[i]; + + if (bdata->button->type != type) + continue; + + if (only_disabled && !bdata->disabled) + continue; + + __set_bit(bdata->button->code, bits); + } + + ret = bitmap_scnlistprintf(buf, PAGE_SIZE - 2, bits, n_events); + buf[ret++] = '\n'; + buf[ret] = '\0'; + + kfree(bits); + + return ret; +} + +/** + * gpio_keys_attr_store_helper() - enable/disable buttons based on given bitmap + * @ddata: pointer to drvdata + * @buf: buffer from userspace that contains stringified bitmap + * @type: button type (%EV_KEY, %EV_SW) + * + * This function parses stringified bitmap from @buf and disables/enables + * GPIO buttons accordinly. Returns 0 on success and negative error + * on failure. + */ +static ssize_t gpio_keys_attr_store_helper(struct gpio_keys_drvdata *ddata, + const char *buf, unsigned int type) +{ + int n_events = get_n_events_by_type(type); + unsigned long *bits; + ssize_t error; + int i; + + bits = kcalloc(BITS_TO_LONGS(n_events), sizeof(*bits), GFP_KERNEL); + if (!bits) + return -ENOMEM; + + error = bitmap_parselist(buf, bits, n_events); + if (error) + goto out; + + /* First validate */ + for (i = 0; i < ddata->n_buttons; i++) { + struct gpio_button_data *bdata = &ddata->data[i]; + + if (bdata->button->type != type) + continue; + + if (test_bit(bdata->button->code, bits) && + !bdata->button->can_disable) { + error = -EINVAL; + goto out; + } + } + + mutex_lock(&ddata->disable_lock); + + for (i = 0; i < ddata->n_buttons; i++) { + struct gpio_button_data *bdata = &ddata->data[i]; + + if (bdata->button->type != type) + continue; + + if (test_bit(bdata->button->code, bits)) + gpio_keys_disable_button(bdata); + else + gpio_keys_enable_button(bdata); + } + + mutex_unlock(&ddata->disable_lock); + +out: + kfree(bits); + return error; +} + +#define ATTR_SHOW_FN(name, type, only_disabled) \ +static ssize_t gpio_keys_show_##name(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct platform_device *pdev = to_platform_device(dev); \ + struct gpio_keys_drvdata *ddata = platform_get_drvdata(pdev); \ + \ + return gpio_keys_attr_show_helper(ddata, buf, \ + type, only_disabled); \ +} + +ATTR_SHOW_FN(keys, EV_KEY, false); +ATTR_SHOW_FN(switches, EV_SW, false); +ATTR_SHOW_FN(disabled_keys, EV_KEY, true); +ATTR_SHOW_FN(disabled_switches, EV_SW, true); + +/* + * ATTRIBUTES: + * + * /sys/devices/platform/gpio-keys/keys [ro] + * /sys/devices/platform/gpio-keys/switches [ro] + */ +static DEVICE_ATTR(keys, S_IRUGO, gpio_keys_show_keys, NULL); +static DEVICE_ATTR(switches, S_IRUGO, gpio_keys_show_switches, NULL); + +#define ATTR_STORE_FN(name, type) \ +static ssize_t gpio_keys_store_##name(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, \ + size_t count) \ +{ \ + struct platform_device *pdev = to_platform_device(dev); \ + struct gpio_keys_drvdata *ddata = platform_get_drvdata(pdev); \ + ssize_t error; \ + \ + error = gpio_keys_attr_store_helper(ddata, buf, type); \ + if (error) \ + return error; \ + \ + return count; \ +} + +ATTR_STORE_FN(disabled_keys, EV_KEY); +ATTR_STORE_FN(disabled_switches, EV_SW); + +/* + * ATTRIBUTES: + * + * /sys/devices/platform/gpio-keys/disabled_keys [rw] + * /sys/devices/platform/gpio-keys/disables_switches [rw] + */ +static DEVICE_ATTR(disabled_keys, S_IWUSR | S_IRUGO, + gpio_keys_show_disabled_keys, + gpio_keys_store_disabled_keys); +static DEVICE_ATTR(disabled_switches, S_IWUSR | S_IRUGO, + gpio_keys_show_disabled_switches, + gpio_keys_store_disabled_switches); + +static struct attribute *gpio_keys_attrs[] = { + &dev_attr_keys.attr, + &dev_attr_switches.attr, + &dev_attr_disabled_keys.attr, + &dev_attr_disabled_switches.attr, + NULL, +}; + +static struct attribute_group gpio_keys_attr_group = { + .attrs = gpio_keys_attrs, +}; + static void gpio_keys_report_event(struct gpio_button_data *bdata) { struct gpio_keys_button *button = bdata->button; @@ -79,11 +355,13 @@ static irqreturn_t gpio_keys_isr(int irq, void *dev_id) return IRQ_HANDLED; } -static int __devinit gpio_keys_setup_key(struct device *dev, +static int __devinit gpio_keys_setup_key(struct platform_device *pdev, struct gpio_button_data *bdata, struct gpio_keys_button *button) { char *desc = button->desc ? button->desc : "gpio_keys"; + struct device *dev = &pdev->dev; + unsigned long irqflags; int irq, error; setup_timer(&bdata->timer, gpio_keys_timer, (unsigned long)bdata); @@ -112,10 +390,15 @@ static int __devinit gpio_keys_setup_key(struct device *dev, goto fail3; } - error = request_irq(irq, gpio_keys_isr, - IRQF_SHARED | - IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, - desc, bdata); + irqflags = IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING; + /* + * If platform has specified that the button can be disabled, + * we don't want it to share the interrupt line. + */ + if (!button->can_disable) + irqflags |= IRQF_SHARED; + + error = request_irq(irq, gpio_keys_isr, irqflags, desc, bdata); if (error) { dev_err(dev, "Unable to claim irq %d; error %d\n", irq, error); @@ -149,6 +432,10 @@ static int __devinit gpio_keys_probe(struct platform_device *pdev) goto fail1; } + ddata->input = input; + ddata->n_buttons = pdata->nbuttons; + mutex_init(&ddata->disable_lock); + platform_set_drvdata(pdev, ddata); input->name = pdev->name; @@ -164,8 +451,6 @@ static int __devinit gpio_keys_probe(struct platform_device *pdev) if (pdata->rep) __set_bit(EV_REP, input->evbit); - ddata->input = input; - for (i = 0; i < pdata->nbuttons; i++) { struct gpio_keys_button *button = &pdata->buttons[i]; struct gpio_button_data *bdata = &ddata->data[i]; @@ -174,7 +459,7 @@ static int __devinit gpio_keys_probe(struct platform_device *pdev) bdata->input = input; bdata->button = button; - error = gpio_keys_setup_key(dev, bdata, button); + error = gpio_keys_setup_key(pdev, bdata, button); if (error) goto fail2; @@ -184,13 +469,20 @@ static int __devinit gpio_keys_probe(struct platform_device *pdev) input_set_capability(input, type, button->code); } - error = input_register_device(input); + error = sysfs_create_group(&pdev->dev.kobj, &gpio_keys_attr_group); if (error) { - dev_err(dev, "Unable to register input device, " - "error: %d\n", error); + dev_err(dev, "Unable to export keys/switches, error: %d\n", + error); goto fail2; } + error = input_register_device(input); + if (error) { + dev_err(dev, "Unable to register input device, error: %d\n", + error); + goto fail3; + } + /* get current state of buttons */ for (i = 0; i < pdata->nbuttons; i++) gpio_keys_report_event(&ddata->data[i]); @@ -200,6 +492,8 @@ static int __devinit gpio_keys_probe(struct platform_device *pdev) return 0; + fail3: + sysfs_remove_group(&pdev->dev.kobj, &gpio_keys_attr_group); fail2: while (--i >= 0) { free_irq(gpio_to_irq(pdata->buttons[i].gpio), &ddata->data[i]); @@ -224,6 +518,8 @@ static int __devexit gpio_keys_remove(struct platform_device *pdev) struct input_dev *input = ddata->input; int i; + sysfs_remove_group(&pdev->dev.kobj, &gpio_keys_attr_group); + device_init_wakeup(&pdev->dev, 0); for (i = 0; i < pdata->nbuttons; i++) { diff --git a/include/linux/gpio_keys.h b/include/linux/gpio_keys.h index 1289fa7623ca..cd0b3f30f48e 100644 --- a/include/linux/gpio_keys.h +++ b/include/linux/gpio_keys.h @@ -10,6 +10,7 @@ struct gpio_keys_button { int type; /* input event type (EV_KEY, EV_SW) */ int wakeup; /* configure the button as a wake-up source */ int debounce_interval; /* debounce ticks interval in msecs */ + bool can_disable; }; struct gpio_keys_platform_data { -- cgit v1.2.3 From fce877e3a429940a986e085a41e8b57f2d922e36 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Jan 2010 13:25:12 +0100 Subject: bitops: Ensure the compile time HWEIGHT is only used for such Avoid accidental misuse by failing to compile things Suggested-by: Andrew Morton Signed-off-by: Peter Zijlstra Cc: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 10 +++++++--- include/linux/bitops.h | 33 ++++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5b91992b6b25..96cfc1a4fe9f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -93,13 +93,16 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; -#define EVENT_CONSTRAINT(c, n, m) { \ +#define __EVENT_CONSTRAINT(c, n, m, w) {\ { .idxmsk64[0] = (n) }, \ .code = (c), \ .cmask = (m), \ - .weight = HWEIGHT64((u64)(n)), \ + .weight = (w), \ } +#define EVENT_CONSTRAINT(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) + #define INTEL_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) @@ -2622,7 +2625,8 @@ void __init init_hw_perf_events(void) register_die_notifier(&perf_event_nmi_notifier); unconstrained = (struct event_constraint) - EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 0); + __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, + 0, x86_pmu.num_events); pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.event_bits); diff --git a/include/linux/bitops.h b/include/linux/bitops.h index ba0fd1eb4af7..25b8b2f33ae9 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -45,19 +45,30 @@ static inline unsigned long hweight_long(unsigned long w) return sizeof(w) == 4 ? hweight32(w) : hweight64(w); } -#define HWEIGHT8(w) \ - ( (!!((w) & (1ULL << 0))) + \ - (!!((w) & (1ULL << 1))) + \ - (!!((w) & (1ULL << 2))) + \ - (!!((w) & (1ULL << 3))) + \ - (!!((w) & (1ULL << 4))) + \ - (!!((w) & (1ULL << 5))) + \ - (!!((w) & (1ULL << 6))) + \ +/* + * Clearly slow versions of the hweightN() functions, their benefit is + * of course compile time evaluation of constant arguments. + */ +#define HWEIGHT8(w) \ + ( BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + \ + (!!((w) & (1ULL << 0))) + \ + (!!((w) & (1ULL << 1))) + \ + (!!((w) & (1ULL << 2))) + \ + (!!((w) & (1ULL << 3))) + \ + (!!((w) & (1ULL << 4))) + \ + (!!((w) & (1ULL << 5))) + \ + (!!((w) & (1ULL << 6))) + \ (!!((w) & (1ULL << 7))) ) -#define HWEIGHT16(w) (HWEIGHT8(w) + HWEIGHT8(w >> 8)) -#define HWEIGHT32(w) (HWEIGHT16(w) + HWEIGHT16(w >> 16)) -#define HWEIGHT64(w) (HWEIGHT32(w) + HWEIGHT32(w >> 32)) +#define HWEIGHT16(w) (HWEIGHT8(w) + HWEIGHT8((w) >> 8)) +#define HWEIGHT32(w) (HWEIGHT16(w) + HWEIGHT16((w) >> 16)) +#define HWEIGHT64(w) (HWEIGHT32(w) + HWEIGHT32((w) >> 32)) + +/* + * Type invariant version that simply casts things to the + * largest type. + */ +#define HWEIGHT(w) HWEIGHT64((u64)(w)) /** * rol32 - rotate a 32-bit value left -- cgit v1.2.3 From 447a194b393f32699607fd99617a40abd6a95114 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 1 Feb 2010 14:50:01 +0200 Subject: perf_events, x86: Fix bug in hw_perf_enable() We cannot assume that because hwc->idx == assign[i], we can avoid reprogramming the counter in hw_perf_enable(). The event may have been scheduled out and another event may have been programmed into this counter. Thus, we need a more robust way of verifying if the counter still contains config/data related to an event. This patch adds a generation number to each counter on each cpu. Using this mechanism we can verify reliabilty whether the content of a counter corresponds to an event. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <4b66dc67.0b38560a.1635.ffffae18@mx.google.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 34 ++++++++++++++++++++++++++++------ include/linux/perf_event.h | 2 ++ 2 files changed, 30 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 96cfc1a4fe9f..a920f173a220 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -90,6 +90,7 @@ struct cpu_hw_events { int n_events; int n_added; int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ + u64 tags[X86_PMC_IDX_MAX]; struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; @@ -1142,6 +1143,8 @@ static int __hw_perf_event_init(struct perf_event *event) hwc->config = ARCH_PERFMON_EVENTSEL_INT; hwc->idx = -1; + hwc->last_cpu = -1; + hwc->last_tag = ~0ULL; /* * Count user and OS events unless requested not to. @@ -1457,11 +1460,14 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, return n; } - static inline void x86_assign_hw_event(struct perf_event *event, - struct hw_perf_event *hwc, int idx) + struct cpu_hw_events *cpuc, int i) { - hwc->idx = idx; + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = cpuc->assign[i]; + hwc->last_cpu = smp_processor_id(); + hwc->last_tag = ++cpuc->tags[i]; if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { hwc->config_base = 0; @@ -1480,6 +1486,15 @@ static inline void x86_assign_hw_event(struct perf_event *event, } } +static inline int match_prev_assignment(struct hw_perf_event *hwc, + struct cpu_hw_events *cpuc, + int i) +{ + return hwc->idx == cpuc->assign[i] && + hwc->last_cpu == smp_processor_id() && + hwc->last_tag == cpuc->tags[i]; +} + static void __x86_pmu_disable(struct perf_event *event, struct cpu_hw_events *cpuc); void hw_perf_enable(void) @@ -1508,7 +1523,14 @@ void hw_perf_enable(void) event = cpuc->event_list[i]; hwc = &event->hw; - if (hwc->idx == -1 || hwc->idx == cpuc->assign[i]) + /* + * we can avoid reprogramming counter if: + * - assigned same counter as last time + * - running on same CPU as last time + * - no other event has used the counter since + */ + if (hwc->idx == -1 || + match_prev_assignment(hwc, cpuc, i)) continue; __x86_pmu_disable(event, cpuc); @@ -1522,12 +1544,12 @@ void hw_perf_enable(void) hwc = &event->hw; if (hwc->idx == -1) { - x86_assign_hw_event(event, hwc, cpuc->assign[i]); + x86_assign_hw_event(event, cpuc, i); x86_perf_event_set_period(event, hwc, hwc->idx); } /* * need to mark as active because x86_pmu_disable() - * clear active_mask and eventsp[] yet it preserves + * clear active_mask and events[] yet it preserves * idx */ set_bit(hwc->idx, cpuc->active_mask); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 556b0f4a668e..071a7db52549 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -478,9 +478,11 @@ struct hw_perf_event { union { struct { /* hardware */ u64 config; + u64 last_tag; unsigned long config_base; unsigned long event_base; int idx; + int last_cpu; }; struct { /* software */ s64 remaining; -- cgit v1.2.3 From 2a61aa401638529cd4231f6106980d307fba98fa Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Fri, 11 Dec 2009 16:35:40 -0500 Subject: Fix misspellings of "invocation" in comments. Some comments misspell "invocation"; this fixes them. No code changes. Signed-off-by: Adam Buchbinder Signed-off-by: Jiri Kosina --- fs/buffer.c | 2 +- fs/mpage.c | 2 +- include/linux/mmzone.h | 2 +- kernel/sched_cpupri.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/buffer.c b/fs/buffer.c index 6fa530256bfd..1d920bab5e70 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2893,7 +2893,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, /* * The page straddles i_size. It must be zeroed out on each and every - * writepage invokation because it may be mmapped. "A file is mapped + * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." diff --git a/fs/mpage.c b/fs/mpage.c index 42381bd6543b..598d54e200eb 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -561,7 +561,7 @@ page_is_mapped: if (page->index >= end_index) { /* * The page straddles i_size. It must be zeroed out on each - * and every writepage invokation because it may be mmapped. + * and every writepage invocation because it may be mmapped. * "A file is mapped in multiples of the page size. For a file * that is not a multiple of the page size, the remaining memory * is zeroed when mapped, and writes to that region are not diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 30fe668c2542..e60a340fe890 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -349,7 +349,7 @@ struct zone { * prev_priority holds the scanning priority for this zone. It is * defined as the scanning priority at which we achieved our reclaim * target at the previous try_to_free_pages() or balance_pgdat() - * invokation. + * invocation. * * We use prev_priority as a measure of how much stress page reclaim is * under - it drives the swappiness decision: whether to unmap mapped diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 597b33099dfa..3db4b1a0e921 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -58,7 +58,7 @@ static int convert_prio(int prio) * @lowest_mask: A mask to fill in with selected CPUs (or NULL) * * Note: This function returns the recommended CPUs as calculated during the - * current invokation. By the time the call returns, the CPUs may have in + * current invocation. By the time the call returns, the CPUs may have in * fact changed priorities any number of times. While not ideal, it is not * an issue of correctness since the normal rebalancer logic will correct * any discrepancies created by racing against the uncertainty of the current -- cgit v1.2.3 From 6683ece36e3531fc8c75f69e7165c5f20930be88 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 4 Feb 2010 10:22:25 -0800 Subject: net: use helpers to access mc list V2 This patch introduces the similar helpers as those already done for uc list. However multicast lists are no list_head lists but "mademanually". The three macros added by this patch will make the transition of mc_list to list_head smooth in two steps: 1) convert all drivers to use these macros (with the original iterator of type "struct dev_mc_list") 2) once all drivers are converted, convert list type and iterators to "struct netdev_hw_addr" in one patch. >From now on, drivers can (and should) use "netdev_for_each_mc_addr" to iterate over the addresses with iterator of type "struct netdev_hw_addr". Also macros "netdev_mc_count" and "netdev_mc_empty" to read list's length. This is the state which should be reached in all drivers. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 622ba5aa93c4..e535700a3b72 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -268,6 +268,12 @@ struct netdev_hw_addr_list { #define netdev_for_each_uc_addr(ha, dev) \ list_for_each_entry(ha, &dev->uc.list, list) +#define netdev_mc_count(dev) ((dev)->mc_count) +#define netdev_mc_empty(dev) (netdev_mc_count(dev) == 0) + +#define netdev_for_each_mc_addr(mclist, dev) \ + for (mclist = dev->mc_list; mclist; mclist = mclist->next) + struct hh_cache { struct hh_cache *hh_next; /* Next entry */ atomic_t hh_refcnt; /* number of users */ -- cgit v1.2.3 From f8f76db1db369f3a130ac3fd33e2eee5f1610d9c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 4 Feb 2010 10:23:02 -0800 Subject: libphy: add phy_find_first function Many drivers do this in them manually. Now they can use this function. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 16 ++++++++++++++++ include/linux/phy.h | 1 + 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index adbc0fded130..db1794546c56 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -276,6 +276,22 @@ int phy_device_register(struct phy_device *phydev) } EXPORT_SYMBOL(phy_device_register); +/** + * phy_find_first - finds the first PHY device on the bus + * @bus: the target MII bus + */ +struct phy_device *phy_find_first(struct mii_bus *bus) +{ + int addr; + + for (addr = 0; addr < PHY_MAX_ADDR; addr++) { + if (bus->phy_map[addr]) + return bus->phy_map[addr]; + } + return NULL; +} +EXPORT_SYMBOL(phy_find_first); + /** * phy_prepare_link - prepares the PHY layer to monitor link status * @phydev: target phy_device struct diff --git a/include/linux/phy.h b/include/linux/phy.h index 6a7eb402165d..14d7fdf6a90a 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -452,6 +452,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface); struct phy_device * phy_attach(struct net_device *dev, const char *bus_id, u32 flags, phy_interface_t interface); +struct phy_device *phy_find_first(struct mii_bus *bus); int phy_connect_direct(struct net_device *dev, struct phy_device *phydev, void (*handler)(struct net_device *), u32 flags, phy_interface_t interface); -- cgit v1.2.3 From 8ee2bf9ab792d0c02b13ca3acbd036debb7745d9 Mon Sep 17 00:00:00 2001 From: Sriramakrishnan Date: Thu, 19 Nov 2009 15:58:25 +0530 Subject: TI Davinci EMAC : Re-use driver for other platforms. The davinci EMAC peripheral is also available on other TI platforms -notably TI AM3517 SoC. This patch modifies the config option and the platform structure header files so that the driver can be reused on non-davinci platforms as well. Signed-off-by: Sriramakrishnan Acked-by: Chaithrika U S Acked-by: David S. Miller Signed-off-by: Kevin Hilman --- arch/arm/mach-davinci/common.c | 2 +- arch/arm/mach-davinci/include/mach/da8xx.h | 2 +- arch/arm/mach-davinci/include/mach/dm365.h | 2 +- arch/arm/mach-davinci/include/mach/dm644x.h | 2 +- arch/arm/mach-davinci/include/mach/dm646x.h | 2 +- arch/arm/mach-davinci/include/mach/emac.h | 36 ----------------------------- drivers/net/Kconfig | 2 +- drivers/net/davinci_emac.c | 3 +-- include/linux/davinci_emac.h | 36 +++++++++++++++++++++++++++++ 9 files changed, 43 insertions(+), 44 deletions(-) delete mode 100644 arch/arm/mach-davinci/include/mach/emac.h create mode 100644 include/linux/davinci_emac.h (limited to 'include/linux') diff --git a/arch/arm/mach-davinci/common.c b/arch/arm/mach-davinci/common.c index c2de94cde56a..94f27cbcd55a 100644 --- a/arch/arm/mach-davinci/common.c +++ b/arch/arm/mach-davinci/common.c @@ -11,13 +11,13 @@ #include #include #include +#include #include #include #include #include -#include #include "clock.h" diff --git a/arch/arm/mach-davinci/include/mach/da8xx.h b/arch/arm/mach-davinci/include/mach/da8xx.h index d43a4b6b6d76..d9a7f11894c4 100644 --- a/arch/arm/mach-davinci/include/mach/da8xx.h +++ b/arch/arm/mach-davinci/include/mach/da8xx.h @@ -13,10 +13,10 @@ #include