From 3871f2ffe53db3cef4fe0c18993ad9e6e0f69408 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 24 Dec 2008 16:06:57 -0800 Subject: sysrq: fix ftrace help msg & doc. Impact: update documentation and help messages We have a conventional method of explicitly stating the sysrq action key in a sysrq help message, so change dump-ftrace-buffer to use that method and add it to Documentation/sysrq.txt. Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar --- Documentation/sysrq.txt | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index 10a0263ebb3f..56b53e005d18 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt @@ -114,6 +114,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.: 'x' - Used by xmon interface on ppc/powerpc platforms. +'z' - Dump the ftrace buffer + '0'-'9' - Sets the console log level, controlling which kernel messages will be printed to your console. ('0', for example would make it so that only emergency messages like PANICs or OOPSes would -- cgit v1.2.3 From b9ce08c01020eb28bfbfa6faf1c740281c5f418e Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Sun, 10 Aug 2008 20:14:03 +0300 Subject: kmemtrace: Core implementation. kmemtrace provides tracing for slab allocator functions, such as kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected data is then fed to the userspace application in order to analyse allocation hotspots, internal fragmentation and so on, making it possible to see how well an allocator performs, as well as debug and profile kernel code. Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- Documentation/kernel-parameters.txt | 10 ++ MAINTAINERS | 6 + include/linux/kmemtrace.h | 85 +++++++++ init/main.c | 2 + lib/Kconfig.debug | 28 +++ mm/Makefile | 1 + mm/kmemtrace.c | 335 ++++++++++++++++++++++++++++++++++++ 7 files changed, 467 insertions(+) create mode 100644 include/linux/kmemtrace.h create mode 100644 mm/kmemtrace.c (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e0f346d201ed..542c2d8843db 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -49,6 +49,7 @@ parameter is applicable: ISAPNP ISA PnP code is enabled. ISDN Appropriate ISDN support is enabled. JOY Appropriate joystick support is enabled. + KMEMTRACE kmemtrace is enabled. LIBATA Libata driver is enabled LP Printer support is enabled. LOOP Loopback device support is enabled. @@ -1018,6 +1019,15 @@ and is between 256 and 4096 characters. It is defined in the file use the HighMem zone if it exists, and the Normal zone if it does not. + kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no } + Controls whether kmemtrace is enabled + at boot-time. + + kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of + subbufs kmemtrace's relay channel has. Set this + higher than default (KMEMTRACE_N_SUBBUFS in code) if + you experience buffer overruns. + movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter is similar to kernelcore except it specifies the amount of memory used for migratable allocations. diff --git a/MAINTAINERS b/MAINTAINERS index 618c1ef4a397..e2b3c8555051 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2565,6 +2565,12 @@ M: jason.wessel@windriver.com L: kgdb-bugreport@lists.sourceforge.net S: Maintained +KMEMTRACE +P: Eduard - Gabriel Munteanu +M: eduard.munteanu@linux360.ro +L: linux-kernel@vger.kernel.org +S: Maintained + KPROBES P: Ananth N Mavinakayanahalli M: ananth@in.ibm.com diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h new file mode 100644 index 000000000000..2c332010cb4e --- /dev/null +++ b/include/linux/kmemtrace.h @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2008 Eduard - Gabriel Munteanu + * + * This file is released under GPL version 2. + */ + +#ifndef _LINUX_KMEMTRACE_H +#define _LINUX_KMEMTRACE_H + +#ifdef __KERNEL__ + +#include +#include + +enum kmemtrace_type_id { + KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ + KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ + KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ +}; + +#ifdef CONFIG_KMEMTRACE + +extern void kmemtrace_init(void); + +static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ + trace_mark(kmemtrace_alloc, "type_id %d call_site %lu ptr %lu " + "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d", + type_id, call_site, (unsigned long) ptr, + bytes_req, bytes_alloc, (unsigned long) gfp_flags, node); +} + +static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr) +{ + trace_mark(kmemtrace_free, "type_id %d call_site %lu ptr %lu", + type_id, call_site, (unsigned long) ptr); +} + +#else /* CONFIG_KMEMTRACE */ + +static inline void kmemtrace_init(void) +{ +} + +static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ +} + +static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr) +{ +} + +#endif /* CONFIG_KMEMTRACE */ + +static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags) +{ + kmemtrace_mark_alloc_node(type_id, call_site, ptr, + bytes_req, bytes_alloc, gfp_flags, -1); +} + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_KMEMTRACE_H */ + diff --git a/init/main.c b/init/main.c index 7e117a231af1..be1fe2242a55 100644 --- a/init/main.c +++ b/init/main.c @@ -69,6 +69,7 @@ #include #include #include +#include #ifdef CONFIG_X86_LOCAL_APIC #include @@ -653,6 +654,7 @@ asmlinkage void __init start_kernel(void) enable_debug_pagealloc(); cpu_hotplug_init(); kmem_cache_init(); + kmemtrace_init(); debug_objects_mem_init(); idr_init_cache(); setup_per_cpu_pageset(); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b0f239e443bc..78d669b461d2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -803,6 +803,34 @@ config FIREWIRE_OHCI_REMOTE_DMA If unsure, say N. +config KMEMTRACE + bool "Kernel memory tracer (kmemtrace)" + depends on RELAY && DEBUG_FS && MARKERS + help + kmemtrace provides tracing for slab allocator functions, such as + kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected + data is then fed to the userspace application in order to analyse + allocation hotspots, internal fragmentation and so on, making it + possible to see how well an allocator performs, as well as debug + and profile kernel code. + + This requires an userspace application to use. See + Documentation/vm/kmemtrace.txt for more information. + + Saying Y will make the kernel somewhat larger and slower. However, + if you disable kmemtrace at run-time or boot-time, the performance + impact is minimal (depending on the arch the kernel is built for). + + If unsure, say N. + +config KMEMTRACE_DEFAULT_ENABLED + bool "Enabled by default at boot" + depends on KMEMTRACE + help + Say Y here to enable kmemtrace at boot-time by default. Whatever + the choice, the behavior can be overridden by a kernel parameter, + as described in documentation. + menuconfig BUILD_DOCSRC bool "Build targets in Documentation/ tree" depends on HEADERS_CHECK diff --git a/mm/Makefile b/mm/Makefile index c06b45a1ff5f..3782eb66d4b3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -34,3 +34,4 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_KMEMTRACE) += kmemtrace.o diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c new file mode 100644 index 000000000000..83ad1cc71a92 --- /dev/null +++ b/mm/kmemtrace.c @@ -0,0 +1,335 @@ +/* + * Copyright (C) 2008 Pekka Enberg, Eduard - Gabriel Munteanu + * + * This file is released under GPL version 2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define KMEMTRACE_SUBBUF_SIZE 524288 +#define KMEMTRACE_DEF_N_SUBBUFS 20 + +static struct rchan *kmemtrace_chan; +static u32 kmemtrace_buf_overruns; + +static unsigned int kmemtrace_n_subbufs; +#ifdef CONFIG_KMEMTRACE_DEFAULT_ENABLED +static unsigned int kmemtrace_enabled = 1; +#else +static unsigned int kmemtrace_enabled = 0; +#endif + +/* + * The sequence number is used for reordering kmemtrace packets + * in userspace, since they are logged as per-CPU data. + * + * atomic_t should always be a 32-bit signed integer. Wraparound is not + * likely to occur, but userspace can deal with it by expecting a certain + * sequence number in the next packet that will be read. + */ +static atomic_t kmemtrace_seq_num; + +#define KMEMTRACE_ABI_VERSION 1 + +static u32 kmemtrace_abi_version __read_mostly = KMEMTRACE_ABI_VERSION; + +enum kmemtrace_event_id { + KMEMTRACE_EVENT_ALLOC = 0, + KMEMTRACE_EVENT_FREE, +}; + +struct kmemtrace_event { + u8 event_id; + u8 type_id; + u16 event_size; + s32 seq_num; + u64 call_site; + u64 ptr; +} __attribute__ ((__packed__)); + +struct kmemtrace_stats_alloc { + u64 bytes_req; + u64 bytes_alloc; + u32 gfp_flags; + s32 numa_node; +} __attribute__ ((__packed__)); + +static void kmemtrace_probe_alloc(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + unsigned long flags; + struct kmemtrace_event *ev; + struct kmemtrace_stats_alloc *stats; + void *buf; + + local_irq_save(flags); + + buf = relay_reserve(kmemtrace_chan, + sizeof(struct kmemtrace_event) + + sizeof(struct kmemtrace_stats_alloc)); + if (!buf) + goto failed; + + /* + * Don't convert this to use structure initializers, + * C99 does not guarantee the rvalues evaluation order. + */ + + ev = buf; + ev->event_id = KMEMTRACE_EVENT_ALLOC; + ev->type_id = va_arg(*args, int); + ev->event_size = sizeof(struct kmemtrace_event) + + sizeof(struct kmemtrace_stats_alloc); + ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num); + ev->call_site = va_arg(*args, unsigned long); + ev->ptr = va_arg(*args, unsigned long); + + stats = buf + sizeof(struct kmemtrace_event); + stats->bytes_req = va_arg(*args, unsigned long); + stats->bytes_alloc = va_arg(*args, unsigned long); + stats->gfp_flags = va_arg(*args, unsigned long); + stats->numa_node = va_arg(*args, int); + +failed: + local_irq_restore(flags); +} + +static void kmemtrace_probe_free(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + unsigned long flags; + struct kmemtrace_event *ev; + + local_irq_save(flags); + + ev = relay_reserve(kmemtrace_chan, sizeof(struct kmemtrace_event)); + if (!ev) + goto failed; + + /* + * Don't convert this to use structure initializers, + * C99 does not guarantee the rvalues evaluation order. + */ + ev->event_id = KMEMTRACE_EVENT_FREE; + ev->type_id = va_arg(*args, int); + ev->event_size = sizeof(struct kmemtrace_event); + ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num); + ev->call_site = va_arg(*args, unsigned long); + ev->ptr = va_arg(*args, unsigned long); + +failed: + local_irq_restore(flags); +} + +static struct dentry * +kmemtrace_create_buf_file(const char *filename, struct dentry *parent, + int mode, struct rchan_buf *buf, int *is_global) +{ + return debugfs_create_file(filename, mode, parent, buf, + &relay_file_operations); +} + +static int kmemtrace_remove_buf_file(struct dentry *dentry) +{ + debugfs_remove(dentry); + + return 0; +} + +static int kmemtrace_subbuf_start(struct rchan_buf *buf, + void *subbuf, + void *prev_subbuf, + size_t prev_padding) +{ + if (relay_buf_full(buf)) { + /* + * We know it's not SMP-safe, but neither + * debugfs_create_u32() is. + */ + kmemtrace_buf_overruns++; + return 0; + } + + return 1; +} + +static struct rchan_callbacks relay_callbacks = { + .create_buf_file = kmemtrace_create_buf_file, + .remove_buf_file = kmemtrace_remove_buf_file, + .subbuf_start = kmemtrace_subbuf_start, +}; + +static struct dentry *kmemtrace_dir; +static struct dentry *kmemtrace_overruns_dentry; +static struct dentry *kmemtrace_abi_version_dentry; + +static struct dentry *kmemtrace_enabled_dentry; + +static int kmemtrace_start_probes(void) +{ + int err; + + err = marker_probe_register("kmemtrace_alloc", "type_id %d " + "call_site %lu ptr %lu " + "bytes_req %lu bytes_alloc %lu " + "gfp_flags %lu node %d", + kmemtrace_probe_alloc, NULL); + if (err) + return err; + err = marker_probe_register("kmemtrace_free", "type_id %d " + "call_site %lu ptr %lu", + kmemtrace_probe_free, NULL); + + return err; +} + +static void kmemtrace_stop_probes(void) +{ + marker_probe_unregister("kmemtrace_alloc", + kmemtrace_probe_alloc, NULL); + marker_probe_unregister("kmemtrace_free", + kmemtrace_probe_free, NULL); +} + +static int kmemtrace_enabled_get(void *data, u64 *val) +{ + *val = *((int *) data); + + return 0; +} + +static int kmemtrace_enabled_set(void *data, u64 val) +{ + u64 old_val = kmemtrace_enabled; + + *((int *) data) = !!val; + + if (old_val == val) + return 0; + if (val) + kmemtrace_start_probes(); + else + kmemtrace_stop_probes(); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(kmemtrace_enabled_fops, + kmemtrace_enabled_get, + kmemtrace_enabled_set, "%llu\n"); + +static void kmemtrace_cleanup(void) +{ + if (kmemtrace_enabled_dentry) + debugfs_remove(kmemtrace_enabled_dentry); + + kmemtrace_stop_probes(); + + if (kmemtrace_abi_version_dentry) + debugfs_remove(kmemtrace_abi_version_dentry); + if (kmemtrace_overruns_dentry) + debugfs_remove(kmemtrace_overruns_dentry); + + relay_close(kmemtrace_chan); + kmemtrace_chan = NULL; + + if (kmemtrace_dir) + debugfs_remove(kmemtrace_dir); +} + +static int __init kmemtrace_setup_late(void) +{ + if (!kmemtrace_chan) + goto failed; + + kmemtrace_dir = debugfs_create_dir("kmemtrace", NULL); + if (!kmemtrace_dir) + goto cleanup; + + kmemtrace_abi_version_dentry = + debugfs_create_u32("abi_version", S_IRUSR, + kmemtrace_dir, &kmemtrace_abi_version); + kmemtrace_overruns_dentry = + debugfs_create_u32("total_overruns", S_IRUSR, + kmemtrace_dir, &kmemtrace_buf_overruns); + if (!kmemtrace_overruns_dentry || !kmemtrace_abi_version_dentry) + goto cleanup; + + kmemtrace_enabled_dentry = + debugfs_create_file("enabled", S_IRUSR | S_IWUSR, + kmemtrace_dir, &kmemtrace_enabled, + &kmemtrace_enabled_fops); + if (!kmemtrace_enabled_dentry) + goto cleanup; + + if (relay_late_setup_files(kmemtrace_chan, "cpu", kmemtrace_dir)) + goto cleanup; + + printk(KERN_INFO "kmemtrace: fully up.\n"); + + return 0; + +cleanup: + kmemtrace_cleanup(); +failed: + return 1; +} +late_initcall(kmemtrace_setup_late); + +static int __init kmemtrace_set_boot_enabled(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "yes")) + kmemtrace_enabled = 1; + else if (!strcmp(str, "no")) + kmemtrace_enabled = 0; + else + return -EINVAL; + + return 0; +} +early_param("kmemtrace.enable", kmemtrace_set_boot_enabled); + +static int __init kmemtrace_set_subbufs(char *str) +{ + get_option(&str, &kmemtrace_n_subbufs); + return 0; +} +early_param("kmemtrace.subbufs", kmemtrace_set_subbufs); + +void kmemtrace_init(void) +{ + if (!kmemtrace_enabled) + return; + + if (!kmemtrace_n_subbufs) + kmemtrace_n_subbufs = KMEMTRACE_DEF_N_SUBBUFS; + + kmemtrace_chan = relay_open(NULL, NULL, KMEMTRACE_SUBBUF_SIZE, + kmemtrace_n_subbufs, &relay_callbacks, + NULL); + if (unlikely(!kmemtrace_chan)) { + printk(KERN_ERR "kmemtrace: could not open relay channel.\n"); + return; + } + + if (unlikely(kmemtrace_start_probes())) + goto probe_fail; + + printk(KERN_INFO "kmemtrace: early init successful.\n"); + + return; + +probe_fail: + printk(KERN_ERR "kmemtrace: could not register marker probes!\n"); + kmemtrace_cleanup(); +} + -- cgit v1.2.3 From aa46a7e0228c0477708ce44a0c5621902b3c157c Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Sun, 10 Aug 2008 20:14:04 +0300 Subject: kmemtrace: Additional documentation. Documented kmemtrace's ABI, purpose and design. Also includes a short usage guide, FAQ, as well as a link to the userspace application's Git repository, which is currently hosted at repo.or.cz. Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- Documentation/ABI/testing/debugfs-kmemtrace | 71 ++++++++++++++++ Documentation/vm/kmemtrace.txt | 126 ++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+) create mode 100644 Documentation/ABI/testing/debugfs-kmemtrace create mode 100644 Documentation/vm/kmemtrace.txt (limited to 'Documentation') diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace new file mode 100644 index 000000000000..a5ff9a64c9ee --- /dev/null +++ b/Documentation/ABI/testing/debugfs-kmemtrace @@ -0,0 +1,71 @@ +What: /sys/kernel/debug/kmemtrace/ +Date: July 2008 +Contact: Eduard - Gabriel Munteanu +Description: + +In kmemtrace-enabled kernels, the following files are created: + +/sys/kernel/debug/kmemtrace/ + cpu (0400) Per-CPU tracing data, see below. (binary) + total_overruns (0400) Total number of bytes which were dropped from + cpu files because of full buffer condition, + non-binary. (text) + abi_version (0400) Kernel's kmemtrace ABI version. (text) + +Each per-CPU file should be read according to the relay interface. That is, +the reader should set affinity to that specific CPU and, as currently done by +the userspace application (though there are other methods), use poll() with +an infinite timeout before every read(). Otherwise, erroneous data may be +read. The binary data has the following _core_ format: + + Event ID (1 byte) Unsigned integer, one of: + 0 - represents an allocation (KMEMTRACE_EVENT_ALLOC) + 1 - represents a freeing of previously allocated memory + (KMEMTRACE_EVENT_FREE) + Type ID (1 byte) Unsigned integer, one of: + 0 - this is a kmalloc() / kfree() + 1 - this is a kmem_cache_alloc() / kmem_cache_free() + 2 - this is a __get_free_pages() et al. + Event size (2 bytes) Unsigned integer representing the + size of this event. Used to extend + kmemtrace. Discard the bytes you + don't know about. + Sequence number (4 bytes) Signed integer used to reorder data + logged on SMP machines. Wraparound + must be taken into account, although + it is unlikely. + Caller address (8 bytes) Return address to the caller. + Pointer to mem (8 bytes) Pointer to target memory area. Can be + NULL, but not all such calls might be + recorded. + +In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow: + + Requested bytes (8 bytes) Total number of requested bytes, + unsigned, must not be zero. + Allocated bytes (8 bytes) Total number of actually allocated + bytes, unsigned, must not be lower + than requested bytes. + Requested flags (4 bytes) GFP flags supplied by the caller. + Target CPU (4 bytes) Signed integer, valid for event id 1. + If equal to -1, target CPU is the same + as origin CPU, but the reverse might + not be true. + +The data is made available in the same endianness the machine has. + +Other event ids and type ids may be defined and added. Other fields may be +added by increasing event size, but see below for details. +Every modification to the ABI, including new id definitions, are followed +by bumping the ABI version by one. + +Adding new data to the packet (features) is done at the end of the mandatory +data: + Feature size (2 byte) + Feature ID (1 byte) + Feature data (Feature size - 4 bytes) + + +Users: + kmemtrace-user - git://repo.or.cz/kmemtrace-user.git + diff --git a/Documentation/vm/kmemtrace.txt b/Documentation/vm/kmemtrace.txt new file mode 100644 index 000000000000..75360b142ba4 --- /dev/null +++ b/Documentation/vm/kmemtrace.txt @@ -0,0 +1,126 @@ + kmemtrace - Kernel Memory Tracer + + by Eduard - Gabriel Munteanu + + +I. Introduction +=============== + +kmemtrace helps kernel developers figure out two things: +1) how different allocators (SLAB, SLUB etc.) perform +2) how kernel code allocates memory and how much + +To do this, we trace every allocation and export information to the userspace +through the relay interface. We export things such as the number of requested +bytes, the number of bytes actually allocated (i.e. including internal +fragmentation), whether this is a slab allocation or a plain kmalloc() and so +on. + +The actual analysis is performed by a userspace tool (see section III for +details on where to get it from). It logs the data exported by the kernel, +processes it and (as of writing this) can provide the following information: +- the total amount of memory allocated and fragmentation per call-site +- the amount of memory allocated and fragmentation per allocation +- total memory allocated and fragmentation in the collected dataset +- number of cross-CPU allocation and frees (makes sense in NUMA environments) + +Moreover, it can potentially find inconsistent and erroneous behavior in +kernel code, such as using slab free functions on kmalloc'ed memory or +allocating less memory than requested (but not truly failed allocations). + +kmemtrace also makes provisions for tracing on some arch and analysing the +data on another. + +II. Design and goals +==================== + +kmemtrace was designed to handle rather large amounts of data. Thus, it uses +the relay interface to export whatever is logged to userspace, which then +stores it. Analysis and reporting is done asynchronously, that is, after the +data is collected and stored. By design, it allows one to log and analyse +on different machines and different arches. + +As of writing this, the ABI is not considered stable, though it might not +change much. However, no guarantees are made about compatibility yet. When +deemed stable, the ABI should still allow easy extension while maintaining +backward compatibility. This is described further in Documentation/ABI. + +Summary of design goals: + - allow logging and analysis to be done across different machines + - be fast and anticipate usage in high-load environments (*) + - be reasonably extensible + - make it possible for GNU/Linux distributions to have kmemtrace + included in their repositories + +(*) - one of the reasons Pekka Enberg's original userspace data analysis + tool's code was rewritten from Perl to C (although this is more than a + simple conversion) + + +III. Quick usage guide +====================== + +1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable +CONFIG_KMEMTRACE and CONFIG_DEFAULT_ENABLED). + +2) Get the userspace tool and build it: +$ git-clone git://repo.or.cz/kmemtrace-user.git # current repository +$ cd kmemtrace-user/ +$ ./autogen.sh +$ ./configure +$ make + +3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the +'single' runlevel (so that relay buffers don't fill up easily), and run +kmemtrace: +# '$' does not mean user, but root here. +$ mount -t debugfs none /sys/kernel/debug +$ mount -t proc none /proc +$ cd path/to/kmemtrace-user/ +$ ./kmemtraced +Wait a bit, then stop it with CTRL+C. +$ cat /sys/kernel/debug/kmemtrace/total_overruns # Check if we didn't + # overrun, should + # be zero. +$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to + check its correctness] +$ ./kmemtrace-report + +Now you should have a nice and short summary of how the allocator performs. + +IV. FAQ and known issues +======================== + +Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix +this? Should I worry? +A: If it's non-zero, this affects kmemtrace's accuracy, depending on how +large the number is. You can fix it by supplying a higher +'kmemtrace.subbufs=N' kernel parameter. +--- + +Q: kmemtrace_check reports errors, how do I fix this? Should I worry? +A: This is a bug and should be reported. It can occur for a variety of +reasons: + - possible bugs in relay code + - possible misuse of relay by kmemtrace + - timestamps being collected unorderly +Or you may fix it yourself and send us a patch. +--- + +Q: kmemtrace_report shows many errors, how do I fix this? Should I worry? +A: This is a known issue and I'm working on it. These might be true errors +in kernel code, which may have inconsistent behavior (e.g. allocating memory +with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed +out this behavior may work with SLAB, but may fail with other allocators. + +It may also be due to lack of tracing in some unusual allocator functions. + +We don't want bug reports regarding this issue yet. +--- + +V. See also +=========== + +Documentation/kernel-parameters.txt +Documentation/ABI/testing/debugfs-kmemtrace + -- cgit v1.2.3 From 4a80b24bb2ec66a5cb7fa5ff8335907f09288200 Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Tue, 19 Aug 2008 20:43:27 +0300 Subject: kmemtrace: Fix typos in documentation. Corrected the ABI description and the kmemtrace usage guide. Thanks to Randy Dunlap for noticing these errors. Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- Documentation/ABI/testing/debugfs-kmemtrace | 2 +- Documentation/vm/kmemtrace.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace index a5ff9a64c9ee..5e6a92a02d85 100644 --- a/Documentation/ABI/testing/debugfs-kmemtrace +++ b/Documentation/ABI/testing/debugfs-kmemtrace @@ -63,7 +63,7 @@ Adding new data to the packet (features) is done at the end of the mandatory data: Feature size (2 byte) Feature ID (1 byte) - Feature data (Feature size - 4 bytes) + Feature data (Feature size - 3 bytes) Users: diff --git a/Documentation/vm/kmemtrace.txt b/Documentation/vm/kmemtrace.txt index 75360b142ba4..f656cac3558c 100644 --- a/Documentation/vm/kmemtrace.txt +++ b/Documentation/vm/kmemtrace.txt @@ -61,7 +61,7 @@ III. Quick usage guide ====================== 1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable -CONFIG_KMEMTRACE and CONFIG_DEFAULT_ENABLED). +CONFIG_KMEMTRACE and CONFIG_KMEMTRACE_DEFAULT_ENABLED). 2) Get the userspace tool and build it: $ git-clone git://repo.or.cz/kmemtrace-user.git # current repository -- cgit v1.2.3 From bf6803d6fd654d9a73cd90308b5225d78655d027 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Fri, 10 Oct 2008 11:02:59 +0300 Subject: kmemtrace: remove config option for enabling tracing at boot Users can pass kmemtrace.enabled=yes as a kernel parameter to enable kmemtrace at boot so remove the useless CONFIG_KMEMTRACE_DEFAULT_ENABLED config option. Cc: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- Documentation/vm/kmemtrace.txt | 2 +- lib/Kconfig.debug | 8 -------- mm/kmemtrace.c | 8 +++----- 3 files changed, 4 insertions(+), 14 deletions(-) (limited to 'Documentation') diff --git a/Documentation/vm/kmemtrace.txt b/Documentation/vm/kmemtrace.txt index f656cac3558c..a956d9b7f943 100644 --- a/Documentation/vm/kmemtrace.txt +++ b/Documentation/vm/kmemtrace.txt @@ -61,7 +61,7 @@ III. Quick usage guide ====================== 1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable -CONFIG_KMEMTRACE and CONFIG_KMEMTRACE_DEFAULT_ENABLED). +CONFIG_KMEMTRACE). 2) Get the userspace tool and build it: $ git-clone git://repo.or.cz/kmemtrace-user.git # current repository diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 78d669b461d2..b5417e23ba94 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -823,14 +823,6 @@ config KMEMTRACE If unsure, say N. -config KMEMTRACE_DEFAULT_ENABLED - bool "Enabled by default at boot" - depends on KMEMTRACE - help - Say Y here to enable kmemtrace at boot-time by default. Whatever - the choice, the behavior can be overridden by a kernel parameter, - as described in documentation. - menuconfig BUILD_DOCSRC bool "Build targets in Documentation/ tree" depends on HEADERS_CHECK diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c index f7a49c077df2..f7704f52c4b1 100644 --- a/mm/kmemtrace.c +++ b/mm/kmemtrace.c @@ -19,11 +19,9 @@ static struct rchan *kmemtrace_chan; static u32 kmemtrace_buf_overruns; static unsigned int kmemtrace_n_subbufs; -#ifdef CONFIG_KMEMTRACE_DEFAULT_ENABLED -static unsigned int kmemtrace_enabled = 1; -#else -static unsigned int kmemtrace_enabled = 0; -#endif + +/* disabled by default */ +static unsigned int kmemtrace_enabled; /* * The sequence number is used for reordering kmemtrace packets -- cgit v1.2.3 From 5d2ad3316e29ad218f98d66b9c0ce6d4bcd05b77 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sat, 3 Jan 2009 21:09:27 +0200 Subject: doc: mmiotrace.txt, buffer size control change Impact: prevents confusing the user when buffer size is inadequate The tracing framework offers a resizeable buffer, which mmiotrace uses to record events. If the buffer is full, the following events will be lost. Events should not be lost, so the documentation instructs the user to increase the buffer size. The buffer size is set via a debugfs file. Mmiotrace documentation was not updated the same time the debugfs file was changed. The old file was tracing/trace_entries and first contained the number of entries the buffer had space for, per cpu. Nowadays this file is replaced with the file tracing/buffer_size_kb, which tells the amount of memory reserved for the buffer, per cpu, in kilobytes. Previously, a flag had to be toggled via the debugfs file tracing/tracing_enabled when the buffer size was changed. This is no longer necessary. The mmiotrace documentation is updated to reflect the current state of the tracing framework. Signed-off-by: Pekka Paalanen Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- Documentation/tracers/mmiotrace.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/tracers/mmiotrace.txt index cde23b4a12a1..5731c67abc55 100644 --- a/Documentation/tracers/mmiotrace.txt +++ b/Documentation/tracers/mmiotrace.txt @@ -78,12 +78,10 @@ to view your kernel log and look for "mmiotrace has lost events" warning. If events were lost, the trace is incomplete. You should enlarge the buffers and try again. Buffers are enlarged by first seeing how large the current buffers are: -$ cat /debug/tracing/trace_entries +$ cat /debug/tracing/buffer_size_kb gives you a number. Approximately double this number and write it back, for instance: -$ echo 0 > /debug/tracing/tracing_enabled -$ echo 128000 > /debug/tracing/trace_entries -$ echo 1 > /debug/tracing/tracing_enabled +$ echo 128000 > /debug/tracing/buffer_size_kb Then start again from the top. If you are doing a trace for a driver project, e.g. Nouveau, you should also -- cgit v1.2.3 From e2ea5399bb4fb7aaafb08f846db453f4eec55160 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Mon, 19 Jan 2009 10:35:58 +0100 Subject: x86, ftrace, hw-branch-tracer: documentation Document the hw-branch-tracer in the ftrace documentation. Signed-off-by: Markus Metzger Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- Documentation/ftrace.txt | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'Documentation') diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 803b1318b13d..758fb42a1b68 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -165,6 +165,8 @@ Here is the list of current tracers that may be configured. nop - This is not a tracer. To remove all tracers from tracing simply echo "nop" into current_tracer. + hw-branch-tracer - traces branches on all cpu's in a circular buffer. + Examples of using the tracer ---------------------------- @@ -1152,6 +1154,78 @@ int main (int argc, char **argv) return 0; } + +hw-branch-tracer (x86 only) +--------------------------- + +This tracer uses the x86 last branch tracing hardware feature to +collect a branch trace on all cpus with relatively low overhead. + +The tracer uses a fixed-size circular buffer per cpu and only +traces ring 0 branches. The trace file dumps that buffer in the +following format: + +# tracer: hw-branch-tracer +# +# CPU# TO <- FROM + 0 scheduler_tick+0xb5/0x1bf <- task_tick_idle+0x5/0x6 + 2 run_posix_cpu_timers+0x2b/0x72a <- run_posix_cpu_timers+0x25/0x72a + 0 scheduler_tick+0x139/0x1bf <- scheduler_tick+0xed/0x1bf + 0 scheduler_tick+0x17c/0x1bf <- scheduler_tick+0x148/0x1bf + 2 run_posix_cpu_timers+0x9e/0x72a <- run_posix_cpu_timers+0x5e/0x72a + 0 scheduler_tick+0x1b6/0x1bf <- scheduler_tick+0x1aa/0x1bf + + +The tracer may be used to dump the trace for the oops'ing cpu on a +kernel oops into the system log. To enable this, ftrace_dump_on_oops +must be set. To set ftrace_dump_on_oops, one can either use the sysctl +function or set it via the proc system interface. + + sysctl kernel.ftrace_dump_on_oops=1 + +or + + echo 1 > /proc/sys/kernel/ftrace_dump_on_oops + + +Here's an example of such a dump after a null pointer dereference in a +kernel module: + +[57848.105921] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 +[57848.106019] IP: [] open+0x6/0x14 [oops] +[57848.106019] PGD 2354e9067 PUD 2375e7067 PMD 0 +[57848.106019] Oops: 0002 [#1] SMP +[57848.106019] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:20:05.0/local_cpus +[57848.106019] Dumping ftrace buffer: +[57848.106019] --------------------------------- +[...] +[57848.106019] 0 chrdev_open+0xe6/0x165 <- cdev_put+0x23/0x24 +[57848.106019] 0 chrdev_open+0x117/0x165 <- chrdev_open+0xfa/0x165 +[57848.106019] 0 chrdev_open+0x120/0x165 <- chrdev_open+0x11c/0x165 +[57848.106019] 0 chrdev_open+0x134/0x165 <- chrdev_open+0x12b/0x165 +[57848.106019] 0 open+0x0/0x14 [oops] <- chrdev_open+0x144/0x165 +[57848.106019] 0 page_fault+0x0/0x30 <- open+0x6/0x14 [oops] +[57848.106019] 0 error_entry+0x0/0x5b <- page_fault+0x4/0x30 +[57848.106019] 0 error_kernelspace+0x0/0x31 <- error_entry+0x59/0x5b +[57848.106019] 0 error_sti+0x0/0x1 <- error_kernelspace+0x2d/0x31 +[57848.106019] 0 page_fault+0x9/0x30 <- error_sti+0x0/0x1 +[57848.106019] 0 do_page_fault+0x0/0x881 <- page_fault+0x1a/0x30 +[...] +[57848.106019] 0 do_page_fault+0x66b/0x881 <- is_prefetch+0x1ee/0x1f2 +[57848.106019] 0 do_page_fault+0x6e0/0x881 <- do_page_fault+0x67a/0x881 +[57848.106019] 0 oops_begin+0x0/0x96 <- do_page_fault+0x6e0/0x881 +[57848.106019] 0 trace_hw_branch_oops+0x0/0x2d <- oops_begin+0x9/0x96 +[...] +[57848.106019] 0 ds_suspend_bts+0x2a/0xe3 <- ds_suspend_bts+0x1a/0xe3 +[57848.106019] --------------------------------- +[57848.106019] CPU 0 +[57848.106019] Modules linked in: oops +[57848.106019] Pid: 5542, comm: cat Tainted: G W 2.6.28 #23 +[57848.106019] RIP: 0010:[] [] open+0x6/0x14 [oops] +[57848.106019] RSP: 0018:ffff880235457d48 EFLAGS: 00010246 +[...] + + dynamic ftrace -------------- -- cgit v1.2.3 From f510b233cfc7bfd57b6007071c52aa42e3d16b06 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 22 Jan 2009 17:53:47 +0100 Subject: lockdep: get_user_chars() redo Generic, states independent, get_user_chars(). Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- Documentation/lockdep-design.txt | 30 +++++++++++++++++------------- kernel/lockdep.c | 24 ++++++++++++------------ kernel/lockdep_internals.h | 7 ++++--- kernel/lockdep_proc.c | 6 +++--- 4 files changed, 36 insertions(+), 31 deletions(-) (limited to 'Documentation') diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt index 488773018152..938ea22f2cc0 100644 --- a/Documentation/lockdep-design.txt +++ b/Documentation/lockdep-design.txt @@ -27,33 +27,37 @@ lock-class. State ----- -The validator tracks lock-class usage history into 5 separate state bits: +The validator tracks lock-class usage history into 4n + 1 separate state bits: -- 'ever held in hardirq context' [ == hardirq-safe ] -- 'ever held in softirq context' [ == softirq-safe ] -- 'ever held with hardirqs enabled' [ == hardirq-unsafe ] -- 'ever held with softirqs and hardirqs enabled' [ == softirq-unsafe ] +- 'ever held in STATE context' +- 'ever head as readlock in STATE context' +- 'ever head with STATE enabled' +- 'ever head as readlock with STATE enabled' + +Where STATE can be either one of (kernel/lockdep_states.h) + - hardirq + - softirq + - reclaim_fs - 'ever used' [ == !unused ] -When locking rules are violated, these 4 state bits are presented in the -locking error messages, inside curlies. A contrived example: +When locking rules are violated, these state bits are presented in the +locking error messages, inside curlies. A contrived example: modprobe/2287 is trying to acquire lock: - (&sio_locks[i].lock){--..}, at: [] mutex_lock+0x21/0x24 + (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 but task is already holding lock: - (&sio_locks[i].lock){--..}, at: [] mutex_lock+0x21/0x24 + (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 -The bit position indicates hardirq, softirq, hardirq-read, -softirq-read respectively, and the character displayed in each -indicates: +The bit position indicates STATE, STATE-read, for each of the states listed +above, and the character displayed in each indicates: '.' acquired while irqs disabled '+' acquired in irq context '-' acquired with irqs enabled - '?' read acquired in irq context with irqs enabled. + '?' acquired in irq context with irqs enabled. Unused mutexes cannot be part of the cause of an error. diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 1b4ee3c0b789..22ced8d4912f 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -487,25 +487,25 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) return c; } -void -get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, - char *c4, char *c5, char *c6) +void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) { - *c1 = get_usage_char(class, LOCK_USED_IN_HARDIRQ); - *c2 = get_usage_char(class, LOCK_USED_IN_SOFTITQ); - *c3 = get_usage_char(class, LOCK_USED_IN_HARDIRQ_READ); - *c4 = get_usage_char(class, LOCK_USED_IN_SOFTITQ_READ); + int i = 0; - *c5 = get_usage_char(class, LOCK_USED_IN_RECLAIM_FS); - *c6 = get_usage_char(class, LOCK_USED_IN_RECLAIM_FS_READ); +#define LOCKDEP_STATE(__STATE) \ + usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE); \ + usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ); +#include "lockdep_states.h" +#undef LOCKDEP_STATE + + usage[i] = '\0'; } static void print_lock_name(struct lock_class *class) { - char str[KSYM_NAME_LEN], c1, c2, c3, c4, c5, c6; + char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; const char *name; - get_usage_chars(class, &c1, &c2, &c3, &c4, &c5, &c6); + get_usage_chars(class, usage); name = class->name; if (!name) { @@ -518,7 +518,7 @@ static void print_lock_name(struct lock_class *class) if (class->subclass) printk("/%d", class->subclass); } - printk("){%c%c%c%c%c%c}", c1, c2, c3, c4, c5, c6); + printk("){%s}", usage); } static void print_lockdep_cache(struct lockdep_map *lock) diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 7e653e66ce5a..a2cc7e9a6e84 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -70,9 +70,10 @@ enum { extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; -extern void -get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, - char *c4, char *c5, char *c6); +#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2) + +extern void get_usage_chars(struct lock_class *class, + char usage[LOCK_USAGE_CHARS]); extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index bd474fd9df9d..b51064ce564a 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -84,7 +84,7 @@ static int l_show(struct seq_file *m, void *v) { struct lock_class *class = v; struct lock_list *entry; - char c1, c2, c3, c4, c5, c6; + char usage[LOCK_USAGE_CHARS]; if (v == SEQ_START_TOKEN) { seq_printf(m, "all lock classes:\n"); @@ -100,8 +100,8 @@ static int l_show(struct seq_file *m, void *v) seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); #endif - get_usage_chars(class, &c1, &c2, &c3, &c4, &c5, &c6); - seq_printf(m, " %c%c%c%c%c%c", c1, c2, c3, c4, c5, c6); + get_usage_chars(class, usage); + seq_printf(m, " %s", usage); seq_printf(m, ": "); print_name(m, class); -- cgit v1.2.3 From 985ec20ad531f2641ab9d5193e37891fe959fc7d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 18 Feb 2009 06:35:34 +0100 Subject: tracing/function-graph-tracer: provide documentation for the function graph tracer Update documentation for the function graph tracer. Signed-off-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- Documentation/ftrace.txt | 226 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) (limited to 'Documentation') diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 758fb42a1b68..055bcd2992da 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -129,6 +129,10 @@ of ftrace. Here is a list of some of the key files: set_ftrace_pid: Have the function tracer only trace a single thread. + set_graph_function: Select the function where the trace have to start + with the function graph tracer (See the section + "dynamic ftrace" for more details). + available_filter_functions: This lists the functions that ftrace has processed and can trace. These are the function names that you can pass to "set_ftrace_filter" or @@ -143,6 +147,12 @@ Here is the list of current tracers that may be configured. function - function tracer that uses mcount to trace all functions. + function_graph_tracer - similar to the function tracer except that the + function tracer probes the functions on their entry whereas the + function graph tracer traces on both entry and exit of the + functions. It then provides the ability to draw a graph of + function calls like a primitive C code source. + sched_switch - traces the context switches between tasks. irqsoff - traces the areas that disable interrupts and saves @@ -1226,6 +1236,163 @@ kernel module: [...] +function graph tracer +--------------------------- + +This tracer is similar to the function tracer except that it probes +a function on its entry and its exit. +This is done by setting a dynamically allocated stack of return addresses on each +task_struct. Then the tracer overwrites the return address of each function traced +to set a custom probe. Thus the original return address is stored on the stack of return +address in the task_struct. + +Probing on both extremities of a function leads to special features such as + +_ measure of function's time execution +_ having a reliable call stack to draw function calls graph + +This tracer is useful in several situations: + +_ you want to find the reason of a strange kernel behavior and need to see + what happens in detail on any areas (or specific ones). +_ you are experiencing weird latencies but it's difficult to find its origin. +_ you want to find quickly which path is taken by a specific function +_ you just want to see what happens inside your kernel + +# tracer: function_graph +# +# CPU DURATION FUNCTION CALLS +# | | | | | | | + + 0) | sys_open() { + 0) | do_sys_open() { + 0) | getname() { + 0) | kmem_cache_alloc() { + 0) 1.382 us | __might_sleep(); + 0) 2.478 us | } + 0) | strncpy_from_user() { + 0) | might_fault() { + 0) 1.389 us | __might_sleep(); + 0) 2.553 us | } + 0) 3.807 us | } + 0) 7.876 us | } + 0) | alloc_fd() { + 0) 0.668 us | _spin_lock(); + 0) 0.570 us | expand_files(); + 0) 0.586 us | _spin_unlock(); + + +There are several columns that can be dynamically enabled/disabled. +You can use every combination of options you want, depending on your needs. + +_ The cpu number on which the function executed is default enabled. + It is sometimes better to only trace one cpu (see tracing_cpu_mask file) + or you might sometimes see unordered function calls while cpu tracing switch. + + hide: echo nofuncgraph-cpu > /debug/tracing/trace_options + show: echo funcgraph-cpu > /debug/tracing/trace_options + +_ The duration (function's time of execution) is displayed on the closing bracket + line of a function or on the same line than the current function in case of a leaf + one. It is default enabled. + + hide: echo nofuncgraph-duration > /debug/tracing/trace_options + show: echo funcgraph-duration > /debug/tracing/trace_options + +_ The overhead field precedes the duration one in case of reached duration thresholds. + + hide: echo nofuncgraph-overhead > /debug/tracing/trace_options + show: echo funcgraph-overhead > /debug/tracing/trace_options + depends on: funcgraph-duration + + ie: + + 0) | up_write() { + 0) 0.646 us | _spin_lock_irqsave(); + 0) 0.684 us | _spin_unlock_irqrestore(); + 0) 3.123 us | } + 0) 0.548 us | fput(); + 0) + 58.628 us | } + + [...] + + 0) | putname() { + 0) | kmem_cache_free() { + 0) 0.518 us | __phys_addr(); + 0) 1.757 us | } + 0) 2.861 us | } + 0) ! 115.305 us | } + 0) ! 116.402 us | } + + + means that the function exceeded 10 usecs. + ! means that the function exceeded 100 usecs. + + +_ The task/pid field displays the thread cmdline and pid which executed the function. + It is default disabled. + + hide: echo nofuncgraph-proc > /debug/tracing/trace_options + show: echo funcgraph-proc > /debug/tracing/trace_options + + ie: + + # tracer: function_graph + # + # CPU TASK/PID DURATION FUNCTION CALLS + # | | | | | | | | | + 0) sh-4802 | | d_free() { + 0) sh-4802 | | call_rcu() { + 0) sh-4802 | | __call_rcu() { + 0) sh-4802 | 0.616 us | rcu_process_gp_end(); + 0) sh-4802 | 0.586 us | check_for_new_grace_period(); + 0) sh-4802 | 2.899 us | } + 0) sh-4802 | 4.040 us | } + 0) sh-4802 | 5.151 us | } + 0) sh-4802 | + 49.370 us | } + + +_ The absolute time field is an absolute timestamp given by the clock since + it started. A snapshot of this time is given on each entry/exit of functions + + hide: echo nofuncgraph-abstime > /debug/tracing/trace_options + show: echo funcgraph-abstime > /debug/tracing/trace_options + + ie: + + # + # TIME CPU DURATION FUNCTION CALLS + # | | | | | | | | + 360.774522 | 1) 0.541 us | } + 360.774522 | 1) 4.663 us | } + 360.774523 | 1) 0.541 us | __wake_up_bit(); + 360.774524 | 1) 6.796 us | } + 360.774524 | 1) 7.952 us | } + 360.774525 | 1) 9.063 us | } + 360.774525 | 1) 0.615 us | journal_mark_dirty(); + 360.774527 | 1) 0.578 us | __brelse(); + 360.774528 | 1) | reiserfs_prepare_for_journal() { + 360.774528 | 1) | unlock_buffer() { + 360.774529 | 1) | wake_up_bit() { + 360.774529 | 1) | bit_waitqueue() { + 360.774530 | 1) 0.594 us | __phys_addr(); + + +You can put some comments on specific functions by using ftrace_printk() +For example, if you want to put a comment inside the __might_sleep() function, +you just have to include and call ftrace_printk() inside __might_sleep() + +ftrace_printk("I'm a comment!\n") + +will produce: + + 1) | __might_sleep() { + 1) | /* I'm a comment! */ + 1) 1.449 us | } + + +You might find other useful features for this tracer on the "dynamic ftrace" +section such as tracing only specific functions or tasks. + dynamic ftrace -------------- @@ -1427,6 +1594,65 @@ Produces: We can see that there's no more lock or preempt tracing. + +* Dynamic ftrace with the function graph tracer * + + +Although what has been explained above concerns both the function tracer and +the function_graph_tracer, the following concerns only the latter. + +If you want to trace only one function and all of its childs, you just have +to echo its name on set_graph_function: + +echo __do_fault > set_graph_function + +will produce the following: + + 0) | __do_fault() { + 0) | filemap_fault() { + 0) | find_lock_page() { + 0) 0.804 us | find_get_page(); + 0) | __might_sleep() { + 0) 1.329 us | } + 0) 3.904 us | } + 0) 4.979 us | } + 0) 0.653 us | _spin_lock(); + 0) 0.578 us | page_add_file_rmap(); + 0) 0.525 us | native_set_pte_at(); + 0) 0.585 us | _spin_unlock(); + 0) | unlock_page() { + 0) 0.541 us | page_waitqueue(); + 0) 0.639 us | __wake_up_bit(); + 0) 2.786 us | } + 0) + 14.237 us | } + 0) | __do_fault() { + 0) | filemap_fault() { + 0) | find_lock_page() { + 0) 0.698 us | find_get_page(); + 0) | __might_sleep() { + 0) 1.412 us | } + 0) 3.950 us | } + 0) 5.098 us | } + 0) 0.631 us | _spin_lock(); + 0) 0.571 us | page_add_file_rmap(); + 0) 0.526 us | native_set_pte_at(); + 0) 0.586 us | _spin_unlock(); + 0) | unlock_page() { + 0) 0.533 us | page_waitqueue(); + 0) 0.638 us | __wake_up_bit(); + 0) 2.793 us | } + 0) + 14.012 us | } + +You can also select several functions: + +echo sys_open > set_graph_function +echo sys_close >> set_graph_function + +Now if you want to go back to trace all functions + +echo > set_graph_function + + trace_pipe ---------- -- cgit v1.2.3 From 5752674e140db5bce08c6bc60021a9bc3b960800 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 19 Feb 2009 12:54:10 +0100 Subject: Documentation/ftrace.txt: update - fix typos/grammos and clarify the text - prettify the document some more Cc: Frederic Weisbecker Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- Documentation/ftrace.txt | 956 ++++++++++++++++++++++++++--------------------- 1 file changed, 530 insertions(+), 426 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 055bcd2992da..2041ee951c1a 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -15,31 +15,31 @@ Introduction Ftrace is an internal tracer designed to help out developers and designers of systems to find what is going on inside the kernel. -It can be used for debugging or analyzing latencies and performance -issues that take place outside of user-space. +It can be used for debugging or analyzing latencies and +performance issues that take place outside of user-space. Although ftrace is the function tracer, it also includes an -infrastructure that allows for other types of tracing. Some of the -tracers that are currently in ftrace include a tracer to trace -context switches, the time it takes for a high priority task to -run after it was woken up, the time interrupts are disabled, and -more (ftrace allows for tracer plugins, which means that the list of -tracers can always grow). +infrastructure that allows for other types of tracing. Some of +the tracers that are currently in ftrace include a tracer to +trace context switches, the time it takes for a high priority +task to run after it was woken up, the time interrupts are +disabled, and more (ftrace allows for tracer plugins, which +means that the list of tracers can always grow). The File System --------------- -Ftrace uses the debugfs file system to hold the control files as well -as the files to display output. +Ftrace uses the debugfs file system to hold the control files as +well as the files to display output. To mount the debugfs system: # mkdir /debug # mount -t debugfs nodev /debug -(Note: it is more common to mount at /sys/kernel/debug, but for simplicity - this document will use /debug) +( Note: it is more common to mount at /sys/kernel/debug, but for + simplicity this document will use /debug) That's it! (assuming that you have ftrace configured into your kernel) @@ -50,94 +50,124 @@ of ftrace. Here is a list of some of the key files: Note: all time values are in microseconds. - current_tracer: This is used to set or display the current tracer - that is configured. - - available_tracers: This holds the different types of tracers that - have been compiled into the kernel. The tracers - listed here can be configured by echoing their name - into current_tracer. - - tracing_enabled: This sets or displays whether the current_tracer - is activated and tracing or not. Echo 0 into this - file to disable the tracer or 1 to enable it. - - trace: This file holds the output of the trace in a human readable - format (described below). - - latency_trace: This file shows the same trace but the information - is organized more to display possible latencies - in the system (described below). - - trace_pipe: The output is the same as the "trace" file but this - file is meant to be streamed with live tracing. - Reads from this file will block until new data - is retrieved. Unlike the "trace" and "latency_trace" - files, this file is a consumer. This means reading - from this file causes sequential reads to display - more current data. Once data is read from this - file, it is consumed, and will not be read - again with a sequential read. The "trace" and - "latency_trace" files are static, and if the - tracer is not adding more data, they will display - the same information every time they are read. - - trace_options: This file lets the user control the amount of data - that is displayed in one of the above output - files. - - trace_max_latency: Some of the tracers record the max latency. - For example, the time interrupts are disabled. - This time is saved in this file. The max trace - will also be stored, and displayed by either - "trace" or "latency_trace". A new max trace will - only be recorded if the latency is greater than - the value in this file. (in microseconds) - - buffer_size_kb: This sets or displays the number of kilobytes each CPU - buffer can hold. The tracer buffers are the same size - for each CPU. The displayed number is the size of the - CPU buffer and not total size of all buffers. The - trace buffers are allocated in pages (blocks of memory - that the kernel uses for allocation, usually 4 KB in size). - If the last page allocated has room for more bytes - than requested, the rest of the page will be used, - making the actual allocation bigger than requested. - (Note, the size may not be a multiple of the page size due - to buffer managment overhead.) - - This can only be updated when the current_tracer - is set to "nop". - - tracing_cpumask: This is a mask that lets the user only trace - on specified CPUS. The format is a hex string - representing the CPUS. - - set_ftrace_filter: When dynamic ftrace is configured in (see the - section below "dynamic ftrace"), the code is dynamically - modified (code text rewrite) to disable calling of the - function profiler (mcount). This lets tracing be configured - in with practically no overhead in performance. This also - has a side effect of enabling or disabling specific functions - to be traced. Echoing names of functions into this file - will limit the trace to only those functions. - - set_ftrace_notrace: This has an effect opposite to that of - set_ftrace_filter. Any function that is added here will not - be traced. If a function exists in both set_ftrace_filter - and set_ftrace_notrace, the function will _not_ be traced. - - set_ftrace_pid: Have the function tracer only trace a single thread. - - set_graph_function: Select the function where the trace have to start - with the function graph tracer (See the section - "dynamic ftrace" for more details). - - available_filter_functions: This lists the functions that ftrace - has processed and can trace. These are the function - names that you can pass to "set_ftrace_filter" or - "set_ftrace_notrace". (See the section "dynamic ftrace" - below for more details.) + current_tracer: + + This is used to set or display the current tracer + that is configured. + + available_tracers: + + This holds the different types of tracers that + have been compiled into the kernel. The + tracers listed here can be configured by + echoing their name into current_tracer. + + tracing_enabled: + + This sets or displays whether the current_tracer + is activated and tracing or not. Echo 0 into this + file to disable the tracer or 1 to enable it. + + trace: + + This file holds the output of the trace in a human + readable format (described below). + + latency_trace: + + This file shows the same trace but the information + is organized more to display possible latencies + in the system (described below). + + trace_pipe: + + The output is the same as the "trace" file but this + file is meant to be streamed with live tracing. + Reads from this file will block until new data + is retrieved. Unlike the "trace" and "latency_trace" + files, this file is a consumer. This means reading + from this file causes sequential reads to display + more current data. Once data is read from this + file, it is consumed, and will not be read + again with a sequential read. The "trace" and + "latency_trace" files are static, and if the + tracer is not adding more data, they will display + the same information every time they are read. + + trace_options: + + This file lets the user control the amount of data + that is displayed in one of the above output + files. + + trace_max_latency: + + Some of the tracers record the max latency. + For example, the time interrupts are disabled. + This time is saved in this file. The max trace + will also be stored, and displayed by either + "trace" or "latency_trace". A new max trace will + only be recorded if the latency is greater than + the value in this file. (in microseconds) + + buffer_size_kb: + + This sets or displays the number of kilobytes each CPU + buffer can hold. The tracer buffers are the same size + for each CPU. The displayed number is the size of the + CPU buffer and not total size of all buffers. The + trace buffers are allocated in pages (blocks of memory + that the kernel uses for allocation, usually 4 KB in size). + If the last page allocated has room for more bytes + than requested, the rest of the page will be used, + making the actual allocation bigger than requested. + ( Note, the size may not be a multiple of the page size + due to buffer managment overhead. ) + + This can only be updated when the current_tracer + is set to "nop". + + tracing_cpumask: + + This is a mask that lets the user only trace + on specified CPUS. The format is a hex string + representing the CPUS. + + set_ftrace_filter: + + When dynamic ftrace is configured in (see the + section below "dynamic ftrace"), the code is dynamically + modified (code text rewrite) to disable calling of the + function profiler (mcount). This lets tracing be configured + in with practically no overhead in performance. This also + has a side effect of enabling or disabling specific functions + to be traced. Echoing names of functions into this file + will limit the trace to only those functions. + + set_ftrace_notrace: + + This has an effect opposite to that of + set_ftrace_filter. Any function that is added here will not + be traced. If a function exists in both set_ftrace_filter + and set_ftrace_notrace, the function will _not_ be traced. + + set_ftrace_pid: + + Have the function tracer only trace a single thread. + + set_graph_function: + + Set a "trigger" function where tracing should start + with the function graph tracer (See the section + "dynamic ftrace" for more details). + + available_filter_functions: + + This lists the functions that ftrace + has processed and can trace. These are the function + names that you can pass to "set_ftrace_filter" or + "set_ftrace_notrace". (See the section "dynamic ftrace" + below for more details.) The Tracers @@ -145,44 +175,66 @@ The Tracers Here is the list of current tracers that may be configured. - function - function tracer that uses mcount to trace all functions. + "function" + + Function call tracer to trace all kernel functions. + + "function_graph_tracer" + + Similar to the function tracer except that the + function tracer probes the functions on their entry + whereas the function graph tracer traces on both entry + and exit of the functions. It then provides the ability + to draw a graph of function calls similar to C code + source. + + "sched_switch" + + Traces the context switches and wakeups between tasks. + + "irqsoff" + + Traces the areas that disable interrupts and saves + the trace with the longest max latency. + See tracing_max_latency. When a new max is recorded, + it replaces the old trace. It is best to view this + trace via the latency_trace file. - function_graph_tracer - similar to the function tracer except that the - function tracer probes the functions on their entry whereas the - function graph tracer traces on both entry and exit of the - functions. It then provides the ability to draw a graph of - function calls like a primitive C code source. + "preemptoff" - sched_switch - traces the context switches between tasks. + Similar to irqsoff but traces and records the amount of + time for which preemption is disabled. - irqsoff - traces the areas that disable interrupts and saves - the trace with the longest max latency. - See tracing_max_latency. When a new max is recorded, - it replaces the old trace. It is best to view this - trace via the latency_trace file. + "preemptirqsoff" - preemptoff - Similar to irqsoff but traces and records the amount of - time for which preemption is disabled. + Similar to irqsoff and preemptoff, but traces and + records the largest time for which irqs and/or preemption + is disabled. - preemptirqsoff - Similar to irqsoff and preemptoff, but traces and - records the largest time for which irqs and/or preemption - is disabled. + "wakeup" - wakeup - Traces and records the max latency that it takes for - the highest priority task to get scheduled after - it has been woken up. + Traces and records the max latency that it takes for + the highest priority task to get scheduled after + it has been woken up. - nop - This is not a tracer. To remove all tracers from tracing - simply echo "nop" into current_tracer. + "hw-branch-tracer" - hw-branch-tracer - traces branches on all cpu's in a circular buffer. + Uses the BTS CPU feature on x86 CPUs to traces all + branches executed. + + "nop" + + This is the "trace nothing" tracer. To remove all + tracers from tracing simply echo "nop" into + current_tracer. Examples of using the tracer ---------------------------- -Here are typical examples of using the tracers when controlling them only -with the debugfs interface (without using any user-land utilities). +Here are typical examples of using the tracers when controlling +them only with the debugfs interface (without using any +user-land utilities). Output format: -------------- @@ -199,16 +251,16 @@ Here is an example of the output format of the file "trace" bash-4251 [01] 10152.583855: _atomic_dec_and_lock <-dput -------- -A header is printed with the tracer name that is represented by the trace. -In this case the tracer is "function". Then a header showing the format. Task -name "bash", the task PID "4251", the CPU that it was running on -"01", the timestamp in . format, the function name that was -traced "path_put" and the parent function that called this function -"path_walk". The timestamp is the time at which the function was -entered. +A header is printed with the tracer name that is represented by +the trace. In this case the tracer is "function". Then a header +showing the format. Task name "bash", the task PID "4251", the +CPU that it was running on "01", the timestamp in . +format, the function name that was traced "path_put" and the +parent function that called this function "path_walk". The +timestamp is the time at which the function was entered. -The sched_switch tracer also includes tracing of task wakeups and -context switches. +The sched_switch tracer also includes tracing of task wakeups +and context switches. ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S @@ -217,8 +269,8 @@ context switches. kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R -Wake ups are represented by a "+" and the context switches are shown as -"==>". The format is: +Wake ups are represented by a "+" and the context switches are +shown as "==>". The format is: Context switches: @@ -232,19 +284,20 @@ Wake ups are represented by a "+" and the context switches are shown as :: + :: -The prio is the internal kernel priority, which is the inverse of the -priority that is usually displayed by user-space tools. Zero represents -the highest priority (99). Prio 100 starts the "nice" priorities with -100 being equal to nice -20 and 139 being nice 19. The prio "140" is -reserved for the idle task which is the lowest priority thread (pid 0). +The prio is the internal kernel priority, which is the inverse +of the priority that is usually displayed by user-space tools. +Zero represents the highest priority (99). Prio 100 starts the +"nice" priorities with 100 being equal to nice -20 and 139 being +nice 19. The prio "140" is reserved for the idle task which is +the lowest priority thread (pid 0). Latency trace format -------------------- -For traces that display latency times, the latency_trace file gives -somewhat more information to see why a latency happened. Here is a typical -trace. +For traces that display latency times, the latency_trace file +gives somewhat more information to see why a latency happened. +Here is a typical trace. # tracer: irqsoff # @@ -271,20 +324,20 @@ irqsoff latency trace v1.1.5 on 2.6.26-rc8 -0 0d.s1 98us : trace_hardirqs_on (do_softirq) +This shows that the current tracer is "irqsoff" tracing the time +for which interrupts were disabled. It gives the trace version +and the version of the kernel upon which this was executed on +(2.6.26-rc8). Then it displays the max latency in microsecs (97 +us). The number of trace entries displayed and the total number +recorded (both are three: #3/3). The type of preemption that was +used (PREEMPT). VP, KP, SP, and HP are always zero and are +reserved for later use. #P is the number of online CPUS (#P:2). -This shows that the current tracer is "irqsoff" tracing the time for which -interrupts were disabled. It gives the trace version and the version -of the kernel upon which this was executed on (2.6.26-rc8). Then it displays -the max latency in microsecs (97 us). The number of trace entries displayed -and the total number recorded (both are three: #3/3). The type of -preemption that was used (PREEMPT). VP, KP, SP, and HP are always zero -and are reserved for later use. #P is the number of online CPUS (#P:2). - -The task is the process that was running when the latency occurred. -(swapper pid: 0). +The task is the process that was running when the latency +occurred. (swapper pid: 0). -The start and stop (the functions in which the interrupts were disabled and -enabled respectively) that caused the latencies: +The start and stop (the functions in which the interrupts were +disabled and enabled respectively) that caused the latencies: apic_timer_interrupt is where the interrupts were disabled. do_softirq is where they were enabled again. @@ -320,12 +373,12 @@ The above is mostly meaningful for kernel developers. latency_trace file is relative to the start of the trace. delay: This is just to help catch your eye a bit better. And - needs to be fixed to be only relative to the same CPU. - The marks are determined by the difference between this - current trace and the next trace. - '!' - greater than preempt_mark_thresh (default 100) - '+' - greater than 1 microsecond - ' ' - less than or equal to 1 microsecond. + needs to be fixed to be only relative to the same CPU. + The marks are determined by the difference between this + current trace and the next trace. + '!' - greater than preempt_mark_thresh (default 100) + '+' - greater than 1 microsecond + ' ' - less than or equal to 1 microsecond. The rest is the same as the 'trace' file. @@ -333,14 +386,15 @@ The above is mostly meaningful for kernel developers. trace_options ------------- -The trace_options file is used to control what gets printed in the trace -output. To see what is available, simply cat the file: +The trace_options file is used to control what gets printed in +the trace output. To see what is available, simply cat the file: cat /debug/tracing/trace_options print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ - noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj + noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj -To disable one of the options, echo in the option prepended with "no". +To disable one of the options, echo in the option prepended with +"no". echo noprint-parent > /debug/tracing/trace_options @@ -350,8 +404,8 @@ To enable an option, leave off the "no". Here are the available options: - print-parent - On function traces, display the calling function - as well as the function being traced. + print-parent - On function traces, display the calling (parent) + function as well as the function being traced. print-parent: bash-4000 [01] 1477.606694: simple_strtoul <-strict_strtoul @@ -360,15 +414,16 @@ Here are the available options: bash-4000 [01] 1477.606694: simple_strtoul - sym-offset - Display not only the function name, but also the offset - in the function. For example, instead of seeing just - "ktime_get", you will see "ktime_get+0xb/0x20". + sym-offset - Display not only the function name, but also the + offset in the function. For example, instead of + seeing just "ktime_get", you will see + "ktime_get+0xb/0x20". sym-offset: bash-4000 [01] 1477.606694: simple_strtoul+0x6/0xa0 - sym-addr - this will also display the function address as well as - the function name. + sym-addr - this will also display the function address as well + as the function name. sym-addr: bash-4000 [01] 1477.606694: simple_strtoul @@ -378,35 +433,41 @@ Here are the available options: bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ (+0.000ms): simple_strtoul (strict_strtoul) - raw - This will display raw numbers. This option is best for use with - user applications that can translate the raw numbers better than - having it done in the kernel. + raw - This will display raw numbers. This option is best for + use with user applications that can translate the raw + numbers better than having it done in the kernel. - hex - Similar to raw, but the numbers will be in a hexadecimal format. + hex - Similar to raw, but the numbers will be in a hexadecimal + format. bin - This will print out the formats in raw binary. block - TBD (needs update) - stacktrace - This is one of the options that changes the trace itself. - When a trace is recorded, so is the stack of functions. - This allows for back traces of trace sites. + stacktrace - This is one of the options that changes the trace + itself. When a trace is recorded, so is the stack + of functions. This allows for back traces of + trace sites. - userstacktrace - This option changes the trace. - It records a stacktrace of the current userspace thread. + userstacktrace - This option changes the trace. It records a + stacktrace of the current userspace thread. - sym-userobj - when user stacktrace are enabled, look up which object the - address belongs to, and print a relative address - This is especially useful when ASLR is on, otherwise you don't - get a chance to resolve the address to object/file/line after the app is no - longer running + sym-userobj - when user stacktrace are enabled, look up which + object the address belongs to, and print a + relative address. This is especially useful when + ASLR is on, otherwise you don't get a chance to + resolve the address to object/file/line after + the app is no longer running - The lookup is performed when you read trace,trace_pipe,latency_trace. Example: + The lookup is performed when you read + trace,trace_pipe,latency_trace. Example: a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] - sched-tree - TBD (any users??) + sched-tree - trace all tasks that are on the runqueue, at + every scheduling event. Will add overhead if + there's a lot of tasks running at once. sched_switch @@ -443,18 +504,19 @@ of how to use it. [...] -As we have discussed previously about this format, the header shows -the name of the trace and points to the options. The "FUNCTION" -is a misnomer since here it represents the wake ups and context -switches. +As we have discussed previously about this format, the header +shows the name of the trace and points to the options. The +"FUNCTION" is a misnomer since here it represents the wake ups +and context switches. -The sched_switch file only lists the wake ups (represented with '+') -and context switches ('==>') with the previous task or current task -first followed by the next task or task waking up. The format for both -of these is PID:KERNEL-PRIO:TASK-STATE. Remember that the KERNEL-PRIO -is the inverse of the actual priority with zero (0) being the highest -priority and the nice values starting at 100 (nice -20). Below is -a quick chart to map the kernel priority to user land priorities. +The sched_switch file only lists the wake ups (represented with +'+') and context switches ('==>') with the previous task or +current task first followed by the next task or task waking up. +The format for both of these is PID:KERNEL-PRIO:TASK-STATE. +Remember that the KERNEL-PRIO is the inverse of the actual +priority with zero (0) being the highest priority and the nice +values starting at 100 (nice -20). Below is a quick chart to map +the kernel priority to user land priorities. Kernel priority: 0 to 99 ==> user RT priority 99 to 0 Kernel priority: 100 to 139 ==> user nice -20 to 19 @@ -475,10 +537,10 @@ The task states are: ftrace_enabled -------------- -The following tracers (listed below) give different output depending -on whether or not the sysctl ftrace_enabled is set. To set ftrace_enabled, -one can either use the sysctl function or set it via the proc -file system interface. +The following tracers (listed below) give different output +depending on whether or not the sysctl ftrace_enabled is set. To +set ftrace_enabled, one can either use the sysctl function or +set it via the proc file system interface. sysctl kernel.ftrace_enabled=1 @@ -486,12 +548,12 @@ file system interface. echo 1 > /proc/sys/kernel/ftrace_enabled -To disable ftrace_enabled simply replace the '1' with '0' in -the above commands. +To disable ftrace_enabled simply replace the '1' with '0' in the +above commands. -When ftrace_enabled is set the tracers will also record the functions -that are within the trace. The descriptions of the tracers -will also show an example with ftrace enabled. +When ftrace_enabled is set the tracers will also record the +functions that are within the trace. The descriptions of the +tracers will also show an example with ftrace enabled. irqsoff @@ -499,17 +561,18 @@ irqsoff When interrupts are disabled, the CPU can not react to any other external event (besides NMIs and SMIs). This prevents the timer -interrupt from triggering or the mouse interrupt from letting the -kernel know of a new mouse event. The result is a latency with the -reaction time. +interrupt from triggering or the mouse interrupt from letting +the kernel know of a new mouse event. The result is a latency +with the reaction time. -The irqsoff tracer tracks the time for which interrupts are disabled. -When a new maximum latency is hit, the tracer saves the trace leading up -to that latency point so that every time a new maximum is reached, the old -saved trace is discarded and the new trace is saved. +The irqsoff tracer tracks the time for which interrupts are +disabled. When a new maximum latency is hit, the tracer saves +the trace leading up to that latency point so that every time a +new maximum is reached, the old saved trace is discarded and the +new trace is saved. -To reset the maximum, echo 0 into tracing_max_latency. Here is an -example: +To reset the maximum, echo 0 into tracing_max_latency. Here is +an example: # echo irqsoff > /debug/tracing/current_tracer # echo 0 > /debug/tracing/tracing_max_latency @@ -544,10 +607,11 @@ irqsoff latency trace v1.1.5 on 2.6.26 Here we see that that we had a latency of 12 microsecs (which is -very good). The _write_lock_irq in sys_setpgid disabled interrupts. -The difference between the 12 and the displayed timestamp 14us occurred -because the clock was incremented between the time of recording the max -latency and the time of recording the function that had that latency. +very good). The _write_lock_irq in sys_setpgid disabled +interrupts. The difference between the 12 and the displayed +timestamp 14us occurred because the clock was incremented +between the time of recording the max latency and the time of +recording the function that had that latency. Note the above example had ftrace_enabled not set. If we set the ftrace_enabled, we get a much larger output: @@ -598,24 +662,24 @@ irqsoff latency trace v1.1.5 on 2.6.26-rc8 Here we traced a 50 microsecond latency. But we also see all the -functions that were called during that time. Note that by enabling -function tracing, we incur an added overhead. This overhead may -extend the latency times. But nevertheless, this trace has provided -some very helpful debugging information. +functions that were called during that time. Note that by +enabling function tracing, we incur an added overhead. This +overhead may extend the latency times. But nevertheless, this +trace has provided some very helpful debugging information. preemptoff ---------- -When preemption is disabled, we may be able to receive interrupts but -the task cannot be preempted and a higher priority task must wait -for preemption to be enabled again before it can preempt a lower -priority task. +When preemption is disabled, we may be able to receive +interrupts but the task cannot be preempted and a higher +priority task must wait for preemption to be enabled again +before it can preempt a lower priority task. The preemptoff tracer traces the places that disable preemption. -Like the irqsoff tracer, it records the maximum latency for which preemption -was disabled. The control of preemptoff tracer is much like the irqsoff -tracer. +Like the irqsoff tracer, it records the maximum latency for +which preemption was disabled. The control of preemptoff tracer +is much like the irqsoff tracer. # echo preemptoff > /debug/tracing/current_tracer # echo 0 > /debug/tracing/tracing_max_latency @@ -649,11 +713,12 @@ preemptoff latency trace v1.1.5 on 2.6.26-rc8 sshd-4261 0d.s1 30us : trace_preempt_on (__do_softirq) -This has some more changes. Preemption was disabled when an interrupt -came in (notice the 'h'), and was enabled while doing a softirq. -(notice the 's'). But we also see that interrupts have been disabled -when entering the preempt off section and leaving it (the 'd'). -We do not know if interrupts were enabled in the mean time. +This has some more changes. Preemption was disabled when an +interrupt came in (notice the 'h'), and was enabled while doing +a softirq. (notice the 's'). But we also see that interrupts +have been disabled when entering the preempt off section and +leaving it (the 'd'). We do not know if interrupts were enabled +in the mean time. # tracer: preemptoff # @@ -712,28 +777,30 @@ preemptoff latency trace v1.1.5 on 2.6.26-rc8 sshd-4261 0d.s1 64us : trace_preempt_on (__do_softirq) -The above is an example of the preemptoff trace with ftrace_enabled -set. Here we see that interrupts were disabled the entire time. -The irq_enter code lets us know that we entered an interrupt 'h'. -Before that, the functions being traced still show that it is not -in an interrupt, but we can see from the functions themselves that -this is not the case. +The above is an example of the preemptoff trace with +ftrace_enabled set. Here we see that interrupts were disabled +the entire time. The irq_enter code lets us know that we entered +an interrupt 'h'. Before that, the functions being traced still +show that it is not in an interrupt, but we can see from the +functions themselves that this is not the case. -Notice that __do_softirq when called does not have a preempt_count. -It may seem that we missed a preempt enabling. What really happened -is that the preempt count is held on the thread's stack and we -switched to the softirq stack (4K stacks in effect). The code -does not copy the preempt count, but because interrupts are disabled, -we do not need to worry about it. Having a tracer like this is good -for letting people know what really happens inside the kernel. +Notice that __do_softirq when called does not have a +preempt_count. It may seem that we missed a preempt enabling. +What really happened is that the preempt count is held on the +thread's stack and we switched to the softirq stack (4K stacks +in effect). The code does not copy the preempt count, but +because interrupts are disabled, we do not need to worry about +it. Having a tracer like this is good for letting people know +what really happens inside the kernel. preemptirqsoff -------------- -Knowing the locations that have interrupts disabled or preemption -disabled for the longest times is helpful. But sometimes we would -like to know when either preemption and/or interrupts are disabled. +Knowing the locations that have interrupts disabled or +preemption disabled for the longest times is helpful. But +sometimes we would like to know when either preemption and/or +interrupts are disabled. Consider the following code: @@ -753,11 +820,13 @@ The preemptoff tracer will record the total length of call_function_with_irqs_and_preemption_off() and call_function_with_preemption_off(). -But neither will trace the time that interrupts and/or preemption -is disabled. This total time is the time that we can not schedule. -To record this time, use the preemptirqsoff tracer. +But neither will trace the time that interrupts and/or +preemption is disabled. This total time is the time that we can +not schedule. To record this time, use the preemptirqsoff +tracer. -Again, using this trace is much like the irqsoff and preemptoff tracers. +Again, using this trace is much like the irqsoff and preemptoff +tracers. # echo preemptirqsoff > /debug/tracing/current_tracer # echo 0 > /debug/tracing/tracing_max_latency @@ -793,9 +862,10 @@ preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 The trace_hardirqs_off_thunk is called from assembly on x86 when -interrupts are disabled in the assembly code. Without the function -tracing, we do not know if interrupts were enabled within the preemption -points. We do see that it started with preemption enabled. +interrupts are disabled in the assembly code. Without the +function tracing, we do not know if interrupts were enabled +within the preemption points. We do see that it started with +preemption enabled. Here is a trace with ftrace_enabled set: @@ -883,40 +953,42 @@ preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 sshd-4261 0d.s1 105us : trace_preempt_on (__do_softirq) -This is a very interesting trace. It started with the preemption of -the ls task. We see that the task had the "need_resched" bit set -via the 'N' in the trace. Interrupts were disabled before the spin_lock -at the beginning of the trace. We see that a schedule took place to run -sshd. When the interrupts were enabled, we took an interrupt. -On return from the interrupt handler, the softirq ran. We took another -interrupt while running the softirq as we see from the capital 'H'. +This is a very interesting trace. It started with the preemption +of the ls task. We see that the task had the "need_resched" bit +set via the 'N' in the trace. Interrupts were disabled before +the spin_lock at the beginning of the trace. We see that a +schedule took place to run sshd. When the interrupts were +enabled, we took an interrupt. On return from the interrupt +handler, the softirq ran. We took another interrupt while +running the softirq as we see from the capital 'H'. wakeup ------ -In a Real-Time environment it is very important to know the wakeup -time it takes for the highest priority task that is woken up to the -time that it executes. This is also known as "schedule latency". -I stress the point that this is about RT tasks. It is also important -to know the scheduling latency of non-RT tasks, but the average -schedule latency is better for non-RT tasks. Tools like -LatencyTop are more appropriate for such measurements. +In a Real-Time environment it is very important to know the +wakeup time it takes for the highest priority task that is woken +up to the time that it executes. This is also known as "schedule +latency". I stress the point that this is about RT tasks. It is +also important to know the scheduling latency of non-RT tasks, +but the average schedule latency is better for non-RT tasks. +Tools like LatencyTop are more appropriate for such +measurements. Real-Time environments are interested in the worst case latency. -That is the longest latency it takes for something to happen, and -not the average. We can have a very fast scheduler that may only -have a large latency once in a while, but that would not work well -with Real-Time tasks. The wakeup tracer was designed to record -the worst case wakeups of RT tasks. Non-RT tasks are not recorded -because the tracer only records one worst case and tracing non-RT -tasks that are unpredictable will overwrite the worst case latency -of RT tasks. - -Since this tracer only deals with RT tasks, we will run this slightly -differently than we did with the previous tracers. Instead of performing -an 'ls', we will run 'sleep 1' under 'chrt' which changes the -priority of the task. +That is the longest latency it takes for something to happen, +and not the average. We can have a very fast scheduler that may +only have a large latency once in a while, but that would not +work well with Real-Time tasks. The wakeup tracer was designed +to record the worst case wakeups of RT tasks. Non-RT tasks are +not recorded because the tracer only records one worst case and +tracing non-RT tasks that are unpredictable will overwrite the +worst case latency of RT tasks. + +Since this tracer only deals with RT tasks, we will run this +slightly differently than we did with the previous tracers. +Instead of performing an 'ls', we will run 'sleep 1' under +'chrt' which changes the priority of the task. # echo wakeup > /debug/tracing/current_tracer # echo 0 > /debug/tracing/tracing_max_latency @@ -946,17 +1018,16 @@ wakeup latency trace v1.1.5 on 2.6.26-rc8 -0 1d..4 4us : schedule (cpu_idle) +Running this on an idle system, we see that it only took 4 +microseconds to perform the task switch. Note, since the trace +marker in the schedule is before the actual "switch", we stop +the tracing when the recorded task is about to schedule in. This +may change if we add a new marker at the end of the scheduler. -Running this on an idle system, we see that it only took 4 microseconds -to perform the task switch. Note, since the trace marker in the -schedule is before the actual "switch", we stop the tracing when -the recorded task is about to schedule in. This may change if -we add a new marker at the end of the scheduler. - -Notice that the recorded task is 'sleep' with the PID of 4901 and it -has an rt_prio of 5. This priority is user-space priority and not -the internal kernel priority. The policy is 1 for SCHED_FIFO and 2 -for SCHED_RR. +Notice that the recorded task is 'sleep' with the PID of 4901 +and it has an rt_prio of 5. This priority is user-space priority +and not the internal kernel priority. The policy is 1 for +SCHED_FIFO and 2 for SCHED_RR. Doing the same with chrt -r 5 and ftrace_enabled set. @@ -1013,24 +1084,25 @@ ksoftirq-7 1d..6 49us : _spin_unlock (tracing_record_cmdline) ksoftirq-7 1d..6 49us : sub_preempt_count (_spin_unlock) ksoftirq-7 1d..4 50us : schedule (__cond_resched) -The interrupt went off while running ksoftirqd. This task runs at -SCHED_OTHER. Why did not we see the 'N' set early? This may be -a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K stacks -configured, the interrupt and softirq run with their own stack. -Some information is held on the top of the task's stack (need_resched -and preempt_count are both stored there). The setting of the NEED_RESCHED -bit is done directly to the task's stack, but the reading of the -NEED_RESCHED is done by looking at the current stack, which in this case -is the stack for the hard interrupt. This hides the fact that NEED_RESCHED -has been set. We do not see the 'N' until we switch back to the task's +The interrupt went off while running ksoftirqd. This task runs +at SCHED_OTHER. Why did not we see the 'N' set early? This may +be a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K +stacks configured, the interrupt and softirq run with their own +stack. Some information is held on the top of the task's stack +(need_resched and preempt_count are both stored there). The +setting of the NEED_RESCHED bit is done directly to the task's +stack, but the reading of the NEED_RESCHED is done by looking at +the current stack, which in this case is the stack for the hard +interrupt. This hides the fact that NEED_RESCHED has been set. +We do not see the 'N' until we switch back to the task's assigned stack. function -------- This tracer is the function tracer. Enabling the function tracer -can be done from the debug file system. Make sure the ftrace_enabled is -set; otherwise this tracer is a nop. +can be done from the debug file system. Make sure the +ftrace_enabled is set; otherwise this tracer is a nop. # sysctl kernel.ftrace_enabled=1 # echo function > /debug/tracing/current_tracer @@ -1060,14 +1132,15 @@ set; otherwise this tracer is a nop. [...] -Note: function tracer uses ring buffers to store the above entries. -The newest data may overwrite the oldest data. Sometimes using echo to -stop the trace is not sufficient because the tracing could have overwritten -the data that you wanted to record. For this reason, it is sometimes better to -disable tracing directly from a program. This allows you to stop the -tracing at the point that you hit the part that you are interested in. -To disable the tracing directly from a C program, something like following -code snippet can be used: +Note: function tracer uses ring buffers to store the above +entries. The newest data may overwrite the oldest data. +Sometimes using echo to stop the trace is not sufficient because +the tracing could have overwritten the data that you wanted to +record. For this reason, it is sometimes better to disable +tracing directly from a program. This allows you to stop the +tracing at the point that you hit the part that you are +interested in. To disable the tracing directly from a C program, +something like following code snippet can be used: int trace_fd; [...] @@ -1082,10 +1155,10 @@ int main(int argc, char *argv[]) { } Note: Here we hard coded the path name. The debugfs mount is not -guaranteed to be at /debug (and is more commonly at /sys/kernel/debug). -For simple one time traces, the above is sufficent. For anything else, -a search through /proc/mounts may be needed to find where the debugfs -file-system is mounted. +guaranteed to be at /debug (and is more commonly at +/sys/kernel/debug). For simple one time traces, the above is +sufficent. For anything else, a search through /proc/mounts may +be needed to find where the debugfs file-system is mounted. Single thread tracing @@ -1186,10 +1259,11 @@ following format: 0 scheduler_tick+0x1b6/0x1bf <- scheduler_tick+0x1aa/0x1bf -The tracer may be used to dump the trace for the oops'ing cpu on a -kernel oops into the system log. To enable this, ftrace_dump_on_oops -must be set. To set ftrace_dump_on_oops, one can either use the sysctl -function or set it via the proc system interface. +The tracer may be used to dump the trace for the oops'ing cpu on +a kernel oops into the system log. To enable this, +ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one +can either use the sysctl function or set it via the proc system +interface. sysctl kernel.ftrace_dump_on_oops=1 @@ -1198,8 +1272,8 @@ or echo 1 > /proc/sys/kernel/ftrace_dump_on_oops -Here's an example of such a dump after a null pointer dereference in a -kernel module: +Here's an example of such a dump after a null pointer +dereference in a kernel module: [57848.105921] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [57848.106019] IP: [] open+0x6/0x14 [oops] @@ -1239,25 +1313,34 @@ kernel module: function graph tracer --------------------------- -This tracer is similar to the function tracer except that it probes -a function on its entry and its exit. -This is done by setting a dynamically allocated stack of return addresses on each -task_struct. Then the tracer overwrites the return address of each function traced -to set a custom probe. Thus the original return address is stored on the stack of return -address in the task_struct. +This tracer is similar to the function tracer except that it +probes a function on its entry and its exit. This is done by +using a dynamically allocated stack of return addresses in each +task_struct. On function entry the tracer overwrites the return +address of each function traced to set a custom probe. Thus the +original return address is stored on the stack of return address +in the task_struct. -Probing on both extremities of a function leads to special features such as +Probing on both ends of a function leads to special features +such as: -_ measure of function's time execution -_ having a reliable call stack to draw function calls graph +- measure of a function's time execution +- having a reliable call stack to draw function calls graph This tracer is useful in several situations: -_ you want to find the reason of a strange kernel behavior and need to see - what happens in detail on any areas (or specific ones). -_ you are experiencing weird latencies but it's difficult to find its origin. -_ you want to find quickly which path is taken by a specific function -_ you just want to see what happens inside your kernel +- you want to find the reason of a strange kernel behavior and + need to see what happens in detail on any areas (or specific + ones). + +- you are experiencing weird latencies but it's difficult to + find its origin. + +- you want to find quickly which path is taken by a specific + function + +- you just want to peek inside a working kernel and want to see + what happens there. # tracer: function_graph # @@ -1282,24 +1365,28 @@ _ you just want to see what happens inside your kernel 0) 0.586 us | _spin_unlock(); -There are several columns that can be dynamically enabled/disabled. -You can use every combination of options you want, depending on your needs. +There are several columns that can be dynamically +enabled/disabled. You can use every combination of options you +want, depending on your needs. -_ The cpu number on which the function executed is default enabled. - It is sometimes better to only trace one cpu (see tracing_cpu_mask file) - or you might sometimes see unordered function calls while cpu tracing switch. +- The cpu number on which the function executed is default + enabled. It is sometimes better to only trace one cpu (see + tracing_cpu_mask file) or you might sometimes see unordered + function calls while cpu tracing switch. hide: echo nofuncgraph-cpu > /debug/tracing/trace_options show: echo funcgraph-cpu > /debug/tracing/trace_options -_ The duration (function's time of execution) is displayed on the closing bracket - line of a function or on the same line than the current function in case of a leaf - one. It is default enabled. +- The duration (function's time of execution) is displayed on + the closing bracket line of a function or on the same line + than the current function in case of a leaf one. It is default + enabled. hide: echo nofuncgraph-duration > /debug/tracing/trace_options show: echo funcgraph-duration > /debug/tracing/trace_options -_ The overhead field precedes the duration one in case of reached duration thresholds. +- The overhead field precedes the duration field in case of + reached duration thresholds. hide: echo nofuncgraph-overhead > /debug/tracing/trace_options show: echo funcgraph-overhead > /debug/tracing/trace_options @@ -1328,8 +1415,8 @@ _ The overhead field precedes the duration one in case of reached duration thres ! means that the function exceeded 100 usecs. -_ The task/pid field displays the thread cmdline and pid which executed the function. - It is default disabled. +- The task/pid field displays the thread cmdline and pid which + executed the function. It is default disabled. hide: echo nofuncgraph-proc > /debug/tracing/trace_options show: echo funcgraph-proc > /debug/tracing/trace_options @@ -1351,8 +1438,9 @@ _ The task/pid field displays the thread cmdline and pid which executed the func 0) sh-4802 | + 49.370 us | } -_ The absolute time field is an absolute timestamp given by the clock since - it started. A snapshot of this time is given on each entry/exit of functions +- The absolute time field is an absolute timestamp given by the + system clock since it started. A snapshot of this time is + given on each entry/exit of functions hide: echo nofuncgraph-abstime > /debug/tracing/trace_options show: echo funcgraph-abstime > /debug/tracing/trace_options @@ -1377,9 +1465,10 @@ _ The absolute time field is an absolute timestamp given by the clock since 360.774530 | 1) 0.594 us | __phys_addr(); -You can put some comments on specific functions by using ftrace_printk() -For example, if you want to put a comment inside the __might_sleep() function, -you just have to include and call ftrace_printk() inside __might_sleep() +You can put some comments on specific functions by using +ftrace_printk() For example, if you want to put a comment inside +the __might_sleep() function, you just have to include + and call ftrace_printk() inside __might_sleep() ftrace_printk("I'm a comment!\n") @@ -1390,8 +1479,9 @@ will produce: 1) 1.449 us | } -You might find other useful features for this tracer on the "dynamic ftrace" -section such as tracing only specific functions or tasks. +You might find other useful features for this tracer in the +following "dynamic ftrace" section such as tracing only specific +functions or tasks. dynamic ftrace -------------- @@ -1399,43 +1489,45 @@ dynamic ftrace If CONFIG_DYNAMIC_FTRACE is set, the system will run with virtually no overhead when function tracing is disabled. The way this works is the mcount function call (placed at the start of -every kernel function, produced by the -pg switch in gcc), starts -of pointing to a simple return. (Enabling FTRACE will include the --pg switch in the compiling of the kernel.) +every kernel function, produced by the -pg switch in gcc), +starts of pointing to a simple return. (Enabling FTRACE will +include the -pg switch in the compiling of the kernel.) At compile time every C file object is run through the recordmcount.pl script (located in the scripts directory). This script will process the C object using objdump to find all the -locations in the .text section that call mcount. (Note, only -the .text section is processed, since processing other sections -like .init.text may cause races due to those sections being freed). +locations in the .text section that call mcount. (Note, only the +.text section is processed, since processing other sections like +.init.text may cause races due to those sections being freed). -A new section called "__mcount_loc" is created that holds references -to all the mcount call sites in the .text section. This section is -compiled back into the original object. The final linker will add -all these references into a single table. +A new section called "__mcount_loc" is created that holds +references to all the mcount call sites in the .text section. +This section is compiled back into the original object. The +final linker will add all these references into a single table. On boot up, before SMP is initialized, the dynamic ftrace code -scans this table and updates all the locations into nops. It also -records the locations, which are added to the available_filter_functions -list. Modules are processed as they are loaded and before they are -executed. When a module is unloaded, it also removes its functions from -the ftrace function list. This is automatic in the module unload -code, and the module author does not need to worry about it. - -When tracing is enabled, kstop_machine is called to prevent races -with the CPUS executing code being modified (which can cause the -CPU to do undesireable things), and the nops are patched back -to calls. But this time, they do not call mcount (which is just -a function stub). They now call into the ftrace infrastructure. +scans this table and updates all the locations into nops. It +also records the locations, which are added to the +available_filter_functions list. Modules are processed as they +are loaded and before they are executed. When a module is +unloaded, it also removes its functions from the ftrace function +list. This is automatic in the module unload code, and the +module author does not need to worry about it. + +When tracing is enabled, kstop_machine is called to prevent +races with the CPUS executing code being modified (which can +cause the CPU to do undesireable things), and the nops are +patched back to calls. But this time, they do not call mcount +(which is just a function stub). They now call into the ftrace +infrastructure. One special side-effect to the recording of the functions being traced is that we can now selectively choose which functions we -wish to trace and which ones we want the mcount calls to remain as -nops. +wish to trace and which ones we want the mcount calls to remain +as nops. -Two files are used, one for enabling and one for disabling the tracing -of specified functions. They are: +Two files are used, one for enabling and one for disabling the +tracing of specified functions. They are: set_ftrace_filter @@ -1443,8 +1535,8 @@ and set_ftrace_notrace -A list of available functions that you can add to these files is listed -in: +A list of available functions that you can add to these files is +listed in: available_filter_functions @@ -1481,8 +1573,8 @@ hrtimer_interrupt sys_nanosleep -Perhaps this is not enough. The filters also allow simple wild cards. -Only the following are currently available +Perhaps this is not enough. The filters also allow simple wild +cards. Only the following are currently available * - will match functions that begin with * - will match functions that end with @@ -1492,9 +1584,9 @@ These are the only wild cards which are supported. * will not work. -Note: It is better to use quotes to enclose the wild cards, otherwise - the shell may expand the parameters into names of files in the local - directory. +Note: It is better to use quotes to enclose the wild cards, + otherwise the shell may expand the parameters into names + of files in the local directory. # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter @@ -1540,7 +1632,8 @@ This is because the '>' and '>>' act just like they do in bash. To rewrite the filters, use '>' To append to the filters, use '>>' -To clear out a filter so that all functions will be recorded again: +To clear out a filter so that all functions will be recorded +again: # echo > /debug/tracing/set_ftrace_filter # cat /debug/tracing/set_ftrace_filter @@ -1572,7 +1665,8 @@ hrtimer_get_res hrtimer_init_sleeper -The set_ftrace_notrace prevents those functions from being traced. +The set_ftrace_notrace prevents those functions from being +traced. # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace @@ -1595,18 +1689,20 @@ Produces: We can see that there's no more lock or preempt tracing. -* Dynamic ftrace with the function graph tracer * +Dynamic ftrace with the function graph tracer +--------------------------------------------- +Although what has been explained above concerns both the +function tracer and the function-graph-tracer, there are some +special features only available in the function-graph tracer. -Although what has been explained above concerns both the function tracer and -the function_graph_tracer, the following concerns only the latter. +If you want to trace only one function and all of its children, +you just have to echo its name into set_graph_function: -If you want to trace only one function and all of its childs, you just have -to echo its name on set_graph_function: + echo __do_fault > set_graph_function -echo __do_fault > set_graph_function - -will produce the following: +will produce the following "expanded" trace of the __do_fault() +function: 0) | __do_fault() { 0) | filemap_fault() { @@ -1643,23 +1739,24 @@ will produce the following: 0) 2.793 us | } 0) + 14.012 us | } -You can also select several functions: +You can also expand several functions at once: -echo sys_open > set_graph_function -echo sys_close >> set_graph_function + echo sys_open > set_graph_function + echo sys_close >> set_graph_function -Now if you want to go back to trace all functions +Now if you want to go back to trace all functions you can clear +this special filter via: -echo > set_graph_function + echo > set_graph_function trace_pipe ---------- -The trace_pipe outputs the same content as the trace file, but the effect -on the tracing is different. Every read from trace_pipe is consumed. -This means that subsequent reads will be different. The trace -is live. +The trace_pipe outputs the same content as the trace file, but +the effect on the tracing is different. Every read from +trace_pipe is consumed. This means that subsequent reads will be +different. The trace is live. # echo function > /debug/tracing/current_tracer # cat /debug/tracing/trace_pipe > /tmp/trace.out & @@ -1687,38 +1784,45 @@ is live. bash-4043 [00] 41.267111: select_task_rq_rt <-try_to_wake_up -Note, reading the trace_pipe file will block until more input is added. -By changing the tracer, trace_pipe will issue an EOF. We needed -to set the function tracer _before_ we "cat" the trace_pipe file. +Note, reading the trace_pipe file will block until more input is +added. By changing the tracer, trace_pipe will issue an EOF. We +needed to set the function tracer _before_ we "cat" the +trace_pipe file. trace entries ------------- -Having too much or not enough data can be troublesome in diagnosing -an issue in the kernel. The file buffer_size_kb is used to modify -the size of the internal trace buffers. The number listed -is the number of entries that can be recorded per CPU. To know -the full size, multiply the number of possible CPUS with the -number of entries. +Having too much or not enough data can be troublesome in +diagnosing an issue in the kernel. The file buffer_size_kb is +used to modify the size of the internal trace buffers. The +number listed is the number of entries that can be recorded per +CPU. To know the full size, multiply the number of possible CPUS +with the number of entries. # cat /debug/tracing/buffer_size_kb 1408 (units kilobytes) -Note, to modify this, you must have tracing completely disabled. To do that, -echo "nop" into the current_tracer. If the current_tracer is not set -to "nop", an EINVAL error will be returned. +Note, to modify this, you must have tracing completely disabled. +To do that, echo "nop" into the current_tracer. If the +current_tracer is not set to "nop", an EINVAL error will be +returned. # echo nop > /debug/tracing/current_tracer # echo 10000 > /debug/tracing/buffer_size_kb # cat /debug/tracing/buffer_size_kb 10000 (units kilobytes) -The number of pages which will be allocated is limited to a percentage -of available memory. Allocating too much will produce an error. +The number of pages which will be allocated is limited to a +percentage of available memory. Allocating too much will produce +an error. # echo 1000000000000 > /debug/tracing/buffer_size_kb -bash: echo: write error: Cannot allocate memory # cat /debug/tracing/buffer_size_kb 85 +----------- + +More details can be found in the source code, in the +kernel/tracing/*.c files. -- cgit v1.2.3 From 722bde6875bfb49a0c84e5601eb82dd7ac02d27c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Feb 2009 00:51:57 -0500 Subject: ext4: Add fine print for the 32000 subdirectory limit Some poeple are reading the ext4 feature list too literally and create dubious test cases involving very long filenames and 1k blocksize and then complain when they run into an htree-imposed limit. So add fine print to the "fix 32000 subdirectory limit" ext4 feature. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index cec829bc7291..5c484aec2bab 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be * extent format more robust in face of on-disk corruption due to magics, * internal redundancy in tree * improved file allocation (multi-block alloc) -* fix 32000 subdirectory limit +* lift 32000 subdirectory limit imposed by i_links_count[1] * nsec timestamps for mtime, atime, ctime, create time * inode version field on disk (NFSv4, Lustre) * reduced e2fsck time via uninit_bg feature @@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force the ordering) +[1] Filesystems with a block size of 1k may see a limit imposed by the +directory hash tree having a maximum depth of two. + 2.2 Candidate features for future inclusion * Online defrag (patches available but not well tested) -- cgit v1.2.3 From 3e8ebb5c433f016dff5824587436642d87fc2e6c Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Sun, 1 Mar 2009 20:41:41 -0500 Subject: debug_objects: add boot-parameter toggle to turn object debugging off again While trying to debug why my Atom netbook is falling over booting rawhide debug-enabled kernels, I stumbled across the fact that we've been enabling object debugging by default. However, once you default it to on, you've got no way to turn it back off again at runtime. Add a boolean toggle to turn it off. I would just make it an int module_param, however people may already expect the boolean enable behaviour, so just add an analogue for disabling. Signed-off-by: Kyle McMartin Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 3 +++ lib/debugobjects.c | 8 ++++++++ 2 files changed, 11 insertions(+) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 54f21a5c262b..1a328f33e1df 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -604,6 +604,9 @@ and is between 256 and 4096 characters. It is defined in the file debug_objects [KNL] Enable object debugging + no_debug_objects + [KNL] Disable object debugging + debugpat [X86] Enable PAT debugging decnet.addr= [HW,NET] diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 5d99be1fd988..90e46fa12721 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -55,7 +55,15 @@ static int __init enable_object_debug(char *str) debug_objects_enabled = 1; return 0; } + +static int __init disable_object_debug(char *str) +{ + debug_objects_enabled = 0; + return 0; +} + early_param("debug_objects", enable_object_debug); +early_param("no_debug_objects", disable_object_debug); static const char *obj_states[ODEBUG_STATE_MAX] = { [ODEBUG_STATE_NONE] = "none", -- cgit v1.2.3 From 3197ebdb130473a92760100cbfe0d7e671838f48 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 31 Mar 2009 09:10:09 -0400 Subject: ext4: Add sysfs support Add basic sysfs support so that information about the mounted filesystem and various tuning parameters can be accessed via /sys/fs/ext4//*. Signed-off-by: "Theodore Ts'o" --- Documentation/ABI/testing/sysfs-fs-ext4 | 81 +++++++++++++ fs/ext4/ext4_sb.h | 2 + fs/ext4/super.c | 207 ++++++++++++++++++++++++++++++++ 3 files changed, 290 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-fs-ext4 (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4 new file mode 100644 index 000000000000..4e79074de282 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-fs-ext4 @@ -0,0 +1,81 @@ +What: /sys/fs/ext4//mb_stats +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + Controls whether the multiblock allocator should + collect statistics, which are shown during the unmount. + 1 means to collect statistics, 0 means not to collect + statistics + +What: /sys/fs/ext4//mb_group_prealloc +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + The multiblock allocator will round up allocation + requests to a multiple of this tuning parameter if the + stripe size is not set in the ext4 superblock + +What: /sys/fs/ext4//mb_max_to_scan +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + The maximum number of extents the multiblock allocator + will search to find the best extent + +What: /sys/fs/ext4//mb_min_to_scan +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + The minimum number of extents the multiblock allocator + will search to find the best extent + +What: /sys/fs/ext4//mb_order2_req +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + Tuning parameter which controls the minimum size for + requests (as a power of 2) where the buddy cache is + used + +What: /sys/fs/ext4//mb_stream_req +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + Files which have fewer blocks than this tunable + parameter will have their blocks allocated out of a + block group specific preallocation pool, so that small + files are packed closely together. Each large file + will have its blocks allocated out of its own unique + preallocation pool. + +What: /sys/fs/ext4//inode_readahead +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + Tuning parameter which controls the maximum number of + inode table blocks that ext4's inode table readahead + algorithm will pre-read into the buffer cache + +What: /sys/fs/ext4//delayed_allocation_blocks +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + This file is read-only and shows the number of blocks + that are dirty in the page cache, but which do not + have their location in the filesystem allocated yet. + +What: /sys/fs/ext4//lifetime_write_kbytes +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + This file is read-only and shows the number of kilobytes + of data that have been written to this filesystem since it was + created. + +What: /sys/fs/ext4//session_write_kbytes +Date: March 2008 +Contact: "Theodore Ts'o" +Description: + This file is read-only and shows the number of + kilobytes of data that have been written to this + filesystem since it was mounted. diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 50ab1169c378..57b71fefbccf 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -64,6 +64,8 @@ struct ext4_sb_info { struct percpu_counter s_dirtyblocks_counter; struct blockgroup_lock *s_blockgroup_lock; struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; /* Journaling */ struct inode *s_journal_inode; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 30fc27cdf8fc..2883d4318c22 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,7 @@ #include "group.h" struct proc_dir_entry *ext4_proc_root; +static struct kset *ext4_kset; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); @@ -580,6 +582,7 @@ static void ext4_put_super(struct super_block *sb) remove_proc_entry("inode_readahead_blks", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } + kobject_del(&sbi->s_kobj); for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); @@ -615,6 +618,16 @@ static void ext4_put_super(struct super_block *sb) ext4_blkdev_remove(sbi); } sb->s_fs_info = NULL; + /* + * Now that we are completely done shutting down the + * superblock, we need to actually destroy the kobject. + */ + unlock_kernel(); + unlock_super(sb); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + lock_super(sb); + lock_kernel(); kfree(sbi->s_blockgroup_lock); kfree(sbi); return; @@ -1464,6 +1477,11 @@ set_qf_format: return 0; if (option < 0 || option > (1 << 30)) return 0; + if (option & (option - 1)) { + printk(KERN_ERR "EXT4-fs: inode_readahead_blks" + " must be a power of 2\n"); + return 0; + } sbi->s_inode_readahead_blks = option; break; case Opt_journal_ioprio: @@ -1992,6 +2010,181 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) return 0; } +/* sysfs supprt */ + +struct ext4_attr { + struct attribute attr; + ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); + ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, + const char *, size_t); + int offset; +}; + +static int parse_strtoul(const char *buf, + unsigned long max, unsigned long *value) +{ + char *endp; + + while (*buf && isspace(*buf)) + buf++; + *value = simple_strtoul(buf, &endp, 0); + while (*endp && isspace(*endp)) + endp++; + if (*endp || *value > max) + return -EINVAL; + + return 0; +} + +static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); +} + +static ssize_t session_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + return snprintf(buf, PAGE_SIZE, "%lu\n", + (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + sbi->s_sectors_written_start) >> 1); +} + +static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + return snprintf(buf, PAGE_SIZE, "%llu\n", + sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1)); +} + +static ssize_t inode_readahead_blks_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long t; + + if (parse_strtoul(buf, 0x40000000, &t)) + return -EINVAL; + + /* inode_readahead_blks must be a power of 2 */ + if (t & (t-1)) + return -EINVAL; + + sbi->s_inode_readahead_blks = t; + return count; +} + +static ssize_t sbi_ui_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t sbi_ui_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + unsigned long t; + + if (parse_strtoul(buf, 0xffffffff, &t)) + return -EINVAL; + *ui = t; + return count; +} + +#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .offset = offsetof(struct ext4_sb_info, _elname), \ +} +#define EXT4_ATTR(name, mode, show, store) \ +static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) + +#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) +#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) +#define EXT4_RW_ATTR_SBI_UI(name, elname) \ + EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) +#define ATTR_LIST(name) &ext4_attr_##name.attr + +EXT4_RO_ATTR(delayed_allocation_blocks); +EXT4_RO_ATTR(session_write_kbytes); +EXT4_RO_ATTR(lifetime_write_kbytes); +EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, + inode_readahead_blks_store, s_inode_readahead_blks); +EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); +EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); +EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); + +static struct attribute *ext4_attrs[] = { + ATTR_LIST(delayed_allocation_blocks), + ATTR_LIST(session_write_kbytes), + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(inode_readahead_blks), + ATTR_LIST(mb_stats), + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), + ATTR_LIST(mb_stream_req), + ATTR_LIST(mb_group_prealloc), + NULL, +}; + +static ssize_t ext4_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t ext4_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void ext4_sb_release(struct kobject *kobj) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + + +static struct sysfs_ops ext4_attr_ops = { + .show = ext4_attr_show, + .store = ext4_attr_store, +}; + +static struct kobj_type ext4_ktype = { + .default_attrs = ext4_attrs, + .sysfs_ops = &ext4_attr_ops, + .release = ext4_sb_release, +}; + static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) @@ -2575,6 +2768,16 @@ no_journal: goto failed_mount4; } + sbi->s_kobj.kset = ext4_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, + "%s", sb->s_id); + if (err) { + ext4_mb_release(sb); + ext4_ext_release(sb); + goto failed_mount4; + }; + /* * akpm: core read_super() calls in here with the superblock locked. * That deadlocks, because orphan cleanup needs to lock the superblock @@ -3734,6 +3937,9 @@ static int __init init_ext4_fs(void) { int err; + ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); + if (!ext4_kset) + return -ENOMEM; ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = init_ext4_mballoc(); if (err) @@ -3775,6 +3981,7 @@ static void __exit exit_ext4_fs(void) exit_ext4_xattr(); exit_ext4_mballoc(); remove_proc_entry("fs/ext4", NULL); + kset_unregister(ext4_kset); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -- cgit v1.2.3 From b713a5ec55bf73c833f9883cdd761b20ee61a1ab Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 31 Mar 2009 09:11:14 -0400 Subject: ext4: remove /proc tuning knobs Remove tuning knobs in /proc/fs/ext4//*. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/proc.txt | 21 ------- fs/ext4/ext4.h | 16 ----- fs/ext4/inode.c | 7 +-- fs/ext4/mballoc.c | 117 +++++++++---------------------------- fs/ext4/super.c | 46 --------------- 5 files changed, 30 insertions(+), 177 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 830bad7cce0f..efc4fd9f40ce 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -940,27 +940,6 @@ Table 1-10: Files in /proc/fs/ext4/ File Content mb_groups details of multiblock allocator buddy cache of free blocks mb_history multiblock allocation history - stats controls whether the multiblock allocator should start - collecting statistics, which are shown during the unmount - group_prealloc the multiblock allocator will round up allocation - requests to a multiple of this tuning parameter if the - stripe size is not set in the ext4 superblock - max_to_scan The maximum number of extents the multiblock allocator - will search to find the best extent - min_to_scan The minimum number of extents the multiblock allocator - will search to find the best extent - order2_req Tuning parameter which controls the minimum size for - requests (as a power of 2) where the buddy cache is - used - stream_req Files which have fewer blocks than this tunable - parameter will have their blocks allocated out of a - block group specific preallocation pool, so that small - files are packed closely together. Each large file - will have its blocks allocated out of its own unique - preallocation pool. -inode_readahead Tuning parameter which controls the maximum number of - inode table blocks that ext4's inode table readahead - algorithm will pre-read into the buffer cache .............................................................................. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0bd39188531c..e5c273ff928b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -976,22 +976,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, extern struct proc_dir_entry *ext4_proc_root; -#ifdef CONFIG_PROC_FS -extern const struct file_operations ext4_ui_proc_fops; - -#define EXT4_PROC_HANDLER(name, var) \ -do { \ - proc = proc_create_data(name, mode, sbi->s_proc, \ - &ext4_ui_proc_fops, &sbi->s_##var); \ - if (proc == NULL) { \ - printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \ - goto err_out; \ - } \ -} while (0) -#else -#define EXT4_PROC_HANDLER(name, var) -#endif - /* * Function prototypes */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7dcac9d7e491..d3118d1acc39 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4153,12 +4153,7 @@ make_io: unsigned num; table = ext4_inode_table(sb, gdp); - /* Make sure s_inode_readahead_blks is a power of 2 */ - while (EXT4_SB(sb)->s_inode_readahead_blks & - (EXT4_SB(sb)->s_inode_readahead_blks-1)) - EXT4_SB(sb)->s_inode_readahead_blks = - (EXT4_SB(sb)->s_inode_readahead_blks & - (EXT4_SB(sb)->s_inode_readahead_blks-1)); + /* s_inode_readahead_blks is always a power of 2 */ b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); if (table > b) b = table; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b0d6022eaa67..c4c430977622 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -46,22 +46,23 @@ * The allocation request involve request for multiple number of blocks * near to the goal(block) value specified. * - * During initialization phase of the allocator we decide to use the group - * preallocation or inode preallocation depending on the size file. The - * size of the file could be the resulting file size we would have after - * allocation or the current file size which ever is larger. If the size is - * less that sbi->s_mb_stream_request we select the group - * preallocation. The default value of s_mb_stream_request is 16 - * blocks. This can also be tuned via - * /proc/fs/ext4//stream_req. The value is represented in terms - * of number of blocks. + * During initialization phase of the allocator we decide to use the + * group preallocation or inode preallocation depending on the size of + * the file. The size of the file could be the resulting file size we + * would have after allocation, or the current file size, which ever + * is larger. If the size is less than sbi->s_mb_stream_request we + * select to use the group preallocation. The default value of + * s_mb_stream_request is 16 blocks. This can also be tuned via + * /sys/fs/ext4//mb_stream_req. The value is represented in + * terms of number of blocks. * * The main motivation for having small file use group preallocation is to - * ensure that we have small file closer in the disk. + * ensure that we have small files closer together on the disk. * - * First stage the allocator looks at the inode prealloc list - * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for - * this particular inode. The inode prealloc space is represented as: + * First stage the allocator looks at the inode prealloc list, + * ext4_inode_info->i_prealloc_list, which contains list of prealloc + * spaces for this particular inode. The inode prealloc space is + * represented as: * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space @@ -121,29 +122,29 @@ * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to - * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to + * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is * 512 blocks. This can be tuned via - * /proc/fs/ext4/ option the group prealloc request is normalized to the * stripe value (sbi->s_stripe) * - * The regular allocator(using the buddy cache) support few tunables. + * The regular allocator(using the buddy cache) supports few tunables. * - * /proc/fs/ext4//min_to_scan - * /proc/fs/ext4//max_to_scan - * /proc/fs/ext4//order2_req + * /sys/fs/ext4//mb_min_to_scan + * /sys/fs/ext4//mb_max_to_scan + * /sys/fs/ext4//mb_order2_req * - * The regular allocator use buddy scan only if the request len is power of + * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The * value of s_mb_order2_reqs can be tuned via - * /proc/fs/ext4//order2_req. If the request len is equal to + * /sys/fs/ext4//mb_order2_req. If the request len is equal to * stripe size (sbi->s_stripe), we try to search for contigous block in - * stripe size. This should result in better allocation on RAID setup. If - * not we search in the specific group using bitmap for best extents. The - * tunable min_to_scan and max_to_scan controll the behaviour here. + * stripe size. This should result in better allocation on RAID setups. If + * not, we search in the specific group using bitmap for best extents. The + * tunable min_to_scan and max_to_scan control the behaviour here. * min_to_scan indicate how long the mballoc __must__ look for a best - * extent and max_to_scanindicate how long the mballoc __can__ look for a + * extent and max_to_scan indicates how long the mballoc __can__ look for a * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it @@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); -static int ext4_mb_init_per_dev_proc(struct super_block *sb); -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); @@ -1978,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* * We search using buddy data only if the order of the request * is greater than equal to the sbi_s_mb_order2_reqs - * You can tune it via /proc/fs/ext4//order2_req + * You can tune it via /sys/fs/ext4//mb_order2_req */ if (i >= sbi->s_mb_order2_reqs) { /* @@ -2753,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) spin_lock_init(&lg->lg_prealloc_lock); } - ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); if (sbi->s_journal) @@ -2836,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb) free_percpu(sbi->s_locality_groups); ext4_mb_history_release(sb); - ext4_mb_destroy_per_dev_proc(sb); return 0; } @@ -2897,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug("freed %u blocks in %u structures\n", count, count2); } -#define EXT4_MB_STATS_NAME "stats" -#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan" -#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan" -#define EXT4_MB_ORDER2_REQ "order2_req" -#define EXT4_MB_STREAM_REQ "stream_req" -#define EXT4_MB_GROUP_PREALLOC "group_prealloc" - -static int ext4_mb_init_per_dev_proc(struct super_block *sb) -{ -#ifdef CONFIG_PROC_FS - mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct proc_dir_entry *proc; - - if (sbi->s_proc == NULL) - return -EINVAL; - - EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats); - EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan); - EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan); - EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs); - EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request); - EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc); - return 0; - -err_out: - remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); - remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); - remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); - remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); - remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); - remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); - return -ENOMEM; -#else - return 0; -#endif -} - -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) -{ -#ifdef CONFIG_PROC_FS - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_proc == NULL) - return -EINVAL; - - remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); - remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); - remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); - remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); - remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); - remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); -#endif - return 0; -} - int __init init_ext4_mballoc(void) { ext4_pspace_cachep = @@ -3123,7 +3064,7 @@ out_err: * here we normalize request for locality group * Group request are normalized to s_strip size if we set the same via mount * option. If not we set it to s_mb_group_prealloc which can be configured via - * /proc/fs/ext4//group_prealloc + * /sys/fs/ext4//mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? */ @@ -4239,7 +4180,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) * file is determined by the current size or the resulting size after * allocation which ever is larger * - * One can tune this size via /proc/fs/ext4//stream_req + * One can tune this size via /sys/fs/ext4//mb_stream_req */ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2883d4318c22..1ec554cc107a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -579,7 +579,6 @@ static void ext4_put_super(struct super_block *sb) ext4_commit_super(sb, es, 1); } if (sbi->s_proc) { - remove_proc_entry("inode_readahead_blks", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } kobject_del(&sbi->s_kobj); @@ -2529,11 +2528,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_PROC_FS if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); - - if (sbi->s_proc) - proc_create_data("inode_readahead_blks", 0644, sbi->s_proc, - &ext4_ui_proc_fops, - &sbi->s_inode_readahead_blks); #endif bgl_lock_init(sbi->s_blockgroup_lock); @@ -2832,7 +2826,6 @@ failed_mount2: kfree(sbi->s_group_desc); failed_mount: if (sbi->s_proc) { - remove_proc_entry("inode_readahead_blks", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } #ifdef CONFIG_QUOTA @@ -3865,45 +3858,6 @@ static int ext4_get_sb(struct file_system_type *fs_type, return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); } -#ifdef CONFIG_PROC_FS -static int ext4_ui_proc_show(struct seq_file *m, void *v) -{ - unsigned int *p = m->private; - - seq_printf(m, "%u\n", *p); - return 0; -} - -static int ext4_ui_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, ext4_ui_proc_show, PDE(inode)->data); -} - -static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, - size_t cnt, loff_t *ppos) -{ - unsigned long *p = PDE(file->f_path.dentry->d_inode)->data; - char str[32]; - - if (cnt >= sizeof(str)) - return -EINVAL; - if (copy_from_user(str, buf, cnt)) - return -EFAULT; - - *p = simple_strtoul(str, NULL, 0); - return cnt; -} - -const struct file_operations ext4_ui_proc_fops = { - .owner = THIS_MODULE, - .open = ext4_ui_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = ext4_ui_proc_write, -}; -#endif - static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", -- cgit v1.2.3 From 5e1607a00bd082972629d3d68c95c8bcf902b55a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2009 10:24:48 +0100 Subject: tracing: rename ftrace_printk() => trace_printk() Impact: cleanup Use a more generic name - this also allows the prototype to move to kernel.h and be generally available to kernel developers who want to do some quick tracing. Signed-off-by: Ingo Molnar --- Documentation/ftrace.txt | 6 +++--- include/linux/ftrace.h | 18 +++++++++--------- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace.h | 2 +- 4 files changed, 17 insertions(+), 17 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 2041ee951c1a..22614bef6359 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -1466,11 +1466,11 @@ want, depending on your needs. You can put some comments on specific functions by using -ftrace_printk() For example, if you want to put a comment inside +trace_printk() For example, if you want to put a comment inside the __might_sleep() function, you just have to include - and call ftrace_printk() inside __might_sleep() + and call trace_printk() inside __might_sleep() -ftrace_printk("I'm a comment!\n") +trace_printk("I'm a comment!\n") will produce: diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1f69ac7c1587..fbb9c364e166 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -329,11 +329,11 @@ extern void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); /** - * ftrace_printk - printf formatting in the ftrace buffer + * trace_printk - printf formatting in the ftrace buffer * @fmt: the printf format for printing * - * Note: __ftrace_printk is an internal function for ftrace_printk and - * the @ip is passed in via the ftrace_printk macro. + * Note: __trace_printk is an internal function for trace_printk and + * the @ip is passed in via the trace_printk macro. * * This function allows a kernel developer to debug fast path sections * that printk is not appropriate for. By scattering in various @@ -341,14 +341,14 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); * where problems are occurring. * * This is intended as a debugging tool for the developer only. - * Please refrain from leaving ftrace_printks scattered around in + * Please refrain from leaving trace_printks scattered around in * your code. */ -# define ftrace_printk(fmt...) __ftrace_printk(_THIS_IP_, fmt) +# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt) extern int -__ftrace_printk(unsigned long ip, const char *fmt, ...) +__trace_printk(unsigned long ip, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); -# define ftrace_vprintk(fmt, ap) __ftrace_printk(_THIS_IP_, fmt, ap) +# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap) extern int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); extern void ftrace_dump(void); @@ -356,13 +356,13 @@ extern void ftrace_dump(void); static inline void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } static inline int -ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); +trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); static inline void tracing_start(void) { } static inline void tracing_stop(void) { } static inline void ftrace_off_permanent(void) { } static inline int -ftrace_printk(const char *fmt, ...) +trace_printk(const char *fmt, ...) { return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d1ef43999d9e..c0e9c1263393 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -48,7 +48,7 @@ unsigned long __read_mostly tracing_thresh; * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the * entries inserted during the selftest although some concurrent - * insertions into the ring-buffer such as ftrace_printk could occurred + * insertions into the ring-buffer such as trace_printk could occurred * at the same time, giving false positive or negative results. */ static bool __read_mostly tracing_selftest_running; @@ -291,7 +291,7 @@ static const char *trace_options[] = { "block", "stacktrace", "sched-tree", - "ftrace_printk", + "trace_printk", "ftrace_preempt", "branch", "annotate", @@ -3768,7 +3768,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(trace_vprintk); -int __ftrace_printk(unsigned long ip, const char *fmt, ...) +int __trace_printk(unsigned long ip, const char *fmt, ...) { int ret; va_list ap; @@ -3781,7 +3781,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...) va_end(ap); return ret; } -EXPORT_SYMBOL_GPL(__ftrace_printk); +EXPORT_SYMBOL_GPL(__trace_printk); int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 12cd119cca32..8beff03fda68 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -115,7 +115,7 @@ struct userstack_entry { }; /* - * ftrace_printk entry: + * trace_printk entry: */ struct print_entry { struct trace_entry ent; -- cgit v1.2.3 From 42b40b3d55f5782b00b74d9105c3565fbfa5cb80 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Sat, 7 Mar 2009 23:55:09 +0900 Subject: ftrace: fix documentation typo s/trace_max_latency/tracing_max_latency/ There isn't a trace_max_latency file, there is tracing_max_latency. Fix it. Signed-off-by: KOSAKI Motohiro Acked-by: Steven Rostedt LKML-Reference: <20090307235409.5A87.A69D9226@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- Documentation/ftrace.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 22614bef6359..fd9a3e693813 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -100,7 +100,7 @@ of ftrace. Here is a list of some of the key files: that is displayed in one of the above output files. - trace_max_latency: + tracing_max_latency: Some of the tracers record the max latency. For example, the time interrupts are disabled. -- cgit v1.2.3 From 73969ff0eda233f140bcbed1251431387b43f383 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 4 Mar 2009 23:27:14 -0800 Subject: Input: generic driver for rotary encoders on GPIOs This patch adds a generic driver for rotary encoders connected to GPIO pins of a system. It relies on gpiolib and generic hardware irqs. The documentation that also comes with this patch explains the concept and how to use the driver. Signed-off-by: Daniel Mack Tested-by: H Hartley Sweeten Signed-off-by: Dmitry Torokhov --- Documentation/input/rotary-encoder.txt | 101 +++++++++++++++ drivers/input/misc/Kconfig | 11 ++ drivers/input/misc/Makefile | 2 + drivers/input/misc/rotary_encoder.c | 221 +++++++++++++++++++++++++++++++++ include/linux/rotary_encoder.h | 13 ++ 5 files changed, 348 insertions(+) create mode 100644 Documentation/input/rotary-encoder.txt create mode 100644 drivers/input/misc/rotary_encoder.c create mode 100644 include/linux/rotary_encoder.h (limited to 'Documentation') diff --git a/Documentation/input/rotary-encoder.txt b/Documentation/input/rotary-encoder.txt new file mode 100644 index 000000000000..435102a26d96 --- /dev/null +++ b/Documentation/input/rotary-encoder.txt @@ -0,0 +1,101 @@ +rotary-encoder - a generic driver for GPIO connected devices +Daniel Mack , Feb 2009 + +0. Function +----------- + +Rotary encoders are devices which are connected to the CPU or other +peripherals with two wires. The outputs are phase-shifted by 90 degrees +and by triggering on falling and rising edges, the turn direction can +be determined. + +The phase diagram of these two outputs look like this: + + _____ _____ _____ + | | | | | | + Channel A ____| |_____| |_____| |____ + + : : : : : : : : : : : : + __ _____ _____ _____ + | | | | | | | + Channel B |_____| |_____| |_____| |__ + + : : : : : : : : : : : : + Event a b c d a b c d a b c d + + |<-------->| + one step + + +For more information, please see + http://en.wikipedia.org/wiki/Rotary_encoder + + +1. Events / state machine +------------------------- + +a) Rising edge on channel A, channel B in low state + This state is used to recognize a clockwise turn + +b) Rising edge on channel B, channel A in high state + When entering this state, the encoder is put into 'armed' state, + meaning that there it has seen half the way of a one-step transition. + +c) Falling edge on channel A, channel B in high state + This state is used to recognize a counter-clockwise turn + +d) Falling edge on channel B, channel A in low state + Parking position. If the encoder enters this state, a full transition + should have happend, unless it flipped back on half the way. The + 'armed' state tells us about that. + +2. Platform requirements +------------------------ + +As there is no hardware dependent call in this driver, the platform it is +used with must support gpiolib. Another requirement is that IRQs must be +able to fire on both edges. + + +3. Board integration +-------------------- + +To use this driver in your system, register a platform_device with the +name 'rotary-encoder' and associate the IRQs and some specific platform +data with it. + +struct rotary_encoder_platform_data is declared in +include/linux/rotary-encoder.h and needs to be filled with the number of +steps the encoder has and can carry information about externally inverted +signals (because of used invertig buffer or other reasons). + +Because GPIO to IRQ mapping is platform specific, this information must +be given in seperately to the driver. See the example below. + +------------------ + +/* board support file example */ + +#include +#include + +#define GPIO_ROTARY_A 1 +#define GPIO_ROTARY_B 2 + +static struct rotary_encoder_platform_data my_rotary_encoder_info = { + .steps = 24, + .axis = ABS_X, + .gpio_a = GPIO_ROTARY_A, + .gpio_b = GPIO_ROTARY_B, + .inverted_a = 0, + .inverted_b = 0, +}; + +static struct platform_device rotary_encoder_device = { + .name = "rotary-encoder", + .id = 0, + .dev = { + .platform_data = &my_rotary_encoder_info, + } +}; + diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig index 67e5553f699a..806d2e66d249 100644 --- a/drivers/input/misc/Kconfig +++ b/drivers/input/misc/Kconfig @@ -227,4 +227,15 @@ config INPUT_PCF50633_PMU Say Y to include support for delivering PMU events via input layer on NXP PCF50633. +config INPUT_GPIO_ROTARY_ENCODER + tristate "Rotary encoders connected to GPIO pins" + depends on GPIOLIB && GENERIC_GPIO + help + Say Y here to add support for rotary encoders connected to GPIO lines. + Check file:Documentation/incput/rotary_encoder.txt for more + information. + + To compile this driver as a module, choose M here: the + module will be called rotary_encoder. + endif diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile index bb62e6efacf3..e86cee66c914 100644 --- a/drivers/input/misc/Makefile +++ b/drivers/input/misc/Makefile @@ -22,3 +22,5 @@ obj-$(CONFIG_INPUT_UINPUT) += uinput.o obj-$(CONFIG_INPUT_APANEL) += apanel.o obj-$(CONFIG_INPUT_SGI_BTNS) += sgi_btns.o obj-$(CONFIG_INPUT_PCF50633_PMU) += pcf50633-input.o +obj-$(CONFIG_INPUT_GPIO_ROTARY_ENCODER) += rotary_encoder.o + diff --git a/drivers/input/misc/rotary_encoder.c b/drivers/input/misc/rotary_encoder.c new file mode 100644 index 000000000000..5bb3ab51b8c6 --- /dev/null +++ b/drivers/input/misc/rotary_encoder.c @@ -0,0 +1,221 @@ +/* + * rotary_encoder.c + * + * (c) 2009 Daniel Mack + * + * state machine code inspired by code from Tim Ruetz + * + * A generic driver for rotary encoders connected to GPIO lines. + * See file:Documentation/input/rotary_encoder.txt for more information + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRV_NAME "rotary-encoder" + +struct rotary_encoder { + unsigned int irq_a; + unsigned int irq_b; + unsigned int pos; + unsigned int armed; + unsigned int dir; + struct input_dev *input; + struct rotary_encoder_platform_data *pdata; +}; + +static irqreturn_t rotary_encoder_irq(int irq, void *dev_id) +{ + struct rotary_encoder *encoder = dev_id; + struct rotary_encoder_platform_data *pdata = encoder->pdata; + int a = !!gpio_get_value(pdata->gpio_a); + int b = !!gpio_get_value(pdata->gpio_b); + int state; + + a ^= pdata->inverted_a; + b ^= pdata->inverted_b; + state = (a << 1) | b; + + switch (state) { + + case 0x0: + if (!encoder->armed) + break; + + if (encoder->dir) { + /* turning counter-clockwise */ + encoder->pos += pdata->steps; + encoder->pos--; + encoder->pos %= pdata->steps; + } else { + /* turning clockwise */ + encoder->pos++; + encoder->pos %= pdata->steps; + } + + input_report_abs(encoder->input, pdata->axis, encoder->pos); + input_sync(encoder->input); + + encoder->armed = 0; + break; + + case 0x1: + case 0x2: + if (encoder->armed) + encoder->dir = state - 1; + break; + + case 0x3: + encoder->armed = 1; + break; + } + + return IRQ_HANDLED; +} + +static int __devinit rotary_encoder_probe(struct platform_device *pdev) +{ + struct rotary_encoder_platform_data *pdata = pdev->dev.platform_data; + struct rotary_encoder *encoder; + struct input_dev *input; + int err; + + if (!pdata || !pdata->steps) { + dev_err(&pdev->dev, "invalid platform data\n"); + return -ENOENT; + } + + encoder = kzalloc(sizeof(struct rotary_encoder), GFP_KERNEL); + input = input_allocate_device(); + if (!encoder || !input) { + dev_err(&pdev->dev, "failed to allocate memory for device\n"); + err = -ENOMEM; + goto exit_free_mem; + } + + encoder->input = input; + encoder->pdata = pdata; + encoder->irq_a = gpio_to_irq(pdata->gpio_a); + encoder->irq_b = gpio_to_irq(pdata->gpio_b); + + /* create and register the input driver */ + input->name = pdev->name; + input->id.bustype = BUS_HOST; + input->dev.parent = &pdev->dev; + input->evbit[0] = BIT_MASK(EV_ABS); + input_set_abs_params(encoder->input, + pdata->axis, 0, pdata->steps, 0, 1); + + err = input_register_device(input); + if (err) { + dev_err(&pdev->dev, "failed to register input device\n"); + goto exit_free_mem; + } + + /* request the GPIOs */ + err = gpio_request(pdata->gpio_a, DRV_NAME); + if (err) { + dev_err(&pdev->dev, "unable to request GPIO %d\n", + pdata->gpio_a); + goto exit_unregister_input; + } + + err = gpio_request(pdata->gpio_b, DRV_NAME); + if (err) { + dev_err(&pdev->dev, "unable to request GPIO %d\n", + pdata->gpio_b); + goto exit_free_gpio_a; + } + + /* request the IRQs */ + err = request_irq(encoder->irq_a, &rotary_encoder_irq, + IORESOURCE_IRQ_HIGHEDGE | IORESOURCE_IRQ_LOWEDGE, + DRV_NAME, encoder); + if (err) { + dev_err(&pdev->dev, "unable to request IRQ %d\n", + encoder->irq_a); + goto exit_free_gpio_b; + } + + err = request_irq(encoder->irq_b, &rotary_encoder_irq, + IORESOURCE_IRQ_HIGHEDGE | IORESOURCE_IRQ_LOWEDGE, + DRV_NAME, encoder); + if (err) { + dev_err(&pdev->dev, "unable to request IRQ %d\n", + encoder->irq_b); + goto exit_free_irq_a; + } + + platform_set_drvdata(pdev, encoder); + + return 0; + +exit_free_irq_a: + free_irq(encoder->irq_a, encoder); +exit_free_gpio_b: + gpio_free(pdata->gpio_b); +exit_free_gpio_a: + gpio_free(pdata->gpio_a); +exit_unregister_input: + input_unregister_device(input); + input = NULL; /* so we don't try to free it */ +exit_free_mem: + input_free_device(input); + kfree(encoder); + return err; +} + +static int __devexit rotary_encoder_remove(struct platform_device *pdev) +{ + struct rotary_encoder *encoder = platform_get_drvdata(pdev); + struct rotary_encoder_platform_data *pdata = pdev->dev.platform_data; + + free_irq(encoder->irq_a, encoder); + free_irq(encoder->irq_b, encoder); + gpio_free(pdata->gpio_a); + gpio_free(pdata->gpio_b); + input_unregister_device(encoder->input); + platform_set_drvdata(pdev, NULL); + kfree(encoder); + + return 0; +} + +static struct platform_driver rotary_encoder_driver = { + .probe = rotary_encoder_probe, + .remove = __devexit_p(rotary_encoder_remove), + .driver = { + .name = DRV_NAME, + .owner = THIS_MODULE, + } +}; + +static int __init rotary_encoder_init(void) +{ + return platform_driver_register(&rotary_encoder_driver); +} + +static void __exit rotary_encoder_exit(void) +{ + platform_driver_unregister(&rotary_encoder_driver); +} + +module_init(rotary_encoder_init); +module_exit(rotary_encoder_exit); + +MODULE_ALIAS("platform:" DRV_NAME); +MODULE_DESCRIPTION("GPIO rotary encoder driver"); +MODULE_AUTHOR("Daniel Mack "); +MODULE_LICENSE("GPL v2"); + diff --git a/include/linux/rotary_encoder.h b/include/linux/rotary_encoder.h new file mode 100644 index 000000000000..12d63a30c347 --- /dev/null +++ b/include/linux/rotary_encoder.h @@ -0,0 +1,13 @@ +#ifndef __ROTARY_ENCODER_H__ +#define __ROTARY_ENCODER_H__ + +struct rotary_encoder_platform_data { + unsigned int steps; + unsigned int axis; + unsigned int gpio_a; + unsigned int gpio_b; + unsigned int inverted_a; + unsigned int inverted_b; +}; + +#endif /* __ROTARY_ENCODER_H__ */ -- cgit v1.2.3 From 2939b0469d04ba9ac791aca9a81625d7eb50662b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 9 Mar 2009 15:47:18 -0400 Subject: tracing: replace TP with TP_ Impact: clean up The macros TPPROTO, TPARGS, TPFMT, TPRAWFMT, and TPCMD all look a bit ugly. This patch adds an underscore to their names. Signed-off-by: Steven Rostedt --- Documentation/tracepoints.txt | 8 +-- include/linux/tracepoint.h | 6 +-- include/trace/block.h | 70 ++++++++++++------------ include/trace/irq_event_types.h | 16 +++--- include/trace/lockdep_event_types.h | 24 ++++----- include/trace/power.h | 12 ++--- include/trace/sched_event_types.h | 98 +++++++++++++++++----------------- include/trace/workqueue.h | 16 +++--- kernel/trace/trace_event_types.h | 28 +++++----- kernel/trace/trace_events_stage_2.h | 6 +-- kernel/trace/trace_events_stage_3.h | 8 +-- kernel/trace/trace_export.c | 8 +-- kernel/trace/trace_format.h | 55 ------------------- samples/tracepoints/tp-samples-trace.h | 8 +-- 14 files changed, 154 insertions(+), 209 deletions(-) delete mode 100644 kernel/trace/trace_format.h (limited to 'Documentation') diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt index 6f0a044f5b5e..4ff43c6de299 100644 --- a/Documentation/tracepoints.txt +++ b/Documentation/tracepoints.txt @@ -45,8 +45,8 @@ In include/trace/subsys.h : #include DECLARE_TRACE(subsys_eventname, - TPPROTO(int firstarg, struct task_struct *p), - TPARGS(firstarg, p)); + TP_PROTO(int firstarg, struct task_struct *p), + TP_ARGS(firstarg, p)); In subsys/file.c (where the tracing statement must be added) : @@ -66,10 +66,10 @@ Where : - subsys is the name of your subsystem. - eventname is the name of the event to trace. -- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the +- TP_PROTO(int firstarg, struct task_struct *p) is the prototype of the function called by this tracepoint. -- TPARGS(firstarg, p) are the parameters names, same as found in the +- TP_ARGS(firstarg, p) are the parameters names, same as found in the prototype. Connecting a function (probe) to a tracepoint is done by providing a diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 152b2f03fb86..3bcc3e171443 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -31,8 +31,8 @@ struct tracepoint { * Keep in sync with vmlinux.lds.h. */ -#define TPPROTO(args...) args -#define TPARGS(args...) args +#define TP_PROTO(args...) args +#define TP_ARGS(args...) args #ifdef CONFIG_TRACEPOINTS @@ -65,7 +65,7 @@ struct tracepoint { { \ if (unlikely(__tracepoint_##name.state)) \ __DO_TRACE(&__tracepoint_##name, \ - TPPROTO(proto), TPARGS(args)); \ + TP_PROTO(proto), TP_ARGS(args)); \ } \ static inline int register_trace_##name(void (*probe)(proto)) \ { \ diff --git a/include/trace/block.h b/include/trace/block.h index 25c6a1fd5b77..25b7068b819e 100644 --- a/include/trace/block.h +++ b/include/trace/block.h @@ -5,72 +5,72 @@ #include DECLARE_TRACE(block_rq_abort, - TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); + TP_PROTO(struct request_queue *q, struct request *rq), + TP_ARGS(q, rq)); DECLARE_TRACE(block_rq_insert, - TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); + TP_PROTO(struct request_queue *q, struct request *rq), + TP_ARGS(q, rq)); DECLARE_TRACE(block_rq_issue, - TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); + TP_PROTO(struct request_queue *q, struct request *rq), + TP_ARGS(q, rq)); DECLARE_TRACE(block_rq_requeue, - TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); + TP_PROTO(struct request_queue *q, struct request *rq), + TP_ARGS(q, rq)); DECLARE_TRACE(block_rq_complete, - TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); + TP_PROTO(struct request_queue *q, struct request *rq), + TP_ARGS(q, rq)); DECLARE_TRACE(block_bio_bounce, - TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); + TP_PROTO(struct request_queue *q, struct bio *bio), + TP_ARGS(q, bio)); DECLARE_TRACE(block_bio_complete, - TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); + TP_PROTO(struct request_queue *q, struct bio *bio), + TP_ARGS(q, bio)); DECLARE_TRACE(block_bio_backmerge, - TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); + TP_PROTO(struct request_queue *q, struct bio *bio), + TP_ARGS(q, bio)); DECLARE_TRACE(block_bio_frontmerge, - TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); + TP_PROTO(struct request_queue *q, struct bio *bio), + TP_ARGS(q, bio)); DECLARE_TRACE(block_bio_queue, - TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); + TP_PROTO(struct request_queue *q, struct bio *bio), + TP_ARGS(q, bio)); DECLARE_TRACE(block_getrq, - TPPROTO(struct request_queue *q, struct bio *bio, int rw), - TPARGS(q, bio, rw)); + TP_PROTO(struct request_queue *q, struct bio *bio, int rw), + TP_ARGS(q, bio, rw)); DECLARE_TRACE(block_sleeprq, - TPPROTO(struct request_queue *q, struct bio *bio, int rw), - TPARGS(q, bio, rw)); + TP_PROTO(struct request_queue *q, struct bio *bio, int rw), + TP_ARGS(q, bio, rw)); DECLARE_TRACE(block_plug, - TPPROTO(struct request_queue *q), - TPARGS(q)); + TP_PROTO(struct request_queue *q), + TP_ARGS(q)); DECLARE_TRACE(block_unplug_timer, - TPPROTO(struct request_queue *q), - TPARGS(q)); + TP_PROTO(struct request_queue *q), + TP_ARGS(q)); DECLARE_TRACE(block_unplug_io, - TPPROTO(struct request_queue *q), - TPARGS(q)); + TP_PROTO(struct request_queue *q), + TP_ARGS(q)); DECLARE_TRACE(block_split, - TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu), - TPARGS(q, bio, pdu)); + TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu), + TP_ARGS(q, bio, pdu)); DECLARE_TRACE(block_remap, - TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t from, sector_t to), - TPARGS(q, bio, dev, from, to)); + TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, + sector_t from, sector_t to), + TP_ARGS(q, bio, dev, from, to)); #endif diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h index 65850bc5ea06..0147d9eef5f4 100644 --- a/include/trace/irq_event_types.h +++ b/include/trace/irq_event_types.h @@ -9,25 +9,25 @@ #define TRACE_SYSTEM irq TRACE_EVENT_FORMAT(irq_handler_entry, - TPPROTO(int irq, struct irqaction *action), - TPARGS(irq, action), - TPFMT("irq=%d handler=%s", irq, action->name), + TP_PROTO(int irq, struct irqaction *action), + TP_ARGS(irq, action), + TP_FMT("irq=%d handler=%s", irq, action->name), TRACE_STRUCT( TRACE_FIELD(int, irq, irq) ), - TPRAWFMT("irq %d") + TP_RAW_FMT("irq %d") ); TRACE_EVENT_FORMAT(irq_handler_exit, - TPPROTO(int irq, struct irqaction *action, int ret), - TPARGS(irq, action, ret), - TPFMT("irq=%d handler=%s return=%s", + TP_PROTO(int irq, struct irqaction *action, int ret), + TP_ARGS(irq, action, ret), + TP_FMT("irq=%d handler=%s return=%s", irq, action->name, ret ? "handled" : "unhandled"), TRACE_STRUCT( TRACE_FIELD(int, irq, irq) TRACE_FIELD(int, ret, ret) ), - TPRAWFMT("irq %d ret %d") + TP_RAW_FMT("irq %d ret %d") ); #undef TRACE_SYSTEM diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h index f713d74a82b4..1f00e8b3543e 100644 --- a/include/trace/lockdep_event_types.h +++ b/include/trace/lockdep_event_types.h @@ -10,32 +10,32 @@ #ifdef CONFIG_LOCKDEP TRACE_FORMAT(lock_acquire, - TPPROTO(struct lockdep_map *lock, unsigned int subclass, + TP_PROTO(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *next_lock, unsigned long ip), - TPARGS(lock, subclass, trylock, read, check, next_lock, ip), - TPFMT("%s%s%s", trylock ? "try " : "", + TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), + TP_FMT("%s%s%s", trylock ? "try " : "", read ? "read " : "", lock->name) ); TRACE_FORMAT(lock_release, - TPPROTO(struct lockdep_map *lock, int nested, unsigned long ip), - TPARGS(lock, nested, ip), - TPFMT("%s", lock->name) + TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), + TP_ARGS(lock, nested, ip), + TP_FMT("%s", lock->name) ); #ifdef CONFIG_LOCK_STAT TRACE_FORMAT(lock_contended, - TPPROTO(struct lockdep_map *lock, unsigned long ip), - TPARGS(lock, ip), - TPFMT("%s", lock->name) + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + TP_ARGS(lock, ip), + TP_FMT("%s", lock->name) ); TRACE_FORMAT(lock_acquired, - TPPROTO(struct lockdep_map *lock, unsigned long ip), - TPARGS(lock, ip), - TPFMT("%s", lock->name) + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + TP_ARGS(lock, ip), + TP_FMT("%s", lock->name) ); #endif diff --git a/include/trace/power.h b/include/trace/power.h index 38aca537e497..ef204666e983 100644 --- a/include/trace/power.h +++ b/include/trace/power.h @@ -18,15 +18,15 @@ struct power_trace { }; DECLARE_TRACE(power_start, - TPPROTO(struct power_trace *it, unsigned int type, unsigned int state), - TPARGS(it, type, state)); + TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state), + TP_ARGS(it, type, state)); DECLARE_TRACE(power_mark, - TPPROTO(struct power_trace *it, unsigned int type, unsigned int state), - TPARGS(it, type, state)); + TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state), + TP_ARGS(it, type, state)); DECLARE_TRACE(power_end, - TPPROTO(struct power_trace *it), - TPARGS(it)); + TP_PROTO(struct power_trace *it), + TP_ARGS(it)); #endif /* _TRACE_POWER_H */ diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h index a6de5c1601a0..71b14828a957 100644 --- a/include/trace/sched_event_types.h +++ b/include/trace/sched_event_types.h @@ -9,143 +9,143 @@ #define TRACE_SYSTEM sched TRACE_EVENT_FORMAT(sched_kthread_stop, - TPPROTO(struct task_struct *t), - TPARGS(t), - TPFMT("task %s:%d", t->comm, t->pid), + TP_PROTO(struct task_struct *t), + TP_ARGS(t), + TP_FMT("task %s:%d", t->comm, t->pid), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, t->pid) ), - TPRAWFMT("task %d") + TP_RAW_FMT("task %d") ); TRACE_EVENT_FORMAT(sched_kthread_stop_ret, - TPPROTO(int ret), - TPARGS(ret), - TPFMT("ret=%d", ret), + TP_PROTO(int ret), + TP_ARGS(ret), + TP_FMT("ret=%d", ret), TRACE_STRUCT( TRACE_FIELD(int, ret, ret) ), - TPRAWFMT("ret=%d") + TP_RAW_FMT("ret=%d") ); TRACE_EVENT_FORMAT(sched_wait_task, - TPPROTO(struct rq *rq, struct task_struct *p), - TPARGS(rq, p), - TPFMT("task %s:%d", p->comm, p->pid), + TP_PROTO(struct rq *rq, struct task_struct *p), + TP_ARGS(rq, p), + TP_FMT("task %s:%d", p->comm, p->pid), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, p->pid) ), - TPRAWFMT("task %d") + TP_RAW_FMT("task %d") ); TRACE_EVENT_FORMAT(sched_wakeup, - TPPROTO(struct rq *rq, struct task_struct *p, int success), - TPARGS(rq, p, success), - TPFMT("task %s:%d %s", + TP_PROTO(struct rq *rq, struct task_struct *p, int success), + TP_ARGS(rq, p, success), + TP_FMT("task %s:%d %s", p->comm, p->pid, success ? "succeeded" : "failed"), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, p->pid) TRACE_FIELD(int, success, success) ), - TPRAWFMT("task %d success=%d") + TP_RAW_FMT("task %d success=%d") ); TRACE_EVENT_FORMAT(sched_wakeup_new, - TPPROTO(struct rq *rq, struct task_struct *p, int success), - TPARGS(rq, p, success), - TPFMT("task %s:%d", + TP_PROTO(struct rq *rq, struct task_struct *p, int success), + TP_ARGS(rq, p, success), + TP_FMT("task %s:%d", p->comm, p->pid, success ? "succeeded" : "failed"), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, p->pid) TRACE_FIELD(int, success, success) ), - TPRAWFMT("task %d success=%d") + TP_RAW_FMT("task %d success=%d") ); TRACE_EVENT_FORMAT(sched_switch, - TPPROTO(struct rq *rq, struct task_struct *prev, + TP_PROTO(struct rq *rq, struct task_struct *prev, struct task_struct *next), - TPARGS(rq, prev, next), - TPFMT("task %s:%d ==> %s:%d", + TP_ARGS(rq, prev, next), + TP_FMT("task %s:%d ==> %s:%d", prev->comm, prev->pid, next->comm, next->pid), TRACE_STRUCT( TRACE_FIELD(pid_t, prev_pid, prev->pid) TRACE_FIELD(int, prev_prio, prev->prio) TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN], next_comm, - TPCMD(memcpy(TRACE_ENTRY->next_comm, + TP_CMD(memcpy(TRACE_ENTRY->next_comm, next->comm, TASK_COMM_LEN))) TRACE_FIELD(pid_t, next_pid, next->pid) TRACE_FIELD(int, next_prio, next->prio) ), - TPRAWFMT("prev %d:%d ==> next %s:%d:%d") + TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d") ); TRACE_EVENT_FORMAT(sched_migrate_task, - TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu), - TPARGS(p, orig_cpu, dest_cpu), - TPFMT("task %s:%d from: %d to: %d", + TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), + TP_ARGS(p, orig_cpu, dest_cpu), + TP_FMT("task %s:%d from: %d to: %d", p->comm, p->pid, orig_cpu, dest_cpu), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, p->pid) TRACE_FIELD(int, orig_cpu, orig_cpu) TRACE_FIELD(int, dest_cpu, dest_cpu) ), - TPRAWFMT("task %d from: %d to: %d") + TP_RAW_FMT("task %d from: %d to: %d") ); TRACE_EVENT_FORMAT(sched_process_free, - TPPROTO(struct task_struct *p), - TPARGS(p), - TPFMT("task %s:%d", p->comm, p->pid), + TP_PROTO(struct task_struct *p), + TP_ARGS(p), + TP_FMT("task %s:%d", p->comm, p->pid), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, p->pid) ), - TPRAWFMT("task %d") + TP_RAW_FMT("task %d") ); TRACE_EVENT_FORMAT(sched_process_exit, - TPPROTO(struct task_struct *p), - TPARGS(p), - TPFMT("task %s:%d", p->comm, p->pid), + TP_PROTO(struct task_struct *p), + TP_ARGS(p), + TP_FMT("task %s:%d", p->comm, p->pid), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, p->pid) ), - TPRAWFMT("task %d") + TP_RAW_FMT("task %d") ); TRACE_EVENT_FORMAT(sched_process_wait, - TPPROTO(struct pid *pid), - TPARGS(pid), - TPFMT("pid %d", pid_nr(pid)), + TP_PROTO(struct pid *pid), + TP_ARGS(pid), + TP_FMT("pid %d", pid_nr(pid)), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, pid_nr(pid)) ), - TPRAWFMT("task %d") + TP_RAW_FMT("task %d") ); TRACE_EVENT_FORMAT(sched_process_fork, - TPPROTO(struct task_struct *parent, struct task_struct *child), - TPARGS(parent, child), - TPFMT("parent %s:%d child %s:%d", + TP_PROTO(struct task_struct *parent, struct task_struct *child), + TP_ARGS(parent, child), + TP_FMT("parent %s:%d child %s:%d", parent->comm, parent->pid, child->comm, child->pid), TRACE_STRUCT( TRACE_FIELD(pid_t, parent, parent->pid) TRACE_FIELD(pid_t, child, child->pid) ), - TPRAWFMT("parent %d child %d") + TP_RAW_FMT("parent %d child %d") ); TRACE_EVENT_FORMAT(sched_signal_send, - TPPROTO(int sig, struct task_struct *p), - TPARGS(sig, p), - TPFMT("sig: %d task %s:%d", sig, p->comm, p->pid), + TP_PROTO(int sig, struct task_struct *p), + TP_ARGS(sig, p), + TP_FMT("sig: %d task %s:%d", sig, p->comm, p->pid), TRACE_STRUCT( TRACE_FIELD(int, sig, sig) TRACE_FIELD(pid_t, pid, p->pid) ), - TPRAWFMT("sig: %d task %d") + TP_RAW_FMT("sig: %d task %d") ); #undef TRACE_SYSTEM diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h index 867829df4571..7626523deeba 100644 --- a/include/trace/workqueue.h +++ b/include/trace/workqueue.h @@ -6,20 +6,20 @@ #include DECLARE_TRACE(workqueue_insertion, - TPPROTO(struct task_struct *wq_thread, struct work_struct *work), - TPARGS(wq_thread, work)); + TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), + TP_ARGS(wq_thread, work)); DECLARE_TRACE(workqueue_execution, - TPPROTO(struct task_struct *wq_thread, struct work_struct *work), - TPARGS(wq_thread, work)); + TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), + TP_ARGS(wq_thread, work)); /* Trace the creation of one workqueue thread on a cpu */ DECLARE_TRACE(workqueue_creation, - TPPROTO(struct task_struct *wq_thread, int cpu), - TPARGS(wq_thread, cpu)); + TP_PROTO(struct task_struct *wq_thread, int cpu), + TP_ARGS(wq_thread, cpu)); DECLARE_TRACE(workqueue_destruction, - TPPROTO(struct task_struct *wq_thread), - TPARGS(wq_thread)); + TP_PROTO(struct task_struct *wq_thread), + TP_ARGS(wq_thread)); #endif /* __TRACE_WORKQUEUE_H */ diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index fb4eba166433..d94179aa1fc2 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -10,7 +10,7 @@ TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore, TRACE_FIELD(unsigned long, ip, ip) TRACE_FIELD(unsigned long, parent_ip, parent_ip) ), - TPRAWFMT(" %lx <-- %lx") + TP_RAW_FMT(" %lx <-- %lx") ); TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT, @@ -19,7 +19,7 @@ TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT, TRACE_FIELD(unsigned long, graph_ent.func, func) TRACE_FIELD(int, graph_ent.depth, depth) ), - TPRAWFMT("--> %lx (%d)") + TP_RAW_FMT("--> %lx (%d)") ); TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, @@ -28,7 +28,7 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, TRACE_FIELD(unsigned long, ret.func, func) TRACE_FIELD(int, ret.depth, depth) ), - TPRAWFMT("<-- %lx (%d)") + TP_RAW_FMT("<-- %lx (%d)") ); TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore, @@ -41,7 +41,7 @@ TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore, TRACE_FIELD(unsigned char, next_state, next_state) TRACE_FIELD(unsigned int, next_cpu, next_cpu) ), - TPRAWFMT("%u:%u:%u ==+ %u:%u:%u [%03u]") + TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") ); TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, @@ -54,7 +54,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, TRACE_FIELD(unsigned char, next_state, next_state) TRACE_FIELD(unsigned int, next_cpu, next_cpu) ), - TPRAWFMT("%u:%u:%u ==+ %u:%u:%u [%03u]") + TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") ); TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, @@ -63,7 +63,7 @@ TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, TRACE_FIELD(unsigned long, arg2, arg2) TRACE_FIELD(unsigned long, arg3, arg3) ), - TPRAWFMT("(%08lx) (%08lx) (%08lx)") + TP_RAW_FMT("(%08lx) (%08lx) (%08lx)") ); /* @@ -83,7 +83,7 @@ TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore, TRACE_FIELD(unsigned long, caller[6], stack6) TRACE_FIELD(unsigned long, caller[7], stack7) ), - TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") ); @@ -98,7 +98,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, TRACE_FIELD(unsigned long, caller[6], stack6) TRACE_FIELD(unsigned long, caller[7], stack7) ), - TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") ); @@ -108,7 +108,7 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, TRACE_FIELD(unsigned int, depth, depth) TRACE_FIELD_ZERO_CHAR(buf) ), - TPRAWFMT("%08lx (%d) %s") + TP_RAW_FMT("%08lx (%d) %s") ); TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, @@ -118,7 +118,7 @@ TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file) TRACE_FIELD(char, correct, correct) ), - TPRAWFMT("%u:%s:%s (%u)") + TP_RAW_FMT("%u:%s:%s (%u)") ); TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, @@ -126,7 +126,7 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, TRACE_FIELD(u64, from, from) TRACE_FIELD(u64, to, to) ), - TPRAWFMT("from: %llx to: %llx") + TP_RAW_FMT("from: %llx to: %llx") ); TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, @@ -136,7 +136,7 @@ TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, TRACE_FIELD(int, state_data.type, type) TRACE_FIELD(int, state_data.state, state) ), - TPRAWFMT("%llx->%llx type:%u state:%u") + TP_RAW_FMT("%llx->%llx type:%u state:%u") ); TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore, @@ -149,7 +149,7 @@ TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore, TRACE_FIELD(gfp_t, gfp_flags, gfp_flags) TRACE_FIELD(int, node, node) ), - TPRAWFMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" + TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" " flags:%x node:%d") ); @@ -159,7 +159,7 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore, TRACE_FIELD(unsigned long, call_site, call_site) TRACE_FIELD(const void *, ptr, ptr) ), - TPRAWFMT("type:%u call_site:%lx ptr:%p") + TP_RAW_FMT("type:%u call_site:%lx ptr:%p") ); #undef TRACE_SYSTEM diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index d24a97e74aea..8e2e0f56c2a8 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -20,7 +20,7 @@ * * field = (typeof(field))entry; * - * ret = trace_seq_printf(s, "%s", "\n"); + * ret = trace_seq_printf(s, "%s", "\n"); * if (!ret) * return TRACE_TYPE_PARTIAL_LINE; * @@ -44,8 +44,8 @@ field->item, -#undef TPRAWFMT -#define TPRAWFMT(args...) args +#undef TP_RAW_FMT +#define TP_RAW_FMT(args...) args #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 2c8d76c7dbed..557ca52bcdcd 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -106,8 +106,8 @@ * */ -#undef TPFMT -#define TPFMT(fmt, args...) fmt "\n", ##args +#undef TP_FMT +#define TP_FMT(fmt, args...) fmt "\n", ##args #define _TRACE_FORMAT(call, proto, args, fmt) \ static void ftrace_event_##call(proto) \ @@ -152,8 +152,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD(type, item, assign)\ entry->item = assign; -#undef TPCMD -#define TPCMD(cmd...) cmd +#undef TP_CMD +#define TP_CMD(cmd...) cmd #undef TRACE_ENTRY #define TRACE_ENTRY entry diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 7162ab49d05d..e62bc10f8103 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -26,8 +26,8 @@ return 0; -#undef TPRAWFMT -#define TPRAWFMT(args...) args +#undef TP_RAW_FMT +#define TP_RAW_FMT(args...) args #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ @@ -57,8 +57,8 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_FIELD(type, item, assign)\ entry->item = assign; -#undef TPCMD -#define TPCMD(cmd...) cmd +#undef TP_CMD +#define TP_CMD(cmd...) cmd #undef TRACE_ENTRY #define TRACE_ENTRY entry diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h deleted file mode 100644 index 97e59a9c82ea..000000000000 --- a/kernel/trace/trace_format.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - * struct ftrace_raw_##call field; - * int ret; - * - * ret = trace_seq_printf(s, #type " " #item ";" - * " size:%d; offset:%d;\n", - * sizeof(field.type), - * offsetof(struct ftrace_raw_##call, - * item)); - * - * } - */ - -#undef TRACE_STRUCT -#define TRACE_STRUCT(args...) args - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - - -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ - ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -static int \ -ftrace_format_##call(struct trace_seq *s) \ -{ \ - struct ftrace_raw_##call field; \ - int ret; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ - \ - return ret; \ -} - diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h index 01724e04c556..dffdc49878af 100644 --- a/samples/tracepoints/tp-samples-trace.h +++ b/samples/tracepoints/tp-samples-trace.h @@ -5,9 +5,9 @@ #include DECLARE_TRACE(subsys_event, - TPPROTO(struct inode *inode, struct file *file), - TPARGS(inode, file)); + TP_PROTO(struct inode *inode, struct file *file), + TP_ARGS(inode, file)); DECLARE_TRACE(subsys_eventb, - TPPROTO(void), - TPARGS()); + TP_PROTO(void), + TP_ARGS()); #endif -- cgit v1.2.3 From 631595fbf4aeac260e664a8a002897e4db6a50dd Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 10 Mar 2009 13:57:10 +0900 Subject: doc: add trace_buf_size description to kernel-parameters.txt from early boot tracing view, trace_buf_size parameter is important. it should be documented. Signed-off-by: KOSAKI Motohiro LKML-Reference: <20090310135200.A48B.A69D9226@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 454f42b21f16..7643483bdd6a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2344,6 +2344,8 @@ and is between 256 and 4096 characters. It is defined in the file tp720= [HW,PS2] + trace_buf_size=nn[KMG] [ftrace] will set tracing buffer size. + trix= [HW,OSS] MediaTrix AudioTrix Pro Format: ,,,,,,,, -- cgit v1.2.3 From 1cc0ca26c57221f5aa0f49a311a8fc83413dfe97 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 14 Jan 2009 10:04:36 -0700 Subject: PCI/x86: document pci=earlydump argument Document the "pci=earlydump" argument. This currently only works on x86. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- Documentation/kernel-parameters.txt | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 54f21a5c262b..c7c441e7930e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1659,6 +1659,8 @@ and is between 256 and 4096 characters. It is defined in the file See also Documentation/blockdev/paride.txt. pci=option[,option...] [PCI] various PCI subsystem options: + earlydump [X86] dump PCI config space before the kernel + changes anything off [X86] don't probe for the PCI bus bios [X86-32] force use of PCI BIOS, don't access the hardware directly. Use this if your machine -- cgit v1.2.3 From 0994375e9614f78657031e04e30019b9cdb62795 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Mon, 23 Feb 2009 21:52:23 -0800 Subject: PCI: add remove_id sysfs entry This adds a remove_id sysfs entry to allow users of new_id to later remove the added dynid. One use case is management tools that want to dynamically bind/unbind devices to pci-stub driver while devices are assigned to KVM guests. Rather than having to track which driver was originally bound to the driver, a mangement tool can simply: Guest uses device Signed-off-by: Chris Wright Signed-off-by: Jesse Barnes --- Documentation/ABI/testing/sysfs-bus-pci | 16 +++++++ drivers/pci/pci-driver.c | 80 ++++++++++++++++++++++++++++++++- 2 files changed, 94 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index e638e15a8895..3d29793a0ea2 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -41,6 +41,22 @@ Description: for the device and attempt to bind to it. For example: # echo "8086 10f5" > /sys/bus/pci/drivers/foo/new_id +What: /sys/bus/pci/drivers/.../remove_id +Date: February 2009 +Contact: Chris Wright +Description: + Writing a device ID to this file will remove an ID + that was dynamically added via the new_id sysfs entry. + The format for the device ID is: + VVVV DDDD SVVV SDDD CCCC MMMM. That is Vendor ID, Device + ID, Subsystem Vendor ID, Subsystem Device ID, Class, + and Class Mask. The Vendor ID and Device ID fields are + required, the rest are optional. After successfully + removing an ID, the driver will no longer support the + device. This is useful to ensure auto probing won't + match the driver to the device. For example: + # echo "8086 10f5" > /sys/bus/pci/drivers/foo/remove_id + What: /sys/bus/pci/devices/.../vpd Date: February 2008 Contact: Ben Hutchings diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 93eac1423585..87a5ddbb324f 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -99,6 +99,52 @@ store_new_id(struct device_driver *driver, const char *buf, size_t count) } static DRIVER_ATTR(new_id, S_IWUSR, NULL, store_new_id); +/** + * store_remove_id - remove a PCI device ID from this driver + * @driver: target device driver + * @buf: buffer for scanning device ID data + * @count: input size + * + * Removes a dynamic pci device ID to this driver. + */ +static ssize_t +store_remove_id(struct device_driver *driver, const char *buf, size_t count) +{ + struct pci_dynid *dynid, *n; + struct pci_driver *pdrv = to_pci_driver(driver); + __u32 vendor, device, subvendor = PCI_ANY_ID, + subdevice = PCI_ANY_ID, class = 0, class_mask = 0; + int fields = 0; + int retval = -ENODEV; + + fields = sscanf(buf, "%x %x %x %x %x %x", + &vendor, &device, &subvendor, &subdevice, + &class, &class_mask); + if (fields < 2) + return -EINVAL; + + spin_lock(&pdrv->dynids.lock); + list_for_each_entry_safe(dynid, n, &pdrv->dynids.list, node) { + struct pci_device_id *id = &dynid->id; + if ((id->vendor == vendor) && + (id->device == device) && + (subvendor == PCI_ANY_ID || id->subvendor == subvendor) && + (subdevice == PCI_ANY_ID || id->subdevice == subdevice) && + !((id->class ^ class) & class_mask)) { + list_del(&dynid->node); + kfree(dynid); + retval = 0; + break; + } + } + spin_unlock(&pdrv->dynids.lock); + + if (retval) + return retval; + return count; +} +static DRIVER_ATTR(remove_id, S_IWUSR, NULL, store_remove_id); + static void pci_free_dynids(struct pci_driver *drv) { @@ -125,6 +171,20 @@ static void pci_remove_newid_file(struct pci_driver *drv) { driver_remove_file(&drv->driver, &driver_attr_new_id); } + +static int +pci_create_removeid_file(struct pci_driver *drv) +{ + int error = 0; + if (drv->probe != NULL) + error = driver_create_file(&drv->driver,&driver_attr_remove_id); + return error; +} + +static void pci_remove_removeid_file(struct pci_driver *drv) +{ + driver_remove_file(&drv->driver, &driver_attr_remove_id); +} #else /* !CONFIG_HOTPLUG */ static inline void pci_free_dynids(struct pci_driver *drv) {} static inline int pci_create_newid_file(struct pci_driver *drv) @@ -132,6 +192,11 @@ static inline int pci_create_newid_file(struct pci_driver *drv) return 0; } static inline void pci_remove_newid_file(struct pci_driver *drv) {} +static inline int pci_create_removeid_file(struct pci_driver *drv) +{ + return 0; +} +static inline void pci_remove_removeid_file(struct pci_driver *drv) {} #endif /** @@ -852,13 +917,23 @@ int __pci_register_driver(struct pci_driver *drv, struct module *owner, /* register with core */ error = driver_register(&drv->driver); if (error) - return error; + goto out; error = pci_create_newid_file(drv); if (error) - driver_unregister(&drv->driver); + goto out_newid; + error = pci_create_removeid_file(drv); + if (error) + goto out_removeid; +out: return error; + +out_removeid: + pci_remove_newid_file(drv); +out_newid: + driver_unregister(&drv->driver); + goto out; } /** @@ -874,6 +949,7 @@ int __pci_register_driver(struct pci_driver *drv, struct module *owner, void pci_unregister_driver(struct pci_driver *drv) { + pci_remove_removeid_file(drv); pci_remove_newid_file(drv); driver_unregister(&drv->driver); pci_free_dynids(drv); -- cgit v1.2.3 From c41ade2ee1dc146d2de2ee470a87cd6b878a08f4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 17 Mar 2009 08:54:05 -0400 Subject: Rewrite MSI-HOWTO I didn't find the previous version very useful, so I rewrote it. Signed-off-by: Matthew Wilcox Reviewed-by: Randy Dunlap Reviewed-by: Grant Grundler Signed-off-by: Jesse Barnes --- Documentation/PCI/MSI-HOWTO.txt | 758 +++++++++++++++------------------------- 1 file changed, 277 insertions(+), 481 deletions(-) (limited to 'Documentation') diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 256defd7e174..1c02431f1d1a 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -4,506 +4,302 @@ Revised Feb 12, 2004 by Martine Silbermann email: Martine.Silbermann@hp.com Revised Jun 25, 2004 by Tom L Nguyen + Revised Jul 9, 2008 by Matthew Wilcox + Copyright 2003, 2008 Intel Corporation 1. About this guide -This guide describes the basics of Message Signaled Interrupts (MSI), -the advantages of using MSI over traditional interrupt mechanisms, -and how to enable your driver to use MSI or MSI-X. Also included is -a Frequently Asked Questions (FAQ) section. - -1.1 Terminology - -PCI devices can be single-function or multi-function. In either case, -when this text talks about enabling or disabling MSI on a "device -function," it is referring to one specific PCI device and function and -not to all functions on a PCI device (unless the PCI device has only -one function). - -2. Copyright 2003 Intel Corporation - -3. What is MSI/MSI-X? - -Message Signaled Interrupt (MSI), as described in the PCI Local Bus -Specification Revision 2.3 or later, is an optional feature, and a -required feature for PCI Express devices. MSI enables a device function -to request service by sending an Inbound Memory Write on its PCI bus to -the FSB as a Message Signal Interrupt transaction. Because MSI is -generated in the form of a Memory Write, all transaction conditions, -such as a Retry, Master-Abort, Target-Abort or normal completion, are -supported. - -A PCI device that supports MSI must also support pin IRQ assertion -interrupt mechanism to provide backward compatibility for systems that -do not support MSI. In systems which support MSI, the bus driver is -responsible for initializing the message address and message data of -the device function's MSI/MSI-X capability structure during device -initial configuration. - -An MSI capable device function indicates MSI support by implementing -the MSI/MSI-X capability structure in its PCI capability list. The -device function may implement both the MSI capability structure and -the MSI-X capability structure; however, the bus driver should not -enable both. - -The MSI capability structure contains Message Control register, -Message Address register and Message Data register. These registers -provide the bus driver control over MSI. The Message Control register -indicates the MSI capability supported by the device. The Message -Address register specifies the target address and the Message Data -register specifies the characteristics of the message. To request -service, the device function writes the content of the Message Data -register to the target address. The device and its software driver -are prohibited from writing to these registers. - -The MSI-X capability structure is an optional extension to MSI. It -uses an independent and separate capability structure. There are -some key advantages to implementing the MSI-X capability structure -over the MSI capability structure as described below. - - - Support a larger maximum number of vectors per function. - - - Provide the ability for system software to configure - each vector with an independent message address and message - data, specified by a table that resides in Memory Space. - - - MSI and MSI-X both support per-vector masking. Per-vector - masking is an optional extension of MSI but a required - feature for MSI-X. Per-vector masking provides the kernel the - ability to mask/unmask a single MSI while running its - interrupt service routine. If per-vector masking is - not supported, then the device driver should provide the - hardware/software synchronization to ensure that the device - generates MSI when the driver wants it to do so. - -4. Why use MSI? - -As a benefit to the simplification of board design, MSI allows board -designers to remove out-of-band interrupt routing. MSI is another -step towards a legacy-free environment. - -Due to increasing pressure on chipset and processor packages to -reduce pin count, the need for interrupt pins is expected to -diminish over time. Devices, due to pin constraints, may implement -messages to increase performance. - -PCI Express endpoints uses INTx emulation (in-band messages) instead -of IRQ pin assertion. Using INTx emulation requires interrupt -sharing among devices connected to the same node (PCI bridge) while -MSI is unique (non-shared) and does not require BIOS configuration -support. As a result, the PCI Express technology requires MSI -support for better interrupt performance. - -Using MSI enables the device functions to support two or more -vectors, which can be configured to target different CPUs to -increase scalability. - -5. Configuring a driver to use MSI/MSI-X - -By default, the kernel will not enable MSI/MSI-X on all devices that -support this capability. The CONFIG_PCI_MSI kernel option -must be selected to enable MSI/MSI-X support. - -5.1 Including MSI/MSI-X support into the kernel - -To allow MSI/MSI-X capable device drivers to selectively enable -MSI/MSI-X (using pci_enable_msi()/pci_enable_msix() as described -below), the VECTOR based scheme needs to be enabled by setting -CONFIG_PCI_MSI during kernel config. - -Since the target of the inbound message is the local APIC, providing -CONFIG_X86_LOCAL_APIC must be enabled as well as CONFIG_PCI_MSI. - -5.2 Configuring for MSI support - -Due to the non-contiguous fashion in vector assignment of the -existing Linux kernel, this version does not support multiple -messages regardless of a device function is capable of supporting -more than one vector. To enable MSI on a device function's MSI -capability structure requires a device driver to call the function -pci_enable_msi() explicitly. - -5.2.1 API pci_enable_msi +This guide describes the basics of Message Signaled Interrupts (MSIs), +the advantages of using MSI over traditional interrupt mechanisms, how +to change your driver to use MSI or MSI-X and some basic diagnostics to +try if a device doesn't support MSIs. + + +2. What are MSIs? + +A Message Signaled Interrupt is a write from the device to a special +address which causes an interrupt to be received by the CPU. + +The MSI capability was first specified in PCI 2.2 and was later enhanced +in PCI 3.0 to allow each interrupt to be masked individually. The MSI-X +capability was also introduced with PCI 3.0. It supports more interrupts +per device than MSI and allows interrupts to be independently configured. + +Devices may support both MSI and MSI-X, but only one can be enabled at +a time. + + +3. Why use MSIs? + +There are three reasons why using MSIs can give an advantage over +traditional pin-based interrupts. + +Pin-based PCI interrupts are often shared amongst several devices. +To support this, the kernel must call each interrupt handler associated +with an interrupt, which leads to reduced performance for the system as +a whole. MSIs are never shared, so this problem cannot arise. + +When a device writes data to memory, then raises a pin-based interrupt, +it is possible that the interrupt may arrive before all the data has +arrived in memory (this becomes more likely with devices behind PCI-PCI +bridges). In order to ensure that all the data has arrived in memory, +the interrupt handler must read a register on the device which raised +the interrupt. PCI transaction ordering rules require that all the data +arrives in memory before the value can be returned from the register. +Using MSIs avoids this problem as the interrupt-generating write cannot +pass the data writes, so by the time the interrupt is raised, the driver +knows that all the data has arrived in memory. + +PCI devices can only support a single pin-based interrupt per function. +Often drivers have to query the device to find out what event has +occurred, slowing down interrupt handling for the common case. With +MSIs, a device can support more interrupts, allowing each interrupt +to be specialised to a different purpose. One possible design gives +infrequent conditions (such as errors) their own interrupt which allows +the driver to handle the normal interrupt handling path more efficiently. +Other possible designs include giving one interrupt to each packet queue +in a network card or each port in a storage controller. + + +4. How to use MSIs + +PCI devices are initialised to use pin-based interrupts. The device +driver has to set up the device to use MSI or MSI-X. Not all machines +support MSIs correctly, and for those machines, the APIs described below +will simply fail and the device will continue to use pin-based interrupts. + +4.1 Include kernel support for MSIs + +To support MSI or MSI-X, the kernel must be built with the CONFIG_PCI_MSI +option enabled. This option is only available on some architectures, +and it may depend on some other options also being set. For example, +on x86, you must also enable X86_UP_APIC or SMP in order to see the +CONFIG_PCI_MSI option. + +4.2 Using MSI + +Most of the hard work is done for the driver in the PCI layer. It simply +has to request that the PCI layer set up the MSI capability for this +device. + +4.2.1 pci_enable_msi int pci_enable_msi(struct pci_dev *dev) -With this new API, a device driver that wants to have MSI -enabled on its device function must call this API to enable MSI. -A successful call will initialize the MSI capability structure -with ONE vector, regardless of whether a device function is -capable of supporting multiple messages. This vector replaces the -pre-assigned dev->irq with a new MSI vector. To avoid a conflict -of the new assigned vector with existing pre-assigned vector requires -a device driver to call this API before calling request_irq(). +A successful call will allocate ONE interrupt to the device, regardless +of how many MSIs the device supports. The device will be switched from +pin-based interrupt mode to MSI mode. The dev->irq number is changed +to a new number which represents the message signaled interrupt. +This function should be called before the driver calls request_irq() +since enabling MSIs disables the pin-based IRQ and the driver will not +receive interrupts on the old interrupt. -5.2.2 API pci_disable_msi +4.2.2 pci_disable_msi void pci_disable_msi(struct pci_dev *dev) -This API should always be used to undo the effect of pci_enable_msi() -when a device driver is unloading. This API restores dev->irq with -the pre-assigned IOAPIC vector and switches a device's interrupt -mode to PCI pin-irq assertion/INTx emulation mode. - -Note that a device driver should always call free_irq() on the MSI vector -that it has done request_irq() on before calling this API. Failure to do -so results in a BUG_ON() and a device will be left with MSI enabled and -leaks its vector. - -5.2.3 MSI mode vs. legacy mode diagram - -The below diagram shows the events which switch the interrupt -mode on the MSI-capable device function between MSI mode and -PIN-IRQ assertion mode. - - ------------ pci_enable_msi ------------------------ - | | <=============== | | - | MSI MODE | | PIN-IRQ ASSERTION MODE | - | | ===============> | | - ------------ pci_disable_msi ------------------------ - - -Figure 1. MSI Mode vs. Legacy Mode - -In Figure 1, a device operates by default in legacy mode. Legacy -in this context means PCI pin-irq assertion or PCI-Express INTx -emulation. A successful MSI request (using pci_enable_msi()) switches -a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector -stored in dev->irq will be saved by the PCI subsystem and a new -assigned MSI vector will replace dev->irq. - -To return back to its default mode, a device driver should always call -pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a -device driver should always call free_irq() on the MSI vector it has -done request_irq() on before calling pci_disable_msi(). Failure to do -so results in a BUG_ON() and a device will be left with MSI enabled and -leaks its vector. Otherwise, the PCI subsystem restores a device's -dev->irq with a pre-assigned IOAPIC vector and marks the released -MSI vector as unused. - -Once being marked as unused, there is no guarantee that the PCI -subsystem will reserve this MSI vector for a device. Depending on -the availability of current PCI vector resources and the number of -MSI/MSI-X requests from other drivers, this MSI may be re-assigned. - -For the case where the PCI subsystem re-assigns this MSI vector to -another driver, a request to switch back to MSI mode may result -in being assigned a different MSI vector or a failure if no more -vectors are available. - -5.3 Configuring for MSI-X support - -Due to the ability of the system software to configure each vector of -the MSI-X capability structure with an independent message address -and message data, the non-contiguous fashion in vector assignment of -the existing Linux kernel has no impact on supporting multiple -messages on an MSI-X capable device functions. To enable MSI-X on -a device function's MSI-X capability structure requires its device -driver to call the function pci_enable_msix() explicitly. - -The function pci_enable_msix(), once invoked, enables either -all or nothing, depending on the current availability of PCI vector -resources. If the PCI vector resources are available for the number -of vectors requested by a device driver, this function will configure -the MSI-X table of the MSI-X capability structure of a device with -requested messages. To emphasize this reason, for example, a device -may be capable for supporting the maximum of 32 vectors while its -software driver usually may request 4 vectors. It is recommended -that the device driver should call this function once during the -initialization phase of the device driver. - -Unlike the function pci_enable_msi(), the function pci_enable_msix() -does not replace the pre-assigned IOAPIC dev->irq with a new MSI -vector because the PCI subsystem writes the 1:1 vector-to-entry mapping -into the field vector of each element contained in a second argument. -Note that the pre-assigned IOAPIC dev->irq is valid only if the device -operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt at -using dev->irq by the device driver to request for interrupt service -may result in unpredictable behavior. - -For each MSI-X vector granted, a device driver is responsible for calling -other functions like request_irq(), enable_irq(), etc. to enable -this vector with its corresponding interrupt service handler. It is -a device driver's choice to assign all vectors with the same -interrupt service handler or each vector with a unique interrupt -service handler. - -5.3.1 Handling MMIO address space of MSI-X Table - -The PCI 3.0 specification has implementation notes that MMIO address -space for a device's MSI-X structure should be isolated so that the -software system can set different pages for controlling accesses to the -MSI-X structure. The implementation of MSI support requires the PCI -subsystem, not a device driver, to maintain full control of the MSI-X -table/MSI-X PBA (Pending Bit Array) and MMIO address space of the MSI-X -table/MSI-X PBA. A device driver should not access the MMIO address -space of the MSI-X table/MSI-X PBA. - -5.3.2 API pci_enable_msix - -int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) +This function should be used to undo the effect of pci_enable_msi(). +Calling it restores dev->irq to the pin-based interrupt number and frees +the previously allocated message signaled interrupt(s). The interrupt +may subsequently be assigned to another device, so drivers should not +cache the value of dev->irq. -This API enables a device driver to request the PCI subsystem -to enable MSI-X messages on its hardware device. Depending on -the availability of PCI vectors resources, the PCI subsystem enables -either all or none of the requested vectors. +A device driver must always call free_irq() on the interrupt(s) +for which it has called request_irq() before calling this function. +Failure to do so will result in a BUG_ON(), the device will be left with +MSI enabled and will leak its vector. -Argument 'dev' points to the device (pci_dev) structure. +4.3 Using MSI-X -Argument 'entries' is a pointer to an array of msix_entry structs. -The number of entries is indicated in argument 'nvec'. -struct msix_entry is defined in /driver/pci/msi.h: +The MSI-X capability is much more flexible than the MSI capability. +It supports up to 2048 interrupts, each of which can be controlled +independently. To support this flexibility, drivers must use an array of +`struct msix_entry': struct msix_entry { u16 vector; /* kernel uses to write alloc vector */ u16 entry; /* driver uses to specify entry */ }; -A device driver is responsible for initializing the field 'entry' of -each element with a unique entry supported by MSI-X table. Otherwise, --EINVAL will be returned as a result. A successful return of zero -indicates the PCI subsystem completed initializing each of the requested -entries of the MSI-X table with message address and message data. -Last but not least, the PCI subsystem will write the 1:1 -vector-to-entry mapping into the field 'vector' of each element. A -device driver is responsible for keeping track of allocated MSI-X -vectors in its internal data structure. - -A return of zero indicates that the number of MSI-X vectors was -successfully allocated. A return of greater than zero indicates -MSI-X vector shortage. Or a return of less than zero indicates -a failure. This failure may be a result of duplicate entries -specified in second argument, or a result of no available vector, -or a result of failing to initialize MSI-X table entries. - -5.3.3 API pci_disable_msix +This allows for the device to use these interrupts in a sparse fashion; +for example it could use interrupts 3 and 1027 and allocate only a +two-element array. The driver is expected to fill in the 'entry' value +in each element of the array to indicate which entries it wants the kernel +to assign interrupts for. It is invalid to fill in two entries with the +same number. + +4.3.1 pci_enable_msix + +int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) + +Calling this function asks the PCI subsystem to allocate 'nvec' MSIs. +The 'entries' argument is a pointer to an array of msix_entry structs +which should be at least 'nvec' entries in size. On success, the +function will return 0 and the device will have been switched into +MSI-X interrupt mode. The 'vector' elements in each entry will have +been filled in with the interrupt number. The driver should then call +request_irq() for each 'vector' that it decides to use. + +If this function returns a negative number, it indicates an error and +the driver should not attempt to allocate any more MSI-X interrupts for +this device. If it returns a positive number, it indicates the maximum +number of interrupt vectors that could have been allocated. + +This function, in contrast with pci_enable_msi(), does not adjust +dev->irq. The device will not generate interrupts for this interrupt +number once MSI-X is enabled. The device driver is responsible for +keeping track of the interrupts assigned to the MSI-X vectors so it can +free them again later. + +Device drivers should normally call this function once per device +during the initialization phase. + +4.3.2 pci_disable_msix void pci_disable_msix(struct pci_dev *dev) -This API should always be used to undo the effect of pci_enable_msix() -when a device driver is unloading. Note that a device driver should -always call free_irq() on all MSI-X vectors it has done request_irq() -on before calling this API. Failure to do so results in a BUG_ON() and -a device will be left with MSI-X enabled and leaks its vectors. - -5.3.4 MSI-X mode vs. legacy mode diagram - -The below diagram shows the events which switch the interrupt -mode on the MSI-X capable device function between MSI-X mode and -PIN-IRQ assertion mode (legacy). - - ------------ pci_enable_msix(,,n) ------------------------ - | | <=============== | | - | MSI-X MODE | | PIN-IRQ ASSERTION MODE | - | | ===============> | | - ------------ pci_disable_msix ------------------------ - -Figure 2. MSI-X Mode vs. Legacy Mode - -In Figure 2, a device operates by default in legacy mode. A -successful MSI-X request (using pci_enable_msix()) switches a -device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector -stored in dev->irq will be saved by the PCI subsystem; however, -unlike MSI mode, the PCI subsystem will not replace dev->irq with -assigned MSI-X vector because the PCI subsystem already writes the 1:1 -vector-to-entry mapping into the field 'vector' of each element -specified in second argument. - -To return back to its default mode, a device driver should always call -pci_disable_msix() to undo the effect of pci_enable_msix(). Note that -a device driver should always call free_irq() on all MSI-X vectors it -has done request_irq() on before calling pci_disable_msix(). Failure -to do so results in a BUG_ON() and a device will be left with MSI-X -enabled and leaks its vectors. Otherwise, the PCI subsystem switches a -device function's interrupt mode from MSI-X mode to legacy mode and -marks all allocated MSI-X vectors as unused. - -Once being marked as unused, there is no guarantee that the PCI -subsystem will reserve these MSI-X vectors for a device. Depending on -the availability of current PCI vector resources and the number of -MSI/MSI-X requests from other drivers, these MSI-X vectors may be -re-assigned. - -For the case where the PCI subsystem re-assigned these MSI-X vectors -to other drivers, a request to switch back to MSI-X mode may result -being assigned with another set of MSI-X vectors or a failure if no -more vectors are available. - -5.4 Handling function implementing both MSI and MSI-X capabilities - -For the case where a function implements both MSI and MSI-X -capabilities, the PCI subsystem enables a device to run either in MSI -mode or MSI-X mode but not both. A device driver determines whether it -wants MSI or MSI-X enabled on its hardware device. Once a device -driver requests for MSI, for example, it is prohibited from requesting -MSI-X; in other words, a device driver is not permitted to ping-pong -between MSI mod MSI-X mode during a run-time. - -5.5 Hardware requirements for MSI/MSI-X support - -MSI/MSI-X support requires support from both system hardware and -individual hardware device functions. - -5.5.1 Required x86 hardware support - -Since the target of MSI address is the local APIC CPU, enabling -MSI/MSI-X support in the Linux kernel is dependent on whether existing -system hardware supports local APIC. Users should verify that their -system supports local APIC operation by testing that it runs when -CONFIG_X86_LOCAL_APIC=y. - -In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set; -however, in UP environment, users must manually set -CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting -CONFIG_PCI_MSI enables the VECTOR based scheme and the option for -MSI-capable device drivers to selectively enable MSI/MSI-X. - -Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X -vector is allocated new during runtime and MSI/MSI-X support does not -depend on BIOS support. This key independency enables MSI/MSI-X -support on future IOxAPIC free platforms. - -5.5.2 Device hardware support - -The hardware device function supports MSI by indicating the -MSI/MSI-X capability structure on its PCI capability list. By -default, this capability structure will not be initialized by -the kernel to enable MSI during the system boot. In other words, -the device function is running on its default pin assertion mode. -Note that in many cases the hardware supporting MSI have bugs, -which may result in system hangs. The software driver of specific -MSI-capable hardware is responsible for deciding whether to call -pci_enable_msi or not. A return of zero indicates the kernel -successfully initialized the MSI/MSI-X capability structure of the -device function. The device function is now running on MSI/MSI-X mode. - -5.6 How to tell whether MSI/MSI-X is enabled on device function - -At the driver level, a return of zero from the function call of -pci_enable_msi()/pci_enable_msix() indicates to a device driver that -its device function is initialized successfully and ready to run in -MSI/MSI-X mode. - -At the user level, users can use the command 'cat /proc/interrupts' -to display the vectors allocated for devices and their interrupt -MSI/MSI-X modes ("PCI-MSI"/"PCI-MSI-X"). Below shows MSI mode is -enabled on a SCSI Adaptec 39320D Ultra320 controller. - - CPU0 CPU1 - 0: 324639 0 IO-APIC-edge timer - 1: 1186 0 IO-APIC-edge i8042 - 2: 0 0 XT-PIC cascade - 12: 2797 0 IO-APIC-edge i8042 - 14: 6543 0 IO-APIC-edge ide0 - 15: 1 0 IO-APIC-edge ide1 -169: 0 0 IO-APIC-level uhci-hcd -185: 0 0 IO-APIC-level uhci-hcd -193: 138 10 PCI-MSI aic79xx -201: 30 0 PCI-MSI aic79xx -225: 30 0 IO-APIC-level aic7xxx -233: 30 0 IO-APIC-level aic7xxx -NMI: 0 0 -LOC: 324553 325068 -ERR: 0 -MIS: 0 - -6. MSI quirks - -Several PCI chipsets or devices are known to not support MSI. -The PCI stack provides 3 possible levels of MSI disabling: -* on a single device -* on all devices behind a specific bridge -* globally - -6.1. Disabling MSI on a single device - -Under some circumstances it might be required to disable MSI on a -single device. This may be achieved by either not calling pci_enable_msi() -or all, or setting the pci_dev->no_msi flag before (most of the time -in a quirk). - -6.2. Disabling MSI below a bridge - -The vast majority of MSI quirks are required by PCI bridges not -being able to route MSI between busses. In this case, MSI have to be -disabled on all devices behind this bridge. It is achieves by setting -the PCI_BUS_FLAGS_NO_MSI flag in the pci_bus->bus_flags of the bridge -subordinate bus. There is no need to set the same flag on bridges that -are below the broken bridge. When pci_enable_msi() is called to enable -MSI on a device, pci_msi_supported() takes care of checking the NO_MSI -flag in all parent busses of the device. - -Some bridges actually support dynamic MSI support enabling/disabling -by changing some bits in their PCI configuration space (especially -the Hypertransport chipsets such as the nVidia nForce and Serverworks -HT2000). It may then be required to update the NO_MSI flag on the -corresponding devices in the sysfs hierarchy. To enable MSI support -on device "0000:00:0e", do: - - echo 1 > /sys/bus/pci/devices/0000:00:0e/msi_bus - -To disable MSI support, echo 0 instead of 1. Note that it should be -used with caution since changing this value might break interrupts. - -6.3. Disabling MSI globally - -Some extreme cases may require to disable MSI globally on the system. -For now, the only known case is a Serverworks PCI-X chipsets (MSI are -not supported on several busses that are not all connected to the -chipset in the Linux PCI hierarchy). In the vast majority of other -cases, disabling only behind a specific bridge is enough. - -For debugging purpose, the user may also pass pci=nomsi on the kernel -command-line to explicitly disable MSI globally. But, once the appro- -priate quirks are added to the kernel, this option should not be -required anymore. - -6.4. Finding why MSI cannot be enabled on a device - -Assuming that MSI are not enabled on a device, you should look at -dmesg to find messages that quirks may output when disabling MSI -on some devices, some bridges or even globally. -Then, lspci -t gives the list of bridges above a device. Reading -/sys/bus/pci/devices/0000:00:0e/msi_bus will tell you whether MSI -are enabled (1) or disabled (0). In 0 is found in a single bridge -msi_bus file above the device, MSI cannot be enabled. - -7. FAQ - -Q1. Are there any limitations on using the MSI? - -A1. If the PCI device supports MSI and conforms to the -specification and the platform supports the APIC local bus, -then using MSI should work. - -Q2. Will it work on all the Pentium processors (P3, P4, Xeon, -AMD processors)? In P3 IPI's are transmitted on the APIC local -bus and in P4 and Xeon they are transmitted on the system -bus. Are there any implications with this? - -A2. MSI support enables a PCI device sending an inbound -memory write (0xfeexxxxx as target address) on its PCI bus -directly to the FSB. Since the message address has a -redirection hint bit cleared, it should work. - -Q3. The target address 0xfeexxxxx will be translated by the -Host Bridge into an interrupt message. Are there any -limitations on the chipsets such as Intel 8xx, Intel e7xxx, -or VIA? - -A3. If these chipsets support an inbound memory write with -target address set as 0xfeexxxxx, as conformed to PCI -specification 2.3 or latest, then it should work. - -Q4. From the driver point of view, if the MSI is lost because -of errors occurring during inbound memory write, then it may -wait forever. Is there a mechanism for it to recover? - -A4. Since the target of the transaction is an inbound memory -write, all transaction termination conditions (Retry, -Master-Abort, Target-Abort, or normal completion) are -supported. A device sending an MSI must abide by all the PCI -rules and conditions regarding that inbound memory write. So, -if a retry is signaled it must retry, etc... We believe that -the recommendation for Abort is also a retry (refer to PCI -specification 2.3 or latest). +This API should be used to undo the effect of pci_enable_msix(). It frees +the previously allocated message signaled interrupts. The interrupts may +subsequently be assigned to another device, so drivers should not cache +the value of the 'vector' elements over a call to pci_disable_msix(). + +A device driver must always call free_irq() on the interrupt(s) +for which it has called request_irq() before calling this function. +Failure to do so will result in a BUG_ON(), the device will be left with +MSI enabled and will leak its vector. + +4.3.3 The MSI-X Table + +The MSI-X capability specifies a BAR and offset within that BAR for the +MSI-X Table. This address is mapped by the PCI subsystem, and should not +be accessed directly by the device driver. If the driver wishes to +mask or unmask an interrupt, it should call disable_irq() / enable_irq(). + +4.4 Handling devices implementing both MSI and MSI-X capabilities + +If a device implements both MSI and MSI-X capabilities, it can +run in either MSI mode or MSI-X mode but not both simultaneously. +This is a requirement of the PCI spec, and it is enforced by the +PCI layer. Calling pci_enable_msi() when MSI-X is already enabled or +pci_enable_msix() when MSI is already enabled will result in an error. +If a device driver wishes to switch between MSI and MSI-X at runtime, +it must first quiesce the device, then switch it back to pin-interrupt +mode, before calling pci_enable_msi() or pci_enable_msix() and resuming +operation. This is not expected to be a common operation but may be +useful for debugging or testing during development. + +4.5 Considerations when using MSIs + +4.5.1 Choosing between MSI-X and MSI + +If your device supports both MSI-X and MSI capabilities, you should use +the MSI-X facilities in preference to the MSI facilities. As mentioned +above, MSI-X supports any number of interrupts between 1 and 2048. +In constrast, MSI is restricted to a maximum of 32 interrupts (and +must be a power of two). In addition, the MSI interrupt vectors must +be allocated consecutively, so the system may not be able to allocate +as many vectors for MSI as it could for MSI-X. On some platforms, MSI +interrupts must all be targetted at the same set of CPUs whereas MSI-X +interrupts can all be targetted at different CPUs. + +4.5.2 Spinlocks + +Most device drivers have a per-device spinlock which is taken in the +interrupt handler. With pin-based interrupts or a single MSI, it is not +necessary to disable interrupts (Linux guarantees the same interrupt will +not be re-entered). If a device uses multiple interrupts, the driver +must disable interrupts while the lock is held. If the device sends +a different interrupt, the driver will deadlock trying to recursively +acquire the spinlock. + +There are two solutions. The first is to take the lock with +spin_lock_irqsave() or spin_lock_irq() (see +Documentation/DocBook/kernel-locking). The second is to specify +IRQF_DISABLED to request_irq() so that the kernel runs the entire +interrupt routine with interrupts disabled. + +If your MSI interrupt routine does not hold the lock for the whole time +it is running, the first solution may be best. The second solution is +normally preferred as it avoids making two transitions from interrupt +disabled to enabled and back again. + +4.6 How to tell whether MSI/MSI-X is enabled on a device + +Using 'lspci -v' (as root) may show some devices with "MSI", "Message +Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities +has an 'Enable' flag which will be followed with either "+" (enabled) +or "-" (disabled). + + +5. MSI quirks + +Several PCI chipsets or devices are known not to support MSIs. +The PCI stack provides three ways to disable MSIs: + +1. globally +2. on all devices behind a specific bridge +3. on a single device + +5.1. Disabling MSIs globally + +Some host chipsets simply don't support MSIs properly. If we're +lucky, the manufacturer knows this and has indicated it in the ACPI +FADT table. In this case, Linux will automatically disable MSIs. +Some boards don't include this information in the table and so we have +to detect them ourselves. The complete list of these is found near the +quirk_disable_all_msi() function in drivers/pci/quirks.c. + +If you have a board which has problems with MSIs, you can pass pci=nomsi +on the kernel command line to disable MSIs on all devices. It would be +in your best interests to report the problem to linux-pci@vger.kernel.org +including a full 'lspci -v' so we can add the quirks to the kernel. + +5.2. Disabling MSIs below a bridge + +Some PCI bridges are not able to route MSIs between busses properly. +In this case, MSIs must be disabled on all devices behind the bridge. + +Some bridges allow you to enable MSIs by changing some bits in their +PCI configuration space (especially the Hypertransport chipsets such +as the nVidia nForce and Serverworks HT2000). As with host chipsets, +Linux mostly knows about them and automatically enables MSIs if it can. +If you have a bridge which Linux doesn't yet know about, you can enable +MSIs in configuration space using whatever method you know works, then +enable MSIs on that bridge by doing: + + echo 1 > /sys/bus/pci/devices/$bridge/msi_bus + +where $bridge is the PCI address of the bridge you've enabled (eg +0000:00:0e.0). + +To disable MSIs, echo 0 instead of 1. Changing this value should be +done with caution as it can break interrupt handling for all devices +below this bridge. + +Again, please notify linux-pci@vger.kernel.org of any bridges that need +special handling. + +5.3. Disabling MSIs on a single device + +Some devices are known to have faulty MSI implementations. Usually this +is handled in the individual device driver but occasionally it's necessary +to handle this with a quirk. Some drivers have an option to disable use +of MSI. While this is a convenient workaround for the driver author, +it is not good practise, and should not be emulated. + +5.4. Finding why MSIs are disabled on a device + +From the above three sections, you can see that there are many reasons +why MSIs may not be enabled for a given device. Your first step should +be to examine your dmesg carefully to determine whether MSIs are enabled +for your machine. You should also check your .config to be sure you +have enabled CONFIG_PCI_MSI. + +Then, 'lspci -t' gives the list of bridges above a device. Reading +/sys/bus/pci/devices/*/msi_bus will tell you whether MSI are enabled (1) +or disabled (0). If 0 is found in any of the msi_bus files belonging +to bridges between the PCI root and the device, MSIs are disabled. + +It is also worth checking the device driver to see whether it supports MSIs. +For example, it may contain calls to pci_enable_msi(), pci_enable_msix() or +pci_enable_msi_block(). -- cgit v1.2.3 From 1c8d7b0a562da06d3ebe83f01b1ed553205d1ae4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 17 Mar 2009 08:54:10 -0400 Subject: PCI MSI: Add support for multiple MSI Add the new API pci_enable_msi_block() to allow drivers to request multiple MSI and reimplement pci_enable_msi in terms of pci_enable_msi_block. Ensure that the architecture back ends don't have to know about multiple MSI. Signed-off-by: Matthew Wilcox Signed-off-by: Jesse Barnes --- Documentation/PCI/MSI-HOWTO.txt | 45 +++++++++++++++++--- arch/powerpc/kernel/msi.c | 4 ++ arch/x86/kernel/io_apic.c | 4 ++ drivers/pci/msi.c | 91 +++++++++++++++++++++++++++++------------ drivers/pci/msi.h | 6 --- include/linux/msi.h | 1 + include/linux/pci.h | 6 ++- 7 files changed, 116 insertions(+), 41 deletions(-) (limited to 'Documentation') diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 1c02431f1d1a..9494f6dc38eb 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -94,15 +94,48 @@ This function should be called before the driver calls request_irq() since enabling MSIs disables the pin-based IRQ and the driver will not receive interrupts on the old interrupt. -4.2.2 pci_disable_msi +4.2.2 pci_enable_msi_block + +int pci_enable_msi_block(struct pci_dev *dev, int count) + +This variation on the above call allows a device driver to request multiple +MSIs. The MSI specification only allows interrupts to be allocated in +powers of two, up to a maximum of 2^5 (32). + +If this function returns 0, it has succeeded in allocating at least as many +interrupts as the driver requested (it may have allocated more in order +to satisfy the power-of-two requirement). In this case, the function +enables MSI on this device and updates dev->irq to be the lowest of +the new interrupts assigned to it. The other interrupts assigned to +the device are in the range dev->irq to dev->irq + count - 1. + +If this function returns a negative number, it indicates an error and +the driver should not attempt to request any more MSI interrupts for +this device. If this function returns a positive number, it will be +less than 'count' and indicate the number of interrupts that could have +been allocated. In neither case will the irq value have been +updated, nor will the device have been switched into MSI mode. + +The device driver must decide what action to take if +pci_enable_msi_block() returns a value less than the number asked for. +Some devices can make use of fewer interrupts than the maximum they +request; in this case the driver should call pci_enable_msi_block() +again. Note that it is not guaranteed to succeed, even when the +'count' has been reduced to the value returned from a previous call to +pci_enable_msi_block(). This is because there are multiple constraints +on the number of vectors that can be allocated; pci_enable_msi_block() +will return as soon as it finds any constraint that doesn't allow the +call to succeed. + +4.2.3 pci_disable_msi void pci_disable_msi(struct pci_dev *dev) -This function should be used to undo the effect of pci_enable_msi(). -Calling it restores dev->irq to the pin-based interrupt number and frees -the previously allocated message signaled interrupt(s). The interrupt -may subsequently be assigned to another device, so drivers should not -cache the value of dev->irq. +This function should be used to undo the effect of pci_enable_msi() or +pci_enable_msi_block(). Calling it restores dev->irq to the pin-based +interrupt number and frees the previously allocated message signaled +interrupt(s). The interrupt may subsequently be assigned to another +device, so drivers should not cache the value of dev->irq. A device driver must always call free_irq() on the interrupt(s) for which it has called request_irq() before calling this function. diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c index 3bb7d3dd28be..0c16e2a854e5 100644 --- a/arch/powerpc/kernel/msi.c +++ b/arch/powerpc/kernel/msi.c @@ -19,6 +19,10 @@ int arch_msi_check_device(struct pci_dev* dev, int nvec, int type) return -ENOSYS; } + /* PowerPC doesn't support multiple MSI yet */ + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + if (ppc_md.msi_check_device) { pr_debug("msi: Using platform check routine.\n"); return ppc_md.msi_check_device(dev, nvec, type); diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index bc7ac4da90d7..a09549a6321b 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -3510,6 +3510,10 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int index = 0; #endif + /* x86 doesn't support multiple MSI yet */ + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + irq_want = nr_irqs_gsi; sub_handle = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index adcc78242571..6f2e6295e773 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -40,6 +40,13 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) struct msi_desc *entry; int ret; + /* + * If an architecture wants to support multiple MSI, it needs to + * override arch_setup_msi_irqs() + */ + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + list_for_each_entry(entry, &dev->msi_list, list) { ret = arch_setup_msi_irq(dev, entry); if (ret < 0) @@ -58,8 +65,12 @@ void arch_teardown_msi_irqs(struct pci_dev *dev) struct msi_desc *entry; list_for_each_entry(entry, &dev->msi_list, list) { - if (entry->irq != 0) - arch_teardown_msi_irq(entry->irq); + int i, nvec; + if (entry->irq == 0) + continue; + nvec = 1 << entry->msi_attrib.multiple; + for (i = 0; i < nvec; i++) + arch_teardown_msi_irq(entry->irq + i); } } #endif @@ -163,7 +174,8 @@ static void msi_set_mask_bit(unsigned irq, u32 flag) msix_mask_irq(desc, flag); readl(desc->mask_base); /* Flush write to device */ } else { - msi_mask_irq(desc, 1, flag); + unsigned offset = irq - desc->dev->irq; + msi_mask_irq(desc, 1 << offset, flag << offset); } } @@ -229,6 +241,12 @@ void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg) } else { struct pci_dev *dev = entry->dev; int pos = entry->msi_attrib.pos; + u16 msgctl; + + pci_read_config_word(dev, msi_control_reg(pos), &msgctl); + msgctl &= ~PCI_MSI_FLAGS_QSIZE; + msgctl |= entry->msi_attrib.multiple << 4; + pci_write_config_word(dev, msi_control_reg(pos), msgctl); pci_write_config_dword(dev, msi_lower_address_reg(pos), msg->address_lo); @@ -291,7 +309,7 @@ static void __pci_restore_msi_state(struct pci_dev *dev) pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control); msi_mask_irq(entry, msi_capable_mask(control), entry->masked); control &= ~PCI_MSI_FLAGS_QSIZE; - control |= PCI_MSI_FLAGS_ENABLE; + control |= (entry->msi_attrib.multiple << 4) | PCI_MSI_FLAGS_ENABLE; pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control); } @@ -332,13 +350,15 @@ EXPORT_SYMBOL_GPL(pci_restore_msi_state); /** * msi_capability_init - configure device's MSI capability structure * @dev: pointer to the pci_dev data structure of MSI device function + * @nvec: number of interrupts to allocate * - * Setup the MSI capability structure of device function with a single - * MSI irq, regardless of device function is capable of handling - * multiple messages. A return of zero indicates the successful setup - * of an entry zero with the new MSI irq or non-zero for otherwise. - **/ -static int msi_capability_init(struct pci_dev *dev) + * Setup the MSI capability structure of the device with the requested + * number of interrupts. A return value of zero indicates the successful + * setup of an entry with the new MSI irq. A negative return value indicates + * an error, and a positive return value indicates the number of interrupts + * which could have been allocated. + */ +static int msi_capability_init(struct pci_dev *dev, int nvec) { struct msi_desc *entry; int pos, ret; @@ -371,7 +391,7 @@ static int msi_capability_init(struct pci_dev *dev) list_add_tail(&entry->list, &dev->msi_list); /* Configure MSI capability structure */ - ret = arch_setup_msi_irqs(dev, 1, PCI_CAP_ID_MSI); + ret = arch_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI); if (ret) { msi_free_irqs(dev); return ret; @@ -524,35 +544,48 @@ static int pci_msi_check_device(struct pci_dev* dev, int nvec, int type) } /** - * pci_enable_msi - configure device's MSI capability structure - * @dev: pointer to the pci_dev data structure of MSI device function + * pci_enable_msi_block - configure device's MSI capability structure + * @dev: device to configure + * @nvec: number of interrupts to configure * - * Setup the MSI capability structure of device function with - * a single MSI irq upon its software driver call to request for - * MSI mode enabled on its hardware device function. A return of zero - * indicates the successful setup of an entry zero with the new MSI - * irq or non-zero for otherwise. - **/ -int pci_enable_msi(struct pci_dev* dev) + * Allocate IRQs for a device with the MSI capability. + * This function returns a negative errno if an error occurs. If it + * is unable to allocate the number of interrupts requested, it returns + * the number of interrupts it might be able to allocate. If it successfully + * allocates at least the number of interrupts requested, it returns 0 and + * updates the @dev's irq member to the lowest new interrupt number; the + * other interrupt numbers allocated to this device are consecutive. + */ +int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec) { - int status; + int status, pos, maxvec; + u16 msgctl; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + if (!pos) + return -EINVAL; + pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl); + maxvec = 1 << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1); + if (nvec > maxvec) + return maxvec; - status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI); + status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSI); if (status) return status; WARN_ON(!!dev->msi_enabled); - /* Check whether driver already requested for MSI-X irqs */ + /* Check whether driver already requested MSI-X irqs */ if (dev->msix_enabled) { dev_info(&dev->dev, "can't enable MSI " "(MSI-X already enabled)\n"); return -EINVAL; } - status = msi_capability_init(dev); + + status = msi_capability_init(dev, nvec); return status; } -EXPORT_SYMBOL(pci_enable_msi); +EXPORT_SYMBOL(pci_enable_msi_block); void pci_msi_shutdown(struct pci_dev *dev) { @@ -599,8 +632,12 @@ static int msi_free_irqs(struct pci_dev* dev) struct msi_desc *entry, *tmp; list_for_each_entry(entry, &dev->msi_list, list) { - if (entry->irq) - BUG_ON(irq_has_action(entry->irq)); + int i, nvec; + if (!entry->irq) + continue; + nvec = 1 << entry->msi_attrib.multiple; + for (i = 0; i < nvec; i++) + BUG_ON(irq_has_action(entry->irq + i)); } arch_teardown_msi_irqs(dev); diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h index 3898f5237144..71f4df2ef654 100644 --- a/drivers/pci/msi.h +++ b/drivers/pci/msi.h @@ -20,14 +20,8 @@ #define msi_mask_bits_reg(base, is64bit) \ ( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4) #define msi_disable(control) control &= ~PCI_MSI_FLAGS_ENABLE -#define multi_msi_capable(control) \ - (1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1)) -#define multi_msi_enable(control, num) \ - control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE); #define is_64bit_address(control) (!!(control & PCI_MSI_FLAGS_64BIT)) #define is_mask_bit_support(control) (!!(control & PCI_MSI_FLAGS_MASKBIT)) -#define msi_enable(control, num) multi_msi_enable(control, num); \ - control |= PCI_MSI_FLAGS_ENABLE #define msix_table_offset_reg(base) (base + 0x04) #define msix_pba_offset_reg(base) (base + 0x08) diff --git a/include/linux/msi.h b/include/linux/msi.h index 37c1bbe546e5..6991ab5b24d1 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -21,6 +21,7 @@ extern void write_msi_msg(unsigned int irq, struct msi_msg *msg); struct msi_desc { struct { __u8 is_msix : 1; + __u8 multiple: 3; /* log2 number of messages */ __u8 maskbit : 1; /* mask-pending bit supported ? */ __u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */ __u8 pos; /* Location of the msi capability */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 7baf2a5db12a..1f6c5ddaae36 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -789,7 +789,7 @@ struct msix_entry { #ifndef CONFIG_PCI_MSI -static inline int pci_enable_msi(struct pci_dev *dev) +static inline int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec) { return -1; } @@ -824,7 +824,7 @@ static inline int pci_msi_enabled(void) return 0; } #else -extern int pci_enable_msi(struct pci_dev *dev); +extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec); extern void pci_msi_shutdown(struct pci_dev *dev); extern void pci_disable_msi(struct pci_dev *dev); extern int pci_msix_table_size(struct pci_dev *dev); @@ -846,6 +846,8 @@ static inline int pcie_aspm_enabled(void) extern int pcie_aspm_enabled(void); #endif +#define pci_enable_msi(pdev) pci_enable_msi_block(pdev, 1) + #ifdef CONFIG_HT_IRQ /* The functions a driver should call */ int ht_create_irq(struct pci_dev *dev, int idx); -- cgit v1.2.3 From 32a9a682bef2f6fce7026bd94d1ce20028b0e52d Mon Sep 17 00:00:00 2001 From: Yuji Shimada Date: Mon, 16 Mar 2009 17:13:39 +0900 Subject: PCI: allow assignment of memory resources with a specified alignment This patch allows memory resources to be assigned with a specified alignment at boot-time or run-time. The patch is useful when we use PCI pass-through, because page-aligned memory resources are required to securely share PCI resources with guest drivers. If you want to assign the resource at boot time, please set "pci=resource_alignment=" boot parameter. This is format of "pci=resource_alignment=" boot parameter: [@][:]:.[; ...] Specifies alignment and device to reassign aligned memory resources. If is not specified, PAGE_SIZE is used as alignment. PCI-PCI bridge can be specified, if resource windows need to be expanded. This is example: pci=resource_alignment=20@07:00.0;18@0f:00.0;00:1d.7 If you want to assign the resource at run-time, please set "/sys/bus/pci/resource_alignment" file, and hot-remove the device and hot-add the device. For this purpose, fakephp or PCI hotplug interfaces can be used. The format of "/sys/bus/pci/resource_alignment" file is the same with boot parameter. You can use "," instead of ";". For example: # cd /sys/bus/pci # echo -n 20@12:00.0 > resource_alignment # echo 1 > devices/0000:12:00.0/remove # echo 1 > rescan Reviewed-by: Alex Chiang Reviewed-by: Yu Zhao Signed-off-by: Yuji Shimada Signed-off-by: Jesse Barnes --- Documentation/kernel-parameters.txt | 9 +++ drivers/pci/pci.c | 120 ++++++++++++++++++++++++++++++++++++ drivers/pci/pci.h | 6 ++ drivers/pci/quirks.c | 60 ++++++++++++++++++ drivers/pci/setup-res.c | 15 +++++ 5 files changed, 210 insertions(+) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index c7c441e7930e..1754fedc531c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1760,6 +1760,15 @@ and is between 256 and 4096 characters. It is defined in the file cbmemsize=nn[KMG] The fixed amount of bus space which is reserved for the CardBus bridge's memory window. The default value is 64 megabytes. + resource_alignment= + Format: + [@][:]:.[; ...] + Specifies alignment and device to reassign + aligned memory resources. + If is not specified, + PAGE_SIZE is used as alignment. + PCI-PCI bridge can be specified, if resource + windows need to be expanded. pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 8310dc2f943b..a35a8b2ba631 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -20,6 +20,8 @@ #include #include #include /* isa_dma_bridge_buggy */ +#include +#include #include "pci.h" unsigned int pci_pm_d3_delay = PCI_PM_D3_WAIT; @@ -2370,6 +2372,121 @@ int pci_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type) return 0; } +#define RESOURCE_ALIGNMENT_PARAM_SIZE COMMAND_LINE_SIZE +static char resource_alignment_param[RESOURCE_ALIGNMENT_PARAM_SIZE] = {0}; +spinlock_t resource_alignment_lock = SPIN_LOCK_UNLOCKED; + +/** + * pci_specified_resource_alignment - get resource alignment specified by user. + * @dev: the PCI device to get + * + * RETURNS: Resource alignment if it is specified. + * Zero if it is not specified. + */ +resource_size_t pci_specified_resource_alignment(struct pci_dev *dev) +{ + int seg, bus, slot, func, align_order, count; + resource_size_t align = 0; + char *p; + + spin_lock(&resource_alignment_lock); + p = resource_alignment_param; + while (*p) { + count = 0; + if (sscanf(p, "%d%n", &align_order, &count) == 1 && + p[count] == '@') { + p += count + 1; + } else { + align_order = -1; + } + if (sscanf(p, "%x:%x:%x.%x%n", + &seg, &bus, &slot, &func, &count) != 4) { + seg = 0; + if (sscanf(p, "%x:%x.%x%n", + &bus, &slot, &func, &count) != 3) { + /* Invalid format */ + printk(KERN_ERR "PCI: Can't parse resource_alignment parameter: %s\n", + p); + break; + } + } + p += count; + if (seg == pci_domain_nr(dev->bus) && + bus == dev->bus->number && + slot == PCI_SLOT(dev->devfn) && + func == PCI_FUNC(dev->devfn)) { + if (align_order == -1) { + align = PAGE_SIZE; + } else { + align = 1 << align_order; + } + /* Found */ + break; + } + if (*p != ';' && *p != ',') { + /* End of param or invalid format */ + break; + } + p++; + } + spin_unlock(&resource_alignment_lock); + return align; +} + +/** + * pci_is_reassigndev - check if specified PCI is target device to reassign + * @dev: the PCI device to check + * + * RETURNS: non-zero for PCI device is a target device to reassign, + * or zero is not. + */ +int pci_is_reassigndev(struct pci_dev *dev) +{ + return (pci_specified_resource_alignment(dev) != 0); +} + +ssize_t pci_set_resource_alignment_param(const char *buf, size_t count) +{ + if (count > RESOURCE_ALIGNMENT_PARAM_SIZE - 1) + count = RESOURCE_ALIGNMENT_PARAM_SIZE - 1; + spin_lock(&resource_alignment_lock); + strncpy(resource_alignment_param, buf, count); + resource_alignment_param[count] = '\0'; + spin_unlock(&resource_alignment_lock); + return count; +} + +ssize_t pci_get_resource_alignment_param(char *buf, size_t size) +{ + size_t count; + spin_lock(&resource_alignment_lock); + count = snprintf(buf, size, "%s", resource_alignment_param); + spin_unlock(&resource_alignment_lock); + return count; +} + +static ssize_t pci_resource_alignment_show(struct bus_type *bus, char *buf) +{ + return pci_get_resource_alignment_param(buf, PAGE_SIZE); +} + +static ssize_t pci_resource_alignment_store(struct bus_type *bus, + const char *buf, size_t count) +{ + return pci_set_resource_alignment_param(buf, count); +} + +BUS_ATTR(resource_alignment, 0644, pci_resource_alignment_show, + pci_resource_alignment_store); + +static int __init pci_resource_alignment_sysfs_init(void) +{ + return bus_create_file(&pci_bus_type, + &bus_attr_resource_alignment); +} + +late_initcall(pci_resource_alignment_sysfs_init); + static void __devinit pci_no_domains(void) { #ifdef CONFIG_PCI_DOMAINS @@ -2418,6 +2535,9 @@ static int __init pci_setup(char *str) pci_cardbus_io_size = memparse(str + 9, &str); } else if (!strncmp(str, "cbmemsize=", 10)) { pci_cardbus_mem_size = memparse(str + 10, &str); + } else if (!strncmp(str, "resource_alignment=", 19)) { + pci_set_resource_alignment_param(str + 19, + strlen(str + 19)); } else { printk(KERN_ERR "PCI: Unknown option `%s'\n", str); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 07c0aa5275e6..2cd1cba7236f 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -195,4 +195,10 @@ static inline int pci_ari_enabled(struct pci_bus *bus) return bus->self && bus->self->ari_enabled; } +#ifdef CONFIG_PCI_QUIRKS +extern int pci_is_reassigndev(struct pci_dev *dev); +resource_size_t pci_specified_resource_alignment(struct pci_dev *dev); +extern void pci_disable_bridge_window(struct pci_dev *dev); +#endif + #endif /* DRIVERS_PCI_H */ diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 5aa2afb23ef9..50233818a763 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "pci.h" int isa_dma_bridge_buggy; @@ -34,6 +35,65 @@ int pcie_mch_quirk; EXPORT_SYMBOL(pcie_mch_quirk); #ifdef CONFIG_PCI_QUIRKS +/* + * This quirk function disables the device and releases resources + * which is specified by kernel's boot parameter 'pci=resource_alignment='. + * It also rounds up size to specified alignment. + * Later on, the kernel will assign page-aligned memory resource back + * to that device. + */ +static void __devinit quirk_resource_alignment(struct pci_dev *dev) +{ + int i; + struct resource *r; + resource_size_t align, size; + + if (!pci_is_reassigndev(dev)) + return; + + if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL && + (dev->class >> 8) == PCI_CLASS_BRIDGE_HOST) { + dev_warn(&dev->dev, + "Can't reassign resources to host bridge.\n"); + return; + } + + dev_info(&dev->dev, "Disabling device and release resources.\n"); + pci_disable_device(dev); + + align = pci_specified_resource_alignment(dev); + for (i=0; i < PCI_BRIDGE_RESOURCES; i++) { + r = &dev->resource[i]; + if (!(r->flags & IORESOURCE_MEM)) + continue; + size = resource_size(r); + if (size < align) { + size = align; + dev_info(&dev->dev, + "Rounding up size of resource #%d to %#llx.\n", + i, (unsigned long long)size); + } + r->end = size - 1; + r->start = 0; + } + /* Need to disable bridge's resource window, + * to enable the kernel to reassign new resource + * window later on. + */ + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE && + (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { + for (i = PCI_BRIDGE_RESOURCES; i < PCI_NUM_RESOURCES; i++) { + r = &dev->resource[i]; + if (!(r->flags & IORESOURCE_MEM)) + continue; + r->end = resource_size(r) - 1; + r->start = 0; + } + pci_disable_bridge_window(dev); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, quirk_resource_alignment); + /* The Mellanox Tavor device gives false positive parity errors * Mark this device with a broken_parity_status, to allow * PCI scanning code to "skip" this now blacklisted device. diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index 32e8d88a4619..3039fcb86afc 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -120,6 +120,21 @@ int pci_claim_resource(struct pci_dev *dev, int resource) return err; } +#ifdef CONFIG_PCI_QUIRKS +void pci_disable_bridge_window(struct pci_dev *dev) +{ + dev_dbg(&dev->dev, "Disabling bridge window.\n"); + + /* MMIO Base/Limit */ + pci_write_config_dword(dev, PCI_MEMORY_BASE, 0x0000fff0); + + /* Prefetchable MMIO Base/Limit */ + pci_write_config_dword(dev, PCI_PREF_LIMIT_UPPER32, 0); + pci_write_config_dword(dev, PCI_PREF_MEMORY_BASE, 0x0000fff0); + pci_write_config_dword(dev, PCI_PREF_BASE_UPPER32, 0xffffffff); +} +#endif /* CONFIG_PCI_QUIRKS */ + int pci_assign_resource(struct pci_dev *dev, int resno) { struct pci_bus *bus = dev->bus; -- cgit v1.2.3 From 01db4957179c92fda7d9a06e49b7ae56fb7c925b Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 20 Mar 2009 11:25:17 +0800 Subject: PCI: document SR-IOV sysfs entries Reviewed-by: Randy Dunlap Reviewed-by: Matthew Wilcox Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- Documentation/ABI/testing/sysfs-bus-pci | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 3d29793a0ea2..d175a2a5fac2 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -68,3 +68,30 @@ Description: that some devices may have malformatted data. If the underlying VPD has a writable section then the corresponding section of this file will be writable. + +What: /sys/bus/pci/devices/.../virtfnN +Date: March 2009 +Contact: Yu Zhao +Description: + This symbolic link appears when hardware supports the SR-IOV + capability and the Physical Function driver has enabled it. + The symbolic link points to the PCI device sysfs entry of the + Virtual Function whose index is N (0...MaxVFs-1). + +What: /sys/bus/pci/devices/.../dep_link +Date: March 2009 +Contact: Yu Zhao +Description: + This symbolic link appears when hardware supports the SR-IOV + capability and the Physical Function driver has enabled it, + and this device has vendor specific dependencies with others. + The symbolic link points to the PCI device sysfs entry of + Physical Function this device depends on. + +What: /sys/bus/pci/devices/.../physfn +Date: March 2009 +Contact: Yu Zhao +Description: + This symbolic link appears when a device is a Virtual Function. + The symbolic link points to the PCI device sysfs entry of the + Physical Function this device associates with. -- cgit v1.2.3 From 15b49bee3a2b228370194f1b3ebc3db427cc9c94 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 20 Mar 2009 11:25:18 +0800 Subject: PCI: manual for SR-IOV user and driver developer Reviewed-by: Randy Dunlap Reviewed-by: Matthew Wilcox Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- Documentation/DocBook/kernel-api.tmpl | 1 + Documentation/PCI/pci-iov-howto.txt | 99 +++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 Documentation/PCI/pci-iov-howto.txt (limited to 'Documentation') diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index bc962cda6504..58c194572c76 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl @@ -199,6 +199,7 @@ X!Edrivers/pci/hotplug.c --> !Edrivers/pci/probe.c !Edrivers/pci/rom.c +!Edrivers/pci/iov.c PCI Hotplug Support Library !Edrivers/pci/hotplug/pci_hotplug_core.c diff --git a/Documentation/PCI/pci-iov-howto.txt b/Documentation/PCI/pci-iov-howto.txt new file mode 100644 index 000000000000..fc73ef5d65b8 --- /dev/null +++ b/Documentation/PCI/pci-iov-howto.txt @@ -0,0 +1,99 @@ + PCI Express I/O Virtualization Howto + Copyright (C) 2009 Intel Corporation + Yu Zhao + + +1. Overview + +1.1 What is SR-IOV + +Single Root I/O Virtualization (SR-IOV) is a PCI Express Extended +capability which makes one physical device appear as multiple virtual +devices. The physical device is referred to as Physical Function (PF) +while the virtual devices are referred to as Virtual Functions (VF). +Allocation of the VF can be dynamically controlled by the PF via +registers encapsulated in the capability. By default, this feature is +not enabled and the PF behaves as traditional PCIe device. Once it's +turned on, each VF's PCI configuration space can be accessed by its own +Bus, Device and Function Number (Routing ID). And each VF also has PCI +Memory Space, which is used to map its register set. VF device driver +operates on the register set so it can be functional and appear as a +real existing PCI device. + +2. User Guide + +2.1 How can I enable SR-IOV capability + +The device driver (PF driver) will control the enabling and disabling +of the capability via API provided by SR-IOV core. If the hardware +has SR-IOV capability, loading its PF driver would enable it and all +VFs associated with the PF. + +2.2 How can I use the Virtual Functions + +The VF is treated as hot-plugged PCI devices in the kernel, so they +should be able to work in the same way as real PCI devices. The VF +requires device driver that is same as a normal PCI device's. + +3. Developer Guide + +3.1 SR-IOV API + +To enable SR-IOV capability: + int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn); + 'nr_virtfn' is number of VFs to be enabled. + +To disable SR-IOV capability: + void pci_disable_sriov(struct pci_dev *dev); + +To notify SR-IOV core of Virtual Function Migration: + irqreturn_t pci_sriov_migration(struct pci_dev *dev); + +3.2 Usage example + +Following piece of code illustrates the usage of the SR-IOV API. + +static int __devinit dev_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + pci_enable_sriov(dev, NR_VIRTFN); + + ... + + return 0; +} + +static void __devexit dev_remove(struct pci_dev *dev) +{ + pci_disable_sriov(dev); + + ... +} + +static int dev_suspend(struct pci_dev *dev, pm_message_t state) +{ + ... + + return 0; +} + +static int dev_resume(struct pci_dev *dev) +{ + ... + + return 0; +} + +static void dev_shutdown(struct pci_dev *dev) +{ + ... +} + +static struct pci_driver dev_driver = { + .name = "SR-IOV Physical Function driver", + .id_table = dev_id_table, + .probe = dev_probe, + .remove = __devexit_p(dev_remove), + .suspend = dev_suspend, + .resume = dev_resume, + .shutdown = dev_shutdown, +}; -- cgit v1.2.3 From fafad5bf06c3a3bb8b24b28b6f065367e7411872 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 20 Mar 2009 15:22:12 +1100 Subject: PCI MSI: Add example request loop to MSI-HOWTO.txt Encourage driver writers to think about supporting a variable number of MSI-X interrupts, and give an example of how to do such a request. Acked-by: Matthew Wilcox Signed-off-by: Michael Ellerman Signed-off-by: Jesse Barnes --- Documentation/PCI/MSI-HOWTO.txt | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 9494f6dc38eb..dcf7acc720e1 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -176,7 +176,8 @@ request_irq() for each 'vector' that it decides to use. If this function returns a negative number, it indicates an error and the driver should not attempt to allocate any more MSI-X interrupts for this device. If it returns a positive number, it indicates the maximum -number of interrupt vectors that could have been allocated. +number of interrupt vectors that could have been allocated. See example +below. This function, in contrast with pci_enable_msi(), does not adjust dev->irq. The device will not generate interrupts for this interrupt @@ -187,6 +188,26 @@ free them again later. Device drivers should normally call this function once per device during the initialization phase. +It is ideal if drivers can cope with a variable number of MSI-X interrupts, +there are many reasons why the platform may not be able to provide the +exact number a driver asks for. + +A request loop to achieve that might look like: + +static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) +{ + while (nvec >= FOO_DRIVER_MINIMUM_NVEC) { + rc = pci_enable_msix(adapter->pdev, + adapter->msix_entries, nvec); + if (rc > 0) + nvec = rc; + else + return rc; + } + + return -ENOSPC; +} + 4.3.2 pci_disable_msix void pci_disable_msix(struct pci_dev *dev) -- cgit v1.2.3 From 705b1aaa823e800490f157cd9366ad8cff385f5f Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 20 Mar 2009 14:56:31 -0600 Subject: PCI: Introduce /sys/bus/pci/rescan This interface allows the user to force a rescan of all PCI buses in system, and rediscover devices that have been removed earlier. pci_bus_attrs implementation from Trent Piepho. Thanks to Vegard Nossum for discovering locking issues with the sysfs interface. Cc: Trent Piepho Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- Documentation/ABI/testing/sysfs-bus-pci | 9 +++++++++ drivers/pci/pci-driver.c | 1 + drivers/pci/pci-sysfs.c | 26 ++++++++++++++++++++++++++ drivers/pci/pci.h | 6 ++++++ drivers/pci/probe.c | 4 ++-- 5 files changed, 44 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index d175a2a5fac2..e6ad047cb3c2 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -57,6 +57,15 @@ Description: match the driver to the device. For example: # echo "8086 10f5" > /sys/bus/pci/drivers/foo/remove_id +What: /sys/bus/pci/rescan +Date: January 2009 +Contact: Linux PCI developers +Description: + Writing a non-zero value to this attribute will + force a rescan of all PCI buses in the system, and + re-discover previously removed devices. + Depends on CONFIG_HOTPLUG. + What: /sys/bus/pci/devices/.../vpd Date: February 2008 Contact: Ben Hutchings diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 87a5ddbb324f..95d198570290 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -1049,6 +1049,7 @@ struct bus_type pci_bus_type = { .remove = pci_device_remove, .shutdown = pci_device_shutdown, .dev_attrs = pci_dev_attrs, + .bus_attrs = pci_bus_attrs, .pm = PCI_PM_OPS_PTR, }; diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index ec7a175dcbaf..be7468a5eb72 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -219,6 +219,32 @@ msi_bus_store(struct device *dev, struct device_attribute *attr, return count; } +#ifdef CONFIG_HOTPLUG +static DEFINE_MUTEX(pci_remove_rescan_mutex); +static ssize_t bus_rescan_store(struct bus_type *bus, const char *buf, + size_t count) +{ + unsigned long val; + struct pci_bus *b = NULL; + + if (strict_strtoul(buf, 0, &val) < 0) + return -EINVAL; + + if (val) { + mutex_lock(&pci_remove_rescan_mutex); + while ((b = pci_find_next_bus(b)) != NULL) + pci_rescan_bus(b); + mutex_unlock(&pci_remove_rescan_mutex); + } + return count; +} + +struct bus_attribute pci_bus_attrs[] = { + __ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, bus_rescan_store), + __ATTR_NULL +}; +#endif + struct device_attribute pci_dev_attrs[] = { __ATTR_RO(resource), __ATTR_RO(vendor), diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 22dcfdb75d91..45833a5bca61 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -138,6 +138,12 @@ extern int pcie_mch_quirk; extern struct device_attribute pci_dev_attrs[]; extern struct device_attribute dev_attr_cpuaffinity; extern struct device_attribute dev_attr_cpulistaffinity; +#ifdef CONFIG_HOTPLUG +extern struct bus_attribute pci_bus_attrs[]; +#else +#define pci_bus_attrs NULL +#endif + /** * pci_match_one_device - Tell if a PCI device structure has a matching diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 60a8e5fec6c5..56c71e585f3d 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1228,13 +1228,13 @@ unsigned int __devinit pci_rescan_bus(struct pci_bus *bus) max = pci_scan_child_bus(bus); - up_read(&pci_bus_sem); + down_read(&pci_bus_sem); list_for_each_entry(dev, &bus->devices, bus_list) if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) if (dev->subordinate) pci_bus_size_bridges(dev->subordinate); - down_read(&pci_bus_sem); + up_read(&pci_bus_sem); pci_bus_assign_resources(bus); pci_enable_bridges(bus); -- cgit v1.2.3 From 77c27c7b49d69d45ccb94e481653f024f1ac6650 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 20 Mar 2009 14:56:36 -0600 Subject: PCI: Introduce /sys/bus/pci/devices/.../remove This patch adds an attribute named "remove" to a PCI device's sysfs directory. Writing a non-zero value to this attribute will remove the PCI device and any children of it. Trent Piepho wrote the original implementation and documentation. Thanks to Vegard Nossum for testing under kmemcheck and finding locking issues with the sysfs interface. Cc: Trent Piepho Tested-by: Vegard Nossum Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- Documentation/ABI/testing/sysfs-bus-pci | 8 ++++++++ Documentation/filesystems/sysfs-pci.txt | 10 +++++++++ drivers/pci/pci-sysfs.c | 36 +++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index e6ad047cb3c2..daa791a9e85e 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -66,6 +66,14 @@ Description: re-discover previously removed devices. Depends on CONFIG_HOTPLUG. +What: /sys/bus/pci/devices/.../remove +Date: January 2009 +Contact: Linux PCI developers +Description: + Writing a non-zero value to this attribute will + hot-remove the PCI device and any of its children. + Depends on CONFIG_HOTPLUG. + What: /sys/bus/pci/devices/.../vpd Date: February 2008 Contact: Ben Hutchings diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt index 9f8740ca3f3b..26e4b8bc53ee 100644 --- a/Documentation/filesystems/sysfs-pci.txt +++ b/Documentation/filesystems/sysfs-pci.txt @@ -12,6 +12,7 @@ that support it. For example, a given bus might look like this: | |-- enable | |-- irq | |-- local_cpus + | |-- remove | |-- resource | |-- resource0 | |-- resource1 @@ -36,6 +37,7 @@ files, each with their own function. enable Whether the device is enabled (ascii, rw) irq IRQ number (ascii, ro) local_cpus nearby CPU mask (cpumask, ro) + remove remove device from kernel's list (ascii, wo) resource PCI resource host addresses (ascii, ro) resource0..N PCI resource N, if present (binary, mmap) resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) @@ -46,6 +48,7 @@ files, each with their own function. ro - read only file rw - file is readable and writable + wo - write only file mmap - file is mmapable ascii - file contains ascii text binary - file contains binary data @@ -73,6 +76,13 @@ that the device must be enabled for a rom read to return data succesfully. In the event a driver is not bound to the device, it can be enabled using the 'enable' file, documented above. +The 'remove' file is used to remove the PCI device, by writing a non-zero +integer to the file. This does not involve any kind of hot-plug functionality, +e.g. powering off the device. The device is removed from the kernel's list of +PCI devices, the sysfs directory for it is removed, and the device will be +removed from any drivers attached to it. Removal of PCI root buses is +disallowed. + Accessing legacy resources through sysfs ---------------------------------------- diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index be7468a5eb72..e16990ecc024 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -243,6 +243,39 @@ struct bus_attribute pci_bus_attrs[] = { __ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, bus_rescan_store), __ATTR_NULL }; + +static void remove_callback(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + mutex_lock(&pci_remove_rescan_mutex); + pci_remove_bus_device(pdev); + mutex_unlock(&pci_remove_rescan_mutex); +} + +static ssize_t +remove_store(struct device *dev, struct device_attribute *dummy, + const char *buf, size_t count) +{ + int ret = 0; + unsigned long val; + struct pci_dev *pdev = to_pci_dev(dev); + + if (strict_strtoul(buf, 0, &val) < 0) + return -EINVAL; + + if (pci_is_root_bus(pdev->bus)) + return -EBUSY; + + /* An attribute cannot be unregistered by one of its own methods, + * so we have to use this roundabout approach. + */ + if (val) + ret = device_schedule_callback(dev, remove_callback); + if (ret) + count = ret; + return count; +} #endif struct device_attribute pci_dev_attrs[] = { @@ -263,6 +296,9 @@ struct device_attribute pci_dev_attrs[] = { __ATTR(broken_parity_status,(S_IRUGO|S_IWUSR), broken_parity_status_show,broken_parity_status_store), __ATTR(msi_bus, 0644, msi_bus_show, msi_bus_store), +#ifdef CONFIG_HOTPLUG + __ATTR(remove, (S_IWUSR|S_IWGRP), NULL, remove_store), +#endif __ATTR_NULL, }; -- cgit v1.2.3 From 738a6396c223b486304dda778119dbbca563f019 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 20 Mar 2009 14:56:41 -0600 Subject: PCI: Introduce /sys/bus/pci/devices/.../rescan This interface allows the user to force a rescan of the device's parent bus and all subordinate buses, and rediscover devices removed earlier from this part of the device tree. Cc: Trent Piepho Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- Documentation/ABI/testing/sysfs-bus-pci | 10 ++++++++++ drivers/pci/pci-sysfs.c | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index daa791a9e85e..97ad190e13af 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -74,6 +74,16 @@ Description: hot-remove the PCI device and any of its children. Depends on CONFIG_HOTPLUG. +What: /sys/bus/pci/devices/.../rescan +Date: January 2009 +Contact: Linux PCI developers +Description: + Writing a non-zero value to this attribute will + force a rescan of the device's parent bus and all + child buses, and re-discover devices removed earlier + from this part of the device tree. + Depends on CONFIG_HOTPLUG. + What: /sys/bus/pci/devices/.../vpd Date: February 2008 Contact: Ben Hutchings diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index e16990ecc024..e9a8706a6401 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -244,6 +244,24 @@ struct bus_attribute pci_bus_attrs[] = { __ATTR_NULL }; +static ssize_t +dev_rescan_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + struct pci_dev *pdev = to_pci_dev(dev); + + if (strict_strtoul(buf, 0, &val) < 0) + return -EINVAL; + + if (val) { + mutex_lock(&pci_remove_rescan_mutex); + pci_rescan_bus(pdev->bus); + mutex_unlock(&pci_remove_rescan_mutex); + } + return count; +} + static void remove_callback(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); @@ -298,6 +316,7 @@ struct device_attribute pci_dev_attrs[] = { __ATTR(msi_bus, 0644, msi_bus_show, msi_bus_store), #ifdef CONFIG_HOTPLUG __ATTR(remove, (S_IWUSR|S_IWGRP), NULL, remove_store), + __ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, dev_rescan_store), #endif __ATTR_NULL, }; -- cgit v1.2.3 From f110ca489c9b7cf3f6c9656e383e787f3aee217f Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 20 Mar 2009 14:56:56 -0600 Subject: PCI Hotplug: schedule fakephp for feature removal Now that the PCI core is capable of function-level remove and rescan as well as bus-level rescan, there's no functional need to keep fakephp anymore. We keep it around for userspace compatibility reasons, schedule removal in three years. Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- Documentation/feature-removal-schedule.txt | 33 ++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'Documentation') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 20d3b94703a4..8851eea06a02 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -335,6 +335,7 @@ Why: In 2.6.18 the Secmark concept was introduced to replace the "compat_net" Secmark, it is time to deprecate the older mechanism and start the process of removing the old code. Who: Paul Moore + --------------------------- What: sysfs ui for changing p4-clockmod parameters @@ -344,3 +345,35 @@ Why: See commits 129f8ae9b1b5be94517da76009ea956e89104ce8 and Removal is subject to fixing any remaining bugs in ACPI which may cause the thermal throttling not to happen at the right time. Who: Dave Jones , Matthew Garrett + +--------------------------- + +What: fakephp and associated sysfs files in /sys/bus/pci/slots/ +When: 2011 +Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to + represent a machine's physical PCI slots. The change in semantics + had userspace implications, as the hotplug core no longer allowed + drivers to create multiple sysfs files per physical slot (required + for multi-function devices, e.g.). fakephp was seen as a developer's + tool only, and its interface changed. Too late, we learned that + there were some users of the fakephp interface. + + In 2.6.30, the original fakephp interface was restored. At the same + time, the PCI core gained the ability that fakephp provided, namely + function-level hot-remove and hot-add. + + Since the PCI core now provides the same functionality, exposed in: + + /sys/bus/pci/rescan + /sys/bus/pci/devices/.../remove + /sys/bus/pci/devices/.../rescan + + there is no functional reason to maintain fakephp as well. + + We will keep the existing module so that 'modprobe fakephp' will + present the old /sys/bus/pci/slots/... interface for compatibility, + but users are urged to migrate their applications to the API above. + + After a reasonable transition period, we will remove the legacy + fakephp interface. +Who: Alex Chiang -- cgit v1.2.3 From 0a5d649018b151cb9331c213a843ac4a3e7e44ab Mon Sep 17 00:00:00 2001 From: Jody McIntyre Date: Tue, 24 Mar 2009 16:00:28 -0400 Subject: tracing: Documentation / sample code fixes for tracepoints Fix the tracepoint documentation to refer to "tracepoint-sample" instead of "tracepoint-example" to match what actually exists; fix the directory, and clarify how to compile. Change every instance of "example" in the sample tracepoint code to "sample" for consistency. Signed-off-by: Jody McIntyre Acked-by: Mathieu Desnoyers Cc: torvalds@linux-foundation.org LKML-Reference: <20090324200027.GH8294@clouds> Signed-off-by: Ingo Molnar --- Documentation/tracepoints.txt | 13 +++++++------ samples/tracepoints/tracepoint-sample.c | 24 ++++++++++++------------ 2 files changed, 19 insertions(+), 18 deletions(-) (limited to 'Documentation') diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt index 4ff43c6de299..c0e1ceed75a4 100644 --- a/Documentation/tracepoints.txt +++ b/Documentation/tracepoints.txt @@ -103,13 +103,14 @@ used to export the defined tracepoints. * Probe / tracepoint example -See the example provided in samples/tracepoints/src +See the example provided in samples/tracepoints -Compile them with your kernel. +Compile them with your kernel. They are built during 'make' (not +'make modules') when CONFIG_SAMPLE_TRACEPOINTS=m. Run, as root : -modprobe tracepoint-example (insmod order is not important) -modprobe tracepoint-probe-example -cat /proc/tracepoint-example (returns an expected error) -rmmod tracepoint-example tracepoint-probe-example +modprobe tracepoint-sample (insmod order is not important) +modprobe tracepoint-probe-sample +cat /proc/tracepoint-sample (returns an expected error) +rmmod tracepoint-sample tracepoint-probe-sample dmesg diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c index 68d5dc0310e4..9cf80a11e8b6 100644 --- a/samples/tracepoints/tracepoint-sample.c +++ b/samples/tracepoints/tracepoint-sample.c @@ -1,6 +1,6 @@ /* tracepoint-sample.c * - * Executes a tracepoint when /proc/tracepoint-example is opened. + * Executes a tracepoint when /proc/tracepoint-sample is opened. * * (C) Copyright 2007 Mathieu Desnoyers * @@ -16,7 +16,7 @@ DEFINE_TRACE(subsys_event); DEFINE_TRACE(subsys_eventb); -struct proc_dir_entry *pentry_example; +struct proc_dir_entry *pentry_sample; static int my_open(struct inode *inode, struct file *file) { @@ -32,25 +32,25 @@ static struct file_operations mark_ops = { .open = my_open, }; -static int __init example_init(void) +static int __init sample_init(void) { - printk(KERN_ALERT "example init\n"); - pentry_example = proc_create("tracepoint-example", 0444, NULL, + printk(KERN_ALERT "sample init\n"); + pentry_sample = proc_create("tracepoint-sample", 0444, NULL, &mark_ops); - if (!pentry_example) + if (!pentry_sample) return -EPERM; return 0; } -static void __exit example_exit(void) +static void __exit sample_exit(void) { - printk(KERN_ALERT "example exit\n"); - remove_proc_entry("tracepoint-example", NULL); + printk(KERN_ALERT "sample exit\n"); + remove_proc_entry("tracepoint-sample", NULL); } -module_init(example_init) -module_exit(example_exit) +module_init(sample_init) +module_exit(sample_exit) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Mathieu Desnoyers"); -MODULE_DESCRIPTION("Tracepoint example"); +MODULE_DESCRIPTION("Tracepoint sample"); -- cgit v1.2.3 From b5cbc369db39d9080f4932db8607aea1e1654d4d Mon Sep 17 00:00:00 2001 From: Greg Banks Date: Thu, 26 Mar 2009 17:45:27 +1100 Subject: Document /proc/fs/nfsd/pool_stats Document the format and semantics of the /proc/fs/nfsd/pool_stats file. Signed-off-by: Greg Banks Signed-off-by: J. Bruce Fields --- Documentation/filesystems/knfsd-stats.txt | 159 ++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 Documentation/filesystems/knfsd-stats.txt (limited to 'Documentation') diff --git a/Documentation/filesystems/knfsd-stats.txt b/Documentation/filesystems/knfsd-stats.txt new file mode 100644 index 000000000000..64ced5149d37 --- /dev/null +++ b/Documentation/filesystems/knfsd-stats.txt @@ -0,0 +1,159 @@ + +Kernel NFS Server Statistics +============================ + +This document describes the format and semantics of the statistics +which the kernel NFS server makes available to userspace. These +statistics are available in several text form pseudo files, each of +which is described separately below. + +In most cases you don't need to know these formats, as the nfsstat(8) +program from the nfs-utils distribution provides a helpful command-line +interface for extracting and printing them. + +All the files described here are formatted as a sequence of text lines, +separated by newline '\n' characters. Lines beginning with a hash +'#' character are comments intended for humans and should be ignored +by parsing routines. All other lines contain a sequence of fields +separated by whitespace. + +/proc/fs/nfsd/pool_stats +------------------------ + +This file is available in kernels from 2.6.30 onwards, if the +/proc/fs/nfsd filesystem is mounted (it almost always should be). + +The first line is a comment which describes the fields present in +all the other lines. The other lines present the following data as +a sequence of unsigned decimal numeric fields. One line is shown +for each NFS thread pool. + +All counters are 64 bits wide and wrap naturally. There is no way +to zero these counters, instead applications should do their own +rate conversion. + +pool + The id number of the NFS thread pool to which this line applies. + This number does not change. + + Thread pool ids are a contiguous set of small integers starting + at zero. The maximum value depends on the thread pool mode, but + currently cannot be larger than the number of CPUs in the system. + Note that in the default case there will be a single thread pool + which contains all the nfsd threads and all the CPUs in the system, + and thus this file will have a single line with a pool id of "0". + +packets-arrived + Counts how many NFS packets have arrived. More precisely, this + is the number of times that the network stack has notified the + sunrpc server layer that new data may be available on a transport + (e.g. an NFS or UDP socket or an NFS/RDMA endpoint). + + Depending on the NFS workload patterns and various network stack + effects (such as Large Receive Offload) which can combine packets + on the wire, this may be either more or less than the number + of NFS calls received (which statistic is available elsewhere). + However this is a more accurate and less workload-dependent measure + of how much CPU load is being placed on the sunrpc server layer + due to NFS network traffic. + +sockets-enqueued + Counts how many times an NFS transport is enqueued to wait for + an nfsd thread to service it, i.e. no nfsd thread was considered + available. + + The circumstance this statistic tracks indicates that there was NFS + network-facing work to be done but it couldn't be done immediately, + thus introducing a small delay in servicing NFS calls. The ideal + rate of change for this counter is zero; significantly non-zero + values may indicate a performance limitation. + + This can happen either because there are too few nfsd threads in the + thread pool for the NFS workload (the workload is thread-limited), + or because the NFS workload needs more CPU time than is available in + the thread pool (the workload is CPU-limited). In the former case, + configuring more nfsd threads will probably improve the performance + of the NFS workload. In the latter case, the sunrpc server layer is + already choosing not to wake idle nfsd threads because there are too + many nfsd threads which want to run but cannot, so configuring more + nfsd threads will make no difference whatsoever. The overloads-avoided + statistic (see below) can be used to distinguish these cases. + +threads-woken + Counts how many times an idle nfsd thread is woken to try to + receive some data from an NFS transport. + + This statistic tracks the circumstance where incoming + network-facing NFS work is being handled quickly, which is a good + thing. The ideal rate of change for this counter will be close + to but less than the rate of change of the packets-arrived counter. + +overloads-avoided + Counts how many times the sunrpc server layer chose not to wake an + nfsd thread, despite the presence of idle nfsd threads, because + too many nfsd threads had been recently woken but could not get + enough CPU time to actually run. + + This statistic counts a circumstance where the sunrpc layer + heuristically avoids overloading the CPU scheduler with too many + runnable nfsd threads. The ideal rate of change for this counter + is zero. Significant non-zero values indicate that the workload + is CPU limited. Usually this is associated with heavy CPU usage + on all the CPUs in the nfsd thread pool. + + If a sustained large overloads-avoided rate is detected on a pool, + the top(1) utility should be used to check for the following + pattern of CPU usage on all the CPUs associated with the given + nfsd thread pool. + + - %us ~= 0 (as you're *NOT* running applications on your NFS server) + + - %wa ~= 0 + + - %id ~= 0 + + - %sy + %hi + %si ~= 100 + + If this pattern is seen, configuring more nfsd threads will *not* + improve the performance of the workload. If this patten is not + seen, then something more subtle is wrong. + +threads-timedout + Counts how many times an nfsd thread triggered an idle timeout, + i.e. was not woken to handle any incoming network packets for + some time. + + This statistic counts a circumstance where there are more nfsd + threads configured than can be used by the NFS workload. This is + a clue that the number of nfsd threads can be reduced without + affecting performance. Unfortunately, it's only a clue and not + a strong indication, for a couple of reasons: + + - Currently the rate at which the counter is incremented is quite + slow; the idle timeout is 60 minutes. Unless the NFS workload + remains constant for hours at a time, this counter is unlikely + to be providing information that is still useful. + + - It is usually a wise policy to provide some slack, + i.e. configure a few more nfsds than are currently needed, + to allow for future spikes in load. + + +Note that incoming packets on NFS transports will be dealt with in +one of three ways. An nfsd thread can be woken (threads-woken counts +this case), or the transport can be enqueued for later attention +(sockets-enqueued counts this case), or the packet can be temporarily +deferred because the transport is currently being used by an nfsd +thread. This last case is not very interesting and is not explicitly +counted, but can be inferred from the other counters thus: + +packets-deferred = packets-arrived - ( sockets-enqueued + threads-woken ) + + +More +---- +Descriptions of the other statistics file should go here. + + +Greg Banks +26 Mar 2009 -- cgit v1.2.3 From 06705bff9114531a997a7d0c2520bea0f2927410 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 28 Mar 2009 10:59:57 -0400 Subject: ext4: Regularize mount options Add support for using the mount options "barrier" and "nobarrier", and "auto_da_alloc" and "noauto_da_alloc", which is more consistent than "barrier=<0|1>" or "auto_da_alloc=<0|1>". Most other ext3/ext4 mount options use the foo/nofoo naming convention. We allow the old forms of these mount options for backwards compatibility. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 25 +++++++++++++++++++++++-- fs/ext4/super.c | 30 ++++++++++++++++++++++-------- 2 files changed, 45 insertions(+), 10 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 5c484aec2bab..97882df04865 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -183,8 +183,8 @@ commit=nrsec (*) Ext4 can be told to sync all its data and metadata performance. barrier=<0|1(*)> This enables/disables the use of write barriers in - the jbd code. barrier=0 disables, barrier=1 enables. - This also requires an IO stack which can support +barrier(*) the jbd code. barrier=0 disables, barrier=1 enables. +nobarrier This also requires an IO stack which can support barriers, and if jbd gets an error on a barrier write, it will disable again with a warning. Write barriers enforce proper on-disk ordering @@ -192,6 +192,9 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in safe to use, at some performance penalty. If your disks are battery-backed in one way or another, disabling barriers may safely improve performance. + The mount options "barrier" and "nobarrier" can + also be used to enable or disable barriers, for + consistency with other ext4 mount options. inode_readahead=n This tuning parameter controls the maximum number of inode table blocks that ext4's inode @@ -313,6 +316,24 @@ journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the a slightly higher priority than the default I/O priority. +auto_da_alloc(*) Many broken applications don't use fsync() when +noauto_da_alloc replacing existing files via patterns such as + fd = open("foo.new")/write(fd,..)/close(fd)/ + rename("foo.new", "foo"), or worse yet, + fd = open("foo", O_TRUNC)/write(fd,..)/close(fd). + If auto_da_alloc is enabled, ext4 will detect + the replace-via-rename and replace-via-truncate + patterns and force that any delayed allocation + blocks are allocated such that at the next + journal commit, in the default data=ordered + mode, the data blocks of the new file are forced + to disk before the rename() operation is + commited. This provides roughly the same level + of guarantees as ext3, and avoids the + "zero-length" problem that can happen when a + system crashes before the delayed allocation + blocks are forced to disk. + Data Mode ========= There are 3 different data modes: diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 45d07cf9df34..9987bba99db3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -867,7 +867,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",data_err=abort"); if (test_opt(sb, NO_AUTO_DA_ALLOC)) - seq_puts(seq, ",auto_da_alloc=0"); + seq_puts(seq, ",noauto_da_alloc"); ext4_show_quota_options(seq, sb); return 0; @@ -1018,7 +1018,7 @@ enum { Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_auto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, + Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, @@ -1026,8 +1026,8 @@ enum { Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota, Opt_i_version, + Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_inode_readahead_blks, Opt_journal_ioprio }; @@ -1080,6 +1080,8 @@ static const match_table_t tokens = { {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, {Opt_barrier, "barrier=%u"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, @@ -1088,6 +1090,8 @@ static const match_table_t tokens = { {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, {Opt_journal_ioprio, "journal_ioprio=%u"}, {Opt_auto_da_alloc, "auto_da_alloc=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc"}, + {Opt_noauto_da_alloc, "noauto_da_alloc"}, {Opt_err, NULL}, }; @@ -1422,9 +1426,14 @@ set_qf_format: case Opt_abort: set_opt(sbi->s_mount_opt, ABORT); break; + case Opt_nobarrier: + clear_opt(sbi->s_mount_opt, BARRIER); + break; case Opt_barrier: - if (match_int(&args[0], &option)) - return 0; + if (match_int(&args[0], &option)) { + set_opt(sbi->s_mount_opt, BARRIER); + break; + } if (option) set_opt(sbi->s_mount_opt, BARRIER); else @@ -1485,9 +1494,14 @@ set_qf_format: *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, option); break; + case Opt_noauto_da_alloc: + set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); + break; case Opt_auto_da_alloc: - if (match_int(&args[0], &option)) - return 0; + if (match_int(&args[0], &option)) { + clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); + break; + } if (option) clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); else -- cgit v1.2.3 From d1881d3192a3d3e8dc4f255b03187f4c36cb0617 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 Mar 2009 21:55:25 -0600 Subject: lguest: barrier me harder Impact: barrier correctness in example launcher I doubt either lguest user will complain about performance. Reported-by: Christoph Hellwig Cc: Jens Axboe Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'Documentation') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index f2dbbf3bdeab..d36fcc0f2715 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -1630,6 +1630,13 @@ static bool service_io(struct device *dev) } } + /* OK, so we noted that it was pretty poor to use an fdatasync as a + * barrier. But Christoph Hellwig points out that we need a sync + * *afterwards* as well: "Barriers specify no reordering to the front + * or the back." And Jens Axboe confirmed it, so here we are: */ + if (out->type & VIRTIO_BLK_T_BARRIER) + fdatasync(vblk->fd); + /* We can't trigger an IRQ, because we're not the Launcher. It does * that when we tell it we're done. */ add_used(dev->vq, head, wlen); -- cgit v1.2.3 From afa5eb7c68689ced4284f01c96feed44a2d0a127 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 18 Mar 2009 09:13:37 +0100 Subject: HID: remove compat stuff This removal was scheduled and there is no problem with later distros to adapt for the new bus, thanks to aliases. module-init-tools map files are deprecated nowadays, so that the patch which introduced hid ones into the m-i-t won't be accepted and hence there is no reason for leaving compat stuff in. Signed-off-by: Jiri Slaby Cc: Jiri Kosina Signed-off-by: Jiri Kosina --- Documentation/feature-removal-schedule.txt | 7 --- drivers/hid/Kconfig | 12 ----- drivers/hid/Makefile | 4 -- drivers/hid/hid-a4tech.c | 2 - drivers/hid/hid-apple.c | 2 - drivers/hid/hid-belkin.c | 2 - drivers/hid/hid-cherry.c | 2 - drivers/hid/hid-chicony.c | 2 - drivers/hid/hid-core.c | 21 -------- drivers/hid/hid-cypress.c | 2 - drivers/hid/hid-drff.c | 2 - drivers/hid/hid-dummy.c | 87 ------------------------------ drivers/hid/hid-ezkey.c | 2 - drivers/hid/hid-gaff.c | 2 - drivers/hid/hid-gyration.c | 2 - drivers/hid/hid-kensington.c | 2 - drivers/hid/hid-kye.c | 2 - drivers/hid/hid-lg.c | 2 - drivers/hid/hid-microsoft.c | 2 - drivers/hid/hid-monterey.c | 2 - drivers/hid/hid-ntrig.c | 2 - drivers/hid/hid-petalynx.c | 2 - drivers/hid/hid-pl.c | 2 - drivers/hid/hid-samsung.c | 2 - drivers/hid/hid-sony.c | 2 - drivers/hid/hid-sunplus.c | 2 - drivers/hid/hid-tmff.c | 2 - drivers/hid/hid-topseed.c | 2 - drivers/hid/hid-zpff.c | 2 - include/linux/hid.h | 16 ------ 30 files changed, 195 deletions(-) delete mode 100644 drivers/hid/hid-dummy.c (limited to 'Documentation') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 1135996bec8b..fc5e85a5901c 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -273,13 +273,6 @@ Who: Glauber Costa --------------------------- -What: remove HID compat support -When: 2.6.29 -Why: needed only as a temporary solution until distros fix themselves up -Who: Jiri Slaby - ---------------------------- - What: print_fn_descriptor_symbol() When: October 2009 Why: The %pF vsprintf format provides the same functionality in a diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 88e16ef93247..63a2564f0f81 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -70,18 +70,6 @@ source "drivers/hid/usbhid/Kconfig" menu "Special HID drivers" depends on HID -config HID_COMPAT - bool "Load all HID drivers on hid core load" - default y - ---help--- - Compatible option for older userspace. If you have system without udev - support of module loading through aliases and also old - module-init-tools which can't handle hid bus, choose Y here. Otherwise - say N. If you say N and your userspace is old enough, the only - functionality you lose is modules autoloading. - - If unsure, say Y. - config HID_A4TECH tristate "A4 tech" if EMBEDDED depends on USB_HID diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile index e6b72ed0d70a..1f7cb0fd4505 100644 --- a/drivers/hid/Makefile +++ b/drivers/hid/Makefile @@ -8,10 +8,6 @@ obj-$(CONFIG_HID) += hid.o hid-$(CONFIG_HID_DEBUG) += hid-debug.o hid-$(CONFIG_HIDRAW) += hidraw.o -ifdef CONFIG_HID_COMPAT -obj-m += hid-dummy.o -endif - hid-logitech-objs := hid-lg.o ifdef CONFIG_LOGITECH_FF hid-logitech-objs += hid-lgff.o diff --git a/drivers/hid/hid-a4tech.c b/drivers/hid/hid-a4tech.c index ebca00e6c103..42ea359e94cf 100644 --- a/drivers/hid/hid-a4tech.c +++ b/drivers/hid/hid-a4tech.c @@ -158,5 +158,3 @@ static void a4_exit(void) module_init(a4_init); module_exit(a4_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(a4tech); diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index cab3be7ef0ab..7359d9d88e46 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -474,5 +474,3 @@ static void apple_exit(void) module_init(apple_init); module_exit(apple_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(apple); diff --git a/drivers/hid/hid-belkin.c b/drivers/hid/hid-belkin.c index 12c8a9ba6ed6..2f6723133a4b 100644 --- a/drivers/hid/hid-belkin.c +++ b/drivers/hid/hid-belkin.c @@ -101,5 +101,3 @@ static void belkin_exit(void) module_init(belkin_init); module_exit(belkin_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(belkin); diff --git a/drivers/hid/hid-cherry.c b/drivers/hid/hid-cherry.c index b833b9769aba..ab8209e7e45c 100644 --- a/drivers/hid/hid-cherry.c +++ b/drivers/hid/hid-cherry.c @@ -83,5 +83,3 @@ static void ch_exit(void) module_init(ch_init); module_exit(ch_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(cherry); diff --git a/drivers/hid/hid-chicony.c b/drivers/hid/hid-chicony.c index a54d4096e0f7..7f91076d8493 100644 --- a/drivers/hid/hid-chicony.c +++ b/drivers/hid/hid-chicony.c @@ -76,5 +76,3 @@ static void ch_exit(void) module_init(ch_init); module_exit(ch_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(chicony); diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index b96fbd5dab55..e56f8d5d3a50 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1819,15 +1819,6 @@ void hid_unregister_driver(struct hid_driver *hdrv) } EXPORT_SYMBOL_GPL(hid_unregister_driver); -#ifdef CONFIG_HID_COMPAT -static void hid_compat_load(struct work_struct *ws) -{ - request_module("hid-dummy"); -} -static DECLARE_WORK(hid_compat_work, hid_compat_load); -static struct workqueue_struct *hid_compat_wq; -#endif - static int __init hid_init(void) { int ret; @@ -1842,15 +1833,6 @@ static int __init hid_init(void) if (ret) goto err_bus; -#ifdef CONFIG_HID_COMPAT - hid_compat_wq = create_singlethread_workqueue("hid_compat"); - if (!hid_compat_wq) { - hidraw_exit(); - goto err; - } - queue_work(hid_compat_wq, &hid_compat_work); -#endif - return 0; err_bus: bus_unregister(&hid_bus_type); @@ -1860,9 +1842,6 @@ err: static void __exit hid_exit(void) { -#ifdef CONFIG_HID_COMPAT - destroy_workqueue(hid_compat_wq); -#endif hidraw_exit(); bus_unregister(&hid_bus_type); } diff --git a/drivers/hid/hid-cypress.c b/drivers/hid/hid-cypress.c index 5d69d27b935d..9d6d3b91773b 100644 --- a/drivers/hid/hid-cypress.c +++ b/drivers/hid/hid-cypress.c @@ -154,5 +154,3 @@ static void cp_exit(void) module_init(cp_init); module_exit(cp_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(cypress); diff --git a/drivers/hid/hid-drff.c b/drivers/hid/hid-drff.c index 785d2492b5ef..34f3eb65100b 100644 --- a/drivers/hid/hid-drff.c +++ b/drivers/hid/hid-drff.c @@ -186,5 +186,3 @@ static void __exit dr_exit(void) module_init(dr_init); module_exit(dr_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(dragonrise); diff --git a/drivers/hid/hid-dummy.c b/drivers/hid/hid-dummy.c deleted file mode 100644 index 74d765f38624..000000000000 --- a/drivers/hid/hid-dummy.c +++ /dev/null @@ -1,87 +0,0 @@ -#include -#include -#include - -static int __init hid_dummy_init(void) -{ -#ifdef CONFIG_HID_A4TECH_MODULE - HID_COMPAT_CALL_DRIVER(a4tech); -#endif -#ifdef CONFIG_HID_APPLE_MODULE - HID_COMPAT_CALL_DRIVER(apple); -#endif -#ifdef CONFIG_HID_BELKIN_MODULE - HID_COMPAT_CALL_DRIVER(belkin); -#endif -#ifdef CONFIG_HID_BRIGHT_MODULE - HID_COMPAT_CALL_DRIVER(bright); -#endif -#ifdef CONFIG_HID_CHERRY_MODULE - HID_COMPAT_CALL_DRIVER(cherry); -#endif -#ifdef CONFIG_HID_CHICONY_MODULE - HID_COMPAT_CALL_DRIVER(chicony); -#endif -#ifdef CONFIG_HID_CYPRESS_MODULE - HID_COMPAT_CALL_DRIVER(cypress); -#endif -#ifdef CONFIG_HID_DELL_MODULE - HID_COMPAT_CALL_DRIVER(dell); -#endif -#ifdef CONFIG_DRAGONRISE_FF_MODULE - HID_COMPAT_CALL_DRIVER(dragonrise); -#endif -#ifdef CONFIG_HID_EZKEY_MODULE - HID_COMPAT_CALL_DRIVER(ezkey); -#endif -#ifdef CONFIG_HID_KYE_MODULE - HID_COMPAT_CALL_DRIVER(kye); -#endif -#ifdef CONFIG_HID_GYRATION_MODULE - HID_COMPAT_CALL_DRIVER(gyration); -#endif -#ifdef CONFIG_HID_KENSINGTON_MODULE - HID_COMPAT_CALL_DRIVER(kensington); -#endif -#ifdef CONFIG_HID_LOGITECH_MODULE - HID_COMPAT_CALL_DRIVER(logitech); -#endif -#ifdef CONFIG_HID_MICROSOFT_MODULE - HID_COMPAT_CALL_DRIVER(microsoft); -#endif -#ifdef CONFIG_HID_MONTEREY_MODULE - HID_COMPAT_CALL_DRIVER(monterey); -#endif -#ifdef CONFIG_HID_NTRIG_MODULE - HID_COMPAT_CALL_DRIVER(ntrig); -#endif -#ifdef CONFIG_HID_PANTHERLORD_MODULE - HID_COMPAT_CALL_DRIVER(pantherlord); -#endif -#ifdef CONFIG_HID_PETALYNX_MODULE - HID_COMPAT_CALL_DRIVER(petalynx); -#endif -#ifdef CONFIG_HID_SAMSUNG_MODULE - HID_COMPAT_CALL_DRIVER(samsung); -#endif -#ifdef CONFIG_HID_SONY_MODULE - HID_COMPAT_CALL_DRIVER(sony); -#endif -#ifdef CONFIG_HID_SUNPLUS_MODULE - HID_COMPAT_CALL_DRIVER(sunplus); -#endif -#ifdef CONFIG_GREENASIA_FF_MODULE - HID_COMPAT_CALL_DRIVER(greenasia); -#endif -#ifdef CONFIG_THRUSTMASTER_FF_MODULE - HID_COMPAT_CALL_DRIVER(thrustmaster); -#endif -#ifdef CONFIG_ZEROPLUS_FF_MODULE - HID_COMPAT_CALL_DRIVER(zeroplus); -#endif - - return -EIO; -} -module_init(hid_dummy_init); - -MODULE_LICENSE("GPL"); diff --git a/drivers/hid/hid-ezkey.c b/drivers/hid/hid-ezkey.c index deb42f931b7e..0a1fe054799b 100644 --- a/drivers/hid/hid-ezkey.c +++ b/drivers/hid/hid-ezkey.c @@ -91,5 +91,3 @@ static void ez_exit(void) module_init(ez_init); module_exit(ez_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(ezkey); diff --git a/drivers/hid/hid-gaff.c b/drivers/hid/hid-gaff.c index 71211f6a4f02..510ad3ab8d33 100644 --- a/drivers/hid/hid-gaff.c +++ b/drivers/hid/hid-gaff.c @@ -181,5 +181,3 @@ static void __exit ga_exit(void) module_init(ga_init); module_exit(ga_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(greenasia); diff --git a/drivers/hid/hid-gyration.c b/drivers/hid/hid-gyration.c index 04a0afec52ac..d42d222097a8 100644 --- a/drivers/hid/hid-gyration.c +++ b/drivers/hid/hid-gyration.c @@ -94,5 +94,3 @@ static void gyration_exit(void) module_init(gyration_init); module_exit(gyration_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(gyration); diff --git a/drivers/hid/hid-kensington.c b/drivers/hid/hid-kensington.c index 747fee5b2a73..7353bd79cbe9 100644 --- a/drivers/hid/hid-kensington.c +++ b/drivers/hid/hid-kensington.c @@ -61,5 +61,3 @@ static void ks_exit(void) module_init(ks_init); module_exit(ks_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(kensington); diff --git a/drivers/hid/hid-kye.c b/drivers/hid/hid-kye.c index ea7f412e31a9..72ee3fec56d9 100644 --- a/drivers/hid/hid-kye.c +++ b/drivers/hid/hid-kye.c @@ -67,5 +67,3 @@ static void kye_exit(void) module_init(kye_init); module_exit(kye_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(kye); diff --git a/drivers/hid/hid-lg.c b/drivers/hid/hid-lg.c index 83e07c9f4144..7b80cb694982 100644 --- a/drivers/hid/hid-lg.c +++ b/drivers/hid/hid-lg.c @@ -326,5 +326,3 @@ static void lg_exit(void) module_init(lg_init); module_exit(lg_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(logitech); diff --git a/drivers/hid/hid-microsoft.c b/drivers/hid/hid-microsoft.c index 25b10dcad90d..5e9e37a0506d 100644 --- a/drivers/hid/hid-microsoft.c +++ b/drivers/hid/hid-microsoft.c @@ -210,5 +210,3 @@ static void ms_exit(void) module_init(ms_init); module_exit(ms_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(microsoft); diff --git a/drivers/hid/hid-monterey.c b/drivers/hid/hid-monterey.c index f3a85a065f18..240f87618be6 100644 --- a/drivers/hid/hid-monterey.c +++ b/drivers/hid/hid-monterey.c @@ -78,5 +78,3 @@ static void mr_exit(void) module_init(mr_init); module_exit(mr_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(monterey); diff --git a/drivers/hid/hid-ntrig.c b/drivers/hid/hid-ntrig.c index db44fbd7bdf6..c5b252be9c21 100644 --- a/drivers/hid/hid-ntrig.c +++ b/drivers/hid/hid-ntrig.c @@ -78,5 +78,3 @@ static void ntrig_exit(void) module_init(ntrig_init); module_exit(ntrig_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(ntrig); diff --git a/drivers/hid/hid-petalynx.c b/drivers/hid/hid-petalynx.c index 10945fe12d50..2e83e8ff891a 100644 --- a/drivers/hid/hid-petalynx.c +++ b/drivers/hid/hid-petalynx.c @@ -118,5 +118,3 @@ static void pl_exit(void) module_init(pl_init); module_exit(pl_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(petalynx); diff --git a/drivers/hid/hid-pl.c b/drivers/hid/hid-pl.c index 9ad76bf71186..4db9a3483760 100644 --- a/drivers/hid/hid-pl.c +++ b/drivers/hid/hid-pl.c @@ -230,5 +230,3 @@ static void pl_exit(void) module_init(pl_init); module_exit(pl_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(pantherlord); diff --git a/drivers/hid/hid-samsung.c b/drivers/hid/hid-samsung.c index 15f3c0492450..07083aa6c19a 100644 --- a/drivers/hid/hid-samsung.c +++ b/drivers/hid/hid-samsung.c @@ -96,5 +96,3 @@ static void samsung_exit(void) module_init(samsung_init); module_exit(samsung_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(samsung); diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index dd5a3979a4de..c2599388a350 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -148,5 +148,3 @@ static void sony_exit(void) module_init(sony_init); module_exit(sony_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(sony); diff --git a/drivers/hid/hid-sunplus.c b/drivers/hid/hid-sunplus.c index 5ba68f7dbb78..e0a8fd36a85b 100644 --- a/drivers/hid/hid-sunplus.c +++ b/drivers/hid/hid-sunplus.c @@ -78,5 +78,3 @@ static void sp_exit(void) module_init(sp_init); module_exit(sp_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(sunplus); diff --git a/drivers/hid/hid-tmff.c b/drivers/hid/hid-tmff.c index 1b7cba0f7e1f..7c1f7b50330c 100644 --- a/drivers/hid/hid-tmff.c +++ b/drivers/hid/hid-tmff.c @@ -265,5 +265,3 @@ static void tm_exit(void) module_init(tm_init); module_exit(tm_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(thrustmaster); diff --git a/drivers/hid/hid-topseed.c b/drivers/hid/hid-topseed.c index cca64a0564a9..152ccfabeba5 100644 --- a/drivers/hid/hid-topseed.c +++ b/drivers/hid/hid-topseed.c @@ -73,5 +73,3 @@ static void ts_exit(void) module_init(ts_init); module_exit(ts_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(topseed); diff --git a/drivers/hid/hid-zpff.c b/drivers/hid/hid-zpff.c index ea82f3718b21..85a198a18537 100644 --- a/drivers/hid/hid-zpff.c +++ b/drivers/hid/hid-zpff.c @@ -158,5 +158,3 @@ static void zp_exit(void) module_init(zp_init); module_exit(zp_exit); MODULE_LICENSE("GPL"); - -HID_COMPAT_LOAD_DRIVER(zeroplus); diff --git a/include/linux/hid.h b/include/linux/hid.h index a46cda488695..a46cbea71d65 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -792,21 +792,5 @@ dbg_hid(const char *fmt, ...) __FILE__ , ## arg) #endif /* HID_FF */ -#ifdef __KERNEL__ -#ifdef CONFIG_HID_COMPAT -#define HID_COMPAT_LOAD_DRIVER(name) \ -/* prototype to avoid sparse warning */ \ -extern void hid_compat_##name(void); \ -void hid_compat_##name(void) { } \ -EXPORT_SYMBOL(hid_compat_##name) -#else -#define HID_COMPAT_LOAD_DRIVER(name) -#endif /* HID_COMPAT */ -#define HID_COMPAT_CALL_DRIVER(name) do { \ - extern void hid_compat_##name(void); \ - hid_compat_##name(); \ -} while (0) -#endif /* __KERNEL__ */ - #endif -- cgit v1.2.3 From e3375ac767b847127df57d186a26abf83d055982 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Sat, 10 Jan 2009 00:47:21 +0100 Subject: trivial: document ext3 semantics of 'ro' option a bit better ext3 has quite unexpected semantics or "ro" and defaults are not what they are documented to be, due to mkfs override. Signed-off-by: Pavel Machek Signed-off-by: "Theodore Ts'o" Signed-off-by: Jiri Kosina --- Documentation/filesystems/ext3.txt | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index e5f3833a6ef8..570f9bd9be2b 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt @@ -14,6 +14,11 @@ Options When mounting an ext3 filesystem, the following option are accepted: (*) == default +ro Mount filesystem read only. Note that ext3 will replay + the journal (and thus write to the partition) even when + mounted "read only". Mount options "ro,noload" can be + used to prevent writes to the filesystem. + journal=update Update the ext3 file system's journal to the current format. @@ -27,7 +32,9 @@ journal_dev=devnum When the external journal device's major/minor numbers identified through its new major/minor numbers encoded in devnum. -noload Don't load the journal on mounting. +noload Don't load the journal on mounting. Note that this forces + mount of inconsistent filesystem, which can lead to + various problems. data=journal All data are committed into the journal prior to being written into the main file system. @@ -92,9 +99,12 @@ nocheck debug Extra debugging information is sent to syslog. -errors=remount-ro(*) Remount the filesystem read-only on an error. +errors=remount-ro Remount the filesystem read-only on an error. errors=continue Keep going on a filesystem error. errors=panic Panic and halt the machine if an error occurs. + (These mount options override the errors behavior + specified in the superblock, which can be + configured using tune2fs.) data_err=ignore(*) Just print an error message if an error occurs in a file data buffer in ordered mode. -- cgit v1.2.3 From caa790ba6cb88dccfab356960d93e2f4e0bd8704 Mon Sep 17 00:00:00 2001 From: Chris Samuel Date: Sat, 17 Jan 2009 00:01:18 +1100 Subject: trivial: cgroups: documentation typo and spelling corrections Minor typo and spelling corrections fixed whilst reading to learn about cgroups capabilities. Signed-off-by: Chris Samuel Acked-by: Paul Menage Signed-off-by: Jiri Kosina --- Documentation/cgroups/cgroups.txt | 10 +++++----- Documentation/cgroups/cpusets.txt | 12 ++++++------ Documentation/cgroups/devices.txt | 2 +- Documentation/cgroups/memory.txt | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'Documentation') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 93feb8444489..f4f5ee97d4db 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -56,7 +56,7 @@ hierarchy, and a set of subsystems; each subsystem has system-specific state attached to each cgroup in the hierarchy. Each hierarchy has an instance of the cgroup virtual filesystem associated with it. -At any one time there may be multiple active hierachies of task +At any one time there may be multiple active hierarchies of task cgroups. Each hierarchy is a partition of all tasks in the system. User level code may create and destroy cgroups by name in an @@ -124,10 +124,10 @@ following lines: / \ Prof (15%) students (5%) -Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go +Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd go into NFS network class. -At the same time firefox/lynx will share an appropriate CPU/Memory class +At the same time Firefox/Lynx will share an appropriate CPU/Memory class depending on who launched it (prof/student). With the ability to classify tasks differently for different resources @@ -325,7 +325,7 @@ and then start a subshell 'sh' in that cgroup: Creating, modifying, using the cgroups can be done through the cgroup virtual filesystem. -To mount a cgroup hierarchy will all available subsystems, type: +To mount a cgroup hierarchy with all available subsystems, type: # mount -t cgroup xxx /dev/cgroup The "xxx" is not interpreted by the cgroup code, but will appear in @@ -521,7 +521,7 @@ always handled well. void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp) (cgroup_mutex held by caller) -Called at the end of cgroup_clone() to do any paramater +Called at the end of cgroup_clone() to do any parameter initialization which might be required before a task could attach. For example in cpusets, no task may attach before 'cpus' and 'mems' are set up. diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt index 0611e9528c7c..f9ca389dddf4 100644 --- a/Documentation/cgroups/cpusets.txt +++ b/Documentation/cgroups/cpusets.txt @@ -131,7 +131,7 @@ Cpusets extends these two mechanisms as follows: - The hierarchy of cpusets can be mounted at /dev/cpuset, for browsing and manipulation from user space. - A cpuset may be marked exclusive, which ensures that no other - cpuset (except direct ancestors and descendents) may contain + cpuset (except direct ancestors and descendants) may contain any overlapping CPUs or Memory Nodes. - You can list all the tasks (by pid) attached to any cpuset. @@ -226,7 +226,7 @@ nodes with memory--using the cpuset_track_online_nodes() hook. -------------------------------- If a cpuset is cpu or mem exclusive, no other cpuset, other than -a direct ancestor or descendent, may share any of the same CPUs or +a direct ancestor or descendant, may share any of the same CPUs or Memory Nodes. A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", @@ -427,7 +427,7 @@ child cpusets have this flag enabled. When doing this, you don't usually want to leave any unpinned tasks in the top cpuset that might use non-trivial amounts of CPU, as such tasks may be artificially constrained to some subset of CPUs, depending on -the particulars of this flag setting in descendent cpusets. Even if +the particulars of this flag setting in descendant cpusets. Even if such a task could use spare CPU cycles in some other CPUs, the kernel scheduler might not consider the possibility of load balancing that task to that underused CPU. @@ -531,9 +531,9 @@ be idle. Of course it takes some searching cost to find movable tasks and/or idle CPUs, the scheduler might not search all CPUs in the domain -everytime. In fact, in some architectures, the searching ranges on +every time. In fact, in some architectures, the searching ranges on events are limited in the same socket or node where the CPU locates, -while the load balance on tick searchs all. +while the load balance on tick searches all. For example, assume CPU Z is relatively far from CPU X. Even if CPU Z is idle while CPU X and the siblings are busy, scheduler can't migrate @@ -601,7 +601,7 @@ its new cpuset, then the task will continue to use whatever subset of MPOL_BIND nodes are still allowed in the new cpuset. If the task was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed in the new cpuset, then the task will be essentially treated as if it -was MPOL_BIND bound to the new cpuset (even though its numa placement, +was MPOL_BIND bound to the new cpuset (even though its NUMA placement, as queried by get_mempolicy(), doesn't change). If a task is moved from one cpuset to another, then the kernel will adjust the tasks memory placement, as above, the next time that the kernel attempts diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroups/devices.txt index 7cc6e6a60672..57ca4c89fe5c 100644 --- a/Documentation/cgroups/devices.txt +++ b/Documentation/cgroups/devices.txt @@ -42,7 +42,7 @@ suffice, but we can decide the best way to adequately restrict movement as people get some experience with this. We may just want to require CAP_SYS_ADMIN, which at least is a separate bit from CAP_MKNOD. We may want to just refuse moving to a cgroup which -isn't a descendent of the current one. Or we may want to use +isn't a descendant of the current one. Or we may want to use CAP_MAC_ADMIN, since we really are trying to lock down root. CAP_SYS_ADMIN is needed to modify the whitelist or move another diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index e1501964df1e..a98a7fe7aabb 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -302,7 +302,7 @@ will be charged as a new owner of it. unevictable - # of pages cannot be reclaimed.(mlocked etc) Below is depend on CONFIG_DEBUG_VM. - inactive_ratio - VM inernal parameter. (see mm/page_alloc.c) + inactive_ratio - VM internal parameter. (see mm/page_alloc.c) recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) recent_rotated_file - VM internal parameter. (see mm/vmscan.c) recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) -- cgit v1.2.3 From 877d03105d04b2c13e241130277fa69c8d2564f0 Mon Sep 17 00:00:00 2001 From: Nick Andrew Date: Mon, 26 Jan 2009 11:06:57 +0100 Subject: trivial: Fix misspelling of firmware Fix misspelling of firmware. Signed-off-by: Nick Andrew Signed-off-by: Jiri Kosina --- Documentation/ia64/kvm.txt | 2 +- Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt | 2 +- arch/mips/sgi-ip27/ip27-smp.c | 2 +- arch/sparc/kernel/head_64.S | 2 +- drivers/net/sb1250-mac.c | 2 +- drivers/net/tg3.c | 2 +- drivers/net/wireless/ipw2x00/ipw2100.c | 2 +- drivers/net/wireless/ipw2x00/ipw2200.c | 2 +- drivers/net/wireless/iwlwifi/iwl-agn.c | 2 +- drivers/net/wireless/iwlwifi/iwl3945-base.c | 2 +- drivers/net/wireless/libertas/cmd.c | 2 +- drivers/pci/pci.c | 2 +- drivers/platform/x86/thinkpad_acpi.c | 2 +- drivers/staging/otus/hal/hpmain.c | 2 +- drivers/usb/atm/ueagle-atm.c | 2 +- drivers/usb/serial/ChangeLog.history | 2 +- include/linux/libata.h | 2 +- kernel/power/disk.c | 4 ++-- sound/oss/pss.c | 2 +- sound/sh/aica.c | 2 +- 20 files changed, 21 insertions(+), 21 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ia64/kvm.txt b/Documentation/ia64/kvm.txt index 84f7cb3d5bec..ffb5c80bec3e 100644 --- a/Documentation/ia64/kvm.txt +++ b/Documentation/ia64/kvm.txt @@ -42,7 +42,7 @@ Note: For step 2, please make sure that host page size == TARGET_PAGE_SIZE of qe hg clone http://xenbits.xensource.com/ext/efi-vfirmware.hg you can get the firmware's binary in the directory of efi-vfirmware.hg/binaries. - (3) Rename the firware you owned to Flash.fd, and copy it to /usr/local/share/qemu + (3) Rename the firmware you owned to Flash.fd, and copy it to /usr/local/share/qemu 4. Boot up Linux or Windows guests: 4.1 Create or install a image for guest boot. If you have xen experience, it should be easy. diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt index 6c238f59b2a9..249db3a15d15 100644 --- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt +++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt @@ -1,6 +1,6 @@ * Uploaded QE firmware - If a new firwmare has been uploaded to the QE (usually by the + If a new firmware has been uploaded to the QE (usually by the boot loader), then a 'firmware' child node should be added to the QE node. This node provides information on the uploaded firmware that device drivers may need. diff --git a/arch/mips/sgi-ip27/ip27-smp.c b/arch/mips/sgi-ip27/ip27-smp.c index 5b47d6b65275..cbcd7eb83bd1 100644 --- a/arch/mips/sgi-ip27/ip27-smp.c +++ b/arch/mips/sgi-ip27/ip27-smp.c @@ -221,7 +221,7 @@ static void __init ip27_smp_setup(void) * Assumption to be fixed: we're always booted on logical / physical * processor 0. While we're always running on logical processor 0 * this still means this is physical processor zero; it might for - * example be disabled in the firwware. + * example be disabled in the firmware. */ alloc_cpupda(0, 0); } diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S index a46c3a21e26d..3a1b7bf03cff 100644 --- a/arch/sparc/kernel/head_64.S +++ b/arch/sparc/kernel/head_64.S @@ -686,7 +686,7 @@ tlb_fixup_done: * point. * * There used to be enormous complexity wrt. transferring - * over from the firwmare's trap table to the Linux kernel's. + * over from the firmware's trap table to the Linux kernel's. * For example, there was a chicken & egg problem wrt. building * the OBP page tables, yet needing to be on the Linux kernel * trap table (to translate PAGE_OFFSET addresses) in order to diff --git a/drivers/net/sb1250-mac.c b/drivers/net/sb1250-mac.c index 88dd2e09832f..ce7551e17ba7 100644 --- a/drivers/net/sb1250-mac.c +++ b/drivers/net/sb1250-mac.c @@ -2299,7 +2299,7 @@ static int sbmac_init(struct platform_device *pldev, long long base) eaddr = sc->sbm_hwaddr; /* - * Read the ethernet address. The firwmare left this programmed + * Read the ethernet address. The firmware left this programmed * for us in the ethernet address register for each mac. */ diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index f7efcecc4108..ed60b18addac 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -11225,7 +11225,7 @@ static int __devinit tg3_phy_probe(struct tg3 *tp) return tg3_phy_init(tp); /* Reading the PHY ID register can conflict with ASF - * firwmare access to the PHY hardware. + * firmware access to the PHY hardware. */ err = 0; if ((tp->tg3_flags & TG3_FLAG_ENABLE_ASF) || diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c index 115b70487502..f4e963ba768b 100644 --- a/drivers/net/wireless/ipw2x00/ipw2100.c +++ b/drivers/net/wireless/ipw2x00/ipw2100.c @@ -2362,7 +2362,7 @@ static void ipw2100_corruption_detected(struct ipw2100_priv *priv, int i) i * sizeof(struct ipw2100_status)); #ifdef IPW2100_DEBUG_C3 - /* Halt the fimrware so we can get a good image */ + /* Halt the firmware so we can get a good image */ write_register(priv->net_dev, IPW_REG_RESET_REG, IPW_AUX_HOST_RESET_REG_STOP_MASTER); j = 5; diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c index b3449948a25a..f6174fdc12bf 100644 --- a/drivers/net/wireless/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/ipw2x00/ipw2200.c @@ -8844,7 +8844,7 @@ static int ipw_wx_set_mode(struct net_device *dev, #endif /* CONFIG_IPW2200_MONITOR */ /* Free the existing firmware and reset the fw_loaded - * flag so ipw_load() will bring in the new firmawre */ + * flag so ipw_load() will bring in the new firmware */ free_firmware(); priv->ieee->iw_mode = wrqu->mode; diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c index 663dc83be501..3889158b359c 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn.c @@ -1337,7 +1337,7 @@ static int iwl_read_ucode(struct iwl_priv *priv) /* api_ver should match the api version forming part of the * firmware filename ... but we don't check for that and only rely - * on the API version read from firware header from here on forward */ + * on the API version read from firmware header from here on forward */ if (api_ver < api_min || api_ver > api_max) { IWL_ERR(priv, "Driver unable to support your firmware API. " diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c index a71b08ca7c71..9d5f97dd7c73 100644 --- a/drivers/net/wireless/iwlwifi/iwl3945-base.c +++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c @@ -2562,7 +2562,7 @@ static int iwl3945_read_ucode(struct iwl_priv *priv) /* api_ver should match the api version forming part of the * firmware filename ... but we don't check for that and only rely - * on the API version read from firware header from here on forward */ + * on the API version read from firmware header from here on forward */ if (api_ver < api_min || api_ver > api_max) { IWL_ERR(priv, "Driver unable to support your firmware API. " diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c index 639dd02d3d31..8c3605cdc64c 100644 --- a/drivers/net/wireless/libertas/cmd.c +++ b/drivers/net/wireless/libertas/cmd.c @@ -1649,7 +1649,7 @@ static struct cmd_ctrl_node *lbs_get_cmd_ctrl_node(struct lbs_private *priv) /** * @brief This function executes next command in command - * pending queue. It will put fimware back to PS mode + * pending queue. It will put firmware back to PS mode * if applicable. * * @param priv A pointer to struct lbs_private structure diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 6d6120007af4..dab33a21d49a 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -550,7 +550,7 @@ void pci_update_current_state(struct pci_dev *dev, pci_power_t state) * @dev: PCI device to handle. * @state: PCI power state (D0, D1, D2, D3hot) to put the device into. * - * Transition a device to a new power state, using the platform formware and/or + * Transition a device to a new power state, using the platform firmware and/or * the device's PCI PM registers. * * RETURN VALUE: diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index d2433204a40c..814cb6520673 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -5811,7 +5811,7 @@ static struct ibm_struct volume_driver_data = { * ThinkPads from this same time period (and earlier) probably lack the * tachometer as well. * - * Unfortunately a lot of ThinkPads with new-style ECs but whose firwmare + * Unfortunately a lot of ThinkPads with new-style ECs but whose firmware * was never fixed by IBM to report the EC firmware version string * probably support the tachometer (like the early X models), so * detecting it is quite hard. We need more data to know for sure. diff --git a/drivers/staging/otus/hal/hpmain.c b/drivers/staging/otus/hal/hpmain.c index 2e65c466aae8..dab278326931 100644 --- a/drivers/staging/otus/hal/hpmain.c +++ b/drivers/staging/otus/hal/hpmain.c @@ -152,7 +152,7 @@ u16_t zfHpInit(zdev_t* dev, u32_t frequency) else { #ifndef ZM_OTUS_LINUX_PHASE_2 - /* donwload the normal frimware */ + /* download the normal firmware */ if ((ret = zfFirmwareDownload(dev, (u32_t*)zcFwImage, (u32_t)zcFwImageSize, ZM_FIRMWARE_WLAN_ADDR)) != ZM_SUCCESS) { diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c index b6483dd98acc..9cf9ff69e3e3 100644 --- a/drivers/usb/atm/ueagle-atm.c +++ b/drivers/usb/atm/ueagle-atm.c @@ -626,7 +626,7 @@ static void uea_upload_pre_firmware(const struct firmware *fw_entry, void *conte goto err_fw_corrupted; /* - * Start to upload formware : send reset + * Start to upload firmware : send reset */ value = 1; ret = uea_send_modem_cmd(usb, F8051_USBCS, sizeof(value), &value); diff --git a/drivers/usb/serial/ChangeLog.history b/drivers/usb/serial/ChangeLog.history index c1b279939bbf..f13fd488ebec 100644 --- a/drivers/usb/serial/ChangeLog.history +++ b/drivers/usb/serial/ChangeLog.history @@ -715,7 +715,7 @@ io_edgeport.c Change Log comments: 0.2 (01/30/2000) greg kroah-hartman Milestone 1 release. - Device is found by USB subsystem, enumerated, fimware is downloaded + Device is found by USB subsystem, enumerated, firmware is downloaded and the descriptors are printed to the debug log, config is set, and green light starts to blink. Open port works, and data can be sent and received at the default settings of the UART. Loopback connector diff --git a/include/linux/libata.h b/include/linux/libata.h index 76262d83656b..b450a2628855 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -379,7 +379,7 @@ enum { ATA_HORKAGE_BRIDGE_OK = (1 << 10), /* no bridge limits */ ATA_HORKAGE_ATAPI_MOD16_DMA = (1 << 11), /* use ATAPI DMA for commands not multiple of 16 bytes */ - ATA_HORKAGE_FIRMWARE_WARN = (1 << 12), /* firwmare update warning */ + ATA_HORKAGE_FIRMWARE_WARN = (1 << 12), /* firmware update warning */ ATA_HORKAGE_1_5_GBPS = (1 << 13), /* force 1.5 Gbps */ /* DMA mask for user DMA control: User visible values; DO NOT diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 4a4a206b1979..9d1c1a0de350 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -265,7 +265,7 @@ static int create_image(int platform_mode) * hibernation_snapshot - quiesce devices and create the hibernation * snapshot image. * @platform_mode - if set, use the platform driver, if available, to - * prepare the platform frimware for the power transition. + * prepare the platform firmware for the power transition. * * Must be called with pm_mutex held */ @@ -378,7 +378,7 @@ static int resume_target_kernel(void) * hibernation_restore - quiesce devices and restore the hibernation * snapshot image. If successful, control returns in hibernation_snaphot() * @platform_mode - if set, use the platform driver, if available, to - * prepare the platform frimware for the transition. + * prepare the platform firmware for the transition. * * Must be called with pm_mutex held */ diff --git a/sound/oss/pss.c b/sound/oss/pss.c index 16517a5a1301..83f5ee236b12 100644 --- a/sound/oss/pss.c +++ b/sound/oss/pss.c @@ -46,7 +46,7 @@ * load the driver as it did in previous versions. * 04-07-1999: Anthony Barbachan * Added module parameter pss_firmware to allow the user to tell - * the driver where the fireware file is located. The default + * the driver where the firmware file is located. The default * setting is the previous hardcoded setting "/etc/sound/pss_synth". * 00-03-03: Christoph Hellwig * Adapted to module_init/module_exit diff --git a/sound/sh/aica.c b/sound/sh/aica.c index f551233c5a08..583a3693df75 100644 --- a/sound/sh/aica.c +++ b/sound/sh/aica.c @@ -565,7 +565,7 @@ static int load_aica_firmware(void) err = request_firmware(&fw_entry, "aica_firmware.bin", &pd->dev); if (unlikely(err)) return err; - /* write firware into memory */ + /* write firmware into memory */ spu_disable(); spu_memload(0, fw_entry->data, fw_entry->size); spu_enable(); -- cgit v1.2.3 From 6d5e147dd034d9ceedc89fe39f4284700944f0c8 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 3 Feb 2009 11:57:13 +0100 Subject: trivial: Give the right path in Documentation example While the Documentation example creates /cgroup/test, it removes /test/cgroup, which is clearly not the intended path. Change that to /cgroup/test. Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Jiri Kosina --- Documentation/cgroups/memcg_test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt index 523a9c16c400..a9263596f8d8 100644 --- a/Documentation/cgroups/memcg_test.txt +++ b/Documentation/cgroups/memcg_test.txt @@ -356,7 +356,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. (Shell-B) # move all tasks in /cgroup/test to /cgroup # /sbin/swapoff -a - # rmdir /test/cgroup + # rmdir /cgroup/test # kill malloc task. Of course, tmpfs v.s. swapoff test should be tested, too. -- cgit v1.2.3 From 21acb9caa2e30b100e9a1943d995bb99d40f4035 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Wed, 4 Feb 2009 10:12:08 +0100 Subject: trivial: fix where cgroup documentation is not correctly referred to cgroup documentation was moved to Documentation/cgroups/. There are some places that still refer to Documentation/controllers/, Documentation/cgroups.txt and Documentation/cpusets.txt. Fix those. Signed-off-by: Thadeu Lima de Souza Cascardo Reviewed-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Jiri Kosina --- Documentation/00-INDEX | 4 ++-- Documentation/cgroups/00-INDEX | 18 ++++++++++++++++++ Documentation/kernel-parameters.txt | 4 ++-- Documentation/scheduler/sched-rt-group.txt | 2 +- Documentation/vm/numa_memory_policy.txt | 3 ++- Documentation/vm/page_migration | 3 ++- Documentation/x86/x86_64/fake-numa-for-cpusets | 5 +++-- include/linux/cgroup.h | 5 ++++- init/Kconfig | 2 +- 9 files changed, 35 insertions(+), 11 deletions(-) create mode 100644 Documentation/cgroups/00-INDEX (limited to 'Documentation') diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 2a39aeba1464..d05737aaa84b 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -86,6 +86,8 @@ cachetlb.txt - describes the cache/TLB flushing interfaces Linux uses. cdrom/ - directory with information on the CD-ROM drivers that Linux has. +cgroups/ + - cgroups features, including cpusets and memory controller. connector/ - docs on the netlink based userspace<->kernel space communication mod. console/ @@ -98,8 +100,6 @@ cpu-load.txt - document describing how CPU load statistics are collected. cpuidle/ - info on CPU_IDLE, CPU idle state management subsystem. -cpusets.txt - - documents the cpusets feature; assign CPUs and Mem to a set of tasks. cputopology.txt - documentation on how CPU topology info is exported via sysfs. cris/ diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX new file mode 100644 index 000000000000..3f58fa3d6d00 --- /dev/null +++ b/Documentation/cgroups/00-INDEX @@ -0,0 +1,18 @@ +00-INDEX + - this file +cgroups.txt + - Control Groups definition, implementation details, examples and API. +cpuacct.txt + - CPU Accounting Controller; account CPU usage for groups of tasks. +cpusets.txt + - documents the cpusets feature; assign CPUs and Mem to a set of tasks. +devices.txt + - Device Whitelist Controller; description, interface and security. +freezer-subsystem.txt + - checkpointing; rationale to not use signals, interface. +memcg_test.txt + - Memory Resource Controller; implementation details. +memory.txt + - Memory Resource Controller; design, accounting, interface, testing. +resource_counter.txt + - Resource Counter API. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index be3bde51b564..755def2cb071 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1593,7 +1593,7 @@ and is between 256 and 4096 characters. It is defined in the file nosoftlockup [KNL] Disable the soft-lockup detector. noswapaccount [KNL] Disable accounting of swap in memory resource - controller. (See Documentation/controllers/memory.txt) + controller. (See Documentation/cgroups/memory.txt) nosync [HW,M68K] Disables sync negotiation for all devices. @@ -1932,7 +1932,7 @@ and is between 256 and 4096 characters. It is defined in the file relax_domain_level= [KNL, SMP] Set scheduler's default relax_domain_level. - See Documentation/cpusets.txt. + See Documentation/cgroups/cpusets.txt. reserve= [KNL,BUGS] Force the kernel to ignore some iomem area diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index 3ef339f491e0..5ba4d3fc625a 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt @@ -126,7 +126,7 @@ This uses the /cgroup virtual file system and "/cgroup//cpu.rt_runtime_u to control the CPU time reserved for each control group instead. For more information on working with control groups, you should read -Documentation/cgroups.txt as well. +Documentation/cgroups/cgroups.txt as well. Group settings are checked against the following limits in order to keep the configuration schedulable: diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt index 6aaaeb38730c..be45dbb9d7f2 100644 --- a/Documentation/vm/numa_memory_policy.txt +++ b/Documentation/vm/numa_memory_policy.txt @@ -8,7 +8,8 @@ The current memory policy support was added to Linux 2.6 around May 2004. This document attempts to describe the concepts and APIs of the 2.6 memory policy support. -Memory policies should not be confused with cpusets (Documentation/cpusets.txt) +Memory policies should not be confused with cpusets +(Documentation/cgroups/cpusets.txt) which is an administrative mechanism for restricting the nodes from which memory may be allocated by a set of processes. Memory policies are a programming interface that a NUMA-aware application can take advantage of. When diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration index d5fdfd34bbaf..6513fe2d90b8 100644 --- a/Documentation/vm/page_migration +++ b/Documentation/vm/page_migration @@ -37,7 +37,8 @@ locations. Larger installations usually partition the system using cpusets into sections of nodes. Paul Jackson has equipped cpusets with the ability to -move pages when a task is moved to another cpuset (See ../cpusets.txt). +move pages when a task is moved to another cpuset (See +Documentation/cgroups/cpusets.txt). Cpusets allows the automation of process locality. If a task is moved to a new cpuset then also all its pages are moved with it so that the performance of the process does not sink dramatically. Also the pages diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets b/Documentation/x86/x86_64/fake-numa-for-cpusets index 33bb56655991..0f11d9becb0b 100644 --- a/Documentation/x86/x86_64/fake-numa-for-cpusets +++ b/Documentation/x86/x86_64/fake-numa-for-cpusets @@ -7,7 +7,8 @@ you can create fake NUMA nodes that represent contiguous chunks of memory and assign them to cpusets and their attached tasks. This is a way of limiting the amount of system memory that are available to a certain class of tasks. -For more information on the features of cpusets, see Documentation/cpusets.txt. +For more information on the features of cpusets, see +Documentation/cgroups/cpusets.txt. There are a number of different configurations you can use for your needs. For more information on the numa=fake command line option and its various ways of configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt. @@ -32,7 +33,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg: On node 3 totalpages: 131072 Now following the instructions for mounting the cpusets filesystem from -Documentation/cpusets.txt, you can assign fake nodes (i.e. contiguous memory +Documentation/cgroups/cpusets.txt, you can assign fake nodes (i.e. contiguous memory address spaces) to individual cpusets: [root@xroads /]# mkdir exampleset diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 499900d0cee7..b837631fe499 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -342,7 +342,10 @@ int cgroup_task_count(const struct cgroup *cgrp); /* Return true if the cgroup is a descendant of the current cgroup */ int cgroup_is_descendant(const struct cgroup *cgrp); -/* Control Group subsystem type. See Documentation/cgroups.txt for details */ +/* + * Control Group subsystem type. + * See Documentation/cgroups/cgroups.txt for details + */ struct cgroup_subsys { struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, diff --git a/init/Kconfig b/init/Kconfig index bcffc0e47647..99eb4196bd0a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -565,7 +565,7 @@ config CGROUP_MEM_RES_CTLR select MM_OWNER help Provides a memory resource controller that manages both anonymous - memory and page cache. (See Documentation/controllers/memory.txt) + memory and page cache. (See Documentation/cgroups/memory.txt) Note that setting this option increases fixed memory overhead associated with each page of memory in the system. By this, -- cgit v1.2.3 From 25f3311acc3405dd0dace3571a41f450e6cc6a65 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 30 Mar 2009 21:46:41 +0200 Subject: hwmon: (ds1621) Clean up documentation * The alarms sysfs file is deprecated, and individual alarm files are self-explanatory. * The driver doesn't implement high-reslution temperature readings so don't document that. Signed-off-by: Jean Delvare Cc: Aurelien Jarno --- Documentation/hwmon/ds1621 | 51 +++------------------------------------------- 1 file changed, 3 insertions(+), 48 deletions(-) (limited to 'Documentation') diff --git a/Documentation/hwmon/ds1621 b/Documentation/hwmon/ds1621 index 1fee6f1e6bc5..5e97f333c4df 100644 --- a/Documentation/hwmon/ds1621 +++ b/Documentation/hwmon/ds1621 @@ -49,12 +49,9 @@ of up to +/- 0.5 degrees even when compared against precise temperature readings. Be sure to have a high vs. low temperature limit gap of al least 1.0 degree Celsius to avoid Tout "bouncing", though! -As for alarms, you can read the alarm status of the DS1621 via the 'alarms' -/sys file interface. The result consists mainly of bit 6 and 5 of the -configuration register of the chip; bit 6 (0x40 or 64) is the high alarm -bit and bit 5 (0x20 or 32) the low one. These bits are set when the high or -low limits are met or exceeded and are reset by the module as soon as the -respective temperature ranges are left. +The alarm bits are set when the high or low limits are met or exceeded and +are reset by the module as soon as the respective temperature ranges are +left. The alarm registers are in no way suitable to find out about the actual status of Tout. They will only tell you about its history, whether or not @@ -64,45 +61,3 @@ with neither of the alarms set. Temperature conversion of the DS1621 takes up to 1000ms; internal access to non-volatile registers may last for 10ms or below. - -High Accuracy Temperature Reading ---------------------------------- - -As said before, the temperature issued via the 9-bit i2c-bus data is -somewhat arbitrary. Internally, the temperature conversion is of a -different kind that is explained (not so...) well in the DS1621 data sheet. -To cut the long story short: Inside the DS1621 there are two oscillators, -both of them biassed by a temperature coefficient. - -Higher resolution of the temperature reading can be achieved using the -internal projection, which means taking account of REG_COUNT and REG_SLOPE -(the driver manages them): - -Taken from Dallas Semiconductors App Note 068: 'Increasing Temperature -Resolution on the DS1620' and App Note 105: 'High Resolution Temperature -Measurement with Dallas Direct-to-Digital Temperature Sensors' - -- Read the 9-bit temperature and strip the LSB (Truncate the .5 degs) -- The resulting value is TEMP_READ. -- Then, read REG_COUNT. -- And then, REG_SLOPE. - - TEMP = TEMP_READ - 0.25 + ((REG_SLOPE - REG_COUNT) / REG_SLOPE) - -Note that this is what the DONE bit in the DS1621 configuration register is -good for: Internally, one temperature conversion takes up to 1000ms. Before -that conversion is complete you will not be able to read valid things out -of REG_COUNT and REG_SLOPE. The DONE bit, as you may have guessed by now, -tells you whether the conversion is complete ("done", in plain English) and -thus, whether the values you read are good or not. - -The DS1621 has two modes of operation: "Continuous" conversion, which can -be understood as the default stand-alone mode where the chip gets the -temperature and controls external devices via its Tout pin or tells other -i2c's about it if they care. The other mode is called "1SHOT", that means -that it only figures out about the temperature when it is explicitly told -to do so; this can be seen as power saving mode. - -Now if you want to read REG_COUNT and REG_SLOPE, you have to either stop -the continuous conversions until the contents of these registers are valid, -or, in 1SHOT mode, you have to have one conversion made. -- cgit v1.2.3 From 237c8d2f54ff12bd4fea1a9d18a94ae5810271d3 Mon Sep 17 00:00:00 2001 From: Gong Jun Date: Mon, 30 Mar 2009 21:46:42 +0200 Subject: hwmon: (w83627ehf) Add support for W83667HG Add initial support for the Nuvoton W83667HG chip to the w83627ehf driver. It has been tested on ASUS P5QL PRO by Gong Jun. At the moment there is still a usability issue which is that only in6 or temp3 can be present on the W83667HG, so the driver shouldn't expose both. This will be addressed later. Signed-off-by: Gong Jun Acked-by: David Hubbard Signed-off-by: Jean Delvare --- Documentation/hwmon/w83627ehf | 29 ++++++++---- drivers/hwmon/Kconfig | 4 +- drivers/hwmon/w83627ehf.c | 106 +++++++++++++++++++++++++++--------------- 3 files changed, 92 insertions(+), 47 deletions(-) (limited to 'Documentation') diff --git a/Documentation/hwmon/w83627ehf b/Documentation/hwmon/w83627ehf index d6e1ae30fa6e..b6eb59384bb3 100644 --- a/Documentation/hwmon/w83627ehf +++ b/Documentation/hwmon/w83627ehf @@ -2,30 +2,40 @@ Kernel driver w83627ehf ======================= Supported chips: - * Winbond W83627EHF/EHG/DHG (ISA access ONLY) + * Winbond W83627EHF/EHG (ISA access ONLY) Prefix: 'w83627ehf' Addresses scanned: ISA address retrieved from Super I/O registers Datasheet: - http://www.winbond-usa.com/products/winbond_products/pdfs/PCIC/W83627EHF_%20W83627EHGb.pdf - DHG datasheet confidential. + http://www.nuvoton.com.tw/NR/rdonlyres/A6A258F0-F0C9-4F97-81C0-C4D29E7E943E/0/W83627EHF.pdf + * Winbond W83627DHG + Prefix: 'w83627dhg' + Addresses scanned: ISA address retrieved from Super I/O registers + Datasheet: + http://www.nuvoton.com.tw/NR/rdonlyres/7885623D-A487-4CF9-A47F-30C5F73D6FE6/0/W83627DHG.pdf + * Winbond W83667HG + Prefix: 'w83667hg' + Addresses scanned: ISA address retrieved from Super I/O registers + Datasheet: not available Authors: Jean Delvare Yuan Mu (Winbond) Rudolf Marek David Hubbard + Gong Jun Description ----------- -This driver implements support for the Winbond W83627EHF, W83627EHG, and -W83627DHG super I/O chips. We will refer to them collectively as Winbond chips. +This driver implements support for the Winbond W83627EHF, W83627EHG, +W83627DHG and W83667HG super I/O chips. We will refer to them collectively +as Winbond chips. The chips implement three temperature sensors, five fan rotation speed sensors, ten analog voltage sensors (only nine for the 627DHG), one -VID (6 pins for the 627EHF/EHG, 8 pins for the 627DHG), alarms with beep -warnings (control unimplemented), and some automatic fan regulation -strategies (plus manual fan control mode). +VID (6 pins for the 627EHF/EHG, 8 pins for the 627DHG and 667HG), alarms +with beep warnings (control unimplemented), and some automatic fan +regulation strategies (plus manual fan control mode). Temperatures are measured in degrees Celsius and measurement resolution is 1 degC for temp1 and 0.5 degC for temp2 and temp3. An alarm is triggered when @@ -54,7 +64,8 @@ follows: temp1 -> pwm1 temp2 -> pwm2 temp3 -> pwm3 -prog -> pwm4 (the programmable setting is not supported by the driver) +prog -> pwm4 (not on 667HG; the programmable setting is not supported by + the driver) /sys files ---------- diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index b4eea0292c1a..0eb4170cc4af 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -827,7 +827,7 @@ config SENSORS_W83627HF will be called w83627hf. config SENSORS_W83627EHF - tristate "Winbond W83627EHF/DHG" + tristate "Winbond W83627EHF/EHG/DHG, W83667HG" select HWMON_VID help If you say yes here you get support for the hardware @@ -838,6 +838,8 @@ config SENSORS_W83627EHF chip suited for specific Intel processors that use PECI such as the Core 2 Duo. + This driver also supports the W83667HG chip. + This driver can also be built as a module. If so, the module will be called w83627ehf. diff --git a/drivers/hwmon/w83627ehf.c b/drivers/hwmon/w83627ehf.c index 18432e34dc7a..20a9332959bb 100644 --- a/drivers/hwmon/w83627ehf.c +++ b/drivers/hwmon/w83627ehf.c @@ -36,6 +36,7 @@ w83627ehf 10 5 4 3 0x8850 0x88 0x5ca3 0x8860 0xa1 w83627dhg 9 5 4 3 0xa020 0xc1 0x5ca3 + w83667hg 9 5 3 3 0xa510 0xc1 0x5ca3 */ #include @@ -52,12 +53,13 @@ #include #include "lm75.h" -enum kinds { w83627ehf, w83627dhg }; +enum kinds { w83627ehf, w83627dhg, w83667hg }; /* used to set data->name = w83627ehf_device_names[data->sio_kind] */ static const char * w83627ehf_device_names[] = { "w83627ehf", "w83627dhg", + "w83667hg", }; static unsigned short force_id; @@ -71,6 +73,7 @@ MODULE_PARM_DESC(force_id, "Override the detected device ID"); */ #define W83627EHF_LD_HWM 0x0b +#define W83667HG_LD_VID 0x0d #define SIO_REG_LDSEL 0x07 /* Logical device select */ #define SIO_REG_DEVID 0x20 /* Device ID (2 bytes) */ @@ -83,6 +86,7 @@ MODULE_PARM_DESC(force_id, "Override the detected device ID"); #define SIO_W83627EHF_ID 0x8850 #define SIO_W83627EHG_ID 0x8860 #define SIO_W83627DHG_ID 0xa020 +#define SIO_W83667HG_ID 0xa510 #define SIO_ID_MASK 0xFFF0 static inline void @@ -289,6 +293,7 @@ struct w83627ehf_data { u8 pwm_mode[4]; /* 0->DC variable voltage, 1->PWM variable duty cycle */ u8 pwm_enable[4]; /* 1->manual 2->thermal cruise (also called SmartFan I) */ + u8 pwm_num; /* number of pwm */ u8 pwm[4]; u8 target_temp[4]; u8 tolerance[4]; @@ -1192,7 +1197,7 @@ static void w83627ehf_device_remove_files(struct device *dev) device_remove_file(dev, &sda_fan_div[i].dev_attr); device_remove_file(dev, &sda_fan_min[i].dev_attr); } - for (i = 0; i < 4; i++) { + for (i = 0; i < data->pwm_num; i++) { device_remove_file(dev, &sda_pwm[i].dev_attr); device_remove_file(dev, &sda_pwm_mode[i].dev_attr); device_remove_file(dev, &sda_pwm_enable[i].dev_attr); @@ -1272,8 +1277,10 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev) data->name = w83627ehf_device_names[sio_data->kind]; platform_set_drvdata(pdev, data); - /* 627EHG and 627EHF have 10 voltage inputs; DHG has 9 */ - data->in_num = (sio_data->kind == w83627dhg) ? 9 : 10; + /* 627EHG and 627EHF have 10 voltage inputs; 627DHG and 667HG have 9 */ + data->in_num = (sio_data->kind == w83627ehf) ? 10 : 9; + /* 667HG has 3 pwms */ + data->pwm_num = (sio_data->kind == w83667hg) ? 3 : 4; /* Initialize the chip */ w83627ehf_init_device(data); @@ -1281,44 +1288,64 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev) data->vrm = vid_which_vrm(); superio_enter(sio_data->sioreg); /* Read VID value */ - superio_select(sio_data->sioreg, W83627EHF_LD_HWM); - if (superio_inb(sio_data->sioreg, SIO_REG_VID_CTRL) & 0x80) { - /* Set VID input sensibility if needed. In theory the BIOS - should have set it, but in practice it's not always the - case. We only do it for the W83627EHF/EHG because the - W83627DHG is more complex in this respect. */ - if (sio_data->kind == w83627ehf) { - en_vrm10 = superio_inb(sio_data->sioreg, - SIO_REG_EN_VRM10); - if ((en_vrm10 & 0x08) && data->vrm == 90) { - dev_warn(dev, "Setting VID input voltage to " - "TTL\n"); - superio_outb(sio_data->sioreg, SIO_REG_EN_VRM10, - en_vrm10 & ~0x08); - } else if (!(en_vrm10 & 0x08) && data->vrm == 100) { - dev_warn(dev, "Setting VID input voltage to " - "VRM10\n"); - superio_outb(sio_data->sioreg, SIO_REG_EN_VRM10, - en_vrm10 | 0x08); - } - } - - data->vid = superio_inb(sio_data->sioreg, SIO_REG_VID_DATA); - if (sio_data->kind == w83627ehf) /* 6 VID pins only */ - data->vid &= 0x3f; - + if (sio_data->kind == w83667hg) { + /* W83667HG has different pins for VID input and output, so + we can get the VID input values directly at logical device D + 0xe3. */ + superio_select(sio_data->sioreg, W83667HG_LD_VID); + data->vid = superio_inb(sio_data->sioreg, 0xe3); err = device_create_file(dev, &dev_attr_cpu0_vid); if (err) goto exit_release; } else { - dev_info(dev, "VID pins in output mode, CPU VID not " - "available\n"); + superio_select(sio_data->sioreg, W83627EHF_LD_HWM); + if (superio_inb(sio_data->sioreg, SIO_REG_VID_CTRL) & 0x80) { + /* Set VID input sensibility if needed. In theory the + BIOS should have set it, but in practice it's not + always the case. We only do it for the W83627EHF/EHG + because the W83627DHG is more complex in this + respect. */ + if (sio_data->kind == w83627ehf) { + en_vrm10 = superio_inb(sio_data->sioreg, + SIO_REG_EN_VRM10); + if ((en_vrm10 & 0x08) && data->vrm == 90) { + dev_warn(dev, "Setting VID input " + "voltage to TTL\n"); + superio_outb(sio_data->sioreg, + SIO_REG_EN_VRM10, + en_vrm10 & ~0x08); + } else if (!(en_vrm10 & 0x08) + && data->vrm == 100) { + dev_warn(dev, "Setting VID input " + "voltage to VRM10\n"); + superio_outb(sio_data->sioreg, + SIO_REG_EN_VRM10, + en_vrm10 | 0x08); + } + } + + data->vid = superio_inb(sio_data->sioreg, + SIO_REG_VID_DATA); + if (sio_data->kind == w83627ehf) /* 6 VID pins only */ + data->vid &= 0x3f; + + err = device_create_file(dev, &dev_attr_cpu0_vid); + if (err) + goto exit_release; + } else { + dev_info(dev, "VID pins in output mode, CPU VID not " + "available\n"); + } } /* fan4 and fan5 share some pins with the GPIO and serial flash */ - - fan5pin = !(superio_inb(sio_data->sioreg, 0x24) & 0x2); - fan4pin = !(superio_inb(sio_data->sioreg, 0x29) & 0x6); + if (sio_data->kind == w83667hg) { + fan5pin = superio_inb(sio_data->sioreg, 0x27) & 0x20; + fan4pin = superio_inb(sio_data->sioreg, 0x27) & 0x40; + } else { + fan5pin = !(superio_inb(sio_data->sioreg, 0x24) & 0x02); + fan4pin = !(superio_inb(sio_data->sioreg, 0x29) & 0x06); + } superio_exit(sio_data->sioreg); /* It looks like fan4 and fan5 pins can be alternatively used @@ -1344,7 +1371,7 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev) goto exit_remove; /* if fan4 is enabled create the sf3 files for it */ - if (data->has_fan & (1 << 3)) + if ((data->has_fan & (1 << 3)) && data->pwm_num >= 4) for (i = 0; i < ARRAY_SIZE(sda_sf3_arrays_fan4); i++) { if ((err = device_create_file(dev, &sda_sf3_arrays_fan4[i].dev_attr))) @@ -1372,7 +1399,7 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev) || (err = device_create_file(dev, &sda_fan_min[i].dev_attr))) goto exit_remove; - if (i < 4 && /* w83627ehf only has 4 pwm */ + if (i < data->pwm_num && ((err = device_create_file(dev, &sda_pwm[i].dev_attr)) || (err = device_create_file(dev, @@ -1442,6 +1469,7 @@ static int __init w83627ehf_find(int sioaddr, unsigned short *addr, static const char __initdata sio_name_W83627EHF[] = "W83627EHF"; static const char __initdata sio_name_W83627EHG[] = "W83627EHG"; static const char __initdata sio_name_W83627DHG[] = "W83627DHG"; + static const char __initdata sio_name_W83667HG[] = "W83667HG"; u16 val; const char *sio_name; @@ -1466,6 +1494,10 @@ static int __init w83627ehf_find(int sioaddr, unsigned short *addr, sio_data->kind = w83627dhg; sio_name = sio_name_W83627DHG; break; + case SIO_W83667HG_ID: + sio_data->kind = w83667hg; + sio_name = sio_name_W83667HG; + break; default: if (val != 0xffff) pr_debug(DRVNAME ": unsupported chip ID: 0x%04x\n", -- cgit v1.2.3 From fb4504fe84b09cbf49fda19e6630a1003d79656a Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 30 Mar 2009 21:46:43 +0200 Subject: Move the pcf8591 driver to hwmon Directory drivers/i2c/chips is going away, so drivers there must find new homes. For the pcf8591 driver, the best choice seems to be the hwmon subsystem. While the Philips PCF8591 device isn't a typical hardware monitoring chip, its DAC interface is compatible with the hwmon one, so it fits somewhat. If a better subsystem is ever created for ADC/DAC chips, the driver could be moved there. Signed-off-by: Jean Delvare Cc: Aurelien Jarno --- Documentation/hwmon/pcf8591 | 90 +++++++++++ Documentation/i2c/chips/pcf8591 | 90 ----------- drivers/hwmon/Kconfig | 14 ++ drivers/hwmon/Makefile | 1 + drivers/hwmon/pcf8591.c | 325 ++++++++++++++++++++++++++++++++++++++++ drivers/i2c/chips/Kconfig | 13 -- drivers/i2c/chips/Makefile | 1 - drivers/i2c/chips/pcf8591.c | 325 ---------------------------------------- 8 files changed, 430 insertions(+), 429 deletions(-) create mode 100644 Documentation/hwmon/pcf8591 delete mode 100644 Documentation/i2c/chips/pcf8591 create mode 100644 drivers/hwmon/pcf8591.c delete mode 100644 drivers/i2c/chips/pcf8591.c (limited to 'Documentation') diff --git a/Documentation/hwmon/pcf8591 b/Documentation/hwmon/pcf8591 new file mode 100644 index 000000000000..5628fcf4207f --- /dev/null +++ b/Documentation/hwmon/pcf8591 @@ -0,0 +1,90 @@ +Kernel driver pcf8591 +===================== + +Supported chips: + * Philips PCF8591 + Prefix: 'pcf8591' + Addresses scanned: I2C 0x48 - 0x4f + Datasheet: Publicly available at the Philips Semiconductor website + http://www.semiconductors.philips.com/pip/PCF8591P.html + +Authors: + Aurelien Jarno + valuable contributions by Jan M. Sendler , + Jean Delvare + + +Description +----------- +The PCF8591 is an 8-bit A/D and D/A converter (4 analog inputs and one +analog output) for the I2C bus produced by Philips Semiconductors. It +is designed to provide a byte I2C interface to up to 4 separate devices. + +The PCF8591 has 4 analog inputs programmable as single-ended or +differential inputs : +- mode 0 : four single ended inputs + Pins AIN0 to AIN3 are single ended inputs for channels 0 to 3 + +- mode 1 : three differential inputs + Pins AIN3 is the common negative differential input + Pins AIN0 to AIN2 are positive differential inputs for channels 0 to 2 + +- mode 2 : single ended and differential mixed + Pins AIN0 and AIN1 are single ended inputs for channels 0 and 1 + Pins AIN2 is the positive differential input for channel 3 + Pins AIN3 is the negative differential input for channel 3 + +- mode 3 : two differential inputs + Pins AIN0 is the positive differential input for channel 0 + Pins AIN1 is the negative differential input for channel 0 + Pins AIN2 is the positive differential input for channel 1 + Pins AIN3 is the negative differential input for channel 1 + +See the datasheet for details. + +Module parameters +----------------- + +* input_mode int + + Analog input mode: + 0 = four single ended inputs + 1 = three differential inputs + 2 = single ended and differential mixed + 3 = two differential inputs + + +Accessing PCF8591 via /sys interface +------------------------------------- + +! Be careful ! +The PCF8591 is plainly impossible to detect ! Stupid chip. +So every chip with address in the interval [48..4f] is +detected as PCF8591. If you have other chips in this address +range, the workaround is to load this module after the one +for your others chips. + +On detection (i.e. insmod, modprobe et al.), directories are being +created for each detected PCF8591: + +/sys/bus/devices/<0>-<1>/ +where <0> is the bus the chip was detected on (e. g. i2c-0) +and <1> the chip address ([48..4f]) + +Inside these directories, there are such files: +in0, in1, in2, in3, out0_enable, out0_output, name + +Name contains chip name. + +The in0, in1, in2 and in3 files are RO. Reading gives the value of the +corresponding channel. Depending on the current analog inputs configuration, +files in2 and/or in3 do not exist. Values range are from 0 to 255 for single +ended inputs and -128 to +127 for differential inputs (8-bit ADC). + +The out0_enable file is RW. Reading gives "1" for analog output enabled and +"0" for analog output disabled. Writing accepts "0" and "1" accordingly. + +The out0_output file is RW. Writing a number between 0 and 255 (8-bit DAC), send +the value to the digital-to-analog converter. Note that a voltage will +only appears on AOUT pin if aout0_enable equals 1. Reading returns the last +value written. diff --git a/Documentation/i2c/chips/pcf8591 b/Documentation/i2c/chips/pcf8591 deleted file mode 100644 index 5628fcf4207f..000000000000 --- a/Documentation/i2c/chips/pcf8591 +++ /dev/null @@ -1,90 +0,0 @@ -Kernel driver pcf8591 -===================== - -Supported chips: - * Philips PCF8591 - Prefix: 'pcf8591' - Addresses scanned: I2C 0x48 - 0x4f - Datasheet: Publicly available at the Philips Semiconductor website - http://www.semiconductors.philips.com/pip/PCF8591P.html - -Authors: - Aurelien Jarno - valuable contributions by Jan M. Sendler , - Jean Delvare - - -Description ------------ -The PCF8591 is an 8-bit A/D and D/A converter (4 analog inputs and one -analog output) for the I2C bus produced by Philips Semiconductors. It -is designed to provide a byte I2C interface to up to 4 separate devices. - -The PCF8591 has 4 analog inputs programmable as single-ended or -differential inputs : -- mode 0 : four single ended inputs - Pins AIN0 to AIN3 are single ended inputs for channels 0 to 3 - -- mode 1 : three differential inputs - Pins AIN3 is the common negative differential input - Pins AIN0 to AIN2 are positive differential inputs for channels 0 to 2 - -- mode 2 : single ended and differential mixed - Pins AIN0 and AIN1 are single ended inputs for channels 0 and 1 - Pins AIN2 is the positive differential input for channel 3 - Pins AIN3 is the negative differential input for channel 3 - -- mode 3 : two differential inputs - Pins AIN0 is the positive differential input for channel 0 - Pins AIN1 is the negative differential input for channel 0 - Pins AIN2 is the positive differential input for channel 1 - Pins AIN3 is the negative differential input for channel 1 - -See the datasheet for details. - -Module parameters ------------------ - -* input_mode int - - Analog input mode: - 0 = four single ended inputs - 1 = three differential inputs - 2 = single ended and differential mixed - 3 = two differential inputs - - -Accessing PCF8591 via /sys interface -------------------------------------- - -! Be careful ! -The PCF8591 is plainly impossible to detect ! Stupid chip. -So every chip with address in the interval [48..4f] is -detected as PCF8591. If you have other chips in this address -range, the workaround is to load this module after the one -for your others chips. - -On detection (i.e. insmod, modprobe et al.), directories are being -created for each detected PCF8591: - -/sys/bus/devices/<0>-<1>/ -where <0> is the bus the chip was detected on (e. g. i2c-0) -and <1> the chip address ([48..4f]) - -Inside these directories, there are such files: -in0, in1, in2, in3, out0_enable, out0_output, name - -Name contains chip name. - -The in0, in1, in2 and in3 files are RO. Reading gives the value of the -corresponding channel. Depending on the current analog inputs configuration, -files in2 and/or in3 do not exist. Values range are from 0 to 255 for single -ended inputs and -128 to +127 for differential inputs (8-bit ADC). - -The out0_enable file is RW. Reading gives "1" for analog output enabled and -"0" for analog output disabled. Writing accepts "0" and "1" accordingly. - -The out0_output file is RW. Writing a number between 0 and 255 (8-bit DAC), send -the value to the digital-to-analog converter. Note that a voltage will -only appears on AOUT pin if aout0_enable equals 1. Reading returns the last -value written. diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index 0eb4170cc4af..9a22816b37dd 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -635,6 +635,20 @@ config SENSORS_PC87427 This driver can also be built as a module. If so, the module will be called pc87427. +config SENSORS_PCF8591 + tristate "Philips PCF8591 ADC/DAC" + depends on I2C + default n + help + If you say yes here you get support for Philips PCF8591 4-channel + ADC, 1-channel DAC chips. + + This driver can also be built as a module. If so, the module + will be called pcf8591. + + These devices are hard to detect and rarely found on mainstream + hardware. If unsure, say N. + config SENSORS_SIS5595 tristate "Silicon Integrated Systems Corp. SiS5595" depends on PCI diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index 2e80f37f39eb..e332d6267920 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -70,6 +70,7 @@ obj-$(CONFIG_SENSORS_MAX1619) += max1619.o obj-$(CONFIG_SENSORS_MAX6650) += max6650.o obj-$(CONFIG_SENSORS_PC87360) += pc87360.o obj-$(CONFIG_SENSORS_PC87427) += pc87427.o +obj-$(CONFIG_SENSORS_PCF8591) += pcf8591.o obj-$(CONFIG_SENSORS_SIS5595) += sis5595.o obj-$(CONFIG_SENSORS_SMSC47B397)+= smsc47b397.o obj-$(CONFIG_SENSORS_SMSC47M1) += smsc47m1.o diff --git a/drivers/hwmon/pcf8591.c b/drivers/hwmon/pcf8591.c new file mode 100644 index 000000000000..1d7ffebd679d --- /dev/null +++ b/drivers/hwmon/pcf8591.c @@ -0,0 +1,325 @@ +/* + Copyright (C) 2001-2004 Aurelien Jarno + Ported to Linux 2.6 by Aurelien Jarno with + the help of Jean Delvare + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include + +/* Addresses to scan */ +static const unsigned short normal_i2c[] = { 0x48, 0x49, 0x4a, 0x4b, 0x4c, + 0x4d, 0x4e, 0x4f, I2C_CLIENT_END }; + +/* Insmod parameters */ +I2C_CLIENT_INSMOD_1(pcf8591); + +static int input_mode; +module_param(input_mode, int, 0); +MODULE_PARM_DESC(input_mode, + "Analog input mode:\n" + " 0 = four single ended inputs\n" + " 1 = three differential inputs\n" + " 2 = single ended and differential mixed\n" + " 3 = two differential inputs\n"); + +/* The PCF8591 control byte + 7 6 5 4 3 2 1 0 + | 0 |AOEF| AIP | 0 |AINC| AICH | */ + +/* Analog Output Enable Flag (analog output active if 1) */ +#define PCF8591_CONTROL_AOEF 0x40 + +/* Analog Input Programming + 0x00 = four single ended inputs + 0x10 = three differential inputs + 0x20 = single ended and differential mixed + 0x30 = two differential inputs */ +#define PCF8591_CONTROL_AIP_MASK 0x30 + +/* Autoincrement Flag (switch on if 1) */ +#define PCF8591_CONTROL_AINC 0x04 + +/* Channel selection + 0x00 = channel 0 + 0x01 = channel 1 + 0x02 = channel 2 + 0x03 = channel 3 */ +#define PCF8591_CONTROL_AICH_MASK 0x03 + +/* Initial values */ +#define PCF8591_INIT_CONTROL ((input_mode << 4) | PCF8591_CONTROL_AOEF) +#define PCF8591_INIT_AOUT 0 /* DAC out = 0 */ + +/* Conversions */ +#define REG_TO_SIGNED(reg) (((reg) & 0x80)?((reg) - 256):(reg)) + +struct pcf8591_data { + struct mutex update_lock; + + u8 control; + u8 aout; +}; + +static void pcf8591_init_client(struct i2c_client *client); +static int pcf8591_read_channel(struct device *dev, int channel); + +/* following are the sysfs callback functions */ +#define show_in_channel(channel) \ +static ssize_t show_in##channel##_input(struct device *dev, struct device_attribute *attr, char *buf) \ +{ \ + return sprintf(buf, "%d\n", pcf8591_read_channel(dev, channel));\ +} \ +static DEVICE_ATTR(in##channel##_input, S_IRUGO, \ + show_in##channel##_input, NULL); + +show_in_channel(0); +show_in_channel(1); +show_in_channel(2); +show_in_channel(3); + +static ssize_t show_out0_ouput(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct pcf8591_data *data = i2c_get_clientdata(to_i2c_client(dev)); + return sprintf(buf, "%d\n", data->aout * 10); +} + +static ssize_t set_out0_output(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + unsigned int value; + struct i2c_client *client = to_i2c_client(dev); + struct pcf8591_data *data = i2c_get_clientdata(client); + if ((value = (simple_strtoul(buf, NULL, 10) + 5) / 10) <= 255) { + data->aout = value; + i2c_smbus_write_byte_data(client, data->control, data->aout); + return count; + } + return -EINVAL; +} + +static DEVICE_ATTR(out0_output, S_IWUSR | S_IRUGO, + show_out0_ouput, set_out0_output); + +static ssize_t show_out0_enable(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct pcf8591_data *data = i2c_get_clientdata(to_i2c_client(dev)); + return sprintf(buf, "%u\n", !(!(data->control & PCF8591_CONTROL_AOEF))); +} + +static ssize_t set_out0_enable(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + struct pcf8591_data *data = i2c_get_clientdata(client); + unsigned long val = simple_strtoul(buf, NULL, 10); + + mutex_lock(&data->update_lock); + if (val) + data->control |= PCF8591_CONTROL_AOEF; + else + data->control &= ~PCF8591_CONTROL_AOEF; + i2c_smbus_write_byte(client, data->control); + mutex_unlock(&data->update_lock); + return count; +} + +static DEVICE_ATTR(out0_enable, S_IWUSR | S_IRUGO, + show_out0_enable, set_out0_enable); + +static struct attribute *pcf8591_attributes[] = { + &dev_attr_out0_enable.attr, + &dev_attr_out0_output.attr, + &dev_attr_in0_input.attr, + &dev_attr_in1_input.attr, + NULL +}; + +static const struct attribute_group pcf8591_attr_group = { + .attrs = pcf8591_attributes, +}; + +static struct attribute *pcf8591_attributes_opt[] = { + &dev_attr_in2_input.attr, + &dev_attr_in3_input.attr, + NULL +}; + +static const struct attribute_group pcf8591_attr_group_opt = { + .attrs = pcf8591_attributes_opt, +}; + +/* + * Real code + */ + +/* Return 0 if detection is successful, -ENODEV otherwise */ +static int pcf8591_detect(struct i2c_client *client, int kind, + struct i2c_board_info *info) +{ + struct i2c_adapter *adapter = client->adapter; + + if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE + | I2C_FUNC_SMBUS_WRITE_BYTE_DATA)) + return -ENODEV; + + /* Now, we would do the remaining detection. But the PCF8591 is plainly + impossible to detect! Stupid chip. */ + + strlcpy(info->type, "pcf8591", I2C_NAME_SIZE); + + return 0; +} + +static int pcf8591_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct pcf8591_data *data; + int err; + + if (!(data = kzalloc(sizeof(struct pcf8591_data), GFP_KERNEL))) { + err = -ENOMEM; + goto exit; + } + + i2c_set_clientdata(client, data); + mutex_init(&data->update_lock); + + /* Initialize the PCF8591 chip */ + pcf8591_init_client(client); + + /* Register sysfs hooks */ + err = sysfs_create_group(&client->dev.kobj, &pcf8591_attr_group); + if (err) + goto exit_kfree; + + /* Register input2 if not in "two differential inputs" mode */ + if (input_mode != 3) { + if ((err = device_create_file(&client->dev, + &dev_attr_in2_input))) + goto exit_sysfs_remove; + } + + /* Register input3 only in "four single ended inputs" mode */ + if (input_mode == 0) { + if ((err = device_create_file(&client->dev, + &dev_attr_in3_input))) + goto exit_sysfs_remove; + } + + return 0; + +exit_sysfs_remove: + sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group_opt); + sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group); +exit_kfree: + kfree(data); +exit: + return err; +} + +static int pcf8591_remove(struct i2c_client *client) +{ + sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group_opt); + sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group); + kfree(i2c_get_clientdata(client)); + return 0; +} + +/* Called when we have found a new PCF8591. */ +static void pcf8591_init_client(struct i2c_client *client) +{ + struct pcf8591_data *data = i2c_get_clientdata(client); + data->control = PCF8591_INIT_CONTROL; + data->aout = PCF8591_INIT_AOUT; + + i2c_smbus_write_byte_data(client, data->control, data->aout); + + /* The first byte transmitted contains the conversion code of the + previous read cycle. FLUSH IT! */ + i2c_smbus_read_byte(client); +} + +static int pcf8591_read_channel(struct device *dev, int channel) +{ + u8 value; + struct i2c_client *client = to_i2c_client(dev); + struct pcf8591_data *data = i2c_get_clientdata(client); + + mutex_lock(&data->update_lock); + + if ((data->control & PCF8591_CONTROL_AICH_MASK) != channel) { + data->control = (data->control & ~PCF8591_CONTROL_AICH_MASK) + | channel; + i2c_smbus_write_byte(client, data->control); + + /* The first byte transmitted contains the conversion code of + the previous read cycle. FLUSH IT! */ + i2c_smbus_read_byte(client); + } + value = i2c_smbus_read_byte(client); + + mutex_unlock(&data->update_lock); + + if ((channel == 2 && input_mode == 2) || + (channel != 3 && (input_mode == 1 || input_mode == 3))) + return (10 * REG_TO_SIGNED(value)); + else + return (10 * value); +} + +static const struct i2c_device_id pcf8591_id[] = { + { "pcf8591", 0 }, + { } +}; +MODULE_DEVICE_TABLE(i2c, pcf8591_id); + +static struct i2c_driver pcf8591_driver = { + .driver = { + .name = "pcf8591", + }, + .probe = pcf8591_probe, + .remove = pcf8591_remove, + .id_table = pcf8591_id, + + .class = I2C_CLASS_HWMON, /* Nearest choice */ + .detect = pcf8591_detect, + .address_data = &addr_data, +}; + +static int __init pcf8591_init(void) +{ + if (input_mode < 0 || input_mode > 3) { + printk(KERN_WARNING "pcf8591: invalid input_mode (%d)\n", + input_mode); + input_mode = 0; + } + return i2c_add_driver(&pcf8591_driver); +} + +static void __exit pcf8591_exit(void) +{ + i2c_del_driver(&pcf8591_driver); +} + +MODULE_AUTHOR("Aurelien Jarno "); +MODULE_DESCRIPTION("PCF8591 driver"); +MODULE_LICENSE("GPL"); + +module_init(pcf8591_init); +module_exit(pcf8591_exit); diff --git a/drivers/i2c/chips/Kconfig b/drivers/i2c/chips/Kconfig index c80312c1f382..8f8c81eb0aee 100644 --- a/drivers/i2c/chips/Kconfig +++ b/drivers/i2c/chips/Kconfig @@ -64,19 +64,6 @@ config SENSORS_PCA9539 This driver is deprecated and will be dropped soon. Use drivers/gpio/pca953x.c instead. -config SENSORS_PCF8591 - tristate "Philips PCF8591" - depends on EXPERIMENTAL - default n - help - If you say yes here you get support for Philips PCF8591 chips. - - This driver can also be built as a module. If so, the module - will be called pcf8591. - - These devices are hard to detect and rarely found on mainstream - hardware. If unsure, say N. - config SENSORS_MAX6875 tristate "Maxim MAX6875 Power supply supervisor" depends on EXPERIMENTAL diff --git a/drivers/i2c/chips/Makefile b/drivers/i2c/chips/Makefile index d142f238a2de..55a376037183 100644 --- a/drivers/i2c/chips/Makefile +++ b/drivers/i2c/chips/Makefile @@ -15,7 +15,6 @@ obj-$(CONFIG_SENSORS_MAX6875) += max6875.o obj-$(CONFIG_SENSORS_PCA9539) += pca9539.o obj-$(CONFIG_SENSORS_PCF8574) += pcf8574.o obj-$(CONFIG_PCF8575) += pcf8575.o -obj-$(CONFIG_SENSORS_PCF8591) += pcf8591.o obj-$(CONFIG_SENSORS_TSL2550) += tsl2550.o ifeq ($(CONFIG_I2C_DEBUG_CHIP),y) diff --git a/drivers/i2c/chips/pcf8591.c b/drivers/i2c/chips/pcf8591.c deleted file mode 100644 index 16ce3e193776..000000000000 --- a/drivers/i2c/chips/pcf8591.c +++ /dev/null @@ -1,325 +0,0 @@ -/* - Copyright (C) 2001-2004 Aurelien Jarno - Ported to Linux 2.6 by Aurelien Jarno with - the help of Jean Delvare - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#include -#include -#include -#include -#include - -/* Addresses to scan */ -static const unsigned short normal_i2c[] = { 0x48, 0x49, 0x4a, 0x4b, 0x4c, - 0x4d, 0x4e, 0x4f, I2C_CLIENT_END }; - -/* Insmod parameters */ -I2C_CLIENT_INSMOD_1(pcf8591); - -static int input_mode; -module_param(input_mode, int, 0); -MODULE_PARM_DESC(input_mode, - "Analog input mode:\n" - " 0 = four single ended inputs\n" - " 1 = three differential inputs\n" - " 2 = single ended and differential mixed\n" - " 3 = two differential inputs\n"); - -/* The PCF8591 control byte - 7 6 5 4 3 2 1 0 - | 0 |AOEF| AIP | 0 |AINC| AICH | */ - -/* Analog Output Enable Flag (analog output active if 1) */ -#define PCF8591_CONTROL_AOEF 0x40 - -/* Analog Input Programming - 0x00 = four single ended inputs - 0x10 = three differential inputs - 0x20 = single ended and differential mixed - 0x30 = two differential inputs */ -#define PCF8591_CONTROL_AIP_MASK 0x30 - -/* Autoincrement Flag (switch on if 1) */ -#define PCF8591_CONTROL_AINC 0x04 - -/* Channel selection - 0x00 = channel 0 - 0x01 = channel 1 - 0x02 = channel 2 - 0x03 = channel 3 */ -#define PCF8591_CONTROL_AICH_MASK 0x03 - -/* Initial values */ -#define PCF8591_INIT_CONTROL ((input_mode << 4) | PCF8591_CONTROL_AOEF) -#define PCF8591_INIT_AOUT 0 /* DAC out = 0 */ - -/* Conversions */ -#define REG_TO_SIGNED(reg) (((reg) & 0x80)?((reg) - 256):(reg)) - -struct pcf8591_data { - struct mutex update_lock; - - u8 control; - u8 aout; -}; - -static void pcf8591_init_client(struct i2c_client *client); -static int pcf8591_read_channel(struct device *dev, int channel); - -/* following are the sysfs callback functions */ -#define show_in_channel(channel) \ -static ssize_t show_in##channel##_input(struct device *dev, struct device_attribute *attr, char *buf) \ -{ \ - return sprintf(buf, "%d\n", pcf8591_read_channel(dev, channel));\ -} \ -static DEVICE_ATTR(in##channel##_input, S_IRUGO, \ - show_in##channel##_input, NULL); - -show_in_channel(0); -show_in_channel(1); -show_in_channel(2); -show_in_channel(3); - -static ssize_t show_out0_ouput(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct pcf8591_data *data = i2c_get_clientdata(to_i2c_client(dev)); - return sprintf(buf, "%d\n", data->aout * 10); -} - -static ssize_t set_out0_output(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) -{ - unsigned int value; - struct i2c_client *client = to_i2c_client(dev); - struct pcf8591_data *data = i2c_get_clientdata(client); - if ((value = (simple_strtoul(buf, NULL, 10) + 5) / 10) <= 255) { - data->aout = value; - i2c_smbus_write_byte_data(client, data->control, data->aout); - return count; - } - return -EINVAL; -} - -static DEVICE_ATTR(out0_output, S_IWUSR | S_IRUGO, - show_out0_ouput, set_out0_output); - -static ssize_t show_out0_enable(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct pcf8591_data *data = i2c_get_clientdata(to_i2c_client(dev)); - return sprintf(buf, "%u\n", !(!(data->control & PCF8591_CONTROL_AOEF))); -} - -static ssize_t set_out0_enable(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) -{ - struct i2c_client *client = to_i2c_client(dev); - struct pcf8591_data *data = i2c_get_clientdata(client); - unsigned long val = simple_strtoul(buf, NULL, 10); - - mutex_lock(&data->update_lock); - if (val) - data->control |= PCF8591_CONTROL_AOEF; - else - data->control &= ~PCF8591_CONTROL_AOEF; - i2c_smbus_write_byte(client, data->control); - mutex_unlock(&data->update_lock); - return count; -} - -static DEVICE_ATTR(out0_enable, S_IWUSR | S_IRUGO, - show_out0_enable, set_out0_enable); - -static struct attribute *pcf8591_attributes[] = { - &dev_attr_out0_enable.attr, - &dev_attr_out0_output.attr, - &dev_attr_in0_input.attr, - &dev_attr_in1_input.attr, - NULL -}; - -static const struct attribute_group pcf8591_attr_group = { - .attrs = pcf8591_attributes, -}; - -static struct attribute *pcf8591_attributes_opt[] = { - &dev_attr_in2_input.attr, - &dev_attr_in3_input.attr, - NULL -}; - -static const struct attribute_group pcf8591_attr_group_opt = { - .attrs = pcf8591_attributes_opt, -}; - -/* - * Real code - */ - -/* Return 0 if detection is successful, -ENODEV otherwise */ -static int pcf8591_detect(struct i2c_client *client, int kind, - struct i2c_board_info *info) -{ - struct i2c_adapter *adapter = client->adapter; - - if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE - | I2C_FUNC_SMBUS_WRITE_BYTE_DATA)) - return -ENODEV; - - /* Now, we would do the remaining detection. But the PCF8591 is plainly - impossible to detect! Stupid chip. */ - - strlcpy(info->type, "pcf8591", I2C_NAME_SIZE); - - return 0; -} - -static int pcf8591_probe(struct i2c_client *client, - const struct i2c_device_id *id) -{ - struct pcf8591_data *data; - int err; - - if (!(data = kzalloc(sizeof(struct pcf8591_data), GFP_KERNEL))) { - err = -ENOMEM; - goto exit; - } - - i2c_set_clientdata(client, data); - mutex_init(&data->update_lock); - - /* Initialize the PCF8591 chip */ - pcf8591_init_client(client); - - /* Register sysfs hooks */ - err = sysfs_create_group(&client->dev.kobj, &pcf8591_attr_group); - if (err) - goto exit_kfree; - - /* Register input2 if not in "two differential inputs" mode */ - if (input_mode != 3) { - if ((err = device_create_file(&client->dev, - &dev_attr_in2_input))) - goto exit_sysfs_remove; - } - - /* Register input3 only in "four single ended inputs" mode */ - if (input_mode == 0) { - if ((err = device_create_file(&client->dev, - &dev_attr_in3_input))) - goto exit_sysfs_remove; - } - - return 0; - -exit_sysfs_remove: - sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group_opt); - sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group); -exit_kfree: - kfree(data); -exit: - return err; -} - -static int pcf8591_remove(struct i2c_client *client) -{ - sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group_opt); - sysfs_remove_group(&client->dev.kobj, &pcf8591_attr_group); - kfree(i2c_get_clientdata(client)); - return 0; -} - -/* Called when we have found a new PCF8591. */ -static void pcf8591_init_client(struct i2c_client *client) -{ - struct pcf8591_data *data = i2c_get_clientdata(client); - data->control = PCF8591_INIT_CONTROL; - data->aout = PCF8591_INIT_AOUT; - - i2c_smbus_write_byte_data(client, data->control, data->aout); - - /* The first byte transmitted contains the conversion code of the - previous read cycle. FLUSH IT! */ - i2c_smbus_read_byte(client); -} - -static int pcf8591_read_channel(struct device *dev, int channel) -{ - u8 value; - struct i2c_client *client = to_i2c_client(dev); - struct pcf8591_data *data = i2c_get_clientdata(client); - - mutex_lock(&data->update_lock); - - if ((data->control & PCF8591_CONTROL_AICH_MASK) != channel) { - data->control = (data->control & ~PCF8591_CONTROL_AICH_MASK) - | channel; - i2c_smbus_write_byte(client, data->control); - - /* The first byte transmitted contains the conversion code of - the previous read cycle. FLUSH IT! */ - i2c_smbus_read_byte(client); - } - value = i2c_smbus_read_byte(client); - - mutex_unlock(&data->update_lock); - - if ((channel == 2 && input_mode == 2) || - (channel != 3 && (input_mode == 1 || input_mode == 3))) - return (10 * REG_TO_SIGNED(value)); - else - return (10 * value); -} - -static const struct i2c_device_id pcf8591_id[] = { - { "pcf8591", 0 }, - { } -}; -MODULE_DEVICE_TABLE(i2c, pcf8591_id); - -static struct i2c_driver pcf8591_driver = { - .driver = { - .name = "pcf8591", - }, - .probe = pcf8591_probe, - .remove = pcf8591_remove, - .id_table = pcf8591_id, - - .class = I2C_CLASS_HWMON, /* Nearest choice */ - .detect = pcf8591_detect, - .address_data = &addr_data, -}; - -static int __init pcf8591_init(void) -{ - if (input_mode < 0 || input_mode > 3) { - printk(KERN_WARNING "pcf8591: invalid input_mode (%d)\n", - input_mode); - input_mode = 0; - } - return i2c_add_driver(&pcf8591_driver); -} - -static void __exit pcf8591_exit(void) -{ - i2c_del_driver(&pcf8591_driver); -} - -MODULE_AUTHOR("Aurelien Jarno "); -MODULE_DESCRIPTION("PCF8591 driver"); -MODULE_LICENSE("GPL"); - -module_init(pcf8591_init); -module_exit(pcf8591_exit); -- cgit v1.2.3 From ec19920944246b4686c7772a58507a20c361dc9d Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 30 Mar 2009 21:46:44 +0200 Subject: hwmon: Define a standard interface for chassis intrusion detection Define a standard interface for the chassis intrusion detection feature some hardware monitoring chips have. Some drivers have custom sysfs entries for it, but a standard interface would allow integration with user-space (namely libsensors.) Signed-off-by: Jean Delvare Acked-by: Hans de Goede Acked-by: Matt Roberds --- Documentation/hwmon/sysfs-interface | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'Documentation') diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface index 6dbfd5efd991..2f10ce6a879f 100644 --- a/Documentation/hwmon/sysfs-interface +++ b/Documentation/hwmon/sysfs-interface @@ -365,6 +365,7 @@ energy[1-*]_input Cumulative energy use Unit: microJoule RO + ********** * Alarms * ********** @@ -453,6 +454,27 @@ beep_mask Bitmask for beep. RW +*********************** +* Intrusion detection * +*********************** + +intrusion[0-*]_alarm + Chassis intrusion detection + 0: OK + 1: intrusion detected + RW + Contrary to regular alarm flags which clear themselves + automatically when read, this one sticks until cleared by + the user. This is done by writing 0 to the file. Writing + other values is unsupported. + +intrusion[0-*]_beep + Chassis intrusion beep + 0: disable + 1: enable + RW + + sysfs attribute writes interpretation ------------------------------------- -- cgit v1.2.3 From 99b76233803beab302123d243eea9e41149804f3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 25 Mar 2009 22:48:06 +0300 Subject: proc 2/2: remove struct proc_dir_entry::owner Setting ->owner as done currently (pde->owner = THIS_MODULE) is racy as correctly noted at bug #12454. Someone can lookup entry with NULL ->owner, thus not pinning enything, and release it later resulting in module refcount underflow. We can keep ->owner and supply it at registration time like ->proc_fops and ->data. But this leaves ->owner as easy-manipulative field (just one C assignment) and somebody will forget to unpin previous/pin current module when switching ->owner. ->proc_fops is declared as "const" which should give some thoughts. ->read_proc/->write_proc were just fixed to not require ->owner for protection. rmmod'ed directories will be empty and return "." and ".." -- no harm. And directories with tricky enough readdir and lookup shouldn't be modular. We definitely don't want such modular code. Removing ->owner will also make PDE smaller. So, let's nuke it. Kudos to Jeff Layton for reminding about this, let's say, oversight. http://bugzilla.kernel.org/show_bug.cgi?id=12454 Signed-off-by: Alexey Dobriyan --- Documentation/DocBook/procfs_example.c | 9 --------- arch/alpha/kernel/srm_env.c | 5 ----- arch/blackfin/mm/sram-alloc.c | 1 - arch/ia64/kernel/palinfo.c | 2 -- arch/ia64/sn/kernel/sn2/prominfo_proc.c | 9 ++------- arch/powerpc/kernel/rtas_flash.c | 1 - arch/sparc/kernel/led.c | 1 - arch/x86/kernel/cpu/mtrr/if.c | 10 +--------- drivers/acpi/ac.c | 1 - drivers/acpi/battery.c | 1 - drivers/acpi/button.c | 3 --- drivers/acpi/fan.c | 2 -- drivers/acpi/processor_core.c | 2 -- drivers/acpi/sbs.c | 1 - drivers/acpi/thermal.c | 2 -- drivers/acpi/video.c | 5 ----- drivers/block/ps3vram.c | 2 -- drivers/char/ipmi/ipmi_msghandler.c | 12 ++++------- drivers/char/ipmi/ipmi_si_intf.c | 6 +++--- drivers/input/input.c | 2 -- drivers/isdn/hardware/eicon/divasi.c | 1 - drivers/media/video/cpia.c | 4 +--- drivers/message/i2o/i2o_proc.c | 2 -- drivers/net/bonding/bond_main.c | 35 ++------------------------------- drivers/net/irda/vlsi_ir.c | 7 ------- drivers/net/wireless/airo.c | 1 - drivers/platform/x86/asus_acpi.c | 3 --- drivers/platform/x86/thinkpad_acpi.c | 2 -- drivers/platform/x86/toshiba_acpi.c | 3 --- drivers/rtc/rtc-proc.c | 10 ++-------- drivers/s390/block/dasd_proc.c | 2 -- drivers/scsi/scsi_devinfo.c | 2 -- drivers/scsi/scsi_proc.c | 3 --- drivers/video/via/viafbdev.c | 5 ----- fs/afs/proc.c | 1 - fs/cifs/cifs_debug.c | 1 - fs/jfs/jfs_debug.c | 1 - fs/nfs/client.c | 2 -- fs/proc/inode.c | 19 +++--------------- fs/proc/proc_tty.c | 1 - fs/reiserfs/procfs.c | 5 +---- include/linux/ipmi_smi.h | 2 +- include/linux/proc_fs.h | 4 ---- net/appletalk/atalk_proc.c | 1 - net/atm/mpoa_proc.c | 1 - net/atm/proc.c | 1 - net/can/bcm.c | 4 ---- net/can/proc.c | 2 -- net/core/pktgen.c | 1 - net/irda/irproc.c | 1 - net/llc/llc_proc.c | 1 - net/sctp/protocol.c | 8 ++------ net/sunrpc/cache.c | 4 ---- net/sunrpc/stats.c | 10 ++-------- sound/core/info.c | 31 ++--------------------------- 55 files changed, 26 insertions(+), 232 deletions(-) (limited to 'Documentation') diff --git a/Documentation/DocBook/procfs_example.c b/Documentation/DocBook/procfs_example.c index 8c6396e4bf31..a5b11793b1e0 100644 --- a/Documentation/DocBook/procfs_example.c +++ b/Documentation/DocBook/procfs_example.c @@ -117,9 +117,6 @@ static int __init init_procfs_example(void) rv = -ENOMEM; goto out; } - - example_dir->owner = THIS_MODULE; - /* create jiffies using convenience function */ jiffies_file = create_proc_read_entry("jiffies", 0444, example_dir, @@ -130,8 +127,6 @@ static int __init init_procfs_example(void) goto no_jiffies; } - jiffies_file->owner = THIS_MODULE; - /* create foo and bar files using same callback * functions */ @@ -146,7 +141,6 @@ static int __init init_procfs_example(void) foo_file->data = &foo_data; foo_file->read_proc = proc_read_foobar; foo_file->write_proc = proc_write_foobar; - foo_file->owner = THIS_MODULE; bar_file = create_proc_entry("bar", 0644, example_dir); if(bar_file == NULL) { @@ -159,7 +153,6 @@ static int __init init_procfs_example(void) bar_file->data = &bar_data; bar_file->read_proc = proc_read_foobar; bar_file->write_proc = proc_write_foobar; - bar_file->owner = THIS_MODULE; /* create symlink */ symlink = proc_symlink("jiffies_too", example_dir, @@ -169,8 +162,6 @@ static int __init init_procfs_example(void) goto no_symlink; } - symlink->owner = THIS_MODULE; - /* everything OK */ printk(KERN_INFO "%s %s initialised\n", MODULE_NAME, MODULE_VERS); diff --git a/arch/alpha/kernel/srm_env.c b/arch/alpha/kernel/srm_env.c index 78ad7cd1bbd6..d12af472e1c0 100644 --- a/arch/alpha/kernel/srm_env.c +++ b/arch/alpha/kernel/srm_env.c @@ -218,7 +218,6 @@ srm_env_init(void) BASE_DIR); goto cleanup; } - base_dir->owner = THIS_MODULE; /* * Create per-name subdirectory @@ -229,7 +228,6 @@ srm_env_init(void) BASE_DIR, NAMED_DIR); goto cleanup; } - named_dir->owner = THIS_MODULE; /* * Create per-number subdirectory @@ -241,7 +239,6 @@ srm_env_init(void) goto cleanup; } - numbered_dir->owner = THIS_MODULE; /* * Create all named nodes @@ -254,7 +251,6 @@ srm_env_init(void) goto cleanup; entry->proc_entry->data = (void *) entry; - entry->proc_entry->owner = THIS_MODULE; entry->proc_entry->read_proc = srm_env_read; entry->proc_entry->write_proc = srm_env_write; @@ -275,7 +271,6 @@ srm_env_init(void) entry->id = var_num; entry->proc_entry->data = (void *) entry; - entry->proc_entry->owner = THIS_MODULE; entry->proc_entry->read_proc = srm_env_read; entry->proc_entry->write_proc = srm_env_write; } diff --git a/arch/blackfin/mm/sram-alloc.c b/arch/blackfin/mm/sram-alloc.c index 834cab7438a8..530d1393a232 100644 --- a/arch/blackfin/mm/sram-alloc.c +++ b/arch/blackfin/mm/sram-alloc.c @@ -854,7 +854,6 @@ static int __init sram_proc_init(void) printk(KERN_WARNING "unable to create /proc/sram\n"); return -1; } - ptr->owner = THIS_MODULE; ptr->read_proc = sram_proc_read; return 0; } diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c index e5c57f413ca2..a4f19c70aadd 100644 --- a/arch/ia64/kernel/palinfo.c +++ b/arch/ia64/kernel/palinfo.c @@ -1002,8 +1002,6 @@ create_palinfo_proc_entries(unsigned int cpu) *pdir = create_proc_read_entry( palinfo_entries[j].name, 0, cpu_dir, palinfo_read_entry, (void *)f.value); - if (*pdir) - (*pdir)->owner = THIS_MODULE; pdir++; } } diff --git a/arch/ia64/sn/kernel/sn2/prominfo_proc.c b/arch/ia64/sn/kernel/sn2/prominfo_proc.c index 4dcce3d0e04c..e63328818643 100644 --- a/arch/ia64/sn/kernel/sn2/prominfo_proc.c +++ b/arch/ia64/sn/kernel/sn2/prominfo_proc.c @@ -225,7 +225,6 @@ static struct proc_dir_entry *sgi_prominfo_entry; int __init prominfo_init(void) { struct proc_dir_entry **entp; - struct proc_dir_entry *p; cnodeid_t cnodeid; unsigned long nasid; int size; @@ -246,14 +245,10 @@ int __init prominfo_init(void) sprintf(name, "node%d", cnodeid); *entp = proc_mkdir(name, sgi_prominfo_entry); nasid = cnodeid_to_nasid(cnodeid); - p = create_proc_read_entry("fit", 0, *entp, read_fit_entry, + create_proc_read_entry("fit", 0, *entp, read_fit_entry, (void *)nasid); - if (p) - p->owner = THIS_MODULE; - p = create_proc_read_entry("version", 0, *entp, + create_proc_read_entry("version", 0, *entp, read_version_entry, (void *)nasid); - if (p) - p->owner = THIS_MODULE; entp++; } diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c index 149cb112cd1a..13011a96a977 100644 --- a/arch/powerpc/kernel/rtas_flash.c +++ b/arch/powerpc/kernel/rtas_flash.c @@ -669,7 +669,6 @@ static void remove_flash_pde(struct proc_dir_entry *dp) { if (dp) { kfree(dp->data); - dp->owner = NULL; remove_proc_entry(dp->name, dp->parent); } } diff --git a/arch/sparc/kernel/led.c b/arch/sparc/kernel/led.c index adaaed4ea2fb..00d034ea2164 100644 --- a/arch/sparc/kernel/led.c +++ b/arch/sparc/kernel/led.c @@ -126,7 +126,6 @@ static int __init led_init(void) led = proc_create("led", 0, NULL, &led_proc_fops); if (!led) return -ENOMEM; - led->owner = THIS_MODULE; printk(KERN_INFO "led: version %s, Lars Kotthoff \n", diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 4c4214690dd1..fb73a52913a4 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -377,10 +377,6 @@ static const struct file_operations mtrr_fops = { .release = mtrr_close, }; - -static struct proc_dir_entry *proc_root_mtrr; - - static int mtrr_seq_show(struct seq_file *seq, void *offset) { char factor; @@ -423,11 +419,7 @@ static int __init mtrr_if_init(void) (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) return -ENODEV; - proc_root_mtrr = - proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops); - - if (proc_root_mtrr) - proc_root_mtrr->owner = THIS_MODULE; + proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops); return 0; } diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c index 9b917dac7732..88e42abf5d88 100644 --- a/drivers/acpi/ac.c +++ b/drivers/acpi/ac.c @@ -191,7 +191,6 @@ static int acpi_ac_add_fs(struct acpi_device *device) acpi_ac_dir); if (!acpi_device_dir(device)) return -ENODEV; - acpi_device_dir(device)->owner = THIS_MODULE; } /* 'state' [R] */ diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 69cbc57c2d1c..3bcb5bfc45d3 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -760,7 +760,6 @@ static int acpi_battery_add_fs(struct acpi_device *device) acpi_battery_dir); if (!acpi_device_dir(device)) return -ENODEV; - acpi_device_dir(device)->owner = THIS_MODULE; } for (i = 0; i < ACPI_BATTERY_NUMFILES; ++i) { diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index 171fd914f435..c2f06069dcd4 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -200,12 +200,10 @@ static int acpi_button_add_fs(struct acpi_device *device) if (!entry) return -ENODEV; - entry->owner = THIS_MODULE; acpi_device_dir(device) = proc_mkdir(acpi_device_bid(device), entry); if (!acpi_device_dir(device)) return -ENODEV; - acpi_device_dir(device)->owner = THIS_MODULE; /* 'info' [R] */ entry = proc_create_data(ACPI_BUTTON_FILE_INFO, @@ -522,7 +520,6 @@ static int __init acpi_button_init(void) acpi_button_dir = proc_mkdir(ACPI_BUTTON_CLASS, acpi_root_dir); if (!acpi_button_dir) return -ENODEV; - acpi_button_dir->owner = THIS_MODULE; result = acpi_bus_register_driver(&acpi_button_driver); if (result < 0) { remove_proc_entry(ACPI_BUTTON_CLASS, acpi_root_dir); diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index eaaee1660bdf..8a02944bf92d 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -193,7 +193,6 @@ static int acpi_fan_add_fs(struct acpi_device *device) acpi_fan_dir); if (!acpi_device_dir(device)) return -ENODEV; - acpi_device_dir(device)->owner = THIS_MODULE; } /* 'status' [R/W] */ @@ -347,7 +346,6 @@ static int __init acpi_fan_init(void) acpi_fan_dir = proc_mkdir(ACPI_FAN_CLASS, acpi_root_dir); if (!acpi_fan_dir) return -ENODEV; - acpi_fan_dir->owner = THIS_MODULE; #endif result = acpi_bus_register_driver(&acpi_fan_driver); diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index 0cc2fd31e376..fa2f7422d23d 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -359,7 +359,6 @@ static int acpi_processor_add_fs(struct acpi_device *device) if (!acpi_device_dir(device)) return -ENODEV; } - acpi_device_dir(device)->owner = THIS_MODULE; /* 'info' [R] */ entry = proc_create_data(ACPI_PROCESSOR_FILE_INFO, @@ -1137,7 +1136,6 @@ static int __init acpi_processor_init(void) acpi_processor_dir = proc_mkdir(ACPI_PROCESSOR_CLASS, acpi_root_dir); if (!acpi_processor_dir) return -ENOMEM; - acpi_processor_dir->owner = THIS_MODULE; /* * Check whether the system is DMI table. If yes, OSPM diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c index 6050ce481873..59afd52ccc12 100644 --- a/drivers/acpi/sbs.c +++ b/drivers/acpi/sbs.c @@ -488,7 +488,6 @@ acpi_sbs_add_fs(struct proc_dir_entry **dir, if (!*dir) { return -ENODEV; } - (*dir)->owner = THIS_MODULE; } /* 'info' [R] */ diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 99e6f1f8ea45..c11f9aeca706 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -1506,7 +1506,6 @@ static int acpi_thermal_add_fs(struct acpi_device *device) acpi_thermal_dir); if (!acpi_device_dir(device)) return -ENODEV; - acpi_device_dir(device)->owner = THIS_MODULE; } /* 'state' [R] */ @@ -1875,7 +1874,6 @@ static int __init acpi_thermal_init(void) acpi_thermal_dir = proc_mkdir(ACPI_THERMAL_CLASS, acpi_root_dir); if (!acpi_thermal_dir) return -ENODEV; - acpi_thermal_dir->owner = THIS_MODULE; result = acpi_bus_register_driver(&acpi_thermal_driver); if (result < 0) { diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index bb5ed059114a..67cc36dc9b82 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -1125,8 +1125,6 @@ static int acpi_video_device_add_fs(struct acpi_device *device) if (!device_dir) return -ENOMEM; - device_dir->owner = THIS_MODULE; - /* 'info' [R] */ entry = proc_create_data("info", S_IRUGO, device_dir, &acpi_video_device_info_fops, acpi_driver_data(device)); @@ -1403,8 +1401,6 @@ static int acpi_video_bus_add_fs(struct acpi_device *device) if (!device_dir) return -ENOMEM; - device_dir->owner = THIS_MODULE; - /* 'info' [R] */ entry = proc_create_data("info", S_IRUGO, device_dir, &acpi_video_bus_info_fops, @@ -2131,7 +2127,6 @@ static int __init acpi_video_init(void) acpi_video_dir = proc_mkdir(ACPI_VIDEO_CLASS, acpi_root_dir); if (!acpi_video_dir) return -ENODEV; - acpi_video_dir->owner = THIS_MODULE; result = acpi_bus_register_driver(&acpi_video_bus); if (result < 0) { diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 393ed6760d78..8eddef373a91 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -551,8 +551,6 @@ static void __devinit ps3vram_proc_init(struct ps3_system_bus_device *dev) dev_warn(&dev->core, "failed to create /proc entry\n"); return; } - - pde->owner = THIS_MODULE; pde->data = priv; } diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 7a88dfd4427b..e93fc8d22fb2 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -1944,7 +1944,7 @@ static int stat_file_read_proc(char *page, char **start, off_t off, int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name, read_proc_t *read_proc, - void *data, struct module *owner) + void *data) { int rv = 0; #ifdef CONFIG_PROC_FS @@ -1970,7 +1970,6 @@ int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name, } else { file->data = data; file->read_proc = read_proc; - file->owner = owner; mutex_lock(&smi->proc_entry_lock); /* Stick it on the list. */ @@ -1993,23 +1992,21 @@ static int add_proc_entries(ipmi_smi_t smi, int num) smi->proc_dir = proc_mkdir(smi->proc_dir_name, proc_ipmi_root); if (!smi->proc_dir) rv = -ENOMEM; - else - smi->proc_dir->owner = THIS_MODULE; if (rv == 0) rv = ipmi_smi_add_proc_entry(smi, "stats", stat_file_read_proc, - smi, THIS_MODULE); + smi); if (rv == 0) rv = ipmi_smi_add_proc_entry(smi, "ipmb", ipmb_file_read_proc, - smi, THIS_MODULE); + smi); if (rv == 0) rv = ipmi_smi_add_proc_entry(smi, "version", version_file_read_proc, - smi, THIS_MODULE); + smi); #endif /* CONFIG_PROC_FS */ return rv; @@ -4265,7 +4262,6 @@ static int ipmi_init_msghandler(void) return -ENOMEM; } - proc_ipmi_root->owner = THIS_MODULE; #endif /* CONFIG_PROC_FS */ setup_timer(&ipmi_timer, ipmi_timeout, 0); diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 3000135f2ead..e58ea4cd55ce 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -2899,7 +2899,7 @@ static int try_smi_init(struct smi_info *new_smi) rv = ipmi_smi_add_proc_entry(new_smi->intf, "type", type_file_read_proc, - new_smi, THIS_MODULE); + new_smi); if (rv) { printk(KERN_ERR "ipmi_si: Unable to create proc entry: %d\n", @@ -2909,7 +2909,7 @@ static int try_smi_init(struct smi_info *new_smi) rv = ipmi_smi_add_proc_entry(new_smi->intf, "si_stats", stat_file_read_proc, - new_smi, THIS_MODULE); + new_smi); if (rv) { printk(KERN_ERR "ipmi_si: Unable to create proc entry: %d\n", @@ -2919,7 +2919,7 @@ static int try_smi_init(struct smi_info *new_smi) rv = ipmi_smi_add_proc_entry(new_smi->intf, "params", param_read_proc, - new_smi, THIS_MODULE); + new_smi); if (rv) { printk(KERN_ERR "ipmi_si: Unable to create proc entry: %d\n", diff --git a/drivers/input/input.c b/drivers/input/input.c index 1730d7331a5d..ec3db3ade118 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -903,8 +903,6 @@ static int __init input_proc_init(void) if (!proc_bus_input_dir) return -ENOMEM; - proc_bus_input_dir->owner = THIS_MODULE; - entry = proc_create("devices", 0, proc_bus_input_dir, &input_devices_fileops); if (!entry) diff --git a/drivers/isdn/hardware/eicon/divasi.c b/drivers/isdn/hardware/eicon/divasi.c index f4969fe0a055..69e71ebe7841 100644 --- a/drivers/isdn/hardware/eicon/divasi.c +++ b/drivers/isdn/hardware/eicon/divasi.c @@ -118,7 +118,6 @@ static int DIVA_INIT_FUNCTION create_um_idi_proc(void) return (0); um_idi_proc_entry->read_proc = um_idi_proc_read; - um_idi_proc_entry->owner = THIS_MODULE; return (1); } diff --git a/drivers/media/video/cpia.c b/drivers/media/video/cpia.c index c3b0c8c63c76..43ab0adf3b61 100644 --- a/drivers/media/video/cpia.c +++ b/drivers/media/video/cpia.c @@ -1381,9 +1381,7 @@ static void proc_cpia_create(void) { cpia_proc_root = proc_mkdir("cpia", NULL); - if (cpia_proc_root) - cpia_proc_root->owner = THIS_MODULE; - else + if (!cpia_proc_root) LOG("Unable to initialise /proc/cpia\n"); } diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c index 9a36b5a7de57..7045c45da9b1 100644 --- a/drivers/message/i2o/i2o_proc.c +++ b/drivers/message/i2o/i2o_proc.c @@ -2037,8 +2037,6 @@ static int __init i2o_proc_fs_create(void) if (!i2o_proc_dir_root) return -1; - i2o_proc_dir_root->owner = THIS_MODULE; - list_for_each_entry(c, &i2o_controllers, list) i2o_proc_iop_add(i2o_proc_dir_root, c); diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 9c326a50a3ee..99610f358c40 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3444,25 +3444,12 @@ static void bond_remove_proc_entry(struct bonding *bond) */ static void bond_create_proc_dir(void) { - int len = strlen(DRV_NAME); - - for (bond_proc_dir = init_net.proc_net->subdir; bond_proc_dir; - bond_proc_dir = bond_proc_dir->next) { - if ((bond_proc_dir->namelen == len) && - !memcmp(bond_proc_dir->name, DRV_NAME, len)) { - break; - } - } - if (!bond_proc_dir) { bond_proc_dir = proc_mkdir(DRV_NAME, init_net.proc_net); - if (bond_proc_dir) { - bond_proc_dir->owner = THIS_MODULE; - } else { + if (!bond_proc_dir) printk(KERN_WARNING DRV_NAME ": Warning: cannot create /proc/net/%s\n", DRV_NAME); - } } } @@ -3471,25 +3458,7 @@ static void bond_create_proc_dir(void) */ static void bond_destroy_proc_dir(void) { - struct proc_dir_entry *de; - - if (!bond_proc_dir) { - return; - } - - /* verify that the /proc dir is empty */ - for (de = bond_proc_dir->subdir; de; de = de->next) { - /* ignore . and .. */ - if (*(de->name) != '.') { - break; - } - } - - if (de) { - if (bond_proc_dir->owner == THIS_MODULE) { - bond_proc_dir->owner = NULL; - } - } else { + if (bond_proc_dir) { remove_proc_entry(DRV_NAME, init_net.proc_net); bond_proc_dir = NULL; } diff --git a/drivers/net/irda/vlsi_ir.c b/drivers/net/irda/vlsi_ir.c index 1243bc8e0035..ac0e4b6b6b66 100644 --- a/drivers/net/irda/vlsi_ir.c +++ b/drivers/net/irda/vlsi_ir.c @@ -1871,13 +1871,6 @@ static int __init vlsi_mod_init(void) * without procfs - it's not required for the driver to work. */ vlsi_proc_root = proc_mkdir(PROC_DIR, NULL); - if (vlsi_proc_root) { - /* protect registered procdir against module removal. - * Because we are in the module init path there's no race - * window after create_proc_entry (and no barrier needed). - */ - vlsi_proc_root->owner = THIS_MODULE; - } ret = pci_register_driver(&vlsi_irda_driver); diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c index 7e80aba8a148..31b1cc2b778a 100644 --- a/drivers/net/wireless/airo.c +++ b/drivers/net/wireless/airo.c @@ -4494,7 +4494,6 @@ static int setup_proc_entry( struct net_device *dev, goto fail; apriv->proc_entry->uid = proc_uid; apriv->proc_entry->gid = proc_gid; - apriv->proc_entry->owner = THIS_MODULE; /* Setup the StatsDelta */ entry = proc_create_data("StatsDelta", diff --git a/drivers/platform/x86/asus_acpi.c b/drivers/platform/x86/asus_acpi.c index d63f26e666a4..ba1f7497e4b9 100644 --- a/drivers/platform/x86/asus_acpi.c +++ b/drivers/platform/x86/asus_acpi.c @@ -987,7 +987,6 @@ asus_proc_add(char *name, proc_writefunc *writefunc, proc->write_proc = writefunc; proc->read_proc = readfunc; proc->data = acpi_driver_data(device); - proc->owner = THIS_MODULE; proc->uid = asus_uid; proc->gid = asus_gid; return 0; @@ -1020,7 +1019,6 @@ static int asus_hotk_add_fs(struct acpi_device *device) if (proc) { proc->read_proc = proc_read_info; proc->data = acpi_driver_data(device); - proc->owner = THIS_MODULE; proc->uid = asus_uid; proc->gid = asus_gid; } else { @@ -1436,7 +1434,6 @@ static int __init asus_acpi_init(void) printk(KERN_ERR "Asus ACPI: Unable to create /proc entry\n"); return -ENODEV; } - asus_proc_dir->owner = THIS_MODULE; result = acpi_bus_register_driver(&asus_hotk_driver); if (result < 0) { diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index d2433204a40c..3dad27a385d3 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -6992,7 +6992,6 @@ static int __init ibm_init(struct ibm_init_struct *iibm) ret = -ENODEV; goto err_out; } - entry->owner = THIS_MODULE; entry->data = ibm; entry->read_proc = &dispatch_procfs_read; if (ibm->write) @@ -7405,7 +7404,6 @@ static int __init thinkpad_acpi_module_init(void) thinkpad_acpi_module_exit(); return -ENODEV; } - proc_dir->owner = THIS_MODULE; ret = platform_driver_register(&tpacpi_pdriver); if (ret) { diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 40e60fc2e596..9f187265db8e 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -679,8 +679,6 @@ static acpi_status __init add_device(void) toshiba_proc_dir, (read_proc_t *) dispatch_read, item); - if (proc) - proc->owner = THIS_MODULE; if (proc && item->write_func) proc->write_proc = (write_proc_t *) dispatch_write; } @@ -772,7 +770,6 @@ static int __init toshiba_acpi_init(void) toshiba_acpi_exit(); return -ENODEV; } else { - toshiba_proc_dir->owner = THIS_MODULE; status = add_device(); if (ACPI_FAILURE(status)) { toshiba_acpi_exit(); diff --git a/drivers/rtc/rtc-proc.c b/drivers/rtc/rtc-proc.c index 0c6257a034ff..c086fc30a84c 100644 --- a/drivers/rtc/rtc-proc.c +++ b/drivers/rtc/rtc-proc.c @@ -105,14 +105,8 @@ static const struct file_operations rtc_proc_fops = { void rtc_proc_add_device(struct rtc_device *rtc) { - if (rtc->id == 0) { - struct proc_dir_entry *ent; - - ent = proc_create_data("driver/rtc", 0, NULL, - &rtc_proc_fops, rtc); - if (ent) - ent->owner = rtc->owner; - } + if (rtc->id == 0) + proc_create_data("driver/rtc", 0, NULL, &rtc_proc_fops, rtc); } void rtc_proc_del_device(struct rtc_device *rtc) diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c index 2080ba6a69b0..654daa3cdfda 100644 --- a/drivers/s390/block/dasd_proc.c +++ b/drivers/s390/block/dasd_proc.c @@ -320,7 +320,6 @@ dasd_proc_init(void) dasd_proc_root_entry = proc_mkdir("dasd", NULL); if (!dasd_proc_root_entry) goto out_nodasd; - dasd_proc_root_entry->owner = THIS_MODULE; dasd_devices_entry = proc_create("devices", S_IFREG | S_IRUGO | S_IWUSR, dasd_proc_root_entry, @@ -334,7 +333,6 @@ dasd_proc_init(void) goto out_nostatistics; dasd_statistics_entry->read_proc = dasd_statistics_read; dasd_statistics_entry->write_proc = dasd_statistics_write; - dasd_statistics_entry->owner = THIS_MODULE; return 0; out_nostatistics: diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c index 099b5455bbce..b13481369642 100644 --- a/drivers/scsi/scsi_devinfo.c +++ b/drivers/scsi/scsi_devinfo.c @@ -596,8 +596,6 @@ int __init scsi_init_devinfo(void) error = -ENOMEM; goto out; } - - p->owner = THIS_MODULE; #endif /* CONFIG_SCSI_PROC_FS */ out: diff --git a/drivers/scsi/scsi_proc.c b/drivers/scsi/scsi_proc.c index 82f7b2dd08a2..77fbddb507fd 100644 --- a/drivers/scsi/scsi_proc.c +++ b/drivers/scsi/scsi_proc.c @@ -115,8 +115,6 @@ void scsi_proc_hostdir_add(struct scsi_host_template *sht) if (!sht->proc_dir) printk(KERN_ERR "%s: proc_mkdir failed for %s\n", __func__, sht->proc_name); - else - sht->proc_dir->owner = sht->module; } mutex_unlock(&global_host_template_mutex); } @@ -163,7 +161,6 @@ void scsi_proc_host_add(struct Scsi_Host *shost) } p->write_proc = proc_scsi_write_proc; - p->owner = sht->module; } /** diff --git a/drivers/video/via/viafbdev.c b/drivers/video/via/viafbdev.c index 37b433a08ce8..e327b84820d2 100644 --- a/drivers/video/via/viafbdev.c +++ b/drivers/video/via/viafbdev.c @@ -2059,25 +2059,21 @@ static void viafb_init_proc(struct proc_dir_entry **viafb_entry) if (viafb_entry) { entry = create_proc_entry("dvp0", 0, *viafb_entry); if (entry) { - entry->owner = THIS_MODULE; entry->read_proc = viafb_dvp0_proc_read; entry->write_proc = viafb_dvp0_proc_write; } entry = create_proc_entry("dvp1", 0, *viafb_entry); if (entry) { - entry->owner = THIS_MODULE; entry->read_proc = viafb_dvp1_proc_read; entry->write_proc = viafb_dvp1_proc_write; } entry = create_proc_entry("dfph", 0, *viafb_entry); if (entry) { - entry->owner = THIS_MODULE; entry->read_proc = viafb_dfph_proc_read; entry->write_proc = viafb_dfph_proc_write; } entry = create_proc_entry("dfpl", 0, *viafb_entry); if (entry) { - entry->owner = THIS_MODULE; entry->read_proc = viafb_dfpl_proc_read; entry->write_proc = viafb_dfpl_proc_write; } @@ -2086,7 +2082,6 @@ static void viafb_init_proc(struct proc_dir_entry **viafb_entry) viaparinfo->chip_info->lvds_chip_info2.lvds_chip_name) { entry = create_proc_entry("vt1636", 0, *viafb_entry); if (entry) { - entry->owner = THIS_MODULE; entry->read_proc = viafb_vt1636_proc_read; entry->write_proc = viafb_vt1636_proc_write; } diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 7578c1ab9e0b..8630615e57fe 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -146,7 +146,6 @@ int afs_proc_init(void) proc_afs = proc_mkdir("fs/afs", NULL); if (!proc_afs) goto error_dir; - proc_afs->owner = THIS_MODULE; p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops); if (!p) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 877e4d9a1159..7f19fefd3d45 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -404,7 +404,6 @@ cifs_proc_init(void) if (proc_fs_cifs == NULL) return; - proc_fs_cifs->owner = THIS_MODULE; proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops); #ifdef CONFIG_CIFS_STATS diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index 6a73de84bcef..dd824d9b0b1a 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -90,7 +90,6 @@ void jfs_proc_init(void) if (!(base = proc_mkdir("fs/jfs", NULL))) return; - base->owner = THIS_MODULE; for (i = 0; i < NPROCENT; i++) proc_create(Entries[i].name, 0, base, Entries[i].proc_fops); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 574158ae2398..2277421656e7 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1606,8 +1606,6 @@ int __init nfs_fs_proc_init(void) if (!proc_fs_nfs) goto error_0; - proc_fs_nfs->owner = THIS_MODULE; - /* a file of servers with which we're dealing */ p = proc_create("servers", S_IFREG|S_IRUGO, proc_fs_nfs, &nfs_server_list_fops); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index e11dc22c6511..d78ade305541 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -58,11 +58,8 @@ static void proc_delete_inode(struct inode *inode) /* Let go of any associated proc directory entry */ de = PROC_I(inode)->pde; - if (de) { - if (de->owner) - module_put(de->owner); + if (de) de_put(de); - } if (PROC_I(inode)->sysctl) sysctl_head_put(PROC_I(inode)->sysctl); clear_inode(inode); @@ -449,12 +446,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, { struct inode * inode; - if (!try_module_get(de->owner)) - goto out_mod; - inode = iget_locked(sb, ino); if (!inode) - goto out_ino; + return NULL; if (inode->i_state & I_NEW) { inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; PROC_I(inode)->fd = 0; @@ -485,16 +479,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, } } unlock_new_inode(inode); - } else { - module_put(de->owner); + } else de_put(de); - } return inode; - -out_ino: - module_put(de->owner); -out_mod: - return NULL; } int proc_fill_super(struct super_block *s) diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index d153946d6d15..4a9e0f65ae60 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -152,7 +152,6 @@ void proc_tty_register_driver(struct tty_driver *driver) if (!ent) return; ent->read_proc = driver->ops->read_proc; - ent->owner = driver->owner; ent->data = driver; driver->proc_entry = ent; diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index d5066400638a..9229e5514a4e 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -492,7 +492,6 @@ int reiserfs_proc_info_init(struct super_block *sb) spin_lock_init(&__PINFO(sb).lock); REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root); if (REISERFS_SB(sb)->procdir) { - REISERFS_SB(sb)->procdir->owner = THIS_MODULE; REISERFS_SB(sb)->procdir->data = sb; add_file(sb, "version", show_version); add_file(sb, "super", show_super); @@ -556,9 +555,7 @@ int reiserfs_proc_info_global_init(void) { if (proc_info_root == NULL) { proc_info_root = proc_mkdir(proc_info_root_name, NULL); - if (proc_info_root) { - proc_info_root->owner = THIS_MODULE; - } else { + if (!proc_info_root) { reiserfs_warning(NULL, "cannot create /proc/%s", proc_info_root_name); return 1; diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index 62b73668b602..f7c9c75a2775 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -230,6 +230,6 @@ static inline void ipmi_free_smi_msg(struct ipmi_smi_msg *msg) automatically be dstroyed when the interface is destroyed. */ int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name, read_proc_t *read_proc, - void *data, struct module *owner); + void *data); #endif /* __LINUX_IPMI_SMI_H */ diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index b8bdb96eff78..fbfa3d44d33d 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -41,9 +41,6 @@ enum { * while parent/subdir create the directory structure (every * /proc file has a parent, but "subdir" is NULL for all * non-directory entries). - * - * "owner" is used to protect module - * from unloading while proc_dir_entry is in use */ typedef int (read_proc_t)(char *page, char **start, off_t off, @@ -70,7 +67,6 @@ struct proc_dir_entry { * somewhere. */ const struct file_operations *proc_fops; - struct module *owner; struct proc_dir_entry *next, *parent, *subdir; void *data; read_proc_t *read_proc; diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c index 162199a2d74f..fd8e0847b254 100644 --- a/net/appletalk/atalk_proc.c +++ b/net/appletalk/atalk_proc.c @@ -281,7 +281,6 @@ int __init atalk_proc_init(void) atalk_proc_dir = proc_mkdir("atalk", init_net.proc_net); if (!atalk_proc_dir) goto out; - atalk_proc_dir->owner = THIS_MODULE; p = proc_create("interface", S_IRUGO, atalk_proc_dir, &atalk_seq_interface_fops); diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c index 4990541ef5da..1a0f5ccea9c4 100644 --- a/net/atm/mpoa_proc.c +++ b/net/atm/mpoa_proc.c @@ -281,7 +281,6 @@ int mpc_proc_init(void) printk(KERN_ERR "Unable to initialize /proc/atm/%s\n", STAT_FILE_NAME); return -ENOMEM; } - p->owner = THIS_MODULE; return 0; } diff --git a/net/atm/proc.c b/net/atm/proc.c index 49487b313f22..e7b3b273907d 100644 --- a/net/atm/proc.c +++ b/net/atm/proc.c @@ -476,7 +476,6 @@ int __init atm_proc_init(void) atm_proc_root, e->proc_fops); if (!dirent) goto err_out_remove; - dirent->owner = THIS_MODULE; e->dirent = dirent; } ret = 0; diff --git a/net/can/bcm.c b/net/can/bcm.c index b7c7d4651136..95d7f32643ae 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1604,10 +1604,6 @@ static int __init bcm_module_init(void) /* create /proc/net/can-bcm directory */ proc_dir = proc_mkdir("can-bcm", init_net.proc_net); - - if (proc_dir) - proc_dir->owner = THIS_MODULE; - return 0; } diff --git a/net/can/proc.c b/net/can/proc.c index 520fef5e5398..1463653dbe34 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -473,8 +473,6 @@ void can_init_proc(void) return; } - can_dir->owner = THIS_MODULE; - /* own procfs entries from the AF_CAN core */ pde_version = can_create_proc_readentry(CAN_PROC_VERSION, 0644, can_proc_read_version, NULL); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 32d419f5ac98..3779c1438c11 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3806,7 +3806,6 @@ static int __init pg_init(void) pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net); if (!pg_proc_dir) return -ENODEV; - pg_proc_dir->owner = THIS_MODULE; pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops); if (pe == NULL) { diff --git a/net/irda/irproc.c b/net/irda/irproc.c index 88e80a312732..8ff1861649e8 100644 --- a/net/irda/irproc.c +++ b/net/irda/irproc.c @@ -70,7 +70,6 @@ void __init irda_proc_register(void) proc_irda = proc_mkdir("irda", init_net.proc_net); if (proc_irda == NULL) return; - proc_irda->owner = THIS_MODULE; for (i = 0; i < ARRAY_SIZE(irda_dirs); i++) d = proc_create(irda_dirs[i].name, 0, proc_irda, diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c index b58bd7c6cdf8..d208b3396d94 100644 --- a/net/llc/llc_proc.c +++ b/net/llc/llc_proc.c @@ -236,7 +236,6 @@ int __init llc_proc_init(void) llc_proc_dir = proc_mkdir("llc", init_net.proc_net); if (!llc_proc_dir) goto out; - llc_proc_dir->owner = THIS_MODULE; p = proc_create("socket", S_IRUGO, llc_proc_dir, &llc_seq_socket_fops); if (!p) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index cb198af8887c..8eb3e61cb701 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -106,12 +106,8 @@ static __init int sctp_proc_init(void) goto out_nomem; #ifdef CONFIG_PROC_FS if (!proc_net_sctp) { - struct proc_dir_entry *ent; - ent = proc_mkdir("sctp", init_net.proc_net); - if (ent) { - ent->owner = THIS_MODULE; - proc_net_sctp = ent; - } else + proc_net_sctp = proc_mkdir("sctp", init_net.proc_net); + if (!proc_net_sctp) goto out_free_percpu; } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 4735caad26ed..20029a79a5de 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -313,7 +313,6 @@ static int create_cache_proc_entries(struct cache_detail *cd) cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); if (cd->proc_ent == NULL) goto out_nomem; - cd->proc_ent->owner = cd->owner; cd->channel_ent = cd->content_ent = NULL; p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR, @@ -321,7 +320,6 @@ static int create_cache_proc_entries(struct cache_detail *cd) cd->flush_ent = p; if (p == NULL) goto out_nomem; - p->owner = cd->owner; if (cd->cache_request || cd->cache_parse) { p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR, @@ -329,7 +327,6 @@ static int create_cache_proc_entries(struct cache_detail *cd) cd->channel_ent = p; if (p == NULL) goto out_nomem; - p->owner = cd->owner; } if (cd->cache_show) { p = proc_create_data("content", S_IFREG|S_IRUSR|S_IWUSR, @@ -337,7 +334,6 @@ static int create_cache_proc_entries(struct cache_detail *cd) cd->content_ent = p; if (p == NULL) goto out_nomem; - p->owner = cd->owner; } return 0; out_nomem: diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 085372ef4feb..1ef6e46d9da2 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -262,14 +262,8 @@ void rpc_proc_init(void) { dprintk("RPC: registering /proc/net/rpc\n"); - if (!proc_net_rpc) { - struct proc_dir_entry *ent; - ent = proc_mkdir("rpc", init_net.proc_net); - if (ent) { - ent->owner = THIS_MODULE; - proc_net_rpc = ent; - } - } + if (!proc_net_rpc) + proc_net_rpc = proc_mkdir("rpc", init_net.proc_net); } void diff --git a/sound/core/info.c b/sound/core/info.c index 70fa87189f36..35df614f6c55 100644 --- a/sound/core/info.c +++ b/sound/core/info.c @@ -154,11 +154,6 @@ EXPORT_SYMBOL(snd_seq_root); struct snd_info_entry *snd_oss_root; #endif -static inline void snd_info_entry_prepare(struct proc_dir_entry *de) -{ - de->owner = THIS_MODULE; -} - static void snd_remove_proc_entry(struct proc_dir_entry *parent, struct proc_dir_entry *de) { @@ -522,32 +517,11 @@ static const struct file_operations snd_info_entry_operations = .release = snd_info_entry_release, }; -/** - * snd_create_proc_entry - create a procfs entry - * @name: the name of the proc file - * @mode: the file permission bits, S_Ixxx - * @parent: the parent proc-directory entry - * - * Creates a new proc file entry with the given name and permission - * on the given directory. - * - * Returns the pointer of new instance or NULL on failure. - */ -static struct proc_dir_entry *snd_create_proc_entry(const char *name, mode_t mode, - struct proc_dir_entry *parent) -{ - struct proc_dir_entry *p; - p = create_proc_entry(name, mode, parent); - if (p) - snd_info_entry_prepare(p); - return p; -} - int __init snd_info_init(void) { struct proc_dir_entry *p; - p = snd_create_proc_entry("asound", S_IFDIR | S_IRUGO | S_IXUGO, NULL); + p = create_proc_entry("asound", S_IFDIR | S_IRUGO | S_IXUGO, NULL); if (p == NULL) return -ENOMEM; snd_proc_root = p; @@ -974,12 +948,11 @@ int snd_info_register(struct snd_info_entry * entry) return -ENXIO; root = entry->parent == NULL ? snd_proc_root : entry->parent->p; mutex_lock(&info_mutex); - p = snd_create_proc_entry(entry->name, entry->mode, root); + p = create_proc_entry(entry->name, entry->mode, root); if (!p) { mutex_unlock(&info_mutex); return -ENOMEM; } - p->owner = entry->module; if (!S_ISDIR(entry->mode)) p->proc_fops = &snd_info_entry_operations; p->size = entry->size; -- cgit v1.2.3 From 11373542344bdc35be1e6e68b0baadd1b6f7acbb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 15:18:37 +1100 Subject: Documentation/md.txt update Update md.txt to reflect recent changes in a number of sysfs attributes. Signed-off-by: NeilBrown --- Documentation/md.txt | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/md.txt b/Documentation/md.txt index 1da9d1b1793f..4edd39ec7db9 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt @@ -164,15 +164,19 @@ All md devices contain: raid_disks a text file with a simple number indicating the number of devices in a fully functional array. If this is not yet known, the file - will be empty. If an array is being resized (not currently - possible) this will contain the larger of the old and new sizes. - Some raid level (RAID1) allow this value to be set while the - array is active. This will reconfigure the array. Otherwise - it can only be set while assembling an array. + will be empty. If an array is being resized this will contain + the new number of devices. + Some raid levels allow this value to be set while the array is + active. This will reconfigure the array. Otherwise it can only + be set while assembling an array. + A change to this attribute will not be permitted if it would + reduce the size of the array. To reduce the number of drives + in an e.g. raid5, the array size must first be reduced by + setting the 'array_size' attribute. chunk_size - This is the size if bytes for 'chunks' and is only relevant to - raid levels that involve striping (1,4,5,6,10). The address space + This is the size in bytes for 'chunks' and is only relevant to + raid levels that involve striping (0,4,5,6,10). The address space of the array is conceptually divided into chunks and consecutive chunks are striped onto neighbouring devices. The size should be at least PAGE_SIZE (4k) and should be a power @@ -183,6 +187,20 @@ All md devices contain: simply a number that is interpretted differently by different levels. It can be written while assembling an array. + array_size + This can be used to artificially constrain the available space in + the array to be less than is actually available on the combined + devices. Writing a number (in Kilobytes) which is less than + the available size will set the size. Any reconfiguration of the + array (e.g. adding devices) will not cause the size to change. + Writing the word 'default' will cause the effective size of the + array to be whatever size is actually available based on + 'level', 'chunk_size' and 'component_size'. + + This can be used to reduce the size of the array before reducing + the number of devices in a raid4/5/6, or to support external + metadata formats which mandate such clipping. + reshape_position This is either "none" or a sector number within the devices of the array where "reshape" is up to. If this is set, the three @@ -207,6 +225,11 @@ All md devices contain: about the array. It can be 0.90 (traditional format), 1.0, 1.1, 1.2 (newer format in varying locations) or "none" indicating that the kernel isn't managing metadata at all. + Alternately it can be "external:" followed by a string which + is set by user-space. This indicates that metadata is managed + by a user-space program. Any device failure or other event that + requires a metadata update will cause array activity to be + suspended until the event is acknowledged. resync_start The point at which resync should start. If no resync is needed, -- cgit v1.2.3 From 853116a10544206b6b2cf42ebc9d78fba2668888 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 14 Jan 2009 23:03:17 -0800 Subject: regulator: add get_status() Based on previous LKML discussions: * Update docs for regulator sysfs class attributes to highlight the fact that all current attributes are intended to be control inputs, including notably "state" and "opmode" which previously implied otherwise. * Define a new regulator driver get_status() method, which is the first method reporting regulator outputs instead of inputs. It can report on/off and error status; or instead of simply "on", report the actual operating mode. For the moment, this is a sysfs-only interface, not accessible to regulator clients. Such clients can use the current notification interfaces to detect errors, if the regulator reports them. Signed-off-by: David Brownell Signed-off-by: Liam Girdwood --- Documentation/ABI/testing/sysfs-class-regulator | 57 +++++++++++++++++++++---- drivers/regulator/core.c | 46 ++++++++++++++++++++ include/linux/regulator/driver.h | 17 ++++++++ 3 files changed, 111 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-class-regulator b/Documentation/ABI/testing/sysfs-class-regulator index 873ef1fc1569..e091fa873792 100644 --- a/Documentation/ABI/testing/sysfs-class-regulator +++ b/Documentation/ABI/testing/sysfs-class-regulator @@ -4,8 +4,8 @@ KernelVersion: 2.6.26 Contact: Liam Girdwood Description: Some regulator directories will contain a field called - state. This reports the regulator enable status, for - regulators which can report that value. + state. This reports the regulator enable control, for + regulators which can report that input value. This will be one of the following strings: @@ -14,16 +14,54 @@ Description: 'unknown' 'enabled' means the regulator output is ON and is supplying - power to the system. + power to the system (assuming no error prevents it). 'disabled' means the regulator output is OFF and is not - supplying power to the system.. + supplying power to the system (unless some non-Linux + control has enabled it). 'unknown' means software cannot determine the state, or the reported state is invalid. NOTE: this field can be used in conjunction with microvolts - and microamps to determine regulator output levels. + or microamps to determine configured regulator output levels. + + +What: /sys/class/regulator/.../status +Description: + Some regulator directories will contain a field called + "status". This reports the current regulator status, for + regulators which can report that output value. + + This will be one of the following strings: + + off + on + error + fast + normal + idle + standby + + "off" means the regulator is not supplying power to the + system. + + "on" means the regulator is supplying power to the system, + and the regulator can't report a detailed operation mode. + + "error" indicates an out-of-regulation status such as being + disabled due to thermal shutdown, or voltage being unstable + because of problems with the input power supply. + + "fast", "normal", "idle", and "standby" are all detailed + regulator operation modes (described elsewhere). They + imply "on", but provide more detail. + + Note that regulator status is a function of many inputs, + not limited to control inputs from Linux. For example, + the actual load presented may trigger "error" status; or + a regulator may be enabled by another user, even though + Linux did not enable it. What: /sys/class/regulator/.../type @@ -58,7 +96,7 @@ Description: Some regulator directories will contain a field called microvolts. This holds the regulator output voltage setting measured in microvolts (i.e. E-6 Volts), for regulators - which can report that voltage. + which can report the control input for voltage. NOTE: This value should not be used to determine the regulator output voltage level as this value is the same regardless of @@ -73,7 +111,7 @@ Description: Some regulator directories will contain a field called microamps. This holds the regulator output current limit setting measured in microamps (i.e. E-6 Amps), for regulators - which can report that current. + which can report the control input for a current limit. NOTE: This value should not be used to determine the regulator output current level as this value is the same regardless of @@ -87,7 +125,7 @@ Contact: Liam Girdwood Description: Some regulator directories will contain a field called opmode. This holds the current regulator operating mode, - for regulators which can report it. + for regulators which can report that control input value. The opmode value can be one of the following strings: @@ -101,7 +139,8 @@ Description: NOTE: This value should not be used to determine the regulator output operating mode as this value is the same regardless of - whether the regulator is enabled or disabled. + whether the regulator is enabled or disabled. A "status" + attribute may be available to determine the actual mode. What: /sys/class/regulator/.../min_microvolts diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index f511a406fcaa..0ff95c3ccf5b 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -312,6 +312,47 @@ static ssize_t regulator_state_show(struct device *dev, } static DEVICE_ATTR(state, 0444, regulator_state_show, NULL); +static ssize_t regulator_status_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct regulator_dev *rdev = dev_get_drvdata(dev); + int status; + char *label; + + status = rdev->desc->ops->get_status(rdev); + if (status < 0) + return status; + + switch (status) { + case REGULATOR_STATUS_OFF: + label = "off"; + break; + case REGULATOR_STATUS_ON: + label = "on"; + break; + case REGULATOR_STATUS_ERROR: + label = "error"; + break; + case REGULATOR_STATUS_FAST: + label = "fast"; + break; + case REGULATOR_STATUS_NORMAL: + label = "normal"; + break; + case REGULATOR_STATUS_IDLE: + label = "idle"; + break; + case REGULATOR_STATUS_STANDBY: + label = "standby"; + break; + default: + return -ERANGE; + } + + return sprintf(buf, "%s\n", label); +} +static DEVICE_ATTR(status, 0444, regulator_status_show, NULL); + static ssize_t regulator_min_uA_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1744,6 +1785,11 @@ static int add_regulator_attributes(struct regulator_dev *rdev) if (status < 0) return status; } + if (ops->get_status) { + status = device_create_file(dev, &dev_attr_status); + if (status < 0) + return status; + } /* some attributes are type-specific */ if (rdev->desc->type == REGULATOR_CURRENT) { diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 2dae05705f13..6e957aae7629 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -21,6 +21,17 @@ struct regulator_dev; struct regulator_init_data; +enum regulator_status { + REGULATOR_STATUS_OFF, + REGULATOR_STATUS_ON, + REGULATOR_STATUS_ERROR, + /* fast/normal/idle/standby are flavors of "on" */ + REGULATOR_STATUS_FAST, + REGULATOR_STATUS_NORMAL, + REGULATOR_STATUS_IDLE, + REGULATOR_STATUS_STANDBY, +}; + /** * struct regulator_ops - regulator operations. * @@ -72,6 +83,12 @@ struct regulator_ops { int (*set_mode) (struct regulator_dev *, unsigned int mode); unsigned int (*get_mode) (struct regulator_dev *); + /* report regulator status ... most other accessors report + * control inputs, this reports results of combining inputs + * from Linux (and other sources) with the actual load. + */ + int (*get_status)(struct regulator_dev *); + /* get most efficient regulator operating mode for load */ unsigned int (*get_optimum_mode) (struct regulator_dev *, int input_uV, int output_uV, int load_uA); -- cgit v1.2.3 From 214c8adb87b880690b103e86baffc46f8288fa62 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Tue, 28 Oct 2008 17:22:01 +0200 Subject: exofs: Documentation Added some documentation in exofs.txt, as well as a BUGS file. For further reading, operation instructions, example scripts and up to date infomation and code please see: http://open-osd.org Signed-off-by: Boaz Harrosh --- Documentation/filesystems/exofs.txt | 176 ++++++++++++++++++++++++++++++++++++ fs/exofs/BUGS | 3 + 2 files changed, 179 insertions(+) create mode 100644 Documentation/filesystems/exofs.txt create mode 100644 fs/exofs/BUGS (limited to 'Documentation') diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt new file mode 100644 index 000000000000..0ced74c2f73c --- /dev/null +++ b/Documentation/filesystems/exofs.txt @@ -0,0 +1,176 @@ +=============================================================================== +WHAT IS EXOFS? +=============================================================================== + +exofs is a file system that uses an OSD and exports the API of a normal Linux +file system. Users access exofs like any other local file system, and exofs +will in turn issue commands to the local OSD initiator. + +OSD is a new T10 command set that views storage devices not as a large/flat +array of sectors but as a container of objects, each having a length, quota, +time attributes and more. Each object is addressed by a 64bit ID, and is +contained in a 64bit ID partition. Each object has associated attributes +attached to it, which are integral part of the object and provide metadata about +the object. The standard defines some common obligatory attributes, but user +attributes can be added as needed. + +=============================================================================== +ENVIRONMENT +=============================================================================== + +To use this file system, you need to have an object store to run it on. You +may download a target from: +http://open-osd.org + +See Documentation/scsi/osd.txt for how to setup a working osd environment. + +=============================================================================== +USAGE +=============================================================================== + +1. Download and compile exofs and open-osd initiator: + You need an external Kernel source tree or kernel headers from your + distribution. (anything based on 2.6.26 or later). + + a. download open-osd including exofs source using: + [parent-directory]$ git clone git://git.open-osd.org/open-osd.git + + b. Build the library module like this: + [parent-directory]$ make -C KSRC=$(KER_DIR) open-osd + + This will build both the open-osd initiator as well as the exofs kernel + module. Use whatever parameters you compiled your Kernel with and + $(KER_DIR) above pointing to the Kernel you compile against. See the file + open-osd/top-level-Makefile for an example. + +2. Get the OSD initiator and target set up properly, and login to the target. + See Documentation/scsi/osd.txt for farther instructions. Also see ./do-osd + for example script that does all these steps. + +3. Insmod the exofs.ko module: + [exofs]$ insmod exofs.ko + +4. Make sure the directory where you want to mount exists. If not, create it. + (For example, mkdir /mnt/exofs) + +5. At first run you will need to invoke the mkfs.exofs application + + As an example, this will create the file system on: + /dev/osd0 partition ID 65536 + + mkfs.exofs --pid=65536 --format /dev/osd0 + + The --format is optional if not specified no OSD_FORMAT will be + preformed and a clean file system will be created in the specified pid, + in the available space of the target. (Use --format=size_in_meg to limit + the total LUN space available) + + If pid already exist it will be deleted and a new one will be created in it's + place. Be careful. + + An exofs lives inside a single OSD partition. You can create multiple exofs + filesystems on the same device using multiple pids. + + (run mkfs.exofs without any parameters for usage help message) + +6. Mount the file system. + + For example, to mount /dev/osd0, partition ID 0x10000 on /mnt/exofs: + + mount -t exofs -o pid=65536 /dev/osd0 /mnt/exofs/ + +7. For reference (See do-exofs example script): + do-exofs start - an example of how to perform the above steps. + do-exofs stop - an example of how to unmount the file system. + do-exofs format - an example of how to format and mkfs a new exofs. + +8. Extra compilation flags (uncomment in fs/exofs/Kbuild): + CONFIG_EXOFS_DEBUG - for debug messages and extra checks. + +=============================================================================== +exofs mount options +=============================================================================== +Similar to any mount command: + mount -t exofs -o exofs_options /dev/osdX mount_exofs_directory + +Where: + -t exofs: specifies the exofs file system + + /dev/osdX: X is a decimal number. /dev/osdX was created after a successful + login into an OSD target. + + mount_exofs_directory: The directory to mount the file system on + + exofs specific options: Options are separated by commas (,) + pid= - The partition number to mount/create as + container of the filesystem. + This option is mandatory + to= - Timeout in ticks for a single command + default is (60 * HZ) [for debugging only] + +=============================================================================== +DESIGN +=============================================================================== + +* The file system control block (AKA on-disk superblock) resides in an object + with a special ID (defined in common.h). + Information included in the file system control block is used to fill the + in-memory superblock structure at mount time. This object is created before + the file system is used by mkexofs.c It contains information such as: + - The file system's magic number + - The next inode number to be allocated + +* Each file resides in its own object and contains the data (and it will be + possible to extend the file over multiple objects, though this has not been + implemented yet). + +* A directory is treated as a file, and essentially contains a list of pairs for files that are found in that directory. The object + IDs correspond to the files' inode numbers and will be allocated according to + a bitmap (stored in a separate object). Now they are allocated using a + counter. + +* Each file's control block (AKA on-disk inode) is stored in its object's + attributes. This applies to both regular files and other types (directories, + device files, symlinks, etc.). + +* Credentials are generated per object (inode and superblock) when they is + created in memory (read off disk or created). The credential works for all + operations and is used as long as the object remains in memory. + +* Async OSD operations are used whenever possible, but the target may execute + them out of order. The operations that concern us are create, delete, + readpage, writepage, update_inode, and truncate. The following pairs of + operations should execute in the order written, and we need to prevent them + from executing in reverse order: + - The following are handled with the OBJ_CREATED and OBJ_2BCREATED + flags. OBJ_CREATED is set when we know the object exists on the OSD - + in create's callback function, and when we successfully do a read_inode. + OBJ_2BCREATED is set in the beginning of the create function, so we + know that we should wait. + - create/delete: delete should wait until the object is created + on the OSD. + - create/readpage: readpage should be able to return a page + full of zeroes in this case. If there was a write already + en-route (i.e. create, writepage, readpage) then the page + would be locked, and so it would really be the same as + create/writepage. + - create/writepage: if writepage is called for a sync write, it + should wait until the object is created on the OSD. + Otherwise, it should just return. + - create/truncate: truncate should wait until the object is + created on the OSD. + - create/update_inode: update_inode should wait until the + object is created on the OSD. + - Handled by VFS locks: + - readpage/delete: shouldn't happen because of page lock. + - writepage/delete: shouldn't happen because of page lock. + - readpage/writepage: shouldn't happen because of page lock. + +=============================================================================== +LICENSE/COPYRIGHT +=============================================================================== +The exofs file system is based on ext2 v0.5b (distributed with the Linux kernel +version 2.6.10). All files include the original copyrights, and the license +is GPL version 2 (only version 2, as is true for the Linux kernel). The +Linux kernel can be downloaded from www.kernel.org. diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS new file mode 100644 index 000000000000..1b2d4c63a579 --- /dev/null +++ b/fs/exofs/BUGS @@ -0,0 +1,3 @@ +- Out-of-space may cause a severe problem if the object (and directory entry) + were written, but the inode attributes failed. Then if the filesystem was + unmounted and mounted the kernel can get into an endless loop doing a readdir. -- cgit v1.2.3 From ef12fefabf94b6a902ad3abd3eb124b00560c445 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Tue, 31 Mar 2009 10:02:22 +0530 Subject: cpuacct: add per-cgroup utime/stime statistics Add per-cgroup cpuacct controller statistics like the system and user time consumed by the group of tasks. Changelog: v7 - Changed the name of the statistic from utime to user and from stime to system so that in future we could easily add other statistics like irq, softirq, steal times etc easily. v6 - Fixed a bug in the error path of cpuacct_create() (pointed by Li Zefan). v5 - In cpuacct_stats_show(), use cputime64_to_clock_t() since we are operating on a 64bit variable here. v4 - Remove comments in cpuacct_update_stats() which explained why rcu_read_lock() was needed (as per Peter Zijlstra's review comments). - Don't say that percpu_counter_read() is broken in Documentation/cpuacct.txt as per KAMEZAWA Hiroyuki's review comments. v3 - Fix a small race in the cpuacct hierarchy walk. v2 - stime and utime now exported in clock_t units instead of msecs. - Addressed the code review comments from Balbir and Li Zefan. - Moved to -tip tree. v1 - Moved the stime/utime accounting to cpuacct controller. Earlier versions - http://lkml.org/lkml/2009/2/25/129 Signed-off-by: Bharata B Rao Signed-off-by: Balaji Rao Cc: Dhaval Giani Cc: Paul Menage Cc: Andrew Morton Cc: KAMEZAWA Hiroyuki Reviewed-by: Li Zefan Acked-by: Peter Zijlstra Acked-by: Balbir Singh Tested-by: Balbir Singh LKML-Reference: <20090331043222.GA4093@in.ibm.com> Signed-off-by: Ingo Molnar --- Documentation/cgroups/cpuacct.txt | 18 ++++++++ kernel/sched.c | 87 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 99 insertions(+), 6 deletions(-) (limited to 'Documentation') diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt index bb775fbe43d7..8b930946c52a 100644 --- a/Documentation/cgroups/cpuacct.txt +++ b/Documentation/cgroups/cpuacct.txt @@ -30,3 +30,21 @@ The above steps create a new group g1 and move the current shell process (bash) into it. CPU time consumed by this bash and its children can be obtained from g1/cpuacct.usage and the same is accumulated in /cgroups/cpuacct.usage also. + +cpuacct.stat file lists a few statistics which further divide the +CPU time obtained by the cgroup into user and system times. Currently +the following statistics are supported: + +user: Time spent by tasks of the cgroup in user mode. +system: Time spent by tasks of the cgroup in kernel mode. + +user and system are in USER_HZ unit. + +cpuacct controller uses percpu_counter interface to collect user and +system times. This has two side effects: + +- It is theoretically possible to see wrong values for user and system times. + This is because percpu_counter_read() on 32bit systems isn't safe + against concurrent writes. +- It is possible to see slightly outdated values for user and system times + due to the batch processing nature of percpu_counter. diff --git a/kernel/sched.c b/kernel/sched.c index c8d7f17bd036..8d1bdbe8aafc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1393,10 +1393,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, struct rq_iterator *iterator); #endif +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + #ifdef CONFIG_CGROUP_CPUACCT static void cpuacct_charge(struct task_struct *tsk, u64 cputime); +static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val); #else static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} +static inline void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) {} #endif static inline void inc_cpu_load(struct rq *rq, unsigned long load) @@ -4236,6 +4248,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime, cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); + + cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); /* Account for user time used */ acct_update_integrals(p); } @@ -4297,6 +4311,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset, else cpustat->system = cputime64_add(cpustat->system, tmp); + cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + /* Account for system time used */ acct_update_integrals(p); } @@ -9539,6 +9555,7 @@ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ u64 *cpuusage; + struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; struct cpuacct *parent; }; @@ -9563,20 +9580,32 @@ static struct cgroup_subsys_state *cpuacct_create( struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + int i; if (!ca) - return ERR_PTR(-ENOMEM); + goto out; ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) { - kfree(ca); - return ERR_PTR(-ENOMEM); - } + if (!ca->cpuusage) + goto out_free_ca; + + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + if (percpu_counter_init(&ca->cpustat[i], 0)) + goto out_free_counters; if (cgrp->parent) ca->parent = cgroup_ca(cgrp->parent); return &ca->css; + +out_free_counters: + while (--i >= 0) + percpu_counter_destroy(&ca->cpustat[i]); + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); } /* destroy an existing cpu accounting group */ @@ -9584,7 +9613,10 @@ static void cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); + int i; + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + percpu_counter_destroy(&ca->cpustat[i]); free_percpu(ca->cpuusage); kfree(ca); } @@ -9671,6 +9703,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, return 0; } +static const char *cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int i; + + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { + s64 val = percpu_counter_read(&ca->cpustat[i]); + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[i], val); + } + return 0; +} + static struct cftype files[] = { { .name = "usage", @@ -9681,7 +9732,10 @@ static struct cftype files[] = { .name = "usage_percpu", .read_seq_string = cpuacct_percpu_seq_read, }, - + { + .name = "stat", + .read_map = cpuacct_stats_show, + }, }; static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) @@ -9716,6 +9770,27 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_unlock(); } +/* + * Charge the system/user time to the task's accounting group. + */ +static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) +{ + struct cpuacct *ca; + + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(tsk); + + do { + percpu_counter_add(&ca->cpustat[idx], val); + ca = ca->parent; + } while (ca); + rcu_read_unlock(); +} + struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", .create = cpuacct_create, -- cgit v1.2.3 From 2584e517320bd48dc8d20e38a2621a2dbe58fade Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Tue, 31 Mar 2009 15:21:26 -0700 Subject: mm: reintroduce and deprecate rlimit based access for SHM_HUGETLB Allow non root users with sufficient mlock rlimits to be able to allocate hugetlb backed shm for now. Deprecate this though. This is being deprecated because the mlock based rlimit checks for SHM_HUGETLB is not consistent with mmap based huge page allocations. Signed-off-by: Ravikiran Thirumalai Reviewed-by: Mel Gorman Cc: William Lee Irwin III Cc: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/feature-removal-schedule.txt | 12 ++++++++++++ fs/hugetlbfs/inode.c | 13 +++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 5e02b83ac12b..ea7d1bdad34d 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -311,6 +311,18 @@ Who: Vlad Yasevich --------------------------- +What: Ability for non root users to shm_get hugetlb pages based on mlock + resource limits +When: 2.6.31 +Why: Non root users need to be part of /proc/sys/vm/hugetlb_shm_group or + have CAP_IPC_LOCK to be able to allocate shm segments backed by + huge pages. The mlock based rlimit check to allow shm hugetlb is + inconsistent with mmap based allocations. Hence it is being + deprecated. +Who: Ravikiran Thirumalai + +--------------------------- + What: CONFIG_THERMAL_HWMON When: January 2009 Why: This option was introduced just to allow older lm-sensors userspace diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index bc56df8ce001..23a3c76711e0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -949,6 +949,7 @@ static int can_do_hugetlb_shm(void) struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) { int error = -ENOMEM; + int unlock_shm = 0; struct file *file; struct inode *inode; struct dentry *dentry, *root; @@ -958,8 +959,14 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) if (!hugetlbfs_vfsmount) return ERR_PTR(-ENOENT); - if (!can_do_hugetlb_shm()) - return ERR_PTR(-EPERM); + if (!can_do_hugetlb_shm()) { + if (user_shm_lock(size, user)) { + unlock_shm = 1; + WARN_ONCE(1, + "Using mlock ulimits for SHM_HUGETLB deprecated\n"); + } else + return ERR_PTR(-EPERM); + } root = hugetlbfs_vfsmount->mnt_root; quick_string.name = name; @@ -999,6 +1006,8 @@ out_inode: out_dentry: dput(dentry); out_shm_unlock: + if (unlock_shm) + user_shm_unlock(size, user); return ERR_PTR(error); } -- cgit v1.2.3 From c2ec175c39f62949438354f603f4aa170846aabb Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 31 Mar 2009 15:23:21 -0700 Subject: mm: page_mkwrite change prototype to match fault Change the page_mkwrite prototype to take a struct vm_fault, and return VM_FAULT_xxx flags. There should be no functional change. This makes it possible to return much more detailed error information to the VM (and also can provide more information eg. virtual_address to the driver, which might be important in some special cases). This is required for a subsequent fix. And will also make it easier to merge page_mkwrite() with fault() in future. Signed-off-by: Nick Piggin Cc: Chris Mason Cc: Trond Myklebust Cc: Miklos Szeredi Cc: Steven Whitehouse Cc: Mark Fasheh Cc: Joel Becker Cc: Artem Bityutskiy Cc: Felix Blyakher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 2 +- drivers/video/fb_defio.c | 3 ++- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 5 ++++- fs/buffer.c | 6 +++++- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 5 ++++- fs/fuse/file.c | 3 ++- fs/gfs2/ops_file.c | 5 ++++- fs/nfs/file.c | 5 ++++- fs/ocfs2/mmap.c | 6 ++++-- fs/ubifs/file.c | 9 ++++++--- fs/xfs/linux-2.6/xfs_file.c | 4 ++-- include/linux/buffer_head.h | 2 +- include/linux/mm.h | 3 ++- mm/memory.c | 26 ++++++++++++++++++++++---- 16 files changed, 65 insertions(+), 23 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 4e78ce677843..76efe5b71d7d 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -505,7 +505,7 @@ prototypes: void (*open)(struct vm_area_struct*); void (*close)(struct vm_area_struct*); int (*fault)(struct vm_area_struct*, struct vm_fault *); - int (*page_mkwrite)(struct vm_area_struct *, struct page *); + int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c index 082026546aee..0a7a6679ee6e 100644 --- a/drivers/video/fb_defio.c +++ b/drivers/video/fb_defio.c @@ -85,8 +85,9 @@ EXPORT_SYMBOL_GPL(fb_deferred_io_fsync); /* vm_ops->page_mkwrite handler */ static int fb_deferred_io_mkwrite(struct vm_area_struct *vma, - struct page *page) + struct vm_fault *vmf) { + struct page *page = vmf->page; struct fb_info *info = vma->vm_private_data; struct fb_deferred_io *fbdefio = info->fbdefio; struct page *cur; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5e1d4e30e9d8..7dd1b6d0bf32 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2060,7 +2060,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, unsigned long btrfs_force_ra(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t offset, pgoff_t last_index); -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); void btrfs_put_inode(struct inode *inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7d4f948bc22a..ec5423790bbb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4292,8 +4292,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = fdentry(vma->vm_file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -4362,6 +4363,8 @@ again: out_unlock: unlock_page(page); out: + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 73abe6d8218c..6d51a3da362c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2313,9 +2313,10 @@ int block_commit_write(struct page *page, unsigned from, unsigned to) * unlock the page. */ int -block_page_mkwrite(struct vm_area_struct *vma, struct page *page, +block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; unsigned long end; loff_t size; @@ -2340,6 +2341,9 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page, ret = block_commit_write(page, 0, end); out_unlock: + if (ret) + ret = VM_FAULT_SIGBUS; + unlock_page(page); return ret; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6083bb38057b..990c94000924 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1098,7 +1098,7 @@ extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); -extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t ext4_get_reserved_space(struct inode *inode); /* ioctl.c */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 71d3ecd5db79..dd82ff390067 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5146,8 +5146,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) return !buffer_mapped(bh); } -int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) +int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; loff_t size; unsigned long len; int ret = -EINVAL; @@ -5199,6 +5200,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) goto out_unlock; ret = 0; out_unlock: + if (ret) + ret = VM_FAULT_SIGBUS; up_read(&inode->i_alloc_sem); return ret; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 821d10f719bd..4e340fedf768 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1234,8 +1234,9 @@ static void fuse_vma_close(struct vm_area_struct *vma) * - sync(2) * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER */ -static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; /* * Don't use page->mapping as it may become NULL from a * concurrent truncate. diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 3b9e8de3500b..70b9b8548945 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -337,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page) * blocks allocated on disk to back that page. */ -static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -412,6 +413,8 @@ out_unlock: gfs2_glock_dq(&gh); out: gfs2_holder_uninit(&gh); + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 90f292b520d2..cec79392e4ba 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -451,8 +451,9 @@ const struct address_space_operations nfs_file_aops = { .launder_page = nfs_launder_page, }; -static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct file *filp = vma->vm_file; struct dentry *dentry = filp->f_path.dentry; unsigned pagelen; @@ -483,6 +484,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) ret = pagelen; out_unlock: unlock_page(page); + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index eea1d24713ea..b606496b72ec 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -154,8 +154,9 @@ out: return ret; } -static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct buffer_head *di_bh = NULL; sigset_t blocked, oldset; @@ -196,7 +197,8 @@ out: ret2 = ocfs2_vm_op_unblock_sigs(&oldset); if (ret2 < 0) mlog_errno(ret2); - + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 93b6de51f261..0ff89fe71e51 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1434,8 +1434,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) * mmap()d file has taken write protection fault and is being made * writable. UBIFS must ensure page is budgeted for. */ -static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct ubifs_info *c = inode->i_sb->s_fs_info; struct timespec now = ubifs_current_time(inode); @@ -1447,7 +1448,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); if (unlikely(c->ro_media)) - return -EROFS; + return VM_FAULT_SIGBUS; /* -EROFS */ /* * We have not locked @page so far so we may budget for changing the @@ -1480,7 +1481,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) if (err == -ENOSPC) ubifs_warn("out of space for mmapped file " "(inode number %lu)", inode->i_ino); - return err; + return VM_FAULT_SIGBUS; } lock_page(page); @@ -1520,6 +1521,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) out_unlock: unlock_page(page); ubifs_release_budget(c, &req); + if (err) + err = VM_FAULT_SIGBUS; return err; } diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index e14c4e3aea0c..f4e255441574 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -234,9 +234,9 @@ xfs_file_mmap( STATIC int xfs_vm_page_mkwrite( struct vm_area_struct *vma, - struct page *page) + struct vm_fault *vmf) { - return block_page_mkwrite(vma, page, xfs_get_blocks); + return block_page_mkwrite(vma, vmf, xfs_get_blocks); } const struct file_operations xfs_file_operations = { diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index f19fd9045ea0..3d7bcde2e332 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -216,7 +216,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); int block_commit_write(struct page *page, unsigned from, unsigned to); -int block_page_mkwrite(struct vm_area_struct *vma, struct page *page, +int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); void block_sync_page(struct page *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 2223f8dfa568..aeabe953ba4f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -135,6 +135,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ +#define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ /* * This interface is used by x86 PAT code to identify a pfn mapping that is @@ -187,7 +188,7 @@ struct vm_operations_struct { /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ - int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page); + int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware diff --git a/mm/memory.c b/mm/memory.c index 5b4ad5e4f98d..cf6873e91c6a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1945,6 +1945,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * get_user_pages(.write=1, .force=1). */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + struct vm_fault vmf; + int tmp; + + vmf.virtual_address = (void __user *)(address & + PAGE_MASK); + vmf.pgoff = old_page->index; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.page = old_page; + /* * Notify the address space that the page is about to * become writable so that it can prohibit this or wait @@ -1956,8 +1965,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); - if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) + tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + ret = tmp; goto unwritable_page; + } /* * Since we dropped the lock we need to revalidate @@ -2106,7 +2119,7 @@ oom: unwritable_page: page_cache_release(old_page); - return VM_FAULT_SIGBUS; + return ret; } /* @@ -2648,9 +2661,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, * to become writable */ if (vma->vm_ops->page_mkwrite) { + int tmp; + unlock_page(page); - if (vma->vm_ops->page_mkwrite(vma, page) < 0) { - ret = VM_FAULT_SIGBUS; + vmf.flags |= FAULT_FLAG_MKWRITE; + tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + ret = tmp; anon = 1; /* no anon but release vmf.page */ goto out_unlocked; } -- cgit v1.2.3 From c2d7543851849a6923680cdd7e1047ed1a84a1c5 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 31 Mar 2009 15:23:46 -0700 Subject: filesystem freeze: allow SysRq emergency thaw to thaw frozen filesystems Now that the filesystem freeze operation has been elevated to the VFS, and is just an ioctl away, some sort of safety net for unintentionally frozen root filesystems may be in order. The timeout thaw originally proposed did not get merged, but perhaps something like this would be useful in emergencies. For example, freeze /path/to/mountpoint may freeze your root filesystem if you forgot that you had that unmounted. I chose 'j' as the last remaining character other than 'h' which is sort of reserved for help (because help is generated on any unknown character). I've tested this on a non-root fs with multiple (nested) freezers, as well as on a system rendered unresponsive due to a frozen root fs. [randy.dunlap@oracle.com: emergency thaw only if CONFIG_BLOCK enabled] Signed-off-by: Eric Sandeen Cc: Takashi Sato Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysrq.txt | 5 +++++ drivers/char/sysrq.c | 19 ++++++++++++++++++- fs/buffer.c | 33 +++++++++++++++++++++++++++++++++ include/linux/fs.h | 1 + 4 files changed, 57 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index 9e592c718afb..afa2946892da 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt @@ -81,6 +81,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.: 'i' - Send a SIGKILL to all processes, except for init. +'j' - Forcibly "Just thaw it" - filesystems frozen by the FIFREEZE ioctl. + 'k' - Secure Access Key (SAK) Kills all programs on the current virtual console. NOTE: See important comments below in SAK section. @@ -160,6 +162,9 @@ t'E'rm and k'I'll are useful if you have some sort of runaway process you are unable to kill any other way, especially if it's spawning other processes. +"'J'ust thaw it" is useful if your system becomes unresponsive due to a frozen +(probably root) filesystem via the FIFREEZE ioctl. + * Sometimes SysRq seems to get 'stuck' after using it, what can I do? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ That happens to me, also. I've found that tapping shift, alt, and control diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 33a9351c896d..5afe7316c72e 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -346,6 +346,19 @@ static struct sysrq_key_op sysrq_moom_op = { .enable_mask = SYSRQ_ENABLE_SIGNAL, }; +#ifdef CONFIG_BLOCK +static void sysrq_handle_thaw(int key, struct tty_struct *tty) +{ + emergency_thaw_all(); +} +static struct sysrq_key_op sysrq_thaw_op = { + .handler = sysrq_handle_thaw, + .help_msg = "thaw-filesystems(J)", + .action_msg = "Emergency Thaw of all frozen filesystems", + .enable_mask = SYSRQ_ENABLE_SIGNAL, +}; +#endif + static void sysrq_handle_kill(int key, struct tty_struct *tty) { send_sig_all(SIGKILL); @@ -396,9 +409,13 @@ static struct sysrq_key_op *sysrq_key_table[36] = { &sysrq_moom_op, /* f */ /* g: May be registered by ppc for kgdb */ NULL, /* g */ - NULL, /* h */ + NULL, /* h - reserved for help */ &sysrq_kill_op, /* i */ +#ifdef CONFIG_BLOCK + &sysrq_thaw_op, /* j */ +#else NULL, /* j */ +#endif &sysrq_SAK_op, /* k */ #ifdef CONFIG_SMP &sysrq_showallcpus_op, /* l */ diff --git a/fs/buffer.c b/fs/buffer.c index c77b848c3d43..f5f8b15a6e40 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -547,6 +547,39 @@ repeat: return err; } +void do_thaw_all(unsigned long unused) +{ + struct super_block *sb; + char b[BDEVNAME_SIZE]; + + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) + printk(KERN_WARNING "Emergency Thaw on %s\n", + bdevname(sb->s_bdev, b)); + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; + } + spin_unlock(&sb_lock); + printk(KERN_WARNING "Emergency Thaw complete\n"); +} + +/** + * emergency_thaw_all -- forcibly thaw every frozen filesystem + * + * Used for emergency unfreeze of all filesystems via SysRq + */ +void emergency_thaw_all(void) +{ + pdflush_operation(do_thaw_all, 0); +} + /** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written diff --git a/include/linux/fs.h b/include/linux/fs.h index 87e7bfc5ebd7..61211ad823fe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1878,6 +1878,7 @@ extern struct block_device *open_by_devnum(dev_t, fmode_t); extern void invalidate_bdev(struct block_device *); extern int sync_blockdev(struct block_device *bdev); extern struct super_block *freeze_bdev(struct block_device *); +extern void emergency_thaw_all(void); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int fsync_bdev(struct block_device *); extern int fsync_super(struct super_block *); -- cgit v1.2.3 From 3cdbbeebb77348176bd6a03fd86e11bc281c529e Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Tue, 31 Mar 2009 15:23:53 -0700 Subject: drivers/misc/isl29003.c: driver for the ISL29003 ambient light sensor Add a driver for Intersil's ISL29003 ambient light sensor device plus some documentation. Inspired by tsl2550.c, a driver for a similar device. It is put in drivers/misc for now until the industrial I/O framework gets merged. Signed-off-by: Daniel Mack Acked-by: Jonathan Cameron Cc: Jean Delvare Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/misc-devices/isl29003 | 62 +++++ drivers/misc/Kconfig | 10 + drivers/misc/Makefile | 1 + drivers/misc/isl29003.c | 470 ++++++++++++++++++++++++++++++++++++ 4 files changed, 543 insertions(+) create mode 100644 Documentation/misc-devices/isl29003 create mode 100644 drivers/misc/isl29003.c (limited to 'Documentation') diff --git a/Documentation/misc-devices/isl29003 b/Documentation/misc-devices/isl29003 new file mode 100644 index 000000000000..c4ff5f38e010 --- /dev/null +++ b/Documentation/misc-devices/isl29003 @@ -0,0 +1,62 @@ +Kernel driver isl29003 +===================== + +Supported chips: +* Intersil ISL29003 +Prefix: 'isl29003' +Addresses scanned: none +Datasheet: +http://www.intersil.com/data/fn/fn7464.pdf + +Author: Daniel Mack + + +Description +----------- +The ISL29003 is an integrated light sensor with a 16-bit integrating type +ADC, I2C user programmable lux range select for optimized counts/lux, and +I2C multi-function control and monitoring capabilities. The internal ADC +provides 16-bit resolution while rejecting 50Hz and 60Hz flicker caused by +artificial light sources. + +The driver allows to set the lux range, the bit resolution, the operational +mode (see below) and the power state of device and can read the current lux +value, of course. + + +Detection +--------- + +The ISL29003 does not have an ID register which could be used to identify +it, so the detection routine will just try to read from the configured I2C +addess and consider the device to be present as soon as it ACKs the +transfer. + + +Sysfs entries +------------- + +range: + 0: 0 lux to 1000 lux (default) + 1: 0 lux to 4000 lux + 2: 0 lux to 16,000 lux + 3: 0 lux to 64,000 lux + +resolution: + 0: 2^16 cycles (default) + 1: 2^12 cycles + 2: 2^8 cycles + 3: 2^4 cycles + +mode: + 0: diode1's current (unsigned 16bit) (default) + 1: diode1's current (unsigned 16bit) + 2: difference between diodes (l1 - l2, signed 15bit) + +power_state: + 0: device is disabled (default) + 1: device is enabled + +lux (read only): + returns the value from the last sensor reading + diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 1c484084ed4f..5f3bff434621 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -223,6 +223,16 @@ config DELL_LAPTOP This driver adds support for rfkill and backlight control to Dell laptops. +config ISL29003 + tristate "Intersil ISL29003 ambient light sensor" + depends on I2C && SYSFS + help + If you say yes here you get support for the Intersil ISL29003 + ambient light sensor. + + This driver can also be built as a module. If so, the module + will be called isl29003. + source "drivers/misc/c2port/Kconfig" source "drivers/misc/eeprom/Kconfig" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index bc1199830554..7871f05dcb9b 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -18,5 +18,6 @@ obj-$(CONFIG_KGDB_TESTS) += kgdbts.o obj-$(CONFIG_SGI_XP) += sgi-xp/ obj-$(CONFIG_SGI_GRU) += sgi-gru/ obj-$(CONFIG_HP_ILO) += hpilo.o +obj-$(CONFIG_ISL29003) += isl29003.o obj-$(CONFIG_C2PORT) += c2port/ obj-y += eeprom/ diff --git a/drivers/misc/isl29003.c b/drivers/misc/isl29003.c new file mode 100644 index 000000000000..2e2a5923d4c2 --- /dev/null +++ b/drivers/misc/isl29003.c @@ -0,0 +1,470 @@ +/* + * isl29003.c - Linux kernel module for + * Intersil ISL29003 ambient light sensor + * + * See file:Documentation/misc-devices/isl29003 + * + * Copyright (c) 2009 Daniel Mack + * + * Based on code written by + * Rodolfo Giometti + * Eurotech S.p.A. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include + +#define ISL29003_DRV_NAME "isl29003" +#define DRIVER_VERSION "1.0" + +#define ISL29003_REG_COMMAND 0x00 +#define ISL29003_ADC_ENABLED (1 << 7) +#define ISL29003_ADC_PD (1 << 6) +#define ISL29003_TIMING_INT (1 << 5) +#define ISL29003_MODE_SHIFT (2) +#define ISL29003_MODE_MASK (0x3 << ISL29003_MODE_SHIFT) +#define ISL29003_RES_SHIFT (0) +#define ISL29003_RES_MASK (0x3 << ISL29003_RES_SHIFT) + +#define ISL29003_REG_CONTROL 0x01 +#define ISL29003_INT_FLG (1 << 5) +#define ISL29003_RANGE_SHIFT (2) +#define ISL29003_RANGE_MASK (0x3 << ISL29003_RANGE_SHIFT) +#define ISL29003_INT_PERSISTS_SHIFT (0) +#define ISL29003_INT_PERSISTS_MASK (0xf << ISL29003_INT_PERSISTS_SHIFT) + +#define ISL29003_REG_IRQ_THRESH_HI 0x02 +#define ISL29003_REG_IRQ_THRESH_LO 0x03 +#define ISL29003_REG_LSB_SENSOR 0x04 +#define ISL29003_REG_MSB_SENSOR 0x05 +#define ISL29003_REG_LSB_TIMER 0x06 +#define ISL29003_REG_MSB_TIMER 0x07 + +#define ISL29003_NUM_CACHABLE_REGS 4 + +struct isl29003_data { + struct i2c_client *client; + struct mutex lock; + u8 reg_cache[ISL29003_NUM_CACHABLE_REGS]; +}; + +static int gain_range[] = { + 1000, 4000, 16000, 64000 +}; + +/* + * register access helpers + */ + +static int __isl29003_read_reg(struct i2c_client *client, + u32 reg, u8 mask, u8 shift) +{ + struct isl29003_data *data = i2c_get_clientdata(client); + return (data->reg_cache[reg] & mask) >> shift; +} + +static int __isl29003_write_reg(struct i2c_client *client, + u32 reg, u8 mask, u8 shift, u8 val) +{ + struct isl29003_data *data = i2c_get_clientdata(client); + int ret = 0; + u8 tmp; + + if (reg >= ISL29003_NUM_CACHABLE_REGS) + return -EINVAL; + + mutex_lock(&data->lock); + + tmp = data->reg_cache[reg]; + tmp &= ~mask; + tmp |= val << shift; + + ret = i2c_smbus_write_byte_data(client, reg, tmp); + if (!ret) + data->reg_cache[reg] = tmp; + + mutex_unlock(&data->lock); + return ret; +} + +/* + * internally used functions + */ + +/* range */ +static int isl29003_get_range(struct i2c_client *client) +{ + return __isl29003_read_reg(client, ISL29003_REG_CONTROL, + ISL29003_RANGE_MASK, ISL29003_RANGE_SHIFT); +} + +static int isl29003_set_range(struct i2c_client *client, int range) +{ + return __isl29003_write_reg(client, ISL29003_REG_CONTROL, + ISL29003_RANGE_MASK, ISL29003_RANGE_SHIFT, range); +} + +/* resolution */ +static int isl29003_get_resolution(struct i2c_client *client) +{ + return __isl29003_read_reg(client, ISL29003_REG_COMMAND, + ISL29003_RES_MASK, ISL29003_RES_SHIFT); +} + +static int isl29003_set_resolution(struct i2c_client *client, int res) +{ + return __isl29003_write_reg(client, ISL29003_REG_COMMAND, + ISL29003_RES_MASK, ISL29003_RES_SHIFT, res); +} + +/* mode */ +static int isl29003_get_mode(struct i2c_client *client) +{ + return __isl29003_read_reg(client, ISL29003_REG_COMMAND, + ISL29003_RES_MASK, ISL29003_RES_SHIFT); +} + +static int isl29003_set_mode(struct i2c_client *client, int mode) +{ + return __isl29003_write_reg(client, ISL29003_REG_COMMAND, + ISL29003_RES_MASK, ISL29003_RES_SHIFT, mode); +} + +/* power_state */ +static int isl29003_set_power_state(struct i2c_client *client, int state) +{ + return __isl29003_write_reg(client, ISL29003_REG_COMMAND, + ISL29003_ADC_ENABLED | ISL29003_ADC_PD, 0, + state ? ISL29003_ADC_ENABLED : ISL29003_ADC_PD); +} + +static int isl29003_get_power_state(struct i2c_client *client) +{ + struct isl29003_data *data = i2c_get_clientdata(client); + u8 cmdreg = data->reg_cache[ISL29003_REG_COMMAND]; + return ~cmdreg & ISL29003_ADC_PD; +} + +static int isl29003_get_adc_value(struct i2c_client *client) +{ + struct isl29003_data *data = i2c_get_clientdata(client); + int lsb, msb, range, bitdepth; + + mutex_lock(&data->lock); + lsb = i2c_smbus_read_byte_data(client, ISL29003_REG_LSB_SENSOR); + + if (lsb < 0) { + mutex_unlock(&data->lock); + return lsb; + } + + msb = i2c_smbus_read_byte_data(client, ISL29003_REG_MSB_SENSOR); + mutex_unlock(&data->lock); + + if (msb < 0) + return msb; + + range = isl29003_get_range(client); + bitdepth = (4 - isl29003_get_resolution(client)) * 4; + return (((msb << 8) | lsb) * gain_range[range]) >> bitdepth; +} + +/* + * sysfs layer + */ + +/* range */ +static ssize_t isl29003_show_range(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + return sprintf(buf, "%i\n", isl29003_get_range(client)); +} + +static ssize_t isl29003_store_range(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + unsigned long val; + int ret; + + if ((strict_strtoul(buf, 10, &val) < 0) || (val > 3)) + return -EINVAL; + + ret = isl29003_set_range(client, val); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR(range, S_IWUSR | S_IRUGO, + isl29003_show_range, isl29003_store_range); + + +/* resolution */ +static ssize_t isl29003_show_resolution(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + return sprintf(buf, "%d\n", isl29003_get_resolution(client)); +} + +static ssize_t isl29003_store_resolution(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + unsigned long val; + int ret; + + if ((strict_strtoul(buf, 10, &val) < 0) || (val > 3)) + return -EINVAL; + + ret = isl29003_set_resolution(client, val); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR(resolution, S_IWUSR | S_IRUGO, + isl29003_show_resolution, isl29003_store_resolution); + +/* mode */ +static ssize_t isl29003_show_mode(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + return sprintf(buf, "%d\n", isl29003_get_mode(client)); +} + +static ssize_t isl29003_store_mode(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + unsigned long val; + int ret; + + if ((strict_strtoul(buf, 10, &val) < 0) || (val > 2)) + return -EINVAL; + + ret = isl29003_set_mode(client, val); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, + isl29003_show_mode, isl29003_store_mode); + + +/* power state */ +static ssize_t isl29003_show_power_state(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + return sprintf(buf, "%d\n", isl29003_get_power_state(client)); +} + +static ssize_t isl29003_store_power_state(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + unsigned long val; + int ret; + + if ((strict_strtoul(buf, 10, &val) < 0) || (val > 1)) + return -EINVAL; + + ret = isl29003_set_power_state(client, val); + return ret ? ret : count; +} + +static DEVICE_ATTR(power_state, S_IWUSR | S_IRUGO, + isl29003_show_power_state, isl29003_store_power_state); + + +/* lux */ +static ssize_t isl29003_show_lux(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + + /* No LUX data if not operational */ + if (!isl29003_get_power_state(client)) + return -EBUSY; + + return sprintf(buf, "%d\n", isl29003_get_adc_value(client)); +} + +static DEVICE_ATTR(lux, S_IRUGO, isl29003_show_lux, NULL); + +static struct attribute *isl29003_attributes[] = { + &dev_attr_range.attr, + &dev_attr_resolution.attr, + &dev_attr_mode.attr, + &dev_attr_power_state.attr, + &dev_attr_lux.attr, + NULL +}; + +static const struct attribute_group isl29003_attr_group = { + .attrs = isl29003_attributes, +}; + +static int isl29003_init_client(struct i2c_client *client) +{ + struct isl29003_data *data = i2c_get_clientdata(client); + int i; + + /* read all the registers once to fill the cache. + * if one of the reads fails, we consider the init failed */ + for (i = 0; i < ARRAY_SIZE(data->reg_cache); i++) { + int v = i2c_smbus_read_byte_data(client, i); + if (v < 0) + return -ENODEV; + + data->reg_cache[i] = v; + } + + /* set defaults */ + isl29003_set_range(client, 0); + isl29003_set_resolution(client, 0); + isl29003_set_mode(client, 0); + isl29003_set_power_state(client, 0); + + return 0; +} + +/* + * I2C layer + */ + +static int __devinit isl29003_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent); + struct isl29003_data *data; + int err = 0; + + if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE)) + return -EIO; + + data = kzalloc(sizeof(struct isl29003_data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->client = client; + i2c_set_clientdata(client, data); + mutex_init(&data->lock); + + /* initialize the ISL29003 chip */ + err = isl29003_init_client(client); + if (err) + goto exit_kfree; + + /* register sysfs hooks */ + err = sysfs_create_group(&client->dev.kobj, &isl29003_attr_group); + if (err) + goto exit_kfree; + + dev_info(&client->dev, "driver version %s enabled\n", DRIVER_VERSION); + return 0; + +exit_kfree: + kfree(data); + return err; +} + +static int __devexit isl29003_remove(struct i2c_client *client) +{ + sysfs_remove_group(&client->dev.kobj, &isl29003_attr_group); + isl29003_set_power_state(client, 0); + kfree(i2c_get_clientdata(client)); + return 0; +} + +#ifdef CONFIG_PM +static int isl29003_suspend(struct i2c_client *client, pm_message_t mesg) +{ + return isl29003_set_power_state(client, 0); +} + +static int isl29003_resume(struct i2c_client *client) +{ + int i; + struct isl29003_data *data = i2c_get_clientdata(client); + + /* restore registers from cache */ + for (i = 0; i < ARRAY_SIZE(data->reg_cache); i++) + if (!i2c_smbus_write_byte_data(client, i, data->reg_cache[i])) + return -EIO; + + return 0; +} + +#else +#define isl29003_suspend NULL +#define isl29003_resume NULL +#endif /* CONFIG_PM */ + +static const struct i2c_device_id isl29003_id[] = { + { "isl29003", 0 }, + {} +}; +MODULE_DEVICE_TABLE(i2c, isl29003_id); + +static struct i2c_driver isl29003_driver = { + .driver = { + .name = ISL29003_DRV_NAME, + .owner = THIS_MODULE, + }, + .suspend = isl29003_suspend, + .resume = isl29003_resume, + .probe = isl29003_probe, + .remove = __devexit_p(isl29003_remove), + .id_table = isl29003_id, +}; + +static int __init isl29003_init(void) +{ + return i2c_add_driver(&isl29003_driver); +} + +static void __exit isl29003_exit(void) +{ + i2c_del_driver(&isl29003_driver); +} + +MODULE_AUTHOR("Daniel Mack "); +MODULE_DESCRIPTION("ISL29003 ambient light sensor driver"); +MODULE_LICENSE("GPL v2"); +MODULE_VERSION(DRIVER_VERSION); + +module_init(isl29003_init); +module_exit(isl29003_exit); + -- cgit v1.2.3 From 2b872903c5d66bccdf4306a35e3e94d4da8555d3 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 31 Mar 2009 15:24:25 -0700 Subject: hp_accel: small documentation updates Fix english in Documentation, add "how to test" description. Signed-off-by: Pavel Machek Cc: Eric Piel Cc: Vladimir Botka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/hwmon/lis3lv02d | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/hwmon/lis3lv02d b/Documentation/hwmon/lis3lv02d index 287f8c902656..effe949a7282 100644 --- a/Documentation/hwmon/lis3lv02d +++ b/Documentation/hwmon/lis3lv02d @@ -1,11 +1,11 @@ Kernel driver lis3lv02d -================== +======================= Supported chips: * STMicroelectronics LIS3LV02DL and LIS3LV02DQ -Author: +Authors: Yan Burman Eric Piel @@ -15,7 +15,7 @@ Description This driver provides support for the accelerometer found in various HP laptops sporting the feature officially called "HP Mobile Data -Protection System 3D" or "HP 3D DriveGuard". It detect automatically +Protection System 3D" or "HP 3D DriveGuard". It detects automatically laptops with this sensor. Known models (for now the HP 2133, nc6420, nc2510, nc8510, nc84x0, nw9440 and nx9420) will have their axis automatically oriented on standard way (eg: you can directly play @@ -27,7 +27,7 @@ position - 3D position that the accelerometer reports. Format: "(x,y,z)" calibrate - read: values (x, y, z) that are used as the base for input class device operation. write: forces the base to be recalibrated with the current - position. + position. rate - reports the sampling rate of the accelerometer device in HZ This driver also provides an absolute input class device, allowing @@ -48,7 +48,7 @@ For better compatibility between the various laptops. The values reported by the accelerometer are converted into a "standard" organisation of the axes (aka "can play neverball out of the box"): * When the laptop is horizontal the position reported is about 0 for X and Y -and a positive value for Z + and a positive value for Z * If the left side is elevated, X increases (becomes positive) * If the front side (where the touchpad is) is elevated, Y decreases (becomes negative) @@ -59,3 +59,13 @@ email to the authors to add it to the database. When reporting a new laptop, please include the output of "dmidecode" plus the value of /sys/devices/platform/lis3lv02d/position in these four cases. +Q&A +--- + +Q: How do I safely simulate freefall? I have an HP "portable +workstation" which has about 3.5kg and a plastic case, so letting it +fall to the ground is out of question... + +A: The sensor is pretty sensitive, so your hands can do it. Lift it +into free space, follow the fall with your hands for like 10 +centimeters. That should be enough to trigger the detection. -- cgit v1.2.3 From 72f5de92e199f96cfcea125aefc76c138d8c553c Mon Sep 17 00:00:00 2001 From: Ira Snyder Date: Tue, 31 Mar 2009 15:24:29 -0700 Subject: hwmon: Add LTC4215 driver Add Linux support for the Linear Technology LTC4215 Hot Swap controller I2C monitoring interface. I have tested the driver with my board, and it appears to work fine. With the power supplies disabled, it reads 11.93V input, 1.93V output, no current and no power. With the supplies enabled, it reads 11.93V input, 11.98V output, no current, no power. I'm not drawing any current at the moment, so this is reasonable. The value in the sense register never reads anything except 0, so I expect to get zero from the current and power calculations. I didn't attempt to support changing any of the chip's settings or enabling the FET. I'm not sure even how to do that and still fit within the hwmon framework. :) Signed-off-by: Ira W. Snyder Cc: Jean Delvare Cc: "Mark M. Hoffman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/hwmon/ltc4215 | 50 ++++++ drivers/hwmon/Kconfig | 11 ++ drivers/hwmon/Makefile | 1 + drivers/hwmon/ltc4215.c | 364 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 426 insertions(+) create mode 100644 Documentation/hwmon/ltc4215 create mode 100644 drivers/hwmon/ltc4215.c (limited to 'Documentation') diff --git a/Documentation/hwmon/ltc4215 b/Documentation/hwmon/ltc4215 new file mode 100644 index 000000000000..2e6a21eb656c --- /dev/null +++ b/Documentation/hwmon/ltc4215 @@ -0,0 +1,50 @@ +Kernel driver ltc4215 +===================== + +Supported chips: + * Linear Technology LTC4215 + Prefix: 'ltc4215' + Addresses scanned: 0x44 + Datasheet: + http://www.linear.com/pc/downloadDocument.do?navId=H0,C1,C1003,C1006,C1163,P17572,D12697 + +Author: Ira W. Snyder + + +Description +----------- + +The LTC4215 controller allows a board to be safely inserted and removed +from a live backplane. + + +Usage Notes +----------- + +This driver does not probe for LTC4215 devices, due to the fact that some +of the possible addresses are unfriendly to probing. You will need to use +the "force" parameter to tell the driver where to find the device. + +Example: the following will load the driver for an LTC4215 at address 0x44 +on I2C bus #0: +$ modprobe ltc4215 force=0,0x44 + + +Sysfs entries +------------- + +The LTC4215 has built-in limits for overvoltage, undervoltage, and +undercurrent warnings. This makes it very likely that the reference +circuit will be used. + +in1_input input voltage +in2_input output voltage + +in1_min_alarm input undervoltage alarm +in1_max_alarm input overvoltage alarm + +curr1_input current +curr1_max_alarm overcurrent alarm + +power1_input power usage +power1_alarm power bad alarm diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index 8a91c9971558..bc6810c22dd7 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -571,6 +571,17 @@ config SENSORS_LM93 This driver can also be built as a module. If so, the module will be called lm93. +config SENSORS_LTC4215 + tristate "Linear Technology LTC4215" + depends on I2C && EXPERIMENTAL + default n + help + If you say yes here you get support for Linear Technology LTC4215 + Hot Swap Controller I2C interface. + + This driver can also be built as a module. If so, the module will + be called ltc4215. + config SENSORS_LTC4245 tristate "Linear Technology LTC4245" depends on I2C && EXPERIMENTAL diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index 81c88822a3eb..4da261d16abf 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -65,6 +65,7 @@ obj-$(CONFIG_SENSORS_LM90) += lm90.o obj-$(CONFIG_SENSORS_LM92) += lm92.o obj-$(CONFIG_SENSORS_LM93) += lm93.o obj-$(CONFIG_SENSORS_LM95241) += lm95241.o +obj-$(CONFIG_SENSORS_LTC4215) += ltc4215.o obj-$(CONFIG_SENSORS_LTC4245) += ltc4245.o obj-$(CONFIG_SENSORS_MAX1111) += max1111.o obj-$(CONFIG_SENSORS_MAX1619) += max1619.o diff --git a/drivers/hwmon/ltc4215.c b/drivers/hwmon/ltc4215.c new file mode 100644 index 000000000000..9386e2a39211 --- /dev/null +++ b/drivers/hwmon/ltc4215.c @@ -0,0 +1,364 @@ +/* + * Driver for Linear Technology LTC4215 I2C Hot Swap Controller + * + * Copyright (C) 2009 Ira W. Snyder + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * Datasheet: + * http://www.linear.com/pc/downloadDocument.do?navId=H0,C1,C1003,C1006,C1163,P17572,D12697 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static const unsigned short normal_i2c[] = { I2C_CLIENT_END }; + +/* Insmod parameters */ +I2C_CLIENT_INSMOD_1(ltc4215); + +/* Here are names of the chip's registers (a.k.a. commands) */ +enum ltc4215_cmd { + LTC4215_CONTROL = 0x00, /* rw */ + LTC4215_ALERT = 0x01, /* rw */ + LTC4215_STATUS = 0x02, /* ro */ + LTC4215_FAULT = 0x03, /* rw */ + LTC4215_SENSE = 0x04, /* rw */ + LTC4215_SOURCE = 0x05, /* rw */ + LTC4215_ADIN = 0x06, /* rw */ +}; + +struct ltc4215_data { + struct device *hwmon_dev; + + struct mutex update_lock; + bool valid; + unsigned long last_updated; /* in jiffies */ + + /* Registers */ + u8 regs[7]; +}; + +static struct ltc4215_data *ltc4215_update_device(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + struct ltc4215_data *data = i2c_get_clientdata(client); + s32 val; + int i; + + mutex_lock(&data->update_lock); + + /* The chip's A/D updates 10 times per second */ + if (time_after(jiffies, data->last_updated + HZ / 10) || !data->valid) { + + dev_dbg(&client->dev, "Starting ltc4215 update\n"); + + /* Read all registers */ + for (i = 0; i < ARRAY_SIZE(data->regs); i++) { + val = i2c_smbus_read_byte_data(client, i); + if (unlikely(val < 0)) + data->regs[i] = 0; + else + data->regs[i] = val; + } + + data->last_updated = jiffies; + data->valid = 1; + } + + mutex_unlock(&data->update_lock); + + return data; +} + +/* Return the voltage from the given register in millivolts */ +static int ltc4215_get_voltage(struct device *dev, u8 reg) +{ + struct ltc4215_data *data = ltc4215_update_device(dev); + const u8 regval = data->regs[reg]; + u32 voltage = 0; + + switch (reg) { + case LTC4215_SENSE: + /* 151 uV per increment */ + voltage = regval * 151 / 1000; + break; + case LTC4215_SOURCE: + /* 60.5 mV per increment */ + voltage = regval * 605 / 10; + break; + case LTC4215_ADIN: + /* The ADIN input is divided by 12.5, and has 4.82 mV + * per increment, so we have the additional multiply */ + voltage = regval * 482 * 125 / 1000; + break; + default: + /* If we get here, the developer messed up */ + WARN_ON_ONCE(1); + break; + } + + return voltage; +} + +/* Return the current from the sense resistor in mA */ +static unsigned int ltc4215_get_current(struct device *dev) +{ + struct ltc4215_data *data = ltc4215_update_device(dev); + + /* The strange looking conversions that follow are fixed-point + * math, since we cannot do floating point in the kernel. + * + * Step 1: convert sense register to microVolts + * Step 2: convert voltage to milliAmperes + * + * If you play around with the V=IR equation, you come up with + * the following: X uV / Y mOhm == Z mA + * + * With the resistors that are fractions of a milliOhm, we multiply + * the voltage and resistance by 10, to shift the decimal point. + * Now we can use the normal division operator again. + */ + + /* Calculate voltage in microVolts (151 uV per increment) */ + const unsigned int voltage = data->regs[LTC4215_SENSE] * 151; + + /* Calculate current in milliAmperes (4 milliOhm sense resistor) */ + const unsigned int curr = voltage / 4; + + return curr; +} + +static ssize_t ltc4215_show_voltage(struct device *dev, + struct device_attribute *da, + char *buf) +{ + struct sensor_device_attribute *attr = to_sensor_dev_attr(da); + const int voltage = ltc4215_get_voltage(dev, attr->index); + + return snprintf(buf, PAGE_SIZE, "%d\n", voltage); +} + +static ssize_t ltc4215_show_current(struct device *dev, + struct device_attribute *da, + char *buf) +{ + const unsigned int curr = ltc4215_get_current(dev); + + return snprintf(buf, PAGE_SIZE, "%u\n", curr); +} + +static ssize_t ltc4215_show_power(struct device *dev, + struct device_attribute *da, + char *buf) +{ + const unsigned int curr = ltc4215_get_current(dev); + const int output_voltage = ltc4215_get_voltage(dev, LTC4215_ADIN); + + /* current in mA * voltage in mV == power in uW */ + const unsigned int power = abs(output_voltage * curr); + + return snprintf(buf, PAGE_SIZE, "%u\n", power); +} + +static ssize_t ltc4215_show_alarm(struct device *dev, + struct device_attribute *da, + char *buf) +{ + struct sensor_device_attribute_2 *attr = to_sensor_dev_attr_2(da); + struct ltc4215_data *data = ltc4215_update_device(dev); + const u8 reg = data->regs[attr->index]; + const u32 mask = attr->nr; + + return snprintf(buf, PAGE_SIZE, "%u\n", (reg & mask) ? 1 : 0); +} + +/* These macros are used below in constructing device attribute objects + * for use with sysfs_create_group() to make a sysfs device file + * for each register. + */ + +#define LTC4215_VOLTAGE(name, ltc4215_cmd_idx) \ + static SENSOR_DEVICE_ATTR(name, S_IRUGO, \ + ltc4215_show_voltage, NULL, ltc4215_cmd_idx) + +#define LTC4215_CURRENT(name) \ + static SENSOR_DEVICE_ATTR(name, S_IRUGO, \ + ltc4215_show_current, NULL, 0); + +#define LTC4215_POWER(name) \ + static SENSOR_DEVICE_ATTR(name, S_IRUGO, \ + ltc4215_show_power, NULL, 0); + +#define LTC4215_ALARM(name, mask, reg) \ + static SENSOR_DEVICE_ATTR_2(name, S_IRUGO, \ + ltc4215_show_alarm, NULL, (mask), reg) + +/* Construct a sensor_device_attribute structure for each register */ + +/* Current */ +LTC4215_CURRENT(curr1_input); +LTC4215_ALARM(curr1_max_alarm, (1 << 2), LTC4215_STATUS); + +/* Power (virtual) */ +LTC4215_POWER(power1_input); +LTC4215_ALARM(power1_alarm, (1 << 3), LTC4215_STATUS); + +/* Input Voltage */ +LTC4215_VOLTAGE(in1_input, LTC4215_ADIN); +LTC4215_ALARM(in1_max_alarm, (1 << 0), LTC4215_STATUS); +LTC4215_ALARM(in1_min_alarm, (1 << 1), LTC4215_STATUS); + +/* Output Voltage */ +LTC4215_VOLTAGE(in2_input, LTC4215_SOURCE); + +/* Finally, construct an array of pointers to members of the above objects, + * as required for sysfs_create_group() + */ +static struct attribute *ltc4215_attributes[] = { + &sensor_dev_attr_curr1_input.dev_attr.attr, + &sensor_dev_attr_curr1_max_alarm.dev_attr.attr, + + &sensor_dev_attr_power1_input.dev_attr.attr, + &sensor_dev_attr_power1_alarm.dev_attr.attr, + + &sensor_dev_attr_in1_input.dev_attr.attr, + &sensor_dev_attr_in1_max_alarm.dev_attr.attr, + &sensor_dev_attr_in1_min_alarm.dev_attr.attr, + + &sensor_dev_attr_in2_input.dev_attr.attr, + + NULL, +}; + +static const struct attribute_group ltc4215_group = { + .attrs = ltc4215_attributes, +}; + +static int ltc4215_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct ltc4215_data *data; + int ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) { + ret = -ENOMEM; + goto out_kzalloc; + } + + i2c_set_clientdata(client, data); + mutex_init(&data->update_lock); + + /* Initialize the LTC4215 chip */ + /* TODO */ + + /* Register sysfs hooks */ + ret = sysfs_create_group(&client->dev.kobj, <c4215_group); + if (ret) + goto out_sysfs_create_group; + + data->hwmon_dev = hwmon_device_register(&client->dev); + if (IS_ERR(data->hwmon_dev)) { + ret = PTR_ERR(data->hwmon_dev); + goto out_hwmon_device_register; + } + + return 0; + +out_hwmon_device_register: + sysfs_remove_group(&client->dev.kobj, <c4215_group); +out_sysfs_create_group: + kfree(data); +out_kzalloc: + return ret; +} + +static int ltc4215_remove(struct i2c_client *client) +{ + struct ltc4215_data *data = i2c_get_clientdata(client); + + hwmon_device_unregister(data->hwmon_dev); + sysfs_remove_group(&client->dev.kobj, <c4215_group); + + kfree(data); + + return 0; +} + +static int ltc4215_detect(struct i2c_client *client, + int kind, + struct i2c_board_info *info) +{ + struct i2c_adapter *adapter = client->adapter; + + if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA)) + return -ENODEV; + + if (kind < 0) { /* probed detection - check the chip type */ + s32 v; /* 8 bits from the chip, or -ERRNO */ + + /* + * Register 0x01 bit b7 is reserved, expect 0 + * Register 0x03 bit b6 and b7 are reserved, expect 0 + */ + v = i2c_smbus_read_byte_data(client, LTC4215_ALERT); + if (v < 0 || (v & (1 << 7)) != 0) + return -ENODEV; + + v = i2c_smbus_read_byte_data(client, LTC4215_FAULT); + if (v < 0 || (v & ((1 << 6) | (1 << 7))) != 0) + return -ENODEV; + } + + strlcpy(info->type, "ltc4215", I2C_NAME_SIZE); + dev_info(&adapter->dev, "ltc4215 %s at address 0x%02x\n", + kind < 0 ? "probed" : "forced", + client->addr); + + return 0; +} + +static const struct i2c_device_id ltc4215_id[] = { + { "ltc4215", ltc4215 }, + { } +}; +MODULE_DEVICE_TABLE(i2c, ltc4215_id); + +/* This is the driver that will be inserted */ +static struct i2c_driver ltc4215_driver = { + .class = I2C_CLASS_HWMON, + .driver = { + .name = "ltc4215", + }, + .probe = ltc4215_probe, + .remove = ltc4215_remove, + .id_table = ltc4215_id, + .detect = ltc4215_detect, + .address_data = &addr_data, +}; + +static int __init ltc4215_init(void) +{ + return i2c_add_driver(<c4215_driver); +} + +static void __exit ltc4215_exit(void) +{ + i2c_del_driver(<c4215_driver); +} + +MODULE_AUTHOR("Ira W. Snyder "); +MODULE_DESCRIPTION("LTC4215 driver"); +MODULE_LICENSE("GPL"); + +module_init(ltc4215_init); +module_exit(ltc4215_exit); -- cgit v1.2.3 From 3f1c6ebf57b815ad709e89291e446935fee78f75 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 31 Mar 2009 15:24:38 -0700 Subject: powerpc: add mmc-spi-slot bindings The bindings describes a case where MMC/SD/SDIO slot directly connected to a SPI bus. Such setups are widely used on embedded PowerPC boards. The patch also adds the mmc-spi-slot entry to the OpenFirmware modalias table. Signed-off-by: Anton Vorontsov Cc: David Brownell Cc: Benjamin Herrenschmidt Cc: Kumar Gala Cc: Grant Likely Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../powerpc/dts-bindings/mmc-spi-slot.txt | 23 ++++++++++++++++++++++ drivers/of/base.c | 1 + 2 files changed, 24 insertions(+) create mode 100644 Documentation/powerpc/dts-bindings/mmc-spi-slot.txt (limited to 'Documentation') diff --git a/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt b/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt new file mode 100644 index 000000000000..c39ac2891951 --- /dev/null +++ b/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt @@ -0,0 +1,23 @@ +MMC/SD/SDIO slot directly connected to a SPI bus + +Required properties: +- compatible : should be "mmc-spi-slot". +- reg : should specify SPI address (chip-select number). +- spi-max-frequency : maximum frequency for this device (Hz). +- voltage-ranges : two cells are required, first cell specifies minimum + slot voltage (mV), second cell specifies maximum slot voltage (mV). + Several ranges could be specified. +- gpios : (optional) may specify GPIOs in this order: Card-Detect GPIO, + Write-Protect GPIO. + +Example: + + mmc-slot@0 { + compatible = "fsl,mpc8323rdb-mmc-slot", + "mmc-spi-slot"; + reg = <0>; + gpios = <&qe_pio_d 14 1 + &qe_pio_d 15 0>; + voltage-ranges = <3300 3300>; + spi-max-frequency = <50000000>; + }; diff --git a/drivers/of/base.c b/drivers/of/base.c index cd17092b82bd..41c5dfd85358 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -446,6 +446,7 @@ struct of_modalias_table { }; static struct of_modalias_table of_modalias_table[] = { { "fsl,mcu-mpc8349emitx", "mcu-mpc8349emitx" }, + { "mmc-spi-slot", "mmc_spi" }, }; /** -- cgit v1.2.3 From ddb53d48da5b0e691f35e703ac29118747f86c99 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 31 Mar 2009 15:25:40 -0700 Subject: fbdev: remove cyblafb driver A tridentfb driver has all the functionality of the cyblafb driver without the bugs of the latter. Changes to the tridentfb driver: - FBINFO_READS_FAST added to the tridentfb. The cyblafb used a blitter for scrolling which is faster than color expansion on Cyberblade chipsets. The blitter is slower on a discrete Blade3D core. Use the blitter for scrolling in the tridentfb only for integrated Blade3D cores. Now, scrolling speed is about equal for the tridentfb and the cyblafb. - a copyright notice addition is done on request of Jani Monoses (the first author of the tridentfb). Tested on AGP Blade3D card and PCChips M787CLR motherboard: VIA C3 cpu + VT8601 north bridge (aka Cyberblade/i1). Signed-off-by: Krzysztof Helt Cc: "Jani Monoses" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/fb/00-INDEX | 2 - Documentation/fb/cyblafb/bugs | 13 - Documentation/fb/cyblafb/credits | 7 - Documentation/fb/cyblafb/documentation | 17 - Documentation/fb/cyblafb/fb.modes | 154 --- Documentation/fb/cyblafb/performance | 79 -- Documentation/fb/cyblafb/todo | 31 - Documentation/fb/cyblafb/usage | 217 ---- Documentation/fb/cyblafb/whatsnew | 29 - Documentation/fb/cyblafb/whycyblafb | 85 -- drivers/video/Kconfig | 28 +- drivers/video/cyblafb.c | 1683 -------------------------------- drivers/video/tridentfb.c | 6 +- 13 files changed, 7 insertions(+), 2344 deletions(-) delete mode 100644 Documentation/fb/cyblafb/bugs delete mode 100644 Documentation/fb/cyblafb/credits delete mode 100644 Documentation/fb/cyblafb/documentation delete mode 100644 Documentation/fb/cyblafb/fb.modes delete mode 100644 Documentation/fb/cyblafb/performance delete mode 100644 Documentation/fb/cyblafb/todo delete mode 100644 Documentation/fb/cyblafb/usage delete mode 100644 Documentation/fb/cyblafb/whatsnew delete mode 100644 Documentation/fb/cyblafb/whycyblafb delete mode 100644 drivers/video/cyblafb.c (limited to 'Documentation') diff --git a/Documentation/fb/00-INDEX b/Documentation/fb/00-INDEX index caabbd395e61..a618fd99c9f0 100644 --- a/Documentation/fb/00-INDEX +++ b/Documentation/fb/00-INDEX @@ -11,8 +11,6 @@ aty128fb.txt - info on the ATI Rage128 frame buffer driver. cirrusfb.txt - info on the driver for Cirrus Logic chipsets. -cyblafb/ - - directory with documentation files related to the cyblafb driver. deferred_io.txt - an introduction to deferred IO. fbcon.txt diff --git a/Documentation/fb/cyblafb/bugs b/Documentation/fb/cyblafb/bugs deleted file mode 100644 index 9443a6d72cdd..000000000000 --- a/Documentation/fb/cyblafb/bugs +++ /dev/null @@ -1,13 +0,0 @@ -Bugs -==== - -I currently don't know of any bug. Please do send reports to: - - linux-fbdev-devel@lists.sourceforge.net - - Knut_Petersen@t-online.de. - - -Untested features -================= - -All LCD stuff is untested. If it worked in tridentfb, it should work in -cyblafb. Please test and report the results to Knut_Petersen@t-online.de. diff --git a/Documentation/fb/cyblafb/credits b/Documentation/fb/cyblafb/credits deleted file mode 100644 index 0eb3b443dc2b..000000000000 --- a/Documentation/fb/cyblafb/credits +++ /dev/null @@ -1,7 +0,0 @@ -Thanks to -========= - * Alan Hourihane, for writing the X trident driver - * Jani Monoses, for writing the tridentfb driver - * Antonino A. Daplas, for review of the first published - version of cyblafb and some code - * Jochen Hein, for testing and a helpfull bug report diff --git a/Documentation/fb/cyblafb/documentation b/Documentation/fb/cyblafb/documentation deleted file mode 100644 index bb1aac048425..000000000000 --- a/Documentation/fb/cyblafb/documentation +++ /dev/null @@ -1,17 +0,0 @@ -Available Documentation -======================= - -Apollo PLE 133 Chipset VT8601A North Bridge Datasheet, Rev. 1.82, October 22, -2001, available from VIA: - - http://www.viavpsd.com/product/6/15/DS8601A182.pdf - -The datasheet is incomplete, some registers that need to be programmed are not -explained at all and important bits are listed as "reserved". But you really -need the datasheet to understand the code. "p. xxx" comments refer to page -numbers of this document. - -XFree/XOrg drivers are available and of good quality, looking at the code -there is a good idea if the datasheet does not provide enough information -or if the datasheet seems to be wrong. - diff --git a/Documentation/fb/cyblafb/fb.modes b/Documentation/fb/cyblafb/fb.modes deleted file mode 100644 index fe0e5223ba86..000000000000 --- a/Documentation/fb/cyblafb/fb.modes +++ /dev/null @@ -1,154 +0,0 @@ -# -# Sample fb.modes file -# -# Provides an incomplete list of working modes for -# the cyberblade/i1 graphics core. -# -# The value 4294967256 is used instead of -40. Of course, -40 is not -# a really reasonable value, but chip design does not always follow -# logic. Believe me, it's ok, and it's the way the BIOS does it. -# -# fbset requires 4294967256 in fb.modes and -40 as an argument to -# the -t parameter. That's also not too reasonable, and it might change -# in the future or might even be differt for your current version. -# - -mode "640x480-50" - geometry 640 480 2048 4096 8 - timings 47619 4294967256 24 17 0 216 3 -endmode - -mode "640x480-60" - geometry 640 480 2048 4096 8 - timings 39682 4294967256 24 17 0 216 3 -endmode - -mode "640x480-70" - geometry 640 480 2048 4096 8 - timings 34013 4294967256 24 17 0 216 3 -endmode - -mode "640x480-72" - geometry 640 480 2048 4096 8 - timings 33068 4294967256 24 17 0 216 3 -endmode - -mode "640x480-75" - geometry 640 480 2048 4096 8 - timings 31746 4294967256 24 17 0 216 3 -endmode - -mode "640x480-80" - geometry 640 480 2048 4096 8 - timings 29761 4294967256 24 17 0 216 3 -endmode - -mode "640x480-85" - geometry 640 480 2048 4096 8 - timings 28011 4294967256 24 17 0 216 3 -endmode - -mode "800x600-50" - geometry 800 600 2048 4096 8 - timings 30303 96 24 14 0 136 11 -endmode - -mode "800x600-60" - geometry 800 600 2048 4096 8 - timings 25252 96 24 14 0 136 11 -endmode - -mode "800x600-70" - geometry 800 600 2048 4096 8 - timings 21645 96 24 14 0 136 11 -endmode - -mode "800x600-72" - geometry 800 600 2048 4096 8 - timings 21043 96 24 14 0 136 11 -endmode - -mode "800x600-75" - geometry 800 600 2048 4096 8 - timings 20202 96 24 14 0 136 11 -endmode - -mode "800x600-80" - geometry 800 600 2048 4096 8 - timings 18939 96 24 14 0 136 11 -endmode - -mode "800x600-85" - geometry 800 600 2048 4096 8 - timings 17825 96 24 14 0 136 11 -endmode - -mode "1024x768-50" - geometry 1024 768 2048 4096 8 - timings 19054 144 24 29 0 120 3 -endmode - -mode "1024x768-60" - geometry 1024 768 2048 4096 8 - timings 15880 144 24 29 0 120 3 -endmode - -mode "1024x768-70" - geometry 1024 768 2048 4096 8 - timings 13610 144 24 29 0 120 3 -endmode - -mode "1024x768-72" - geometry 1024 768 2048 4096 8 - timings 13232 144 24 29 0 120 3 -endmode - -mode "1024x768-75" - geometry 1024 768 2048 4096 8 - timings 12703 144 24 29 0 120 3 -endmode - -mode "1024x768-80" - geometry 1024 768 2048 4096 8 - timings 11910 144 24 29 0 120 3 -endmode - -mode "1024x768-85" - geometry 1024 768 2048 4096 8 - timings 11209 144 24 29 0 120 3 -endmode - -mode "1280x1024-50" - geometry 1280 1024 2048 4096 8 - timings 11114 232 16 39 0 160 3 -endmode - -mode "1280x1024-60" - geometry 1280 1024 2048 4096 8 - timings 9262 232 16 39 0 160 3 -endmode - -mode "1280x1024-70" - geometry 1280 1024 2048 4096 8 - timings 7939 232 16 39 0 160 3 -endmode - -mode "1280x1024-72" - geometry 1280 1024 2048 4096 8 - timings 7719 232 16 39 0 160 3 -endmode - -mode "1280x1024-75" - geometry 1280 1024 2048 4096 8 - timings 7410 232 16 39 0 160 3 -endmode - -mode "1280x1024-80" - geometry 1280 1024 2048 4096 8 - timings 6946 232 16 39 0 160 3 -endmode - -mode "1280x1024-85" - geometry 1280 1024 2048 4096 8 - timings 6538 232 16 39 0 160 3 -endmode diff --git a/Documentation/fb/cyblafb/performance b/Documentation/fb/cyblafb/performance deleted file mode 100644 index 8d15d5dfc6b3..000000000000 --- a/Documentation/fb/cyblafb/performance +++ /dev/null @@ -1,79 +0,0 @@ -Speed -===== - -CyBlaFB is much faster than tridentfb and vesafb. Compare the performance data -for mode 1280x1024-[8,16,32]@61 Hz. - -Test 1: Cat a file with 2000 lines of 0 characters. -Test 2: Cat a file with 2000 lines of 80 characters. -Test 3: Cat a file with 2000 lines of 160 characters. - -All values show system time use in seconds, kernel 2.6.12 was used for -the measurements. 2.6.13 is a bit slower, 2.6.14 hopefully will include a -patch that speeds up kernel bitblitting a lot ( > 20%). - -+-----------+-----------------------------------------------------+ -| | not accelerated | -| TRIDENTFB +-----------------+-----------------+-----------------+ -| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp | -| | noypan | ypan | noypan | ypan | noypan | ypan | -+-----------+--------+--------+--------+--------+--------+--------+ -| Test 1 | 4.31 | 4.33 | 6.05 | 12.81 | ---- | ---- | -| Test 2 | 67.94 | 5.44 | 123.16 | 14.79 | ---- | ---- | -| Test 3 | 131.36 | 6.55 | 240.12 | 16.76 | ---- | ---- | -+-----------+--------+--------+--------+--------+--------+--------+ -| Comments | | | completely bro- | -| | | | ken, monitor | -| | | | switches off | -+-----------+-----------------+-----------------+-----------------+ - - -+-----------+-----------------------------------------------------+ -| | accelerated | -| TRIDENTFB +-----------------+-----------------+-----------------+ -| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp | -| | noypan | ypan | noypan | ypan | noypan | ypan | -+-----------+--------+--------+--------+--------+--------+--------+ -| Test 1 | ---- | ---- | 20.62 | 1.22 | ---- | ---- | -| Test 2 | ---- | ---- | 22.61 | 3.19 | ---- | ---- | -| Test 3 | ---- | ---- | 24.59 | 5.16 | ---- | ---- | -+-----------+--------+--------+--------+--------+--------+--------+ -| Comments | broken, writing | broken, ok only | completely bro- | -| | to wrong places | if bgcolor is | ken, monitor | -| | on screen + bug | black, bug in | switches off | -| | in fillrect() | fillrect() | | -+-----------+-----------------+-----------------+-----------------+ - - -+-----------+-----------------------------------------------------+ -| | not accelerated | -| VESAFB +-----------------+-----------------+-----------------+ -| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp | -| | noypan | ypan | noypan | ypan | noypan | ypan | -+-----------+--------+--------+--------+--------+--------+--------+ -| Test 1 | 4.26 | 3.76 | 5.99 | 7.23 | ---- | ---- | -| Test 2 | 65.65 | 4.89 | 120.88 | 9.08 | ---- | ---- | -| Test 3 | 126.91 | 5.94 | 235.77 | 11.03 | ---- | ---- | -+-----------+--------+--------+--------+--------+--------+--------+ -| Comments | vga=0x307 | vga=0x31a | vga=0x31b not | -| | fh=80kHz | fh=80kHz | supported by | -| | fv=75kHz | fv=75kHz | video BIOS and | -| | | | hardware | -+-----------+-----------------+-----------------+-----------------+ - - -+-----------+-----------------------------------------------------+ -| | accelerated | -| CYBLAFB +-----------------+-----------------+-----------------+ -| | 8 bpp | 16 bpp | 32 bpp | -| | noypan | ypan | noypan | ypan | noypan | ypan | -+-----------+--------+--------+--------+--------+--------+--------+ -| Test 1 | 8.02 | 0.23 | 19.04 | 0.61 | 57.12 | 2.74 | -| Test 2 | 8.38 | 0.55 | 19.39 | 0.92 | 57.54 | 3.13 | -| Test 3 | 8.73 | 0.86 | 19.74 | 1.24 | 57.95 | 3.51 | -+-----------+--------+--------+--------+--------+--------+--------+ -| Comments | | | | -| | | | | -| | | | | -| | | | | -+-----------+-----------------+-----------------+-----------------+ diff --git a/Documentation/fb/cyblafb/todo b/Documentation/fb/cyblafb/todo deleted file mode 100644 index c5f6d0eae545..000000000000 --- a/Documentation/fb/cyblafb/todo +++ /dev/null @@ -1,31 +0,0 @@ -TODO / Missing features -======================= - -Verify LCD stuff "stretch" and "center" options are - completely untested ... this code needs to be - verified. As I don't have access to such - hardware, please contact me if you are - willing run some tests. - -Interlaced video modes The reason that interleaved - modes are disabled is that I do not know - the meaning of the vertical interlace - parameter. Also the datasheet mentions a - bit d8 of a horizontal interlace parameter, - but nowhere the lower 8 bits. Please help - if you can. - -low-res double scan modes Who needs it? - -accelerated color blitting Who needs it? The console driver does use color - blitting for nothing but drawing the penguine, - everything else is done using color expanding - blitting of 1bpp character bitmaps. - -ioctls Who needs it? - -TV-out Will be done later. Use "vga= " at boot time - to set a suitable video mode. - -??? Feel free to contact me if you have any - feature requests diff --git a/Documentation/fb/cyblafb/usage b/Documentation/fb/cyblafb/usage deleted file mode 100644 index a39bb3d402a2..000000000000 --- a/Documentation/fb/cyblafb/usage +++ /dev/null @@ -1,217 +0,0 @@ -CyBlaFB is a framebuffer driver for the Cyberblade/i1 graphics core integrated -into the VIA Apollo PLE133 (aka vt8601) south bridge. It is developed and -tested using a VIA EPIA 5000 board. - -Cyblafb - compiled into the kernel or as a module? -================================================== - -You might compile cyblafb either as a module or compile it permanently into the -kernel. - -Unless you have a real reason to do so you should not compile both vesafb and -cyblafb permanently into the kernel. It's possible and it helps during the -developement cycle, but it's useless and will at least block some otherwise -usefull memory for ordinary users. - -Selecting Modes -=============== - - Startup Mode - ============ - - First of all, you might use the "vga=???" boot parameter as it is - documented in vesafb.txt and svga.txt. Cyblafb will detect the video - mode selected and will use the geometry and timings found by - inspecting the hardware registers. - - video=cyblafb vga=0x317 - - Alternatively you might use a combination of the mode, ref and bpp - parameters. If you compiled the driver into the kernel, add something - like this to the kernel command line: - - video=cyblafb:1280x1024,bpp=16,ref=50 ... - - If you compiled the driver as a module, the same mode would be - selected by the following command: - - modprobe cyblafb mode=1280x1024 bpp=16 ref=50 ... - - None of the modes possible to select as startup modes are affected by - the problems described at the end of the next subsection. - - For all startup modes cyblafb chooses a virtual x resolution of 2048, - the only exception is mode 1280x1024 in combination with 32 bpp. This - allows ywrap scrolling for all those modes if rotation is 0 or 2, and - also fast scrolling if rotation is 1 or 3. The default virtual y reso- - lution is 4096 for bpp == 8, 2048 for bpp==16 and 1024 for bpp == 32, - again with the only exception of 1280x1024 at 32 bpp. - - Please do set your video memory size to 8 Mb in the Bios setup. Other - values will work, but performace is decreased for a lot of modes. - - Mode changes using fbset - ======================== - - You might use fbset to change the video mode, see "man fbset". Cyblafb - generally does assume that you know what you are doing. But it does - some checks, especially those that are needed to prevent you from - damaging your hardware. - - - only 8, 16, 24 and 32 bpp video modes are accepted - - interlaced video modes are not accepted - - double scan video modes are not accepted - - if a flat panel is found, cyblafb does not allow you - to program a resolution higher than the physical - resolution of the flat panel monitor - - cyblafb does not allow vclk to exceed 230 MHz. As 32 bpp - and (currently) 24 bit modes use a doubled vclk internally, - the dotclock limit as seen by fbset is 115 MHz for those - modes and 230 MHz for 8 and 16 bpp modes. - - cyblafb will allow you to select very high resolutions as - long as the hardware can be programmed to these modes. The - documented limit 1600x1200 is not enforced, but don't expect - perfect signal quality. - - Any request that violates the rules given above will be either changed - to something the hardware supports or an error value will be returned. - - If you program a virtual y resolution higher than the hardware limit, - cyblafb will silently decrease that value to the highest possible - value. The same is true for a virtual x resolution that is not - supported by the hardware. Cyblafb tries to adapt vyres first because - vxres decides if ywrap scrolling is possible or not. - - Attempts to disable acceleration are ignored, I believe that this is - safe. - - Some video modes that should work do not work as expected. If you use - the standard fb.modes, fbset 640x480-60 will program that mode, but - you will see a vertical area, about two characters wide, with only - much darker characters than the other characters on the screen. - Cyblafb does allow that mode to be set, as it does not violate the - official specifications. It would need a lot of code to reliably sort - out all invalid modes, playing around with the margin values will - give a valid mode quickly. And if cyblafb would detect such an invalid - mode, should it silently alter the requested values or should it - report an error? Both options have some pros and cons. As stated - above, none of the startup modes are affected, and if you set - verbosity to 1 or higher, cyblafb will print the fbset command that - would be needed to program that mode using fbset. - - -Other Parameters -================ - - -crt don't autodetect, assume monitor connected to - standard VGA connector - -fp don't autodetect, assume flat panel display - connected to flat panel monitor interface - -nativex inform driver about native x resolution of - flat panel monitor connected to special - interface (should be autodetected) - -stretch stretch image to adapt low resolution modes to - higer resolutions of flat panel monitors - connected to special interface - -center center image to adapt low resolution modes to - higer resolutions of flat panel monitors - connected to special interface - -memsize use if autodetected memsize is wrong ... - should never be necessary - -nopcirr disable PCI read retry -nopciwr disable PCI write retry -nopcirb disable PCI read bursts -nopciwb disable PCI write bursts - -bpp bpp for specified modes - valid values: 8 || 16 || 24 || 32 - -ref refresh rate for specified mode - valid values: 50 <= ref <= 85 - -mode 640x480 or 800x600 or 1024x768 or 1280x1024 - if not specified, the startup mode will be detected - and used, so you might also use the vga=??? parameter - described in vesafb.txt. If you do not specify a mode, - bpp and ref parameters are ignored. - -verbosity 0 is the default, increase to at least 2 for every - bug report! - -Development hints -================= - -It's much faster do compile a module and to load the new version after -unloading the old module than to compile a new kernel and to reboot. So if you -try to work on cyblafb, it might be a good idea to use cyblafb as a module. -In real life, fast often means dangerous, and that's also the case here. If -you introduce a serious bug when cyblafb is compiled into the kernel, the -kernel will lock or oops with a high probability before the file system is -mounted, and the danger for your data is low. If you load a broken own version -of cyblafb on a running system, the danger for the integrity of the file -system is much higher as you might need a hard reset afterwards. Decide -yourself. - -Module unloading, the vfb method -================================ - -If you want to unload/reload cyblafb using the virtual framebuffer, you need -to enable vfb support in the kernel first. After that, load the modules as -shown below: - - modprobe vfb vfb_enable=1 - modprobe fbcon - modprobe cyblafb - fbset -fb /dev/fb1 1280x1024-60 -vyres 2662 - con2fb /dev/fb1 /dev/tty1 - ... - -If you now made some changes to cyblafb and want to reload it, you might do it -as show below: - - con2fb /dev/fb0 /dev/tty1 - ... - rmmod cyblafb - modprobe cyblafb - con2fb /dev/fb1 /dev/tty1 - ... - -Of course, you might choose another mode, and most certainly you also want to -map some other /dev/tty* to the real framebuffer device. You might also choose -to compile fbcon as a kernel module or place it permanently in the kernel. - -I do not know of any way to unload fbcon, and fbcon will prevent the -framebuffer device loaded first from unloading. [If there is a way, then -please add a description here!] - -Module unloading, the vesafb method -=================================== - -Configure the kernel: - - <*> Support for frame buffer devices - [*] VESA VGA graphics support - Cyberblade/i1 support - -Add e.g. "video=vesafb:ypan vga=0x307" to the kernel parameters. The ypan -parameter is important, choose any vga parameter you like as long as it is -a graphics mode. - -After booting, load cyblafb without any mode and bpp parameter and assign -cyblafb to individual ttys using con2fb, e.g.: - - modprobe cyblafb - con2fb /dev/fb1 /dev/tty1 - -Unloading cyblafb works without problems after you assign vesafb to all -ttys again, e.g.: - - con2fb /dev/fb0 /dev/tty1 - rmmod cyblafb diff --git a/Documentation/fb/cyblafb/whatsnew b/Documentation/fb/cyblafb/whatsnew deleted file mode 100644 index 76c07a26e044..000000000000 --- a/Documentation/fb/cyblafb/whatsnew +++ /dev/null @@ -1,29 +0,0 @@ -0.62 -==== - - - the vesafb parameter has been removed as I decided to allow the - feature without any special parameter. - - - Cyblafb does not use the vga style of panning any longer, now the - "right view" register in the graphics engine IO space is used. Without - that change it was impossible to use all available memory, and without - access to all available memory it is impossible to ywrap. - - - The imageblit function now uses hardware acceleration for all font - widths. Hardware blitting across pixel column 2048 is broken in the - cyberblade/i1 graphics core, but we work around that hardware bug. - - - modes with vxres != xres are supported now. - - - ywrap scrolling is supported now and the default. This is a big - performance gain. - - - default video modes use vyres > yres and vxres > xres to allow - almost optimal scrolling speed for normal and rotated screens - - - some features mainly usefull for debugging the upper layers of the - framebuffer system have been added, have a look at the code - - - fixed: Oops after unloading cyblafb when reading /proc/io* - - - we work around some bugs of the higher framebuffer layers. diff --git a/Documentation/fb/cyblafb/whycyblafb b/Documentation/fb/cyblafb/whycyblafb deleted file mode 100644 index a123bc11e698..000000000000 --- a/Documentation/fb/cyblafb/whycyblafb +++ /dev/null @@ -1,85 +0,0 @@ -I tried the following framebuffer drivers: - - - TRIDENTFB is full of bugs. Acceleration is broken for Blade3D - graphics cores like the cyberblade/i1. It claims to support a great - number of devices, but documentation for most of these devices is - unfortunately not available. There is _no_ reason to use tridentfb - for cyberblade/i1 + CRT users. VESAFB is faster, and the one - advantage, mode switching, is broken in tridentfb. - - - VESAFB is used by many distributions as a standard. Vesafb does - not support mode switching. VESAFB is a bit faster than the working - configurations of TRIDENTFB, but it is still too slow, even if you - use ypan. - - - EPIAFB (you'll find it on sourceforge) supports the Cyberblade/i1 - graphics core, but it still has serious bugs and developement seems - to have stopped. This is the one driver with TV-out support. If you - do need this feature, try epiafb. - -None of these drivers was a real option for me. - -I believe that is unreasonable to change code that announces to support 20 -devices if I only have more or less sufficient documentation for exactly one -of these. The risk of breaking device foo while fixing device bar is too high. - -So I decided to start CyBlaFB as a stripped down tridentfb. - -All code specific to other Trident chips has been removed. After that there -were a lot of cosmetic changes to increase the readability of the code. All -register names were changed to those mnemonics used in the datasheet. Function -and macro names were changed if they hindered easy understanding of the code. - -After that I debugged the code and implemented some new features. I'll try to -give a little summary of the main changes: - - - calculation of vertical and horizontal timings was fixed - - - video signal quality has been improved dramatically - - - acceleration: - - - fillrect and copyarea were fixed and reenabled - - - color expanding imageblit was newly implemented, color - imageblit (only used to draw the penguine) still uses the - generic code. - - - init of the acceleration engine was improved and moved to a - place where it really works ... - - - sync function has a timeout now and tries to reset and - reinit the accel engine if necessary - - - fewer slow copyarea calls when doing ypan scrolling by using - undocumented bit d21 of screen start address stored in - CR2B[5]. BIOS does use it also, so this should be safe. - - - cyblafb rejects any attempt to set modes that would cause vclk - values above reasonable 230 MHz. 32bit modes use a clock - multiplicator of 2, so fbset does show the correct values for - pixclock but not for vclk in this case. The fbset limit is 115 MHz - for 32 bpp modes. - - - cyblafb rejects modes known to be broken or unimplemented (all - interlaced modes, all doublescan modes for now) - - - cyblafb now works independant of the video mode in effect at startup - time (tridentfb does not init all needed registers to reasonable - values) - - - switching between video modes does work reliably now - - - the first video mode now is the one selected on startup using the - vga=???? mechanism or any of - - 640x480, 800x600, 1024x768, 1280x1024 - - 8, 16, 24 or 32 bpp - - refresh between 50 Hz and 85 Hz, 1 Hz steps (1280x1024-32 - is limited to 63Hz) - - - pci retry and pci burst mode are settable (try to disable if you - experience latency problems) - - - built as a module cyblafb might be unloaded and reloaded using - the vfb module and con2vt or might be used together with vesafb - diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index c19f6feb4e53..db7a4f42edad 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -1597,30 +1597,6 @@ config FB_VT8623 Driver for CastleRock integrated graphics core in the VIA VT8623 [Apollo CLE266] chipset. -config FB_CYBLA - tristate "Cyberblade/i1 support" - depends on FB && PCI && X86_32 && !64BIT - select FB_CFB_IMAGEBLIT - ---help--- - This driver is supposed to support the Trident Cyberblade/i1 - graphics core integrated in the VIA VT8601A North Bridge, - also known as VIA Apollo PLE133. - - Status: - - Developed, tested and working on EPIA 5000 and EPIA 800. - - Does work reliable on all systems with CRT/LCD connected to - normal VGA ports. - - Should work on systems that do use the internal LCD port, but - this is absolutely not tested. - - Character imageblit, copyarea and rectangle fill are hw accelerated, - ypan scrolling is used by default. - - Please do read . - - To compile this driver as a module, choose M here: the - module will be called cyblafb. - config FB_TRIDENT tristate "Trident support" depends on FB && PCI @@ -1633,8 +1609,8 @@ config FB_TRIDENT and Blade XP. There are also integrated versions of these chips called CyberXXXX, CyberImage or CyberBlade. These chips are mostly found in laptops - but also on some motherboards. For more information, read - + but also on some motherboards including early VIA EPIA motherboards. + For more information, read Say Y if you have such a graphics board. diff --git a/drivers/video/cyblafb.c b/drivers/video/cyblafb.c deleted file mode 100644 index 9704b73135f5..000000000000 --- a/drivers/video/cyblafb.c +++ /dev/null @@ -1,1683 +0,0 @@ -/* - * Frame buffer driver for Trident Cyberblade/i1 graphics core - * - * Copyright 2005 Knut Petersen - * - * CREDITS: - * tridentfb.c by Jani Monoses - * see files above for further credits - * - */ - -#define CYBLAFB_DEBUG 0 -#define CYBLAFB_KD_GRAPHICS_QUIRK 1 - -#define CYBLAFB_PIXMAPSIZE 8192 - -#include -#include -#include -#include -#include -#include -#include