diff options
Diffstat (limited to 'Documentation')
37 files changed, 1168 insertions, 394 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss index 0a92a7c93a62..4f29e5f1ebfa 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss +++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss @@ -31,3 +31,31 @@ Date: March 2009 Kernel Version: 2.6.30 Contact: iss_storagedev@hp.com Description: A symbolic link to /sys/block/cciss!cXdY + +Where: /sys/bus/pci/devices/<dev>/ccissX/rescan +Date: August 2009 +Kernel Version: 2.6.31 +Contact: iss_storagedev@hp.com +Description: Kicks of a rescan of the controller to discover logical + drive topology changes. + +Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/lunid +Date: August 2009 +Kernel Version: 2.6.31 +Contact: iss_storagedev@hp.com +Description: Displays the 8-byte LUN ID used to address logical + drive Y of controller X. + +Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/raid_level +Date: August 2009 +Kernel Version: 2.6.31 +Contact: iss_storagedev@hp.com +Description: Displays the RAID level of logical drive Y of + controller X. + +Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/usage_count +Date: August 2009 +Kernel Version: 2.6.31 +Contact: iss_storagedev@hp.com +Description: Displays the usage count (number of opens) of logical drive Y + of controller X. diff --git a/Documentation/ABI/testing/sysfs-class-usb_host b/Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc index 46b66ad1f1b4..4e8106f7cfd9 100644 --- a/Documentation/ABI/testing/sysfs-class-usb_host +++ b/Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc @@ -1,4 +1,4 @@ -What: /sys/class/usb_host/usb_hostN/wusb_chid +What: /sys/class/uwb_rc/uwbN/wusbhc/wusb_chid Date: July 2008 KernelVersion: 2.6.27 Contact: David Vrabel <david.vrabel@csr.com> @@ -9,7 +9,7 @@ Description: Set an all zero CHID to stop the host controller. -What: /sys/class/usb_host/usb_hostN/wusb_trust_timeout +What: /sys/class/uwb_rc/uwbN/wusbhc/wusb_trust_timeout Date: July 2008 KernelVersion: 2.6.27 Contact: David Vrabel <david.vrabel@csr.com> diff --git a/Documentation/ABI/testing/sysfs-devices-cache_disable b/Documentation/ABI/testing/sysfs-devices-cache_disable deleted file mode 100644 index 175bb4f70512..000000000000 --- a/Documentation/ABI/testing/sysfs-devices-cache_disable +++ /dev/null @@ -1,18 +0,0 @@ -What: /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X -Date: August 2008 -KernelVersion: 2.6.27 -Contact: mark.langsdorf@amd.com -Description: These files exist in every cpu's cache index directories. - There are currently 2 cache_disable_# files in each - directory. Reading from these files on a supported - processor will return that cache disable index value - for that processor and node. Writing to one of these - files will cause the specificed cache index to be disabled. - - Currently, only AMD Family 10h Processors support cache index - disable, and only for their L3 caches. See the BIOS and - Kernel Developer's Guide at - http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf - for formatting information and other details on the - cache index disable. -Users: joachim.deguara@amd.com diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu new file mode 100644 index 000000000000..a703b9e9aeb9 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -0,0 +1,156 @@ +What: /sys/devices/system/cpu/ +Date: pre-git history +Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> +Description: + A collection of both global and individual CPU attributes + + Individual CPU attributes are contained in subdirectories + named by the kernel's logical CPU number, e.g.: + + /sys/devices/system/cpu/cpu#/ + +What: /sys/devices/system/cpu/sched_mc_power_savings + /sys/devices/system/cpu/sched_smt_power_savings +Date: June 2006 +Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> +Description: Discover and adjust the kernel's multi-core scheduler support. + + Possible values are: + + 0 - No power saving load balance (default value) + 1 - Fill one thread/core/package first for long running threads + 2 - Also bias task wakeups to semi-idle cpu package for power + savings + + sched_mc_power_savings is dependent upon SCHED_MC, which is + itself architecture dependent. + + sched_smt_power_savings is dependent upon SCHED_SMT, which + is itself architecture dependent. + + The two files are independent of each other. It is possible + that one file may be present without the other. + + Introduced by git commit 5c45bf27. + + +What: /sys/devices/system/cpu/kernel_max + /sys/devices/system/cpu/offline + /sys/devices/system/cpu/online + /sys/devices/system/cpu/possible + /sys/devices/system/cpu/present +Date: December 2008 +Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> +Description: CPU topology files that describe kernel limits related to + hotplug. Briefly: + + kernel_max: the maximum cpu index allowed by the kernel + configuration. + + offline: cpus that are not online because they have been + HOTPLUGGED off or exceed the limit of cpus allowed by the + kernel configuration (kernel_max above). + + online: cpus that are online and being scheduled. + + possible: cpus that have been allocated resources and can be + brought online if they are present. + + present: cpus that have been identified as being present in + the system. + + See Documentation/cputopology.txt for more information. + + + +What: /sys/devices/system/cpu/cpu#/node +Date: October 2009 +Contact: Linux memory management mailing list <linux-mm@kvack.org> +Description: Discover NUMA node a CPU belongs to + + When CONFIG_NUMA is enabled, a symbolic link that points + to the corresponding NUMA node directory. + + For example, the following symlink is created for cpu42 + in NUMA node 2: + + /sys/devices/system/cpu/cpu42/node2 -> ../../node/node2 + + +What: /sys/devices/system/cpu/cpu#/topology/core_id + /sys/devices/system/cpu/cpu#/topology/core_siblings + /sys/devices/system/cpu/cpu#/topology/core_siblings_list + /sys/devices/system/cpu/cpu#/topology/physical_package_id + /sys/devices/system/cpu/cpu#/topology/thread_siblings + /sys/devices/system/cpu/cpu#/topology/thread_siblings_list +Date: December 2008 +Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> +Description: CPU topology files that describe a logical CPU's relationship + to other cores and threads in the same physical package. + + One cpu# directory is created per logical CPU in the system, + e.g. /sys/devices/system/cpu/cpu42/. + + Briefly, the files above are: + + core_id: the CPU core ID of cpu#. Typically it is the + hardware platform's identifier (rather than the kernel's). + The actual value is architecture and platform dependent. + + core_siblings: internal kernel map of cpu#'s hardware threads + within the same physical_package_id. + + core_siblings_list: human-readable list of the logical CPU + numbers within the same physical_package_id as cpu#. + + physical_package_id: physical package id of cpu#. Typically + corresponds to a physical socket number, but the actual value + is architecture and platform dependent. + + thread_siblings: internel kernel map of cpu#'s hardware + threads within the same core as cpu# + + thread_siblings_list: human-readable list of cpu#'s hardware + threads within the same core as cpu# + + See Documentation/cputopology.txt for more information. + + +What: /sys/devices/system/cpu/cpuidle/current_driver + /sys/devices/system/cpu/cpuidle/current_governer_ro +Date: September 2007 +Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> +Description: Discover cpuidle policy and mechanism + + Various CPUs today support multiple idle levels that are + differentiated by varying exit latencies and power + consumption during idle. + + Idle policy (governor) is differentiated from idle mechanism + (driver) + + current_driver: displays current idle mechanism + + current_governor_ro: displays current idle policy + + See files in Documentation/cpuidle/ for more information. + + +What: /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X +Date: August 2008 +KernelVersion: 2.6.27 +Contact: mark.langsdorf@amd.com +Description: These files exist in every cpu's cache index directories. + There are currently 2 cache_disable_# files in each + directory. Reading from these files on a supported + processor will return that cache disable index value + for that processor and node. Writing to one of these + files will cause the specificed cache index to be disabled. + + Currently, only AMD Family 10h Processors support cache index + disable, and only for their L3 caches. See the BIOS and + Kernel Developer's Guide at + http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf + for formatting information and other details on the + cache index disable. +Users: joachim.deguara@amd.com diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches index b7f9d3b4bbf6..72651f788f4e 100644 --- a/Documentation/SubmittingPatches +++ b/Documentation/SubmittingPatches @@ -232,7 +232,7 @@ your e-mail client so that it sends your patches untouched. When sending patches to Linus, always follow step #7. Large changes are not appropriate for mailing lists, and some -maintainers. If your patch, uncompressed, exceeds 40 kB in size, +maintainers. If your patch, uncompressed, exceeds 300 kB in size, it is preferred that you store your patch on an Internet-accessible server, and provide instead a URL (link) pointing to your patch. diff --git a/Documentation/arm/tcm.txt b/Documentation/arm/tcm.txt new file mode 100644 index 000000000000..77fd9376e6d7 --- /dev/null +++ b/Documentation/arm/tcm.txt @@ -0,0 +1,147 @@ +ARM TCM (Tightly-Coupled Memory) handling in Linux +---- +Written by Linus Walleij <linus.walleij@stericsson.com> + +Some ARM SoC:s have a so-called TCM (Tightly-Coupled Memory). +This is usually just a few (4-64) KiB of RAM inside the ARM +processor. + +Due to being embedded inside the CPU The TCM has a +Harvard-architecture, so there is an ITCM (instruction TCM) +and a DTCM (data TCM). The DTCM can not contain any +instructions, but the ITCM can actually contain data. +The size of DTCM or ITCM is minimum 4KiB so the typical +minimum configuration is 4KiB ITCM and 4KiB DTCM. + +ARM CPU:s have special registers to read out status, physical +location and size of TCM memories. arch/arm/include/asm/cputype.h +defines a CPUID_TCM register that you can read out from the +system control coprocessor. Documentation from ARM can be found +at http://infocenter.arm.com, search for "TCM Status Register" +to see documents for all CPUs. Reading this register you can +determine if ITCM (bit 0) and/or DTCM (bit 16) is present in the +machine. + +There is further a TCM region register (search for "TCM Region +Registers" at the ARM site) that can report and modify the location +size of TCM memories at runtime. This is used to read out and modify +TCM location and size. Notice that this is not a MMU table: you +actually move the physical location of the TCM around. At the +place you put it, it will mask any underlying RAM from the +CPU so it is usually wise not to overlap any physical RAM with +the TCM. + +The TCM memory can then be remapped to another address again using +the MMU, but notice that the TCM if often used in situations where +the MMU is turned off. To avoid confusion the current Linux +implementation will map the TCM 1 to 1 from physical to virtual +memory in the location specified by the machine. + +TCM is used for a few things: + +- FIQ and other interrupt handlers that need deterministic + timing and cannot wait for cache misses. + +- Idle loops where all external RAM is set to self-refresh + retention mode, so only on-chip RAM is accessible by + the CPU and then we hang inside ITCM waiting for an + interrupt. + +- Other operations which implies shutting off or reconfiguring + the external RAM controller. + +There is an interface for using TCM on the ARM architecture +in <asm/tcm.h>. Using this interface it is possible to: + +- Define the physical address and size of ITCM and DTCM. + +- Tag functions to be compiled into ITCM. + +- Tag data and constants to be allocated to DTCM and ITCM. + +- Have the remaining TCM RAM added to a special + allocation pool with gen_pool_create() and gen_pool_add() + and provice tcm_alloc() and tcm_free() for this + memory. Such a heap is great for things like saving + device state when shutting off device power domains. + +A machine that has TCM memory shall select HAVE_TCM in +arch/arm/Kconfig for itself, and then the +rest of the functionality will depend on the physical +location and size of ITCM and DTCM to be defined in +mach/memory.h for the machine. Code that needs to use +TCM shall #include <asm/tcm.h> If the TCM is not located +at the place given in memory.h it will be moved using +the TCM Region registers. + +Functions to go into itcm can be tagged like this: +int __tcmfunc foo(int bar); + +Variables to go into dtcm can be tagged like this: +int __tcmdata foo; + +Constants can be tagged like this: +int __tcmconst foo; + +To put assembler into TCM just use +.section ".tcm.text" or .section ".tcm.data" +respectively. + +Example code: + +#include <asm/tcm.h> + +/* Uninitialized data */ +static u32 __tcmdata tcmvar; +/* Initialized data */ +static u32 __tcmdata tcmassigned = 0x2BADBABEU; +/* Constant */ +static const u32 __tcmconst tcmconst = 0xCAFEBABEU; + +static void __tcmlocalfunc tcm_to_tcm(void) +{ + int i; + for (i = 0; i < 100; i++) + tcmvar ++; +} + +static void __tcmfunc hello_tcm(void) +{ + /* Some abstract code that runs in ITCM */ + int i; + for (i = 0; i < 100; i++) { + tcmvar ++; + } + tcm_to_tcm(); +} + +static void __init test_tcm(void) +{ + u32 *tcmem; + int i; + + hello_tcm(); + printk("Hello TCM executed from ITCM RAM\n"); + + printk("TCM variable from testrun: %u @ %p\n", tcmvar, &tcmvar); + tcmvar = 0xDEADBEEFU; + printk("TCM variable: 0x%x @ %p\n", tcmvar, &tcmvar); + + printk("TCM assigned variable: 0x%x @ %p\n", tcmassigned, &tcmassigned); + + printk("TCM constant: 0x%x @ %p\n", tcmconst, &tcmconst); + + /* Allocate some TCM memory from the pool */ + tcmem = tcm_alloc(20); + if (tcmem) { + printk("TCM Allocated 20 bytes of TCM @ %p\n", tcmem); + tcmem[0] = 0xDEADBEEFU; + tcmem[1] = 0x2BADBABEU; + tcmem[2] = 0xCAFEBABEU; + tcmem[3] = 0xDEADBEEFU; + tcmem[4] = 0x2BADBABEU; + for (i = 0; i < 5; i++) + printk("TCM tcmem[%d] = %08x\n", i, tcmem[i]); + tcm_free(tcmem, 20); + } +} diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 455d4e6d346d..0b33bfe7dde9 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -227,7 +227,14 @@ as the path relative to the root of the cgroup file system. Each cgroup is represented by a directory in the cgroup file system containing the following files describing that cgroup: - - tasks: list of tasks (by pid) attached to that cgroup + - tasks: list of tasks (by pid) attached to that cgroup. This list + is not guaranteed to be sorted. Writing a thread id into this file + moves the thread into this cgroup. + - cgroup.procs: list of tgids in the cgroup. This list is not + guaranteed to be sorted or free of duplicate tgids, and userspace + should sort/uniquify the list if this property is required. + Writing a tgid into this file moves all threads with that tgid into + this cgroup. - notify_on_release flag: run the release agent on exit? - release_agent: the path to use for release notifications (this file exists in the top cgroup only) @@ -374,7 +381,7 @@ Now you want to do something with this cgroup. In this directory you can find several files: # ls -notify_on_release tasks +cgroup.procs notify_on_release tasks (plus whatever files added by the attached subsystems) Now attach your shell to this cgroup: diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt index b41f3e58aefa..f1c5c4bccd3e 100644 --- a/Documentation/cputopology.txt +++ b/Documentation/cputopology.txt @@ -1,15 +1,28 @@ -Export cpu topology info via sysfs. Items (attributes) are similar +Export CPU topology info via sysfs. Items (attributes) are similar to /proc/cpuinfo. 1) /sys/devices/system/cpu/cpuX/topology/physical_package_id: -represent the physical package id of cpu X; + + physical package id of cpuX. Typically corresponds to a physical + socket number, but the actual value is architecture and platform + dependent. + 2) /sys/devices/system/cpu/cpuX/topology/core_id: -represent the cpu core id to cpu X; + + the CPU core ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + 3) /sys/devices/system/cpu/cpuX/topology/thread_siblings: -represent the thread siblings to cpu X in the same core; + + internel kernel map of cpuX's hardware threads within the same + core as cpuX + 4) /sys/devices/system/cpu/cpuX/topology/core_siblings: -represent the thread siblings to cpu X in the same physical package; + + internal kernel map of cpuX's hardware threads within the same + physical_package_id. To implement it in an architecture-neutral way, a new source file, drivers/base/topology.c, is to export the 4 attributes. @@ -32,32 +45,32 @@ not defined by include/asm-XXX/topology.h: 3) thread_siblings: just the given CPU 4) core_siblings: just the given CPU -Additionally, cpu topology information is provided under +Additionally, CPU topology information is provided under /sys/devices/system/cpu and includes these files. The internal source for the output is in brackets ("[]"). - kernel_max: the maximum cpu index allowed by the kernel configuration. + kernel_max: the maximum CPU index allowed by the kernel configuration. [NR_CPUS-1] - offline: cpus that are not online because they have been + offline: CPUs that are not online because they have been HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit - of cpus allowed by the kernel configuration (kernel_max + of CPUs allowed by the kernel configuration (kernel_max above). [~cpu_online_mask + cpus >= NR_CPUS] - online: cpus that are online and being scheduled [cpu_online_mask] + online: CPUs that are online and being scheduled [cpu_online_mask] - possible: cpus that have been allocated resources and can be + possible: CPUs that have been allocated resources and can be brought online if they are present. [cpu_possible_mask] - present: cpus that have been identified as being present in the + present: CPUs that have been identified as being present in the system. [cpu_present_mask] The format for the above output is compatible with cpulist_parse() [see <linux/cpumask.h>]. Some examples follow. -In this example, there are 64 cpus in the system but cpus 32-63 exceed +In this example, there are 64 CPUs in the system but cpus 32-63 exceed the kernel max which is limited to 0..31 by the NR_CPUS config option -being 32. Note also that cpus 2 and 4-31 are not online but could be +being 32. Note also that CPUs 2 and 4-31 are not online but could be brought online as they are both present and possible. kernel_max: 31 @@ -67,8 +80,8 @@ brought online as they are both present and possible. present: 0-31 In this example, the NR_CPUS config option is 128, but the kernel was -started with possible_cpus=144. There are 4 cpus in the system and cpu2 -was manually taken offline (and is the only cpu that can be brought +started with possible_cpus=144. There are 4 CPUs in the system and cpu2 +was manually taken offline (and is the only CPU that can be brought online.) kernel_max: 127 @@ -78,4 +91,4 @@ online.) present: 0-3 See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter -as well as more information on the various cpumask's. +as well as more information on the various cpumasks. diff --git a/Documentation/debugging-via-ohci1394.txt b/Documentation/debugging-via-ohci1394.txt index 59a91e5c6909..611f5a5499b1 100644 --- a/Documentation/debugging-via-ohci1394.txt +++ b/Documentation/debugging-via-ohci1394.txt @@ -64,14 +64,14 @@ be used to view the printk buffer of a remote machine, even with live update. Bernhard Kaindl enhanced firescope to support accessing 64-bit machines from 32-bit firescope and vice versa: -- ftp://ftp.suse.de/private/bk/firewire/tools/firescope-0.2.2.tar.bz2 +- http://halobates.de/firewire/firescope-0.2.2.tar.bz2 and he implemented fast system dump (alpha version - read README.txt): -- ftp://ftp.suse.de/private/bk/firewire/tools/firedump-0.1.tar.bz2 +- http://halobates.de/firewire/firedump-0.1.tar.bz2 There is also a gdb proxy for firewire which allows to use gdb to access data which can be referenced from symbols found by gdb in vmlinux: -- ftp://ftp.suse.de/private/bk/firewire/tools/fireproxy-0.33.tar.bz2 +- http://halobates.de/firewire/fireproxy-0.33.tar.bz2 The latest version of this gdb proxy (fireproxy-0.34) can communicate (not yet stable) with kgdb over an memory-based communication module (kgdbom). @@ -178,7 +178,7 @@ Step-by-step instructions for using firescope with early OHCI initialization: Notes ----- -Documentation and specifications: ftp://ftp.suse.de/private/bk/firewire/docs +Documentation and specifications: http://halobates.de/firewire/ FireWire is a trademark of Apple Inc. - for more information please refer to: http://en.wikipedia.org/wiki/FireWire diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 89a47b5aff07..bc693fffabe0 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -418,6 +418,14 @@ When: 2.6.33 Why: Should be implemented in userspace, policy daemon. Who: Johannes Berg <johannes@sipsolutions.net> +--------------------------- + +What: CONFIG_INOTIFY +When: 2.6.33 +Why: last user (audit) will be converted to the newer more generic + and more easily maintained fsnotify subsystem +Who: Eric Paris <eparis@redhat.com> + ---------------------------- What: lock_policy_rwsem_* and unlock_policy_rwsem_* will not be @@ -451,3 +459,33 @@ Why: OSS sound_core grabs all legacy minors (0-255) of SOUND_MAJOR will also allow making ALSA OSS emulation independent of sound_core. The dependency will be broken then too. Who: Tejun Heo <tj@kernel.org> + +---------------------------- + +What: Support for VMware's guest paravirtuliazation technique [VMI] will be + dropped. +When: 2.6.37 or earlier. +Why: With the recent innovations in CPU hardware acceleration technologies + from Intel and AMD, VMware ran a few experiments to compare these + techniques to guest paravirtualization technique on VMware's platform. + These hardware assisted virtualization techniques have outperformed the + performance benefits provided by VMI in most of the workloads. VMware + expects that these hardware features will be ubiquitous in a couple of + years, as a result, VMware has started a phased retirement of this + feature from the hypervisor. We will be removing this feature from the + Kernel too. Right now we are targeting 2.6.37 but can retire earlier if + technical reasons (read opportunity to remove major chunk of pvops) + arise. + + Please note that VMI has always been an optimization and non-VMI kernels + still work fine on VMware's platform. + Latest versions of VMware's product which support VMI are, + Workstation 7.0 and VSphere 4.0 on ESX side, future maintainence + releases for these products will continue supporting VMI. + + For more details about VMI retirement take a look at this, + http://blogs.vmware.com/guestosguide/2009/09/vmi-retirement.html + +Who: Alok N Kataria <akataria@vmware.com> + +---------------------------- diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index 570f9bd9be2b..05d5cf1d743f 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt @@ -123,10 +123,18 @@ resuid=n The user ID which may use the reserved blocks. sb=n Use alternate superblock at this location. -quota -noquota -grpquota -usrquota +quota These options are ignored by the filesystem. They +noquota are used only by quota tools to recognize volumes +grpquota where quota should be turned on. See documentation +usrquota in the quota-tools package for more details + (http://sourceforge.net/projects/linuxquota). + +jqfmt=<quota type> These options tell filesystem details about quota +usrjquota=<file> so that quota information can be properly updated +grpjquota=<file> during journal replay. They replace the above + quota options. See documentation in the quota-tools + package for more details + (http://sourceforge.net/projects/linuxquota). bh (*) ext3 associates buffer heads to data pages to nobh (a) cache disk block mapping information diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 18b5ec8cea45..6d94e0696f8c 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -134,9 +134,15 @@ ro Mount filesystem read only. Note that ext4 will mount options "ro,noload" can be used to prevent writes to the filesystem. +journal_checksum Enable checksumming of the journal transactions. + This will allow the recovery code in e2fsck and the + kernel to detect corruption in the kernel. It is a + compatible change and will be ignored by older kernels. + journal_async_commit Commit block can be written to disk without waiting for descriptor blocks. If enabled older kernels cannot - mount the device. + mount the device. This will enable 'journal_checksum' + internally. journal=update Update the ext4 file system's journal to the current format. @@ -282,9 +288,16 @@ stripe=n Number of filesystem blocks that mballoc will try to use for allocation size and alignment. For RAID5/6 systems this should be the number of data disks * RAID chunk size in file system blocks. -delalloc (*) Deferring block allocation until write-out time. -nodelalloc Disable delayed allocation. Blocks are allocation - when data is copied from user to page cache. + +delalloc (*) Defer block allocation until just before ext4 + writes out the block(s) in question. This + allows ext4 to better allocation decisions + more efficiently. +nodelalloc Disable delayed allocation. Blocks are allocated + when the data is copied from userspace to the + page cache, either via the write(2) system call + or when an mmap'ed page which was previously + unallocated is written for the first time. max_batch_time=usec Maximum amount of time ext4 should wait for additional filesystem operations to be batch diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index b5aee7838a00..2c48f945546b 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1113,7 +1113,6 @@ Table 1-12: Files in /proc/fs/ext4/<devname> .............................................................................. File Content mb_groups details of multiblock allocator buddy cache of free blocks - mb_history multiblock allocation history .............................................................................. diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index b58b84b50fa2..eed520fd0c8e 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -102,7 +102,7 @@ shortname=lower|win95|winnt|mixed winnt: emulate the Windows NT rule for display/create. mixed: emulate the Windows NT rule for display, emulate the Windows 95 rule for create. - Default setting is `lower'. + Default setting is `mixed'. tz=UTC -- Interpret timestamps as UTC rather than local time. This option disables the conversion of timestamps diff --git a/Documentation/flexible-arrays.txt b/Documentation/flexible-arrays.txt index 84eb26808dee..cb8a3a00cc92 100644 --- a/Documentation/flexible-arrays.txt +++ b/Documentation/flexible-arrays.txt @@ -1,5 +1,5 @@ Using flexible arrays in the kernel -Last updated for 2.6.31 +Last updated for 2.6.32 Jonathan Corbet <corbet@lwn.net> Large contiguous memory allocations can be unreliable in the Linux kernel. @@ -40,6 +40,13 @@ argument is passed directly to the internal memory allocation calls. With the current code, using flags to ask for high memory is likely to lead to notably unpleasant side effects. +It is also possible to define flexible arrays at compile time with: + + DEFINE_FLEX_ARRAY(name, element_size, total); + +This macro will result in a definition of an array with the given name; the +element size and total will be checked for validity at compile time. + Storing data into a flexible array is accomplished with a call to: int flex_array_put(struct flex_array *array, unsigned int element_nr, @@ -76,16 +83,30 @@ particular element has never been allocated. Note that it is possible to get back a valid pointer for an element which has never been stored in the array. Memory for array elements is allocated one page at a time; a single allocation could provide memory for several -adjacent elements. The flexible array code does not know if a specific -element has been written; it only knows if the associated memory is -present. So a flex_array_get() call on an element which was never stored -in the array has the potential to return a pointer to random data. If the -caller does not have a separate way to know which elements were actually -stored, it might be wise, at least, to add GFP_ZERO to the flags argument -to ensure that all elements are zeroed. - -There is no way to remove a single element from the array. It is possible, -though, to remove all elements with a call to: +adjacent elements. Flexible array elements are normally initialized to the +value FLEX_ARRAY_FREE (defined as 0x6c in <linux/poison.h>), so errors +involving that number probably result from use of unstored array entries. +Note that, if array elements are allocated with __GFP_ZERO, they will be +initialized to zero and this poisoning will not happen. + +Individual elements in the array can be cleared with: + + int flex_array_clear(struct flex_array *array, unsigned int element_nr); + +This function will set the given element to FLEX_ARRAY_FREE and return +zero. If storage for the indicated element is not allocated for the array, +flex_array_clear() will return -EINVAL instead. Note that clearing an +element does not release the storage associated with it; to reduce the +allocated size of an array, call: + + int flex_array_shrink(struct flex_array *array); + +The return value will be the number of pages of memory actually freed. +This function works by scanning the array for pages containing nothing but +FLEX_ARRAY_FREE bytes, so (1) it can be expensive, and (2) it will not work +if the array's pages are allocated with __GFP_ZERO. + +It is possible to remove all elements of an array with a call to: void flex_array_free_parts(struct flex_array *array); diff --git a/Documentation/hwmon/ltc4215 b/Documentation/hwmon/ltc4215 index 2e6a21eb656c..c196a1846259 100644 --- a/Documentation/hwmon/ltc4215 +++ b/Documentation/hwmon/ltc4215 @@ -22,12 +22,13 @@ Usage Notes ----------- This driver does not probe for LTC4215 devices, due to the fact that some -of the possible addresses are unfriendly to probing. You will need to use -the "force" parameter to tell the driver where to find the device. +of the possible addresses are unfriendly to probing. You will have to +instantiate the devices explicitly. Example: the following will load the driver for an LTC4215 at address 0x44 on I2C bus #0: -$ modprobe ltc4215 force=0,0x44 +$ modprobe ltc4215 +$ echo ltc4215 0x44 > /sys/bus/i2c/devices/i2c-0/new_device Sysfs entries diff --git a/Documentation/hwmon/ltc4245 b/Documentation/hwmon/ltc4245 index bae7a3adc5d8..02838a47d862 100644 --- a/Documentation/hwmon/ltc4245 +++ b/Documentation/hwmon/ltc4245 @@ -23,12 +23,13 @@ Usage Notes ----------- This driver does not probe for LTC4245 devices, due to the fact that some -of the possible addresses are unfriendly to probing. You will need to use -the "force" parameter to tell the driver where to find the device. +of the possible addresses are unfriendly to probing. You will have to +instantiate the devices explicitly. Example: the following will load the driver for an LTC4245 at address 0x23 on I2C bus #1: -$ modprobe ltc4245 force=1,0x23 +$ modprobe ltc4245 +$ echo ltc4245 0x23 > /sys/bus/i2c/devices/i2c-1/new_device Sysfs entries diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface index dcbd502c8792..82def883361b 100644 --- a/Documentation/hwmon/sysfs-interface +++ b/Documentation/hwmon/sysfs-interface @@ -353,10 +353,20 @@ power[1-*]_average Average power use Unit: microWatt RO -power[1-*]_average_interval Power use averaging interval +power[1-*]_average_interval Power use averaging interval. A poll + notification is sent to this file if the + hardware changes the averaging interval. Unit: milliseconds RW +power[1-*]_average_interval_max Maximum power use averaging interval + Unit: milliseconds + RO + +power[1-*]_average_interval_min Minimum power use averaging interval + Unit: milliseconds + RO + power[1-*]_average_highest Historical average maximum power use Unit: microWatt RO @@ -365,6 +375,18 @@ power[1-*]_average_lowest Historical average minimum power use Unit: microWatt RO +power[1-*]_average_max A poll notification is sent to + power[1-*]_average when power use + rises above this value. + Unit: microWatt + RW + +power[1-*]_average_min A poll notification is sent to + power[1-*]_average when power use + sinks below this value. + Unit: microWatt + RW + power[1-*]_input Instantaneous power use Unit: microWatt RO @@ -381,6 +403,39 @@ power[1-*]_reset_history Reset input_highest, input_lowest, average_highest and average_lowest. WO +power[1-*]_accuracy Accuracy of the power meter. + Unit: Percent + RO + +power[1-*]_alarm 1 if the system is drawing more power than the + cap allows; 0 otherwise. A poll notification is + sent to this file when the power use exceeds the + cap. This file only appears if the cap is known + to be enforced by hardware. + RO + +power[1-*]_cap If power use rises above this limit, the + system should take action to reduce power use. + A poll notification is sent to this file if the + cap is changed by the hardware. The *_cap + files only appear if the cap is known to be + enforced by hardware. + Unit: microWatt + RW + +power[1-*]_cap_hyst Margin of hysteresis built around capping and + notification. + Unit: microWatt + RW + +power[1-*]_cap_max Maximum cap that can be set. + Unit: microWatt + RO + +power[1-*]_cap_min Minimum cap that can be set. + Unit: microWatt + RO + ********** * Energy * ********** diff --git a/Documentation/i2c/busses/i2c-piix4 b/Documentation/i2c/busses/i2c-piix4 index c5b37c570554..ac540c71c7eb 100644 --- a/Documentation/i2c/busses/i2c-piix4 +++ b/Documentation/i2c/busses/i2c-piix4 @@ -8,7 +8,7 @@ Supported adapters: Datasheet: Only available via NDA from ServerWorks * ATI IXP200, IXP300, IXP400, SB600, SB700 and SB800 southbridges Datasheet: Not publicly available - * AMD SB900 + * AMD Hudson-2 Datasheet: Not publicly available * Standard Microsystems (SMSC) SLC90E66 (Victory66) southbridge Datasheet: Publicly available at the SMSC website http://www.smsc.com diff --git a/Documentation/i2c/instantiating-devices b/Documentation/i2c/instantiating-devices index c740b7b41088..e89490270aba 100644 --- a/Documentation/i2c/instantiating-devices +++ b/Documentation/i2c/instantiating-devices @@ -188,7 +188,7 @@ segment, the address is sufficient to uniquely identify the device to be deleted. Example: -# echo eeprom 0x50 > /sys/class/i2c-adapter/i2c-3/new_device +# echo eeprom 0x50 > /sys/bus/i2c/devices/i2c-3/new_device While this interface should only be used when in-kernel device declaration can't be done, there is a variety of cases where it can be helpful: diff --git a/Documentation/infiniband/user_mad.txt b/Documentation/infiniband/user_mad.txt index 744687dd195b..8a366959f5cc 100644 --- a/Documentation/infiniband/user_mad.txt +++ b/Documentation/infiniband/user_mad.txt @@ -128,8 +128,8 @@ Setting IsSM Capability Bit To create the appropriate character device files automatically with udev, a rule like - KERNEL="umad*", NAME="infiniband/%k" - KERNEL="issm*", NAME="infiniband/%k" + KERNEL=="umad*", NAME="infiniband/%k" + KERNEL=="issm*", NAME="infiniband/%k" can be used. This will create device nodes named diff --git a/Documentation/infiniband/user_verbs.txt b/Documentation/infiniband/user_verbs.txt index f847501e50b5..afe3f8da9018 100644 --- a/Documentation/infiniband/user_verbs.txt +++ b/Documentation/infiniband/user_verbs.txt @@ -58,7 +58,7 @@ Memory pinning To create the appropriate character device files automatically with udev, a rule like - KERNEL="uverbs*", NAME="infiniband/%k" + KERNEL=="uverbs*", NAME="infiniband/%k" can be used. This will create device nodes named diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6fa7292947e5..9107b387e91f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -671,6 +671,7 @@ and is between 256 and 4096 characters. It is defined in the file earlyprintk= [X86,SH,BLACKFIN] earlyprintk=vga earlyprintk=serial[,ttySn[,baudrate]] + earlyprintk=ttySn[,baudrate] earlyprintk=dbgp[debugController#] Append ",keep" to not disable it when the real console diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt index 6d03487ef1c7..aafcaa634191 100644 --- a/Documentation/laptops/thinkpad-acpi.txt +++ b/Documentation/laptops/thinkpad-acpi.txt @@ -199,18 +199,22 @@ kind to allow it (and it often doesn't!). Not all bits in the mask can be modified. Not all bits that can be modified do anything. Not all hot keys can be individually controlled -by the mask. Some models do not support the mask at all, and in those -models, hot keys cannot be controlled individually. The behaviour of -the mask is, therefore, highly dependent on the ThinkPad model. +by the mask. Some models do not support the mask at all. The behaviour +of the mask is, therefore, highly dependent on the ThinkPad model. + +The driver will filter out any unmasked hotkeys, so even if the firmware +doesn't allow disabling an specific hotkey, the driver will not report +events for unmasked hotkeys. Note that unmasking some keys prevents their default behavior. For example, if Fn+F5 is unmasked, that key will no longer enable/disable -Bluetooth by itself. +Bluetooth by itself in firmware. -Note also that not all Fn key combinations are supported through ACPI. -For example, on the X40, the brightness, volume and "Access IBM" buttons -do not generate ACPI events even with this driver. They *can* be used -through the "ThinkPad Buttons" utility, see http://www.nongnu.org/tpb/ +Note also that not all Fn key combinations are supported through ACPI +depending on the ThinkPad model and firmware version. On those +ThinkPads, it is still possible to support some extra hotkeys by +polling the "CMOS NVRAM" at least 10 times per second. The driver +attempts to enables this functionality automatically when required. procfs notes: @@ -255,18 +259,11 @@ sysfs notes: 1: does nothing hotkey_mask: - bit mask to enable driver-handling (and depending on + bit mask to enable reporting (and depending on the firmware, ACPI event generation) for each hot key (see above). Returns the current status of the hot keys mask, and allows one to modify it. - Note: when NVRAM polling is active, the firmware mask - will be different from the value returned by - hotkey_mask. The driver will retain enabled bits for - hotkeys that are under NVRAM polling even if the - firmware refuses them, and will not set these bits on - the firmware hot key mask. - hotkey_all_mask: bit mask that should enable event reporting for all supported hot keys, when echoed to hotkey_mask above. @@ -279,7 +276,8 @@ sysfs notes: bit mask that should enable event reporting for all supported hot keys, except those which are always handled by the firmware anyway. Echo it to - hotkey_mask above, to use. + hotkey_mask above, to use. This is the default mask + used by the driver. hotkey_source_mask: bit mask that selects which hot keys will the driver @@ -287,9 +285,10 @@ sysfs notes: based on the capabilities reported by the ACPI firmware, but it can be overridden at runtime. - Hot keys whose bits are set in both hotkey_source_mask - and also on hotkey_mask are polled for in NVRAM. Only a - few hot keys are available through CMOS NVRAM polling. + Hot keys whose bits are set in hotkey_source_mask are + polled for in NVRAM, and reported as hotkey events if + enabled in hotkey_mask. Only a few hot keys are + available through CMOS NVRAM polling. Warning: when in NVRAM mode, the volume up/down/mute keys are synthesized according to changes in the mixer, @@ -525,6 +524,7 @@ compatibility purposes when hotkey_report_mode is set to 1. 0x2305 System is waking up from suspend to eject bay 0x2404 System is waking up from hibernation to undock 0x2405 System is waking up from hibernation to eject bay +0x5010 Brightness level changed/control event The above events are never propagated by the driver. @@ -532,7 +532,6 @@ The above events are never propagated by the driver. 0x4003 Undocked (see 0x2x04), can sleep again 0x500B Tablet pen inserted into its storage bay 0x500C Tablet pen removed from its storage bay -0x5010 Brightness level changed (newer Lenovo BIOSes) The above events are propagated by the driver. @@ -621,6 +620,8 @@ For Lenovo models *with* ACPI backlight control: 2. Do *NOT* load up ACPI video, enable the hotkeys in thinkpad-acpi, and map them to KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN. Process these keys on userspace somehow (e.g. by calling xbacklight). + The driver will do this automatically if it detects that ACPI video + has been disabled. Bluetooth @@ -1459,3 +1460,8 @@ Sysfs interface changelog: 0x020400: Marker for 16 LEDs support. Also, LEDs that are known to not exist in a given model are not registered with the LED sysfs class anymore. + +0x020500: Updated hotkey driver, hotkey_mask is always available + and it is always able to disable hot keys. Very old + thinkpads are properly supported. hotkey_bios_mask + is deprecated and marked for removal. diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index ba9373f82ab5..098de5bce00a 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -42,7 +42,6 @@ #include <signal.h> #include "linux/lguest_launcher.h" #include "linux/virtio_config.h" -#include <linux/virtio_ids.h> #include "linux/virtio_net.h" #include "linux/virtio_blk.h" #include "linux/virtio_console.h" diff --git a/Documentation/i2c/chips/eeprom b/Documentation/misc-devices/eeprom index f7e8104b5764..f7e8104b5764 100644 --- a/Documentation/i2c/chips/eeprom +++ b/Documentation/misc-devices/eeprom diff --git a/Documentation/i2c/chips/max6875 b/Documentation/misc-devices/max6875 index 10ca43cd1a72..1e89ee3ccc1b 100644 --- a/Documentation/i2c/chips/max6875 +++ b/Documentation/misc-devices/max6875 @@ -42,10 +42,12 @@ General Remarks Valid addresses for the MAX6875 are 0x50 and 0x52. Valid addresses for the MAX6874 are 0x50, 0x52, 0x54 and 0x56. -The driver does not probe any address, so you must force the address. +The driver does not probe any address, so you explicitly instantiate the +devices. Example: -$ modprobe max6875 force=0,0x50 +$ modprobe max6875 +$ echo max6875 0x50 > /sys/bus/i2c/devices/i2c-0/new_device The MAX6874/MAX6875 ignores address bit 0, so this driver attaches to multiple addresses. For example, for address 0x50, it also reserves 0x51. diff --git a/Documentation/scsi/hptiop.txt b/Documentation/scsi/hptiop.txt index a6eb4add1be6..9605179711f4 100644 --- a/Documentation/scsi/hptiop.txt +++ b/Documentation/scsi/hptiop.txt @@ -3,6 +3,25 @@ HIGHPOINT ROCKETRAID 3xxx/4xxx ADAPTER DRIVER (hptiop) Controller Register Map ------------------------- +For RR44xx Intel IOP based adapters, the controller IOP is accessed via PCI BAR0 and BAR2: + + BAR0 offset Register + 0x11C5C Link Interface IRQ Set + 0x11C60 Link Interface IRQ Clear + + BAR2 offset Register + 0x10 Inbound Message Register 0 + 0x14 Inbound Message Register 1 + 0x18 Outbound Message Register 0 + 0x1C Outbound Message Register 1 + 0x20 Inbound Doorbell Register + 0x24 Inbound Interrupt Status Register + 0x28 Inbound Interrupt Mask Register + 0x30 Outbound Interrupt Status Register + 0x34 Outbound Interrupt Mask Register + 0x40 Inbound Queue Port + 0x44 Outbound Queue Port + For Intel IOP based adapters, the controller IOP is accessed via PCI BAR0: BAR0 offset Register @@ -93,7 +112,7 @@ The driver exposes following sysfs attributes: ----------------------------------------------------------------------------- -Copyright (C) 2006-2007 HighPoint Technologies, Inc. All Rights Reserved. +Copyright (C) 2006-2009 HighPoint Technologies, Inc. All Rights Reserved. This file is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt index 1c8eb4518ce0..fd9a2f67edf2 100644 --- a/Documentation/sound/alsa/ALSA-Configuration.txt +++ b/Documentation/sound/alsa/ALSA-Configuration.txt @@ -522,7 +522,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. pcm_devs - Number of PCM devices assigned to each card (default = 1, up to 4) pcm_substreams - Number of PCM substreams assigned to each PCM - (default = 8, up to 16) + (default = 8, up to 128) hrtimer - Use hrtimer (=1, default) or system timer (=0) fake_buffer - Fake buffer allocations (default = 1) diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt index f1708b79f963..4c7f9aee5c4e 100644 --- a/Documentation/sound/alsa/HD-Audio-Models.txt +++ b/Documentation/sound/alsa/HD-Audio-Models.txt @@ -209,6 +209,7 @@ AD1884A / AD1883 / AD1984A / AD1984B laptop laptop with HP jack sensing mobile mobile devices with HP jack sensing thinkpad Lenovo Thinkpad X300 + touchsmart HP Touchsmart AD1884 ====== @@ -358,6 +359,7 @@ STAC9227/9228/9229/927x 5stack-no-fp D965 5stack without front panel dell-3stack Dell Dimension E520 dell-bios Fixes with Dell BIOS setup + volknob Fixes with volume-knob widget 0x24 auto BIOS setup (default) STAC92HD71B* diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt index 70d68ce8640a..a87dc277a5ca 100644 --- a/Documentation/thermal/sysfs-api.txt +++ b/Documentation/thermal/sysfs-api.txt @@ -1,5 +1,5 @@ Generic Thermal Sysfs driver How To -========================= +=================================== Written by Sujith Thomas <sujith.thomas@intel.com>, Zhang Rui <rui.zhang@intel.com> @@ -10,20 +10,20 @@ Copyright (c) 2008 Intel Corporation 0. Introduction -The generic thermal sysfs provides a set of interfaces for thermal zone devices (sensors) -and thermal cooling devices (fan, processor...) to register with the thermal management -solution and to be a part of it. +The generic thermal sysfs provides a set of interfaces for thermal zone +devices (sensors) and thermal cooling devices (fan, processor...) to register +with the thermal management solution and to be a part of it. -This how-to focuses on enabling new thermal zone and cooling devices to participate -in thermal management. -This solution is platform independent and any type of thermal zone devices and -cooling devices should be able to make use of the infrastructure. +This how-to focuses on enabling new thermal zone and cooling devices to +participate in thermal management. +This solution is platform independent and any type of thermal zone devices +and cooling devices should be able to make use of the infrastructure. -The main task of the thermal sysfs driver is to expose thermal zone attributes as well -as cooling device attributes to the user space. -An intelligent thermal management application can make decisions based on inputs -from thermal zone attributes (the current temperature and trip point temperature) -and throttle appropriate devices. +The main task of the thermal sysfs driver is to expose thermal zone attributes +as well as cooling device attributes to the user space. +An intelligent thermal management application can make decisions based on +inputs from thermal zone attributes (the current temperature and trip point +temperature) and throttle appropriate devices. [0-*] denotes any positive number starting from 0 [1-*] denotes any positive number starting from 1 @@ -31,77 +31,77 @@ and throttle appropriate devices. 1. thermal sysfs driver interface functions 1.1 thermal zone device interface -1.1.1 struct thermal_zone_device *thermal_zone_device_register(char *name, int trips, - void *devdata, struct thermal_zone_device_ops *ops) - - This interface function adds a new thermal zone device (sensor) to - /sys/class/thermal folder as thermal_zone[0-*]. - It tries to bind all the thermal cooling devices registered at the same time. - - name: the thermal zone name. - trips: the total number of trip points this thermal zone supports. - devdata: device private data - ops: thermal zone device call-backs. - .bind: bind the thermal zone device with a thermal cooling device. - .unbind: unbind the thermal zone device with a thermal cooling device. - .get_temp: get the current temperature of the thermal zone. - .get_mode: get the current mode (user/kernel) of the thermal zone. - "kernel" means thermal management is done in kernel. - "user" will prevent kernel thermal driver actions upon trip points - so that user applications can take charge of thermal management. - .set_mode: set the mode (user/kernel) of the thermal zone. - .get_trip_type: get the type of certain trip point. - .get_trip_temp: get the temperature above which the certain trip point - will be fired. +1.1.1 struct thermal_zone_device *thermal_zone_device_register(char *name, + int trips, void *devdata, struct thermal_zone_device_ops *ops) + + This interface function adds a new thermal zone device (sensor) to + /sys/class/thermal folder as thermal_zone[0-*]. It tries to bind all the + thermal cooling devices registered at the same time. + + name: the thermal zone name. + trips: the total number of trip points this thermal zone supports. + devdata: device private data + ops: thermal zone device call-backs. + .bind: bind the thermal zone device with a thermal cooling device. + .unbind: unbind the thermal zone device with a thermal cooling device. + .get_temp: get the current temperature of the thermal zone. + .get_mode: get the current mode (user/kernel) of the thermal zone. + - "kernel" means thermal management is done in kernel. + - "user" will prevent kernel thermal driver actions upon trip points + so that user applications can take charge of thermal management. + .set_mode: set the mode (user/kernel) of the thermal zone. + .get_trip_type: get the type of certain trip point. + .get_trip_temp: get the temperature above which the certain trip point + will be fired. 1.1.2 void thermal_zone_device_unregister(struct thermal_zone_device *tz) - This interface function removes the thermal zone device. - It deletes the corresponding entry form /sys/class/thermal folder and unbind all - the thermal cooling devices it uses. + This interface function removes the thermal zone device. + It deletes the corresponding entry form /sys/class/thermal folder and + unbind all the thermal cooling devices it uses. 1.2 thermal cooling device interface 1.2.1 struct thermal_cooling_device *thermal_cooling_device_register(char *name, - void *devdata, struct thermal_cooling_device_ops *) - - This interface function adds a new thermal cooling device (fan/processor/...) to - /sys/class/thermal/ folder as cooling_device[0-*]. - It tries to bind itself to all the thermal zone devices register at the same time. - name: the cooling device name. - devdata: device private data. - ops: thermal cooling devices call-backs. - .get_max_state: get the Maximum throttle state of the cooling device. - .get_cur_state: get the Current throttle state of the cooling device. - .set_cur_state: set the Current throttle state of the cooling device. + void *devdata, struct thermal_cooling_device_ops *) + + This interface function adds a new thermal cooling device (fan/processor/...) + to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself + to all the thermal zone devices register at the same time. + name: the cooling device name. + devdata: device private data. + ops: thermal cooling devices call-backs. + .get_max_state: get the Maximum throttle state of the cooling device. + .get_cur_state: get the Current throttle state of the cooling device. + .set_cur_state: set the Current throttle state of the cooling device. 1.2.2 void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev) - This interface function remove the thermal cooling device. - It deletes the corresponding entry form /sys/class/thermal folder and unbind - itself from all the thermal zone devices using it. + This interface function remove the thermal cooling device. + It deletes the corresponding entry form /sys/class/thermal folder and + unbind itself from all the thermal zone devices using it. 1.3 interface for binding a thermal zone device with a thermal cooling device 1.3.1 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz, - int trip, struct thermal_cooling_device *cdev); + int trip, struct thermal_cooling_device *cdev); - This interface function bind a thermal cooling device to the certain trip point - of a thermal zone device. - This function is usually called in the thermal zone device .bind callback. - tz: the thermal zone device - cdev: thermal cooling device - trip: indicates which trip point the cooling devices is associated with - in this thermal zone. + This interface function bind a thermal cooling device to the certain trip + point of a thermal zone device. + This function is usually called in the thermal zone device .bind callback. + tz: the thermal zone device + cdev: thermal cooling device + trip: indicates which trip point the cooling devices is associated with + in this thermal zone. 1.3.2 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz, - int trip, struct thermal_cooling_device *cdev); + int trip, struct thermal_cooling_device *cdev); - This interface function unbind a thermal cooling device from the certain trip point - of a thermal zone device. - This function is usually called in the thermal zone device .unbind callback. - tz: the thermal zone device - cdev: thermal cooling device - trip: indicates which trip point the cooling devices is associated with - in this thermal zone. + This interface function unbind a thermal cooling device from the certain + trip point of a thermal zone device. This function is usually called in + the thermal zone device .unbind callback. + tz: the thermal zone device + cdev: thermal cooling device + trip: indicates which trip point the cooling devices is associated with + in this thermal zone. 2. sysfs attributes structure @@ -114,153 +114,166 @@ if hwmon is compiled in or built as a module. Thermal zone device sys I/F, created once it's registered: /sys/class/thermal/thermal_zone[0-*]: - |-----type: Type of the thermal zone - |-----temp: Current temperature - |-----mode: Working mode of the thermal zone - |-----trip_point_[0-*]_temp: Trip point temperature - |-----trip_point_[0-*]_type: Trip point type + |---type: Type of the thermal zone + |---temp: Current temperature + |---mode: Working mode of the thermal zone + |---trip_point_[0-*]_temp: Trip point temperature + |---trip_point_[0-*]_type: Trip point type Thermal cooling device sys I/F, created once it's registered: /sys/class/thermal/cooling_device[0-*]: - |-----type : Type of the cooling device(processor/fan/...) - |-----max_state: Maximum cooling state of the cooling device - |-----cur_state: Current cooling state of the cooling device + |---type: Type of the cooling device(processor/fan/...) + |---max_state: Maximum cooling state of the cooling device + |---cur_state: Current cooling state of the cooling device -These two dynamic attributes are created/removed in pairs. -They represent the relationship between a thermal zone and its associated cooling device. -They are created/removed for each -thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device successful execution. +Then next two dynamic attributes are created/removed in pairs. They represent +the relationship between a thermal zone and its associated cooling device. +They are created/removed for each successful execution of +thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device. -/sys/class/thermal/thermal_zone[0-*] - |-----cdev[0-*]: The [0-*]th cooling device in the current thermal zone - |-----cdev[0-*]_trip_point: Trip point that cdev[0-*] is associated with +/sys/class/thermal/thermal_zone[0-*]: + |---cdev[0-*]: [0-*]th cooling device in current thermal zone + |---cdev[0-*]_trip_point: Trip point that cdev[0-*] is associated with Besides the thermal zone device sysfs I/F and cooling device sysfs I/F, -the generic thermal driver also creates a hwmon sysfs I/F for each _type_ of -thermal zone device. E.g. the generic thermal driver registers one hwmon class device -and build the associated hwmon sysfs I/F for all the registered ACPI thermal zones. +the generic thermal driver also creates a hwmon sysfs I/F for each _type_ +of thermal zone device. E.g. the generic thermal driver registers one hwmon +class device and build the associated hwmon sysfs I/F for all the registered +ACPI thermal zones. + /sys/class/hwmon/hwmon[0-*]: - |-----name: The type of the thermal zone devices. - |-----temp[1-*]_input: The current temperature of thermal zone [1-*]. - |-----temp[1-*]_critical: The critical trip point of thermal zone [1-*]. + |---name: The type of the thermal zone devices + |---temp[1-*]_input: The current temperature of thermal zone [1-*] + |---temp[1-*]_critical: The critical trip point of thermal zone [1-*] + Please read Documentation/hwmon/sysfs-interface for additional information. *************************** * Thermal zone attributes * *************************** -type Strings which represent the thermal zone type. - This is given by thermal zone driver as part of registration. - Eg: "acpitz" indicates it's an ACPI thermal device. - In order to keep it consistent with hwmon sys attribute, - this should be a short, lowercase string, - not containing spaces nor dashes. - RO - Required - -temp Current temperature as reported by thermal zone (sensor) - Unit: millidegree Celsius - RO - Required - -mode One of the predefined values in [kernel, user] - This file gives information about the algorithm - that is currently managing the thermal zone. - It can be either default kernel based algorithm - or user space application. - RW - Optional - kernel = Thermal management in kernel thermal zone driver. - user = Preventing kernel thermal zone driver actions upon - trip points so that user application can take full - charge of the thermal management. - -trip_point_[0-*]_temp The temperature above which trip point will be fired - Unit: millidegree Celsius - RO - Optional - -trip_point_[0-*]_type Strings which indicate the type of the trip point - E.g. it can be one of critical, hot, passive, - active[0-*] for ACPI thermal zone. - RO - Optional - -cdev[0-*] Sysfs link to the thermal cooling device node where the sys I/F - for cooling device throttling control represents. - RO - Optional - -cdev[0-*]_trip_point The trip point with which cdev[0-*] is associated in this thermal zone - -1 means the cooling device is not associated with any trip point. - RO - Optional - -****************************** -* Cooling device attributes * -****************************** - -type String which represents the type of device - eg: For generic ACPI: this should be "Fan", - "Processor" or "LCD" - eg. For memory controller device on intel_menlow platform: - this should be "Memory controller" - RO - Required - -max_state The maximum permissible cooling state of this cooling device. - RO - Required - -cur_state The current cooling state of this cooling device. - the value can any integer numbers between 0 and max_state, - cur_state == 0 means no cooling - cur_state == max_state means the maximum cooling. - RW - Required +type + Strings which represent the thermal zone type. + This is given by thermal zone driver as part of registration. + E.g: "acpitz" indicates it's an ACPI thermal device. + In order to keep it consistent with hwmon sys attribute; this should + be a short, lowercase string, not containing spaces nor dashes. + RO, Required + +temp + Current temperature as reported by thermal zone (sensor). + Unit: millidegree Celsius + RO, Required + +mode + One of the predefined values in [kernel, user]. + This file gives information about the algorithm that is currently + managing the thermal zone. It can be either default kernel based + algorithm or user space application. + kernel = Thermal management in kernel thermal zone driver. + user = Preventing kernel thermal zone driver actions upon + trip points so that user application can take full + charge of the thermal management. + RW, Optional + +trip_point_[0-*]_temp + The temperature above which trip point will be fired. + Unit: millidegree Celsius + RO, Optional + +trip_point_[0-*]_type + Strings which indicate the type of the trip point. + E.g. it can be one of critical, hot, passive, active[0-*] for ACPI + thermal zone. + RO, Optional + +cdev[0-*] + Sysfs link to the thermal cooling device node where the sys I/F + for cooling device throttling control represents. + RO, Optional + +cdev[0-*]_trip_point + The trip point with which cdev[0-*] is associated in this thermal + zone; -1 means the cooling device is not associated with any trip + point. + RO, Optional + +passive + Attribute is only present for zones in which the passive cooling + policy is not supported by native thermal driver. Default is zero + and can be set to a temperature (in millidegrees) to enable a + passive trip point for the zone. Activation is done by polling with + an interval of 1 second. + Unit: millidegrees Celsius + RW, Optional + +***************************** +* Cooling device attributes * +***************************** + +type + String which represents the type of device, e.g: + - for generic ACPI: should be "Fan", "Processor" or "LCD" + - for memory controller device on intel_menlow platform: + should be "Memory controller". + RO, Required + +max_state + The maximum permissible cooling state of this cooling device. + RO, Required + +cur_state + The current cooling state of this cooling device. + The value can any integer numbers between 0 and max_state: + - cur_state == 0 means no cooling + - cur_state == max_state means the maximum cooling. + RW, Required 3. A simple implementation -ACPI thermal zone may support multiple trip points like critical/hot/passive/active. -If an ACPI thermal zone supports critical, passive, active[0] and active[1] at the same time, -it may register itself as a thermal_zone_device (thermal_zone1) with 4 trip points in all. -It has one processor and one fan, which are both registered as thermal_cooling_device. -If the processor is listed in _PSL method, and the fan is listed in _AL0 method, -the sys I/F structure will be built like this: +ACPI thermal zone may support multiple trip points like critical, hot, +passive, active. If an ACPI thermal zone supports critical, passive, +active[0] and active[1] at the same time, it may register itself as a +thermal_zone_device (thermal_zone1) with 4 trip points in all. +It has one processor and one fan, which are both registered as +thermal_cooling_device. + +If the processor is listed in _PSL method, and the fan is listed in _AL0 +method, the sys I/F structure will be built like this: /sys/class/thermal: |thermal_zone1: - |-----type: acpitz - |-----temp: 37000 - |-----mode: kernel - |-----trip_point_0_temp: 100000 - |-----trip_point_0_type: critical - |-----trip_point_1_temp: 80000 - |-----trip_point_1_type: passive - |-----trip_point_2_temp: 70000 - |-----trip_point_2_type: active0 - |-----trip_point_3_temp: 60000 - |-----trip_point_3_type: active1 - |-----cdev0: --->/sys/class/thermal/cooling_device0 - |-----cdev0_trip_point: 1 /* cdev0 can be used for passive */ - |-----cdev1: --->/sys/class/thermal/cooling_device3 - |-----cdev1_trip_point: 2 /* cdev1 can be used for active[0]*/ + |---type: acpitz + |---temp: 37000 + |---mode: kernel + |---trip_point_0_temp: 100000 + |---trip_point_0_type: critical + |---trip_point_1_temp: 80000 + |---trip_point_1_type: passive + |---trip_point_2_temp: 70000 + |---trip_point_2_type: active0 + |---trip_point_3_temp: 60000 + |---trip_point_3_type: active1 + |---cdev0: --->/sys/class/thermal/cooling_device0 + |---cdev0_trip_point: 1 /* cdev0 can be used for passive */ + |---cdev1: --->/sys/class/thermal/cooling_device3 + |---cdev1_trip_point: 2 /* cdev1 can be used for active[0]*/ |cooling_device0: - |-----type: Processor - |-----max_state: 8 - |-----cur_state: 0 + |---type: Processor + |---max_state: 8 + |---cur_state: 0 |cooling_device3: - |-----type: Fan - |-----max_state: 2 - |-----cur_state: 0 + |---type: Fan + |---max_state: 2 + |---cur_state: 0 /sys/class/hwmon: |hwmon0: - |-----name: acpitz - |-----temp1_input: 37000 - |-----temp1_crit: 100000 + |---name: acpitz + |---temp1_input: 37000 + |---temp1_crit: 100000 diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 957b22fde2df..8179692fbb90 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -1231,6 +1231,7 @@ something like this simple program: #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> +#include <string.h> #define _STR(x) #x #define STR(x) _STR(x) @@ -1265,6 +1266,7 @@ const char *find_debugfs(void) return NULL; } + strcat(debugfs, "/tracing/"); debugfs_found = 1; return debugfs; diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt new file mode 100644 index 000000000000..3ffadf8da61f --- /dev/null +++ b/Documentation/vm/hwpoison.txt @@ -0,0 +1,136 @@ +What is hwpoison? + +Upcoming Intel CPUs have support for recovering from some memory errors +(``MCA recovery''). This requires the OS to declare a page "poisoned", +kill the processes associated with it and avoid using it in the future. + +This patchkit implements the necessary infrastructure in the VM. + +To quote the overview comment: + + * High level machine check handler. Handles pages reported by the + * hardware as being corrupted usually due to a 2bit ECC memory or cache + * failure. + * + * This focusses on pages detected as corrupted in the background. + * When the current CPU tries to consume corruption the currently + * running process can just be killed directly instead. This implies + * that if the error cannot be handled for some reason it's safe to + * just ignore it because no corruption has been consumed yet. Instead + * when that happens another machine check will happen. + * + * Handles page cache pages in various states. The tricky part + * here is that we can access any page asynchronous to other VM + * users, because memory failures could happen anytime and anywhere, + * possibly violating some of their assumptions. This is why this code + * has to be extremely careful. Generally it tries to use normal locking + * rules, as in get the standard locks, even if that means the + * error handling takes potentially a long time. + * + * Some of the operations here are somewhat inefficient and have non + * linear algorithmic complexity, because the data structures have not + * been optimized for this case. This is in particular the case + * for the mapping from a vma to a process. Since this case is expected + * to be rare we hope we can get away with this. + +The code consists of a the high level handler in mm/memory-failure.c, +a new page poison bit and various checks in the VM to handle poisoned +pages. + +The main target right now is KVM guests, but it works for all kinds +of applications. KVM support requires a recent qemu-kvm release. + +For the KVM use there was need for a new signal type so that +KVM can inject the machine check into the guest with the proper +address. This in theory allows other applications to handle +memory failures too. The expection is that near all applications +won't do that, but some very specialized ones might. + +--- + +There are two (actually three) modi memory failure recovery can be in: + +vm.memory_failure_recovery sysctl set to zero: + All memory failures cause a panic. Do not attempt recovery. + (on x86 this can be also affected by the tolerant level of the + MCE subsystem) + +early kill + (can be controlled globally and per process) + Send SIGBUS to the application as soon as the error is detected + This allows applications who can process memory errors in a gentle + way (e.g. drop affected object) + This is the mode used by KVM qemu. + +late kill + Send SIGBUS when the application runs into the corrupted page. + This is best for memory error unaware applications and default + Note some pages are always handled as late kill. + +--- + +User control: + +vm.memory_failure_recovery + See sysctl.txt + +vm.memory_failure_early_kill + Enable early kill mode globally + +PR_MCE_KILL + Set early/late kill mode/revert to system default + arg1: PR_MCE_KILL_CLEAR: Revert to system default + arg1: PR_MCE_KILL_SET: arg2 defines thread specific mode + PR_MCE_KILL_EARLY: Early kill + PR_MCE_KILL_LATE: Late kill + PR_MCE_KILL_DEFAULT: Use system global default +PR_MCE_KILL_GET + return current mode + + +--- + +Testing: + +madvise(MADV_POISON, ....) + (as root) + Poison a page in the process for testing + + +hwpoison-inject module through debugfs + /sys/debug/hwpoison/corrupt-pfn + +Inject hwpoison fault at PFN echoed into this file + + +Architecture specific MCE injector + +x86 has mce-inject, mce-test + +Some portable hwpoison test programs in mce-test, see blow. + +--- + +References: + +http://halobates.de/mce-lc09-2.pdf + Overview presentation from LinuxCon 09 + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git + Test suite (hwpoison specific portable tests in tsrc) + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git + x86 specific injector + + +--- + +Limitations: + +- Not all page types are supported and never will. Most kernel internal +objects cannot be recovered, only LRU pages for now. +- Right now hugepage support is missing. + +--- +Andi Kleen, Oct 2009 + diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt index 72a22f65960e..262d8e6793a3 100644 --- a/Documentation/vm/ksm.txt +++ b/Documentation/vm/ksm.txt @@ -52,15 +52,15 @@ The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/, readable by all but writable only by root: max_kernel_pages - set to maximum number of kernel pages that KSM may use - e.g. "echo 2000 > /sys/kernel/mm/ksm/max_kernel_pages" + e.g. "echo 100000 > /sys/kernel/mm/ksm/max_kernel_pages" Value 0 imposes no limit on the kernel pages KSM may use; but note that any process using MADV_MERGEABLE can cause KSM to allocate these pages, unswappable until it exits. - Default: 2000 (chosen for demonstration purposes) + Default: quarter of memory (chosen to not pin too much) pages_to_scan - how many present pages to scan before ksmd goes to sleep - e.g. "echo 200 > /sys/kernel/mm/ksm/pages_to_scan" - Default: 200 (chosen for demonstration purposes) + e.g. "echo 100 > /sys/kernel/mm/ksm/pages_to_scan" + Default: 100 (chosen for demonstration purposes) sleep_millisecs - how many milliseconds ksmd should sleep before next scan e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs" @@ -70,7 +70,8 @@ run - set 0 to stop ksmd from running but keep merged pages, set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run", set 2 to stop ksmd and unmerge all pages currently merged, but leave mergeable areas registered for next run - Default: 1 (for immediate use by apps which register) + Default: 0 (must be changed to 1 to activate KSM, + except if CONFIG_SYSFS is disabled) The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/: @@ -86,4 +87,4 @@ pages_volatile embraces several different kinds of activity, but a high proportion there would also indicate poor use of madvise MADV_MERGEABLE. Izik Eidus, -Hugh Dickins, 30 July 2009 +Hugh Dickins, 24 Sept 2009 diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index fa1a30d9e9d5..4793c6aac733 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -2,7 +2,10 @@ * page-types: Tool for querying page flags * * Copyright (C) 2009 Intel corporation - * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com> + * + * Authors: Wu Fengguang <fengguang.wu@intel.com> + * + * Released under the General Public License (GPL). */ #define _LARGEFILE64_SOURCE @@ -69,7 +72,9 @@ #define KPF_COMPOUND_TAIL 16 #define KPF_HUGE 17 #define KPF_UNEVICTABLE 18 +#define KPF_HWPOISON 19 #define KPF_NOPAGE 20 +#define KPF_KSM 21 /* [32-] kernel hacking assistances */ #define KPF_RESERVED 32 @@ -116,7 +121,9 @@ static char *page_flag_names[] = { [KPF_COMPOUND_TAIL] = "T:compound_tail", [KPF_HUGE] = "G:huge", [KPF_UNEVICTABLE] = "u:unevictable", + [KPF_HWPOISON] = "X:hwpoison", [KPF_NOPAGE] = "n:nopage", + [KPF_KSM] = "x:ksm", [KPF_RESERVED] = "r:reserved", [KPF_MLOCKED] = "m:mlocked", @@ -152,9 +159,6 @@ static unsigned long opt_size[MAX_ADDR_RANGES]; static int nr_vmas; static unsigned long pg_start[MAX_VMAS]; static unsigned long pg_end[MAX_VMAS]; -static unsigned long voffset; - -static int pagemap_fd; #define MAX_BIT_FILTERS 64 static int nr_bit_filters; @@ -163,9 +167,16 @@ static uint64_t opt_bits[MAX_BIT_FILTERS]; static int page_size; -#define PAGES_BATCH (64 << 10) /* 64k pages */ +static int pagemap_fd; static int kpageflags_fd; +static int opt_hwpoison; +static int opt_unpoison; + +static char *hwpoison_debug_fs = "/debug/hwpoison"; +static int hwpoison_inject_fd; +static int hwpoison_forget_fd; + #define HASH_SHIFT 13 #define HASH_SIZE (1 << HASH_SHIFT) #define HASH_MASK (HASH_SIZE - 1) @@ -207,6 +218,74 @@ static void fatal(const char *x, ...) exit(EXIT_FAILURE); } +static int checked_open(const char *pathname, int flags) +{ + int fd = open(pathname, flags); + + if (fd < 0) { + perror(pathname); + exit(EXIT_FAILURE); + } + + return fd; +} + +/* + * pagemap/kpageflags routines + */ + +static unsigned long do_u64_read(int fd, char *name, + uint64_t *buf, + unsigned long index, + unsigned long count) +{ + long bytes; + + if (index > ULONG_MAX / 8) + fatal("index overflow: %lu\n", index); + + if (lseek(fd, index * 8, SEEK_SET) < 0) { + perror(name); + exit(EXIT_FAILURE); + } + + bytes = read(fd, buf, count * 8); + if (bytes < 0) { + perror(name); + exit(EXIT_FAILURE); + } + if (bytes % 8) + fatal("partial read: %lu bytes\n", bytes); + + return bytes / 8; +} + +static unsigned long kpageflags_read(uint64_t *buf, + unsigned long index, + unsigned long pages) +{ + return do_u64_read(kpageflags_fd, PROC_KPAGEFLAGS, buf, index, pages); +} + +static unsigned long pagemap_read(uint64_t *buf, + unsigned long index, + unsigned long pages) +{ + return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages); +} + +static unsigned long pagemap_pfn(uint64_t val) +{ + unsigned long pfn; + + if (val & PM_PRESENT) + pfn = PM_PFRAME(val); + else + pfn = 0; + + return pfn; +} + /* * page flag names @@ -255,7 +334,8 @@ static char *page_flag_longname(uint64_t flags) * page list and summary */ -static void show_page_range(unsigned long offset, uint64_t flags) +static void show_page_range(unsigned long voffset, + unsigned long offset, uint64_t flags) { static uint64_t flags0; static unsigned long voff; @@ -281,7 +361,8 @@ static void show_page_range(unsigned long offset, uint64_t flags) count = 1; } -static void show_page(unsigned long offset, uint64_t flags) +static void show_page(unsigned long voffset, + unsigned long offset, uint64_t flags) { if (opt_pid) printf("%lx\t", voffset); @@ -362,6 +443,62 @@ static uint64_t well_known_flags(uint64_t flags) return flags; } +static uint64_t kpageflags_flags(uint64_t flags) +{ + flags = expand_overloaded_flags(flags); + + if (!opt_raw) + flags = well_known_flags(flags); + + return flags; +} + +/* + * page actions + */ + +static void prepare_hwpoison_fd(void) +{ + char buf[100]; + + if (opt_hwpoison && !hwpoison_inject_fd) { + sprintf(buf, "%s/corrupt-pfn", hwpoison_debug_fs); + hwpoison_inject_fd = checked_open(buf, O_WRONLY); + } + + if (opt_unpoison && !hwpoison_forget_fd) { + sprintf(buf, "%s/renew-pfn", hwpoison_debug_fs); + hwpoison_forget_fd = checked_open(buf, O_WRONLY); + } +} + +static int hwpoison_page(unsigned long offset) +{ + char buf[100]; + int len; + + len = sprintf(buf, "0x%lx\n", offset); + len = write(hwpoison_inject_fd, buf, len); + if (len < 0) { + perror("hwpoison inject"); + return len; + } + return 0; +} + +static int unpoison_page(unsigned long offset) +{ + char buf[100]; + int len; + + len = sprintf(buf, "0x%lx\n", offset); + len = write(hwpoison_forget_fd, buf, len); + if (len < 0) { + perror("hwpoison forget"); + return len; + } + return 0; +} /* * page frame walker @@ -394,104 +531,83 @@ static int hash_slot(uint64_t flags) exit(EXIT_FAILURE); } -static void add_page(unsigned long offset, uint64_t flags) +static void add_page(unsigned long voffset, + unsigned long offset, uint64_t flags) { - flags = expand_overloaded_flags(flags); - - if (!opt_raw) - flags = well_known_flags(flags); + flags = kpageflags_flags(flags); if (!bit_mask_ok(flags)) return; + if (opt_hwpoison) + hwpoison_page(offset); + if (opt_unpoison) + unpoison_page(offset); + if (opt_list == 1) - show_page_range(offset, flags); + show_page_range(voffset, offset, flags); else if (opt_list == 2) - show_page(offset, flags); + show_page(voffset, offset, flags); nr_pages[hash_slot(flags)]++; total_pages++; } -static void walk_pfn(unsigned long index, unsigned long count) +#define KPAGEFLAGS_BATCH (64 << 10) /* 64k pages */ +static void walk_pfn(unsigned long voffset, + unsigned long index, + unsigned long count) { + uint64_t buf[KPAGEFLAGS_BATCH]; unsigned long batch; - unsigned long n; + unsigned long pages; unsigned long i; - if (index > ULONG_MAX / KPF_BYTES) - fatal("index overflow: %lu\n", index); - - lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET); - while (count) { - uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH]; - - batch = min_t(unsigned long, count, PAGES_BATCH); - n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES); - if (n == 0) + batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH); + pages = kpageflags_read(buf, index, batch); + if (pages == 0) break; - if (n < 0) { - perror(PROC_KPAGEFLAGS); - exit(EXIT_FAILURE); - } - if (n % KPF_BYTES != 0) - fatal("partial read: %lu bytes\n", n); - n = n / KPF_BYTES; + for (i = 0; i < pages; i++) + add_page(voffset + i, index + i, buf[i]); - for (i = 0; i < n; i++) - add_page(index + i, kpageflags_buf[i]); - - index += batch; - count -= batch; + index += pages; + count -= pages; } } - -#define PAGEMAP_BATCH 4096 -static unsigned long task_pfn(unsigned long pgoff) +#define PAGEMAP_BATCH (64 << 10) +static void walk_vma(unsigned long index, unsigned long count) { - static uint64_t buf[PAGEMAP_BATCH]; - static unsigned long start; - static long count; - uint64_t pfn; + uint64_t buf[PAGEMAP_BATCH]; + unsigned long batch; + unsigned long pages; + unsigned long pfn; + unsigned long i; - if (pgoff < start || pgoff >= start + count) { - if (lseek64(pagemap_fd, - (uint64_t)pgoff * PM_ENTRY_BYTES, - SEEK_SET) < 0) { - perror("pagemap seek"); - exit(EXIT_FAILURE); - } - count = read(pagemap_fd, buf, sizeof(buf)); - if (count == 0) - return 0; - if (count < 0) { - perror("pagemap read"); - exit(EXIT_FAILURE); - } - if (count % PM_ENTRY_BYTES) { - fatal("pagemap read not aligned.\n"); - exit(EXIT_FAILURE); - } - count /= PM_ENTRY_BYTES; - start = pgoff; - } + while (count) { + batch = min_t(unsigned long, count, PAGEMAP_BATCH); + pages = pagemap_read(buf, index, batch); + if (pages == 0) + break; - pfn = buf[pgoff - start]; - if (pfn & PM_PRESENT) - pfn = PM_PFRAME(pfn); - else - pfn = 0; + for (i = 0; i < pages; i++) { + pfn = pagemap_pfn(buf[i]); + if (pfn) + walk_pfn(index + i, pfn, 1); + } - return pfn; + index += pages; + count -= pages; + } } static void walk_task(unsigned long index, unsigned long count) { - int i = 0; const unsigned long end = index + count; + unsigned long start; + int i = 0; while (index < end) { @@ -501,15 +617,11 @@ static void walk_task(unsigned long index, unsigned long count) if (pg_start[i] >= end) return; - voffset = max_t(unsigned long, pg_start[i], index); - index = min_t(unsigned long, pg_end[i], end); + start = max_t(unsigned long, pg_start[i], index); + index = min_t(unsigned long, pg_end[i], end); - assert(voffset < index); - for (; voffset < index; voffset++) { - unsigned long pfn = task_pfn(voffset); - if (pfn) - walk_pfn(pfn, 1); - } + assert(start < index); + walk_vma(start, index - start); } } @@ -527,18 +639,14 @@ static void walk_addr_ranges(void) { int i; - kpageflags_fd = open(PROC_KPAGEFLAGS, O_RDONLY); - if (kpageflags_fd < 0) { - perror(PROC_KPAGEFLAGS); - exit(EXIT_FAILURE); - } + kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY); if (!nr_addr_ranges) add_addr_range(0, ULONG_MAX); for (i = 0; i < nr_addr_ranges; i++) if (!opt_pid) - walk_pfn(opt_offset[i], opt_size[i]); + walk_pfn(0, opt_offset[i], opt_size[i]); else walk_task(opt_offset[i], opt_size[i]); @@ -575,6 +683,8 @@ static void usage(void) " -l|--list Show page details in ranges\n" " -L|--list-each Show page details one by one\n" " -N|--no-summary Don't show summay info\n" +" -X|--hwpoison hwpoison pages\n" +" -x|--unpoison unpoison pages\n" " -h|--help Show this usage message\n" "addr-spec:\n" " N one page at offset N (unit: pages)\n" @@ -624,11 +734,7 @@ static void parse_pid(const char *str) opt_pid = parse_number(str); sprintf(buf, "/proc/%d/pagemap", opt_pid); - pagemap_fd = open(buf, O_RDONLY); - if (pagemap_fd < 0) { - perror(buf); - exit(EXIT_FAILURE); - } + pagemap_fd = checked_open(buf, O_RDONLY); sprintf(buf, "/proc/%d/maps", opt_pid); file = fopen(buf, "r"); @@ -788,6 +894,8 @@ static struct option opts[] = { { "list" , 0, NULL, 'l' }, { "list-each" , 0, NULL, 'L' }, { "no-summary", 0, NULL, 'N' }, + { "hwpoison" , 0, NULL, 'X' }, + { "unpoison" , 0, NULL, 'x' }, { "help" , 0, NULL, 'h' }, { NULL , 0, NULL, 0 } }; @@ -799,7 +907,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); while ((c = getopt_long(argc, argv, - "rp:f:a:b:lLNh", opts, NULL)) != -1) { + "rp:f:a:b:lLNXxh", opts, NULL)) != -1) { switch (c) { case 'r': opt_raw = 1; @@ -825,6 +933,14 @@ int main(int argc, char *argv[]) case 'N': opt_no_summary = 1; break; + case 'X': + opt_hwpoison = 1; + prepare_hwpoison_fd(); + break; + case 'x': + opt_unpoison = 1; + prepare_hwpoison_fd(); + break; case 'h': usage(); exit(0); @@ -844,7 +960,7 @@ int main(int argc, char *argv[]) walk_addr_ranges(); if (opt_list == 1) - show_page_range(0, 0); /* drain the buffer */ + show_page_range(0, 0, 0); /* drain the buffer */ if (opt_no_summary) return 0; diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index 600a304a828c..df09b9650a81 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -57,7 +57,9 @@ There are three components to pagemap: 16. COMPOUND_TAIL 16. HUGE 18. UNEVICTABLE + 19. HWPOISON 20. NOPAGE + 21. KSM Short descriptions to the page flags: @@ -86,9 +88,15 @@ Short descriptions to the page flags: 17. HUGE this is an integral part of a HugeTLB page +19. HWPOISON + hardware detected memory corruption on this page: don't touch the data! + 20. NOPAGE no page frame exists at the requested address +21. KSM + identical memory pages dynamically shared between one or more processes + [IO related page flags] 1. ERROR IO error occurred 3. UPTODATE page has up-to-date data diff --git a/Documentation/w1/masters/ds2482 b/Documentation/w1/masters/ds2482 index 9210d6fa5024..299b91c7609f 100644 --- a/Documentation/w1/masters/ds2482 +++ b/Documentation/w1/masters/ds2482 @@ -24,8 +24,8 @@ General Remarks Valid addresses are 0x18, 0x19, 0x1a, and 0x1b. However, the device cannot be detected without writing to the i2c bus, so no -detection is done. -You should force the device address. +detection is done. You should instantiate the device explicitly. -$ modprobe ds2482 force=0,0x18 +$ modprobe ds2482 +$ echo ds2482 0x18 > /sys/bus/i2c/devices/i2c-0/new_device |