diff options
193 files changed, 4525 insertions, 1704 deletions
| diff --git a/Documentation/acpi/apei/einj.txt b/Documentation/acpi/apei/einj.txt index dfab71848dc8..5cc699ba5453 100644 --- a/Documentation/acpi/apei/einj.txt +++ b/Documentation/acpi/apei/einj.txt @@ -48,12 +48,19 @@ directory apei/einj. The following files are provided.  - param1    This file is used to set the first error parameter value. Effect of    parameter depends on error_type specified. For memory error, this is -  physical memory address. +  physical memory address.  Only available if param_extension module +  parameter is specified.  - param2    This file is used to set the second error parameter value. Effect of    parameter depends on error_type specified. For memory error, this is -  physical memory address mask. +  physical memory address mask.  Only available if param_extension +  module parameter is specified. + +Injecting parameter support is a BIOS version specific extension, that +is, it only works on some BIOS version.  If you want to use it, please +make sure your BIOS version has the proper support and specify +"param_extension=y" in module parameter.  For more information about EINJ, please refer to ACPI specification  version 4.0, section 17.5. diff --git a/Documentation/devicetree/bindings/gpio/gpio_keys.txt b/Documentation/devicetree/bindings/gpio/gpio_keys.txt index 7190c99d7611..5c2c02140a62 100644 --- a/Documentation/devicetree/bindings/gpio/gpio_keys.txt +++ b/Documentation/devicetree/bindings/gpio/gpio_keys.txt @@ -10,7 +10,7 @@ Optional properties:  Each button (key) is represented as a sub-node of "gpio-keys":  Subnode properties: -	- gpios: OF devcie-tree gpio specificatin. +	- gpios: OF device-tree gpio specification.  	- label: Descriptive name of the key.  	- linux,code: Keycode to emit. diff --git a/Documentation/devicetree/bindings/input/fsl-mma8450.txt b/Documentation/devicetree/bindings/input/fsl-mma8450.txt new file mode 100644 index 000000000000..a00c94ccbdee --- /dev/null +++ b/Documentation/devicetree/bindings/input/fsl-mma8450.txt @@ -0,0 +1,11 @@ +* Freescale MMA8450 3-Axis Accelerometer + +Required properties: +- compatible : "fsl,mma8450". + +Example: + +accelerometer: mma8450@1c { +	compatible = "fsl,mma8450"; +	reg = <0x1c>; +}; diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt index 7be15e44d481..82a5d250d75e 100644 --- a/Documentation/fault-injection/fault-injection.txt +++ b/Documentation/fault-injection/fault-injection.txt @@ -143,8 +143,7 @@ o provide a way to configure fault attributes    failslab, fail_page_alloc, and fail_make_request use this way.    Helper functions: -	init_fault_attr_dentries(entries, attr, name); -	void cleanup_fault_attr_dentries(entries); +	fault_create_debugfs_attr(name, parent, attr);  - module parameters diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index ea0bace0124a..43f48098220d 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -296,15 +296,6 @@ Who:	Ravikiran Thirumalai <kiran@scalex86.org>  --------------------------- -What:	CONFIG_THERMAL_HWMON -When:	January 2009 -Why:	This option was introduced just to allow older lm-sensors userspace -	to keep working over the upgrade to 2.6.26. At the scheduled time of -	removal fixed lm-sensors (2.x or 3.x) should be readily available. -Who:	Rene Herman <rene.herman@gmail.com> - ---------------------------- -  What:	Code that is now under CONFIG_WIRELESS_EXT_SYSFS  	(in net/core/net-sysfs.c)  When:	After the only user (hal) has seen a release with the patches diff --git a/Documentation/frv/booting.txt b/Documentation/frv/booting.txt index ace200b7c214..37c4d84a0e57 100644 --- a/Documentation/frv/booting.txt +++ b/Documentation/frv/booting.txt @@ -106,13 +106,20 @@ separated by spaces:        To use the first on-chip serial port at baud rate 115200, no parity, 8        bits, and no flow control. -  (*) root=/dev/<xxxx> +  (*) root=<xxxx> -      This specifies the device upon which the root filesystem resides. For -      example: +      This specifies the device upon which the root filesystem resides. It +      may be specified by major and minor number, device path, or even +      partition uuid, if supported.  For example:  	/dev/nfs	NFS root filesystem  	/dev/mtdblock3	Fourth RedBoot partition on the System Flash +	PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF/PARTNROFF=1 +		first partition after the partition with the given UUID +	253:0		Device with major 253 and minor 0 + +      Authoritative information can be found in +      "Documentation/kernel-parameters.txt".    (*) rw diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 72ba8d51dbc1..845a191004b1 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -292,6 +292,7 @@ Code  Seq#(hex)	Include File		Comments  					<mailto:buk@buks.ipn.de>  0xA0	all	linux/sdp/sdp.h		Industrial Device Project  					<mailto:kenji@bitgate.com> +0xA2	00-0F	arch/tile/include/asm/hardwall.h  0xA3	80-8F	Port ACL		in development:  					<mailto:tlewis@mindspring.com>  0xA3	90-9F	linux/dtlk.h diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 26a83743af19..e279b7242912 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -163,6 +163,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.  			See also Documentation/power/pm.txt, pci=noacpi +	acpi_rsdp=	[ACPI,EFI,KEXEC] +			Pass the RSDP address to the kernel, mostly used +			on machines running EFI runtime service to boot the +			second kernel for kdump. +  	acpi_apic_instance=	[ACPI, IOAPIC]  			Format: <int>  			2: use 2nd APIC table, if available @@ -546,6 +551,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.  			/proc/<pid>/coredump_filter.  			See also Documentation/filesystems/proc.txt. +	cpuidle.off=1	[CPU_IDLE] +			disable the cpuidle sub-system +  	cpcihp_generic=	[HW,PCI] Generic port I/O CompactPCI driver  			Format:  			<first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>] @@ -2240,6 +2248,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.  	ro		[KNL] Mount root device read-only on boot  	root=		[KNL] Root filesystem +			See name_to_dev_t comment in init/do_mounts.c.  	rootdelay=	[KNL] Delay (in seconds) to pause before attempting to  			mount the root filesystem diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt index c93bed66e25d..97d45f276fe6 100644 --- a/Documentation/m68k/kernel-options.txt +++ b/Documentation/m68k/kernel-options.txt @@ -129,6 +129,20 @@ decimal 11 is the major of SCSI CD-ROMs, and the minor 0 stands for  the first of these. You can find out all valid major numbers by  looking into include/linux/major.h. +In addition to major and minor numbers, if the device containing your +root partition uses a partition table format with unique partition +identifiers, then you may use them.  For instance, +"root=PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF".  It is also +possible to reference another partition on the same device using a +known partition UUID as the starting point.  For example, +if partition 5 of the device has the UUID of +00112233-4455-6677-8899-AABBCCDDEEFF then partition 3 may be found as +follows: +  PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF/PARTNROFF=-2 + +Authoritative information can be found in +"Documentation/kernel-parameters.txt". +  2.2) ro, rw  ----------- diff --git a/MAINTAINERS b/MAINTAINERS index 0d2fcda465eb..07cfd8deaad5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3367,6 +3367,12 @@ F:	drivers/net/ixgb/  F:	drivers/net/ixgbe/  F:	drivers/net/ixgbevf/ +INTEL MRST PMU DRIVER +M:	Len Brown <len.brown@intel.com> +L:	linux-pm@lists.linux-foundation.org +S:	Supported +F:	arch/x86/platform/mrst/pmu.* +  INTEL PRO/WIRELESS 2100 NETWORK CONNECTION SUPPORT  L:	linux-wireless@vger.kernel.org  S:	Orphan @@ -6319,6 +6325,7 @@ F:	include/linux/sysv_fs.h  TARGET SUBSYSTEM  M:	Nicholas A. Bellinger <nab@linux-iscsi.org>  L:	linux-scsi@vger.kernel.org +L:	target-devel@vger.kernel.org  L:	http://groups.google.com/group/linux-iscsi-target-dev  W:	http://www.linux-iscsi.org  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/nab/lio-core-2.6.git master diff --git a/arch/Kconfig b/arch/Kconfig index 26b0e2397a57..4b0669cbb3b0 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -178,4 +178,7 @@ config HAVE_ARCH_MUTEX_CPU_RELAX  config HAVE_RCU_TABLE_FREE  	bool +config ARCH_HAVE_NMI_SAFE_CMPXCHG +	bool +  source "kernel/gcov/Kconfig" diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index ca2da8da6e9c..60cde53d266c 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -14,6 +14,7 @@ config ALPHA  	select AUTO_IRQ_AFFINITY if SMP  	select GENERIC_IRQ_SHOW  	select ARCH_WANT_OPTIONAL_GPIOLIB +	select ARCH_HAVE_NMI_SAFE_CMPXCHG  	help  	  The Alpha is a 64-bit general-purpose processor designed and  	  marketed by the Digital Equipment Corporation of blessed memory, diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 5e1e54197227..d7ee0d4c072d 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -30,6 +30,7 @@  #include <linux/uaccess.h>  #include <linux/random.h>  #include <linux/hw_breakpoint.h> +#include <linux/cpuidle.h>  #include <asm/cacheflush.h>  #include <asm/leds.h> @@ -196,7 +197,8 @@ void cpu_idle(void)  				cpu_relax();  			} else {  				stop_critical_timings(); -				pm_idle(); +				if (cpuidle_call_idle()) +					pm_idle();  				start_critical_timings();  				/*  				 * This will eventually be removed - pm_idle diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index e9d689b7c833..197e96f70405 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -10,6 +10,7 @@ config AVR32  	select GENERIC_IRQ_PROBE  	select HARDIRQS_SW_RESEND  	select GENERIC_IRQ_SHOW +	select ARCH_HAVE_NMI_SAFE_CMPXCHG  	help  	  AVR32 is a high-performance 32-bit RISC microprocessor core,  	  designed for cost-sensitive embedded applications, with particular diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c index 850265373611..466af40c5822 100644 --- a/arch/cris/arch-v10/drivers/sync_serial.c +++ b/arch/cris/arch-v10/drivers/sync_serial.c @@ -158,7 +158,7 @@ static int sync_serial_open(struct inode *inode, struct file *file);  static int sync_serial_release(struct inode *inode, struct file *file);  static unsigned int sync_serial_poll(struct file *filp, poll_table *wait); -static int sync_serial_ioctl(struct file *file, +static long sync_serial_ioctl(struct file *file,  	unsigned int cmd, unsigned long arg);  static ssize_t sync_serial_write(struct file *file, const char *buf,  	size_t count, loff_t *ppos); @@ -625,11 +625,11 @@ static int sync_serial_open(struct inode *inode, struct file *file)  			*R_IRQ_MASK1_SET = 1 << port->data_avail_bit;  		DEBUG(printk(KERN_DEBUG "sser%d rec started\n", dev));  	} -	ret = 0; +	err = 0;  out:  	mutex_unlock(&sync_serial_mutex); -	return ret; +	return err;  }  static int sync_serial_release(struct inode *inode, struct file *file) diff --git a/arch/cris/arch-v10/kernel/irq.c b/arch/cris/arch-v10/kernel/irq.c index 907cfb5a873d..ba0e5965d6e3 100644 --- a/arch/cris/arch-v10/kernel/irq.c +++ b/arch/cris/arch-v10/kernel/irq.c @@ -20,6 +20,9 @@  #define crisv10_mask_irq(irq_nr) (*R_VECT_MASK_CLR = 1 << (irq_nr));  #define crisv10_unmask_irq(irq_nr) (*R_VECT_MASK_SET = 1 << (irq_nr)); +extern void kgdb_init(void); +extern void breakpoint(void); +  /* don't use set_int_vector, it bypasses the linux interrupt handlers. it is   * global just so that the kernel gdb can use it.   */ diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h index 29b74a105830..332f19c54557 100644 --- a/arch/cris/include/asm/thread_info.h +++ b/arch/cris/include/asm/thread_info.h @@ -11,8 +11,6 @@  #ifdef __KERNEL__ -#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR -  #ifndef __ASSEMBLY__  #include <asm/types.h>  #include <asm/processor.h> @@ -67,8 +65,10 @@ struct thread_info {  #define init_thread_info	(init_thread_union.thread_info) +#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR  /* thread information allocation */ -#define alloc_thread_info(tsk, node) ((struct thread_info *) __get_free_pages(GFP_KERNEL,1)) +#define alloc_thread_info_node(tsk, node)	\ +	((struct thread_info *) __get_free_pages(GFP_KERNEL, 1))  #define free_thread_info(ti) free_pages((unsigned long) (ti), 1)  #endif /* !__ASSEMBLY__ */ diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index cb884e489425..bad27a6ff407 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -7,6 +7,7 @@ config FRV  	select HAVE_PERF_EVENTS  	select HAVE_GENERIC_HARDIRQS  	select GENERIC_IRQ_SHOW +	select ARCH_HAVE_NMI_SAFE_CMPXCHG  config ZONE_DMA  	bool diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 64c7ab7e7a81..124854714958 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -28,6 +28,7 @@ config IA64  	select IRQ_PER_CPU  	select GENERIC_IRQ_SHOW  	select ARCH_WANT_OPTIONAL_GPIOLIB +	select ARCH_HAVE_NMI_SAFE_CMPXCHG  	default y  	help  	  The Itanium Processor Family is Intel's 64-bit successor to diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 284cd3771eaa..9e8ee9d2b8ca 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -6,6 +6,7 @@ config M68K  	select GENERIC_ATOMIC64 if MMU  	select HAVE_GENERIC_HARDIRQS if !MMU  	select GENERIC_IRQ_SHOW if !MMU +	select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS  config RWSEM_GENERIC_SPINLOCK  	bool diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 65adc86a230e..e077b0bf56ca 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -15,6 +15,7 @@ config PARISC  	select HAVE_GENERIC_HARDIRQS  	select GENERIC_IRQ_PROBE  	select IRQ_PER_CPU +	select ARCH_HAVE_NMI_SAFE_CMPXCHG  	help  	  The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 374c475e56a3..6926b61acfea 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -136,6 +136,7 @@ config PPC  	select HAVE_SYSCALL_TRACEPOINTS  	select HAVE_BPF_JIT if (PPC64 && NET)  	select HAVE_ARCH_JUMP_LABEL +	select ARCH_HAVE_NMI_SAFE_CMPXCHG  config EARLY_PRINTK  	bool diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 17dbb4318261..ed5cb5af5281 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -81,6 +81,7 @@ config S390  	select INIT_ALL_POSSIBLE  	select HAVE_IRQ_WORK  	select HAVE_PERF_EVENTS +	select ARCH_HAVE_NMI_SAFE_CMPXCHG  	select HAVE_KERNEL_GZIP  	select HAVE_KERNEL_BZIP2  	select HAVE_KERNEL_LZMA diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 748ff1920068..ff9177c8f643 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -11,6 +11,7 @@ config SUPERH  	select HAVE_DMA_ATTRS  	select HAVE_IRQ_WORK  	select HAVE_PERF_EVENTS +	select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)  	select PERF_USE_VMALLOC  	select HAVE_KERNEL_GZIP  	select HAVE_KERNEL_BZIP2 diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 84db0d6ccd0d..3c45de1db716 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -16,12 +16,13 @@  #include <linux/thread_info.h>  #include <linux/irqflags.h>  #include <linux/smp.h> +#include <linux/cpuidle.h>  #include <asm/pgalloc.h>  #include <asm/system.h>  #include <linux/atomic.h>  #include <asm/smp.h> -void (*pm_idle)(void) = NULL; +static void (*pm_idle)(void);  static int hlt_counter; @@ -100,7 +101,8 @@ void cpu_idle(void)  			local_irq_disable();  			/* Don't trace irqs off for idle */  			stop_critical_timings(); -			pm_idle(); +			if (cpuidle_call_idle()) +				pm_idle();  			/*  			 * Sanity check to ensure that pm_idle() returns  			 * with IRQs enabled diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 1074dddcb104..42c67beadcae 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -54,6 +54,7 @@ config SPARC64  	select HAVE_PERF_EVENTS  	select PERF_USE_VMALLOC  	select IRQ_PREFLOW_FASTEOI +	select ARCH_HAVE_NMI_SAFE_CMPXCHG  config ARCH_DEFCONFIG  	string diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 0249b8b4db54..b30f71ac0d06 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -12,6 +12,7 @@ config TILE  	select GENERIC_PENDING_IRQ if SMP  	select GENERIC_IRQ_SHOW  	select SYS_HYPERVISOR +	select ARCH_HAVE_NMI_SAFE_CMPXCHG if !M386  # FIXME: investigate whether we need/want these options.  #	select HAVE_IOREMAP_PROT diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild index 849ab2fa1f5c..aec60dc06007 100644 --- a/arch/tile/include/asm/Kbuild +++ b/arch/tile/include/asm/Kbuild @@ -2,3 +2,41 @@ include include/asm-generic/Kbuild.asm  header-y += ucontext.h  header-y += hardwall.h + +generic-y += bug.h +generic-y += bugs.h +generic-y += cputime.h +generic-y += device.h +generic-y += div64.h +generic-y += emergency-restart.h +generic-y += errno.h +generic-y += fb.h +generic-y += fcntl.h +generic-y += ioctl.h +generic-y += ioctls.h +generic-y += ipc.h +generic-y += ipcbuf.h +generic-y += irq_regs.h +generic-y += kdebug.h +generic-y += local.h +generic-y += module.h +generic-y += msgbuf.h +generic-y += mutex.h +generic-y += param.h +generic-y += parport.h +generic-y += poll.h +generic-y += posix_types.h +generic-y += resource.h +generic-y += scatterlist.h +generic-y += sembuf.h +generic-y += serial.h +generic-y += shmbuf.h +generic-y += shmparam.h +generic-y += socket.h +generic-y += sockios.h +generic-y += statfs.h +generic-y += termbits.h +generic-y += termios.h +generic-y += types.h +generic-y += ucontext.h +generic-y += xor.h diff --git a/arch/tile/include/asm/bug.h b/arch/tile/include/asm/bug.h deleted file mode 100644 index b12fd89e42e9..000000000000 --- a/arch/tile/include/asm/bug.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/bug.h> diff --git a/arch/tile/include/asm/bugs.h b/arch/tile/include/asm/bugs.h deleted file mode 100644 index 61791e1ad9f5..000000000000 --- a/arch/tile/include/asm/bugs.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/bugs.h> diff --git a/arch/tile/include/asm/cputime.h b/arch/tile/include/asm/cputime.h deleted file mode 100644 index 6d68ad7e0ea3..000000000000 --- a/arch/tile/include/asm/cputime.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/cputime.h> diff --git a/arch/tile/include/asm/device.h b/arch/tile/include/asm/device.h deleted file mode 100644 index f0a4c256403b..000000000000 --- a/arch/tile/include/asm/device.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/device.h> diff --git a/arch/tile/include/asm/div64.h b/arch/tile/include/asm/div64.h deleted file mode 100644 index 6cd978cefb28..000000000000 --- a/arch/tile/include/asm/div64.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/div64.h> diff --git a/arch/tile/include/asm/emergency-restart.h b/arch/tile/include/asm/emergency-restart.h deleted file mode 100644 index 3711bd9d50bd..000000000000 --- a/arch/tile/include/asm/emergency-restart.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/emergency-restart.h> diff --git a/arch/tile/include/asm/errno.h b/arch/tile/include/asm/errno.h deleted file mode 100644 index 4c82b503d92f..000000000000 --- a/arch/tile/include/asm/errno.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/errno.h> diff --git a/arch/tile/include/asm/fb.h b/arch/tile/include/asm/fb.h deleted file mode 100644 index 3a4988e8df45..000000000000 --- a/arch/tile/include/asm/fb.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/fb.h> diff --git a/arch/tile/include/asm/fcntl.h b/arch/tile/include/asm/fcntl.h deleted file mode 100644 index 46ab12db5739..000000000000 --- a/arch/tile/include/asm/fcntl.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/fcntl.h> diff --git a/arch/tile/include/asm/fixmap.h b/arch/tile/include/asm/fixmap.h index 51537ff9265a..c66f7933beaa 100644 --- a/arch/tile/include/asm/fixmap.h +++ b/arch/tile/include/asm/fixmap.h @@ -75,12 +75,6 @@ extern void __set_fixmap(enum fixed_addresses idx,  #define set_fixmap(idx, phys) \  		__set_fixmap(idx, phys, PAGE_KERNEL) -/* - * Some hardware wants to get fixmapped without caching. - */ -#define set_fixmap_nocache(idx, phys) \ -		__set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) -  #define clear_fixmap(idx) \  		__set_fixmap(idx, 0, __pgprot(0)) diff --git a/arch/tile/include/asm/ioctl.h b/arch/tile/include/asm/ioctl.h deleted file mode 100644 index b279fe06dfe5..000000000000 --- a/arch/tile/include/asm/ioctl.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/ioctl.h> diff --git a/arch/tile/include/asm/ioctls.h b/arch/tile/include/asm/ioctls.h deleted file mode 100644 index ec34c760665e..000000000000 --- a/arch/tile/include/asm/ioctls.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/ioctls.h> diff --git a/arch/tile/include/asm/ipc.h b/arch/tile/include/asm/ipc.h deleted file mode 100644 index a46e3d9c2a3f..000000000000 --- a/arch/tile/include/asm/ipc.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/ipc.h> diff --git a/arch/tile/include/asm/ipcbuf.h b/arch/tile/include/asm/ipcbuf.h deleted file mode 100644 index 84c7e51cb6d0..000000000000 --- a/arch/tile/include/asm/ipcbuf.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/ipcbuf.h> diff --git a/arch/tile/include/asm/irq_regs.h b/arch/tile/include/asm/irq_regs.h deleted file mode 100644 index 3dd9c0b70270..000000000000 --- a/arch/tile/include/asm/irq_regs.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/irq_regs.h> diff --git a/arch/tile/include/asm/kdebug.h b/arch/tile/include/asm/kdebug.h deleted file mode 100644 index 6ece1b037665..000000000000 --- a/arch/tile/include/asm/kdebug.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/kdebug.h> diff --git a/arch/tile/include/asm/local.h b/arch/tile/include/asm/local.h deleted file mode 100644 index c11c530f74d0..000000000000 --- a/arch/tile/include/asm/local.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/local.h> diff --git a/arch/tile/include/asm/module.h b/arch/tile/include/asm/module.h deleted file mode 100644 index 1e4b79fe8584..000000000000 --- a/arch/tile/include/asm/module.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/module.h> diff --git a/arch/tile/include/asm/msgbuf.h b/arch/tile/include/asm/msgbuf.h deleted file mode 100644 index 809134c644a6..000000000000 --- a/arch/tile/include/asm/msgbuf.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/msgbuf.h> diff --git a/arch/tile/include/asm/mutex.h b/arch/tile/include/asm/mutex.h deleted file mode 100644 index ff6101aa2c71..000000000000 --- a/arch/tile/include/asm/mutex.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/mutex-dec.h> diff --git a/arch/tile/include/asm/param.h b/arch/tile/include/asm/param.h deleted file mode 100644 index 965d45427975..000000000000 --- a/arch/tile/include/asm/param.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/param.h> diff --git a/arch/tile/include/asm/parport.h b/arch/tile/include/asm/parport.h deleted file mode 100644 index cf252af64590..000000000000 --- a/arch/tile/include/asm/parport.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/parport.h> diff --git a/arch/tile/include/asm/poll.h b/arch/tile/include/asm/poll.h deleted file mode 100644 index c98509d3149e..000000000000 --- a/arch/tile/include/asm/poll.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/poll.h> diff --git a/arch/tile/include/asm/posix_types.h b/arch/tile/include/asm/posix_types.h deleted file mode 100644 index 22cae6230ceb..000000000000 --- a/arch/tile/include/asm/posix_types.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/posix_types.h> diff --git a/arch/tile/include/asm/resource.h b/arch/tile/include/asm/resource.h deleted file mode 100644 index 04bc4db8921b..000000000000 --- a/arch/tile/include/asm/resource.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/resource.h> diff --git a/arch/tile/include/asm/scatterlist.h b/arch/tile/include/asm/scatterlist.h deleted file mode 100644 index 35d786fe93ae..000000000000 --- a/arch/tile/include/asm/scatterlist.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/scatterlist.h> diff --git a/arch/tile/include/asm/sembuf.h b/arch/tile/include/asm/sembuf.h deleted file mode 100644 index 7673b83cfef7..000000000000 --- a/arch/tile/include/asm/sembuf.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/sembuf.h> diff --git a/arch/tile/include/asm/serial.h b/arch/tile/include/asm/serial.h deleted file mode 100644 index a0cb0caff152..000000000000 --- a/arch/tile/include/asm/serial.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/serial.h> diff --git a/arch/tile/include/asm/shmbuf.h b/arch/tile/include/asm/shmbuf.h deleted file mode 100644 index 83c05fc2de38..000000000000 --- a/arch/tile/include/asm/shmbuf.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/shmbuf.h> diff --git a/arch/tile/include/asm/shmparam.h b/arch/tile/include/asm/shmparam.h deleted file mode 100644 index 93f30deb95d0..000000000000 --- a/arch/tile/include/asm/shmparam.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/shmparam.h> diff --git a/arch/tile/include/asm/socket.h b/arch/tile/include/asm/socket.h deleted file mode 100644 index 6b71384b9d8b..000000000000 --- a/arch/tile/include/asm/socket.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/socket.h> diff --git a/arch/tile/include/asm/sockios.h b/arch/tile/include/asm/sockios.h deleted file mode 100644 index def6d4746ee7..000000000000 --- a/arch/tile/include/asm/sockios.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/sockios.h> diff --git a/arch/tile/include/asm/statfs.h b/arch/tile/include/asm/statfs.h deleted file mode 100644 index 0b91fe198c20..000000000000 --- a/arch/tile/include/asm/statfs.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/statfs.h> diff --git a/arch/tile/include/asm/termbits.h b/arch/tile/include/asm/termbits.h deleted file mode 100644 index 3935b106de79..000000000000 --- a/arch/tile/include/asm/termbits.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/termbits.h> diff --git a/arch/tile/include/asm/termios.h b/arch/tile/include/asm/termios.h deleted file mode 100644 index 280d78a9d966..000000000000 --- a/arch/tile/include/asm/termios.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/termios.h> diff --git a/arch/tile/include/asm/types.h b/arch/tile/include/asm/types.h deleted file mode 100644 index b9e79bc580dd..000000000000 --- a/arch/tile/include/asm/types.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/types.h> diff --git a/arch/tile/include/asm/ucontext.h b/arch/tile/include/asm/ucontext.h deleted file mode 100644 index 9bc07b9f30fb..000000000000 --- a/arch/tile/include/asm/ucontext.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/ucontext.h> diff --git a/arch/tile/include/asm/xor.h b/arch/tile/include/asm/xor.h deleted file mode 100644 index c82eb12a5b18..000000000000 --- a/arch/tile/include/asm/xor.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/xor.h> diff --git a/arch/tile/include/hv/drv_srom_intf.h b/arch/tile/include/hv/drv_srom_intf.h new file mode 100644 index 000000000000..6395faa6d9e6 --- /dev/null +++ b/arch/tile/include/hv/drv_srom_intf.h @@ -0,0 +1,41 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, but + *   WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + *   NON INFRINGEMENT.  See the GNU General Public License for + *   more details. + */ + +/** + * @file drv_srom_intf.h + * Interface definitions for the SPI Flash ROM driver. + */ + +#ifndef _SYS_HV_INCLUDE_DRV_SROM_INTF_H +#define _SYS_HV_INCLUDE_DRV_SROM_INTF_H + +/** Read this offset to get the total device size. */ +#define SROM_TOTAL_SIZE_OFF   0xF0000000 + +/** Read this offset to get the device sector size. */ +#define SROM_SECTOR_SIZE_OFF  0xF0000004 + +/** Read this offset to get the device page size. */ +#define SROM_PAGE_SIZE_OFF    0xF0000008 + +/** Write this offset to flush any pending writes. */ +#define SROM_FLUSH_OFF        0xF1000000 + +/** Write this offset, plus the byte offset of the start of a sector, to + *  erase a sector.  Any write data is ignored, but there must be at least + *  one byte of write data.  Only applies when the driver is in MTD mode. + */ +#define SROM_ERASE_OFF        0xF2000000 + +#endif /* _SYS_HV_INCLUDE_DRV_SROM_INTF_H */ diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c index c4be58cc5d50..f6f50f2a5e37 100644 --- a/arch/tile/kernel/time.c +++ b/arch/tile/kernel/time.c @@ -78,7 +78,6 @@ static struct clocksource cycle_counter_cs = {  	.rating = 300,  	.read = clocksource_get_cycles,  	.mask = CLOCKSOURCE_MASK(64), -	.shift = 22,   /* typical value, e.g. x86 tsc uses this */  	.flags = CLOCK_SOURCE_IS_CONTINUOUS,  }; @@ -91,8 +90,6 @@ void __init setup_clock(void)  	cycles_per_sec = hv_sysconf(HV_SYSCONF_CPU_SPEED);  	sched_clock_mult =  		clocksource_hz2mult(cycles_per_sec, SCHED_CLOCK_SHIFT); -	cycle_counter_cs.mult = -		clocksource_hz2mult(cycles_per_sec, cycle_counter_cs.shift);  }  void __init calibrate_delay(void) @@ -107,7 +104,7 @@ void __init calibrate_delay(void)  void __init time_init(void)  {  	/* Initialize and register the clock source. */ -	clocksource_register(&cycle_counter_cs); +	clocksource_register_hz(&cycle_counter_cs, cycles_per_sec);  	/* Start up the tile-timer interrupt source on the boot cpu. */  	setup_tile_timer(); diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index 4e10c4023028..7309988c9794 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -836,8 +836,7 @@ void __init mem_init(void)  #endif  #ifdef CONFIG_FLATMEM -	if (!mem_map) -		BUG(); +	BUG_ON(!mem_map);  #endif  #ifdef CONFIG_HIGHMEM diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7cf916fc1ce7..6a47bb22657f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -72,6 +72,7 @@ config X86  	select USE_GENERIC_SMP_HELPERS if SMP  	select HAVE_BPF_JIT if (X86_64 && NET)  	select CLKEVT_I8253 +	select ARCH_HAVE_NMI_SAFE_CMPXCHG  config INSTRUCTION_DECODER  	def_bool (KPROBES || PERF_EVENTS) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index d02804d650c4..d8e8eefbe24c 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -40,8 +40,6 @@  #include <linux/compiler.h>  #include <asm/page.h> -#include <xen/xen.h> -  #define build_mmio_read(name, size, type, reg, barrier) \  static inline type name(const volatile void __iomem *addr) \  { type ret; asm volatile("mov" size " %1,%0":reg (ret) \ @@ -334,6 +332,7 @@ extern void fixup_early_ioremap(void);  extern bool is_early_ioremap_ptep(pte_t *ptep);  #ifdef CONFIG_XEN +#include <xen/xen.h>  struct bio_vec;  extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 219371546afd..0d1171c97729 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -751,8 +751,6 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)  		     :: "a" (eax), "c" (ecx));  } -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); -  extern void select_idle_routine(const struct cpuinfo_x86 *c);  extern void init_amd_e400_c1e_mask(void); diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 5812404a0d4c..f50e7fb2a201 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -149,6 +149,29 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,  }  EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + * + * New with Core Duo processors, MWAIT can take some hints based on CPU + * capability. + */ +void mwait_idle_with_hints(unsigned long ax, unsigned long cx) +{ +	if (!need_resched()) { +		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) +			clflush((void *)¤t_thread_info()->flags); + +		__monitor((void *)¤t_thread_info()->flags, 0, 0); +		smp_mb(); +		if (!need_resched()) +			__mwait(ax, cx); +	} +} +  void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)  {  	unsigned int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e1ba8cb24e4e..e7e3b019c439 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -438,29 +438,6 @@ void cpu_idle_wait(void)  }  EXPORT_SYMBOL_GPL(cpu_idle_wait); -/* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - * - * New with Core Duo processors, MWAIT can take some hints based on CPU - * capability. - */ -void mwait_idle_with_hints(unsigned long ax, unsigned long cx) -{ -	if (!need_resched()) { -		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) -			clflush((void *)¤t_thread_info()->flags); - -		__monitor((void *)¤t_thread_info()->flags, 0, 0); -		smp_mb(); -		if (!need_resched()) -			__mwait(ax, cx); -	} -} -  /* Default MONITOR/MWAIT with no hints, used for default C1 state */  static void mwait_idle(void)  { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a3d0dc59067b..7a3b65107a27 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -38,6 +38,7 @@  #include <linux/uaccess.h>  #include <linux/io.h>  #include <linux/kdebug.h> +#include <linux/cpuidle.h>  #include <asm/pgtable.h>  #include <asm/system.h> @@ -109,7 +110,8 @@ void cpu_idle(void)  			local_irq_disable();  			/* Don't trace irqs off for idle */  			stop_critical_timings(); -			pm_idle(); +			if (cpuidle_idle_call()) +				pm_idle();  			start_critical_timings();  		}  		tick_nohz_restart_sched_tick(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ca6f7ab8df33..f693e44e1bf6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,6 +37,7 @@  #include <linux/uaccess.h>  #include <linux/io.h>  #include <linux/ftrace.h> +#include <linux/cpuidle.h>  #include <asm/pgtable.h>  #include <asm/system.h> @@ -136,7 +137,8 @@ void cpu_idle(void)  			enter_idle();  			/* Don't trace irqs off for idle */  			stop_critical_timings(); -			pm_idle(); +			if (cpuidle_idle_call()) +				pm_idle();  			start_critical_timings();  			/* In many cases the interrupt that ended idle diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile index f61ccdd49341..1ea38775a6d3 100644 --- a/arch/x86/platform/mrst/Makefile +++ b/arch/x86/platform/mrst/Makefile @@ -1,3 +1,4 @@  obj-$(CONFIG_X86_MRST)		+= mrst.o  obj-$(CONFIG_X86_MRST)		+= vrtc.o  obj-$(CONFIG_EARLY_PRINTK_MRST)	+= early_printk_mrst.o +obj-$(CONFIG_X86_MRST)		+= pmu.o diff --git a/arch/x86/platform/mrst/pmu.c b/arch/x86/platform/mrst/pmu.c new file mode 100644 index 000000000000..9281da7d91bd --- /dev/null +++ b/arch/x86/platform/mrst/pmu.c @@ -0,0 +1,817 @@ +/* + * mrst/pmu.c - driver for MRST Power Management Unit + * + * Copyright (c) 2011, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <linux/cpuidle.h> +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/seq_file.h> +#include <linux/sfi.h> +#include <asm/intel_scu_ipc.h> +#include "pmu.h" + +#define IPCMSG_FW_REVISION	0xF4 + +struct mrst_device { +	u16 pci_dev_num;	/* DEBUG only */ +	u16 lss; +	u16 latest_request; +	unsigned int pci_state_counts[PCI_D3cold + 1]; /* DEBUG only */ +}; + +/* + * comlete list of MRST PCI devices + */ +static struct mrst_device mrst_devs[] = { +/*  0 */ { 0x0800, LSS_SPI0 },		/* Moorestown SPI Ctrl 0 */ +/*  1 */ { 0x0801, LSS_SPI1 },		/* Moorestown SPI Ctrl 1 */ +/*  2 */ { 0x0802, LSS_I2C0 },		/* Moorestown I2C 0 */ +/*  3 */ { 0x0803, LSS_I2C1 },		/* Moorestown I2C 1 */ +/*  4 */ { 0x0804, LSS_I2C2 },		/* Moorestown I2C 2 */ +/*  5 */ { 0x0805, LSS_KBD },		/* Moorestown Keyboard Ctrl */ +/*  6 */ { 0x0806, LSS_USB_HC },	/* Moorestown USB Ctrl */ +/*  7 */ { 0x0807, LSS_SD_HC0 },	/* Moorestown SD Host Ctrl 0 */ +/*  8 */ { 0x0808, LSS_SD_HC1 },	/* Moorestown SD Host Ctrl 1 */ +/*  9 */ { 0x0809, LSS_NAND },		/* Moorestown NAND Ctrl */ +/* 10 */ { 0x080a, LSS_AUDIO },		/* Moorestown Audio Ctrl */ +/* 11 */ { 0x080b, LSS_IMAGING },	/* Moorestown ISP */ +/* 12 */ { 0x080c, LSS_SECURITY },	/* Moorestown Security Controller */ +/* 13 */ { 0x080d, LSS_DISPLAY },	/* Moorestown External Displays */ +/* 14 */ { 0x080e, 0 },			/* Moorestown SCU IPC */ +/* 15 */ { 0x080f, LSS_GPIO },		/* Moorestown GPIO Controller */ +/* 16 */ { 0x0810, 0 },			/* Moorestown Power Management Unit */ +/* 17 */ { 0x0811, LSS_USB_OTG },	/* Moorestown OTG Ctrl */ +/* 18 */ { 0x0812, LSS_SPI2 },		/* Moorestown SPI Ctrl 2 */ +/* 19 */ { 0x0813, 0 },			/* Moorestown SC DMA */ +/* 20 */ { 0x0814, LSS_AUDIO_LPE },	/* Moorestown LPE DMA */ +/* 21 */ { 0x0815, LSS_AUDIO_SSP },	/* Moorestown SSP0 */ + +/* 22 */ { 0x084F, LSS_SD_HC2 },	/* Moorestown SD Host Ctrl 2 */ + +/* 23 */ { 0x4102, 0 },			/* Lincroft */ +/* 24 */ { 0x4110, 0 },			/* Lincroft */ +}; + +/* n.b. We ignore PCI-id 0x815 in LSS9 b/c MeeGo has no driver for it */ +static u16 mrst_lss9_pci_ids[] = {0x080a, 0x0814, 0}; +static u16 mrst_lss10_pci_ids[] = {0x0800, 0x0801, 0x0802, 0x0803, +					0x0804, 0x0805, 0x080f, 0}; + +/* handle concurrent SMP invokations of pmu_pci_set_power_state() */ +static spinlock_t mrst_pmu_power_state_lock; + +static unsigned int wake_counters[MRST_NUM_LSS];	/* DEBUG only */ +static unsigned int pmu_irq_stats[INT_INVALID + 1];	/* DEBUG only */ + +static int graphics_is_off; +static int lss_s0i3_enabled; +static bool mrst_pmu_s0i3_enable; + +/*  debug counters */ +static u32 pmu_wait_ready_calls; +static u32 pmu_wait_ready_udelays; +static u32 pmu_wait_ready_udelays_max; +static u32 pmu_wait_done_calls; +static u32 pmu_wait_done_udelays; +static u32 pmu_wait_done_udelays_max; +static u32 pmu_set_power_state_entry; +static u32 pmu_set_power_state_send_cmd; + +static struct mrst_device *pci_id_2_mrst_dev(u16 pci_dev_num) +{ +	int index = 0; + +	if ((pci_dev_num >= 0x0800) && (pci_dev_num <= 0x815)) +		index = pci_dev_num - 0x800; +	else if (pci_dev_num == 0x084F) +		index = 22; +	else if (pci_dev_num == 0x4102) +		index = 23; +	else if (pci_dev_num == 0x4110) +		index = 24; + +	if (pci_dev_num != mrst_devs[index].pci_dev_num) { +		WARN_ONCE(1, FW_BUG "Unknown PCI device 0x%04X\n", pci_dev_num); +		return 0; +	} + +	return &mrst_devs[index]; +} + +/** + * mrst_pmu_validate_cstates + * @dev: cpuidle_device + * + * Certain states are not appropriate for governor to pick in some cases. + * This function will be called as cpuidle_device's prepare callback and + * thus tells governor to ignore such states when selecting the next state + * to enter. + */ + +#define IDLE_STATE4_IS_C6	4 +#define IDLE_STATE5_IS_S0I3	5 + +int mrst_pmu_invalid_cstates(void) +{ +	int cpu = smp_processor_id(); + +	/* +	 * Demote to C4 if the PMU is busy. +	 * Since LSS changes leave the busy bit clear... +	 * busy means either the PMU is waiting for an ACK-C6 that +	 * isn't coming due to an MWAIT that returned immediately; +	 * or we returned from S0i3 successfully, and the PMU +	 * is not done sending us interrupts. +	 */ +	if (pmu_read_busy_status()) +		return 1 << IDLE_STATE4_IS_C6 | 1 << IDLE_STATE5_IS_S0I3; + +	/* +	 * Disallow S0i3 if: PMU is not initialized, or CPU1 is active, +	 * or if device LSS is insufficient, or the GPU is active, +	 * or if it has been explicitly disabled. +	 */ +	if (!pmu_reg || !cpumask_equal(cpu_online_mask, cpumask_of(cpu)) || +	    !lss_s0i3_enabled || !graphics_is_off || !mrst_pmu_s0i3_enable) +		return 1 << IDLE_STATE5_IS_S0I3; +	else +		return 0; +} + +/* + * pmu_update_wake_counters(): read PM_WKS, update wake_counters[] + * DEBUG only. + */ +static void pmu_update_wake_counters(void) +{ +	int lss; +	u32 wake_status; + +	wake_status = pmu_read_wks(); + +	for (lss = 0; lss < MRST_NUM_LSS; ++lss) { +		if (wake_status & (1 << lss)) +			wake_counters[lss]++; +	} +} + +int mrst_pmu_s0i3_entry(void) +{ +	int status; + +	/* Clear any possible error conditions */ +	pmu_write_ics(0x300); + +	/* set wake control to current D-states */ +	pmu_write_wssc(S0I3_SSS_TARGET); + +	status = mrst_s0i3_entry(PM_S0I3_COMMAND, &pmu_reg->pm_cmd); +	pmu_update_wake_counters(); +	return status; +} + +/* poll for maximum of 5ms for busy bit to clear */ +static int pmu_wait_ready(void) +{ +	int udelays; + +	pmu_wait_ready_calls++; + +	for (udelays = 0; udelays < 500; ++udelays) { +		if (udelays > pmu_wait_ready_udelays_max) +			pmu_wait_ready_udelays_max = udelays; + +		if (pmu_read_busy_status() == 0) +			return 0; + +		udelay(10); +		pmu_wait_ready_udelays++; +	} + +	/* +	 * if this fires, observe +	 * /sys/kernel/debug/mrst_pmu_wait_ready_calls +	 * /sys/kernel/debug/mrst_pmu_wait_ready_udelays +	 */ +	WARN_ONCE(1, "SCU not ready for 5ms"); +	return -EBUSY; +} +/* poll for maximum of 50ms us for busy bit to clear */ +static int pmu_wait_done(void) +{ +	int udelays; + +	pmu_wait_done_calls++; + +	for (udelays = 0; udelays < 500; ++udelays) { +		if (udelays > pmu_wait_done_udelays_max) +			pmu_wait_done_udelays_max = udelays; + +		if (pmu_read_busy_status() == 0) +			return 0; + +		udelay(100); +		pmu_wait_done_udelays++; +	} + +	/* +	 * if this fires, observe +	 * /sys/kernel/debug/mrst_pmu_wait_done_calls +	 * /sys/kernel/debug/mrst_pmu_wait_done_udelays +	 */ +	WARN_ONCE(1, "SCU not done for 50ms"); +	return -EBUSY; +} + +u32 mrst_pmu_msi_is_disabled(void) +{ +	return pmu_msi_is_disabled(); +} + +void mrst_pmu_enable_msi(void) +{ +	pmu_msi_enable(); +} + +/** + * pmu_irq - pmu driver interrupt handler + * Context: interrupt context + */ +static irqreturn_t pmu_irq(int irq, void *dummy) +{ +	union pmu_pm_ics pmu_ics; + +	pmu_ics.value = pmu_read_ics(); + +	if (!pmu_ics.bits.pending) +		return IRQ_NONE; + +	switch (pmu_ics.bits.cause) { +	case INT_SPURIOUS: +	case INT_CMD_DONE: +	case INT_CMD_ERR: +	case INT_WAKE_RX: +	case INT_SS_ERROR: +	case INT_S0IX_MISS: +	case INT_NO_ACKC6: +		pmu_irq_stats[pmu_ics.bits.cause]++; +		break; +	default: +		pmu_irq_stats[INT_INVALID]++; +	} + +	pmu_write_ics(pmu_ics.value); /* Clear pending interrupt */ + +	return IRQ_HANDLED; +} + +/* + * Translate PCI power management to MRST LSS D-states + */ +static int pci_2_mrst_state(int lss, pci_power_t pci_state) +{ +	switch (pci_state) { +	case PCI_D0: +		if (SSMSK(D0i1, lss) & D0I1_ACG_SSS_TARGET) +			return D0i1; +		else +			return D0; +	case PCI_D1: +		return D0i1; +	case PCI_D2: +		return D0i2; +	case PCI_D3hot: +	case PCI_D3cold: +		return D0i3; +	default: +		WARN(1, "pci_state %d\n", pci_state); +		return 0; +	} +} + +static int pmu_issue_command(u32 pm_ssc) +{ +	union pmu_pm_set_cfg_cmd_t command; + +	if (pmu_read_busy_status()) { +		pr_debug("pmu is busy, Operation not permitted\n"); +		return -1; +	} + +	/* +	 * enable interrupts in PMU so that interrupts are +	 * propagated when ioc bit for a particular set +	 * command is set +	 */ + +	pmu_irq_enable(); + +	/* Configure the sub systems for pmu2 */ + +	pmu_write_ssc(pm_ssc); + +	/* +	 * Send the set config command for pmu its configured +	 * for mode CM_IMMEDIATE & hence with No Trigger +	 */ + +	command.pmu2_params.d_param.cfg_mode = CM_IMMEDIATE; +	command.pmu2_params.d_param.cfg_delay = 0; +	command.pmu2_params.d_param.rsvd = 0; + +	/* construct the command to send SET_CFG to particular PMU */ +	command.pmu2_params.d_param.cmd = SET_CFG_CMD; +	command.pmu2_params.d_param.ioc = 0; +	command.pmu2_params.d_param.mode_id = 0; +	command.pmu2_params.d_param.sys_state = SYS_STATE_S0I0; + +	/* write the value of PM_CMD into particular PMU */ +	pr_debug("pmu command being written %x\n", +			command.pmu_pm_set_cfg_cmd_value); + +	pmu_write_cmd(command.pmu_pm_set_cfg_cmd_value); + +	return 0; +} + +static u16 pmu_min_lss_pci_req(u16 *ids, u16 pci_state) +{ +	u16 existing_request; +	int i; + +	for (i = 0; ids[i]; ++i) { +		struct mrst_device *mrst_dev; + +		mrst_dev = pci_id_2_mrst_dev(ids[i]); +		if (unlikely(!mrst_dev)) +			continue; + +		existing_request = mrst_dev->latest_request; +		if (existing_request < pci_state) +			pci_state = existing_request; +	} +	return pci_state; +} + +/** + * pmu_pci_set_power_state - Callback function is used by all the PCI devices + *			for a platform  specific device power on/shutdown. + */ + +int pmu_pci_set_power_state(struct pci_dev *pdev, pci_power_t pci_state) +{ +	u32 old_sss, new_sss; +	int status = 0; +	struct mrst_device *mrst_dev; + +	pmu_set_power_state_entry++; + +	BUG_ON(pdev->vendor != PCI_VENDOR_ID_INTEL); +	BUG_ON(pci_state < PCI_D0 || pci_state > PCI_D3cold); + +	mrst_dev = pci_id_2_mrst_dev(pdev->device); +	if (unlikely(!mrst_dev)) +		return -ENODEV; + +	mrst_dev->pci_state_counts[pci_state]++;	/* count invocations */ + +	/* PMU driver calls self as part of PCI initialization, ignore */ +	if (pdev->device == PCI_DEV_ID_MRST_PMU) +		return 0; + +	BUG_ON(!pmu_reg); /* SW bug if called before initialized */ + +	spin_lock(&mrst_pmu_power_state_lock); + +	if (pdev->d3_delay) { +		dev_dbg(&pdev->dev, "d3_delay %d, should be 0\n", +			pdev->d3_delay); +		pdev->d3_delay = 0; +	} +	/* +	 * If Lincroft graphics, simply remember state +	 */ +	if ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY +		&& !((pdev->class & PCI_SUB_CLASS_MASK) >> 8)) { +		if (pci_state == PCI_D0) +			graphics_is_off = 0; +		else +			graphics_is_off = 1; +		goto ret; +	} + +	if (!mrst_dev->lss) +		goto ret;	/* device with no LSS */ + +	if (mrst_dev->latest_request == pci_state) +		goto ret;	/* no change */ + +	mrst_dev->latest_request = pci_state;	/* record latest request */ + +	/* +	 * LSS9 and LSS10 contain multiple PCI devices. +	 * Use the lowest numbered (highest power) state in the LSS +	 */ +	if (mrst_dev->lss == 9) +		pci_state = pmu_min_lss_pci_req(mrst_lss9_pci_ids, pci_state); +	else if (mrst_dev->lss == 10) +		pci_state = pmu_min_lss_pci_req(mrst_lss10_pci_ids, pci_state); + +	status = pmu_wait_ready(); +	if (status) +		goto ret; + +	old_sss = pmu_read_sss(); +	new_sss = old_sss & ~SSMSK(3, mrst_dev->lss); +	new_sss |= SSMSK(pci_2_mrst_state(mrst_dev->lss, pci_state), +			mrst_dev->lss); + +	if (new_sss == old_sss) +		goto ret;	/* nothing to do */ + +	pmu_set_power_state_send_cmd++; + +	status = pmu_issue_command(new_sss); + +	if (unlikely(status != 0)) { +		dev_err(&pdev->dev, "Failed to Issue a PM command\n"); +		goto ret; +	} + +	if (pmu_wait_done()) +		goto ret; + +	lss_s0i3_enabled = +	((pmu_read_sss() & S0I3_SSS_TARGET) == S0I3_SSS_TARGET); +ret: +	spin_unlock(&mrst_pmu_power_state_lock); +	return status; +} + +#ifdef CONFIG_DEBUG_FS +static char *d0ix_names[] = {"D0", "D0i1", "D0i2", "D0i3"}; + +static inline const char *d0ix_name(int state) +{ +	return d0ix_names[(int) state]; +} + +static int debug_mrst_pmu_show(struct seq_file *s, void *unused) +{ +	struct pci_dev *pdev = NULL; +	u32 cur_pmsss; +	int lss; + +	seq_printf(s, "0x%08X D0I1_ACG_SSS_TARGET\n", D0I1_ACG_SSS_TARGET); + +	cur_pmsss = pmu_read_sss(); + +	seq_printf(s, "0x%08X S0I3_SSS_TARGET\n", S0I3_SSS_TARGET); + +	seq_printf(s, "0x%08X Current SSS ", cur_pmsss); +	seq_printf(s, lss_s0i3_enabled ? "\n" : "[BLOCKS s0i3]\n"); + +	if (cpumask_equal(cpu_online_mask, cpumask_of(0))) +		seq_printf(s, "cpu0 is only cpu online\n"); +	else +		seq_printf(s, "cpu0 is NOT only cpu online [BLOCKS S0i3]\n"); + +	seq_printf(s, "GFX: %s\n", graphics_is_off ? "" : "[BLOCKS s0i3]"); + + +	for_each_pci_dev(pdev) { +		int pos; +		u16 pmcsr; +		struct mrst_device *mrst_dev; +		int i; + +		mrst_dev = pci_id_2_mrst_dev(pdev->device); + +		seq_printf(s, "%s %04x/%04X %-16.16s ", +			dev_name(&pdev->dev), +			pdev->vendor, pdev->device, +			dev_driver_string(&pdev->dev)); + +		if (unlikely (!mrst_dev)) { +			seq_printf(s, " UNKNOWN\n"); +			continue; +		} + +		if (mrst_dev->lss) +			seq_printf(s, "LSS %2d %-4s ", mrst_dev->lss, +				d0ix_name(((cur_pmsss >> +					(mrst_dev->lss * 2)) & 0x3))); +		else +			seq_printf(s, "            "); + +		/* PCI PM config space setting */ +		pos = pci_find_capability(pdev, PCI_CAP_ID_PM); +		if (pos != 0) { +			pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr); +		seq_printf(s, "PCI-%-4s", +			pci_power_name(pmcsr & PCI_PM_CTRL_STATE_MASK)); +		} else { +			seq_printf(s, "        "); +		} + +		seq_printf(s, " %s ", pci_power_name(mrst_dev->latest_request)); +		for (i = 0; i <= PCI_D3cold; ++i) +			seq_printf(s, "%d ", mrst_dev->pci_state_counts[i]); + +		if (mrst_dev->lss) { +			unsigned int lssmask; + +			lssmask = SSMSK(D0i3, mrst_dev->lss); + +			if ((lssmask & S0I3_SSS_TARGET) && +				((lssmask & cur_pmsss) != +					(lssmask & S0I3_SSS_TARGET))) +						seq_printf(s , "[BLOCKS s0i3]"); +		} + +		seq_printf(s, "\n"); +	} +	seq_printf(s, "Wake Counters:\n"); +	for (lss = 0; lss < MRST_NUM_LSS; ++lss) +		seq_printf(s, "LSS%d %d\n", lss, wake_counters[lss]); + +	seq_printf(s, "Interrupt Counters:\n"); +	seq_printf(s, +		"INT_SPURIOUS \t%8u\n" "INT_CMD_DONE \t%8u\n" +		"INT_CMD_ERR  \t%8u\n" "INT_WAKE_RX  \t%8u\n" +		"INT_SS_ERROR \t%8u\n" "INT_S0IX_MISS\t%8u\n" +		"INT_NO_ACKC6 \t%8u\n" "INT_INVALID  \t%8u\n", +		pmu_irq_stats[INT_SPURIOUS], pmu_irq_stats[INT_CMD_DONE], +		pmu_irq_stats[INT_CMD_ERR], pmu_irq_stats[INT_WAKE_RX], +		pmu_irq_stats[INT_SS_ERROR], pmu_irq_stats[INT_S0IX_MISS], +		pmu_irq_stats[INT_NO_ACKC6], pmu_irq_stats[INT_INVALID]); + +	seq_printf(s, "mrst_pmu_wait_ready_calls          %8d\n", +			pmu_wait_ready_calls); +	seq_printf(s, "mrst_pmu_wait_ready_udelays        %8d\n", +			pmu_wait_ready_udelays); +	seq_printf(s, "mrst_pmu_wait_ready_udelays_max    %8d\n", +			pmu_wait_ready_udelays_max); +	seq_printf(s, "mrst_pmu_wait_done_calls           %8d\n", +			pmu_wait_done_calls); +	seq_printf(s, "mrst_pmu_wait_done_udelays         %8d\n", +			pmu_wait_done_udelays); +	seq_printf(s, "mrst_pmu_wait_done_udelays_max     %8d\n", +			pmu_wait_done_udelays_max); +	seq_printf(s, "mrst_pmu_set_power_state_entry     %8d\n", +			pmu_set_power_state_entry); +	seq_printf(s, "mrst_pmu_set_power_state_send_cmd  %8d\n", +			pmu_set_power_state_send_cmd); +	seq_printf(s, "SCU busy: %d\n", pmu_read_busy_status()); + +	return 0; +} + +static int debug_mrst_pmu_open(struct inode *inode, struct file *file) +{ +	return single_open(file, debug_mrst_pmu_show, NULL); +} + +static const struct file_operations devices_state_operations = { +	.open		= debug_mrst_pmu_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; +#endif	/* DEBUG_FS */ + +/* + * Validate SCU PCI shim PCI vendor capability byte + * against LSS hard-coded in mrst_devs[] above. + * DEBUG only. + */ +static void pmu_scu_firmware_debug(void) +{ +	struct pci_dev *pdev = NULL; + +	for_each_pci_dev(pdev) { +		struct mrst_device *mrst_dev; +		u8 pci_config_lss; +		int pos; + +		mrst_dev = pci_id_2_mrst_dev(pdev->device); +		if (unlikely(!mrst_dev)) { +			printk(KERN_ERR FW_BUG "pmu: Unknown " +				"PCI device 0x%04X\n", pdev->device); +			continue; +		} + +		if (mrst_dev->lss == 0) +			continue;	 /* no LSS in our table */ + +		pos = pci_find_capability(pdev, PCI_CAP_ID_VNDR); +		if (!pos != 0) { +			printk(KERN_ERR FW_BUG "pmu: 0x%04X " +				"missing PCI Vendor Capability\n", +				pdev->device); +			continue; +		} +		pci_read_config_byte(pdev, pos + 4, &pci_config_lss); +		if (!(pci_config_lss & PCI_VENDOR_CAP_LOG_SS_MASK)) { +			printk(KERN_ERR FW_BUG "pmu: 0x%04X " +				"invalid PCI Vendor Capability 0x%x " +				" expected LSS 0x%X\n", +				pdev->device, pci_config_lss, mrst_dev->lss); +			continue; +		} +		pci_config_lss &= PCI_VENDOR_CAP_LOG_ID_MASK; + +		if (mrst_dev->lss == pci_config_lss) +			continue; + +		printk(KERN_ERR FW_BUG "pmu: 0x%04X LSS = %d, expected %d\n", +			pdev->device, pci_config_lss, mrst_dev->lss); +	} +} + +/** + * pmu_probe + */ +static int __devinit pmu_probe(struct pci_dev *pdev, +				   const struct pci_device_id *pci_id) +{ +	int ret; +	struct mrst_pmu_reg *pmu; + +	/* Init the device */ +	ret = pci_enable_device(pdev); +	if (ret) { +		dev_err(&pdev->dev, "Unable to Enable PCI device\n"); +		return ret; +	} + +	ret = pci_request_regions(pdev, MRST_PMU_DRV_NAME); +	if (ret < 0) { +		dev_err(&pdev->dev, "Cannot obtain PCI resources, aborting\n"); +		goto out_err1; +	} + +	/* Map the memory of PMU reg base */ +	pmu = pci_iomap(pdev, 0, 0); +	if (!pmu) { +		dev_err(&pdev->dev, "Unable to map the PMU address space\n"); +		ret = -ENOMEM; +		goto out_err2; +	} + +#ifdef CONFIG_DEBUG_FS +	/* /sys/kernel/debug/mrst_pmu */ +	(void) debugfs_create_file("mrst_pmu", S_IFREG | S_IRUGO, +				NULL, NULL, &devices_state_operations); +#endif +	pmu_reg = pmu;	/* success */ + +	if (request_irq(pdev->irq, pmu_irq, 0, MRST_PMU_DRV_NAME, NULL)) { +		dev_err(&pdev->dev, "Registering isr has failed\n"); +		ret = -1; +		goto out_err3; +	} + +	pmu_scu_firmware_debug(); + +	pmu_write_wkc(S0I3_WAKE_SOURCES);	/* Enable S0i3 wakeup sources */ + +	pmu_wait_ready(); + +	pmu_write_ssc(D0I1_ACG_SSS_TARGET);	/* Enable Auto-Clock_Gating */ +	pmu_write_cmd(0x201); + +	spin_lock_init(&mrst_pmu_power_state_lock); + +	/* Enable the hardware interrupt */ +	pmu_irq_enable(); +	return 0; + +out_err3: +	free_irq(pdev->irq, NULL); +	pci_iounmap(pdev, pmu_reg); +	pmu_reg = NULL; +out_err2: +	pci_release_region(pdev, 0); +out_err1: +	pci_disable_device(pdev); +	return ret; +} + +static void __devexit pmu_remove(struct pci_dev *pdev) +{ +	dev_err(&pdev->dev, "Mid PM pmu_remove called\n"); + +	/* Freeing up the irq */ +	free_irq(pdev->irq, NULL); + +	pci_iounmap(pdev, pmu_reg); +	pmu_reg = NULL; + +	/* disable the current PCI device */ +	pci_release_region(pdev, 0); +	pci_disable_device(pdev); +} + +static DEFINE_PCI_DEVICE_TABLE(pmu_pci_ids) = { +	{ PCI_VDEVICE(INTEL, PCI_DEV_ID_MRST_PMU), 0 }, +	{ } +}; + +MODULE_DEVICE_TABLE(pci, pmu_pci_ids); + +static struct pci_driver driver = { +	.name = MRST_PMU_DRV_NAME, +	.id_table = pmu_pci_ids, +	.probe = pmu_probe, +	.remove = __devexit_p(pmu_remove), +}; + +/** + * pmu_pci_register - register the PMU driver as PCI device + */ +static int __init pmu_pci_register(void) +{ +	return pci_register_driver(&driver); +} + +/* Register and probe via fs_initcall() to preceed device_initcall() */ +fs_initcall(pmu_pci_register); + +static void __exit mid_pci_cleanup(void) +{ +	pci_unregister_driver(&driver); +} + +static int ia_major; +static int ia_minor; + +static int pmu_sfi_parse_oem(struct sfi_table_header *table) +{ +	struct sfi_table_simple *sb; + +	sb = (struct sfi_table_simple *)table; +	ia_major = (sb->pentry[1] >> 0) & 0xFFFF; +	ia_minor = (sb->pentry[1] >> 16) & 0xFFFF; +	printk(KERN_INFO "mrst_pmu: IA FW version v%x.%x\n", +		ia_major, ia_minor); + +	return 0; +} + +static int __init scu_fw_check(void) +{ +	int ret; +	u32 fw_version; + +	if (!pmu_reg) +		return 0;	/* this driver didn't probe-out */ + +	sfi_table_parse("OEMB", NULL, NULL, pmu_sfi_parse_oem); + +	if (ia_major < 0x6005 || ia_minor < 0x1525) { +		WARN(1, "mrst_pmu: IA FW version too old\n"); +		return -1; +	} + +	ret = intel_scu_ipc_command(IPCMSG_FW_REVISION, 0, NULL, 0, +					&fw_version, 1); + +	if (ret) { +		WARN(1, "mrst_pmu: IPC FW version? %d\n", ret); +	} else { +		int scu_major = (fw_version >> 8) & 0xFF; +		int scu_minor = (fw_version >> 0) & 0xFF; + +		printk(KERN_INFO "mrst_pmu: firmware v%x\n", fw_version); + +		if ((scu_major >= 0xC0) && (scu_minor >= 0x49)) { +			printk(KERN_INFO "mrst_pmu: enabling S0i3\n"); +			mrst_pmu_s0i3_enable = true; +		} else { +			WARN(1, "mrst_pmu: S0i3 disabled, old firmware %X.%X", +					scu_major, scu_minor); +		} +	} +	return 0; +} +late_initcall(scu_fw_check); +module_exit(mid_pci_cleanup); diff --git a/arch/x86/platform/mrst/pmu.h b/arch/x86/platform/mrst/pmu.h new file mode 100644 index 000000000000..bfbfe64b167b --- /dev/null +++ b/arch/x86/platform/mrst/pmu.h @@ -0,0 +1,234 @@ +/* + * mrst/pmu.h - private definitions for MRST Power Management Unit mrst/pmu.c + * + * Copyright (c) 2011, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _MRST_PMU_H_ +#define _MRST_PMU_H_ + +#define PCI_DEV_ID_MRST_PMU		0x0810 +#define MRST_PMU_DRV_NAME		"mrst_pmu" +#define	PCI_SUB_CLASS_MASK		0xFF00 + +#define	PCI_VENDOR_CAP_LOG_ID_MASK	0x7F +#define PCI_VENDOR_CAP_LOG_SS_MASK	0x80 + +#define SUB_SYS_ALL_D0I1	0x01155555 +#define S0I3_WAKE_SOURCES	0x00001FFF + +#define PM_S0I3_COMMAND					\ +	((0 << 31) |	/* Reserved */			\ +	(0 << 30) |	/* Core must be idle */		\ +	(0xc2 << 22) |	/* ACK C6 trigger */		\ +	(3 << 19) |	/* Trigger on DMI message */	\ +	(3 << 16) |	/* Enter S0i3 */		\ +	(0 << 13) |	/* Numeric mode ID (sw) */	\ +	(3 << 9) |	/* Trigger mode */		\ +	(0 << 8) |	/* Do not interrupt */		\ +	(1 << 0))	/* Set configuration */ + +#define	LSS_DMI		0 +#define	LSS_SD_HC0	1 +#define	LSS_SD_HC1	2 +#define	LSS_NAND	3 +#define	LSS_IMAGING	4 +#define	LSS_SECURITY	5 +#define	LSS_DISPLAY	6 +#define	LSS_USB_HC	7 +#define	LSS_USB_OTG	8 +#define	LSS_AUDIO	9 +#define	LSS_AUDIO_LPE	9 +#define	LSS_AUDIO_SSP	9 +#define	LSS_I2C0	10 +#define	LSS_I2C1	10 +#define	LSS_I2C2	10 +#define	LSS_KBD		10 +#define	LSS_SPI0	10 +#define	LSS_SPI1	10 +#define	LSS_SPI2	10 +#define	LSS_GPIO	10 +#define	LSS_SRAM	11	/* used by SCU, do not touch */ +#define	LSS_SD_HC2	12 +/* LSS hardware bits 15,14,13 are hardwired to 0, thus unusable */ +#define MRST_NUM_LSS	13 + +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) + +#define	SSMSK(mask, lss) ((mask) << ((lss) * 2)) +#define	D0	0 +#define	D0i1	1 +#define	D0i2	2 +#define	D0i3	3 + +#define S0I3_SSS_TARGET	(		\ +	SSMSK(D0i1, LSS_DMI) |		\ +	SSMSK(D0i3, LSS_SD_HC0) |	\ +	SSMSK(D0i3, LSS_SD_HC1) |	\ +	SSMSK(D0i3, LSS_NAND) |		\ +	SSMSK(D0i3, LSS_SD_HC2) |	\ +	SSMSK(D0i3, LSS_IMAGING) |	\ +	SSMSK(D0i3, LSS_SECURITY) |	\ +	SSMSK(D0i3, LSS_DISPLAY) |	\ +	SSMSK(D0i3, LSS_USB_HC) |	\ +	SSMSK(D0i3, LSS_USB_OTG) |	\ +	SSMSK(D0i3, LSS_AUDIO) |	\ +	SSMSK(D0i1, LSS_I2C0)) + +/* + * D0i1 on Langwell is Autonomous Clock Gating (ACG). + * Enable ACG on every LSS except camera and audio + */ +#define D0I1_ACG_SSS_TARGET	 \ +	(SUB_SYS_ALL_D0I1 & ~SSMSK(D0i1, LSS_IMAGING) & ~SSMSK(D0i1, LSS_AUDIO)) + +enum cm_mode { +	CM_NOP,			/* ignore the config mode value */ +	CM_IMMEDIATE, +	CM_DELAY, +	CM_TRIGGER, +	CM_INVALID +}; + +enum sys_state { +	SYS_STATE_S0I0, +	SYS_STATE_S0I1, +	SYS_STATE_S0I2, +	SYS_STATE_S0I3, +	SYS_STATE_S3, +	SYS_STATE_S5 +}; + +#define SET_CFG_CMD	1 + +enum int_status { +	INT_SPURIOUS = 0, +	INT_CMD_DONE = 1, +	INT_CMD_ERR = 2, +	INT_WAKE_RX = 3, +	INT_SS_ERROR = 4, +	INT_S0IX_MISS = 5, +	INT_NO_ACKC6 = 6, +	INT_INVALID = 7, +}; + +/* PMU register interface */ +static struct mrst_pmu_reg { +	u32 pm_sts;		/* 0x00 */ +	u32 pm_cmd;		/* 0x04 */ +	u32 pm_ics;		/* 0x08 */ +	u32 _resv1;		/* 0x0C */ +	u32 pm_wkc[2];		/* 0x10 */ +	u32 pm_wks[2];		/* 0x18 */ +	u32 pm_ssc[4];		/* 0x20 */ +	u32 pm_sss[4];		/* 0x30 */ +	u32 pm_wssc[4];		/* 0x40 */ +	u32 pm_c3c4;		/* 0x50 */ +	u32 pm_c5c6;		/* 0x54 */ +	u32 pm_msi_disable;	/* 0x58 */ +} *pmu_reg; + +static inline u32 pmu_read_sts(void) { return readl(&pmu_reg->pm_sts); } +static inline u32 pmu_read_ics(void) { return readl(&pmu_reg->pm_ics); } +static inline u32 pmu_read_wks(void) { return readl(&pmu_reg->pm_wks[0]); } +static inline u32 pmu_read_sss(void) { return readl(&pmu_reg->pm_sss[0]); } + +static inline void pmu_write_cmd(u32 arg) { writel(arg, &pmu_reg->pm_cmd); } +static inline void pmu_write_ics(u32 arg) { writel(arg, &pmu_reg->pm_ics); } +static inline void pmu_write_wkc(u32 arg) { writel(arg, &pmu_reg->pm_wkc[0]); } +static inline void pmu_write_ssc(u32 arg) { writel(arg, &pmu_reg->pm_ssc[0]); } +static inline void pmu_write_wssc(u32 arg) +					{ writel(arg, &pmu_reg->pm_wssc[0]); } + +static inline void pmu_msi_enable(void) { writel(0, &pmu_reg->pm_msi_disable); } +static inline u32 pmu_msi_is_disabled(void) +				{ return readl(&pmu_reg->pm_msi_disable); } + +union pmu_pm_ics { +	struct { +		u32 cause:8; +		u32 enable:1; +		u32 pending:1; +		u32 reserved:22; +	} bits; +	u32 value; +}; + +static inline void pmu_irq_enable(void) +{ +	union pmu_pm_ics pmu_ics; + +	pmu_ics.value = pmu_read_ics(); +	pmu_ics.bits.enable = 1; +	pmu_write_ics(pmu_ics.value); +} + +union pmu_pm_status { +	struct { +		u32 pmu_rev:8; +		u32 pmu_busy:1; +		u32 mode_id:4; +		u32 Reserved:19; +	} pmu_status_parts; +	u32 pmu_status_value; +}; + +static inline int pmu_read_busy_status(void) +{ +	union pmu_pm_status result; + +	result.pmu_status_value = pmu_read_sts(); + +	return result.pmu_status_parts.pmu_busy; +} + +/* pmu set config parameters */ +struct cfg_delay_param_t { +	u32 cmd:8; +	u32 ioc:1; +	u32 cfg_mode:4; +	u32 mode_id:3; +	u32 sys_state:3; +	u32 cfg_delay:8; +	u32 rsvd:5; +}; + +struct cfg_trig_param_t { +	u32 cmd:8; +	u32 ioc:1; +	u32 cfg_mode:4; +	u32 mode_id:3; +	u32 sys_state:3; +	u32 cfg_trig_type:3; +	u32 cfg_trig_val:8; +	u32 cmbi:1; +	u32 rsvd1:1; +}; + +union pmu_pm_set_cfg_cmd_t { +	union { +		struct cfg_delay_param_t d_param; +		struct cfg_trig_param_t t_param; +	} pmu2_params; +	u32 pmu_pm_set_cfg_cmd_value; +}; + +#ifdef FUTURE_PATCH +extern int mrst_s0i3_entry(u32 regval, u32 *regaddr); +#else +static inline int mrst_s0i3_entry(u32 regval, u32 *regaddr) { return -1; } +#endif +#endif diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 60aeeb56948f..a9627e2e3295 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -9,6 +9,7 @@  #include <linux/mm.h>  #include <linux/pm.h>  #include <linux/memblock.h> +#include <linux/cpuidle.h>  #include <asm/elf.h>  #include <asm/vdso.h> @@ -426,7 +427,7 @@ void __init xen_arch_setup(void)  #ifdef CONFIG_X86_32  	boot_cpu_data.hlt_works_ok = 1;  #endif -	pm_idle = default_idle; +	disable_cpuidle();  	boot_option_idle_override = IDLE_HALT;  	fiddle_vdso(); diff --git a/block/blk-core.c b/block/blk-core.c index b850bedad229..b627558c461f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1368,8 +1368,10 @@ static bool should_fail_request(struct hd_struct *part, unsigned int bytes)  static int __init fail_make_request_debugfs(void)  { -	return init_fault_attr_dentries(&fail_make_request, -					"fail_make_request"); +	struct dentry *dir = fault_create_debugfs_attr("fail_make_request", +						NULL, &fail_make_request); + +	return IS_ERR(dir) ? PTR_ERR(dir) : 0;  }  late_initcall(fail_make_request_debugfs); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 4f0c06c7a338..780354888958 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -28,7 +28,10 @@ int blk_should_fake_timeout(struct request_queue *q)  static int __init fail_io_timeout_debugfs(void)  { -	return init_fault_attr_dentries(&fail_io_timeout, "fail_io_timeout"); +	struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout", +						NULL, &fail_io_timeout); + +	return IS_ERR(dir) ? PTR_ERR(dir) : 0;  }  late_initcall(fail_io_timeout_debugfs); diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h index 73863d86f022..76dc02f15574 100644 --- a/drivers/acpi/acpica/acglobal.h +++ b/drivers/acpi/acpica/acglobal.h @@ -126,6 +126,12 @@ u8 ACPI_INIT_GLOBAL(acpi_gbl_copy_dsdt_locally, FALSE);   */  u8 ACPI_INIT_GLOBAL(acpi_gbl_truncate_io_addresses, FALSE); +/* + * Disable runtime checking and repair of values returned by control methods. + * Use only if the repair is causing a problem on a particular machine. + */ +u8 ACPI_INIT_GLOBAL(acpi_gbl_disable_auto_repair, FALSE); +  /* acpi_gbl_FADT is a local copy of the FADT, converted to a common format. */  struct acpi_table_fadt acpi_gbl_FADT; diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h index c7f743ca395b..5552125d8340 100644 --- a/drivers/acpi/acpica/aclocal.h +++ b/drivers/acpi/acpica/aclocal.h @@ -357,6 +357,7 @@ struct acpi_predefined_data {  	char *pathname;  	const union acpi_predefined_info *predefined;  	union acpi_operand_object *parent_package; +	struct acpi_namespace_node *node;  	u32 flags;  	u8 node_flags;  }; diff --git a/drivers/acpi/acpica/acpredef.h b/drivers/acpi/acpica/acpredef.h index 94e73c97cf85..c445cca490ea 100644 --- a/drivers/acpi/acpica/acpredef.h +++ b/drivers/acpi/acpica/acpredef.h @@ -468,6 +468,7 @@ static const union acpi_predefined_info predefined_names[] =  	{{"_SWS", 0, ACPI_RTYPE_INTEGER}},  	{{"_TC1", 0, ACPI_RTYPE_INTEGER}},  	{{"_TC2", 0, ACPI_RTYPE_INTEGER}}, +	{{"_TDL", 0, ACPI_RTYPE_INTEGER}},  	{{"_TIP", 1, ACPI_RTYPE_INTEGER}},  	{{"_TIV", 1, ACPI_RTYPE_INTEGER}},  	{{"_TMP", 0, ACPI_RTYPE_INTEGER}}, diff --git a/drivers/acpi/acpica/nspredef.c b/drivers/acpi/acpica/nspredef.c index 9fb03fa8ffde..c845c8089f39 100644 --- a/drivers/acpi/acpica/nspredef.c +++ b/drivers/acpi/acpica/nspredef.c @@ -193,14 +193,20 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node,  	}  	/* -	 * 1) We have a return value, but if one wasn't expected, just exit, this is -	 * not a problem. For example, if the "Implicit Return" feature is -	 * enabled, methods will always return a value. +	 * Return value validation and possible repair.  	 * -	 * 2) If the return value can be of any type, then we cannot perform any -	 * validation, exit. +	 * 1) Don't perform return value validation/repair if this feature +	 * has been disabled via a global option. +	 * +	 * 2) We have a return value, but if one wasn't expected, just exit, +	 * this is not a problem. For example, if the "Implicit Return" +	 * feature is enabled, methods will always return a value. +	 * +	 * 3) If the return value can be of any type, then we cannot perform +	 * any validation, just exit.  	 */ -	if ((!predefined->info.expected_btypes) || +	if (acpi_gbl_disable_auto_repair || +	    (!predefined->info.expected_btypes) ||  	    (predefined->info.expected_btypes == ACPI_RTYPE_ALL)) {  		goto cleanup;  	} @@ -212,6 +218,7 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node,  		goto cleanup;  	}  	data->predefined = predefined; +	data->node = node;  	data->node_flags = node->flags;  	data->pathname = pathname; diff --git a/drivers/acpi/acpica/nsrepair2.c b/drivers/acpi/acpica/nsrepair2.c index 973883babee1..024c4f263f87 100644 --- a/drivers/acpi/acpica/nsrepair2.c +++ b/drivers/acpi/acpica/nsrepair2.c @@ -503,6 +503,21 @@ acpi_ns_repair_TSS(struct acpi_predefined_data *data,  {  	union acpi_operand_object *return_object = *return_object_ptr;  	acpi_status status; +	struct acpi_namespace_node *node; + +	/* +	 * We can only sort the _TSS return package if there is no _PSS in the +	 * same scope. This is because if _PSS is present, the ACPI specification +	 * dictates that the _TSS Power Dissipation field is to be ignored, and +	 * therefore some BIOSs leave garbage values in the _TSS Power field(s). +	 * In this case, it is best to just return the _TSS package as-is. +	 * (May, 2011) +	 */ +	status = +	    acpi_ns_get_node(data->node, "^_PSS", ACPI_NS_NO_UPSEARCH, &node); +	if (ACPI_SUCCESS(status)) { +		return (AE_OK); +	}  	status = acpi_ns_check_sorted_list(data, return_object, 5, 1,  					   ACPI_SORT_DESCENDING, diff --git a/drivers/acpi/acpica/tbinstal.c b/drivers/acpi/acpica/tbinstal.c index 48db0944ce4a..62365f6075dd 100644 --- a/drivers/acpi/acpica/tbinstal.c +++ b/drivers/acpi/acpica/tbinstal.c @@ -126,12 +126,29 @@ acpi_tb_add_table(struct acpi_table_desc *table_desc, u32 *table_index)  	}  	/* -	 * Originally, we checked the table signature for "SSDT" or "PSDT" here. -	 * Next, we added support for OEMx tables, signature "OEM". -	 * Valid tables were encountered with a null signature, so we've just -	 * given up on validating the signature, since it seems to be a waste -	 * of code. The original code was removed (05/2008). +	 * Validate the incoming table signature. +	 * +	 * 1) Originally, we checked the table signature for "SSDT" or "PSDT". +	 * 2) We added support for OEMx tables, signature "OEM". +	 * 3) Valid tables were encountered with a null signature, so we just +	 *    gave up on validating the signature, (05/2008). +	 * 4) We encountered non-AML tables such as the MADT, which caused +	 *    interpreter errors and kernel faults. So now, we once again allow +	 *    only "SSDT", "OEMx", and now, also a null signature. (05/2011).  	 */ +	if ((table_desc->pointer->signature[0] != 0x00) && +	    (!ACPI_COMPARE_NAME(table_desc->pointer->signature, ACPI_SIG_SSDT)) +	    && (ACPI_STRNCMP(table_desc->pointer->signature, "OEM", 3))) { +		ACPI_ERROR((AE_INFO, +			    "Table has invalid signature [%4.4s] (0x%8.8X), must be SSDT or OEMx", +			    acpi_ut_valid_acpi_name(*(u32 *)table_desc-> +						    pointer-> +						    signature) ? table_desc-> +			    pointer->signature : "????", +			    *(u32 *)table_desc->pointer->signature)); + +		return_ACPI_STATUS(AE_BAD_SIGNATURE); +	}  	(void)acpi_ut_acquire_mutex(ACPI_MTX_TABLES); diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig index f739a70b1c70..c34aa51af4ee 100644 --- a/drivers/acpi/apei/Kconfig +++ b/drivers/acpi/apei/Kconfig @@ -10,9 +10,11 @@ config ACPI_APEI  	  error injection.  config ACPI_APEI_GHES -	tristate "APEI Generic Hardware Error Source" +	bool "APEI Generic Hardware Error Source"  	depends on ACPI_APEI && X86  	select ACPI_HED +	select LLIST +	select GENERIC_ALLOCATOR  	help  	  Generic Hardware Error Source provides a way to report  	  platform hardware errors (such as that from chipset). It @@ -30,6 +32,13 @@ config ACPI_APEI_PCIEAER  	  PCIe AER errors may be reported via APEI firmware first mode.  	  Turn on this option to enable the corresponding support. +config ACPI_APEI_MEMORY_FAILURE +	bool "APEI memory error recovering support" +	depends on ACPI_APEI && MEMORY_FAILURE +	help +	  Memory errors may be reported via APEI firmware first mode. +	  Turn on this option to enable the memory recovering support. +  config ACPI_APEI_EINJ  	tristate "APEI Error INJection (EINJ)"  	depends on ACPI_APEI && DEBUG_FS diff --git a/drivers/acpi/apei/apei-base.c b/drivers/acpi/apei/apei-base.c index 4a904a4bf05f..8041248fce9b 100644 --- a/drivers/acpi/apei/apei-base.c +++ b/drivers/acpi/apei/apei-base.c @@ -157,9 +157,10 @@ EXPORT_SYMBOL_GPL(apei_exec_noop);   * Interpret the specified action. Go through whole action table,   * execute all instructions belong to the action.   */ -int apei_exec_run(struct apei_exec_context *ctx, u8 action) +int __apei_exec_run(struct apei_exec_context *ctx, u8 action, +		    bool optional)  { -	int rc; +	int rc = -ENOENT;  	u32 i, ip;  	struct acpi_whea_header *entry;  	apei_exec_ins_func_t run; @@ -198,9 +199,9 @@ rewind:  			goto rewind;  	} -	return 0; +	return !optional && rc < 0 ? rc : 0;  } -EXPORT_SYMBOL_GPL(apei_exec_run); +EXPORT_SYMBOL_GPL(__apei_exec_run);  typedef int (*apei_exec_entry_func_t)(struct apei_exec_context *ctx,  				      struct acpi_whea_header *entry, @@ -603,3 +604,29 @@ struct dentry *apei_get_debugfs_dir(void)  	return dapei;  }  EXPORT_SYMBOL_GPL(apei_get_debugfs_dir); + +int apei_osc_setup(void) +{ +	static u8 whea_uuid_str[] = "ed855e0c-6c90-47bf-a62a-26de0fc5ad5c"; +	acpi_handle handle; +	u32 capbuf[3]; +	struct acpi_osc_context context = { +		.uuid_str	= whea_uuid_str, +		.rev		= 1, +		.cap.length	= sizeof(capbuf), +		.cap.pointer	= capbuf, +	}; + +	capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE; +	capbuf[OSC_SUPPORT_TYPE] = 0; +	capbuf[OSC_CONTROL_TYPE] = 0; + +	if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle)) +	    || ACPI_FAILURE(acpi_run_osc(handle, &context))) +		return -EIO; +	else { +		kfree(context.ret.pointer); +		return 0; +	} +} +EXPORT_SYMBOL_GPL(apei_osc_setup); diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h index ef0581f2094d..f57050e7a5e7 100644 --- a/drivers/acpi/apei/apei-internal.h +++ b/drivers/acpi/apei/apei-internal.h @@ -50,7 +50,18 @@ static inline u64 apei_exec_ctx_get_output(struct apei_exec_context *ctx)  	return ctx->value;  } -int apei_exec_run(struct apei_exec_context *ctx, u8 action); +int __apei_exec_run(struct apei_exec_context *ctx, u8 action, bool optional); + +static inline int apei_exec_run(struct apei_exec_context *ctx, u8 action) +{ +	return __apei_exec_run(ctx, action, 0); +} + +/* It is optional whether the firmware provides the action */ +static inline int apei_exec_run_optional(struct apei_exec_context *ctx, u8 action) +{ +	return __apei_exec_run(ctx, action, 1); +}  /* Common instruction implementation */ @@ -113,4 +124,6 @@ void apei_estatus_print(const char *pfx,  			const struct acpi_hest_generic_status *estatus);  int apei_estatus_check_header(const struct acpi_hest_generic_status *estatus);  int apei_estatus_check(const struct acpi_hest_generic_status *estatus); + +int apei_osc_setup(void);  #endif diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c index f74b2ea11f21..589b96c38704 100644 --- a/drivers/acpi/apei/einj.c +++ b/drivers/acpi/apei/einj.c @@ -46,7 +46,8 @@   * Some BIOSes allow parameters to the SET_ERROR_TYPE entries in the   * EINJ table through an unpublished extension. Use with caution as   * most will ignore the parameter and make their own choice of address - * for error injection. + * for error injection.  This extension is used only if + * param_extension module parameter is specified.   */  struct einj_parameter {  	u64 type; @@ -65,6 +66,9 @@ struct einj_parameter {  	((struct acpi_whea_header *)((char *)(tab) +			\  				    sizeof(struct acpi_table_einj))) +static bool param_extension; +module_param(param_extension, bool, 0); +  static struct acpi_table_einj *einj_tab;  static struct apei_resources einj_resources; @@ -285,7 +289,7 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2)  	einj_exec_ctx_init(&ctx); -	rc = apei_exec_run(&ctx, ACPI_EINJ_BEGIN_OPERATION); +	rc = apei_exec_run_optional(&ctx, ACPI_EINJ_BEGIN_OPERATION);  	if (rc)  		return rc;  	apei_exec_ctx_set_input(&ctx, type); @@ -323,7 +327,7 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2)  	rc = __einj_error_trigger(trigger_paddr);  	if (rc)  		return rc; -	rc = apei_exec_run(&ctx, ACPI_EINJ_END_OPERATION); +	rc = apei_exec_run_optional(&ctx, ACPI_EINJ_END_OPERATION);  	return rc;  } @@ -489,14 +493,6 @@ static int __init einj_init(void)  				     einj_debug_dir, NULL, &error_type_fops);  	if (!fentry)  		goto err_cleanup; -	fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR, -				    einj_debug_dir, &error_param1); -	if (!fentry) -		goto err_cleanup; -	fentry = debugfs_create_x64("param2", S_IRUSR | S_IWUSR, -				    einj_debug_dir, &error_param2); -	if (!fentry) -		goto err_cleanup;  	fentry = debugfs_create_file("error_inject", S_IWUSR,  				     einj_debug_dir, NULL, &error_inject_fops);  	if (!fentry) @@ -513,12 +509,23 @@ static int __init einj_init(void)  	rc = apei_exec_pre_map_gars(&ctx);  	if (rc)  		goto err_release; -	param_paddr = einj_get_parameter_address(); -	if (param_paddr) { -		einj_param = ioremap(param_paddr, sizeof(*einj_param)); -		rc = -ENOMEM; -		if (!einj_param) -			goto err_unmap; +	if (param_extension) { +		param_paddr = einj_get_parameter_address(); +		if (param_paddr) { +			einj_param = ioremap(param_paddr, sizeof(*einj_param)); +			rc = -ENOMEM; +			if (!einj_param) +				goto err_unmap; +			fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR, +						    einj_debug_dir, &error_param1); +			if (!fentry) +				goto err_unmap; +			fentry = debugfs_create_x64("param2", S_IRUSR | S_IWUSR, +						    einj_debug_dir, &error_param2); +			if (!fentry) +				goto err_unmap; +		} else +			pr_warn(EINJ_PFX "Parameter extension is not supported.\n");  	}  	pr_info(EINJ_PFX "Error INJection is initialized.\n"); @@ -526,6 +533,8 @@ static int __init einj_init(void)  	return 0;  err_unmap: +	if (einj_param) +		iounmap(einj_param);  	apei_exec_post_unmap_gars(&ctx);  err_release:  	apei_resources_release(&einj_resources); diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c index a4cfb64c86a1..903549df809b 100644 --- a/drivers/acpi/apei/erst-dbg.c +++ b/drivers/acpi/apei/erst-dbg.c @@ -33,7 +33,7 @@  #define ERST_DBG_PFX			"ERST DBG: " -#define ERST_DBG_RECORD_LEN_MAX		4096 +#define ERST_DBG_RECORD_LEN_MAX		0x4000  static void *erst_dbg_buf;  static unsigned int erst_dbg_buf_len; @@ -213,6 +213,10 @@ static struct miscdevice erst_dbg_dev = {  static __init int erst_dbg_init(void)  { +	if (erst_disable) { +		pr_info(ERST_DBG_PFX "ERST support is disabled.\n"); +		return -ENODEV; +	}  	return misc_register(&erst_dbg_dev);  } diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index 6053f4780df9..2ca59dc69f7f 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -642,7 +642,7 @@ static int __erst_write_to_storage(u64 offset)  	int rc;  	erst_exec_ctx_init(&ctx); -	rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_WRITE); +	rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_WRITE);  	if (rc)  		return rc;  	apei_exec_ctx_set_input(&ctx, offset); @@ -666,7 +666,7 @@ static int __erst_write_to_storage(u64 offset)  	if (rc)  		return rc;  	val = apei_exec_ctx_get_output(&ctx); -	rc = apei_exec_run(&ctx, ACPI_ERST_END); +	rc = apei_exec_run_optional(&ctx, ACPI_ERST_END);  	if (rc)  		return rc; @@ -681,7 +681,7 @@ static int __erst_read_from_storage(u64 record_id, u64 offset)  	int rc;  	erst_exec_ctx_init(&ctx); -	rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_READ); +	rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_READ);  	if (rc)  		return rc;  	apei_exec_ctx_set_input(&ctx, offset); @@ -709,7 +709,7 @@ static int __erst_read_from_storage(u64 record_id, u64 offset)  	if (rc)  		return rc;  	val = apei_exec_ctx_get_output(&ctx); -	rc = apei_exec_run(&ctx, ACPI_ERST_END); +	rc = apei_exec_run_optional(&ctx, ACPI_ERST_END);  	if (rc)  		return rc; @@ -724,7 +724,7 @@ static int __erst_clear_from_storage(u64 record_id)  	int rc;  	erst_exec_ctx_init(&ctx); -	rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_CLEAR); +	rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_CLEAR);  	if (rc)  		return rc;  	apei_exec_ctx_set_input(&ctx, record_id); @@ -748,7 +748,7 @@ static int __erst_clear_from_storage(u64 record_id)  	if (rc)  		return rc;  	val = apei_exec_ctx_get_output(&ctx); -	rc = apei_exec_run(&ctx, ACPI_ERST_END); +	rc = apei_exec_run_optional(&ctx, ACPI_ERST_END);  	if (rc)  		return rc; diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index f703b2881153..0784f99a4665 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -12,7 +12,7 @@   * For more information about Generic Hardware Error Source, please   * refer to ACPI Specification version 4.0, section 17.3.2.6   * - * Copyright 2010 Intel Corp. + * Copyright 2010,2011 Intel Corp.   *   Author: Huang Ying <ying.huang@intel.com>   *   * This program is free software; you can redistribute it and/or @@ -42,6 +42,9 @@  #include <linux/mutex.h>  #include <linux/ratelimit.h>  #include <linux/vmalloc.h> +#include <linux/irq_work.h> +#include <linux/llist.h> +#include <linux/genalloc.h>  #include <acpi/apei.h>  #include <acpi/atomicio.h>  #include <acpi/hed.h> @@ -53,6 +56,30 @@  #define GHES_PFX	"GHES: "  #define GHES_ESTATUS_MAX_SIZE		65536 +#define GHES_ESOURCE_PREALLOC_MAX_SIZE	65536 + +#define GHES_ESTATUS_POOL_MIN_ALLOC_ORDER 3 + +/* This is just an estimation for memory pool allocation */ +#define GHES_ESTATUS_CACHE_AVG_SIZE	512 + +#define GHES_ESTATUS_CACHES_SIZE	4 + +#define GHES_ESTATUS_IN_CACHE_MAX_NSEC	10000000000ULL +/* Prevent too many caches are allocated because of RCU */ +#define GHES_ESTATUS_CACHE_ALLOCED_MAX	(GHES_ESTATUS_CACHES_SIZE * 3 / 2) + +#define GHES_ESTATUS_CACHE_LEN(estatus_len)			\ +	(sizeof(struct ghes_estatus_cache) + (estatus_len)) +#define GHES_ESTATUS_FROM_CACHE(estatus_cache)			\ +	((struct acpi_hest_generic_status *)			\ +	 ((struct ghes_estatus_cache *)(estatus_cache) + 1)) + +#define GHES_ESTATUS_NODE_LEN(estatus_len)			\ +	(sizeof(struct ghes_estatus_node) + (estatus_len)) +#define GHES_ESTATUS_FROM_NODE(estatus_node)				\ +	((struct acpi_hest_generic_status *)				\ +	 ((struct ghes_estatus_node *)(estatus_node) + 1))  /*   * One struct ghes is created for each generic hardware error source. @@ -77,6 +104,22 @@ struct ghes {  	};  }; +struct ghes_estatus_node { +	struct llist_node llnode; +	struct acpi_hest_generic *generic; +}; + +struct ghes_estatus_cache { +	u32 estatus_len; +	atomic_t count; +	struct acpi_hest_generic *generic; +	unsigned long long time_in; +	struct rcu_head rcu; +}; + +int ghes_disable; +module_param_named(disable, ghes_disable, bool, 0); +  static int ghes_panic_timeout	__read_mostly = 30;  /* @@ -121,6 +164,22 @@ static struct vm_struct *ghes_ioremap_area;  static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi);  static DEFINE_SPINLOCK(ghes_ioremap_lock_irq); +/* + * printk is not safe in NMI context.  So in NMI handler, we allocate + * required memory from lock-less memory allocator + * (ghes_estatus_pool), save estatus into it, put them into lock-less + * list (ghes_estatus_llist), then delay printk into IRQ context via + * irq_work (ghes_proc_irq_work).  ghes_estatus_size_request record + * required pool size by all NMI error source. + */ +static struct gen_pool *ghes_estatus_pool; +static unsigned long ghes_estatus_pool_size_request; +static struct llist_head ghes_estatus_llist; +static struct irq_work ghes_proc_irq_work; + +struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE]; +static atomic_t ghes_estatus_cache_alloced; +  static int ghes_ioremap_init(void)  {  	ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES, @@ -180,6 +239,55 @@ static void ghes_iounmap_irq(void __iomem *vaddr_ptr)  	__flush_tlb_one(vaddr);  } +static int ghes_estatus_pool_init(void) +{ +	ghes_estatus_pool = gen_pool_create(GHES_ESTATUS_POOL_MIN_ALLOC_ORDER, -1); +	if (!ghes_estatus_pool) +		return -ENOMEM; +	return 0; +} + +static void ghes_estatus_pool_free_chunk_page(struct gen_pool *pool, +					      struct gen_pool_chunk *chunk, +					      void *data) +{ +	free_page(chunk->start_addr); +} + +static void ghes_estatus_pool_exit(void) +{ +	gen_pool_for_each_chunk(ghes_estatus_pool, +				ghes_estatus_pool_free_chunk_page, NULL); +	gen_pool_destroy(ghes_estatus_pool); +} + +static int ghes_estatus_pool_expand(unsigned long len) +{ +	unsigned long i, pages, size, addr; +	int ret; + +	ghes_estatus_pool_size_request += PAGE_ALIGN(len); +	size = gen_pool_size(ghes_estatus_pool); +	if (size >= ghes_estatus_pool_size_request) +		return 0; +	pages = (ghes_estatus_pool_size_request - size) / PAGE_SIZE; +	for (i = 0; i < pages; i++) { +		addr = __get_free_page(GFP_KERNEL); +		if (!addr) +			return -ENOMEM; +		ret = gen_pool_add(ghes_estatus_pool, addr, PAGE_SIZE, -1); +		if (ret) +			return ret; +	} + +	return 0; +} + +static void ghes_estatus_pool_shrink(unsigned long len) +{ +	ghes_estatus_pool_size_request -= PAGE_ALIGN(len); +} +  static struct ghes *ghes_new(struct acpi_hest_generic *generic)  {  	struct ghes *ghes; @@ -341,43 +449,196 @@ static void ghes_clear_estatus(struct ghes *ghes)  	ghes->flags &= ~GHES_TO_CLEAR;  } -static void ghes_do_proc(struct ghes *ghes) +static void ghes_do_proc(const struct acpi_hest_generic_status *estatus)  { -	int sev, processed = 0; +	int sev, sec_sev;  	struct acpi_hest_generic_data *gdata; -	sev = ghes_severity(ghes->estatus->error_severity); -	apei_estatus_for_each_section(ghes->estatus, gdata) { -#ifdef CONFIG_X86_MCE +	sev = ghes_severity(estatus->error_severity); +	apei_estatus_for_each_section(estatus, gdata) { +		sec_sev = ghes_severity(gdata->error_severity);  		if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,  				 CPER_SEC_PLATFORM_MEM)) { -			apei_mce_report_mem_error( -				sev == GHES_SEV_CORRECTED, -				(struct cper_sec_mem_err *)(gdata+1)); -			processed = 1; -		} +			struct cper_sec_mem_err *mem_err; +			mem_err = (struct cper_sec_mem_err *)(gdata+1); +#ifdef CONFIG_X86_MCE +			apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED, +						  mem_err);  #endif +#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE +			if (sev == GHES_SEV_RECOVERABLE && +			    sec_sev == GHES_SEV_RECOVERABLE && +			    mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) { +				unsigned long pfn; +				pfn = mem_err->physical_addr >> PAGE_SHIFT; +				memory_failure_queue(pfn, 0, 0); +			} +#endif +		}  	}  } -static void ghes_print_estatus(const char *pfx, struct ghes *ghes) +static void __ghes_print_estatus(const char *pfx, +				 const struct acpi_hest_generic *generic, +				 const struct acpi_hest_generic_status *estatus)  { -	/* Not more than 2 messages every 5 seconds */ -	static DEFINE_RATELIMIT_STATE(ratelimit, 5*HZ, 2); -  	if (pfx == NULL) { -		if (ghes_severity(ghes->estatus->error_severity) <= +		if (ghes_severity(estatus->error_severity) <=  		    GHES_SEV_CORRECTED)  			pfx = KERN_WARNING HW_ERR;  		else  			pfx = KERN_ERR HW_ERR;  	} -	if (__ratelimit(&ratelimit)) { -		printk( -	"%s""Hardware error from APEI Generic Hardware Error Source: %d\n", -	pfx, ghes->generic->header.source_id); -		apei_estatus_print(pfx, ghes->estatus); +	printk("%s""Hardware error from APEI Generic Hardware Error Source: %d\n", +	       pfx, generic->header.source_id); +	apei_estatus_print(pfx, estatus); +} + +static int ghes_print_estatus(const char *pfx, +			      const struct acpi_hest_generic *generic, +			      const struct acpi_hest_generic_status *estatus) +{ +	/* Not more than 2 messages every 5 seconds */ +	static DEFINE_RATELIMIT_STATE(ratelimit_corrected, 5*HZ, 2); +	static DEFINE_RATELIMIT_STATE(ratelimit_uncorrected, 5*HZ, 2); +	struct ratelimit_state *ratelimit; + +	if (ghes_severity(estatus->error_severity) <= GHES_SEV_CORRECTED) +		ratelimit = &ratelimit_corrected; +	else +		ratelimit = &ratelimit_uncorrected; +	if (__ratelimit(ratelimit)) { +		__ghes_print_estatus(pfx, generic, estatus); +		return 1;  	} +	return 0; +} + +/* + * GHES error status reporting throttle, to report more kinds of + * errors, instead of just most frequently occurred errors. + */ +static int ghes_estatus_cached(struct acpi_hest_generic_status *estatus) +{ +	u32 len; +	int i, cached = 0; +	unsigned long long now; +	struct ghes_estatus_cache *cache; +	struct acpi_hest_generic_status *cache_estatus; + +	len = apei_estatus_len(estatus); +	rcu_read_lock(); +	for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) { +		cache = rcu_dereference(ghes_estatus_caches[i]); +		if (cache == NULL) +			continue; +		if (len != cache->estatus_len) +			continue; +		cache_estatus = GHES_ESTATUS_FROM_CACHE(cache); +		if (memcmp(estatus, cache_estatus, len)) +			continue; +		atomic_inc(&cache->count); +		now = sched_clock(); +		if (now - cache->time_in < GHES_ESTATUS_IN_CACHE_MAX_NSEC) +			cached = 1; +		break; +	} +	rcu_read_unlock(); +	return cached; +} + +static struct ghes_estatus_cache *ghes_estatus_cache_alloc( +	struct acpi_hest_generic *generic, +	struct acpi_hest_generic_status *estatus) +{ +	int alloced; +	u32 len, cache_len; +	struct ghes_estatus_cache *cache; +	struct acpi_hest_generic_status *cache_estatus; + +	alloced = atomic_add_return(1, &ghes_estatus_cache_alloced); +	if (alloced > GHES_ESTATUS_CACHE_ALLOCED_MAX) { +		atomic_dec(&ghes_estatus_cache_alloced); +		return NULL; +	} +	len = apei_estatus_len(estatus); +	cache_len = GHES_ESTATUS_CACHE_LEN(len); +	cache = (void *)gen_pool_alloc(ghes_estatus_pool, cache_len); +	if (!cache) { +		atomic_dec(&ghes_estatus_cache_alloced); +		return NULL; +	} +	cache_estatus = GHES_ESTATUS_FROM_CACHE(cache); +	memcpy(cache_estatus, estatus, len); +	cache->estatus_len = len; +	atomic_set(&cache->count, 0); +	cache->generic = generic; +	cache->time_in = sched_clock(); +	return cache; +} + +static void ghes_estatus_cache_free(struct ghes_estatus_cache *cache) +{ +	u32 len; + +	len = apei_estatus_len(GHES_ESTATUS_FROM_CACHE(cache)); +	len = GHES_ESTATUS_CACHE_LEN(len); +	gen_pool_free(ghes_estatus_pool, (unsigned long)cache, len); +	atomic_dec(&ghes_estatus_cache_alloced); +} + +static void ghes_estatus_cache_rcu_free(struct rcu_head *head) +{ +	struct ghes_estatus_cache *cache; + +	cache = container_of(head, struct ghes_estatus_cache, rcu); +	ghes_estatus_cache_free(cache); +} + +static void ghes_estatus_cache_add( +	struct acpi_hest_generic *generic, +	struct acpi_hest_generic_status *estatus) +{ +	int i, slot = -1, count; +	unsigned long long now, duration, period, max_period = 0; +	struct ghes_estatus_cache *cache, *slot_cache = NULL, *new_cache; + +	new_cache = ghes_estatus_cache_alloc(generic, estatus); +	if (new_cache == NULL) +		return; +	rcu_read_lock(); +	now = sched_clock(); +	for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) { +		cache = rcu_dereference(ghes_estatus_caches[i]); +		if (cache == NULL) { +			slot = i; +			slot_cache = NULL; +			break; +		} +		duration = now - cache->time_in; +		if (duration >= GHES_ESTATUS_IN_CACHE_MAX_NSEC) { +			slot = i; +			slot_cache = cache; +			break; +		} +		count = atomic_read(&cache->count); +		period = duration; +		do_div(period, (count + 1)); +		if (period > max_period) { +			max_period = period; +			slot = i; +			slot_cache = cache; +		} +	} +	/* new_cache must be put into array after its contents are written */ +	smp_wmb(); +	if (slot != -1 && cmpxchg(ghes_estatus_caches + slot, +				  slot_cache, new_cache) == slot_cache) { +		if (slot_cache) +			call_rcu(&slot_cache->rcu, ghes_estatus_cache_rcu_free); +	} else +		ghes_estatus_cache_free(new_cache); +	rcu_read_unlock();  }  static int ghes_proc(struct ghes *ghes) @@ -387,9 +648,11 @@ static int ghes_proc(struct ghes *ghes)  	rc = ghes_read_estatus(ghes, 0);  	if (rc)  		goto out; -	ghes_print_estatus(NULL, ghes); -	ghes_do_proc(ghes); - +	if (!ghes_estatus_cached(ghes->estatus)) { +		if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus)) +			ghes_estatus_cache_add(ghes->generic, ghes->estatus); +	} +	ghes_do_proc(ghes->estatus);  out:  	ghes_clear_estatus(ghes);  	return 0; @@ -447,6 +710,45 @@ static int ghes_notify_sci(struct notifier_block *this,  	return ret;  } +static void ghes_proc_in_irq(struct irq_work *irq_work) +{ +	struct llist_node *llnode, *next, *tail = NULL; +	struct ghes_estatus_node *estatus_node; +	struct acpi_hest_generic *generic; +	struct acpi_hest_generic_status *estatus; +	u32 len, node_len; + +	/* +	 * Because the time order of estatus in list is reversed, +	 * revert it back to proper order. +	 */ +	llnode = llist_del_all(&ghes_estatus_llist); +	while (llnode) { +		next = llnode->next; +		llnode->next = tail; +		tail = llnode; +		llnode = next; +	} +	llnode = tail; +	while (llnode) { +		next = llnode->next; +		estatus_node = llist_entry(llnode, struct ghes_estatus_node, +					   llnode); +		estatus = GHES_ESTATUS_FROM_NODE(estatus_node); +		len = apei_estatus_len(estatus); +		node_len = GHES_ESTATUS_NODE_LEN(len); +		ghes_do_proc(estatus); +		if (!ghes_estatus_cached(estatus)) { +			generic = estatus_node->generic; +			if (ghes_print_estatus(NULL, generic, estatus)) +				ghes_estatus_cache_add(generic, estatus); +		} +		gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, +			      node_len); +		llnode = next; +	} +} +  static int ghes_notify_nmi(struct notifier_block *this,  				  unsigned long cmd, void *data)  { @@ -476,7 +778,8 @@ static int ghes_notify_nmi(struct notifier_block *this,  	if (sev_global >= GHES_SEV_PANIC) {  		oops_begin(); -		ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global); +		__ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global->generic, +				     ghes_global->estatus);  		/* reboot to log the error! */  		if (panic_timeout == 0)  			panic_timeout = ghes_panic_timeout; @@ -484,12 +787,34 @@ static int ghes_notify_nmi(struct notifier_block *this,  	}  	list_for_each_entry_rcu(ghes, &ghes_nmi, list) { +#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG +		u32 len, node_len; +		struct ghes_estatus_node *estatus_node; +		struct acpi_hest_generic_status *estatus; +#endif  		if (!(ghes->flags & GHES_TO_CLEAR))  			continue; -		/* Do not print estatus because printk is not NMI safe */ -		ghes_do_proc(ghes); +#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG +		if (ghes_estatus_cached(ghes->estatus)) +			goto next; +		/* Save estatus for further processing in IRQ context */ +		len = apei_estatus_len(ghes->estatus); +		node_len = GHES_ESTATUS_NODE_LEN(len); +		estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool, +						      node_len); +		if (estatus_node) { +			estatus_node->generic = ghes->generic; +			estatus = GHES_ESTATUS_FROM_NODE(estatus_node); +			memcpy(estatus, ghes->estatus, len); +			llist_add(&estatus_node->llnode, &ghes_estatus_llist); +		} +next: +#endif  		ghes_clear_estatus(ghes);  	} +#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG +	irq_work_queue(&ghes_proc_irq_work); +#endif  out:  	raw_spin_unlock(&ghes_nmi_lock); @@ -504,10 +829,26 @@ static struct notifier_block ghes_notifier_nmi = {  	.notifier_call = ghes_notify_nmi,  }; +static unsigned long ghes_esource_prealloc_size( +	const struct acpi_hest_generic *generic) +{ +	unsigned long block_length, prealloc_records, prealloc_size; + +	block_length = min_t(unsigned long, generic->error_block_length, +			     GHES_ESTATUS_MAX_SIZE); +	prealloc_records = max_t(unsigned long, +				 generic->records_to_preallocate, 1); +	prealloc_size = min_t(unsigned long, block_length * prealloc_records, +			      GHES_ESOURCE_PREALLOC_MAX_SIZE); + +	return prealloc_size; +} +  static int __devinit ghes_probe(struct platform_device *ghes_dev)  {  	struct acpi_hest_generic *generic;  	struct ghes *ghes = NULL; +	unsigned long len;  	int rc = -EINVAL;  	generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data; @@ -573,6 +914,8 @@ static int __devinit ghes_probe(struct platform_device *ghes_dev)  		mutex_unlock(&ghes_list_mutex);  		break;  	case ACPI_HEST_NOTIFY_NMI: +		len = ghes_esource_prealloc_size(generic); +		ghes_estatus_pool_expand(len);  		mutex_lock(&ghes_list_mutex);  		if (list_empty(&ghes_nmi))  			register_die_notifier(&ghes_notifier_nmi); @@ -597,6 +940,7 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev)  {  	struct ghes *ghes;  	struct acpi_hest_generic *generic; +	unsigned long len;  	ghes = platform_get_drvdata(ghes_dev);  	generic = ghes->generic; @@ -627,6 +971,8 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev)  		 * freed after NMI handler finishes.  		 */  		synchronize_rcu(); +		len = ghes_esource_prealloc_size(generic); +		ghes_estatus_pool_shrink(len);  		break;  	default:  		BUG(); @@ -662,15 +1008,43 @@ static int __init ghes_init(void)  		return -EINVAL;  	} +	if (ghes_disable) { +		pr_info(GHES_PFX "GHES is not enabled!\n"); +		return -EINVAL; +	} + +	init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq); +  	rc = ghes_ioremap_init();  	if (rc)  		goto err; -	rc = platform_driver_register(&ghes_platform_driver); +	rc = ghes_estatus_pool_init();  	if (rc)  		goto err_ioremap_exit; +	rc = ghes_estatus_pool_expand(GHES_ESTATUS_CACHE_AVG_SIZE * +				      GHES_ESTATUS_CACHE_ALLOCED_MAX); +	if (rc) +		goto err_pool_exit; + +	rc = platform_driver_register(&ghes_platform_driver); +	if (rc) +		goto err_pool_exit; + +	rc = apei_osc_setup(); +	if (rc == 0 && osc_sb_apei_support_acked) +		pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit and WHEA _OSC.\n"); +	else if (rc == 0 && !osc_sb_apei_support_acked) +		pr_info(GHES_PFX "APEI firmware first mode is enabled by WHEA _OSC.\n"); +	else if (rc && osc_sb_apei_support_acked) +		pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit.\n"); +	else +		pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n"); +  	return 0; +err_pool_exit: +	ghes_estatus_pool_exit();  err_ioremap_exit:  	ghes_ioremap_exit();  err: @@ -680,6 +1054,7 @@ err:  static void __exit ghes_exit(void)  {  	platform_driver_unregister(&ghes_platform_driver); +	ghes_estatus_pool_exit();  	ghes_ioremap_exit();  } diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c index 181bc2f7bb74..05fee06f4d6e 100644 --- a/drivers/acpi/apei/hest.c +++ b/drivers/acpi/apei/hest.c @@ -231,16 +231,17 @@ void __init acpi_hest_init(void)  		goto err;  	} -	rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count); -	if (rc) -		goto err; - -	rc = hest_ghes_dev_register(ghes_count); -	if (!rc) { -		pr_info(HEST_PFX "Table parsing has been initialized.\n"); -		return; +	if (!ghes_disable) { +		rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count); +		if (rc) +			goto err; +		rc = hest_ghes_dev_register(ghes_count); +		if (rc) +			goto err;  	} +	pr_info(HEST_PFX "Table parsing has been initialized.\n"); +	return;  err:  	hest_disable = 1;  } diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 2c661353e8f2..87c0a8daa99a 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -55,6 +55,9 @@  #define ACPI_BATTERY_NOTIFY_INFO	0x81  #define ACPI_BATTERY_NOTIFY_THRESHOLD   0x82 +/* Battery power unit: 0 means mW, 1 means mA */ +#define ACPI_BATTERY_POWER_UNIT_MA	1 +  #define _COMPONENT		ACPI_BATTERY_COMPONENT  ACPI_MODULE_NAME("battery"); @@ -91,11 +94,6 @@ MODULE_DEVICE_TABLE(acpi, battery_device_ids);  enum {  	ACPI_BATTERY_ALARM_PRESENT,  	ACPI_BATTERY_XINFO_PRESENT, -	/* For buggy DSDTs that report negative 16-bit values for either -	 * charging or discharging current and/or report 0 as 65536 -	 * due to bad math. -	 */ -	ACPI_BATTERY_QUIRK_SIGNED16_CURRENT,  	ACPI_BATTERY_QUIRK_PERCENTAGE_CAPACITY,  }; @@ -301,7 +299,8 @@ static enum power_supply_property energy_battery_props[] = {  #ifdef CONFIG_ACPI_PROCFS_POWER  inline char *acpi_battery_units(struct acpi_battery *battery)  { -	return (battery->power_unit)?"mA":"mW"; +	return (battery->power_unit == ACPI_BATTERY_POWER_UNIT_MA) ? +		"mA" : "mW";  }  #endif @@ -461,9 +460,17 @@ static int acpi_battery_get_state(struct acpi_battery *battery)  	battery->update_time = jiffies;  	kfree(buffer.pointer); -	if (test_bit(ACPI_BATTERY_QUIRK_SIGNED16_CURRENT, &battery->flags) && -	    battery->rate_now != -1) +	/* For buggy DSDTs that report negative 16-bit values for either +	 * charging or discharging current and/or report 0 as 65536 +	 * due to bad math. +	 */ +	if (battery->power_unit == ACPI_BATTERY_POWER_UNIT_MA && +		battery->rate_now != ACPI_BATTERY_VALUE_UNKNOWN && +		(s16)(battery->rate_now) < 0) {  		battery->rate_now = abs((s16)battery->rate_now); +		printk_once(KERN_WARNING FW_BUG "battery: (dis)charge rate" +			" invalid.\n"); +	}  	if (test_bit(ACPI_BATTERY_QUIRK_PERCENTAGE_CAPACITY, &battery->flags)  	    && battery->capacity_now >= 0 && battery->capacity_now <= 100) @@ -544,7 +551,7 @@ static int sysfs_add_battery(struct acpi_battery *battery)  {  	int result; -	if (battery->power_unit) { +	if (battery->power_unit == ACPI_BATTERY_POWER_UNIT_MA) {  		battery->bat.properties = charge_battery_props;  		battery->bat.num_properties =  			ARRAY_SIZE(charge_battery_props); @@ -566,18 +573,16 @@ static int sysfs_add_battery(struct acpi_battery *battery)  static void sysfs_remove_battery(struct acpi_battery *battery)  { -	if (!battery->bat.dev) +	mutex_lock(&battery->lock); +	if (!battery->bat.dev) { +		mutex_unlock(&battery->lock);  		return; +	} +  	device_remove_file(battery->bat.dev, &alarm_attr);  	power_supply_unregister(&battery->bat);  	battery->bat.dev = NULL; -} - -static void acpi_battery_quirks(struct acpi_battery *battery) -{ -	if (dmi_name_in_vendors("Acer") && battery->power_unit) { -		set_bit(ACPI_BATTERY_QUIRK_SIGNED16_CURRENT, &battery->flags); -	} +	mutex_unlock(&battery->lock);  }  /* @@ -592,7 +597,7 @@ static void acpi_battery_quirks(struct acpi_battery *battery)   *   * Handle this correctly so that they won't break userspace.   */ -static void acpi_battery_quirks2(struct acpi_battery *battery) +static void acpi_battery_quirks(struct acpi_battery *battery)  {  	if (test_bit(ACPI_BATTERY_QUIRK_PERCENTAGE_CAPACITY, &battery->flags))  		return ; @@ -623,13 +628,15 @@ static int acpi_battery_update(struct acpi_battery *battery)  		result = acpi_battery_get_info(battery);  		if (result)  			return result; -		acpi_battery_quirks(battery);  		acpi_battery_init_alarm(battery);  	} -	if (!battery->bat.dev) -		sysfs_add_battery(battery); +	if (!battery->bat.dev) { +		result = sysfs_add_battery(battery); +		if (result) +			return result; +	}  	result = acpi_battery_get_state(battery); -	acpi_battery_quirks2(battery); +	acpi_battery_quirks(battery);  	return result;  } @@ -863,7 +870,7 @@ DECLARE_FILE_FUNCTIONS(alarm);  		}, \  	} -static struct battery_file { +static const struct battery_file {  	struct file_operations ops;  	mode_t mode;  	const char *name; @@ -948,9 +955,12 @@ static int battery_notify(struct notifier_block *nb,  	struct acpi_battery *battery = container_of(nb, struct acpi_battery,  						    pm_nb);  	switch (mode) { +	case PM_POST_HIBERNATION:  	case PM_POST_SUSPEND: -		sysfs_remove_battery(battery); -		sysfs_add_battery(battery); +		if (battery->bat.dev) { +			sysfs_remove_battery(battery); +			sysfs_add_battery(battery); +		}  		break;  	} @@ -975,25 +985,33 @@ static int acpi_battery_add(struct acpi_device *device)  	if (ACPI_SUCCESS(acpi_get_handle(battery->device->handle,  			"_BIX", &handle)))  		set_bit(ACPI_BATTERY_XINFO_PRESENT, &battery->flags); -	acpi_battery_update(battery); +	result = acpi_battery_update(battery); +	if (result) +		goto fail;  #ifdef CONFIG_ACPI_PROCFS_POWER  	result = acpi_battery_add_fs(device);  #endif -	if (!result) { -		printk(KERN_INFO PREFIX "%s Slot [%s] (battery %s)\n", -			ACPI_BATTERY_DEVICE_NAME, acpi_device_bid(device), -			device->status.battery_present ? "present" : "absent"); -	} else { +	if (result) {  #ifdef CONFIG_ACPI_PROCFS_POWER  		acpi_battery_remove_fs(device);  #endif -		kfree(battery); +		goto fail;  	} +	printk(KERN_INFO PREFIX "%s Slot [%s] (battery %s)\n", +		ACPI_BATTERY_DEVICE_NAME, acpi_device_bid(device), +		device->status.battery_present ? "present" : "absent"); +  	battery->pm_nb.notifier_call = battery_notify;  	register_pm_notifier(&battery->pm_nb);  	return result; + +fail: +	sysfs_remove_battery(battery); +	mutex_destroy(&battery->lock); +	kfree(battery); +	return result;  }  static int acpi_battery_remove(struct acpi_device *device, int type) diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index d1e06c182cdb..437ddbf0c49a 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -39,6 +39,7 @@  #include <linux/pci.h>  #include <acpi/acpi_bus.h>  #include <acpi/acpi_drivers.h> +#include <acpi/apei.h>  #include <linux/dmi.h>  #include <linux/suspend.h> @@ -519,6 +520,7 @@ out_kfree:  }  EXPORT_SYMBOL(acpi_run_osc); +bool osc_sb_apei_support_acked;  static u8 sb_uuid_str[] = "0811B06E-4A27-44F9-8D60-3CBBC22E7B48";  static void acpi_bus_osc_support(void)  { @@ -541,11 +543,19 @@ static void acpi_bus_osc_support(void)  #if defined(CONFIG_ACPI_PROCESSOR) || defined(CONFIG_ACPI_PROCESSOR_MODULE)  	capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_PPC_OST_SUPPORT;  #endif + +	if (!ghes_disable) +		capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_APEI_SUPPORT;  	if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle)))  		return; -	if (ACPI_SUCCESS(acpi_run_osc(handle, &context))) +	if (ACPI_SUCCESS(acpi_run_osc(handle, &context))) { +		u32 *capbuf_ret = context.ret.pointer; +		if (context.ret.length > OSC_SUPPORT_TYPE) +			osc_sb_apei_support_acked = +				capbuf_ret[OSC_SUPPORT_TYPE] & OSC_SB_APEI_SUPPORT;  		kfree(context.ret.pointer); -	/* do we need to check the returned cap? Sounds no */ +	} +	/* do we need to check other returned cap? Sounds no */  }  /* -------------------------------------------------------------------------- diff --git a/drivers/acpi/dock.c b/drivers/acpi/dock.c index 1864ad3cf895..19a61136d848 100644 --- a/drivers/acpi/dock.c +++ b/drivers/acpi/dock.c @@ -77,7 +77,7 @@ struct dock_dependent_device {  	struct list_head list;  	struct list_head hotplug_list;  	acpi_handle handle; -	struct acpi_dock_ops *ops; +	const struct acpi_dock_ops *ops;  	void *context;  }; @@ -589,7 +589,7 @@ EXPORT_SYMBOL_GPL(unregister_dock_notifier);   * the dock driver after _DCK is executed.   */  int -register_hotplug_dock_device(acpi_handle handle, struct acpi_dock_ops *ops, +register_hotplug_dock_device(acpi_handle handle, const struct acpi_dock_ops *ops,  			     void *context)  {  	struct dock_dependent_device *dd; diff --git a/drivers/acpi/ec_sys.c b/drivers/acpi/ec_sys.c index 05b44201a614..22f918bacd35 100644 --- a/drivers/acpi/ec_sys.c +++ b/drivers/acpi/ec_sys.c @@ -92,7 +92,7 @@ static ssize_t acpi_ec_write_io(struct file *f, const char __user *buf,  	return count;  } -static struct file_operations acpi_ec_io_ops = { +static const struct file_operations acpi_ec_io_ops = {  	.owner = THIS_MODULE,  	.open  = acpi_ec_open_io,  	.read  = acpi_ec_read_io, diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index 467479f07c1f..0f0356ca1a9e 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -110,7 +110,7 @@ fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state)  	return result;  } -static struct thermal_cooling_device_ops fan_cooling_ops = { +static const struct thermal_cooling_device_ops fan_cooling_ops = {  	.get_max_state = fan_get_max_state,  	.get_cur_state = fan_get_cur_state,  	.set_cur_state = fan_set_cur_state, diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 372f9b70f7f4..fa32f584229f 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -155,7 +155,7 @@ static u32 acpi_osi_handler(acpi_string interface, u32 supported)  {  	if (!strcmp("Linux", interface)) { -		printk(KERN_NOTICE FW_BUG PREFIX +		printk_once(KERN_NOTICE FW_BUG PREFIX  			"BIOS _OSI(Linux) query %s%s\n",  			osi_linux.enable ? "honored" : "ignored",  			osi_linux.cmdline ? " via cmdline" : @@ -237,8 +237,23 @@ void acpi_os_vprintf(const char *fmt, va_list args)  #endif  } +#ifdef CONFIG_KEXEC +static unsigned long acpi_rsdp; +static int __init setup_acpi_rsdp(char *arg) +{ +	acpi_rsdp = simple_strtoul(arg, NULL, 16); +	return 0; +} +early_param("acpi_rsdp", setup_acpi_rsdp); +#endif +  acpi_physical_address __init acpi_os_get_root_pointer(void)  { +#ifdef CONFIG_KEXEC +	if (acpi_rsdp) +		return acpi_rsdp; +#endif +  	if (efi_enabled) {  		if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)  			return efi.acpi20; @@ -1083,7 +1098,13 @@ struct osi_setup_entry {  	bool enable;  }; -static struct osi_setup_entry __initdata osi_setup_entries[OSI_STRING_ENTRIES_MAX]; +static struct osi_setup_entry __initdata +		osi_setup_entries[OSI_STRING_ENTRIES_MAX] = { +	{"Module Device", true}, +	{"Processor Device", true}, +	{"3.0 _SCP Extensions", true}, +	{"Processor Aggregator Device", true}, +};  void __init acpi_osi_setup(char *str)  { diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index f907cfbfa13c..7f9eba9a0b02 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -303,6 +303,61 @@ void acpi_pci_irq_del_prt(struct pci_bus *bus)  /* --------------------------------------------------------------------------                            PCI Interrupt Routing Support     -------------------------------------------------------------------------- */ +#ifdef CONFIG_X86_IO_APIC +extern int noioapicquirk; +extern int noioapicreroute; + +static int bridge_has_boot_interrupt_variant(struct pci_bus *bus) +{ +	struct pci_bus *bus_it; + +	for (bus_it = bus ; bus_it ; bus_it = bus_it->parent) { +		if (!bus_it->self) +			return 0; +		if (bus_it->self->irq_reroute_variant) +			return bus_it->self->irq_reroute_variant; +	} +	return 0; +} + +/* + * Some chipsets (e.g. Intel 6700PXH) generate a legacy INTx when the IRQ + * entry in the chipset's IO-APIC is masked (as, e.g. the RT kernel does + * during interrupt handling). When this INTx generation cannot be disabled, + * we reroute these interrupts to their legacy equivalent to get rid of + * spurious interrupts. + */ +static int acpi_reroute_boot_interrupt(struct pci_dev *dev, +				       struct acpi_prt_entry *entry) +{ +	if (noioapicquirk || noioapicreroute) { +		return 0; +	} else { +		switch (bridge_has_boot_interrupt_variant(dev->bus)) { +		case 0: +			/* no rerouting necessary */ +			return 0; +		case INTEL_IRQ_REROUTE_VARIANT: +			/* +			 * Remap according to INTx routing table in 6700PXH +			 * specs, intel order number 302628-002, section +			 * 2.15.2. Other chipsets (80332, ...) have the same +			 * mapping and are handled here as well. +			 */ +			dev_info(&dev->dev, "PCI IRQ %d -> rerouted to legacy " +				 "IRQ %d\n", entry->index, +				 (entry->index % 4) + 16); +			entry->index = (entry->index % 4) + 16; +			return 1; +		default: +			dev_warn(&dev->dev, "Cannot reroute IRQ %d to legacy " +				 "IRQ: unknown mapping\n", entry->index); +			return -1; +		} +	} +} +#endif /* CONFIG_X86_IO_APIC */ +  static struct acpi_prt_entry *acpi_pci_irq_lookup(struct pci_dev *dev, int pin)  {  	struct acpi_prt_entry *entry; @@ -311,6 +366,9 @@ static struct acpi_prt_entry *acpi_pci_irq_lookup(struct pci_dev *dev, int pin)  	entry = acpi_pci_irq_find_prt_entry(dev, pin);  	if (entry) { +#ifdef CONFIG_X86_IO_APIC +		acpi_reroute_boot_interrupt(dev, entry); +#endif /* CONFIG_X86_IO_APIC */  		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %s[%c] _PRT entry\n",  				  pci_name(dev), pin_name(pin)));  		return entry; diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index d06078d660ad..2672c798272f 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -485,7 +485,8 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device)  		root->secondary.end = 0xFF;  		printk(KERN_WARNING FW_BUG PREFIX  		       "no secondary bus range in _CRS\n"); -		status = acpi_evaluate_integer(device->handle, METHOD_NAME__BBN,					       NULL, &bus); +		status = acpi_evaluate_integer(device->handle, METHOD_NAME__BBN, +					       NULL, &bus);  		if (ACPI_SUCCESS(status))  			root->secondary.start = bus;  		else if (status == AE_NOT_FOUND) diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c index 79cb65332894..870550d6a4bf 100644 --- a/drivers/acpi/processor_thermal.c +++ b/drivers/acpi/processor_thermal.c @@ -244,7 +244,7 @@ processor_set_cur_state(struct thermal_cooling_device *cdev,  	return result;  } -struct thermal_cooling_device_ops processor_cooling_ops = { +const struct thermal_cooling_device_ops processor_cooling_ops = {  	.get_max_state = processor_get_max_state,  	.get_cur_state = processor_get_cur_state,  	.set_cur_state = processor_set_cur_state, diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c index 50658ff887d9..6e36d0c0057c 100644 --- a/drivers/acpi/sbs.c +++ b/drivers/acpi/sbs.c @@ -130,6 +130,9 @@ struct acpi_sbs {  #define to_acpi_sbs(x) container_of(x, struct acpi_sbs, charger) +static int acpi_sbs_remove(struct acpi_device *device, int type); +static int acpi_battery_get_state(struct acpi_battery *battery); +  static inline int battery_scale(int log)  {  	int scale = 1; @@ -195,6 +198,8 @@ static int acpi_sbs_battery_get_property(struct power_supply *psy,  	if ((!battery->present) && psp != POWER_SUPPLY_PROP_PRESENT)  		return -ENODEV; + +	acpi_battery_get_state(battery);  	switch (psp) {  	case POWER_SUPPLY_PROP_STATUS:  		if (battery->rate_now < 0) @@ -225,11 +230,17 @@ static int acpi_sbs_battery_get_property(struct power_supply *psy,  	case POWER_SUPPLY_PROP_POWER_NOW:  		val->intval = abs(battery->rate_now) *  				acpi_battery_ipscale(battery) * 1000; +		val->intval *= (acpi_battery_mode(battery)) ? +				(battery->voltage_now * +				acpi_battery_vscale(battery) / 1000) : 1;  		break;  	case POWER_SUPPLY_PROP_CURRENT_AVG:  	case POWER_SUPPLY_PROP_POWER_AVG:  		val->intval = abs(battery->rate_avg) *  				acpi_battery_ipscale(battery) * 1000; +		val->intval *= (acpi_battery_mode(battery)) ? +				(battery->voltage_now * +				acpi_battery_vscale(battery) / 1000) : 1;  		break;  	case POWER_SUPPLY_PROP_CAPACITY:  		val->intval = battery->state_of_charge; @@ -903,8 +914,6 @@ static void acpi_sbs_callback(void *context)  	}  } -static int acpi_sbs_remove(struct acpi_device *device, int type); -  static int acpi_sbs_add(struct acpi_device *device)  {  	struct acpi_sbs *sbs; diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 6c949602cbd1..3ed80b2ca907 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -428,6 +428,22 @@ static struct dmi_system_id __initdata acpisleep_dmi_table[] = {  		DMI_MATCH(DMI_PRODUCT_NAME, "1000 Series"),  		},  	}, +	{ +	.callback = init_old_suspend_ordering, +	.ident = "Asus A8N-SLI DELUXE", +	.matches = { +		DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), +		DMI_MATCH(DMI_BOARD_NAME, "A8N-SLI DELUXE"), +		}, +	}, +	{ +	.callback = init_old_suspend_ordering, +	.ident = "Asus A8N-SLI Premium", +	.matches = { +		DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), +		DMI_MATCH(DMI_BOARD_NAME, "A8N-SLI Premium"), +		}, +	},  	{},  };  #endif /* CONFIG_SUSPEND */ diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index 77255f250dbb..c538d0ef10ff 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -149,12 +149,12 @@ static int param_get_debug_level(char *buffer, const struct kernel_param *kp)  	return result;  } -static struct kernel_param_ops param_ops_debug_layer = { +static const struct kernel_param_ops param_ops_debug_layer = {  	.set = param_set_uint,  	.get = param_get_debug_layer,  }; -static struct kernel_param_ops param_ops_debug_level = { +static const struct kernel_param_ops param_ops_debug_level = {  	.set = param_set_uint,  	.get = param_get_debug_level,  }; diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 2607e17b520f..48fbc647b178 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -812,7 +812,7 @@ acpi_thermal_unbind_cooling_device(struct thermal_zone_device *thermal,  				thermal_zone_unbind_cooling_device);  } -static struct thermal_zone_device_ops acpi_thermal_zone_ops = { +static const struct thermal_zone_device_ops acpi_thermal_zone_ops = {  	.bind = acpi_thermal_bind_cooling_device,  	.unbind	= acpi_thermal_unbind_cooling_device,  	.get_temp = thermal_get_temp, diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index ada4b4d9bdc8..08a44b532f7c 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -307,7 +307,7 @@ video_set_cur_state(struct thermal_cooling_device *cooling_dev, unsigned long st  	return acpi_video_device_lcd_set_level(video, level);  } -static struct thermal_cooling_device_ops video_cooling_ops = { +static const struct thermal_cooling_device_ops video_cooling_ops = {  	.get_max_state = video_get_max_state,  	.get_cur_state = video_get_cur_state,  	.set_cur_state = video_set_cur_state, diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c index e0a5b555cee1..bb7c5f1085cc 100644 --- a/drivers/ata/libata-acpi.c +++ b/drivers/ata/libata-acpi.c @@ -218,12 +218,12 @@ static void ata_acpi_dev_uevent(acpi_handle handle, u32 event, void *data)  	ata_acpi_uevent(dev->link->ap, dev, event);  } -static struct acpi_dock_ops ata_acpi_dev_dock_ops = { +static const struct acpi_dock_ops ata_acpi_dev_dock_ops = {  	.handler = ata_acpi_dev_notify_dock,  	.uevent = ata_acpi_dev_uevent,  }; -static struct acpi_dock_ops ata_acpi_ap_dock_ops = { +static const struct acpi_dock_ops ata_acpi_ap_dock_ops = {  	.handler = ata_acpi_ap_notify_dock,  	.uevent = ata_acpi_ap_uevent,  }; diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 49502bc5360a..423fd56bf612 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -616,5 +616,16 @@ config MSM_SMD_PKT  	  Enables userspace clients to read and write to some packet SMD  	  ports via device interface for MSM chipset. +config TILE_SROM +	bool "Character-device access via hypervisor to the Tilera SPI ROM" +	depends on TILE +	default y +	---help--- +	  This device provides character-level read-write access +	  to the SROM, typically via the "0", "1", and "2" devices +	  in /dev/srom/.  The Tilera hypervisor makes the flash +	  device appear much like a simple EEPROM, and knows +	  how to partition a single ROM for multiple purposes. +  endmenu diff --git a/drivers/char/Makefile b/drivers/char/Makefile index 7a00672bd85d..32762ba769c2 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -63,3 +63,5 @@ obj-$(CONFIG_RAMOOPS)		+= ramoops.o  obj-$(CONFIG_JS_RTC)		+= js-rtc.o  js-rtc-y = rtc.o + +obj-$(CONFIG_TILE_SROM)		+= tile-srom.o diff --git a/drivers/char/ramoops.c b/drivers/char/ramoops.c index fca0c51bbc90..810aff9e750f 100644 --- a/drivers/char/ramoops.c +++ b/drivers/char/ramoops.c @@ -147,6 +147,14 @@ static int __init ramoops_probe(struct platform_device *pdev)  	cxt->phys_addr = pdata->mem_address;  	cxt->record_size = pdata->record_size;  	cxt->dump_oops = pdata->dump_oops; +	/* +	 * Update the module parameter variables as well so they are visible +	 * through /sys/module/ramoops/parameters/ +	 */ +	mem_size = pdata->mem_size; +	mem_address = pdata->mem_address; +	record_size = pdata->record_size; +	dump_oops = pdata->dump_oops;  	if (!request_mem_region(cxt->phys_addr, cxt->size, "ramoops")) {  		pr_err("request mem region failed\n"); diff --git a/drivers/char/tile-srom.c b/drivers/char/tile-srom.c new file mode 100644 index 000000000000..cf3ee008dca2 --- /dev/null +++ b/drivers/char/tile-srom.c @@ -0,0 +1,481 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, but + *   WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + *   NON INFRINGEMENT.  See the GNU General Public License for + *   more details. + * + * SPI Flash ROM driver + * + * This source code is derived from code provided in "Linux Device + * Drivers, Third Edition", by Jonathan Corbet, Alessandro Rubini, and + * Greg Kroah-Hartman, published by O'Reilly Media, Inc. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/init.h> +#include <linux/kernel.h>	/* printk() */ +#include <linux/slab.h>		/* kmalloc() */ +#include <linux/fs.h>		/* everything... */ +#include <linux/errno.h>	/* error codes */ +#include <linux/types.h>	/* size_t */ +#include <linux/proc_fs.h> +#include <linux/fcntl.h>	/* O_ACCMODE */ +#include <linux/aio.h> +#include <linux/pagemap.h> +#include <linux/hugetlb.h> +#include <linux/uaccess.h> +#include <linux/platform_device.h> +#include <hv/hypervisor.h> +#include <linux/ioctl.h> +#include <linux/cdev.h> +#include <linux/delay.h> +#include <hv/drv_srom_intf.h> + +/* + * Size of our hypervisor I/O requests.  We break up large transfers + * so that we don't spend large uninterrupted spans of time in the + * hypervisor.  Erasing an SROM sector takes a significant fraction of + * a second, so if we allowed the user to, say, do one I/O to write the + * entire ROM, we'd get soft lockup timeouts, or worse. + */ +#define SROM_CHUNK_SIZE ((size_t)4096) + +/* + * When hypervisor is busy (e.g. erasing), poll the status periodically. + */ + +/* + * Interval to poll the state in msec + */ +#define SROM_WAIT_TRY_INTERVAL 20 + +/* + * Maximum times to poll the state + */ +#define SROM_MAX_WAIT_TRY_TIMES 1000 + +struct srom_dev { +	int hv_devhdl;			/* Handle for hypervisor device */ +	u32 total_size;			/* Size of this device */ +	u32 sector_size;		/* Size of a sector */ +	u32 page_size;			/* Size of a page */ +	struct mutex lock;		/* Allow only one accessor at a time */ +}; + +static int srom_major;			/* Dynamic major by default */ +module_param(srom_major, int, 0); +MODULE_AUTHOR("Tilera Corporation"); +MODULE_LICENSE("GPL"); + +static int srom_devs;			/* Number of SROM partitions */ +static struct cdev srom_cdev; +static struct class *srom_class; +static struct srom_dev *srom_devices; + +/* + * Handle calling the hypervisor and managing EAGAIN/EBUSY. + */ + +static ssize_t _srom_read(int hv_devhdl, void *buf, +			  loff_t off, size_t count) +{ +	int retval, retries = SROM_MAX_WAIT_TRY_TIMES; +	for (;;) { +		retval = hv_dev_pread(hv_devhdl, 0, (HV_VirtAddr)buf, +				      count, off); +		if (retval >= 0) +			return retval; +		if (retval == HV_EAGAIN) +			continue; +		if (retval == HV_EBUSY && --retries > 0) { +			msleep(SROM_WAIT_TRY_INTERVAL); +			continue; +		} +		pr_err("_srom_read: error %d\n", retval); +		return -EIO; +	} +} + +static ssize_t _srom_write(int hv_devhdl, const void *buf, +			   loff_t off, size_t count) +{ +	int retval, retries = SROM_MAX_WAIT_TRY_TIMES; +	for (;;) { +		retval = hv_dev_pwrite(hv_devhdl, 0, (HV_VirtAddr)buf, +				       count, off); +		if (retval >= 0) +			return retval; +		if (retval == HV_EAGAIN) +			continue; +		if (retval == HV_EBUSY && --retries > 0) { +			msleep(SROM_WAIT_TRY_INTERVAL); +			continue; +		} +		pr_err("_srom_write: error %d\n", retval); +		return -EIO; +	} +} + +/** + * srom_open() - Device open routine. + * @inode: Inode for this device. + * @filp: File for this specific open of the device. + * + * Returns zero, or an error code. + */ +static int srom_open(struct inode *inode, struct file *filp) +{ +	filp->private_data = &srom_devices[iminor(inode)]; +	return 0; +} + + +/** + * srom_release() - Device release routine. + * @inode: Inode for this device. + * @filp: File for this specific open of the device. + * + * Returns zero, or an error code. + */ +static int srom_release(struct inode *inode, struct file *filp) +{ +	struct srom_dev *srom = filp->private_data; +	char dummy; + +	/* Make sure we've flushed anything written to the ROM. */ +	mutex_lock(&srom->lock); +	if (srom->hv_devhdl >= 0) +		_srom_write(srom->hv_devhdl, &dummy, SROM_FLUSH_OFF, 1); +	mutex_unlock(&srom->lock); + +	filp->private_data = NULL; + +	return 0; +} + + +/** + * srom_read() - Read data from the device. + * @filp: File for this specific open of the device. + * @buf: User's data buffer. + * @count: Number of bytes requested. + * @f_pos: File position. + * + * Returns number of bytes read, or an error code. + */ +static ssize_t srom_read(struct file *filp, char __user *buf, +			 size_t count, loff_t *f_pos) +{ +	int retval = 0; +	void *kernbuf; +	struct srom_dev *srom = filp->private_data; + +	kernbuf = kmalloc(SROM_CHUNK_SIZE, GFP_KERNEL); +	if (!kernbuf) +		return -ENOMEM; + +	if (mutex_lock_interruptible(&srom->lock)) { +		retval = -ERESTARTSYS; +		kfree(kernbuf); +		return retval; +	} + +	while (count) { +		int hv_retval; +		int bytes_this_pass = min(count, SROM_CHUNK_SIZE); + +		hv_retval = _srom_read(srom->hv_devhdl, kernbuf, +				       *f_pos, bytes_this_pass); +		if (hv_retval > 0) { +			if (copy_to_user(buf, kernbuf, hv_retval) != 0) { +				retval = -EFAULT; +				break; +			} +		} else if (hv_retval <= 0) { +			if (retval == 0) +				retval = hv_retval; +			break; +		} + +		retval += hv_retval; +		*f_pos += hv_retval; +		buf += hv_retval; +		count -= hv_retval; +	} + +	mutex_unlock(&srom->lock); +	kfree(kernbuf); + +	return retval; +} + +/** + * srom_write() - Write data to the device. + * @filp: File for this specific open of the device. + * @buf: User's data buffer. + * @count: Number of bytes requested. + * @f_pos: File position. + * + * Returns number of bytes written, or an error code. + */ +static ssize_t srom_write(struct file *filp, const char __user *buf, +			  size_t count, loff_t *f_pos) +{ +	int retval = 0; +	void *kernbuf; +	struct srom_dev *srom = filp->private_data; + +	kernbuf = kmalloc(SROM_CHUNK_SIZE, GFP_KERNEL); +	if (!kernbuf) +		return -ENOMEM; + +	if (mutex_lock_interruptible(&srom->lock)) { +		retval = -ERESTARTSYS; +		kfree(kernbuf); +		return retval; +	} + +	while (count) { +		int hv_retval; +		int bytes_this_pass = min(count, SROM_CHUNK_SIZE); + +		if (copy_from_user(kernbuf, buf, bytes_this_pass) != 0) { +			retval = -EFAULT; +			break; +		} + +		hv_retval = _srom_write(srom->hv_devhdl, kernbuf, +					*f_pos, bytes_this_pass); +		if (hv_retval <= 0) { +			if (retval == 0) +				retval = hv_retval; +			break; +		} + +		retval += hv_retval; +		*f_pos += hv_retval; +		buf += hv_retval; +		count -= hv_retval; +	} + +	mutex_unlock(&srom->lock); +	kfree(kernbuf); + +	return retval; +} + +/* Provide our own implementation so we can use srom->total_size. */ +loff_t srom_llseek(struct file *filp, loff_t offset, int origin) +{ +	struct srom_dev *srom = filp->private_data; + +	if (mutex_lock_interruptible(&srom->lock)) +		return -ERESTARTSYS; + +	switch (origin) { +	case SEEK_END: +		offset += srom->total_size; +		break; +	case SEEK_CUR: +		offset += filp->f_pos; +		break; +	} + +	if (offset < 0 || offset > srom->total_size) { +		offset = -EINVAL; +	} else { +		filp->f_pos = offset; +		filp->f_version = 0; +	} + +	mutex_unlock(&srom->lock); + +	return offset; +} + +static ssize_t total_show(struct device *dev, +			  struct device_attribute *attr, char *buf) +{ +	struct srom_dev *srom = dev_get_drvdata(dev); +	return sprintf(buf, "%u\n", srom->total_size); +} + +static ssize_t sector_show(struct device *dev, +			   struct device_attribute *attr, char *buf) +{ +	struct srom_dev *srom = dev_get_drvdata(dev); +	return sprintf(buf, "%u\n", srom->sector_size); +} + +static ssize_t page_show(struct device *dev, +			 struct device_attribute *attr, char *buf) +{ +	struct srom_dev *srom = dev_get_drvdata(dev); +	return sprintf(buf, "%u\n", srom->page_size); +} + +static struct device_attribute srom_dev_attrs[] = { +	__ATTR(total_size, S_IRUGO, total_show, NULL), +	__ATTR(sector_size, S_IRUGO, sector_show, NULL), +	__ATTR(page_size, S_IRUGO, page_show, NULL), +	__ATTR_NULL +}; + +static char *srom_devnode(struct device *dev, mode_t *mode) +{ +	*mode = S_IRUGO | S_IWUSR; +	return kasprintf(GFP_KERNEL, "srom/%s", dev_name(dev)); +} + +/* + * The fops + */ +static const struct file_operations srom_fops = { +	.owner =     THIS_MODULE, +	.llseek =    srom_llseek, +	.read =	     srom_read, +	.write =     srom_write, +	.open =	     srom_open, +	.release =   srom_release, +}; + +/** + * srom_setup_minor() - Initialize per-minor information. + * @srom: Per-device SROM state. + * @index: Device to set up. + */ +static int srom_setup_minor(struct srom_dev *srom, int index) +{ +	struct device *dev; +	int devhdl = srom->hv_devhdl; + +	mutex_init(&srom->lock); + +	if (_srom_read(devhdl, &srom->total_size, +		       SROM_TOTAL_SIZE_OFF, sizeof(srom->total_size)) < 0) +		return -EIO; +	if (_srom_read(devhdl, &srom->sector_size, +		       SROM_SECTOR_SIZE_OFF, sizeof(srom->sector_size)) < 0) +		return -EIO; +	if (_srom_read(devhdl, &srom->page_size, +		       SROM_PAGE_SIZE_OFF, sizeof(srom->page_size)) < 0) +		return -EIO; + +	dev = device_create(srom_class, &platform_bus, +			    MKDEV(srom_major, index), srom, "%d", index); +	return IS_ERR(dev) ? PTR_ERR(dev) : 0; +} + +/** srom_init() - Initialize the driver's module. */ +static int srom_init(void) +{ +	int result, i; +	dev_t dev = MKDEV(srom_major, 0); + +	/* +	 * Start with a plausible number of partitions; the krealloc() call +	 * below will yield about log(srom_devs) additional allocations. +	 */ +	srom_devices = kzalloc(4 * sizeof(struct srom_dev), GFP_KERNEL); + +	/* Discover the number of srom partitions. */ +	for (i = 0; ; i++) { +		int devhdl; +		char buf[20]; +		struct srom_dev *new_srom_devices = +			krealloc(srom_devices, (i+1) * sizeof(struct srom_dev), +				 GFP_KERNEL | __GFP_ZERO); +		if (!new_srom_devices) { +			result = -ENOMEM; +			goto fail_mem; +		} +		srom_devices = new_srom_devices; +		sprintf(buf, "srom/0/%d", i); +		devhdl = hv_dev_open((HV_VirtAddr)buf, 0); +		if (devhdl < 0) { +			if (devhdl != HV_ENODEV) +				pr_notice("srom/%d: hv_dev_open failed: %d.\n", +					  i, devhdl); +			break; +		} +		srom_devices[i].hv_devhdl = devhdl; +	} +	srom_devs = i; + +	/* Bail out early if we have no partitions at all. */ +	if (srom_devs == 0) { +		result = -ENODEV; +		goto fail_mem; +	} + +	/* Register our major, and accept a dynamic number. */ +	if (srom_major) +		result = register_chrdev_region(dev, srom_devs, "srom"); +	else { +		result = alloc_chrdev_region(&dev, 0, srom_devs, "srom"); +		srom_major = MAJOR(dev); +	} +	if (result < 0) +		goto fail_mem; + +	/* Register a character device. */ +	cdev_init(&srom_cdev, &srom_fops); +	srom_cdev.owner = THIS_MODULE; +	srom_cdev.ops = &srom_fops; +	result = cdev_add(&srom_cdev, dev, srom_devs); +	if (result < 0) +		goto fail_chrdev; + +	/* Create a sysfs class. */ +	srom_class = class_create(THIS_MODULE, "srom"); +	if (IS_ERR(srom_class)) { +		result = PTR_ERR(srom_class); +		goto fail_cdev; +	} +	srom_class->dev_attrs = srom_dev_attrs; +	srom_class->devnode = srom_devnode; + +	/* Do per-partition initialization */ +	for (i = 0; i < srom_devs; i++) { +		result = srom_setup_minor(srom_devices + i, i); +		if (result < 0) +			goto fail_class; +	} + +	return 0; + +fail_class: +	for (i = 0; i < srom_devs; i++) +		device_destroy(srom_class, MKDEV(srom_major, i)); +	class_destroy(srom_class); +fail_cdev: +	cdev_del(&srom_cdev); +fail_chrdev: +	unregister_chrdev_region(dev, srom_devs); +fail_mem: +	kfree(srom_devices); +	return result; +} + +/** srom_cleanup() - Clean up the driver's module. */ +static void srom_cleanup(void) +{ +	int i; +	for (i = 0; i < srom_devs; i++) +		device_destroy(srom_class, MKDEV(srom_major, i)); +	class_destroy(srom_class); +	cdev_del(&srom_cdev); +	unregister_chrdev_region(MKDEV(srom_major, 0), srom_devs); +	kfree(srom_devices); +} + +module_init(srom_init); +module_exit(srom_cleanup); diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index 7fc2f108f490..3f4051a7c5a7 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -80,7 +80,7 @@ enum tis_defaults {  static LIST_HEAD(tis_chips);  static DEFINE_SPINLOCK(tis_lock); -#ifdef CONFIG_PNP +#if defined(CONFIG_PNP) && defined(CONFIG_ACPI)  static int is_itpm(struct pnp_dev *dev)  {  	struct acpi_device *acpi = pnp_acpi_device(dev); @@ -93,6 +93,11 @@ static int is_itpm(struct pnp_dev *dev)  	return 0;  } +#else +static inline int is_itpm(struct pnp_dev *dev) +{ +	return 0; +}  #endif  static int check_locality(struct tpm_chip *chip, int l) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index bf5092455a8f..d4c542372886 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -25,9 +25,19 @@ DEFINE_PER_CPU(struct cpuidle_device *, cpuidle_devices);  DEFINE_MUTEX(cpuidle_lock);  LIST_HEAD(cpuidle_detected_devices); -static void (*pm_idle_old)(void);  static int enabled_devices; +static int off __read_mostly; +static int initialized __read_mostly; + +int cpuidle_disabled(void) +{ +	return off; +} +void disable_cpuidle(void) +{ +	off = 1; +}  #if defined(CONFIG_ARCH_HAS_CPU_IDLE_WAIT)  static void cpuidle_kick_cpus(void) @@ -46,25 +56,23 @@ static int __cpuidle_register_device(struct cpuidle_device *dev);   * cpuidle_idle_call - the main idle loop   *   * NOTE: no locks or semaphores should be used here + * return non-zero on failure   */ -static void cpuidle_idle_call(void) +int cpuidle_idle_call(void)  {  	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);  	struct cpuidle_state *target_state;  	int next_state; +	if (off) +		return -ENODEV; + +	if (!initialized) +		return -ENODEV; +  	/* check if the device is ready */ -	if (!dev || !dev->enabled) { -		if (pm_idle_old) -			pm_idle_old(); -		else -#if defined(CONFIG_ARCH_HAS_DEFAULT_IDLE) -			default_idle(); -#else -			local_irq_enable(); -#endif -		return; -	} +	if (!dev || !dev->enabled) +		return -EBUSY;  #if 0  	/* shows regressions, re-enable for 2.6.29 */ @@ -89,7 +97,7 @@ static void cpuidle_idle_call(void)  	next_state = cpuidle_curr_governor->select(dev);  	if (need_resched()) {  		local_irq_enable(); -		return; +		return 0;  	}  	target_state = &dev->states[next_state]; @@ -114,6 +122,8 @@ static void cpuidle_idle_call(void)  	/* give the governor an opportunity to reflect on the outcome */  	if (cpuidle_curr_governor->reflect)  		cpuidle_curr_governor->reflect(dev); + +	return 0;  }  /** @@ -121,10 +131,10 @@ static void cpuidle_idle_call(void)   */  void cpuidle_install_idle_handler(void)  { -	if (enabled_devices && (pm_idle != cpuidle_idle_call)) { +	if (enabled_devices) {  		/* Make sure all changes finished before we switch to new idle */  		smp_wmb(); -		pm_idle = cpuidle_idle_call; +		initialized = 1;  	}  } @@ -133,8 +143,8 @@ void cpuidle_install_idle_handler(void)   */  void cpuidle_uninstall_idle_handler(void)  { -	if (enabled_devices && pm_idle_old && (pm_idle != pm_idle_old)) { -		pm_idle = pm_idle_old; +	if (enabled_devices) { +		initialized = 0;  		cpuidle_kick_cpus();  	}  } @@ -427,7 +437,8 @@ static int __init cpuidle_init(void)  {  	int ret; -	pm_idle_old = pm_idle; +	if (cpuidle_disabled()) +		return -ENODEV;  	ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);  	if (ret) @@ -438,4 +449,5 @@ static int __init cpuidle_init(void)  	return 0;  } +module_param(off, int, 0444);  core_initcall(cpuidle_init); diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h index 33e50d556f17..38c3fd8b9d76 100644 --- a/drivers/cpuidle/cpuidle.h +++ b/drivers/cpuidle/cpuidle.h @@ -13,6 +13,7 @@ extern struct list_head cpuidle_governors;  extern struct list_head cpuidle_detected_devices;  extern struct mutex cpuidle_lock;  extern spinlock_t cpuidle_driver_lock; +extern int cpuidle_disabled(void);  /* idle loop */  extern void cpuidle_install_idle_handler(void); diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index fd1601e3d125..3f7e3cedd133 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -26,6 +26,9 @@ int cpuidle_register_driver(struct cpuidle_driver *drv)  	if (!drv)  		return -EINVAL; +	if (cpuidle_disabled()) +		return -ENODEV; +  	spin_lock(&cpuidle_driver_lock);  	if (cpuidle_curr_driver) {  		spin_unlock(&cpuidle_driver_lock); diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c index 724c164d31c9..ea2f8e7aa24a 100644 --- a/drivers/cpuidle/governor.c +++ b/drivers/cpuidle/governor.c @@ -81,6 +81,9 @@ int cpuidle_register_governor(struct cpuidle_governor *gov)  	if (!gov || !gov->select)  		return -EINVAL; +	if (cpuidle_disabled()) +		return -ENODEV; +  	mutex_lock(&cpuidle_lock);  	if (__cpuidle_find_governor(gov->name) == NULL) {  		ret = 0; diff --git a/drivers/eisa/pci_eisa.c b/drivers/eisa/pci_eisa.c index 30da70d06a6d..cdae207028a7 100644 --- a/drivers/eisa/pci_eisa.c +++ b/drivers/eisa/pci_eisa.c @@ -45,13 +45,13 @@ static int __init pci_eisa_init(struct pci_dev *pdev,  	return 0;  } -static struct pci_device_id __initdata pci_eisa_pci_tbl[] = { +static struct pci_device_id pci_eisa_pci_tbl[] = {  	{ PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,  	  PCI_CLASS_BRIDGE_EISA << 8, 0xffff00, 0 },  	{ 0, }  }; -static struct pci_driver __initdata pci_eisa_driver = { +static struct pci_driver __refdata pci_eisa_driver = {  	.name		= "pci_eisa",  	.id_table	= pci_eisa_pci_tbl,  	.probe		= pci_eisa_init, diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index eacb05e6cfb3..eb80b549ed8d 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -157,7 +157,7 @@ utf16_strnlen(efi_char16_t *s, size_t maxlength)  	return length;  } -static unsigned long +static inline unsigned long  utf16_strlen(efi_char16_t *s)  {  	return utf16_strnlen(s, ~0UL); @@ -580,8 +580,8 @@ static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type,  	return -1;  } -static u64 efi_pstore_write(enum pstore_type_id type, int part, size_t size, -			    struct pstore_info *psi) +static u64 efi_pstore_write(enum pstore_type_id type, unsigned int part, +			    size_t size, struct pstore_info *psi)  {  	return 0;  } diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c index ce281d152275..67df91af8424 100644 --- a/drivers/input/keyboard/gpio_keys.c +++ b/drivers/input/keyboard/gpio_keys.c @@ -483,7 +483,7 @@ static int gpio_keys_get_devtree_pdata(struct device *dev,  	buttons = kzalloc(pdata->nbuttons * (sizeof *buttons), GFP_KERNEL);  	if (!buttons) -		return -ENODEV; +		return -ENOMEM;  	pp = NULL;  	i = 0; diff --git a/drivers/input/keyboard/lm8323.c b/drivers/input/keyboard/lm8323.c index ab0acaf7fe8f..756348a7f93a 100644 --- a/drivers/input/keyboard/lm8323.c +++ b/drivers/input/keyboard/lm8323.c @@ -754,8 +754,11 @@ fail3:  	device_remove_file(&client->dev, &dev_attr_disable_kp);  fail2:  	while (--pwm >= 0) -		if (lm->pwm[pwm].enabled) +		if (lm->pwm[pwm].enabled) { +			device_remove_file(lm->pwm[pwm].cdev.dev, +					   &dev_attr_time);  			led_classdev_unregister(&lm->pwm[pwm].cdev); +		}  fail1:  	input_free_device(idev);  	kfree(lm); @@ -775,8 +778,10 @@ static int __devexit lm8323_remove(struct i2c_client *client)  	device_remove_file(&lm->client->dev, &dev_attr_disable_kp);  	for (i = 0; i < 3; i++) -		if (lm->pwm[i].enabled) +		if (lm->pwm[i].enabled) { +			device_remove_file(lm->pwm[i].cdev.dev, &dev_attr_time);  			led_classdev_unregister(&lm->pwm[i].cdev); +		}  	kfree(lm); diff --git a/drivers/input/keyboard/tegra-kbc.c b/drivers/input/keyboard/tegra-kbc.c index da3828fc2c09..f270447ba951 100644 --- a/drivers/input/keyboard/tegra-kbc.c +++ b/drivers/input/keyboard/tegra-kbc.c @@ -19,6 +19,7 @@   * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.   */ +#include <linux/kernel.h>  #include <linux/module.h>  #include <linux/input.h>  #include <linux/platform_device.h> @@ -37,7 +38,7 @@  #define KBC_ROW_SCAN_DLY	5  /* KBC uses a 32KHz clock so a cycle = 1/32Khz */ -#define KBC_CYCLE_USEC	32 +#define KBC_CYCLE_MS	32  /* KBC Registers */ @@ -647,7 +648,7 @@ static int __devinit tegra_kbc_probe(struct platform_device *pdev)  	debounce_cnt = min(pdata->debounce_cnt, KBC_MAX_DEBOUNCE_CNT);  	scan_time_rows = (KBC_ROW_SCAN_TIME + debounce_cnt) * num_rows;  	kbc->repoll_dly = KBC_ROW_SCAN_DLY + scan_time_rows + pdata->repeat_cnt; -	kbc->repoll_dly = ((kbc->repoll_dly * KBC_CYCLE_USEC) + 999) / 1000; +	kbc->repoll_dly = DIV_ROUND_UP(kbc->repoll_dly, KBC_CYCLE_MS);  	input_dev->name = pdev->name;  	input_dev->id.bustype = BUS_HOST; diff --git a/drivers/input/misc/kxtj9.c b/drivers/input/misc/kxtj9.c index c456f63b6bae..783597a9a64a 100644 --- a/drivers/input/misc/kxtj9.c +++ b/drivers/input/misc/kxtj9.c @@ -21,6 +21,7 @@  #include <linux/i2c.h>  #include <linux/input.h>  #include <linux/interrupt.h> +#include <linux/module.h>  #include <linux/slab.h>  #include <linux/input/kxtj9.h>  #include <linux/input-polldev.h> diff --git a/drivers/input/misc/mma8450.c b/drivers/input/misc/mma8450.c index 20f8f9284f02..6c76cf792991 100644 --- a/drivers/input/misc/mma8450.c +++ b/drivers/input/misc/mma8450.c @@ -24,6 +24,7 @@  #include <linux/delay.h>  #include <linux/i2c.h>  #include <linux/input-polldev.h> +#include <linux/of_device.h>  #define MMA8450_DRV_NAME	"mma8450" @@ -229,10 +230,17 @@ static const struct i2c_device_id mma8450_id[] = {  };  MODULE_DEVICE_TABLE(i2c, mma8450_id); +static const struct of_device_id mma8450_dt_ids[] = { +	{ .compatible = "fsl,mma8450", }, +	{ /* sentinel */ } +}; +MODULE_DEVICE_TABLE(i2c, mma8450_dt_ids); +  static struct i2c_driver mma8450_driver = {  	.driver = {  		.name	= MMA8450_DRV_NAME,  		.owner	= THIS_MODULE, +		.of_match_table = mma8450_dt_ids,  	},  	.probe		= mma8450_probe,  	.remove		= __devexit_p(mma8450_remove), diff --git a/drivers/input/mouse/hgpk.c b/drivers/input/mouse/hgpk.c index 95577c15ae56..4d17d9f3320b 100644 --- a/drivers/input/mouse/hgpk.c +++ b/drivers/input/mouse/hgpk.c @@ -32,6 +32,7 @@  #define DEBUG  #include <linux/slab.h>  #include <linux/input.h> +#include <linux/module.h>  #include <linux/serio.h>  #include <linux/libps2.h>  #include <linux/delay.h> diff --git a/drivers/input/touchscreen/ad7879.c b/drivers/input/touchscreen/ad7879.c index bc3b5187f3a3..131f9d1c921b 100644 --- a/drivers/input/touchscreen/ad7879.c +++ b/drivers/input/touchscreen/ad7879.c @@ -249,12 +249,14 @@ static void __ad7879_enable(struct ad7879 *ts)  static void __ad7879_disable(struct ad7879 *ts)  { +	u16 reg = (ts->cmd_crtl2 & ~AD7879_PM(-1)) | +		AD7879_PM(AD7879_PM_SHUTDOWN);  	disable_irq(ts->irq);  	if (del_timer_sync(&ts->timer))  		ad7879_ts_event_release(ts); -	ad7879_write(ts, AD7879_REG_CTRL2, AD7879_PM(AD7879_PM_SHUTDOWN)); +	ad7879_write(ts, AD7879_REG_CTRL2, reg);  } diff --git a/drivers/of/base.c b/drivers/of/base.c index 3ff22e32b602..fb28b5af733b 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -17,14 +17,39 @@   *      as published by the Free Software Foundation; either version   *      2 of the License, or (at your option) any later version.   */ +#include <linux/ctype.h>  #include <linux/module.h>  #include <linux/of.h>  #include <linux/spinlock.h>  #include <linux/slab.h>  #include <linux/proc_fs.h> +/** + * struct alias_prop - Alias property in 'aliases' node + * @link:	List node to link the structure in aliases_lookup list + * @alias:	Alias property name + * @np:		Pointer to device_node that the alias stands for + * @id:		Index value from end of alias name + * @stem:	Alias string without the index + * + * The structure represents one alias property of 'aliases' node as + * an entry in aliases_lookup list. + */ +struct alias_prop { +	struct list_head link; +	const char *alias; +	struct device_node *np; +	int id; +	char stem[0]; +}; + +static LIST_HEAD(aliases_lookup); +  struct device_node *allnodes;  struct device_node *of_chosen; +struct device_node *of_aliases; + +static DEFINE_MUTEX(of_aliases_mutex);  /* use when traversing tree through the allnext, child, sibling,   * or parent members of struct device_node. @@ -988,3 +1013,108 @@ out_unlock:  }  #endif /* defined(CONFIG_OF_DYNAMIC) */ +static void of_alias_add(struct alias_prop *ap, struct device_node *np, +			 int id, const char *stem, int stem_len) +{ +	ap->id = id; +	ap->np = np; +	strncpy(ap->stem, stem, stem_len); +	ap->stem[stem_len] = 0; +	list_add_tail(&ap->link, &aliases_lookup); +	pr_debug("adding DT alias:%s: stem=%s id=%i node=%s\n", +		 ap->alias, ap->stem, ap->id, np ? np->full_name : NULL); +} + +/** + * of_alias_scan() - Scan all properties of 'aliases' node + * + * The function scans all the properties of 'aliases' node and populate + * the global lookup table with the properties.  It returns the + * number of alias_prop found, or error code in error case. + */ +__init void of_alias_scan(void) +{ +	struct property *pp; + +	if (!of_aliases) +		return; + +	for_each_property(pp, of_aliases->properties) { +		const char *start = pp->name; +		const char *end = start + strlen(start); +		struct device_node *np; +		struct alias_prop *ap; +		int id, len; + +		/* Skip those we do not want to proceed */ +		if (!strcmp(pp->name, "name") || +		    !strcmp(pp->name, "phandle") || +		    !strcmp(pp->name, "linux,phandle")) +			continue; + +		np = of_find_node_by_path(pp->value); +		if (!np) +			continue; + +		/* walk alias backwards to extract the id and 'stem' string */ +		while (isdigit(*(end-1)) && end > start) +			end--; +		len = end - start; +		id = strlen(end) ? simple_strtoul(end, NULL, 10) : -1; + +		/* Allocate an alias_prop with enough space for the stem */ +		ap = early_init_dt_alloc_memory_arch(sizeof(*ap) + len + 1, 4); +		if (!ap) +			continue; +		ap->alias = start; +		of_alias_add(ap, np, id, start, len); +	} +} + +/** + * of_alias_get_id() - Get alias id for the given device_node + * @np:		Pointer to the given device_node + * @stem:	Alias stem of the given device_node + * + * The function travels the lookup table to get alias id for the given + * device_node and alias stem.  It returns the alias id if find it. + * If not, dynamically creates one in the lookup table and returns it, + * or returns error code if fail to create. + */ +int of_alias_get_id(struct device_node *np, const char *stem) +{ +	struct alias_prop *app; +	int id = 0; +	bool found = false; + +	mutex_lock(&of_aliases_mutex); +	list_for_each_entry(app, &aliases_lookup, link) { +		if (strcmp(app->stem, stem) != 0) +			continue; + +		if (np == app->np) { +			found = true; +			id = app->id; +			break; +		} + +		if (id <= app->id) +			id = app->id + 1; +	} + +	/* If an id is not found, then allocate a new one */ +	if (!found) { +		app = kzalloc(sizeof(*app) + strlen(stem) + 1, 4); +		if (!app) { +			id = -ENODEV; +			goto out; +		} +		of_alias_add(app, np, id, stem, strlen(stem)); +	} + + out: +	mutex_unlock(&of_aliases_mutex); + +	return id; +} +EXPORT_SYMBOL_GPL(of_alias_get_id); diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 65200af29c52..13d6d3a96b31 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -707,10 +707,12 @@ void __init unflatten_device_tree(void)  	__unflatten_device_tree(initial_boot_params, &allnodes,  				early_init_dt_alloc_memory_arch); -	/* Get pointer to OF "/chosen" node for use everywhere */ +	/* Get pointer to "/chosen" and "/aliasas" nodes for use everywhere */  	of_chosen = of_find_node_by_path("/chosen");  	if (of_chosen == NULL)  		of_chosen = of_find_node_by_path("/chosen@0"); +	of_aliases = of_find_node_by_path("/aliases"); +	of_alias_scan();  }  #endif /* CONFIG_OF_EARLY_FLATTREE */ diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index a70fa89f76fd..220285760b68 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -110,7 +110,7 @@ static int post_dock_fixups(struct notifier_block *nb, unsigned long val,  } -static struct acpi_dock_ops acpiphp_dock_ops = { +static const struct acpi_dock_ops acpiphp_dock_ops = {  	.handler = handle_hotplug_event_func,  }; diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c index bcae8dd41496..7789002bdd5c 100644 --- a/drivers/rtc/rtc-omap.c +++ b/drivers/rtc/rtc-omap.c @@ -368,7 +368,7 @@ static int __init omap_rtc_probe(struct platform_device *pdev)  		pr_info("%s: already running\n", pdev->name);  	/* force to 24 hour mode */ -	new_ctrl = reg & ~(OMAP_RTC_CTRL_SPLIT|OMAP_RTC_CTRL_AUTO_COMP); +	new_ctrl = reg & (OMAP_RTC_CTRL_SPLIT|OMAP_RTC_CTRL_AUTO_COMP);  	new_ctrl |= OMAP_RTC_CTRL_STOP;  	/* BOARD-SPECIFIC CUSTOMIZATION CAN GO HERE: diff --git a/drivers/target/iscsi/Kconfig b/drivers/target/iscsi/Kconfig index 564ff4e0dbc4..8345fb457a40 100644 --- a/drivers/target/iscsi/Kconfig +++ b/drivers/target/iscsi/Kconfig @@ -1,5 +1,6 @@  config ISCSI_TARGET  	tristate "Linux-iSCSI.org iSCSI Target Mode Stack" +	depends on NET  	select CRYPTO  	select CRYPTO_CRC32C  	select CRYPTO_CRC32C_INTEL if X86 diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 14c81c4265bd..c24fb10de60b 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -120,7 +120,7 @@ struct iscsi_tiqn *iscsit_add_tiqn(unsigned char *buf)  	struct iscsi_tiqn *tiqn = NULL;  	int ret; -	if (strlen(buf) > ISCSI_IQN_LEN) { +	if (strlen(buf) >= ISCSI_IQN_LEN) {  		pr_err("Target IQN exceeds %d bytes\n",  				ISCSI_IQN_LEN);  		return ERR_PTR(-EINVAL); @@ -1857,7 +1857,7 @@ static int iscsit_handle_text_cmd(  	char *text_ptr, *text_in;  	int cmdsn_ret, niov = 0, rx_got, rx_size;  	u32 checksum = 0, data_crc = 0, payload_length; -	u32 padding = 0, text_length = 0; +	u32 padding = 0, pad_bytes = 0, text_length = 0;  	struct iscsi_cmd *cmd;  	struct kvec iov[3];  	struct iscsi_text *hdr; @@ -1896,7 +1896,7 @@ static int iscsit_handle_text_cmd(  		padding = ((-payload_length) & 3);  		if (padding != 0) { -			iov[niov].iov_base = cmd->pad_bytes; +			iov[niov].iov_base = &pad_bytes;  			iov[niov++].iov_len  = padding;  			rx_size += padding;  			pr_debug("Receiving %u additional bytes" @@ -1917,7 +1917,7 @@ static int iscsit_handle_text_cmd(  		if (conn->conn_ops->DataDigest) {  			iscsit_do_crypto_hash_buf(&conn->conn_rx_hash,  					text_in, text_length, -					padding, cmd->pad_bytes, +					padding, (u8 *)&pad_bytes,  					(u8 *)&data_crc);  			if (checksum != data_crc) { @@ -3468,7 +3468,12 @@ static inline void iscsit_thread_check_cpumask(  }  #else -#define iscsit_thread_get_cpumask(X) ({}) + +void iscsit_thread_get_cpumask(struct iscsi_conn *conn) +{ +	return; +} +  #define iscsit_thread_check_cpumask(X, Y, Z) ({})  #endif /* CONFIG_SMP */ diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c index 32bb92c44450..f095e65b1ccf 100644 --- a/drivers/target/iscsi/iscsi_target_configfs.c +++ b/drivers/target/iscsi/iscsi_target_configfs.c @@ -181,7 +181,7 @@ struct se_tpg_np *lio_target_call_addnptotpg(  		return ERR_PTR(-EOVERFLOW);  	}  	memset(buf, 0, MAX_PORTAL_LEN + 1); -	snprintf(buf, MAX_PORTAL_LEN, "%s", name); +	snprintf(buf, MAX_PORTAL_LEN + 1, "%s", name);  	memset(&sockaddr, 0, sizeof(struct __kernel_sockaddr_storage)); diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c index 713a4d23557a..4d087ac11067 100644 --- a/drivers/target/iscsi/iscsi_target_nego.c +++ b/drivers/target/iscsi/iscsi_target_nego.c @@ -978,7 +978,7 @@ struct iscsi_login *iscsi_target_init_negotiation(  		pr_err("Unable to allocate memory for struct iscsi_login.\n");  		iscsit_tx_login_rsp(conn, ISCSI_STATUS_CLS_TARGET_ERR,  				ISCSI_LOGIN_STATUS_NO_RESOURCES); -		goto out; +		return NULL;  	}  	login->req = kzalloc(ISCSI_HDR_LEN, GFP_KERNEL); diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index c75a01a1c475..89760329d5d0 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -1747,6 +1747,8 @@ int transport_generic_handle_cdb(  }  EXPORT_SYMBOL(transport_generic_handle_cdb); +static void transport_generic_request_failure(struct se_cmd *, +			struct se_device *, int, int);  /*   * Used by fabric module frontends to queue tasks directly.   * Many only be used from process context only @@ -1754,6 +1756,8 @@ EXPORT_SYMBOL(transport_generic_handle_cdb);  int transport_handle_cdb_direct(  	struct se_cmd *cmd)  { +	int ret; +  	if (!cmd->se_lun) {  		dump_stack();  		pr_err("cmd->se_lun is NULL\n"); @@ -1765,8 +1769,31 @@ int transport_handle_cdb_direct(  				" from interrupt context\n");  		return -EINVAL;  	} - -	return transport_generic_new_cmd(cmd); +	/* +	 * Set TRANSPORT_NEW_CMD state and cmd->t_transport_active=1 following +	 * transport_generic_handle_cdb*() -> transport_add_cmd_to_queue() +	 * in existing usage to ensure that outstanding descriptors are handled +	 * correctly during shutdown via transport_generic_wait_for_tasks() +	 * +	 * Also, we don't take cmd->t_state_lock here as we only expect +	 * this to be called for initial descriptor submission. +	 */ +	cmd->t_state = TRANSPORT_NEW_CMD; +	atomic_set(&cmd->t_transport_active, 1); +	/* +	 * transport_generic_new_cmd() is already handling QUEUE_FULL, +	 * so follow TRANSPORT_NEW_CMD processing thread context usage +	 * and call transport_generic_request_failure() if necessary.. +	 */ +	ret = transport_generic_new_cmd(cmd); +	if (ret == -EAGAIN) +		return 0; +	else if (ret < 0) { +		cmd->transport_error_status = ret; +		transport_generic_request_failure(cmd, NULL, 0, +				(cmd->data_direction != DMA_TO_DEVICE)); +	} +	return 0;  }  EXPORT_SYMBOL(transport_handle_cdb_direct); @@ -3324,7 +3351,7 @@ static int transport_generic_cmd_sequencer(  			goto out_invalid_cdb_field;  		} -		cmd->t_task_lba = get_unaligned_be16(&cdb[2]); +		cmd->t_task_lba = get_unaligned_be64(&cdb[2]);  		passthrough = (dev->transport->transport_type ==  				TRANSPORT_PLUGIN_PHBA_PDEV);  		/* diff --git a/drivers/target/tcm_fc/tcm_fc.h b/drivers/target/tcm_fc/tcm_fc.h index f7fff7ed63c3..bd4fe21a23b8 100644 --- a/drivers/target/tcm_fc/tcm_fc.h +++ b/drivers/target/tcm_fc/tcm_fc.h @@ -187,4 +187,9 @@ void ft_dump_cmd(struct ft_cmd *, const char *caller);  ssize_t ft_format_wwn(char *, size_t, u64); +/* + * Underlying HW specific helper function + */ +void ft_invl_hw_context(struct ft_cmd *); +  #endif /* __TCM_FC_H__ */ diff --git a/drivers/target/tcm_fc/tfc_cmd.c b/drivers/target/tcm_fc/tfc_cmd.c index 09df38b4610c..5654dc22f7ae 100644 --- a/drivers/target/tcm_fc/tfc_cmd.c +++ b/drivers/target/tcm_fc/tfc_cmd.c @@ -320,6 +320,7 @@ static void ft_recv_seq(struct fc_seq *sp, struct fc_frame *fp, void *arg)  	default:  		pr_debug("%s: unhandled frame r_ctl %x\n",  		       __func__, fh->fh_r_ctl); +		ft_invl_hw_context(cmd);  		fc_frame_free(fp);  		transport_generic_free_cmd(&cmd->se_cmd, 0, 0);  		break; diff --git a/drivers/target/tcm_fc/tfc_io.c b/drivers/target/tcm_fc/tfc_io.c index 8e2a46ddcccb..c37f4cd96452 100644 --- a/drivers/target/tcm_fc/tfc_io.c +++ b/drivers/target/tcm_fc/tfc_io.c @@ -213,62 +213,49 @@ void ft_recv_write_data(struct ft_cmd *cmd, struct fc_frame *fp)  	if (!(ntoh24(fh->fh_f_ctl) & FC_FC_REL_OFF))  		goto drop; +	f_ctl = ntoh24(fh->fh_f_ctl); +	ep = fc_seq_exch(seq); +	lport = ep->lp; +	if (cmd->was_ddp_setup) { +		BUG_ON(!ep); +		BUG_ON(!lport); +	} +  	/* -	 * Doesn't expect even single byte of payload. Payload +	 * Doesn't expect payload if DDP is setup. Payload  	 * is expected to be copied directly to user buffers -	 * due to DDP (Large Rx offload) feature, hence -	 * BUG_ON if BUF is non-NULL +	 * due to DDP (Large Rx offload),  	 */  	buf = fc_frame_payload_get(fp, 1); -	if (cmd->was_ddp_setup && buf) { -		pr_debug("%s: When DDP was setup, not expected to" -				 "receive frame with payload, Payload shall be" -				 "copied directly to buffer instead of coming " -				 "via. legacy receive queues\n", __func__); -		BUG_ON(buf); -	} +	if (buf) +		pr_err("%s: xid 0x%x, f_ctl 0x%x, cmd->sg %p, " +				"cmd->sg_cnt 0x%x. DDP was setup" +				" hence not expected to receive frame with " +				"payload, Frame will be dropped if " +				"'Sequence Initiative' bit in f_ctl is " +				"not set\n", __func__, ep->xid, f_ctl, +				cmd->sg, cmd->sg_cnt); +	/* + 	 * Invalidate HW DDP context if it was setup for respective + 	 * command. Invalidation of HW DDP context is requited in both + 	 * situation (success and error).  + 	 */ +	ft_invl_hw_context(cmd);  	/* -	 * If ft_cmd indicated 'ddp_setup', in that case only the last frame -	 * should come with 'TSI bit being set'. If 'TSI bit is not set and if -	 * data frame appears here, means error condition. In both the cases -	 * release the DDP context (ddp_put) and in error case, as well -	 * initiate error recovery mechanism. +	 * If "Sequence Initiative (TSI)" bit set in f_ctl, means last +	 * write data frame is received successfully where payload is +	 * posted directly to user buffer and only the last frame's +	 * header is posted in receive queue. +	 * +	 * If "Sequence Initiative (TSI)" bit is not set, means error +	 * condition w.r.t. DDP, hence drop the packet and let explict +	 * ABORTS from other end of exchange timer trigger the recovery.  	 */ -	ep = fc_seq_exch(seq); -	if (cmd->was_ddp_setup) { -		BUG_ON(!ep); -		lport = ep->lp; -		BUG_ON(!lport); -	} -	if (cmd->was_ddp_setup && ep->xid != FC_XID_UNKNOWN) { -		f_ctl = ntoh24(fh->fh_f_ctl); -		/* -		 * If TSI bit set in f_ctl, means last write data frame is -		 * received successfully where payload is posted directly -		 * to user buffer and only the last frame's header is posted -		 * in legacy receive queue -		 */ -		if (f_ctl & FC_FC_SEQ_INIT) { /* TSI bit set in FC frame */ -			cmd->write_data_len = lport->tt.ddp_done(lport, -								ep->xid); -			goto last_frame; -		} else { -			/* -			 * Updating the write_data_len may be meaningless at -			 * this point, but just in case if required in future -			 * for debugging or any other purpose -			 */ -			pr_err("%s: Received frame with TSI bit not" -					" being SET, dropping the frame, " -					"cmd->sg <%p>, cmd->sg_cnt <0x%x>\n", -					__func__, cmd->sg, cmd->sg_cnt); -			cmd->write_data_len = lport->tt.ddp_done(lport, -							      ep->xid); -			lport->tt.seq_exch_abort(cmd->seq, 0); -			goto drop; -		} -	} +	if (f_ctl & FC_FC_SEQ_INIT) +		goto last_frame; +	else +		goto drop;  	rel_off = ntohl(fh->fh_parm_offset);  	frame_len = fr_len(fp); @@ -331,3 +318,39 @@ last_frame:  drop:  	fc_frame_free(fp);  } + +/* + * Handle and cleanup any HW specific resources if + * received ABORTS, errors, timeouts. + */ +void ft_invl_hw_context(struct ft_cmd *cmd) +{ +	struct fc_seq *seq = cmd->seq; +	struct fc_exch *ep = NULL; +	struct fc_lport *lport = NULL; + +	BUG_ON(!cmd); + +	/* Cleanup the DDP context in HW if DDP was setup */ +	if (cmd->was_ddp_setup && seq) { +		ep = fc_seq_exch(seq); +		if (ep) { +			lport = ep->lp; +			if (lport && (ep->xid <= lport->lro_xid)) +				/* +				 * "ddp_done" trigger invalidation of HW +				 * specific DDP context +				 */ +				cmd->write_data_len = lport->tt.ddp_done(lport, +								      ep->xid); + +				/* +				 * Resetting same variable to indicate HW's +				 * DDP context has been invalidated to avoid +				 * re_invalidation of same context (context is +				 * identified using ep->xid) +				 */ +				cmd->was_ddp_setup = 0; +		} +	} +} diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index bf7c687519ef..f7f71b2d3101 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -14,11 +14,7 @@ menuconfig THERMAL  	  If you want this support, you should say Y or M here.  config THERMAL_HWMON -	bool "Hardware monitoring support" +	bool  	depends on THERMAL  	depends on HWMON=y || HWMON=THERMAL -	help -	  The generic thermal sysfs driver's hardware monitoring support -	  requires a 2.10.7/3.0.2 or later lm-sensors userspace. - -	  Say Y if your user-space is new enough. +	default y diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c index 0b1c82ad6805..708f8e92771a 100644 --- a/drivers/thermal/thermal_sys.c +++ b/drivers/thermal/thermal_sys.c @@ -420,6 +420,29 @@ thermal_cooling_device_trip_point_show(struct device *dev,  /* hwmon sys I/F */  #include <linux/hwmon.h> + +/* thermal zone devices with the same type share one hwmon device */ +struct thermal_hwmon_device { +	char type[THERMAL_NAME_LENGTH]; +	struct device *device; +	int count; +	struct list_head tz_list; +	struct list_head node; +}; + +struct thermal_hwmon_attr { +	struct device_attribute attr; +	char name[16]; +}; + +/* one temperature input for each thermal zone */ +struct thermal_hwmon_temp { +	struct list_head hwmon_node; +	struct thermal_zone_device *tz; +	struct thermal_hwmon_attr temp_input;	/* hwmon sys attr */ +	struct thermal_hwmon_attr temp_crit;	/* hwmon sys attr */ +}; +  static LIST_HEAD(thermal_hwmon_list);  static ssize_t @@ -437,9 +460,10 @@ temp_input_show(struct device *dev, struct device_attribute *attr, char *buf)  	int ret;  	struct thermal_hwmon_attr *hwmon_attr  			= container_of(attr, struct thermal_hwmon_attr, attr); -	struct thermal_zone_device *tz -			= container_of(hwmon_attr, struct thermal_zone_device, +	struct thermal_hwmon_temp *temp +			= container_of(hwmon_attr, struct thermal_hwmon_temp,  				       temp_input); +	struct thermal_zone_device *tz = temp->tz;  	ret = tz->ops->get_temp(tz, &temperature); @@ -455,9 +479,10 @@ temp_crit_show(struct device *dev, struct device_attribute *attr,  {  	struct thermal_hwmon_attr *hwmon_attr  			= container_of(attr, struct thermal_hwmon_attr, attr); -	struct thermal_zone_device *tz -			= container_of(hwmon_attr, struct thermal_zone_device, +	struct thermal_hwmon_temp *temp +			= container_of(hwmon_attr, struct thermal_hwmon_temp,  				       temp_crit); +	struct thermal_zone_device *tz = temp->tz;  	long temperature;  	int ret; @@ -469,22 +494,54 @@ temp_crit_show(struct device *dev, struct device_attribute *attr,  } -static int -thermal_add_hwmon_sysfs(struct thermal_zone_device *tz) +static struct thermal_hwmon_device * +thermal_hwmon_lookup_by_type(const struct thermal_zone_device *tz)  {  	struct thermal_hwmon_device *hwmon; -	int new_hwmon_device = 1; -	int result;  	mutex_lock(&thermal_list_lock);  	list_for_each_entry(hwmon, &thermal_hwmon_list, node)  		if (!strcmp(hwmon->type, tz->type)) { -			new_hwmon_device = 0;  			mutex_unlock(&thermal_list_lock); -			goto register_sys_interface; +			return hwmon; +		} +	mutex_unlock(&thermal_list_lock); + +	return NULL; +} + +/* Find the temperature input matching a given thermal zone */ +static struct thermal_hwmon_temp * +thermal_hwmon_lookup_temp(const struct thermal_hwmon_device *hwmon, +			  const struct thermal_zone_device *tz) +{ +	struct thermal_hwmon_temp *temp; + +	mutex_lock(&thermal_list_lock); +	list_for_each_entry(temp, &hwmon->tz_list, hwmon_node) +		if (temp->tz == tz) { +			mutex_unlock(&thermal_list_lock); +			return temp;  		}  	mutex_unlock(&thermal_list_lock); +	return NULL; +} + +static int +thermal_add_hwmon_sysfs(struct thermal_zone_device *tz) +{ +	struct thermal_hwmon_device *hwmon; +	struct thermal_hwmon_temp *temp; +	int new_hwmon_device = 1; +	int result; + +	hwmon = thermal_hwmon_lookup_by_type(tz); +	if (hwmon) { +		new_hwmon_device = 0; +		goto register_sys_interface; +	} +  	hwmon = kzalloc(sizeof(struct thermal_hwmon_device), GFP_KERNEL);  	if (!hwmon)  		return -ENOMEM; @@ -502,30 +559,36 @@ thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)  		goto free_mem;   register_sys_interface: -	tz->hwmon = hwmon; +	temp = kzalloc(sizeof(struct thermal_hwmon_temp), GFP_KERNEL); +	if (!temp) { +		result = -ENOMEM; +		goto unregister_name; +	} + +	temp->tz = tz;  	hwmon->count++; -	snprintf(tz->temp_input.name, THERMAL_NAME_LENGTH, +	snprintf(temp->temp_input.name, THERMAL_NAME_LENGTH,  		 "temp%d_input", hwmon->count); -	tz->temp_input.attr.attr.name = tz->temp_input.name; -	tz->temp_input.attr.attr.mode = 0444; -	tz->temp_input.attr.show = temp_input_show; -	sysfs_attr_init(&tz->temp_input.attr.attr); -	result = device_create_file(hwmon->device, &tz->temp_input.attr); +	temp->temp_input.attr.attr.name = temp->temp_input.name; +	temp->temp_input.attr.attr.mode = 0444; +	temp->temp_input.attr.show = temp_input_show; +	sysfs_attr_init(&temp->temp_input.attr.attr); +	result = device_create_file(hwmon->device, &temp->temp_input.attr);  	if (result) -		goto unregister_name; +		goto free_temp_mem;  	if (tz->ops->get_crit_temp) {  		unsigned long temperature;  		if (!tz->ops->get_crit_temp(tz, &temperature)) { -			snprintf(tz->temp_crit.name, THERMAL_NAME_LENGTH, +			snprintf(temp->temp_crit.name, THERMAL_NAME_LENGTH,  				"temp%d_crit", hwmon->count); -			tz->temp_crit.attr.attr.name = tz->temp_crit.name; -			tz->temp_crit.attr.attr.mode = 0444; -			tz->temp_crit.attr.show = temp_crit_show; -			sysfs_attr_init(&tz->temp_crit.attr.attr); +			temp->temp_crit.attr.attr.name = temp->temp_crit.name; +			temp->temp_crit.attr.attr.mode = 0444; +			temp->temp_crit.attr.show = temp_crit_show; +			sysfs_attr_init(&temp->temp_crit.attr.attr);  			result = device_create_file(hwmon->device, -						    &tz->temp_crit.attr); +						    &temp->temp_crit.attr);  			if (result)  				goto unregister_input;  		} @@ -534,13 +597,15 @@ thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)  	mutex_lock(&thermal_list_lock);  	if (new_hwmon_device)  		list_add_tail(&hwmon->node, &thermal_hwmon_list); -	list_add_tail(&tz->hwmon_node, &hwmon->tz_list); +	list_add_tail(&temp->hwmon_node, &hwmon->tz_list);  	mutex_unlock(&thermal_list_lock);  	return 0;   unregister_input: -	device_remove_file(hwmon->device, &tz->temp_input.attr); +	device_remove_file(hwmon->device, &temp->temp_input.attr); + free_temp_mem: +	kfree(temp);   unregister_name:  	if (new_hwmon_device) {  		device_remove_file(hwmon->device, &dev_attr_name); @@ -556,15 +621,30 @@ thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)  static void  thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)  { -	struct thermal_hwmon_device *hwmon = tz->hwmon; +	struct thermal_hwmon_device *hwmon; +	struct thermal_hwmon_temp *temp; + +	hwmon = thermal_hwmon_lookup_by_type(tz); +	if (unlikely(!hwmon)) { +		/* Should never happen... */ +		dev_dbg(&tz->device, "hwmon device lookup failed!\n"); +		return; +	} + +	temp = thermal_hwmon_lookup_temp(hwmon, tz); +	if (unlikely(!temp)) { +		/* Should never happen... */ +		dev_dbg(&tz->device, "temperature input lookup failed!\n"); +		return; +	} -	tz->hwmon = NULL; -	device_remove_file(hwmon->device, &tz->temp_input.attr); +	device_remove_file(hwmon->device, &temp->temp_input.attr);  	if (tz->ops->get_crit_temp) -		device_remove_file(hwmon->device, &tz->temp_crit.attr); +		device_remove_file(hwmon->device, &temp->temp_crit.attr);  	mutex_lock(&thermal_list_lock); -	list_del(&tz->hwmon_node); +	list_del(&temp->hwmon_node); +	kfree(temp);  	if (!list_empty(&hwmon->tz_list)) {  		mutex_unlock(&thermal_list_lock);  		return; diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig index 69407e72aac1..278aeaa92505 100644 --- a/drivers/video/backlight/Kconfig +++ b/drivers/video/backlight/Kconfig @@ -336,7 +336,7 @@ config BACKLIGHT_PCF50633  	  enable its driver.  config BACKLIGHT_AAT2870 -	bool "AnalogicTech AAT2870 Backlight" +	tristate "AnalogicTech AAT2870 Backlight"  	depends on BACKLIGHT_CLASS_DEVICE && MFD_AAT2870_CORE  	help  	  If you have a AnalogicTech AAT2870 say Y to enable the diff --git a/drivers/video/backlight/aat2870_bl.c b/drivers/video/backlight/aat2870_bl.c index 4952a617563d..331f1ef1dad5 100644 --- a/drivers/video/backlight/aat2870_bl.c +++ b/drivers/video/backlight/aat2870_bl.c @@ -44,7 +44,7 @@ static inline int aat2870_brightness(struct aat2870_bl_driver_data *aat2870_bl,  	struct backlight_device *bd = aat2870_bl->bd;  	int val; -	val = brightness * aat2870_bl->max_current; +	val = brightness * (aat2870_bl->max_current - 1);  	val /= bd->props.max_brightness;  	return val; @@ -158,10 +158,10 @@ static int aat2870_bl_probe(struct platform_device *pdev)  	props.type = BACKLIGHT_RAW;  	bd = backlight_device_register("aat2870-backlight", &pdev->dev,  				       aat2870_bl, &aat2870_bl_ops, &props); -	if (!bd) { +	if (IS_ERR(bd)) {  		dev_err(&pdev->dev,  			"Failed allocate memory for backlight device\n"); -		ret = -ENOMEM; +		ret = PTR_ERR(bd);  		goto out_kfree;  	} @@ -175,7 +175,7 @@ static int aat2870_bl_probe(struct platform_device *pdev)  	else  		aat2870_bl->channels = AAT2870_BL_CH_ALL; -	if (pdata->max_brightness > 0) +	if (pdata->max_current > 0)  		aat2870_bl->max_current = pdata->max_current;  	else  		aat2870_bl->max_current = AAT2870_CURRENT_27_9; diff --git a/fs/Kconfig b/fs/Kconfig index 19891aab9c6e..9fe0b349f4cd 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -127,14 +127,21 @@ config TMPFS_POSIX_ACL  	select TMPFS_XATTR  	select GENERIC_ACL  	help -	  POSIX Access Control Lists (ACLs) support permissions for users and -	  groups beyond the owner/group/world scheme. +	  POSIX Access Control Lists (ACLs) support additional access rights +	  for users and groups beyond the standard owner/group/world scheme, +	  and this option selects support for ACLs specifically for tmpfs +	  filesystems. + +	  If you've selected TMPFS, it's possible that you'll also need +	  this option as there are a number of Linux distros that require +	  POSIX ACL support under /dev for certain features to work properly. +	  For example, some distros need this feature for ALSA-related /dev +	  files for sound to work properly.  In short, if you're not sure, +	  say Y.  	  To learn more about Access Control Lists, visit the POSIX ACLs for  	  Linux website <http://acl.bestbits.at/>. -	  If you don't know what Access Control Lists are, say N. -  config TMPFS_XATTR  	bool "Tmpfs extended attributes"  	depends on TMPFS diff --git a/fs/dcache.c b/fs/dcache.c index 2347cdb15abb..c83cae19161e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -795,6 +795,7 @@ relock:  /**   * prune_dcache_sb - shrink the dcache + * @sb: superblock   * @nr_to_scan: number of entries to try to free   *   * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e2d88baf91d3..4687fea0c00f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -124,7 +124,7 @@ void *ext4_kvzalloc(size_t size, gfp_t flags)  {  	void *ret; -	ret = kmalloc(size, flags); +	ret = kzalloc(size, flags);  	if (!ret)  		ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);  	return ret; diff --git a/fs/stack.c b/fs/stack.c index 4a6f7f440658..b4f2ab48a61f 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -29,10 +29,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)  	 *  	 * We don't actually know what locking is used at the lower level;  	 * but if it's a filesystem that supports quotas, it will be using -	 * i_lock as in inode_add_bytes().  tmpfs uses other locking, and -	 * its 32-bit is (just) able to exceed 2TB i_size with the aid of -	 * holes; but its i_blocks cannot carry into the upper long without -	 * almost 2TB swap - let's ignore that case. +	 * i_lock as in inode_add_bytes().  	 */  	if (sizeof(i_blocks) > sizeof(long))  		spin_lock(&src->i_lock); diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h index 3090471b2a5e..e49c36d38d7e 100644 --- a/include/acpi/acpi_drivers.h +++ b/include/acpi/acpi_drivers.h @@ -128,7 +128,7 @@ extern int is_dock_device(acpi_handle handle);  extern int register_dock_notifier(struct notifier_block *nb);  extern void unregister_dock_notifier(struct notifier_block *nb);  extern int register_hotplug_dock_device(acpi_handle handle, -					struct acpi_dock_ops *ops, +					const struct acpi_dock_ops *ops,  					void *context);  extern void unregister_hotplug_dock_device(acpi_handle handle);  #else diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 2ed0a8486c19..f554a9313b43 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -47,7 +47,7 @@  /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION                 0x20110413 +#define ACPI_CA_VERSION                 0x20110623  #include "actypes.h"  #include "actbl.h" @@ -69,6 +69,7 @@ extern u32 acpi_gbl_trace_flags;  extern u32 acpi_gbl_enable_aml_debug_object;  extern u8 acpi_gbl_copy_dsdt_locally;  extern u8 acpi_gbl_truncate_io_addresses; +extern u8 acpi_gbl_disable_auto_repair;  extern u32 acpi_current_gpe_count;  extern struct acpi_table_fadt acpi_gbl_FADT; diff --git a/include/acpi/apei.h b/include/acpi/apei.h index e67b523a50e1..51a527d24a8a 100644 --- a/include/acpi/apei.h +++ b/include/acpi/apei.h @@ -18,6 +18,11 @@  extern int hest_disable;  extern int erst_disable; +#ifdef CONFIG_ACPI_APEI_GHES +extern int ghes_disable; +#else +#define ghes_disable 1 +#endif  #ifdef CONFIG_ACPI_APEI  void __init acpi_hest_init(void); diff --git a/include/acpi/processor.h b/include/acpi/processor.h index ba4928cae473..67055f180330 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -337,7 +337,7 @@ extern struct cpuidle_driver acpi_idle_driver;  /* in processor_thermal.c */  int acpi_processor_get_limit_info(struct acpi_processor *pr); -extern struct thermal_cooling_device_ops processor_cooling_ops; +extern const struct thermal_cooling_device_ops processor_cooling_ops;  #ifdef CONFIG_CPU_FREQ  void acpi_thermal_cpufreq_init(void);  void acpi_thermal_cpufreq_exit(void); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 1deb2a73c2da..6001b4da39dd 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -238,7 +238,6 @@ extern int acpi_paddr_to_node(u64 start_addr, u64 size);  extern int pnpacpi_disabled;  #define PXM_INVAL	(-1) -#define NID_INVAL	(-1)  int acpi_check_resource_conflict(const struct resource *res); @@ -280,6 +279,8 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);  #define OSC_SB_CPUHP_OST_SUPPORT	8  #define OSC_SB_APEI_SUPPORT		16 +extern bool osc_sb_apei_support_acked; +  /* PCI defined _OSC bits */  /* _OSC DW1 Definition (OS Support Fields) */  #define OSC_EXT_PCI_CONFIG_SUPPORT		1 diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 3bac44cce142..7ad634501e48 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -146,6 +146,7 @@ extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);  extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);  extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits); +#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))  #define BITMAP_LAST_WORD_MASK(nbits)					\  (									\  	((nbits) % BITS_PER_LONG) ?					\ diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 36719ead50e8..b51629e15cfc 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -122,6 +122,8 @@ struct cpuidle_driver {  };  #ifdef CONFIG_CPU_IDLE +extern void disable_cpuidle(void); +extern int cpuidle_idle_call(void);  extern int cpuidle_register_driver(struct cpuidle_driver *drv);  struct cpuidle_driver *cpuidle_get_driver(void); @@ -135,6 +137,8 @@ extern int cpuidle_enable_device(struct cpuidle_device *dev);  extern void cpuidle_disable_device(struct cpuidle_device *dev);  #else +static inline void disable_cpuidle(void) { } +static inline int cpuidle_idle_call(void) { return -ENODEV; }  static inline int cpuidle_register_driver(struct cpuidle_driver *drv)  {return -ENODEV; } diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 3ff060ac7810..c6f996f2abb6 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -25,10 +25,6 @@ struct fault_attr {  	unsigned long reject_end;  	unsigned long count; - -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS -	struct dentry *dir; -#endif  };  #define FAULT_ATTR_INITIALIZER {				\ @@ -45,19 +41,15 @@ bool should_fail(struct fault_attr *attr, ssize_t size);  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS -int init_fault_attr_dentries(struct fault_attr *attr, const char *name); -void cleanup_fault_attr_dentries(struct fault_attr *attr); +struct dentry *fault_create_debugfs_attr(const char *name, +			struct dentry *parent, struct fault_attr *attr);  #else /* CONFIG_FAULT_INJECTION_DEBUG_FS */ -static inline int init_fault_attr_dentries(struct fault_attr *attr, -					  const char *name) -{ -	return -ENODEV; -} - -static inline void cleanup_fault_attr_dentries(struct fault_attr *attr) +static inline struct dentry *fault_create_debugfs_attr(const char *name, +			struct dentry *parent, struct fault_attr *attr)  { +	return ERR_PTR(-ENODEV);  }  #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 5bbebda78b02..5e98eeb2af3b 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -1,8 +1,26 @@  /* - * Basic general purpose allocator for managing special purpose memory - * not managed by the regular kmalloc/kfree interface. - * Uses for this includes on-device special memory, uncached memory - * etc. + * Basic general purpose allocator for managing special purpose + * memory, for example, memory that is not managed by the regular + * kmalloc/kfree interface.  Uses for this includes on-device special + * memory, uncached memory etc. + * + * It is safe to use the allocator in NMI handlers and other special + * unblockable contexts that could otherwise deadlock on locks.  This + * is implemented by using atomic operations and retries on any + * conflicts.  The disadvantage is that there may be livelocks in + * extreme cases.  For better scalability, one allocator can be used + * for each CPU. + * + * The lockless operation only works if there is enough memory + * available.  If new memory is added to the pool a lock has to be + * still taken.  So any user relying on locklessness has to ensure + * that sufficient memory is preallocated. + * + * The basic atomic operation of this allocator is cmpxchg on long. + * On architectures that don't have NMI-safe cmpxchg implementation, + * the allocator can NOT be used in NMI handler.  So code uses the + * allocator in NMI handler should depend on + * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.   *   * This source code is licensed under the GNU General Public License,   * Version 2.  See the file COPYING for more details. @@ -15,7 +33,7 @@   *  General purpose special memory pool descriptor.   */  struct gen_pool { -	rwlock_t lock; +	spinlock_t lock;  	struct list_head chunks;	/* list of chunks in this pool */  	int min_alloc_order;		/* minimum allocation order */  }; @@ -24,8 +42,8 @@ struct gen_pool {   *  General purpose special memory pool chunk descriptor.   */  struct gen_pool_chunk { -	spinlock_t lock;  	struct list_head next_chunk;	/* next chunk in pool */ +	atomic_t avail;  	phys_addr_t phys_addr;		/* physical starting address of memory chunk */  	unsigned long start_addr;	/* starting address of memory chunk */  	unsigned long end_addr;		/* ending address of memory chunk */ @@ -56,4 +74,8 @@ static inline int gen_pool_add(struct gen_pool *pool, unsigned long addr,  extern void gen_pool_destroy(struct gen_pool *);  extern unsigned long gen_pool_alloc(struct gen_pool *, size_t);  extern void gen_pool_free(struct gen_pool *, unsigned long, size_t); +extern void gen_pool_for_each_chunk(struct gen_pool *, +	void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *); +extern size_t gen_pool_avail(struct gen_pool *); +extern size_t gen_pool_size(struct gen_pool *);  #endif /* __GENALLOC_H__ */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index cb4089254f01..3a76faf6a3ee 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -92,7 +92,7 @@ struct vm_area_struct;   */  #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 23	/* Room for 23 __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 24	/* Room for N __GFP_FOO bits */  #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))  /* This equals 0, but use constants in case they ever change */ diff --git a/include/linux/idr.h b/include/linux/idr.h index 13a801f3d028..255491cf522e 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -146,6 +146,10 @@ void ida_remove(struct ida *ida, int id);  void ida_destroy(struct ida *ida);  void ida_init(struct ida *ida); +int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end, +		   gfp_t gfp_mask); +void ida_simple_remove(struct ida *ida, unsigned int id); +  void __init idr_init_cache(void);  #endif /* __IDR_H__ */ diff --git a/include/linux/llist.h b/include/linux/llist.h new file mode 100644 index 000000000000..aa0c8b5b3cd0 --- /dev/null +++ b/include/linux/llist.h @@ -0,0 +1,126 @@ +#ifndef LLIST_H +#define LLIST_H +/* + * Lock-less NULL terminated single linked list + * + * If there are multiple producers and multiple consumers, llist_add + * can be used in producers and llist_del_all can be used in + * consumers.  They can work simultaneously without lock.  But + * llist_del_first can not be used here.  Because llist_del_first + * depends on list->first->next does not changed if list->first is not + * changed during its operation, but llist_del_first, llist_add, + * llist_add (or llist_del_all, llist_add, llist_add) sequence in + * another consumer may violate that. + * + * If there are multiple producers and one consumer, llist_add can be + * used in producers and llist_del_all or llist_del_first can be used + * in the consumer. + * + * This can be summarized as follow: + * + *           |   add    | del_first |  del_all + * add       |    -     |     -     |     - + * del_first |          |     L     |     L + * del_all   |          |           |     - + * + * Where "-" stands for no lock is needed, while "L" stands for lock + * is needed. + * + * The list entries deleted via llist_del_all can be traversed with + * traversing function such as llist_for_each etc.  But the list + * entries can not be traversed safely before deleted from the list. + * The order of deleted entries is from the newest to the oldest added + * one.  If you want to traverse from the oldest to the newest, you + * must reverse the order by yourself before traversing. + * + * The basic atomic operation of this list is cmpxchg on long.  On + * architectures that don't have NMI-safe cmpxchg implementation, the + * list can NOT be used in NMI handler.  So code uses the list in NMI + * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. + */ + +struct llist_head { +	struct llist_node *first; +}; + +struct llist_node { +	struct llist_node *next; +}; + +#define LLIST_HEAD_INIT(name)	{ NULL } +#define LLIST_HEAD(name)	struct llist_head name = LLIST_HEAD_INIT(name) + +/** + * init_llist_head - initialize lock-less list head + * @head:	the head for your lock-less list + */ +static inline void init_llist_head(struct llist_head *list) +{ +	list->first = NULL; +} + +/** + * llist_entry - get the struct of this entry + * @ptr:	the &struct llist_node pointer. + * @type:	the type of the struct this is embedded in. + * @member:	the name of the llist_node within the struct. + */ +#define llist_entry(ptr, type, member)		\ +	container_of(ptr, type, member) + +/** + * llist_for_each - iterate over some deleted entries of a lock-less list + * @pos:	the &struct llist_node to use as a loop cursor + * @node:	the first entry of deleted list entries + * + * In general, some entries of the lock-less list can be traversed + * safely only after being deleted from list, so start with an entry + * instead of list head. + * + * If being used on entries deleted from lock-less list directly, the + * traverse order is from the newest to the oldest added entry.  If + * you want to traverse from the oldest to the newest, you must + * reverse the order by yourself before traversing. + */ +#define llist_for_each(pos, node)			\ +	for ((pos) = (node); pos; (pos) = (pos)->next) + +/** + * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type + * @pos:	the type * to use as a loop cursor. + * @node:	the fist entry of deleted list entries. + * @member:	the name of the llist_node with the struct. + * + * In general, some entries of the lock-less list can be traversed + * safely only after being removed from list, so start with an entry + * instead of list head. + * + * If being used on entries deleted from lock-less list directly, the + * traverse order is from the newest to the oldest added entry.  If + * you want to traverse from the oldest to the newest, you must + * reverse the order by yourself before traversing. + */ +#define llist_for_each_entry(pos, node, member)				\ +	for ((pos) = llist_entry((node), typeof(*(pos)), member);	\ +	     &(pos)->member != NULL;					\ +	     (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member)) + +/** + * llist_empty - tests whether a lock-less list is empty + * @head:	the list to test + * + * Not guaranteed to be accurate or up to date.  Just a quick way to + * test whether the list is empty without deleting something from the + * list. + */ +static inline int llist_empty(const struct llist_head *head) +{ +	return ACCESS_ONCE(head->first) == NULL; +} + +void llist_add(struct llist_node *new, struct llist_head *head); +void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, +		     struct llist_head *head); +struct llist_node *llist_del_first(struct llist_head *head); +struct llist_node *llist_del_all(struct llist_head *head); +#endif /* LLIST_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b96600786913..3b535db00a94 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -86,8 +86,6 @@ extern void mem_cgroup_uncharge_end(void);  extern void mem_cgroup_uncharge_page(struct page *page);  extern void mem_cgroup_uncharge_cache_page(struct page *page); -extern int mem_cgroup_shmem_charge_fallback(struct page *page, -			struct mm_struct *mm, gfp_t gfp_mask);  extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);  int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); @@ -225,12 +223,6 @@ static inline void mem_cgroup_uncharge_cache_page(struct page *page)  {  } -static inline int mem_cgroup_shmem_charge_fallback(struct page *page, -			struct mm_struct *mm, gfp_t gfp_mask) -{ -	return 0; -} -  static inline void mem_cgroup_add_lru_list(struct page *page, int lru)  {  } diff --git a/include/linux/mfd/aat2870.h b/include/linux/mfd/aat2870.h index 89212df05622..f7316c29bdec 100644 --- a/include/linux/mfd/aat2870.h +++ b/include/linux/mfd/aat2870.h @@ -89,7 +89,7 @@ enum aat2870_id {  /* Backlight current magnitude (mA) */  enum aat2870_current { -	AAT2870_CURRENT_0_45, +	AAT2870_CURRENT_0_45 = 1,  	AAT2870_CURRENT_0_90,  	AAT2870_CURRENT_1_80,  	AAT2870_CURRENT_2_70, diff --git a/include/linux/mm.h b/include/linux/mm.h index 3172a1c0f08e..f2690cf49827 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1600,6 +1600,7 @@ enum mf_flags {  };  extern void memory_failure(unsigned long pfn, int trapno);  extern int __memory_failure(unsigned long pfn, int trapno, int flags); +extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);  extern int unpoison_memory(unsigned long pfn);  extern int sysctl_memory_failure_early_kill;  extern int sysctl_memory_failure_recovery; diff --git a/include/linux/of.h b/include/linux/of.h index 0085bb01c041..bc3dc6399547 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -68,6 +68,7 @@ struct device_node {  /* Pointer for first entry in chain of all nodes. */  extern struct device_node *allnodes;  extern struct device_node *of_chosen; +extern struct device_node *of_aliases;  extern rwlock_t devtree_lock;  static inline bool of_have_populated_dt(void) @@ -209,6 +210,9 @@ extern int of_device_is_available(const struct device_node *device);  extern const void *of_get_property(const struct device_node *node,  				const char *name,  				int *lenp); +#define for_each_property(pp, properties) \ +	for (pp = properties; pp != NULL; pp = pp->next) +  extern int of_n_addr_cells(struct device_node *np);  extern int of_n_size_cells(struct device_node *np);  extern const struct of_device_id *of_match_node( @@ -221,6 +225,10 @@ extern int of_parse_phandles_with_args(struct device_node *np,  	const char *list_name, const char *cells_name, int index,  	struct device_node **out_node, const void **out_args); +extern void *early_init_dt_alloc_memory_arch(u64 size, u64 align); +extern void of_alias_scan(void); +extern int of_alias_get_id(struct device_node *np, const char *stem); +  extern int of_machine_is_compatible(const char *compat);  extern int prom_add_property(struct device_node* np, struct property* prop); diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index c84d900fbbb3..b74b74ffe0e7 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -97,7 +97,6 @@ extern void early_init_dt_check_for_initrd(unsigned long node);  extern int early_init_dt_scan_memory(unsigned long node, const char *uname,  				     int depth, void *data);  extern void early_init_dt_add_memory_arch(u64 base, u64 size); -extern void * early_init_dt_alloc_memory_arch(u64 size, u64 align);  extern u64 dt_mem_next_cell(int s, __be32 **cellp);  /* diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 23241c2fecce..9d4539c52e53 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -39,7 +39,15 @@   * when it is shrunk, before we rcu free the node. See shrink code for   * details.   */ -#define RADIX_TREE_INDIRECT_PTR	1 +#define RADIX_TREE_INDIRECT_PTR		1 +/* + * A common use of the radix tree is to store pointers to struct pages; + * but shmem/tmpfs needs also to store swap entries in the same tree: + * those are marked as exceptional entries to distinguish them. + * EXCEPTIONAL_ENTRY tests the bit, EXCEPTIONAL_SHIFT shifts content past it. + */ +#define RADIX_TREE_EXCEPTIONAL_ENTRY	2 +#define RADIX_TREE_EXCEPTIONAL_SHIFT	2  #define radix_tree_indirect_to_ptr(ptr) \  	radix_tree_indirect_to_ptr((void __force *)(ptr)) @@ -174,6 +182,28 @@ static inline int radix_tree_deref_retry(void *arg)  }  /** + * radix_tree_exceptional_entry	- radix_tree_deref_slot gave exceptional entry? + * @arg:	value returned by radix_tree_deref_slot + * Returns:	0 if well-aligned pointer, non-0 if exceptional entry. + */ +static inline int radix_tree_exceptional_entry(void *arg) +{ +	/* Not unlikely because radix_tree_exception often tested first */ +	return (unsigned long)arg & RADIX_TREE_EXCEPTIONAL_ENTRY; +} + +/** + * radix_tree_exception	- radix_tree_deref_slot returned either exception? + * @arg:	value returned by radix_tree_deref_slot + * Returns:	0 if well-aligned pointer, non-0 if either kind of exception. + */ +static inline int radix_tree_exception(void *arg) +{ +	return unlikely((unsigned long)arg & +		(RADIX_TREE_INDIRECT_PTR | RADIX_TREE_EXCEPTIONAL_ENTRY)); +} + +/**   * radix_tree_replace_slot	- replace item in a slot   * @pslot:	pointer to slot, returned by radix_tree_lookup_slot   * @item:	new item to store in the slot. @@ -194,8 +224,8 @@ void *radix_tree_delete(struct radix_tree_root *, unsigned long);  unsigned int  radix_tree_gang_lookup(struct radix_tree_root *root, void **results,  			unsigned long first_index, unsigned int max_items); -unsigned int -radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, +unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, +			void ***results, unsigned long *indices,  			unsigned long first_index, unsigned int max_items);  unsigned long radix_tree_next_hole(struct radix_tree_root *root,  				unsigned long index, unsigned long max_scan); @@ -222,6 +252,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,  		unsigned long nr_to_tag,  		unsigned int fromtag, unsigned int totag);  int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); +unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);  static inline void radix_tree_preload_end(void)  { diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index aa08fa8fd79b..9291ac3cc627 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -8,22 +8,15 @@  /* inode in-kernel data */ -#define SHMEM_NR_DIRECT 16 - -#define SHMEM_SYMLINK_INLINE_LEN (SHMEM_NR_DIRECT * sizeof(swp_entry_t)) -  struct shmem_inode_info {  	spinlock_t		lock;  	unsigned long		flags;  	unsigned long		alloced;	/* data pages alloced to file */ -	unsigned long		swapped;	/* subtotal assigned to swap */ -	unsigned long		next_index;	/* highest alloced index + 1 */ -	struct shared_policy	policy;		/* NUMA memory alloc policy */ -	struct page		*i_indirect;	/* top indirect blocks page */  	union { -		swp_entry_t	i_direct[SHMEM_NR_DIRECT]; /* first blocks */ -		char		inline_symlink[SHMEM_SYMLINK_INLINE_LEN]; +		unsigned long	swapped;	/* subtotal assigned to swap */ +		char		*symlink;	/* unswappable short symlink */  	}; +	struct shared_policy	policy;		/* NUMA memory alloc policy */  	struct list_head	swaplist;	/* chain of maybes on swap */  	struct list_head	xattr_list;	/* list of shmem_xattr */  	struct inode		vfs_inode; @@ -49,7 +42,7 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)  /*   * Functions in mm/shmem.c called directly from elsewhere:   */ -extern int init_tmpfs(void); +extern int shmem_init(void);  extern int shmem_fill_super(struct super_block *sb, void *data, int silent);  extern struct file *shmem_file_setup(const char *name,  					loff_t size, unsigned long flags); @@ -59,8 +52,6 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,  					pgoff_t index, gfp_t gfp_mask);  extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);  extern int shmem_unuse(swp_entry_t entry, struct page *page); -extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, -					struct page **pagep, swp_entry_t *ent);  static inline struct page *shmem_read_mapping_page(  				struct address_space *mapping, pgoff_t index) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index cd42e30b7c6e..2189d3ffc85d 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -1,3 +1,8 @@ +#ifndef _LINUX_SWAPOPS_H +#define _LINUX_SWAPOPS_H + +#include <linux/radix-tree.h> +  /*   * swapcache pages are stored in the swapper_space radix tree.  We want to   * get good packing density in that tree, so the index should be dense in @@ -76,6 +81,22 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry)  	return __swp_entry_to_pte(arch_entry);  } +static inline swp_entry_t radix_to_swp_entry(void *arg) +{ +	swp_entry_t entry; + +	entry.val = (unsigned long)arg >> RADIX_TREE_EXCEPTIONAL_SHIFT; +	return entry; +} + +static inline void *swp_to_radix_entry(swp_entry_t entry) +{ +	unsigned long value; + +	value = entry.val << RADIX_TREE_EXCEPTIONAL_SHIFT; +	return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY); +} +  #ifdef CONFIG_MIGRATION  static inline swp_entry_t make_migration_entry(struct page *page, int write)  { @@ -169,3 +190,5 @@ static inline int non_swap_entry(swp_entry_t entry)  	return 0;  }  #endif + +#endif /* _LINUX_SWAPOPS_H */ diff --git a/include/linux/thermal.h b/include/linux/thermal.h index d3ec89fb4122..47b4a27e6e97 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -85,22 +85,6 @@ struct thermal_cooling_device {  				((long)t-2732+5)/10 : ((long)t-2732-5)/10)  #define CELSIUS_TO_KELVIN(t)	((t)*10+2732) -#if defined(CONFIG_THERMAL_HWMON) -/* thermal zone devices with the same type share one hwmon device */ -struct thermal_hwmon_device { -	char type[THERMAL_NAME_LENGTH]; -	struct device *device; -	int count; -	struct list_head tz_list; -	struct list_head node; -}; - -struct thermal_hwmon_attr { -	struct device_attribute attr; -	char name[16]; -}; -#endif -  struct thermal_zone_device {  	int id;  	char type[THERMAL_NAME_LENGTH]; @@ -120,12 +104,6 @@ struct thermal_zone_device {  	struct mutex lock;	/* protect cooling devices list */  	struct list_head node;  	struct delayed_work poll_queue; -#if defined(CONFIG_THERMAL_HWMON) -	struct list_head hwmon_node; -	struct thermal_hwmon_device *hwmon; -	struct thermal_hwmon_attr temp_input;	/* hwmon sys attr */ -	struct thermal_hwmon_attr temp_crit;	/* hwmon sys attr */ -#endif  };  /* Adding event notification support elements */  #define THERMAL_GENL_FAMILY_NAME                "thermal_event" diff --git a/init/main.c b/init/main.c index d7211faed2ad..9c51ee7adf3d 100644 --- a/init/main.c +++ b/init/main.c @@ -369,9 +369,12 @@ static noinline void __init_refok rest_init(void)  	init_idle_bootup_task(current);  	preempt_enable_no_resched();  	schedule(); -	preempt_disable(); + +	/* At this point, we can enable user mode helper functionality */ +	usermodehelper_enable();  	/* Call into cpu_idle with preempt disabled */ +	preempt_disable();  	cpu_idle();  } @@ -715,7 +718,7 @@ static void __init do_basic_setup(void)  {  	cpuset_init_smp();  	usermodehelper_init(); -	init_tmpfs(); +	shmem_init();  	driver_init();  	init_irq_proc();  	do_ctors(); diff --git a/ipc/shm.c b/ipc/shm.c index 9fb044f3b345..b5bae9d945b6 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -294,7 +294,7 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)  void shm_destroy_orphaned(struct ipc_namespace *ns)  {  	down_write(&shm_ids(ns).rw_mutex); -	if (&shm_ids(ns).in_use) +	if (shm_ids(ns).in_use)  		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);  	up_write(&shm_ids(ns).rw_mutex);  } @@ -304,9 +304,12 @@ void exit_shm(struct task_struct *task)  {  	struct ipc_namespace *ns = task->nsproxy->ipc_ns; +	if (shm_ids(ns).in_use == 0) +		return; +  	/* Destroy all already created segments, but not mapped yet */  	down_write(&shm_ids(ns).rw_mutex); -	if (&shm_ids(ns).in_use) +	if (shm_ids(ns).in_use)  		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);  	up_write(&shm_ids(ns).rw_mutex);  } diff --git a/kernel/kmod.c b/kernel/kmod.c index 47613dfb7b28..ddc7644c1305 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -274,7 +274,7 @@ static void __call_usermodehelper(struct work_struct *work)   * (used for preventing user land processes from being created after the user   * land has been frozen during a system-wide hibernation or suspend operation).   */ -static int usermodehelper_disabled; +static int usermodehelper_disabled = 1;  /* Number of helpers running */  static atomic_t running_helpers = ATOMIC_INIT(0); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d1db2880d1cf..e19ce1454ee1 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -291,30 +291,28 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)  	if (!cpumask_subset(mask, cpu_possible_mask))  		return -EINVAL; -	s = NULL;  	if (isadd == REGISTER) {  		for_each_cpu(cpu, mask) { -			if (!s) -				s = kmalloc_node(sizeof(struct listener), -						 GFP_KERNEL, cpu_to_node(cpu)); +			s = kmalloc_node(sizeof(struct listener), +					GFP_KERNEL, cpu_to_node(cpu));  			if (!s)  				goto cleanup; +  			s->pid = pid; -			INIT_LIST_HEAD(&s->list);  			s->valid = 1;  			listeners = &per_cpu(listener_array, cpu);  			down_write(&listeners->sem); -			list_for_each_entry_safe(s2, tmp, &listeners->list, list) { -				if (s2->pid == pid) -					goto next_cpu; +			list_for_each_entry(s2, &listeners->list, list) { +				if (s2->pid == pid && s2->valid) +					goto exists;  			}  			list_add(&s->list, &listeners->list);  			s = NULL; -next_cpu: +exists:  			up_write(&listeners->sem); +			kfree(s); /* nop if NULL */  		} -		kfree(s);  		return 0;  	} diff --git a/lib/Kconfig b/lib/Kconfig index 32f3e5ae2be5..6c695ff9caba 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -276,4 +276,7 @@ config CORDIC  	  so its calculations are in fixed point. Modules can select this  	  when they require this function. Module will be called cordic. +config LLIST +	bool +  endmenu diff --git a/lib/Makefile b/lib/Makefile index 892f4e282ea1..6457af4a7caf 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -115,6 +115,8 @@ obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o  obj-$(CONFIG_CORDIC) += cordic.o +obj-$(CONFIG_LLIST) += llist.o +  hostprogs-y	:= gen_crc32table  clean-files	:= crc32table.h diff --git a/lib/bitmap.c b/lib/bitmap.c index 37ef4b048795..2f4412e4d071 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -271,8 +271,6 @@ int __bitmap_weight(const unsigned long *bitmap, int bits)  }  EXPORT_SYMBOL(__bitmap_weight); -#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG)) -  void bitmap_set(unsigned long *map, int start, int nr)  {  	unsigned long *p = map + BIT_WORD(start); diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 2577b121c7c1..f193b7796449 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -197,21 +197,15 @@ static struct dentry *debugfs_create_atomic_t(const char *name, mode_t mode,  	return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);  } -void cleanup_fault_attr_dentries(struct fault_attr *attr) -{ -	debugfs_remove_recursive(attr->dir); -} - -int init_fault_attr_dentries(struct fault_attr *attr, const char *name) +struct dentry *fault_create_debugfs_attr(const char *name, +			struct dentry *parent, struct fault_attr *attr)  {  	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;  	struct dentry *dir; -	dir = debugfs_create_dir(name, NULL); +	dir = debugfs_create_dir(name, parent);  	if (!dir) -		return -ENOMEM; - -	attr->dir = dir; +		return ERR_PTR(-ENOMEM);  	if (!debugfs_create_ul("probability", mode, dir, &attr->probability))  		goto fail; @@ -243,11 +237,11 @@ int init_fault_attr_dentries(struct fault_attr *attr, const char *name)  #endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */ -	return 0; +	return dir;  fail: -	debugfs_remove_recursive(attr->dir); +	debugfs_remove_recursive(dir); -	return -ENOMEM; +	return ERR_PTR(-ENOMEM);  }  #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ diff --git a/lib/genalloc.c b/lib/genalloc.c index 577ddf805975..f352cc42f4f8 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -1,8 +1,26 @@  /* - * Basic general purpose allocator for managing special purpose memory - * not managed by the regular kmalloc/kfree interface. - * Uses for this includes on-device special memory, uncached memory - * etc. + * Basic general purpose allocator for managing special purpose + * memory, for example, memory that is not managed by the regular + * kmalloc/kfree interface.  Uses for this includes on-device special + * memory, uncached memory etc. + * + * It is safe to use the allocator in NMI handlers and other special + * unblockable contexts that could otherwise deadlock on locks.  This + * is implemented by using atomic operations and retries on any + * conflicts.  The disadvantage is that there may be livelocks in + * extreme cases.  For better scalability, one allocator can be used + * for each CPU. + * + * The lockless operation only works if there is enough memory + * available.  If new memory is added to the pool a lock has to be + * still taken.  So any user relying on locklessness has to ensure + * that sufficient memory is preallocated. + * + * The basic atomic operation of this allocator is cmpxchg on long. + * On architectures that don't have NMI-safe cmpxchg implementation, + * the allocator can NOT be used in NMI handler.  So code uses the + * allocator in NMI handler should depend on + * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.   *   * Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org>   * @@ -13,8 +31,109 @@  #include <linux/slab.h>  #include <linux/module.h>  #include <linux/bitmap.h> +#include <linux/rculist.h> +#include <linux/interrupt.h>  #include <linux/genalloc.h> +static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set) +{ +	unsigned long val, nval; + +	nval = *addr; +	do { +		val = nval; +		if (val & mask_to_set) +			return -EBUSY; +		cpu_relax(); +	} while ((nval = cmpxchg(addr, val, val | mask_to_set)) != val); + +	return 0; +} + +static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear) +{ +	unsigned long val, nval; + +	nval = *addr; +	do { +		val = nval; +		if ((val & mask_to_clear) != mask_to_clear) +			return -EBUSY; +		cpu_relax(); +	} while ((nval = cmpxchg(addr, val, val & ~mask_to_clear)) != val); + +	return 0; +} + +/* + * bitmap_set_ll - set the specified number of bits at the specified position + * @map: pointer to a bitmap + * @start: a bit position in @map + * @nr: number of bits to set + * + * Set @nr bits start from @start in @map lock-lessly. Several users + * can set/clear the same bitmap simultaneously without lock. If two + * users set the same bit, one user will return remain bits, otherwise + * return 0. + */ +static int bitmap_set_ll(unsigned long *map, int start, int nr) +{ +	unsigned long *p = map + BIT_WORD(start); +	const int size = start + nr; +	int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); +	unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); + +	while (nr - bits_to_set >= 0) { +		if (set_bits_ll(p, mask_to_set)) +			return nr; +		nr -= bits_to_set; +		bits_to_set = BITS_PER_LONG; +		mask_to_set = ~0UL; +		p++; +	} +	if (nr) { +		mask_to_set &= BITMAP_LAST_WORD_MASK(size); +		if (set_bits_ll(p, mask_to_set)) +			return nr; +	} + +	return 0; +} + +/* + * bitmap_clear_ll - clear the specified number of bits at the specified position + * @map: pointer to a bitmap + * @start: a bit position in @map + * @nr: number of bits to set + * + * Clear @nr bits start from @start in @map lock-lessly. Several users + * can set/clear the same bitmap simultaneously without lock. If two + * users clear the same bit, one user will return remain bits, + * otherwise return 0. + */ +static int bitmap_clear_ll(unsigned long *map, int start, int nr) +{ +	unsigned long *p = map + BIT_WORD(start); +	const int size = start + nr; +	int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); +	unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + +	while (nr - bits_to_clear >= 0) { +		if (clear_bits_ll(p, mask_to_clear)) +			return nr; +		nr -= bits_to_clear; +		bits_to_clear = BITS_PER_LONG; +		mask_to_clear = ~0UL; +		p++; +	} +	if (nr) { +		mask_to_clear &= BITMAP_LAST_WORD_MASK(size); +		if (clear_bits_ll(p, mask_to_clear)) +			return nr; +	} + +	return 0; +}  /**   * gen_pool_create - create a new special memory pool @@ -30,7 +149,7 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid)  	pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid);  	if (pool != NULL) { -		rwlock_init(&pool->lock); +		spin_lock_init(&pool->lock);  		INIT_LIST_HEAD(&pool->chunks);  		pool->min_alloc_order = min_alloc_order;  	} @@ -63,14 +182,14 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy  	if (unlikely(chunk == NULL))  		return -ENOMEM; -	spin_lock_init(&chunk->lock);  	chunk->phys_addr = phys;  	chunk->start_addr = virt;  	chunk->end_addr = virt + size; +	atomic_set(&chunk->avail, size); -	write_lock(&pool->lock); -	list_add(&chunk->next_chunk, &pool->chunks); -	write_unlock(&pool->lock); +	spin_lock(&pool->lock); +	list_add_rcu(&chunk->next_chunk, &pool->chunks); +	spin_unlock(&pool->lock);  	return 0;  } @@ -85,19 +204,19 @@ EXPORT_SYMBOL(gen_pool_add_virt);   */  phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr)  { -	struct list_head *_chunk;  	struct gen_pool_chunk *chunk; +	phys_addr_t paddr = -1; -	read_lock(&pool->lock); -	list_for_each(_chunk, &pool->chunks) { -		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); - -		if (addr >= chunk->start_addr && addr < chunk->end_addr) -			return chunk->phys_addr + addr - chunk->start_addr; +	rcu_read_lock(); +	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { +		if (addr >= chunk->start_addr && addr < chunk->end_addr) { +			paddr = chunk->phys_addr + (addr - chunk->start_addr); +			break; +		}  	} -	read_unlock(&pool->lock); +	rcu_read_unlock(); -	return -1; +	return paddr;  }  EXPORT_SYMBOL(gen_pool_virt_to_phys); @@ -115,7 +234,6 @@ void gen_pool_destroy(struct gen_pool *pool)  	int order = pool->min_alloc_order;  	int bit, end_bit; -  	list_for_each_safe(_chunk, _next_chunk, &pool->chunks) {  		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);  		list_del(&chunk->next_chunk); @@ -137,44 +255,50 @@ EXPORT_SYMBOL(gen_pool_destroy);   * @size: number of bytes to allocate from the pool   *   * Allocate the requested number of bytes from the specified pool. - * Uses a first-fit algorithm. + * Uses a first-fit algorithm. Can not be used in NMI handler on + * architectures without NMI-safe cmpxchg implementation.   */  unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)  { -	struct list_head *_chunk;  	struct gen_pool_chunk *chunk; -	unsigned long addr, flags; +	unsigned long addr = 0;  	int order = pool->min_alloc_order; -	int nbits, start_bit, end_bit; +	int nbits, start_bit = 0, end_bit, remain; + +#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG +	BUG_ON(in_nmi()); +#endif  	if (size == 0)  		return 0;  	nbits = (size + (1UL << order) - 1) >> order; - -	read_lock(&pool->lock); -	list_for_each(_chunk, &pool->chunks) { -		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); +	rcu_read_lock(); +	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { +		if (size > atomic_read(&chunk->avail)) +			continue;  		end_bit = (chunk->end_addr - chunk->start_addr) >> order; - -		spin_lock_irqsave(&chunk->lock, flags); -		start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit, 0, -						nbits, 0); -		if (start_bit >= end_bit) { -			spin_unlock_irqrestore(&chunk->lock, flags); +retry: +		start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit, +						       start_bit, nbits, 0); +		if (start_bit >= end_bit)  			continue; +		remain = bitmap_set_ll(chunk->bits, start_bit, nbits); +		if (remain) { +			remain = bitmap_clear_ll(chunk->bits, start_bit, +						 nbits - remain); +			BUG_ON(remain); +			goto retry;  		}  		addr = chunk->start_addr + ((unsigned long)start_bit << order); - -		bitmap_set(chunk->bits, start_bit, nbits); -		spin_unlock_irqrestore(&chunk->lock, flags); -		read_unlock(&pool->lock); -		return addr; +		size = nbits << order; +		atomic_sub(size, &chunk->avail); +		break;  	} -	read_unlock(&pool->lock); -	return 0; +	rcu_read_unlock(); +	return addr;  }  EXPORT_SYMBOL(gen_pool_alloc); @@ -184,33 +308,95 @@ EXPORT_SYMBOL(gen_pool_alloc);   * @addr: starting address of memory to free back to pool   * @size: size in bytes of memory to free   * - * Free previously allocated special memory back to the specified pool. + * Free previously allocated special memory back to the specified + * pool.  Can not be used in NMI handler on architectures without + * NMI-safe cmpxchg implementation.   */  void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)  { -	struct list_head *_chunk;  	struct gen_pool_chunk *chunk; -	unsigned long flags;  	int order = pool->min_alloc_order; -	int bit, nbits; +	int start_bit, nbits, remain; -	nbits = (size + (1UL << order) - 1) >> order; - -	read_lock(&pool->lock); -	list_for_each(_chunk, &pool->chunks) { -		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); +#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG +	BUG_ON(in_nmi()); +#endif +	nbits = (size + (1UL << order) - 1) >> order; +	rcu_read_lock(); +	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {  		if (addr >= chunk->start_addr && addr < chunk->end_addr) {  			BUG_ON(addr + size > chunk->end_addr); -			spin_lock_irqsave(&chunk->lock, flags); -			bit = (addr - chunk->start_addr) >> order; -			while (nbits--) -				__clear_bit(bit++, chunk->bits); -			spin_unlock_irqrestore(&chunk->lock, flags); -			break; +			start_bit = (addr - chunk->start_addr) >> order; +			remain = bitmap_clear_ll(chunk->bits, start_bit, nbits); +			BUG_ON(remain); +			size = nbits << order; +			atomic_add(size, &chunk->avail); +			rcu_read_unlock(); +			return;  		}  	} -	BUG_ON(nbits > 0); -	read_unlock(&pool->lock); +	rcu_read_unlock(); +	BUG();  }  EXPORT_SYMBOL(gen_pool_free); + +/** + * gen_pool_for_each_chunk - call func for every chunk of generic memory pool + * @pool:	the generic memory pool + * @func:	func to call + * @data:	additional data used by @func + * + * Call @func for every chunk of generic memory pool.  The @func is + * called with rcu_read_lock held. + */ +void gen_pool_for_each_chunk(struct gen_pool *pool, +	void (*func)(struct gen_pool *pool, struct gen_pool_chunk *chunk, void *data), +	void *data) +{ +	struct gen_pool_chunk *chunk; + +	rcu_read_lock(); +	list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk) +		func(pool, chunk, data); +	rcu_read_unlock(); +} +EXPORT_SYMBOL(gen_pool_for_each_chunk); + +/** + * gen_pool_avail - get available free space of the pool + * @pool: pool to get available free space + * + * Return available free space of the specified pool. + */ +size_t gen_pool_avail(struct gen_pool *pool) +{ +	struct gen_pool_chunk *chunk; +	size_t avail = 0; + +	rcu_read_lock(); +	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) +		avail += atomic_read(&chunk->avail); +	rcu_read_unlock(); +	return avail; +} +EXPORT_SYMBOL_GPL(gen_pool_avail); + +/** + * gen_pool_size - get size in bytes of memory managed by the pool + * @pool: pool to get size + * + * Return size in bytes of memory managed by the pool. + */ +size_t gen_pool_size(struct gen_pool *pool) +{ +	struct gen_pool_chunk *chunk; +	size_t size = 0; + +	rcu_read_lock(); +	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) +		size += chunk->end_addr - chunk->start_addr; +	rcu_read_unlock(); +	return size; +} +EXPORT_SYMBOL_GPL(gen_pool_size); diff --git a/lib/idr.c b/lib/idr.c index e15502e8b21e..db040ce3fa73 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -34,8 +34,10 @@  #include <linux/err.h>  #include <linux/string.h>  #include <linux/idr.h> +#include <linux/spinlock.h>  static struct kmem_cache *idr_layer_cache; +static DEFINE_SPINLOCK(simple_ida_lock);  static struct idr_layer *get_from_free_list(struct idr *idp)  { @@ -926,6 +928,71 @@ void ida_destroy(struct ida *ida)  EXPORT_SYMBOL(ida_destroy);  /** + * ida_simple_get - get a new id. + * @ida: the (initialized) ida. + * @start: the minimum id (inclusive, < 0x8000000) + * @end: the maximum id (exclusive, < 0x8000000 or 0) + * @gfp_mask: memory allocation flags + * + * Allocates an id in the range start <= id < end, or returns -ENOSPC. + * On memory allocation failure, returns -ENOMEM. + * + * Use ida_simple_remove() to get rid of an id. + */ +int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end, +		   gfp_t gfp_mask) +{ +	int ret, id; +	unsigned int max; + +	BUG_ON((int)start < 0); +	BUG_ON((int)end < 0); + +	if (end == 0) +		max = 0x80000000; +	else { +		BUG_ON(end < start); +		max = end - 1; +	} + +again: +	if (!ida_pre_get(ida, gfp_mask)) +		return -ENOMEM; + +	spin_lock(&simple_ida_lock); +	ret = ida_get_new_above(ida, start, &id); +	if (!ret) { +		if (id > max) { +			ida_remove(ida, id); +			ret = -ENOSPC; +		} else { +			ret = id; +		} +	} +	spin_unlock(&simple_ida_lock); + +	if (unlikely(ret == -EAGAIN)) +		goto again; + +	return ret; +} +EXPORT_SYMBOL(ida_simple_get); + +/** + * ida_simple_remove - remove an allocated id. + * @ida: the (initialized) ida. + * @id: the id returned by ida_simple_get. + */ +void ida_simple_remove(struct ida *ida, unsigned int id) +{ +	BUG_ON((int)id < 0); +	spin_lock(&simple_ida_lock); +	ida_remove(ida, id); +	spin_unlock(&simple_ida_lock); +} +EXPORT_SYMBOL(ida_simple_remove); + +/**   * ida_init - initialize ida handle   * @ida:	ida handle   * diff --git a/lib/llist.c b/lib/llist.c new file mode 100644 index 000000000000..da445724fa1f --- /dev/null +++ b/lib/llist.c @@ -0,0 +1,129 @@ +/* + * Lock-less NULL terminated single linked list + * + * The basic atomic operation of this list is cmpxchg on long.  On + * architectures that don't have NMI-safe cmpxchg implementation, the + * list can NOT be used in NMI handler.  So code uses the list in NMI + * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. + * + * Copyright 2010,2011 Intel Corp. + *   Author: Huang Ying <ying.huang@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/llist.h> + +#include <asm/system.h> + +/** + * llist_add - add a new entry + * @new:	new entry to be added + * @head:	the head for your lock-less list + */ +void llist_add(struct llist_node *new, struct llist_head *head) +{ +	struct llist_node *entry, *old_entry; + +#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG +	BUG_ON(in_nmi()); +#endif + +	entry = head->first; +	do { +		old_entry = entry; +		new->next = entry; +		cpu_relax(); +	} while ((entry = cmpxchg(&head->first, old_entry, new)) != old_entry); +} +EXPORT_SYMBOL_GPL(llist_add); + +/** + * llist_add_batch - add several linked entries in batch + * @new_first:	first entry in batch to be added + * @new_last:	last entry in batch to be added + * @head:	the head for your lock-less list + */ +void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, +		     struct llist_head *head) +{ +	struct llist_node *entry, *old_entry; + +#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG +	BUG_ON(in_nmi()); +#endif + +	entry = head->first; +	do { +		old_entry = entry; +		new_last->next = entry; +		cpu_relax(); +	} while ((entry = cmpxchg(&head->first, old_entry, new_first)) != old_entry); +} +EXPORT_SYMBOL_GPL(llist_add_batch); + +/** + * llist_del_first - delete the first entry of lock-less list + * @head:	the head for your lock-less list + * + * If list is empty, return NULL, otherwise, return the first entry + * deleted, this is the newest added one. + * + * Only one llist_del_first user can be used simultaneously with + * multiple llist_add users without lock.  Because otherwise + * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add, + * llist_add) sequence in another user may change @head->first->next, + * but keep @head->first.  If multiple consumers are needed, please + * use llist_del_all or use lock between consumers. + */ +struct llist_node *llist_del_first(struct llist_head *head) +{ +	struct llist_node *entry, *old_entry, *next; + +#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG +	BUG_ON(in_nmi()); +#endif + +	entry = head->first; +	do { +		if (entry == NULL) +			return NULL; +		old_entry = entry; +		next = entry->next; +		cpu_relax(); +	} while ((entry = cmpxchg(&head->first, old_entry, next)) != old_entry); + +	return entry; +} +EXPORT_SYMBOL_GPL(llist_del_first); + +/** + * llist_del_all - delete all entries from lock-less list + * @head:	the head of lock-less list to delete all entries + * + * If list is empty, return NULL, otherwise, delete all entries and + * return the pointer to the first entry.  The order of entries + * deleted is from the newest to the oldest added one. + */ +struct llist_node *llist_del_all(struct llist_head *head) +{ +#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG +	BUG_ON(in_nmi()); +#endif + +	return xchg(&head->first, NULL); +} +EXPORT_SYMBOL_GPL(llist_del_all); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 7ea2e033d715..a2f9da59c197 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -823,8 +823,8 @@ unsigned long radix_tree_prev_hole(struct radix_tree_root *root,  EXPORT_SYMBOL(radix_tree_prev_hole);  static unsigned int -__lookup(struct radix_tree_node *slot, void ***results, unsigned long index, -	unsigned int max_items, unsigned long *next_index) +__lookup(struct radix_tree_node *slot, void ***results, unsigned long *indices, +	unsigned long index, unsigned int max_items, unsigned long *next_index)  {  	unsigned int nr_found = 0;  	unsigned int shift, height; @@ -857,12 +857,16 @@ __lookup(struct radix_tree_node *slot, void ***results, unsigned long index,  	/* Bottom level: grab some items */  	for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { -		index++;  		if (slot->slots[i]) { -			results[nr_found++] = &(slot->slots[i]); -			if (nr_found == max_items) +			results[nr_found] = &(slot->slots[i]); +			if (indices) +				indices[nr_found] = index; +			if (++nr_found == max_items) { +				index++;  				goto out; +			}  		} +		index++;  	}  out:  	*next_index = index; @@ -918,8 +922,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,  		if (cur_index > max_index)  			break; -		slots_found = __lookup(node, (void ***)results + ret, cur_index, -					max_items - ret, &next_index); +		slots_found = __lookup(node, (void ***)results + ret, NULL, +				cur_index, max_items - ret, &next_index);  		nr_found = 0;  		for (i = 0; i < slots_found; i++) {  			struct radix_tree_node *slot; @@ -944,6 +948,7 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);   *	radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree   *	@root:		radix tree root   *	@results:	where the results of the lookup are placed + *	@indices:	where their indices should be placed (but usually NULL)   *	@first_index:	start the lookup from this key   *	@max_items:	place up to this many items at *results   * @@ -958,7 +963,8 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);   *	protection, radix_tree_deref_slot may fail requiring a retry.   */  unsigned int -radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, +radix_tree_gang_lookup_slot(struct radix_tree_root *root, +			void ***results, unsigned long *indices,  			unsigned long first_index, unsigned int max_items)  {  	unsigned long max_index; @@ -974,6 +980,8 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,  		if (first_index > 0)  			return 0;  		results[0] = (void **)&root->rnode; +		if (indices) +			indices[0] = 0;  		return 1;  	}  	node = indirect_to_ptr(node); @@ -987,8 +995,9 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,  		if (cur_index > max_index)  			break; -		slots_found = __lookup(node, results + ret, cur_index, -					max_items - ret, &next_index); +		slots_found = __lookup(node, results + ret, +				indices ? indices + ret : NULL, +				cur_index, max_items - ret, &next_index);  		ret += slots_found;  		if (next_index == 0)  			break; @@ -1194,6 +1203,98 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,  }  EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); +#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP) +#include <linux/sched.h> /* for cond_resched() */ + +/* + * This linear search is at present only useful to shmem_unuse_inode(). + */ +static unsigned long __locate(struct radix_tree_node *slot, void *item, +			      unsigned long index, unsigned long *found_index) +{ +	unsigned int shift, height; +	unsigned long i; + +	height = slot->height; +	shift = (height-1) * RADIX_TREE_MAP_SHIFT; + +	for ( ; height > 1; height--) { +		i = (index >> shift) & RADIX_TREE_MAP_MASK; +		for (;;) { +			if (slot->slots[i] != NULL) +				break; +			index &= ~((1UL << shift) - 1); +			index += 1UL << shift; +			if (index == 0) +				goto out;	/* 32-bit wraparound */ +			i++; +			if (i == RADIX_TREE_MAP_SIZE) +				goto out; +		} + +		shift -= RADIX_TREE_MAP_SHIFT; +		slot = rcu_dereference_raw(slot->slots[i]); +		if (slot == NULL) +			goto out; +	} + +	/* Bottom level: check items */ +	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { +		if (slot->slots[i] == item) { +			*found_index = index + i; +			index = 0; +			goto out; +		} +	} +	index += RADIX_TREE_MAP_SIZE; +out: +	return index; +} + +/** + *	radix_tree_locate_item - search through radix tree for item + *	@root:		radix tree root + *	@item:		item to be found + * + *	Returns index where item was found, or -1 if not found. + *	Caller must hold no lock (since this time-consuming function needs + *	to be preemptible), and must check afterwards if item is still there. + */ +unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) +{ +	struct radix_tree_node *node; +	unsigned long max_index; +	unsigned long cur_index = 0; +	unsigned long found_index = -1; + +	do { +		rcu_read_lock(); +		node = rcu_dereference_raw(root->rnode); +		if (!radix_tree_is_indirect_ptr(node)) { +			rcu_read_unlock(); +			if (node == item) +				found_index = 0; +			break; +		} + +		node = indirect_to_ptr(node); +		max_index = radix_tree_maxindex(node->height); +		if (cur_index > max_index) +			break; + +		cur_index = __locate(node, item, cur_index, &found_index); +		rcu_read_unlock(); +		cond_resched(); +	} while (cur_index != 0 && cur_index <= max_index); + +	return found_index; +} +#else +unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) +{ +	return -1; +} +#endif /* CONFIG_SHMEM && CONFIG_SWAP */  /**   *	radix_tree_shrink    -    shrink height of a radix tree to minimal diff --git a/mm/failslab.c b/mm/failslab.c index 1ce58c201dca..0dd7b8fec71c 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -34,23 +34,23 @@ __setup("failslab=", setup_failslab);  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS  static int __init failslab_debugfs_init(void)  { +	struct dentry *dir;  	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; -	int err; -	err = init_fault_attr_dentries(&failslab.attr, "failslab"); -	if (err) -		return err; +	dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr); +	if (IS_ERR(dir)) +		return PTR_ERR(dir); -	if (!debugfs_create_bool("ignore-gfp-wait", mode, failslab.attr.dir, +	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,  				&failslab.ignore_gfp_wait))  		goto fail; -	if (!debugfs_create_bool("cache-filter", mode, failslab.attr.dir, +	if (!debugfs_create_bool("cache-filter", mode, dir,  				&failslab.cache_filter))  		goto fail;  	return 0;  fail: -	cleanup_fault_attr_dentries(&failslab.attr); +	debugfs_remove_recursive(dir);  	return -ENOMEM;  } diff --git a/mm/filemap.c b/mm/filemap.c index 867d40222ec7..645a080ba4df 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,7 +33,6 @@  #include <linux/cpuset.h>  #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */  #include <linux/memcontrol.h> -#include <linux/mm_inline.h> /* for page_is_file_cache() */  #include <linux/cleancache.h>  #include "internal.h" @@ -462,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,  	int error;  	VM_BUG_ON(!PageLocked(page)); +	VM_BUG_ON(PageSwapBacked(page));  	error = mem_cgroup_cache_charge(page, current->mm,  					gfp_mask & GFP_RECLAIM_MASK); @@ -479,8 +479,6 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,  		if (likely(!error)) {  			mapping->nrpages++;  			__inc_zone_page_state(page, NR_FILE_PAGES); -			if (PageSwapBacked(page)) -				__inc_zone_page_state(page, NR_SHMEM);  			spin_unlock_irq(&mapping->tree_lock);  		} else {  			page->mapping = NULL; @@ -502,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,  {  	int ret; -	/* -	 * Splice_read and readahead add shmem/tmpfs pages into the page cache -	 * before shmem_readpage has a chance to mark them as SwapBacked: they -	 * need to go on the anon lru below, and mem_cgroup_cache_charge -	 * (called in add_to_page_cache) needs to know where they're going too. -	 */ -	if (mapping_cap_swap_backed(mapping)) -		SetPageSwapBacked(page); -  	ret = add_to_page_cache(page, mapping, offset, gfp_mask); -	if (ret == 0) { -		if (page_is_file_cache(page)) -			lru_cache_add_file(page); -		else -			lru_cache_add_anon(page); -	} +	if (ret == 0) +		lru_cache_add_file(page);  	return ret;  }  EXPORT_SYMBOL_GPL(add_to_page_cache_lru); @@ -714,9 +699,16 @@ repeat:  		page = radix_tree_deref_slot(pagep);  		if (unlikely(!page))  			goto out; -		if (radix_tree_deref_retry(page)) -			goto repeat; - +		if (radix_tree_exception(page)) { +			if (radix_tree_deref_retry(page)) +				goto repeat; +			/* +			 * Otherwise, shmem/tmpfs must be storing a swap entry +			 * here as an exceptional entry: so return it without +			 * attempting to raise page count. +			 */ +			goto out; +		}  		if (!page_cache_get_speculative(page))  			goto repeat; @@ -753,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)  repeat:  	page = find_get_page(mapping, offset); -	if (page) { +	if (page && !radix_tree_exception(page)) {  		lock_page(page);  		/* Has the page been truncated? */  		if (unlikely(page->mapping != mapping)) { @@ -840,7 +832,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,  	rcu_read_lock();  restart:  	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, -				(void ***)pages, start, nr_pages); +				(void ***)pages, NULL, start, nr_pages);  	ret = 0;  	for (i = 0; i < nr_found; i++) {  		struct page *page; @@ -849,13 +841,22 @@ repeat:  		if (unlikely(!page))  			continue; -		/* -		 * This can only trigger when the entry at index 0 moves out -		 * of or back to the root: none yet gotten, safe to restart. -		 */ -		if (radix_tree_deref_retry(page)) { -			WARN_ON(start | i); -			goto restart; +		if (radix_tree_exception(page)) { +			if (radix_tree_deref_retry(page)) { +				/* +				 * Transient condition which can only trigger +				 * when entry at index 0 moves out of or back +				 * to root: none yet gotten, safe to restart. +				 */ +				WARN_ON(start | i); +				goto restart; +			} +			/* +			 * Otherwise, shmem/tmpfs must be storing a swap entry +			 * here as an exceptional entry: so skip over it - +			 * we only reach this from invalidate_mapping_pages(). +			 */ +			continue;  		}  		if (!page_cache_get_speculative(page)) @@ -903,7 +904,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,  	rcu_read_lock();  restart:  	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, -				(void ***)pages, index, nr_pages); +				(void ***)pages, NULL, index, nr_pages);  	ret = 0;  	for (i = 0; i < nr_found; i++) {  		struct page *page; @@ -912,12 +913,22 @@ repeat:  		if (unlikely(!page))  			continue; -		/* -		 * This can only trigger when the entry at index 0 moves out -		 * of or back to the root: none yet gotten, safe to restart. -		 */ -		if (radix_tree_deref_retry(page)) -			goto restart; +		if (radix_tree_exception(page)) { +			if (radix_tree_deref_retry(page)) { +				/* +				 * Transient condition which can only trigger +				 * when entry at index 0 moves out of or back +				 * to root: none yet gotten, safe to restart. +				 */ +				goto restart; +			} +			/* +			 * Otherwise, shmem/tmpfs must be storing a swap entry +			 * here as an exceptional entry: so stop looking for +			 * contiguous pages. +			 */ +			break; +		}  		if (!page_cache_get_speculative(page))  			goto repeat; @@ -977,12 +988,21 @@ repeat:  		if (unlikely(!page))  			continue; -		/* -		 * This can only trigger when the entry at index 0 moves out -		 * of or back to the root: none yet gotten, safe to restart. -		 */ -		if (radix_tree_deref_retry(page)) -			goto restart; +		if (radix_tree_exception(page)) { +			if (radix_tree_deref_retry(page)) { +				/* +				 * Transient condition which can only trigger +				 * when entry at index 0 moves out of or back +				 * to root: none yet gotten, safe to restart. +				 */ +				goto restart; +			} +			/* +			 * This function is never used on a shmem/tmpfs +			 * mapping, so a swap entry won't be found here. +			 */ +			BUG(); +		}  		if (!page_cache_get_speculative(page))  			goto repeat; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5f84d2351ddb..f4ec4e7ca4cd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -35,7 +35,6 @@  #include <linux/limits.h>  #include <linux/mutex.h>  #include <linux/rbtree.h> -#include <linux/shmem_fs.h>  #include <linux/slab.h>  #include <linux/swap.h>  #include <linux/swapops.h> @@ -2873,30 +2872,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,  		return 0;  	if (PageCompound(page))  		return 0; -	/* -	 * Corner case handling. This is called from add_to_page_cache() -	 * in usual. But some FS (shmem) precharges this page before calling it -	 * and call add_to_page_cache() with GFP_NOWAIT. -	 * -	 * For GFP_NOWAIT case, the page may be pre-charged before calling -	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call -	 * charge twice. (It works but has to pay a bit larger cost.) -	 * And when the page is SwapCache, it should take swap information -	 * into account. This is under lock_page() now. -	 */ -	if (!(gfp_mask & __GFP_WAIT)) { -		struct page_cgroup *pc; - -		pc = lookup_page_cgroup(page); -		if (!pc) -			return 0; -		lock_page_cgroup(pc); -		if (PageCgroupUsed(pc)) { -			unlock_page_cgroup(pc); -			return 0; -		} -		unlock_page_cgroup(pc); -	}  	if (unlikely(!mm))  		mm = &init_mm; @@ -3486,31 +3461,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,  	cgroup_release_and_wakeup_rmdir(&mem->css);  } -/* - * A call to try to shrink memory usage on charge failure at shmem's swapin. - * Calling hierarchical_reclaim is not enough because we should update - * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. - * Moreover considering hierarchy, we should reclaim from the mem_over_limit, - * not from the memcg which this page would be charged to. - * try_charge_swapin does all of these works properly. - */ -int mem_cgroup_shmem_charge_fallback(struct page *page, -			    struct mm_struct *mm, -			    gfp_t gfp_mask) -{ -	struct mem_cgroup *mem; -	int ret; - -	if (mem_cgroup_disabled()) -		return 0; - -	ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); -	if (!ret) -		mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ - -	return ret; -} -  #ifdef CONFIG_DEBUG_VM  static struct page_cgroup *lookup_page_cgroup_used(struct page *page)  { @@ -5330,15 +5280,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,  		pgoff = pte_to_pgoff(ptent);  	/* page is moved even if it's not RSS of this task(page-faulted). */ -	if (!mapping_cap_swap_backed(mapping)) { /* normal file */ -		page = find_get_page(mapping, pgoff); -	} else { /* shmem/tmpfs file. we should take account of swap too. */ -		swp_entry_t ent; -		mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); +	page = find_get_page(mapping, pgoff); + +#ifdef CONFIG_SWAP +	/* shmem/tmpfs may report page out on swap: account for that too. */ +	if (radix_tree_exceptional_entry(page)) { +		swp_entry_t swap = radix_to_swp_entry(page);  		if (do_swap_account) -			entry->val = ent.val; +			*entry = swap; +		page = find_get_page(&swapper_space, swap.val);  	} - +#endif  	return page;  } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 740c4f52059c..2b43ba051ac9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -53,6 +53,7 @@  #include <linux/hugetlb.h>  #include <linux/memory_hotplug.h>  #include <linux/mm_inline.h> +#include <linux/kfifo.h>  #include "internal.h"  int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -1178,6 +1179,97 @@ void memory_failure(unsigned long pfn, int trapno)  	__memory_failure(pfn, trapno, 0);  } +#define MEMORY_FAILURE_FIFO_ORDER	4 +#define MEMORY_FAILURE_FIFO_SIZE	(1 << MEMORY_FAILURE_FIFO_ORDER) + +struct memory_failure_entry { +	unsigned long pfn; +	int trapno; +	int flags; +}; + +struct memory_failure_cpu { +	DECLARE_KFIFO(fifo, struct memory_failure_entry, +		      MEMORY_FAILURE_FIFO_SIZE); +	spinlock_t lock; +	struct work_struct work; +}; + +static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); + +/** + * memory_failure_queue - Schedule handling memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * @flags: Flags for memory failure handling + * + * This function is called by the low level hardware error handler + * when it detects hardware memory corruption of a page. It schedules + * the recovering of error page, including dropping pages, killing + * processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Can run in IRQ context. + */ +void memory_failure_queue(unsigned long pfn, int trapno, int flags) +{ +	struct memory_failure_cpu *mf_cpu; +	unsigned long proc_flags; +	struct memory_failure_entry entry = { +		.pfn =		pfn, +		.trapno =	trapno, +		.flags =	flags, +	}; + +	mf_cpu = &get_cpu_var(memory_failure_cpu); +	spin_lock_irqsave(&mf_cpu->lock, proc_flags); +	if (kfifo_put(&mf_cpu->fifo, &entry)) +		schedule_work_on(smp_processor_id(), &mf_cpu->work); +	else +		pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n", +		       pfn); +	spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); +	put_cpu_var(memory_failure_cpu); +} +EXPORT_SYMBOL_GPL(memory_failure_queue); + +static void memory_failure_work_func(struct work_struct *work) +{ +	struct memory_failure_cpu *mf_cpu; +	struct memory_failure_entry entry = { 0, }; +	unsigned long proc_flags; +	int gotten; + +	mf_cpu = &__get_cpu_var(memory_failure_cpu); +	for (;;) { +		spin_lock_irqsave(&mf_cpu->lock, proc_flags); +		gotten = kfifo_get(&mf_cpu->fifo, &entry); +		spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); +		if (!gotten) +			break; +		__memory_failure(entry.pfn, entry.trapno, entry.flags); +	} +} + +static int __init memory_failure_init(void) +{ +	struct memory_failure_cpu *mf_cpu; +	int cpu; + +	for_each_possible_cpu(cpu) { +		mf_cpu = &per_cpu(memory_failure_cpu, cpu); +		spin_lock_init(&mf_cpu->lock); +		INIT_KFIFO(mf_cpu->fifo); +		INIT_WORK(&mf_cpu->work, memory_failure_work_func); +	} + +	return 0; +} +core_initcall(memory_failure_init); +  /**   * unpoison_memory - Unpoison a previously poisoned page   * @pfn: Page number of the to be unpoisoned page diff --git a/mm/mincore.c b/mm/mincore.c index a4e6b9d75c76..636a86876ff2 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)  	 * file will not get a swp_entry_t in its pte, but rather it is like  	 * any other file mapping (ie. marked !present and faulted in with  	 * tmpfs's .fault). So swapped out tmpfs mappings are tested here. -	 * -	 * However when tmpfs moves the page from pagecache and into swapcache, -	 * it is still in core, but the find_get_page below won't find it. -	 * No big deal, but make a note of it.  	 */  	page = find_get_page(mapping, pgoff); +#ifdef CONFIG_SWAP +	/* shmem/tmpfs may return swap: account for swapcache page too. */ +	if (radix_tree_exceptional_entry(page)) { +		swp_entry_t swap = radix_to_swp_entry(page); +		page = find_get_page(&swapper_space, swap.val); +	} +#endif  	if (page) {  		present = PageUptodate(page);  		page_cache_release(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1dbcf8888f14..6e8ecb6e021c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1409,14 +1409,11 @@ static int __init fail_page_alloc_debugfs(void)  {  	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;  	struct dentry *dir; -	int err; -	err = init_fault_attr_dentries(&fail_page_alloc.attr, -				       "fail_page_alloc"); -	if (err) -		return err; - -	dir = fail_page_alloc.attr.dir; +	dir = fault_create_debugfs_attr("fail_page_alloc", NULL, +					&fail_page_alloc.attr); +	if (IS_ERR(dir)) +		return PTR_ERR(dir);  	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,  				&fail_page_alloc.ignore_gfp_wait)) @@ -1430,7 +1427,7 @@ static int __init fail_page_alloc_debugfs(void)  	return 0;  fail: -	cleanup_fault_attr_dentries(&fail_page_alloc.attr); +	debugfs_remove_recursive(dir);  	return -ENOMEM;  } diff --git a/mm/shmem.c b/mm/shmem.c index 5cc21f8b4cd3..32f6763f16fb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -6,7 +6,8 @@   *		 2000-2001 Christoph Rohland   *		 2000-2001 SAP AG   *		 2002 Red Hat Inc. - * Copyright (C) 2002-2005 Hugh Dickins. + * Copyright (C) 2002-2011 Hugh Dickins. + * Copyright (C) 2011 Google Inc.   * Copyright (C) 2002-2005 VERITAS Software Corporation.   * Copyright (C) 2004 Andi Kleen, SuSE Labs   * @@ -28,7 +29,6 @@  #include <linux/file.h>  #include <linux/mm.h>  #include <linux/module.h> -#include <linux/percpu_counter.h>  #include <linux/swap.h>  static struct vfsmount *shm_mnt; @@ -51,6 +51,8 @@ static struct vfsmount *shm_mnt;  #include <linux/shmem_fs.h>  #include <linux/writeback.h>  #include <linux/blkdev.h> +#include <linux/pagevec.h> +#include <linux/percpu_counter.h>  #include <linux/splice.h>  #include <linux/security.h>  #include <linux/swapops.h> @@ -63,43 +65,17 @@ static struct vfsmount *shm_mnt;  #include <linux/magic.h>  #include <asm/uaccess.h> -#include <asm/div64.h>  #include <asm/pgtable.h> -/* - * The maximum size of a shmem/tmpfs file is limited by the maximum size of - * its triple-indirect swap vector - see illustration at shmem_swp_entry(). - * - * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel, - * but one eighth of that on a 64-bit kernel.  With 8kB page size, maximum - * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel, - * MAX_LFS_FILESIZE being then more restrictive than swap vector layout. - * - * We use / and * instead of shifts in the definitions below, so that the swap - * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE. - */ -#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) -#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) - -#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) -#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT) - -#define SHMEM_MAX_BYTES  min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE) -#define SHMEM_MAX_INDEX  ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT)) -  #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)  #define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) -/* info->flags needs VM_flags to handle pagein/truncate races efficiently */ -#define SHMEM_PAGEIN	 VM_READ -#define SHMEM_TRUNCATE	 VM_WRITE - -/* Definition to limit shmem_truncate's steps between cond_rescheds */ -#define LATENCY_LIMIT	 64 -  /* Pretend that each entry is of this size in directory's i_size */  #define BOGO_DIRENT_SIZE 20 +/* Symlink up to this size is kmalloc'ed instead of using a swappable page */ +#define SHORT_SYMLINK_LEN 128 +  struct shmem_xattr {  	struct list_head list;	/* anchored by shmem_inode_info->xattr_list */  	char *name;		/* xattr name */ @@ -107,7 +83,7 @@ struct shmem_xattr {  	char value[0];  }; -/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ +/* Flag allocation requirements to shmem_getpage */  enum sgp_type {  	SGP_READ,	/* don't exceed i_size, don't allocate page */  	SGP_CACHE,	/* don't exceed i_size, may allocate page */ @@ -137,56 +113,6 @@ static inline int shmem_getpage(struct inode *inode, pgoff_t index,  			mapping_gfp_mask(inode->i_mapping), fault_type);  } -static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) -{ -	/* -	 * The above definition of ENTRIES_PER_PAGE, and the use of -	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: -	 * might be reconsidered if it ever diverges from PAGE_SIZE. -	 * -	 * Mobility flags are masked out as swap vectors cannot move -	 */ -	return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO, -				PAGE_CACHE_SHIFT-PAGE_SHIFT); -} - -static inline void shmem_dir_free(struct page *page) -{ -	__free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT); -} - -static struct page **shmem_dir_map(struct page *page) -{ -	return (struct page **)kmap_atomic(page, KM_USER0); -} - -static inline void shmem_dir_unmap(struct page **dir) -{ -	kunmap_atomic(dir, KM_USER0); -} - -static swp_entry_t *shmem_swp_map(struct page *page) -{ -	return (swp_entry_t *)kmap_atomic(page, KM_USER1); -} - -static inline void shmem_swp_balance_unmap(void) -{ -	/* -	 * When passing a pointer to an i_direct entry, to code which -	 * also handles indirect entries and so will shmem_swp_unmap, -	 * we must arrange for the preempt count to remain in balance. -	 * What kmap_atomic of a lowmem page does depends on config -	 * and architecture, so pretend to kmap_atomic some lowmem page. -	 */ -	(void) kmap_atomic(ZERO_PAGE(0), KM_USER1); -} - -static inline void shmem_swp_unmap(swp_entry_t *entry) -{ -	kunmap_atomic(entry, KM_USER1); -} -  static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)  {  	return sb->s_fs_info; @@ -244,15 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {  static LIST_HEAD(shmem_swaplist);  static DEFINE_MUTEX(shmem_swaplist_mutex); -static void shmem_free_blocks(struct inode *inode, long pages) -{ -	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); -	if (sbinfo->max_blocks) { -		percpu_counter_add(&sbinfo->used_blocks, -pages); -		inode->i_blocks -= pages*BLOCKS_PER_PAGE; -	} -} -  static int shmem_reserve_inode(struct super_block *sb)  {  	struct shmem_sb_info *sbinfo = SHMEM_SB(sb); @@ -279,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb)  }  /** - * shmem_recalc_inode - recalculate the size of an inode + * shmem_recalc_inode - recalculate the block usage of an inode   * @inode: inode to recalc   *   * We have to calculate the free blocks since the mm can drop @@ -297,474 +214,297 @@ static void shmem_recalc_inode(struct inode *inode)  	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;  	if (freed > 0) { +		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); +		if (sbinfo->max_blocks) +			percpu_counter_add(&sbinfo->used_blocks, -freed);  		info->alloced -= freed; +		inode->i_blocks -= freed * BLOCKS_PER_PAGE;  		shmem_unacct_blocks(info->flags, freed); -		shmem_free_blocks(inode, freed);  	}  } -/** - * shmem_swp_entry - find the swap vector position in the info structure - * @info:  info structure for the inode - * @index: index of the page to find - * @page:  optional page to add to the structure. Has to be preset to - *         all zeros - * - * If there is no space allocated yet it will return NULL when - * page is NULL, else it will use the page for the needed block, - * setting it to NULL on return to indicate that it has been used. - * - * The swap vector is organized the following way: - * - * There are SHMEM_NR_DIRECT entries directly stored in the - * shmem_inode_info structure. So small files do not need an addional - * allocation. - * - * For pages with index > SHMEM_NR_DIRECT there is the pointer - * i_indirect which points to a page which holds in the first half - * doubly indirect blocks, in the second half triple indirect blocks: - * - * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the - * following layout (for SHMEM_NR_DIRECT == 16): - * - * i_indirect -> dir --> 16-19 - * 	      |	     +-> 20-23 - * 	      | - * 	      +-->dir2 --> 24-27 - * 	      |	       +-> 28-31 - * 	      |	       +-> 32-35 - * 	      |	       +-> 36-39 - * 	      | - * 	      +-->dir3 --> 40-43 - * 	       	       +-> 44-47 - * 	      	       +-> 48-51 - * 	      	       +-> 52-55 +/* + * Replace item expected in radix tree by a new item, while holding tree lock.   */ -static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) -{ -	unsigned long offset; -	struct page **dir; -	struct page *subdir; - -	if (index < SHMEM_NR_DIRECT) { -		shmem_swp_balance_unmap(); -		return info->i_direct+index; -	} -	if (!info->i_indirect) { -		if (page) { -			info->i_indirect = *page; -			*page = NULL; -		} -		return NULL;			/* need another page */ -	} - -	index -= SHMEM_NR_DIRECT; -	offset = index % ENTRIES_PER_PAGE; -	index /= ENTRIES_PER_PAGE; -	dir = shmem_dir_map(info->i_indirect); - -	if (index >= ENTRIES_PER_PAGE/2) { -		index -= ENTRIES_PER_PAGE/2; -		dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE; -		index %= ENTRIES_PER_PAGE; -		subdir = *dir; -		if (!subdir) { -			if (page) { -				*dir = *page; -				*page = NULL; -			} -			shmem_dir_unmap(dir); -			return NULL;		/* need another page */ -		} -		shmem_dir_unmap(dir); -		dir = shmem_dir_map(subdir); -	} +static int shmem_radix_tree_replace(struct address_space *mapping, +			pgoff_t index, void *expected, void *replacement) +{ +	void **pslot; +	void *item = NULL; + +	VM_BUG_ON(!expected); +	pslot = radix_tree_lookup_slot(&mapping->page_tree, index); +	if (pslot) +		item = radix_tree_deref_slot_protected(pslot, +							&mapping->tree_lock); +	if (item != expected) +		return -ENOENT; +	if (replacement) +		radix_tree_replace_slot(pslot, replacement); +	else +		radix_tree_delete(&mapping->page_tree, index); +	return 0; +} -	dir += index; -	subdir = *dir; -	if (!subdir) { -		if (!page || !(subdir = *page)) { -			shmem_dir_unmap(dir); -			return NULL;		/* need a page */ +/* + * Like add_to_page_cache_locked, but error if expected item has gone. + */ +static int shmem_add_to_page_cache(struct page *page, +				   struct address_space *mapping, +				   pgoff_t index, gfp_t gfp, void *expected) +{ +	int error = 0; + +	VM_BUG_ON(!PageLocked(page)); +	VM_BUG_ON(!PageSwapBacked(page)); + +	if (!expected) +		error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); +	if (!error) { +		page_cache_get(page); +		page->mapping = mapping; +		page->index = index; + +		spin_lock_irq(&mapping->tree_lock); +		if (!expected) +			error = radix_tree_insert(&mapping->page_tree, +							index, page); +		else +			error = shmem_radix_tree_replace(mapping, index, +							expected, page); +		if (!error) { +			mapping->nrpages++; +			__inc_zone_page_state(page, NR_FILE_PAGES); +			__inc_zone_page_state(page, NR_SHMEM); +			spin_unlock_irq(&mapping->tree_lock); +		} else { +			page->mapping = NULL; +			spin_unlock_irq(&mapping->tree_lock); +			page_cache_release(page);  		} -		*dir = subdir; -		*page = NULL; +		if (!expected) +			radix_tree_preload_end();  	} -	shmem_dir_unmap(dir); -	return shmem_swp_map(subdir) + offset; +	if (error) +		mem_cgroup_uncharge_cache_page(page); +	return error;  } -static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) +/* + * Like delete_from_page_cache, but substitutes swap for page. + */ +static void shmem_delete_from_page_cache(struct page *page, void *radswap)  { -	long incdec = value? 1: -1; +	struct address_space *mapping = page->mapping; +	int error; -	entry->val = value; -	info->swapped += incdec; -	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { -		struct page *page = kmap_atomic_to_page(entry); -		set_page_private(page, page_private(page) + incdec); -	} +	spin_lock_irq(&mapping->tree_lock); +	error = shmem_radix_tree_replace(mapping, page->index, page, radswap); +	page->mapping = NULL; +	mapping->nrpages--; +	__dec_zone_page_state(page, NR_FILE_PAGES); +	__dec_zone_page_state(page, NR_SHMEM); +	spin_unlock_irq(&mapping->tree_lock); +	page_cache_release(page); +	BUG_ON(error);  } -/** - * shmem_swp_alloc - get the position of the swap entry for the page. - * @info:	info structure for the inode - * @index:	index of the page to find - * @sgp:	check and recheck i_size? skip allocation? - * @gfp:	gfp mask to use for any page allocation - * - * If the entry does not exist, allocate it. +/* + * Like find_get_pages, but collecting swap entries as well as pages.   */ -static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, -			unsigned long index, enum sgp_type sgp, gfp_t gfp) -{ -	struct inode *inode = &info->vfs_inode; -	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); -	struct page *page = NULL; -	swp_entry_t *entry; - -	if (sgp != SGP_WRITE && -	    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) -		return ERR_PTR(-EINVAL); - -	while (!(entry = shmem_swp_entry(info, index, &page))) { -		if (sgp == SGP_READ) -			return shmem_swp_map(ZERO_PAGE(0)); -		/* -		 * Test used_blocks against 1 less max_blocks, since we have 1 data -		 * page (and perhaps indirect index pages) yet to allocate: -		 * a waste to allocate index if we cannot allocate data. -		 */ -		if (sbinfo->max_blocks) { -			if (percpu_counter_compare(&sbinfo->used_blocks, -						sbinfo->max_blocks - 1) >= 0) -				return ERR_PTR(-ENOSPC); -			percpu_counter_inc(&sbinfo->used_blocks); -			inode->i_blocks += BLOCKS_PER_PAGE; +static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, +					pgoff_t start, unsigned int nr_pages, +					struct page **pages, pgoff_t *indices) +{ +	unsigned int i; +	unsigned int ret; +	unsigned int nr_found; + +	rcu_read_lock(); +restart: +	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, +				(void ***)pages, indices, start, nr_pages); +	ret = 0; +	for (i = 0; i < nr_found; i++) { +		struct page *page; +repeat: +		page = radix_tree_deref_slot((void **)pages[i]); +		if (unlikely(!page)) +			continue; +		if (radix_tree_exception(page)) { +			if (radix_tree_deref_retry(page)) +				goto restart; +			/* +			 * Otherwise, we must be storing a swap entry +			 * here as an exceptional entry: so return it +			 * without attempting to raise page count. +			 */ +			goto export;  		} +		if (!page_cache_get_speculative(page)) +			goto repeat; -		spin_unlock(&info->lock); -		page = shmem_dir_alloc(gfp); -		spin_lock(&info->lock); - -		if (!page) { -			shmem_free_blocks(inode, 1); -			return ERR_PTR(-ENOMEM); -		} -		if (sgp != SGP_WRITE && -		    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { -			entry = ERR_PTR(-EINVAL); -			break; +		/* Has the page moved? */ +		if (unlikely(page != *((void **)pages[i]))) { +			page_cache_release(page); +			goto repeat;  		} -		if (info->next_index <= index) -			info->next_index = index + 1; -	} -	if (page) { -		/* another task gave its page, or truncated the file */ -		shmem_free_blocks(inode, 1); -		shmem_dir_free(page); -	} -	if (info->next_index <= index && !IS_ERR(entry)) -		info->next_index = index + 1; -	return entry; +export: +		indices[ret] = indices[i]; +		pages[ret] = page; +		ret++; +	} +	if (unlikely(!ret && nr_found)) +		goto restart; +	rcu_read_unlock(); +	return ret;  } -/** - * shmem_free_swp - free some swap entries in a directory - * @dir:        pointer to the directory - * @edir:       pointer after last entry of the directory - * @punch_lock: pointer to spinlock when needed for the holepunch case +/* + * Remove swap entry from radix tree, free the swap and its page cache.   */ -static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, -						spinlock_t *punch_lock) -{ -	spinlock_t *punch_unlock = NULL; -	swp_entry_t *ptr; -	int freed = 0; - -	for (ptr = dir; ptr < edir; ptr++) { -		if (ptr->val) { -			if (unlikely(punch_lock)) { -				punch_unlock = punch_lock; -				punch_lock = NULL; -				spin_lock(punch_unlock); -				if (!ptr->val) -					continue; -			} -			free_swap_and_cache(*ptr); -			*ptr = (swp_entry_t){0}; -			freed++; -		} -	} -	if (punch_unlock) -		spin_unlock(punch_unlock); -	return freed; -} - -static int shmem_map_and_free_swp(struct page *subdir, int offset, -		int limit, struct page ***dir, spinlock_t *punch_lock) -{ -	swp_entry_t *ptr; -	int freed = 0; - -	ptr = shmem_swp_map(subdir); -	for (; offset < limit; offset += LATENCY_LIMIT) { -		int size = limit - offset; -		if (size > LATENCY_LIMIT) -			size = LATENCY_LIMIT; -		freed += shmem_free_swp(ptr+offset, ptr+offset+size, -							punch_lock); -		if (need_resched()) { -			shmem_swp_unmap(ptr); -			if (*dir) { -				shmem_dir_unmap(*dir); -				*dir = NULL; -			} -			cond_resched(); -			ptr = shmem_swp_map(subdir); -		} -	} -	shmem_swp_unmap(ptr); -	return freed; +static int shmem_free_swap(struct address_space *mapping, +			   pgoff_t index, void *radswap) +{ +	int error; + +	spin_lock_irq(&mapping->tree_lock); +	error = shmem_radix_tree_replace(mapping, index, radswap, NULL); +	spin_unlock_irq(&mapping->tree_lock); +	if (!error) +		free_swap_and_cache(radix_to_swp_entry(radswap)); +	return error;  } -static void shmem_free_pages(struct list_head *next) +/* + * Pagevec may contain swap entries, so shuffle up pages before releasing. + */ +static void shmem_pagevec_release(struct pagevec *pvec)  { -	struct page *page; -	int freed = 0; - -	do { -		page = container_of(next, struct page, lru); -		next = next->next; -		shmem_dir_free(page); -		freed++; -		if (freed >= LATENCY_LIMIT) { -			cond_resched(); -			freed = 0; -		} -	} while (next); +	int i, j; + +	for (i = 0, j = 0; i < pagevec_count(pvec); i++) { +		struct page *page = pvec->pages[i]; +		if (!radix_tree_exceptional_entry(page)) +			pvec->pages[j++] = page; +	} +	pvec->nr = j; +	pagevec_release(pvec);  } -void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) +/* + * Remove range of pages and swap entries from radix tree, and free them. + */ +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)  { +	struct address_space *mapping = inode->i_mapping;  	struct shmem_inode_info *info = SHMEM_I(inode); -	unsigned long idx; -	unsigned long size; -	unsigned long limit; -	unsigned long stage; -	unsigned long diroff; -	struct page **dir; -	struct page *topdir; -	struct page *middir; -	struct page *subdir; -	swp_entry_t *ptr; -	LIST_HEAD(pages_to_free); -	long nr_pages_to_free = 0; +	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); +	pgoff_t end = (lend >> PAGE_CACHE_SHIFT); +	struct pagevec pvec; +	pgoff_t indices[PAGEVEC_SIZE];  	long nr_swaps_freed = 0; -	int offset; -	int freed; -	int punch_hole; -	spinlock_t *needs_lock; -	spinlock_t *punch_lock; -	unsigned long upper_limit; +	pgoff_t index; +	int i; -	truncate_inode_pages_range(inode->i_mapping, start, end); +	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); -	inode->i_ctime = inode->i_mtime = CURRENT_TIME; -	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -	if (idx >= info->next_index) -		return; +	pagevec_init(&pvec, 0); +	index = start; +	while (index <= end) { +		pvec.nr = shmem_find_get_pages_and_swap(mapping, index, +			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, +							pvec.pages, indices); +		if (!pvec.nr) +			break; +		mem_cgroup_uncharge_start(); +		for (i = 0; i < pagevec_count(&pvec); i++) { +			struct page *page = pvec.pages[i]; -	spin_lock(&info->lock); -	info->flags |= SHMEM_TRUNCATE; -	if (likely(end == (loff_t) -1)) { -		limit = info->next_index; -		upper_limit = SHMEM_MAX_INDEX; -		info->next_index = idx; -		needs_lock = NULL; -		punch_hole = 0; -	} else { -		if (end + 1 >= inode->i_size) {	/* we may free a little more */ -			limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >> -							PAGE_CACHE_SHIFT; -			upper_limit = SHMEM_MAX_INDEX; -		} else { -			limit = (end + 1) >> PAGE_CACHE_SHIFT; -			upper_limit = limit; -		} -		needs_lock = &info->lock; -		punch_hole = 1; -	} +			index = indices[i]; +			if (index > end) +				break; + +			if (radix_tree_exceptional_entry(page)) { +				nr_swaps_freed += !shmem_free_swap(mapping, +								index, page); +				continue; +			} -	topdir = info->i_indirect; -	if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { -		info->i_indirect = NULL; -		nr_pages_to_free++; -		list_add(&topdir->lru, &pages_to_free); +			if (!trylock_page(page)) +				continue; +			if (page->mapping == mapping) { +				VM_BUG_ON(PageWriteback(page)); +				truncate_inode_page(mapping, page); +			} +			unlock_page(page); +		} +		shmem_pagevec_release(&pvec); +		mem_cgroup_uncharge_end(); +		cond_resched(); +		index++;  	} -	spin_unlock(&info->lock); -	if (info->swapped && idx < SHMEM_NR_DIRECT) { -		ptr = info->i_direct; -		size = limit; -		if (size > SHMEM_NR_DIRECT) -			size = SHMEM_NR_DIRECT; -		nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); +	if (partial) { +		struct page *page = NULL; +		shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); +		if (page) { +			zero_user_segment(page, partial, PAGE_CACHE_SIZE); +			set_page_dirty(page); +			unlock_page(page); +			page_cache_release(page); +		}  	} -	/* -	 * If there are no indirect blocks or we are punching a hole -	 * below indirect blocks, nothing to be done. -	 */ -	if (!topdir || limit <= SHMEM_NR_DIRECT) -		goto done2; +	index = start; +	for ( ; ; ) { +		cond_resched(); +		pvec.nr = shmem_find_get_pages_and_swap(mapping, index, +			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, +							pvec.pages, indices); +		if (!pvec.nr) { +			if (index == start) +				break; +			index = start; +			continue; +		} +		if (index == start && indices[0] > end) { +			shmem_pagevec_release(&pvec); +			break; +		} +		mem_cgroup_uncharge_start(); +		for (i = 0; i < pagevec_count(&pvec); i++) { +			struct page *page = pvec.pages[i]; -	/* -	 * The truncation case has already dropped info->lock, and we're safe -	 * because i_size and next_index have already been lowered, preventing -	 * access beyond.  But in the punch_hole case, we still need to take -	 * the lock when updating the swap directory, because there might be -	 * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or -	 * shmem_writepage.  However, whenever we find we can remove a whole -	 * directory page (not at the misaligned start or end of the range), -	 * we first NULLify its pointer in the level above, and then have no -	 * need to take the lock when updating its contents: needs_lock and -	 * punch_lock (either pointing to info->lock or NULL) manage this. -	 */ +			index = indices[i]; +			if (index > end) +				break; -	upper_limit -= SHMEM_NR_DIRECT; -	limit -= SHMEM_NR_DIRECT; -	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; -	offset = idx % ENTRIES_PER_PAGE; -	idx -= offset; - -	dir = shmem_dir_map(topdir); -	stage = ENTRIES_PER_PAGEPAGE/2; -	if (idx < ENTRIES_PER_PAGEPAGE/2) { -		middir = topdir; -		diroff = idx/ENTRIES_PER_PAGE; -	} else { -		dir += ENTRIES_PER_PAGE/2; -		dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE; -		while (stage <= idx) -			stage += ENTRIES_PER_PAGEPAGE; -		middir = *dir; -		if (*dir) { -			diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % -				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; -			if (!diroff && !offset && upper_limit >= stage) { -				if (needs_lock) { -					spin_lock(needs_lock); -					*dir = NULL; -					spin_unlock(needs_lock); -					needs_lock = NULL; -				} else -					*dir = NULL; -				nr_pages_to_free++; -				list_add(&middir->lru, &pages_to_free); +			if (radix_tree_exceptional_entry(page)) { +				nr_swaps_freed += !shmem_free_swap(mapping, +								index, page); +				continue;  			} -			shmem_dir_unmap(dir); -			dir = shmem_dir_map(middir); -		} else { -			diroff = 0; -			offset = 0; -			idx = stage; -		} -	} -	for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { -		if (unlikely(idx == stage)) { -			shmem_dir_unmap(dir); -			dir = shmem_dir_map(topdir) + -			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; -			while (!*dir) { -				dir++; -				idx += ENTRIES_PER_PAGEPAGE; -				if (idx >= limit) -					goto done1; -			} -			stage = idx + ENTRIES_PER_PAGEPAGE; -			middir = *dir; -			if (punch_hole) -				needs_lock = &info->lock; -			if (upper_limit >= stage) { -				if (needs_lock) { -					spin_lock(needs_lock); -					*dir = NULL; -					spin_unlock(needs_lock); -					needs_lock = NULL; -				} else -					*dir = NULL; -				nr_pages_to_free++; -				list_add(&middir->lru, &pages_to_free); +			lock_page(page); +			if (page->mapping == mapping) { +				VM_BUG_ON(PageWriteback(page)); +				truncate_inode_page(mapping, page);  			} -			shmem_dir_unmap(dir); -			cond_resched(); -			dir = shmem_dir_map(middir); -			diroff = 0; -		} -		punch_lock = needs_lock; -		subdir = dir[diroff]; -		if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) { -			if (needs_lock) { -				spin_lock(needs_lock); -				dir[diroff] = NULL; -				spin_unlock(needs_lock); -				punch_lock = NULL; -			} else -				dir[diroff] = NULL; -			nr_pages_to_free++; -			list_add(&subdir->lru, &pages_to_free); -		} -		if (subdir && page_private(subdir) /* has swap entries */) { -			size = limit - idx; -			if (size > ENTRIES_PER_PAGE) -				size = ENTRIES_PER_PAGE; -			freed = shmem_map_and_free_swp(subdir, -					offset, size, &dir, punch_lock); -			if (!dir) -				dir = shmem_dir_map(middir); -			nr_swaps_freed += freed; -			if (offset || punch_lock) { -				spin_lock(&info->lock); -				set_page_private(subdir, -					page_private(subdir) - freed); -				spin_unlock(&info->lock); -			} else -				BUG_ON(page_private(subdir) != freed); +			unlock_page(page);  		} -		offset = 0; -	} -done1: -	shmem_dir_unmap(dir); -done2: -	if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { -		/* -		 * Call truncate_inode_pages again: racing shmem_unuse_inode -		 * may have swizzled a page in from swap since -		 * truncate_pagecache or generic_delete_inode did it, before we -		 * lowered next_index.  Also, though shmem_getpage checks -		 * i_size before adding to cache, no recheck after: so fix the -		 * narrow window there too. -		 */ -		truncate_inode_pages_range(inode->i_mapping, start, end); +		shmem_pagevec_release(&pvec); +		mem_cgroup_uncharge_end(); +		index++;  	}  	spin_lock(&info->lock); -	info->flags &= ~SHMEM_TRUNCATE;  	info->swapped -= nr_swaps_freed; -	if (nr_pages_to_free) -		shmem_free_blocks(inode, nr_pages_to_free);  	shmem_recalc_inode(inode);  	spin_unlock(&info->lock); -	/* -	 * Empty swap vector directory pages to be freed? -	 */ -	if (!list_empty(&pages_to_free)) { -		pages_to_free.prev->next = NULL; -		shmem_free_pages(pages_to_free.next); -	} +	inode->i_ctime = inode->i_mtime = CURRENT_TIME;  }  EXPORT_SYMBOL_GPL(shmem_truncate_range); @@ -780,37 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)  	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {  		loff_t oldsize = inode->i_size;  		loff_t newsize = attr->ia_size; -		struct page *page = NULL; -		if (newsize < oldsize) { -			/* -			 * If truncating down to a partial page, then -			 * if that page is already allocated, hold it -			 * in memory until the truncation is over, so -			 * truncate_partial_page cannot miss it were -			 * it assigned to swap. -			 */ -			if (newsize & (PAGE_CACHE_SIZE-1)) { -				(void) shmem_getpage(inode, -					newsize >> PAGE_CACHE_SHIFT, -						&page, SGP_READ, NULL); -				if (page) -					unlock_page(page); -			} -			/* -			 * Reset SHMEM_PAGEIN flag so that shmem_truncate can -			 * detect if any pages might have been added to cache -			 * after truncate_inode_pages.  But we needn't bother -			 * if it's being fully truncated to zero-length: the -			 * nrpages check is efficient enough in that case. -			 */ -			if (newsize) { -				struct shmem_inode_info *info = SHMEM_I(inode); -				spin_lock(&info->lock); -				info->flags &= ~SHMEM_PAGEIN; -				spin_unlock(&info->lock); -			} -		}  		if (newsize != oldsize) {  			i_size_write(inode, newsize);  			inode->i_ctime = inode->i_mtime = CURRENT_TIME; @@ -822,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)  			/* unmap again to remove racily COWed private pages */  			unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);  		} -		if (page) -			page_cache_release(page);  	}  	setattr_copy(inode, attr); @@ -848,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode)  			list_del_init(&info->swaplist);  			mutex_unlock(&shmem_swaplist_mutex);  		} -	} +	} else +		kfree(info->symlink);  	list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {  		kfree(xattr->name); @@ -859,106 +568,27 @@ static void shmem_evict_inode(struct inode *inode)  	end_writeback(inode);  } -static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) -{ -	swp_entry_t *ptr; - -	for (ptr = dir; ptr < edir; ptr++) { -		if (ptr->val == entry.val) -			return ptr - dir; -	} -	return -1; -} - -static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) +/* + * If swap found in inode, free it and move page from swapcache to filecache. + */ +static int shmem_unuse_inode(struct shmem_inode_info *info, +			     swp_entry_t swap, struct page *page)  { -	struct address_space *mapping; -	unsigned long idx; -	unsigned long size; -	unsigned long limit; -	unsigned long stage; -	struct page **dir; -	struct page *subdir; -	swp_entry_t *ptr; -	int offset; +	struct address_space *mapping = info->vfs_inode.i_mapping; +	void *radswap; +	pgoff_t index;  	int error; -	idx = 0; -	ptr = info->i_direct; -	spin_lock(&info->lock); -	if (!info->swapped) { -		list_del_init(&info->swaplist); -		goto lost2; -	} -	limit = info->next_index; -	size = limit; -	if (size > SHMEM_NR_DIRECT) -		size = SHMEM_NR_DIRECT; -	offset = shmem_find_swp(entry, ptr, ptr+size); -	if (offset >= 0) { -		shmem_swp_balance_unmap(); -		goto found; -	} -	if (!info->i_indirect) -		goto lost2; - -	dir = shmem_dir_map(info->i_indirect); -	stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2; - -	for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { -		if (unlikely(idx == stage)) { -			shmem_dir_unmap(dir-1); -			if (cond_resched_lock(&info->lock)) { -				/* check it has not been truncated */ -				if (limit > info->next_index) { -					limit = info->next_index; -					if (idx >= limit) -						goto lost2; -				} -			} -			dir = shmem_dir_map(info->i_indirect) + -			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; -			while (!*dir) { -				dir++; -				idx += ENTRIES_PER_PAGEPAGE; -				if (idx >= limit) -					goto lost1; -			} -			stage = idx + ENTRIES_PER_PAGEPAGE; -			subdir = *dir; -			shmem_dir_unmap(dir); -			dir = shmem_dir_map(subdir); -		} -		subdir = *dir; -		if (subdir && page_private(subdir)) { -			ptr = shmem_swp_map(subdir); -			size = limit - idx; -			if (size > ENTRIES_PER_PAGE) -				size = ENTRIES_PER_PAGE; -			offset = shmem_find_swp(entry, ptr, ptr+size); -			shmem_swp_unmap(ptr); -			if (offset >= 0) { -				shmem_dir_unmap(dir); -				ptr = shmem_swp_map(subdir); -				goto found; -			} -		} -	} -lost1: -	shmem_dir_unmap(dir-1); -lost2: -	spin_unlock(&info->lock); -	return 0; -found: -	idx += offset; -	ptr += offset; +	radswap = swp_to_radix_entry(swap); +	index = radix_tree_locate_item(&mapping->page_tree, radswap); +	if (index == -1) +		return 0;  	/*  	 * Move _head_ to start search for next from here.  	 * But be careful: shmem_evict_inode checks list_empty without taking  	 * mutex, and there's an instant in list_move_tail when info->swaplist -	 * would appear empty, if it were the only one on shmem_swaplist.  We -	 * could avoid doing it if inode NULL; or use this minor optimization. +	 * would appear empty, if it were the only one on shmem_swaplist.  	 */  	if (shmem_swaplist.next != &info->swaplist)  		list_move_tail(&shmem_swaplist, &info->swaplist); @@ -968,29 +598,34 @@ found:  	 * but also to hold up shmem_evict_inode(): so inode cannot be freed  	 * beneath us (pagelock doesn't help until the page is in pagecache).  	 */ -	mapping = info->vfs_inode.i_mapping; -	error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); +	error = shmem_add_to_page_cache(page, mapping, index, +						GFP_NOWAIT, radswap);  	/* which does mem_cgroup_uncharge_cache_page on error */  	if (error != -ENOMEM) { +		/* +		 * Truncation and eviction use free_swap_and_cache(), which +		 * only does trylock page: if we raced, best clean up here. +		 */  		delete_from_swap_cache(page);  		set_page_dirty(page); -		info->flags |= SHMEM_PAGEIN; -		shmem_swp_set(info, ptr, 0); -		swap_free(entry); +		if (!error) { +			spin_lock(&info->lock); +			info->swapped--; +			spin_unlock(&info->lock); +			swap_free(swap); +		}  		error = 1;	/* not an error, but entry was found */  	} -	shmem_swp_unmap(ptr); -	spin_unlock(&info->lock);  	return error;  }  /* - * shmem_unuse() search for an eventually swapped out shmem page. + * Search through swapped inodes to find and replace swap by page.   */ -int shmem_unuse(swp_entry_t entry, struct page *page) +int shmem_unuse(swp_entry_t swap, struct page *page)  { -	struct list_head *p, *next; +	struct list_head *this, *next;  	struct shmem_inode_info *info;  	int found = 0;  	int error; @@ -999,32 +634,25 @@ int shmem_unuse(swp_entry_t entry, struct page *page)  	 * Charge page using GFP_KERNEL while we can wait, before taking  	 * the shmem_swaplist_mutex which might hold up shmem_writepage().  	 * Charged back to the user (not to caller) when swap account is used. -	 * add_to_page_cache() will be called with GFP_NOWAIT.  	 */  	error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);  	if (error)  		goto out; -	/* -	 * Try to preload while we can wait, to not make a habit of -	 * draining atomic reserves; but don't latch on to this cpu, -	 * it's okay if sometimes we get rescheduled after this. -	 */ -	error = radix_tree_preload(GFP_KERNEL); -	if (error) -		goto uncharge; -	radix_tree_preload_end(); +	/* No radix_tree_preload: swap entry keeps a place for page in tree */  	mutex_lock(&shmem_swaplist_mutex); -	list_for_each_safe(p, next, &shmem_swaplist) { -		info = list_entry(p, struct shmem_inode_info, swaplist); -		found = shmem_unuse_inode(info, entry, page); +	list_for_each_safe(this, next, &shmem_swaplist) { +		info = list_entry(this, struct shmem_inode_info, swaplist); +		if (info->swapped) +			found = shmem_unuse_inode(info, swap, page); +		else +			list_del_init(&info->swaplist);  		cond_resched();  		if (found)  			break;  	}  	mutex_unlock(&shmem_swaplist_mutex); -uncharge:  	if (!found)  		mem_cgroup_uncharge_cache_page(page);  	if (found < 0) @@ -1041,10 +669,10 @@ out:  static int shmem_writepage(struct page *page, struct writeback_control *wbc)  {  	struct shmem_inode_info *info; -	swp_entry_t *entry, swap;  	struct address_space *mapping; -	unsigned long index;  	struct inode *inode; +	swp_entry_t swap; +	pgoff_t index;  	BUG_ON(!PageLocked(page));  	mapping = page->mapping; @@ -1073,50 +701,32 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)  	/*  	 * Add inode to shmem_unuse()'s list of swapped-out inodes, -	 * if it's not already there.  Do it now because we cannot take -	 * mutex while holding spinlock, and must do so before the page -	 * is moved to swap cache, when its pagelock no longer protects +	 * if it's not already there.  Do it now before the page is +	 * moved to swap cache, when its pagelock no longer protects  	 * the inode from eviction.  But don't unlock the mutex until -	 * we've taken the spinlock, because shmem_unuse_inode() will -	 * prune a !swapped inode from the swaplist under both locks. +	 * we've incremented swapped, because shmem_unuse_inode() will +	 * prune a !swapped inode from the swaplist under this mutex.  	 */  	mutex_lock(&shmem_swaplist_mutex);  	if (list_empty(&info->swaplist))  		list_add_tail(&info->swaplist, &shmem_swaplist); -	spin_lock(&info->lock); -	mutex_unlock(&shmem_swaplist_mutex); - -	if (index >= info->next_index) { -		BUG_ON(!(info->flags & SHMEM_TRUNCATE)); -		goto unlock; -	} -	entry = shmem_swp_entry(info, index, NULL); -	if (entry->val) { -		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */ -		free_swap_and_cache(*entry); -		shmem_swp_set(info, entry, 0); -	} -	shmem_recalc_inode(inode); -  	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { -		delete_from_page_cache(page); -		shmem_swp_set(info, entry, swap.val); -		shmem_swp_unmap(entry);  		swap_shmem_alloc(swap); +		shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); + +		spin_lock(&info->lock); +		info->swapped++; +		shmem_recalc_inode(inode);  		spin_unlock(&info->lock); + +		mutex_unlock(&shmem_swaplist_mutex);  		BUG_ON(page_mapped(page));  		swap_writepage(page, wbc);  		return 0;  	} -	shmem_swp_unmap(entry); -unlock: -	spin_unlock(&info->lock); -	/* -	 * add_to_swap_cache() doesn't return -EEXIST, so we can safely -	 * clear SWAP_HAS_CACHE flag. -	 */ +	mutex_unlock(&shmem_swaplist_mutex);  	swapcache_free(swap, NULL);  redirty:  	set_page_dirty(page); @@ -1153,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)  }  #endif /* CONFIG_TMPFS */ -static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, -			struct shmem_inode_info *info, unsigned long idx) +static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, +			struct shmem_inode_info *info, pgoff_t index)  {  	struct mempolicy mpol, *spol;  	struct vm_area_struct pvma; -	struct page *page;  	spol = mpol_cond_copy(&mpol, -				mpol_shared_policy_lookup(&info->policy, idx)); +			mpol_shared_policy_lookup(&info->policy, index));  	/* Create a pseudo vma that just contains the policy */  	pvma.vm_start = 0; -	pvma.vm_pgoff = idx; +	pvma.vm_pgoff = index;  	pvma.vm_ops = NULL;  	pvma.vm_policy = spol; -	page = swapin_readahead(entry, gfp, &pvma, 0); -	return page; +	return swapin_readahead(swap, gfp, &pvma, 0);  }  static struct page *shmem_alloc_page(gfp_t gfp, -			struct shmem_inode_info *info, unsigned long idx) +			struct shmem_inode_info *info, pgoff_t index)  {  	struct vm_area_struct pvma;  	/* Create a pseudo vma that just contains the policy */  	pvma.vm_start = 0; -	pvma.vm_pgoff = idx; +	pvma.vm_pgoff = index;  	pvma.vm_ops = NULL; -	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); +	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);  	/*  	 * alloc_page_vma() will drop the shared policy reference @@ -1190,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp,  }  #else /* !CONFIG_NUMA */  #ifdef CONFIG_TMPFS -static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)  {  }  #endif /* CONFIG_TMPFS */ -static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, -			struct shmem_inode_info *info, unsigned long idx) +static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, +			struct shmem_inode_info *info, pgoff_t index)  { -	return swapin_readahead(entry, gfp, NULL, 0); +	return swapin_readahead(swap, gfp, NULL, 0);  }  static inline struct page *shmem_alloc_page(gfp_t gfp, -			struct shmem_inode_info *info, unsigned long idx) +			struct shmem_inode_info *info, pgoff_t index)  {  	return alloc_page(gfp);  } @@ -1222,243 +830,190 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)   * vm. If we swap it in we mark it dirty since we also free the swap   * entry since a page cannot live in both the swap and page cache   */ -static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,  	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)  {  	struct address_space *mapping = inode->i_mapping; -	struct shmem_inode_info *info = SHMEM_I(inode); +	struct shmem_inode_info *info;  	struct shmem_sb_info *sbinfo;  	struct page *page; -	struct page *prealloc_page = NULL; -	swp_entry_t *entry;  	swp_entry_t swap;  	int error; -	int ret; +	int once = 0; -	if (idx >= SHMEM_MAX_INDEX) +	if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))  		return -EFBIG;  repeat: -	page = find_lock_page(mapping, idx); -	if (page) { +	swap.val = 0; +	page = find_lock_page(mapping, index); +	if (radix_tree_exceptional_entry(page)) { +		swap = radix_to_swp_entry(page); +		page = NULL; +	} + +	if (sgp != SGP_WRITE && +	    ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { +		error = -EINVAL; +		goto failed; +	} + +	if (page || (sgp == SGP_READ && !swap.val)) {  		/*  		 * Once we can get the page lock, it must be uptodate:  		 * if there were an error in reading back from swap,  		 * the page would not be inserted into the filecache.  		 */ -		BUG_ON(!PageUptodate(page)); -		goto done; +		BUG_ON(page && !PageUptodate(page)); +		*pagep = page; +		return 0;  	}  	/* -	 * Try to preload while we can wait, to not make a habit of -	 * draining atomic reserves; but don't latch on to this cpu. +	 * Fast cache lookup did not find it: +	 * bring it back from swap or allocate.  	 */ -	error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); -	if (error) -		goto out; -	radix_tree_preload_end(); - -	if (sgp != SGP_READ && !prealloc_page) { -		prealloc_page = shmem_alloc_page(gfp, info, idx); -		if (prealloc_page) { -			SetPageSwapBacked(prealloc_page); -			if (mem_cgroup_cache_charge(prealloc_page, -					current->mm, GFP_KERNEL)) { -				page_cache_release(prealloc_page); -				prealloc_page = NULL; -			} -		} -	} - -	spin_lock(&info->lock); -	shmem_recalc_inode(inode); -	entry = shmem_swp_alloc(info, idx, sgp, gfp); -	if (IS_ERR(entry)) { -		spin_unlock(&info->lock); -		error = PTR_ERR(entry); -		goto out; -	} -	swap = *entry; +	info = SHMEM_I(inode); +	sbinfo = SHMEM_SB(inode->i_sb);  	if (swap.val) {  		/* Look it up and read it in.. */  		page = lookup_swap_cache(swap);  		if (!page) { -			shmem_swp_unmap(entry); -			spin_unlock(&info->lock);  			/* here we actually do the io */  			if (fault_type)  				*fault_type |= VM_FAULT_MAJOR; -			page = shmem_swapin(swap, gfp, info, idx); +			page = shmem_swapin(swap, gfp, info, index);  			if (!page) { -				spin_lock(&info->lock); -				entry = shmem_swp_alloc(info, idx, sgp, gfp); -				if (IS_ERR(entry)) -					error = PTR_ERR(entry); -				else { -					if (entry->val == swap.val) -						error = -ENOMEM; -					shmem_swp_unmap(entry); -				} -				spin_unlock(&info->lock); -				if (error) -					goto out; -				goto repeat; +				error = -ENOMEM; +				goto failed;  			} -			wait_on_page_locked(page); -			page_cache_release(page); -			goto repeat;  		}  		/* We have to do this with page locked to prevent races */ -		if (!trylock_page(page)) { -			shmem_swp_unmap(entry); -			spin_unlock(&info->lock); -			wait_on_page_locked(page); -			page_cache_release(page); -			goto repeat; -		} -		if (PageWriteback(page)) { -			shmem_swp_unmap(entry); -			spin_unlock(&info->lock); -			wait_on_page_writeback(page); -			unlock_page(page); -			page_cache_release(page); -			goto repeat; -		} +		lock_page(page);  		if (!PageUptodate(page)) { -			shmem_swp_unmap(entry); -			spin_unlock(&info->lock); -			unlock_page(page); -			page_cache_release(page);  			error = -EIO; -			goto out; +			goto failed;  		} - -		error = add_to_page_cache_locked(page, mapping, -						 idx, GFP_NOWAIT); -		if (error) { -			shmem_swp_unmap(entry); -			spin_unlock(&info->lock); -			if (error == -ENOMEM) { -				/* -				 * reclaim from proper memory cgroup and -				 * call memcg's OOM if needed. -				 */ -				error = mem_cgroup_shmem_charge_fallback( -						page, current->mm, gfp); -				if (error) { -					unlock_page(page); -					page_cache_release(page); -					goto out; -				} -			} -			unlock_page(page); -			page_cache_release(page); -			goto repeat; +		wait_on_page_writeback(page); + +		/* Someone may have already done it for us */ +		if (page->mapping) { +			if (page->mapping == mapping && +			    page->index == index) +				goto done; +			error = -EEXIST; +			goto failed;  		} -		info->flags |= SHMEM_PAGEIN; -		shmem_swp_set(info, entry, 0); -		shmem_swp_unmap(entry); -		delete_from_swap_cache(page); +		error = mem_cgroup_cache_charge(page, current->mm, +						gfp & GFP_RECLAIM_MASK); +		if (!error) +			error = shmem_add_to_page_cache(page, mapping, index, +						gfp, swp_to_radix_entry(swap)); +		if (error) +			goto failed; + +		spin_lock(&info->lock); +		info->swapped--; +		shmem_recalc_inode(inode);  		spin_unlock(&info->lock); + +		delete_from_swap_cache(page);  		set_page_dirty(page);  		swap_free(swap); -	} else if (sgp == SGP_READ) { -		shmem_swp_unmap(entry); -		page = find_get_page(mapping, idx); -		if (page && !trylock_page(page)) { -			spin_unlock(&info->lock); -			wait_on_page_locked(page); -			page_cache_release(page); -			goto repeat; +	} else { +		if (shmem_acct_block(info->flags)) { +			error = -ENOSPC; +			goto failed;  		} -		spin_unlock(&info->lock); - -	} else if (prealloc_page) { -		shmem_swp_unmap(entry); -		sbinfo = SHMEM_SB(inode->i_sb);  		if (sbinfo->max_blocks) {  			if (percpu_counter_compare(&sbinfo->used_blocks, -						sbinfo->max_blocks) >= 0 || -			    shmem_acct_block(info->flags)) -				goto nospace; +						sbinfo->max_blocks) >= 0) { +				error = -ENOSPC; +				goto unacct; +			}  			percpu_counter_inc(&sbinfo->used_blocks); -			inode->i_blocks += BLOCKS_PER_PAGE; -		} else if (shmem_acct_block(info->flags)) -			goto nospace; - -		page = prealloc_page; -		prealloc_page = NULL; - -		entry = shmem_swp_alloc(info, idx, sgp, gfp); -		if (IS_ERR(entry)) -			error = PTR_ERR(entry); -		else { -			swap = *entry; -			shmem_swp_unmap(entry);  		} -		ret = error || swap.val; -		if (ret) -			mem_cgroup_uncharge_cache_page(page); -		else -			ret = add_to_page_cache_lru(page, mapping, -						idx, GFP_NOWAIT); -		/* -		 * At add_to_page_cache_lru() failure, -		 * uncharge will be done automatically. -		 */ -		if (ret) { -			shmem_unacct_blocks(info->flags, 1); -			shmem_free_blocks(inode, 1); -			spin_unlock(&info->lock); -			page_cache_release(page); -			if (error) -				goto out; -			goto repeat; + +		page = shmem_alloc_page(gfp, info, index); +		if (!page) { +			error = -ENOMEM; +			goto decused;  		} -		info->flags |= SHMEM_PAGEIN; +		SetPageSwapBacked(page); +		__set_page_locked(page); +		error = mem_cgroup_cache_charge(page, current->mm, +						gfp & GFP_RECLAIM_MASK); +		if (!error) +			error = shmem_add_to_page_cache(page, mapping, index, +						gfp, NULL); +		if (error) +			goto decused; +		lru_cache_add_anon(page); + +		spin_lock(&info->lock);  		info->alloced++; +		inode->i_blocks += BLOCKS_PER_PAGE; +		shmem_recalc_inode(inode);  		spin_unlock(&info->lock); +  		clear_highpage(page);  		flush_dcache_page(page);  		SetPageUptodate(page);  		if (sgp == SGP_DIRTY)  			set_page_dirty(page); - -	} else { -		spin_unlock(&info->lock); -		error = -ENOMEM; -		goto out;  	}  done: -	*pagep = page; -	error = 0; -out: -	if (prealloc_page) { -		mem_cgroup_uncharge_cache_page(prealloc_page); -		page_cache_release(prealloc_page); +	/* Perhaps the file has been truncated since we checked */ +	if (sgp != SGP_WRITE && +	    ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { +		error = -EINVAL; +		goto trunc;  	} -	return error; +	*pagep = page; +	return 0; -nospace:  	/* -	 * Perhaps the page was brought in from swap between find_lock_page -	 * and taking info->lock?  We allow for that at add_to_page_cache_lru, -	 * but must also avoid reporting a spurious ENOSPC while working on a -	 * full tmpfs. +	 * Error recovery.  	 */ -	page = find_get_page(mapping, idx); +trunc: +	ClearPageDirty(page); +	delete_from_page_cache(page); +	spin_lock(&info->lock); +	info->alloced--; +	inode->i_blocks -= BLOCKS_PER_PAGE;  	spin_unlock(&info->lock); +decused: +	if (sbinfo->max_blocks) +		percpu_counter_add(&sbinfo->used_blocks, -1); +unacct: +	shmem_unacct_blocks(info->flags, 1); +failed: +	if (swap.val && error != -EINVAL) { +		struct page *test = find_get_page(mapping, index); +		if (test && !radix_tree_exceptional_entry(test)) +			page_cache_release(test); +		/* Have another try if the entry has changed */ +		if (test != swp_to_radix_entry(swap)) +			error = -EEXIST; +	}  	if (page) { +		unlock_page(page);  		page_cache_release(page); +	} +	if (error == -ENOSPC && !once++) { +		info = SHMEM_I(inode); +		spin_lock(&info->lock); +		shmem_recalc_inode(inode); +		spin_unlock(&info->lock);  		goto repeat;  	} -	error = -ENOSPC; -	goto out; +	if (error == -EEXIST) +		goto repeat; +	return error;  }  static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -1467,9 +1022,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)  	int error;  	int ret = VM_FAULT_LOCKED; -	if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) -		return VM_FAULT_SIGBUS; -  	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);  	if (error)  		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); @@ -1482,20 +1034,20 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)  }  #ifdef CONFIG_NUMA -static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)  { -	struct inode *i = vma->vm_file->f_path.dentry->d_inode; -	return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); +	struct inode *inode = vma->vm_file->f_path.dentry->d_inode; +	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);  }  static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,  					  unsigned long addr)  { -	struct inode *i = vma->vm_file->f_path.dentry->d_inode; -	unsigned long idx; +	struct inode *inode = vma->vm_file->f_path.dentry->d_inode; +	pgoff_t index; -	idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; -	return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); +	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; +	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);  }  #endif @@ -1593,7 +1145,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode  #ifdef CONFIG_TMPFS  static const struct inode_operations shmem_symlink_inode_operations; -static const struct inode_operations shmem_symlink_inline_operations; +static const struct inode_operations shmem_short_symlink_operations;  static int  shmem_write_begin(struct file *file, struct address_space *mapping, @@ -1626,7 +1178,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_  {  	struct inode *inode = filp->f_path.dentry->d_inode;  	struct address_space *mapping = inode->i_mapping; -	unsigned long index, offset; +	pgoff_t index; +	unsigned long offset;  	enum sgp_type sgp = SGP_READ;  	/* @@ -1642,7 +1195,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_  	for (;;) {  		struct page *page = NULL; -		unsigned long end_index, nr, ret; +		pgoff_t end_index; +		unsigned long nr, ret;  		loff_t i_size = i_size_read(inode);  		end_index = i_size >> PAGE_CACHE_SHIFT; @@ -1880,8 +1434,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)  	buf->f_namelen = NAME_MAX;  	if (sbinfo->max_blocks) {  		buf->f_blocks = sbinfo->max_blocks; -		buf->f_bavail = buf->f_bfree = -				sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); +		buf->f_bavail = +		buf->f_bfree  = sbinfo->max_blocks - +				percpu_counter_sum(&sbinfo->used_blocks);  	}  	if (sbinfo->max_inodes) {  		buf->f_files = sbinfo->max_inodes; @@ -2055,10 +1610,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s  	info = SHMEM_I(inode);  	inode->i_size = len-1; -	if (len <= SHMEM_SYMLINK_INLINE_LEN) { -		/* do it inline */ -		memcpy(info->inline_symlink, symname, len); -		inode->i_op = &shmem_symlink_inline_operations; +	if (len <= SHORT_SYMLINK_LEN) { +		info->symlink = kmemdup(symname, len, GFP_KERNEL); +		if (!info->symlink) { +			iput(inode); +			return -ENOMEM; +		} +		inode->i_op = &shmem_short_symlink_operations;  	} else {  		error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);  		if (error) { @@ -2081,17 +1639,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s  	return 0;  } -static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) +static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)  { -	nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); +	nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);  	return NULL;  }  static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)  {  	struct page *page = NULL; -	int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); -	nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); +	int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); +	nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));  	if (page)  		unlock_page(page);  	return page; @@ -2202,7 +1760,6 @@ out:  	return err;  } -  static const struct xattr_handler *shmem_xattr_handlers[] = {  #ifdef CONFIG_TMPFS_POSIX_ACL  	&generic_acl_access_handler, @@ -2332,9 +1889,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)  }  #endif /* CONFIG_TMPFS_XATTR */ -static const struct inode_operations shmem_symlink_inline_operations = { +static const struct inode_operations shmem_short_symlink_operations = {  	.readlink	= generic_readlink, -	.follow_link	= shmem_follow_link_inline, +	.follow_link	= shmem_follow_short_symlink,  #ifdef CONFIG_TMPFS_XATTR  	.setxattr	= shmem_setxattr,  	.getxattr	= shmem_getxattr, @@ -2534,8 +2091,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)  	if (config.max_inodes < inodes)  		goto out;  	/* -	 * Those tests also disallow limited->unlimited while any are in -	 * use, so i_blocks will always be zero when max_blocks is zero; +	 * Those tests disallow limited->unlimited while any are in use;  	 * but we must separately disallow unlimited->limited, because  	 * in that case we have no record of how much is already in use.  	 */ @@ -2627,7 +2183,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)  		goto failed;  	sbinfo->free_inodes = sbinfo->max_inodes; -	sb->s_maxbytes = SHMEM_MAX_BYTES; +	sb->s_maxbytes = MAX_LFS_FILESIZE;  	sb->s_blocksize = PAGE_CACHE_SIZE;  	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;  	sb->s_magic = TMPFS_MAGIC; @@ -2662,14 +2218,14 @@ static struct kmem_cache *shmem_inode_cachep;  static struct inode *shmem_alloc_inode(struct super_block *sb)  { -	struct shmem_inode_info *p; -	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); -	if (!p) +	struct shmem_inode_info *info; +	info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); +	if (!info)  		return NULL; -	return &p->vfs_inode; +	return &info->vfs_inode;  } -static void shmem_i_callback(struct rcu_head *head) +static void shmem_destroy_callback(struct rcu_head *head)  {  	struct inode *inode = container_of(head, struct inode, i_rcu);  	INIT_LIST_HEAD(&inode->i_dentry); @@ -2678,29 +2234,26 @@ static void shmem_i_callback(struct rcu_head *head)  static void shmem_destroy_inode(struct inode *inode)  { -	if ((inode->i_mode & S_IFMT) == S_IFREG) { -		/* only struct inode is valid if it's an inline symlink */ +	if ((inode->i_mode & S_IFMT) == S_IFREG)  		mpol_free_shared_policy(&SHMEM_I(inode)->policy); -	} -	call_rcu(&inode->i_rcu, shmem_i_callback); +	call_rcu(&inode->i_rcu, shmem_destroy_callback);  } -static void init_once(void *foo) +static void shmem_init_inode(void *foo)  { -	struct shmem_inode_info *p = (struct shmem_inode_info *) foo; - -	inode_init_once(&p->vfs_inode); +	struct shmem_inode_info *info = foo; +	inode_init_once(&info->vfs_inode);  } -static int init_inodecache(void) +static int shmem_init_inodecache(void)  {  	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",  				sizeof(struct shmem_inode_info), -				0, SLAB_PANIC, init_once); +				0, SLAB_PANIC, shmem_init_inode);  	return 0;  } -static void destroy_inodecache(void) +static void shmem_destroy_inodecache(void)  {  	kmem_cache_destroy(shmem_inode_cachep);  } @@ -2797,21 +2350,20 @@ static const struct vm_operations_struct shmem_vm_ops = {  #endif  }; -  static struct dentry *shmem_mount(struct file_system_type *fs_type,  	int flags, const char *dev_name, void *data)  {  	return mount_nodev(fs_type, flags, data, shmem_fill_super);  } -static struct file_system_type tmpfs_fs_type = { +static struct file_system_type shmem_fs_type = {  	.owner		= THIS_MODULE,  	.name		= "tmpfs",  	.mount		= shmem_mount,  	.kill_sb	= kill_litter_super,  }; -int __init init_tmpfs(void) +int __init shmem_init(void)  {  	int error; @@ -2819,18 +2371,18 @@ int __init init_tmpfs(void)  	if (error)  		goto out4; -	error = init_inodecache(); +	error = shmem_init_inodecache();  	if (error)  		goto out3; -	error = register_filesystem(&tmpfs_fs_type); +	error = register_filesystem(&shmem_fs_type);  	if (error) {  		printk(KERN_ERR "Could not register tmpfs\n");  		goto out2;  	} -	shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, -				tmpfs_fs_type.name, NULL); +	shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER, +				 shmem_fs_type.name, NULL);  	if (IS_ERR(shm_mnt)) {  		error = PTR_ERR(shm_mnt);  		printk(KERN_ERR "Could not kern_mount tmpfs\n"); @@ -2839,9 +2391,9 @@ int __init init_tmpfs(void)  	return 0;  out1: -	unregister_filesystem(&tmpfs_fs_type); +	unregister_filesystem(&shmem_fs_type);  out2: -	destroy_inodecache(); +	shmem_destroy_inodecache();  out3:  	bdi_destroy(&shmem_backing_dev_info);  out4: @@ -2849,45 +2401,6 @@ out4:  	return error;  } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -/** - * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file - * @inode: the inode to be searched - * @pgoff: the offset to be searched - * @pagep: the pointer for the found page to be stored - * @ent: the pointer for the found swap entry to be stored - * - * If a page is found, refcount of it is incremented. Callers should handle - * these refcount. - */ -void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, -					struct page **pagep, swp_entry_t *ent) -{ -	swp_entry_t entry = { .val = 0 }, *ptr; -	struct page *page = NULL; -	struct shmem_inode_info *info = SHMEM_I(inode); - -	if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) -		goto out; - -	spin_lock(&info->lock); -	ptr = shmem_swp_entry(info, pgoff, NULL); -#ifdef CONFIG_SWAP -	if (ptr && ptr->val) { -		entry.val = ptr->val; -		page = find_get_page(&swapper_space, entry.val); -	} else -#endif -		page = find_get_page(inode->i_mapping, pgoff); -	if (ptr) -		shmem_swp_unmap(ptr); -	spin_unlock(&info->lock); -out: -	*pagep = page; -	*ent = entry; -} -#endif -  #else /* !CONFIG_SHMEM */  /* @@ -2901,23 +2414,23 @@ out:  #include <linux/ramfs.h> -static struct file_system_type tmpfs_fs_type = { +static struct file_system_type shmem_fs_type = {  	.name		= "tmpfs",  	.mount		= ramfs_mount,  	.kill_sb	= kill_litter_super,  }; -int __init init_tmpfs(void) +int __init shmem_init(void)  { -	BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); +	BUG_ON(register_filesystem(&shmem_fs_type) != 0); -	shm_mnt = kern_mount(&tmpfs_fs_type); +	shm_mnt = kern_mount(&shmem_fs_type);  	BUG_ON(IS_ERR(shm_mnt));  	return 0;  } -int shmem_unuse(swp_entry_t entry, struct page *page) +int shmem_unuse(swp_entry_t swap, struct page *page)  {  	return 0;  } @@ -2927,43 +2440,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)  	return 0;  } -void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)  { -	truncate_inode_pages_range(inode->i_mapping, start, end); +	truncate_inode_pages_range(inode->i_mapping, lstart, lend);  }  EXPORT_SYMBOL_GPL(shmem_truncate_range); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -/** - * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file - * @inode: the inode to be searched - * @pgoff: the offset to be searched - * @pagep: the pointer for the found page to be stored - * @ent: the pointer for the found swap entry to be stored - * - * If a page is found, refcount of it is incremented. Callers should handle - * these refcount. - */ -void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, -					struct page **pagep, swp_entry_t *ent) -{ -	struct page *page = NULL; - -	if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) -		goto out; -	page = find_get_page(inode->i_mapping, pgoff); -out: -	*pagep = page; -	*ent = (swp_entry_t){ .val = 0 }; -} -#endif -  #define shmem_vm_ops				generic_file_vm_ops  #define shmem_file_operations			ramfs_file_operations  #define shmem_get_inode(sb, dir, mode, dev, flags)	ramfs_get_inode(sb, dir, mode, dev)  #define shmem_acct_size(flags, size)		0  #define shmem_unacct_size(flags, size)		do {} while (0) -#define SHMEM_MAX_BYTES				MAX_LFS_FILESIZE  #endif /* CONFIG_SHMEM */ @@ -2987,7 +2474,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags  	if (IS_ERR(shm_mnt))  		return (void *)shm_mnt; -	if (size < 0 || size > SHMEM_MAX_BYTES) +	if (size < 0 || size > MAX_LFS_FILESIZE)  		return ERR_PTR(-EINVAL);  	if (shmem_acct_size(flags, size)) diff --git a/mm/swapfile.c b/mm/swapfile.c index 1b8c33907242..17bc224bce68 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1924,20 +1924,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p,  	/*  	 * Find out how many pages are allowed for a single swap -	 * device. There are two limiting factors: 1) the number of -	 * bits for the swap offset in the swp_entry_t type and -	 * 2) the number of bits in the a swap pte as defined by -	 * the different architectures. In order to find the -	 * largest possible bit mask a swap entry with swap type 0 +	 * device. There are three limiting factors: 1) the number +	 * of bits for the swap offset in the swp_entry_t type, and +	 * 2) the number of bits in the swap pte as defined by the +	 * the different architectures, and 3) the number of free bits +	 * in an exceptional radix_tree entry. In order to find the +	 * largest possible bit mask, a swap entry with swap type 0  	 * and swap offset ~0UL is created, encoded to a swap pte, -	 * decoded to a swp_entry_t again and finally the swap +	 * decoded to a swp_entry_t again, and finally the swap  	 * offset is extracted. This will mask all the bits from  	 * the initial ~0UL mask that can't be encoded in either  	 * the swp_entry_t or the architecture definition of a -	 * swap pte. +	 * swap pte.  Then the same is done for a radix_tree entry.  	 */  	maxpages = swp_offset(pte_to_swp_entry( -			swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; +			swp_entry_to_pte(swp_entry(0, ~0UL)))); +	maxpages = swp_offset(radix_to_swp_entry( +			swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; +  	if (maxpages > swap_header->info.last_page) {  		maxpages = swap_header->info.last_page + 1;  		/* p->max is an unsigned int: don't overflow it */ diff --git a/mm/truncate.c b/mm/truncate.c index 232eb2736a79..b40ac6d4e86e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -336,6 +336,14 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,  	unsigned long count = 0;  	int i; +	/* +	 * Note: this function may get called on a shmem/tmpfs mapping: +	 * pagevec_lookup() might then return 0 prematurely (because it +	 * got a gangful of swap entries); but it's hardly worth worrying +	 * about - it can rarely have anything to free from such a mapping +	 * (most pages are dirty), and already skips over any difficulties. +	 */ +  	pagevec_init(&pvec, 0);  	while (index <= end && pagevec_lookup(&pvec, mapping, index,  			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 6d8ef4a3a9b5..8b2d37b59c9e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -128,34 +128,34 @@ unsigned long long get_msr(int cpu, off_t offset)  void print_header(void)  {  	if (show_pkg) -		fprintf(stderr, "pkg "); +		fprintf(stderr, "pk");  	if (show_core) -		fprintf(stderr, "core"); +		fprintf(stderr, " cr");  	if (show_cpu)  		fprintf(stderr, " CPU");  	if (do_nhm_cstates) -		fprintf(stderr, "   %%c0 "); +		fprintf(stderr, "    %%c0 ");  	if (has_aperf) -		fprintf(stderr, "  GHz"); +		fprintf(stderr, " GHz");  	fprintf(stderr, "  TSC");  	if (do_nhm_cstates) -		fprintf(stderr, "   %%c1 "); +		fprintf(stderr, "    %%c1");  	if (do_nhm_cstates) -		fprintf(stderr, "   %%c3 "); +		fprintf(stderr, "    %%c3");  	if (do_nhm_cstates) -		fprintf(stderr, "   %%c6 "); +		fprintf(stderr, "    %%c6");  	if (do_snb_cstates) -		fprintf(stderr, "   %%c7 "); +		fprintf(stderr, "    %%c7");  	if (do_snb_cstates) -		fprintf(stderr, "  %%pc2 "); +		fprintf(stderr, "  %%pc2");  	if (do_nhm_cstates) -		fprintf(stderr, "  %%pc3 "); +		fprintf(stderr, "  %%pc3");  	if (do_nhm_cstates) -		fprintf(stderr, "  %%pc6 "); +		fprintf(stderr, "  %%pc6");  	if (do_snb_cstates) -		fprintf(stderr, "  %%pc7 "); +		fprintf(stderr, "  %%pc7");  	if (extra_msr_offset) -		fprintf(stderr, "       MSR 0x%x ", extra_msr_offset); +		fprintf(stderr, "        MSR 0x%x ", extra_msr_offset);  	putc('\n', stderr);  } @@ -194,14 +194,14 @@ void print_cnt(struct counters *p)  	/* topology columns, print blanks on 1st (average) line */  	if (p == cnt_average) {  		if (show_pkg) -			fprintf(stderr, "    "); +			fprintf(stderr, " ");  		if (show_core)  			fprintf(stderr, "    ");  		if (show_cpu)  			fprintf(stderr, "    ");  	} else {  		if (show_pkg) -			fprintf(stderr, "%4d", p->pkg); +			fprintf(stderr, "%d", p->pkg);  		if (show_core)  			fprintf(stderr, "%4d", p->core);  		if (show_cpu) @@ -241,22 +241,22 @@ void print_cnt(struct counters *p)  		if (!skip_c1)  			fprintf(stderr, "%7.2f", 100.0 * p->c1/p->tsc);  		else -			fprintf(stderr, "   ****"); +			fprintf(stderr, "  ****");  	}  	if (do_nhm_cstates) -		fprintf(stderr, "%7.2f", 100.0 * p->c3/p->tsc); +		fprintf(stderr, " %6.2f", 100.0 * p->c3/p->tsc);  	if (do_nhm_cstates) -		fprintf(stderr, "%7.2f", 100.0 * p->c6/p->tsc); +		fprintf(stderr, " %6.2f", 100.0 * p->c6/p->tsc);  	if (do_snb_cstates) -		fprintf(stderr, "%7.2f", 100.0 * p->c7/p->tsc); +		fprintf(stderr, " %6.2f", 100.0 * p->c7/p->tsc);  	if (do_snb_cstates) -		fprintf(stderr, "%7.2f", 100.0 * p->pc2/p->tsc); +		fprintf(stderr, " %5.2f", 100.0 * p->pc2/p->tsc);  	if (do_nhm_cstates) -		fprintf(stderr, "%7.2f", 100.0 * p->pc3/p->tsc); +		fprintf(stderr, " %5.2f", 100.0 * p->pc3/p->tsc);  	if (do_nhm_cstates) -		fprintf(stderr, "%7.2f", 100.0 * p->pc6/p->tsc); +		fprintf(stderr, " %5.2f", 100.0 * p->pc6/p->tsc);  	if (do_snb_cstates) -		fprintf(stderr, "%7.2f", 100.0 * p->pc7/p->tsc); +		fprintf(stderr, " %5.2f", 100.0 * p->pc7/p->tsc);  	if (extra_msr_offset)  		fprintf(stderr, "  0x%016llx", p->extra_msr);  	putc('\n', stderr); diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c index 2618ef2ba31f..33c5c7ee148f 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c @@ -137,7 +137,6 @@ void cmdline(int argc, char **argv)  void validate_cpuid(void)  {  	unsigned int eax, ebx, ecx, edx, max_level; -	char brand[16];  	unsigned int fms, family, model, stepping;  	eax = ebx = ecx = edx = 0; @@ -160,8 +159,8 @@ void validate_cpuid(void)  		model += ((fms >> 16) & 0xf) << 4;  	if (verbose > 1) -		printf("CPUID %s %d levels family:model:stepping " -			"0x%x:%x:%x (%d:%d:%d)\n", brand, max_level, +		printf("CPUID %d levels family:model:stepping " +			"0x%x:%x:%x (%d:%d:%d)\n", max_level,  			family, model, stepping, family, model, stepping);  	if (!(edx & (1 << 5))) { | 
