From c62d81bcfe82526cc3da10cf4fc63faad368bc60 Mon Sep 17 00:00:00 2001 From: Alessandro Rubini Date: Sun, 20 Sep 2009 23:28:04 +0200 Subject: mtd: use bbm.h in nand.h This consolidates common code in nand.h and bbm.h. The comments and data structures were the same, this keeps the comment from nand.h as it fits 80 columns, while the one in bbm.h did not. Signed-off-by: Alessandro Rubini Signed-off-by: David Woodhouse --- include/linux/mtd/bbm.h | 35 +++++++++++----------- include/linux/mtd/nand.h | 76 +----------------------------------------------- 2 files changed, 19 insertions(+), 92 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/bbm.h b/include/linux/mtd/bbm.h index fff8c53e5434..9c3757c5759d 100644 --- a/include/linux/mtd/bbm.h +++ b/include/linux/mtd/bbm.h @@ -19,22 +19,21 @@ /** * struct nand_bbt_descr - bad block table descriptor - * @options: options for this descriptor - * @pages: the page(s) where we find the bbt, used with - * option BBT_ABSPAGE when bbt is searched, - * then we store the found bbts pages here. - * Its an array and supports up to 8 chips now - * @offs: offset of the pattern in the oob area of the page - * @veroffs: offset of the bbt version counter in the oob area of the page - * @version: version read from the bbt page during scan - * @len: length of the pattern, if 0 no pattern check is performed - * @maxblocks: maximum number of blocks to search for a bbt. This - * number of blocks is reserved at the end of the device - * where the tables are written. - * @reserved_block_code: if non-0, this pattern denotes a reserved - * (rather than bad) block in the stored bbt - * @pattern: pattern to identify bad block table or factory marked - * good / bad blocks, can be NULL, if len = 0 + * @options: options for this descriptor + * @pages: the page(s) where we find the bbt, used with option BBT_ABSPAGE + * when bbt is searched, then we store the found bbts pages here. + * Its an array and supports up to 8 chips now + * @offs: offset of the pattern in the oob area of the page + * @veroffs: offset of the bbt version counter in the oob are of the page + * @version: version read from the bbt page during scan + * @len: length of the pattern, if 0 no pattern check is performed + * @maxblocks: maximum number of blocks to search for a bbt. This number of + * blocks is reserved at the end of the device where the tables are + * written. + * @reserved_block_code: if non-0, this pattern denotes a reserved (rather than + * bad) block in the stored bbt + * @pattern: pattern to identify bad block table or factory marked good / + * bad blocks, can be NULL, if len = 0 * * Descriptor for the bad block table marker and the descriptor for the * pattern which identifies good and bad blocks. The assumption is made @@ -90,7 +89,9 @@ struct nand_bbt_descr { /* * Constants for oob configuration */ -#define ONENAND_BADBLOCK_POS 0 +#define NAND_SMALL_BADBLOCK_POS 5 +#define NAND_LARGE_BADBLOCK_POS 0 +#define ONENAND_BADBLOCK_POS 0 /* * Bad block scanning errors diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 7a232a9bdd62..d87ada538d17 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -21,6 +21,7 @@ #include #include #include +#include struct mtd_info; /* Scan and identify a NAND device */ @@ -470,75 +471,6 @@ struct nand_manufacturers { extern struct nand_flash_dev nand_flash_ids[]; extern struct nand_manufacturers nand_manuf_ids[]; -/** - * struct nand_bbt_descr - bad block table descriptor - * @options: options for this descriptor - * @pages: the page(s) where we find the bbt, used with option BBT_ABSPAGE - * when bbt is searched, then we store the found bbts pages here. - * Its an array and supports up to 8 chips now - * @offs: offset of the pattern in the oob area of the page - * @veroffs: offset of the bbt version counter in the oob are of the page - * @version: version read from the bbt page during scan - * @len: length of the pattern, if 0 no pattern check is performed - * @maxblocks: maximum number of blocks to search for a bbt. This number of - * blocks is reserved at the end of the device where the tables are - * written. - * @reserved_block_code: if non-0, this pattern denotes a reserved (rather than - * bad) block in the stored bbt - * @pattern: pattern to identify bad block table or factory marked good / - * bad blocks, can be NULL, if len = 0 - * - * Descriptor for the bad block table marker and the descriptor for the - * pattern which identifies good and bad blocks. The assumption is made - * that the pattern and the version count are always located in the oob area - * of the first block. - */ -struct nand_bbt_descr { - int options; - int pages[NAND_MAX_CHIPS]; - int offs; - int veroffs; - uint8_t version[NAND_MAX_CHIPS]; - int len; - int maxblocks; - int reserved_block_code; - uint8_t *pattern; -}; - -/* Options for the bad block table descriptors */ - -/* The number of bits used per block in the bbt on the device */ -#define NAND_BBT_NRBITS_MSK 0x0000000F -#define NAND_BBT_1BIT 0x00000001 -#define NAND_BBT_2BIT 0x00000002 -#define NAND_BBT_4BIT 0x00000004 -#define NAND_BBT_8BIT 0x00000008 -/* The bad block table is in the last good block of the device */ -#define NAND_BBT_LASTBLOCK 0x00000010 -/* The bbt is at the given page, else we must scan for the bbt */ -#define NAND_BBT_ABSPAGE 0x00000020 -/* The bbt is at the given page, else we must scan for the bbt */ -#define NAND_BBT_SEARCH 0x00000040 -/* bbt is stored per chip on multichip devices */ -#define NAND_BBT_PERCHIP 0x00000080 -/* bbt has a version counter at offset veroffs */ -#define NAND_BBT_VERSION 0x00000100 -/* Create a bbt if none axists */ -#define NAND_BBT_CREATE 0x00000200 -/* Search good / bad pattern through all pages of a block */ -#define NAND_BBT_SCANALLPAGES 0x00000400 -/* Scan block empty during good / bad block scan */ -#define NAND_BBT_SCANEMPTY 0x00000800 -/* Write bbt if neccecary */ -#define NAND_BBT_WRITE 0x00001000 -/* Read and write back block contents when writing bbt */ -#define NAND_BBT_SAVECONTENT 0x00002000 -/* Search good / bad pattern on the first and the second page */ -#define NAND_BBT_SCAN2NDPAGE 0x00004000 - -/* The maximum number of blocks to scan for a bbt */ -#define NAND_BBT_SCAN_MAXBLOCKS 4 - extern int nand_scan_bbt(struct mtd_info *mtd, struct nand_bbt_descr *bd); extern int nand_update_bbt(struct mtd_info *mtd, loff_t offs); extern int nand_default_bbt(struct mtd_info *mtd); @@ -548,12 +480,6 @@ extern int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr, extern int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len, size_t * retlen, uint8_t * buf); -/* -* Constants for oob configuration -*/ -#define NAND_SMALL_BADBLOCK_POS 5 -#define NAND_LARGE_BADBLOCK_POS 0 - /** * struct platform_nand_chip - chip level device structure * @nr_chips: max. number of chips to scan for -- cgit v1.2.3 From 30631cb82d5c6c662d5ec682beaa834c1f9f0987 Mon Sep 17 00:00:00 2001 From: Alessandro Rubini Date: Sun, 20 Sep 2009 23:28:14 +0200 Subject: mtd: unify status enum from three headers nand.h, onenand.h and flashchip.h defined enumeration types for chip status using the same symbolic names. This prevented a board file to include more than one of them. In particular, no nand and onenand platform devices could live in the same file. This patch augments flashchip.h with a few status values in order to cover all cases, so nand.h and onenand.h can use flstate_t without declaring their own status enum. Signed-off-by: Alessandro Rubini Signed-off-by: David Woodhouse --- include/linux/mtd/flashchip.h | 7 +++++++ include/linux/mtd/nand.h | 17 ++--------------- include/linux/mtd/onenand.h | 19 ++----------------- 3 files changed, 11 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/flashchip.h b/include/linux/mtd/flashchip.h index d4f38c5fd44e..f350a4879f75 100644 --- a/include/linux/mtd/flashchip.h +++ b/include/linux/mtd/flashchip.h @@ -38,6 +38,13 @@ typedef enum { FL_XIP_WHILE_ERASING, FL_XIP_WHILE_WRITING, FL_SHUTDOWN, + /* These 2 come from nand_state_t, which has been unified here */ + FL_READING, + FL_CACHEDPRG, + /* These 2 come from onenand_state_t, which has been unified here */ + FL_RESETING, + FL_OTPING, + FL_UNKNOWN } flstate_t; diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index d87ada538d17..2476078a032f 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -21,6 +21,7 @@ #include #include #include +#include #include struct mtd_info; @@ -203,20 +204,6 @@ typedef enum { #define NAND_CI_CHIPNR_MSK 0x03 #define NAND_CI_CELLTYPE_MSK 0x0C -/* - * nand_state_t - chip states - * Enumeration for NAND flash chip state - */ -typedef enum { - FL_READY, - FL_READING, - FL_WRITING, - FL_ERASING, - FL_SYNCING, - FL_CACHEDPRG, - FL_PM_SUSPENDED, -} nand_state_t; - /* Keep gcc happy */ struct nand_chip; @@ -403,7 +390,7 @@ struct nand_chip { uint8_t cellinfo; int badblockpos; - nand_state_t state; + flstate_t state; uint8_t *oob_poi; struct nand_hw_control *controller; diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h index 4e49f3350678..f57e29e17bb0 100644 --- a/include/linux/mtd/onenand.h +++ b/include/linux/mtd/onenand.h @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -25,22 +26,6 @@ extern int onenand_scan(struct mtd_info *mtd, int max_chips); /* Free resources held by the OneNAND device */ extern void onenand_release(struct mtd_info *mtd); -/* - * onenand_state_t - chip states - * Enumeration for OneNAND flash chip state - */ -typedef enum { - FL_READY, - FL_READING, - FL_WRITING, - FL_ERASING, - FL_SYNCING, - FL_LOCKING, - FL_RESETING, - FL_OTPING, - FL_PM_SUSPENDED, -} onenand_state_t; - /** * struct onenand_bufferram - OneNAND BufferRAM Data * @blockpage: block & page address in BufferRAM @@ -137,7 +122,7 @@ struct onenand_chip { spinlock_t chip_lock; wait_queue_head_t wq; - onenand_state_t state; + flstate_t state; unsigned char *page_buf; unsigned char *oob_buf; -- cgit v1.2.3 From dc7a08166f3a5f23e79e839a8a88849bd3397c32 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 27 Oct 2009 14:41:35 -0400 Subject: nfs: new subdir Documentation/filesystems/nfs We're adding enough nfs documentation that it may as well have its own subdirectory. Acked-by: Randy Dunlap Signed-off-by: J. Bruce Fields --- Documentation/filesystems/00-INDEX | 10 +- Documentation/filesystems/Exporting | 147 -------------- Documentation/filesystems/nfs-rdma.txt | 271 ------------------------- Documentation/filesystems/nfs.txt | 98 --------- Documentation/filesystems/nfs/00-INDEX | 12 ++ Documentation/filesystems/nfs/Exporting | 147 ++++++++++++++ Documentation/filesystems/nfs/nfs-rdma.txt | 271 +++++++++++++++++++++++++ Documentation/filesystems/nfs/nfs.txt | 98 +++++++++ Documentation/filesystems/nfs/nfs41-server.txt | 222 ++++++++++++++++++++ Documentation/filesystems/nfs/nfsroot.txt | 270 ++++++++++++++++++++++++ Documentation/filesystems/nfs41-server.txt | 222 -------------------- Documentation/filesystems/nfsroot.txt | 270 ------------------------ Documentation/filesystems/porting | 2 +- Documentation/kernel-parameters.txt | 6 +- fs/cifs/export.c | 2 +- fs/exportfs/expfs.c | 2 +- fs/isofs/export.c | 2 +- fs/nfs/Kconfig | 2 +- include/linux/exportfs.h | 2 +- net/ipv4/Kconfig | 6 +- net/ipv4/ipconfig.c | 2 +- 21 files changed, 1035 insertions(+), 1029 deletions(-) delete mode 100644 Documentation/filesystems/Exporting delete mode 100644 Documentation/filesystems/nfs-rdma.txt delete mode 100644 Documentation/filesystems/nfs.txt create mode 100644 Documentation/filesystems/nfs/00-INDEX create mode 100644 Documentation/filesystems/nfs/Exporting create mode 100644 Documentation/filesystems/nfs/nfs-rdma.txt create mode 100644 Documentation/filesystems/nfs/nfs.txt create mode 100644 Documentation/filesystems/nfs/nfs41-server.txt create mode 100644 Documentation/filesystems/nfs/nfsroot.txt delete mode 100644 Documentation/filesystems/nfs41-server.txt delete mode 100644 Documentation/filesystems/nfsroot.txt (limited to 'include/linux') diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index f15621ee5599..482151c883a5 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -1,7 +1,5 @@ 00-INDEX - this file (info on some of the filesystems supported by linux). -Exporting - - explanation of how to make filesystems exportable. Locking - info on locking rules as they pertain to Linux VFS. 9p.txt @@ -66,12 +64,8 @@ mandatory-locking.txt - info on the Linux implementation of Sys V mandatory file locking. ncpfs.txt - info on Novell Netware(tm) filesystem using NCP protocol. -nfs41-server.txt - - info on the Linux server implementation of NFSv4 minor version 1. -nfs-rdma.txt - - how to install and setup the Linux NFS/RDMA client and server software. -nfsroot.txt - - short guide on setting up a diskless box with NFS root filesystem. +nfs/ + - nfs-related documentation. nilfs2.txt - info and mount options for the NILFS2 filesystem. ntfs.txt diff --git a/Documentation/filesystems/Exporting b/Documentation/filesystems/Exporting deleted file mode 100644 index 87019d2b5981..000000000000 --- a/Documentation/filesystems/Exporting +++ /dev/null @@ -1,147 +0,0 @@ - -Making Filesystems Exportable -============================= - -Overview --------- - -All filesystem operations require a dentry (or two) as a starting -point. Local applications have a reference-counted hold on suitable -dentries via open file descriptors or cwd/root. However remote -applications that access a filesystem via a remote filesystem protocol -such as NFS may not be able to hold such a reference, and so need a -different way to refer to a particular dentry. As the alternative -form of reference needs to be stable across renames, truncates, and -server-reboot (among other things, though these tend to be the most -problematic), there is no simple answer like 'filename'. - -The mechanism discussed here allows each filesystem implementation to -specify how to generate an opaque (outside of the filesystem) byte -string for any dentry, and how to find an appropriate dentry for any -given opaque byte string. -This byte string will be called a "filehandle fragment" as it -corresponds to part of an NFS filehandle. - -A filesystem which supports the mapping between filehandle fragments -and dentries will be termed "exportable". - - - -Dcache Issues -------------- - -The dcache normally contains a proper prefix of any given filesystem -tree. This means that if any filesystem object is in the dcache, then -all of the ancestors of that filesystem object are also in the dcache. -As normal access is by filename this prefix is created naturally and -maintained easily (by each object maintaining a reference count on -its parent). - -However when objects are included into the dcache by interpreting a -filehandle fragment, there is no automatic creation of a path prefix -for the object. This leads to two related but distinct features of -the dcache that are not needed for normal filesystem access. - -1/ The dcache must sometimes contain objects that are not part of the - proper prefix. i.e that are not connected to the root. -2/ The dcache must be prepared for a newly found (via ->lookup) directory - to already have a (non-connected) dentry, and must be able to move - that dentry into place (based on the parent and name in the - ->lookup). This is particularly needed for directories as - it is a dcache invariant that directories only have one dentry. - -To implement these features, the dcache has: - -a/ A dentry flag DCACHE_DISCONNECTED which is set on - any dentry that might not be part of the proper prefix. - This is set when anonymous dentries are created, and cleared when a - dentry is noticed to be a child of a dentry which is in the proper - prefix. - -b/ A per-superblock list "s_anon" of dentries which are the roots of - subtrees that are not in the proper prefix. These dentries, as - well as the proper prefix, need to be released at unmount time. As - these dentries will not be hashed, they are linked together on the - d_hash list_head. - -c/ Helper routines to allocate anonymous dentries, and to help attach - loose directory dentries at lookup time. They are: - d_alloc_anon(inode) will return a dentry for the given inode. - If the inode already has a dentry, one of those is returned. - If it doesn't, a new anonymous (IS_ROOT and - DCACHE_DISCONNECTED) dentry is allocated and attached. - In the case of a directory, care is taken that only one dentry - can ever be attached. - d_splice_alias(inode, dentry) will make sure that there is a - dentry with the same name and parent as the given dentry, and - which refers to the given inode. - If the inode is a directory and already has a dentry, then that - dentry is d_moved over the given dentry. - If the passed dentry gets attached, care is taken that this is - mutually exclusive to a d_alloc_anon operation. - If the passed dentry is used, NULL is returned, else the used - dentry is returned. This corresponds to the calling pattern of - ->lookup. - - -Filesystem Issues ------------------ - -For a filesystem to be exportable it must: - - 1/ provide the filehandle fragment routines described below. - 2/ make sure that d_splice_alias is used rather than d_add - when ->lookup finds an inode for a given parent and name. - Typically the ->lookup routine will end with a: - - return d_splice_alias(inode, dentry); - } - - - - A file system implementation declares that instances of the filesystem -are exportable by setting the s_export_op field in the struct -super_block. This field must point to a "struct export_operations" -struct which has the following members: - - encode_fh (optional) - Takes a dentry and creates a filehandle fragment which can later be used - to find or create a dentry for the same object. The default - implementation creates a filehandle fragment that encodes a 32bit inode - and generation number for the inode encoded, and if necessary the - same information for the parent. - - fh_to_dentry (mandatory) - Given a filehandle fragment, this should find the implied object and - create a dentry for it (possibly with d_alloc_anon). - - fh_to_parent (optional but strongly recommended) - Given a filehandle fragment, this should find the parent of the - implied object and create a dentry for it (possibly with d_alloc_anon). - May fail if the filehandle fragment is too small. - - get_parent (optional but strongly recommended) - When given a dentry for a directory, this should return a dentry for - the parent. Quite possibly the parent dentry will have been allocated - by d_alloc_anon. The default get_parent function just returns an error - so any filehandle lookup that requires finding a parent will fail. - ->lookup("..") is *not* used as a default as it can leave ".." entries - in the dcache which are too messy to work with. - - get_name (optional) - When given a parent dentry and a child dentry, this should find a name - in the directory identified by the parent dentry, which leads to the - object identified by the child dentry. If no get_name function is - supplied, a default implementation is provided which uses vfs_readdir - to find potential names, and matches inode numbers to find the correct - match. - - -A filehandle fragment consists of an array of 1 or more 4byte words, -together with a one byte "type". -The decode_fh routine should not depend on the stated size that is -passed to it. This size may be larger than the original filehandle -generated by encode_fh, in which case it will have been padded with -nuls. Rather, the encode_fh routine should choose a "type" which -indicates the decode_fh how much of the filehandle is valid, and how -it should be interpreted. diff --git a/Documentation/filesystems/nfs-rdma.txt b/Documentation/filesystems/nfs-rdma.txt deleted file mode 100644 index e386f7e4bcee..000000000000 --- a/Documentation/filesystems/nfs-rdma.txt +++ /dev/null @@ -1,271 +0,0 @@ -################################################################################ -# # -# NFS/RDMA README # -# # -################################################################################ - - Author: NetApp and Open Grid Computing - Date: May 29, 2008 - -Table of Contents -~~~~~~~~~~~~~~~~~ - - Overview - - Getting Help - - Installation - - Check RDMA and NFS Setup - - NFS/RDMA Setup - -Overview -~~~~~~~~ - - This document describes how to install and setup the Linux NFS/RDMA client - and server software. - - The NFS/RDMA client was first included in Linux 2.6.24. The NFS/RDMA server - was first included in the following release, Linux 2.6.25. - - In our testing, we have obtained excellent performance results (full 10Gbit - wire bandwidth at minimal client CPU) under many workloads. The code passes - the full Connectathon test suite and operates over both Infiniband and iWARP - RDMA adapters. - -Getting Help -~~~~~~~~~~~~ - - If you get stuck, you can ask questions on the - - nfs-rdma-devel@lists.sourceforge.net - - mailing list. - -Installation -~~~~~~~~~~~~ - - These instructions are a step by step guide to building a machine for - use with NFS/RDMA. - - - Install an RDMA device - - Any device supported by the drivers in drivers/infiniband/hw is acceptable. - - Testing has been performed using several Mellanox-based IB cards, the - Ammasso AMS1100 iWARP adapter, and the Chelsio cxgb3 iWARP adapter. - - - Install a Linux distribution and tools - - The first kernel release to contain both the NFS/RDMA client and server was - Linux 2.6.25 Therefore, a distribution compatible with this and subsequent - Linux kernel release should be installed. - - The procedures described in this document have been tested with - distributions from Red Hat's Fedora Project (http://fedora.redhat.com/). - - - Install nfs-utils-1.1.2 or greater on the client - - An NFS/RDMA mount point can be obtained by using the mount.nfs command in - nfs-utils-1.1.2 or greater (nfs-utils-1.1.1 was the first nfs-utils - version with support for NFS/RDMA mounts, but for various reasons we - recommend using nfs-utils-1.1.2 or greater). To see which version of - mount.nfs you are using, type: - - $ /sbin/mount.nfs -V - - If the version is less than 1.1.2 or the command does not exist, - you should install the latest version of nfs-utils. - - Download the latest package from: - - http://www.kernel.org/pub/linux/utils/nfs - - Uncompress the package and follow the installation instructions. - - If you will not need the idmapper and gssd executables (you do not need - these to create an NFS/RDMA enabled mount command), the installation - process can be simplified by disabling these features when running - configure: - - $ ./configure --disable-gss --disable-nfsv4 - - To build nfs-utils you will need the tcp_wrappers package installed. For - more information on this see the package's README and INSTALL files. - - After building the nfs-utils package, there will be a mount.nfs binary in - the utils/mount directory. This binary can be used to initiate NFS v2, v3, - or v4 mounts. To initiate a v4 mount, the binary must be called - mount.nfs4. The standard technique is to create a symlink called - mount.nfs4 to mount.nfs. - - This mount.nfs binary should be installed at /sbin/mount.nfs as follows: - - $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs - - In this location, mount.nfs will be invoked automatically for NFS mounts - by the system mount command. - - NOTE: mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed - on the NFS client machine. You do not need this specific version of - nfs-utils on the server. Furthermore, only the mount.nfs command from - nfs-utils-1.1.2 is needed on the client. - - - Install a Linux kernel with NFS/RDMA - - The NFS/RDMA client and server are both included in the mainline Linux - kernel version 2.6.25 and later. This and other versions of the 2.6 Linux - kernel can be found at: - - ftp://ftp.kernel.org/pub/linux/kernel/v2.6/ - - Download the sources and place them in an appropriate location. - - - Configure the RDMA stack - - Make sure your kernel configuration has RDMA support enabled. Under - Device Drivers -> InfiniBand support, update the kernel configuration - to enable InfiniBand support [NOTE: the option name is misleading. Enabling - InfiniBand support is required for all RDMA devices (IB, iWARP, etc.)]. - - Enable the appropriate IB HCA support (mlx4, mthca, ehca, ipath, etc.) or - iWARP adapter support (amso, cxgb3, etc.). - - If you are using InfiniBand, be sure to enable IP-over-InfiniBand support. - - - Configure the NFS client and server - - Your kernel configuration must also have NFS file system support and/or - NFS server support enabled. These and other NFS related configuration - options can be found under File Systems -> Network File Systems. - - - Build, install, reboot - - The NFS/RDMA code will be enabled automatically if NFS and RDMA - are turned on. The NFS/RDMA client and server are configured via the hidden - SUNRPC_XPRT_RDMA config option that depends on SUNRPC and INFINIBAND. The - value of SUNRPC_XPRT_RDMA will be: - - - N if either SUNRPC or INFINIBAND are N, in this case the NFS/RDMA client - and server will not be built - - M if both SUNRPC and INFINIBAND are on (M or Y) and at least one is M, - in this case the NFS/RDMA client and server will be built as modules - - Y if both SUNRPC and INFINIBAND are Y, in this case the NFS/RDMA client - and server will be built into the kernel - - Therefore, if you have followed the steps above and turned no NFS and RDMA, - the NFS/RDMA client and server will be built. - - Build a new kernel, install it, boot it. - -Check RDMA and NFS Setup -~~~~~~~~~~~~~~~~~~~~~~~~ - - Before configuring the NFS/RDMA software, it is a good idea to test - your new kernel to ensure that the kernel is working correctly. - In particular, it is a good idea to verify that the RDMA stack - is functioning as expected and standard NFS over TCP/IP and/or UDP/IP - is working properly. - - - Check RDMA Setup - - If you built the RDMA components as modules, load them at - this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel - card: - - $ modprobe ib_mthca - $ modprobe ib_ipoib - - If you are using InfiniBand, make sure there is a Subnet Manager (SM) - running on the network. If your IB switch has an embedded SM, you can - use it. Otherwise, you will need to run an SM, such as OpenSM, on one - of your end nodes. - - If an SM is running on your network, you should see the following: - - $ cat /sys/class/infiniband/driverX/ports/1/state - 4: ACTIVE - - where driverX is mthca0, ipath5, ehca3, etc. - - To further test the InfiniBand software stack, use IPoIB (this - assumes you have two IB hosts named host1 and host2): - - host1$ ifconfig ib0 a.b.c.x - host2$ ifconfig ib0 a.b.c.y - host1$ ping a.b.c.y - host2$ ping a.b.c.x - - For other device types, follow the appropriate procedures. - - - Check NFS Setup - - For the NFS components enabled above (client and/or server), - test their functionality over standard Ethernet using TCP/IP or UDP/IP. - -NFS/RDMA Setup -~~~~~~~~~~~~~~ - - We recommend that you use two machines, one to act as the client and - one to act as the server. - - One time configuration: - - - On the server system, configure the /etc/exports file and - start the NFS/RDMA server. - - Exports entries with the following formats have been tested: - - /vol0 192.168.0.47(fsid=0,rw,async,insecure,no_root_squash) - /vol0 192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash) - - The IP address(es) is(are) the client's IPoIB address for an InfiniBand - HCA or the cleint's iWARP address(es) for an RNIC. - - NOTE: The "insecure" option must be used because the NFS/RDMA client does - not use a reserved port. - - Each time a machine boots: - - - Load and configure the RDMA drivers - - For InfiniBand using a Mellanox adapter: - - $ modprobe ib_mthca - $ modprobe ib_ipoib - $ ifconfig ib0 a.b.c.d - - NOTE: use unique addresses for the client and server - - - Start the NFS server - - If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in - kernel config), load the RDMA transport module: - - $ modprobe svcrdma - - Regardless of how the server was built (module or built-in), start the - server: - - $ /etc/init.d/nfs start - - or - - $ service nfs start - - Instruct the server to listen on the RDMA transport: - - $ echo rdma 20049 > /proc/fs/nfsd/portlist - - - On the client system - - If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in - kernel config), load the RDMA client module: - - $ modprobe xprtrdma.ko - - Regardless of how the client was built (module or built-in), use this - command to mount the NFS/RDMA server: - - $ mount -o rdma,port=20049 :/ /mnt - - To verify that the mount is using RDMA, run "cat /proc/mounts" and check - the "proto" field for the given mount. - - Congratulations! You're using NFS/RDMA! diff --git a/Documentation/filesystems/nfs.txt b/Documentation/filesystems/nfs.txt deleted file mode 100644 index f50f26ce6cd0..000000000000 --- a/Documentation/filesystems/nfs.txt +++ /dev/null @@ -1,98 +0,0 @@ - -The NFS client -============== - -The NFS version 2 protocol was first documented in RFC1094 (March 1989). -Since then two more major releases of NFS have been published, with NFSv3 -being documented in RFC1813 (June 1995), and NFSv4 in RFC3530 (April -2003). - -The Linux NFS client currently supports all the above published versions, -and work is in progress on adding support for minor version 1 of the NFSv4 -protocol. - -The purpose of this document is to provide information on some of the -upcall interfaces that are used in order to provide the NFS client with -some of the information that it requires in order to fully comply with -the NFS spec. - -The DNS resolver -================ - -NFSv4 allows for one server to refer the NFS client to data that has been -migrated onto another server by means of the special "fs_locations" -attribute. See - http://tools.ietf.org/html/rfc3530#section-6 -and - http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00 - -The fs_locations information can take the form of either an ip address and -a path, or a DNS hostname and a path. The latter requires the NFS client to -do a DNS lookup in order to mount the new volume, and hence the need for an -upcall to allow userland to provide this service. - -Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual -/var/lib/nfs/rpc_pipefs, the upcall consists of the following steps: - - (1) The process checks the dns_resolve cache to see if it contains a - valid entry. If so, it returns that entry and exits. - - (2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent' - (may be changed using the 'nfs.cache_getent' kernel boot parameter) - is run, with two arguments: - - the cache name, "dns_resolve" - - the hostname to resolve - - (3) After looking up the corresponding ip address, the helper script - writes the result into the rpc_pipefs pseudo-file - '/var/lib/nfs/rpc_pipefs/cache/dns_resolve/channel' - in the following (text) format: - - " \n" - - Where is in the usual IPv4 (123.456.78.90) or IPv6 - (ffee:ddcc:bbaa:9988:7766:5544:3322:1100, ffee::1100, ...) format. - is identical to the second argument of the helper - script, and is the 'time to live' of this cache entry (in - units of seconds). - - Note: If is invalid, say the string "0", then a negative - entry is created, which will cause the kernel to treat the hostname - as having no valid DNS translation. - - - - -A basic sample /sbin/nfs_cache_getent -===================================== - -#!/bin/bash -# -ttl=600 -# -cut=/usr/bin/cut -getent=/usr/bin/getent -rpc_pipefs=/var/lib/nfs/rpc_pipefs -# -die() -{ - echo "Usage: $0 cache_name entry_name" - exit 1 -} - -[ $# -lt 2 ] && die -cachename="$1" -cache_path=${rpc_pipefs}/cache/${cachename}/channel - -case "${cachename}" in - dns_resolve) - name="$2" - result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )" - [ -z "${result}" ] && result="0" - ;; - *) - die - ;; -esac -echo "${result} ${name} ${ttl}" >${cache_path} - diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX new file mode 100644 index 000000000000..6ff3d212027b --- /dev/null +++ b/Documentation/filesystems/nfs/00-INDEX @@ -0,0 +1,12 @@ +00-INDEX + - this file (nfs-related documentation). +Exporting + - explanation of how to make filesystems exportable. +nfs.txt + - nfs client, and DNS resolution for fs_locations. +nfs41-server.txt + - info on the Linux server implementation of NFSv4 minor version 1. +nfs-rdma.txt + - how to install and setup the Linux NFS/RDMA client and server software +nfsroot.txt + - short guide on setting up a diskless box with NFS root filesystem. diff --git a/Documentation/filesystems/nfs/Exporting b/Documentation/filesystems/nfs/Exporting new file mode 100644 index 000000000000..87019d2b5981 --- /dev/null +++ b/Documentation/filesystems/nfs/Exporting @@ -0,0 +1,147 @@ + +Making Filesystems Exportable +============================= + +Overview +-------- + +All filesystem operations require a dentry (or two) as a starting +point. Local applications have a reference-counted hold on suitable +dentries via open file descriptors or cwd/root. However remote +applications that access a filesystem via a remote filesystem protocol +such as NFS may not be able to hold such a reference, and so need a +different way to refer to a particular dentry. As the alternative +form of reference needs to be stable across renames, truncates, and +server-reboot (among other things, though these tend to be the most +problematic), there is no simple answer like 'filename'. + +The mechanism discussed here allows each filesystem implementation to +specify how to generate an opaque (outside of the filesystem) byte +string for any dentry, and how to find an appropriate dentry for any +given opaque byte string. +This byte string will be called a "filehandle fragment" as it +corresponds to part of an NFS filehandle. + +A filesystem which supports the mapping between filehandle fragments +and dentries will be termed "exportable". + + + +Dcache Issues +------------- + +The dcache normally contains a proper prefix of any given filesystem +tree. This means that if any filesystem object is in the dcache, then +all of the ancestors of that filesystem object are also in the dcache. +As normal access is by filename this prefix is created naturally and +maintained easily (by each object maintaining a reference count on +its parent). + +However when objects are included into the dcache by interpreting a +filehandle fragment, there is no automatic creation of a path prefix +for the object. This leads to two related but distinct features of +the dcache that are not needed for normal filesystem access. + +1/ The dcache must sometimes contain objects that are not part of the + proper prefix. i.e that are not connected to the root. +2/ The dcache must be prepared for a newly found (via ->lookup) directory + to already have a (non-connected) dentry, and must be able to move + that dentry into place (based on the parent and name in the + ->lookup). This is particularly needed for directories as + it is a dcache invariant that directories only have one dentry. + +To implement these features, the dcache has: + +a/ A dentry flag DCACHE_DISCONNECTED which is set on + any dentry that might not be part of the proper prefix. + This is set when anonymous dentries are created, and cleared when a + dentry is noticed to be a child of a dentry which is in the proper + prefix. + +b/ A per-superblock list "s_anon" of dentries which are the roots of + subtrees that are not in the proper prefix. These dentries, as + well as the proper prefix, need to be released at unmount time. As + these dentries will not be hashed, they are linked together on the + d_hash list_head. + +c/ Helper routines to allocate anonymous dentries, and to help attach + loose directory dentries at lookup time. They are: + d_alloc_anon(inode) will return a dentry for the given inode. + If the inode already has a dentry, one of those is returned. + If it doesn't, a new anonymous (IS_ROOT and + DCACHE_DISCONNECTED) dentry is allocated and attached. + In the case of a directory, care is taken that only one dentry + can ever be attached. + d_splice_alias(inode, dentry) will make sure that there is a + dentry with the same name and parent as the given dentry, and + which refers to the given inode. + If the inode is a directory and already has a dentry, then that + dentry is d_moved over the given dentry. + If the passed dentry gets attached, care is taken that this is + mutually exclusive to a d_alloc_anon operation. + If the passed dentry is used, NULL is returned, else the used + dentry is returned. This corresponds to the calling pattern of + ->lookup. + + +Filesystem Issues +----------------- + +For a filesystem to be exportable it must: + + 1/ provide the filehandle fragment routines described below. + 2/ make sure that d_splice_alias is used rather than d_add + when ->lookup finds an inode for a given parent and name. + Typically the ->lookup routine will end with a: + + return d_splice_alias(inode, dentry); + } + + + + A file system implementation declares that instances of the filesystem +are exportable by setting the s_export_op field in the struct +super_block. This field must point to a "struct export_operations" +struct which has the following members: + + encode_fh (optional) + Takes a dentry and creates a filehandle fragment which can later be used + to find or create a dentry for the same object. The default + implementation creates a filehandle fragment that encodes a 32bit inode + and generation number for the inode encoded, and if necessary the + same information for the parent. + + fh_to_dentry (mandatory) + Given a filehandle fragment, this should find the implied object and + create a dentry for it (possibly with d_alloc_anon). + + fh_to_parent (optional but strongly recommended) + Given a filehandle fragment, this should find the parent of the + implied object and create a dentry for it (possibly with d_alloc_anon). + May fail if the filehandle fragment is too small. + + get_parent (optional but strongly recommended) + When given a dentry for a directory, this should return a dentry for + the parent. Quite possibly the parent dentry will have been allocated + by d_alloc_anon. The default get_parent function just returns an error + so any filehandle lookup that requires finding a parent will fail. + ->lookup("..") is *not* used as a default as it can leave ".." entries + in the dcache which are too messy to work with. + + get_name (optional) + When given a parent dentry and a child dentry, this should find a name + in the directory identified by the parent dentry, which leads to the + object identified by the child dentry. If no get_name function is + supplied, a default implementation is provided which uses vfs_readdir + to find potential names, and matches inode numbers to find the correct + match. + + +A filehandle fragment consists of an array of 1 or more 4byte words, +together with a one byte "type". +The decode_fh routine should not depend on the stated size that is +passed to it. This size may be larger than the original filehandle +generated by encode_fh, in which case it will have been padded with +nuls. Rather, the encode_fh routine should choose a "type" which +indicates the decode_fh how much of the filehandle is valid, and how +it should be interpreted. diff --git a/Documentation/filesystems/nfs/nfs-rdma.txt b/Documentation/filesystems/nfs/nfs-rdma.txt new file mode 100644 index 000000000000..e386f7e4bcee --- /dev/null +++ b/Documentation/filesystems/nfs/nfs-rdma.txt @@ -0,0 +1,271 @@ +################################################################################ +# # +# NFS/RDMA README # +# # +################################################################################ + + Author: NetApp and Open Grid Computing + Date: May 29, 2008 + +Table of Contents +~~~~~~~~~~~~~~~~~ + - Overview + - Getting Help + - Installation + - Check RDMA and NFS Setup + - NFS/RDMA Setup + +Overview +~~~~~~~~ + + This document describes how to install and setup the Linux NFS/RDMA client + and server software. + + The NFS/RDMA client was first included in Linux 2.6.24. The NFS/RDMA server + was first included in the following release, Linux 2.6.25. + + In our testing, we have obtained excellent performance results (full 10Gbit + wire bandwidth at minimal client CPU) under many workloads. The code passes + the full Connectathon test suite and operates over both Infiniband and iWARP + RDMA adapters. + +Getting Help +~~~~~~~~~~~~ + + If you get stuck, you can ask questions on the + + nfs-rdma-devel@lists.sourceforge.net + + mailing list. + +Installation +~~~~~~~~~~~~ + + These instructions are a step by step guide to building a machine for + use with NFS/RDMA. + + - Install an RDMA device + + Any device supported by the drivers in drivers/infiniband/hw is acceptable. + + Testing has been performed using several Mellanox-based IB cards, the + Ammasso AMS1100 iWARP adapter, and the Chelsio cxgb3 iWARP adapter. + + - Install a Linux distribution and tools + + The first kernel release to contain both the NFS/RDMA client and server was + Linux 2.6.25 Therefore, a distribution compatible with this and subsequent + Linux kernel release should be installed. + + The procedures described in this document have been tested with + distributions from Red Hat's Fedora Project (http://fedora.redhat.com/). + + - Install nfs-utils-1.1.2 or greater on the client + + An NFS/RDMA mount point can be obtained by using the mount.nfs command in + nfs-utils-1.1.2 or greater (nfs-utils-1.1.1 was the first nfs-utils + version with support for NFS/RDMA mounts, but for various reasons we + recommend using nfs-utils-1.1.2 or greater). To see which version of + mount.nfs you are using, type: + + $ /sbin/mount.nfs -V + + If the version is less than 1.1.2 or the command does not exist, + you should install the latest version of nfs-utils. + + Download the latest package from: + + http://www.kernel.org/pub/linux/utils/nfs + + Uncompress the package and follow the installation instructions. + + If you will not need the idmapper and gssd executables (you do not need + these to create an NFS/RDMA enabled mount command), the installation + process can be simplified by disabling these features when running + configure: + + $ ./configure --disable-gss --disable-nfsv4 + + To build nfs-utils you will need the tcp_wrappers package installed. For + more information on this see the package's README and INSTALL files. + + After building the nfs-utils package, there will be a mount.nfs binary in + the utils/mount directory. This binary can be used to initiate NFS v2, v3, + or v4 mounts. To initiate a v4 mount, the binary must be called + mount.nfs4. The standard technique is to create a symlink called + mount.nfs4 to mount.nfs. + + This mount.nfs binary should be installed at /sbin/mount.nfs as follows: + + $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs + + In this location, mount.nfs will be invoked automatically for NFS mounts + by the system mount command. + + NOTE: mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed + on the NFS client machine. You do not need this specific version of + nfs-utils on the server. Furthermore, only the mount.nfs command from + nfs-utils-1.1.2 is needed on the client. + + - Install a Linux kernel with NFS/RDMA + + The NFS/RDMA client and server are both included in the mainline Linux + kernel version 2.6.25 and later. This and other versions of the 2.6 Linux + kernel can be found at: + + ftp://ftp.kernel.org/pub/linux/kernel/v2.6/ + + Download the sources and place them in an appropriate location. + + - Configure the RDMA stack + + Make sure your kernel configuration has RDMA support enabled. Under + Device Drivers -> InfiniBand support, update the kernel configuration + to enable InfiniBand support [NOTE: the option name is misleading. Enabling + InfiniBand support is required for all RDMA devices (IB, iWARP, etc.)]. + + Enable the appropriate IB HCA support (mlx4, mthca, ehca, ipath, etc.) or + iWARP adapter support (amso, cxgb3, etc.). + + If you are using InfiniBand, be sure to enable IP-over-InfiniBand support. + + - Configure the NFS client and server + + Your kernel configuration must also have NFS file system support and/or + NFS server support enabled. These and other NFS related configuration + options can be found under File Systems -> Network File Systems. + + - Build, install, reboot + + The NFS/RDMA code will be enabled automatically if NFS and RDMA + are turned on. The NFS/RDMA client and server are configured via the hidden + SUNRPC_XPRT_RDMA config option that depends on SUNRPC and INFINIBAND. The + value of SUNRPC_XPRT_RDMA will be: + + - N if either SUNRPC or INFINIBAND are N, in this case the NFS/RDMA client + and server will not be built + - M if both SUNRPC and INFINIBAND are on (M or Y) and at least one is M, + in this case the NFS/RDMA client and server will be built as modules + - Y if both SUNRPC and INFINIBAND are Y, in this case the NFS/RDMA client + and server will be built into the kernel + + Therefore, if you have followed the steps above and turned no NFS and RDMA, + the NFS/RDMA client and server will be built. + + Build a new kernel, install it, boot it. + +Check RDMA and NFS Setup +~~~~~~~~~~~~~~~~~~~~~~~~ + + Before configuring the NFS/RDMA software, it is a good idea to test + your new kernel to ensure that the kernel is working correctly. + In particular, it is a good idea to verify that the RDMA stack + is functioning as expected and standard NFS over TCP/IP and/or UDP/IP + is working properly. + + - Check RDMA Setup + + If you built the RDMA components as modules, load them at + this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel + card: + + $ modprobe ib_mthca + $ modprobe ib_ipoib + + If you are using InfiniBand, make sure there is a Subnet Manager (SM) + running on the network. If your IB switch has an embedded SM, you can + use it. Otherwise, you will need to run an SM, such as OpenSM, on one + of your end nodes. + + If an SM is running on your network, you should see the following: + + $ cat /sys/class/infiniband/driverX/ports/1/state + 4: ACTIVE + + where driverX is mthca0, ipath5, ehca3, etc. + + To further test the InfiniBand software stack, use IPoIB (this + assumes you have two IB hosts named host1 and host2): + + host1$ ifconfig ib0 a.b.c.x + host2$ ifconfig ib0 a.b.c.y + host1$ ping a.b.c.y + host2$ ping a.b.c.x + + For other device types, follow the appropriate procedures. + + - Check NFS Setup + + For the NFS components enabled above (client and/or server), + test their functionality over standard Ethernet using TCP/IP or UDP/IP. + +NFS/RDMA Setup +~~~~~~~~~~~~~~ + + We recommend that you use two machines, one to act as the client and + one to act as the server. + + One time configuration: + + - On the server system, configure the /etc/exports file and + start the NFS/RDMA server. + + Exports entries with the following formats have been tested: + + /vol0 192.168.0.47(fsid=0,rw,async,insecure,no_root_squash) + /vol0 192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash) + + The IP address(es) is(are) the client's IPoIB address for an InfiniBand + HCA or the cleint's iWARP address(es) for an RNIC. + + NOTE: The "insecure" option must be used because the NFS/RDMA client does + not use a reserved port. + + Each time a machine boots: + + - Load and configure the RDMA drivers + + For InfiniBand using a Mellanox adapter: + + $ modprobe ib_mthca + $ modprobe ib_ipoib + $ ifconfig ib0 a.b.c.d + + NOTE: use unique addresses for the client and server + + - Start the NFS server + + If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in + kernel config), load the RDMA transport module: + + $ modprobe svcrdma + + Regardless of how the server was built (module or built-in), start the + server: + + $ /etc/init.d/nfs start + + or + + $ service nfs start + + Instruct the server to listen on the RDMA transport: + + $ echo rdma 20049 > /proc/fs/nfsd/portlist + + - On the client system + + If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in + kernel config), load the RDMA client module: + + $ modprobe xprtrdma.ko + + Regardless of how the client was built (module or built-in), use this + command to mount the NFS/RDMA server: + + $ mount -o rdma,port=20049 :/ /mnt + + To verify that the mount is using RDMA, run "cat /proc/mounts" and check + the "proto" field for the given mount. + + Congratulations! You're using NFS/RDMA! diff --git a/Documentation/filesystems/nfs/nfs.txt b/Documentation/filesystems/nfs/nfs.txt new file mode 100644 index 000000000000..f50f26ce6cd0 --- /dev/null +++ b/Documentation/filesystems/nfs/nfs.txt @@ -0,0 +1,98 @@ + +The NFS client +============== + +The NFS version 2 protocol was first documented in RFC1094 (March 1989). +Since then two more major releases of NFS have been published, with NFSv3 +being documented in RFC1813 (June 1995), and NFSv4 in RFC3530 (April +2003). + +The Linux NFS client currently supports all the above published versions, +and work is in progress on adding support for minor version 1 of the NFSv4 +protocol. + +The purpose of this document is to provide information on some of the +upcall interfaces that are used in order to provide the NFS client with +some of the information that it requires in order to fully comply with +the NFS spec. + +The DNS resolver +================ + +NFSv4 allows for one server to refer the NFS client to data that has been +migrated onto another server by means of the special "fs_locations" +attribute. See + http://tools.ietf.org/html/rfc3530#section-6 +and + http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00 + +The fs_locations information can take the form of either an ip address and +a path, or a DNS hostname and a path. The latter requires the NFS client to +do a DNS lookup in order to mount the new volume, and hence the need for an +upcall to allow userland to provide this service. + +Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual +/var/lib/nfs/rpc_pipefs, the upcall consists of the following steps: + + (1) The process checks the dns_resolve cache to see if it contains a + valid entry. If so, it returns that entry and exits. + + (2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent' + (may be changed using the 'nfs.cache_getent' kernel boot parameter) + is run, with two arguments: + - the cache name, "dns_resolve" + - the hostname to resolve + + (3) After looking up the corresponding ip address, the helper script + writes the result into the rpc_pipefs pseudo-file + '/var/lib/nfs/rpc_pipefs/cache/dns_resolve/channel' + in the following (text) format: + + " \n" + + Where is in the usual IPv4 (123.456.78.90) or IPv6 + (ffee:ddcc:bbaa:9988:7766:5544:3322:1100, ffee::1100, ...) format. + is identical to the second argument of the helper + script, and is the 'time to live' of this cache entry (in + units of seconds). + + Note: If is invalid, say the string "0", then a negative + entry is created, which will cause the kernel to treat the hostname + as having no valid DNS translation. + + + + +A basic sample /sbin/nfs_cache_getent +===================================== + +#!/bin/bash +# +ttl=600 +# +cut=/usr/bin/cut +getent=/usr/bin/getent +rpc_pipefs=/var/lib/nfs/rpc_pipefs +# +die() +{ + echo "Usage: $0 cache_name entry_name" + exit 1 +} + +[ $# -lt 2 ] && die +cachename="$1" +cache_path=${rpc_pipefs}/cache/${cachename}/channel + +case "${cachename}" in + dns_resolve) + name="$2" + result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )" + [ -z "${result}" ] && result="0" + ;; + *) + die + ;; +esac +echo "${result} ${name} ${ttl}" >${cache_path} + diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt new file mode 100644 index 000000000000..1bd0d0c05171 --- /dev/null +++ b/Documentation/filesystems/nfs/nfs41-server.txt @@ -0,0 +1,222 @@ +NFSv4.1 Server Implementation + +Server support for minorversion 1 can be controlled using the +/proc/fs/nfsd/versions control file. The string output returned +by reading this file will contain either "+4.1" or "-4.1" +correspondingly. + +Currently, server support for minorversion 1 is disabled by default. +It can be enabled at run time by writing the string "+4.1" to +the /proc/fs/nfsd/versions control file. Note that to write this +control file, the nfsd service must be taken down. Use your user-mode +nfs-utils to set this up; see rpc.nfsd(8) + +(Warning: older servers will interpret "+4.1" and "-4.1" as "+4" and +"-4", respectively. Therefore, code meant to work on both new and old +kernels must turn 4.1 on or off *before* turning support for version 4 +on or off; rpc.nfsd does this correctly.) + +The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based +on the latest NFSv4.1 Internet Draft: +http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29 + +From the many new features in NFSv4.1 the current implementation +focuses on the mandatory-to-implement NFSv4.1 Sessions, providing +"exactly once" semantics and better control and throttling of the +resources allocated for each client. + +Other NFSv4.1 features, Parallel NFS operations in particular, +are still under development out of tree. +See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design +for more information. + +The current implementation is intended for developers only: while it +does support ordinary file operations on clients we have tested against +(including the linux client), it is incomplete in ways which may limit +features unexpectedly, cause known bugs in rare cases, or cause +interoperability problems with future clients. Known issues: + + - gss support is questionable: currently mounts with kerberos + from a linux client are possible, but we aren't really + conformant with the spec (for example, we don't use kerberos + on the backchannel correctly). + - no trunking support: no clients currently take advantage of + trunking, but this is a mandatory feature, and its use is + recommended to clients in a number of places. (E.g. to ensure + timely renewal in case an existing connection's retry timeouts + have gotten too long; see section 8.3 of the draft.) + Therefore, lack of this feature may cause future clients to + fail. + - Incomplete backchannel support: incomplete backchannel gss + support and no support for BACKCHANNEL_CTL mean that + callbacks (hence delegations and layouts) may not be + available and clients confused by the incomplete + implementation may fail. + - Server reboot recovery is unsupported; if the server reboots, + clients may fail. + - We do not support SSV, which provides security for shared + client-server state (thus preventing unauthorized tampering + with locks and opens, for example). It is mandatory for + servers to support this, though no clients use it yet. + - Mandatory operations which we do not support, such as + DESTROY_CLIENTID, FREE_STATEID, SECINFO_NO_NAME, and + TEST_STATEID, are not currently used by clients, but will be + (and the spec recommends their uses in common cases), and + clients should not be expected to know how to recover from the + case where they are not supported. This will eventually cause + interoperability failures. + +In addition, some limitations are inherited from the current NFSv4 +implementation: + + - Incomplete delegation enforcement: if a file is renamed or + unlinked, a client holding a delegation may continue to + indefinitely allow opens of the file under the old name. + +The table below, taken from the NFSv4.1 document, lists +the operations that are mandatory to implement (REQ), optional +(OPT), and NFSv4.0 operations that are required not to implement (MNI) +in minor version 1. The first column indicates the operations that +are not supported yet by the linux server implementation. + +The OPTIONAL features identified and their abbreviations are as follows: + pNFS Parallel NFS + FDELG File Delegations + DDELG Directory Delegations + +The following abbreviations indicate the linux server implementation status. + I Implemented NFSv4.1 operations. + NS Not Supported. + NS* unimplemented optional feature. + P pNFS features implemented out of tree. + PNS pNFS features that are not supported yet (out of tree). + +Operations + + +----------------------+------------+--------------+----------------+ + | Operation | REQ, REC, | Feature | Definition | + | | OPT, or | (REQ, REC, | | + | | MNI | or OPT) | | + +----------------------+------------+--------------+----------------+ + | ACCESS | REQ | | Section 18.1 | +NS | BACKCHANNEL_CTL | REQ | | Section 18.33 | +NS | BIND_CONN_TO_SESSION | REQ | | Section 18.34 | + | CLOSE | REQ | | Section 18.2 | + | COMMIT | REQ | | Section 18.3 | + | CREATE | REQ | | Section 18.4 | +I | CREATE_SESSION | REQ | | Section 18.36 | +NS*| DELEGPURGE | OPT | FDELG (REQ) | Section 18.5 | + | DELEGRETURN | OPT | FDELG, | Section 18.6 | + | | | DDELG, pNFS | | + | | | (REQ) | | +NS | DESTROY_CLIENTID | REQ | | Section 18.50 | +I | DESTROY_SESSION | REQ | | Section 18.37 | +I | EXCHANGE_ID | REQ | | Section 18.35 | +NS | FREE_STATEID | REQ | | Section 18.38 | + | GETATTR | REQ | | Section 18.7 | +P | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 | +P | GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 | + | GETFH | REQ | | Section 18.8 | +NS*| GET_DIR_DELEGATION | OPT | DDELG (REQ) | Section 18.39 | +P | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 | +P | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 | +P | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 | + | LINK | OPT | | Section 18.9 | + | LOCK | REQ | | Section 18.10 | + | LOCKT | REQ | | Section 18.11 | + | LOCKU | REQ | | Section 18.12 | + | LOOKUP | REQ | | Section 18.13 | + | LOOKUPP | REQ | | Section 18.14 | + | NVERIFY | REQ | | Section 18.15 | + | OPEN | REQ | | Section 18.16 | +NS*| OPENATTR | OPT | | Section 18.17 | + | OPEN_CONFIRM | MNI | | N/A | + | OPEN_DOWNGRADE | REQ | | Section 18.18 | + | PUTFH | REQ | | Section 18.19 | + | PUTPUBFH | REQ | | Section 18.20 | + | PUTROOTFH | REQ | | Section 18.21 | + | READ | REQ | | Section 18.22 | + | READDIR | REQ | | Section 18.23 | + | READLINK | OPT | | Section 18.24 | +NS | RECLAIM_COMPLETE | REQ | | Section 18.51 | + | RELEASE_LOCKOWNER | MNI | | N/A | + | REMOVE | REQ | | Section 18.25 | + | RENAME | REQ | | Section 18.26 | + | RENEW | MNI | | N/A | + | RESTOREFH | REQ | | Section 18.27 | + | SAVEFH | REQ | | Section 18.28 | + | SECINFO | REQ | | Section 18.29 | +NS | SECINFO_NO_NAME | REC | pNFS files | Section 18.45, | + | | | layout (REQ) | Section 13.12 | +I | SEQUENCE | REQ | | Section 18.46 | + | SETATTR | REQ | | Section 18.30 | + | SETCLIENTID | MNI | | N/A | + | SETCLIENTID_CONFIRM | MNI | | N/A | +NS | SET_SSV | REQ | | Section 18.47 | +NS | TEST_STATEID | REQ | | Section 18.48 | + | VERIFY | REQ | | Section 18.31 | +NS*| WANT_DELEGATION | OPT | FDELG (OPT) | Section 18.49 | + | WRITE | REQ | | Section 18.32 | + +Callback Operations + + +-------------------------+-----------+-------------+---------------+ + | Operation | REQ, REC, | Feature | Definition | + | | OPT, or | (REQ, REC, | | + | | MNI | or OPT) | | + +-------------------------+-----------+-------------+---------------+ + | CB_GETATTR | OPT | FDELG (REQ) | Section 20.1 | +P | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 | +NS*| CB_NOTIFY | OPT | DDELG (REQ) | Section 20.4 | +P | CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 | +NS*| CB_NOTIFY_LOCK | OPT | | Section 20.11 | +NS*| CB_PUSH_DELEG | OPT | FDELG (OPT) | Section 20.5 | + | CB_RECALL | OPT | FDELG, | Section 20.2 | + | | | DDELG, pNFS | | + | | | (REQ) | | +NS*| CB_RECALL_ANY | OPT | FDELG, | Section 20.6 | + | | | DDELG, pNFS | | + | | | (REQ) | | +NS | CB_RECALL_SLOT | REQ | | Section 20.8 | +NS*| CB_RECALLABLE_OBJ_AVAIL | OPT | DDELG, pNFS | Section 20.7 | + | | | (REQ) | | +I | CB_SEQUENCE | OPT | FDELG, | Section 20.9 | + | | | DDELG, pNFS | | + | | | (REQ) | | +NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 | + | | | DDELG, pNFS | | + | | | (REQ) | | + +-------------------------+-----------+-------------+---------------+ + +Implementation notes: + +DELEGPURGE: +* mandatory only for servers that support CLAIM_DELEGATE_PREV and/or + CLAIM_DELEG_PREV_FH (which allows clients to keep delegations that + persist across client reboots). Thus we need not implement this for + now. + +EXCHANGE_ID: +* only SP4_NONE state protection supported +* implementation ids are ignored + +CREATE_SESSION: +* backchannel attributes are ignored +* backchannel security parameters are ignored + +SEQUENCE: +* no support for dynamic slot table renegotiation (optional) + +nfsv4.1 COMPOUND rules: +The following cases aren't supported yet: +* Enforcing of NFS4ERR_NOT_ONLY_OP for: BIND_CONN_TO_SESSION, CREATE_SESSION, + DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID. +* DESTROY_SESSION MUST be the final operation in the COMPOUND request. + +Nonstandard compound limitations: +* No support for a sessions fore channel RPC compound that requires both a + ca_maxrequestsize request and a ca_maxresponsesize reply, so we may + fail to live up to the promise we made in CREATE_SESSION fore channel + negotiation. +* No more than one IO operation (read, write, readdir) allowed per + compound. diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt new file mode 100644 index 000000000000..3ba0b945aaf8 --- /dev/null +++ b/Documentation/filesystems/nfs/nfsroot.txt @@ -0,0 +1,270 @@ +Mounting the root filesystem via NFS (nfsroot) +=============================================== + +Written 1996 by Gero Kuhlmann +Updated 1997 by Martin Mares +Updated 2006 by Nico Schottelius +Updated 2006 by Horms + + + +In order to use a diskless system, such as an X-terminal or printer server +for example, it is necessary for the root filesystem to be present on a +non-disk device. This may be an initramfs (see Documentation/filesystems/ +ramfs-rootfs-initramfs.txt), a ramdisk (see Documentation/initrd.txt) or a +filesystem mounted via NFS. The following text describes on how to use NFS +for the root filesystem. For the rest of this text 'client' means the +diskless system, and 'server' means the NFS server. + + + + +1.) Enabling nfsroot capabilities + ----------------------------- + +In order to use nfsroot, NFS client support needs to be selected as +built-in during configuration. Once this has been selected, the nfsroot +option will become available, which should also be selected. + +In the networking options, kernel level autoconfiguration can be selected, +along with the types of autoconfiguration to support. Selecting all of +DHCP, BOOTP and RARP is safe. + + + + +2.) Kernel command line + ------------------- + +When the kernel has been loaded by a boot loader (see below) it needs to be +told what root fs device to use. And in the case of nfsroot, where to find +both the server and the name of the directory on the server to mount as root. +This can be established using the following kernel command line parameters: + + +root=/dev/nfs + + This is necessary to enable the pseudo-NFS-device. Note that it's not a + real device but just a synonym to tell the kernel to use NFS instead of + a real device. + + +nfsroot=[:][,] + + If the `nfsroot' parameter is NOT given on the command line, + the default "/tftpboot/%s" will be used. + + Specifies the IP address of the NFS server. + The default address is determined by the `ip' parameter + (see below). This parameter allows the use of different + servers for IP autoconfiguration and NFS. + + Name of the directory on the server to mount as root. + If there is a "%s" token in the string, it will be + replaced by the ASCII-representation of the client's + IP address. + + Standard NFS options. All options are separated by commas. + The following defaults are used: + port = as given by server portmap daemon + rsize = 4096 + wsize = 4096 + timeo = 7 + retrans = 3 + acregmin = 3 + acregmax = 60 + acdirmin = 30 + acdirmax = 60 + flags = hard, nointr, noposix, cto, ac + + +ip=:::::: + + This parameter tells the kernel how to configure IP addresses of devices + and also how to set up the IP routing table. It was originally called + `nfsaddrs', but now the boot-time IP configuration works independently of + NFS, so it was renamed to `ip' and the old name remained as an alias for + compatibility reasons. + + If this parameter is missing from the kernel command line, all fields are + assumed to be empty, and the defaults mentioned below apply. In general + this means that the kernel tries to configure everything using + autoconfiguration. + + The parameter can appear alone as the value to the `ip' + parameter (without all the ':' characters before). If the value is + "ip=off" or "ip=none", no autoconfiguration will take place, otherwise + autoconfiguration will take place. The most common way to use this + is "ip=dhcp". + + IP address of the client. + + Default: Determined using autoconfiguration. + + IP address of the NFS server. If RARP is used to determine + the client address and this parameter is NOT empty only + replies from the specified server are accepted. + + Only required for NFS root. That is autoconfiguration + will not be triggered if it is missing and NFS root is not + in operation. + + Default: Determined using autoconfiguration. + The address of the autoconfiguration server is used. + + IP address of a gateway if the server is on a different subnet. + + Default: Determined using autoconfiguration. + + Netmask for local network interface. If unspecified + the netmask is derived from the client IP address assuming + classful addressing. + + Default: Determined using autoconfiguration. + + Name of the client. May be supplied by autoconfiguration, + but its absence will not trigger autoconfiguration. + + Default: Client IP address is used in ASCII notation. + + Name of network device to use. + + Default: If the host only has one device, it is used. + Otherwise the device is determined using + autoconfiguration. This is done by sending + autoconfiguration requests out of all devices, + and using the device that received the first reply. + + Method to use for autoconfiguration. In the case of options + which specify multiple autoconfiguration protocols, + requests are sent using all protocols, and the first one + to reply is used. + + Only autoconfiguration protocols that have been compiled + into the kernel will be used, regardless of the value of + this option. + + off or none: don't use autoconfiguration + (do static IP assignment instead) + on or any: use any protocol available in the kernel + (default) + dhcp: use DHCP + bootp: use BOOTP + rarp: use RARP + both: use both BOOTP and RARP but not DHCP + (old option kept for backwards compatibility) + + Default: any + + + + +3.) Boot Loader + ---------- + +To get the kernel into memory different approaches can be used. +They depend on various facilities being available: + + +3.1) Booting from a floppy using syslinux + + When building kernels, an easy way to create a boot floppy that uses + syslinux is to use the zdisk or bzdisk make targets which use zimage + and bzimage images respectively. Both targets accept the + FDARGS parameter which can be used to set the kernel command line. + + e.g. + make bzdisk FDARGS="root=/dev/nfs" + + Note that the user running this command will need to have + access to the floppy drive device, /dev/fd0 + + For more information on syslinux, including how to create bootdisks + for prebuilt kernels, see http://syslinux.zytor.com/ + + N.B: Previously it was possible to write a kernel directly to + a floppy using dd, configure the boot device using rdev, and + boot using the resulting floppy. Linux no longer supports this + method of booting. + +3.2) Booting from a cdrom using isolinux + + When building kernels, an easy way to create a bootable cdrom that + uses isolinux is to use the isoimage target which uses a bzimage + image. Like zdisk and bzdisk, this target accepts the FDARGS + parameter which can be used to set the kernel command line. + + e.g. + make isoimage FDARGS="root=/dev/nfs" + + The resulting iso image will be arch//boot/image.iso + This can be written to a cdrom using a variety of tools including + cdrecord. + + e.g. + cdrecord dev=ATAPI:1,0,0 arch/i386/boot/image.iso + + For more information on isolinux, including how to create bootdisks + for prebuilt kernels, see http://syslinux.zytor.com/ + +3.2) Using LILO + When using LILO all the necessary command line parameters may be + specified using the 'append=' directive in the LILO configuration + file. + + However, to use the 'root=' directive you also need to create + a dummy root device, which may be removed after LILO is run. + + mknod /dev/boot255 c 0 255 + + For information on configuring LILO, please refer to its documentation. + +3.3) Using GRUB + When using GRUB, kernel parameter are simply appended after the kernel + specification: kernel + +3.4) Using loadlin + loadlin may be used to boot Linux from a DOS command prompt without + requiring a local hard disk to mount as root. This has not been + thoroughly tested by the authors of this document, but in general + it should be possible configure the kernel command line similarly + to the configuration of LILO. + + Please refer to the loadlin documentation for further information. + +3.5) Using a boot ROM + This is probably the most elegant way of booting a diskless client. + With a boot ROM the kernel is loaded using the TFTP protocol. The + authors of this document are not aware of any no commercial boot + ROMs that support booting Linux over the network. However, there + are two free implementations of a boot ROM, netboot-nfs and + etherboot, both of which are available on sunsite.unc.edu, and both + of which contain everything you need to boot a diskless Linux client. + +3.6) Using pxelinux + Pxelinux may be used to boot linux using the PXE boot loader + which is present on many modern network cards. + + When using pxelinux, the kernel image is specified using + "kernel ". The nfsroot parameters + are passed to the kernel by adding them to the "append" line. + It is common to use serial console in conjunction with pxeliunx, + see Documentation/serial-console.txt for more information. + + For more information on isolinux, including how to create bootdisks + for prebuilt kernels, see http://syslinux.zytor.com/ + + + + +4.) Credits + ------- + + The nfsroot code in the kernel and the RARP support have been written + by Gero Kuhlmann . + + The rest of the IP layer autoconfiguration code has been written + by Martin Mares . + + In order to write the initial version of nfsroot I would like to thank + Jens-Uwe Mager for his help. diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs41-server.txt deleted file mode 100644 index 1bd0d0c05171..000000000000 --- a/Documentation/filesystems/nfs41-server.txt +++ /dev/null @@ -1,222 +0,0 @@ -NFSv4.1 Server Implementation - -Server support for minorversion 1 can be controlled using the -/proc/fs/nfsd/versions control file. The string output returned -by reading this file will contain either "+4.1" or "-4.1" -correspondingly. - -Currently, server support for minorversion 1 is disabled by default. -It can be enabled at run time by writing the string "+4.1" to -the /proc/fs/nfsd/versions control file. Note that to write this -control file, the nfsd service must be taken down. Use your user-mode -nfs-utils to set this up; see rpc.nfsd(8) - -(Warning: older servers will interpret "+4.1" and "-4.1" as "+4" and -"-4", respectively. Therefore, code meant to work on both new and old -kernels must turn 4.1 on or off *before* turning support for version 4 -on or off; rpc.nfsd does this correctly.) - -The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based -on the latest NFSv4.1 Internet Draft: -http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29 - -From the many new features in NFSv4.1 the current implementation -focuses on the mandatory-to-implement NFSv4.1 Sessions, providing -"exactly once" semantics and better control and throttling of the -resources allocated for each client. - -Other NFSv4.1 features, Parallel NFS operations in particular, -are still under development out of tree. -See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design -for more information. - -The current implementation is intended for developers only: while it -does support ordinary file operations on clients we have tested against -(including the linux client), it is incomplete in ways which may limit -features unexpectedly, cause known bugs in rare cases, or cause -interoperability problems with future clients. Known issues: - - - gss support is questionable: currently mounts with kerberos - from a linux client are possible, but we aren't really - conformant with the spec (for example, we don't use kerberos - on the backchannel correctly). - - no trunking support: no clients currently take advantage of - trunking, but this is a mandatory feature, and its use is - recommended to clients in a number of places. (E.g. to ensure - timely renewal in case an existing connection's retry timeouts - have gotten too long; see section 8.3 of the draft.) - Therefore, lack of this feature may cause future clients to - fail. - - Incomplete backchannel support: incomplete backchannel gss - support and no support for BACKCHANNEL_CTL mean that - callbacks (hence delegations and layouts) may not be - available and clients confused by the incomplete - implementation may fail. - - Server reboot recovery is unsupported; if the server reboots, - clients may fail. - - We do not support SSV, which provides security for shared - client-server state (thus preventing unauthorized tampering - with locks and opens, for example). It is mandatory for - servers to support this, though no clients use it yet. - - Mandatory operations which we do not support, such as - DESTROY_CLIENTID, FREE_STATEID, SECINFO_NO_NAME, and - TEST_STATEID, are not currently used by clients, but will be - (and the spec recommends their uses in common cases), and - clients should not be expected to know how to recover from the - case where they are not supported. This will eventually cause - interoperability failures. - -In addition, some limitations are inherited from the current NFSv4 -implementation: - - - Incomplete delegation enforcement: if a file is renamed or - unlinked, a client holding a delegation may continue to - indefinitely allow opens of the file under the old name. - -The table below, taken from the NFSv4.1 document, lists -the operations that are mandatory to implement (REQ), optional -(OPT), and NFSv4.0 operations that are required not to implement (MNI) -in minor version 1. The first column indicates the operations that -are not supported yet by the linux server implementation. - -The OPTIONAL features identified and their abbreviations are as follows: - pNFS Parallel NFS - FDELG File Delegations - DDELG Directory Delegations - -The following abbreviations indicate the linux server implementation status. - I Implemented NFSv4.1 operations. - NS Not Supported. - NS* unimplemented optional feature. - P pNFS features implemented out of tree. - PNS pNFS features that are not supported yet (out of tree). - -Operations - - +----------------------+------------+--------------+----------------+ - | Operation | REQ, REC, | Feature | Definition | - | | OPT, or | (REQ, REC, | | - | | MNI | or OPT) | | - +----------------------+------------+--------------+----------------+ - | ACCESS | REQ | | Section 18.1 | -NS | BACKCHANNEL_CTL | REQ | | Section 18.33 | -NS | BIND_CONN_TO_SESSION | REQ | | Section 18.34 | - | CLOSE | REQ | | Section 18.2 | - | COMMIT | REQ | | Section 18.3 | - | CREATE | REQ | | Section 18.4 | -I | CREATE_SESSION | REQ | | Section 18.36 | -NS*| DELEGPURGE | OPT | FDELG (REQ) | Section 18.5 | - | DELEGRETURN | OPT | FDELG, | Section 18.6 | - | | | DDELG, pNFS | | - | | | (REQ) | | -NS | DESTROY_CLIENTID | REQ | | Section 18.50 | -I | DESTROY_SESSION | REQ | | Section 18.37 | -I | EXCHANGE_ID | REQ | | Section 18.35 | -NS | FREE_STATEID | REQ | | Section 18.38 | - | GETATTR | REQ | | Section 18.7 | -P | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 | -P | GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 | - | GETFH | REQ | | Section 18.8 | -NS*| GET_DIR_DELEGATION | OPT | DDELG (REQ) | Section 18.39 | -P | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 | -P | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 | -P | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 | - | LINK | OPT | | Section 18.9 | - | LOCK | REQ | | Section 18.10 | - | LOCKT | REQ | | Section 18.11 | - | LOCKU | REQ | | Section 18.12 | - | LOOKUP | REQ | | Section 18.13 | - | LOOKUPP | REQ | | Section 18.14 | - | NVERIFY | REQ | | Section 18.15 | - | OPEN | REQ | | Section 18.16 | -NS*| OPENATTR | OPT | | Section 18.17 | - | OPEN_CONFIRM | MNI | | N/A | - | OPEN_DOWNGRADE | REQ | | Section 18.18 | - | PUTFH | REQ | | Section 18.19 | - | PUTPUBFH | REQ | | Section 18.20 | - | PUTROOTFH | REQ | | Section 18.21 | - | READ | REQ | | Section 18.22 | - | READDIR | REQ | | Section 18.23 | - | READLINK | OPT | | Section 18.24 | -NS | RECLAIM_COMPLETE | REQ | | Section 18.51 | - | RELEASE_LOCKOWNER | MNI | | N/A | - | REMOVE | REQ | | Section 18.25 | - | RENAME | REQ | | Section 18.26 | - | RENEW | MNI | | N/A | - | RESTOREFH | REQ | | Section 18.27 | - | SAVEFH | REQ | | Section 18.28 | - | SECINFO | REQ | | Section 18.29 | -NS | SECINFO_NO_NAME | REC | pNFS files | Section 18.45, | - | | | layout (REQ) | Section 13.12 | -I | SEQUENCE | REQ | | Section 18.46 | - | SETATTR | REQ | | Section 18.30 | - | SETCLIENTID | MNI | | N/A | - | SETCLIENTID_CONFIRM | MNI | | N/A | -NS | SET_SSV | REQ | | Section 18.47 | -NS | TEST_STATEID | REQ | | Section 18.48 | - | VERIFY | REQ | | Section 18.31 | -NS*| WANT_DELEGATION | OPT | FDELG (OPT) | Section 18.49 | - | WRITE | REQ | | Section 18.32 | - -Callback Operations - - +-------------------------+-----------+-------------+---------------+ - | Operation | REQ, REC, | Feature | Definition | - | | OPT, or | (REQ, REC, | | - | | MNI | or OPT) | | - +-------------------------+-----------+-------------+---------------+ - | CB_GETATTR | OPT | FDELG (REQ) | Section 20.1 | -P | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 | -NS*| CB_NOTIFY | OPT | DDELG (REQ) | Section 20.4 | -P | CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 | -NS*| CB_NOTIFY_LOCK | OPT | | Section 20.11 | -NS*| CB_PUSH_DELEG | OPT | FDELG (OPT) | Section 20.5 | - | CB_RECALL | OPT | FDELG, | Section 20.2 | - | | | DDELG, pNFS | | - | | | (REQ) | | -NS*| CB_RECALL_ANY | OPT | FDELG, | Section 20.6 | - | | | DDELG, pNFS | | - | | | (REQ) | | -NS | CB_RECALL_SLOT | REQ | | Section 20.8 | -NS*| CB_RECALLABLE_OBJ_AVAIL | OPT | DDELG, pNFS | Section 20.7 | - | | | (REQ) | | -I | CB_SEQUENCE | OPT | FDELG, | Section 20.9 | - | | | DDELG, pNFS | | - | | | (REQ) | | -NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 | - | | | DDELG, pNFS | | - | | | (REQ) | | - +-------------------------+-----------+-------------+---------------+ - -Implementation notes: - -DELEGPURGE: -* mandatory only for servers that support CLAIM_DELEGATE_PREV and/or - CLAIM_DELEG_PREV_FH (which allows clients to keep delegations that - persist across client reboots). Thus we need not implement this for - now. - -EXCHANGE_ID: -* only SP4_NONE state protection supported -* implementation ids are ignored - -CREATE_SESSION: -* backchannel attributes are ignored -* backchannel security parameters are ignored - -SEQUENCE: -* no support for dynamic slot table renegotiation (optional) - -nfsv4.1 COMPOUND rules: -The following cases aren't supported yet: -* Enforcing of NFS4ERR_NOT_ONLY_OP for: BIND_CONN_TO_SESSION, CREATE_SESSION, - DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID. -* DESTROY_SESSION MUST be the final operation in the COMPOUND request. - -Nonstandard compound limitations: -* No support for a sessions fore channel RPC compound that requires both a - ca_maxrequestsize request and a ca_maxresponsesize reply, so we may - fail to live up to the promise we made in CREATE_SESSION fore channel - negotiation. -* No more than one IO operation (read, write, readdir) allowed per - compound. diff --git a/Documentation/filesystems/nfsroot.txt b/Documentation/filesystems/nfsroot.txt deleted file mode 100644 index 3ba0b945aaf8..000000000000 --- a/Documentation/filesystems/nfsroot.txt +++ /dev/null @@ -1,270 +0,0 @@ -Mounting the root filesystem via NFS (nfsroot) -=============================================== - -Written 1996 by Gero Kuhlmann -Updated 1997 by Martin Mares -Updated 2006 by Nico Schottelius -Updated 2006 by Horms - - - -In order to use a diskless system, such as an X-terminal or printer server -for example, it is necessary for the root filesystem to be present on a -non-disk device. This may be an initramfs (see Documentation/filesystems/ -ramfs-rootfs-initramfs.txt), a ramdisk (see Documentation/initrd.txt) or a -filesystem mounted via NFS. The following text describes on how to use NFS -for the root filesystem. For the rest of this text 'client' means the -diskless system, and 'server' means the NFS server. - - - - -1.) Enabling nfsroot capabilities - ----------------------------- - -In order to use nfsroot, NFS client support needs to be selected as -built-in during configuration. Once this has been selected, the nfsroot -option will become available, which should also be selected. - -In the networking options, kernel level autoconfiguration can be selected, -along with the types of autoconfiguration to support. Selecting all of -DHCP, BOOTP and RARP is safe. - - - - -2.) Kernel command line - ------------------- - -When the kernel has been loaded by a boot loader (see below) it needs to be -told what root fs device to use. And in the case of nfsroot, where to find -both the server and the name of the directory on the server to mount as root. -This can be established using the following kernel command line parameters: - - -root=/dev/nfs - - This is necessary to enable the pseudo-NFS-device. Note that it's not a - real device but just a synonym to tell the kernel to use NFS instead of - a real device. - - -nfsroot=[:][,] - - If the `nfsroot' parameter is NOT given on the command line, - the default "/tftpboot/%s" will be used. - - Specifies the IP address of the NFS server. - The default address is determined by the `ip' parameter - (see below). This parameter allows the use of different - servers for IP autoconfiguration and NFS. - - Name of the directory on the server to mount as root. - If there is a "%s" token in the string, it will be - replaced by the ASCII-representation of the client's - IP address. - - Standard NFS options. All options are separated by commas. - The following defaults are used: - port = as given by server portmap daemon - rsize = 4096 - wsize = 4096 - timeo = 7 - retrans = 3 - acregmin = 3 - acregmax = 60 - acdirmin = 30 - acdirmax = 60 - flags = hard, nointr, noposix, cto, ac - - -ip=:::::: - - This parameter tells the kernel how to configure IP addresses of devices - and also how to set up the IP routing table. It was originally called - `nfsaddrs', but now the boot-time IP configuration works independently of - NFS, so it was renamed to `ip' and the old name remained as an alias for - compatibility reasons. - - If this parameter is missing from the kernel command line, all fields are - assumed to be empty, and the defaults mentioned below apply. In general - this means that the kernel tries to configure everything using - autoconfiguration. - - The parameter can appear alone as the value to the `ip' - parameter (without all the ':' characters before). If the value is - "ip=off" or "ip=none", no autoconfiguration will take place, otherwise - autoconfiguration will take place. The most common way to use this - is "ip=dhcp". - - IP address of the client. - - Default: Determined using autoconfiguration. - - IP address of the NFS server. If RARP is used to determine - the client address and this parameter is NOT empty only - replies from the specified server are accepted. - - Only required for NFS root. That is autoconfiguration - will not be triggered if it is missing and NFS root is not - in operation. - - Default: Determined using autoconfiguration. - The address of the autoconfiguration server is used. - - IP address of a gateway if the server is on a different subnet. - - Default: Determined using autoconfiguration. - - Netmask for local network interface. If unspecified - the netmask is derived from the client IP address assuming - classful addressing. - - Default: Determined using autoconfiguration. - - Name of the client. May be supplied by autoconfiguration, - but its absence will not trigger autoconfiguration. - - Default: Client IP address is used in ASCII notation. - - Name of network device to use. - - Default: If the host only has one device, it is used. - Otherwise the device is determined using - autoconfiguration. This is done by sending - autoconfiguration requests out of all devices, - and using the device that received the first reply. - - Method to use for autoconfiguration. In the case of options - which specify multiple autoconfiguration protocols, - requests are sent using all protocols, and the first one - to reply is used. - - Only autoconfiguration protocols that have been compiled - into the kernel will be used, regardless of the value of - this option. - - off or none: don't use autoconfiguration - (do static IP assignment instead) - on or any: use any protocol available in the kernel - (default) - dhcp: use DHCP - bootp: use BOOTP - rarp: use RARP - both: use both BOOTP and RARP but not DHCP - (old option kept for backwards compatibility) - - Default: any - - - - -3.) Boot Loader - ---------- - -To get the kernel into memory different approaches can be used. -They depend on various facilities being available: - - -3.1) Booting from a floppy using syslinux - - When building kernels, an easy way to create a boot floppy that uses - syslinux is to use the zdisk or bzdisk make targets which use zimage - and bzimage images respectively. Both targets accept the - FDARGS parameter which can be used to set the kernel command line. - - e.g. - make bzdisk FDARGS="root=/dev/nfs" - - Note that the user running this command will need to have - access to the floppy drive device, /dev/fd0 - - For more information on syslinux, including how to create bootdisks - for prebuilt kernels, see http://syslinux.zytor.com/ - - N.B: Previously it was possible to write a kernel directly to - a floppy using dd, configure the boot device using rdev, and - boot using the resulting floppy. Linux no longer supports this - method of booting. - -3.2) Booting from a cdrom using isolinux - - When building kernels, an easy way to create a bootable cdrom that - uses isolinux is to use the isoimage target which uses a bzimage - image. Like zdisk and bzdisk, this target accepts the FDARGS - parameter which can be used to set the kernel command line. - - e.g. - make isoimage FDARGS="root=/dev/nfs" - - The resulting iso image will be arch//boot/image.iso - This can be written to a cdrom using a variety of tools including - cdrecord. - - e.g. - cdrecord dev=ATAPI:1,0,0 arch/i386/boot/image.iso - - For more information on isolinux, including how to create bootdisks - for prebuilt kernels, see http://syslinux.zytor.com/ - -3.2) Using LILO - When using LILO all the necessary command line parameters may be - specified using the 'append=' directive in the LILO configuration - file. - - However, to use the 'root=' directive you also need to create - a dummy root device, which may be removed after LILO is run. - - mknod /dev/boot255 c 0 255 - - For information on configuring LILO, please refer to its documentation. - -3.3) Using GRUB - When using GRUB, kernel parameter are simply appended after the kernel - specification: kernel - -3.4) Using loadlin - loadlin may be used to boot Linux from a DOS command prompt without - requiring a local hard disk to mount as root. This has not been - thoroughly tested by the authors of this document, but in general - it should be possible configure the kernel command line similarly - to the configuration of LILO. - - Please refer to the loadlin documentation for further information. - -3.5) Using a boot ROM - This is probably the most elegant way of booting a diskless client. - With a boot ROM the kernel is loaded using the TFTP protocol. The - authors of this document are not aware of any no commercial boot - ROMs that support booting Linux over the network. However, there - are two free implementations of a boot ROM, netboot-nfs and - etherboot, both of which are available on sunsite.unc.edu, and both - of which contain everything you need to boot a diskless Linux client. - -3.6) Using pxelinux - Pxelinux may be used to boot linux using the PXE boot loader - which is present on many modern network cards. - - When using pxelinux, the kernel image is specified using - "kernel ". The nfsroot parameters - are passed to the kernel by adding them to the "append" line. - It is common to use serial console in conjunction with pxeliunx, - see Documentation/serial-console.txt for more information. - - For more information on isolinux, including how to create bootdisks - for prebuilt kernels, see http://syslinux.zytor.com/ - - - - -4.) Credits - ------- - - The nfsroot code in the kernel and the RARP support have been written - by Gero Kuhlmann . - - The rest of the IP layer autoconfiguration code has been written - by Martin Mares . - - In order to write the initial version of nfsroot I would like to thank - Jens-Uwe Mager for his help. diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 92b888d540a6..a7e9746ee7ea 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -140,7 +140,7 @@ Callers of notify_change() need ->i_mutex now. New super_block field "struct export_operations *s_export_op" for explicit support for exporting, e.g. via NFS. The structure is fully documented at its declaration in include/linux/fs.h, and in -Documentation/filesystems/Exporting. +Documentation/filesystems/nfs/Exporting. Briefly it allows for the definition of decode_fh and encode_fh operations to encode and decode filehandles, and allows the filesystem to use diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9107b387e91f..dab0f04b4264 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1017,7 +1017,7 @@ and is between 256 and 4096 characters. It is defined in the file No delay ip= [IP_PNP] - See Documentation/filesystems/nfsroot.txt. + See Documentation/filesystems/nfs/nfsroot.txt. ip2= [HW] Set IO/IRQ pairs for up to 4 IntelliPort boards See comment before ip2_setup() in @@ -1538,10 +1538,10 @@ and is between 256 and 4096 characters. It is defined in the file going to be removed in 2.6.29. nfsaddrs= [NFS] - See Documentation/filesystems/nfsroot.txt. + See Documentation/filesystems/nfs/nfsroot.txt. nfsroot= [NFS] nfs root filesystem for disk-less boxes. - See Documentation/filesystems/nfsroot.txt. + See Documentation/filesystems/nfs/nfsroot.txt. nfs.callback_tcpport= [NFS] set the TCP port on which the NFSv4 callback diff --git a/fs/cifs/export.c b/fs/cifs/export.c index 75949d6a5f1b..6177f7cca16a 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -24,7 +24,7 @@ */ /* - * See Documentation/filesystems/Exporting + * See Documentation/filesystems/nfs/Exporting * and examples in fs/exportfs * * Since cifs is a network file system, an "fsid" must be included for diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 197c7db583c7..e9e175949a63 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -6,7 +6,7 @@ * and for mapping back from file handles to dentries. * * For details on why we do all the strange and hairy things in here - * take a look at Documentation/filesystems/Exporting. + * take a look at Documentation/filesystems/nfs/Exporting. */ #include #include diff --git a/fs/isofs/export.c b/fs/isofs/export.c index e81a30593ba9..ed752cb38474 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -9,7 +9,7 @@ * * The following files are helpful: * - * Documentation/filesystems/Exporting + * Documentation/filesystems/nfs/Exporting * fs/exportfs/expfs.c. */ diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 2a77bc25d5af..59e5673b4597 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -90,7 +90,7 @@ config ROOT_NFS If you want your system to mount its root file system via NFS, choose Y here. This is common practice for managing systems without local permanent storage. For details, read - . + . Most people say N here. diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 27e772cefb6a..dc12f416a49f 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -97,7 +97,7 @@ struct fid { * @get_name: find the name for a given inode in a given directory * @get_parent: find the parent of a given directory * - * See Documentation/filesystems/Exporting for details on how to use + * See Documentation/filesystems/nfs/Exporting for details on how to use * this interface correctly. * * encode_fh: diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 70491d9035eb..0c94a1ac2946 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -166,7 +166,7 @@ config IP_PNP_DHCP If unsure, say Y. Note that if you want to use DHCP, a DHCP server must be operating on your network. Read - for details. + for details. config IP_PNP_BOOTP bool "IP: BOOTP support" @@ -181,7 +181,7 @@ config IP_PNP_BOOTP does BOOTP itself, providing all necessary information on the kernel command line, you can say N here. If unsure, say Y. Note that if you want to use BOOTP, a BOOTP server must be operating on your network. - Read for details. + Read for details. config IP_PNP_RARP bool "IP: RARP support" @@ -194,7 +194,7 @@ config IP_PNP_RARP older protocol which is being obsoleted by BOOTP and DHCP), say Y here. Note that if you want to use RARP, a RARP server must be operating on your network. Read - for details. + for details. # not yet ready.. # bool ' IP: ARP support' CONFIG_IP_PNP_ARP diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index f8d04c256454..7dcbf4706099 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1447,7 +1447,7 @@ late_initcall(ip_auto_config); /* * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel - * command line parameter. See Documentation/filesystems/nfsroot.txt. + * command line parameter. See Documentation/filesystems/nfs/nfsroot.txt. */ static int __init ic_proto_name(char *name) { -- cgit v1.2.3 From c017b4be3e84176cab10eca5e6c4faeb8cfc6f3e Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 28 Oct 2009 13:33:09 +0000 Subject: kmemleak: Simplify the kmemleak_scan_area() function prototype This function was taking non-necessary arguments which can be determined by kmemleak. The patch also modifies the calling sites. Signed-off-by: Catalin Marinas Cc: Pekka Enberg Cc: Christoph Lameter Cc: Rusty Russell --- include/linux/kmemleak.h | 6 ++---- kernel/module.c | 7 ++----- mm/kmemleak.c | 49 +++++++++++++++++++++--------------------------- mm/slab.c | 4 ++-- 4 files changed, 27 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 3c7497d46ee9..99d9a6766f7e 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -32,8 +32,7 @@ extern void kmemleak_padding(const void *ptr, unsigned long offset, size_t size) __ref; extern void kmemleak_not_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; -extern void kmemleak_scan_area(const void *ptr, unsigned long offset, - size_t length, gfp_t gfp) __ref; +extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; extern void kmemleak_no_scan(const void *ptr) __ref; static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, @@ -84,8 +83,7 @@ static inline void kmemleak_not_leak(const void *ptr) static inline void kmemleak_ignore(const void *ptr) { } -static inline void kmemleak_scan_area(const void *ptr, unsigned long offset, - size_t length, gfp_t gfp) +static inline void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { } static inline void kmemleak_erase(void **ptr) diff --git a/kernel/module.c b/kernel/module.c index 8b7d8805819d..1eb952097077 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2043,9 +2043,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, unsigned int i; /* only scan the sections containing data */ - kmemleak_scan_area(mod->module_core, (unsigned long)mod - - (unsigned long)mod->module_core, - sizeof(struct module), GFP_KERNEL); + kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); for (i = 1; i < hdr->e_shnum; i++) { if (!(sechdrs[i].sh_flags & SHF_ALLOC)) @@ -2054,8 +2052,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) continue; - kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr - - (unsigned long)mod->module_core, + kmemleak_scan_area((void *)sechdrs[i].sh_addr, sechdrs[i].sh_size, GFP_KERNEL); } } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 8bf765c4f58d..96106358e042 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -119,8 +119,8 @@ /* scanning area inside a memory block */ struct kmemleak_scan_area { struct hlist_node node; - unsigned long offset; - size_t length; + unsigned long start; + size_t size; }; #define KMEMLEAK_GREY 0 @@ -241,8 +241,6 @@ struct early_log { const void *ptr; /* allocated/freed memory block */ size_t size; /* memory block size */ int min_count; /* minimum reference count */ - unsigned long offset; /* scan area offset */ - size_t length; /* scan area length */ unsigned long trace[MAX_TRACE]; /* stack trace */ unsigned int trace_len; /* stack trace length */ }; @@ -720,14 +718,13 @@ static void make_black_object(unsigned long ptr) * Add a scanning area to the object. If at least one such area is added, * kmemleak will only scan these ranges rather than the whole memory block. */ -static void add_scan_area(unsigned long ptr, unsigned long offset, - size_t length, gfp_t gfp) +static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) { unsigned long flags; struct kmemleak_object *object; struct kmemleak_scan_area *area; - object = find_and_get_object(ptr, 0); + object = find_and_get_object(ptr, 1); if (!object) { kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", ptr); @@ -741,7 +738,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, } spin_lock_irqsave(&object->lock, flags); - if (offset + length > object->size) { + if (ptr + size > object->pointer + object->size) { kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); dump_object_info(object); kmem_cache_free(scan_area_cache, area); @@ -749,8 +746,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, } INIT_HLIST_NODE(&area->node); - area->offset = offset; - area->length = length; + area->start = ptr; + area->size = size; hlist_add_head(&area->node, &object->area_list); out_unlock: @@ -786,7 +783,7 @@ static void object_no_scan(unsigned long ptr) * processed later once kmemleak is fully initialized. */ static void __init log_early(int op_type, const void *ptr, size_t size, - int min_count, unsigned long offset, size_t length) + int min_count) { unsigned long flags; struct early_log *log; @@ -808,8 +805,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size, log->ptr = ptr; log->size = size; log->min_count = min_count; - log->offset = offset; - log->length = length; if (op_type == KMEMLEAK_ALLOC) log->trace_len = __save_stack_trace(log->trace); crt_early_log++; @@ -858,7 +853,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) create_object((unsigned long)ptr, size, min_count, gfp); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); + log_early(KMEMLEAK_ALLOC, ptr, size, min_count); } EXPORT_SYMBOL_GPL(kmemleak_alloc); @@ -873,7 +868,7 @@ void __ref kmemleak_free(const void *ptr) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_FREE, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); @@ -888,7 +883,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) delete_object_part((unsigned long)ptr, size); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); + log_early(KMEMLEAK_FREE_PART, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_part); @@ -903,7 +898,7 @@ void __ref kmemleak_not_leak(const void *ptr) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_not_leak); @@ -919,22 +914,21 @@ void __ref kmemleak_ignore(const void *ptr) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) make_black_object((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_IGNORE, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_ignore); /* * Limit the range to be scanned in an allocated memory block. */ -void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, - size_t length, gfp_t gfp) +void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) - add_scan_area((unsigned long)ptr, offset, length, gfp); + add_scan_area((unsigned long)ptr, size, gfp); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); + log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); } EXPORT_SYMBOL(kmemleak_scan_area); @@ -948,7 +942,7 @@ void __ref kmemleak_no_scan(const void *ptr) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) object_no_scan((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_no_scan); @@ -1075,9 +1069,9 @@ static void scan_object(struct kmemleak_object *object) } } else hlist_for_each_entry(area, elem, &object->area_list, node) - scan_block((void *)(object->pointer + area->offset), - (void *)(object->pointer + area->offset - + area->length), object, 0); + scan_block((void *)area->start, + (void *)(area->start + area->size), + object, 0); out: spin_unlock_irqrestore(&object->lock, flags); } @@ -1642,8 +1636,7 @@ void __init kmemleak_init(void) kmemleak_ignore(log->ptr); break; case KMEMLEAK_SCAN_AREA: - kmemleak_scan_area(log->ptr, log->offset, log->length, - GFP_KERNEL); + kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); break; case KMEMLEAK_NO_SCAN: kmemleak_no_scan(log->ptr); diff --git a/mm/slab.c b/mm/slab.c index 646db3085193..d2713a944ebd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2584,8 +2584,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, * kmemleak does not treat the ->s_mem pointer as a reference * to the object. Otherwise we will not report the leak. */ - kmemleak_scan_area(slabp, offsetof(struct slab, list), - sizeof(struct list_head), local_flags); + kmemleak_scan_area(&slabp->list, sizeof(struct list_head), + local_flags); if (!slabp) return NULL; } else { -- cgit v1.2.3 From dd17c8f72993f9461e9c19250e3f155d6d99df22 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Oct 2009 22:34:15 +0900 Subject: percpu: remove per_cpu__ prefix. Now that the return from alloc_percpu is compatible with the address of per-cpu vars, it makes sense to hand around the address of per-cpu variables. To make this sane, we remove the per_cpu__ prefix we used created to stop people accidentally using these vars directly. Now we have sparse, we can use that (next patch). tj: * Updated to convert stuff which were missed by or added after the original patch. * Kill per_cpu_var() macro. Signed-off-by: Rusty Russell Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter --- arch/blackfin/mach-common/entry.S | 4 ++-- arch/cris/arch-v10/kernel/entry.S | 2 +- arch/cris/arch-v32/mm/mmu.S | 2 +- arch/ia64/include/asm/percpu.h | 4 ++-- arch/ia64/kernel/ia64_ksyms.c | 4 ++-- arch/ia64/mm/discontig.c | 2 +- arch/microblaze/include/asm/entry.h | 2 +- arch/parisc/lib/fixup.S | 8 +++---- arch/powerpc/platforms/pseries/hvCall.S | 2 +- arch/sparc/kernel/nmi.c | 6 +++--- arch/sparc/kernel/rtrap_64.S | 8 +++---- arch/x86/include/asm/percpu.h | 37 +++++++++++++++------------------ arch/x86/include/asm/system.h | 8 +++---- arch/x86/kernel/apic/nmi.c | 6 +++--- arch/x86/kernel/head_32.S | 6 +++--- arch/x86/kernel/vmlinux.lds.S | 4 ++-- arch/x86/xen/xen-asm_32.S | 4 ++-- include/asm-generic/percpu.h | 12 +++++------ include/linux/percpu-defs.h | 18 ++++++---------- include/linux/percpu.h | 5 ++--- include/linux/vmstat.h | 8 +++---- kernel/rcutorture.c | 8 +++---- kernel/trace/trace.c | 6 +++--- kernel/trace/trace_functions_graph.c | 4 ++-- 24 files changed, 80 insertions(+), 90 deletions(-) (limited to 'include/linux') diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S index 1e7cac23e25f..a3ea7e9fe43b 100644 --- a/arch/blackfin/mach-common/entry.S +++ b/arch/blackfin/mach-common/entry.S @@ -835,8 +835,8 @@ ENDPROC(_resume) ENTRY(_ret_from_exception) #ifdef CONFIG_IPIPE - p2.l = _per_cpu__ipipe_percpu_domain; - p2.h = _per_cpu__ipipe_percpu_domain; + p2.l = _ipipe_percpu_domain; + p2.h = _ipipe_percpu_domain; r0.l = _ipipe_root; r0.h = _ipipe_root; r2 = [p2]; diff --git a/arch/cris/arch-v10/kernel/entry.S b/arch/cris/arch-v10/kernel/entry.S index 2c18d08cd913..c52bef39e250 100644 --- a/arch/cris/arch-v10/kernel/entry.S +++ b/arch/cris/arch-v10/kernel/entry.S @@ -358,7 +358,7 @@ mmu_bus_fault: 1: btstq 12, $r1 ; Refill? bpl 2f lsrq 24, $r1 ; Get PGD index (bit 24-31) - move.d [per_cpu__current_pgd], $r0 ; PGD for the current process + move.d [current_pgd], $r0 ; PGD for the current process move.d [$r0+$r1.d], $r0 ; Get PMD beq 2f nop diff --git a/arch/cris/arch-v32/mm/mmu.S b/arch/cris/arch-v32/mm/mmu.S index 2238d154bde3..f125d912e140 100644 --- a/arch/cris/arch-v32/mm/mmu.S +++ b/arch/cris/arch-v32/mm/mmu.S @@ -115,7 +115,7 @@ #ifdef CONFIG_SMP move $s7, $acr ; PGD #else - move.d per_cpu__current_pgd, $acr ; PGD + move.d current_pgd, $acr ; PGD #endif ; Look up PMD in PGD lsrq 24, $r0 ; Get PMD index into PGD (bit 24-31) diff --git a/arch/ia64/include/asm/percpu.h b/arch/ia64/include/asm/percpu.h index 30cf46534dd2..f7c00a5e0e2b 100644 --- a/arch/ia64/include/asm/percpu.h +++ b/arch/ia64/include/asm/percpu.h @@ -9,7 +9,7 @@ #define PERCPU_ENOUGH_ROOM PERCPU_PAGE_SIZE #ifdef __ASSEMBLY__ -# define THIS_CPU(var) (per_cpu__##var) /* use this to mark accesses to per-CPU variables... */ +# define THIS_CPU(var) (var) /* use this to mark accesses to per-CPU variables... */ #else /* !__ASSEMBLY__ */ @@ -39,7 +39,7 @@ extern void *per_cpu_init(void); * On the positive side, using __ia64_per_cpu_var() instead of __get_cpu_var() is slightly * more efficient. */ -#define __ia64_per_cpu_var(var) per_cpu__##var +#define __ia64_per_cpu_var(var) var #include diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c index 461b99902bf6..7f4a0ed24152 100644 --- a/arch/ia64/kernel/ia64_ksyms.c +++ b/arch/ia64/kernel/ia64_ksyms.c @@ -30,9 +30,9 @@ EXPORT_SYMBOL(max_low_pfn); /* defined by bootmem.c, but not exported by generic #endif #include -EXPORT_SYMBOL(per_cpu__ia64_cpu_info); +EXPORT_SYMBOL(ia64_cpu_info); #ifdef CONFIG_SMP -EXPORT_SYMBOL(per_cpu__local_per_cpu_offset); +EXPORT_SYMBOL(local_per_cpu_offset); #endif #include diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 19c4b2195dce..8d586d1e2515 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -459,7 +459,7 @@ static void __init initialize_pernode_data(void) cpu = 0; node = node_cpuid[cpu].nid; cpu0_cpu_info = (struct cpuinfo_ia64 *)(__phys_per_cpu_start + - ((char *)&per_cpu__ia64_cpu_info - __per_cpu_start)); + ((char *)&ia64_cpu_info - __per_cpu_start)); cpu0_cpu_info->node_data = mem_data[node].node_data; } #endif /* CONFIG_SMP */ diff --git a/arch/microblaze/include/asm/entry.h b/arch/microblaze/include/asm/entry.h index 61abbd232640..ec89f2ad0fe1 100644 --- a/arch/microblaze/include/asm/entry.h +++ b/arch/microblaze/include/asm/entry.h @@ -21,7 +21,7 @@ * places */ -#define PER_CPU(var) per_cpu__##var +#define PER_CPU(var) var # ifndef __ASSEMBLY__ DECLARE_PER_CPU(unsigned int, KSP); /* Saved kernel stack pointer */ diff --git a/arch/parisc/lib/fixup.S b/arch/parisc/lib/fixup.S index d172d4245cdc..f8c45cc2947d 100644 --- a/arch/parisc/lib/fixup.S +++ b/arch/parisc/lib/fixup.S @@ -36,8 +36,8 @@ #endif /* t2 = &__per_cpu_offset[smp_processor_id()]; */ LDREGX \t2(\t1),\t2 - addil LT%per_cpu__exception_data,%r27 - LDREG RT%per_cpu__exception_data(%r1),\t1 + addil LT%exception_data,%r27 + LDREG RT%exception_data(%r1),\t1 /* t1 = &__get_cpu_var(exception_data) */ add,l \t1,\t2,\t1 /* t1 = t1->fault_ip */ @@ -46,8 +46,8 @@ #else .macro get_fault_ip t1 t2 /* t1 = &__get_cpu_var(exception_data) */ - addil LT%per_cpu__exception_data,%r27 - LDREG RT%per_cpu__exception_data(%r1),\t2 + addil LT%exception_data,%r27 + LDREG RT%exception_data(%r1),\t2 /* t1 = t2->fault_ip */ LDREG EXCDATA_IP(\t2), \t1 .endm diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S index c1427b3634ec..580f789cae7f 100644 --- a/arch/powerpc/platforms/pseries/hvCall.S +++ b/arch/powerpc/platforms/pseries/hvCall.S @@ -55,7 +55,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_PURR); \ /* calculate address of stat structure r4 = opcode */ \ srdi r4,r4,2; /* index into array */ \ mulli r4,r4,HCALL_STAT_SIZE; \ - LOAD_REG_ADDR(r7, per_cpu__hcall_stats); \ + LOAD_REG_ADDR(r7, hcall_stats); \ add r4,r4,r7; \ ld r7,PACA_DATA_OFFSET(r13); /* per cpu offset */ \ add r4,r4,r7; \ diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index f30f4a1ead23..2ad288ff99a4 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -112,13 +112,13 @@ notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs) touched = 1; } if (!touched && __get_cpu_var(last_irq_sum) == sum) { - __this_cpu_inc(per_cpu_var(alert_counter)); - if (__this_cpu_read(per_cpu_var(alert_counter)) == 30 * nmi_hz) + __this_cpu_inc(alert_counter); + if (__this_cpu_read(alert_counter) == 30 * nmi_hz) die_nmi("BUG: NMI Watchdog detected LOCKUP", regs, panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; - __this_cpu_write(per_cpu_var(alert_counter), 0); + __this_cpu_write(alert_counter, 0); } if (__get_cpu_var(wd_enabled)) { write_pic(picl_value(nmi_hz)); diff --git a/arch/sparc/kernel/rtrap_64.S b/arch/sparc/kernel/rtrap_64.S index fd3cee4d117c..1ddec403f512 100644 --- a/arch/sparc/kernel/rtrap_64.S +++ b/arch/sparc/kernel/rtrap_64.S @@ -149,11 +149,11 @@ rtrap_nmi: ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1 rtrap_irq: rtrap: #ifndef CONFIG_SMP - sethi %hi(per_cpu____cpu_data), %l0 - lduw [%l0 + %lo(per_cpu____cpu_data)], %l1 + sethi %hi(__cpu_data), %l0 + lduw [%l0 + %lo(__cpu_data)], %l1 #else - sethi %hi(per_cpu____cpu_data), %l0 - or %l0, %lo(per_cpu____cpu_data), %l0 + sethi %hi(__cpu_data), %l0 + or %l0, %lo(__cpu_data), %l0 lduw [%l0 + %g5], %l1 #endif cmp %l1, 0 diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 0c44196b78ac..4c170ccc72ed 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -25,19 +25,18 @@ */ #ifdef CONFIG_SMP #define PER_CPU(var, reg) \ - __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \ - lea per_cpu__##var(reg), reg -#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var + __percpu_mov_op %__percpu_seg:this_cpu_off, reg; \ + lea var(reg), reg +#define PER_CPU_VAR(var) %__percpu_seg:var #else /* ! SMP */ -#define PER_CPU(var, reg) \ - __percpu_mov_op $per_cpu__##var, reg -#define PER_CPU_VAR(var) per_cpu__##var +#define PER_CPU(var, reg) __percpu_mov_op $var, reg +#define PER_CPU_VAR(var) var #endif /* SMP */ #ifdef CONFIG_X86_64_SMP #define INIT_PER_CPU_VAR(var) init_per_cpu__##var #else -#define INIT_PER_CPU_VAR(var) per_cpu__##var +#define INIT_PER_CPU_VAR(var) var #endif #else /* ...!ASSEMBLY */ @@ -60,12 +59,12 @@ * There also must be an entry in vmlinux_64.lds.S */ #define DECLARE_INIT_PER_CPU(var) \ - extern typeof(per_cpu_var(var)) init_per_cpu_var(var) + extern typeof(var) init_per_cpu_var(var) #ifdef CONFIG_X86_64_SMP #define init_per_cpu_var(var) init_per_cpu__##var #else -#define init_per_cpu_var(var) per_cpu_var(var) +#define init_per_cpu_var(var) var #endif /* For arch-specific code, we can use direct single-insn ops (they @@ -142,16 +141,14 @@ do { \ * per-thread variables implemented as per-cpu variables and thus * stable for the duration of the respective task. */ -#define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \ - "m" (per_cpu__##var)) -#define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \ - "p" (&per_cpu__##var)) -#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) -#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) -#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) -#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val) -#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val) -#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val) +#define percpu_read(var) percpu_from_op("mov", var, "m" (var)) +#define percpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) +#define percpu_write(var, val) percpu_to_op("mov", var, val) +#define percpu_add(var, val) percpu_to_op("add", var, val) +#define percpu_sub(var, val) percpu_to_op("sub", var, val) +#define percpu_and(var, val) percpu_to_op("and", var, val) +#define percpu_or(var, val) percpu_to_op("or", var, val) +#define percpu_xor(var, val) percpu_to_op("xor", var, val) #define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) @@ -236,7 +233,7 @@ do { \ ({ \ int old__; \ asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \ - : "=r" (old__), "+m" (per_cpu__##var) \ + : "=r" (old__), "+m" (var) \ : "dIr" (bit)); \ old__; \ }) diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index f08f97374892..de10c19d9558 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -31,7 +31,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, "movl %P[task_canary](%[next]), %%ebx\n\t" \ "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" #define __switch_canary_oparam \ - , [stack_canary] "=m" (per_cpu_var(stack_canary.canary)) + , [stack_canary] "=m" (stack_canary.canary) #define __switch_canary_iparam \ , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) #else /* CC_STACKPROTECTOR */ @@ -113,7 +113,7 @@ do { \ "movq %P[task_canary](%%rsi),%%r8\n\t" \ "movq %%r8,"__percpu_arg([gs_canary])"\n\t" #define __switch_canary_oparam \ - , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary)) + , [gs_canary] "=m" (irq_stack_union.stack_canary) #define __switch_canary_iparam \ , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) #else /* CC_STACKPROTECTOR */ @@ -134,7 +134,7 @@ do { \ __switch_canary \ "movq %P[thread_info](%%rsi),%%r8\n\t" \ "movq %%rax,%%rdi\n\t" \ - "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ + "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ "jnz ret_from_fork\n\t" \ RESTORE_CONTEXT \ : "=a" (last) \ @@ -144,7 +144,7 @@ do { \ [ti_flags] "i" (offsetof(struct thread_info, flags)), \ [_tif_fork] "i" (_TIF_FORK), \ [thread_info] "i" (offsetof(struct task_struct, stack)), \ - [current_task] "m" (per_cpu_var(current_task)) \ + [current_task] "m" (current_task) \ __switch_canary_iparam \ : "memory", "cc" __EXTRA_CLOBBER) #endif diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index e631cc4416f7..45404379d173 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -437,8 +437,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - __this_cpu_inc(per_cpu_var(alert_counter)); - if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz) + __this_cpu_inc(alert_counter); + if (__this_cpu_read(alert_counter) == 5 * nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ @@ -446,7 +446,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) regs, panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; - __this_cpu_write(per_cpu_var(alert_counter), 0); + __this_cpu_write(alert_counter, 0); } /* see if the nmi watchdog went off */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 050c278481b1..fd39eaf83b84 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -438,8 +438,8 @@ is386: movl $2,%ecx # set MP */ cmpb $0,ready jne 1f - movl $per_cpu__gdt_page,%eax - movl $per_cpu__stack_canary,%ecx + movl $gdt_page,%eax + movl $stack_canary,%ecx movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) shrl $16, %ecx movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) @@ -702,7 +702,7 @@ idt_descr: .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) .word GDT_ENTRIES*8-1 - .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ + .long gdt_page /* Overwritten for secondary CPUs */ /* * The boot_gdt must mirror the equivalent in setup.S and is diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 92929fb3f9fa..ecb92717c412 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -312,7 +312,7 @@ SECTIONS * Per-cpu symbols which need to be offset from __per_cpu_load * for the boot processor. */ -#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load INIT_PER_CPU(gdt_page); INIT_PER_CPU(irq_stack_union); @@ -323,7 +323,7 @@ INIT_PER_CPU(irq_stack_union); "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -. = ASSERT((per_cpu__irq_stack_union == 0), +. = ASSERT((irq_stack_union == 0), "irq_stack_union is not at start of per-cpu area"); #endif diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 88e15deb8b82..22a2093b5862 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -90,9 +90,9 @@ ENTRY(xen_iret) GET_THREAD_INFO(%eax) movl TI_cpu(%eax), %eax movl __per_cpu_offset(,%eax,4), %eax - mov per_cpu__xen_vcpu(%eax), %eax + mov xen_vcpu(%eax), %eax #else - movl per_cpu__xen_vcpu, %eax + movl xen_vcpu, %eax #endif /* check IF state we're restoring */ diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 8087b90d4673..ca6f0491412b 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -50,11 +50,11 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; * offset. */ #define per_cpu(var, cpu) \ - (*SHIFT_PERCPU_PTR(&per_cpu_var(var), per_cpu_offset(cpu))) + (*SHIFT_PERCPU_PTR(&(var), per_cpu_offset(cpu))) #define __get_cpu_var(var) \ - (*SHIFT_PERCPU_PTR(&per_cpu_var(var), my_cpu_offset)) + (*SHIFT_PERCPU_PTR(&(var), my_cpu_offset)) #define __raw_get_cpu_var(var) \ - (*SHIFT_PERCPU_PTR(&per_cpu_var(var), __my_cpu_offset)) + (*SHIFT_PERCPU_PTR(&(var), __my_cpu_offset)) #define this_cpu_ptr(ptr) SHIFT_PERCPU_PTR(ptr, my_cpu_offset) #define __this_cpu_ptr(ptr) SHIFT_PERCPU_PTR(ptr, __my_cpu_offset) @@ -66,9 +66,9 @@ extern void setup_per_cpu_areas(void); #else /* ! SMP */ -#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var))) -#define __get_cpu_var(var) per_cpu_var(var) -#define __raw_get_cpu_var(var) per_cpu_var(var) +#define per_cpu(var, cpu) (*((void)(cpu), &(var))) +#define __get_cpu_var(var) (var) +#define __raw_get_cpu_var(var) (var) #define this_cpu_ptr(ptr) per_cpu_ptr(ptr, 0) #define __this_cpu_ptr(ptr) this_cpu_ptr(ptr) diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 5a5d6ce4bd55..ee99f6c2cdcd 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -1,12 +1,6 @@ #ifndef _LINUX_PERCPU_DEFS_H #define _LINUX_PERCPU_DEFS_H -/* - * Determine the real variable name from the name visible in the - * kernel sources. - */ -#define per_cpu_var(var) per_cpu__##var - /* * Base implementations of per-CPU variable declarations and definitions, where * the section in which the variable is to be placed is provided by the @@ -56,24 +50,24 @@ */ #define DECLARE_PER_CPU_SECTION(type, name, sec) \ extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ - extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name + extern __PCPU_ATTRS(sec) __typeof__(type) name #define DEFINE_PER_CPU_SECTION(type, name, sec) \ __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ extern __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \ __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \ __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES __weak \ - __typeof__(type) per_cpu__##name + __typeof__(type) name #else /* * Normal declaration and definition macros. */ #define DECLARE_PER_CPU_SECTION(type, name, sec) \ - extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name + extern __PCPU_ATTRS(sec) __typeof__(type) name #define DEFINE_PER_CPU_SECTION(type, name, sec) \ __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES \ - __typeof__(type) per_cpu__##name + __typeof__(type) name #endif /* @@ -137,8 +131,8 @@ /* * Intermodule exports for per-CPU variables. */ -#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) -#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) +#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var) +#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var) #endif /* _LINUX_PERCPU_DEFS_H */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 522f421ec213..e12410e55e05 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -182,7 +182,7 @@ static inline void *pcpu_lpage_remapped(void *kaddr) #ifndef percpu_read # define percpu_read(var) \ ({ \ - typeof(per_cpu_var(var)) __tmp_var__; \ + typeof(var) __tmp_var__; \ __tmp_var__ = get_cpu_var(var); \ put_cpu_var(var); \ __tmp_var__; \ @@ -253,8 +253,7 @@ do { \ /* * Optimized manipulation for memory allocated through the per cpu - * allocator or for addresses of per cpu variables (can be determined - * using per_cpu_var(xx). + * allocator or for addresses of per cpu variables. * * These operation guarantee exclusivity of access for other operations * on the *same* processor. The assumption is that per cpu data is only diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d85889710f9b..3e489fda11a1 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -76,22 +76,22 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states); static inline void __count_vm_event(enum vm_event_item item) { - __this_cpu_inc(per_cpu_var(vm_event_states).event[item]); + __this_cpu_inc(vm_event_states.event[item]); } static inline void count_vm_event(enum vm_event_item item) { - this_cpu_inc(per_cpu_var(vm_event_states).event[item]); + this_cpu_inc(vm_event_states.event[item]); } static inline void __count_vm_events(enum vm_event_item item, long delta) { - __this_cpu_add(per_cpu_var(vm_event_states).event[item], delta); + __this_cpu_add(vm_event_states.event[item], delta); } static inline void count_vm_events(enum vm_event_item item, long delta) { - this_cpu_add(per_cpu_var(vm_event_states).event[item], delta); + this_cpu_add(vm_event_states.event[item], delta); } extern void all_vm_events(unsigned long *); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 178967b6434e..e339ab349121 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -731,13 +731,13 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); + __this_cpu_inc(rcu_torture_count[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } - __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); + __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); } @@ -786,13 +786,13 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); + __this_cpu_inc(rcu_torture_count[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } - __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); + __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); schedule(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 85a5ed70b5b2..b808177af816 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -91,12 +91,12 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) { preempt_disable(); - __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); + __this_cpu_inc(ftrace_cpu_disabled); } static inline void ftrace_enable_cpu(void) { - __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); + __this_cpu_dec(ftrace_cpu_disabled); preempt_enable(); } @@ -1085,7 +1085,7 @@ trace_function(struct trace_array *tr, struct ftrace_entry *entry; /* If we are reading the ring buffer, don't trace */ - if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) return; event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 90a6daa10962..8614e3241ff8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -176,7 +176,7 @@ static int __trace_graph_entry(struct trace_array *tr, struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ent_entry *entry; - if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) return 0; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, @@ -240,7 +240,7 @@ static void __trace_graph_return(struct trace_array *tr, struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ret_entry *entry; - if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) return; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, -- cgit v1.2.3 From f7b64fe806029e0a0454df132eec3c5ab576102c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Oct 2009 22:34:15 +0900 Subject: percpu: make access macros universal Now that per_cpu__ prefix is gone, there's no distinction between static and dynamic percpu variables. Make get_cpu_var() take dynamic percpu variables and ensure that all macros have parentheses around the parameter evaluation and evaluate the variable parameter only once such that any expression which evaluates to percpu address can be used safely. Signed-off-by: Tejun Heo --- include/linux/percpu.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e12410e55e05..f965f833a643 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -27,10 +27,13 @@ * we force a syntax error here if it isn't. */ #define get_cpu_var(var) (*({ \ - extern int simple_identifier_##var(void); \ preempt_disable(); \ &__get_cpu_var(var); })) -#define put_cpu_var(var) preempt_enable() + +#define put_cpu_var(var) do { \ + (void)(var); \ + preempt_enable(); \ +} while (0) #ifdef CONFIG_SMP @@ -182,17 +185,19 @@ static inline void *pcpu_lpage_remapped(void *kaddr) #ifndef percpu_read # define percpu_read(var) \ ({ \ - typeof(var) __tmp_var__; \ - __tmp_var__ = get_cpu_var(var); \ - put_cpu_var(var); \ - __tmp_var__; \ + typeof(var) *pr_ptr__ = &(var); \ + typeof(var) pr_ret__; \ + pr_ret__ = get_cpu_var(*pr_ptr__); \ + put_cpu_var(*pr_ptr__); \ + pr_ret__; \ }) #endif #define __percpu_generic_to_op(var, val, op) \ do { \ - get_cpu_var(var) op val; \ - put_cpu_var(var); \ + typeof(var) *pgto_ptr__ = &(var); \ + get_cpu_var(*pgto_ptr__) op val; \ + put_cpu_var(*pgto_ptr__); \ } while (0) #ifndef percpu_write @@ -304,7 +309,7 @@ do { \ #define _this_cpu_generic_to_op(pcp, val, op) \ do { \ preempt_disable(); \ - *__this_cpu_ptr(&pcp) op val; \ + *__this_cpu_ptr(&(pcp)) op val; \ preempt_enable(); \ } while (0) -- cgit v1.2.3 From e0fdb0e050eae331046385643618f12452aa7e73 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Oct 2009 22:34:15 +0900 Subject: percpu: add __percpu for sparse. We have to make __kernel "__attribute__((address_space(0)))" so we can cast to it. tj: * put_cpu_var() update. * Annotations added to dynamic allocator interface. Signed-off-by: Rusty Russell Cc: Al Viro Signed-off-by: Tejun Heo --- include/asm-generic/percpu.h | 4 +++- include/linux/compiler.h | 4 +++- include/linux/percpu-defs.h | 2 +- include/linux/percpu.h | 18 +++++++++++------- 4 files changed, 18 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index ca6f0491412b..fded453fd25c 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -41,7 +41,9 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; * Only S390 provides its own means of moving the pointer. */ #ifndef SHIFT_PERCPU_PTR -#define SHIFT_PERCPU_PTR(__p, __offset) RELOC_HIDE((__p), (__offset)) +/* Weird cast keeps both GCC and sparse happy. */ +#define SHIFT_PERCPU_PTR(__p, __offset) \ + RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)) #endif /* diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 04fb5135b4e1..abba8045c6ef 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -5,7 +5,7 @@ #ifdef __CHECKER__ # define __user __attribute__((noderef, address_space(1))) -# define __kernel /* default address space */ +# define __kernel __attribute__((address_space(0))) # define __safe __attribute__((safe)) # define __force __attribute__((force)) # define __nocast __attribute__((nocast)) @@ -15,6 +15,7 @@ # define __acquire(x) __context__(x,1) # define __release(x) __context__(x,-1) # define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) +# define __percpu __attribute__((noderef, address_space(3))) extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); #else @@ -32,6 +33,7 @@ extern void __chk_io_ptr(const volatile void __iomem *); # define __acquire(x) (void)0 # define __release(x) (void)0 # define __cond_lock(x,c) (c) +# define __percpu #endif #ifdef __KERNEL__ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index ee99f6c2cdcd..0fa0cb524250 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -12,7 +12,7 @@ * that section. */ #define __PCPU_ATTRS(sec) \ - __attribute__((section(PER_CPU_BASE_SECTION sec))) \ + __percpu __attribute__((section(PER_CPU_BASE_SECTION sec))) \ PER_CPU_ATTRIBUTES #define __PCPU_DUMMY_ATTRS \ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index f965f833a643..2c0d31a3f6b6 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -30,8 +30,12 @@ preempt_disable(); \ &__get_cpu_var(var); })) +/* + * The weird & is necessary because sparse considers (void)(var) to be + * a direct dereference of percpu variable (var). + */ #define put_cpu_var(var) do { \ - (void)(var); \ + (void)&(var); \ preempt_enable(); \ } while (0) @@ -130,9 +134,9 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, */ #define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu))) -extern void *__alloc_reserved_percpu(size_t size, size_t align); -extern void *__alloc_percpu(size_t size, size_t align); -extern void free_percpu(void *__pdata); +extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); +extern void __percpu *__alloc_percpu(size_t size, size_t align); +extern void free_percpu(void __percpu *__pdata); #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA extern void __init setup_per_cpu_areas(void); @@ -142,7 +146,7 @@ extern void __init setup_per_cpu_areas(void); #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) -static inline void *__alloc_percpu(size_t size, size_t align) +static inline void __percpu *__alloc_percpu(size_t size, size_t align) { /* * Can't easily make larger alignment work with kmalloc. WARN @@ -153,7 +157,7 @@ static inline void *__alloc_percpu(size_t size, size_t align) return kzalloc(size, GFP_KERNEL); } -static inline void free_percpu(void *p) +static inline void free_percpu(void __percpu *p) { kfree(p); } @@ -168,7 +172,7 @@ static inline void *pcpu_lpage_remapped(void *kaddr) #endif /* CONFIG_SMP */ #define alloc_percpu(type) \ - (typeof(type) *)__alloc_percpu(sizeof(type), __alignof__(type)) + (typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type)) /* * Optional methods for optimized non-lvalue per-cpu variable access. -- cgit v1.2.3 From 545695fb41da117928ab946067a42d9e15fd009d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Oct 2009 22:34:15 +0900 Subject: percpu: make accessors check for percpu pointer in sparse The previous patch made sparse warn about percpu variables being used directly without going through percpu accessors. This patch implements the other half - checking whether non percpu variable is passed into percpu accessors. Signed-off-by: Tejun Heo Cc: Rusty Russell Cc: Al Viro --- include/asm-generic/percpu.h | 6 ++++-- include/linux/percpu-defs.h | 20 ++++++++++++++++++-- include/linux/percpu.h | 2 ++ 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index fded453fd25c..04f91c2d3f7b 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -42,8 +42,10 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; */ #ifndef SHIFT_PERCPU_PTR /* Weird cast keeps both GCC and sparse happy. */ -#define SHIFT_PERCPU_PTR(__p, __offset) \ - RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)) +#define SHIFT_PERCPU_PTR(__p, __offset) ({ \ + __verify_pcpu_ptr((__p)); \ + RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)); \ +}) #endif /* diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 0fa0cb524250..1fa36eb54b6a 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -18,6 +18,16 @@ #define __PCPU_DUMMY_ATTRS \ __attribute__((section(".discard"), unused)) +/* + * Macro which verifies @ptr is a percpu pointer without evaluating + * @ptr. This is to be used in percpu accessors to verify that the + * input parameter is a percpu pointer. + */ +#define __verify_pcpu_ptr(ptr) do { \ + void __percpu *__vpp_verify = (typeof(ptr))NULL; \ + (void)__vpp_verify; \ +} while (0) + /* * s390 and alpha modules require percpu variables to be defined as * weak to force the compiler to generate GOT based external @@ -129,10 +139,16 @@ __aligned(PAGE_SIZE) /* - * Intermodule exports for per-CPU variables. + * Intermodule exports for per-CPU variables. sparse forgets about + * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to + * noop if __CHECKER__. */ +#ifndef __CHECKER__ #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var) #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var) - +#else +#define EXPORT_PER_CPU_SYMBOL(var) +#define EXPORT_PER_CPU_SYMBOL_GPL(var) +#endif #endif /* _LINUX_PERCPU_DEFS_H */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 2c0d31a3f6b6..42878f0cd0e2 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -237,6 +237,7 @@ extern void __bad_size_call_parameter(void); #define __pcpu_size_call_return(stem, variable) \ ({ typeof(variable) pscr_ret__; \ + __verify_pcpu_ptr(&(variable)); \ switch(sizeof(variable)) { \ case 1: pscr_ret__ = stem##1(variable);break; \ case 2: pscr_ret__ = stem##2(variable);break; \ @@ -250,6 +251,7 @@ extern void __bad_size_call_parameter(void); #define __pcpu_size_call(stem, variable, ...) \ do { \ + __verify_pcpu_ptr(&(variable)); \ switch(sizeof(variable)) { \ case 1: stem##1(variable, __VA_ARGS__);break; \ case 2: stem##2(variable, __VA_ARGS__);break; \ -- cgit v1.2.3 From 8c10cbdb4af642d9a2efb45ea89251aaab905360 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Mon, 19 Oct 2009 12:04:53 +0200 Subject: nfsd: use STATEID_FMT and STATEID_VAL for printing stateids Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 46 ++++++++++++++++------------------------------ include/linux/nfsd/state.h | 7 +++++++ 2 files changed, 23 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 42dab9587afe..c8b621a120cd 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2416,11 +2416,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); - dprintk("NFSD: delegation stateid=(%08x/%08x/%08x/%08x)\n\n", - dp->dl_stateid.si_boot, - dp->dl_stateid.si_stateownerid, - dp->dl_stateid.si_fileid, - dp->dl_stateid.si_generation); + dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", + STATEID_VAL(&dp->dl_stateid)); out: if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && flag == NFS4_OPEN_DELEGATE_NONE @@ -2510,9 +2507,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs_ok; - dprintk("nfs4_process_open2: stateid=(%08x/%08x/%08x/%08x)\n", - stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid, - stp->st_stateid.si_fileid, stp->st_stateid.si_generation); + dprintk("%s: stateid=" STATEID_FMT "\n", __func__, + STATEID_VAL(&stp->st_stateid)); out: if (fp) put_nfs4_file(fp); @@ -2678,9 +2674,8 @@ STALE_STATEID(stateid_t *stateid) { if (time_after((unsigned long)boot_time, (unsigned long)stateid->si_boot)) { - dprintk("NFSD: stale stateid (%08x/%08x/%08x/%08x)!\n", - stateid->si_boot, stateid->si_stateownerid, - stateid->si_fileid, stateid->si_generation); + dprintk("NFSD: stale stateid " STATEID_FMT "!\n", + STATEID_VAL(stateid)); return 1; } return 0; @@ -2692,9 +2687,8 @@ EXPIRED_STATEID(stateid_t *stateid) if (time_before((unsigned long)boot_time, ((unsigned long)stateid->si_boot)) && time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) { - dprintk("NFSD: expired stateid (%08x/%08x/%08x/%08x)!\n", - stateid->si_boot, stateid->si_stateownerid, - stateid->si_fileid, stateid->si_generation); + dprintk("NFSD: expired stateid " STATEID_FMT "!\n", + STATEID_VAL(stateid)); return 1; } return 0; @@ -2708,9 +2702,8 @@ stateid_error_map(stateid_t *stateid) if (EXPIRED_STATEID(stateid)) return nfserr_expired; - dprintk("NFSD: bad stateid (%08x/%08x/%08x/%08x)!\n", - stateid->si_boot, stateid->si_stateownerid, - stateid->si_fileid, stateid->si_generation); + dprintk("NFSD: bad stateid " STATEID_FMT "!\n", + STATEID_VAL(stateid)); return nfserr_bad_stateid; } @@ -2896,10 +2889,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, struct svc_fh *current_fh = &cstate->current_fh; __be32 status; - dprintk("NFSD: preprocess_seqid_op: seqid=%d " - "stateid = (%08x/%08x/%08x/%08x)\n", seqid, - stateid->si_boot, stateid->si_stateownerid, stateid->si_fileid, - stateid->si_generation); + dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, + seqid, STATEID_VAL(stateid)); *stpp = NULL; *sopp = NULL; @@ -3031,12 +3022,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, sop->so_confirmed = 1; update_stateid(&stp->st_stateid); memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t)); - dprintk("NFSD: nfsd4_open_confirm: success, seqid=%d " - "stateid=(%08x/%08x/%08x/%08x)\n", oc->oc_seqid, - stp->st_stateid.si_boot, - stp->st_stateid.si_stateownerid, - stp->st_stateid.si_fileid, - stp->st_stateid.si_generation); + dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", + __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid)); nfsd4_create_clid_dir(sop->so_client); out: @@ -3295,9 +3282,8 @@ find_delegation_stateid(struct inode *ino, stateid_t *stid) struct nfs4_file *fp; struct nfs4_delegation *dl; - dprintk("NFSD:find_delegation_stateid stateid=(%08x/%08x/%08x/%08x)\n", - stid->si_boot, stid->si_stateownerid, - stid->si_fileid, stid->si_generation); + dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__, + STATEID_VAL(stid)); fp = find_file(ino); if (!fp) diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index b38d11324189..5aadf8aa3a97 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -60,6 +60,13 @@ typedef struct { #define si_stateownerid si_opaque.so_stateownerid #define si_fileid si_opaque.so_fileid +#define STATEID_FMT "(%08x/%08x/%08x/%08x)" +#define STATEID_VAL(s) \ + (s)->si_boot, \ + (s)->si_stateownerid, \ + (s)->si_fileid, \ + (s)->si_generation + struct nfsd4_cb_sequence { /* args/res */ u32 cbs_minorversion; -- cgit v1.2.3 From 417608c20a4c8397bc5307d949ec01ea0a0dd8e5 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 12 Nov 2009 11:19:44 -0800 Subject: IB/mlx4: Remove limitation on LSO header size Current code has a limitation: an LSO header is not allowed to cross a 64 byte boundary. This patch removes this limitation by setting the WQE RR for large headers thus allowing LSO headers of any size. The extra buffer reserved for MLX4_IB_QP_LSO QPs has been doubled, from 64 to 128 bytes, assuming this is reasonable upper limit for header length. Also, this patch will cause IB_DEVICE_UD_TSO to be set only for HCA FW versions that set MLX4_DEV_CAP_FLAG_BLH; e.g. FW version 2.6.000 and higher. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 2 +- drivers/infiniband/hw/mlx4/qp.c | 24 ++++++++++++------------ drivers/net/mlx4/fw.c | 1 + include/linux/mlx4/device.h | 1 + 4 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 3cb3f47a10b8..e596537ff353 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -103,7 +103,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; - if (dev->dev->caps.max_gso_sz) + if (dev->dev->caps.max_gso_sz && dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH) props->device_cap_flags |= IB_DEVICE_UD_TSO; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 518d561970aa..847030c89a8d 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -54,7 +54,8 @@ enum { /* * Largest possible UD header: send with GRH and immediate data. */ - MLX4_IB_UD_HEADER_SIZE = 72 + MLX4_IB_UD_HEADER_SIZE = 72, + MLX4_IB_LSO_HEADER_SPARE = 128, }; struct mlx4_ib_sqp { @@ -67,7 +68,8 @@ struct mlx4_ib_sqp { }; enum { - MLX4_IB_MIN_SQ_STRIDE = 6 + MLX4_IB_MIN_SQ_STRIDE = 6, + MLX4_IB_CACHE_LINE_SIZE = 64, }; static const __be32 mlx4_ib_opcode[] = { @@ -261,7 +263,7 @@ static int send_wqe_overhead(enum ib_qp_type type, u32 flags) case IB_QPT_UD: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_datagram_seg) + - ((flags & MLX4_IB_QP_LSO) ? 64 : 0); + ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0); case IB_QPT_UC: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_raddr_seg); @@ -1466,16 +1468,12 @@ static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, struct mlx4_ib_qp *qp, unsigned *lso_seg_len, - __be32 *lso_hdr_sz) + __be32 *lso_hdr_sz, __be32 *blh) { unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16); - /* - * This is a temporary limitation and will be removed in - * a forthcoming FW release: - */ - if (unlikely(halign > 64)) - return -EINVAL; + if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE)) + *blh = cpu_to_be32(1 << 6); if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) && wr->num_sge > qp->sq.max_gs - (halign >> 4))) @@ -1521,6 +1519,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, __be32 dummy; __be32 *lso_wqe; __be32 uninitialized_var(lso_hdr_sz); + __be32 blh; int i; spin_lock_irqsave(&qp->sq.lock, flags); @@ -1529,6 +1528,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, for (nreq = 0; wr; ++nreq, wr = wr->next) { lso_wqe = &dummy; + blh = 0; if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { err = -ENOMEM; @@ -1615,7 +1615,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += sizeof (struct mlx4_wqe_datagram_seg) / 16; if (wr->opcode == IB_WR_LSO) { - err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz); + err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh); if (unlikely(err)) { *bad_wr = wr; goto out; @@ -1686,7 +1686,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | - (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); + (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh; stamp = ind + qp->sq_spare_wqes; ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 3c16602172fc..7194be3a2894 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -90,6 +90,7 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u32 flags) [ 9] = "Q_Key violation counter", [10] = "VMM", [12] = "DPDP", + [15] = "Big LSO headers", [16] = "MW support", [17] = "APM support", [18] = "Atomic ops support", diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index ce7cc6c7bcbb..e92d1bfdb330 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -61,6 +61,7 @@ enum { MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR = 1 << 8, MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR = 1 << 9, MLX4_DEV_CAP_FLAG_DPDP = 1 << 12, + MLX4_DEV_CAP_FLAG_BLH = 1 << 15, MLX4_DEV_CAP_FLAG_MEM_WINDOW = 1 << 16, MLX4_DEV_CAP_FLAG_APM = 1 << 17, MLX4_DEV_CAP_FLAG_ATOMIC = 1 << 18, -- cgit v1.2.3 From 0a3adadee42f2865bb867b8c5f4955b7def9baad Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 4 Nov 2009 18:12:35 -0500 Subject: nfsd: make fs/nfsd/vfs.h for common includes None of this stuff is used outside nfsd, so move it out of the common linux include directory. Actually, probably none of the stuff in include/linux/nfsd/nfsd.h really belongs there, so later we may remove that file entirely. Signed-off-by: J. Bruce Fields --- fs/nfsd/lockd.c | 1 + fs/nfsd/nfs2acl.c | 1 + fs/nfsd/nfs3acl.c | 1 + fs/nfsd/nfs3proc.c | 1 + fs/nfsd/nfs4proc.c | 1 + fs/nfsd/nfs4recover.c | 1 + fs/nfsd/nfs4state.c | 1 + fs/nfsd/nfs4xdr.c | 1 + fs/nfsd/nfsfh.c | 1 + fs/nfsd/nfsproc.c | 1 + fs/nfsd/nfssvc.c | 1 + fs/nfsd/vfs.c | 1 + fs/nfsd/vfs.h | 98 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/nfsd/nfsd.h | 87 +---------------------------------------- 14 files changed, 111 insertions(+), 86 deletions(-) create mode 100644 fs/nfsd/vfs.h (limited to 'include/linux') diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index b2786a5f9afe..812bc64874f6 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -16,6 +16,7 @@ #include #include #include +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_LOCKD diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index e2a17f0a96a7..38c883d48b02 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -14,6 +14,7 @@ #include #include #include +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC #define RETURN_STATUS(st) { resp->status = (st); return (st); } diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index ff73596eb550..526d85a65f76 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -13,6 +13,7 @@ #include #include #include +#include "vfs.h" #define RETURN_STATUS(st) { resp->status = (st); return (st); } diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index a713c418a922..1a259d313e14 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -25,6 +25,7 @@ #include #include #include +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index bebc0c2e1b0a..60a93cdefef5 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -48,6 +48,7 @@ #include #include #include +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index b5348405046b..c7a6b245c7ad 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -47,6 +47,7 @@ #include #include #include +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c8b621a120cd..850960e5d626 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -56,6 +56,7 @@ #include #include #include +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0fbd50cee1f6..db0fc55670b3 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -57,6 +57,7 @@ #include #include #include +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_XDR diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 01965b2f3a76..d0d8a217a3ea 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -22,6 +22,7 @@ #include #include #include +#include "vfs.h" #include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_FH diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index c5393d1b8955..6c967e1ba37b 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -24,6 +24,7 @@ #include #include #include +#include "vfs.h" typedef struct svc_rqst svc_rqst; typedef struct svc_buf svc_buf; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 67ea83eedd43..2944b31dcbe6 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -35,6 +35,7 @@ #include #include #include +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_SVC diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 638573968dcf..a7038ede671a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -56,6 +56,7 @@ #endif /* CONFIG_NFSD_V4 */ #include #include +#include "vfs.h" #include diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h new file mode 100644 index 000000000000..b8011fd2fcab --- /dev/null +++ b/fs/nfsd/vfs.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 1995-1997 Olaf Kirch + */ + +#ifndef LINUX_NFSD_VFS_H +#define LINUX_NFSD_VFS_H + +/* + * Flags for nfsd_permission + */ +#define NFSD_MAY_NOP 0 +#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */ +#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */ +#define NFSD_MAY_READ 4 /* == MAY_READ */ +#define NFSD_MAY_SATTR 8 +#define NFSD_MAY_TRUNC 16 +#define NFSD_MAY_LOCK 32 +#define NFSD_MAY_OWNER_OVERRIDE 64 +#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ +#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 + +#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) +#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) + +/* + * Callback function for readdir + */ +typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int); + +/* nfsd/vfs.c */ +int fh_lock_parent(struct svc_fh *, struct dentry *); +int nfsd_racache_init(int); +void nfsd_racache_shutdown(void); +int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, + struct svc_export **expp); +__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, + const char *, unsigned int, struct svc_fh *); +__be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, + const char *, unsigned int, + struct svc_export **, struct dentry **); +__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, + struct iattr *, int, time_t); +#ifdef CONFIG_NFSD_V4 +__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *, + struct nfs4_acl *); +int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **); +#endif /* CONFIG_NFSD_V4 */ +__be32 nfsd_create(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + int type, dev_t rdev, struct svc_fh *res); +#ifdef CONFIG_NFSD_V3 +__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); +__be32 nfsd_create_v3(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + struct svc_fh *res, int createmode, + u32 *verifier, int *truncp, int *created); +__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, + loff_t, unsigned long); +#endif /* CONFIG_NFSD_V3 */ +__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int, + int, struct file **); +void nfsd_close(struct file *); +__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *, + loff_t, struct kvec *, int, unsigned long *); +__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, + loff_t, struct kvec *,int, unsigned long *, int *); +__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, + char *, int *); +__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, + char *name, int len, char *path, int plen, + struct svc_fh *res, struct iattr *); +__be32 nfsd_link(struct svc_rqst *, struct svc_fh *, + char *, int, struct svc_fh *); +__be32 nfsd_rename(struct svc_rqst *, + struct svc_fh *, char *, int, + struct svc_fh *, char *, int); +__be32 nfsd_remove(struct svc_rqst *, + struct svc_fh *, char *, int); +__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, + char *name, int len); +int nfsd_truncate(struct svc_rqst *, struct svc_fh *, + unsigned long size); +__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, + loff_t *, struct readdir_cd *, filldir_t); +__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, + struct kstatfs *, int access); + +int nfsd_notify_change(struct inode *, struct iattr *); +__be32 nfsd_permission(struct svc_rqst *, struct svc_export *, + struct dentry *, int); +int nfsd_sync_dir(struct dentry *dp); + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); +int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *); +#endif + +#endif /* LINUX_NFSD_VFS_H */ diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index 510ffdd5020e..e4518d090a8c 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -25,30 +25,10 @@ */ #define NFSD_SUPPORTED_MINOR_VERSION 1 -/* - * Flags for nfsd_permission - */ -#define NFSD_MAY_NOP 0 -#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */ -#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */ -#define NFSD_MAY_READ 4 /* == MAY_READ */ -#define NFSD_MAY_SATTR 8 -#define NFSD_MAY_TRUNC 16 -#define NFSD_MAY_LOCK 32 -#define NFSD_MAY_OWNER_OVERRIDE 64 -#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ -#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 - -#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) -#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) - -/* - * Callback function for readdir - */ struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ }; -typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int); + extern struct svc_program nfsd_program; extern struct svc_version nfsd_version2, nfsd_version3, @@ -73,69 +53,6 @@ int nfsd_nrpools(void); int nfsd_get_nrthreads(int n, int *); int nfsd_set_nrthreads(int n, int *); -/* nfsd/vfs.c */ -int fh_lock_parent(struct svc_fh *, struct dentry *); -int nfsd_racache_init(int); -void nfsd_racache_shutdown(void); -int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, - struct svc_export **expp); -__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, - const char *, unsigned int, struct svc_fh *); -__be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, - const char *, unsigned int, - struct svc_export **, struct dentry **); -__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, - struct iattr *, int, time_t); -#ifdef CONFIG_NFSD_V4 -__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *, - struct nfs4_acl *); -int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **); -#endif /* CONFIG_NFSD_V4 */ -__be32 nfsd_create(struct svc_rqst *, struct svc_fh *, - char *name, int len, struct iattr *attrs, - int type, dev_t rdev, struct svc_fh *res); -#ifdef CONFIG_NFSD_V3 -__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); -__be32 nfsd_create_v3(struct svc_rqst *, struct svc_fh *, - char *name, int len, struct iattr *attrs, - struct svc_fh *res, int createmode, - u32 *verifier, int *truncp, int *created); -__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, - loff_t, unsigned long); -#endif /* CONFIG_NFSD_V3 */ -__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int, - int, struct file **); -void nfsd_close(struct file *); -__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *, - loff_t, struct kvec *, int, unsigned long *); -__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, - loff_t, struct kvec *,int, unsigned long *, int *); -__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, - char *, int *); -__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, - char *name, int len, char *path, int plen, - struct svc_fh *res, struct iattr *); -__be32 nfsd_link(struct svc_rqst *, struct svc_fh *, - char *, int, struct svc_fh *); -__be32 nfsd_rename(struct svc_rqst *, - struct svc_fh *, char *, int, - struct svc_fh *, char *, int); -__be32 nfsd_remove(struct svc_rqst *, - struct svc_fh *, char *, int); -__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, - char *name, int len); -int nfsd_truncate(struct svc_rqst *, struct svc_fh *, - unsigned long size); -__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, - loff_t *, struct readdir_cd *, filldir_t); -__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, - struct kstatfs *, int access); - -int nfsd_notify_change(struct inode *, struct iattr *); -__be32 nfsd_permission(struct svc_rqst *, struct svc_export *, - struct dentry *, int); -int nfsd_sync_dir(struct dentry *dp); - #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) #ifdef CONFIG_NFSD_V2_ACL extern struct svc_version nfsd_acl_version2; @@ -147,8 +64,6 @@ extern struct svc_version nfsd_acl_version3; #else #define nfsd_acl_version3 NULL #endif -struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); -int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *); #endif enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL }; -- cgit v1.2.3 From 5db53f3e80dee2d9dff5e534f9e9fe1db17c9936 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Fri, 20 Nov 2009 20:13:39 +0100 Subject: [LogFS] add new flash file system This is a new flash file system. See Documentation/filesystems/logfs.txt Signed-off-by: Joern Engel --- Documentation/filesystems/00-INDEX | 2 + Documentation/filesystems/logfs.txt | 241 ++++ fs/Kconfig | 1 + fs/Makefile | 1 + fs/logfs/Kconfig | 17 + fs/logfs/Makefile | 13 + fs/logfs/compr.c | 95 ++ fs/logfs/dev_bdev.c | 263 ++++ fs/logfs/dev_mtd.c | 253 ++++ fs/logfs/dir.c | 818 +++++++++++++ fs/logfs/file.c | 263 ++++ fs/logfs/gc.c | 730 ++++++++++++ fs/logfs/inode.c | 417 +++++++ fs/logfs/journal.c | 879 ++++++++++++++ fs/logfs/logfs.h | 722 +++++++++++ fs/logfs/logfs_abi.h | 627 ++++++++++ fs/logfs/readwrite.c | 2246 +++++++++++++++++++++++++++++++++++ fs/logfs/segment.c | 924 ++++++++++++++ fs/logfs/super.c | 634 ++++++++++ include/linux/btree-128.h | 109 ++ include/linux/btree-type.h | 147 +++ include/linux/btree.h | 243 ++++ lib/Kconfig | 3 + lib/Makefile | 1 + lib/btree.c | 797 +++++++++++++ 25 files changed, 10446 insertions(+) create mode 100644 Documentation/filesystems/logfs.txt create mode 100644 fs/logfs/Kconfig create mode 100644 fs/logfs/Makefile create mode 100644 fs/logfs/compr.c create mode 100644 fs/logfs/dev_bdev.c create mode 100644 fs/logfs/dev_mtd.c create mode 100644 fs/logfs/dir.c create mode 100644 fs/logfs/file.c create mode 100644 fs/logfs/gc.c create mode 100644 fs/logfs/inode.c create mode 100644 fs/logfs/journal.c create mode 100644 fs/logfs/logfs.h create mode 100644 fs/logfs/logfs_abi.h create mode 100644 fs/logfs/readwrite.c create mode 100644 fs/logfs/segment.c create mode 100644 fs/logfs/super.c create mode 100644 include/linux/btree-128.h create mode 100644 include/linux/btree-type.h create mode 100644 include/linux/btree.h create mode 100644 lib/btree.c (limited to 'include/linux') diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index f15621ee5599..d362aa543b27 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -62,6 +62,8 @@ jfs.txt - info and mount options for the JFS filesystem. locks.txt - info on file locking implementations, flock() vs. fcntl(), etc. +logfs.txt + - info on the LogFS flash filesystem. mandatory-locking.txt - info on the Linux implementation of Sys V mandatory file locking. ncpfs.txt diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt new file mode 100644 index 000000000000..e64c94ba401a --- /dev/null +++ b/Documentation/filesystems/logfs.txt @@ -0,0 +1,241 @@ + +The LogFS Flash Filesystem +========================== + +Specification +============= + +Superblocks +----------- + +Two superblocks exist at the beginning and end of the filesystem. +Each superblock is 256 Bytes large, with another 3840 Bytes reserved +for future purposes, making a total of 4096 Bytes. + +Superblock locations may differ for MTD and block devices. On MTD the +first non-bad block contains a superblock in the first 4096 Bytes and +the last non-bad block contains a superblock in the last 4096 Bytes. +On block devices, the first 4096 Bytes of the device contain the first +superblock and the last aligned 4096 Byte-block contains the second +superblock. + +For the most part, the superblocks can be considered read-only. They +are written only to correct errors detected within the superblocks, +move the journal and change the filesystem parameters through tunefs. +As a result, the superblock does not contain any fields that require +constant updates, like the amount of free space, etc. + +Segments +-------- + +The space in the device is split up into equal-sized segments. +Segments are the primary write unit of LogFS. Within each segments, +writes happen from front (low addresses) to back (high addresses. If +only a partial segment has been written, the segment number, the +current position within and optionally a write buffer are stored in +the journal. + +Segments are erased as a whole. Therefore Garbage Collection may be +required to completely free a segment before doing so. + +Journal +-------- + +The journal contains all global information about the filesystem that +is subject to frequent change. At mount time, it has to be scanned +for the most recent commit entry, which contains a list of pointers to +all currently valid entries. + +Object Store +------------ + +All space except for the superblocks and journal is part of the object +store. Each segment contains a segment header and a number of +objects, each consisting of the object header and the payload. +Objects are either inodes, directory entries (dentries), file data +blocks or indirect blocks. + +Levels +------ + +Garbage collection (GC) may fail if all data is written +indiscriminately. One requirement of GC is that data is seperated +roughly according to the distance between the tree root and the data. +Effectively that means all file data is on level 0, indirect blocks +are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks, +respectively. Inode file data is on level 6 for the inodes and 7-11 +for indirect blocks. + +Each segment contains objects of a single level only. As a result, +each level requires its own seperate segment to be open for writing. + +Inode File +---------- + +All inodes are stored in a special file, the inode file. Single +exception is the inode file's inode (master inode) which for obvious +reasons is stored in the journal instead. Instead of data blocks, the +leaf nodes of the inode files are inodes. + +Aliases +------- + +Writes in LogFS are done by means of a wandering tree. A naïve +implementation would require that for each write or a block, all +parent blocks are written as well, since the block pointers have +changed. Such an implementation would not be very efficient. + +In LogFS, the block pointer changes are cached in the journal by means +of alias entries. Each alias consists of its logical address - inode +number, block index, level and child number (index into block) - and +the changed data. Any 8-byte word can be changes in this manner. + +Currently aliases are used for block pointers, file size, file used +bytes and the height of an inodes indirect tree. + +Segment Aliases +--------------- + +Related to regular aliases, these are used to handle bad blocks. +Initially, bad blocks are handled by moving the affected segment +content to a spare segment and noting this move in the journal with a +segment alias, a simple (to, from) tupel. GC will later empty this +segment and the alias can be removed again. This is used on MTD only. + +Vim +--- + +By cleverly predicting the life time of data, it is possible to +seperate long-living data from short-living data and thereby reduce +the GC overhead later. Each type of distinc life expectency (vim) can +have a seperate segment open for writing. Each (level, vim) tupel can +be open just once. If an open segment with unknown vim is encountered +at mount time, it is closed and ignored henceforth. + +Indirect Tree +------------- + +Inodes in LogFS are similar to FFS-style filesystems with direct and +indirect block pointers. One difference is that LogFS uses a single +indirect pointer that can be either a 1x, 2x, etc. indirect pointer. +A height field in the inode defines the height of the indirect tree +and thereby the indirection of the pointer. + +Another difference is the addressing of indirect blocks. In LogFS, +the first 16 pointers in the first indirect block are left empty, +corresponding to the 16 direct pointers in the inode. In ext2 (maybe +others as well) the first pointer in the first indirect block +corresponds to logical block 12, skipping the 12 direct pointers. +So where ext2 is using arithmetic to better utilize space, LogFS keeps +arithmetic simple and uses compression to save space. + +Compression +----------- + +Both file data and metadata can be compressed. Compression for file +data can be enabled with chattr +c and disabled with chattr -c. Doing +so has no effect on existing data, but new data will be stored +accordingly. New inodes will inherit the compression flag of the +parent directory. + +Metadata is always compressed. However, the space accounting ignores +this and charges for the uncompressed size. Failing to do so could +result in GC failures when, after moving some data, indirect blocks +compress worse than previously. Even on a 100% full medium, GC may +not consume any extra space, so the compression gains are lost space +to the user. + +However, they are not lost space to the filesystem internals. By +cheating the user for those bytes, the filesystem gained some slack +space and GC will run less often and faster. + +Garbage Collection and Wear Leveling +------------------------------------ + +Garbage collection is invoked whenever the number of free segments +falls below a threshold. The best (known) candidate is picked based +on the least amount of valid data contained in the segment. All +remaining valid data is copied elsewhere, thereby invalidating it. + +The GC code also checks for aliases and writes then back if their +number gets too large. + +Wear leveling is done by occasionally picking a suboptimal segment for +garbage collection. If a stale segments erase count is significantly +lower than the active segments' erase counts, it will be picked. Wear +leveling is rate limited, so it will never monopolize the device for +more than one segment worth at a time. + +Values for "occasionally", "significantly lower" are compile time +constants. + +Hashed directories +------------------ + +To satisfy efficient lookup(), directory entries are hashed and +located based on the hash. In order to both support large directories +and not be overly inefficient for small directories, several hash +tables of increasing size are used. For each table, the hash value +modulo the table size gives the table index. + +Tables sizes are chosen to limit the number of indirect blocks with a +fully populated table to 0, 1, 2 or 3 respectively. So the first +table contains 16 entries, the second 512-16, etc. + +The last table is special in several ways. First its size depends on +the effective 32bit limit on telldir/seekdir cookies. Since logfs +uses the upper half of the address space for indirect blocks, the size +is limited to 2^31. Secondly the table contains hash buckets with 16 +entries each. + +Using single-entry buckets would result in birthday "attacks". At +just 2^16 used entries, hash collisions would be likely (P >= 0.5). +My math skills are insufficient to do the combinatorics for the 17x +collisions necessary to overflow a bucket, but testing showed that in +10,000 runs the lowest directory fill before a bucket overflow was +188,057,130 entries with an average of 315,149,915 entries. So for +directory sizes of up to a million, bucket overflows should be +virtually impossible under normal circumstances. + +With carefully chosen filenames, it is obviously possible to cause an +overflow with just 21 entries (4 higher tables + 16 entries + 1). So +there may be a security concern if a malicious user has write access +to a directory. + +Open For Discussion +=================== + +Device Address Space +-------------------- + +A device address space is used for caching. Both block devices and +MTD provide functions to either read a single page or write a segment. +Partial segments may be written for data integrity, but where possible +complete segments are written for performance on simple block device +flash media. + +Meta Inodes +----------- + +Inodes are stored in the inode file, which is just a regular file for +most purposes. At umount time, however, the inode file needs to +remain open until all dirty inodes are written. So +generic_shutdown_super() may not close this inode, but shouldn't +complain about remaining inodes due to the inode file either. Same +goes for mapping inode of the device address space. + +Currently logfs uses a hack that essentially copies part of fs/inode.c +code over. A general solution would be preferred. + +Indirect block mapping +---------------------- + +With compression, the block device (or mapping inode) cannot be used +to cache indirect blocks. Some other place is required. Currently +logfs uses the top half of each inode's address space. The low 8TB +(on 32bit) are filled with file data, the high 8TB are used for +indirect blocks. + +One problem is that 16TB files created on 64bit systems actually have +data in the top 8TB. But files >16TB would cause problems anyway, so +only the limit has changed. diff --git a/fs/Kconfig b/fs/Kconfig index 64d44efad7a5..7405f071be67 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -177,6 +177,7 @@ source "fs/efs/Kconfig" source "fs/jffs2/Kconfig" # UBIFS File system configuration source "fs/ubifs/Kconfig" +source "fs/logfs/Kconfig" source "fs/cramfs/Kconfig" source "fs/squashfs/Kconfig" source "fs/freevxfs/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index af6d04700d9c..c3633aa46911 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -99,6 +99,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/ obj-$(CONFIG_UFS_FS) += ufs/ obj-$(CONFIG_EFS_FS) += efs/ obj-$(CONFIG_JFFS2_FS) += jffs2/ +obj-$(CONFIG_LOGFS) += logfs/ obj-$(CONFIG_UBIFS_FS) += ubifs/ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig new file mode 100644 index 000000000000..daf9a9b32dd3 --- /dev/null +++ b/fs/logfs/Kconfig @@ -0,0 +1,17 @@ +config LOGFS + tristate "LogFS file system (EXPERIMENTAL)" + depends on (MTD || BLOCK) && EXPERIMENTAL + select ZLIB_INFLATE + select ZLIB_DEFLATE + select CRC32 + select BTREE + help + Flash filesystem aimed to scale efficiently to large devices. + In comparison to JFFS2 it offers significantly faster mount + times and potentially less RAM usage, although the latter has + not been measured yet. + + In its current state it is still very experimental and should + not be used for other than testing purposes. + + If unsure, say N. diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile new file mode 100644 index 000000000000..4820027787ee --- /dev/null +++ b/fs/logfs/Makefile @@ -0,0 +1,13 @@ +obj-$(CONFIG_LOGFS) += logfs.o + +logfs-y += compr.o +logfs-y += dir.o +logfs-y += file.o +logfs-y += gc.o +logfs-y += inode.o +logfs-y += journal.o +logfs-y += readwrite.o +logfs-y += segment.o +logfs-y += super.o +logfs-$(CONFIG_BLOCK) += dev_bdev.o +logfs-$(CONFIG_MTD) += dev_mtd.o diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c new file mode 100644 index 000000000000..44bbfd249abc --- /dev/null +++ b/fs/logfs/compr.c @@ -0,0 +1,95 @@ +/* + * fs/logfs/compr.c - compression routines + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include + +#define COMPR_LEVEL 3 + +static DEFINE_MUTEX(compr_mutex); +static struct z_stream_s stream; + +int logfs_compress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + mutex_lock(&compr_mutex); + err = zlib_deflateInit(&stream, COMPR_LEVEL); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_deflateEnd(&stream); + if (err != Z_OK) + goto error; + + if (stream.total_out >= stream.total_in) + goto error; + + ret = stream.total_out; +error: + mutex_unlock(&compr_mutex); + return ret; +} + +int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + mutex_lock(&compr_mutex); + err = zlib_inflateInit(&stream); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_inflateEnd(&stream); + if (err != Z_OK) + goto error; + + ret = 0; +error: + mutex_unlock(&compr_mutex); + return ret; +} + +int __init logfs_compr_init(void) +{ + size_t size = max(zlib_deflate_workspacesize(), + zlib_inflate_workspacesize()); + stream.workspace = vmalloc(size); + if (!stream.workspace) + return -ENOMEM; + return 0; +} + +void logfs_compr_exit(void) +{ + vfree(stream.workspace); +} diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c new file mode 100644 index 000000000000..58a057b6e1af --- /dev/null +++ b/fs/logfs/dev_bdev.c @@ -0,0 +1,263 @@ +/* + * fs/logfs/dev_bdev.c - Device access methods for block devices + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include +#include + +#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) + +static void request_complete(struct bio *bio, int err) +{ + complete((struct completion *)bio->bi_private); +} + +static int sync_request(struct page *page, struct block_device *bdev, int rw) +{ + struct bio bio; + struct bio_vec bio_vec; + struct completion complete; + + bio_init(&bio); + bio.bi_io_vec = &bio_vec; + bio_vec.bv_page = page; + bio_vec.bv_len = PAGE_SIZE; + bio_vec.bv_offset = 0; + bio.bi_vcnt = 1; + bio.bi_idx = 0; + bio.bi_size = PAGE_SIZE; + bio.bi_bdev = bdev; + bio.bi_sector = page->index * (PAGE_SIZE >> 9); + init_completion(&complete); + bio.bi_private = &complete; + bio.bi_end_io = request_complete; + + submit_bio(rw, &bio); + generic_unplug_device(bdev_get_queue(bdev)); + wait_for_completion(&complete); + return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO; +} + +static int bdev_readpage(void *_sb, struct page *page) +{ + struct super_block *sb = _sb; + struct block_device *bdev = logfs_super(sb)->s_bdev; + int err; + + err = sync_request(page, bdev, READ); + if (err) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + unlock_page(page); + return err; +} + +static DECLARE_WAIT_QUEUE_HEAD(wq); + +static void writeseg_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct super_block *sb = bio->bi_private; + struct logfs_super *super = logfs_super(sb); + struct page *page; + + BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ + BUG_ON(err); + BUG_ON(bio->bi_vcnt == 0); + do { + page = bvec->bv_page; + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + end_page_writeback(page); + } while (bvec >= bio->bi_io_vec); + bio_put(bio); + if (atomic_dec_and_test(&super->s_pending_writes)) + wake_up(&wq); +} + +static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct bio *bio; + struct page *page; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); + int i; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); /* FIXME: handle this */ + + for (i = 0; i < nr_pages; i++) { + if (i >= max_pages) { + /* Block layer cannot split bios :( */ + bio->bi_vcnt = i; + bio->bi_idx = 0; + bio->bi_size = i * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = writeseg_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + + ofs += i * PAGE_SIZE; + index += i; + nr_pages -= i; + i = 0; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + } + page = find_lock_page(mapping, index + i); + BUG_ON(!page); + bio->bi_io_vec[i].bv_page = page; + bio->bi_io_vec[i].bv_len = PAGE_SIZE; + bio->bi_io_vec[i].bv_offset = 0; + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + } + bio->bi_vcnt = nr_pages; + bio->bi_idx = 0; + bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = writeseg_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + return 0; +} + +static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + int head; + + BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO); + + if (len == 0) { + /* This can happen when the object fit perfectly into a + * segment, the segment gets written per sync and subsequently + * closed. + */ + return; + } + head = ofs & (PAGE_SIZE - 1); + if (head) { + ofs -= head; + len += head; + } + len = PAGE_ALIGN(len); + __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); + generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev)); +} + +static int bdev_erase(struct super_block *sb, loff_t to, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + pgoff_t index = to >> PAGE_SHIFT; + int i, nr_pages = len >> PAGE_SHIFT; + + BUG_ON(to & (PAGE_SIZE - 1)); + BUG_ON(len & (PAGE_SIZE - 1)); + + if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + for (i = 0; i < nr_pages; i++) { + page = find_get_page(mapping, index + i); + if (page) { + memset(page_address(page), 0xFF, PAGE_SIZE); + page_cache_release(page); + } + } + return 0; +} + +static void bdev_sync(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + wait_event(wq, atomic_read(&super->s_pending_writes) == 0); +} + +static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = bdev_readpage; + + *ofs = 0; + return read_cache_page(mapping, 0, filler, sb); +} + +static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = bdev_readpage; + u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000; + pgoff_t index = pos >> PAGE_SHIFT; + + *ofs = pos; + return read_cache_page(mapping, index, filler, sb); +} + +static int bdev_write_sb(struct super_block *sb, struct page *page) +{ + struct block_device *bdev = logfs_super(sb)->s_bdev; + + /* Nothing special to do for block devices. */ + return sync_request(page, bdev, WRITE); +} + +static void bdev_put_device(struct super_block *sb) +{ + close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE); +} + +static const struct logfs_device_ops bd_devops = { + .find_first_sb = bdev_find_first_sb, + .find_last_sb = bdev_find_last_sb, + .write_sb = bdev_write_sb, + .readpage = bdev_readpage, + .writeseg = bdev_writeseg, + .erase = bdev_erase, + .sync = bdev_sync, + .put_device = bdev_put_device, +}; + +int logfs_get_sb_bdev(struct file_system_type *type, int flags, + const char *devname, struct vfsmount *mnt) +{ + struct block_device *bdev; + + bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { + int mtdnr = MINOR(bdev->bd_dev); + close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); + return logfs_get_sb_mtd(type, flags, mtdnr, mnt); + } + + return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt); +} diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c new file mode 100644 index 000000000000..68e99d046c23 --- /dev/null +++ b/fs/logfs/dev_mtd.c @@ -0,0 +1,253 @@ +/* + * fs/logfs/dev_mtd.c - Device access methods for MTD + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include +#include + +#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) + +static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + size_t retlen; + int ret; + + ret = mtd->read(mtd, ofs, len, &retlen, buf); + BUG_ON(ret == -EINVAL); + if (ret) + return ret; + + /* Not sure if we should loop instead. */ + if (retlen != len) + return -EIO; + + return 0; +} + +static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf) +{ + struct logfs_super *super = logfs_super(sb); + struct mtd_info *mtd = super->s_mtd; + size_t retlen; + loff_t page_start, page_end; + int ret; + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs)); + BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift); + BUG_ON(len > PAGE_CACHE_SIZE); + page_start = ofs & PAGE_CACHE_MASK; + page_end = PAGE_CACHE_ALIGN(ofs + len) - 1; + ret = mtd->write(mtd, ofs, len, &retlen, buf); + if (ret || (retlen != len)) + return -EIO; + + return 0; +} + +/* + * For as long as I can remember (since about 2001) mtd->erase has been an + * asynchronous interface lacking the first driver to actually use the + * asynchronous properties. So just to prevent the first implementor of such + * a thing from breaking logfs in 2350, we do the usual pointless dance to + * declare a completion variable and wait for completion before returning + * from mtd_erase(). What an excercise in futility! + */ +static void logfs_erase_callback(struct erase_info *ei) +{ + complete((struct completion *)ei->priv); +} + +static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + pgoff_t index = ofs >> PAGE_SHIFT; + + for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) { + page = find_get_page(mapping, index); + if (!page) + continue; + memset(page_address(page), 0xFF, PAGE_SIZE); + page_cache_release(page); + } + return 0; +} + +static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + struct erase_info ei; + DECLARE_COMPLETION_ONSTACK(complete); + int ret; + + BUG_ON(len % mtd->erasesize); + if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + memset(&ei, 0, sizeof(ei)); + ei.mtd = mtd; + ei.addr = ofs; + ei.len = len; + ei.callback = logfs_erase_callback; + ei.priv = (long)&complete; + ret = mtd->erase(mtd, &ei); + if (ret) + return -EIO; + + wait_for_completion(&complete); + if (ei.state != MTD_ERASE_DONE) + return -EIO; + return mtd_erase_mapping(sb, ofs, len); +} + +static void mtd_sync(struct super_block *sb) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + + if (mtd->sync) + mtd->sync(mtd); +} + +static int mtd_readpage(void *_sb, struct page *page) +{ + struct super_block *sb = _sb; + int err; + + err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); + if (err == -EUCLEAN) { + err = 0; + /* FIXME: force GC this segment */ + } + if (err) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + unlock_page(page); + return err; +} + +static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = mtd_readpage; + struct mtd_info *mtd = super->s_mtd; + + if (!mtd->block_isbad) + return NULL; + + *ofs = 0; + while (mtd->block_isbad(mtd, *ofs)) { + *ofs += mtd->erasesize; + if (*ofs >= mtd->size) + return NULL; + } + BUG_ON(*ofs & ~PAGE_MASK); + return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); +} + +static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = mtd_readpage; + struct mtd_info *mtd = super->s_mtd; + + if (!mtd->block_isbad) + return NULL; + + *ofs = mtd->size - mtd->erasesize; + while (mtd->block_isbad(mtd, *ofs)) { + *ofs -= mtd->erasesize; + if (*ofs <= 0) + return NULL; + } + *ofs = *ofs + mtd->erasesize - 0x1000; + BUG_ON(*ofs & ~PAGE_MASK); + return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); +} + +static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + int i, err; + + for (i = 0; i < nr_pages; i++) { + page = find_lock_page(mapping, index + i); + BUG_ON(!page); + + err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); + unlock_page(page); + page_cache_release(page); + if (err) + return err; + } + return 0; +} + +static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + int head; + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return; + + if (len == 0) { + /* This can happen when the object fit perfectly into a + * segment, the segment gets written per sync and subsequently + * closed. + */ + return; + } + head = ofs & (PAGE_SIZE - 1); + if (head) { + ofs -= head; + len += head; + } + len = PAGE_ALIGN(len); + __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); +} + +static void mtd_put_device(struct super_block *sb) +{ + put_mtd_device(logfs_super(sb)->s_mtd); +} + +static const struct logfs_device_ops mtd_devops = { + .find_first_sb = mtd_find_first_sb, + .find_last_sb = mtd_find_last_sb, + .readpage = mtd_readpage, + .writeseg = mtd_writeseg, + .erase = mtd_erase, + .sync = mtd_sync, + .put_device = mtd_put_device, +}; + +int logfs_get_sb_mtd(struct file_system_type *type, int flags, + int mtdnr, struct vfsmount *mnt) +{ + struct mtd_info *mtd; + const struct logfs_device_ops *devops = &mtd_devops; + + mtd = get_mtd_device(NULL, mtdnr); + return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt); +} diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c new file mode 100644 index 000000000000..89104e6f81c4 --- /dev/null +++ b/fs/logfs/dir.c @@ -0,0 +1,818 @@ +/* + * fs/logfs/dir.c - directory-related code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" + + +/* + * Atomic dir operations + * + * Directory operations are by default not atomic. Dentries and Inodes are + * created/removed/altered in seperate operations. Therefore we need to do + * a small amount of journaling. + * + * Create, link, mkdir, mknod and symlink all share the same function to do + * the work: __logfs_create. This function works in two atomic steps: + * 1. allocate inode (remember in journal) + * 2. allocate dentry (clear journal) + * + * As we can only get interrupted between the two, when the inode we just + * created is simply stored in the anchor. On next mount, if we were + * interrupted, we delete the inode. From a users point of view the + * operation never happened. + * + * Unlink and rmdir also share the same function: unlink. Again, this + * function works in two atomic steps + * 1. remove dentry (remember inode in journal) + * 2. unlink inode (clear journal) + * + * And again, on the next mount, if we were interrupted, we delete the inode. + * From a users point of view the operation succeeded. + * + * Rename is the real pain to deal with, harder than all the other methods + * combined. Depending on the circumstances we can run into three cases. + * A "target rename" where the target dentry already existed, a "local + * rename" where both parent directories are identical or a "cross-directory + * rename" in the remaining case. + * + * Local rename is atomic, as the old dentry is simply rewritten with a new + * name. + * + * Cross-directory rename works in two steps, similar to __logfs_create and + * logfs_unlink: + * 1. Write new dentry (remember old dentry in journal) + * 2. Remove old dentry (clear journal) + * + * Here we remember a dentry instead of an inode. On next mount, if we were + * interrupted, we delete the dentry. From a users point of view, the + * operation succeeded. + * + * Target rename works in three atomic steps: + * 1. Attach old inode to new dentry (remember old dentry and new inode) + * 2. Remove old dentry (still remember the new inode) + * 3. Remove victim inode + * + * Here we remember both an inode an a dentry. If we get interrupted + * between steps 1 and 2, we delete both the dentry and the inode. If + * we get interrupted between steps 2 and 3, we delete just the inode. + * In either case, the remaining objects are deleted on next mount. From + * a users point of view, the operation succeeded. + */ + +static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd, + loff_t pos) +{ + return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL); +} + +static int write_inode(struct inode *inode) +{ + return __logfs_write_inode(inode, WF_LOCK); +} + +static s64 dir_seek_data(struct inode *inode, s64 pos) +{ + s64 new_pos = logfs_seek_data(inode, pos); + + return max(pos, new_pos - 1); +} + +static int beyond_eof(struct inode *inode, loff_t bix) +{ + loff_t pos = bix << inode->i_sb->s_blocksize_bits; + return pos >= i_size_read(inode); +} + +/* + * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11, + * so short names (len <= 9) don't even occupy the complete 32bit name + * space. A prime >256 ensures short names quickly spread the 32bit + * name space. Add about 26 for the estimated amount of information + * of each character and pick a prime nearby, preferrably a bit-sparse + * one. + */ +static u32 hash_32(const char *s, int len, u32 seed) +{ + u32 hash = seed; + int i; + + for (i = 0; i < len; i++) + hash = hash * 293 + s[i]; + return hash; +} + +/* + * We have to satisfy several conflicting requirements here. Small + * directories should stay fairly compact and not require too many + * indirect blocks. The number of possible locations for a given hash + * should be small to make lookup() fast. And we should try hard not + * to overflow the 32bit name space or nfs and 32bit host systems will + * be unhappy. + * + * So we use the following scheme. First we reduce the hash to 0..15 + * and try a direct block. If that is occupied we reduce the hash to + * 16..255 and try an indirect block. Same for 2x and 3x indirect + * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff, + * but use buckets containing eight entries instead of a single one. + * + * Using 16 entries should allow for a reasonable amount of hash + * collisions, so the 32bit name space can be packed fairly tight + * before overflowing. Oh and currently we don't overflow but return + * and error. + * + * How likely are collisions? Doing the appropriate math is beyond me + * and the Bronstein textbook. But running a test program to brute + * force collisions for a couple of days showed that on average the + * first collision occurs after 598M entries, with 290M being the + * smallest result. Obviously 21 entries could already cause a + * collision if all entries are carefully chosen. + */ +static pgoff_t hash_index(u32 hash, int round) +{ + switch (round) { + case 0: + return hash % I0_BLOCKS; + case 1: + return I0_BLOCKS + hash % (I1_BLOCKS - I0_BLOCKS); + case 2: + return I1_BLOCKS + hash % (I2_BLOCKS - I1_BLOCKS); + case 3: + return I2_BLOCKS + hash % (I3_BLOCKS - I2_BLOCKS); + case 4 ... 19: + return I3_BLOCKS + 16 * (hash % (((1<<31) - I3_BLOCKS) / 16)) + + round - 4; + } + BUG(); +} + +static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry) +{ + struct qstr *name = &dentry->d_name; + struct page *page; + struct logfs_disk_dentry *dd; + u32 hash = hash_32(name->name, name->len, 0); + pgoff_t index; + int round; + + if (name->len > LOGFS_MAX_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + for (round = 0; round < 20; round++) { + index = hash_index(hash, round); + + if (beyond_eof(dir, index)) + return NULL; + if (!logfs_exist_block(dir, index)) + continue; + page = read_cache_page(dir->i_mapping, index, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return page; + dd = kmap_atomic(page, KM_USER0); + BUG_ON(dd->namelen == 0); + + if (name->len != be16_to_cpu(dd->namelen) || + memcmp(name->name, dd->name, name->len)) { + kunmap_atomic(dd, KM_USER0); + page_cache_release(page); + continue; + } + + kunmap_atomic(dd, KM_USER0); + return page; + } + return NULL; +} + +static int logfs_remove_inode(struct inode *inode) +{ + int ret; + + inode->i_nlink--; + ret = write_inode(inode); + LOGFS_BUG_ON(ret, inode->i_sb); + return ret; +} + +static void abort_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + if (logfs_inode(inode)->li_block) + logfs_inode(inode)->li_block->ta = NULL; + kfree(ta); +} + +static int logfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct logfs_super *super = logfs_super(dir->i_sb); + struct inode *inode = dentry->d_inode; + struct logfs_transaction *ta; + struct page *page; + pgoff_t index; + int ret; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = UNLINK_1; + ta->ino = inode->i_ino; + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + page = logfs_get_dd_page(dir, dentry); + if (!page) + return -ENOENT; + if (IS_ERR(page)) + return PTR_ERR(page); + index = page->index; + page_cache_release(page); + + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(dir, ta); + + ret = logfs_delete(dir, index, NULL); + if (!ret) + ret = write_inode(dir); + + if (ret) { + abort_transaction(dir, ta); + printk(KERN_ERR"LOGFS: unable to delete inode\n"); + goto out; + } + + ta->state = UNLINK_2; + logfs_add_transaction(inode, ta); + ret = logfs_remove_inode(inode); +out: + mutex_unlock(&super->s_dirop_mutex); + return ret; +} + +static inline int logfs_empty_dir(struct inode *dir) +{ + u64 data; + + data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits; + return data >= i_size_read(dir); +} + +static int logfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + if (!logfs_empty_dir(inode)) + return -ENOTEMPTY; + + return logfs_unlink(dir, dentry); +} + +/* FIXME: readdir currently has it's own dir_walk code. I don't see a good + * way to combine the two copies */ +#define IMPLICIT_NODES 2 +static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir) +{ + struct inode *dir = file->f_dentry->d_inode; + loff_t pos = file->f_pos - IMPLICIT_NODES; + struct page *page; + struct logfs_disk_dentry *dd; + int full; + + BUG_ON(pos < 0); + for (;; pos++) { + if (beyond_eof(dir, pos)) + break; + if (!logfs_exist_block(dir, pos)) { + /* deleted dentry */ + pos = dir_seek_data(dir, pos); + continue; + } + page = read_cache_page(dir->i_mapping, pos, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + dd = kmap_atomic(page, KM_USER0); + BUG_ON(dd->namelen == 0); + + full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen), + pos, be64_to_cpu(dd->ino), dd->type); + kunmap_atomic(dd, KM_USER0); + page_cache_release(page); + if (full) + break; + } + + file->f_pos = pos + IMPLICIT_NODES; + return 0; +} + +static int logfs_readdir(struct file *file, void *buf, filldir_t filldir) +{ + struct inode *inode = file->f_dentry->d_inode; + ino_t pino = parent_ino(file->f_dentry); + int err; + + if (file->f_pos < 0) + return -EINVAL; + + if (file->f_pos == 0) { + if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0) + return 0; + file->f_pos++; + } + if (file->f_pos == 1) { + if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0) + return 0; + file->f_pos++; + } + + err = __logfs_readdir(file, buf, filldir); + return err; +} + +static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name) +{ + dd->namelen = cpu_to_be16(name->len); + memcpy(dd->name, name->name, name->len); +} + +static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct page *page; + struct logfs_disk_dentry *dd; + pgoff_t index; + u64 ino = 0; + struct inode *inode; + + page = logfs_get_dd_page(dir, dentry); + if (IS_ERR(page)) + return ERR_CAST(page); + if (!page) { + d_add(dentry, NULL); + return NULL; + } + index = page->index; + dd = kmap_atomic(page, KM_USER0); + ino = be64_to_cpu(dd->ino); + kunmap_atomic(dd, KM_USER0); + page_cache_release(page); + + inode = logfs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) { + printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n", + ino, dir->i_ino, index); + return ERR_CAST(inode); + } + return d_splice_alias(inode, dentry); +} + +static void grow_dir(struct inode *dir, loff_t index) +{ + index = (index + 1) << dir->i_sb->s_blocksize_bits; + if (i_size_read(dir) < index) + i_size_write(dir, index); +} + +static int logfs_write_dir(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct page *page; + struct logfs_disk_dentry *dd; + u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0); + pgoff_t index; + int round, err; + + for (round = 0; round < 20; round++) { + index = hash_index(hash, round); + + if (logfs_exist_block(dir, index)) + continue; + page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL); + if (!page) + return -ENOMEM; + + dd = kmap_atomic(page, KM_USER0); + memset(dd, 0, sizeof(*dd)); + dd->ino = cpu_to_be64(inode->i_ino); + dd->type = logfs_type(inode); + logfs_set_name(dd, &dentry->d_name); + kunmap_atomic(dd, KM_USER0); + + err = logfs_write_buf(dir, page, WF_LOCK); + unlock_page(page); + page_cache_release(page); + if (!err) + grow_dir(dir, index); + return err; + } + /* FIXME: Is there a better return value? In most cases neither + * the filesystem nor the directory are full. But we have had + * too many collisions for this particular hash and no fallback. + */ + return -ENOSPC; +} + +static int __logfs_create(struct inode *dir, struct dentry *dentry, + struct inode *inode, const char *dest, long destlen) +{ + struct logfs_super *super = logfs_super(dir->i_sb); + struct logfs_inode *li = logfs_inode(inode); + struct logfs_transaction *ta; + int ret; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = CREATE_1; + ta->ino = inode->i_ino; + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(inode, ta); + + if (dest) { + /* symlink */ + ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL); + if (!ret) + ret = write_inode(inode); + } else { + /* creat/mkdir/mknod */ + ret = write_inode(inode); + } + if (ret) { + abort_transaction(inode, ta); + li->li_flags |= LOGFS_IF_STILLBORN; + /* FIXME: truncate symlink */ + inode->i_nlink--; + iput(inode); + goto out; + } + + ta->state = CREATE_2; + logfs_add_transaction(dir, ta); + ret = logfs_write_dir(dir, dentry, inode); + /* sync directory */ + if (!ret) + ret = write_inode(dir); + + if (ret) { + logfs_del_transaction(dir, ta); + ta->state = CREATE_2; + logfs_add_transaction(inode, ta); + logfs_remove_inode(inode); + iput(inode); + goto out; + } + d_instantiate(dentry, inode); +out: + mutex_unlock(&super->s_dirop_mutex); + return ret; +} + +static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + + /* + * FIXME: why do we have to fill in S_IFDIR, while the mode is + * correct for mknod, creat, etc.? Smells like the vfs *should* + * do it for us but for some reason fails to do so. + */ + inode = logfs_new_inode(dir, S_IFDIR | mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_dir_iops; + inode->i_fop = &logfs_dir_fops; + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + + inode = logfs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_reg_iops; + inode->i_fop = &logfs_reg_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + struct inode *inode; + + if (dentry->d_name.len > LOGFS_MAX_NAMELEN) + return -ENAMETOOLONG; + + inode = logfs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + init_special_inode(inode, mode, rdev); + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_symlink(struct inode *dir, struct dentry *dentry, + const char *target) +{ + struct inode *inode; + size_t destlen = strlen(target) + 1; + + if (destlen > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + + inode = logfs_new_inode(dir, S_IFLNK | 0777); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_symlink_iops; + inode->i_mapping->a_ops = &logfs_reg_aops; + + return __logfs_create(dir, dentry, inode, target, destlen); +} + +static int logfs_permission(struct inode *inode, int mask) +{ + return generic_permission(inode, mask, NULL); +} + +static int logfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + + if (inode->i_nlink >= LOGFS_LINK_MAX) + return -EMLINK; + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + atomic_inc(&inode->i_count); + inode->i_nlink++; + mark_inode_dirty_sync(inode); + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_get_dd(struct inode *dir, struct dentry *dentry, + struct logfs_disk_dentry *dd, loff_t *pos) +{ + struct page *page; + void *map; + + page = logfs_get_dd_page(dir, dentry); + if (IS_ERR(page)) + return PTR_ERR(page); + *pos = page->index; + map = kmap_atomic(page, KM_USER0); + memcpy(dd, map, sizeof(*dd)); + kunmap_atomic(map, KM_USER0); + page_cache_release(page); + return 0; +} + +static int logfs_delete_dd(struct inode *dir, loff_t pos) +{ + /* + * Getting called with pos somewhere beyond eof is either a goofup + * within this file or means someone maliciously edited the + * (crc-protected) journal. + */ + BUG_ON(beyond_eof(dir, pos)); + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos); + return logfs_delete(dir, pos, NULL); +} + +/* + * Cross-directory rename, target does not exist. Just a little nasty. + * Create a new dentry in the target dir, then remove the old dentry, + * all the while taking care to remember our operation in the journal. + */ +static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct logfs_super *super = logfs_super(old_dir->i_sb); + struct logfs_disk_dentry dd; + struct logfs_transaction *ta; + loff_t pos; + int err; + + /* 1. locate source dd */ + err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); + if (err) + return err; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = CROSS_RENAME_1; + ta->dir = old_dir->i_ino; + ta->pos = pos; + + /* 2. write target dd */ + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(new_dir, ta); + err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode); + if (!err) + err = write_inode(new_dir); + + if (err) { + super->s_rename_dir = 0; + super->s_rename_pos = 0; + abort_transaction(new_dir, ta); + goto out; + } + + /* 3. remove source dd */ + ta->state = CROSS_RENAME_2; + logfs_add_transaction(old_dir, ta); + err = logfs_delete_dd(old_dir, pos); + if (!err) + err = write_inode(old_dir); + LOGFS_BUG_ON(err, old_dir->i_sb); +out: + mutex_unlock(&super->s_dirop_mutex); + return err; +} + +static int logfs_replace_inode(struct inode *dir, struct dentry *dentry, + struct logfs_disk_dentry *dd, struct inode *inode) +{ + loff_t pos; + int err; + + err = logfs_get_dd(dir, dentry, dd, &pos); + if (err) + return err; + dd->ino = cpu_to_be64(inode->i_ino); + dd->type = logfs_type(inode); + + err = write_dir(dir, dd, pos); + if (err) + return err; + log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos, + dd->name, be64_to_cpu(dd->ino)); + return write_inode(dir); +} + +/* Target dentry exists - the worst case. We need to attach the source + * inode to the target dentry, then remove the orphaned target inode and + * source dentry. + */ +static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct logfs_super *super = logfs_super(old_dir->i_sb); + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + int isdir = S_ISDIR(old_inode->i_mode); + struct logfs_disk_dentry dd; + struct logfs_transaction *ta; + loff_t pos; + int err; + + BUG_ON(isdir != S_ISDIR(new_inode->i_mode)); + if (isdir) { + if (!logfs_empty_dir(new_inode)) + return -ENOTEMPTY; + } + + /* 1. locate source dd */ + err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); + if (err) + return err; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = TARGET_RENAME_1; + ta->dir = old_dir->i_ino; + ta->pos = pos; + ta->ino = new_inode->i_ino; + + /* 2. attach source inode to target dd */ + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(new_dir, ta); + err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode); + if (err) { + super->s_rename_dir = 0; + super->s_rename_pos = 0; + super->s_victim_ino = 0; + abort_transaction(new_dir, ta); + goto out; + } + + /* 3. remove source dd */ + ta->state = TARGET_RENAME_2; + logfs_add_transaction(old_dir, ta); + err = logfs_delete_dd(old_dir, pos); + if (!err) + err = write_inode(old_dir); + LOGFS_BUG_ON(err, old_dir->i_sb); + + /* 4. remove target inode */ + ta->state = TARGET_RENAME_3; + logfs_add_transaction(new_inode, ta); + err = logfs_remove_inode(new_inode); + +out: + mutex_unlock(&super->s_dirop_mutex); + return err; +} + +static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (new_dentry->d_inode) + return logfs_rename_target(old_dir, old_dentry, + new_dir, new_dentry); + return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry); +} + +/* No locking done here, as this is called before .get_sb() returns. */ +int logfs_replay_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + u64 ino, pos; + int err; + + if (super->s_victim_ino) { + /* delete victim inode */ + ino = super->s_victim_ino; + printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino); + inode = logfs_iget(sb, ino); + if (IS_ERR(inode)) + goto fail; + + LOGFS_BUG_ON(i_size_read(inode) > 0, sb); + super->s_victim_ino = 0; + err = logfs_remove_inode(inode); + iput(inode); + if (err) { + super->s_victim_ino = ino; + goto fail; + } + } + if (super->s_rename_dir) { + /* delete old dd from rename */ + ino = super->s_rename_dir; + pos = super->s_rename_pos; + printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n", + ino, pos); + inode = logfs_iget(sb, ino); + if (IS_ERR(inode)) + goto fail; + + super->s_rename_dir = 0; + super->s_rename_pos = 0; + err = logfs_delete_dd(inode, pos); + iput(inode); + if (err) { + super->s_rename_dir = ino; + super->s_rename_pos = pos; + goto fail; + } + } + return 0; +fail: + LOGFS_BUG(sb); + return -EIO; +} + +const struct inode_operations logfs_symlink_iops = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, +}; + +const struct inode_operations logfs_dir_iops = { + .create = logfs_create, + .link = logfs_link, + .lookup = logfs_lookup, + .mkdir = logfs_mkdir, + .mknod = logfs_mknod, + .rename = logfs_rename, + .rmdir = logfs_rmdir, + .permission = logfs_permission, + .symlink = logfs_symlink, + .unlink = logfs_unlink, +}; +const struct file_operations logfs_dir_fops = { + .fsync = logfs_fsync, + .ioctl = logfs_ioctl, + .readdir = logfs_readdir, + .read = generic_read_dir, +}; diff --git a/fs/logfs/file.c b/fs/logfs/file.c new file mode 100644 index 000000000000..370f367a933e --- /dev/null +++ b/fs/logfs/file.c @@ -0,0 +1,263 @@ +/* + * fs/logfs/file.c - prepare_write, commit_write and friends + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include + +static int logfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct page *page; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) + return 0; + if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + len; + + /* Reading beyond i_size is simple: memset to zero */ + zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); + return 0; + } + return logfs_readpage_nolock(page); +} + +static int logfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, + void *fsdata) +{ + struct inode *inode = mapping->host; + pgoff_t index = page->index; + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + copied; + int ret = 0; + + BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize); + BUG_ON(page->index > I3_BLOCKS); + + if (copied < len) { + /* + * Short write of a non-initialized paged. Just tell userspace + * to retry the entire page. + */ + if (!PageUptodate(page)) { + copied = 0; + goto out; + } + } + if (copied == 0) + goto out; /* FIXME: do we need to update inode? */ + + if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) { + i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end); + mark_inode_dirty_sync(inode); + } + + SetPageUptodate(page); + if (!PageDirty(page)) { + if (!get_page_reserve(inode, page)) + __set_page_dirty_nobuffers(page); + else + ret = logfs_write_buf(inode, page, WF_LOCK); + } +out: + unlock_page(page); + page_cache_release(page); + return ret ? ret : copied; +} + +int logfs_readpage(struct file *file, struct page *page) +{ + int ret; + + ret = logfs_readpage_nolock(page); + unlock_page(page); + return ret; +} + +/* Clear the page's dirty flag in the radix tree. */ +/* TODO: mucking with PageWriteback is silly. Add a generic function to clear + * the dirty bit from the radix tree for filesystems that don't have to wait + * for page writeback to finish (i.e. any compressing filesystem). + */ +static void clear_radix_tree_dirty(struct page *page) +{ + BUG_ON(PagePrivate(page) || page->private); + set_page_writeback(page); + end_page_writeback(page); +} + +static int __logfs_writepage(struct page *page) +{ + struct inode *inode = page->mapping->host; + int err; + + err = logfs_write_buf(inode, page, WF_LOCK); + if (err) + set_page_dirty(page); + else + clear_radix_tree_dirty(page); + unlock_page(page); + return err; +} + +static int logfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + u64 bix; + level_t level; + + log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index, + page); + + logfs_unpack_index(page->index, &bix, &level); + + /* Indirect blocks are never truncated */ + if (level != 0) + return __logfs_writepage(page); + + /* + * TODO: everything below is a near-verbatim copy of nobh_writepage(). + * The relevant bits should be factored out after logfs is merged. + */ + + /* Is the page fully inside i_size? */ + if (bix < end_index) + return __logfs_writepage(page); + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (bix > end_index || offset == 0) { + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invokation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + return __logfs_writepage(page); +} + +static void logfs_invalidatepage(struct page *page, unsigned long offset) +{ + move_page_to_btree(page); + BUG_ON(PagePrivate(page) || page->private); +} + +static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this) +{ + return 0; /* None of these are easy to release */ +} + + +int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct logfs_inode *li = logfs_inode(inode); + unsigned int oldflags, flags; + int err; + + switch (cmd) { + case FS_IOC_GETFLAGS: + flags = li->li_flags & LOGFS_FL_USER_VISIBLE; + return put_user(flags, (int __user *)arg); + case FS_IOC_SETFLAGS: + if (IS_RDONLY(inode)) + return -EROFS; + + if (!is_owner_or_cap(inode)) + return -EACCES; + + err = get_user(flags, (int __user *)arg); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + oldflags = li->li_flags; + flags &= LOGFS_FL_USER_MODIFIABLE; + flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE; + li->li_flags = flags; + mutex_unlock(&inode->i_mutex); + + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty_sync(inode); + return 0; + + default: + return -ENOTTY; + } +} + +int logfs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + struct super_block *sb = dentry->d_inode->i_sb; + struct logfs_super *super = logfs_super(sb); + + /* FIXME: write anchor */ + super->s_devops->sync(sb); + return 0; +} + +static int logfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int err = 0; + + if (attr->ia_valid & ATTR_SIZE) + err = logfs_truncate(inode, attr->ia_size); + attr->ia_valid &= ~ATTR_SIZE; + + if (!err) + err = inode_change_ok(inode, attr); + if (!err) + err = inode_setattr(inode, attr); + return err; +} + +const struct inode_operations logfs_reg_iops = { + .setattr = logfs_setattr, +}; + +const struct file_operations logfs_reg_fops = { + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .fsync = logfs_fsync, + .ioctl = logfs_ioctl, + .llseek = generic_file_llseek, + .mmap = generic_file_readonly_mmap, + .open = generic_file_open, + .read = do_sync_read, + .write = do_sync_write, +}; + +const struct address_space_operations logfs_reg_aops = { + .invalidatepage = logfs_invalidatepage, + .readpage = logfs_readpage, + .releasepage = logfs_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, + .writepage = logfs_writepage, + .writepages = generic_writepages, + .write_begin = logfs_write_begin, + .write_end = logfs_write_end, +}; diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c new file mode 100644 index 000000000000..b3656c44190e --- /dev/null +++ b/fs/logfs/gc.c @@ -0,0 +1,730 @@ +/* + * fs/logfs/gc.c - garbage collection code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include + +/* + * Wear leveling needs to kick in when the difference between low erase + * counts and high erase counts gets too big. A good value for "too big" + * may be somewhat below 10% of maximum erase count for the device. + * Why not 397, to pick a nice round number with no specific meaning? :) + * + * WL_RATELIMIT is the minimum time between two wear level events. A huge + * number of segments may fulfil the requirements for wear leveling at the + * same time. If that happens we don't want to cause a latency from hell, + * but just gently pick one segment every so often and minimize overhead. + */ +#define WL_DELTA 397 +#define WL_RATELIMIT 100 +#define MAX_OBJ_ALIASES 2600 +#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */ +#define LIST_SIZE 64 /* base size of candidate lists */ +#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */ +#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */ + +static int no_free_segments(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + return super->s_free_list.count; +} + +/* journal has distance -1, top-most ifile layer distance 0 */ +static u8 root_distance(struct super_block *sb, gc_level_t __gc_level) +{ + struct logfs_super *super = logfs_super(sb); + u8 gc_level = (__force u8)__gc_level; + + switch (gc_level) { + case 0: /* fall through */ + case 1: /* fall through */ + case 2: /* fall through */ + case 3: + /* file data or indirect blocks */ + return super->s_ifile_levels + super->s_iblock_levels - gc_level; + case 6: /* fall through */ + case 7: /* fall through */ + case 8: /* fall through */ + case 9: + /* inode file data or indirect blocks */ + return super->s_ifile_levels - (gc_level - 6); + default: + printk(KERN_ERR"LOGFS: segment of unknown level %x found\n", + gc_level); + WARN_ON(1); + return super->s_ifile_levels + super->s_iblock_levels; + } +} + +static int segment_is_reserved(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area; + void *reserved; + int i; + + /* Some segments are reserved. Just pretend they were all valid */ + reserved = btree_lookup32(&super->s_reserved_segments, segno); + if (reserved) + return 1; + + /* Currently open segments */ + for_each_area(i) { + area = super->s_area[i]; + if (area->a_is_open && area->a_segno == segno) + return 1; + } + + return 0; +} + +static void logfs_mark_segment_bad(struct super_block *sb, u32 segno) +{ + BUG(); +} + +/* + * Returns the bytes consumed by valid objects in this segment. Object headers + * are counted, the segment header is not. + */ +static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec, + gc_level_t *gc_level) +{ + struct logfs_segment_entry se; + u32 ec_level; + + logfs_get_segment_entry(sb, segno, &se); + if (se.ec_level == cpu_to_be32(BADSEG) || + se.valid == cpu_to_be32(RESERVED)) + return RESERVED; + + ec_level = be32_to_cpu(se.ec_level); + *ec = ec_level >> 4; + *gc_level = GC_LEVEL(ec_level & 0xf); + return be32_to_cpu(se.valid); +} + +static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino, + u64 bix, gc_level_t gc_level) +{ + struct inode *inode; + int err, cookie; + + inode = logfs_safe_iget(sb, ino, &cookie); + err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0); + BUG_ON(err); + logfs_safe_iput(inode, cookie); +} + +static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_header sh; + struct logfs_object_header oh; + u64 ofs, ino, bix; + u32 seg_ofs, logical_segno, cleaned = 0; + int err, len, valid; + gc_level_t gc_level; + + LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb); + + btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS); + err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); + BUG_ON(err); + gc_level = GC_LEVEL(sh.level); + logical_segno = be32_to_cpu(sh.segno); + if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { + logfs_mark_segment_bad(sb, segno); + cleaned = -1; + goto out; + } + + for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; + seg_ofs + sizeof(oh) < super->s_segsize; ) { + ofs = dev_ofs(sb, logical_segno, seg_ofs); + err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh), + &oh); + BUG_ON(err); + + if (!memchr_inv(&oh, 0xff, sizeof(oh))) + break; + + if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { + logfs_mark_segment_bad(sb, segno); + cleaned = super->s_segsize - 1; + goto out; + } + + ino = be64_to_cpu(oh.ino); + bix = be64_to_cpu(oh.bix); + len = sizeof(oh) + be16_to_cpu(oh.len); + valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level); + if (valid == 1) { + logfs_cleanse_block(sb, ofs, ino, bix, gc_level); + cleaned += len; + } else if (valid == 2) { + /* Will be invalid upon journal commit */ + cleaned += len; + } + seg_ofs += len; + } +out: + btree_remove32(&super->s_reserved_segments, segno); + return cleaned; +} + +static struct gc_candidate *add_list(struct gc_candidate *cand, + struct candidate_list *list) +{ + struct rb_node **p = &list->rb_tree.rb_node; + struct rb_node *parent = NULL; + struct gc_candidate *cur; + int comp; + + cand->list = list; + while (*p) { + parent = *p; + cur = rb_entry(parent, struct gc_candidate, rb_node); + + if (list->sort_by_ec) + comp = cand->erase_count < cur->erase_count; + else + comp = cand->valid < cur->valid; + + if (comp) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&cand->rb_node, parent, p); + rb_insert_color(&cand->rb_node, &list->rb_tree); + + if (list->count <= list->maxcount) { + list->count++; + return NULL; + } + cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node); + rb_erase(&cand->rb_node, &list->rb_tree); + cand->list = NULL; + return cand; +} + +static void remove_from_list(struct gc_candidate *cand) +{ + struct candidate_list *list = cand->list; + + rb_erase(&cand->rb_node, &list->rb_tree); + list->count--; +} + +static void free_candidate(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + + btree_remove32(&super->s_cand_tree, cand->segno); + kfree(cand); +} + +u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec) +{ + struct gc_candidate *cand; + u32 segno; + + BUG_ON(list->count == 0); + + cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); + remove_from_list(cand); + segno = cand->segno; + if (ec) + *ec = cand->erase_count; + free_candidate(sb, cand); + return segno; +} + +/* + * We have several lists to manage segments with. The reserve_list is used to + * deal with bad blocks. We try to keep the best (lowest ec) segments on this + * list. + * The free_list contains free segments for normal usage. It usually gets the + * second pick after the reserve_list. But when the free_list is running short + * it is more important to keep the free_list full than to keep a reserve. + * + * Segments that are not free are put onto a per-level low_list. If we have + * to run garbage collection, we pick a candidate from there. All segments on + * those lists should have at least some free space so GC will make progress. + * + * And last we have the ec_list, which is used to pick segments for wear + * leveling. + * + * If all appropriate lists are full, we simply free the candidate and forget + * about that segment for a while. We have better candidates for each purpose. + */ +static void __add_candidate(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE; + + if (cand->valid == 0) { + /* 100% free segments */ + log_gc_noisy("add reserve segment %x (ec %x) at %llx\n", + cand->segno, cand->erase_count, + dev_ofs(sb, cand->segno, 0)); + cand = add_list(cand, &super->s_reserve_list); + if (cand) { + log_gc_noisy("add free segment %x (ec %x) at %llx\n", + cand->segno, cand->erase_count, + dev_ofs(sb, cand->segno, 0)); + cand = add_list(cand, &super->s_free_list); + } + } else { + /* good candidates for Garbage Collection */ + if (cand->valid < full) + cand = add_list(cand, &super->s_low_list[cand->dist]); + /* good candidates for wear leveling, + * segments that were recently written get ignored */ + if (cand) + cand = add_list(cand, &super->s_ec_list); + } + if (cand) + free_candidate(sb, cand); +} + +static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec, + u8 dist) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + + cand = kmalloc(sizeof(*cand), GFP_NOFS); + if (!cand) + return -ENOMEM; + + cand->segno = segno; + cand->valid = valid; + cand->erase_count = ec; + cand->dist = dist; + + btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS); + __add_candidate(sb, cand); + return 0; +} + +static void remove_segment_from_lists(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + + cand = btree_lookup32(&super->s_cand_tree, segno); + if (cand) { + remove_from_list(cand); + free_candidate(sb, cand); + } +} + +static void scan_segment(struct super_block *sb, u32 segno) +{ + u32 valid, ec = 0; + gc_level_t gc_level = 0; + u8 dist; + + if (segment_is_reserved(sb, segno)) + return; + + remove_segment_from_lists(sb, segno); + valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); + if (valid == RESERVED) + return; + + dist = root_distance(sb, gc_level); + add_candidate(sb, segno, valid, ec, dist); +} + +static struct gc_candidate *first_in_list(struct candidate_list *list) +{ + if (list->count == 0) + return NULL; + return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); +} + +/* + * Find the best segment for garbage collection. Main criterion is + * the segment requiring the least effort to clean. Secondary + * criterion is to GC on the lowest level available. + * + * So we search the least effort segment on the lowest level first, + * then move up and pick another segment iff is requires significantly + * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison. + */ +static struct gc_candidate *get_candidate(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i, max_dist; + struct gc_candidate *cand = NULL, *this; + + max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS); + + for (i = max_dist; i >= 0; i--) { + this = first_in_list(&super->s_low_list[i]); + if (!this) + continue; + if (!cand) + cand = this; + if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid) + cand = this; + } + return cand; +} + +static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + gc_level_t gc_level; + u32 cleaned, valid, segno, ec; + u8 dist; + + if (!cand) { + log_gc("GC attempted, but no candidate found\n"); + return 0; + } + + segno = cand->segno; + dist = cand->dist; + valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); + free_candidate(sb, cand); + log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n", + segno, (u64)segno << super->s_segshift, + dist, no_free_segments(sb), valid, + super->s_free_bytes); + cleaned = logfs_gc_segment(sb, segno, dist); + log_gc("GC segment #%02x complete - now %x valid\n", segno, + valid - cleaned); + BUG_ON(cleaned != valid); + return 1; +} + +static int logfs_gc_once(struct super_block *sb) +{ + struct gc_candidate *cand; + + cand = get_candidate(sb); + if (cand) + remove_from_list(cand); + return __logfs_gc_once(sb, cand); +} + +/* returns 1 if a wrap occurs, 0 otherwise */ +static int logfs_scan_some(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u32 segno; + int i, ret = 0; + + segno = super->s_sweeper; + for (i = SCAN_RATIO; i > 0; i--) { + segno++; + if (segno >= super->s_no_segs) { + segno = 0; + ret = 1; + /* Break out of the loop. We want to read a single + * block from the segment size on next invocation if + * SCAN_RATIO is set to match block size + */ + break; + } + + scan_segment(sb, segno); + } + super->s_sweeper = segno; + return ret; +} + +/* + * In principle, this function should loop forever, looking for GC candidates + * and moving data. LogFS is designed in such a way that this loop is + * guaranteed to terminate. + * + * Limiting the loop to some iterations serves purely to catch cases when + * these guarantees have failed. An actual endless loop is an obvious bug + * and should be reported as such. + */ +static void __logfs_gc_pass(struct super_block *sb, int target) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + int round, progress, last_progress = 0; + + if (no_free_segments(sb) >= target && + super->s_no_object_aliases < MAX_OBJ_ALIASES) + return; + + log_gc("__logfs_gc_pass(%x)\n", target); + for (round = 0; round < SCAN_ROUNDS; ) { + if (no_free_segments(sb) >= target) + goto write_alias; + + /* Sync in-memory state with on-medium state in case they + * diverged */ + logfs_write_anchor(super->s_master_inode); + round += logfs_scan_some(sb); + if (no_free_segments(sb) >= target) + goto write_alias; + progress = logfs_gc_once(sb); + if (progress) + last_progress = round; + else if (round - last_progress > 2) + break; + continue; + + /* + * The goto logic is nasty, I just don't know a better way to + * code it. GC is supposed to ensure two things: + * 1. Enough free segments are available. + * 2. The number of aliases is bounded. + * When 1. is achieved, we take a look at 2. and write back + * some alias-containing blocks, if necessary. However, after + * each such write we need to go back to 1., as writes can + * consume free segments. + */ +write_alias: + if (super->s_no_object_aliases < MAX_OBJ_ALIASES) + return; + if (list_empty(&super->s_object_alias)) { + /* All aliases are still in btree */ + return; + } + log_gc("Write back one alias\n"); + block = list_entry(super->s_object_alias.next, + struct logfs_block, alias_list); + block->ops->write_block(block); + /* + * To round off the nasty goto logic, we reset round here. It + * is a safety-net for GC not making any progress and limited + * to something reasonably small. If incremented it for every + * single alias, the loop could terminate rather quickly. + */ + round = 0; + } + LOGFS_BUG(sb); +} + +static int wl_ratelimit(struct super_block *sb, u64 *next_event) +{ + struct logfs_super *super = logfs_super(sb); + + if (*next_event < super->s_gec) { + *next_event = super->s_gec + WL_RATELIMIT; + return 0; + } + return 1; +} + +static void logfs_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *wl_cand, *free_cand; + + if (wl_ratelimit(sb, &super->s_wl_gec_ostore)) + return; + + wl_cand = first_in_list(&super->s_ec_list); + if (!wl_cand) + return; + free_cand = first_in_list(&super->s_free_list); + if (!free_cand) + return; + + if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) { + remove_from_list(wl_cand); + __logfs_gc_once(sb, wl_cand); + } +} + +/* + * The journal needs wear leveling as well. But moving the journal is an + * expensive operation so we try to avoid it as much as possible. And if we + * have to do it, we move the whole journal, not individual segments. + * + * Ratelimiting is not strictly necessary here, it mainly serves to avoid the + * calculations. First we check whether moving the journal would be a + * significant improvement. That means that a) the current journal segments + * have more wear than the future journal segments and b) the current journal + * segments have more wear than normal ostore segments. + * Rationale for b) is that we don't have to move the journal if it is aging + * less than the ostore, even if the reserve segments age even less (they are + * excluded from wear leveling, after all). + * Next we check that the superblocks have less wear than the journal. Since + * moving the journal requires writing the superblocks, we have to protect the + * superblocks even more than the journal. + * + * Also we double the acceptable wear difference, compared to ostore wear + * leveling. Journal data is read and rewritten rapidly, comparatively. So + * soft errors have much less time to accumulate and we allow the journal to + * be a bit worse than the ostore. + */ +static void logfs_journal_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + u32 min_journal_ec = -1, max_reserve_ec = 0; + int i; + + if (wl_ratelimit(sb, &super->s_wl_gec_journal)) + return; + + if (super->s_reserve_list.count < super->s_no_journal_segs) { + /* Reserve is not full enough to move complete journal */ + return; + } + + journal_for_each(i) + if (super->s_journal_seg[i]) + min_journal_ec = min(min_journal_ec, + super->s_journal_ec[i]); + cand = rb_entry(rb_first(&super->s_free_list.rb_tree), + struct gc_candidate, rb_node); + max_reserve_ec = cand->erase_count; + for (i = 0; i < 2; i++) { + struct logfs_segment_entry se; + u32 segno = seg_no(sb, super->s_sb_ofs[i]); + u32 ec; + + logfs_get_segment_entry(sb, segno, &se); + ec = be32_to_cpu(se.ec_level) >> 4; + max_reserve_ec = max(max_reserve_ec, ec); + } + + if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) { + do_logfs_journal_wl_pass(sb); + } +} + +void logfs_gc_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex)); + /* Write journal before free space is getting saturated with dirty + * objects. + */ + if (super->s_dirty_used_bytes + super->s_dirty_free_bytes + + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes) + logfs_write_anchor(super->s_master_inode); + __logfs_gc_pass(sb, logfs_super(sb)->s_total_levels); + logfs_wl_pass(sb); + logfs_journal_wl_pass(sb); +} + +static int check_area(struct super_block *sb, int i) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[i]; + struct logfs_object_header oh; + u32 segno = area->a_segno; + u32 ofs = area->a_used_bytes; + __be32 crc; + int err; + + if (!area->a_is_open) + return 0; + + for (ofs = area->a_used_bytes; + ofs <= super->s_segsize - sizeof(oh); + ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) { + err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh); + if (err) + return err; + + if (!memchr_inv(&oh, 0xff, sizeof(oh))) + break; + + crc = logfs_crc32(&oh, sizeof(oh) - 4, 4); + if (crc != oh.crc) { + printk(KERN_INFO "interrupted header at %llx\n", + dev_ofs(sb, segno, ofs)); + return 0; + } + } + if (ofs != area->a_used_bytes) { + printk(KERN_INFO "%x bytes unaccounted data found at %llx\n", + ofs - area->a_used_bytes, + dev_ofs(sb, segno, area->a_used_bytes)); + area->a_used_bytes = ofs; + } + return 0; +} + +int logfs_check_areas(struct super_block *sb) +{ + int i, err; + + for_each_area(i) { + err = check_area(sb, i); + if (err) + return err; + } + return 0; +} + +static void logfs_init_candlist(struct candidate_list *list, int maxcount, + int sort_by_ec) +{ + list->count = 0; + list->maxcount = maxcount; + list->sort_by_ec = sort_by_ec; + list->rb_tree = RB_ROOT; +} + +int logfs_init_gc(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool); + logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1); + logfs_init_candlist(&super->s_reserve_list, + super->s_bad_seg_reserve, 1); + for_each_area(i) + logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0); + logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1); + return 0; +} + +static void logfs_cleanup_list(struct super_block *sb, + struct candidate_list *list) +{ + struct gc_candidate *cand; + + while (list->count) { + cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate, + rb_node); + remove_from_list(cand); + free_candidate(sb, cand); + } + BUG_ON(list->rb_tree.rb_node); +} + +void logfs_cleanup_gc(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + if (!super->s_free_list.count) + return; + + /* + * FIXME: The btree may still contain a single empty node. So we + * call the grim visitor to clean up that mess. Btree code should + * do it for us, really. + */ + btree_grim_visitor32(&super->s_cand_tree, 0, NULL); + logfs_cleanup_list(sb, &super->s_free_list); + logfs_cleanup_list(sb, &super->s_reserve_list); + for_each_area(i) + logfs_cleanup_list(sb, &super->s_low_list[i]); + logfs_cleanup_list(sb, &super->s_ec_list); +} diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c new file mode 100644 index 000000000000..6d08b3762641 --- /dev/null +++ b/fs/logfs/inode.c @@ -0,0 +1,417 @@ +/* + * fs/logfs/inode.c - inode handling code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include + +/* + * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes + * on the medium. It therefore also lacks a method to store the previous + * generation number for deleted inodes. Instead a single generation number + * is stored which will be used for new inodes. Being just a 32bit counter, + * this can obvious wrap relatively quickly. So we only reuse inodes if we + * know that a fair number of inodes can be created before we have to increment + * the generation again - effectively adding some bits to the counter. + * But being too aggressive here means we keep a very large and very sparse + * inode file, wasting space on indirect blocks. + * So what is a good value? Beats me. 64k seems moderately bad on both + * fronts, so let's use that for now... + * + * NFS sucks, as everyone already knows. + */ +#define INOS_PER_WRAP (0x10000) + +/* + * Logfs' requirement to read inodes for garbage collection makes life a bit + * harder. GC may have to read inodes that are in I_FREEING state, when they + * are being written out - and waiting for GC to make progress, naturally. + * + * So we cannot just call iget() or some variant of it, but first have to check + * wether the inode in question might be in I_FREEING state. Therefore we + * maintain our own per-sb list of "almost deleted" inodes and check against + * that list first. Normally this should be at most 1-2 entries long. + * + * Also, inodes have logfs-specific reference counting on top of what the vfs + * does. When .destroy_inode is called, normally the reference count will drop + * to zero and the inode gets deleted. But if GC accessed the inode, its + * refcount will remain nonzero and final deletion will have to wait. + * + * As a result we have two sets of functions to get/put inodes: + * logfs_safe_iget/logfs_safe_iput - safe to call from GC context + * logfs_iget/iput - normal version + */ +static struct kmem_cache *logfs_inode_cache; + +static DEFINE_SPINLOCK(logfs_inode_lock); + +static void logfs_inode_setops(struct inode *inode) +{ + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + inode->i_op = &logfs_dir_iops; + inode->i_fop = &logfs_dir_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFREG: + inode->i_op = &logfs_reg_iops; + inode->i_fop = &logfs_reg_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFLNK: + inode->i_op = &logfs_symlink_iops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + default: + BUG(); + } +} + +static struct inode *__logfs_iget(struct super_block *sb, ino_t ino) +{ + struct inode *inode = iget_locked(sb, ino); + int err; + + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = logfs_read_inode(inode); + if (err || inode->i_nlink == 0) { + /* inode->i_nlink == 0 can be true when called from + * block validator */ + /* set i_nlink to 0 to prevent caching */ + inode->i_nlink = 0; + logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE; + iget_failed(inode); + if (!err) + err = -ENOENT; + return ERR_PTR(err); + } + + logfs_inode_setops(inode); + unlock_new_inode(inode); + return inode; +} + +struct inode *logfs_iget(struct super_block *sb, ino_t ino) +{ + BUG_ON(ino == LOGFS_INO_MASTER); + BUG_ON(ino == LOGFS_INO_SEGFILE); + return __logfs_iget(sb, ino); +} + +/* + * is_cached is set to 1 if we hand out a cached inode, 0 otherwise. + * this allows logfs_iput to do the right thing later + */ +struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_inode *li; + + if (ino == LOGFS_INO_MASTER) + return super->s_master_inode; + if (ino == LOGFS_INO_SEGFILE) + return super->s_segfile_inode; + + spin_lock(&logfs_inode_lock); + list_for_each_entry(li, &super->s_freeing_list, li_freeing_list) + if (li->vfs_inode.i_ino == ino) { + li->li_refcount++; + spin_unlock(&logfs_inode_lock); + *is_cached = 1; + return &li->vfs_inode; + } + spin_unlock(&logfs_inode_lock); + + *is_cached = 0; + return __logfs_iget(sb, ino); +} + +static void __logfs_destroy_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + BUG_ON(li->li_block); + list_del(&li->li_freeing_list); + kmem_cache_free(logfs_inode_cache, li); +} + +static void logfs_destroy_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + BUG_ON(list_empty(&li->li_freeing_list)); + spin_lock(&logfs_inode_lock); + li->li_refcount--; + if (li->li_refcount == 0) + __logfs_destroy_inode(inode); + spin_unlock(&logfs_inode_lock); +} + +void logfs_safe_iput(struct inode *inode, int is_cached) +{ + if (inode->i_ino == LOGFS_INO_MASTER) + return; + if (inode->i_ino == LOGFS_INO_SEGFILE) + return; + + if (is_cached) { + logfs_destroy_inode(inode); + return; + } + + iput(inode); +} + +static void logfs_init_inode(struct super_block *sb, struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + li->li_flags = 0; + li->li_height = 0; + li->li_used_bytes = 0; + li->li_block = NULL; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_ctime = CURRENT_TIME; + inode->i_mtime = CURRENT_TIME; + inode->i_nlink = 1; + INIT_LIST_HEAD(&li->li_freeing_list); + + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = 0; + + return; +} + +static struct inode *logfs_alloc_inode(struct super_block *sb) +{ + struct logfs_inode *li; + + li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS); + if (!li) + return NULL; + logfs_init_inode(sb, &li->vfs_inode); + return &li->vfs_inode; +} + +/* + * In logfs inodes are written to an inode file. The inode file, like any + * other file, is managed with a inode. The inode file's inode, aka master + * inode, requires special handling in several respects. First, it cannot be + * written to the inode file, so it is stored in the journal instead. + * + * Secondly, this inode cannot be written back and destroyed before all other + * inodes have been written. The ordering is important. Linux' VFS is happily + * unaware of the ordering constraint and would ordinarily destroy the master + * inode at umount time while other inodes are still in use and dirty. Not + * good. + * + * So logfs makes sure the master inode is not written until all other inodes + * have been destroyed. Sadly, this method has another side-effect. The VFS + * will notice one remaining inode and print a frightening warning message. + * Worse, it is impossible to judge whether such a warning was caused by the + * master inode or any other inodes have leaked as well. + * + * Our attempt of solving this is with logfs_new_meta_inode() below. Its + * purpose is to create a new inode that will not trigger the warning if such + * an inode is still in use. An ugly hack, no doubt. Suggections for + * improvement are welcome. + */ +struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode; + + inode = logfs_alloc_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_mode = S_IFREG; + inode->i_ino = ino; + inode->i_sb = sb; + + /* This is a blatant copy of alloc_inode code. We'd need alloc_inode + * to be nonstatic, alas. */ + { + struct address_space * const mapping = &inode->i_data; + + mapping->a_ops = &logfs_reg_aops; + mapping->host = inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + inode->i_mapping = mapping; + inode->i_nlink = 1; + } + + return inode; +} + +struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode; + int err; + + inode = logfs_new_meta_inode(sb, ino); + if (IS_ERR(inode)) + return inode; + + err = logfs_read_inode(inode); + if (err) { + destroy_meta_inode(inode); + return ERR_PTR(err); + } + logfs_inode_setops(inode); + return inode; +} + +static int logfs_write_inode(struct inode *inode, int do_sync) +{ + int ret; + long flags = WF_LOCK; + + /* Can only happen if creat() failed. Safe to skip. */ + if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN) + return 0; + + ret = __logfs_write_inode(inode, flags); + LOGFS_BUG_ON(ret, inode->i_sb); + return ret; +} + +void destroy_meta_inode(struct inode *inode) +{ + if (inode) { + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + logfs_clear_inode(inode); + kmem_cache_free(logfs_inode_cache, logfs_inode(inode)); + } +} + +/* called with inode_lock held */ +static void logfs_drop_inode(struct inode *inode) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_inode *li = logfs_inode(inode); + + spin_lock(&logfs_inode_lock); + list_move(&li->li_freeing_list, &super->s_freeing_list); + spin_unlock(&logfs_inode_lock); + generic_drop_inode(inode); +} + +static void logfs_set_ino_generation(struct super_block *sb, + struct inode *inode) +{ + struct logfs_super *super = logfs_super(sb); + u64 ino; + + mutex_lock(&super->s_journal_mutex); + ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino); + super->s_last_ino = ino; + super->s_inos_till_wrap--; + if (super->s_inos_till_wrap < 0) { + super->s_last_ino = LOGFS_RESERVED_INOS; + super->s_generation++; + super->s_inos_till_wrap = INOS_PER_WRAP; + } + inode->i_ino = ino; + inode->i_generation = super->s_generation; + mutex_unlock(&super->s_journal_mutex); +} + +struct inode *logfs_new_inode(struct inode *dir, int mode) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + logfs_init_inode(sb, inode); + + /* inherit parent flags */ + logfs_inode(inode)->li_flags |= + logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED; + + inode->i_mode = mode; + logfs_set_ino_generation(sb, inode); + + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + inode->i_mode |= S_ISGID; + } + + logfs_inode_setops(inode); + insert_inode_hash(inode); + + return inode; +} + +static void logfs_init_once(void *_li) +{ + struct logfs_inode *li = _li; + int i; + + li->li_flags = 0; + li->li_used_bytes = 0; + li->li_refcount = 1; + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = 0; + inode_init_once(&li->vfs_inode); +} + +static int logfs_sync_fs(struct super_block *sb, int wait) +{ + /* FIXME: write anchor */ + logfs_super(sb)->s_devops->sync(sb); + return 0; +} + +const struct super_operations logfs_super_operations = { + .alloc_inode = logfs_alloc_inode, + .clear_inode = logfs_clear_inode, + .delete_inode = logfs_delete_inode, + .destroy_inode = logfs_destroy_inode, + .drop_inode = logfs_drop_inode, + .write_inode = logfs_write_inode, + .statfs = logfs_statfs, + .sync_fs = logfs_sync_fs, +}; + +int logfs_init_inode_cache(void) +{ + logfs_inode_cache = kmem_cache_create("logfs_inode_cache", + sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT, + logfs_init_once); + if (!logfs_inode_cache) + return -ENOMEM; + return 0; +} + +void logfs_destroy_inode_cache(void) +{ + kmem_cache_destroy(logfs_inode_cache); +} diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c new file mode 100644 index 000000000000..7a023dbba9f8 --- /dev/null +++ b/fs/logfs/journal.c @@ -0,0 +1,879 @@ +/* + * fs/logfs/journal.c - journal handling code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" + +static void logfs_calc_free(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u64 reserve, no_segs = super->s_no_segs; + s64 free; + int i; + + /* superblock segments */ + no_segs -= 2; + super->s_no_journal_segs = 0; + /* journal */ + journal_for_each(i) + if (super->s_journal_seg[i]) { + no_segs--; + super->s_no_journal_segs++; + } + + /* open segments plus one extra per level for GC */ + no_segs -= 2 * super->s_total_levels; + + free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE); + free -= super->s_used_bytes; + /* just a bit extra */ + free -= super->s_total_levels * 4096; + + /* Bad blocks are 'paid' for with speed reserve - the filesystem + * simply gets slower as bad blocks accumulate. Until the bad blocks + * exceed the speed reserve - then the filesystem gets smaller. + */ + reserve = super->s_bad_segments + super->s_bad_seg_reserve; + reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE; + reserve = max(reserve, super->s_speed_reserve); + free -= reserve; + if (free < 0) + free = 0; + + super->s_free_bytes = free; +} + +static void reserve_sb_and_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct btree_head32 *head = &super->s_reserved_segments; + int i, err; + + err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1, + GFP_KERNEL); + BUG_ON(err); + + err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1, + GFP_KERNEL); + BUG_ON(err); + + journal_for_each(i) { + if (!super->s_journal_seg[i]) + continue; + err = btree_insert32(head, super->s_journal_seg[i], (void *)1, + GFP_KERNEL); + BUG_ON(err); + } +} + +static void read_dynsb(struct super_block *sb, + struct logfs_je_dynsb *dynsb) +{ + struct logfs_super *super = logfs_super(sb); + + super->s_gec = be64_to_cpu(dynsb->ds_gec); + super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper); + super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino); + super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir); + super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos); + super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes); + super->s_generation = be32_to_cpu(dynsb->ds_generation); +} + +static void read_anchor(struct super_block *sb, + struct logfs_je_anchor *da) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + int i; + + super->s_last_ino = be64_to_cpu(da->da_last_ino); + li->li_flags = 0; + li->li_height = da->da_height; + i_size_write(inode, be64_to_cpu(da->da_size)); + li->li_used_bytes = be64_to_cpu(da->da_used_bytes); + + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = be64_to_cpu(da->da_data[i]); +} + +static void read_erasecount(struct super_block *sb, + struct logfs_je_journal_ec *ec) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + journal_for_each(i) + super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]); +} + +static int read_area(struct super_block *sb, struct logfs_je_area *a) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[a->gc_level]; + u64 ofs; + u32 writemask = ~(super->s_writesize - 1); + + if (a->gc_level >= LOGFS_NO_AREAS) + return -EIO; + if (a->vim != VIM_DEFAULT) + return -EIO; /* TODO: close area and continue */ + + area->a_used_bytes = be32_to_cpu(a->used_bytes); + area->a_written_bytes = area->a_used_bytes & writemask; + area->a_segno = be32_to_cpu(a->segno); + if (area->a_segno) + area->a_is_open = 1; + + ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + if (super->s_writesize > 1) + logfs_buf_recover(area, ofs, a + 1, super->s_writesize); + else + logfs_buf_recover(area, ofs, NULL, 0); + return 0; +} + +static void *unpack(void *from, void *to) +{ + struct logfs_journal_header *jh = from; + void *data = from + sizeof(struct logfs_journal_header); + int err; + size_t inlen, outlen; + + inlen = be16_to_cpu(jh->h_len); + outlen = be16_to_cpu(jh->h_datalen); + + if (jh->h_compr == COMPR_NONE) + memcpy(to, data, inlen); + else { + err = logfs_uncompress(data, to, inlen, outlen); + BUG_ON(err); + } + return to; +} + +static int __read_je_header(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + struct logfs_super *super = logfs_super(sb); + size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize) + + MAX_JOURNAL_HEADER; + u16 type, len, datalen; + int err; + + /* read header only */ + err = wbuf_read(sb, ofs, sizeof(*jh), jh); + if (err) + return err; + type = be16_to_cpu(jh->h_type); + len = be16_to_cpu(jh->h_len); + datalen = be16_to_cpu(jh->h_datalen); + if (len > sb->s_blocksize) + return -EIO; + if ((type < JE_FIRST) || (type > JE_LAST)) + return -EIO; + if (datalen > bufsize) + return -EIO; + return 0; +} + +static int __read_je_payload(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + u16 len; + int err; + + len = be16_to_cpu(jh->h_len); + err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1); + if (err) + return err; + if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) { + /* Old code was confused. It forgot about the header length + * and stopped calculating the crc 16 bytes before the end + * of data - ick! + * FIXME: Remove this hack once the old code is fixed. + */ + if (jh->h_crc == logfs_crc32(jh, len, 4)) + WARN_ON_ONCE(1); + else + return -EIO; + } + return 0; +} + +/* + * jh needs to be large enough to hold the complete entry, not just the header + */ +static int __read_je(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + int err; + + err = __read_je_header(sb, ofs, jh); + if (err) + return err; + return __read_je_payload(sb, ofs, jh); +} + +static int read_je(struct super_block *sb, u64 ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_journal_header *jh = super->s_compressed_je; + void *scratch = super->s_je; + u16 type, datalen; + int err; + + err = __read_je(sb, ofs, jh); + if (err) + return err; + type = be16_to_cpu(jh->h_type); + datalen = be16_to_cpu(jh->h_datalen); + + switch (type) { + case JE_DYNSB: + read_dynsb(sb, unpack(jh, scratch)); + break; + case JE_ANCHOR: + read_anchor(sb, unpack(jh, scratch)); + break; + case JE_ERASECOUNT: + read_erasecount(sb, unpack(jh, scratch)); + break; + case JE_AREA: + read_area(sb, unpack(jh, scratch)); + break; + case JE_OBJ_ALIAS: + err = logfs_load_object_aliases(sb, unpack(jh, scratch), + datalen); + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + return err; +} + +static int logfs_read_segment(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_journal_header *jh = super->s_compressed_je; + u64 ofs, seg_ofs = dev_ofs(sb, segno, 0); + u32 h_ofs, last_ofs = 0; + u16 len, datalen, last_len; + int i, err; + + /* search for most recent commit */ + for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) { + ofs = seg_ofs + h_ofs; + err = __read_je_header(sb, ofs, jh); + if (err) + continue; + if (jh->h_type != cpu_to_be16(JE_COMMIT)) + continue; + err = __read_je_payload(sb, ofs, jh); + if (err) + continue; + len = be16_to_cpu(jh->h_len); + datalen = be16_to_cpu(jh->h_datalen); + if ((datalen > sizeof(super->s_je_array)) || + (datalen % sizeof(__be64))) + continue; + last_ofs = h_ofs; + last_len = datalen; + h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh); + } + /* read commit */ + if (last_ofs == 0) + return -ENOENT; + ofs = seg_ofs + last_ofs; + log_journal("Read commit from %llx\n", ofs); + err = __read_je(sb, ofs, jh); + BUG_ON(err); /* We should have caught it in the scan loop already */ + if (err) + return err; + /* uncompress */ + unpack(jh, super->s_je_array); + super->s_no_je = last_len / sizeof(__be64); + /* iterate over array */ + for (i = 0; i < super->s_no_je; i++) { + err = read_je(sb, be64_to_cpu(super->s_je_array[i])); + if (err) + return err; + } + super->s_journal_area->a_segno = segno; + return 0; +} + +static u64 read_gec(struct super_block *sb, u32 segno) +{ + struct logfs_segment_header sh; + __be32 crc; + int err; + + if (!segno) + return 0; + err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); + if (err) + return 0; + crc = logfs_crc32(&sh, sizeof(sh), 4); + if (crc != sh.crc) { + WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull)); + /* Most likely it was just erased */ + return 0; + } + return be64_to_cpu(sh.gec); +} + +static int logfs_read_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u64 gec[LOGFS_JOURNAL_SEGS], max; + u32 segno; + int i, max_i; + + max = 0; + max_i = -1; + journal_for_each(i) { + segno = super->s_journal_seg[i]; + gec[i] = read_gec(sb, super->s_journal_seg[i]); + if (gec[i] > max) { + max = gec[i]; + max_i = i; + } + } + if (max_i == -1) + return -EIO; + /* FIXME: Try older segments in case of error */ + return logfs_read_segment(sb, super->s_journal_seg[max_i]); +} + +/* + * First search the current segment (outer loop), then pick the next segment + * in the array, skipping any zero entries (inner loop). + */ +static void journal_get_free_segment(struct logfs_area *area) +{ + struct logfs_super *super = logfs_super(area->a_sb); + int i; + + journal_for_each(i) { + if (area->a_segno != super->s_journal_seg[i]) + continue; + + do { + i++; + if (i == LOGFS_JOURNAL_SEGS) + i = 0; + } while (!super->s_journal_seg[i]); + + area->a_segno = super->s_journal_seg[i]; + area->a_erase_count = ++(super->s_journal_ec[i]); + log_journal("Journal now at %x (ec %x)\n", area->a_segno, + area->a_erase_count); + return; + } + BUG(); +} + +static void journal_get_erase_count(struct logfs_area *area) +{ + /* erase count is stored globally and incremented in + * journal_get_free_segment() - nothing to do here */ +} + +static int journal_erase_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_segment_header sh; + u64 ofs; + int err; + + err = logfs_erase_segment(sb, area->a_segno); + if (err) + return err; + + sh.pad = 0; + sh.type = SEG_JOURNAL; + sh.level = 0; + sh.segno = cpu_to_be32(area->a_segno); + sh.ec = cpu_to_be32(area->a_erase_count); + sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); + sh.crc = logfs_crc32(&sh, sizeof(sh), 4); + + /* This causes a bug in segment.c. Not yet. */ + //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0); + + ofs = dev_ofs(sb, area->a_segno, 0); + area->a_used_bytes = ALIGN(sizeof(sh), 16); + logfs_buf_write(area, ofs, &sh, sizeof(sh)); + return 0; +} + +static size_t __logfs_write_header(struct logfs_super *super, + struct logfs_journal_header *jh, size_t len, size_t datalen, + u16 type, u8 compr) +{ + jh->h_len = cpu_to_be16(len); + jh->h_type = cpu_to_be16(type); + jh->h_version = cpu_to_be16(++super->s_last_version); + jh->h_datalen = cpu_to_be16(datalen); + jh->h_compr = compr; + jh->h_pad[0] = 'H'; + jh->h_pad[1] = 'A'; + jh->h_pad[2] = 'T'; + jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4); + return ALIGN(len, 16) + sizeof(*jh); +} + +static size_t logfs_write_header(struct logfs_super *super, + struct logfs_journal_header *jh, size_t datalen, u16 type) +{ + size_t len = datalen; + + return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE); +} + +static inline size_t logfs_journal_erasecount_size(struct logfs_super *super) +{ + return LOGFS_JOURNAL_SEGS * sizeof(__be32); +} + +static void *logfs_write_erasecount(struct super_block *sb, void *_ec, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_journal_ec *ec = _ec; + int i; + + journal_for_each(i) + ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]); + *type = JE_ERASECOUNT; + *len = logfs_journal_erasecount_size(super); + return ec; +} + +static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore, + size_t ignore2) +{ + struct logfs_shadow *shadow = _shadow; + struct super_block *sb = (void *)_sb; + struct logfs_super *super = logfs_super(sb); + + /* consume new space */ + super->s_free_bytes -= shadow->new_len; + super->s_used_bytes += shadow->new_len; + super->s_dirty_used_bytes -= shadow->new_len; + + /* free up old space */ + super->s_free_bytes += shadow->old_len; + super->s_used_bytes -= shadow->old_len; + super->s_dirty_free_bytes -= shadow->old_len; + + logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len); + logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len); + + log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + mempool_free(shadow, super->s_shadow_pool); +} + +static void account_shadows(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + struct shadow_tree *tree = &super->s_shadow_tree; + + btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow); + btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow); + + if (li->li_block) { + /* + * We never actually use the structure, when attached to the + * master inode. But it is easier to always free it here than + * to have checks in several places elsewhere when allocating + * it. + */ + li->li_block->ops->free_block(sb, li->li_block); + } + BUG_ON((s64)li->li_used_bytes < 0); +} + +static void *__logfs_write_anchor(struct super_block *sb, void *_da, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_anchor *da = _da; + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + int i; + + da->da_height = li->li_height; + da->da_last_ino = cpu_to_be64(super->s_last_ino); + da->da_size = cpu_to_be64(i_size_read(inode)); + da->da_used_bytes = cpu_to_be64(li->li_used_bytes); + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + da->da_data[i] = cpu_to_be64(li->li_data[i]); + *type = JE_ANCHOR; + *len = sizeof(*da); + return da; +} + +static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_dynsb *dynsb = _dynsb; + + dynsb->ds_gec = cpu_to_be64(super->s_gec); + dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper); + dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino); + dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir); + dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos); + dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes); + dynsb->ds_generation = cpu_to_be32(super->s_generation); + *type = JE_DYNSB; + *len = sizeof(*dynsb); + return dynsb; +} + +static void write_wbuf(struct super_block *sb, struct logfs_area *area, + void *wbuf) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + u64 ofs; + pgoff_t index; + int page_ofs; + struct page *page; + + ofs = dev_ofs(sb, area->a_segno, + area->a_used_bytes & ~(super->s_writesize - 1)); + index = ofs >> PAGE_SHIFT; + page_ofs = ofs & (PAGE_SIZE - 1); + + page = find_lock_page(mapping, index); + BUG_ON(!page); + memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize); + unlock_page(page); +} + +static void *logfs_write_area(struct super_block *sb, void *_a, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[super->s_sum_index]; + struct logfs_je_area *a = _a; + + a->vim = VIM_DEFAULT; + a->gc_level = super->s_sum_index; + a->used_bytes = cpu_to_be32(area->a_used_bytes); + a->segno = cpu_to_be32(area->a_segno); + if (super->s_writesize > 1) + write_wbuf(sb, area, a + 1); + + *type = JE_AREA; + *len = sizeof(*a) + super->s_writesize; + return a; +} + +static void *logfs_write_commit(struct super_block *sb, void *h, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + + *type = JE_COMMIT; + *len = super->s_no_je * sizeof(__be64); + return super->s_je_array; +} + +static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type, + size_t len) +{ + struct logfs_super *super = logfs_super(sb); + void *header = super->s_compressed_je; + void *data = header + sizeof(struct logfs_journal_header); + ssize_t compr_len, pad_len; + u8 compr = COMPR_ZLIB; + + if (len == 0) + return logfs_write_header(super, header, 0, type); + + compr_len = logfs_compress(buf, data, len, sb->s_blocksize); + if (compr_len < 0 || type == JE_ANCHOR) { + BUG_ON(len > sb->s_blocksize); + memcpy(data, buf, len); + compr_len = len; + compr = COMPR_NONE; + } + + pad_len = ALIGN(compr_len, 16); + memset(data + compr_len, 0, pad_len - compr_len); + + return __logfs_write_header(super, header, compr_len, len, type, compr); +} + +static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes, + int must_pad) +{ + u32 writesize = logfs_super(area->a_sb)->s_writesize; + s32 ofs; + int ret; + + ret = logfs_open_area(area, *bytes); + if (ret) + return -EAGAIN; + + ofs = area->a_used_bytes; + area->a_used_bytes += *bytes; + + if (must_pad) { + area->a_used_bytes = ALIGN(area->a_used_bytes, writesize); + *bytes = area->a_used_bytes - ofs; + } + + return dev_ofs(area->a_sb, area->a_segno, ofs); +} + +static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type, + size_t buf_len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + struct logfs_journal_header *jh = super->s_compressed_je; + size_t len; + int must_pad = 0; + s64 ofs; + + len = __logfs_write_je(sb, buf, type, buf_len); + if (jh->h_type == cpu_to_be16(JE_COMMIT)) + must_pad = 1; + + ofs = logfs_get_free_bytes(area, &len, must_pad); + if (ofs < 0) + return ofs; + logfs_buf_write(area, ofs, super->s_compressed_je, len); + super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs); + return 0; +} + +static int logfs_write_je(struct super_block *sb, + void* (*write)(struct super_block *sb, void *scratch, + u16 *type, size_t *len)) +{ + void *buf; + size_t len; + u16 type; + + buf = write(sb, logfs_super(sb)->s_je, &type, &len); + return logfs_write_je_buf(sb, buf, type, len); +} + +int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_obj_alias *oa = super->s_je; + int err = 0, fill = super->s_je_fill; + + log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n", + fill, ino, bix, level, child_no, be64_to_cpu(val)); + oa[fill].ino = cpu_to_be64(ino); + oa[fill].bix = cpu_to_be64(bix); + oa[fill].val = val; + oa[fill].level = (__force u8)level; + oa[fill].child_no = cpu_to_be16(child_no); + fill++; + if (fill >= sb->s_blocksize / sizeof(*oa)) { + err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize); + fill = 0; + } + + super->s_je_fill = fill; + return err; +} + +static int logfs_write_obj_aliases(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int err; + + log_journal("logfs_write_obj_aliases: %d aliases to write\n", + super->s_no_object_aliases); + super->s_je_fill = 0; + err = logfs_write_obj_aliases_pagecache(sb); + if (err) + return err; + + if (super->s_je_fill) + err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS, + super->s_je_fill + * sizeof(struct logfs_obj_alias)); + return err; +} + +/* + * Write all journal entries. The goto logic ensures that all journal entries + * are written whenever a new segment is used. It is ugly and potentially a + * bit wasteful, but robustness is more important. With this we can *always* + * erase all journal segments except the one containing the most recent commit. + */ +void logfs_write_anchor(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + int i, err; + + BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + mutex_lock(&super->s_journal_mutex); + + /* Do this first or suffer corruption */ + logfs_sync_segments(sb); + account_shadows(sb); + +again: + super->s_no_je = 0; + for_each_area(i) { + if (!super->s_area[i]->a_is_open) + continue; + super->s_sum_index = i; + err = logfs_write_je(sb, logfs_write_area); + if (err) + goto again; + } + err = logfs_write_obj_aliases(sb); + if (err) + goto again; + err = logfs_write_je(sb, logfs_write_erasecount); + if (err) + goto again; + err = logfs_write_je(sb, __logfs_write_anchor); + if (err) + goto again; + err = logfs_write_je(sb, logfs_write_dynsb); + if (err) + goto again; + /* + * Order is imperative. First we sync all writes, including the + * non-committed journal writes. Then we write the final commit and + * sync the current journal segment. + * There is a theoretical bug here. Syncing the journal segment will + * write a number of journal entries and the final commit. All these + * are written in a single operation. If the device layer writes the + * data back-to-front, the commit will precede the other journal + * entries, leaving a race window. + * Two fixes are possible. Preferred is to fix the device layer to + * ensure writes happen front-to-back. Alternatively we can insert + * another logfs_sync_area() super->s_devops->sync() combo before + * writing the commit. + */ + /* + * On another subject, super->s_devops->sync is usually not necessary. + * Unless called from sys_sync or friends, a barrier would suffice. + */ + super->s_devops->sync(sb); + err = logfs_write_je(sb, logfs_write_commit); + if (err) + goto again; + log_journal("Write commit to %llx\n", + be64_to_cpu(super->s_je_array[super->s_no_je - 1])); + logfs_sync_area(area); + BUG_ON(area->a_used_bytes != area->a_written_bytes); + super->s_devops->sync(sb); + + mutex_unlock(&super->s_journal_mutex); + return; +} + +void do_logfs_journal_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + u32 segno, ec; + int i, err; + + log_journal("Journal requires wear-leveling.\n"); + /* Drop old segments */ + journal_for_each(i) + if (super->s_journal_seg[i]) { + logfs_set_segment_unreserved(sb, + super->s_journal_seg[i], + super->s_journal_ec[i]); + super->s_journal_seg[i] = 0; + super->s_journal_ec[i] = 0; + } + /* Get new segments */ + for (i = 0; i < super->s_no_journal_segs; i++) { + segno = get_best_cand(sb, &super->s_reserve_list, &ec); + super->s_journal_seg[i] = segno; + super->s_journal_ec[i] = ec; + logfs_set_segment_reserved(sb, segno); + } + /* Manually move journal_area */ + area->a_segno = super->s_journal_seg[0]; + area->a_is_open = 0; + area->a_used_bytes = 0; + /* Write journal */ + logfs_write_anchor(super->s_master_inode); + /* Write superblocks */ + err = logfs_write_sb(sb); + BUG_ON(err); +} + +static const struct logfs_area_ops journal_area_ops = { + .get_free_segment = journal_get_free_segment, + .get_erase_count = journal_get_erase_count, + .erase_segment = journal_erase_segment, +}; + +int logfs_init_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize) + + MAX_JOURNAL_HEADER; + int ret = -ENOMEM; + + mutex_init(&super->s_journal_mutex); + btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool); + + super->s_je = kzalloc(bufsize, GFP_KERNEL); + if (!super->s_je) + return ret; + + super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL); + if (!super->s_compressed_je) + return ret; + + super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER); + if (IS_ERR(super->s_master_inode)) + return PTR_ERR(super->s_master_inode); + + ret = logfs_read_journal(sb); + if (ret) + return -EIO; + + reserve_sb_and_journal(sb); + logfs_calc_free(sb); + + super->s_journal_area->a_ops = &journal_area_ops; + return 0; +} + +void logfs_cleanup_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + btree_grim_visitor32(&super->s_reserved_segments, 0, NULL); + destroy_meta_inode(super->s_master_inode); + super->s_master_inode = NULL; + + kfree(super->s_compressed_je); + kfree(super->s_je); +} diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h new file mode 100644 index 000000000000..e3082abe9e3b --- /dev/null +++ b/fs/logfs/logfs.h @@ -0,0 +1,722 @@ +/* + * fs/logfs/logfs.h + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Private header for logfs. + */ +#ifndef FS_LOGFS_LOGFS_H +#define FS_LOGFS_LOGFS_H + +#undef __CHECK_ENDIAN__ +#define __CHECK_ENDIAN__ + +#include +#include +#include +#include +#include +#include +#include +#include "logfs_abi.h" + +#define LOGFS_DEBUG_SUPER (0x0001) +#define LOGFS_DEBUG_SEGMENT (0x0002) +#define LOGFS_DEBUG_JOURNAL (0x0004) +#define LOGFS_DEBUG_DIR (0x0008) +#define LOGFS_DEBUG_FILE (0x0010) +#define LOGFS_DEBUG_INODE (0x0020) +#define LOGFS_DEBUG_READWRITE (0x0040) +#define LOGFS_DEBUG_GC (0x0080) +#define LOGFS_DEBUG_GC_NOISY (0x0100) +#define LOGFS_DEBUG_ALIASES (0x0200) +#define LOGFS_DEBUG_BLOCKMOVE (0x0400) +#define LOGFS_DEBUG_ALL (0xffffffff) + +#define LOGFS_DEBUG (0x01) +/* + * To enable specific log messages, simply define LOGFS_DEBUG to match any + * or all of the above. + */ +#ifndef LOGFS_DEBUG +#define LOGFS_DEBUG (0) +#endif + +#define log_cond(cond, fmt, arg...) do { \ + if (cond) \ + printk(KERN_DEBUG fmt, ##arg); \ +} while (0) + +#define log_super(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg) +#define log_segment(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg) +#define log_journal(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg) +#define log_dir(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg) +#define log_file(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg) +#define log_inode(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg) +#define log_readwrite(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg) +#define log_gc(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg) +#define log_gc_noisy(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg) +#define log_aliases(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg) +#define log_blockmove(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg) + +#define PG_pre_locked PG_owner_priv_1 +#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags) +#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags) +#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags) + +/* FIXME: This should really be somewhere in the 64bit area. */ +#define LOGFS_LINK_MAX (1<<30) + +/* Read-only filesystem */ +#define LOGFS_SB_FLAG_RO 0x0001 +#define LOGFS_SB_FLAG_SEG_ALIAS 0x0002 +#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004 +#define LOGFS_SB_FLAG_SHUTDOWN 0x0008 + +/* Write Control Flags */ +#define WF_LOCK 0x01 /* take write lock */ +#define WF_WRITE 0x02 /* write block */ +#define WF_DELETE 0x04 /* delete old block */ + +typedef u8 __bitwise level_t; +typedef u8 __bitwise gc_level_t; + +#define LEVEL(level) ((__force level_t)(level)) +#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level)) + +#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \ + (__force level_t)((__force u8)(level) - 1) ) + +/** + * struct logfs_area - area management information + * + * @a_sb: the superblock this area belongs to + * @a_is_open: 1 if the area is currently open, else 0 + * @a_segno: segment number of area + * @a_written_bytes: number of bytes already written back + * @a_used_bytes: number of used bytes + * @a_ops: area operations (either journal or ostore) + * @a_erase_count: erase count + * @a_level: GC level + */ +struct logfs_area { /* a segment open for writing */ + struct super_block *a_sb; + int a_is_open; + u32 a_segno; + u32 a_written_bytes; + u32 a_used_bytes; + const struct logfs_area_ops *a_ops; + u32 a_erase_count; + gc_level_t a_level; +}; + +/** + * struct logfs_area_ops - area operations + * + * @get_free_segment: fill area->ofs with the offset of a free segment + * @get_erase_count: fill area->erase_count (needs area->ofs) + * @erase_segment: erase and setup segment + */ +struct logfs_area_ops { + void (*get_free_segment)(struct logfs_area *area); + void (*get_erase_count)(struct logfs_area *area); + int (*erase_segment)(struct logfs_area *area); +}; + +/** + * struct logfs_device_ops - device access operations + * + * @readpage: read one page (mm page) + * @writeseg: write one segment. may be a partial segment + * @erase: erase one segment + * @read: read from the device + * @erase: erase part of the device + */ +struct logfs_device_ops { + struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs); + struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs); + int (*write_sb)(struct super_block *sb, struct page *page); + int (*readpage)(void *_sb, struct page *page); + void (*writeseg)(struct super_block *sb, u64 ofs, size_t len); + int (*erase)(struct super_block *sb, loff_t ofs, size_t len); + void (*sync)(struct super_block *sb); + void (*put_device)(struct super_block *sb); +}; + +/** + * struct candidate_list - list of similar candidates + */ +struct candidate_list { + struct rb_root rb_tree; + int count; + int maxcount; + int sort_by_ec; +}; + +/** + * struct gc_candidate - "candidate" segment to be garbage collected next + * + * @list: list (either free of low) + * @segno: segment number + * @valid: number of valid bytes + * @erase_count: erase count of segment + * @dist: distance from tree root + * + * Candidates can be on two lists. The free list contains electees rather + * than candidates - segments that no longer contain any valid data. The + * low list contains candidates to be picked for GC. It should be kept + * short. It is not required to always pick a perfect candidate. In the + * worst case GC will have to move more data than absolutely necessary. + */ +struct gc_candidate { + struct rb_node rb_node; + struct candidate_list *list; + u32 segno; + u32 valid; + u32 erase_count; + u8 dist; +}; + +/** + * struct logfs_journal_entry - temporary structure used during journal scan + * + * @used: + * @version: normalized version + * @len: length + * @offset: offset + */ +struct logfs_journal_entry { + int used; + s16 version; + u16 len; + u16 datalen; + u64 offset; +}; + +enum transaction_state { + CREATE_1 = 1, + CREATE_2, + UNLINK_1, + UNLINK_2, + CROSS_RENAME_1, + CROSS_RENAME_2, + TARGET_RENAME_1, + TARGET_RENAME_2, + TARGET_RENAME_3 +}; + +/** + * struct logfs_transaction - essential fields to support atomic dirops + * + * @ino: target inode + * @dir: inode of directory containing dentry + * @pos: pos of dentry in directory + */ +struct logfs_transaction { + enum transaction_state state; + u64 ino; + u64 dir; + u64 pos; +}; + +/** + * struct logfs_shadow - old block in the shadow of a not-yet-committed new one + * @old_ofs: offset of old block on medium + * @new_ofs: offset of new block on medium + * @ino: inode number + * @bix: block index + * @old_len: size of old block, including header + * @new_len: size of new block, including header + * @level: block level + */ +struct logfs_shadow { + u64 old_ofs; + u64 new_ofs; + u64 ino; + u64 bix; + int old_len; + int new_len; + gc_level_t gc_level; +}; + +/** + * struct shadow_tree + * @new: shadows where old_ofs==0, indexed by new_ofs + * @old: shadows where old_ofs!=0, indexed by old_ofs + */ +struct shadow_tree { + struct btree_head64 new; + struct btree_head64 old; +}; + +struct object_alias_item { + struct list_head list; + __be64 val; + int child_no; +}; + +/** + * struct logfs_block - contains any block state + * @type: indirect block or inode + * @full: number of fully populated children + * @partial: number of partially populated children + * + * Most blocks are directly represented by page cache pages. But when a block + * becomes dirty, is part of a transaction, contains aliases or is otherwise + * special, a struct logfs_block is allocated to track the additional state. + * Inodes are very similar to indirect blocks, so they can also get one of + * these structures added when appropriate. + */ +#define BLOCK_INDIRECT 1 /* Indirect block */ +#define BLOCK_INODE 2 /* Inode */ +struct logfs_block_ops; +struct logfs_block { + struct list_head alias_list; + struct list_head item_list; + struct super_block *sb; + u64 ino; + u64 bix; + level_t level; + struct page *page; + struct inode *inode; + struct logfs_transaction *ta; + unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG]; + struct logfs_block_ops *ops; + int full; + int partial; + int reserved_bytes; +}; + +typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val); +struct logfs_block_ops { + void (*write_block)(struct logfs_block *block); + gc_level_t (*block_level)(struct logfs_block *block); + void (*free_block)(struct super_block *sb, struct logfs_block*block); + int (*write_alias)(struct super_block *sb, + struct logfs_block *block, + write_alias_t *write_one_alias); +}; + +struct logfs_super { + struct mtd_info *s_mtd; /* underlying device */ + struct block_device *s_bdev; /* underlying device */ + const struct logfs_device_ops *s_devops;/* device access */ + struct inode *s_master_inode; /* inode file */ + struct inode *s_segfile_inode; /* segment file */ + struct inode *s_mapping_inode; /* device mapping */ + atomic_t s_pending_writes; /* outstanting bios */ + long s_flags; + mempool_t *s_btree_pool; /* for btree nodes */ + mempool_t *s_alias_pool; /* aliases in segment.c */ + u64 s_feature_incompat; + u64 s_feature_ro_compat; + u64 s_feature_compat; + u64 s_feature_flags; + u64 s_sb_ofs[2]; + /* alias.c fields */ + struct btree_head32 s_segment_alias; /* remapped segments */ + int s_no_object_aliases; + struct list_head s_object_alias; /* remapped objects */ + struct btree_head128 s_object_alias_tree; /* remapped objects */ + struct mutex s_object_alias_mutex; + /* dir.c fields */ + struct mutex s_dirop_mutex; /* for creat/unlink/rename */ + u64 s_victim_ino; /* used for atomic dir-ops */ + u64 s_rename_dir; /* source directory ino */ + u64 s_rename_pos; /* position of source dd */ + /* gc.c fields */ + long s_segsize; /* size of a segment */ + int s_segshift; /* log2 of segment size */ + long s_segmask; /* 1 << s_segshift - 1 */ + long s_no_segs; /* segments on device */ + long s_no_journal_segs; /* segments used for journal */ + long s_no_blocks; /* blocks per segment */ + long s_writesize; /* minimum write size */ + int s_writeshift; /* log2 of write size */ + u64 s_size; /* filesystem size */ + struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */ + u64 s_gec; /* global erase count */ + u64 s_wl_gec_ostore; /* time of last wl event */ + u64 s_wl_gec_journal; /* time of last wl event */ + u64 s_sweeper; /* current sweeper pos */ + u8 s_ifile_levels; /* max level of ifile */ + u8 s_iblock_levels; /* max level of regular files */ + u8 s_data_levels; /* # of segments to leaf block*/ + u8 s_total_levels; /* sum of above three */ + struct btree_head32 s_cand_tree; /* all candidates */ + struct candidate_list s_free_list; /* 100% free segments */ + struct candidate_list s_reserve_list; /* Bad segment reserve */ + struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */ + struct candidate_list s_ec_list; /* wear level candidates */ + struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */ + /* inode.c fields */ + u64 s_last_ino; /* highest ino used */ + long s_inos_till_wrap; + u32 s_generation; /* i_generation for new files */ + struct list_head s_freeing_list; /* inodes being freed */ + /* journal.c fields */ + struct mutex s_journal_mutex; + void *s_je; /* journal entry to compress */ + void *s_compressed_je; /* block to write to journal */ + u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */ + u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */ + u64 s_last_version; + struct logfs_area *s_journal_area; /* open journal segment */ + __be64 s_je_array[64]; + int s_no_je; + + int s_sum_index; /* for the 12 summaries */ + struct shadow_tree s_shadow_tree; + int s_je_fill; /* index of current je */ + /* readwrite.c fields */ + struct mutex s_write_mutex; + int s_lock_count; + mempool_t *s_block_pool; /* struct logfs_block pool */ + mempool_t *s_shadow_pool; /* struct logfs_shadow pool */ + /* + * Space accounting: + * - s_used_bytes specifies space used to store valid data objects. + * - s_dirty_used_bytes is space used to store non-committed data + * objects. Those objects have already been written themselves, + * but they don't become valid until all indirect blocks up to the + * journal have been written as well. + * - s_dirty_free_bytes is space used to store the old copy of a + * replaced object, as long as the replacement is non-committed. + * In other words, it is the amount of space freed when all dirty + * blocks are written back. + * - s_free_bytes is the amount of free space available for any + * purpose. + * - s_root_reserve is the amount of free space available only to + * the root user. Non-privileged users can no longer write once + * this watermark has been reached. + * - s_speed_reserve is space which remains unused to speed up + * garbage collection performance. + * - s_dirty_pages is the space reserved for currently dirty pages. + * It is a pessimistic estimate, so some/most will get freed on + * page writeback. + * + * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size + */ + u64 s_free_bytes; + u64 s_used_bytes; + u64 s_dirty_free_bytes; + u64 s_dirty_used_bytes; + u64 s_root_reserve; + u64 s_speed_reserve; + u64 s_dirty_pages; + /* Bad block handling: + * - s_bad_seg_reserve is a number of segments usually kept + * free. When encountering bad blocks, the affected segment's data + * is _temporarily_ moved to a reserved segment. + * - s_bad_segments is the number of known bad segments. + */ + u32 s_bad_seg_reserve; + u32 s_bad_segments; +}; + +/** + * struct logfs_inode - in-memory inode + * + * @vfs_inode: struct inode + * @li_data: data pointers + * @li_used_bytes: number of used bytes + * @li_freeing_list: used to track inodes currently being freed + * @li_flags: inode flags + * @li_refcount: number of internal (GC-induced) references + */ +struct logfs_inode { + struct inode vfs_inode; + u64 li_data[LOGFS_EMBEDDED_FIELDS]; + u64 li_used_bytes; + struct list_head li_freeing_list; + struct logfs_block *li_block; + u32 li_flags; + u8 li_height; + int li_refcount; +}; + +#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++) +#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++) +#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--) + +/* compr.c */ +int logfs_compress(void *in, void *out, size_t inlen, size_t outlen); +int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen); +int __init logfs_compr_init(void); +void logfs_compr_exit(void); + +/* dev_bdev.c */ +#ifdef CONFIG_BLOCK +int logfs_get_sb_bdev(struct file_system_type *type, int flags, + const char *devname, struct vfsmount *mnt); +#else +static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags, + const char *devname, struct vfsmount *mnt) +{ + return -ENODEV; +} +#endif + +/* dev_mtd.c */ +#ifdef CONFIG_MTD +int logfs_get_sb_mtd(struct file_system_type *type, int flags, + int mtdnr, struct vfsmount *mnt); +#else +static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags, + int mtdnr, struct vfsmount *mnt) +{ + return -ENODEV; +} +#endif + +/* dir.c */ +extern const struct inode_operations logfs_symlink_iops; +extern const struct inode_operations logfs_dir_iops; +extern const struct file_operations logfs_dir_fops; +int logfs_replay_journal(struct super_block *sb); + +/* file.c */ +extern const struct inode_operations logfs_reg_iops; +extern const struct file_operations logfs_reg_fops; +extern const struct address_space_operations logfs_reg_aops; +int logfs_readpage(struct file *file, struct page *page); +int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long arg); +int logfs_fsync(struct file *file, struct dentry *dentry, int datasync); + +/* gc.c */ +u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec); +void logfs_gc_pass(struct super_block *sb); +int logfs_check_areas(struct super_block *sb); +int logfs_init_gc(struct super_block *sb); +void logfs_cleanup_gc(struct super_block *sb); + +/* inode.c */ +extern const struct super_operations logfs_super_operations; +struct inode *logfs_iget(struct super_block *sb, ino_t ino); +struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie); +void logfs_safe_iput(struct inode *inode, int cookie); +struct inode *logfs_new_inode(struct inode *dir, int mode); +struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino); +struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino); +int logfs_init_inode_cache(void); +void logfs_destroy_inode_cache(void); +void destroy_meta_inode(struct inode *inode); +void logfs_set_blocks(struct inode *inode, u64 no); +/* these logically belong into inode.c but actually reside in readwrite.c */ +int logfs_read_inode(struct inode *inode); +int __logfs_write_inode(struct inode *inode, long flags); +void logfs_delete_inode(struct inode *inode); +void logfs_clear_inode(struct inode *inode); + +/* journal.c */ +void logfs_write_anchor(struct inode *inode); +int logfs_init_journal(struct super_block *sb); +void logfs_cleanup_journal(struct super_block *sb); +int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val); +void do_logfs_journal_wl_pass(struct super_block *sb); + +/* readwrite.c */ +pgoff_t logfs_pack_index(u64 bix, level_t level); +void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level); +int logfs_inode_write(struct inode *inode, const void *buf, size_t count, + loff_t bix, long flags, struct shadow_tree *shadow_tree); +int logfs_readpage_nolock(struct page *page); +int logfs_write_buf(struct inode *inode, struct page *page, long flags); +int logfs_delete(struct inode *inode, pgoff_t index, + struct shadow_tree *shadow_tree); +int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, + gc_level_t gc_level, long flags); +int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, + gc_level_t gc_level); +int logfs_truncate(struct inode *inode, u64 size); +u64 logfs_seek_hole(struct inode *inode, u64 bix); +u64 logfs_seek_data(struct inode *inode, u64 bix); +int logfs_open_segfile(struct super_block *sb); +int logfs_init_rw(struct super_block *sb); +void logfs_cleanup_rw(struct super_block *sb); +void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta); +void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta); +void logfs_write_block(struct logfs_block *block, long flags); +int logfs_write_obj_aliases_pagecache(struct super_block *sb); +void logfs_get_segment_entry(struct super_block *sb, u32 segno, + struct logfs_segment_entry *se); +void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment); +void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec, + gc_level_t gc_level); +void logfs_set_segment_reserved(struct super_block *sb, u32 segno); +void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec); +struct logfs_block *__alloc_block(struct super_block *sb, + u64 ino, u64 bix, level_t level); +void __free_block(struct super_block *sb, struct logfs_block *block); +void btree_write_block(struct logfs_block *block); +void initialize_block_counters(struct page *page, struct logfs_block *block, + __be64 *array, int page_is_empty); +int logfs_exist_block(struct inode *inode, u64 bix); +int get_page_reserve(struct inode *inode, struct page *page); +extern struct logfs_block_ops indirect_block_ops; + +/* segment.c */ +int logfs_erase_segment(struct super_block *sb, u32 ofs); +int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf); +int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix, + level_t level); +int logfs_segment_write(struct inode *inode, struct page *page, + struct logfs_shadow *shadow); +int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow); +int logfs_load_object_aliases(struct super_block *sb, + struct logfs_obj_alias *oa, int count); +void move_page_to_btree(struct page *page); +int logfs_init_mapping(struct super_block *sb); +void logfs_sync_area(struct logfs_area *area); +void logfs_sync_segments(struct super_block *sb); + +/* area handling */ +int logfs_init_areas(struct super_block *sb); +void logfs_cleanup_areas(struct super_block *sb); +int logfs_open_area(struct logfs_area *area, size_t bytes); +void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, + int use_filler); + +static inline void logfs_buf_write(struct logfs_area *area, u64 ofs, + void *buf, size_t len) +{ + __logfs_buf_write(area, ofs, buf, len, 0); +} + +static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs, + void *buf, size_t len) +{ + __logfs_buf_write(area, ofs, buf, len, 1); +} + +/* super.c */ +struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index); +void emergency_read_end(struct page *page); +void logfs_crash_dump(struct super_block *sb); +void *memchr_inv(const void *s, int c, size_t n); +int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); +int logfs_get_sb_device(struct file_system_type *type, int flags, + struct mtd_info *mtd, struct block_device *bdev, + const struct logfs_device_ops *devops, struct vfsmount *mnt); +int logfs_check_ds(struct logfs_disk_super *ds); +int logfs_write_sb(struct super_block *sb); + +static inline struct logfs_super *logfs_super(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct logfs_inode *logfs_inode(struct inode *inode) +{ + return container_of(inode, struct logfs_inode, vfs_inode); +} + +static inline void logfs_set_ro(struct super_block *sb) +{ + logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO; +} + +#define LOGFS_BUG(sb) do { \ + struct super_block *__sb = sb; \ + logfs_crash_dump(__sb); \ + logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \ + BUG(); \ +} while (0) + +#define LOGFS_BUG_ON(condition, sb) \ + do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0) + +static inline __be32 logfs_crc32(void *data, size_t len, size_t skip) +{ + return cpu_to_be32(crc32(~0, data+skip, len-skip)); +} + +static inline u8 logfs_type(struct inode *inode) +{ + return (inode->i_mode >> 12) & 15; +} + +static inline pgoff_t logfs_index(struct super_block *sb, u64 pos) +{ + return pos >> sb->s_blocksize_bits; +} + +static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs) +{ + return ((u64)segno << logfs_super(sb)->s_segshift) + ofs; +} + +static inline u32 seg_no(struct super_block *sb, u64 ofs) +{ + return ofs >> logfs_super(sb)->s_segshift; +} + +static inline u32 seg_ofs(struct super_block *sb, u64 ofs) +{ + return ofs & logfs_super(sb)->s_segmask; +} + +static inline u64 seg_align(struct super_block *sb, u64 ofs) +{ + return ofs & ~logfs_super(sb)->s_segmask; +} + +static inline struct logfs_block *logfs_block(struct page *page) +{ + return (void *)page->private; +} + +static inline level_t shrink_level(gc_level_t __level) +{ + u8 level = (__force u8)__level; + + if (level >= LOGFS_MAX_LEVELS) + level -= LOGFS_MAX_LEVELS; + return (__force level_t)level; +} + +static inline gc_level_t expand_level(u64 ino, level_t __level) +{ + u8 level = (__force u8)__level; + + if (ino == LOGFS_INO_MASTER) { + /* ifile has seperate areas */ + level += LOGFS_MAX_LEVELS; + } + return (__force gc_level_t)level; +} + +static inline int logfs_block_shift(struct super_block *sb, level_t level) +{ + level = shrink_level((__force gc_level_t)level); + return (__force int)level * (sb->s_blocksize_bits - 3); +} + +static inline u64 logfs_block_mask(struct super_block *sb, level_t level) +{ + return ~0ull << logfs_block_shift(sb, level); +} + +static inline struct logfs_area *get_area(struct super_block *sb, + gc_level_t gc_level) +{ + return logfs_super(sb)->s_area[(__force u8)gc_level]; +} + +#endif diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h new file mode 100644 index 000000000000..5d3782ddecc8 --- /dev/null +++ b/fs/logfs/logfs_abi.h @@ -0,0 +1,627 @@ +/* + * fs/logfs/logfs_abi.h + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Public header for logfs. + */ +#ifndef FS_LOGFS_LOGFS_ABI_H +#define FS_LOGFS_LOGFS_ABI_H + +/* For out-of-kernel compiles */ +#ifndef BUILD_BUG_ON +#define BUILD_BUG_ON(condition) /**/ +#endif + +#define SIZE_CHECK(type, size) \ +static inline void check_##type(void) \ +{ \ + BUILD_BUG_ON(sizeof(struct type) != (size)); \ +} + +/* + * Throughout the logfs code, we're constantly dealing with blocks at + * various positions or offsets. To remove confusion, we stricly + * distinguish between a "position" - the logical position within a + * file and an "offset" - the physical location within the device. + * + * Any usage of the term offset for a logical location or position for + * a physical one is a bug and should get fixed. + */ + +/* + * Block are allocated in one of several segments depending on their + * level. The following levels are used: + * 0 - regular data block + * 1 - i1 indirect blocks + * 2 - i2 indirect blocks + * 3 - i3 indirect blocks + * 4 - i4 indirect blocks + * 5 - i5 indirect blocks + * 6 - ifile data blocks + * 7 - ifile i1 indirect blocks + * 8 - ifile i2 indirect blocks + * 9 - ifile i3 indirect blocks + * 10 - ifile i4 indirect blocks + * 11 - ifile i5 indirect blocks + * Potential levels to be used in the future: + * 12 - gc recycled blocks, long-lived data + * 13 - replacement blocks, short-lived data + * + * Levels 1-11 are necessary for robust gc operations and help seperate + * short-lived metadata from longer-lived file data. In the future, + * file data should get seperated into several segments based on simple + * heuristics. Old data recycled during gc operation is expected to be + * long-lived. New data is of uncertain life expectancy. New data + * used to replace older blocks in existing files is expected to be + * short-lived. + */ + + +/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */ +#define LOGFS_MAGIC 0xb21f205ac97e8168ull +#define LOGFS_MAGIC_U32 0xc97e8168u + +/* + * Various blocksize related macros. Blocksize is currently fixed at 4KiB. + * Sooner or later that should become configurable and the macros replaced + * by something superblock-dependent. Pointers in indirect blocks are and + * will remain 64bit. + * + * LOGFS_BLOCKSIZE - self-explaining + * LOGFS_BLOCK_FACTOR - number of pointers per indirect block + * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts + */ +#define LOGFS_BLOCKSIZE (4096ull) +#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64)) +#define LOGFS_BLOCK_BITS (9) + +/* + * Number of blocks at various levels of indirection. There are 16 direct + * block pointers plus a single indirect pointer. + */ +#define I0_BLOCKS (16) +#define I1_BLOCKS LOGFS_BLOCK_FACTOR +#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS) +#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS) +#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS) +#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS) + +#define INDIRECT_INDEX I0_BLOCKS +#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1) + +/* + * Sizes at which files require another level of indirection. Files smaller + * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself, + * similar like ext2 fast symlinks. + * + * Data at a position smaller than LOGFS_I0_SIZE is accessed through the + * direct pointers, else through the 1x indirect pointer and so forth. + */ +#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64)) +#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE) + +/* + * Each indirect block pointer must have this flag set, if all block pointers + * behind it are set, i.e. there is no hole hidden in the shadow of this + * indirect block pointer. + */ +#define LOGFS_FULLY_POPULATED (1ULL << 63) +#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED) + +/* + * LogFS needs to seperate data into levels. Each level is defined as the + * maximal possible distance from the master inode (inode of the inode file). + * Data blocks reside on level 0, 1x indirect block on level 1, etc. + * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11. + * This effort is necessary to guarantee garbage collection to always make + * progress. + * + * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks, + * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is + * the maximal number of levels for one file. + * LOGFS_NO_AREAS is twice that, as the inode file and regular files are + * effectively stacked on top of each other. + */ +#define LOGFS_MAX_INDIRECT (5) +#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1) +#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS) + +/* Maximum size of filenames */ +#define LOGFS_MAX_NAMELEN (255) + +/* Number of segments in the primary journal. */ +#define LOGFS_JOURNAL_SEGS (16) + +/* Maximum number of free/erased/etc. segments in journal entries */ +#define MAX_CACHED_SEGS (64) + + +/* + * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store, + * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including + * its header, + * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for + * its segment header and the padded space at the end when no further objects + * fit. + */ +#define LOGFS_OBJECT_HEADERSIZE (0x1c) +#define LOGFS_SEGMENT_HEADERSIZE (0x18) +#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE) +#define LOGFS_SEGMENT_RESERVE \ + (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1) + +/* + * Segment types: + * SEG_SUPER - Data or indirect block + * SEG_JOURNAL - Inode + * SEG_OSTORE - Dentry + */ +enum { + SEG_SUPER = 0x01, + SEG_JOURNAL = 0x02, + SEG_OSTORE = 0x03, +}; + +/** + * struct logfs_segment_header - per-segment header in the ostore + * + * @crc: crc32 of header (there is no data) + * @pad: unused, must be 0 + * @type: segment type, see above + * @level: GC level for all objects in this segment + * @segno: segment number + * @ec: erase count for this segment + * @gec: global erase count at time of writing + */ +struct logfs_segment_header { + __be32 crc; + __be16 pad; + __u8 type; + __u8 level; + __be32 segno; + __be32 ec; + __be64 gec; +}; + +SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE); + +/** + * struct logfs_disk_super - on-medium superblock + * + * @ds_magic: magic number, must equal LOGFS_MAGIC + * @ds_crc: crc32 of structure starting with the next field + * @ds_ifile_levels: maximum number of levels for ifile + * @ds_iblock_levels: maximum number of levels for regular files + * @ds_data_levels: number of seperate levels for data + * @pad0: reserved, must be 0 + * @ds_feature_incompat: incompatible filesystem features + * @ds_feature_ro_compat: read-only compatible filesystem features + * @ds_feature_compat: compatible filesystem features + * @ds_flags: flags + * @ds_segment_shift: log2 of segment size + * @ds_block_shift: log2 of block size + * @ds_write_shift: log2 of write size + * @pad1: reserved, must be 0 + * @ds_journal_seg: segments used by primary journal + * @ds_root_reserve: bytes reserved for the superuser + * @ds_speed_reserve: bytes reserved to speed up GC + * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks + * @pad2: reserved, must be 0 + * @pad3: reserved, must be 0 + * + * Contains only read-only fields. Read-write fields like the amount of used + * space is tracked in the dynamic superblock, which is stored in the journal. + */ +struct logfs_disk_super { + struct logfs_segment_header ds_sh; + __be64 ds_magic; + + __be32 ds_crc; + __u8 ds_ifile_levels; + __u8 ds_iblock_levels; + __u8 ds_data_levels; + __u8 ds_segment_shift; + __u8 ds_block_shift; + __u8 ds_write_shift; + __u8 pad0[6]; + + __be64 ds_filesystem_size; + __be32 ds_segment_size; + __be32 ds_bad_seg_reserve; + + __be64 ds_feature_incompat; + __be64 ds_feature_ro_compat; + + __be64 ds_feature_compat; + __be64 ds_feature_flags; + + __be64 ds_root_reserve; + __be64 ds_speed_reserve; + + __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS]; + + __be64 ds_super_ofs[2]; + __be64 pad3[8]; +}; + +SIZE_CHECK(logfs_disk_super, 256); + +/* + * Object types: + * OBJ_BLOCK - Data or indirect block + * OBJ_INODE - Inode + * OBJ_DENTRY - Dentry + */ +enum { + OBJ_BLOCK = 0x04, + OBJ_INODE = 0x05, + OBJ_DENTRY = 0x06, +}; + +/** + * struct logfs_object_header - per-object header in the ostore + * + * @crc: crc32 of header, excluding data_crc + * @len: length of data + * @type: object type, see above + * @compr: compression type + * @ino: inode number + * @bix: block index + * @data_crc: crc32 of payload + */ +struct logfs_object_header { + __be32 crc; + __be16 len; + __u8 type; + __u8 compr; + __be64 ino; + __be64 bix; + __be32 data_crc; +} __attribute__((packed)); + +SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE); + +/* + * Reserved inode numbers: + * LOGFS_INO_MASTER - master inode (for inode file) + * LOGFS_INO_ROOT - root directory + * LOGFS_INO_SEGFILE - per-segment used bytes and erase count + */ +enum { + LOGFS_INO_MAPPING = 0x00, + LOGFS_INO_MASTER = 0x01, + LOGFS_INO_ROOT = 0x02, + LOGFS_INO_SEGFILE = 0x03, + LOGFS_RESERVED_INOS = 0x10, +}; + +/* + * Inode flags. High bits should never be written to the medium. They are + * reserved for in-memory usage. + * Low bits should either remain in sync with the corresponding FS_*_FL or + * reuse slots that obviously don't make sense for logfs. + * + * LOGFS_IF_DIRTY Inode must be written back + * LOGFS_IF_ZOMBIE Inode has been deleted + * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode + */ +#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */ +#define LOGFS_IF_DIRTY 0x20000000 +#define LOGFS_IF_ZOMBIE 0x40000000 +#define LOGFS_IF_STILLBORN 0x80000000 + +/* Flags available to chattr */ +#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED) +#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED) +/* Flags inherited from parent directory on file/directory creation */ +#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED) + +/** + * struct logfs_disk_inode - on-medium inode + * + * @di_mode: file mode + * @di_pad: reserved, must be 0 + * @di_flags: inode flags, see above + * @di_uid: user id + * @di_gid: group id + * @di_ctime: change time + * @di_mtime: modify time + * @di_refcount: reference count (aka nlink or link count) + * @di_generation: inode generation, for nfs + * @di_used_bytes: number of bytes used + * @di_size: file size + * @di_data: data pointers + */ +struct logfs_disk_inode { + __be16 di_mode; + __u8 di_height; + __u8 di_pad; + __be32 di_flags; + __be32 di_uid; + __be32 di_gid; + + __be64 di_ctime; + __be64 di_mtime; + + __be64 di_atime; + __be32 di_refcount; + __be32 di_generation; + + __be64 di_used_bytes; + __be64 di_size; + + __be64 di_data[LOGFS_EMBEDDED_FIELDS]; +}; + +SIZE_CHECK(logfs_disk_inode, 200); + +#define INODE_POINTER_OFS \ + (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64)) +#define INODE_USED_OFS \ + (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64)) +#define INODE_SIZE_OFS \ + (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64)) +#define INODE_HEIGHT_OFS (0) + +/** + * struct logfs_disk_dentry - on-medium dentry structure + * + * @ino: inode number + * @namelen: length of file name + * @type: file type, identical to bits 12..15 of mode + * @name: file name + */ +/* FIXME: add 6 bytes of padding to remove the __packed */ +struct logfs_disk_dentry { + __be64 ino; + __be16 namelen; + __u8 type; + __u8 name[LOGFS_MAX_NAMELEN]; +} __attribute__((packed)); + +SIZE_CHECK(logfs_disk_dentry, 266); + +#define RESERVED 0xffffffff +#define BADSEG 0xffffffff +/** + * struct logfs_segment_entry - segment file entry + * + * @ec_level: erase count and level + * @valid: number of valid bytes + * + * Segment file contains one entry for every segment. ec_level contains the + * erasecount in the upper 28 bits and the level in the lower 4 bits. An + * ec_level of BADSEG (-1) identifies bad segments. valid contains the number + * of valid bytes or RESERVED (-1 again) if the segment is used for either the + * superblock or the journal, or when the segment is bad. + */ +struct logfs_segment_entry { + __be32 ec_level; + __be32 valid; +}; + +SIZE_CHECK(logfs_segment_entry, 8); + +/** + * struct logfs_journal_header - header for journal entries (JEs) + * + * @h_crc: crc32 of journal entry + * @h_len: length of compressed journal entry, + * not including header + * @h_datalen: length of uncompressed data + * @h_type: JE type + * @h_version: unnormalized version of journal entry + * @h_compr: compression type + * @h_pad: reserved + */ +struct logfs_journal_header { + __be32 h_crc; + __be16 h_len; + __be16 h_datalen; + __be16 h_type; + __be16 h_version; + __u8 h_compr; + __u8 h_pad[3]; +}; + +SIZE_CHECK(logfs_journal_header, 16); + +/* + * Life expectency of data. + * VIM_DEFAULT - default vim + * VIM_SEGFILE - for segment file only - very short-living + * VIM_GC - GC'd data - likely long-living + */ +enum logfs_vim { + VIM_DEFAULT = 0, + VIM_SEGFILE = 1, +}; + +/** + * struct logfs_je_area - wbuf header + * + * @segno: segment number of area + * @used_bytes: number of bytes already used + * @gc_level: GC level + * @vim: life expectancy of data + * + * "Areas" are segments currently being used for writing. There is at least + * one area per GC level. Several may be used to seperate long-living from + * short-living data. If an area with unknown vim is encountered, it can + * simply be closed. + * The write buffer immediately follow this header. + */ +struct logfs_je_area { + __be32 segno; + __be32 used_bytes; + __u8 gc_level; + __u8 vim; +} __attribute__((packed)); + +SIZE_CHECK(logfs_je_area, 10); + +#define MAX_JOURNAL_HEADER \ + (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area)) + +/** + * struct logfs_je_dynsb - dynamic superblock + * + * @ds_gec: global erase count + * @ds_sweeper: current position of GC "sweeper" + * @ds_rename_dir: source directory ino (see dir.c documentation) + * @ds_rename_pos: position of source dd (see dir.c documentation) + * @ds_victim_ino: victims of incomplete dir operation (see dir.c) + * @ds_victim_ino: parent inode of victim (see dir.c) + * @ds_used_bytes: number of used bytes + */ +struct logfs_je_dynsb { + __be64 ds_gec; + __be64 ds_sweeper; + + __be64 ds_rename_dir; + __be64 ds_rename_pos; + + __be64 ds_victim_ino; + __be64 ds_victim_parent; /* XXX */ + + __be64 ds_used_bytes; + __be32 ds_generation; + __be32 pad; +}; + +SIZE_CHECK(logfs_je_dynsb, 64); + +/** + * struct logfs_je_anchor - anchor of filesystem tree, aka master inode + * + * @da_size: size of inode file + * @da_last_ino: last created inode + * @da_used_bytes: number of bytes used + * @da_data: data pointers + */ +struct logfs_je_anchor { + __be64 da_size; + __be64 da_last_ino; + + __be64 da_used_bytes; + u8 da_height; + u8 pad[7]; + + __be64 da_data[LOGFS_EMBEDDED_FIELDS]; +}; + +SIZE_CHECK(logfs_je_anchor, 168); + +/** + * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal) + * + * @so_segment: segments used for 2nd journal + * + * Length of the array is given by h_len field in the header. + */ +struct logfs_je_spillout { + __be64 so_segment[0]; +}; + +SIZE_CHECK(logfs_je_spillout, 0); + +/** + * struct logfs_je_journal_ec - erase counts for all journal segments + * + * @ec: erase count + * + * Length of the array is given by h_len field in the header. + */ +struct logfs_je_journal_ec { + __be32 ec[0]; +}; + +SIZE_CHECK(logfs_je_journal_ec, 0); + +/** + * struct logfs_je_free_segments - list of free segmetns with erase count + */ +struct logfs_je_free_segments { + __be32 segno; + __be32 ec; +}; + +SIZE_CHECK(logfs_je_free_segments, 8); + +/** + * struct logfs_seg_alias - list of segment aliases + */ +struct logfs_seg_alias { + __be32 old_segno; + __be32 new_segno; +}; + +SIZE_CHECK(logfs_seg_alias, 8); + +/** + * struct logfs_obj_alias - list of object aliases + */ +struct logfs_obj_alias { + __be64 ino; + __be64 bix; + __be64 val; + u8 level; + u8 pad[5]; + __be16 child_no; +}; + +SIZE_CHECK(logfs_obj_alias, 32); + +/** + * Compression types. + * + * COMPR_NONE - uncompressed + * COMPR_ZLIB - compressed with zlib + */ +enum { + COMPR_NONE = 0, + COMPR_ZLIB = 1, +}; + +/* + * Journal entries come in groups of 16. First group contains unique + * entries, next groups contain one entry per level + * + * JE_FIRST - smallest possible journal entry number + * + * JEG_BASE - base group, containing unique entries + * JE_COMMIT - commit entry, validates all previous entries + * JE_DYNSB - dynamic superblock, anything that ought to be in the + * superblock but cannot because it is read-write data + * JE_ANCHOR - anchor aka master inode aka inode file's inode + * JE_ERASECOUNT erasecounts for all journal segments + * JE_SPILLOUT - unused + * JE_SEG_ALIAS - aliases segments + * JE_AREA - area description + * + * JE_LAST - largest possible journal entry number + */ +enum { + JE_FIRST = 0x01, + + JEG_BASE = 0x00, + JE_COMMIT = 0x02, + JE_DYNSB = 0x03, + JE_ANCHOR = 0x04, + JE_ERASECOUNT = 0x05, + JE_SPILLOUT = 0x06, + JE_OBJ_ALIAS = 0x0d, + JE_AREA = 0x0e, + + JE_LAST = 0x0e, +}; + +#endif diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c new file mode 100644 index 000000000000..1dbe6e8cccec --- /dev/null +++ b/fs/logfs/readwrite.c @@ -0,0 +1,2246 @@ +/* + * fs/logfs/readwrite.c + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * + * Actually contains five sets of very similar functions: + * read read blocks from a file + * seek_hole find next hole + * seek_data find next data block + * valid check whether a block still belongs to a file + * write write blocks to a file + * delete delete a block (for directories and ifile) + * rewrite move existing blocks of a file to a new location (gc helper) + * truncate truncate a file + */ +#include "logfs.h" +#include + +static u64 adjust_bix(u64 bix, level_t level) +{ + switch (level) { + case 0: + return bix; + case LEVEL(1): + return max_t(u64, bix, I0_BLOCKS); + case LEVEL(2): + return max_t(u64, bix, I1_BLOCKS); + case LEVEL(3): + return max_t(u64, bix, I2_BLOCKS); + case LEVEL(4): + return max_t(u64, bix, I3_BLOCKS); + case LEVEL(5): + return max_t(u64, bix, I4_BLOCKS); + default: + WARN_ON(1); + return bix; + } +} + +static inline u64 maxbix(u8 height) +{ + return 1ULL << (LOGFS_BLOCK_BITS * height); +} + +/** + * The inode address space is cut in two halves. Lower half belongs to data + * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is + * set, the actual block index (bix) and level can be derived from the page + * index. + * + * The lowest three bits of the block index are set to 0 after packing and + * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored + * anyway this is harmless. + */ +#define ARCH_SHIFT (BITS_PER_LONG - 32) +#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT) +#define LEVEL_SHIFT (28 + ARCH_SHIFT) +static inline pgoff_t first_indirect_block(void) +{ + return INDIRECT_BIT | (1ULL << LEVEL_SHIFT); +} + +pgoff_t logfs_pack_index(u64 bix, level_t level) +{ + pgoff_t index; + + BUG_ON(bix >= INDIRECT_BIT); + if (level == 0) + return bix; + + index = INDIRECT_BIT; + index |= (__force long)level << LEVEL_SHIFT; + index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS); + return index; +} + +void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level) +{ + u8 __level; + + if (!(index & INDIRECT_BIT)) { + *bix = index; + *level = 0; + return; + } + + __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT; + *level = LEVEL(__level); + *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT; + *bix = adjust_bix(*bix, *level); + return; +} +#undef ARCH_SHIFT +#undef INDIRECT_BIT +#undef LEVEL_SHIFT + +/* + * Time is stored as nanoseconds since the epoch. + */ +static struct timespec be64_to_timespec(__be64 betime) +{ + return ns_to_timespec(be64_to_cpu(betime)); +} + +static __be64 timespec_to_be64(struct timespec tsp) +{ + return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec); +} + +static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + inode->i_mode = be16_to_cpu(di->di_mode); + li->li_height = di->di_height; + li->li_flags = be32_to_cpu(di->di_flags); + inode->i_uid = be32_to_cpu(di->di_uid); + inode->i_gid = be32_to_cpu(di->di_gid); + inode->i_size = be64_to_cpu(di->di_size); + logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes)); + inode->i_atime = be64_to_timespec(di->di_atime); + inode->i_ctime = be64_to_timespec(di->di_ctime); + inode->i_mtime = be64_to_timespec(di->di_mtime); + inode->i_nlink = be32_to_cpu(di->di_refcount); + inode->i_generation = be32_to_cpu(di->di_generation); + + switch (inode->i_mode & S_IFMT) { + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + inode->i_rdev = be64_to_cpu(di->di_data[0]); + break; + case S_IFDIR: /* fall through */ + case S_IFREG: /* fall through */ + case S_IFLNK: + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = be64_to_cpu(di->di_data[i]); + break; + default: + BUG(); + } +} + +static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + di->di_mode = cpu_to_be16(inode->i_mode); + di->di_height = li->li_height; + di->di_pad = 0; + di->di_flags = cpu_to_be32(li->li_flags); + di->di_uid = cpu_to_be32(inode->i_uid); + di->di_gid = cpu_to_be32(inode->i_gid); + di->di_size = cpu_to_be64(i_size_read(inode)); + di->di_used_bytes = cpu_to_be64(li->li_used_bytes); + di->di_atime = timespec_to_be64(inode->i_atime); + di->di_ctime = timespec_to_be64(inode->i_ctime); + di->di_mtime = timespec_to_be64(inode->i_mtime); + di->di_refcount = cpu_to_be32(inode->i_nlink); + di->di_generation = cpu_to_be32(inode->i_generation); + + switch (inode->i_mode & S_IFMT) { + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + di->di_data[0] = cpu_to_be64(inode->i_rdev); + break; + case S_IFDIR: /* fall through */ + case S_IFREG: /* fall through */ + case S_IFLNK: + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + di->di_data[i] = cpu_to_be64(li->li_data[i]); + break; + default: + BUG(); + } +} + +static void __logfs_set_blocks(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_inode *li = logfs_inode(inode); + + inode->i_blocks = ULONG_MAX; + if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX) + inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9; +} + +void logfs_set_blocks(struct inode *inode, u64 bytes) +{ + struct logfs_inode *li = logfs_inode(inode); + + li->li_used_bytes = bytes; + __logfs_set_blocks(inode); +} + +static void prelock_page(struct super_block *sb, struct page *page, int lock) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!PageLocked(page)); + if (lock) { + BUG_ON(PagePreLocked(page)); + SetPagePreLocked(page); + } else { + /* We are in GC path. */ + if (PagePreLocked(page)) + super->s_lock_count++; + else + SetPagePreLocked(page); + } +} + +static void preunlock_page(struct super_block *sb, struct page *page, int lock) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!PageLocked(page)); + if (lock) + ClearPagePreLocked(page); + else { + /* We are in GC path. */ + BUG_ON(!PagePreLocked(page)); + if (super->s_lock_count) + super->s_lock_count--; + else + ClearPagePreLocked(page); + } +} + +/* + * Logfs is prone to an AB-BA deadlock where one task tries to acquire + * s_write_mutex with a locked page and GC tries to get that page while holding + * s_write_mutex. + * To solve this issue logfs will ignore the page lock iff the page in question + * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked + * in addition to PG_locked. + */ +static void logfs_get_wblocks(struct super_block *sb, struct page *page, + int lock) +{ + struct logfs_super *super = logfs_super(sb); + + if (page) + prelock_page(sb, page, lock); + + if (lock) { + mutex_lock(&super->s_write_mutex); + logfs_gc_pass(sb); + /* FIXME: We also have to check for shadowed space + * and mempool fill grade */ + } +} + +static void logfs_put_wblocks(struct super_block *sb, struct page *page, + int lock) +{ + struct logfs_super *super = logfs_super(sb); + + if (page) + preunlock_page(sb, page, lock); + /* Order matters - we must clear PG_pre_locked before releasing + * s_write_mutex or we could race against another task. */ + if (lock) + mutex_unlock(&super->s_write_mutex); +} + +static struct page *logfs_get_read_page(struct inode *inode, u64 bix, + level_t level) +{ + return find_or_create_page(inode->i_mapping, + logfs_pack_index(bix, level), GFP_NOFS); +} + +static void logfs_put_read_page(struct page *page) +{ + unlock_page(page); + page_cache_release(page); +} + +static void logfs_lock_write_page(struct page *page) +{ + int loop = 0; + + while (unlikely(!trylock_page(page))) { + if (loop++ > 0x1000) { + /* Has been observed once so far... */ + printk(KERN_ERR "stack at %p\n", &loop); + BUG(); + } + if (PagePreLocked(page)) { + /* Holder of page lock is waiting for us, it + * is safe to use this page. */ + break; + } + /* Some other process has this page locked and has + * nothing to do with us. Wait for it to finish. + */ + schedule(); + } + BUG_ON(!PageLocked(page)); +} + +static struct page *logfs_get_write_page(struct inode *inode, u64 bix, + level_t level) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index = logfs_pack_index(bix, level); + struct page *page; + int err; + +repeat: + page = find_get_page(mapping, index); + if (!page) { + page = __page_cache_alloc(GFP_NOFS); + if (!page) + return NULL; + err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS); + if (unlikely(err)) { + page_cache_release(page); + if (err == -EEXIST) + goto repeat; + return NULL; + } + } else logfs_lock_write_page(page); + BUG_ON(!PageLocked(page)); + return page; +} + +static void logfs_unlock_write_page(struct page *page) +{ + if (!PagePreLocked(page)) + unlock_page(page); +} + +static void logfs_put_write_page(struct page *page) +{ + logfs_unlock_write_page(page); + page_cache_release(page); +} + +static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level, + int rw) +{ + if (rw == READ) + return logfs_get_read_page(inode, bix, level); + else + return logfs_get_write_page(inode, bix, level); +} + +static void logfs_put_page(struct page *page, int rw) +{ + if (rw == READ) + logfs_put_read_page(page); + else + logfs_put_write_page(page); +} + +static unsigned long __get_bits(u64 val, int skip, int no) +{ + u64 ret = val; + + ret >>= skip * no; + ret <<= 64 - no; + ret >>= 64 - no; + return ret; +} + +static unsigned long get_bits(u64 val, level_t skip) +{ + return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS); +} + +static inline void init_shadow_tree(struct super_block *sb, + struct shadow_tree *tree) +{ + struct logfs_super *super = logfs_super(sb); + + btree_init_mempool64(&tree->new, super->s_btree_pool); + btree_init_mempool64(&tree->old, super->s_btree_pool); +} + +static void indirect_write_block(struct logfs_block *block) +{ + struct page *page; + struct inode *inode; + int ret; + + page = block->page; + inode = page->mapping->host; + logfs_lock_write_page(page); + ret = logfs_write_buf(inode, page, 0); + logfs_unlock_write_page(page); + /* + * This needs some rework. Unless you want your filesystem to run + * completely synchronously (you don't), the filesystem will always + * report writes as 'successful' before the actual work has been + * done. The actual work gets done here and this is where any errors + * will show up. And there isn't much we can do about it, really. + * + * Some attempts to fix the errors (move from bad blocks, retry io,...) + * have already been done, so anything left should be either a broken + * device or a bug somewhere in logfs itself. Being relatively new, + * the odds currently favor a bug, so for now the line below isn't + * entirely tasteles. + */ + BUG_ON(ret); +} + +static void inode_write_block(struct logfs_block *block) +{ + struct inode *inode; + int ret; + + inode = block->inode; + if (inode->i_ino == LOGFS_INO_MASTER) + logfs_write_anchor(inode); + else { + ret = __logfs_write_inode(inode, 0); + /* see indirect_write_block comment */ + BUG_ON(ret); + } +} + +static gc_level_t inode_block_level(struct logfs_block *block) +{ + BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER); + return GC_LEVEL(LOGFS_MAX_LEVELS); +} + +static gc_level_t indirect_block_level(struct logfs_block *block) +{ + struct page *page; + struct inode *inode; + u64 bix; + level_t level; + + page = block->page; + inode = page->mapping->host; + logfs_unpack_index(page->index, &bix, &level); + return expand_level(inode->i_ino, level); +} + +/* + * This silences a false, yet annoying gcc warning. I hate it when my editor + * jumps into bitops.h each time I recompile this file. + * TODO: Complain to gcc folks about this and upgrade compiler. + */ +static unsigned long fnb(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +static __be64 inode_val0(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 val; + + /* + * Explicit shifting generates good code, but must match the format + * of the structure. Add some paranoia just in case. + */ + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0); + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2); + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4); + + val = (u64)inode->i_mode << 48 | + (u64)li->li_height << 40 | + (u64)li->li_flags; + return cpu_to_be64(val); +} + +static int inode_write_alias(struct super_block *sb, + struct logfs_block *block, write_alias_t *write_one_alias) +{ + struct inode *inode = block->inode; + struct logfs_inode *li = logfs_inode(inode); + unsigned long pos; + u64 ino , bix; + __be64 val; + level_t level; + int err; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS) + return 0; + + switch (pos) { + case INODE_HEIGHT_OFS: + val = inode_val0(inode); + break; + case INODE_USED_OFS: + val = cpu_to_be64(li->li_used_bytes);; + break; + case INODE_SIZE_OFS: + val = cpu_to_be64(i_size_read(inode)); + break; + case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1: + val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]); + break; + default: + BUG(); + } + + ino = LOGFS_INO_MASTER; + bix = inode->i_ino; + level = LEVEL(0); + err = write_one_alias(sb, ino, bix, level, pos, val); + if (err) + return err; + } +} + +static int indirect_write_alias(struct super_block *sb, + struct logfs_block *block, write_alias_t *write_one_alias) +{ + unsigned long pos; + struct page *page = block->page; + u64 ino , bix; + __be64 *child, val; + level_t level; + int err; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_BLOCK_FACTOR) + return 0; + + ino = page->mapping->host->i_ino; + logfs_unpack_index(page->index, &bix, &level); + child = kmap_atomic(page, KM_USER0); + val = child[pos]; + kunmap_atomic(child, KM_USER0); + err = write_one_alias(sb, ino, bix, level, pos, val); + if (err) + return err; + } +} + +int logfs_write_obj_aliases_pagecache(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + int err; + + list_for_each_entry(block, &super->s_object_alias, alias_list) { + err = block->ops->write_alias(sb, block, write_alias_journal); + if (err) + return err; + } + return 0; +} + +void __free_block(struct super_block *sb, struct logfs_block *block) +{ + BUG_ON(!list_empty(&block->item_list)); + list_del(&block->alias_list); + mempool_free(block, logfs_super(sb)->s_block_pool); +} + +static void inode_free_block(struct super_block *sb, struct logfs_block *block) +{ + struct inode *inode = block->inode; + + logfs_inode(inode)->li_block = NULL; + __free_block(sb, block); +} + +static void indirect_free_block(struct super_block *sb, + struct logfs_block *block) +{ + ClearPagePrivate(block->page); + block->page->private = 0; + __free_block(sb, block); +} + + +static struct logfs_block_ops inode_block_ops = { + .write_block = inode_write_block, + .block_level = inode_block_level, + .free_block = inode_free_block, + .write_alias = inode_write_alias, +}; + +struct logfs_block_ops indirect_block_ops = { + .write_block = indirect_write_block, + .block_level = indirect_block_level, + .free_block = indirect_free_block, + .write_alias = indirect_write_alias, +}; + +struct logfs_block *__alloc_block(struct super_block *sb, + u64 ino, u64 bix, level_t level) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + + block = mempool_alloc(super->s_block_pool, GFP_NOFS); + memset(block, 0, sizeof(*block)); + INIT_LIST_HEAD(&block->alias_list); + INIT_LIST_HEAD(&block->item_list); + block->sb = sb; + block->ino = ino; + block->bix = bix; + block->level = level; + return block; +} + +static void alloc_inode_block(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block; + + if (li->li_block) + return; + + block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0); + block->inode = inode; + li->li_block = block; + block->ops = &inode_block_ops; +} + +void initialize_block_counters(struct page *page, struct logfs_block *block, + __be64 *array, int page_is_empty) +{ + u64 ptr; + int i, start; + + block->partial = 0; + block->full = 0; + start = 0; + if (page->index < first_indirect_block()) { + /* Counters are pointless on level 0 */ + return; + } + if (page->index == first_indirect_block()) { + /* Skip unused pointers */ + start = I0_BLOCKS; + block->full = I0_BLOCKS; + } + if (!page_is_empty) { + for (i = start; i < LOGFS_BLOCK_FACTOR; i++) { + ptr = be64_to_cpu(array[i]); + if (ptr) + block->partial++; + if (ptr & LOGFS_FULLY_POPULATED) + block->full++; + } + } +} + +static void alloc_data_block(struct inode *inode, struct page *page) +{ + struct logfs_block *block; + u64 bix; + level_t level; + + if (PagePrivate(page)) + return; + + logfs_unpack_index(page->index, &bix, &level); + block = __alloc_block(inode->i_sb, inode->i_ino, bix, level); + block->page = page; + SetPagePrivate(page); + page->private = (unsigned long)block; + block->ops = &indirect_block_ops; +} + +static void alloc_indirect_block(struct inode *inode, struct page *page, + int page_is_empty) +{ + struct logfs_block *block; + __be64 *array; + + if (PagePrivate(page)) + return; + + alloc_data_block(inode, page); + + block = logfs_block(page); + array = kmap_atomic(page, KM_USER0); + initialize_block_counters(page, block, array, page_is_empty); + kunmap_atomic(array, KM_USER0); +} + +static void block_set_pointer(struct page *page, int index, u64 ptr) +{ + struct logfs_block *block = logfs_block(page); + __be64 *array; + u64 oldptr; + + BUG_ON(!block); + array = kmap_atomic(page, KM_USER0); + oldptr = be64_to_cpu(array[index]); + array[index] = cpu_to_be64(ptr); + kunmap_atomic(array, KM_USER0); + SetPageUptodate(page); + + block->full += !!(ptr & LOGFS_FULLY_POPULATED) + - !!(oldptr & LOGFS_FULLY_POPULATED); + block->partial += !!ptr - !!oldptr; +} + +static u64 block_get_pointer(struct page *page, int index) +{ + __be64 *block; + u64 ptr; + + block = kmap_atomic(page, KM_USER0); + ptr = be64_to_cpu(block[index]); + kunmap_atomic(block, KM_USER0); + return ptr; +} + +static int logfs_read_empty(struct page *page) +{ + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + return 0; +} + +static int logfs_read_direct(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + pgoff_t index = page->index; + u64 block; + + block = li->li_data[index]; + if (!block) + return logfs_read_empty(page); + + return logfs_segment_read(inode, page, block, index, 0); +} + +static int logfs_read_loop(struct inode *inode, struct page *page, + int rw_context) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bix, bofs = li->li_data[INDIRECT_INDEX]; + level_t level, target_level; + int ret; + struct page *ipage; + + logfs_unpack_index(page->index, &bix, &target_level); + if (!bofs) + return logfs_read_empty(page); + + if (bix >= maxbix(li->li_height)) + return logfs_read_empty(page); + + for (level = LEVEL(li->li_height); + (__force u8)level > (__force u8)target_level; + level = SUBLEVEL(level)){ + ipage = logfs_get_page(inode, bix, level, rw_context); + if (!ipage) + return -ENOMEM; + + ret = logfs_segment_read(inode, ipage, bofs, bix, level); + if (ret) { + logfs_put_read_page(ipage); + return ret; + } + + bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level))); + logfs_put_page(ipage, rw_context); + if (!bofs) + return logfs_read_empty(page); + } + + return logfs_segment_read(inode, page, bofs, bix, 0); +} + +static int logfs_read_block(struct inode *inode, struct page *page, + int rw_context) +{ + pgoff_t index = page->index; + + if (index < I0_BLOCKS) + return logfs_read_direct(inode, page); + return logfs_read_loop(inode, page, rw_context); +} + +static int logfs_exist_loop(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bofs = li->li_data[INDIRECT_INDEX]; + level_t level; + int ret; + struct page *ipage; + + if (!bofs) + return 0; + if (bix >= maxbix(li->li_height)) + return 0; + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) { + ipage = logfs_get_read_page(inode, bix, level); + if (!ipage) + return -ENOMEM; + + ret = logfs_segment_read(inode, ipage, bofs, bix, level); + if (ret) { + logfs_put_read_page(ipage); + return ret; + } + + bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level))); + logfs_put_read_page(ipage); + if (!bofs) + return 0; + } + + return 1; +} + +int logfs_exist_block(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) + return !!li->li_data[bix]; + return logfs_exist_loop(inode, bix); +} + +static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data) +{ + struct logfs_inode *li = logfs_inode(inode); + + for (; bix < I0_BLOCKS; bix++) + if (data ^ (li->li_data[bix] == 0)) + return bix; + return I0_BLOCKS; +} + +static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data) +{ + struct logfs_inode *li = logfs_inode(inode); + __be64 *rblock; + u64 increment, bofs = li->li_data[INDIRECT_INDEX]; + level_t level; + int ret, slot; + struct page *page; + + BUG_ON(!bofs); + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) { + increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1)); + page = logfs_get_read_page(inode, bix, level); + if (!page) + return bix; + + ret = logfs_segment_read(inode, page, bofs, bix, level); + if (ret) { + logfs_put_read_page(page); + return bix; + } + + slot = get_bits(bix, SUBLEVEL(level)); + rblock = kmap_atomic(page, KM_USER0); + while (slot < LOGFS_BLOCK_FACTOR) { + if (data && (rblock[slot] != 0)) + break; + if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED)) + break; + slot++; + bix += increment; + bix &= ~(increment - 1); + } + if (slot >= LOGFS_BLOCK_FACTOR) { + kunmap_atomic(rblock, KM_USER0); + logfs_put_read_page(page); + return bix; + } + bofs = be64_to_cpu(rblock[slot]); + kunmap_atomic(rblock, KM_USER0); + logfs_put_read_page(page); + if (!bofs) { + BUG_ON(data); + return bix; + } + } + return bix; +} + +/** + * logfs_seek_hole - find next hole starting at a given block index + * @inode: inode to search in + * @bix: block index to start searching + * + * Returns next hole. If the file doesn't contain any further holes, the + * block address next to eof is returned instead. + */ +u64 logfs_seek_hole(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) { + bix = seek_holedata_direct(inode, bix, 0); + if (bix < I0_BLOCKS) + return bix; + } + + if (!li->li_data[INDIRECT_INDEX]) + return bix; + else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED) + bix = maxbix(li->li_height); + else { + bix = seek_holedata_loop(inode, bix, 0); + if (bix < maxbix(li->li_height)) + return bix; + /* Should not happen anymore. But if some port writes semi- + * corrupt images (as this one used to) we might run into it. + */ + WARN_ON_ONCE(bix == maxbix(li->li_height)); + } + + return bix; +} + +static u64 __logfs_seek_data(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) { + bix = seek_holedata_direct(inode, bix, 1); + if (bix < I0_BLOCKS) + return bix; + } + + if (bix < maxbix(li->li_height)) { + if (!li->li_data[INDIRECT_INDEX]) + bix = maxbix(li->li_height); + else + return seek_holedata_loop(inode, bix, 1); + } + + return bix; +} + +/** + * logfs_seek_data - find next data block after a given block index + * @inode: inode to search in + * @bix: block index to start searching + * + * Returns next data block. If the file doesn't contain any further data + * blocks, the last block in the file is returned instead. + */ +u64 logfs_seek_data(struct inode *inode, u64 bix) +{ + struct super_block *sb = inode->i_sb; + u64 ret, end; + + ret = __logfs_seek_data(inode, bix); + end = i_size_read(inode) >> sb->s_blocksize_bits; + if (ret >= end) + ret = max(bix, end); + return ret; +} + +static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs) +{ + return pure_ofs(li->li_data[bix]) == ofs; +} + +static int __logfs_is_valid_loop(struct inode *inode, u64 bix, + u64 ofs, u64 bofs) +{ + struct logfs_inode *li = logfs_inode(inode); + level_t level; + int ret; + struct page *page; + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){ + page = logfs_get_write_page(inode, bix, level); + BUG_ON(!page); + + ret = logfs_segment_read(inode, page, bofs, bix, level); + if (ret) { + logfs_put_write_page(page); + return 0; + } + + bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level))); + logfs_put_write_page(page); + if (!bofs) + return 0; + + if (pure_ofs(bofs) == ofs) + return 1; + } + return 0; +} + +static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bofs = li->li_data[INDIRECT_INDEX]; + + if (!bofs) + return 0; + + if (bix >= maxbix(li->li_height)) + return 0; + + if (pure_ofs(bofs) == ofs) + return 1; + + return __logfs_is_valid_loop(inode, bix, ofs, bofs); +} + +static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs) +{ + struct logfs_inode *li = logfs_inode(inode); + + if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1) + return 0; + + if (bix < I0_BLOCKS) + return logfs_is_valid_direct(li, bix, ofs); + return logfs_is_valid_loop(inode, bix, ofs); +} + +/** + * logfs_is_valid_block - check whether this block is still valid + * + * @sb - superblock + * @ofs - block physical offset + * @ino - block inode number + * @bix - block index + * @level - block level + * + * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will + * become invalid once the journal is written. + */ +int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, + gc_level_t gc_level) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + int ret, cookie; + + /* Umount closes a segment with free blocks remaining. Those + * blocks are by definition invalid. */ + if (ino == -1) + return 0; + + LOGFS_BUG_ON((u64)(u_long)ino != ino, sb); + + inode = logfs_safe_iget(sb, ino, &cookie); + if (IS_ERR(inode)) + goto invalid; + + ret = __logfs_is_valid_block(inode, bix, ofs); + logfs_safe_iput(inode, cookie); + if (ret) + return ret; + +invalid: + /* Block is nominally invalid, but may still sit in the shadow tree, + * waiting for a journal commit. + */ + if (btree_lookup64(&super->s_shadow_tree.old, ofs)) + return 2; + return 0; +} + +int logfs_readpage_nolock(struct page *page) +{ + struct inode *inode = page->mapping->host; + int ret = -EIO; + + ret = logfs_read_block(inode, page, READ); + + if (ret) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + flush_dcache_page(page); + + return ret; +} + +static int logfs_reserve_bytes(struct inode *inode, int bytes) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + u64 available = super->s_free_bytes + super->s_dirty_free_bytes + - super->s_dirty_used_bytes - super->s_dirty_pages; + + if (!bytes) + return 0; + + if (available < bytes) + return -ENOSPC; + + if (available < bytes + super->s_root_reserve && + !capable(CAP_SYS_RESOURCE)) + return -ENOSPC; + + return 0; +} + +int get_page_reserve(struct inode *inode, struct page *page) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + int ret; + + if (logfs_block(page) && logfs_block(page)->reserved_bytes) + return 0; + + logfs_get_wblocks(inode->i_sb, page, WF_LOCK); + ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE); + if (!ret) { + alloc_data_block(inode, page); + logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE; + super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE; + } + logfs_put_wblocks(inode->i_sb, page, WF_LOCK); + return ret; +} + +/* + * We are protected by write lock. Push victims up to superblock level + * and release transaction when appropriate. + */ +/* FIXME: This is currently called from the wrong spots. */ +static void logfs_handle_transaction(struct inode *inode, + struct logfs_transaction *ta) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + if (!ta) + return; + logfs_inode(inode)->li_block->ta = NULL; + + if (inode->i_ino != LOGFS_INO_MASTER) { + BUG(); /* FIXME: Yes, this needs more thought */ + /* just remember the transaction until inode is written */ + //BUG_ON(logfs_inode(inode)->li_transaction); + //logfs_inode(inode)->li_transaction = ta; + return; + } + + switch (ta->state) { + case CREATE_1: /* fall through */ + case UNLINK_1: + BUG_ON(super->s_victim_ino); + super->s_victim_ino = ta->ino; + break; + case CREATE_2: /* fall through */ + case UNLINK_2: + BUG_ON(super->s_victim_ino != ta->ino); + super->s_victim_ino = 0; + /* transaction ends here - free it */ + kfree(ta); + break; + case CROSS_RENAME_1: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + super->s_rename_dir = ta->dir; + super->s_rename_pos = ta->pos; + break; + case CROSS_RENAME_2: + BUG_ON(super->s_rename_dir != ta->dir); + BUG_ON(super->s_rename_pos != ta->pos); + super->s_rename_dir = 0; + super->s_rename_pos = 0; + kfree(ta); + break; + case TARGET_RENAME_1: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + BUG_ON(super->s_victim_ino); + super->s_rename_dir = ta->dir; + super->s_rename_pos = ta->pos; + super->s_victim_ino = ta->ino; + break; + case TARGET_RENAME_2: + BUG_ON(super->s_rename_dir != ta->dir); + BUG_ON(super->s_rename_pos != ta->pos); + BUG_ON(super->s_victim_ino != ta->ino); + super->s_rename_dir = 0; + super->s_rename_pos = 0; + break; + case TARGET_RENAME_3: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + BUG_ON(super->s_victim_ino != ta->ino); + super->s_victim_ino = 0; + kfree(ta); + break; + default: + BUG(); + } +} + +/* + * Not strictly a reservation, but rather a check that we still have enough + * space to satisfy the write. + */ +static int logfs_reserve_blocks(struct inode *inode, int blocks) +{ + return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE); +} + +struct write_control { + u64 ofs; + long flags; +}; + +static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix, + level_t level, u64 old_ofs) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_shadow *shadow; + + shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS); + memset(shadow, 0, sizeof(*shadow)); + shadow->ino = inode->i_ino; + shadow->bix = bix; + shadow->gc_level = expand_level(inode->i_ino, level); + shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED; + return shadow; +} + +static void free_shadow(struct inode *inode, struct logfs_shadow *shadow) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + mempool_free(shadow, super->s_shadow_pool); +} + +/** + * fill_shadow_tree - Propagate shadow tree changes due to a write + * @inode: Inode owning the page + * @page: Struct page that was written + * @shadow: Shadow for the current write + * + * Writes in logfs can result in two semi-valid objects. The old object + * is still valid as long as it can be reached by following pointers on + * the medium. Only when writes propagate all the way up to the journal + * has the new object safely replaced the old one. + * + * To handle this problem, a struct logfs_shadow is used to represent + * every single write. It is attached to the indirect block, which is + * marked dirty. When the indirect block is written, its shadows are + * handed up to the next indirect block (or inode). Untimately they + * will reach the master inode and be freed upon journal commit. + * + * This function handles a single step in the propagation. It adds the + * shadow for the current write to the tree, along with any shadows in + * the page's tree, in case it was an indirect block. If a page is + * written, the inode parameter is left NULL, if an inode is written, + * the page parameter is left NULL. + */ +static void fill_shadow_tree(struct inode *inode, struct page *page, + struct logfs_shadow *shadow) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_block *block = logfs_block(page); + struct shadow_tree *tree = &super->s_shadow_tree; + + if (PagePrivate(page)) { + if (block->alias_map) + super->s_no_object_aliases -= bitmap_weight( + block->alias_map, LOGFS_BLOCK_FACTOR); + logfs_handle_transaction(inode, block->ta); + block->ops->free_block(inode->i_sb, block); + } + if (shadow) { + if (shadow->old_ofs) + btree_insert64(&tree->old, shadow->old_ofs, shadow, + GFP_NOFS); + else + btree_insert64(&tree->new, shadow->new_ofs, shadow, + GFP_NOFS); + + super->s_dirty_used_bytes += shadow->new_len; + super->s_dirty_free_bytes += shadow->old_len; + } +} + +static void logfs_set_alias(struct super_block *sb, struct logfs_block *block, + long child_no) +{ + struct logfs_super *super = logfs_super(sb); + + if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) { + /* Aliases in the master inode are pointless. */ + return; + } + + if (!test_bit(child_no, block->alias_map)) { + set_bit(child_no, block->alias_map); + super->s_no_object_aliases++; + } + list_move_tail(&block->alias_list, &super->s_object_alias); +} + +/* + * Object aliases can and often do change the size and occupied space of a + * file. So not only do we have to change the pointers, we also have to + * change inode->i_size and li->li_used_bytes. Which is done by setting + * another two object aliases for the inode itself. + */ +static void set_iused(struct inode *inode, struct logfs_shadow *shadow) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (shadow->new_len == shadow->old_len) + return; + + alloc_inode_block(inode); + li->li_used_bytes += shadow->new_len - shadow->old_len; + __logfs_set_blocks(inode); + logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS); + logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS); +} + +static int logfs_write_i0(struct inode *inode, struct page *page, + struct write_control *wc) +{ + struct logfs_shadow *shadow; + u64 bix; + level_t level; + int full, err = 0; + + logfs_unpack_index(page->index, &bix, &level); + if (wc->ofs == 0) + if (logfs_reserve_blocks(inode, 1)) + return -ENOSPC; + + shadow = alloc_shadow(inode, bix, level, wc->ofs); + if (wc->flags & WF_WRITE) + err = logfs_segment_write(inode, page, shadow); + if (wc->flags & WF_DELETE) + logfs_segment_delete(inode, shadow); + if (err) { + free_shadow(inode, shadow); + return err; + } + + set_iused(inode, shadow); + full = 1; + if (level != 0) { + alloc_indirect_block(inode, page, 0); + full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR; + } + fill_shadow_tree(inode, page, shadow); + wc->ofs = shadow->new_ofs; + if (wc->ofs && full) + wc->ofs |= LOGFS_FULLY_POPULATED; + return 0; +} + +static int logfs_write_direct(struct inode *inode, struct page *page, + long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[page->index], + .flags = flags, + }; + int err; + + alloc_inode_block(inode); + + err = logfs_write_i0(inode, page, &wc); + if (err) + return err; + + li->li_data[page->index] = wc.ofs; + logfs_set_alias(inode->i_sb, li->li_block, + page->index + INODE_POINTER_OFS); + return 0; +} + +static int ptr_change(u64 ofs, struct page *page) +{ + struct logfs_block *block = logfs_block(page); + int empty0, empty1, full0, full1; + + empty0 = ofs == 0; + empty1 = block->partial == 0; + if (empty0 != empty1) + return 1; + + /* The !! is necessary to shrink result to int */ + full0 = !!(ofs & LOGFS_FULLY_POPULATED); + full1 = block->full == LOGFS_BLOCK_FACTOR; + if (full0 != full1) + return 1; + return 0; +} + +static int __logfs_write_rec(struct inode *inode, struct page *page, + struct write_control *this_wc, + pgoff_t bix, level_t target_level, level_t level) +{ + int ret, page_empty = 0; + int child_no = get_bits(bix, SUBLEVEL(level)); + struct page *ipage; + struct write_control child_wc = { + .flags = this_wc->flags, + }; + + ipage = logfs_get_write_page(inode, bix, level); + if (!ipage) + return -ENOMEM; + + if (this_wc->ofs) { + ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); + if (ret) + goto out; + } else if (!PageUptodate(ipage)) { + page_empty = 1; + logfs_read_empty(ipage); + } + + child_wc.ofs = block_get_pointer(ipage, child_no); + + if ((__force u8)level-1 > (__force u8)target_level) + ret = __logfs_write_rec(inode, page, &child_wc, bix, + target_level, SUBLEVEL(level)); + else + ret = logfs_write_i0(inode, page, &child_wc); + + if (ret) + goto out; + + alloc_indirect_block(inode, ipage, page_empty); + block_set_pointer(ipage, child_no, child_wc.ofs); + /* FIXME: first condition seems superfluous */ + if (child_wc.ofs || logfs_block(ipage)->partial) + this_wc->flags |= WF_WRITE; + /* the condition on this_wc->ofs ensures that we won't consume extra + * space for indirect blocks in the future, which we cannot reserve */ + if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage)) + ret = logfs_write_i0(inode, ipage, this_wc); + else + logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no); +out: + logfs_put_write_page(ipage); + return ret; +} + +static int logfs_write_rec(struct inode *inode, struct page *page, + pgoff_t bix, level_t target_level, long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[INDIRECT_INDEX], + .flags = flags, + }; + int ret; + + alloc_inode_block(inode); + + if (li->li_height > (__force u8)target_level) + ret = __logfs_write_rec(inode, page, &wc, bix, target_level, + LEVEL(li->li_height)); + else + ret = logfs_write_i0(inode, page, &wc); + if (ret) + return ret; + + if (li->li_data[INDIRECT_INDEX] != wc.ofs) { + li->li_data[INDIRECT_INDEX] = wc.ofs; + logfs_set_alias(inode->i_sb, li->li_block, + INDIRECT_INDEX + INODE_POINTER_OFS); + } + return ret; +} + +void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + alloc_inode_block(inode); + logfs_inode(inode)->li_block->ta = ta; +} + +void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + struct logfs_block *block = logfs_inode(inode)->li_block; + + if (block && block->ta) + block->ta = NULL; +} + +static int grow_inode(struct inode *inode, u64 bix, level_t level) +{ + struct logfs_inode *li = logfs_inode(inode); + u8 height = (__force u8)level; + struct page *page; + struct write_control wc = { + .flags = WF_WRITE, + }; + int err; + + BUG_ON(height > 5 || li->li_height > 5); + while (height > li->li_height || bix >= maxbix(li->li_height)) { + page = logfs_get_write_page(inode, I0_BLOCKS + 1, + LEVEL(li->li_height + 1)); + if (!page) + return -ENOMEM; + logfs_read_empty(page); + alloc_indirect_block(inode, page, 1); + block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]); + err = logfs_write_i0(inode, page, &wc); + logfs_put_write_page(page); + if (err) + return err; + li->li_data[INDIRECT_INDEX] = wc.ofs; + wc.ofs = 0; + li->li_height++; + logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS); + } + return 0; +} + +static int __logfs_write_buf(struct inode *inode, struct page *page, long flags) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + pgoff_t index = page->index; + u64 bix; + level_t level; + int err; + + flags |= WF_WRITE | WF_DELETE; + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + logfs_unpack_index(index, &bix, &level); + if (logfs_block(page) && logfs_block(page)->reserved_bytes) + super->s_dirty_pages -= logfs_block(page)->reserved_bytes; + + if (index < I0_BLOCKS) + return logfs_write_direct(inode, page, flags); + + bix = adjust_bix(bix, level); + err = grow_inode(inode, bix, level); + if (err) + return err; + return logfs_write_rec(inode, page, bix, level, flags); +} + +int logfs_write_buf(struct inode *inode, struct page *page, long flags) +{ + struct super_block *sb = inode->i_sb; + int ret; + + logfs_get_wblocks(sb, page, flags & WF_LOCK); + ret = __logfs_write_buf(inode, page, flags); + logfs_put_wblocks(sb, page, flags & WF_LOCK); + return ret; +} + +static int __logfs_delete(struct inode *inode, struct page *page) +{ + long flags = WF_DELETE; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + if (page->index < I0_BLOCKS) + return logfs_write_direct(inode, page, flags); + return logfs_write_rec(inode, page, page->index, 0, flags); +} + +int logfs_delete(struct inode *inode, pgoff_t index, + struct shadow_tree *shadow_tree) +{ + struct super_block *sb = inode->i_sb; + struct page *page; + int ret; + + page = logfs_get_read_page(inode, index, 0); + if (!page) + return -ENOMEM; + + logfs_get_wblocks(sb, page, 1); + ret = __logfs_delete(inode, page); + logfs_put_wblocks(sb, page, 1); + + logfs_put_read_page(page); + + return ret; +} + +/* Rewrite cannot mark the inode dirty but has to write it immediatly. */ +int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, + gc_level_t gc_level, long flags) +{ + level_t level = shrink_level(gc_level); + struct page *page; + int err; + + page = logfs_get_write_page(inode, bix, level); + if (!page) + return -ENOMEM; + + err = logfs_segment_read(inode, page, ofs, bix, level); + if (!err) { + if (level != 0) + alloc_indirect_block(inode, page, 0); + err = logfs_write_buf(inode, page, flags); + } + logfs_put_write_page(page); + return err; +} + +static int truncate_data_block(struct inode *inode, struct page *page, + u64 ofs, struct logfs_shadow *shadow, u64 size) +{ + loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits; + u64 bix; + level_t level; + int err; + + /* Does truncation happen within this page? */ + if (size <= pageofs || size - pageofs >= PAGE_SIZE) + return 0; + + logfs_unpack_index(page->index, &bix, &level); + BUG_ON(level != 0); + + err = logfs_segment_read(inode, page, ofs, bix, level); + if (err) + return err; + + zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE); + return logfs_segment_write(inode, page, shadow); +} + +static int logfs_truncate_i0(struct inode *inode, struct page *page, + struct write_control *wc, u64 size) +{ + struct logfs_shadow *shadow; + u64 bix; + level_t level; + int err = 0; + + logfs_unpack_index(page->index, &bix, &level); + BUG_ON(level != 0); + shadow = alloc_shadow(inode, bix, level, wc->ofs); + + err = truncate_data_block(inode, page, wc->ofs, shadow, size); + if (err) { + free_shadow(inode, shadow); + return err; + } + + logfs_segment_delete(inode, shadow); + set_iused(inode, shadow); + fill_shadow_tree(inode, page, shadow); + wc->ofs = shadow->new_ofs; + return 0; +} + +static int logfs_truncate_direct(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc; + struct page *page; + int e; + int err; + + alloc_inode_block(inode); + + for (e = I0_BLOCKS - 1; e >= 0; e--) { + if (size > (e+1) * LOGFS_BLOCKSIZE) + break; + + wc.ofs = li->li_data[e]; + if (!wc.ofs) + continue; + + page = logfs_get_write_page(inode, e, 0); + if (!page) + return -ENOMEM; + err = logfs_segment_read(inode, page, wc.ofs, e, 0); + if (err) { + logfs_put_write_page(page); + return err; + } + err = logfs_truncate_i0(inode, page, &wc, size); + logfs_put_write_page(page); + if (err) + return err; + + li->li_data[e] = wc.ofs; + } + return 0; +} + +/* FIXME: these need to become per-sb once we support different blocksizes */ +static u64 __logfs_step[] = { + 1, + I1_BLOCKS, + I2_BLOCKS, + I3_BLOCKS, +}; + +static u64 __logfs_start_index[] = { + I0_BLOCKS, + I1_BLOCKS, + I2_BLOCKS, + I3_BLOCKS +}; + +static inline u64 logfs_step(level_t level) +{ + return __logfs_step[(__force u8)level]; +} + +static inline u64 logfs_factor(u8 level) +{ + return __logfs_step[level] * LOGFS_BLOCKSIZE; +} + +static inline u64 logfs_start_index(level_t level) +{ + return __logfs_start_index[(__force u8)level]; +} + +static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level) +{ + logfs_unpack_index(index, bix, level); + if (*bix <= logfs_start_index(SUBLEVEL(*level))) + *bix = 0; +} + +static int __logfs_truncate_rec(struct inode *inode, struct page *ipage, + struct write_control *this_wc, u64 size) +{ + int truncate_happened = 0; + int e, err = 0; + u64 bix, child_bix, next_bix; + level_t level; + struct page *page; + struct write_control child_wc = { /* FIXME: flags */ }; + + logfs_unpack_raw_index(ipage->index, &bix, &level); + err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); + if (err) + return err; + + for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) { + child_bix = bix + e * logfs_step(SUBLEVEL(level)); + next_bix = child_bix + logfs_step(SUBLEVEL(level)); + if (size > next_bix * LOGFS_BLOCKSIZE) + break; + + child_wc.ofs = pure_ofs(block_get_pointer(ipage, e)); + if (!child_wc.ofs) + continue; + + page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level)); + if (!page) + return -ENOMEM; + + if ((__force u8)level > 1) + err = __logfs_truncate_rec(inode, page, &child_wc, size); + else + err = logfs_truncate_i0(inode, page, &child_wc, size); + logfs_put_write_page(page); + if (err) + return err; + + truncate_happened = 1; + alloc_indirect_block(inode, ipage, 0); + block_set_pointer(ipage, e, child_wc.ofs); + } + + if (!truncate_happened) { + printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size); + return 0; + } + + this_wc->flags = WF_DELETE; + if (logfs_block(ipage)->partial) + this_wc->flags |= WF_WRITE; + + return logfs_write_i0(inode, ipage, this_wc); +} + +static int logfs_truncate_rec(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[INDIRECT_INDEX], + }; + struct page *page; + int err; + + alloc_inode_block(inode); + + if (!wc.ofs) + return 0; + + page = logfs_get_write_page(inode, 0, LEVEL(li->li_height)); + if (!page) + return -ENOMEM; + + err = __logfs_truncate_rec(inode, page, &wc, size); + logfs_put_write_page(page); + if (err) + return err; + + if (li->li_data[INDIRECT_INDEX] != wc.ofs) + li->li_data[INDIRECT_INDEX] = wc.ofs; + return 0; +} + +static int __logfs_truncate(struct inode *inode, u64 size) +{ + int ret; + + if (size >= logfs_factor(logfs_inode(inode)->li_height)) + return 0; + + ret = logfs_truncate_rec(inode, size); + if (ret) + return ret; + + return logfs_truncate_direct(inode, size); +} + +int logfs_truncate(struct inode *inode, u64 size) +{ + struct super_block *sb = inode->i_sb; + int err; + + logfs_get_wblocks(sb, NULL, 1); + err = __logfs_truncate(inode, size); + if (!err) + err = __logfs_write_inode(inode, 0); + logfs_put_wblocks(sb, NULL, 1); + + if (!err) + err = vmtruncate(inode, size); + + /* I don't trust error recovery yet. */ + WARN_ON(err); + return err; +} + +static void move_page_to_inode(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = logfs_block(page); + + if (!block) + return; + + log_blockmove("move_page_to_inode(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + BUG_ON(li->li_block); + block->ops = &inode_block_ops; + block->inode = inode; + li->li_block = block; + + block->page = NULL; + page->private = 0; + ClearPagePrivate(page); +} + +static void move_inode_to_page(struct page *page, struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = li->li_block; + + if (!block) + return; + + log_blockmove("move_inode_to_page(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + BUG_ON(PagePrivate(page)); + block->ops = &indirect_block_ops; + block->page = page; + page->private = (unsigned long)block; + SetPagePrivate(page); + + block->inode = NULL; + li->li_block = NULL; +} + +int logfs_read_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct inode *master_inode = super->s_master_inode; + struct page *page; + struct logfs_disk_inode *di; + u64 ino = inode->i_ino; + + if (ino << sb->s_blocksize_bits > i_size_read(master_inode)) + return -ENODATA; + if (!logfs_exist_block(master_inode, ino)) + return -ENODATA; + + page = read_cache_page(master_inode->i_mapping, ino, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + di = kmap_atomic(page, KM_USER0); + logfs_disk_to_inode(di, inode); + kunmap_atomic(di, KM_USER0); + move_page_to_inode(inode, page); + page_cache_release(page); + return 0; +} + +/* Caller must logfs_put_write_page(page); */ +static struct page *inode_to_page(struct inode *inode) +{ + struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode; + struct logfs_disk_inode *di; + struct page *page; + + BUG_ON(inode->i_ino == LOGFS_INO_MASTER); + + page = logfs_get_write_page(master_inode, inode->i_ino, 0); + if (!page) + return NULL; + + di = kmap_atomic(page, KM_USER0); + logfs_inode_to_disk(inode, di); + kunmap_atomic(di, KM_USER0); + move_inode_to_page(page, inode); + return page; +} + +/* Cheaper version of write_inode. All changes are concealed in + * aliases, which are moved back. No write to the medium happens. + */ +void logfs_clear_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = li->li_block; + struct page *page; + + /* Only deleted files may be dirty at this point */ + BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink); + if (!block) + return; + if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) { + block->ops->free_block(inode->i_sb, block); + return; + } + + BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS); + page = inode_to_page(inode); + BUG_ON(!page); /* FIXME: Use emergency page */ + logfs_put_write_page(page); +} + +static int do_write_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct inode *master_inode = logfs_super(sb)->s_master_inode; + loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits; + struct page *page; + int err; + + BUG_ON(inode->i_ino == LOGFS_INO_MASTER); + /* FIXME: lock inode */ + + if (i_size_read(master_inode) < size) + i_size_write(master_inode, size); + + /* TODO: Tell vfs this inode is clean now */ + + page = inode_to_page(inode); + if (!page) + return -ENOMEM; + + /* FIXME: transaction is part of logfs_block now. Is that enough? */ + err = logfs_write_buf(master_inode, page, 0); + logfs_put_write_page(page); + return err; +} + +static void logfs_mod_segment_entry(struct super_block *sb, u32 segno, + int write, + void (*change_se)(struct logfs_segment_entry *, long), + long arg) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + struct page *page; + struct logfs_segment_entry *se; + pgoff_t page_no; + int child_no; + + page_no = segno >> (sb->s_blocksize_bits - 3); + child_no = segno & ((sb->s_blocksize >> 3) - 1); + + inode = super->s_segfile_inode; + page = logfs_get_write_page(inode, page_no, 0); + BUG_ON(!page); /* FIXME: We need some reserve page for this case */ + if (!PageUptodate(page)) + logfs_read_block(inode, page, WRITE); + + if (write) + alloc_indirect_block(inode, page, 0); + se = kmap_atomic(page, KM_USER0); + change_se(se + child_no, arg); + if (write) { + logfs_set_alias(sb, logfs_block(page), child_no); + BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize); + } + kunmap_atomic(se, KM_USER0); + + logfs_put_write_page(page); +} + +static void __get_segment_entry(struct logfs_segment_entry *se, long _target) +{ + struct logfs_segment_entry *target = (void *)_target; + + *target = *se; +} + +void logfs_get_segment_entry(struct super_block *sb, u32 segno, + struct logfs_segment_entry *se) +{ + logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se); +} + +static void __set_segment_used(struct logfs_segment_entry *se, long increment) +{ + u32 valid; + + valid = be32_to_cpu(se->valid); + valid += increment; + se->valid = cpu_to_be32(valid); +} + +void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment) +{ + struct logfs_super *super = logfs_super(sb); + u32 segno = ofs >> super->s_segshift; + + if (!increment) + return; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment); +} + +static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level) +{ + se->ec_level = cpu_to_be32(ec_level); +} + +void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec, + gc_level_t gc_level) +{ + u32 ec_level = ec << 4 | (__force u8)gc_level; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level); +} + +static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore) +{ + se->valid = cpu_to_be32(RESERVED); +} + +void logfs_set_segment_reserved(struct super_block *sb, u32 segno) +{ + logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0); +} + +static void __set_segment_unreserved(struct logfs_segment_entry *se, + long ec_level) +{ + se->valid = 0; + se->ec_level = cpu_to_be32(ec_level); +} + +void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec) +{ + u32 ec_level = ec << 4; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved, + ec_level); +} + +int __logfs_write_inode(struct inode *inode, long flags) +{ + struct super_block *sb = inode->i_sb; + int ret; + + logfs_get_wblocks(sb, NULL, flags & WF_LOCK); + ret = do_write_inode(inode); + logfs_put_wblocks(sb, NULL, flags & WF_LOCK); + return ret; +} + +static int do_delete_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct inode *master_inode = logfs_super(sb)->s_master_inode; + struct page *page; + int ret; + + page = logfs_get_write_page(master_inode, inode->i_ino, 0); + if (!page) + return -ENOMEM; + + move_inode_to_page(page, inode); + + logfs_get_wblocks(sb, page, 1); + ret = __logfs_delete(master_inode, page); + logfs_put_wblocks(sb, page, 1); + + logfs_put_write_page(page); + return ret; +} + +/* + * ZOMBIE inodes have already been deleted before and should remain dead, + * if it weren't for valid checking. No need to kill them again here. + */ +void logfs_delete_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (!(li->li_flags & LOGFS_IF_ZOMBIE)) { + li->li_flags |= LOGFS_IF_ZOMBIE; + if (i_size_read(inode) > 0) + logfs_truncate(inode, 0); + do_delete_inode(inode); + } + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); +} + +void btree_write_block(struct logfs_block *block) +{ + struct inode *inode; + struct page *page; + int err, cookie; + + inode = logfs_safe_iget(block->sb, block->ino, &cookie); + page = logfs_get_write_page(inode, block->bix, block->level); + + err = logfs_readpage_nolock(page); + BUG_ON(err); + BUG_ON(!PagePrivate(page)); + BUG_ON(logfs_block(page) != block); + err = __logfs_write_buf(inode, page, 0); + BUG_ON(err); + BUG_ON(PagePrivate(page) || page->private); + + logfs_put_write_page(page); + logfs_safe_iput(inode, cookie); +} + +/** + * logfs_inode_write - write inode or dentry objects + * + * @inode: parent inode (ifile or directory) + * @buf: object to write (inode or dentry) + * @n: object size + * @_pos: object number (file position in blocks/objects) + * @flags: write flags + * @lock: 0 if write lock is already taken, 1 otherwise + * @shadow_tree: shadow below this inode + * + * FIXME: All caller of this put a 200-300 byte variable on the stack, + * only to call here and do a memcpy from that stack variable. A good + * example of wasted performance and stack space. + */ +int logfs_inode_write(struct inode *inode, const void *buf, size_t count, + loff_t bix, long flags, struct shadow_tree *shadow_tree) +{ + loff_t pos = bix << inode->i_sb->s_blocksize_bits; + int err; + struct page *page; + void *pagebuf; + + BUG_ON(pos & (LOGFS_BLOCKSIZE-1)); + BUG_ON(count > LOGFS_BLOCKSIZE); + page = logfs_get_write_page(inode, bix, 0); + if (!page) + return -ENOMEM; + + pagebuf = kmap_atomic(page, KM_USER0); + memcpy(pagebuf, buf, count); + flush_dcache_page(page); + kunmap_atomic(pagebuf, KM_USER0); + + if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE) + i_size_write(inode, pos + LOGFS_BLOCKSIZE); + + err = logfs_write_buf(inode, page, flags); + logfs_put_write_page(page); + return err; +} + +int logfs_open_segfile(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + + inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE); + if (IS_ERR(inode)) + return PTR_ERR(inode); + super->s_segfile_inode = inode; + return 0; +} + +int logfs_init_rw(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int min_fill = 3 * super->s_no_blocks; + + INIT_LIST_HEAD(&super->s_object_alias); + mutex_init(&super->s_write_mutex); + super->s_block_pool = mempool_create_kmalloc_pool(min_fill, + sizeof(struct logfs_block)); + super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill, + sizeof(struct logfs_shadow)); + return 0; +} + +void logfs_cleanup_rw(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + destroy_meta_inode(super->s_segfile_inode); + if (super->s_block_pool) + mempool_destroy(super->s_block_pool); + if (super->s_shadow_pool) + mempool_destroy(super->s_shadow_pool); +} diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c new file mode 100644 index 000000000000..5f58b74516ca --- /dev/null +++ b/fs/logfs/segment.c @@ -0,0 +1,924 @@ +/* + * fs/logfs/segment.c - Handling the Object Store + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Object store or ostore makes up the complete device with exception of + * the superblock and journal areas. Apart from its own metadata it stores + * three kinds of objects: inodes, dentries and blocks, both data and indirect. + */ +#include "logfs.h" + +static int logfs_mark_segment_bad(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct btree_head32 *head = &super->s_reserved_segments; + int err; + + err = btree_insert32(head, segno, (void *)1, GFP_NOFS); + if (err) + return err; + logfs_super(sb)->s_bad_segments++; + /* FIXME: write to journal */ + return 0; +} + +int logfs_erase_segment(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + + super->s_gec++; + + return super->s_devops->erase(sb, (u64)segno << super->s_segshift, + super->s_segsize); +} + +static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes) +{ + s32 ofs; + + logfs_open_area(area, bytes); + + ofs = area->a_used_bytes; + area->a_used_bytes += bytes; + BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize); + + return dev_ofs(area->a_sb, area->a_segno, ofs); +} + +static struct page *get_mapping_page(struct super_block *sb, pgoff_t index, + int use_filler) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = super->s_devops->readpage; + struct page *page; + + BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS); + if (use_filler) + page = read_cache_page(mapping, index, filler, sb); + else { + page = find_or_create_page(mapping, index, GFP_NOFS); + unlock_page(page); + } + return page; +} + +void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, + int use_filler) +{ + pgoff_t index = ofs >> PAGE_SHIFT; + struct page *page; + long offset = ofs & (PAGE_SIZE-1); + long copylen; + + /* Only logfs_wbuf_recover may use len==0 */ + BUG_ON(!len && !use_filler); + do { + copylen = min((ulong)len, PAGE_SIZE - offset); + + page = get_mapping_page(area->a_sb, index, use_filler); + SetPageUptodate(page); + BUG_ON(!page); /* FIXME: reserve a pool */ + memcpy(page_address(page) + offset, buf, copylen); + SetPagePrivate(page); + page_cache_release(page); + + buf += copylen; + len -= copylen; + offset = 0; + index++; + } while (len); +} + +/* + * bdev_writeseg will write full pages. Memset the tail to prevent data leaks. + */ +static void pad_wbuf(struct logfs_area *area, int final) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + struct page *page; + u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); + pgoff_t index = ofs >> PAGE_SHIFT; + long offset = ofs & (PAGE_SIZE-1); + u32 len = PAGE_SIZE - offset; + + if (len == PAGE_SIZE) { + /* The math in this function can surely use some love */ + len = 0; + } + if (len) { + BUG_ON(area->a_used_bytes >= super->s_segsize); + + page = get_mapping_page(area->a_sb, index, 0); + BUG_ON(!page); /* FIXME: reserve a pool */ + memset(page_address(page) + offset, 0xff, len); + SetPagePrivate(page); + page_cache_release(page); + } + + if (!final) + return; + + area->a_used_bytes += len; + for ( ; area->a_used_bytes < super->s_segsize; + area->a_used_bytes += PAGE_SIZE) { + /* Memset another page */ + index++; + page = get_mapping_page(area->a_sb, index, 0); + BUG_ON(!page); /* FIXME: reserve a pool */ + memset(page_address(page), 0xff, PAGE_SIZE); + SetPagePrivate(page); + page_cache_release(page); + } +} + +/* + * We have to be careful with the alias tree. Since lookup is done by bix, + * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with + * indirect blocks. So always use it through accessor functions. + */ +static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix, + level_t level) +{ + struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree; + pgoff_t index = logfs_pack_index(bix, level); + + return btree_lookup128(head, ino, index); +} + +static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix, + level_t level, void *val) +{ + struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree; + pgoff_t index = logfs_pack_index(bix, level); + + return btree_insert128(head, ino, index, val, GFP_NOFS); +} + +static int btree_write_alias(struct super_block *sb, struct logfs_block *block, + write_alias_t *write_one_alias) +{ + struct object_alias_item *item; + int err; + + list_for_each_entry(item, &block->item_list, list) { + err = write_alias_journal(sb, block->ino, block->bix, + block->level, item->child_no, item->val); + if (err) + return err; + } + return 0; +} + +static gc_level_t btree_block_level(struct logfs_block *block) +{ + return expand_level(block->ino, block->level); +} + +static struct logfs_block_ops btree_block_ops = { + .write_block = btree_write_block, + .block_level = btree_block_level, + .free_block = __free_block, + .write_alias = btree_write_alias, +}; + +int logfs_load_object_aliases(struct super_block *sb, + struct logfs_obj_alias *oa, int count) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + struct object_alias_item *item; + u64 ino, bix; + level_t level; + int i, err; + + super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS; + count /= sizeof(*oa); + for (i = 0; i < count; i++) { + item = mempool_alloc(super->s_alias_pool, GFP_NOFS); + if (!item) + return -ENOMEM; + memset(item, 0, sizeof(*item)); + + super->s_no_object_aliases++; + item->val = oa[i].val; + item->child_no = be16_to_cpu(oa[i].child_no); + + ino = be64_to_cpu(oa[i].ino); + bix = be64_to_cpu(oa[i].bix); + level = LEVEL(oa[i].level); + + log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n", + ino, bix, level, item->child_no, + be64_to_cpu(item->val)); + block = alias_tree_lookup(sb, ino, bix, level); + if (!block) { + block = __alloc_block(sb, ino, bix, level); + block->ops = &btree_block_ops; + err = alias_tree_insert(sb, ino, bix, level, block); + BUG_ON(err); /* mempool empty */ + } + if (test_and_set_bit(item->child_no, block->alias_map)) { + printk(KERN_ERR"LogFS: Alias collision detected\n"); + return -EIO; + } + list_move_tail(&block->alias_list, &super->s_object_alias); + list_add(&item->list, &block->item_list); + } + return 0; +} + +static void kill_alias(void *_block, unsigned long ignore0, + u64 ignore1, u64 ignore2, size_t ignore3) +{ + struct logfs_block *block = _block; + struct super_block *sb = block->sb; + struct logfs_super *super = logfs_super(sb); + struct object_alias_item *item; + + while (!list_empty(&block->item_list)) { + item = list_entry(block->item_list.next, typeof(*item), list); + list_del(&item->list); + mempool_free(item, super->s_alias_pool); + } + block->ops->free_block(sb, block); +} + +static int obj_type(struct inode *inode, level_t level) +{ + if (level == 0) { + if (S_ISDIR(inode->i_mode)) + return OBJ_DENTRY; + if (inode->i_ino == LOGFS_INO_MASTER) + return OBJ_INODE; + } + return OBJ_BLOCK; +} + +static int obj_len(struct super_block *sb, int obj_type) +{ + switch (obj_type) { + case OBJ_DENTRY: + return sizeof(struct logfs_disk_dentry); + case OBJ_INODE: + return sizeof(struct logfs_disk_inode); + case OBJ_BLOCK: + return sb->s_blocksize; + default: + BUG(); + } +} + +static int __logfs_segment_write(struct inode *inode, void *buf, + struct logfs_shadow *shadow, int type, int len, int compr) +{ + struct logfs_area *area; + struct super_block *sb = inode->i_sb; + s64 ofs; + struct logfs_object_header h; + int acc_len; + + if (shadow->gc_level == 0) + acc_len = len; + else + acc_len = obj_len(sb, type); + + area = get_area(sb, shadow->gc_level); + ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE); + LOGFS_BUG_ON(ofs <= 0, sb); + /* + * Order is important. logfs_get_free_bytes(), by modifying the + * segment file, may modify the content of the very page we're about + * to write now. Which is fine, as long as the calculated crc and + * written data still match. So do the modifications _before_ + * calculating the crc. + */ + + h.len = cpu_to_be16(len); + h.type = type; + h.compr = compr; + h.ino = cpu_to_be64(inode->i_ino); + h.bix = cpu_to_be64(shadow->bix); + h.crc = logfs_crc32(&h, sizeof(h) - 4, 4); + h.data_crc = logfs_crc32(buf, len, 0); + + logfs_buf_write(area, ofs, &h, sizeof(h)); + logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len); + + shadow->new_ofs = ofs; + shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE; + + return 0; +} + +static s64 logfs_segment_write_compress(struct inode *inode, void *buf, + struct logfs_shadow *shadow, int type, int len) +{ + struct super_block *sb = inode->i_sb; + void *compressor_buf = logfs_super(sb)->s_compressed_je; + ssize_t compr_len; + int ret; + + mutex_lock(&logfs_super(sb)->s_journal_mutex); + compr_len = logfs_compress(buf, compressor_buf, len, len); + + if (compr_len >= 0) { + ret = __logfs_segment_write(inode, compressor_buf, shadow, + type, compr_len, COMPR_ZLIB); + } else { + ret = __logfs_segment_write(inode, buf, shadow, type, len, + COMPR_NONE); + } + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + return ret; +} + +/** + * logfs_segment_write - write data block to object store + * @inode: inode containing data + * + * Returns an errno or zero. + */ +int logfs_segment_write(struct inode *inode, struct page *page, + struct logfs_shadow *shadow) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + int do_compress, type, len; + int ret; + void *buf; + + BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED; + if (shadow->gc_level != 0) { + /* temporarily disable compression for indirect blocks */ + do_compress = 0; + } + + type = obj_type(inode, shrink_level(shadow->gc_level)); + len = obj_len(sb, type); + buf = kmap(page); + if (do_compress) + ret = logfs_segment_write_compress(inode, buf, shadow, type, + len); + else + ret = __logfs_segment_write(inode, buf, shadow, type, len, + COMPR_NONE); + kunmap(page); + + log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + /* this BUG_ON did catch a locking bug. useful */ + BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1))); + return ret; +} + +int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf) +{ + pgoff_t index = ofs >> PAGE_SHIFT; + struct page *page; + long offset = ofs & (PAGE_SIZE-1); + long copylen; + + while (len) { + copylen = min((ulong)len, PAGE_SIZE - offset); + + page = get_mapping_page(sb, index, 1); + if (IS_ERR(page)) + return PTR_ERR(page); + memcpy(buf, page_address(page) + offset, copylen); + page_cache_release(page); + + buf += copylen; + len -= copylen; + offset = 0; + index++; + } + return 0; +} + +/* + * The "position" of indirect blocks is ambiguous. It can be the position + * of any data block somewhere behind this indirect block. So we need to + * normalize the positions through logfs_block_mask() before comparing. + */ +static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level) +{ + return (pos1 & logfs_block_mask(sb, level)) != + (pos2 & logfs_block_mask(sb, level)); +} + +#if 0 +static int read_seg_header(struct super_block *sb, u64 ofs, + struct logfs_segment_header *sh) +{ + __be32 crc; + int err; + + err = wbuf_read(sb, ofs, sizeof(*sh), sh); + if (err) + return err; + crc = logfs_crc32(sh, sizeof(*sh), 4); + if (crc != sh->crc) { + printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, " + "got %x\n", ofs, be32_to_cpu(sh->crc), + be32_to_cpu(crc)); + return -EIO; + } + return 0; +} +#endif + +static int read_obj_header(struct super_block *sb, u64 ofs, + struct logfs_object_header *oh) +{ + __be32 crc; + int err; + + err = wbuf_read(sb, ofs, sizeof(*oh), oh); + if (err) + return err; + crc = logfs_crc32(oh, sizeof(*oh) - 4, 4); + if (crc != oh->crc) { + printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, " + "got %x\n", ofs, be32_to_cpu(oh->crc), + be32_to_cpu(crc)); + return -EIO; + } + return 0; +} + +static void move_btree_to_page(struct inode *inode, struct page *page, + __be64 *data) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct btree_head128 *head = &super->s_object_alias_tree; + struct logfs_block *block; + struct object_alias_item *item, *next; + + if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS)) + return; + + block = btree_remove128(head, inode->i_ino, page->index); + if (!block) + return; + + log_blockmove("move_btree_to_page(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + list_for_each_entry_safe(item, next, &block->item_list, list) { + data[item->child_no] = item->val; + list_del(&item->list); + mempool_free(item, super->s_alias_pool); + } + block->page = page; + SetPagePrivate(page); + page->private = (unsigned long)block; + block->ops = &indirect_block_ops; + initialize_block_counters(page, block, data, 0); +} + +/* + * This silences a false, yet annoying gcc warning. I hate it when my editor + * jumps into bitops.h each time I recompile this file. + * TODO: Complain to gcc folks about this and upgrade compiler. + */ +static unsigned long fnb(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +void move_page_to_btree(struct page *page) +{ + struct logfs_block *block = logfs_block(page); + struct super_block *sb = block->sb; + struct logfs_super *super = logfs_super(sb); + struct object_alias_item *item; + unsigned long pos; + __be64 *child; + int err; + + if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) { + block->ops->free_block(sb, block); + return; + } + log_blockmove("move_page_to_btree(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_BLOCK_FACTOR) + break; + + item = mempool_alloc(super->s_alias_pool, GFP_NOFS); + BUG_ON(!item); /* mempool empty */ + memset(item, 0, sizeof(*item)); + + child = kmap_atomic(page, KM_USER0); + item->val = child[pos]; + kunmap_atomic(child, KM_USER0); + item->child_no = pos; + list_add(&item->list, &block->item_list); + } + block->page = NULL; + ClearPagePrivate(page); + page->private = 0; + block->ops = &btree_block_ops; + err = alias_tree_insert(block->sb, block->ino, block->bix, block->level, + block); + BUG_ON(err); /* mempool empty */ + ClearPageUptodate(page); +} + +static int __logfs_segment_read(struct inode *inode, void *buf, + u64 ofs, u64 bix, level_t level) +{ + struct super_block *sb = inode->i_sb; + void *compressor_buf = logfs_super(sb)->s_compressed_je; + struct logfs_object_header oh; + __be32 crc; + u16 len; + int err, block_len; + + block_len = obj_len(sb, obj_type(inode, level)); + err = read_obj_header(sb, ofs, &oh); + if (err) + goto out_err; + + err = -EIO; + if (be64_to_cpu(oh.ino) != inode->i_ino + || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) { + printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: " + "expected (%lx, %llx), got (%llx, %llx)\n", + ofs, inode->i_ino, bix, + be64_to_cpu(oh.ino), be64_to_cpu(oh.bix)); + goto out_err; + } + + len = be16_to_cpu(oh.len); + + switch (oh.compr) { + case COMPR_NONE: + err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf); + if (err) + goto out_err; + crc = logfs_crc32(buf, len, 0); + if (crc != oh.data_crc) { + printk(KERN_ERR"LOGFS: uncompressed data crc error at " + "%llx: expected %x, got %x\n", ofs, + be32_to_cpu(oh.data_crc), + be32_to_cpu(crc)); + goto out_err; + } + break; + case COMPR_ZLIB: + mutex_lock(&logfs_super(sb)->s_journal_mutex); + err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, + compressor_buf); + if (err) { + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + goto out_err; + } + crc = logfs_crc32(compressor_buf, len, 0); + if (crc != oh.data_crc) { + printk(KERN_ERR"LOGFS: compressed data crc error at " + "%llx: expected %x, got %x\n", ofs, + be32_to_cpu(oh.data_crc), + be32_to_cpu(crc)); + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + goto out_err; + } + err = logfs_uncompress(compressor_buf, buf, len, block_len); + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + if (err) { + printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs); + goto out_err; + } + break; + default: + LOGFS_BUG(sb); + err = -EIO; + goto out_err; + } + return 0; + +out_err: + logfs_set_ro(sb); + printk(KERN_ERR"LOGFS: device is read-only now\n"); + LOGFS_BUG(sb); + return err; +} + +/** + * logfs_segment_read - read data block from object store + * @inode: inode containing data + * @buf: data buffer + * @ofs: physical data offset + * @bix: block index + * @level: block level + * + * Returns 0 on success or a negative errno. + */ +int logfs_segment_read(struct inode *inode, struct page *page, + u64 ofs, u64 bix, level_t level) +{ + int err; + void *buf; + + if (PageUptodate(page)) + return 0; + + ofs &= ~LOGFS_FULLY_POPULATED; + + buf = kmap(page); + err = __logfs_segment_read(inode, buf, ofs, bix, level); + if (!err) { + move_btree_to_page(inode, page, buf); + SetPageUptodate(page); + } + kunmap(page); + log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n", + inode->i_ino, bix, level, ofs, err); + return err; +} + +int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow) +{ + struct super_block *sb = inode->i_sb; + struct logfs_object_header h; + u16 len; + int err; + + BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED); + if (!shadow->old_ofs) + return 0; + + log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + err = read_obj_header(sb, shadow->old_ofs, &h); + LOGFS_BUG_ON(err, sb); + LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb); + LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix), + shrink_level(shadow->gc_level)), sb); + + if (shadow->gc_level == 0) + len = be16_to_cpu(h.len); + else + len = obj_len(sb, h.type); + shadow->old_len = len + sizeof(h); + return 0; +} + +static void freeseg(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + u64 ofs, start, end; + + start = dev_ofs(sb, segno, 0); + end = dev_ofs(sb, segno + 1, 0); + for (ofs = start; ofs < end; ofs += PAGE_SIZE) { + page = find_get_page(mapping, ofs >> PAGE_SHIFT); + if (!page) + continue; + ClearPagePrivate(page); + page_cache_release(page); + } +} + +int logfs_open_area(struct logfs_area *area, size_t bytes) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + int err, closed = 0; + + if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize) + return 0; + + if (area->a_is_open) { + u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + u32 len = super->s_segsize - area->a_written_bytes; + + log_gc("logfs_close_area(%x)\n", area->a_segno); + pad_wbuf(area, 1); + super->s_devops->writeseg(area->a_sb, ofs, len); + freeseg(sb, area->a_segno); + closed = 1; + } + + area->a_used_bytes = 0; + area->a_written_bytes = 0; +again: + area->a_ops->get_free_segment(area); + area->a_ops->get_erase_count(area); + + log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level); + err = area->a_ops->erase_segment(area); + if (err) { + printk(KERN_WARNING "LogFS: Error erasing segment %x\n", + area->a_segno); + logfs_mark_segment_bad(sb, area->a_segno); + goto again; + } + area->a_is_open = 1; + return closed; +} + +void logfs_sync_area(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + u32 len = (area->a_used_bytes - area->a_written_bytes); + + if (super->s_writesize) + len &= ~(super->s_writesize - 1); + if (len == 0) + return; + pad_wbuf(area, 0); + super->s_devops->writeseg(sb, ofs, len); + area->a_written_bytes += len; +} + +void logfs_sync_segments(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + for_each_area(i) + logfs_sync_area(super->s_area[i]); +} + +/* + * Pick a free segment to be used for this area. Effectively takes a + * candidate from the free list (not really a candidate anymore). + */ +static void ostore_get_free_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + + if (super->s_free_list.count == 0) { + printk(KERN_ERR"LOGFS: ran out of free segments\n"); + LOGFS_BUG(sb); + } + + area->a_segno = get_best_cand(sb, &super->s_free_list, NULL); +} + +static void ostore_get_erase_count(struct logfs_area *area) +{ + struct logfs_segment_entry se; + u32 ec_level; + + logfs_get_segment_entry(area->a_sb, area->a_segno, &se); + BUG_ON(se.ec_level == cpu_to_be32(BADSEG) || + se.valid == cpu_to_be32(RESERVED)); + + ec_level = be32_to_cpu(se.ec_level); + area->a_erase_count = (ec_level >> 4) + 1; +} + +static int ostore_erase_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_segment_header sh; + u64 ofs; + int err; + + err = logfs_erase_segment(sb, area->a_segno); + if (err) + return err; + + sh.pad = 0; + sh.type = SEG_OSTORE; + sh.level = (__force u8)area->a_level; + sh.segno = cpu_to_be32(area->a_segno); + sh.ec = cpu_to_be32(area->a_erase_count); + sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); + sh.crc = logfs_crc32(&sh, sizeof(sh), 4); + + logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, + area->a_level); + + ofs = dev_ofs(sb, area->a_segno, 0); + area->a_used_bytes = sizeof(sh); + logfs_buf_write(area, ofs, &sh, sizeof(sh)); + return 0; +} + +static const struct logfs_area_ops ostore_area_ops = { + .get_free_segment = ostore_get_free_segment, + .get_erase_count = ostore_get_erase_count, + .erase_segment = ostore_erase_segment, +}; + +static void free_area(struct logfs_area *area) +{ + if (area) + freeseg(area->a_sb, area->a_segno); + kfree(area); +} + +static struct logfs_area *alloc_area(struct super_block *sb) +{ + struct logfs_area *area; + + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + + area->a_sb = sb; + return area; +} + +static void map_invalidatepage(struct page *page, unsigned long l) +{ + BUG(); +} + +static int map_releasepage(struct page *page, gfp_t g) +{ + /* Don't release these pages */ + return 0; +} + +static const struct address_space_operations mapping_aops = { + .invalidatepage = map_invalidatepage, + .releasepage = map_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, +}; + +int logfs_init_mapping(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping; + struct inode *inode; + + inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING); + if (IS_ERR(inode)) + return PTR_ERR(inode); + super->s_mapping_inode = inode; + mapping = inode->i_mapping; + mapping->a_ops = &mapping_aops; + /* Would it be possible to use __GFP_HIGHMEM as well? */ + mapping_set_gfp_mask(mapping, GFP_NOFS); + return 0; +} + +int logfs_init_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i = -1; + + super->s_alias_pool = mempool_create_kmalloc_pool(600, + sizeof(struct object_alias_item)); + if (!super->s_alias_pool) + return -ENOMEM; + + super->s_journal_area = alloc_area(sb); + if (!super->s_journal_area) + goto err; + + for_each_area(i) { + super->s_area[i] = alloc_area(sb); + if (!super->s_area[i]) + goto err; + super->s_area[i]->a_level = GC_LEVEL(i); + super->s_area[i]->a_ops = &ostore_area_ops; + } + btree_init_mempool128(&super->s_object_alias_tree, + super->s_btree_pool); + return 0; + +err: + for (i--; i >= 0; i--) + free_area(super->s_area[i]); + free_area(super->s_journal_area); + mempool_destroy(super->s_alias_pool); + return -ENOMEM; +} + +void logfs_cleanup_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias); + for_each_area(i) + free_area(super->s_area[i]); + free_area(super->s_journal_area); + destroy_meta_inode(super->s_mapping_inode); +} diff --git a/fs/logfs/super.c b/fs/logfs/super.c new file mode 100644 index 000000000000..d128a2c1c8d1 --- /dev/null +++ b/fs/logfs/super.c @@ -0,0 +1,634 @@ +/* + * fs/logfs/super.c + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + * + * Generally contains mount/umount code and also serves as a dump area for + * any functions that don't fit elsewhere and neither justify a file of their + * own. + */ +#include "logfs.h" +#include +#include +#include +#include + +static DEFINE_MUTEX(emergency_mutex); +static struct page *emergency_page; + +struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index) +{ + filler_t *filler = (filler_t *)mapping->a_ops->readpage; + struct page *page; + int err; + + page = read_cache_page(mapping, index, filler, NULL); + if (page) + return page; + + /* No more pages available, switch to emergency page */ + printk(KERN_INFO"Logfs: Using emergency page\n"); + mutex_lock(&emergency_mutex); + err = filler(NULL, emergency_page); + if (err) { + mutex_unlock(&emergency_mutex); + printk(KERN_EMERG"Logfs: Error reading emergency page\n"); + return ERR_PTR(err); + } + return emergency_page; +} + +void emergency_read_end(struct page *page) +{ + if (page == emergency_page) + mutex_unlock(&emergency_mutex); + else + page_cache_release(page); +} + +static void dump_segfile(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_entry se; + u32 segno; + + for (segno = 0; segno < super->s_no_segs; segno++) { + logfs_get_segment_entry(sb, segno, &se); + printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + printk("\n"); + } +} + +/* + * logfs_crash_dump - dump debug information to device + * + * The LogFS superblock only occupies part of a segment. This function will + * write as much debug information as it can gather into the spare space. + */ +void logfs_crash_dump(struct super_block *sb) +{ + dump_segfile(sb); +} + +/* + * TODO: move to lib/string.c + */ +/** + * memchr_inv - Find a character in an area of memory. + * @s: The memory area + * @c: The byte to search for + * @n: The size of the area. + * + * returns the address of the first character other than @c, or %NULL + * if the whole buffer contains just @c. + */ +void *memchr_inv(const void *s, int c, size_t n) +{ + const unsigned char *p = s; + while (n-- != 0) + if ((unsigned char)c != *p++) + return (void *)(p - 1); + + return NULL; +} + +/* + * FIXME: There should be a reserve for root, similar to ext2. + */ +int logfs_statfs(struct dentry *dentry, struct kstatfs *stats) +{ + struct super_block *sb = dentry->d_sb; + struct logfs_super *super = logfs_super(sb); + + stats->f_type = LOGFS_MAGIC_U32; + stats->f_bsize = sb->s_blocksize; + stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3; + stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits; + stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits; + stats->f_files = 0; + stats->f_ffree = 0; + stats->f_namelen = LOGFS_MAX_NAMELEN; + return 0; +} + +static int logfs_sb_set(struct super_block *sb, void *_super) +{ + struct logfs_super *super = _super; + + sb->s_fs_info = super; + sb->s_mtd = super->s_mtd; + sb->s_bdev = super->s_bdev; + return 0; +} + +static int logfs_sb_test(struct super_block *sb, void *_super) +{ + struct logfs_super *super = _super; + struct mtd_info *mtd = super->s_mtd; + + if (mtd && sb->s_mtd == mtd) + return 1; + if (super->s_bdev && sb->s_bdev == super->s_bdev) + return 1; + return 0; +} + +static void set_segment_header(struct logfs_segment_header *sh, u8 type, + u8 level, u32 segno, u32 ec) +{ + sh->pad = 0; + sh->type = type; + sh->level = level; + sh->segno = cpu_to_be32(segno); + sh->ec = cpu_to_be32(ec); + sh->gec = cpu_to_be64(segno); + sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4); +} + +static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds, + u32 segno, u32 ec) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_header *sh = &ds->ds_sh; + int i; + + memset(ds, 0, sizeof(*ds)); + set_segment_header(sh, SEG_SUPER, 0, segno, ec); + + ds->ds_ifile_levels = super->s_ifile_levels; + ds->ds_iblock_levels = super->s_iblock_levels; + ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */ + ds->ds_segment_shift = super->s_segshift; + ds->ds_block_shift = sb->s_blocksize_bits; + ds->ds_write_shift = super->s_writeshift; + ds->ds_filesystem_size = cpu_to_be64(super->s_size); + ds->ds_segment_size = cpu_to_be32(super->s_segsize); + ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve); + ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat); + ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat); + ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat); + ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags); + ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve); + ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve); + journal_for_each(i) + ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]); + ds->ds_magic = cpu_to_be64(LOGFS_MAGIC); + ds->ds_crc = logfs_crc32(ds, sizeof(*ds), + LOGFS_SEGMENT_HEADERSIZE + 12); +} + +static int write_one_sb(struct super_block *sb, + struct page *(*find_sb)(struct super_block *sb, u64 *ofs)) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_disk_super *ds; + struct logfs_segment_entry se; + struct page *page; + u64 ofs; + u32 ec, segno; + int err; + + page = find_sb(sb, &ofs); + if (!page) + return -EIO; + ds = page_address(page); + segno = seg_no(sb, ofs); + logfs_get_segment_entry(sb, segno, &se); + ec = be32_to_cpu(se.ec_level) >> 4; + ec++; + logfs_set_segment_erased(sb, segno, ec, 0); + logfs_write_ds(sb, ds, segno, ec); + err = super->s_devops->write_sb(sb, page); + page_cache_release(page); + return err; +} + +int logfs_write_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int err; + + /* First superblock */ + err = write_one_sb(sb, super->s_devops->find_first_sb); + if (err) + return err; + + /* Last superblock */ + err = write_one_sb(sb, super->s_devops->find_last_sb); + if (err) + return err; + return 0; +} + +static int ds_cmp(const void *ds0, const void *ds1) +{ + size_t len = sizeof(struct logfs_disk_super); + + /* We know the segment headers differ, so ignore them */ + len -= LOGFS_SEGMENT_HEADERSIZE; + ds0 += LOGFS_SEGMENT_HEADERSIZE; + ds1 += LOGFS_SEGMENT_HEADERSIZE; + return memcmp(ds0, ds1, len); +} + +static int logfs_recover_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_disk_super _ds0, *ds0 = &_ds0; + struct logfs_disk_super _ds1, *ds1 = &_ds1; + int err, valid0, valid1; + + /* read first superblock */ + err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0); + if (err) + return err; + /* read last superblock */ + err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1); + if (err) + return err; + valid0 = logfs_check_ds(ds0) == 0; + valid1 = logfs_check_ds(ds1) == 0; + + if (!valid0 && valid1) { + printk(KERN_INFO"First superblock is invalid - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_first_sb); + } + if (valid0 && !valid1) { + printk(KERN_INFO"Last superblock is invalid - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_last_sb); + } + if (valid0 && valid1 && ds_cmp(ds0, ds1)) { + printk(KERN_INFO"Superblocks don't match - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_last_sb); + } + /* If neither is valid now, something's wrong. Didn't we properly + * check them before?!? */ + BUG_ON(!valid0 && !valid1); + return 0; +} + +static int logfs_make_writeable(struct super_block *sb) +{ + int err; + + /* Repair any broken superblock copies */ + err = logfs_recover_sb(sb); + if (err) + return err; + + /* Check areas for trailing unaccounted data */ + err = logfs_check_areas(sb); + if (err) + return err; + + err = logfs_open_segfile(sb); + if (err) + return err; + + /* Do one GC pass before any data gets dirtied */ + logfs_gc_pass(sb); + + /* after all initializations are done, replay the journal + * for rw-mounts, if necessary */ + err = logfs_replay_journal(sb); + if (err) + return err; + + return 0; +} + +static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) +{ + struct inode *rootdir; + int err; + + /* root dir */ + rootdir = logfs_iget(sb, LOGFS_INO_ROOT); + if (IS_ERR(rootdir)) + goto fail; + + sb->s_root = d_alloc_root(rootdir); + if (!sb->s_root) + goto fail; + + /* FIXME: check for read-only mounts */ + err = logfs_make_writeable(sb); + if (err) + goto fail2; + + log_super("LogFS: Finished mounting\n"); + simple_set_mnt(mnt, sb); + return 0; + +fail2: + iput(rootdir); +fail: + iput(logfs_super(sb)->s_master_inode); + return -EIO; +} + +int logfs_check_ds(struct logfs_disk_super *ds) +{ + struct logfs_segment_header *sh = &ds->ds_sh; + + if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC)) + return -EINVAL; + if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4)) + return -EINVAL; + if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds), + LOGFS_SEGMENT_HEADERSIZE + 12)) + return -EINVAL; + return 0; +} + +static struct page *find_super_block(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct page *first, *last; + + first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]); + if (!first || IS_ERR(first)) + return NULL; + last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]); + if (!last || IS_ERR(first)) { + page_cache_release(first); + return NULL; + } + + if (!logfs_check_ds(page_address(first))) { + page_cache_release(last); + return first; + } + + /* First one didn't work, try the second superblock */ + if (!logfs_check_ds(page_address(last))) { + page_cache_release(first); + return last; + } + + /* Neither worked, sorry folks */ + page_cache_release(first); + page_cache_release(last); + return NULL; +} + +static int __logfs_read_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct page *page; + struct logfs_disk_super *ds; + int i; + + page = find_super_block(sb); + if (!page) + return -EIO; + + ds = page_address(page); + super->s_size = be64_to_cpu(ds->ds_filesystem_size); + super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve); + super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve); + super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve); + super->s_segsize = 1 << ds->ds_segment_shift; + super->s_segmask = (1 << ds->ds_segment_shift) - 1; + super->s_segshift = ds->ds_segment_shift; + sb->s_blocksize = 1 << ds->ds_block_shift; + sb->s_blocksize_bits = ds->ds_block_shift; + super->s_writesize = 1 << ds->ds_write_shift; + super->s_writeshift = ds->ds_write_shift; + super->s_no_segs = super->s_size >> super->s_segshift; + super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits; + super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat); + super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat); + super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat); + super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags); + + journal_for_each(i) + super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]); + + super->s_ifile_levels = ds->ds_ifile_levels; + super->s_iblock_levels = ds->ds_iblock_levels; + super->s_data_levels = ds->ds_data_levels; + super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels + + super->s_data_levels; + page_cache_release(page); + return 0; +} + +static int logfs_read_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int ret; + + super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL); + if (!super->s_btree_pool) + return -ENOMEM; + + btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool); + btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool); + + ret = logfs_init_mapping(sb); + if (ret) + return ret; + + ret = __logfs_read_sb(sb); + if (ret) + return ret; + + mutex_init(&super->s_dirop_mutex); + mutex_init(&super->s_object_alias_mutex); + INIT_LIST_HEAD(&super->s_freeing_list); + + ret = logfs_init_rw(sb); + if (ret) + return ret; + + ret = logfs_init_areas(sb); + if (ret) + return ret; + + ret = logfs_init_gc(sb); + if (ret) + return ret; + + ret = logfs_init_journal(sb); + if (ret) + return ret; + + return 0; +} + +static void logfs_kill_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + log_super("LogFS: Start unmounting\n"); + /* Alias entries slow down mount, so evict as many as possible */ + sync_filesystem(sb); + logfs_write_anchor(super->s_master_inode); + + /* + * From this point on alias entries are simply dropped - and any + * writes to the object store are considered bugs. + */ + super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN; + log_super("LogFS: Now in shutdown\n"); + generic_shutdown_super(sb); + + BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes); + + logfs_cleanup_gc(sb); + logfs_cleanup_journal(sb); + logfs_cleanup_areas(sb); + logfs_cleanup_rw(sb); + super->s_devops->put_device(sb); + mempool_destroy(super->s_btree_pool); + mempool_destroy(super->s_alias_pool); + kfree(super); + log_super("LogFS: Finished unmounting\n"); +} + +int logfs_get_sb_device(struct file_system_type *type, int flags, + struct mtd_info *mtd, struct block_device *bdev, + const struct logfs_device_ops *devops, struct vfsmount *mnt) +{ + struct logfs_super *super; + struct super_block *sb; + int err = -ENOMEM; + static int mount_count; + + log_super("LogFS: Start mount %x\n", mount_count++); + super = kzalloc(sizeof(*super), GFP_KERNEL); + if (!super) + goto err0; + + super->s_mtd = mtd; + super->s_bdev = bdev; + err = -EINVAL; + sb = sget(type, logfs_sb_test, logfs_sb_set, super); + if (IS_ERR(sb)) + goto err0; + + if (sb->s_root) { + /* Device is already in use */ + err = 0; + simple_set_mnt(mnt, sb); + goto err0; + } + + super->s_devops = devops; + + /* + * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache + * only covers 16TB and the upper 8TB are used for indirect blocks. + * On 64bit system we could bump up the limit, but that would make + * the filesystem incompatible with 32bit systems. + */ + sb->s_maxbytes = (1ull << 43) - 1; + sb->s_op = &logfs_super_operations; + sb->s_flags = flags | MS_NOATIME; + + err = logfs_read_sb(sb); + if (err) + goto err1; + + sb->s_flags |= MS_ACTIVE; + err = logfs_get_sb_final(sb, mnt); + if (err) + goto err1; + return 0; + +err1: + up_write(&sb->s_umount); + deactivate_super(sb); + return err; +err0: + kfree(super); + //devops->put_device(sb); + return err; +} + +static int logfs_get_sb(struct file_system_type *type, int flags, + const char *devname, void *data, struct vfsmount *mnt) +{ + ulong mtdnr; + + if (!devname) + return logfs_get_sb_bdev(type, flags, devname, mnt); + if (strncmp(devname, "mtd", 3)) + return logfs_get_sb_bdev(type, flags, devname, mnt); + + { + char *garbage; + mtdnr = simple_strtoul(devname+3, &garbage, 0); + if (*garbage) + return -EINVAL; + } + + return logfs_get_sb_mtd(type, flags, mtdnr, mnt); +} + +static struct file_system_type logfs_fs_type = { + .owner = THIS_MODULE, + .name = "logfs", + .get_sb = logfs_get_sb, + .kill_sb = logfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, + +}; + +static int __init logfs_init(void) +{ + int ret; + + emergency_page = alloc_pages(GFP_KERNEL, 0); + if (!emergency_page) + return -ENOMEM; + + ret = logfs_compr_init(); + if (ret) + goto out1; + + ret = logfs_init_inode_cache(); + if (ret) + goto out2; + + return register_filesystem(&logfs_fs_type); +out2: + logfs_compr_exit(); +out1: + __free_pages(emergency_page, 0); + return ret; +} + +static void __exit logfs_exit(void) +{ + unregister_filesystem(&logfs_fs_type); + logfs_destroy_inode_cache(); + logfs_compr_exit(); + __free_pages(emergency_page, 0); +} + +module_init(logfs_init); +module_exit(logfs_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joern Engel "); +MODULE_DESCRIPTION("scalable flash filesystem"); diff --git a/include/linux/btree-128.h b/include/linux/btree-128.h new file mode 100644 index 000000000000..0b3414c4c928 --- /dev/null +++ b/include/linux/btree-128.h @@ -0,0 +1,109 @@ +extern struct btree_geo btree_geo128; + +struct btree_head128 { struct btree_head h; }; + +static inline void btree_init_mempool128(struct btree_head128 *head, + mempool_t *mempool) +{ + btree_init_mempool(&head->h, mempool); +} + +static inline int btree_init128(struct btree_head128 *head) +{ + return btree_init(&head->h); +} + +static inline void btree_destroy128(struct btree_head128 *head) +{ + btree_destroy(&head->h); +} + +static inline void *btree_lookup128(struct btree_head128 *head, u64 k1, u64 k2) +{ + u64 key[2] = {k1, k2}; + return btree_lookup(&head->h, &btree_geo128, (unsigned long *)&key); +} + +static inline void *btree_get_prev128(struct btree_head128 *head, + u64 *k1, u64 *k2) +{ + u64 key[2] = {*k1, *k2}; + void *val; + + val = btree_get_prev(&head->h, &btree_geo128, + (unsigned long *)&key); + *k1 = key[0]; + *k2 = key[1]; + return val; +} + +static inline int btree_insert128(struct btree_head128 *head, u64 k1, u64 k2, + void *val, gfp_t gfp) +{ + u64 key[2] = {k1, k2}; + return btree_insert(&head->h, &btree_geo128, + (unsigned long *)&key, val, gfp); +} + +static inline int btree_update128(struct btree_head128 *head, u64 k1, u64 k2, + void *val) +{ + u64 key[2] = {k1, k2}; + return btree_update(&head->h, &btree_geo128, + (unsigned long *)&key, val); +} + +static inline void *btree_remove128(struct btree_head128 *head, u64 k1, u64 k2) +{ + u64 key[2] = {k1, k2}; + return btree_remove(&head->h, &btree_geo128, (unsigned long *)&key); +} + +static inline void *btree_last128(struct btree_head128 *head, u64 *k1, u64 *k2) +{ + u64 key[2]; + void *val; + + val = btree_last(&head->h, &btree_geo128, (unsigned long *)&key[0]); + if (val) { + *k1 = key[0]; + *k2 = key[1]; + } + + return val; +} + +static inline int btree_merge128(struct btree_head128 *target, + struct btree_head128 *victim, + gfp_t gfp) +{ + return btree_merge(&target->h, &victim->h, &btree_geo128, gfp); +} + +void visitor128(void *elem, unsigned long opaque, unsigned long *__key, + size_t index, void *__func); + +typedef void (*visitor128_t)(void *elem, unsigned long opaque, + u64 key1, u64 key2, size_t index); + +static inline size_t btree_visitor128(struct btree_head128 *head, + unsigned long opaque, + visitor128_t func2) +{ + return btree_visitor(&head->h, &btree_geo128, opaque, + visitor128, func2); +} + +static inline size_t btree_grim_visitor128(struct btree_head128 *head, + unsigned long opaque, + visitor128_t func2) +{ + return btree_grim_visitor(&head->h, &btree_geo128, opaque, + visitor128, func2); +} + +#define btree_for_each_safe128(head, k1, k2, val) \ + for (val = btree_last128(head, &k1, &k2); \ + val; \ + val = btree_get_prev128(head, &k1, &k2)) + diff --git a/include/linux/btree-type.h b/include/linux/btree-type.h new file mode 100644 index 000000000000..9a1147ef8563 --- /dev/null +++ b/include/linux/btree-type.h @@ -0,0 +1,147 @@ +#define __BTREE_TP(pfx, type, sfx) pfx ## type ## sfx +#define _BTREE_TP(pfx, type, sfx) __BTREE_TP(pfx, type, sfx) +#define BTREE_TP(pfx) _BTREE_TP(pfx, BTREE_TYPE_SUFFIX,) +#define BTREE_FN(name) BTREE_TP(btree_ ## name) +#define BTREE_TYPE_HEAD BTREE_TP(struct btree_head) +#define VISITOR_FN BTREE_TP(visitor) +#define VISITOR_FN_T _BTREE_TP(visitor, BTREE_TYPE_SUFFIX, _t) + +BTREE_TYPE_HEAD { + struct btree_head h; +}; + +static inline void BTREE_FN(init_mempool)(BTREE_TYPE_HEAD *head, + mempool_t *mempool) +{ + btree_init_mempool(&head->h, mempool); +} + +static inline int BTREE_FN(init)(BTREE_TYPE_HEAD *head) +{ + return btree_init(&head->h); +} + +static inline void BTREE_FN(destroy)(BTREE_TYPE_HEAD *head) +{ + btree_destroy(&head->h); +} + +static inline int BTREE_FN(merge)(BTREE_TYPE_HEAD *target, + BTREE_TYPE_HEAD *victim, + gfp_t gfp) +{ + return btree_merge(&target->h, &victim->h, BTREE_TYPE_GEO, gfp); +} + +#if (BITS_PER_LONG > BTREE_TYPE_BITS) +static inline void *BTREE_FN(lookup)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key) +{ + unsigned long _key = key; + return btree_lookup(&head->h, BTREE_TYPE_GEO, &_key); +} + +static inline int BTREE_FN(insert)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key, + void *val, gfp_t gfp) +{ + unsigned long _key = key; + return btree_insert(&head->h, BTREE_TYPE_GEO, &_key, val, gfp); +} + +static inline int BTREE_FN(update)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key, + void *val) +{ + unsigned long _key = key; + return btree_update(&head->h, BTREE_TYPE_GEO, &_key, val); +} + +static inline void *BTREE_FN(remove)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key) +{ + unsigned long _key = key; + return btree_remove(&head->h, BTREE_TYPE_GEO, &_key); +} + +static inline void *BTREE_FN(last)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key) +{ + unsigned long _key; + void *val = btree_last(&head->h, BTREE_TYPE_GEO, &_key); + if (val) + *key = _key; + return val; +} + +static inline void *BTREE_FN(get_prev)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key) +{ + unsigned long _key = *key; + void *val = btree_get_prev(&head->h, BTREE_TYPE_GEO, &_key); + if (val) + *key = _key; + return val; +} +#else +static inline void *BTREE_FN(lookup)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key) +{ + return btree_lookup(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key); +} + +static inline int BTREE_FN(insert)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key, + void *val, gfp_t gfp) +{ + return btree_insert(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key, + val, gfp); +} + +static inline int BTREE_FN(update)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key, + void *val) +{ + return btree_update(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key, val); +} + +static inline void *BTREE_FN(remove)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key) +{ + return btree_remove(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key); +} + +static inline void *BTREE_FN(last)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key) +{ + return btree_last(&head->h, BTREE_TYPE_GEO, (unsigned long *)key); +} + +static inline void *BTREE_FN(get_prev)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key) +{ + return btree_get_prev(&head->h, BTREE_TYPE_GEO, (unsigned long *)key); +} +#endif + +void VISITOR_FN(void *elem, unsigned long opaque, unsigned long *key, + size_t index, void *__func); + +typedef void (*VISITOR_FN_T)(void *elem, unsigned long opaque, + BTREE_KEYTYPE key, size_t index); + +static inline size_t BTREE_FN(visitor)(BTREE_TYPE_HEAD *head, + unsigned long opaque, + VISITOR_FN_T func2) +{ + return btree_visitor(&head->h, BTREE_TYPE_GEO, opaque, + visitorl, func2); +} + +static inline size_t BTREE_FN(grim_visitor)(BTREE_TYPE_HEAD *head, + unsigned long opaque, + VISITOR_FN_T func2) +{ + return btree_grim_visitor(&head->h, BTREE_TYPE_GEO, opaque, + visitorl, func2); +} + +#undef VISITOR_FN +#undef VISITOR_FN_T +#undef __BTREE_TP +#undef _BTREE_TP +#undef BTREE_TP +#undef BTREE_FN +#undef BTREE_TYPE_HEAD +#undef BTREE_TYPE_SUFFIX +#undef BTREE_TYPE_GEO +#undef BTREE_KEYTYPE +#undef BTREE_TYPE_BITS diff --git a/include/linux/btree.h b/include/linux/btree.h new file mode 100644 index 000000000000..65b5bb058324 --- /dev/null +++ b/include/linux/btree.h @@ -0,0 +1,243 @@ +#ifndef BTREE_H +#define BTREE_H + +#include +#include + +/** + * DOC: B+Tree basics + * + * A B+Tree is a data structure for looking up arbitrary (currently allowing + * unsigned long, u32, u64 and 2 * u64) keys into pointers. The data structure + * is described at http://en.wikipedia.org/wiki/B-tree, we currently do not + * use binary search to find the key on lookups. + * + * Each B+Tree consists of a head, that contains bookkeeping information and + * a variable number (starting with zero) nodes. Each node contains the keys + * and pointers to sub-nodes, or, for leaf nodes, the keys and values for the + * tree entries. + * + * Each node in this implementation has the following layout: + * [key1, key2, ..., keyN] [val1, val2, ..., valN] + * + * Each key here is an array of unsigned longs, geo->no_longs in total. The + * number of keys and values (N) is geo->no_pairs. + */ + +/** + * struct btree_head - btree head + * + * @node: the first node in the tree + * @mempool: mempool used for node allocations + * @height: current of the tree + */ +struct btree_head { + unsigned long *node; + mempool_t *mempool; + int height; +}; + +/* btree geometry */ +struct btree_geo; + +/** + * btree_alloc - allocate function for the mempool + * @gfp_mask: gfp mask for the allocation + * @pool_data: unused + */ +void *btree_alloc(gfp_t gfp_mask, void *pool_data); + +/** + * btree_free - free function for the mempool + * @element: the element to free + * @pool_data: unused + */ +void btree_free(void *element, void *pool_data); + +/** + * btree_init_mempool - initialise a btree with given mempool + * + * @head: the btree head to initialise + * @mempool: the mempool to use + * + * When this function is used, there is no need to destroy + * the mempool. + */ +void btree_init_mempool(struct btree_head *head, mempool_t *mempool); + +/** + * btree_init - initialise a btree + * + * @head: the btree head to initialise + * + * This function allocates the memory pool that the + * btree needs. Returns zero or a negative error code + * (-%ENOMEM) when memory allocation fails. + * + */ +int __must_check btree_init(struct btree_head *head); + +/** + * btree_destroy - destroy mempool + * + * @head: the btree head to destroy + * + * This function destroys the internal memory pool, use only + * when using btree_init(), not with btree_init_mempool(). + */ +void btree_destroy(struct btree_head *head); + +/** + * btree_lookup - look up a key in the btree + * + * @head: the btree to look in + * @geo: the btree geometry + * @key: the key to look up + * + * This function returns the value for the given key, or %NULL. + */ +void *btree_lookup(struct btree_head *head, struct btree_geo *geo, + unsigned long *key); + +/** + * btree_insert - insert an entry into the btree + * + * @head: the btree to add to + * @geo: the btree geometry + * @key: the key to add (must not already be present) + * @val: the value to add (must not be %NULL) + * @gfp: allocation flags for node allocations + * + * This function returns 0 if the item could be added, or an + * error code if it failed (may fail due to memory pressure). + */ +int __must_check btree_insert(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, void *val, gfp_t gfp); +/** + * btree_update - update an entry in the btree + * + * @head: the btree to update + * @geo: the btree geometry + * @key: the key to update + * @val: the value to change it to (must not be %NULL) + * + * This function returns 0 if the update was successful, or + * -%ENOENT if the key could not be found. + */ +int btree_update(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, void *val); +/** + * btree_remove - remove an entry from the btree + * + * @head: the btree to update + * @geo: the btree geometry + * @key: the key to remove + * + * This function returns the removed entry, or %NULL if the key + * could not be found. + */ +void *btree_remove(struct btree_head *head, struct btree_geo *geo, + unsigned long *key); + +/** + * btree_merge - merge two btrees + * + * @target: the tree that gets all the entries + * @victim: the tree that gets merged into @target + * @geo: the btree geometry + * @gfp: allocation flags + * + * The two trees @target and @victim may not contain the same keys, + * that is a bug and triggers a BUG(). This function returns zero + * if the trees were merged successfully, and may return a failure + * when memory allocation fails, in which case both trees might have + * been partially merged, i.e. some entries have been moved from + * @victim to @target. + */ +int btree_merge(struct btree_head *target, struct btree_head *victim, + struct btree_geo *geo, gfp_t gfp); + +/** + * btree_last - get last entry in btree + * + * @head: btree head + * @geo: btree geometry + * @key: last key + * + * Returns the last entry in the btree, and sets @key to the key + * of that entry; returns NULL if the tree is empty, in that case + * key is not changed. + */ +void *btree_last(struct btree_head *head, struct btree_geo *geo, + unsigned long *key); + +/** + * btree_get_prev - get previous entry + * + * @head: btree head + * @geo: btree geometry + * @key: pointer to key + * + * The function returns the next item right before the value pointed to by + * @key, and updates @key with its key, or returns %NULL when there is no + * entry with a key smaller than the given key. + */ +void *btree_get_prev(struct btree_head *head, struct btree_geo *geo, + unsigned long *key); + + +/* internal use, use btree_visitor{l,32,64,128} */ +size_t btree_visitor(struct btree_head *head, struct btree_geo *geo, + unsigned long opaque, + void (*func)(void *elem, unsigned long opaque, + unsigned long *key, size_t index, + void *func2), + void *func2); + +/* internal use, use btree_grim_visitor{l,32,64,128} */ +size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo, + unsigned long opaque, + void (*func)(void *elem, unsigned long opaque, + unsigned long *key, + size_t index, void *func2), + void *func2); + + +#include + +extern struct btree_geo btree_geo32; +#define BTREE_TYPE_SUFFIX l +#define BTREE_TYPE_BITS BITS_PER_LONG +#define BTREE_TYPE_GEO &btree_geo32 +#define BTREE_KEYTYPE unsigned long +#include + +#define btree_for_each_safel(head, key, val) \ + for (val = btree_lastl(head, &key); \ + val; \ + val = btree_get_prevl(head, &key)) + +#define BTREE_TYPE_SUFFIX 32 +#define BTREE_TYPE_BITS 32 +#define BTREE_TYPE_GEO &btree_geo32 +#define BTREE_KEYTYPE u32 +#include + +#define btree_for_each_safe32(head, key, val) \ + for (val = btree_last32(head, &key); \ + val; \ + val = btree_get_prev32(head, &key)) + +extern struct btree_geo btree_geo64; +#define BTREE_TYPE_SUFFIX 64 +#define BTREE_TYPE_BITS 64 +#define BTREE_TYPE_GEO &btree_geo64 +#define BTREE_KEYTYPE u64 +#include + +#define btree_for_each_safe64(head, key, val) \ + for (val = btree_last64(head, &key); \ + val; \ + val = btree_get_prev64(head, &key)) + +#endif diff --git a/lib/Kconfig b/lib/Kconfig index bb1326d3839c..277fbfb233b9 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -156,6 +156,9 @@ config TEXTSEARCH_BM config TEXTSEARCH_FSM tristate +config BTREE + boolean + config HAS_IOMEM boolean depends on !NO_IOMEM diff --git a/lib/Makefile b/lib/Makefile index 2e78277eff9d..cff82612e98b 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -41,6 +41,7 @@ lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o +obj-$(CONFIG_BTREE) += btree.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o obj-$(CONFIG_DEBUG_LIST) += list_debug.o obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o diff --git a/lib/btree.c b/lib/btree.c new file mode 100644 index 000000000000..41859a820218 --- /dev/null +++ b/lib/btree.c @@ -0,0 +1,797 @@ +/* + * lib/btree.c - Simple In-memory B+Tree + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2007-2008 Joern Engel + * Bits and pieces stolen from Peter Zijlstra's code, which is + * Copyright 2007, Red Hat Inc. Peter Zijlstra + * GPLv2 + * + * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch + * + * A relatively simple B+Tree implementation. I have written it as a learning + * excercise to understand how B+Trees work. Turned out to be useful as well. + * + * B+Trees can be used similar to Linux radix trees (which don't have anything + * in common with textbook radix trees, beware). Prerequisite for them working + * well is that access to a random tree node is much faster than a large number + * of operations within each node. + * + * Disks have fulfilled the prerequisite for a long time. More recently DRAM + * has gained similar properties, as memory access times, when measured in cpu + * cycles, have increased. Cacheline sizes have increased as well, which also + * helps B+Trees. + * + * Compared to radix trees, B+Trees are more efficient when dealing with a + * sparsely populated address space. Between 25% and 50% of the memory is + * occupied with valid pointers. When densely populated, radix trees contain + * ~98% pointers - hard to beat. Very sparse radix trees contain only ~2% + * pointers. + * + * This particular implementation stores pointers identified by a long value. + * Storing NULL pointers is illegal, lookup will return NULL when no entry + * was found. + * + * A tricks was used that is not commonly found in textbooks. The lowest + * values are to the right, not to the left. All used slots within a node + * are on the left, all unused slots contain NUL values. Most operations + * simply loop once over all slots and terminate on the first NUL. + */ + +#include +#include +#include +#include +#include + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define NODESIZE MAX(L1_CACHE_BYTES, 128) + +struct btree_geo { + int keylen; + int no_pairs; + int no_longs; +}; + +struct btree_geo btree_geo32 = { + .keylen = 1, + .no_pairs = NODESIZE / sizeof(long) / 2, + .no_longs = NODESIZE / sizeof(long) / 2, +}; +EXPORT_SYMBOL_GPL(btree_geo32); + +#define LONG_PER_U64 (64 / BITS_PER_LONG) +struct btree_geo btree_geo64 = { + .keylen = LONG_PER_U64, + .no_pairs = NODESIZE / sizeof(long) / (1 + LONG_PER_U64), + .no_longs = LONG_PER_U64 * (NODESIZE / sizeof(long) / (1 + LONG_PER_U64)), +}; +EXPORT_SYMBOL_GPL(btree_geo64); + +struct btree_geo btree_geo128 = { + .keylen = 2 * LONG_PER_U64, + .no_pairs = NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64), + .no_longs = 2 * LONG_PER_U64 * (NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64)), +}; +EXPORT_SYMBOL_GPL(btree_geo128); + +static struct kmem_cache *btree_cachep; + +void *btree_alloc(gfp_t gfp_mask, void *pool_data) +{ + return kmem_cache_alloc(btree_cachep, gfp_mask); +} +EXPORT_SYMBOL_GPL(btree_alloc); + +void btree_free(void *element, void *pool_data) +{ + kmem_cache_free(btree_cachep, element); +} +EXPORT_SYMBOL_GPL(btree_free); + +static unsigned long *btree_node_alloc(struct btree_head *head, gfp_t gfp) +{ + unsigned long *node; + + node = mempool_alloc(head->mempool, gfp); + memset(node, 0, NODESIZE); + return node; +} + +static int longcmp(const unsigned long *l1, const unsigned long *l2, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) { + if (l1[i] < l2[i]) + return -1; + if (l1[i] > l2[i]) + return 1; + } + return 0; +} + +static unsigned long *longcpy(unsigned long *dest, const unsigned long *src, + size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) + dest[i] = src[i]; + return dest; +} + +static unsigned long *longset(unsigned long *s, unsigned long c, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) + s[i] = c; + return s; +} + +static void dec_key(struct btree_geo *geo, unsigned long *key) +{ + unsigned long val; + int i; + + for (i = geo->keylen - 1; i >= 0; i--) { + val = key[i]; + key[i] = val - 1; + if (val) + break; + } +} + +static unsigned long *bkey(struct btree_geo *geo, unsigned long *node, int n) +{ + return &node[n * geo->keylen]; +} + +static void *bval(struct btree_geo *geo, unsigned long *node, int n) +{ + return (void *)node[geo->no_longs + n]; +} + +static void setkey(struct btree_geo *geo, unsigned long *node, int n, + unsigned long *key) +{ + longcpy(bkey(geo, node, n), key, geo->keylen); +} + +static void setval(struct btree_geo *geo, unsigned long *node, int n, + void *val) +{ + node[geo->no_longs + n] = (unsigned long) val; +} + +static void clearpair(struct btree_geo *geo, unsigned long *node, int n) +{ + longset(bkey(geo, node, n), 0, geo->keylen); + node[geo->no_longs + n] = 0; +} + +static inline void __btree_init(struct btree_head *head) +{ + head->node = NULL; + head->height = 0; +} + +void btree_init_mempool(struct btree_head *head, mempool_t *mempool) +{ + __btree_init(head); + head->mempool = mempool; +} +EXPORT_SYMBOL_GPL(btree_init_mempool); + +int btree_init(struct btree_head *head) +{ + __btree_init(head); + head->mempool = mempool_create(0, btree_alloc, btree_free, NULL); + if (!head->mempool) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL_GPL(btree_init); + +void btree_destroy(struct btree_head *head) +{ + mempool_destroy(head->mempool); + head->mempool = NULL; +} +EXPORT_SYMBOL_GPL(btree_destroy); + +void *btree_last(struct btree_head *head, struct btree_geo *geo, + unsigned long *key) +{ + int height = head->height; + unsigned long *node = head->node; + + if (height == 0) + return NULL; + + for ( ; height > 1; height--) + node = bval(geo, node, 0); + + longcpy(key, bkey(geo, node, 0), geo->keylen); + return bval(geo, node, 0); +} +EXPORT_SYMBOL_GPL(btree_last); + +static int keycmp(struct btree_geo *geo, unsigned long *node, int pos, + unsigned long *key) +{ + return longcmp(bkey(geo, node, pos), key, geo->keylen); +} + +static int keyzero(struct btree_geo *geo, unsigned long *key) +{ + int i; + + for (i = 0; i < geo->keylen; i++) + if (key[i]) + return 0; + + return 1; +} + +void *btree_lookup(struct btree_head *head, struct btree_geo *geo, + unsigned long *key) +{ + int i, height = head->height; + unsigned long *node = head->node; + + if (height == 0) + return NULL; + + for ( ; height > 1; height--) { + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) <= 0) + break; + if (i == geo->no_pairs) + return NULL; + node = bval(geo, node, i); + if (!node) + return NULL; + } + + if (!node) + return NULL; + + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) == 0) + return bval(geo, node, i); + return NULL; +} +EXPORT_SYMBOL_GPL(btree_lookup); + +int btree_update(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, void *val) +{ + int i, height = head->height; + unsigned long *node = head->node; + + if (height == 0) + return -ENOENT; + + for ( ; height > 1; height--) { + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) <= 0) + break; + if (i == geo->no_pairs) + return -ENOENT; + node = bval(geo, node, i); + if (!node) + return -ENOENT; + } + + if (!node) + return -ENOENT; + + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) == 0) { + setval(geo, node, i, val); + return 0; + } + return -ENOENT; +} +EXPORT_SYMBOL_GPL(btree_update); + +/* + * Usually this function is quite similar to normal lookup. But the key of + * a parent node may be smaller than the smallest key of all its siblings. + * In such a case we cannot just return NULL, as we have only proven that no + * key smaller than __key, but larger than this parent key exists. + * So we set __key to the parent key and retry. We have to use the smallest + * such parent key, which is the last parent key we encountered. + */ +void *btree_get_prev(struct btree_head *head, struct btree_geo *geo, + unsigned long *__key) +{ + int i, height; + unsigned long *node, *oldnode; + unsigned long *retry_key = NULL, key[geo->keylen]; + + if (keyzero(geo, __key)) + return NULL; + + if (head->height == 0) + return NULL; +retry: + longcpy(key, __key, geo->keylen); + dec_key(geo, key); + + node = head->node; + for (height = head->height ; height > 1; height--) { + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) <= 0) + break; + if (i == geo->no_pairs) + goto miss; + oldnode = node; + node = bval(geo, node, i); + if (!node) + goto miss; + retry_key = bkey(geo, oldnode, i); + } + + if (!node) + goto miss; + + for (i = 0; i < geo->no_pairs; i++) { + if (keycmp(geo, node, i, key) <= 0) { + if (bval(geo, node, i)) { + longcpy(__key, bkey(geo, node, i), geo->keylen); + return bval(geo, node, i); + } else + goto miss; + } + } +miss: + if (retry_key) { + __key = retry_key; + retry_key = NULL; + goto retry; + } + return NULL; +} + +static int getpos(struct btree_geo *geo, unsigned long *node, + unsigned long *key) +{ + int i; + + for (i = 0; i < geo->no_pairs; i++) { + if (keycmp(geo, node, i, key) <= 0) + break; + } + return i; +} + +static int getfill(struct btree_geo *geo, unsigned long *node, int start) +{ + int i; + + for (i = start; i < geo->no_pairs; i++) + if (!bval(geo, node, i)) + break; + return i; +} + +/* + * locate the correct leaf node in the btree + */ +static unsigned long *find_level(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, int level) +{ + unsigned long *node = head->node; + int i, height; + + for (height = head->height; height > level; height--) { + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) <= 0) + break; + + if ((i == geo->no_pairs) || !bval(geo, node, i)) { + /* right-most key is too large, update it */ + /* FIXME: If the right-most key on higher levels is + * always zero, this wouldn't be necessary. */ + i--; + setkey(geo, node, i, key); + } + BUG_ON(i < 0); + node = bval(geo, node, i); + } + BUG_ON(!node); + return node; +} + +static int btree_grow(struct btree_head *head, struct btree_geo *geo, + gfp_t gfp) +{ + unsigned long *node; + int fill; + + node = btree_node_alloc(head, gfp); + if (!node) + return -ENOMEM; + if (head->node) { + fill = getfill(geo, head->node, 0); + setkey(geo, node, 0, bkey(geo, head->node, fill - 1)); + setval(geo, node, 0, head->node); + } + head->node = node; + head->height++; + return 0; +} + +static void btree_shrink(struct btree_head *head, struct btree_geo *geo) +{ + unsigned long *node; + int fill; + + if (head->height <= 1) + return; + + node = head->node; + fill = getfill(geo, node, 0); + BUG_ON(fill > 1); + head->node = bval(geo, node, 0); + head->height--; + mempool_free(node, head->mempool); +} + +static int btree_insert_level(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, void *val, int level, + gfp_t gfp) +{ + unsigned long *node; + int i, pos, fill, err; + + BUG_ON(!val); + if (head->height < level) { + err = btree_grow(head, geo, gfp); + if (err) + return err; + } + +retry: + node = find_level(head, geo, key, level); + pos = getpos(geo, node, key); + fill = getfill(geo, node, pos); + /* two identical keys are not allowed */ + BUG_ON(pos < fill && keycmp(geo, node, pos, key) == 0); + + if (fill == geo->no_pairs) { + /* need to split node */ + unsigned long *new; + + new = btree_node_alloc(head, gfp); + if (!new) + return -ENOMEM; + err = btree_insert_level(head, geo, + bkey(geo, node, fill / 2 - 1), + new, level + 1, gfp); + if (err) { + mempool_free(new, head->mempool); + return err; + } + for (i = 0; i < fill / 2; i++) { + setkey(geo, new, i, bkey(geo, node, i)); + setval(geo, new, i, bval(geo, node, i)); + setkey(geo, node, i, bkey(geo, node, i + fill / 2)); + setval(geo, node, i, bval(geo, node, i + fill / 2)); + clearpair(geo, node, i + fill / 2); + } + if (fill & 1) { + setkey(geo, node, i, bkey(geo, node, fill - 1)); + setval(geo, node, i, bval(geo, node, fill - 1)); + clearpair(geo, node, fill - 1); + } + goto retry; + } + BUG_ON(fill >= geo->no_pairs); + + /* shift and insert */ + for (i = fill; i > pos; i--) { + setkey(geo, node, i, bkey(geo, node, i - 1)); + setval(geo, node, i, bval(geo, node, i - 1)); + } + setkey(geo, node, pos, key); + setval(geo, node, pos, val); + + return 0; +} + +int btree_insert(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, void *val, gfp_t gfp) +{ + return btree_insert_level(head, geo, key, val, 1, gfp); +} +EXPORT_SYMBOL_GPL(btree_insert); + +static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, int level); +static void merge(struct btree_head *head, struct btree_geo *geo, int level, + unsigned long *left, int lfill, + unsigned long *right, int rfill, + unsigned long *parent, int lpos) +{ + int i; + + for (i = 0; i < rfill; i++) { + /* Move all keys to the left */ + setkey(geo, left, lfill + i, bkey(geo, right, i)); + setval(geo, left, lfill + i, bval(geo, right, i)); + } + /* Exchange left and right child in parent */ + setval(geo, parent, lpos, right); + setval(geo, parent, lpos + 1, left); + /* Remove left (formerly right) child from parent */ + btree_remove_level(head, geo, bkey(geo, parent, lpos), level + 1); + mempool_free(right, head->mempool); +} + +static void rebalance(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, int level, unsigned long *child, int fill) +{ + unsigned long *parent, *left = NULL, *right = NULL; + int i, no_left, no_right; + + if (fill == 0) { + /* Because we don't steal entries from a neigbour, this case + * can happen. Parent node contains a single child, this + * node, so merging with a sibling never happens. + */ + btree_remove_level(head, geo, key, level + 1); + mempool_free(child, head->mempool); + return; + } + + parent = find_level(head, geo, key, level + 1); + i = getpos(geo, parent, key); + BUG_ON(bval(geo, parent, i) != child); + + if (i > 0) { + left = bval(geo, parent, i - 1); + no_left = getfill(geo, left, 0); + if (fill + no_left <= geo->no_pairs) { + merge(head, geo, level, + left, no_left, + child, fill, + parent, i - 1); + return; + } + } + if (i + 1 < getfill(geo, parent, i)) { + right = bval(geo, parent, i + 1); + no_right = getfill(geo, right, 0); + if (fill + no_right <= geo->no_pairs) { + merge(head, geo, level, + child, fill, + right, no_right, + parent, i); + return; + } + } + /* + * We could also try to steal one entry from the left or right + * neighbor. By not doing so we changed the invariant from + * "all nodes are at least half full" to "no two neighboring + * nodes can be merged". Which means that the average fill of + * all nodes is still half or better. + */ +} + +static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, int level) +{ + unsigned long *node; + int i, pos, fill; + void *ret; + + if (level > head->height) { + /* we recursed all the way up */ + head->height = 0; + head->node = NULL; + return NULL; + } + + node = find_level(head, geo, key, level); + pos = getpos(geo, node, key); + fill = getfill(geo, node, pos); + if ((level == 1) && (keycmp(geo, node, pos, key) != 0)) + return NULL; + ret = bval(geo, node, pos); + + /* remove and shift */ + for (i = pos; i < fill - 1; i++) { + setkey(geo, node, i, bkey(geo, node, i + 1)); + setval(geo, node, i, bval(geo, node, i + 1)); + } + clearpair(geo, node, fill - 1); + + if (fill - 1 < geo->no_pairs / 2) { + if (level < head->height) + rebalance(head, geo, key, level, node, fill - 1); + else if (fill - 1 == 1) + btree_shrink(head, geo); + } + + return ret; +} + +void *btree_remove(struct btree_head *head, struct btree_geo *geo, + unsigned long *key) +{ + if (head->height == 0) + return NULL; + + return btree_remove_level(head, geo, key, 1); +} +EXPORT_SYMBOL_GPL(btree_remove); + +int btree_merge(struct btree_head *target, struct btree_head *victim, + struct btree_geo *geo, gfp_t gfp) +{ + unsigned long key[geo->keylen]; + unsigned long dup[geo->keylen]; + void *val; + int err; + + BUG_ON(target == victim); + + if (!(target->node)) { + /* target is empty, just copy fields over */ + target->node = victim->node; + target->height = victim->height; + __btree_init(victim); + return 0; + } + + /* TODO: This needs some optimizations. Currently we do three tree + * walks to remove a single object from the victim. + */ + for (;;) { + if (!btree_last(victim, geo, key)) + break; + val = btree_lookup(victim, geo, key); + err = btree_insert(target, geo, key, val, gfp); + if (err) + return err; + /* We must make a copy of the key, as the original will get + * mangled inside btree_remove. */ + longcpy(dup, key, geo->keylen); + btree_remove(victim, geo, dup); + } + return 0; +} +EXPORT_SYMBOL_GPL(btree_merge); + +static size_t __btree_for_each(struct btree_head *head, struct btree_geo *geo, + unsigned long *node, unsigned long opaque, + void (*func)(void *elem, unsigned long opaque, + unsigned long *key, size_t index, + void *func2), + void *func2, int reap, int height, size_t count) +{ + int i; + unsigned long *child; + + for (i = 0; i < geo->no_pairs; i++) { + child = bval(geo, node, i); + if (!child) + break; + if (height > 1) + count = __btree_for_each(head, geo, child, opaque, + func, func2, reap, height - 1, count); + else + func(child, opaque, bkey(geo, node, i), count++, + func2); + } + if (reap) + mempool_free(node, head->mempool); + return count; +} + +static void empty(void *elem, unsigned long opaque, unsigned long *key, + size_t index, void *func2) +{ +} + +void visitorl(void *elem, unsigned long opaque, unsigned long *key, + size_t index, void *__func) +{ + visitorl_t func = __func; + + func(elem, opaque, *key, index); +} +EXPORT_SYMBOL_GPL(visitorl); + +void visitor32(void *elem, unsigned long opaque, unsigned long *__key, + size_t index, void *__func) +{ + visitor32_t func = __func; + u32 *key = (void *)__key; + + func(elem, opaque, *key, index); +} +EXPORT_SYMBOL_GPL(visitor32); + +void visitor64(void *elem, unsigned long opaque, unsigned long *__key, + size_t index, void *__func) +{ + visitor64_t func = __func; + u64 *key = (void *)__key; + + func(elem, opaque, *key, index); +} +EXPORT_SYMBOL_GPL(visitor64); + +void visitor128(void *elem, unsigned long opaque, unsigned long *__key, + size_t index, void *__func) +{ + visitor128_t func = __func; + u64 *key = (void *)__key; + + func(elem, opaque, key[0], key[1], index); +} +EXPORT_SYMBOL_GPL(visitor128); + +size_t btree_visitor(struct btree_head *head, struct btree_geo *geo, + unsigned long opaque, + void (*func)(void *elem, unsigned long opaque, + unsigned long *key, + size_t index, void *func2), + void *func2) +{ + size_t count = 0; + + if (!func2) + func = empty; + if (head->node) + count = __btree_for_each(head, geo, head->node, opaque, func, + func2, 0, head->height, 0); + return count; +} +EXPORT_SYMBOL_GPL(btree_visitor); + +size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo, + unsigned long opaque, + void (*func)(void *elem, unsigned long opaque, + unsigned long *key, + size_t index, void *func2), + void *func2) +{ + size_t count = 0; + + if (!func2) + func = empty; + if (head->node) + count = __btree_for_each(head, geo, head->node, opaque, func, + func2, 1, head->height, 0); + __btree_init(head); + return count; +} +EXPORT_SYMBOL_GPL(btree_grim_visitor); + +static int __init btree_module_init(void) +{ + btree_cachep = kmem_cache_create("btree_node", NODESIZE, 0, + SLAB_HWCACHE_ALIGN, NULL); + return 0; +} + +static void __exit btree_module_exit(void) +{ + kmem_cache_destroy(btree_cachep); +} + +/* If core code starts using btree, initialization should happen even earlier */ +module_init(btree_module_init); +module_exit(btree_module_exit); + +MODULE_AUTHOR("Joern Engel "); +MODULE_AUTHOR("Johannes Berg "); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 78c210efdefe07131f91ed512a3308b15bb14e2f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 6 Aug 2009 15:41:34 -0400 Subject: Revert "knfsd: avoid overloading the CPU scheduler with enormous load averages" This reverts commit 59a252ff8c0f2fa32c896f69d56ae33e641ce7ad. This helps in an entirely cached workload but not necessarily in workloads that require waiting on disk. Conflicts: include/linux/sunrpc/svc.h net/sunrpc/svc_xprt.c Reported-by: Simon Kirby Tested-by: Jesper Krogh Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 3 --- net/sunrpc/svc_xprt.c | 31 +++++++++---------------------- 2 files changed, 9 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 52e8cb0a7569..d1567d627557 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -29,7 +29,6 @@ struct svc_pool_stats { unsigned long packets; unsigned long sockets_queued; unsigned long threads_woken; - unsigned long overloads_avoided; unsigned long threads_timedout; }; @@ -50,7 +49,6 @@ struct svc_pool { struct list_head sp_sockets; /* pending sockets */ unsigned int sp_nrthreads; /* # of threads in pool */ struct list_head sp_all_threads; /* all server threads */ - int sp_nwaking; /* number of threads woken but not yet active */ struct svc_pool_stats sp_stats; /* statistics on pool operation */ } ____cacheline_aligned_in_smp; @@ -284,7 +282,6 @@ struct svc_rqst { * cache pages */ wait_queue_head_t rq_wait; /* synchronization */ struct task_struct *rq_task; /* service thread */ - int rq_waking; /* 1 if thread is being woken */ }; /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index df124f78ee48..2c58b75a236f 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -16,8 +16,6 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -#define SVC_MAX_WAKING 5 - static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); @@ -306,7 +304,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) struct svc_pool *pool; struct svc_rqst *rqstp; int cpu; - int thread_avail; if (!(xprt->xpt_flags & ((1<sp_lock); + if (!list_empty(&pool->sp_threads) && + !list_empty(&pool->sp_sockets)) + printk(KERN_ERR + "svc_xprt_enqueue: " + "threads and transports both waiting??\n"); + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) { /* Don't enqueue dead transports */ dprintk("svc: transport %p is dead, not enqueued\n", xprt); @@ -358,15 +361,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) } process: - /* Work out whether threads are available */ - thread_avail = !list_empty(&pool->sp_threads); /* threads are asleep */ - if (pool->sp_nwaking >= SVC_MAX_WAKING) { - /* too many threads are runnable and trying to wake up */ - thread_avail = 0; - pool->sp_stats.overloads_avoided++; - } - - if (thread_avail) { + if (!list_empty(&pool->sp_threads)) { rqstp = list_entry(pool->sp_threads.next, struct svc_rqst, rq_list); @@ -381,8 +376,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) svc_xprt_get(xprt); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); - rqstp->rq_waking = 1; - pool->sp_nwaking++; pool->sp_stats.threads_woken++; BUG_ON(xprt->xpt_pool != pool); wake_up(&rqstp->rq_wait); @@ -651,11 +644,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) return -EINTR; spin_lock_bh(&pool->sp_lock); - if (rqstp->rq_waking) { - rqstp->rq_waking = 0; - pool->sp_nwaking--; - BUG_ON(pool->sp_nwaking < 0); - } xprt = svc_xprt_dequeue(pool); if (xprt) { rqstp->rq_xprt = xprt; @@ -1204,16 +1192,15 @@ static int svc_pool_stats_show(struct seq_file *m, void *p) struct svc_pool *pool = p; if (p == SEQ_START_TOKEN) { - seq_puts(m, "# pool packets-arrived sockets-enqueued threads-woken overloads-avoided threads-timedout\n"); + seq_puts(m, "# pool packets-arrived sockets-enqueued threads-woken threads-timedout\n"); return 0; } - seq_printf(m, "%u %lu %lu %lu %lu %lu\n", + seq_printf(m, "%u %lu %lu %lu %lu\n", pool->sp_id, pool->sp_stats.packets, pool->sp_stats.sockets_queued, pool->sp_stats.threads_woken, - pool->sp_stats.overloads_avoided, pool->sp_stats.threads_timedout); return 0; -- cgit v1.2.3 From e169cfbef46d62e042614ffafa8880eed1d894bb Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 23 Nov 2009 14:53:09 -0700 Subject: of/flattree: merge find_flat_dt_string and initial_boot_params Merge common code between Microblaze and PowerPC. Signed-off-by: Grant Likely Reviewed-by: Wolfram Sang Tested-by: Michal Simek --- arch/microblaze/Kconfig | 1 + arch/microblaze/kernel/prom.c | 8 -------- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/prom.c | 12 ------------ drivers/of/Kconfig | 4 ++++ drivers/of/Makefile | 1 + drivers/of/fdt.c | 21 +++++++++++++++++++++ include/linux/of_fdt.h | 4 ++++ 8 files changed, 32 insertions(+), 20 deletions(-) create mode 100644 drivers/of/fdt.c (limited to 'include/linux') diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index bbd8327f1890..f39c9275a29b 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -111,6 +111,7 @@ config CMDLINE_FORCE config OF def_bool y + select OF_FLATTREE config PROC_DEVICETREE bool "Support for device tree in /proc" diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index b817df172aa9..06d620ab4168 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -47,17 +47,9 @@ static int __initdata dt_root_size_cells; typedef u32 cell_t; -static struct boot_param_header *initial_boot_params; - /* export that to outside world */ struct device_node *of_chosen; -static inline char *find_flat_dt_string(u32 offset) -{ - return ((char *)initial_boot_params) + - initial_boot_params->off_dt_strings + offset; -} - /** * This function is used to scan the flattened device-tree, it is * used to extract the memory informations at boot before we can diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2ba14e77296c..2a75c6ae2a8b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -163,6 +163,7 @@ config PPC_OF config OF def_bool y + select OF_FLATTREE config PPC_UDBG_16550 bool diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 4ec300862466..fccf7e49bb28 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -73,12 +73,6 @@ unsigned long tce_alloc_start, tce_alloc_end; typedef u32 cell_t; -#if 0 -static struct boot_param_header *initial_boot_params __initdata; -#else -struct boot_param_header *initial_boot_params; -#endif - extern struct device_node *allnodes; /* temporary while merging */ extern rwlock_t devtree_lock; /* temporary while merging */ @@ -86,12 +80,6 @@ extern rwlock_t devtree_lock; /* temporary while merging */ /* export that to outside world */ struct device_node *of_chosen; -static inline char *find_flat_dt_string(u32 offset) -{ - return ((char *)initial_boot_params) + - initial_boot_params->off_dt_strings + offset; -} - /** * This function is used to scan the flattened device-tree, it is * used to extract the memory informations at boot before we can diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig index d2fa27c5c1b2..462825e03123 100644 --- a/drivers/of/Kconfig +++ b/drivers/of/Kconfig @@ -1,3 +1,7 @@ +config OF_FLATTREE + bool + depends on OF + config OF_DEVICE def_bool y depends on OF && (SPARC || PPC_OF || MICROBLAZE) diff --git a/drivers/of/Makefile b/drivers/of/Makefile index bdfb5f5d4b06..f232cc98ce00 100644 --- a/drivers/of/Makefile +++ b/drivers/of/Makefile @@ -1,4 +1,5 @@ obj-y = base.o +obj-$(CONFIG_OF_FLATTREE) += fdt.o obj-$(CONFIG_OF_DEVICE) += device.o platform.o obj-$(CONFIG_OF_GPIO) += gpio.o obj-$(CONFIG_OF_I2C) += of_i2c.o diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c new file mode 100644 index 000000000000..9faa9a5cbdf0 --- /dev/null +++ b/drivers/of/fdt.c @@ -0,0 +1,21 @@ +/* + * Functions for working with the Flattened Device Tree data format + * + * Copyright 2009 Benjamin Herrenschmidt, IBM Corp + * benh@kernel.crashing.org + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include +#include + +struct boot_param_header *initial_boot_params; + +char *find_flat_dt_string(u32 offset) +{ + return ((char *)initial_boot_params) + + initial_boot_params->off_dt_strings + offset; +} diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 41d432b13553..d1a79f3da789 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -57,7 +57,11 @@ struct boot_param_header { u32 dt_struct_size; /* size of the DT structure block */ }; +/* TBD: Temporary export of fdt globals - remove when code fully merged */ +extern struct boot_param_header *initial_boot_params; + /* For scanning the flat device-tree at boot time */ +extern char *find_flat_dt_string(u32 offset); extern int __init of_scan_flat_dt(int (*it)(unsigned long node, const char *uname, int depth, void *data), -- cgit v1.2.3 From 31a6a87dfc34fbf02aef9a160adf558ec56d3ccd Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 23 Nov 2009 19:49:38 -0700 Subject: of/flattree: remove __init annotations from the header file __init annotation belongs in the .c file, not the header. Signed-off-by: Grant Likely Reviewed-by: Wolfram Sang Tested-by: Michal Simek --- include/linux/of_fdt.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index d1a79f3da789..81231e04e8f3 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -62,15 +62,13 @@ extern struct boot_param_header *initial_boot_params; /* For scanning the flat device-tree at boot time */ extern char *find_flat_dt_string(u32 offset); -extern int __init of_scan_flat_dt(int (*it)(unsigned long node, - const char *uname, int depth, - void *data), - void *data); -extern void __init *of_get_flat_dt_prop(unsigned long node, const char *name, - unsigned long *size); -extern int __init of_flat_dt_is_compatible(unsigned long node, - const char *name); -extern unsigned long __init of_get_flat_dt_root(void); +extern int of_scan_flat_dt(int (*it)(unsigned long node, const char *uname, + int depth, void *data), + void *data); +extern void *of_get_flat_dt_prop(unsigned long node, const char *name, + unsigned long *size); +extern int of_flat_dt_is_compatible(unsigned long node, const char *name); +extern unsigned long of_get_flat_dt_root(void); /* Other Prototypes */ extern void finish_device_tree(void); -- cgit v1.2.3 From bbd33931a08362f78266a4016211a35947b91041 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 23 Nov 2009 20:07:00 -0700 Subject: of/flattree: Merge unflatten_dt_node Merge common code between PowerPC and MicroBlaze Signed-off-by: Grant Likely Reviewed-by: Wolfram Sang Tested-by: Michal Simek --- arch/microblaze/kernel/prom.c | 195 ---------------------------------------- arch/powerpc/kernel/prom.c | 194 ---------------------------------------- drivers/of/fdt.c | 200 ++++++++++++++++++++++++++++++++++++++++++ include/linux/of_fdt.h | 4 + 4 files changed, 204 insertions(+), 389 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index eb27bd3a39b4..021770abfbd7 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -50,201 +50,6 @@ typedef u32 cell_t; /* export that to outside world */ struct device_node *of_chosen; -static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size, - unsigned long align) -{ - void *res; - - *mem = _ALIGN(*mem, align); - res = (void *)*mem; - *mem += size; - - return res; -} - -static unsigned long __init unflatten_dt_node(unsigned long mem, - unsigned long *p, - struct device_node *dad, - struct device_node ***allnextpp, - unsigned long fpsize) -{ - struct device_node *np; - struct property *pp, **prev_pp = NULL; - char *pathp; - u32 tag; - unsigned int l, allocl; - int has_name = 0; - int new_format = 0; - - tag = *((u32 *)(*p)); - if (tag != OF_DT_BEGIN_NODE) { - printk("Weird tag at start of node: %x\n", tag); - return mem; - } - *p += 4; - pathp = (char *)*p; - l = allocl = strlen(pathp) + 1; - *p = _ALIGN(*p + l, 4); - - /* version 0x10 has a more compact unit name here instead of the full - * path. we accumulate the full path size using "fpsize", we'll rebuild - * it later. We detect this because the first character of the name is - * not '/'. - */ - if ((*pathp) != '/') { - new_format = 1; - if (fpsize == 0) { - /* root node: special case. fpsize accounts for path - * plus terminating zero. root node only has '/', so - * fpsize should be 2, but we want to avoid the first - * level nodes to have two '/' so we use fpsize 1 here - */ - fpsize = 1; - allocl = 2; - } else { - /* account for '/' and path size minus terminal 0 - * already in 'l' - */ - fpsize += l; - allocl = fpsize; - } - } - - np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl, - __alignof__(struct device_node)); - if (allnextpp) { - memset(np, 0, sizeof(*np)); - np->full_name = ((char *)np) + sizeof(struct device_node); - if (new_format) { - char *p2 = np->full_name; - /* rebuild full path for new format */ - if (dad && dad->parent) { - strcpy(p2, dad->full_name); -#ifdef DEBUG - if ((strlen(p2) + l + 1) != allocl) { - pr_debug("%s: p: %d, l: %d, a: %d\n", - pathp, (int)strlen(p2), - l, allocl); - } -#endif - p2 += strlen(p2); - } - *(p2++) = '/'; - memcpy(p2, pathp, l); - } else - memcpy(np->full_name, pathp, l); - prev_pp = &np->properties; - **allnextpp = np; - *allnextpp = &np->allnext; - if (dad != NULL) { - np->parent = dad; - /* we temporarily use the next field as `last_child'*/ - if (dad->next == NULL) - dad->child = np; - else - dad->next->sibling = np; - dad->next = np; - } - kref_init(&np->kref); - } - while (1) { - u32 sz, noff; - char *pname; - - tag = *((u32 *)(*p)); - if (tag == OF_DT_NOP) { - *p += 4; - continue; - } - if (tag != OF_DT_PROP) - break; - *p += 4; - sz = *((u32 *)(*p)); - noff = *((u32 *)((*p) + 4)); - *p += 8; - if (initial_boot_params->version < 0x10) - *p = _ALIGN(*p, sz >= 8 ? 8 : 4); - - pname = find_flat_dt_string(noff); - if (pname == NULL) { - printk(KERN_INFO - "Can't find property name in list !\n"); - break; - } - if (strcmp(pname, "name") == 0) - has_name = 1; - l = strlen(pname) + 1; - pp = unflatten_dt_alloc(&mem, sizeof(struct property), - __alignof__(struct property)); - if (allnextpp) { - if (strcmp(pname, "linux,phandle") == 0) { - np->node = *((u32 *)*p); - if (np->linux_phandle == 0) - np->linux_phandle = np->node; - } - if (strcmp(pname, "ibm,phandle") == 0) - np->linux_phandle = *((u32 *)*p); - pp->name = pname; - pp->length = sz; - pp->value = (void *)*p; - *prev_pp = pp; - prev_pp = &pp->next; - } - *p = _ALIGN((*p) + sz, 4); - } - /* with version 0x10 we may not have the name property, recreate - * it here from the unit name if absent - */ - if (!has_name) { - char *p1 = pathp, *ps = pathp, *pa = NULL; - int sz; - - while (*p1) { - if ((*p1) == '@') - pa = p1; - if ((*p1) == '/') - ps = p1 + 1; - p1++; - } - if (pa < ps) - pa = p1; - sz = (pa - ps) + 1; - pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz, - __alignof__(struct property)); - if (allnextpp) { - pp->name = "name"; - pp->length = sz; - pp->value = pp + 1; - *prev_pp = pp; - prev_pp = &pp->next; - memcpy(pp->value, ps, sz - 1); - ((char *)pp->value)[sz - 1] = 0; - pr_debug("fixed up name for %s -> %s\n", pathp, - (char *)pp->value); - } - } - if (allnextpp) { - *prev_pp = NULL; - np->name = of_get_property(np, "name", NULL); - np->type = of_get_property(np, "device_type", NULL); - - if (!np->name) - np->name = ""; - if (!np->type) - np->type = ""; - } - while (tag == OF_DT_BEGIN_NODE) { - mem = unflatten_dt_node(mem, p, np, allnextpp, fpsize); - tag = *((u32 *)(*p)); - } - if (tag != OF_DT_END_NODE) { - printk(KERN_INFO "Weird tag at end of node: %x\n", tag); - return mem; - } - *p += 4; - return mem; -} - /** * unflattens the device-tree passed by the firmware, creating the * tree of struct device_node. It also fills the "name" and "type" diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 413e608863dd..a102a0a33ed1 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -80,200 +80,6 @@ extern rwlock_t devtree_lock; /* temporary while merging */ /* export that to outside world */ struct device_node *of_chosen; -static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size, - unsigned long align) -{ - void *res; - - *mem = _ALIGN(*mem, align); - res = (void *)*mem; - *mem += size; - - return res; -} - -static unsigned long __init unflatten_dt_node(unsigned long mem, - unsigned long *p, - struct device_node *dad, - struct device_node ***allnextpp, - unsigned long fpsize) -{ - struct device_node *np; - struct property *pp, **prev_pp = NULL; - char *pathp; - u32 tag; - unsigned int l, allocl; - int has_name = 0; - int new_format = 0; - - tag = *((u32 *)(*p)); - if (tag != OF_DT_BEGIN_NODE) { - printk("Weird tag at start of node: %x\n", tag); - return mem; - } - *p += 4; - pathp = (char *)*p; - l = allocl = strlen(pathp) + 1; - *p = _ALIGN(*p + l, 4); - - /* version 0x10 has a more compact unit name here instead of the full - * path. we accumulate the full path size using "fpsize", we'll rebuild - * it later. We detect this because the first character of the name is - * not '/'. - */ - if ((*pathp) != '/') { - new_format = 1; - if (fpsize == 0) { - /* root node: special case. fpsize accounts for path - * plus terminating zero. root node only has '/', so - * fpsize should be 2, but we want to avoid the first - * level nodes to have two '/' so we use fpsize 1 here - */ - fpsize = 1; - allocl = 2; - } else { - /* account for '/' and path size minus terminal 0 - * already in 'l' - */ - fpsize += l; - allocl = fpsize; - } - } - - - np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl, - __alignof__(struct device_node)); - if (allnextpp) { - memset(np, 0, sizeof(*np)); - np->full_name = ((char*)np) + sizeof(struct device_node); - if (new_format) { - char *p = np->full_name; - /* rebuild full path for new format */ - if (dad && dad->parent) { - strcpy(p, dad->full_name); -#ifdef DEBUG - if ((strlen(p) + l + 1) != allocl) { - DBG("%s: p: %d, l: %d, a: %d\n", - pathp, (int)strlen(p), l, allocl); - } -#endif - p += strlen(p); - } - *(p++) = '/'; - memcpy(p, pathp, l); - } else - memcpy(np->full_name, pathp, l); - prev_pp = &np->properties; - **allnextpp = np; - *allnextpp = &np->allnext; - if (dad != NULL) { - np->parent = dad; - /* we temporarily use the next field as `last_child'*/ - if (dad->next == 0) - dad->child = np; - else - dad->next->sibling = np; - dad->next = np; - } - kref_init(&np->kref); - } - while(1) { - u32 sz, noff; - char *pname; - - tag = *((u32 *)(*p)); - if (tag == OF_DT_NOP) { - *p += 4; - continue; - } - if (tag != OF_DT_PROP) - break; - *p += 4; - sz = *((u32 *)(*p)); - noff = *((u32 *)((*p) + 4)); - *p += 8; - if (initial_boot_params->version < 0x10) - *p = _ALIGN(*p, sz >= 8 ? 8 : 4); - - pname = find_flat_dt_string(noff); - if (pname == NULL) { - printk("Can't find property name in list !\n"); - break; - } - if (strcmp(pname, "name") == 0) - has_name = 1; - l = strlen(pname) + 1; - pp = unflatten_dt_alloc(&mem, sizeof(struct property), - __alignof__(struct property)); - if (allnextpp) { - if (strcmp(pname, "linux,phandle") == 0) { - np->node = *((u32 *)*p); - if (np->linux_phandle == 0) - np->linux_phandle = np->node; - } - if (strcmp(pname, "ibm,phandle") == 0) - np->linux_phandle = *((u32 *)*p); - pp->name = pname; - pp->length = sz; - pp->value = (void *)*p; - *prev_pp = pp; - prev_pp = &pp->next; - } - *p = _ALIGN((*p) + sz, 4); - } - /* with version 0x10 we may not have the name property, recreate - * it here from the unit name if absent - */ - if (!has_name) { - char *p = pathp, *ps = pathp, *pa = NULL; - int sz; - - while (*p) { - if ((*p) == '@') - pa = p; - if ((*p) == '/') - ps = p + 1; - p++; - } - if (pa < ps) - pa = p; - sz = (pa - ps) + 1; - pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz, - __alignof__(struct property)); - if (allnextpp) { - pp->name = "name"; - pp->length = sz; - pp->value = pp + 1; - *prev_pp = pp; - prev_pp = &pp->next; - memcpy(pp->value, ps, sz - 1); - ((char *)pp->value)[sz - 1] = 0; - DBG("fixed up name for %s -> %s\n", pathp, - (char *)pp->value); - } - } - if (allnextpp) { - *prev_pp = NULL; - np->name = of_get_property(np, "name", NULL); - np->type = of_get_property(np, "device_type", NULL); - - if (!np->name) - np->name = ""; - if (!np->type) - np->type = ""; - } - while (tag == OF_DT_BEGIN_NODE) { - mem = unflatten_dt_node(mem, p, np, allnextpp, fpsize); - tag = *((u32 *)(*p)); - } - if (tag != OF_DT_END_NODE) { - printk("Weird tag at end of node: %x\n", tag); - return mem; - } - *p += 4; - return mem; -} - static int __init early_parse_mem(char *p) { if (!p) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 5cdd958db9af..6852ecf6d1e1 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -166,3 +166,203 @@ int __init of_flat_dt_is_compatible(unsigned long node, const char *compat) return 0; } +static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size, + unsigned long align) +{ + void *res; + + *mem = _ALIGN(*mem, align); + res = (void *)*mem; + *mem += size; + + return res; +} + +/** + * unflatten_dt_node - Alloc and populate a device_node from the flat tree + * @p: pointer to node in flat tree + * @dad: Parent struct device_node + * @allnextpp: pointer to ->allnext from last allocated device_node + * @fpsize: Size of the node path up at the current depth. + */ +unsigned long __init unflatten_dt_node(unsigned long mem, + unsigned long *p, + struct device_node *dad, + struct device_node ***allnextpp, + unsigned long fpsize) +{ + struct device_node *np; + struct property *pp, **prev_pp = NULL; + char *pathp; + u32 tag; + unsigned int l, allocl; + int has_name = 0; + int new_format = 0; + + tag = *((u32 *)(*p)); + if (tag != OF_DT_BEGIN_NODE) { + pr_err("Weird tag at start of node: %x\n", tag); + return mem; + } + *p += 4; + pathp = (char *)*p; + l = allocl = strlen(pathp) + 1; + *p = _ALIGN(*p + l, 4); + + /* version 0x10 has a more compact unit name here instead of the full + * path. we accumulate the full path size using "fpsize", we'll rebuild + * it later. We detect this because the first character of the name is + * not '/'. + */ + if ((*pathp) != '/') { + new_format = 1; + if (fpsize == 0) { + /* root node: special case. fpsize accounts for path + * plus terminating zero. root node only has '/', so + * fpsize should be 2, but we want to avoid the first + * level nodes to have two '/' so we use fpsize 1 here + */ + fpsize = 1; + allocl = 2; + } else { + /* account for '/' and path size minus terminal 0 + * already in 'l' + */ + fpsize += l; + allocl = fpsize; + } + } + + np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl, + __alignof__(struct device_node)); + if (allnextpp) { + memset(np, 0, sizeof(*np)); + np->full_name = ((char *)np) + sizeof(struct device_node); + if (new_format) { + char *fn = np->full_name; + /* rebuild full path for new format */ + if (dad && dad->parent) { + strcpy(fn, dad->full_name); +#ifdef DEBUG + if ((strlen(fn) + l + 1) != allocl) { + pr_debug("%s: p: %d, l: %d, a: %d\n", + pathp, (int)strlen(fn), + l, allocl); + } +#endif + fn += strlen(fn); + } + *(fn++) = '/'; + memcpy(fn, pathp, l); + } else + memcpy(np->full_name, pathp, l); + prev_pp = &np->properties; + **allnextpp = np; + *allnextpp = &np->allnext; + if (dad != NULL) { + np->parent = dad; + /* we temporarily use the next field as `last_child'*/ + if (dad->next == NULL) + dad->child = np; + else + dad->next->sibling = np; + dad->next = np; + } + kref_init(&np->kref); + } + while (1) { + u32 sz, noff; + char *pname; + + tag = *((u32 *)(*p)); + if (tag == OF_DT_NOP) { + *p += 4; + continue; + } + if (tag != OF_DT_PROP) + break; + *p += 4; + sz = *((u32 *)(*p)); + noff = *((u32 *)((*p) + 4)); + *p += 8; + if (initial_boot_params->version < 0x10) + *p = _ALIGN(*p, sz >= 8 ? 8 : 4); + + pname = find_flat_dt_string(noff); + if (pname == NULL) { + pr_info("Can't find property name in list !\n"); + break; + } + if (strcmp(pname, "name") == 0) + has_name = 1; + l = strlen(pname) + 1; + pp = unflatten_dt_alloc(&mem, sizeof(struct property), + __alignof__(struct property)); + if (allnextpp) { + if (strcmp(pname, "linux,phandle") == 0) { + np->node = *((u32 *)*p); + if (np->linux_phandle == 0) + np->linux_phandle = np->node; + } + if (strcmp(pname, "ibm,phandle") == 0) + np->linux_phandle = *((u32 *)*p); + pp->name = pname; + pp->length = sz; + pp->value = (void *)*p; + *prev_pp = pp; + prev_pp = &pp->next; + } + *p = _ALIGN((*p) + sz, 4); + } + /* with version 0x10 we may not have the name property, recreate + * it here from the unit name if absent + */ + if (!has_name) { + char *p1 = pathp, *ps = pathp, *pa = NULL; + int sz; + + while (*p1) { + if ((*p1) == '@') + pa = p1; + if ((*p1) == '/') + ps = p1 + 1; + p1++; + } + if (pa < ps) + pa = p1; + sz = (pa - ps) + 1; + pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz, + __alignof__(struct property)); + if (allnextpp) { + pp->name = "name"; + pp->length = sz; + pp->value = pp + 1; + *prev_pp = pp; + prev_pp = &pp->next; + memcpy(pp->value, ps, sz - 1); + ((char *)pp->value)[sz - 1] = 0; + pr_debug("fixed up name for %s -> %s\n", pathp, + (char *)pp->value); + } + } + if (allnextpp) { + *prev_pp = NULL; + np->name = of_get_property(np, "name", NULL); + np->type = of_get_property(np, "device_type", NULL); + + if (!np->name) + np->name = ""; + if (!np->type) + np->type = ""; + } + while (tag == OF_DT_BEGIN_NODE) { + mem = unflatten_dt_node(mem, p, np, allnextpp, fpsize); + tag = *((u32 *)(*p)); + } + if (tag != OF_DT_END_NODE) { + pr_err("Weird tag at end of node: %x\n", tag); + return mem; + } + *p += 4; + return mem; +} diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 81231e04e8f3..ace9068e07e8 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -69,6 +69,10 @@ extern void *of_get_flat_dt_prop(unsigned long node, const char *name, unsigned long *size); extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); +extern unsigned long unflatten_dt_node(unsigned long mem, unsigned long *p, + struct device_node *dad, + struct device_node ***allnextpp, + unsigned long fpsize); /* Other Prototypes */ extern void finish_device_tree(void); -- cgit v1.2.3 From 41f880091c15b039ffcc8b3d831656b81517a6d3 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 23 Nov 2009 20:07:01 -0700 Subject: of/flattree: Merge unflatten_device_tree Merge common code between PowerPC and MicroBlaze Signed-off-by: Grant Likely Reviewed-by: Wolfram Sang Tested-by: Michal Simek --- arch/microblaze/include/asm/prom.h | 1 - arch/microblaze/kernel/prom.c | 49 ----------------------------------- arch/powerpc/kernel/prom.c | 50 ------------------------------------ drivers/of/fdt.c | 52 ++++++++++++++++++++++++++++++++++++++ include/linux/of.h | 3 +++ include/linux/of_fdt.h | 4 --- 6 files changed, 55 insertions(+), 104 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h index ef3ec1d6ceb3..07d1063f9aae 100644 --- a/arch/microblaze/include/asm/prom.h +++ b/arch/microblaze/include/asm/prom.h @@ -37,7 +37,6 @@ extern struct device_node *of_chosen; #define HAVE_ARCH_DEVTREE_FIXUPS -extern struct device_node *allnodes; /* temporary while merging */ extern rwlock_t devtree_lock; /* temporary while merging */ /* For updating the device tree at runtime */ diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index 021770abfbd7..901d538c15ef 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -50,55 +50,6 @@ typedef u32 cell_t; /* export that to outside world */ struct device_node *of_chosen; -/** - * unflattens the device-tree passed by the firmware, creating the - * tree of struct device_node. It also fills the "name" and "type" - * pointers of the nodes so the normal device-tree walking functions - * can be used (this used to be done by finish_device_tree) - */ -void __init unflatten_device_tree(void) -{ - unsigned long start, mem, size; - struct device_node **allnextp = &allnodes; - - pr_debug(" -> unflatten_device_tree()\n"); - - /* First pass, scan for size */ - start = ((unsigned long)initial_boot_params) + - initial_boot_params->off_dt_struct; - size = unflatten_dt_node(0, &start, NULL, NULL, 0); - size = (size | 3) + 1; - - pr_debug(" size is %lx, allocating...\n", size); - - /* Allocate memory for the expanded device tree */ - mem = lmb_alloc(size + 4, __alignof__(struct device_node)); - mem = (unsigned long) __va(mem); - - ((u32 *)mem)[size / 4] = 0xdeadbeef; - - pr_debug(" unflattening %lx...\n", mem); - - /* Second pass, do actual unflattening */ - start = ((unsigned long)initial_boot_params) + - initial_boot_params->off_dt_struct; - unflatten_dt_node(mem, &start, NULL, &allnextp, 0); - if (*((u32 *)start) != OF_DT_END) - printk(KERN_WARNING "Weird tag at end of tree: %08x\n", - *((u32 *)start)); - if (((u32 *)mem)[size / 4] != 0xdeadbeef) - printk(KERN_WARNING "End of tree marker overwritten: %08x\n", - ((u32 *)mem)[size / 4]); - *allnextp = NULL; - - /* Get pointer to OF "/chosen" node for use everywhere */ - of_chosen = of_find_node_by_path("/chosen"); - if (of_chosen == NULL) - of_chosen = of_find_node_by_path("/chosen@0"); - - pr_debug(" <- unflatten_device_tree()\n"); -} - #define early_init_dt_scan_drconf_memory(node) 0 static int __init early_init_dt_scan_cpus(unsigned long node, diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index a102a0a33ed1..1280f3484ad3 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -73,8 +73,6 @@ unsigned long tce_alloc_start, tce_alloc_end; typedef u32 cell_t; -extern struct device_node *allnodes; /* temporary while merging */ - extern rwlock_t devtree_lock; /* temporary while merging */ /* export that to outside world */ @@ -119,54 +117,6 @@ static void __init move_device_tree(void) DBG("<- move_device_tree\n"); } -/** - * unflattens the device-tree passed by the firmware, creating the - * tree of struct device_node. It also fills the "name" and "type" - * pointers of the nodes so the normal device-tree walking functions - * can be used (this used to be done by finish_device_tree) - */ -void __init unflatten_device_tree(void) -{ - unsigned long start, mem, size; - struct device_node **allnextp = &allnodes; - - DBG(" -> unflatten_device_tree()\n"); - - /* First pass, scan for size */ - start = ((unsigned long)initial_boot_params) + - initial_boot_params->off_dt_struct; - size = unflatten_dt_node(0, &start, NULL, NULL, 0); - size = (size | 3) + 1; - - DBG(" size is %lx, allocating...\n", size); - - /* Allocate memory for the expanded device tree */ - mem = lmb_alloc(size + 4, __alignof__(struct device_node)); - mem = (unsigned long) __va(mem); - - ((u32 *)mem)[size / 4] = 0xdeadbeef; - - DBG(" unflattening %lx...\n", mem); - - /* Second pass, do actual unflattening */ - start = ((unsigned long)initial_boot_params) + - initial_boot_params->off_dt_struct; - unflatten_dt_node(mem, &start, NULL, &allnextp, 0); - if (*((u32 *)start) != OF_DT_END) - printk(KERN_WARNING "Weird tag at end of tree: %08x\n", *((u32 *)start)); - if (((u32 *)mem)[size / 4] != 0xdeadbeef) - printk(KERN_WARNING "End of tree marker overwritten: %08x\n", - ((u32 *)mem)[size / 4] ); - *allnextp = NULL; - - /* Get pointer to OF "/chosen" node for use everywhere */ - of_chosen = of_find_node_by_path("/chosen"); - if (of_chosen == NULL) - of_chosen = of_find_node_by_path("/chosen@0"); - - DBG(" <- unflatten_device_tree()\n"); -} - /* * ibm,pa-features is a per-cpu property that contains a string of * attribute descriptors, each of which has a 2 byte header plus up diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 6852ecf6d1e1..43d236cbc17b 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -9,6 +9,8 @@ * version 2 as published by the Free Software Foundation. */ +#include +#include #include #include @@ -366,3 +368,53 @@ unsigned long __init unflatten_dt_node(unsigned long mem, *p += 4; return mem; } + +/** + * unflatten_device_tree - create tree of device_nodes from flat blob + * + * unflattens the device-tree passed by the firmware, creating the + * tree of struct device_node. It also fills the "name" and "type" + * pointers of the nodes so the normal device-tree walking functions + * can be used. + */ +void __init unflatten_device_tree(void) +{ + unsigned long start, mem, size; + struct device_node **allnextp = &allnodes; + + pr_debug(" -> unflatten_device_tree()\n"); + + /* First pass, scan for size */ + start = ((unsigned long)initial_boot_params) + + initial_boot_params->off_dt_struct; + size = unflatten_dt_node(0, &start, NULL, NULL, 0); + size = (size | 3) + 1; + + pr_debug(" size is %lx, allocating...\n", size); + + /* Allocate memory for the expanded device tree */ + mem = lmb_alloc(size + 4, __alignof__(struct device_node)); + mem = (unsigned long) __va(mem); + + ((u32 *)mem)[size / 4] = 0xdeadbeef; + + pr_debug(" unflattening %lx...\n", mem); + + /* Second pass, do actual unflattening */ + start = ((unsigned long)initial_boot_params) + + initial_boot_params->off_dt_struct; + unflatten_dt_node(mem, &start, NULL, &allnextp, 0); + if (*((u32 *)start) != OF_DT_END) + pr_warning("Weird tag at end of tree: %08x\n", *((u32 *)start)); + if (((u32 *)mem)[size / 4] != 0xdeadbeef) + pr_warning("End of tree marker overwritten: %08x\n", + ((u32 *)mem)[size / 4]); + *allnextp = NULL; + + /* Get pointer to OF "/chosen" node for use everywhere */ + of_chosen = of_find_node_by_path("/chosen"); + if (of_chosen == NULL) + of_chosen = of_find_node_by_path("/chosen@0"); + + pr_debug(" <- unflatten_device_tree()\n"); +} diff --git a/include/linux/of.h b/include/linux/of.h index e7facd8fbce8..bec215792c4f 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -63,6 +63,9 @@ struct device_node { #endif }; +/* Pointer for first entry in chain of all nodes. */ +extern struct device_node *allnodes; + static inline int of_node_check_flag(struct device_node *n, unsigned long flag) { return test_bit(flag, &n->_flags); diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index ace9068e07e8..81231e04e8f3 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -69,10 +69,6 @@ extern void *of_get_flat_dt_prop(unsigned long node, const char *name, unsigned long *size); extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); -extern unsigned long unflatten_dt_node(unsigned long mem, unsigned long *p, - struct device_node *dad, - struct device_node ***allnextpp, - unsigned long fpsize); /* Other Prototypes */ extern void finish_device_tree(void); -- cgit v1.2.3 From 2be09cb993826b52c9fc1d44747c20dd43a50038 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 23 Nov 2009 20:16:46 -0700 Subject: of: remove special case definition of of_read_ulong() Special case of of_read_ulong() was defined for PPC32 to toss away all but the last 32 bits when a large number value was read, and the 'normal' version for ppc64 just #defined of_read_ulong to of_read_number which causes compiler warnings on MicroBlaze and other 32 bit architectures because it returns a u64 instead of a ulong. This patch fixes the problem by defining a common implementation of of_read_ulong() that works everywhere. Signed-off-by: Grant Likely Reviewed-by: Wolfram Sang Tested-by: Michal Simek --- include/linux/of.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index bec215792c4f..d4c014a35ea5 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -113,14 +113,11 @@ static inline u64 of_read_number(const u32 *cell, int size) } /* Like of_read_number, but we want an unsigned long result */ -#ifdef CONFIG_PPC32 static inline unsigned long of_read_ulong(const u32 *cell, int size) { - return cell[size-1]; + /* toss away upper bits if unsigned long is smaller than u64 */ + return of_read_number(cell, size); } -#else -#define of_read_ulong(cell, size) of_read_number(cell, size) -#endif #include -- cgit v1.2.3 From 3cf602532c535ec655725e9833378e04c9fd7783 Mon Sep 17 00:00:00 2001 From: Amul Kumar Saha Date: Wed, 21 Oct 2009 17:00:05 +0530 Subject: mtd: OneNAND OTP support rework What is OTP in OneNAND? The device includes, 1. one block-sized OTP (One Time Programmable) area and 2. user-controlled 1st block OTP(Block 0) that can be used to increase system security or to provide identification capabilities. What is done? In OneNAND, one block of the NAND Array is set aside as an OTP memory area, and 1st Block (Block 0) can be used as OTP area. This area, available to the user, can be configured and locked with secured user information. The OTP block can be read, programmed and locked using the same operations as any other NAND Flash Array memory block. After issuing an OTP-Lock, OTP block cannot be erased. OTP block is fully-guaranteed to be a good block. Why it is done? Locking the 1st Block OTP has the effect of a 'Write-protect' to guard against accidental re-programming of data stored in the 1st block and OTP Block. Which problem it solves? OTP support is provided in the existing implementation of OneNAND/Flex-OneNAND driver, but it is not working with OneNAND devices. Have observed the following in current OTP OneNAND Implmentation, 1. DataSheet specific sequence to lock the OTP Area is not followed. 2. Certain functions are quiet generic to cope with OTP specific activity. This patch re-implements OTP support for OneNAND device. How it is done? For all blocks, 8th word is available to the user. However, in case of OTP Block, 8th word of sector 0, page 0 is reserved as OTP Locking Bit area. Therefore, in case of OTP Block, user usage on this area is prohibited. Condition specific values are entered in the 8th word, sector0, page 0 of the OTP block during the process of issuing an OTP-Lock. The possible conditions are: 1. Only 1st Block Lock 2. Only OTP Block Lock 3. Lock both the 1st Block and the OTP Block What Other feature additions have been done in this patch? This patch adds feature for: 1. Only 1st Block Lock 2. Lock both the 1st Block and the OTP Blocks Re-implemented OTP support for OneNAND Added following features to OneNAND 1. Lock only 1st Block in OneNAND 2. Lock BOTH 1st Block and OTP Block in OneNAND [comments were slightly tweaked by Artem] Signed-off-by: Amul Kumar Saha Reviewed-by: Adrian Hunter Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- drivers/mtd/onenand/onenand_base.c | 292 +++++++++++++++++++++++++++++++++---- include/linux/mtd/onenand.h | 4 +- 2 files changed, 264 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c index 6e250f3a4a16..7bd6ad3ff30a 100644 --- a/drivers/mtd/onenand/onenand_base.c +++ b/drivers/mtd/onenand/onenand_base.c @@ -1,17 +1,19 @@ /* * linux/drivers/mtd/onenand/onenand_base.c * - * Copyright (C) 2005-2007 Samsung Electronics + * Copyright © 2005-2009 Samsung Electronics + * Copyright © 2007 Nokia Corporation + * * Kyungmin Park * * Credits: * Adrian Hunter : * auto-placement support, read-while load support, various fixes - * Copyright (C) Nokia Corporation, 2007 * * Vishak G , Rohit Hagargundgi * Flex-OneNAND support - * Copyright (C) Samsung Electronics, 2008 + * Amul Kumar Saha + * OTP support * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -43,6 +45,18 @@ MODULE_PARM_DESC(flex_bdry, "SLC Boundary information for Flex-OneNAND" " : 0->Set boundary in unlocked status" " : 1->Set boundary in locked status"); +/* Default OneNAND/Flex-OneNAND OTP options*/ +static int otp; + +module_param(otp, int, 0400); +MODULE_PARM_DESC(otp, "Corresponding behaviour of OneNAND in OTP" + "Syntax : otp=LOCK_TYPE" + "LOCK_TYPE : Keys issued, for specific OTP Lock type" + " : 0 -> Default (No Blocks Locked)" + " : 1 -> OTP Block lock" + " : 2 -> 1st Block lock" + " : 3 -> BOTH OTP Block and 1st Block lock"); + /** * onenand_oob_128 - oob info for Flex-Onenand with 4KB page * For now, we expose only 64 out of 80 ecc bytes @@ -2591,6 +2605,208 @@ static void onenand_unlock_all(struct mtd_info *mtd) #ifdef CONFIG_MTD_ONENAND_OTP +/** + * onenand_otp_command - Send OTP specific command to OneNAND device + * @param mtd MTD device structure + * @param cmd the command to be sent + * @param addr offset to read from or write to + * @param len number of bytes to read or write + */ +static int onenand_otp_command(struct mtd_info *mtd, int cmd, loff_t addr, + size_t len) +{ + struct onenand_chip *this = mtd->priv; + int value, block, page; + + /* Address translation */ + switch (cmd) { + case ONENAND_CMD_OTP_ACCESS: + block = (int) (addr >> this->erase_shift); + page = -1; + break; + + default: + block = (int) (addr >> this->erase_shift); + page = (int) (addr >> this->page_shift); + + if (ONENAND_IS_2PLANE(this)) { + /* Make the even block number */ + block &= ~1; + /* Is it the odd plane? */ + if (addr & this->writesize) + block++; + page >>= 1; + } + page &= this->page_mask; + break; + } + + if (block != -1) { + /* Write 'DFS, FBA' of Flash */ + value = onenand_block_address(this, block); + this->write_word(value, this->base + + ONENAND_REG_START_ADDRESS1); + } + + if (page != -1) { + /* Now we use page size operation */ + int sectors = 4, count = 4; + int dataram; + + switch (cmd) { + default: + if (ONENAND_IS_2PLANE(this) && cmd == ONENAND_CMD_PROG) + cmd = ONENAND_CMD_2X_PROG; + dataram = ONENAND_CURRENT_BUFFERRAM(this); + break; + } + + /* Write 'FPA, FSA' of Flash */ + value = onenand_page_address(page, sectors); + this->write_word(value, this->base + + ONENAND_REG_START_ADDRESS8); + + /* Write 'BSA, BSC' of DataRAM */ + value = onenand_buffer_address(dataram, sectors, count); + this->write_word(value, this->base + ONENAND_REG_START_BUFFER); + } + + /* Interrupt clear */ + this->write_word(ONENAND_INT_CLEAR, this->base + ONENAND_REG_INTERRUPT); + + /* Write command */ + this->write_word(cmd, this->base + ONENAND_REG_COMMAND); + + return 0; +} + +/** + * onenand_otp_write_oob_nolock - [Internal] OneNAND write out-of-band, specific to OTP + * @param mtd MTD device structure + * @param to offset to write to + * @param len number of bytes to write + * @param retlen pointer to variable to store the number of written bytes + * @param buf the data to write + * + * OneNAND write out-of-band only for OTP + */ +static int onenand_otp_write_oob_nolock(struct mtd_info *mtd, loff_t to, + struct mtd_oob_ops *ops) +{ + struct onenand_chip *this = mtd->priv; + int column, ret = 0, oobsize; + int written = 0; + u_char *oobbuf; + size_t len = ops->ooblen; + const u_char *buf = ops->oobbuf; + int block, value, status; + + to += ops->ooboffs; + + /* Initialize retlen, in case of early exit */ + ops->oobretlen = 0; + + oobsize = mtd->oobsize; + + column = to & (mtd->oobsize - 1); + + oobbuf = this->oob_buf; + + /* Loop until all data write */ + while (written < len) { + int thislen = min_t(int, oobsize, len - written); + + cond_resched(); + + block = (int) (to >> this->erase_shift); + /* + * Write 'DFS, FBA' of Flash + * Add: F100h DQ=DFS, FBA + */ + + value = onenand_block_address(this, block); + this->write_word(value, this->base + + ONENAND_REG_START_ADDRESS1); + + /* + * Select DataRAM for DDP + * Add: F101h DQ=DBS + */ + + value = onenand_bufferram_address(this, block); + this->write_word(value, this->base + + ONENAND_REG_START_ADDRESS2); + ONENAND_SET_NEXT_BUFFERRAM(this); + + /* + * Enter OTP access mode + */ + this->command(mtd, ONENAND_CMD_OTP_ACCESS, 0, 0); + this->wait(mtd, FL_OTPING); + + /* We send data to spare ram with oobsize + * to prevent byte access */ + memcpy(oobbuf + column, buf, thislen); + + /* + * Write Data into DataRAM + * Add: 8th Word + * in sector0/spare/page0 + * DQ=XXFCh + */ + this->write_bufferram(mtd, ONENAND_SPARERAM, + oobbuf, 0, mtd->oobsize); + + onenand_otp_command(mtd, ONENAND_CMD_PROGOOB, to, mtd->oobsize); + onenand_update_bufferram(mtd, to, 0); + if (ONENAND_IS_2PLANE(this)) { + ONENAND_SET_BUFFERRAM1(this); + onenand_update_bufferram(mtd, to + this->writesize, 0); + } + + ret = this->wait(mtd, FL_WRITING); + if (ret) { + printk(KERN_ERR "%s: write failed %d\n", __func__, ret); + break; + } + + /* Exit OTP access mode */ + this->command(mtd, ONENAND_CMD_RESET, 0, 0); + this->wait(mtd, FL_RESETING); + + status = this->read_word(this->base + ONENAND_REG_CTRL_STATUS); + status &= 0x60; + + if (status == 0x60) { + printk(KERN_DEBUG "\nBLOCK\tSTATUS\n"); + printk(KERN_DEBUG "1st Block\tLOCKED\n"); + printk(KERN_DEBUG "OTP Block\tLOCKED\n"); + } else if (status == 0x20) { + printk(KERN_DEBUG "\nBLOCK\tSTATUS\n"); + printk(KERN_DEBUG "1st Block\tLOCKED\n"); + printk(KERN_DEBUG "OTP Block\tUN-LOCKED\n"); + } else if (status == 0x40) { + printk(KERN_DEBUG "\nBLOCK\tSTATUS\n"); + printk(KERN_DEBUG "1st Block\tUN-LOCKED\n"); + printk(KERN_DEBUG "OTP Block\tLOCKED\n"); + } else { + printk(KERN_DEBUG "Reboot to check\n"); + } + + written += thislen; + if (written == len) + break; + + to += mtd->writesize; + buf += thislen; + column = 0; + } + + ops->oobretlen = written; + + return ret; +} + /* Internal OTP operation */ typedef int (*otp_op_t)(struct mtd_info *mtd, loff_t form, size_t len, size_t *retlen, u_char *buf); @@ -2693,11 +2909,11 @@ static int do_otp_lock(struct mtd_info *mtd, loff_t from, size_t len, struct mtd_oob_ops ops; int ret; - /* Enter OTP access mode */ - this->command(mtd, ONENAND_CMD_OTP_ACCESS, 0, 0); - this->wait(mtd, FL_OTPING); - if (FLEXONENAND(this)) { + + /* Enter OTP access mode */ + this->command(mtd, ONENAND_CMD_OTP_ACCESS, 0, 0); + this->wait(mtd, FL_OTPING); /* * For Flex-OneNAND, we write lock mark to 1st word of sector 4 of * main area of page 49. @@ -2708,19 +2924,19 @@ static int do_otp_lock(struct mtd_info *mtd, loff_t from, size_t len, ops.oobbuf = NULL; ret = onenand_write_ops_nolock(mtd, mtd->writesize * 49, &ops); *retlen = ops.retlen; + + /* Exit OTP access mode */ + this->command(mtd, ONENAND_CMD_RESET, 0, 0); + this->wait(mtd, FL_RESETING); } else { ops.mode = MTD_OOB_PLACE; ops.ooblen = len; ops.oobbuf = buf; ops.ooboffs = 0; - ret = onenand_write_oob_nolock(mtd, from, &ops); + ret = onenand_otp_write_oob_nolock(mtd, from, &ops); *retlen = ops.oobretlen; } - /* Exit OTP access mode */ - this->command(mtd, ONENAND_CMD_RESET, 0, 0); - this->wait(mtd, FL_RESETING); - return ret; } @@ -2751,16 +2967,21 @@ static int onenand_otp_walk(struct mtd_info *mtd, loff_t from, size_t len, if (density < ONENAND_DEVICE_DENSITY_512Mb) otp_pages = 20; else - otp_pages = 10; + otp_pages = 50; if (mode == MTD_OTP_FACTORY) { from += mtd->writesize * otp_pages; - otp_pages = 64 - otp_pages; + otp_pages = ONENAND_PAGES_PER_BLOCK - otp_pages; } /* Check User/Factory boundary */ - if (((mtd->writesize * otp_pages) - (from + len)) < 0) - return 0; + if (mode == MTD_OTP_USER) { + if (((mtd->writesize * otp_pages) - (from + len)) < 0) + return 0; + } else { + if (((mtd->writesize * otp_pages) - len) < 0) + return 0; + } onenand_get_device(mtd, FL_OTPING); while (len > 0 && otp_pages > 0) { @@ -2783,13 +3004,12 @@ static int onenand_otp_walk(struct mtd_info *mtd, loff_t from, size_t len, *retlen += sizeof(struct otp_info); } else { size_t tmp_retlen; - int size = len; ret = action(mtd, from, len, &tmp_retlen, buf); - buf += size; - len -= size; - *retlen += size; + buf += tmp_retlen; + len -= tmp_retlen; + *retlen += tmp_retlen; if (ret) break; @@ -2902,20 +3122,10 @@ static int onenand_lock_user_prot_reg(struct mtd_info *mtd, loff_t from, u_char *buf = FLEXONENAND(this) ? this->page_buf : this->oob_buf; size_t retlen; int ret; + unsigned int otp_lock_offset = ONENAND_OTP_LOCK_OFFSET; memset(buf, 0xff, FLEXONENAND(this) ? this->writesize : mtd->oobsize); - /* - * Note: OTP lock operation - * OTP block : 0xXXFC - * 1st block : 0xXXF3 (If chip support) - * Both : 0xXXF0 (If chip support) - */ - if (FLEXONENAND(this)) - buf[FLEXONENAND_OTP_LOCK_OFFSET] = 0xFC; - else - buf[ONENAND_OTP_LOCK_OFFSET] = 0xFC; - /* * Write lock mark to 8th word of sector0 of page0 of the spare0. * We write 16 bytes spare area instead of 2 bytes. @@ -2926,10 +3136,30 @@ static int onenand_lock_user_prot_reg(struct mtd_info *mtd, loff_t from, from = 0; len = FLEXONENAND(this) ? mtd->writesize : 16; + /* + * Note: OTP lock operation + * OTP block : 0xXXFC XX 1111 1100 + * 1st block : 0xXXF3 (If chip support) XX 1111 0011 + * Both : 0xXXF0 (If chip support) XX 1111 0000 + */ + if (FLEXONENAND(this)) + otp_lock_offset = FLEXONENAND_OTP_LOCK_OFFSET; + + /* ONENAND_OTP_AREA | ONENAND_OTP_BLOCK0 | ONENAND_OTP_AREA_BLOCK0 */ + if (otp == 1) + buf[otp_lock_offset] = 0xFC; + else if (otp == 2) + buf[otp_lock_offset] = 0xF3; + else if (otp == 3) + buf[otp_lock_offset] = 0xF0; + else if (otp != 0) + printk(KERN_DEBUG "[OneNAND] Invalid option selected for OTP\n"); + ret = onenand_otp_walk(mtd, from, len, &retlen, buf, do_otp_lock, MTD_OTP_USER); return ret ? : retlen; } + #endif /* CONFIG_MTD_ONENAND_OTP */ /** diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h index f57e29e17bb0..5509eb06b326 100644 --- a/include/linux/mtd/onenand.h +++ b/include/linux/mtd/onenand.h @@ -1,7 +1,7 @@ /* * linux/include/linux/mtd/onenand.h * - * Copyright (C) 2005-2007 Samsung Electronics + * Copyright © 2005-2009 Samsung Electronics * Kyungmin Park * * This program is free software; you can redistribute it and/or modify @@ -137,6 +137,8 @@ struct onenand_chip { /* * Helper macros */ +#define ONENAND_PAGES_PER_BLOCK (1<<6) + #define ONENAND_CURRENT_BUFFERRAM(this) (this->bufferram_index) #define ONENAND_NEXT_BUFFERRAM(this) (this->bufferram_index ^ 1) #define ONENAND_SET_NEXT_BUFFERRAM(this) (this->bufferram_index ^= 1) -- cgit v1.2.3 From 1c63aca32903efc219fb9df72bae5344f3e54ed5 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Thu, 22 Oct 2009 16:53:32 +0900 Subject: mtd: Add __nand_calculate_ecc() to NAND ECC functions Add __nand_calculate_ecc() which does not take struct mtd_info. The built-in 256/512 software ECC calculation and correction tester will use it. Signed-off-by: Akinobu Mita Acked-by: Vimal Singh Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- drivers/mtd/nand/nand_ecc.c | 25 ++++++++++++++++++++----- include/linux/mtd/nand_ecc.h | 10 ++++++++-- 2 files changed, 28 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/nand_ecc.c b/drivers/mtd/nand/nand_ecc.c index db7ae9d6a296..809fb53304ae 100644 --- a/drivers/mtd/nand/nand_ecc.c +++ b/drivers/mtd/nand/nand_ecc.c @@ -150,20 +150,19 @@ static const char addressbits[256] = { }; /** - * nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256/512-byte + * __nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256/512-byte * block - * @mtd: MTD block structure * @buf: input buffer with raw data + * @eccsize: data bytes per ecc step (256 or 512) * @code: output buffer with ECC */ -int nand_calculate_ecc(struct mtd_info *mtd, const unsigned char *buf, +void __nand_calculate_ecc(const unsigned char *buf, unsigned int eccsize, unsigned char *code) { int i; const uint32_t *bp = (uint32_t *)buf; /* 256 or 512 bytes/ecc */ - const uint32_t eccsize_mult = - (((struct nand_chip *)mtd->priv)->ecc.size) >> 8; + const uint32_t eccsize_mult = eccsize >> 8; uint32_t cur; /* current value in buffer */ /* rp0..rp15..rp17 are the various accumulated parities (per byte) */ uint32_t rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7; @@ -412,6 +411,22 @@ int nand_calculate_ecc(struct mtd_info *mtd, const unsigned char *buf, (invparity[par & 0x55] << 2) | (invparity[rp17] << 1) | (invparity[rp16] << 0); +} +EXPORT_SYMBOL(__nand_calculate_ecc); + +/** + * nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256/512-byte + * block + * @mtd: MTD block structure + * @buf: input buffer with raw data + * @code: output buffer with ECC + */ +int nand_calculate_ecc(struct mtd_info *mtd, const unsigned char *buf, + unsigned char *code) +{ + __nand_calculate_ecc(buf, + ((struct nand_chip *)mtd->priv)->ecc.size, code); + return 0; } EXPORT_SYMBOL(nand_calculate_ecc); diff --git a/include/linux/mtd/nand_ecc.h b/include/linux/mtd/nand_ecc.h index 052ea8ca2434..41bc013571d0 100644 --- a/include/linux/mtd/nand_ecc.h +++ b/include/linux/mtd/nand_ecc.h @@ -16,7 +16,13 @@ struct mtd_info; /* - * Calculate 3 byte ECC code for 256 byte block + * Calculate 3 byte ECC code for eccsize byte block + */ +void __nand_calculate_ecc(const u_char *dat, unsigned int eccsize, + u_char *ecc_code); + +/* + * Calculate 3 byte ECC code for 256/512 byte block */ int nand_calculate_ecc(struct mtd_info *mtd, const u_char *dat, u_char *ecc_code); @@ -27,7 +33,7 @@ int __nand_correct_data(u_char *dat, u_char *read_ecc, u_char *calc_ecc, unsigned int eccsize); /* - * Detect and correct a 1 bit error for 256 byte block + * Detect and correct a 1 bit error for 256/512 byte block */ int nand_correct_data(struct mtd_info *mtd, u_char *dat, u_char *read_ecc, u_char *calc_ecc); -- cgit v1.2.3 From 72073027ee95d059eb5a064da4a978efab36d4ab Mon Sep 17 00:00:00 2001 From: Mika Korhonen Date: Fri, 23 Oct 2009 07:50:43 +0200 Subject: mtd: OneNAND: multiblock erase support Add support for multiblock erase command. OneNANDs (excluding Flex-OneNAND) are capable of simultaneous erase of up to 64 eraseblocks which is much faster. This changes the erase requests for regions covering multiple eraseblocks to be performed using multiblock erase. Signed-off-by: Mika Korhonen Reviewed-by: Adrian Hunter Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- drivers/mtd/onenand/omap2.c | 22 ++++- drivers/mtd/onenand/onenand_base.c | 173 ++++++++++++++++++++++++++++++++++++- include/linux/mtd/flashchip.h | 4 +- include/linux/mtd/onenand_regs.h | 2 + 4 files changed, 194 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/onenand/omap2.c b/drivers/mtd/onenand/omap2.c index 0108ed42e877..2dafd0949be5 100644 --- a/drivers/mtd/onenand/omap2.c +++ b/drivers/mtd/onenand/omap2.c @@ -112,10 +112,24 @@ static int omap2_onenand_wait(struct mtd_info *mtd, int state) unsigned long timeout; u32 syscfg; - if (state == FL_RESETING) { - int i; + if (state == FL_RESETING || state == FL_PREPARING_ERASE || + state == FL_VERIFYING_ERASE) { + int i = 21; + unsigned int intr_flags = ONENAND_INT_MASTER; + + switch (state) { + case FL_RESETING: + intr_flags |= ONENAND_INT_RESET; + break; + case FL_PREPARING_ERASE: + intr_flags |= ONENAND_INT_ERASE; + break; + case FL_VERIFYING_ERASE: + i = 101; + break; + } - for (i = 0; i < 20; i++) { + while (--i) { udelay(1); intr = read_reg(c, ONENAND_REG_INTERRUPT); if (intr & ONENAND_INT_MASTER) @@ -126,7 +140,7 @@ static int omap2_onenand_wait(struct mtd_info *mtd, int state) wait_err("controller error", state, ctrl, intr); return -EIO; } - if (!(intr & ONENAND_INT_RESET)) { + if ((intr & intr_flags) != intr_flags) { wait_err("timeout", state, ctrl, intr); return -EIO; } diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c index 894ebadc3e69..266f471901e5 100644 --- a/drivers/mtd/onenand/onenand_base.c +++ b/drivers/mtd/onenand/onenand_base.c @@ -34,6 +34,13 @@ #include +/* + * Multiblock erase if number of blocks to erase is 2 or more. + * Maximum number of blocks for simultaneous erase is 64. + */ +#define MB_ERASE_MIN_BLK_COUNT 2 +#define MB_ERASE_MAX_BLK_COUNT 64 + /* Default Flex-OneNAND boundary and lock respectively */ static int flex_bdry[MAX_DIES * 2] = { -1, 0, -1, 0 }; @@ -353,6 +360,8 @@ static int onenand_command(struct mtd_info *mtd, int cmd, loff_t addr, size_t le break; case ONENAND_CMD_ERASE: + case ONENAND_CMD_MULTIBLOCK_ERASE: + case ONENAND_CMD_ERASE_VERIFY: case ONENAND_CMD_BUFFERRAM: case ONENAND_CMD_OTP_ACCESS: block = onenand_block(this, addr); @@ -497,7 +506,7 @@ static int onenand_wait(struct mtd_info *mtd, int state) if (interrupt & flags) break; - if (state != FL_READING) + if (state != FL_READING && state != FL_PREPARING_ERASE) cond_resched(); } /* To get correct interrupt status in timeout case */ @@ -530,6 +539,18 @@ static int onenand_wait(struct mtd_info *mtd, int state) return -EIO; } + if (state == FL_PREPARING_ERASE && !(interrupt & ONENAND_INT_ERASE)) { + printk(KERN_ERR "%s: mb erase timeout! ctrl=0x%04x intr=0x%04x\n", + __func__, ctrl, interrupt); + return -EIO; + } + + if (!(interrupt & ONENAND_INT_MASTER)) { + printk(KERN_ERR "%s: timeout! ctrl=0x%04x intr=0x%04x\n", + __func__, ctrl, interrupt); + return -EIO; + } + /* If there's controller error, it's a real error */ if (ctrl & ONENAND_CTRL_ERROR) { printk(KERN_ERR "%s: controller error = 0x%04x\n", @@ -2182,6 +2203,148 @@ static int onenand_block_isbad_nolock(struct mtd_info *mtd, loff_t ofs, int allo return bbm->isbad_bbt(mtd, ofs, allowbbt); } + +static int onenand_multiblock_erase_verify(struct mtd_info *mtd, + struct erase_info *instr) +{ + struct onenand_chip *this = mtd->priv; + loff_t addr = instr->addr; + int len = instr->len; + unsigned int block_size = (1 << this->erase_shift); + int ret = 0; + + while (len) { + this->command(mtd, ONENAND_CMD_ERASE_VERIFY, addr, block_size); + ret = this->wait(mtd, FL_VERIFYING_ERASE); + if (ret) { + printk(KERN_ERR "%s: Failed verify, block %d\n", + __func__, onenand_block(this, addr)); + instr->state = MTD_ERASE_FAILED; + instr->fail_addr = addr; + return -1; + } + len -= block_size; + addr += block_size; + } + return 0; +} + +/** + * onenand_multiblock_erase - [Internal] erase block(s) using multiblock erase + * @param mtd MTD device structure + * @param instr erase instruction + * @param region erase region + * + * Erase one or more blocks up to 64 block at a time + */ +static int onenand_multiblock_erase(struct mtd_info *mtd, + struct erase_info *instr, + unsigned int block_size) +{ + struct onenand_chip *this = mtd->priv; + loff_t addr = instr->addr; + int len = instr->len; + int eb_count = 0; + int ret = 0; + int bdry_block = 0; + + instr->state = MTD_ERASING; + + if (ONENAND_IS_DDP(this)) { + loff_t bdry_addr = this->chipsize >> 1; + if (addr < bdry_addr && (addr + len) > bdry_addr) + bdry_block = bdry_addr >> this->erase_shift; + } + + /* Pre-check bbs */ + while (len) { + /* Check if we have a bad block, we do not erase bad blocks */ + if (onenand_block_isbad_nolock(mtd, addr, 0)) { + printk(KERN_WARNING "%s: attempt to erase a bad block " + "at addr 0x%012llx\n", + __func__, (unsigned long long) addr); + instr->state = MTD_ERASE_FAILED; + return -EIO; + } + len -= block_size; + addr += block_size; + } + + len = instr->len; + addr = instr->addr; + + /* loop over 64 eb batches */ + while (len) { + struct erase_info verify_instr = *instr; + int max_eb_count = MB_ERASE_MAX_BLK_COUNT; + + verify_instr.addr = addr; + verify_instr.len = 0; + + /* do not cross chip boundary */ + if (bdry_block) { + int this_block = (addr >> this->erase_shift); + + if (this_block < bdry_block) { + max_eb_count = min(max_eb_count, + (bdry_block - this_block)); + } + } + + eb_count = 0; + + while (len > block_size && eb_count < (max_eb_count - 1)) { + this->command(mtd, ONENAND_CMD_MULTIBLOCK_ERASE, + addr, block_size); + onenand_invalidate_bufferram(mtd, addr, block_size); + + ret = this->wait(mtd, FL_PREPARING_ERASE); + if (ret) { + printk(KERN_ERR "%s: Failed multiblock erase, " + "block %d\n", __func__, + onenand_block(this, addr)); + instr->state = MTD_ERASE_FAILED; + instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; + return -EIO; + } + + len -= block_size; + addr += block_size; + eb_count++; + } + + /* last block of 64-eb series */ + cond_resched(); + this->command(mtd, ONENAND_CMD_ERASE, addr, block_size); + onenand_invalidate_bufferram(mtd, addr, block_size); + + ret = this->wait(mtd, FL_ERASING); + /* Check if it is write protected */ + if (ret) { + printk(KERN_ERR "%s: Failed erase, block %d\n", + __func__, onenand_block(this, addr)); + instr->state = MTD_ERASE_FAILED; + instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; + return -EIO; + } + + len -= block_size; + addr += block_size; + eb_count++; + + /* verify */ + verify_instr.len = eb_count * block_size; + if (onenand_multiblock_erase_verify(mtd, &verify_instr)) { + instr->state = verify_instr.state; + instr->fail_addr = verify_instr.fail_addr; + return -EIO; + } + + } + return 0; +} + + /** * onenand_block_by_block_erase - [Internal] erase block(s) using regular erase * @param mtd MTD device structure @@ -2315,7 +2478,13 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr) /* Grab the lock and see if the device is available */ onenand_get_device(mtd, FL_ERASING); - ret = onenand_block_by_block_erase(mtd, instr, region, block_size); + if (region || instr->len < MB_ERASE_MIN_BLK_COUNT * block_size) { + /* region is set for Flex-OneNAND (no mb erase) */ + ret = onenand_block_by_block_erase(mtd, instr, + region, block_size); + } else { + ret = onenand_multiblock_erase(mtd, instr, block_size); + } /* Deselect and wake up anyone waiting on the device */ onenand_release_device(mtd); diff --git a/include/linux/mtd/flashchip.h b/include/linux/mtd/flashchip.h index f350a4879f75..d0bf422ae374 100644 --- a/include/linux/mtd/flashchip.h +++ b/include/linux/mtd/flashchip.h @@ -41,9 +41,11 @@ typedef enum { /* These 2 come from nand_state_t, which has been unified here */ FL_READING, FL_CACHEDPRG, - /* These 2 come from onenand_state_t, which has been unified here */ + /* These 4 come from onenand_state_t, which has been unified here */ FL_RESETING, FL_OTPING, + FL_PREPARING_ERASE, + FL_VERIFYING_ERASE, FL_UNKNOWN } flstate_t; diff --git a/include/linux/mtd/onenand_regs.h b/include/linux/mtd/onenand_regs.h index acadbf53a69f..cd6f3b431195 100644 --- a/include/linux/mtd/onenand_regs.h +++ b/include/linux/mtd/onenand_regs.h @@ -131,6 +131,8 @@ #define ONENAND_CMD_LOCK_TIGHT (0x2C) #define ONENAND_CMD_UNLOCK_ALL (0x27) #define ONENAND_CMD_ERASE (0x94) +#define ONENAND_CMD_MULTIBLOCK_ERASE (0x95) +#define ONENAND_CMD_ERASE_VERIFY (0x71) #define ONENAND_CMD_RESET (0xF0) #define ONENAND_CMD_OTP_ACCESS (0x65) #define ONENAND_CMD_READID (0x90) -- cgit v1.2.3 From b1c6e6db5bb7acad82e1c64914c6a9404dae3ee1 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Mon, 2 Nov 2009 18:12:33 +0000 Subject: mtd: nand: add option to quieten off the no device found messgae Add NAND_SCAN_SILENT_NODEV to chip->options to the user-worrying messages 'No NAND device found!!!'. This message often worries users (was three exclamation marks really necessary?) and especially in systems such as the Simtec Osiris where there may be optional NAND devices which are not known until probe time. Revised version of the original NAND_PROBE_SPECULATIVE patch after comments by Artem Bityutskiy about adding a whole new call. Signed-off-by: Ben Dooks Signed-off-by: Simtec Linux Team Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- drivers/mtd/nand/nand_base.c | 3 ++- include/linux/mtd/nand.h | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index ba06473326d1..724cb2c9ad3f 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -2756,7 +2756,8 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips) type = nand_get_flash_type(mtd, chip, busw, &nand_maf_id); if (IS_ERR(type)) { - printk(KERN_WARNING "No NAND device found!!!\n"); + if (!(chip->options & NAND_SCAN_SILENT_NODEV)) + printk(KERN_WARNING "No NAND device found.\n"); chip->select_chip(mtd, -1); return PTR_ERR(type); } diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 2476078a032f..ccab9dfc5217 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -170,7 +170,6 @@ typedef enum { /* Chip does not allow subpage writes */ #define NAND_NO_SUBPAGE_WRITE 0x00000200 - /* Options valid for Samsung large page devices */ #define NAND_SAMSUNG_LP_OPTIONS \ (NAND_NO_PADDING | NAND_CACHEPRG | NAND_COPYBACK) @@ -196,6 +195,9 @@ typedef enum { /* This option is defined if the board driver allocates its own buffers (e.g. because it needs them DMA-coherent */ #define NAND_OWN_BUFFERS 0x00040000 +/* Chip may not exist, so silence any errors in scan */ +#define NAND_SCAN_SILENT_NODEV 0x00080000 + /* Options set by nand scan */ /* Nand scan has allocated controller struct */ #define NAND_CONTROLLER_ALLOC 0x80000000 -- cgit v1.2.3 From b2ef1a2bb2eb49cd7c75b22f1ea40ead0bdfdb8a Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Thu, 5 Nov 2009 15:53:43 +0100 Subject: mtd: move manufacturer to the common cfi.h header file This patch moves the MANUFACTURER_ST and MANUFACTURER_INTEL to the include/linux/mtd/cfi.h header file and renames them to CFI_MFR_ST and CFI_MFR_INTEL. CFI_MFR_ST was already present there. All references in drivers/mtd/chips/cfi_cmdset_0001.c are updated to reflect this. Signed-off-by: Hans-Christian Egtvedt Acked-by: Nicolas Pitre Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- drivers/mtd/chips/cfi_cmdset_0001.c | 20 ++++++++++---------- include/linux/mtd/cfi.h | 9 +++++---- 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c index 94be67e880a7..5fbf29e1e64f 100644 --- a/drivers/mtd/chips/cfi_cmdset_0001.c +++ b/drivers/mtd/chips/cfi_cmdset_0001.c @@ -43,11 +43,11 @@ // debugging, turns off buffer write mode if set to 1 #define FORCE_WORD_WRITE 0 -#define MANUFACTURER_INTEL 0x0089 +/* Intel chips */ #define I82802AB 0x00ad #define I82802AC 0x00ac #define PF38F4476 0x881c -#define MANUFACTURER_ST 0x0020 +/* STMicroelectronics chips */ #define M50LPW080 0x002F #define M50FLW080A 0x0080 #define M50FLW080B 0x0081 @@ -308,16 +308,16 @@ static struct cfi_fixup cfi_fixup_table[] = { #endif { CFI_MFR_ST, 0x00ba, /* M28W320CT */ fixup_st_m28w320ct, NULL }, { CFI_MFR_ST, 0x00bb, /* M28W320CB */ fixup_st_m28w320cb, NULL }, - { MANUFACTURER_INTEL, CFI_ID_ANY, fixup_unlock_powerup_lock, NULL, }, + { CFI_MFR_INTEL, CFI_ID_ANY, fixup_unlock_powerup_lock, NULL, }, { 0, 0, NULL, NULL } }; static struct cfi_fixup jedec_fixup_table[] = { - { MANUFACTURER_INTEL, I82802AB, fixup_use_fwh_lock, NULL, }, - { MANUFACTURER_INTEL, I82802AC, fixup_use_fwh_lock, NULL, }, - { MANUFACTURER_ST, M50LPW080, fixup_use_fwh_lock, NULL, }, - { MANUFACTURER_ST, M50FLW080A, fixup_use_fwh_lock, NULL, }, - { MANUFACTURER_ST, M50FLW080B, fixup_use_fwh_lock, NULL, }, + { CFI_MFR_INTEL, I82802AB, fixup_use_fwh_lock, NULL, }, + { CFI_MFR_INTEL, I82802AC, fixup_use_fwh_lock, NULL, }, + { CFI_MFR_ST, M50LPW080, fixup_use_fwh_lock, NULL, }, + { CFI_MFR_ST, M50FLW080A, fixup_use_fwh_lock, NULL, }, + { CFI_MFR_ST, M50FLW080B, fixup_use_fwh_lock, NULL, }, { 0, 0, NULL, NULL } }; static struct cfi_fixup fixup_table[] = { @@ -333,7 +333,7 @@ static struct cfi_fixup fixup_table[] = { static void cfi_fixup_major_minor(struct cfi_private *cfi, struct cfi_pri_intelext *extp) { - if (cfi->mfr == MANUFACTURER_INTEL && + if (cfi->mfr == CFI_MFR_INTEL && cfi->id == PF38F4476 && extp->MinorVersion == '3') extp->MinorVersion = '1'; } @@ -2249,7 +2249,7 @@ static int cfi_intelext_otp_walk(struct mtd_info *mtd, loff_t from, size_t len, /* Some chips have OTP located in the _top_ partition only. For example: Intel 28F256L18T (T means top-parameter device) */ - if (cfi->mfr == MANUFACTURER_INTEL) { + if (cfi->mfr == CFI_MFR_INTEL) { switch (cfi->id) { case 0x880b: case 0x880c: diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h index 88d3d8fbf9f2..df89f4275232 100644 --- a/include/linux/mtd/cfi.h +++ b/include/linux/mtd/cfi.h @@ -518,10 +518,11 @@ struct cfi_fixup { #define CFI_MFR_ANY 0xffff #define CFI_ID_ANY 0xffff -#define CFI_MFR_AMD 0x0001 -#define CFI_MFR_ATMEL 0x001F -#define CFI_MFR_SAMSUNG 0x00EC -#define CFI_MFR_ST 0x0020 /* STMicroelectronics */ +#define CFI_MFR_AMD 0x0001 +#define CFI_MFR_INTEL 0x0089 +#define CFI_MFR_ATMEL 0x001F +#define CFI_MFR_SAMSUNG 0x00EC +#define CFI_MFR_ST 0x0020 /* STMicroelectronics */ void cfi_fixup(struct mtd_info *mtd, struct cfi_fixup* fixups); -- cgit v1.2.3 From 456b565cc52fbcdaa2e19ffdf40d9dd3b726d603 Mon Sep 17 00:00:00 2001 From: Simon Kagstrom Date: Fri, 16 Oct 2009 14:09:18 +0200 Subject: core: Add kernel message dumper to call on oopses and panics The core functionality is implemented as per Linus suggestion from http://lists.infradead.org/pipermail/linux-mtd/2009-October/027620.html (with the kmsg_dump implementation by Linus). A struct kmsg_dumper has been added which contains a callback to dump the kernel log buffers on crashes. The kmsg_dump function gets called from oops_exit() and panic() and invokes this callbacks with the crash reason. [dwmw2: Fix log_end handling] Signed-off-by: Simon Kagstrom Reviewed-by: Anders Grafstrom Reviewed-by: Linus Torvalds Acked-by: Ingo Molnar Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- include/linux/kmsg_dump.h | 44 +++++++++++++++++ kernel/panic.c | 3 ++ kernel/printk.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+) create mode 100644 include/linux/kmsg_dump.h (limited to 'include/linux') diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h new file mode 100644 index 000000000000..7f089ec5ef32 --- /dev/null +++ b/include/linux/kmsg_dump.h @@ -0,0 +1,44 @@ +/* + * linux/include/kmsg_dump.h + * + * Copyright (C) 2009 Net Insight AB + * + * Author: Simon Kagstrom + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of this archive + * for more details. + */ +#ifndef _LINUX_KMSG_DUMP_H +#define _LINUX_KMSG_DUMP_H + +#include + +enum kmsg_dump_reason { + KMSG_DUMP_OOPS, + KMSG_DUMP_PANIC, +}; + +/** + * struct kmsg_dumper - kernel crash message dumper structure + * @dump: The callback which gets called on crashes. The buffer is passed + * as two sections, where s1 (length l1) contains the older + * messages and s2 (length l2) contains the newer. + * @list: Entry in the dumper list (private) + * @registered: Flag that specifies if this is already registered + */ +struct kmsg_dumper { + void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason, + const char *s1, unsigned long l1, + const char *s2, unsigned long l2); + struct list_head list; + int registered; +}; + +void kmsg_dump(enum kmsg_dump_reason reason); + +int kmsg_dump_register(struct kmsg_dumper *dumper); + +int kmsg_dump_unregister(struct kmsg_dumper *dumper); + +#endif /* _LINUX_KMSG_DUMP_H */ diff --git a/kernel/panic.c b/kernel/panic.c index bcdef26e3332..8c43226a544d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -10,6 +10,7 @@ */ #include #include +#include #include #include #include @@ -74,6 +75,7 @@ NORET_TYPE void panic(const char * fmt, ...) dump_stack(); #endif + kmsg_dump(KMSG_DUMP_PANIC); /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. @@ -338,6 +340,7 @@ void oops_exit(void) { do_oops_enter_exit(); print_oops_end_marker(); + kmsg_dump(KMSG_DUMP_OOPS); } #ifdef WANT_WARN_ON_SLOWPATH diff --git a/kernel/printk.c b/kernel/printk.c index f38b07f78a4e..051d1f50648f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -1405,3 +1406,121 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies, } EXPORT_SYMBOL(printk_timed_ratelimit); #endif + +static DEFINE_SPINLOCK(dump_list_lock); +static LIST_HEAD(dump_list); + +/** + * kmsg_dump_register - register a kernel log dumper. + * @dump: pointer to the kmsg_dumper structure + * + * Adds a kernel log dumper to the system. The dump callback in the + * structure will be called when the kernel oopses or panics and must be + * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise. + */ +int kmsg_dump_register(struct kmsg_dumper *dumper) +{ + unsigned long flags; + int err = -EBUSY; + + /* The dump callback needs to be set */ + if (!dumper->dump) + return -EINVAL; + + spin_lock_irqsave(&dump_list_lock, flags); + /* Don't allow registering multiple times */ + if (!dumper->registered) { + dumper->registered = 1; + list_add_tail(&dumper->list, &dump_list); + err = 0; + } + spin_unlock_irqrestore(&dump_list_lock, flags); + + return err; +} +EXPORT_SYMBOL_GPL(kmsg_dump_register); + +/** + * kmsg_dump_unregister - unregister a kmsg dumper. + * @dump: pointer to the kmsg_dumper structure + * + * Removes a dump device from the system. Returns zero on success and + * %-EINVAL otherwise. + */ +int kmsg_dump_unregister(struct kmsg_dumper *dumper) +{ + unsigned long flags; + int err = -EINVAL; + + spin_lock_irqsave(&dump_list_lock, flags); + if (dumper->registered) { + dumper->registered = 0; + list_del(&dumper->list); + err = 0; + } + spin_unlock_irqrestore(&dump_list_lock, flags); + + return err; +} +EXPORT_SYMBOL_GPL(kmsg_dump_unregister); + +static const char const *kmsg_reasons[] = { + [KMSG_DUMP_OOPS] = "oops", + [KMSG_DUMP_PANIC] = "panic", +}; + +static const char *kmsg_to_str(enum kmsg_dump_reason reason) +{ + if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0) + return "unknown"; + + return kmsg_reasons[reason]; +} + +/** + * kmsg_dump - dump kernel log to kernel message dumpers. + * @reason: the reason (oops, panic etc) for dumping + * + * Iterate through each of the dump devices and call the oops/panic + * callbacks with the log buffer. + */ +void kmsg_dump(enum kmsg_dump_reason reason) +{ + unsigned long end; + unsigned chars; + struct kmsg_dumper *dumper; + const char *s1, *s2; + unsigned long l1, l2; + unsigned long flags; + + /* Theoretically, the log could move on after we do this, but + there's not a lot we can do about that. The new messages + will overwrite the start of what we dump. */ + spin_lock_irqsave(&logbuf_lock, flags); + end = log_end & LOG_BUF_MASK; + chars = logged_chars; + spin_unlock_irqrestore(&logbuf_lock, flags); + + if (logged_chars > end) { + s1 = log_buf + log_buf_len - logged_chars + end; + l1 = logged_chars - end; + + s2 = log_buf; + l2 = end; + } else { + s1 = ""; + l1 = 0; + + s2 = log_buf + end - logged_chars; + l2 = logged_chars; + } + + if (!spin_trylock_irqsave(&dump_list_lock, flags)) { + printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", + kmsg_to_str(reason)); + return; + } + list_for_each_entry(dumper, &dump_list, list) + dumper->dump(dumper, reason, s1, l1, s2, l2); + spin_unlock_irqrestore(&dump_list_lock, flags); +} -- cgit v1.2.3 From 595dd3d8bf953254d8d2f30f99c54fe09c470040 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 1 Dec 2009 10:52:02 -0800 Subject: kmsg_dump: fix build for CONFIG_PRINTK=n kmsg_dump() fails to build when CONFIG_PRINTK=n; provide stubs for the kmsg_dump*() functions when CONFIG_PRINTK=n. kernel/printk.c: In function 'kmsg_dump': kernel/printk.c:1501: error: 'log_buf_len' undeclared (first use in this function) kernel/printk.c:1502: error: 'logged_chars' undeclared (first use in this function) kernel/printk.c:1506: error: 'log_buf' undeclared (first use in this function) Signed-off-by: Randy Dunlap Acked-by: Simon Kagstrom Signed-off-by: David Woodhouse --- include/linux/kmsg_dump.h | 16 ++++++++++++++++ kernel/printk.c | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 7f089ec5ef32..e32aa268efac 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -35,10 +35,26 @@ struct kmsg_dumper { int registered; }; +#ifdef CONFIG_PRINTK void kmsg_dump(enum kmsg_dump_reason reason); int kmsg_dump_register(struct kmsg_dumper *dumper); int kmsg_dump_unregister(struct kmsg_dumper *dumper); +#else +static inline void kmsg_dump(enum kmsg_dump_reason reason) +{ +} + +static inline int kmsg_dump_register(struct kmsg_dumper *dumper) +{ + return -EINVAL; +} + +static inline int kmsg_dump_unregister(struct kmsg_dumper *dumper) +{ + return -EINVAL; +} +#endif #endif /* _LINUX_KMSG_DUMP_H */ diff --git a/kernel/printk.c b/kernel/printk.c index 051d1f50648f..2a564570f822 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1405,7 +1405,6 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies, return false; } EXPORT_SYMBOL(printk_timed_ratelimit); -#endif static DEFINE_SPINLOCK(dump_list_lock); static LIST_HEAD(dump_list); @@ -1524,3 +1523,4 @@ void kmsg_dump(enum kmsg_dump_reason reason) dumper->dump(dumper, reason, s1, l1, s2, l2); spin_unlock_irqrestore(&dump_list_lock, flags); } +#endif -- cgit v1.2.3 From d2eecb03936878ec574ade5532fa83df7d75dde7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 7 Dec 2009 10:36:20 -0500 Subject: ext4: Use slab allocator for sub-page sized allocations Now that the SLUB seems to be fixed so that it respects the requested alignment, use kmem_cache_alloc() to allocator if the block size of the buffer heads to be allocated is less than the page size. Previously, we were using 16k page on a Power system for each buffer, even when the file system was using 1k or 4k block size. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/jbd2.h | 11 +---- 2 files changed, 134 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ac0d027595d0..c03d4dce4d76 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -39,6 +39,8 @@ #include #include #include +#include +#include #define CREATE_TRACE_POINTS #include @@ -93,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); +static int jbd2_journal_create_slab(size_t slab_size); /* * Helper function used to manage commit timeouts @@ -1248,6 +1251,13 @@ int jbd2_journal_load(journal_t *journal) } } + /* + * Create a slab for this blocksize + */ + err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize)); + if (err) + return err; + /* Let the recovery code check whether it needs to recover any * data from the journal. */ if (jbd2_journal_recover(journal)) @@ -1806,6 +1816,127 @@ size_t journal_tag_bytes(journal_t *journal) return JBD2_TAG_SIZE32; } +/* + * JBD memory management + * + * These functions are used to allocate block-sized chunks of memory + * used for making copies of buffer_head data. Very often it will be + * page-sized chunks of data, but sometimes it will be in + * sub-page-size chunks. (For example, 16k pages on Power systems + * with a 4k block file system.) For blocks smaller than a page, we + * use a SLAB allocator. There are slab caches for each block size, + * which are allocated at mount time, if necessary, and we only free + * (all of) the slab caches when/if the jbd2 module is unloaded. For + * this reason we don't need to a mutex to protect access to + * jbd2_slab[] allocating or releasing memory; only in + * jbd2_journal_create_slab(). + */ +#define JBD2_MAX_SLABS 8 +static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; +static DECLARE_MUTEX(jbd2_slab_create_sem); + +static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { + "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", + "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k" +}; + + +static void jbd2_journal_destroy_slabs(void) +{ + int i; + + for (i = 0; i < JBD2_MAX_SLABS; i++) { + if (jbd2_slab[i]) + kmem_cache_destroy(jbd2_slab[i]); + jbd2_slab[i] = NULL; + } +} + +static int jbd2_journal_create_slab(size_t size) +{ + int i = order_base_2(size) - 10; + size_t slab_size; + + if (size == PAGE_SIZE) + return 0; + + if (i >= JBD2_MAX_SLABS) + return -EINVAL; + + if (unlikely(i < 0)) + i = 0; + down(&jbd2_slab_create_sem); + if (jbd2_slab[i]) { + up(&jbd2_slab_create_sem); + return 0; /* Already created */ + } + + slab_size = 1 << (i+10); + jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, + slab_size, 0, NULL); + up(&jbd2_slab_create_sem); + if (!jbd2_slab[i]) { + printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); + return -ENOMEM; + } + return 0; +} + +static struct kmem_cache *get_slab(size_t size) +{ + int i = order_base_2(size) - 10; + + BUG_ON(i >= JBD2_MAX_SLABS); + if (unlikely(i < 0)) + i = 0; + BUG_ON(jbd2_slab[i] == 0); + return jbd2_slab[i]; +} + +void *jbd2_alloc(size_t size, gfp_t flags) +{ + void *ptr; + + BUG_ON(size & (size-1)); /* Must be a power of 2 */ + + flags |= __GFP_REPEAT; + if (size == PAGE_SIZE) + ptr = (void *)__get_free_pages(flags, 0); + else if (size > PAGE_SIZE) { + int order = get_order(size); + + if (order < 3) + ptr = (void *)__get_free_pages(flags, order); + else + ptr = vmalloc(size); + } else + ptr = kmem_cache_alloc(get_slab(size), flags); + + /* Check alignment; SLUB has gotten this wrong in the past, + * and this can lead to user data corruption! */ + BUG_ON(((unsigned long) ptr) & (size-1)); + + return ptr; +} + +void jbd2_free(void *ptr, size_t size) +{ + if (size == PAGE_SIZE) { + free_pages((unsigned long)ptr, 0); + return; + } + if (size > PAGE_SIZE) { + int order = get_order(size); + + if (order < 3) + free_pages((unsigned long)ptr, order); + else + vfree(ptr); + return; + } + kmem_cache_free(get_slab(size), ptr); +}; + /* * Journal_head storage management */ @@ -2204,6 +2335,7 @@ static void jbd2_journal_destroy_caches(void) jbd2_journal_destroy_revoke_caches(); jbd2_journal_destroy_jbd2_journal_head_cache(); jbd2_journal_destroy_handle_cache(); + jbd2_journal_destroy_slabs(); } static int __init journal_init(void) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 638ce4554c76..8ada2a129d08 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -69,15 +69,8 @@ extern u8 jbd2_journal_enable_debug; #define jbd_debug(f, a...) /**/ #endif -static inline void *jbd2_alloc(size_t size, gfp_t flags) -{ - return (void *)__get_free_pages(flags, get_order(size)); -} - -static inline void jbd2_free(void *ptr, size_t size) -{ - free_pages((unsigned long)ptr, get_order(size)); -}; +extern void *jbd2_alloc(size_t size, gfp_t flags); +extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 -- cgit v1.2.3 From 85438592f179c126ad4cb9a280046d4f0a501e6d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 18 Nov 2009 17:53:21 +0900 Subject: percpu: remove compile warnings caused by __verify_pcpu_ptr() If percpu pointer is const, __verify_pcpu_ptr() triggers warnings like the following. drivers/net/loopback.c: In function 'loopback_get_stats': drivers/net/loopback.c:109: warning: initialization discards qualifiers from pointer target type Fix it by adding const to the verification target pointer used in __verify_pcpu_ptr(). Signed-off-by: Tejun Heo Reported-by: Stephen Rothwell --- include/linux/percpu-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 1fa36eb54b6a..68567c0b3a5d 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -24,7 +24,7 @@ * input parameter is a percpu pointer. */ #define __verify_pcpu_ptr(ptr) do { \ - void __percpu *__vpp_verify = (typeof(ptr))NULL; \ + const void __percpu *__vpp_verify = (typeof(ptr))NULL; \ (void)__vpp_verify; \ } while (0) -- cgit v1.2.3 From 876fba43cc810e3c37ce26995933f9547b83cb0e Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Wed, 11 Nov 2009 15:22:15 +0100 Subject: ACPI: add const to acpi_check_resource_conflict() acpi_check_resource_conflict() doesn't change the resource it operates on, so the res parameter can be marked const. Signed-off-by: Jean Delvare Signed-off-by: Len Brown --- drivers/acpi/osl.c | 2 +- include/linux/acpi.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 7c1c59ea9ec6..02e8464e480f 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -1118,7 +1118,7 @@ __setup("acpi_enforce_resources=", acpi_enforce_resources_setup); /* Check for resource conflicts between ACPI OperationRegions and native * drivers */ -int acpi_check_resource_conflict(struct resource *res) +int acpi_check_resource_conflict(const struct resource *res) { struct acpi_res_list *res_list_elem; int ioport; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index dfcd920c3e54..c920d2def4d3 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -240,7 +240,7 @@ extern int pnpacpi_disabled; #define PXM_INVAL (-1) #define NID_INVAL (-1) -int acpi_check_resource_conflict(struct resource *res); +int acpi_check_resource_conflict(const struct resource *res); int acpi_check_region(resource_size_t start, resource_size_t n, const char *name); -- cgit v1.2.3 From cc9b2e9f6603190c009e5d2629ce8e3f99571346 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Thu, 26 Nov 2009 09:50:20 -0600 Subject: [SCSI] enclosure: fix oops while iterating enclosure_status array Based on patch originally by Jeff Mahoney enclosure_status is expected to be a NULL terminated array of strings but isn't actually NULL terminated. When writing an invalid value to /sys/class/enclosure/.../.../status, it goes off the end of the array and Oopses. Fix by making the assumption true and adding NULL at the end. Reported-by: Artur Wojcik Signed-off-by: James Bottomley --- drivers/misc/enclosure.c | 1 + include/linux/enclosure.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c index e9eae4a78402..1eac626e710a 100644 --- a/drivers/misc/enclosure.c +++ b/drivers/misc/enclosure.c @@ -391,6 +391,7 @@ static const char *const enclosure_status [] = { [ENCLOSURE_STATUS_NOT_INSTALLED] = "not installed", [ENCLOSURE_STATUS_UNKNOWN] = "unknown", [ENCLOSURE_STATUS_UNAVAILABLE] = "unavailable", + [ENCLOSURE_STATUS_MAX] = NULL, }; static const char *const enclosure_type [] = { diff --git a/include/linux/enclosure.h b/include/linux/enclosure.h index 90d1c2184112..9a33c5f7e126 100644 --- a/include/linux/enclosure.h +++ b/include/linux/enclosure.h @@ -42,6 +42,8 @@ enum enclosure_status { ENCLOSURE_STATUS_NOT_INSTALLED, ENCLOSURE_STATUS_UNKNOWN, ENCLOSURE_STATUS_UNAVAILABLE, + /* last element for counting purposes */ + ENCLOSURE_STATUS_MAX }; /* SFF-8485 activity light settings */ -- cgit v1.2.3 From f7b3a8355ba6cad251297844a0bdd08898ea36e0 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 24 Nov 2009 03:26:58 -0700 Subject: of/flattree: Merge early_init_dt_check_for_initrd() Merge common code between PowerPC and Microblaze Signed-off-by: Grant Likely Tested-by: Wolfram Sang Acked-by: Benjamin Herrenschmidt --- arch/microblaze/kernel/prom.c | 32 -------------------------------- arch/powerpc/kernel/prom.c | 30 ------------------------------ drivers/of/fdt.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/of_fdt.h | 1 + 4 files changed, 38 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index a38e3733a09c..7959495b1d00 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -113,38 +113,6 @@ static int __init early_init_dt_scan_cpus(unsigned long node, return 0; } -#ifdef CONFIG_BLK_DEV_INITRD -static void __init early_init_dt_check_for_initrd(unsigned long node) -{ - unsigned long l; - u32 *prop; - - pr_debug("Looking for initrd properties... "); - - prop = of_get_flat_dt_prop(node, "linux,initrd-start", &l); - if (prop) { - initrd_start = (unsigned long) - __va((u32)of_read_ulong(prop, l/4)); - - prop = of_get_flat_dt_prop(node, "linux,initrd-end", &l); - if (prop) { - initrd_end = (unsigned long) - __va((u32)of_read_ulong(prop, 1/4)); - initrd_below_start_ok = 1; - } else { - initrd_start = 0; - } - } - - pr_debug("initrd_start=0x%lx initrd_end=0x%lx\n", - initrd_start, initrd_end); -} -#else -static inline void early_init_dt_check_for_initrd(unsigned long node) -{ -} -#endif /* CONFIG_BLK_DEV_INITRD */ - static int __init early_init_dt_scan_chosen(unsigned long node, const char *uname, int depth, void *data) { diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 7f8856655144..1ecd6c6ecabd 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -373,36 +373,6 @@ static int __init early_init_dt_scan_cpus(unsigned long node, return 0; } -#ifdef CONFIG_BLK_DEV_INITRD -static void __init early_init_dt_check_for_initrd(unsigned long node) -{ - unsigned long l; - u32 *prop; - - DBG("Looking for initrd properties... "); - - prop = of_get_flat_dt_prop(node, "linux,initrd-start", &l); - if (prop) { - initrd_start = (unsigned long)__va(of_read_ulong(prop, l/4)); - - prop = of_get_flat_dt_prop(node, "linux,initrd-end", &l); - if (prop) { - initrd_end = (unsigned long) - __va(of_read_ulong(prop, l/4)); - initrd_below_start_ok = 1; - } else { - initrd_start = 0; - } - } - - DBG("initrd_start=0x%lx initrd_end=0x%lx\n", initrd_start, initrd_end); -} -#else -static inline void early_init_dt_check_for_initrd(unsigned long node) -{ -} -#endif /* CONFIG_BLK_DEV_INITRD */ - static int __init early_init_dt_scan_chosen(unsigned long node, const char *uname, int depth, void *data) { diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 43d236cbc17b..6ad98e85dc93 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -369,6 +370,42 @@ unsigned long __init unflatten_dt_node(unsigned long mem, return mem; } +#ifdef CONFIG_BLK_DEV_INITRD +/** + * early_init_dt_check_for_initrd - Decode initrd location from flat tree + * @node: reference to node containing initrd location ('chosen') + */ +void __init early_init_dt_check_for_initrd(unsigned long node) +{ + unsigned long len; + u32 *prop; + + pr_debug("Looking for initrd properties... "); + + prop = of_get_flat_dt_prop(node, "linux,initrd-start", &len); + if (prop) { + initrd_start = (unsigned long) + __va(of_read_ulong(prop, len/4)); + + prop = of_get_flat_dt_prop(node, "linux,initrd-end", &len); + if (prop) { + initrd_end = (unsigned long) + __va(of_read_ulong(prop, len/4)); + initrd_below_start_ok = 1; + } else { + initrd_start = 0; + } + } + + pr_debug("initrd_start=0x%lx initrd_end=0x%lx\n", + initrd_start, initrd_end); +} +#else +inline void early_init_dt_check_for_initrd(unsigned long node) +{ +} +#endif /* CONFIG_BLK_DEV_INITRD */ + /** * unflatten_device_tree - create tree of device_nodes from flat blob * diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 81231e04e8f3..ec2db8278c3f 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -69,6 +69,7 @@ extern void *of_get_flat_dt_prop(unsigned long node, const char *name, unsigned long *size); extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); +extern void early_init_dt_check_for_initrd(unsigned long node); /* Other Prototypes */ extern void finish_device_tree(void); -- cgit v1.2.3 From f00abd94918c9780f9d2d961fc0e419c11457922 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 24 Nov 2009 03:27:10 -0700 Subject: of/flattree: Merge earlyinit_dt_scan_root() Merge common code between PowerPC and Microblaze Signed-off-by: Grant Likely Acked-by: Benjamin Herrenschmidt Tested-by: Wolfram Sang --- arch/microblaze/kernel/prom.c | 23 ----------------------- arch/powerpc/kernel/prom.c | 24 ------------------------ drivers/of/fdt.c | 26 ++++++++++++++++++++++++++ include/linux/of_fdt.h | 6 ++++++ 4 files changed, 32 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index 7959495b1d00..189179a9b554 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -42,9 +42,6 @@ #include #include -static int __initdata dt_root_addr_cells; -static int __initdata dt_root_size_cells; - typedef u32 cell_t; /* export that to outside world */ @@ -158,26 +155,6 @@ static int __init early_init_dt_scan_chosen(unsigned long node, return 1; } -static int __init early_init_dt_scan_root(unsigned long node, - const char *uname, int depth, void *data) -{ - u32 *prop; - - if (depth != 0) - return 0; - - prop = of_get_flat_dt_prop(node, "#size-cells", NULL); - dt_root_size_cells = (prop == NULL) ? 1 : *prop; - pr_debug("dt_root_size_cells = %x\n", dt_root_size_cells); - - prop = of_get_flat_dt_prop(node, "#address-cells", NULL); - dt_root_addr_cells = (prop == NULL) ? 2 : *prop; - pr_debug("dt_root_addr_cells = %x\n", dt_root_addr_cells); - - /* break now */ - return 1; -} - static u64 __init dt_mem_next_cell(int s, cell_t **cellp) { cell_t *p = *cellp; diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 1ecd6c6ecabd..78f65a4d8b03 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -61,10 +61,6 @@ #define DBG(fmt...) #endif - -static int __initdata dt_root_addr_cells; -static int __initdata dt_root_size_cells; - #ifdef CONFIG_PPC64 int __initdata iommu_is_off; int __initdata iommu_force_on; @@ -436,26 +432,6 @@ static int __init early_init_dt_scan_chosen(unsigned long node, return 1; } -static int __init early_init_dt_scan_root(unsigned long node, - const char *uname, int depth, void *data) -{ - u32 *prop; - - if (depth != 0) - return 0; - - prop = of_get_flat_dt_prop(node, "#size-cells", NULL); - dt_root_size_cells = (prop == NULL) ? 1 : *prop; - DBG("dt_root_size_cells = %x\n", dt_root_size_cells); - - prop = of_get_flat_dt_prop(node, "#address-cells", NULL); - dt_root_addr_cells = (prop == NULL) ? 2 : *prop; - DBG("dt_root_addr_cells = %x\n", dt_root_addr_cells); - - /* break now */ - return 1; -} - static u64 __init dt_mem_next_cell(int s, cell_t **cellp) { cell_t *p = *cellp; diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 6ad98e85dc93..be200be47269 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -15,6 +15,9 @@ #include #include +int __initdata dt_root_addr_cells; +int __initdata dt_root_size_cells; + struct boot_param_header *initial_boot_params; char *find_flat_dt_string(u32 offset) @@ -406,6 +409,29 @@ inline void early_init_dt_check_for_initrd(unsigned long node) } #endif /* CONFIG_BLK_DEV_INITRD */ +/** + * early_init_dt_scan_root - fetch the top level address and size cells + */ +int __init early_init_dt_scan_root(unsigned long node, const char *uname, + int depth, void *data) +{ + u32 *prop; + + if (depth != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "#size-cells", NULL); + dt_root_size_cells = (prop == NULL) ? 1 : *prop; + pr_debug("dt_root_size_cells = %x\n", dt_root_size_cells); + + prop = of_get_flat_dt_prop(node, "#address-cells", NULL); + dt_root_addr_cells = (prop == NULL) ? 2 : *prop; + pr_debug("dt_root_addr_cells = %x\n", dt_root_addr_cells); + + /* break now */ + return 1; +} + /** * unflatten_device_tree - create tree of device_nodes from flat blob * diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index ec2db8278c3f..828c3cdaea78 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -58,6 +58,8 @@ struct boot_param_header { }; /* TBD: Temporary export of fdt globals - remove when code fully merged */ +extern int __initdata dt_root_addr_cells; +extern int __initdata dt_root_size_cells; extern struct boot_param_header *initial_boot_params; /* For scanning the flat device-tree at boot time */ @@ -71,6 +73,10 @@ extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); extern void early_init_dt_check_for_initrd(unsigned long node); +/* Early flat tree scan hooks */ +extern int early_init_dt_scan_root(unsigned long node, const char *uname, + int depth, void *data); + /* Other Prototypes */ extern void finish_device_tree(void); extern void unflatten_device_tree(void); -- cgit v1.2.3 From 83f7a06eb479e2aeb83536e77a2cb14cc2285e32 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 24 Nov 2009 03:37:56 -0700 Subject: of/flattree: merge dt_mem_next_cell Merge common code between PowerPC and Microblaze Signed-off-by: Grant Likely Acked-by: Benjamin Herrenschmidt Tested-by: Wolfram Sang --- arch/microblaze/kernel/prom.c | 8 -------- arch/powerpc/kernel/prom.c | 8 -------- drivers/of/fdt.c | 8 ++++++++ include/linux/of_fdt.h | 1 + 4 files changed, 9 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index 189179a9b554..e0f4c34ed0f2 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -155,14 +155,6 @@ static int __init early_init_dt_scan_chosen(unsigned long node, return 1; } -static u64 __init dt_mem_next_cell(int s, cell_t **cellp) -{ - cell_t *p = *cellp; - - *cellp = p + s; - return of_read_number(p, s); -} - static int __init early_init_dt_scan_memory(unsigned long node, const char *uname, int depth, void *data) { diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 78f65a4d8b03..048e3a3e9876 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -432,14 +432,6 @@ static int __init early_init_dt_scan_chosen(unsigned long node, return 1; } -static u64 __init dt_mem_next_cell(int s, cell_t **cellp) -{ - cell_t *p = *cellp; - - *cellp = p + s; - return of_read_number(p, s); -} - #ifdef CONFIG_PPC_PSERIES /* * Interpret the ibm,dynamic-memory property in the diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index be200be47269..ebce509b0886 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -432,6 +432,14 @@ int __init early_init_dt_scan_root(unsigned long node, const char *uname, return 1; } +u64 __init dt_mem_next_cell(int s, u32 **cellp) +{ + u32 *p = *cellp; + + *cellp = p + s; + return of_read_number(p, s); +} + /** * unflatten_device_tree - create tree of device_nodes from flat blob * diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 828c3cdaea78..d1a37e56031e 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -72,6 +72,7 @@ extern void *of_get_flat_dt_prop(unsigned long node, const char *name, extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); extern void early_init_dt_check_for_initrd(unsigned long node); +extern u64 dt_mem_next_cell(int s, u32 **cellp); /* Early flat tree scan hooks */ extern int early_init_dt_scan_root(unsigned long node, const char *uname, -- cgit v1.2.3 From 86e032213424958b45564d0cc96b3316641a49d3 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 10 Dec 2009 23:42:21 -0700 Subject: of/flattree: merge early_init_dt_scan_chosen() Merge common code between PowerPC and Microblaze. This patch splits the arch-specific stuff out into a new function, early_init_dt_scan_chosen_arch(). Signed-off-by: Grant Likely Tested-by: Wolfram Sang Acked-by: Benjamin Herrenschmidt --- arch/microblaze/kernel/prom.c | 44 ++--------------------------------------- arch/powerpc/kernel/prom.c | 46 ++++++++++--------------------------------- drivers/of/fdt.c | 38 +++++++++++++++++++++++++++++++++++ include/linux/of_fdt.h | 3 +++ 4 files changed, 53 insertions(+), 78 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index 50d8b09d5e3f..5505bcffd7dd 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -108,49 +108,9 @@ static int __init early_init_dt_scan_cpus(unsigned long node, return 0; } -static int __init early_init_dt_scan_chosen(unsigned long node, - const char *uname, int depth, void *data) +void __init early_init_dt_scan_chosen_arch(unsigned long node) { - unsigned long l; - char *p; - - pr_debug("search \"chosen\", depth: %d, uname: %s\n", depth, uname); - - if (depth != 1 || - (strcmp(uname, "chosen") != 0 && - strcmp(uname, "chosen@0") != 0)) - return 0; - -#ifdef CONFIG_KEXEC - lprop = (u64 *)of_get_flat_dt_prop(node, - "linux,crashkernel-base", NULL); - if (lprop) - crashk_res.start = *lprop; - - lprop = (u64 *)of_get_flat_dt_prop(node, - "linux,crashkernel-size", NULL); - if (lprop) - crashk_res.end = crashk_res.start + *lprop - 1; -#endif - - early_init_dt_check_for_initrd(node); - - /* Retreive command line */ - p = of_get_flat_dt_prop(node, "bootargs", &l); - if (p != NULL && l > 0) - strlcpy(cmd_line, p, min((int)l, COMMAND_LINE_SIZE)); - -#ifdef CONFIG_CMDLINE -#ifndef CONFIG_CMDLINE_FORCE - if (p == NULL || l == 0 || (l == 1 && (*p) == 0)) -#endif - strlcpy(cmd_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); -#endif /* CONFIG_CMDLINE */ - - pr_debug("Command line is: %s\n", cmd_line); - - /* break now */ - return 1; + /* No Microblaze specific code here */ } static int __init early_init_dt_scan_memory(unsigned long node, diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 4e3181cded44..877fad9b3745 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -367,18 +367,9 @@ static int __init early_init_dt_scan_cpus(unsigned long node, return 0; } -static int __init early_init_dt_scan_chosen(unsigned long node, - const char *uname, int depth, void *data) +void __init early_init_dt_scan_chosen_arch(unsigned long node) { unsigned long *lprop; - unsigned long l; - char *p; - - DBG("search \"chosen\", depth: %d, uname: %s\n", depth, uname); - - if (depth != 1 || - (strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0)) - return 0; #ifdef CONFIG_PPC64 /* check if iommu is forced on or off */ @@ -389,17 +380,17 @@ static int __init early_init_dt_scan_chosen(unsigned long node, #endif /* mem=x on the command line is the preferred mechanism */ - lprop = of_get_flat_dt_prop(node, "linux,memory-limit", NULL); - if (lprop) - memory_limit = *lprop; + lprop = of_get_flat_dt_prop(node, "linux,memory-limit", NULL); + if (lprop) + memory_limit = *lprop; #ifdef CONFIG_PPC64 - lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-start", NULL); - if (lprop) - tce_alloc_start = *lprop; - lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-end", NULL); - if (lprop) - tce_alloc_end = *lprop; + lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-start", NULL); + if (lprop) + tce_alloc_start = *lprop; + lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-end", NULL); + if (lprop) + tce_alloc_end = *lprop; #endif #ifdef CONFIG_KEXEC @@ -411,23 +402,6 @@ static int __init early_init_dt_scan_chosen(unsigned long node, if (lprop) crashk_res.end = crashk_res.start + *lprop - 1; #endif - - early_init_dt_check_for_initrd(node); - - /* Retreive command line */ - p = of_get_flat_dt_prop(node, "bootargs", &l); - if (p != NULL && l > 0) - strlcpy(cmd_line, p, min((int)l, COMMAND_LINE_SIZE)); - -#ifdef CONFIG_CMDLINE - if (p == NULL || l == 0 || (l == 1 && (*p) == 0)) - strlcpy(cmd_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); -#endif /* CONFIG_CMDLINE */ - - DBG("Command line is: %s\n", cmd_line); - - /* break now */ - return 1; } #ifdef CONFIG_PPC_PSERIES diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index ebce509b0886..616a4767a950 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -15,6 +15,10 @@ #include #include +#ifdef CONFIG_PPC +#include +#endif /* CONFIG_PPC */ + int __initdata dt_root_addr_cells; int __initdata dt_root_size_cells; @@ -440,6 +444,40 @@ u64 __init dt_mem_next_cell(int s, u32 **cellp) return of_read_number(p, s); } +int __init early_init_dt_scan_chosen(unsigned long node, const char *uname, + int depth, void *data) +{ + unsigned long l; + char *p; + + pr_debug("search \"chosen\", depth: %d, uname: %s\n", depth, uname); + + if (depth != 1 || + (strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0)) + return 0; + + early_init_dt_check_for_initrd(node); + + /* Retreive command line */ + p = of_get_flat_dt_prop(node, "bootargs", &l); + if (p != NULL && l > 0) + strlcpy(cmd_line, p, min((int)l, COMMAND_LINE_SIZE)); + +#ifdef CONFIG_CMDLINE +#ifndef CONFIG_CMDLINE_FORCE + if (p == NULL || l == 0 || (l == 1 && (*p) == 0)) +#endif + strlcpy(cmd_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); +#endif /* CONFIG_CMDLINE */ + + early_init_dt_scan_chosen_arch(node); + + pr_debug("Command line is: %s\n", cmd_line); + + /* break now */ + return 1; +} + /** * unflatten_device_tree - create tree of device_nodes from flat blob * diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index d1a37e56031e..8118d4559dd5 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -71,6 +71,9 @@ extern void *of_get_flat_dt_prop(unsigned long node, const char *name, unsigned long *size); extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); +extern void early_init_dt_scan_chosen_arch(unsigned long node); +extern int early_init_dt_scan_chosen(unsigned long node, const char *uname, + int depth, void *data); extern void early_init_dt_check_for_initrd(unsigned long node); extern u64 dt_mem_next_cell(int s, u32 **cellp); -- cgit v1.2.3 From a88f6667078412e5eff37ead68a043ee0ec9f1da Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Thu, 10 Dec 2009 18:35:15 +0100 Subject: dmaengine: clarify the meaning of the DMA_CTRL_ACK flag DMA_CTRL_ACK's description applies to its clear state, not to its set state. Signed-off-by: Guennadi Liakhovetski Signed-off-by: Dan Williams --- include/linux/dmaengine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 2b9f2ac7ed60..78784982b33e 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -74,7 +74,7 @@ enum dma_transaction_type { * control completion, and communicate status. * @DMA_PREP_INTERRUPT - trigger an interrupt (callback) upon completion of * this transaction - * @DMA_CTRL_ACK - the descriptor cannot be reused until the client + * @DMA_CTRL_ACK - if clear, the descriptor cannot be reused until the client * acknowledges receipt, i.e. has has a chance to establish any dependency * chains * @DMA_COMPL_SKIP_SRC_UNMAP - set to disable dma-unmapping the source buffer(s) -- cgit v1.2.3 From 03889384cee7a198a79447c1ea6aca2c8e54d155 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Dec 2009 09:48:22 -0500 Subject: tracing: Add trace_dump_stack() I've been asked a few times about how to find out what is calling some location in the kernel. One way is to use dynamic function tracing and implement the func_stack_trace. But this only finds out who is calling a particular function. It does not tell you who is calling that function and entering a specific if conditional. I have myself implemented a quick version of trace_dump_stack() for this purpose a few times, and just needed it now. This is when I realized that this would be a good tool to have in the kernel like trace_printk(). Using trace_dump_stack() is similar to dump_stack() except that it writes to the trace buffer instead and can be used in critical locations. For example: @@ -5485,8 +5485,12 @@ need_resched_nonpreemptible: if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; - else + else { deactivate_task(rq, prev, 1); + trace_printk("Deactivating task %s:%d\n", + prev->comm, prev->pid); + trace_dump_stack(); + } switch_count = &prev->nvcsw; } Produces: <...>-3249 [001] 296.105269: schedule: Deactivating task ntpd:3249 <...>-3249 [001] 296.105270: => schedule => schedule_hrtimeout_range => poll_schedule_timeout => do_select => core_sys_select => sys_select => system_call_fastpath Signed-off-by: Steven Rostedt --- include/linux/kernel.h | 2 ++ kernel/trace/trace.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3fa4c590cf12..5ad4199fb073 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -492,6 +492,8 @@ extern int __trace_printk(unsigned long ip, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); +extern void trace_dump_stack(void); + /* * The double __builtin_constant_p is because gcc will give us an error * if we try to allocate the static variable to fmt if it is not a diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 88bd9ae2a9ed..f531301b7a3b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1151,6 +1151,22 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, __ftrace_trace_stack(tr->buffer, flags, skip, pc); } +/** + * trace_dump_stack - record a stack back trace in the trace buffer + */ +void trace_dump_stack(void) +{ + unsigned long flags; + + if (tracing_disabled || tracing_selftest_running) + return 0; + + local_save_flags(flags); + + /* skipping 3 traces, seems to get us at the caller of this function */ + __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); +} + void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { -- cgit v1.2.3 From 967c9ef9b8c3bdec1bd3a380edac19e0b9fbeadc Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Fri, 11 Dec 2009 22:00:57 -0800 Subject: Input: i8042 - allow installing platform filters for incoming data Some hardware (such as Dell laptops) signal a variety of events through the i8042 controller, even if these don't map to keyboard events. Add support for drivers to filter the i8042 event stream in order to respond to these events and (if appropriate) block them from entering the input stream. Signed-off-by: Matthew Garrett Signed-off-by: Dmitry Torokhov --- drivers/input/serio/i8042.c | 58 ++++++++++++++++++++++++++++++++++++++++++--- include/linux/i8042.h | 18 +++++++++++++- 2 files changed, 72 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c index 634da68f7f35..d84a36e545f6 100644 --- a/drivers/input/serio/i8042.c +++ b/drivers/input/serio/i8042.c @@ -126,6 +126,8 @@ static unsigned char i8042_suppress_kbd_ack; static struct platform_device *i8042_platform_device; static irqreturn_t i8042_interrupt(int irq, void *dev_id); +static bool (*i8042_platform_filter)(unsigned char data, unsigned char str, + struct serio *serio); void i8042_lock_chip(void) { @@ -139,6 +141,48 @@ void i8042_unlock_chip(void) } EXPORT_SYMBOL(i8042_unlock_chip); +int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str, + struct serio *serio)) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&i8042_lock, flags); + + if (i8042_platform_filter) { + ret = -EBUSY; + goto out; + } + + i8042_platform_filter = filter; + +out: + spin_unlock_irqrestore(&i8042_lock, flags); + return ret; +} +EXPORT_SYMBOL(i8042_install_filter); + +int i8042_remove_filter(bool (*filter)(unsigned char data, unsigned char str, + struct serio *port)) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&i8042_lock, flags); + + if (i8042_platform_filter != filter) { + ret = -EINVAL; + goto out; + } + + i8042_platform_filter = NULL; + +out: + spin_unlock_irqrestore(&i8042_lock, flags); + return ret; +} +EXPORT_SYMBOL(i8042_remove_filter); + /* * The i8042_wait_read() and i8042_wait_write functions wait for the i8042 to * be ready for reading values from it / writing values to it. @@ -373,7 +417,8 @@ static void i8042_stop(struct serio *serio) * It is called from i8042_interrupt and thus is running with interrupts * off and i8042_lock held. */ -static bool i8042_filter(unsigned char data, unsigned char str) +static bool i8042_filter(unsigned char data, unsigned char str, + struct serio *serio) { if (unlikely(i8042_suppress_kbd_ack)) { if ((~str & I8042_STR_AUXDATA) && @@ -384,6 +429,11 @@ static bool i8042_filter(unsigned char data, unsigned char str) } } + if (i8042_platform_filter && i8042_platform_filter(data, str, serio)) { + dbg("Filtered out by platfrom filter\n"); + return true; + } + return false; } @@ -396,6 +446,7 @@ static bool i8042_filter(unsigned char data, unsigned char str) static irqreturn_t i8042_interrupt(int irq, void *dev_id) { struct i8042_port *port; + struct serio *serio; unsigned long flags; unsigned char str, data; unsigned int dfl; @@ -462,18 +513,19 @@ static irqreturn_t i8042_interrupt(int irq, void *dev_id) } port = &i8042_ports[port_no]; + serio = port->exists ? port->serio : NULL; dbg("%02x <- i8042 (interrupt, %d, %d%s%s)", data, port_no, irq, dfl & SERIO_PARITY ? ", bad parity" : "", dfl & SERIO_TIMEOUT ? ", timeout" : ""); - filtered = i8042_filter(data, str); + filtered = i8042_filter(data, str, serio); spin_unlock_irqrestore(&i8042_lock, flags); if (likely(port->exists && !filtered)) - serio_interrupt(port->serio, data, dfl); + serio_interrupt(serio, data, dfl); out: return IRQ_RETVAL(ret); diff --git a/include/linux/i8042.h b/include/linux/i8042.h index 60c3360ef6ad..9bf6870ee5f4 100644 --- a/include/linux/i8042.h +++ b/include/linux/i8042.h @@ -39,6 +39,10 @@ void i8042_lock_chip(void); void i8042_unlock_chip(void); int i8042_command(unsigned char *param, int command); bool i8042_check_port_owner(const struct serio *); +int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str, + struct serio *serio)); +int i8042_remove_filter(bool (*filter)(unsigned char data, unsigned char str, + struct serio *serio)); #else @@ -52,7 +56,7 @@ void i8042_unlock_chip(void) int i8042_command(unsigned char *param, int command) { - return -ENOSYS; + return -ENODEV; } bool i8042_check_port_owner(const struct serio *serio) @@ -60,6 +64,18 @@ bool i8042_check_port_owner(const struct serio *serio) return false; } +int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str, + struct serio *serio)) +{ + return -ENODEV; +} + +int i8042_remove_filter(bool (*filter)(unsigned char data, unsigned char str, + struct serio *serio)) +{ + return -ENODEV; +} + #endif #endif -- cgit v1.2.3 From 01fc0ac198eabcbf460e1ed058860a935b6c2c9a Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 19 Apr 2009 21:57:19 +0200 Subject: kbuild: move bounds.h to include/generated Signed-off-by: Sam Ravnborg Cc: Al Viro Signed-off-by: Michal Marek --- .gitignore | 1 - Kbuild | 2 +- Makefile | 2 +- include/linux/mmzone.h | 2 +- include/linux/page-flags.h | 2 +- kernel/bounds.c | 2 +- 6 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/.gitignore b/.gitignore index 946c7ec5c922..36d9cd6d4281 100644 --- a/.gitignore +++ b/.gitignore @@ -52,7 +52,6 @@ include/linux/autoconf.h include/linux/compile.h include/linux/version.h include/linux/utsrelease.h -include/linux/bounds.h include/generated # stgit generated dirs diff --git a/Kbuild b/Kbuild index f056b4feee51..1165d7a5ca4a 100644 --- a/Kbuild +++ b/Kbuild @@ -8,7 +8,7 @@ ##### # 1) Generate bounds.h -bounds-file := include/linux/bounds.h +bounds-file := include/generated/bounds.h always := $(bounds-file) targets := $(bounds-file) kernel/bounds.s diff --git a/Makefile b/Makefile index 07711786dc95..b58e9312ce30 100644 --- a/Makefile +++ b/Makefile @@ -1197,7 +1197,7 @@ MRPROPER_DIRS += include/config include2 usr/include include/generated MRPROPER_FILES += .config .config.old include/asm .version .old_version \ include/linux/autoconf.h include/linux/version.h \ include/linux/utsrelease.h \ - include/linux/bounds.h include/asm*/asm-offsets.h \ + include/asm*/asm-offsets.h \ Module.symvers Module.markers tags TAGS cscope* # clean - Delete most, but leave enough to build external modules diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6f7561730d88..30fe668c2542 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6b202b173955..ef36725aa515 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -8,7 +8,7 @@ #include #ifndef __GENERATING_BOUNDS_H #include -#include +#include #endif /* !__GENERATING_BOUNDS_H */ /* diff --git a/kernel/bounds.c b/kernel/bounds.c index 3c5301381837..98a51f26c136 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -12,7 +12,7 @@ void foo(void) { - /* The enum constants to put into include/linux/bounds.h */ + /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); /* End of constants */ -- cgit v1.2.3 From 98b8788ae91694499d1995035625bea16a4db0c4 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 18 Oct 2009 00:39:40 +0200 Subject: drop explicit include of autoconf.h kbuild.h forces include of autoconf.h on the commandline using -include - so we do not need to include the file explicit. Signed-off-by: Sam Ravnborg Signed-off-by: Michal Marek --- arch/cris/arch-v32/kernel/head.S | 1 - arch/cris/kernel/asm-offsets.c | 1 - arch/cris/kernel/vmlinux.lds.S | 1 - arch/ia64/kvm/asm-offsets.c | 1 - drivers/accessibility/braille/braille_console.c | 1 - drivers/hid/hid-lg.h | 2 -- drivers/platform/x86/compal-laptop.c | 1 - drivers/staging/iio/ring_sw.h | 1 - include/linux/mmdebug.h | 2 -- 9 files changed, 11 deletions(-) (limited to 'include/linux') diff --git a/arch/cris/arch-v32/kernel/head.S b/arch/cris/arch-v32/kernel/head.S index 3db478eb5155..76266f80a5f1 100644 --- a/arch/cris/arch-v32/kernel/head.S +++ b/arch/cris/arch-v32/kernel/head.S @@ -10,7 +10,6 @@ * The macros found in mmu_defs_asm.h uses the ## concatenation operator, so * -traditional must not be used when assembling this file. */ -#include #include #include #include diff --git a/arch/cris/kernel/asm-offsets.c b/arch/cris/kernel/asm-offsets.c index ddd6fbbe75de..dd7b8e983221 100644 --- a/arch/cris/kernel/asm-offsets.c +++ b/arch/cris/kernel/asm-offsets.c @@ -1,6 +1,5 @@ #include #include -#include /* * Generate definitions needed by assembly language modules. diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S index bbfda67d2907..d49d17d2a14f 100644 --- a/arch/cris/kernel/vmlinux.lds.S +++ b/arch/cris/kernel/vmlinux.lds.S @@ -8,7 +8,6 @@ * the kernel has booted. */ -#include #include #include diff --git a/arch/ia64/kvm/asm-offsets.c b/arch/ia64/kvm/asm-offsets.c index 0c3564a7a033..9324c875caf5 100644 --- a/arch/ia64/kvm/asm-offsets.c +++ b/arch/ia64/kvm/asm-offsets.c @@ -22,7 +22,6 @@ * */ -#include #include #include diff --git a/drivers/accessibility/braille/braille_console.c b/drivers/accessibility/braille/braille_console.c index d672cfe7ca59..cb423f5aef24 100644 --- a/drivers/accessibility/braille/braille_console.c +++ b/drivers/accessibility/braille/braille_console.c @@ -21,7 +21,6 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include #include #include #include diff --git a/drivers/hid/hid-lg.h b/drivers/hid/hid-lg.h index 27ae750ca878..bf31592eaf79 100644 --- a/drivers/hid/hid-lg.h +++ b/drivers/hid/hid-lg.h @@ -1,8 +1,6 @@ #ifndef __HID_LG_H #define __HID_LG_H -#include - #ifdef CONFIG_LOGITECH_FF int lgff_init(struct hid_device *hdev); #else diff --git a/drivers/platform/x86/compal-laptop.c b/drivers/platform/x86/compal-laptop.c index 11003bba10d3..1a387e79f719 100644 --- a/drivers/platform/x86/compal-laptop.c +++ b/drivers/platform/x86/compal-laptop.c @@ -51,7 +51,6 @@ #include #include #include -#include #define COMPAL_DRIVER_VERSION "0.2.6" diff --git a/drivers/staging/iio/ring_sw.h b/drivers/staging/iio/ring_sw.h index f0b86f02cd80..fd677f008365 100644 --- a/drivers/staging/iio/ring_sw.h +++ b/drivers/staging/iio/ring_sw.h @@ -29,7 +29,6 @@ * driver requests - some may support multiple options */ -#include #include "iio.h" #include "ring_generic.h" diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 8a5509877192..ee24ef8ab616 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -1,8 +1,6 @@ #ifndef LINUX_MM_DEBUG_H #define LINUX_MM_DEBUG_H 1 -#include - #ifdef CONFIG_DEBUG_VM #define VM_BUG_ON(cond) BUG_ON(cond) #else -- cgit v1.2.3 From 273b281fa22c293963ee3e6eec418f5dda2dbc83 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 18 Oct 2009 00:52:28 +0200 Subject: kbuild: move utsrelease.h to include/generated Fix up all users of utsrelease.h Signed-off-by: Sam Ravnborg Signed-off-by: Michal Marek --- .gitignore | 1 - Makefile | 5 ++--- arch/alpha/boot/bootp.c | 2 +- arch/alpha/boot/bootpz.c | 2 +- arch/alpha/boot/main.c | 2 +- arch/frv/kernel/setup.c | 2 +- arch/powerpc/platforms/52xx/efika.c | 2 +- arch/powerpc/platforms/amigaone/setup.c | 2 +- arch/powerpc/platforms/chrp/setup.c | 2 +- arch/powerpc/platforms/powermac/bootx_init.c | 2 +- arch/x86/boot/header.S | 2 +- arch/x86/boot/version.c | 2 +- drivers/staging/panel/panel.c | 2 +- include/linux/vermagic.h | 2 +- init/version.c | 2 +- kernel/kexec.c | 2 +- kernel/trace/trace.c | 2 +- 17 files changed, 17 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/.gitignore b/.gitignore index c6c19ea6ea96..002d5304968b 100644 --- a/.gitignore +++ b/.gitignore @@ -47,7 +47,6 @@ Module.symvers # include/config include/linux/version.h -include/linux/utsrelease.h include/generated # stgit generated dirs diff --git a/Makefile b/Makefile index 3bdd932e3d88..860224d7cbcf 100644 --- a/Makefile +++ b/Makefile @@ -968,7 +968,7 @@ endif # prepare2 creates a makefile if using a separate output directory prepare2: prepare3 outputmakefile -prepare1: prepare2 include/linux/version.h include/linux/utsrelease.h \ +prepare1: prepare2 include/linux/version.h include/generated/utsrelease.h \ include/config/auto.conf $(cmd_crmodverdir) @@ -1005,7 +1005,7 @@ endef include/linux/version.h: $(srctree)/Makefile FORCE $(call filechk,version.h) -include/linux/utsrelease.h: include/config/kernel.release FORCE +include/generated/utsrelease.h: include/config/kernel.release FORCE $(call filechk,utsrelease.h) PHONY += headerdep @@ -1151,7 +1151,6 @@ CLEAN_FILES += vmlinux System.map \ MRPROPER_DIRS += include/config usr/include include/generated MRPROPER_FILES += .config .config.old .version .old_version \ include/linux/version.h \ - include/linux/utsrelease.h \ Module.symvers Module.markers tags TAGS cscope* # clean - Delete most, but leave enough to build external modules diff --git a/arch/alpha/boot/bootp.c b/arch/alpha/boot/bootp.c index 3af21c789339..3c8d1b25c661 100644 --- a/arch/alpha/boot/bootp.c +++ b/arch/alpha/boot/bootp.c @@ -9,7 +9,7 @@ */ #include #include -#include +#include #include #include diff --git a/arch/alpha/boot/bootpz.c b/arch/alpha/boot/bootpz.c index 1036b515e20c..ade3f129dc27 100644 --- a/arch/alpha/boot/bootpz.c +++ b/arch/alpha/boot/bootpz.c @@ -11,7 +11,7 @@ */ #include #include -#include +#include #include #include diff --git a/arch/alpha/boot/main.c b/arch/alpha/boot/main.c index 89f3be071ae5..644b7db55438 100644 --- a/arch/alpha/boot/main.c +++ b/arch/alpha/boot/main.c @@ -7,7 +7,7 @@ */ #include #include -#include +#include #include #include diff --git a/arch/frv/kernel/setup.c b/arch/frv/kernel/setup.c index 55e4fab7c0bc..75cf7f4b2fa8 100644 --- a/arch/frv/kernel/setup.c +++ b/arch/frv/kernel/setup.c @@ -10,7 +10,7 @@ * 2 of the License, or (at your option) any later version. */ -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/52xx/efika.c b/arch/powerpc/platforms/52xx/efika.c index bcc69e1f77c1..45c0cb9b67e6 100644 --- a/arch/powerpc/platforms/52xx/efika.c +++ b/arch/powerpc/platforms/52xx/efika.c @@ -10,7 +10,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/amigaone/setup.c b/arch/powerpc/platforms/amigaone/setup.c index 9290a7a442d0..fb4eb0df054c 100644 --- a/arch/powerpc/platforms/amigaone/setup.c +++ b/arch/powerpc/platforms/amigaone/setup.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c index cd4ad9aea760..0a6f5ab8aab3 100644 --- a/arch/powerpc/platforms/chrp/setup.c +++ b/arch/powerpc/platforms/chrp/setup.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/powermac/bootx_init.c b/arch/powerpc/platforms/powermac/bootx_init.c index cf660916ae0b..9dd789a7370d 100644 --- a/arch/powerpc/platforms/powermac/bootx_init.c +++ b/arch/powerpc/platforms/powermac/bootx_init.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index b31cc54b4641..93e689f4bd86 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -16,7 +16,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c index 4d88763e39cb..2b15aa488ffb 100644 --- a/arch/x86/boot/version.c +++ b/arch/x86/boot/version.c @@ -13,7 +13,7 @@ */ #include "boot.h" -#include +#include #include const char kernel_version[] = diff --git a/drivers/staging/panel/panel.c b/drivers/staging/panel/panel.c index 4ce399b6d237..f98a52448eae 100644 --- a/drivers/staging/panel/panel.c +++ b/drivers/staging/panel/panel.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index 79b9837d9ca0..cf97b5b9d1fe 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -1,4 +1,4 @@ -#include +#include #include /* Simply sanity version stamp for modules. */ diff --git a/init/version.c b/init/version.c index 82328aaca1ef..adff586401a5 100644 --- a/init/version.c +++ b/init/version.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #ifndef CONFIG_KALLSYMS diff --git a/kernel/kexec.c b/kernel/kexec.c index f336e2107f98..83f54e2a6eed 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 88bd9ae2a9ed..bfb1b64bfa9d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -12,7 +12,7 @@ * Copyright (C) 2004 William Lee Irwin III */ #include -#include +#include #include #include #include -- cgit v1.2.3 From 7a77080dbec28ab2bceb422398601dcc53c142ad Mon Sep 17 00:00:00 2001 From: Jie Zhang Date: Thu, 12 Nov 2009 17:59:56 +0800 Subject: net: add net_tstamp.h to headers_install include/linux/net_tstamp.h is userspace API for hardware time stamping of network packets. It should be exported to userspace. Signed-off-by: Jie Zhang Signed-off-by: Barry Song Signed-off-by: Patrick Ohly Signed-off-by: Michal Marek --- include/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/Kbuild b/include/linux/Kbuild index f72914db2a11..756f831cbdd5 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -118,6 +118,7 @@ header-y += mtio.h header-y += ncp_no.h header-y += neighbour.h header-y += net_dropmon.h +header-y += net_tstamp.h header-y += netfilter_arp.h header-y += netrom.h header-y += nfs2.h -- cgit v1.2.3 From c5df7f775148723de39274537a886e9502eef336 Mon Sep 17 00:00:00 2001 From: Albert Herranz Date: Sat, 12 Dec 2009 06:31:54 +0000 Subject: powerpc: allow ioremap within reserved memory regions Add a flag to let a platform ioremap memory regions marked as reserved. This flag will be used later by the Nintendo Wii support code to allow ioremapping the I/O region sitting between MEM1 and MEM2 and marked as reserved RAM in the patch "wii: use both mem1 and mem2 as ram". This will no longer be needed when proper discontig memory support for 32-bit PowerPC is added to the kernel. Signed-off-by: Albert Herranz Acked-by: Benjamin Herrenschmidt Signed-off-by: Grant Likely --- arch/powerpc/mm/init_32.c | 5 +++++ arch/powerpc/mm/mmu_decl.h | 1 + arch/powerpc/mm/pgtable_32.c | 4 +++- include/linux/lmb.h | 1 + lib/lmb.c | 7 ++++++- 5 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 703c7c2e0d9f..4ec900af332f 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -82,6 +82,11 @@ extern struct task_struct *current_set[NR_CPUS]; int __map_without_bats; int __map_without_ltlbs; +/* + * This tells the system to allow ioremapping memory marked as reserved. + */ +int __allow_ioremap_reserved; + /* max amount of low RAM to map in */ unsigned long __max_low_memory = MAX_LOW_MEM; diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 9aa39fe74f8a..34dacc32250d 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -115,6 +115,7 @@ extern void settlbcam(int index, unsigned long virt, phys_addr_t phys, extern void invalidate_tlbcam_entry(int index); extern int __map_without_bats; +extern int __allow_ioremap_reserved; extern unsigned long ioremap_base; extern unsigned int rtas_data, rtas_size; diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index b55bbe87acb8..177e4038b43c 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -191,7 +192,8 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags, * Don't allow anybody to remap normal RAM that we're using. * mem_init() sets high_memory so only do the check after that. */ - if (mem_init_done && (p < virt_to_phys(high_memory))) { + if (mem_init_done && (p < virt_to_phys(high_memory)) && + !(__allow_ioremap_reserved && lmb_is_region_reserved(p, size))) { printk("__ioremap(): phys addr 0x%llx is RAM lr %p\n", (unsigned long long)p, __builtin_return_address(0)); return NULL; diff --git a/include/linux/lmb.h b/include/linux/lmb.h index 2442e3f3d033..ef82b8fcbddb 100644 --- a/include/linux/lmb.h +++ b/include/linux/lmb.h @@ -54,6 +54,7 @@ extern u64 __init lmb_phys_mem_size(void); extern u64 lmb_end_of_DRAM(void); extern void __init lmb_enforce_memory_limit(u64 memory_limit); extern int __init lmb_is_reserved(u64 addr); +extern int lmb_is_region_reserved(u64 base, u64 size); extern int lmb_find(struct lmb_property *res); extern void lmb_dump_all(void); diff --git a/lib/lmb.c b/lib/lmb.c index 0343c05609f0..9cee17142b2c 100644 --- a/lib/lmb.c +++ b/lib/lmb.c @@ -263,7 +263,7 @@ long __init lmb_reserve(u64 base, u64 size) return lmb_add_region(_rgn, base, size); } -long __init lmb_overlaps_region(struct lmb_region *rgn, u64 base, u64 size) +long lmb_overlaps_region(struct lmb_region *rgn, u64 base, u64 size) { unsigned long i; @@ -493,6 +493,11 @@ int __init lmb_is_reserved(u64 addr) return 0; } +int lmb_is_region_reserved(u64 base, u64 size) +{ + return lmb_overlaps_region(&lmb.reserved, base, size); +} + /* * Given a , find which memory regions belong to this range. * Adjust the request and return a contiguous chunk. -- cgit v1.2.3 From 87d9b4e1c52867a45331a9a5495f6448e0c68b23 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:14:20 +0800 Subject: tracing: Extract duplicate ftrace_raw_init_event_foo() Use a generic trace_event_raw_init() function for all event's raw_init callbacks (but kprobes) instead of defining the same version for each of these. This shrinks the kernel code: text data bss dec hex filename 5355293 1961928 7103260 14420481 dc0a01 vmlinux.o.old 5346802 1961864 7103260 14411926 dbe896 vmlinux.o raw_init can't be removed, because ftrace events and kprobe events use different raw_init callbacks. Though it's possible to totally remove raw_init, I choose to leave it as it is for now. Signed-off-by: Li Zefan Acked-by: Steven Rostedt Cc: Jason Baron Cc: Ingo Molnar LKML-Reference: <4B1DC48C.7080603@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- include/linux/ftrace_event.h | 1 + include/linux/syscalls.h | 4 ++-- include/trace/ftrace.h | 35 ++++------------------------------- kernel/trace/trace_events.c | 14 ++++++++++++++ 4 files changed, 21 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 38f8d6553831..ea44b8911094 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -158,6 +158,7 @@ enum { FILTER_PTR_STRING, }; +extern int trace_event_raw_init(struct ftrace_event_call *call); extern int trace_define_common_fields(struct ftrace_event_call *call); extern int trace_define_field(struct ftrace_event_call *call, const char *type, const char *name, int offset, int size, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index bc70c5810fec..94ac28437bef 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -145,7 +145,7 @@ struct perf_event_attr; .name = "sys_enter"#sname, \ .system = "syscalls", \ .event = &enter_syscall_print_##sname, \ - .raw_init = init_syscall_trace, \ + .raw_init = trace_event_raw_init, \ .show_format = syscall_enter_format, \ .define_fields = syscall_enter_define_fields, \ .regfunc = reg_event_syscall_enter, \ @@ -167,7 +167,7 @@ struct perf_event_attr; .name = "sys_exit"#sname, \ .system = "syscalls", \ .event = &exit_syscall_print_##sname, \ - .raw_init = init_syscall_trace, \ + .raw_init = trace_event_raw_init, \ .show_format = syscall_exit_format, \ .define_fields = syscall_exit_define_fields, \ .regfunc = reg_event_syscall_exit, \ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index c4eca380204e..6055b0604c86 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -623,23 +623,12 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ * .trace = ftrace_raw_output_, <-- stage 2 * }; * - * static int ftrace_raw_init_event_(struct ftrace_event_call *unused) - * { - * int id; - * - * id = register_ftrace_event(&ftrace_event_type_); - * if (!id) - * return -ENODEV; - * event_.id = id; - * return 0; - * } - * * static struct ftrace_event_call __used * __attribute__((__aligned__(4))) * __attribute__((section("_ftrace_events"))) event_ = { * .name = "", * .system = "", - * .raw_init = ftrace_raw_init_event_, + * .raw_init = trace_event_raw_init, * .regfunc = ftrace_reg_event_, * .unregfunc = ftrace_unreg_event_, * .show_format = ftrace_format_, @@ -647,9 +636,6 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ * */ -#undef TP_FMT -#define TP_FMT(fmt, args...) fmt "\n", ##args - #ifdef CONFIG_EVENT_PROFILE #define _TRACE_PROFILE_INIT(call) \ @@ -744,19 +730,7 @@ static void ftrace_raw_unreg_event_##call(struct ftrace_event_call *unused)\ \ static struct trace_event ftrace_event_type_##call = { \ .trace = ftrace_raw_output_##call, \ -}; \ - \ -static int ftrace_raw_init_event_##call(struct ftrace_event_call *unused)\ -{ \ - int id; \ - \ - id = register_ftrace_event(&ftrace_event_type_##call); \ - if (!id) \ - return -ENODEV; \ - event_##call.id = id; \ - INIT_LIST_HEAD(&event_##call.fields); \ - return 0; \ -} +}; #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ @@ -776,7 +750,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .system = __stringify(TRACE_SYSTEM), \ .event = &ftrace_event_type_##call, \ - .raw_init = ftrace_raw_init_event_##call, \ + .raw_init = trace_event_raw_init, \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ .show_format = ftrace_format_##template, \ @@ -793,7 +767,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .system = __stringify(TRACE_SYSTEM), \ .event = &ftrace_event_type_##call, \ - .raw_init = ftrace_raw_init_event_##call, \ + .raw_init = trace_event_raw_init, \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ .show_format = ftrace_format_##call, \ @@ -953,7 +927,6 @@ end: \ perf_swevent_put_recursion_context(rctx); \ end_recursion: \ local_irq_restore(irq_flags); \ - \ } #undef DEFINE_EVENT diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1d18315dc836..8ed66e0d476b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -105,6 +105,20 @@ void trace_destroy_fields(struct ftrace_event_call *call) } } +int trace_event_raw_init(struct ftrace_event_call *call) +{ + int id; + + id = register_ftrace_event(call->event); + if (!id) + return -ENODEV; + call->id = id; + INIT_LIST_HEAD(&call->fields); + + return 0; +} +EXPORT_SYMBOL_GPL(trace_event_raw_init); + static void ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { -- cgit v1.2.3 From 614a71a26ba3d97e9fa85649db69a682b78e407d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:14:36 +0800 Subject: tracing: Pull up calls to trace_define_common_fields() Call trace_define_common_fields() in event_create_dir() only. This avoids trace events to handle it from their define_fields callbacks and shrinks the kernel code size: text data bss dec hex filename 5346802 1961864 7103260 14411926 dbe896 vmlinux.o.old 5345151 1961864 7103260 14410275 dbe223 vmlinux.o Signed-off-by: Li Zefan Acked-by: Steven Rostedt Cc: Ingo Molnar Cc: Jason Baron Cc: Masami Hiramatsu LKML-Reference: <4B1DC49C.8000107@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- include/linux/ftrace_event.h | 1 - include/trace/ftrace.h | 4 ---- kernel/trace/trace_events.c | 7 ++++--- kernel/trace/trace_export.c | 4 ---- kernel/trace/trace_kprobe.c | 8 -------- kernel/trace/trace_syscalls.c | 8 -------- 6 files changed, 4 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index ea44b8911094..db97c64ce0e9 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -159,7 +159,6 @@ enum { }; extern int trace_event_raw_init(struct ftrace_event_call *call); -extern int trace_define_common_fields(struct ftrace_event_call *call); extern int trace_define_field(struct ftrace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type); diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 6055b0604c86..2af2f7a2c1bd 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -436,10 +436,6 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ struct ftrace_raw_##call field; \ int ret; \ \ - ret = trace_define_common_fields(event_call); \ - if (ret) \ - return ret; \ - \ tstruct; \ \ return ret; \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8ed66e0d476b..97b0b3aa166d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -78,7 +78,7 @@ EXPORT_SYMBOL_GPL(trace_define_field); if (ret) \ return ret; -int trace_define_common_fields(struct ftrace_event_call *call) +static int trace_define_common_fields(struct ftrace_event_call *call) { int ret; struct trace_entry ent; @@ -91,7 +91,6 @@ int trace_define_common_fields(struct ftrace_event_call *call) return ret; } -EXPORT_SYMBOL_GPL(trace_define_common_fields); void trace_destroy_fields(struct ftrace_event_call *call) { @@ -927,7 +926,9 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, id); if (call->define_fields) { - ret = call->define_fields(call); + ret = trace_define_common_fields(call); + if (!ret) + ret = call->define_fields(call); if (ret < 0) { pr_warning("Could not initialize trace point" " events/%s\n", call->name); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index dff8c84ddf17..458e5bfe26d0 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -184,10 +184,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ struct struct_name field; \ int ret; \ \ - ret = trace_define_common_fields(event_call); \ - if (ret) \ - return ret; \ - \ tstruct; \ \ return ret; \ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index aff5f80b59b8..e3c80e925896 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1113,10 +1113,6 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) struct kprobe_trace_entry field; struct trace_probe *tp = (struct trace_probe *)event_call->data; - ret = trace_define_common_fields(event_call); - if (!ret) - return ret; - DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); /* Set argument names as fields */ @@ -1131,10 +1127,6 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) struct kretprobe_trace_entry field; struct trace_probe *tp = (struct trace_probe *)event_call->data; - ret = trace_define_common_fields(event_call); - if (!ret) - return ret; - DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 57501d90096a..b957edd0ca3b 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -217,10 +217,6 @@ int syscall_enter_define_fields(struct ftrace_event_call *call) int i; int offset = offsetof(typeof(trace), args); - ret = trace_define_common_fields(call); - if (ret) - return ret; - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); if (ret) return ret; @@ -241,10 +237,6 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) struct syscall_trace_exit trace; int ret; - ret = trace_define_common_fields(call); - if (ret) - return ret; - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); if (ret) return ret; -- cgit v1.2.3 From e00bf2ec60605eb95687b7a0c3b83c87c48541dc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 8 Dec 2009 11:17:29 +0800 Subject: tracing: Change event->profile_count to be int type Like total_profile_count, struct ftrace_event_call::profile_count is protected by event_mutex, so it doesn't need to be atomic_t. Signed-off-by: Li Zefan Acked-by: Steven Rostedt Cc: Jason Baron Cc: Masami Hiramatsu Cc: Peter Zijlstra LKML-Reference: <4B1DC549.5010705@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- include/linux/ftrace_event.h | 2 +- include/linux/syscalls.h | 2 -- include/trace/ftrace.h | 1 - kernel/trace/trace_event_profile.c | 6 +++--- kernel/trace/trace_kprobe.c | 1 - 5 files changed, 4 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index db97c64ce0e9..2233c98d80df 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -131,7 +131,7 @@ struct ftrace_event_call { void *mod; void *data; - atomic_t profile_count; + int profile_count; int (*profile_enable)(struct ftrace_event_call *); void (*profile_disable)(struct ftrace_event_call *); }; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 94ac28437bef..72d69860d901 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -102,12 +102,10 @@ struct perf_event_attr; #ifdef CONFIG_EVENT_PROFILE #define TRACE_SYS_ENTER_PROFILE_INIT(sname) \ - .profile_count = ATOMIC_INIT(-1), \ .profile_enable = prof_sysenter_enable, \ .profile_disable = prof_sysenter_disable, #define TRACE_SYS_EXIT_PROFILE_INIT(sname) \ - .profile_count = ATOMIC_INIT(-1), \ .profile_enable = prof_sysexit_enable, \ .profile_disable = prof_sysexit_disable, #else diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 0c21af85211c..73523151a731 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -629,7 +629,6 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ #ifdef CONFIG_EVENT_PROFILE #define _TRACE_PROFILE_INIT(call) \ - .profile_count = ATOMIC_INIT(-1), \ .profile_enable = ftrace_profile_enable_##call, \ .profile_disable = ftrace_profile_disable_##call, diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index d9c60f80aa0d..9e25573242cf 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -25,7 +25,7 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) char *buf; int ret = -ENOMEM; - if (atomic_inc_return(&event->profile_count)) + if (event->profile_count++ > 0) return 0; if (!total_profile_count) { @@ -56,7 +56,7 @@ fail_buf_nmi: perf_trace_buf = NULL; } fail_buf: - atomic_dec(&event->profile_count); + event->profile_count--; return ret; } @@ -83,7 +83,7 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event) { char *buf, *nmi_buf; - if (!atomic_add_negative(-1, &event->profile_count)) + if (--event->profile_count > 0) return; event->profile_disable(event); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index e3c80e925896..6ed223447a3f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1426,7 +1426,6 @@ static int register_probe_event(struct trace_probe *tp) call->unregfunc = probe_event_disable; #ifdef CONFIG_EVENT_PROFILE - atomic_set(&call->profile_count, -1); call->profile_enable = probe_profile_enable; call->profile_disable = probe_profile_disable; #endif -- cgit v1.2.3 From c7cd606f60e7679c7f9eee7010f02a6f000209c1 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Sat, 12 Dec 2009 04:13:21 +0000 Subject: can: Fix data length code handling in rx path A valid CAN dataframe can have a data length code (DLC) of 0 .. 8 data bytes. When reading the CAN controllers register the 4-bit value may contain values from 0 .. 15 which may exceed the reserved space in the socket buffer! The ISO 11898-1 Chapter 8.4.2.3 (DLC field) says that register values > 8 should be reduced to 8 without any error reporting or frame drop. This patch introduces a new helper macro to cast a given 4-bit data length code (dlc) to __u8 and ensure the DLC value to be max. 8 bytes. The different handlings in the rx path of the CAN netdevice drivers are fixed. Signed-off-by: Oliver Hartkopp Signed-off-by: Wolfgang Grandegger Signed-off-by: David S. Miller --- drivers/net/can/at91_can.c | 2 +- drivers/net/can/bfin_can.c | 2 +- drivers/net/can/mcp251x.c | 13 +++---------- drivers/net/can/mscan/mscan.c | 3 ++- drivers/net/can/sja1000/sja1000.c | 18 ++++++++---------- drivers/net/can/ti_hecc.c | 2 +- drivers/net/can/usb/ems_usb.c | 2 +- include/linux/can/dev.h | 9 +++++++++ 8 files changed, 26 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c index cbe3fce53e3b..d0ec17878ffc 100644 --- a/drivers/net/can/at91_can.c +++ b/drivers/net/can/at91_can.c @@ -474,7 +474,7 @@ static void at91_read_mb(struct net_device *dev, unsigned int mb, reg_msr = at91_read(priv, AT91_MSR(mb)); if (reg_msr & AT91_MSR_MRTR) cf->can_id |= CAN_RTR_FLAG; - cf->can_dlc = min_t(__u8, (reg_msr >> 16) & 0xf, 8); + cf->can_dlc = get_can_dlc((reg_msr >> 16) & 0xf); *(u32 *)(cf->data + 0) = at91_read(priv, AT91_MDL(mb)); *(u32 *)(cf->data + 4) = at91_read(priv, AT91_MDH(mb)); diff --git a/drivers/net/can/bfin_can.c b/drivers/net/can/bfin_can.c index c7fc1de28173..0ec1524523cc 100644 --- a/drivers/net/can/bfin_can.c +++ b/drivers/net/can/bfin_can.c @@ -392,7 +392,7 @@ static void bfin_can_rx(struct net_device *dev, u16 isrc) cf->can_id |= CAN_RTR_FLAG; /* get data length code */ - cf->can_dlc = bfin_read16(®->chl[obj].dlc); + cf->can_dlc = get_can_dlc(bfin_read16(®->chl[obj].dlc) & 0xF); /* get payload */ for (i = 0; i < 8; i += 2) { diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c index 78b1b69b2921..9c5a1537939c 100644 --- a/drivers/net/can/mcp251x.c +++ b/drivers/net/can/mcp251x.c @@ -403,9 +403,8 @@ static void mcp251x_hw_rx_frame(struct spi_device *spi, u8 *buf, for (i = 1; i < RXBDAT_OFF; i++) buf[i] = mcp251x_read_reg(spi, RXBCTRL(buf_idx) + i); - len = buf[RXBDLC_OFF] & RXBDLC_LEN_MASK; - if (len > 8) - len = 8; + + len = get_can_dlc(buf[RXBDLC_OFF] & RXBDLC_LEN_MASK); for (; i < (RXBDAT_OFF + len); i++) buf[i] = mcp251x_read_reg(spi, RXBCTRL(buf_idx) + i); } else { @@ -455,13 +454,7 @@ static void mcp251x_hw_rx(struct spi_device *spi, int buf_idx) (buf[RXBSIDL_OFF] >> RXBSIDL_SHIFT); } /* Data length */ - frame->can_dlc = buf[RXBDLC_OFF] & RXBDLC_LEN_MASK; - if (frame->can_dlc > 8) { - dev_warn(&spi->dev, "invalid frame recevied\n"); - priv->net->stats.rx_errors++; - dev_kfree_skb(skb); - return; - } + frame->can_dlc = get_can_dlc(buf[RXBDLC_OFF] & RXBDLC_LEN_MASK); memcpy(frame->data, buf + RXBDAT_OFF, frame->can_dlc); priv->net->stats.rx_packets++; diff --git a/drivers/net/can/mscan/mscan.c b/drivers/net/can/mscan/mscan.c index bb06dfb58f25..07346f880ca6 100644 --- a/drivers/net/can/mscan/mscan.c +++ b/drivers/net/can/mscan/mscan.c @@ -297,7 +297,8 @@ static void mscan_get_rx_frame(struct net_device *dev, struct can_frame *frame) frame->can_id |= can_id >> 1; if (can_id & 1) frame->can_id |= CAN_RTR_FLAG; - frame->can_dlc = in_8(®s->rx.dlr) & 0xf; + + frame->can_dlc = get_can_dlc(in_8(®s->rx.dlr) & 0xf); if (!(frame->can_id & CAN_RTR_FLAG)) { void __iomem *data = ®s->rx.dsr1_0; diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c index b4ba88a31075..542a4f7255b4 100644 --- a/drivers/net/can/sja1000/sja1000.c +++ b/drivers/net/can/sja1000/sja1000.c @@ -293,15 +293,14 @@ static void sja1000_rx(struct net_device *dev) uint8_t fi; uint8_t dreg; canid_t id; - uint8_t dlc; int i; + /* create zero'ed CAN frame buffer */ skb = alloc_can_skb(dev, &cf); if (skb == NULL) return; fi = priv->read_reg(priv, REG_FI); - dlc = fi & 0x0F; if (fi & FI_FF) { /* extended frame format (EFF) */ @@ -318,16 +317,15 @@ static void sja1000_rx(struct net_device *dev) | (priv->read_reg(priv, REG_ID2) >> 5); } - if (fi & FI_RTR) + if (fi & FI_RTR) { id |= CAN_RTR_FLAG; + } else { + cf->can_dlc = get_can_dlc(fi & 0x0F); + for (i = 0; i < cf->can_dlc; i++) + cf->data[i] = priv->read_reg(priv, dreg++); + } cf->can_id = id; - cf->can_dlc = dlc; - for (i = 0; i < dlc; i++) - cf->data[i] = priv->read_reg(priv, dreg++); - - while (i < 8) - cf->data[i++] = 0; /* release receive buffer */ priv->write_reg(priv, REG_CMR, CMD_RRB); @@ -335,7 +333,7 @@ static void sja1000_rx(struct net_device *dev) netif_rx(skb); stats->rx_packets++; - stats->rx_bytes += dlc; + stats->rx_bytes += cf->can_dlc; } static int sja1000_err(struct net_device *dev, uint8_t isrc, uint8_t status) diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c index 07e8016b17ec..5c993c2da528 100644 --- a/drivers/net/can/ti_hecc.c +++ b/drivers/net/can/ti_hecc.c @@ -552,7 +552,7 @@ static int ti_hecc_rx_pkt(struct ti_hecc_priv *priv, int mbxno) data = hecc_read_mbx(priv, mbxno, HECC_CANMCF); if (data & HECC_CANMCF_RTR) cf->can_id |= CAN_RTR_FLAG; - cf->can_dlc = data & 0xF; + cf->can_dlc = get_can_dlc(data & 0xF); data = hecc_read_mbx(priv, mbxno, HECC_CANMDL); *(u32 *)(cf->data) = cpu_to_be32(data); if (cf->can_dlc > 4) { diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index 591eb0eb1c2b..efbb05c71bf4 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -316,7 +316,7 @@ static void ems_usb_rx_can_msg(struct ems_usb *dev, struct ems_cpc_msg *msg) return; cf->can_id = le32_to_cpu(msg->msg.can_msg.id); - cf->can_dlc = min_t(u8, msg->msg.can_msg.length, 8); + cf->can_dlc = get_can_dlc(msg->msg.can_msg.length & 0xF); if (msg->type == CPC_MSG_TYPE_EXT_CAN_FRAME || msg->type == CPC_MSG_TYPE_EXT_RTR_FRAME) diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 1ed2a5cc03f5..3db7767d2a17 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -51,6 +51,15 @@ struct can_priv { struct sk_buff **echo_skb; }; +/* + * get_can_dlc(value) - helper macro to cast a given data length code (dlc) + * to __u8 and ensure the dlc value to be max. 8 bytes. + * + * To be used in the CAN netdriver receive path to ensure conformance with + * ISO 11898-1 Chapter 8.4.2.3 (DLC field) + */ +#define get_can_dlc(i) (min_t(__u8, (i), 8)) + struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max); void free_candev(struct net_device *dev); -- cgit v1.2.3 From 4056c9a344d60ee96471a5f3b0a3c8a90371c8fd Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 3 Dec 2009 20:28:04 +0200 Subject: nfsd: Remove unused dprintk This doesn't appear to be useful. Signed-off-by: Boaz Harrosh Signed-off-by: J. Bruce Fields --- include/linux/nfsd/nfsfh.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h index 8f641c908450..2973e1135343 100644 --- a/include/linux/nfsd/nfsfh.h +++ b/include/linux/nfsd/nfsfh.h @@ -20,7 +20,6 @@ # include #endif #include -#include /* * This is the old "dentry style" Linux NFSv2 file handle. @@ -329,9 +328,6 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) struct dentry *dentry = fhp->fh_dentry; struct inode *inode; - dfprintk(FILEOP, "nfsd: fh_lock(%s) locked = %d\n", - SVCFH_fmt(fhp), fhp->fh_locked); - BUG_ON(!dentry); if (fhp->fh_locked) { -- cgit v1.2.3 From a600ffcbb3743cf1296bee2a41d4824c719d7181 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 3 Dec 2009 20:28:35 +0200 Subject: sunrpc: Clean never used include files Remove include of two headers never used by this file. Doing so exposed a missing #include in include/linux/sunrpc/rpc_rdma.h. I did not see any other users dependency but if exist they should be fixed since these headers are totally irrelevant to here. Signed-off-by: Boaz Harrosh Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/debug.h | 3 --- include/linux/sunrpc/rpc_rdma.h | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 10709cbe96fd..c2786f20016f 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -28,9 +28,6 @@ #ifdef __KERNEL__ -#include -#include - /* * Enable RPC debugging/profiling. */ diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index 87b895d5c786..b78f16b1dea3 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -40,6 +40,8 @@ #ifndef _LINUX_SUNRPC_RPC_RDMA_H #define _LINUX_SUNRPC_RPC_RDMA_H +#include + struct rpcrdma_segment { __be32 rs_handle; /* Registered memory handle */ __be32 rs_length; /* Length of the chunk in bytes */ -- cgit v1.2.3 From d703158229329af7152d159753f849aa7bd55ee6 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 3 Dec 2009 20:28:47 +0200 Subject: nfsd: Fix independence of a few nfsd related headers An header should be compilation independent, .i.e pull in any header who's declarations are directly used by this header. And not let users re-include all it's dependencies all over again. [At the end of the day what's the use of a header if it does not have more then one user?] Signed-off-by: Boaz Harrosh Signed-off-by: J. Bruce Fields --- include/linux/nfs_xdr.h | 1 + include/linux/nfsacl.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 62f63fb0c4c8..00a0c8170816 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -2,6 +2,7 @@ #define _LINUX_NFS_XDR_H #include +#include /* * To change the maximum rsize and wsize supported by the NFS client, adjust diff --git a/include/linux/nfsacl.h b/include/linux/nfsacl.h index 43011b69297c..f321b578edeb 100644 --- a/include/linux/nfsacl.h +++ b/include/linux/nfsacl.h @@ -29,6 +29,7 @@ #ifdef __KERNEL__ #include +#include /* Maximum number of ACL entries over NFS */ #define NFS_ACL_MAX_ENTRIES 1024 -- cgit v1.2.3 From 72579ac9cd68081108277c31781b127d0f420d61 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 3 Dec 2009 20:28:59 +0200 Subject: nfsd: Headers Independence and include cleanups * Add includes that are directly used by headers * Remove includes that are not needed These are the changes made: [xdr.h] struct nfsd_readdirres has an embedded struct readdir_cd from nfsd.h fixing that we can drop other includes [xdr4.h] embedded types defined both at state.h and nfsd.h [syscall.h] After export.h fix none of these stuff is needed. fix extra space in # include <> statement [stats.h] does not need but was export to user-mode so I don't touch it [state.h] embedded types from nfsfh.h like struct knfsd_fh. bringing that eliminates the need for all other includes [nfsfh.h] directly manipulating types from sunrpc/svc.h. Removed Other unused headers. [nfsd.h] removed unused headers include [export.h] lots of sunrpc/svc.h types and a single prototype declaration with pointer from nfsfh.h, but all users of export.h do need nfsfh.h any way. remove now un-needed include. [const.h] Unfixed (not independent) [cache.h] could do with a forward declaration of "struct svc_rqst;" from sunrpc/svc.h but all users absolutely will need sunrpc/svc.h it is easier overall this way. Signed-off-by: Boaz Harrosh Signed-off-by: J. Bruce Fields --- include/linux/nfsd/cache.h | 3 +-- include/linux/nfsd/export.h | 2 +- include/linux/nfsd/nfsd.h | 4 ---- include/linux/nfsd/nfsfh.h | 3 +-- include/linux/nfsd/state.h | 4 +--- include/linux/nfsd/syscall.h | 8 +------- include/linux/nfsd/xdr.h | 3 +-- include/linux/nfsd/xdr4.h | 3 ++- 8 files changed, 8 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfsd/cache.h b/include/linux/nfsd/cache.h index 3a3f58934f5e..a165425dea41 100644 --- a/include/linux/nfsd/cache.h +++ b/include/linux/nfsd/cache.h @@ -10,8 +10,7 @@ #ifndef NFSCACHE_H #define NFSCACHE_H -#include -#include +#include /* * Representation of a reply cache entry. diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h index a6d9ef2bb34a..ef3d416fcf67 100644 --- a/include/linux/nfsd/export.h +++ b/include/linux/nfsd/export.h @@ -12,7 +12,7 @@ # include #ifdef __KERNEL__ -# include +# include #endif /* diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index e4518d090a8c..74f67c2aca34 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -11,13 +11,9 @@ #define LINUX_NFSD_NFSD_H #include -#include -#include -#include #include #include -#include #include #include /* diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h index 2973e1135343..49523edbc510 100644 --- a/include/linux/nfsd/nfsfh.h +++ b/include/linux/nfsd/nfsfh.h @@ -16,8 +16,7 @@ # include #ifdef __KERNEL__ -# include -# include +# include #endif #include diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index 5aadf8aa3a97..2af75686e0d3 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -37,9 +37,7 @@ #ifndef _NFSD4_STATE_H #define _NFSD4_STATE_H -#include -#include -#include +#include typedef struct { u32 cl_boot; diff --git a/include/linux/nfsd/syscall.h b/include/linux/nfsd/syscall.h index 7a3b565b898f..812bc1e160dc 100644 --- a/include/linux/nfsd/syscall.h +++ b/include/linux/nfsd/syscall.h @@ -9,14 +9,8 @@ #ifndef NFSD_SYSCALL_H #define NFSD_SYSCALL_H -# include -#ifdef __KERNEL__ -# include -#endif -#include -#include +#include #include -#include /* * Version of the syscall interface diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h index a0132ef58f21..58f824d854c2 100644 --- a/include/linux/nfsd/xdr.h +++ b/include/linux/nfsd/xdr.h @@ -7,9 +7,8 @@ #ifndef LINUX_NFSD_H #define LINUX_NFSD_H -#include #include -#include +#include struct nfsd_fhandle { struct svc_fh fh; diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h index 73164c2b3d29..1bf266239c7e 100644 --- a/include/linux/nfsd/xdr4.h +++ b/include/linux/nfsd/xdr4.h @@ -39,7 +39,8 @@ #ifndef _LINUX_NFSD_XDR4_H #define _LINUX_NFSD_XDR4_H -#include +#include +#include #define NFSD4_MAX_TAGLEN 128 #define XDR_LEN(n) (((n) + 3) & ~3) -- cgit v1.2.3 From 9a74af21330c8d46efa977d088a62cc1bfa954e9 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 3 Dec 2009 20:30:56 +0200 Subject: nfsd: Move private headers to source directory Lots of include/linux/nfsd/* headers are only used by nfsd module. Move them to the source directory Signed-off-by: Boaz Harrosh Signed-off-by: J. Bruce Fields --- fs/nfsd/auth.c | 2 +- fs/nfsd/cache.h | 85 +++++++ fs/nfsd/export.c | 3 +- fs/nfsd/lockd.c | 2 +- fs/nfsd/nfs2acl.c | 7 +- fs/nfsd/nfs3acl.c | 7 +- fs/nfsd/nfs3proc.c | 4 +- fs/nfsd/nfs3xdr.c | 2 +- fs/nfsd/nfs4callback.c | 4 +- fs/nfsd/nfs4proc.c | 4 +- fs/nfsd/nfs4recover.c | 5 +- fs/nfsd/nfs4state.c | 2 +- fs/nfsd/nfs4xdr.c | 3 +- fs/nfsd/nfscache.c | 4 +- fs/nfsd/nfsctl.c | 5 +- fs/nfsd/nfsd.h | 335 +++++++++++++++++++++++++++ fs/nfsd/nfsfh.c | 2 +- fs/nfsd/nfsproc.c | 4 +- fs/nfsd/nfssvc.c | 4 +- fs/nfsd/nfsxdr.c | 2 +- fs/nfsd/state.h | 409 ++++++++++++++++++++++++++++++++ fs/nfsd/stats.c | 4 +- fs/nfsd/vfs.c | 18 +- fs/nfsd/xdr.h | 176 ++++++++++++++ fs/nfsd/xdr3.h | 346 +++++++++++++++++++++++++++ fs/nfsd/xdr4.h | 564 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/nfsd/cache.h | 85 ------- include/linux/nfsd/nfsd.h | 335 --------------------------- include/linux/nfsd/state.h | 409 -------------------------------- include/linux/nfsd/xdr.h | 176 -------------- include/linux/nfsd/xdr3.h | 346 --------------------------- include/linux/nfsd/xdr4.h | 564 --------------------------------------------- 32 files changed, 1963 insertions(+), 1955 deletions(-) create mode 100644 fs/nfsd/cache.h create mode 100644 fs/nfsd/nfsd.h create mode 100644 fs/nfsd/state.h create mode 100644 fs/nfsd/xdr.h create mode 100644 fs/nfsd/xdr3.h create mode 100644 fs/nfsd/xdr4.h delete mode 100644 include/linux/nfsd/cache.h delete mode 100644 include/linux/nfsd/nfsd.h delete mode 100644 include/linux/nfsd/state.h delete mode 100644 include/linux/nfsd/xdr.h delete mode 100644 include/linux/nfsd/xdr3.h delete mode 100644 include/linux/nfsd/xdr4.h (limited to 'include/linux') diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index ad354d284cf8..71209d4993d0 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -5,7 +5,7 @@ */ #include -#include +#include "nfsd.h" #include "auth.h" int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h new file mode 100644 index 000000000000..a165425dea41 --- /dev/null +++ b/fs/nfsd/cache.h @@ -0,0 +1,85 @@ +/* + * include/linux/nfsd/cache.h + * + * Request reply cache. This was heavily inspired by the + * implementation in 4.3BSD/4.4BSD. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#ifndef NFSCACHE_H +#define NFSCACHE_H + +#include + +/* + * Representation of a reply cache entry. + */ +struct svc_cacherep { + struct hlist_node c_hash; + struct list_head c_lru; + + unsigned char c_state, /* unused, inprog, done */ + c_type, /* status, buffer */ + c_secure : 1; /* req came from port < 1024 */ + struct sockaddr_in c_addr; + __be32 c_xid; + u32 c_prot; + u32 c_proc; + u32 c_vers; + unsigned long c_timestamp; + union { + struct kvec u_vec; + __be32 u_status; + } c_u; +}; + +#define c_replvec c_u.u_vec +#define c_replstat c_u.u_status + +/* cache entry states */ +enum { + RC_UNUSED, + RC_INPROG, + RC_DONE +}; + +/* return values */ +enum { + RC_DROPIT, + RC_REPLY, + RC_DOIT, + RC_INTR +}; + +/* + * Cache types. + * We may want to add more types one day, e.g. for diropres and + * attrstat replies. Using cache entries with fixed length instead + * of buffer pointers may be more efficient. + */ +enum { + RC_NOCACHE, + RC_REPLSTAT, + RC_REPLBUFF, +}; + +/* + * If requests are retransmitted within this interval, they're dropped. + */ +#define RC_DELAY (HZ/5) + +int nfsd_reply_cache_init(void); +void nfsd_reply_cache_shutdown(void); +int nfsd_cache_lookup(struct svc_rqst *, int); +void nfsd_cache_update(struct svc_rqst *, int, __be32 *); + +#ifdef CONFIG_NFSD_V4 +void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp); +#else /* CONFIG_NFSD_V4 */ +static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp) +{ +} +#endif /* CONFIG_NFSD_V4 */ + +#endif /* NFSCACHE_H */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 68e63f441444..cb3dae2fcd86 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -18,10 +18,11 @@ #include #include -#include #include #include +#include "nfsd.h" + #define NFSDDBG_FACILITY NFSDDBG_EXPORT typedef struct auth_domain svc_client; diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 801ef7104ff4..6f12777ed227 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -9,8 +9,8 @@ */ #include -#include #include +#include "nfsd.h" #include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_LOCKD diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index a54628de7715..874e2a94bf4f 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -6,10 +6,11 @@ * Copyright (C) 2002-2003 Andreas Gruenbacher */ -#include -#include -#include +#include "nfsd.h" +/* FIXME: nfsacl.h is a broken header */ #include +#include "cache.h" +#include "xdr3.h" #include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 2f5c61bea908..c6011ddbadc0 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -6,10 +6,11 @@ * Copyright (C) 2002-2003 Andreas Gruenbacher */ -#include -#include -#include +#include "nfsd.h" +/* FIXME: nfsacl.h is a broken header */ #include +#include "cache.h" +#include "xdr3.h" #include "vfs.h" #define RETURN_STATUS(st) { resp->status = (st); return (st); } diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index b694b4304544..90b19ca75b34 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -10,8 +10,8 @@ #include #include -#include -#include +#include "cache.h" +#include "xdr3.h" #include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 623e13aa6259..c523bb88c10b 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -9,7 +9,7 @@ */ #include -#include +#include "xdr3.h" #include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_XDR diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 4fe396071b61..f7a315827638 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -34,8 +34,8 @@ */ #include -#include -#include +#include "nfsd.h" +#include "state.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 61f682c77e7f..e2b5666f25d1 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -36,8 +36,8 @@ */ #include -#include -#include +#include "cache.h" +#include "xdr4.h" #include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 48742f243c25..6744e7f2da0e 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -33,12 +33,13 @@ * */ -#include -#include #include #include #include #include + +#include "nfsd.h" +#include "state.h" #include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1fe6e29fd500..2923e6c1da18 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -36,11 +36,11 @@ #include #include -#include #include #include #include #include +#include "xdr4.h" #include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2fa96821f5b5..cab978031100 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -43,10 +43,11 @@ #include #include #include -#include #include #include #include + +#include "xdr4.h" #include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_XDR diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 96694b8345ef..18aa9729a380 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -10,8 +10,8 @@ * Copyright (C) 1995, 1996 Olaf Kirch */ -#include -#include +#include "nfsd.h" +#include "cache.h" /* Size of reply cache. Common values are: * 4.3BSD: 128 diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index e4f49fd6af44..0415680d3f58 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -11,12 +11,13 @@ #include #include -#include -#include #include #include #include +#include "nfsd.h" +#include "cache.h" + /* * We have a single directory with 9 nodes in it. */ diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h new file mode 100644 index 000000000000..74f67c2aca34 --- /dev/null +++ b/fs/nfsd/nfsd.h @@ -0,0 +1,335 @@ +/* + * linux/include/linux/nfsd/nfsd.h + * + * Hodge-podge collection of knfsd-related stuff. + * I will sort this out later. + * + * Copyright (C) 1995-1997 Olaf Kirch + */ + +#ifndef LINUX_NFSD_NFSD_H +#define LINUX_NFSD_NFSD_H + +#include +#include + +#include +#include +#include +/* + * nfsd version + */ +#define NFSD_SUPPORTED_MINOR_VERSION 1 + +struct readdir_cd { + __be32 err; /* 0, nfserr, or nfserr_eof */ +}; + + +extern struct svc_program nfsd_program; +extern struct svc_version nfsd_version2, nfsd_version3, + nfsd_version4; +extern u32 nfsd_supported_minorversion; +extern struct mutex nfsd_mutex; +extern struct svc_serv *nfsd_serv; +extern spinlock_t nfsd_drc_lock; +extern unsigned int nfsd_drc_max_mem; +extern unsigned int nfsd_drc_mem_used; + +extern const struct seq_operations nfs_exports_op; + +/* + * Function prototypes. + */ +int nfsd_svc(unsigned short port, int nrservs); +int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); + +int nfsd_nrthreads(void); +int nfsd_nrpools(void); +int nfsd_get_nrthreads(int n, int *); +int nfsd_set_nrthreads(int n, int *); + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +#ifdef CONFIG_NFSD_V2_ACL +extern struct svc_version nfsd_acl_version2; +#else +#define nfsd_acl_version2 NULL +#endif +#ifdef CONFIG_NFSD_V3_ACL +extern struct svc_version nfsd_acl_version3; +#else +#define nfsd_acl_version3 NULL +#endif +#endif + +enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL }; +int nfsd_vers(int vers, enum vers_op change); +int nfsd_minorversion(u32 minorversion, enum vers_op change); +void nfsd_reset_versions(void); +int nfsd_create_serv(void); + +extern int nfsd_max_blksize; + +/* + * NFSv4 State + */ +#ifdef CONFIG_NFSD_V4 +extern unsigned int max_delegations; +int nfs4_state_init(void); +void nfsd4_free_slabs(void); +int nfs4_state_start(void); +void nfs4_state_shutdown(void); +time_t nfs4_lease_time(void); +void nfs4_reset_lease(time_t leasetime); +int nfs4_reset_recoverydir(char *recdir); +#else +static inline int nfs4_state_init(void) { return 0; } +static inline void nfsd4_free_slabs(void) { } +static inline int nfs4_state_start(void) { return 0; } +static inline void nfs4_state_shutdown(void) { } +static inline time_t nfs4_lease_time(void) { return 0; } +static inline void nfs4_reset_lease(time_t leasetime) { } +static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } +#endif + +/* + * lockd binding + */ +void nfsd_lockd_init(void); +void nfsd_lockd_shutdown(void); + + +/* + * These macros provide pre-xdr'ed values for faster operation. + */ +#define nfs_ok cpu_to_be32(NFS_OK) +#define nfserr_perm cpu_to_be32(NFSERR_PERM) +#define nfserr_noent cpu_to_be32(NFSERR_NOENT) +#define nfserr_io cpu_to_be32(NFSERR_IO) +#define nfserr_nxio cpu_to_be32(NFSERR_NXIO) +#define nfserr_eagain cpu_to_be32(NFSERR_EAGAIN) +#define nfserr_acces cpu_to_be32(NFSERR_ACCES) +#define nfserr_exist cpu_to_be32(NFSERR_EXIST) +#define nfserr_xdev cpu_to_be32(NFSERR_XDEV) +#define nfserr_nodev cpu_to_be32(NFSERR_NODEV) +#define nfserr_notdir cpu_to_be32(NFSERR_NOTDIR) +#define nfserr_isdir cpu_to_be32(NFSERR_ISDIR) +#define nfserr_inval cpu_to_be32(NFSERR_INVAL) +#define nfserr_fbig cpu_to_be32(NFSERR_FBIG) +#define nfserr_nospc cpu_to_be32(NFSERR_NOSPC) +#define nfserr_rofs cpu_to_be32(NFSERR_ROFS) +#define nfserr_mlink cpu_to_be32(NFSERR_MLINK) +#define nfserr_opnotsupp cpu_to_be32(NFSERR_OPNOTSUPP) +#define nfserr_nametoolong cpu_to_be32(NFSERR_NAMETOOLONG) +#define nfserr_notempty cpu_to_be32(NFSERR_NOTEMPTY) +#define nfserr_dquot cpu_to_be32(NFSERR_DQUOT) +#define nfserr_stale cpu_to_be32(NFSERR_STALE) +#define nfserr_remote cpu_to_be32(NFSERR_REMOTE) +#define nfserr_wflush cpu_to_be32(NFSERR_WFLUSH) +#define nfserr_badhandle cpu_to_be32(NFSERR_BADHANDLE) +#define nfserr_notsync cpu_to_be32(NFSERR_NOT_SYNC) +#define nfserr_badcookie cpu_to_be32(NFSERR_BAD_COOKIE) +#define nfserr_notsupp cpu_to_be32(NFSERR_NOTSUPP) +#define nfserr_toosmall cpu_to_be32(NFSERR_TOOSMALL) +#define nfserr_serverfault cpu_to_be32(NFSERR_SERVERFAULT) +#define nfserr_badtype cpu_to_be32(NFSERR_BADTYPE) +#define nfserr_jukebox cpu_to_be32(NFSERR_JUKEBOX) +#define nfserr_denied cpu_to_be32(NFSERR_DENIED) +#define nfserr_deadlock cpu_to_be32(NFSERR_DEADLOCK) +#define nfserr_expired cpu_to_be32(NFSERR_EXPIRED) +#define nfserr_bad_cookie cpu_to_be32(NFSERR_BAD_COOKIE) +#define nfserr_same cpu_to_be32(NFSERR_SAME) +#define nfserr_clid_inuse cpu_to_be32(NFSERR_CLID_INUSE) +#define nfserr_stale_clientid cpu_to_be32(NFSERR_STALE_CLIENTID) +#define nfserr_resource cpu_to_be32(NFSERR_RESOURCE) +#define nfserr_moved cpu_to_be32(NFSERR_MOVED) +#define nfserr_nofilehandle cpu_to_be32(NFSERR_NOFILEHANDLE) +#define nfserr_minor_vers_mismatch cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH) +#define nfserr_share_denied cpu_to_be32(NFSERR_SHARE_DENIED) +#define nfserr_stale_stateid cpu_to_be32(NFSERR_STALE_STATEID) +#define nfserr_old_stateid cpu_to_be32(NFSERR_OLD_STATEID) +#define nfserr_bad_stateid cpu_to_be32(NFSERR_BAD_STATEID) +#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID) +#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK) +#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME) +#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH) +#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) +#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) +#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE) +#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD) +#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL) +#define nfserr_grace cpu_to_be32(NFSERR_GRACE) +#define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE) +#define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD) +#define nfserr_badname cpu_to_be32(NFSERR_BADNAME) +#define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN) +#define nfserr_locked cpu_to_be32(NFSERR_LOCKED) +#define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC) +#define nfserr_badiomode cpu_to_be32(NFS4ERR_BADIOMODE) +#define nfserr_badlayout cpu_to_be32(NFS4ERR_BADLAYOUT) +#define nfserr_bad_session_digest cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST) +#define nfserr_badsession cpu_to_be32(NFS4ERR_BADSESSION) +#define nfserr_badslot cpu_to_be32(NFS4ERR_BADSLOT) +#define nfserr_complete_already cpu_to_be32(NFS4ERR_COMPLETE_ALREADY) +#define nfserr_conn_not_bound_to_session cpu_to_be32(NFS4ERR_CONN_NOT_BOUND_TO_SESSION) +#define nfserr_deleg_already_wanted cpu_to_be32(NFS4ERR_DELEG_ALREADY_WANTED) +#define nfserr_back_chan_busy cpu_to_be32(NFS4ERR_BACK_CHAN_BUSY) +#define nfserr_layouttrylater cpu_to_be32(NFS4ERR_LAYOUTTRYLATER) +#define nfserr_layoutunavailable cpu_to_be32(NFS4ERR_LAYOUTUNAVAILABLE) +#define nfserr_nomatching_layout cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT) +#define nfserr_recallconflict cpu_to_be32(NFS4ERR_RECALLCONFLICT) +#define nfserr_unknown_layouttype cpu_to_be32(NFS4ERR_UNKNOWN_LAYOUTTYPE) +#define nfserr_seq_misordered cpu_to_be32(NFS4ERR_SEQ_MISORDERED) +#define nfserr_sequence_pos cpu_to_be32(NFS4ERR_SEQUENCE_POS) +#define nfserr_req_too_big cpu_to_be32(NFS4ERR_REQ_TOO_BIG) +#define nfserr_rep_too_big cpu_to_be32(NFS4ERR_REP_TOO_BIG) +#define nfserr_rep_too_big_to_cache cpu_to_be32(NFS4ERR_REP_TOO_BIG_TO_CACHE) +#define nfserr_retry_uncached_rep cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP) +#define nfserr_unsafe_compound cpu_to_be32(NFS4ERR_UNSAFE_COMPOUND) +#define nfserr_too_many_ops cpu_to_be32(NFS4ERR_TOO_MANY_OPS) +#define nfserr_op_not_in_session cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION) +#define nfserr_hash_alg_unsupp cpu_to_be32(NFS4ERR_HASH_ALG_UNSUPP) +#define nfserr_clientid_busy cpu_to_be32(NFS4ERR_CLIENTID_BUSY) +#define nfserr_pnfs_io_hole cpu_to_be32(NFS4ERR_PNFS_IO_HOLE) +#define nfserr_seq_false_retry cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY) +#define nfserr_bad_high_slot cpu_to_be32(NFS4ERR_BAD_HIGH_SLOT) +#define nfserr_deadsession cpu_to_be32(NFS4ERR_DEADSESSION) +#define nfserr_encr_alg_unsupp cpu_to_be32(NFS4ERR_ENCR_ALG_UNSUPP) +#define nfserr_pnfs_no_layout cpu_to_be32(NFS4ERR_PNFS_NO_LAYOUT) +#define nfserr_not_only_op cpu_to_be32(NFS4ERR_NOT_ONLY_OP) +#define nfserr_wrong_cred cpu_to_be32(NFS4ERR_WRONG_CRED) +#define nfserr_wrong_type cpu_to_be32(NFS4ERR_WRONG_TYPE) +#define nfserr_dirdeleg_unavail cpu_to_be32(NFS4ERR_DIRDELEG_UNAVAIL) +#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG) +#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT) +#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) + +/* error codes for internal use */ +/* if a request fails due to kmalloc failure, it gets dropped. + * Client should resend eventually + */ +#define nfserr_dropit cpu_to_be32(30000) +/* end-of-file indicator in readdir */ +#define nfserr_eof cpu_to_be32(30001) +/* replay detected */ +#define nfserr_replay_me cpu_to_be32(11001) +/* nfs41 replay detected */ +#define nfserr_replay_cache cpu_to_be32(11002) + +/* Check for dir entries '.' and '..' */ +#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) + +/* + * Time of server startup + */ +extern struct timeval nfssvc_boot; + +#ifdef CONFIG_NFSD_V4 + +/* before processing a COMPOUND operation, we have to check that there + * is enough space in the buffer for XDR encode to succeed. otherwise, + * we might process an operation with side effects, and be unable to + * tell the client that the operation succeeded. + * + * COMPOUND_SLACK_SPACE - this is the minimum bytes of buffer space + * needed to encode an "ordinary" _successful_ operation. (GETATTR, + * READ, READDIR, and READLINK have their own buffer checks.) if we + * fall below this level, we fail the next operation with NFS4ERR_RESOURCE. + * + * COMPOUND_ERR_SLACK_SPACE - this is the minimum bytes of buffer space + * needed to encode an operation which has failed with NFS4ERR_RESOURCE. + * care is taken to ensure that we never fall below this level for any + * reason. + */ +#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ +#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ + +#define NFSD_LEASE_TIME (nfs4_lease_time()) +#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ + +/* + * The following attributes are currently not supported by the NFSv4 server: + * ARCHIVE (deprecated anyway) + * HIDDEN (unlikely to be supported any time soon) + * MIMETYPE (unlikely to be supported any time soon) + * QUOTA_* (will be supported in a forthcoming patch) + * SYSTEM (unlikely to be supported any time soon) + * TIME_BACKUP (unlikely to be supported any time soon) + * TIME_CREATE (unlikely to be supported any time soon) + */ +#define NFSD4_SUPPORTED_ATTRS_WORD0 \ +(FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \ + | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_LINK_SUPPORT \ + | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR | FATTR4_WORD0_FSID \ + | FATTR4_WORD0_UNIQUE_HANDLES | FATTR4_WORD0_LEASE_TIME | FATTR4_WORD0_RDATTR_ERROR \ + | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_CANSETTIME | FATTR4_WORD0_CASE_INSENSITIVE \ + | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED \ + | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \ + | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_HOMOGENEOUS \ + | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \ + | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_ACL) + +#define NFSD4_SUPPORTED_ATTRS_WORD1 \ +(FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \ + | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \ + | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \ + | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \ + | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \ + | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID) + +#define NFSD4_SUPPORTED_ATTRS_WORD2 0 + +#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + +#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + NFSD4_SUPPORTED_ATTRS_WORD1 + +#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ + (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) + +static inline u32 nfsd_suppattrs0(u32 minorversion) +{ + return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0 + : NFSD4_SUPPORTED_ATTRS_WORD0; +} + +static inline u32 nfsd_suppattrs1(u32 minorversion) +{ + return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1 + : NFSD4_SUPPORTED_ATTRS_WORD1; +} + +static inline u32 nfsd_suppattrs2(u32 minorversion) +{ + return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2 + : NFSD4_SUPPORTED_ATTRS_WORD2; +} + +/* These will return ERR_INVAL if specified in GETATTR or READDIR. */ +#define NFSD_WRITEONLY_ATTRS_WORD1 \ +(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) + +/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ +#define NFSD_WRITEABLE_ATTRS_WORD0 \ +(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL ) +#define NFSD_WRITEABLE_ATTRS_WORD1 \ +(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) +#define NFSD_WRITEABLE_ATTRS_WORD2 0 + +#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ + NFSD_WRITEABLE_ATTRS_WORD0 +/* + * we currently store the exclusive create verifier in the v_{a,m}time + * attributes so the client can't set these at create time using EXCLUSIVE4_1 + */ +#define NFSD_SUPPATTR_EXCLCREAT_WORD1 \ + (NFSD_WRITEABLE_ATTRS_WORD1 & \ + ~(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)) +#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \ + NFSD_WRITEABLE_ATTRS_WORD2 + +#endif /* CONFIG_NFSD_V4 */ + +#endif /* LINUX_NFSD_NFSD_H */ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 739948165034..0eb1c59f5ab8 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -12,7 +12,7 @@ #include #include -#include +#include "nfsd.h" #include "vfs.h" #include "auth.h" diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index b6bd9e0d7cd0..21a5f793c3d1 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -9,8 +9,8 @@ #include -#include -#include +#include "cache.h" +#include "xdr.h" #include "vfs.h" typedef struct svc_rqst svc_rqst; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index b2d7ffac0357..b520ce10bd15 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -15,11 +15,11 @@ #include #include -#include -#include #include #include #include +#include "nfsd.h" +#include "cache.h" #include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_SVC diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 5e0603da39e7..3bec831704af 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -6,7 +6,7 @@ * Copyright (C) 1995, 1996 Olaf Kirch */ -#include +#include "xdr.h" #include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_XDR diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h new file mode 100644 index 000000000000..2af75686e0d3 --- /dev/null +++ b/fs/nfsd/state.h @@ -0,0 +1,409 @@ +/* + * linux/include/nfsd/state.h + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _NFSD4_STATE_H +#define _NFSD4_STATE_H + +#include + +typedef struct { + u32 cl_boot; + u32 cl_id; +} clientid_t; + +typedef struct { + u32 so_boot; + u32 so_stateownerid; + u32 so_fileid; +} stateid_opaque_t; + +typedef struct { + u32 si_generation; + stateid_opaque_t si_opaque; +} stateid_t; +#define si_boot si_opaque.so_boot +#define si_stateownerid si_opaque.so_stateownerid +#define si_fileid si_opaque.so_fileid + +#define STATEID_FMT "(%08x/%08x/%08x/%08x)" +#define STATEID_VAL(s) \ + (s)->si_boot, \ + (s)->si_stateownerid, \ + (s)->si_fileid, \ + (s)->si_generation + +struct nfsd4_cb_sequence { + /* args/res */ + u32 cbs_minorversion; + struct nfs4_client *cbs_clp; +}; + +struct nfs4_delegation { + struct list_head dl_perfile; + struct list_head dl_perclnt; + struct list_head dl_recall_lru; /* delegation recalled */ + atomic_t dl_count; /* ref count */ + struct nfs4_client *dl_client; + struct nfs4_file *dl_file; + struct file_lock *dl_flock; + struct file *dl_vfs_file; + u32 dl_type; + time_t dl_time; +/* For recall: */ + u32 dl_ident; + stateid_t dl_stateid; + struct knfsd_fh dl_fh; + int dl_retries; +}; + +/* client delegation callback info */ +struct nfs4_cb_conn { + /* SETCLIENTID info */ + struct sockaddr_storage cb_addr; + size_t cb_addrlen; + u32 cb_prog; + u32 cb_minorversion; + u32 cb_ident; /* minorversion 0 only */ + /* RPC client info */ + atomic_t cb_set; /* successful CB_NULL call */ + struct rpc_clnt * cb_client; +}; + +/* Maximum number of slots per session. 160 is useful for long haul TCP */ +#define NFSD_MAX_SLOTS_PER_SESSION 160 +/* Maximum number of operations per session compound */ +#define NFSD_MAX_OPS_PER_COMPOUND 16 +/* Maximum session per slot cache size */ +#define NFSD_SLOT_CACHE_SIZE 1024 +/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ +#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32 +#define NFSD_MAX_MEM_PER_SESSION \ + (NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE) + +struct nfsd4_slot { + bool sl_inuse; + bool sl_cachethis; + u16 sl_opcnt; + u32 sl_seqid; + __be32 sl_status; + u32 sl_datalen; + char sl_data[]; +}; + +struct nfsd4_channel_attrs { + u32 headerpadsz; + u32 maxreq_sz; + u32 maxresp_sz; + u32 maxresp_cached; + u32 maxops; + u32 maxreqs; + u32 nr_rdma_attrs; + u32 rdma_attrs; +}; + +struct nfsd4_create_session { + clientid_t clientid; + struct nfs4_sessionid sessionid; + u32 seqid; + u32 flags; + struct nfsd4_channel_attrs fore_channel; + struct nfsd4_channel_attrs back_channel; + u32 callback_prog; + u32 uid; + u32 gid; +}; + +/* The single slot clientid cache structure */ +struct nfsd4_clid_slot { + u32 sl_seqid; + __be32 sl_status; + struct nfsd4_create_session sl_cr_ses; +}; + +struct nfsd4_session { + struct kref se_ref; + struct list_head se_hash; /* hash by sessionid */ + struct list_head se_perclnt; + u32 se_flags; + struct nfs4_client *se_client; /* for expire_client */ + struct nfs4_sessionid se_sessionid; + struct nfsd4_channel_attrs se_fchannel; + struct nfsd4_channel_attrs se_bchannel; + struct nfsd4_slot *se_slots[]; /* forward channel slots */ +}; + +static inline void +nfsd4_put_session(struct nfsd4_session *ses) +{ + extern void free_session(struct kref *kref); + kref_put(&ses->se_ref, free_session); +} + +static inline void +nfsd4_get_session(struct nfsd4_session *ses) +{ + kref_get(&ses->se_ref); +} + +/* formatted contents of nfs4_sessionid */ +struct nfsd4_sessionid { + clientid_t clientid; + u32 sequence; + u32 reserved; +}; + +#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */ + +/* + * struct nfs4_client - one per client. Clientids live here. + * o Each nfs4_client is hashed by clientid. + * + * o Each nfs4_clients is also hashed by name + * (the opaque quantity initially sent by the client to identify itself). + * + * o cl_perclient list is used to ensure no dangling stateowner references + * when we expire the nfs4_client + */ +struct nfs4_client { + struct list_head cl_idhash; /* hash by cl_clientid.id */ + struct list_head cl_strhash; /* hash by cl_name */ + struct list_head cl_openowners; + struct list_head cl_delegations; + struct list_head cl_lru; /* tail queue */ + struct xdr_netobj cl_name; /* id generated by client */ + char cl_recdir[HEXDIR_LEN]; /* recovery dir */ + nfs4_verifier cl_verifier; /* generated by client */ + time_t cl_time; /* time of last lease renewal */ + struct sockaddr_storage cl_addr; /* client ipaddress */ + u32 cl_flavor; /* setclientid pseudoflavor */ + char *cl_principal; /* setclientid principal name */ + struct svc_cred cl_cred; /* setclientid principal */ + clientid_t cl_clientid; /* generated by server */ + nfs4_verifier cl_confirm; /* generated by server */ + struct nfs4_cb_conn cl_cb_conn; /* callback info */ + atomic_t cl_count; /* ref count */ + u32 cl_firststate; /* recovery dir creation */ + + /* for nfs41 */ + struct list_head cl_sessions; + struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ + u32 cl_exchange_flags; + struct nfs4_sessionid cl_sessionid; + + /* for nfs41 callbacks */ + /* We currently support a single back channel with a single slot */ + unsigned long cl_cb_slot_busy; + u32 cl_cb_seq_nr; + struct svc_xprt *cl_cb_xprt; /* 4.1 callback transport */ + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ +}; + +/* struct nfs4_client_reset + * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl + * upon lease reset, or from upcall to state_daemon (to read in state + * from non-volitile storage) upon reboot. + */ +struct nfs4_client_reclaim { + struct list_head cr_strhash; /* hash by cr_name */ + char cr_recdir[HEXDIR_LEN]; /* recover dir */ +}; + +static inline void +update_stateid(stateid_t *stateid) +{ + stateid->si_generation++; +} + +/* A reasonable value for REPLAY_ISIZE was estimated as follows: + * The OPEN response, typically the largest, requires + * 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) + + * 4(deleg. type) + 8(deleg. stateid) + 4(deleg. recall flag) + + * 20(deleg. space limit) + ~32(deleg. ace) = 112 bytes + */ + +#define NFSD4_REPLAY_ISIZE 112 + +/* + * Replay buffer, where the result of the last seqid-mutating operation + * is cached. + */ +struct nfs4_replay { + __be32 rp_status; + unsigned int rp_buflen; + char *rp_buf; + unsigned intrp_allocated; + struct knfsd_fh rp_openfh; + char rp_ibuf[NFSD4_REPLAY_ISIZE]; +}; + +/* +* nfs4_stateowner can either be an open_owner, or a lock_owner +* +* so_idhash: stateid_hashtbl[] for open owner, lockstateid_hashtbl[] +* for lock_owner +* so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[] +* for lock_owner +* so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client +* struct is reaped. +* so_perfilestate: heads the list of nfs4_stateid (either open or lock) +* and is used to ensure no dangling nfs4_stateid references when we +* release a stateowner. +* so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when +* close is called to reap associated byte-range locks +* so_close_lru: (open) stateowner is placed on this list instead of being +* reaped (when so_perfilestate is empty) to hold the last close replay. +* reaped by laundramat thread after lease period. +*/ +struct nfs4_stateowner { + struct kref so_ref; + struct list_head so_idhash; /* hash by so_id */ + struct list_head so_strhash; /* hash by op_name */ + struct list_head so_perclient; + struct list_head so_stateids; + struct list_head so_perstateid; /* for lockowners only */ + struct list_head so_close_lru; /* tail queue */ + time_t so_time; /* time of placement on so_close_lru */ + int so_is_open_owner; /* 1=openowner,0=lockowner */ + u32 so_id; + struct nfs4_client * so_client; + /* after increment in ENCODE_SEQID_OP_TAIL, represents the next + * sequence id expected from the client: */ + u32 so_seqid; + struct xdr_netobj so_owner; /* open owner name */ + int so_confirmed; /* successful OPEN_CONFIRM? */ + struct nfs4_replay so_replay; +}; + +/* +* nfs4_file: a file opened by some number of (open) nfs4_stateowners. +* o fi_perfile list is used to search for conflicting +* share_acces, share_deny on the file. +*/ +struct nfs4_file { + atomic_t fi_ref; + struct list_head fi_hash; /* hash by "struct inode *" */ + struct list_head fi_stateids; + struct list_head fi_delegations; + struct inode *fi_inode; + u32 fi_id; /* used with stateowner->so_id + * for stateid_hashtbl hash */ + bool fi_had_conflict; +}; + +/* +* nfs4_stateid can either be an open stateid or (eventually) a lock stateid +* +* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file +* +* st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry +* st_perfile: file_hashtbl[] entry. +* st_perfile_state: nfs4_stateowner->so_perfilestate +* st_perlockowner: (open stateid) list of lock nfs4_stateowners +* st_access_bmap: used only for open stateid +* st_deny_bmap: used only for open stateid +* st_openstp: open stateid lock stateid was derived from +* +* XXX: open stateids and lock stateids have diverged sufficiently that +* we should consider defining separate structs for the two cases. +*/ + +struct nfs4_stateid { + struct list_head st_hash; + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_lockowners; + struct nfs4_stateowner * st_stateowner; + struct nfs4_file * st_file; + stateid_t st_stateid; + struct file * st_vfs_file; + unsigned long st_access_bmap; + unsigned long st_deny_bmap; + struct nfs4_stateid * st_openstp; +}; + +/* flags for preprocess_seqid_op() */ +#define HAS_SESSION 0x00000001 +#define CONFIRM 0x00000002 +#define OPEN_STATE 0x00000004 +#define LOCK_STATE 0x00000008 +#define RD_STATE 0x00000010 +#define WR_STATE 0x00000020 +#define CLOSE_STATE 0x00000040 + +#define seqid_mutating_err(err) \ + (((err) != nfserr_stale_clientid) && \ + ((err) != nfserr_bad_seqid) && \ + ((err) != nfserr_stale_stateid) && \ + ((err) != nfserr_bad_stateid)) + +struct nfsd4_compound_state; + +extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, + stateid_t *stateid, int flags, struct file **filp); +extern void nfs4_lock_state(void); +extern void nfs4_unlock_state(void); +extern int nfs4_in_grace(void); +extern __be32 nfs4_check_open_reclaim(clientid_t *clid); +extern void put_nfs4_client(struct nfs4_client *clp); +extern void nfs4_free_stateowner(struct kref *kref); +extern int set_callback_cred(void); +extern void nfsd4_probe_callback(struct nfs4_client *clp); +extern void nfsd4_cb_recall(struct nfs4_delegation *dp); +extern void nfs4_put_delegation(struct nfs4_delegation *dp); +extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); +extern void nfsd4_init_recdir(char *recdir_name); +extern int nfsd4_recdir_load(void); +extern void nfsd4_shutdown_recdir(void); +extern int nfs4_client_to_reclaim(const char *name); +extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); +extern void nfsd4_recdir_purge_old(void); +extern int nfsd4_create_clid_dir(struct nfs4_client *clp); +extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); + +static inline void +nfs4_put_stateowner(struct nfs4_stateowner *so) +{ + kref_put(&so->so_ref, nfs4_free_stateowner); +} + +static inline void +nfs4_get_stateowner(struct nfs4_stateowner *so) +{ + kref_get(&so->so_ref); +} + +#endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index e3e411e9fe4a..3fc69dfd3091 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -25,11 +25,11 @@ #include #include - #include -#include #include +#include "nfsd.h" + struct nfsd_stats nfsdstats; struct svc_stat nfsd_svcstats = { .program = &nfsd_program, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 81ce108c114e..04bdba12d21b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -22,23 +22,25 @@ #include #include #include -#include -#ifdef CONFIG_NFSD_V3 -#include -#endif /* CONFIG_NFSD_V3 */ #include #include #include #include +#include +#include +#include + +#ifdef CONFIG_NFSD_V3 +#include "xdr3.h" +#endif /* CONFIG_NFSD_V3 */ + #ifdef CONFIG_NFSD_V4 #include #include #endif /* CONFIG_NFSD_V4 */ -#include -#include -#include "vfs.h" -#include +#include "nfsd.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_FILEOP diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h new file mode 100644 index 000000000000..235ee5c3be54 --- /dev/null +++ b/fs/nfsd/xdr.h @@ -0,0 +1,176 @@ +/* + * linux/include/linux/nfsd/xdr.h + * + * XDR types for nfsd. This is mainly a typing exercise. + */ + +#ifndef LINUX_NFSD_H +#define LINUX_NFSD_H + +#include +#include "nfsd.h" + +struct nfsd_fhandle { + struct svc_fh fh; +}; + +struct nfsd_sattrargs { + struct svc_fh fh; + struct iattr attrs; +}; + +struct nfsd_diropargs { + struct svc_fh fh; + char * name; + unsigned int len; +}; + +struct nfsd_readargs { + struct svc_fh fh; + __u32 offset; + __u32 count; + int vlen; +}; + +struct nfsd_writeargs { + svc_fh fh; + __u32 offset; + int len; + int vlen; +}; + +struct nfsd_createargs { + struct svc_fh fh; + char * name; + unsigned int len; + struct iattr attrs; +}; + +struct nfsd_renameargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd_readlinkargs { + struct svc_fh fh; + char * buffer; +}; + +struct nfsd_linkargs { + struct svc_fh ffh; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd_symlinkargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + char * tname; + unsigned int tlen; + struct iattr attrs; +}; + +struct nfsd_readdirargs { + struct svc_fh fh; + __u32 cookie; + __u32 count; + __be32 * buffer; +}; + +struct nfsd_attrstat { + struct svc_fh fh; + struct kstat stat; +}; + +struct nfsd_diropres { + struct svc_fh fh; + struct kstat stat; +}; + +struct nfsd_readlinkres { + int len; +}; + +struct nfsd_readres { + struct svc_fh fh; + unsigned long count; + struct kstat stat; +}; + +struct nfsd_readdirres { + int count; + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; +}; + +struct nfsd_statfsres { + struct kstatfs stats; +}; + +/* + * Storage requirements for XDR arguments and results. + */ +union nfsd_xdrstore { + struct nfsd_sattrargs sattr; + struct nfsd_diropargs dirop; + struct nfsd_readargs read; + struct nfsd_writeargs write; + struct nfsd_createargs create; + struct nfsd_renameargs rename; + struct nfsd_linkargs link; + struct nfsd_symlinkargs symlink; + struct nfsd_readdirargs readdir; +}; + +#define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore) + + +int nfssvc_decode_void(struct svc_rqst *, __be32 *, void *); +int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); +int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *, + struct nfsd_sattrargs *); +int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *, + struct nfsd_diropargs *); +int nfssvc_decode_readargs(struct svc_rqst *, __be32 *, + struct nfsd_readargs *); +int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *, + struct nfsd_writeargs *); +int nfssvc_decode_createargs(struct svc_rqst *, __be32 *, + struct nfsd_createargs *); +int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *, + struct nfsd_renameargs *); +int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *, + struct nfsd_readlinkargs *); +int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *, + struct nfsd_linkargs *); +int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *, + struct nfsd_symlinkargs *); +int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *, + struct nfsd_readdirargs *); +int nfssvc_encode_void(struct svc_rqst *, __be32 *, void *); +int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *, struct nfsd_attrstat *); +int nfssvc_encode_diropres(struct svc_rqst *, __be32 *, struct nfsd_diropres *); +int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *, struct nfsd_readlinkres *); +int nfssvc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd_readres *); +int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *, struct nfsd_statfsres *); +int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *, struct nfsd_readdirres *); + +int nfssvc_encode_entry(void *, const char *name, + int namlen, loff_t offset, u64 ino, unsigned int); + +int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); + +/* Helper functions for NFSv2 ACL code */ +__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp); +__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp); + +#endif /* LINUX_NFSD_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h new file mode 100644 index 000000000000..b330756973cf --- /dev/null +++ b/fs/nfsd/xdr3.h @@ -0,0 +1,346 @@ +/* + * linux/include/linux/nfsd/xdr3.h + * + * XDR types for NFSv3 in nfsd. + * + * Copyright (C) 1996-1998, Olaf Kirch + */ + +#ifndef _LINUX_NFSD_XDR3_H +#define _LINUX_NFSD_XDR3_H + +#include "xdr.h" + +struct nfsd3_sattrargs { + struct svc_fh fh; + struct iattr attrs; + int check_guard; + time_t guardtime; +}; + +struct nfsd3_diropargs { + struct svc_fh fh; + char * name; + unsigned int len; +}; + +struct nfsd3_accessargs { + struct svc_fh fh; + unsigned int access; +}; + +struct nfsd3_readargs { + struct svc_fh fh; + __u64 offset; + __u32 count; + int vlen; +}; + +struct nfsd3_writeargs { + svc_fh fh; + __u64 offset; + __u32 count; + int stable; + __u32 len; + int vlen; +}; + +struct nfsd3_createargs { + struct svc_fh fh; + char * name; + unsigned int len; + int createmode; + struct iattr attrs; + __be32 * verf; +}; + +struct nfsd3_mknodargs { + struct svc_fh fh; + char * name; + unsigned int len; + __u32 ftype; + __u32 major, minor; + struct iattr attrs; +}; + +struct nfsd3_renameargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd3_readlinkargs { + struct svc_fh fh; + char * buffer; +}; + +struct nfsd3_linkargs { + struct svc_fh ffh; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd3_symlinkargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + char * tname; + unsigned int tlen; + struct iattr attrs; +}; + +struct nfsd3_readdirargs { + struct svc_fh fh; + __u64 cookie; + __u32 dircount; + __u32 count; + __be32 * verf; + __be32 * buffer; +}; + +struct nfsd3_commitargs { + struct svc_fh fh; + __u64 offset; + __u32 count; +}; + +struct nfsd3_getaclargs { + struct svc_fh fh; + int mask; +}; + +struct posix_acl; +struct nfsd3_setaclargs { + struct svc_fh fh; + int mask; + struct posix_acl *acl_access; + struct posix_acl *acl_default; +}; + +struct nfsd3_attrstat { + __be32 status; + struct svc_fh fh; + struct kstat stat; +}; + +/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */ +struct nfsd3_diropres { + __be32 status; + struct svc_fh dirfh; + struct svc_fh fh; +}; + +struct nfsd3_accessres { + __be32 status; + struct svc_fh fh; + __u32 access; +}; + +struct nfsd3_readlinkres { + __be32 status; + struct svc_fh fh; + __u32 len; +}; + +struct nfsd3_readres { + __be32 status; + struct svc_fh fh; + unsigned long count; + int eof; +}; + +struct nfsd3_writeres { + __be32 status; + struct svc_fh fh; + unsigned long count; + int committed; +}; + +struct nfsd3_renameres { + __be32 status; + struct svc_fh ffh; + struct svc_fh tfh; +}; + +struct nfsd3_linkres { + __be32 status; + struct svc_fh tfh; + struct svc_fh fh; +}; + +struct nfsd3_readdirres { + __be32 status; + struct svc_fh fh; + int count; + __be32 verf[2]; + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; + __be32 * offset1; + struct svc_rqst * rqstp; + +}; + +struct nfsd3_fsstatres { + __be32 status; + struct kstatfs stats; + __u32 invarsec; +}; + +struct nfsd3_fsinfores { + __be32 status; + __u32 f_rtmax; + __u32 f_rtpref; + __u32 f_rtmult; + __u32 f_wtmax; + __u32 f_wtpref; + __u32 f_wtmult; + __u32 f_dtpref; + __u64 f_maxfilesize; + __u32 f_properties; +}; + +struct nfsd3_pathconfres { + __be32 status; + __u32 p_link_max; + __u32 p_name_max; + __u32 p_no_trunc; + __u32 p_chown_restricted; + __u32 p_case_insensitive; + __u32 p_case_preserving; +}; + +struct nfsd3_commitres { + __be32 status; + struct svc_fh fh; +}; + +struct nfsd3_getaclres { + __be32 status; + struct svc_fh fh; + int mask; + struct posix_acl *acl_access; + struct posix_acl *acl_default; +}; + +/* dummy type for release */ +struct nfsd3_fhandle_pair { + __u32 dummy; + struct svc_fh fh1; + struct svc_fh fh2; +}; + +/* + * Storage requirements for XDR arguments and results. + */ +union nfsd3_xdrstore { + struct nfsd3_sattrargs sattrargs; + struct nfsd3_diropargs diropargs; + struct nfsd3_readargs readargs; + struct nfsd3_writeargs writeargs; + struct nfsd3_createargs createargs; + struct nfsd3_renameargs renameargs; + struct nfsd3_linkargs linkargs; + struct nfsd3_symlinkargs symlinkargs; + struct nfsd3_readdirargs readdirargs; + struct nfsd3_diropres diropres; + struct nfsd3_accessres accessres; + struct nfsd3_readlinkres readlinkres; + struct nfsd3_readres readres; + struct nfsd3_writeres writeres; + struct nfsd3_renameres renameres; + struct nfsd3_linkres linkres; + struct nfsd3_readdirres readdirres; + struct nfsd3_fsstatres fsstatres; + struct nfsd3_fsinfores fsinfores; + struct nfsd3_pathconfres pathconfres; + struct nfsd3_commitres commitres; + struct nfsd3_getaclres getaclres; +}; + +#define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore) + +int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); +int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *, + struct nfsd3_sattrargs *); +int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *, + struct nfsd3_diropargs *); +int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *, + struct nfsd3_accessargs *); +int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *, + struct nfsd3_readargs *); +int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *, + struct nfsd3_writeargs *); +int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *, + struct nfsd3_createargs *); +int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *, + struct nfsd3_createargs *); +int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *, + struct nfsd3_mknodargs *); +int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *, + struct nfsd3_renameargs *); +int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *, + struct nfsd3_readlinkargs *); +int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *, + struct nfsd3_linkargs *); +int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *, + struct nfsd3_symlinkargs *); +int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *, + struct nfsd3_readdirargs *); +int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *, + struct nfsd3_readdirargs *); +int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *, + struct nfsd3_commitargs *); +int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *, void *); +int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *, + struct nfsd3_diropres *); +int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *, + struct nfsd3_accessres *); +int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *, + struct nfsd3_readlinkres *); +int nfs3svc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd3_readres *); +int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *, struct nfsd3_writeres *); +int nfs3svc_encode_createres(struct svc_rqst *, __be32 *, + struct nfsd3_diropres *); +int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *, + struct nfsd3_renameres *); +int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *, + struct nfsd3_linkres *); +int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *, + struct nfsd3_readdirres *); +int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *, + struct nfsd3_fsstatres *); +int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *, + struct nfsd3_fsinfores *); +int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *, + struct nfsd3_pathconfres *); +int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *, + struct nfsd3_commitres *); + +int nfs3svc_release_fhandle(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_release_fhandle2(struct svc_rqst *, __be32 *, + struct nfsd3_fhandle_pair *); +int nfs3svc_encode_entry(void *, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int); +int nfs3svc_encode_entry_plus(void *, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int); +/* Helper functions for NFSv3 ACL code */ +__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, + struct svc_fh *fhp); +__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp); + + +#endif /* _LINUX_NFSD_XDR3_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h new file mode 100644 index 000000000000..83202a1cf07b --- /dev/null +++ b/fs/nfsd/xdr4.h @@ -0,0 +1,564 @@ +/* + * include/linux/nfsd/xdr4.h + * + * Server-side types for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _LINUX_NFSD_XDR4_H +#define _LINUX_NFSD_XDR4_H + +#include "state.h" +#include "nfsd.h" + +#define NFSD4_MAX_TAGLEN 128 +#define XDR_LEN(n) (((n) + 3) & ~3) + +struct nfsd4_compound_state { + struct svc_fh current_fh; + struct svc_fh save_fh; + struct nfs4_stateowner *replay_owner; + /* For sessions DRC */ + struct nfsd4_session *session; + struct nfsd4_slot *slot; + __be32 *datap; + size_t iovlen; + u32 minorversion; + u32 status; +}; + +static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs) +{ + return cs->slot != NULL; +} + +struct nfsd4_change_info { + u32 atomic; + bool change_supported; + u32 before_ctime_sec; + u32 before_ctime_nsec; + u64 before_change; + u32 after_ctime_sec; + u32 after_ctime_nsec; + u64 after_change; +}; + +struct nfsd4_access { + u32 ac_req_access; /* request */ + u32 ac_supported; /* response */ + u32 ac_resp_access; /* response */ +}; + +struct nfsd4_close { + u32 cl_seqid; /* request */ + stateid_t cl_stateid; /* request+response */ + struct nfs4_stateowner * cl_stateowner; /* response */ +}; + +struct nfsd4_commit { + u64 co_offset; /* request */ + u32 co_count; /* request */ + nfs4_verifier co_verf; /* response */ +}; + +struct nfsd4_create { + u32 cr_namelen; /* request */ + char * cr_name; /* request */ + u32 cr_type; /* request */ + union { /* request */ + struct { + u32 namelen; + char *name; + } link; /* NF4LNK */ + struct { + u32 specdata1; + u32 specdata2; + } dev; /* NF4BLK, NF4CHR */ + } u; + u32 cr_bmval[3]; /* request */ + struct iattr cr_iattr; /* request */ + struct nfsd4_change_info cr_cinfo; /* response */ + struct nfs4_acl *cr_acl; +}; +#define cr_linklen u.link.namelen +#define cr_linkname u.link.name +#define cr_specdata1 u.dev.specdata1 +#define cr_specdata2 u.dev.specdata2 + +struct nfsd4_delegreturn { + stateid_t dr_stateid; +}; + +struct nfsd4_getattr { + u32 ga_bmval[3]; /* request */ + struct svc_fh *ga_fhp; /* response */ +}; + +struct nfsd4_link { + u32 li_namelen; /* request */ + char * li_name; /* request */ + struct nfsd4_change_info li_cinfo; /* response */ +}; + +struct nfsd4_lock_denied { + clientid_t ld_clientid; + struct nfs4_stateowner *ld_sop; + u64 ld_start; + u64 ld_length; + u32 ld_type; +}; + +struct nfsd4_lock { + /* request */ + u32 lk_type; + u32 lk_reclaim; /* boolean */ + u64 lk_offset; + u64 lk_length; + u32 lk_is_new; + union { + struct { + u32 open_seqid; + stateid_t open_stateid; + u32 lock_seqid; + clientid_t clientid; + struct xdr_netobj owner; + } new; + struct { + stateid_t lock_stateid; + u32 lock_seqid; + } old; + } v; + + /* response */ + union { + struct { + stateid_t stateid; + } ok; + struct nfsd4_lock_denied denied; + } u; + /* The lk_replay_owner is the open owner in the open_to_lock_owner + * case and the lock owner otherwise: */ + struct nfs4_stateowner *lk_replay_owner; +}; +#define lk_new_open_seqid v.new.open_seqid +#define lk_new_open_stateid v.new.open_stateid +#define lk_new_lock_seqid v.new.lock_seqid +#define lk_new_clientid v.new.clientid +#define lk_new_owner v.new.owner +#define lk_old_lock_stateid v.old.lock_stateid +#define lk_old_lock_seqid v.old.lock_seqid + +#define lk_rflags u.ok.rflags +#define lk_resp_stateid u.ok.stateid +#define lk_denied u.denied + + +struct nfsd4_lockt { + u32 lt_type; + clientid_t lt_clientid; + struct xdr_netobj lt_owner; + u64 lt_offset; + u64 lt_length; + struct nfs4_stateowner * lt_stateowner; + struct nfsd4_lock_denied lt_denied; +}; + + +struct nfsd4_locku { + u32 lu_type; + u32 lu_seqid; + stateid_t lu_stateid; + u64 lu_offset; + u64 lu_length; + struct nfs4_stateowner *lu_stateowner; +}; + + +struct nfsd4_lookup { + u32 lo_len; /* request */ + char * lo_name; /* request */ +}; + +struct nfsd4_putfh { + u32 pf_fhlen; /* request */ + char *pf_fhval; /* request */ +}; + +struct nfsd4_open { + u32 op_claim_type; /* request */ + struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */ + u32 op_delegate_type; /* request - CLAIM_PREV only */ + stateid_t op_delegate_stateid; /* request - response */ + u32 op_create; /* request */ + u32 op_createmode; /* request */ + u32 op_bmval[3]; /* request */ + struct iattr iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ + nfs4_verifier verf; /* EXCLUSIVE4 */ + clientid_t op_clientid; /* request */ + struct xdr_netobj op_owner; /* request */ + u32 op_seqid; /* request */ + u32 op_share_access; /* request */ + u32 op_share_deny; /* request */ + stateid_t op_stateid; /* response */ + u32 op_recall; /* recall */ + struct nfsd4_change_info op_cinfo; /* response */ + u32 op_rflags; /* response */ + int op_truncate; /* used during processing */ + struct nfs4_stateowner *op_stateowner; /* used during processing */ + struct nfs4_acl *op_acl; +}; +#define op_iattr iattr +#define op_verf verf + +struct nfsd4_open_confirm { + stateid_t oc_req_stateid /* request */; + u32 oc_seqid /* request */; + stateid_t oc_resp_stateid /* response */; + struct nfs4_stateowner * oc_stateowner; /* response */ +}; + +struct nfsd4_open_downgrade { + stateid_t od_stateid; + u32 od_seqid; + u32 od_share_access; + u32 od_share_deny; + struct nfs4_stateowner *od_stateowner; +}; + + +struct nfsd4_read { + stateid_t rd_stateid; /* request */ + u64 rd_offset; /* request */ + u32 rd_length; /* request */ + int rd_vlen; + struct file *rd_filp; + + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh * rd_fhp; /* response */ +}; + +struct nfsd4_readdir { + u64 rd_cookie; /* request */ + nfs4_verifier rd_verf; /* request */ + u32 rd_dircount; /* request */ + u32 rd_maxcount; /* request */ + u32 rd_bmval[3]; /* request */ + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh * rd_fhp; /* response */ + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; +}; + +struct nfsd4_release_lockowner { + clientid_t rl_clientid; + struct xdr_netobj rl_owner; +}; +struct nfsd4_readlink { + struct svc_rqst *rl_rqstp; /* request */ + struct svc_fh * rl_fhp; /* request */ +}; + +struct nfsd4_remove { + u32 rm_namelen; /* request */ + char * rm_name; /* request */ + struct nfsd4_change_info rm_cinfo; /* response */ +}; + +struct nfsd4_rename { + u32 rn_snamelen; /* request */ + char * rn_sname; /* request */ + u32 rn_tnamelen; /* request */ + char * rn_tname; /* request */ + struct nfsd4_change_info rn_sinfo; /* response */ + struct nfsd4_change_info rn_tinfo; /* response */ +}; + +struct nfsd4_secinfo { + u32 si_namelen; /* request */ + char *si_name; /* request */ + struct svc_export *si_exp; /* response */ +}; + +struct nfsd4_setattr { + stateid_t sa_stateid; /* request */ + u32 sa_bmval[3]; /* request */ + struct iattr sa_iattr; /* request */ + struct nfs4_acl *sa_acl; +}; + +struct nfsd4_setclientid { + nfs4_verifier se_verf; /* request */ + u32 se_namelen; /* request */ + char * se_name; /* request */ + u32 se_callback_prog; /* request */ + u32 se_callback_netid_len; /* request */ + char * se_callback_netid_val; /* request */ + u32 se_callback_addr_len; /* request */ + char * se_callback_addr_val; /* request */ + u32 se_callback_ident; /* request */ + clientid_t se_clientid; /* response */ + nfs4_verifier se_confirm; /* response */ +}; + +struct nfsd4_setclientid_confirm { + clientid_t sc_clientid; + nfs4_verifier sc_confirm; +}; + +/* also used for NVERIFY */ +struct nfsd4_verify { + u32 ve_bmval[3]; /* request */ + u32 ve_attrlen; /* request */ + char * ve_attrval; /* request */ +}; + +struct nfsd4_write { + stateid_t wr_stateid; /* request */ + u64 wr_offset; /* request */ + u32 wr_stable_how; /* request */ + u32 wr_buflen; /* request */ + int wr_vlen; + + u32 wr_bytes_written; /* response */ + u32 wr_how_written; /* response */ + nfs4_verifier wr_verifier; /* response */ +}; + +struct nfsd4_exchange_id { + nfs4_verifier verifier; + struct xdr_netobj clname; + u32 flags; + clientid_t clientid; + u32 seqid; + int spa_how; +}; + +struct nfsd4_sequence { + struct nfs4_sessionid sessionid; /* request/response */ + u32 seqid; /* request/response */ + u32 slotid; /* request/response */ + u32 maxslots; /* request/response */ + u32 cachethis; /* request */ +#if 0 + u32 target_maxslots; /* response */ + u32 status_flags; /* response */ +#endif /* not yet */ +}; + +struct nfsd4_destroy_session { + struct nfs4_sessionid sessionid; +}; + +struct nfsd4_op { + int opnum; + __be32 status; + union { + struct nfsd4_access access; + struct nfsd4_close close; + struct nfsd4_commit commit; + struct nfsd4_create create; + struct nfsd4_delegreturn delegreturn; + struct nfsd4_getattr getattr; + struct svc_fh * getfh; + struct nfsd4_link link; + struct nfsd4_lock lock; + struct nfsd4_lockt lockt; + struct nfsd4_locku locku; + struct nfsd4_lookup lookup; + struct nfsd4_verify nverify; + struct nfsd4_open open; + struct nfsd4_open_confirm open_confirm; + struct nfsd4_open_downgrade open_downgrade; + struct nfsd4_putfh putfh; + struct nfsd4_read read; + struct nfsd4_readdir readdir; + struct nfsd4_readlink readlink; + struct nfsd4_remove remove; + struct nfsd4_rename rename; + clientid_t renew; + struct nfsd4_secinfo secinfo; + struct nfsd4_setattr setattr; + struct nfsd4_setclientid setclientid; + struct nfsd4_setclientid_confirm setclientid_confirm; + struct nfsd4_verify verify; + struct nfsd4_write write; + struct nfsd4_release_lockowner release_lockowner; + + /* NFSv4.1 */ + struct nfsd4_exchange_id exchange_id; + struct nfsd4_create_session create_session; + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; + } u; + struct nfs4_replay * replay; +}; + +struct nfsd4_compoundargs { + /* scratch variables for XDR decode */ + __be32 * p; + __be32 * end; + struct page ** pagelist; + int pagelen; + __be32 tmp[8]; + __be32 * tmpp; + struct tmpbuf { + struct tmpbuf *next; + void (*release)(const void *); + void *buf; + } *to_free; + + struct svc_rqst *rqstp; + + u32 taglen; + char * tag; + u32 minorversion; + u32 opcnt; + struct nfsd4_op *ops; + struct nfsd4_op iops[8]; +}; + +struct nfsd4_compoundres { + /* scratch variables for XDR encode */ + __be32 * p; + __be32 * end; + struct xdr_buf * xbuf; + struct svc_rqst * rqstp; + + u32 taglen; + char * tag; + u32 opcnt; + __be32 * tagp; /* tag, opcount encode location */ + struct nfsd4_compound_state cstate; +}; + +static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp) +{ + struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; + return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE; +} + +static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) +{ + return !resp->cstate.slot->sl_cachethis || nfsd4_is_solo_sequence(resp); +} + +#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) + +static inline void +set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) +{ + BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved); + cinfo->atomic = 1; + cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); + if (cinfo->change_supported) { + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + } else { + cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; + cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; + cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; + cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; + } +} + +int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); +int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *, + struct nfsd4_compoundargs *); +int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, + struct nfsd4_compoundres *); +void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); +void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); +__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, __be32 *buffer, int *countp, + u32 *bmval, struct svc_rqst *, int ignore_crossmnt); +extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_setclientid *setclid); +extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_setclientid_confirm *setclientid_confirm); +extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp); +extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, + struct nfsd4_sequence *seq); +extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, +struct nfsd4_exchange_id *); + extern __be32 nfsd4_create_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_create_session *); +extern __be32 nfsd4_sequence(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_sequence *); +extern __be32 nfsd4_destroy_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_destroy_session *); +extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, + struct nfsd4_open *open); +extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, + struct svc_fh *current_fh, struct nfsd4_open *open); +extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); +extern __be32 nfsd4_close(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_close *close); +extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_open_downgrade *od); +extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *, + struct nfsd4_lock *lock); +extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_lockt *lockt); +extern __be32 nfsd4_locku(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_locku *locku); +extern __be32 +nfsd4_release_lockowner(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_release_lockowner *rlockowner); +extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *); +extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_delegreturn *dr); +extern __be32 nfsd4_renew(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, clientid_t *clid); +#endif + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/include/linux/nfsd/cache.h b/include/linux/nfsd/cache.h deleted file mode 100644 index a165425dea41..000000000000 --- a/include/linux/nfsd/cache.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * include/linux/nfsd/cache.h - * - * Request reply cache. This was heavily inspired by the - * implementation in 4.3BSD/4.4BSD. - * - * Copyright (C) 1995, 1996 Olaf Kirch - */ - -#ifndef NFSCACHE_H -#define NFSCACHE_H - -#include - -/* - * Representation of a reply cache entry. - */ -struct svc_cacherep { - struct hlist_node c_hash; - struct list_head c_lru; - - unsigned char c_state, /* unused, inprog, done */ - c_type, /* status, buffer */ - c_secure : 1; /* req came from port < 1024 */ - struct sockaddr_in c_addr; - __be32 c_xid; - u32 c_prot; - u32 c_proc; - u32 c_vers; - unsigned long c_timestamp; - union { - struct kvec u_vec; - __be32 u_status; - } c_u; -}; - -#define c_replvec c_u.u_vec -#define c_replstat c_u.u_status - -/* cache entry states */ -enum { - RC_UNUSED, - RC_INPROG, - RC_DONE -}; - -/* return values */ -enum { - RC_DROPIT, - RC_REPLY, - RC_DOIT, - RC_INTR -}; - -/* - * Cache types. - * We may want to add more types one day, e.g. for diropres and - * attrstat replies. Using cache entries with fixed length instead - * of buffer pointers may be more efficient. - */ -enum { - RC_NOCACHE, - RC_REPLSTAT, - RC_REPLBUFF, -}; - -/* - * If requests are retransmitted within this interval, they're dropped. - */ -#define RC_DELAY (HZ/5) - -int nfsd_reply_cache_init(void); -void nfsd_reply_cache_shutdown(void); -int nfsd_cache_lookup(struct svc_rqst *, int); -void nfsd_cache_update(struct svc_rqst *, int, __be32 *); - -#ifdef CONFIG_NFSD_V4 -void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp); -#else /* CONFIG_NFSD_V4 */ -static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp) -{ -} -#endif /* CONFIG_NFSD_V4 */ - -#endif /* NFSCACHE_H */ diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h deleted file mode 100644 index 74f67c2aca34..000000000000 --- a/include/linux/nfsd/nfsd.h +++ /dev/null @@ -1,335 +0,0 @@ -/* - * linux/include/linux/nfsd/nfsd.h - * - * Hodge-podge collection of knfsd-related stuff. - * I will sort this out later. - * - * Copyright (C) 1995-1997 Olaf Kirch - */ - -#ifndef LINUX_NFSD_NFSD_H -#define LINUX_NFSD_NFSD_H - -#include -#include - -#include -#include -#include -/* - * nfsd version - */ -#define NFSD_SUPPORTED_MINOR_VERSION 1 - -struct readdir_cd { - __be32 err; /* 0, nfserr, or nfserr_eof */ -}; - - -extern struct svc_program nfsd_program; -extern struct svc_version nfsd_version2, nfsd_version3, - nfsd_version4; -extern u32 nfsd_supported_minorversion; -extern struct mutex nfsd_mutex; -extern struct svc_serv *nfsd_serv; -extern spinlock_t nfsd_drc_lock; -extern unsigned int nfsd_drc_max_mem; -extern unsigned int nfsd_drc_mem_used; - -extern const struct seq_operations nfs_exports_op; - -/* - * Function prototypes. - */ -int nfsd_svc(unsigned short port, int nrservs); -int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); - -int nfsd_nrthreads(void); -int nfsd_nrpools(void); -int nfsd_get_nrthreads(int n, int *); -int nfsd_set_nrthreads(int n, int *); - -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -#ifdef CONFIG_NFSD_V2_ACL -extern struct svc_version nfsd_acl_version2; -#else -#define nfsd_acl_version2 NULL -#endif -#ifdef CONFIG_NFSD_V3_ACL -extern struct svc_version nfsd_acl_version3; -#else -#define nfsd_acl_version3 NULL -#endif -#endif - -enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL }; -int nfsd_vers(int vers, enum vers_op change); -int nfsd_minorversion(u32 minorversion, enum vers_op change); -void nfsd_reset_versions(void); -int nfsd_create_serv(void); - -extern int nfsd_max_blksize; - -/* - * NFSv4 State - */ -#ifdef CONFIG_NFSD_V4 -extern unsigned int max_delegations; -int nfs4_state_init(void); -void nfsd4_free_slabs(void); -int nfs4_state_start(void); -void nfs4_state_shutdown(void); -time_t nfs4_lease_time(void); -void nfs4_reset_lease(time_t leasetime); -int nfs4_reset_recoverydir(char *recdir); -#else -static inline int nfs4_state_init(void) { return 0; } -static inline void nfsd4_free_slabs(void) { } -static inline int nfs4_state_start(void) { return 0; } -static inline void nfs4_state_shutdown(void) { } -static inline time_t nfs4_lease_time(void) { return 0; } -static inline void nfs4_reset_lease(time_t leasetime) { } -static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } -#endif - -/* - * lockd binding - */ -void nfsd_lockd_init(void); -void nfsd_lockd_shutdown(void); - - -/* - * These macros provide pre-xdr'ed values for faster operation. - */ -#define nfs_ok cpu_to_be32(NFS_OK) -#define nfserr_perm cpu_to_be32(NFSERR_PERM) -#define nfserr_noent cpu_to_be32(NFSERR_NOENT) -#define nfserr_io cpu_to_be32(NFSERR_IO) -#define nfserr_nxio cpu_to_be32(NFSERR_NXIO) -#define nfserr_eagain cpu_to_be32(NFSERR_EAGAIN) -#define nfserr_acces cpu_to_be32(NFSERR_ACCES) -#define nfserr_exist cpu_to_be32(NFSERR_EXIST) -#define nfserr_xdev cpu_to_be32(NFSERR_XDEV) -#define nfserr_nodev cpu_to_be32(NFSERR_NODEV) -#define nfserr_notdir cpu_to_be32(NFSERR_NOTDIR) -#define nfserr_isdir cpu_to_be32(NFSERR_ISDIR) -#define nfserr_inval cpu_to_be32(NFSERR_INVAL) -#define nfserr_fbig cpu_to_be32(NFSERR_FBIG) -#define nfserr_nospc cpu_to_be32(NFSERR_NOSPC) -#define nfserr_rofs cpu_to_be32(NFSERR_ROFS) -#define nfserr_mlink cpu_to_be32(NFSERR_MLINK) -#define nfserr_opnotsupp cpu_to_be32(NFSERR_OPNOTSUPP) -#define nfserr_nametoolong cpu_to_be32(NFSERR_NAMETOOLONG) -#define nfserr_notempty cpu_to_be32(NFSERR_NOTEMPTY) -#define nfserr_dquot cpu_to_be32(NFSERR_DQUOT) -#define nfserr_stale cpu_to_be32(NFSERR_STALE) -#define nfserr_remote cpu_to_be32(NFSERR_REMOTE) -#define nfserr_wflush cpu_to_be32(NFSERR_WFLUSH) -#define nfserr_badhandle cpu_to_be32(NFSERR_BADHANDLE) -#define nfserr_notsync cpu_to_be32(NFSERR_NOT_SYNC) -#define nfserr_badcookie cpu_to_be32(NFSERR_BAD_COOKIE) -#define nfserr_notsupp cpu_to_be32(NFSERR_NOTSUPP) -#define nfserr_toosmall cpu_to_be32(NFSERR_TOOSMALL) -#define nfserr_serverfault cpu_to_be32(NFSERR_SERVERFAULT) -#define nfserr_badtype cpu_to_be32(NFSERR_BADTYPE) -#define nfserr_jukebox cpu_to_be32(NFSERR_JUKEBOX) -#define nfserr_denied cpu_to_be32(NFSERR_DENIED) -#define nfserr_deadlock cpu_to_be32(NFSERR_DEADLOCK) -#define nfserr_expired cpu_to_be32(NFSERR_EXPIRED) -#define nfserr_bad_cookie cpu_to_be32(NFSERR_BAD_COOKIE) -#define nfserr_same cpu_to_be32(NFSERR_SAME) -#define nfserr_clid_inuse cpu_to_be32(NFSERR_CLID_INUSE) -#define nfserr_stale_clientid cpu_to_be32(NFSERR_STALE_CLIENTID) -#define nfserr_resource cpu_to_be32(NFSERR_RESOURCE) -#define nfserr_moved cpu_to_be32(NFSERR_MOVED) -#define nfserr_nofilehandle cpu_to_be32(NFSERR_NOFILEHANDLE) -#define nfserr_minor_vers_mismatch cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH) -#define nfserr_share_denied cpu_to_be32(NFSERR_SHARE_DENIED) -#define nfserr_stale_stateid cpu_to_be32(NFSERR_STALE_STATEID) -#define nfserr_old_stateid cpu_to_be32(NFSERR_OLD_STATEID) -#define nfserr_bad_stateid cpu_to_be32(NFSERR_BAD_STATEID) -#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID) -#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK) -#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME) -#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH) -#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) -#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) -#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE) -#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD) -#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL) -#define nfserr_grace cpu_to_be32(NFSERR_GRACE) -#define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE) -#define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD) -#define nfserr_badname cpu_to_be32(NFSERR_BADNAME) -#define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN) -#define nfserr_locked cpu_to_be32(NFSERR_LOCKED) -#define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC) -#define nfserr_badiomode cpu_to_be32(NFS4ERR_BADIOMODE) -#define nfserr_badlayout cpu_to_be32(NFS4ERR_BADLAYOUT) -#define nfserr_bad_session_digest cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST) -#define nfserr_badsession cpu_to_be32(NFS4ERR_BADSESSION) -#define nfserr_badslot cpu_to_be32(NFS4ERR_BADSLOT) -#define nfserr_complete_already cpu_to_be32(NFS4ERR_COMPLETE_ALREADY) -#define nfserr_conn_not_bound_to_session cpu_to_be32(NFS4ERR_CONN_NOT_BOUND_TO_SESSION) -#define nfserr_deleg_already_wanted cpu_to_be32(NFS4ERR_DELEG_ALREADY_WANTED) -#define nfserr_back_chan_busy cpu_to_be32(NFS4ERR_BACK_CHAN_BUSY) -#define nfserr_layouttrylater cpu_to_be32(NFS4ERR_LAYOUTTRYLATER) -#define nfserr_layoutunavailable cpu_to_be32(NFS4ERR_LAYOUTUNAVAILABLE) -#define nfserr_nomatching_layout cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT) -#define nfserr_recallconflict cpu_to_be32(NFS4ERR_RECALLCONFLICT) -#define nfserr_unknown_layouttype cpu_to_be32(NFS4ERR_UNKNOWN_LAYOUTTYPE) -#define nfserr_seq_misordered cpu_to_be32(NFS4ERR_SEQ_MISORDERED) -#define nfserr_sequence_pos cpu_to_be32(NFS4ERR_SEQUENCE_POS) -#define nfserr_req_too_big cpu_to_be32(NFS4ERR_REQ_TOO_BIG) -#define nfserr_rep_too_big cpu_to_be32(NFS4ERR_REP_TOO_BIG) -#define nfserr_rep_too_big_to_cache cpu_to_be32(NFS4ERR_REP_TOO_BIG_TO_CACHE) -#define nfserr_retry_uncached_rep cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP) -#define nfserr_unsafe_compound cpu_to_be32(NFS4ERR_UNSAFE_COMPOUND) -#define nfserr_too_many_ops cpu_to_be32(NFS4ERR_TOO_MANY_OPS) -#define nfserr_op_not_in_session cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION) -#define nfserr_hash_alg_unsupp cpu_to_be32(NFS4ERR_HASH_ALG_UNSUPP) -#define nfserr_clientid_busy cpu_to_be32(NFS4ERR_CLIENTID_BUSY) -#define nfserr_pnfs_io_hole cpu_to_be32(NFS4ERR_PNFS_IO_HOLE) -#define nfserr_seq_false_retry cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY) -#define nfserr_bad_high_slot cpu_to_be32(NFS4ERR_BAD_HIGH_SLOT) -#define nfserr_deadsession cpu_to_be32(NFS4ERR_DEADSESSION) -#define nfserr_encr_alg_unsupp cpu_to_be32(NFS4ERR_ENCR_ALG_UNSUPP) -#define nfserr_pnfs_no_layout cpu_to_be32(NFS4ERR_PNFS_NO_LAYOUT) -#define nfserr_not_only_op cpu_to_be32(NFS4ERR_NOT_ONLY_OP) -#define nfserr_wrong_cred cpu_to_be32(NFS4ERR_WRONG_CRED) -#define nfserr_wrong_type cpu_to_be32(NFS4ERR_WRONG_TYPE) -#define nfserr_dirdeleg_unavail cpu_to_be32(NFS4ERR_DIRDELEG_UNAVAIL) -#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG) -#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT) -#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) - -/* error codes for internal use */ -/* if a request fails due to kmalloc failure, it gets dropped. - * Client should resend eventually - */ -#define nfserr_dropit cpu_to_be32(30000) -/* end-of-file indicator in readdir */ -#define nfserr_eof cpu_to_be32(30001) -/* replay detected */ -#define nfserr_replay_me cpu_to_be32(11001) -/* nfs41 replay detected */ -#define nfserr_replay_cache cpu_to_be32(11002) - -/* Check for dir entries '.' and '..' */ -#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) - -/* - * Time of server startup - */ -extern struct timeval nfssvc_boot; - -#ifdef CONFIG_NFSD_V4 - -/* before processing a COMPOUND operation, we have to check that there - * is enough space in the buffer for XDR encode to succeed. otherwise, - * we might process an operation with side effects, and be unable to - * tell the client that the operation succeeded. - * - * COMPOUND_SLACK_SPACE - this is the minimum bytes of buffer space - * needed to encode an "ordinary" _successful_ operation. (GETATTR, - * READ, READDIR, and READLINK have their own buffer checks.) if we - * fall below this level, we fail the next operation with NFS4ERR_RESOURCE. - * - * COMPOUND_ERR_SLACK_SPACE - this is the minimum bytes of buffer space - * needed to encode an operation which has failed with NFS4ERR_RESOURCE. - * care is taken to ensure that we never fall below this level for any - * reason. - */ -#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ -#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ - -#define NFSD_LEASE_TIME (nfs4_lease_time()) -#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ - -/* - * The following attributes are currently not supported by the NFSv4 server: - * ARCHIVE (deprecated anyway) - * HIDDEN (unlikely to be supported any time soon) - * MIMETYPE (unlikely to be supported any time soon) - * QUOTA_* (will be supported in a forthcoming patch) - * SYSTEM (unlikely to be supported any time soon) - * TIME_BACKUP (unlikely to be supported any time soon) - * TIME_CREATE (unlikely to be supported any time soon) - */ -#define NFSD4_SUPPORTED_ATTRS_WORD0 \ -(FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \ - | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_LINK_SUPPORT \ - | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR | FATTR4_WORD0_FSID \ - | FATTR4_WORD0_UNIQUE_HANDLES | FATTR4_WORD0_LEASE_TIME | FATTR4_WORD0_RDATTR_ERROR \ - | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_CANSETTIME | FATTR4_WORD0_CASE_INSENSITIVE \ - | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED \ - | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \ - | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_HOMOGENEOUS \ - | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \ - | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_ACL) - -#define NFSD4_SUPPORTED_ATTRS_WORD1 \ -(FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \ - | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \ - | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \ - | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \ - | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \ - | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID) - -#define NFSD4_SUPPORTED_ATTRS_WORD2 0 - -#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ - NFSD4_SUPPORTED_ATTRS_WORD0 - -#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ - NFSD4_SUPPORTED_ATTRS_WORD1 - -#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ - (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) - -static inline u32 nfsd_suppattrs0(u32 minorversion) -{ - return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0 - : NFSD4_SUPPORTED_ATTRS_WORD0; -} - -static inline u32 nfsd_suppattrs1(u32 minorversion) -{ - return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1 - : NFSD4_SUPPORTED_ATTRS_WORD1; -} - -static inline u32 nfsd_suppattrs2(u32 minorversion) -{ - return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2 - : NFSD4_SUPPORTED_ATTRS_WORD2; -} - -/* These will return ERR_INVAL if specified in GETATTR or READDIR. */ -#define NFSD_WRITEONLY_ATTRS_WORD1 \ -(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) - -/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ -#define NFSD_WRITEABLE_ATTRS_WORD0 \ -(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL ) -#define NFSD_WRITEABLE_ATTRS_WORD1 \ -(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ - | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) -#define NFSD_WRITEABLE_ATTRS_WORD2 0 - -#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ - NFSD_WRITEABLE_ATTRS_WORD0 -/* - * we currently store the exclusive create verifier in the v_{a,m}time - * attributes so the client can't set these at create time using EXCLUSIVE4_1 - */ -#define NFSD_SUPPATTR_EXCLCREAT_WORD1 \ - (NFSD_WRITEABLE_ATTRS_WORD1 & \ - ~(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)) -#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \ - NFSD_WRITEABLE_ATTRS_WORD2 - -#endif /* CONFIG_NFSD_V4 */ - -#endif /* LINUX_NFSD_NFSD_H */ diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h deleted file mode 100644 index 2af75686e0d3..000000000000 --- a/include/linux/nfsd/state.h +++ /dev/null @@ -1,409 +0,0 @@ -/* - * linux/include/nfsd/state.h - * - * Copyright (c) 2001 The Regents of the University of Michigan. - * All rights reserved. - * - * Kendrick Smith - * Andy Adamson - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#ifndef _NFSD4_STATE_H -#define _NFSD4_STATE_H - -#include - -typedef struct { - u32 cl_boot; - u32 cl_id; -} clientid_t; - -typedef struct { - u32 so_boot; - u32 so_stateownerid; - u32 so_fileid; -} stateid_opaque_t; - -typedef struct { - u32 si_generation; - stateid_opaque_t si_opaque; -} stateid_t; -#define si_boot si_opaque.so_boot -#define si_stateownerid si_opaque.so_stateownerid -#define si_fileid si_opaque.so_fileid - -#define STATEID_FMT "(%08x/%08x/%08x/%08x)" -#define STATEID_VAL(s) \ - (s)->si_boot, \ - (s)->si_stateownerid, \ - (s)->si_fileid, \ - (s)->si_generation - -struct nfsd4_cb_sequence { - /* args/res */ - u32 cbs_minorversion; - struct nfs4_client *cbs_clp; -}; - -struct nfs4_delegation { - struct list_head dl_perfile; - struct list_head dl_perclnt; - struct list_head dl_recall_lru; /* delegation recalled */ - atomic_t dl_count; /* ref count */ - struct nfs4_client *dl_client; - struct nfs4_file *dl_file; - struct file_lock *dl_flock; - struct file *dl_vfs_file; - u32 dl_type; - time_t dl_time; -/* For recall: */ - u32 dl_ident; - stateid_t dl_stateid; - struct knfsd_fh dl_fh; - int dl_retries; -}; - -/* client delegation callback info */ -struct nfs4_cb_conn { - /* SETCLIENTID info */ - struct sockaddr_storage cb_addr; - size_t cb_addrlen; - u32 cb_prog; - u32 cb_minorversion; - u32 cb_ident; /* minorversion 0 only */ - /* RPC client info */ - atomic_t cb_set; /* successful CB_NULL call */ - struct rpc_clnt * cb_client; -}; - -/* Maximum number of slots per session. 160 is useful for long haul TCP */ -#define NFSD_MAX_SLOTS_PER_SESSION 160 -/* Maximum number of operations per session compound */ -#define NFSD_MAX_OPS_PER_COMPOUND 16 -/* Maximum session per slot cache size */ -#define NFSD_SLOT_CACHE_SIZE 1024 -/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ -#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32 -#define NFSD_MAX_MEM_PER_SESSION \ - (NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE) - -struct nfsd4_slot { - bool sl_inuse; - bool sl_cachethis; - u16 sl_opcnt; - u32 sl_seqid; - __be32 sl_status; - u32 sl_datalen; - char sl_data[]; -}; - -struct nfsd4_channel_attrs { - u32 headerpadsz; - u32 maxreq_sz; - u32 maxresp_sz; - u32 maxresp_cached; - u32 maxops; - u32 maxreqs; - u32 nr_rdma_attrs; - u32 rdma_attrs; -}; - -struct nfsd4_create_session { - clientid_t clientid; - struct nfs4_sessionid sessionid; - u32 seqid; - u32 flags; - struct nfsd4_channel_attrs fore_channel; - struct nfsd4_channel_attrs back_channel; - u32 callback_prog; - u32 uid; - u32 gid; -}; - -/* The single slot clientid cache structure */ -struct nfsd4_clid_slot { - u32 sl_seqid; - __be32 sl_status; - struct nfsd4_create_session sl_cr_ses; -}; - -struct nfsd4_session { - struct kref se_ref; - struct list_head se_hash; /* hash by sessionid */ - struct list_head se_perclnt; - u32 se_flags; - struct nfs4_client *se_client; /* for expire_client */ - struct nfs4_sessionid se_sessionid; - struct nfsd4_channel_attrs se_fchannel; - struct nfsd4_channel_attrs se_bchannel; - struct nfsd4_slot *se_slots[]; /* forward channel slots */ -}; - -static inline void -nfsd4_put_session(struct nfsd4_session *ses) -{ - extern void free_session(struct kref *kref); - kref_put(&ses->se_ref, free_session); -} - -static inline void -nfsd4_get_session(struct nfsd4_session *ses) -{ - kref_get(&ses->se_ref); -} - -/* formatted contents of nfs4_sessionid */ -struct nfsd4_sessionid { - clientid_t clientid; - u32 sequence; - u32 reserved; -}; - -#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */ - -/* - * struct nfs4_client - one per client. Clientids live here. - * o Each nfs4_client is hashed by clientid. - * - * o Each nfs4_clients is also hashed by name - * (the opaque quantity initially sent by the client to identify itself). - * - * o cl_perclient list is used to ensure no dangling stateowner references - * when we expire the nfs4_client - */ -struct nfs4_client { - struct list_head cl_idhash; /* hash by cl_clientid.id */ - struct list_head cl_strhash; /* hash by cl_name */ - struct list_head cl_openowners; - struct list_head cl_delegations; - struct list_head cl_lru; /* tail queue */ - struct xdr_netobj cl_name; /* id generated by client */ - char cl_recdir[HEXDIR_LEN]; /* recovery dir */ - nfs4_verifier cl_verifier; /* generated by client */ - time_t cl_time; /* time of last lease renewal */ - struct sockaddr_storage cl_addr; /* client ipaddress */ - u32 cl_flavor; /* setclientid pseudoflavor */ - char *cl_principal; /* setclientid principal name */ - struct svc_cred cl_cred; /* setclientid principal */ - clientid_t cl_clientid; /* generated by server */ - nfs4_verifier cl_confirm; /* generated by server */ - struct nfs4_cb_conn cl_cb_conn; /* callback info */ - atomic_t cl_count; /* ref count */ - u32 cl_firststate; /* recovery dir creation */ - - /* for nfs41 */ - struct list_head cl_sessions; - struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ - u32 cl_exchange_flags; - struct nfs4_sessionid cl_sessionid; - - /* for nfs41 callbacks */ - /* We currently support a single back channel with a single slot */ - unsigned long cl_cb_slot_busy; - u32 cl_cb_seq_nr; - struct svc_xprt *cl_cb_xprt; /* 4.1 callback transport */ - struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ - /* wait here for slots */ -}; - -/* struct nfs4_client_reset - * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl - * upon lease reset, or from upcall to state_daemon (to read in state - * from non-volitile storage) upon reboot. - */ -struct nfs4_client_reclaim { - struct list_head cr_strhash; /* hash by cr_name */ - char cr_recdir[HEXDIR_LEN]; /* recover dir */ -}; - -static inline void -update_stateid(stateid_t *stateid) -{ - stateid->si_generation++; -} - -/* A reasonable value for REPLAY_ISIZE was estimated as follows: - * The OPEN response, typically the largest, requires - * 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) + - * 4(deleg. type) + 8(deleg. stateid) + 4(deleg. recall flag) + - * 20(deleg. space limit) + ~32(deleg. ace) = 112 bytes - */ - -#define NFSD4_REPLAY_ISIZE 112 - -/* - * Replay buffer, where the result of the last seqid-mutating operation - * is cached. - */ -struct nfs4_replay { - __be32 rp_status; - unsigned int rp_buflen; - char *rp_buf; - unsigned intrp_allocated; - struct knfsd_fh rp_openfh; - char rp_ibuf[NFSD4_REPLAY_ISIZE]; -}; - -/* -* nfs4_stateowner can either be an open_owner, or a lock_owner -* -* so_idhash: stateid_hashtbl[] for open owner, lockstateid_hashtbl[] -* for lock_owner -* so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[] -* for lock_owner -* so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client -* struct is reaped. -* so_perfilestate: heads the list of nfs4_stateid (either open or lock) -* and is used to ensure no dangling nfs4_stateid references when we -* release a stateowner. -* so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when -* close is called to reap associated byte-range locks -* so_close_lru: (open) stateowner is placed on this list instead of being -* reaped (when so_perfilestate is empty) to hold the last close replay. -* reaped by laundramat thread after lease period. -*/ -struct nfs4_stateowner { - struct kref so_ref; - struct list_head so_idhash; /* hash by so_id */ - struct list_head so_strhash; /* hash by op_name */ - struct list_head so_perclient; - struct list_head so_stateids; - struct list_head so_perstateid; /* for lockowners only */ - struct list_head so_close_lru; /* tail queue */ - time_t so_time; /* time of placement on so_close_lru */ - int so_is_open_owner; /* 1=openowner,0=lockowner */ - u32 so_id; - struct nfs4_client * so_client; - /* after increment in ENCODE_SEQID_OP_TAIL, represents the next - * sequence id expected from the client: */ - u32 so_seqid; - struct xdr_netobj so_owner; /* open owner name */ - int so_confirmed; /* successful OPEN_CONFIRM? */ - struct nfs4_replay so_replay; -}; - -/* -* nfs4_file: a file opened by some number of (open) nfs4_stateowners. -* o fi_perfile list is used to search for conflicting -* share_acces, share_deny on the file. -*/ -struct nfs4_file { - atomic_t fi_ref; - struct list_head fi_hash; /* hash by "struct inode *" */ - struct list_head fi_stateids; - struct list_head fi_delegations; - struct inode *fi_inode; - u32 fi_id; /* used with stateowner->so_id - * for stateid_hashtbl hash */ - bool fi_had_conflict; -}; - -/* -* nfs4_stateid can either be an open stateid or (eventually) a lock stateid -* -* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file -* -* st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry -* st_perfile: file_hashtbl[] entry. -* st_perfile_state: nfs4_stateowner->so_perfilestate -* st_perlockowner: (open stateid) list of lock nfs4_stateowners -* st_access_bmap: used only for open stateid -* st_deny_bmap: used only for open stateid -* st_openstp: open stateid lock stateid was derived from -* -* XXX: open stateids and lock stateids have diverged sufficiently that -* we should consider defining separate structs for the two cases. -*/ - -struct nfs4_stateid { - struct list_head st_hash; - struct list_head st_perfile; - struct list_head st_perstateowner; - struct list_head st_lockowners; - struct nfs4_stateowner * st_stateowner; - struct nfs4_file * st_file; - stateid_t st_stateid; - struct file * st_vfs_file; - unsigned long st_access_bmap; - unsigned long st_deny_bmap; - struct nfs4_stateid * st_openstp; -}; - -/* flags for preprocess_seqid_op() */ -#define HAS_SESSION 0x00000001 -#define CONFIRM 0x00000002 -#define OPEN_STATE 0x00000004 -#define LOCK_STATE 0x00000008 -#define RD_STATE 0x00000010 -#define WR_STATE 0x00000020 -#define CLOSE_STATE 0x00000040 - -#define seqid_mutating_err(err) \ - (((err) != nfserr_stale_clientid) && \ - ((err) != nfserr_bad_seqid) && \ - ((err) != nfserr_stale_stateid) && \ - ((err) != nfserr_bad_stateid)) - -struct nfsd4_compound_state; - -extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, - stateid_t *stateid, int flags, struct file **filp); -extern void nfs4_lock_state(void); -extern void nfs4_unlock_state(void); -extern int nfs4_in_grace(void); -extern __be32 nfs4_check_open_reclaim(clientid_t *clid); -extern void put_nfs4_client(struct nfs4_client *clp); -extern void nfs4_free_stateowner(struct kref *kref); -extern int set_callback_cred(void); -extern void nfsd4_probe_callback(struct nfs4_client *clp); -extern void nfsd4_cb_recall(struct nfs4_delegation *dp); -extern void nfs4_put_delegation(struct nfs4_delegation *dp); -extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); -extern void nfsd4_init_recdir(char *recdir_name); -extern int nfsd4_recdir_load(void); -extern void nfsd4_shutdown_recdir(void); -extern int nfs4_client_to_reclaim(const char *name); -extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); -extern void nfsd4_recdir_purge_old(void); -extern int nfsd4_create_clid_dir(struct nfs4_client *clp); -extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); - -static inline void -nfs4_put_stateowner(struct nfs4_stateowner *so) -{ - kref_put(&so->so_ref, nfs4_free_stateowner); -} - -static inline void -nfs4_get_stateowner(struct nfs4_stateowner *so) -{ - kref_get(&so->so_ref); -} - -#endif /* NFSD4_STATE_H */ diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h deleted file mode 100644 index 58f824d854c2..000000000000 --- a/include/linux/nfsd/xdr.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * linux/include/linux/nfsd/xdr.h - * - * XDR types for nfsd. This is mainly a typing exercise. - */ - -#ifndef LINUX_NFSD_H -#define LINUX_NFSD_H - -#include -#include - -struct nfsd_fhandle { - struct svc_fh fh; -}; - -struct nfsd_sattrargs { - struct svc_fh fh; - struct iattr attrs; -}; - -struct nfsd_diropargs { - struct svc_fh fh; - char * name; - unsigned int len; -}; - -struct nfsd_readargs { - struct svc_fh fh; - __u32 offset; - __u32 count; - int vlen; -}; - -struct nfsd_writeargs { - svc_fh fh; - __u32 offset; - int len; - int vlen; -}; - -struct nfsd_createargs { - struct svc_fh fh; - char * name; - unsigned int len; - struct iattr attrs; -}; - -struct nfsd_renameargs { - struct svc_fh ffh; - char * fname; - unsigned int flen; - struct svc_fh tfh; - char * tname; - unsigned int tlen; -}; - -struct nfsd_readlinkargs { - struct svc_fh fh; - char * buffer; -}; - -struct nfsd_linkargs { - struct svc_fh ffh; - struct svc_fh tfh; - char * tname; - unsigned int tlen; -}; - -struct nfsd_symlinkargs { - struct svc_fh ffh; - char * fname; - unsigned int flen; - char * tname; - unsigned int tlen; - struct iattr attrs; -}; - -struct nfsd_readdirargs { - struct svc_fh fh; - __u32 cookie; - __u32 count; - __be32 * buffer; -}; - -struct nfsd_attrstat { - struct svc_fh fh; - struct kstat stat; -}; - -struct nfsd_diropres { - struct svc_fh fh; - struct kstat stat; -}; - -struct nfsd_readlinkres { - int len; -}; - -struct nfsd_readres { - struct svc_fh fh; - unsigned long count; - struct kstat stat; -}; - -struct nfsd_readdirres { - int count; - - struct readdir_cd common; - __be32 * buffer; - int buflen; - __be32 * offset; -}; - -struct nfsd_statfsres { - struct kstatfs stats; -}; - -/* - * Storage requirements for XDR arguments and results. - */ -union nfsd_xdrstore { - struct nfsd_sattrargs sattr; - struct nfsd_diropargs dirop; - struct nfsd_readargs read; - struct nfsd_writeargs write; - struct nfsd_createargs create; - struct nfsd_renameargs rename; - struct nfsd_linkargs link; - struct nfsd_symlinkargs symlink; - struct nfsd_readdirargs readdir; -}; - -#define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore) - - -int nfssvc_decode_void(struct svc_rqst *, __be32 *, void *); -int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); -int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *, - struct nfsd_sattrargs *); -int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *, - struct nfsd_diropargs *); -int nfssvc_decode_readargs(struct svc_rqst *, __be32 *, - struct nfsd_readargs *); -int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *, - struct nfsd_writeargs *); -int nfssvc_decode_createargs(struct svc_rqst *, __be32 *, - struct nfsd_createargs *); -int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *, - struct nfsd_renameargs *); -int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *, - struct nfsd_readlinkargs *); -int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *, - struct nfsd_linkargs *); -int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *, - struct nfsd_symlinkargs *); -int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *, - struct nfsd_readdirargs *); -int nfssvc_encode_void(struct svc_rqst *, __be32 *, void *); -int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *, struct nfsd_attrstat *); -int nfssvc_encode_diropres(struct svc_rqst *, __be32 *, struct nfsd_diropres *); -int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *, struct nfsd_readlinkres *); -int nfssvc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd_readres *); -int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *, struct nfsd_statfsres *); -int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *, struct nfsd_readdirres *); - -int nfssvc_encode_entry(void *, const char *name, - int namlen, loff_t offset, u64 ino, unsigned int); - -int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); - -/* Helper functions for NFSv2 ACL code */ -__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp); -__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp); - -#endif /* LINUX_NFSD_H */ diff --git a/include/linux/nfsd/xdr3.h b/include/linux/nfsd/xdr3.h deleted file mode 100644 index 421eddd65a25..000000000000 --- a/include/linux/nfsd/xdr3.h +++ /dev/null @@ -1,346 +0,0 @@ -/* - * linux/include/linux/nfsd/xdr3.h - * - * XDR types for NFSv3 in nfsd. - * - * Copyright (C) 1996-1998, Olaf Kirch - */ - -#ifndef _LINUX_NFSD_XDR3_H -#define _LINUX_NFSD_XDR3_H - -#include - -struct nfsd3_sattrargs { - struct svc_fh fh; - struct iattr attrs; - int check_guard; - time_t guardtime; -}; - -struct nfsd3_diropargs { - struct svc_fh fh; - char * name; - unsigned int len; -}; - -struct nfsd3_accessargs { - struct svc_fh fh; - unsigned int access; -}; - -struct nfsd3_readargs { - struct svc_fh fh; - __u64 offset; - __u32 count; - int vlen; -}; - -struct nfsd3_writeargs { - svc_fh fh; - __u64 offset; - __u32 count; - int stable; - __u32 len; - int vlen; -}; - -struct nfsd3_createargs { - struct svc_fh fh; - char * name; - unsigned int len; - int createmode; - struct iattr attrs; - __be32 * verf; -}; - -struct nfsd3_mknodargs { - struct svc_fh fh; - char * name; - unsigned int len; - __u32 ftype; - __u32 major, minor; - struct iattr attrs; -}; - -struct nfsd3_renameargs { - struct svc_fh ffh; - char * fname; - unsigned int flen; - struct svc_fh tfh; - char * tname; - unsigned int tlen; -}; - -struct nfsd3_readlinkargs { - struct svc_fh fh; - char * buffer; -}; - -struct nfsd3_linkargs { - struct svc_fh ffh; - struct svc_fh tfh; - char * tname; - unsigned int tlen; -}; - -struct nfsd3_symlinkargs { - struct svc_fh ffh; - char * fname; - unsigned int flen; - char * tname; - unsigned int tlen; - struct iattr attrs; -}; - -struct nfsd3_readdirargs { - struct svc_fh fh; - __u64 cookie; - __u32 dircount; - __u32 count; - __be32 * verf; - __be32 * buffer; -}; - -struct nfsd3_commitargs { - struct svc_fh fh; - __u64 offset; - __u32 count; -}; - -struct nfsd3_getaclargs { - struct svc_fh fh; - int mask; -}; - -struct posix_acl; -struct nfsd3_setaclargs { - struct svc_fh fh; - int mask; - struct posix_acl *acl_access; - struct posix_acl *acl_default; -}; - -struct nfsd3_attrstat { - __be32 status; - struct svc_fh fh; - struct kstat stat; -}; - -/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */ -struct nfsd3_diropres { - __be32 status; - struct svc_fh dirfh; - struct svc_fh fh; -}; - -struct nfsd3_accessres { - __be32 status; - struct svc_fh fh; - __u32 access; -}; - -struct nfsd3_readlinkres { - __be32 status; - struct svc_fh fh; - __u32 len; -}; - -struct nfsd3_readres { - __be32 status; - struct svc_fh fh; - unsigned long count; - int eof; -}; - -struct nfsd3_writeres { - __be32 status; - struct svc_fh fh; - unsigned long count; - int committed; -}; - -struct nfsd3_renameres { - __be32 status; - struct svc_fh ffh; - struct svc_fh tfh; -}; - -struct nfsd3_linkres { - __be32 status; - struct svc_fh tfh; - struct svc_fh fh; -}; - -struct nfsd3_readdirres { - __be32 status; - struct svc_fh fh; - int count; - __be32 verf[2]; - - struct readdir_cd common; - __be32 * buffer; - int buflen; - __be32 * offset; - __be32 * offset1; - struct svc_rqst * rqstp; - -}; - -struct nfsd3_fsstatres { - __be32 status; - struct kstatfs stats; - __u32 invarsec; -}; - -struct nfsd3_fsinfores { - __be32 status; - __u32 f_rtmax; - __u32 f_rtpref; - __u32 f_rtmult; - __u32 f_wtmax; - __u32 f_wtpref; - __u32 f_wtmult; - __u32 f_dtpref; - __u64 f_maxfilesize; - __u32 f_properties; -}; - -struct nfsd3_pathconfres { - __be32 status; - __u32 p_link_max; - __u32 p_name_max; - __u32 p_no_trunc; - __u32 p_chown_restricted; - __u32 p_case_insensitive; - __u32 p_case_preserving; -}; - -struct nfsd3_commitres { - __be32 status; - struct svc_fh fh; -}; - -struct nfsd3_getaclres { - __be32 status; - struct svc_fh fh; - int mask; - struct posix_acl *acl_access; - struct posix_acl *acl_default; -}; - -/* dummy type for release */ -struct nfsd3_fhandle_pair { - __u32 dummy; - struct svc_fh fh1; - struct svc_fh fh2; -}; - -/* - * Storage requirements for XDR arguments and results. - */ -union nfsd3_xdrstore { - struct nfsd3_sattrargs sattrargs; - struct nfsd3_diropargs diropargs; - struct nfsd3_readargs readargs; - struct nfsd3_writeargs writeargs; - struct nfsd3_createargs createargs; - struct nfsd3_renameargs renameargs; - struct nfsd3_linkargs linkargs; - struct nfsd3_symlinkargs symlinkargs; - struct nfsd3_readdirargs readdirargs; - struct nfsd3_diropres diropres; - struct nfsd3_accessres accessres; - struct nfsd3_readlinkres readlinkres; - struct nfsd3_readres readres; - struct nfsd3_writeres writeres; - struct nfsd3_renameres renameres; - struct nfsd3_linkres linkres; - struct nfsd3_readdirres readdirres; - struct nfsd3_fsstatres fsstatres; - struct nfsd3_fsinfores fsinfores; - struct nfsd3_pathconfres pathconfres; - struct nfsd3_commitres commitres; - struct nfsd3_getaclres getaclres; -}; - -#define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore) - -int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); -int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *, - struct nfsd3_sattrargs *); -int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *, - struct nfsd3_diropargs *); -int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *, - struct nfsd3_accessargs *); -int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *, - struct nfsd3_readargs *); -int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *, - struct nfsd3_writeargs *); -int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *, - struct nfsd3_createargs *); -int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *, - struct nfsd3_createargs *); -int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *, - struct nfsd3_mknodargs *); -int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *, - struct nfsd3_renameargs *); -int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *, - struct nfsd3_readlinkargs *); -int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *, - struct nfsd3_linkargs *); -int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *, - struct nfsd3_symlinkargs *); -int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *, - struct nfsd3_readdirargs *); -int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *, - struct nfsd3_readdirargs *); -int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *, - struct nfsd3_commitargs *); -int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *, void *); -int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *, - struct nfsd3_attrstat *); -int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *, - struct nfsd3_attrstat *); -int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *, - struct nfsd3_diropres *); -int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *, - struct nfsd3_accessres *); -int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *, - struct nfsd3_readlinkres *); -int nfs3svc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd3_readres *); -int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *, struct nfsd3_writeres *); -int nfs3svc_encode_createres(struct svc_rqst *, __be32 *, - struct nfsd3_diropres *); -int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *, - struct nfsd3_renameres *); -int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *, - struct nfsd3_linkres *); -int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *, - struct nfsd3_readdirres *); -int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *, - struct nfsd3_fsstatres *); -int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *, - struct nfsd3_fsinfores *); -int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *, - struct nfsd3_pathconfres *); -int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *, - struct nfsd3_commitres *); - -int nfs3svc_release_fhandle(struct svc_rqst *, __be32 *, - struct nfsd3_attrstat *); -int nfs3svc_release_fhandle2(struct svc_rqst *, __be32 *, - struct nfsd3_fhandle_pair *); -int nfs3svc_encode_entry(void *, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int); -int nfs3svc_encode_entry_plus(void *, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int); -/* Helper functions for NFSv3 ACL code */ -__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, - struct svc_fh *fhp); -__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp); - - -#endif /* _LINUX_NFSD_XDR3_H */ diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h deleted file mode 100644 index 1bf266239c7e..000000000000 --- a/include/linux/nfsd/xdr4.h +++ /dev/null @@ -1,564 +0,0 @@ -/* - * include/linux/nfsd/xdr4.h - * - * Server-side types for NFSv4. - * - * Copyright (c) 2002 The Regents of the University of Michigan. - * All rights reserved. - * - * Kendrick Smith - * Andy Adamson - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#ifndef _LINUX_NFSD_XDR4_H -#define _LINUX_NFSD_XDR4_H - -#include -#include - -#define NFSD4_MAX_TAGLEN 128 -#define XDR_LEN(n) (((n) + 3) & ~3) - -struct nfsd4_compound_state { - struct svc_fh current_fh; - struct svc_fh save_fh; - struct nfs4_stateowner *replay_owner; - /* For sessions DRC */ - struct nfsd4_session *session; - struct nfsd4_slot *slot; - __be32 *datap; - size_t iovlen; - u32 minorversion; - u32 status; -}; - -static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs) -{ - return cs->slot != NULL; -} - -struct nfsd4_change_info { - u32 atomic; - bool change_supported; - u32 before_ctime_sec; - u32 before_ctime_nsec; - u64 before_change; - u32 after_ctime_sec; - u32 after_ctime_nsec; - u64 after_change; -}; - -struct nfsd4_access { - u32 ac_req_access; /* request */ - u32 ac_supported; /* response */ - u32 ac_resp_access; /* response */ -}; - -struct nfsd4_close { - u32 cl_seqid; /* request */ - stateid_t cl_stateid; /* request+response */ - struct nfs4_stateowner * cl_stateowner; /* response */ -}; - -struct nfsd4_commit { - u64 co_offset; /* request */ - u32 co_count; /* request */ - nfs4_verifier co_verf; /* response */ -}; - -struct nfsd4_create { - u32 cr_namelen; /* request */ - char * cr_name; /* request */ - u32 cr_type; /* request */ - union { /* request */ - struct { - u32 namelen; - char *name; - } link; /* NF4LNK */ - struct { - u32 specdata1; - u32 specdata2; - } dev; /* NF4BLK, NF4CHR */ - } u; - u32 cr_bmval[3]; /* request */ - struct iattr cr_iattr; /* request */ - struct nfsd4_change_info cr_cinfo; /* response */ - struct nfs4_acl *cr_acl; -}; -#define cr_linklen u.link.namelen -#define cr_linkname u.link.name -#define cr_specdata1 u.dev.specdata1 -#define cr_specdata2 u.dev.specdata2 - -struct nfsd4_delegreturn { - stateid_t dr_stateid; -}; - -struct nfsd4_getattr { - u32 ga_bmval[3]; /* request */ - struct svc_fh *ga_fhp; /* response */ -}; - -struct nfsd4_link { - u32 li_namelen; /* request */ - char * li_name; /* request */ - struct nfsd4_change_info li_cinfo; /* response */ -}; - -struct nfsd4_lock_denied { - clientid_t ld_clientid; - struct nfs4_stateowner *ld_sop; - u64 ld_start; - u64 ld_length; - u32 ld_type; -}; - -struct nfsd4_lock { - /* request */ - u32 lk_type; - u32 lk_reclaim; /* boolean */ - u64 lk_offset; - u64 lk_length; - u32 lk_is_new; - union { - struct { - u32 open_seqid; - stateid_t open_stateid; - u32 lock_seqid; - clientid_t clientid; - struct xdr_netobj owner; - } new; - struct { - stateid_t lock_stateid; - u32 lock_seqid; - } old; - } v; - - /* response */ - union { - struct { - stateid_t stateid; - } ok; - struct nfsd4_lock_denied denied; - } u; - /* The lk_replay_owner is the open owner in the open_to_lock_owner - * case and the lock owner otherwise: */ - struct nfs4_stateowner *lk_replay_owner; -}; -#define lk_new_open_seqid v.new.open_seqid -#define lk_new_open_stateid v.new.open_stateid -#define lk_new_lock_seqid v.new.lock_seqid -#define lk_new_clientid v.new.clientid -#define lk_new_owner v.new.owner -#define lk_old_lock_stateid v.old.lock_stateid -#define lk_old_lock_seqid v.old.lock_seqid - -#define lk_rflags u.ok.rflags -#define lk_resp_stateid u.ok.stateid -#define lk_denied u.denied - - -struct nfsd4_lockt { - u32 lt_type; - clientid_t lt_clientid; - struct xdr_netobj lt_owner; - u64 lt_offset; - u64 lt_length; - struct nfs4_stateowner * lt_stateowner; - struct nfsd4_lock_denied lt_denied; -}; - - -struct nfsd4_locku { - u32 lu_type; - u32 lu_seqid; - stateid_t lu_stateid; - u64 lu_offset; - u64 lu_length; - struct nfs4_stateowner *lu_stateowner; -}; - - -struct nfsd4_lookup { - u32 lo_len; /* request */ - char * lo_name; /* request */ -}; - -struct nfsd4_putfh { - u32 pf_fhlen; /* request */ - char *pf_fhval; /* request */ -}; - -struct nfsd4_open { - u32 op_claim_type; /* request */ - struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */ - u32 op_delegate_type; /* request - CLAIM_PREV only */ - stateid_t op_delegate_stateid; /* request - response */ - u32 op_create; /* request */ - u32 op_createmode; /* request */ - u32 op_bmval[3]; /* request */ - struct iattr iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ - nfs4_verifier verf; /* EXCLUSIVE4 */ - clientid_t op_clientid; /* request */ - struct xdr_netobj op_owner; /* request */ - u32 op_seqid; /* request */ - u32 op_share_access; /* request */ - u32 op_share_deny; /* request */ - stateid_t op_stateid; /* response */ - u32 op_recall; /* recall */ - struct nfsd4_change_info op_cinfo; /* response */ - u32 op_rflags; /* response */ - int op_truncate; /* used during processing */ - struct nfs4_stateowner *op_stateowner; /* used during processing */ - struct nfs4_acl *op_acl; -}; -#define op_iattr iattr -#define op_verf verf - -struct nfsd4_open_confirm { - stateid_t oc_req_stateid /* request */; - u32 oc_seqid /* request */; - stateid_t oc_resp_stateid /* response */; - struct nfs4_stateowner * oc_stateowner; /* response */ -}; - -struct nfsd4_open_downgrade { - stateid_t od_stateid; - u32 od_seqid; - u32 od_share_access; - u32 od_share_deny; - struct nfs4_stateowner *od_stateowner; -}; - - -struct nfsd4_read { - stateid_t rd_stateid; /* request */ - u64 rd_offset; /* request */ - u32 rd_length; /* request */ - int rd_vlen; - struct file *rd_filp; - - struct svc_rqst *rd_rqstp; /* response */ - struct svc_fh * rd_fhp; /* response */ -}; - -struct nfsd4_readdir { - u64 rd_cookie; /* request */ - nfs4_verifier rd_verf; /* request */ - u32 rd_dircount; /* request */ - u32 rd_maxcount; /* request */ - u32 rd_bmval[3]; /* request */ - struct svc_rqst *rd_rqstp; /* response */ - struct svc_fh * rd_fhp; /* response */ - - struct readdir_cd common; - __be32 * buffer; - int buflen; - __be32 * offset; -}; - -struct nfsd4_release_lockowner { - clientid_t rl_clientid; - struct xdr_netobj rl_owner; -}; -struct nfsd4_readlink { - struct svc_rqst *rl_rqstp; /* request */ - struct svc_fh * rl_fhp; /* request */ -}; - -struct nfsd4_remove { - u32 rm_namelen; /* request */ - char * rm_name; /* request */ - struct nfsd4_change_info rm_cinfo; /* response */ -}; - -struct nfsd4_rename { - u32 rn_snamelen; /* request */ - char * rn_sname; /* request */ - u32 rn_tnamelen; /* request */ - char * rn_tname; /* request */ - struct nfsd4_change_info rn_sinfo; /* response */ - struct nfsd4_change_info rn_tinfo; /* response */ -}; - -struct nfsd4_secinfo { - u32 si_namelen; /* request */ - char *si_name; /* request */ - struct svc_export *si_exp; /* response */ -}; - -struct nfsd4_setattr { - stateid_t sa_stateid; /* request */ - u32 sa_bmval[3]; /* request */ - struct iattr sa_iattr; /* request */ - struct nfs4_acl *sa_acl; -}; - -struct nfsd4_setclientid { - nfs4_verifier se_verf; /* request */ - u32 se_namelen; /* request */ - char * se_name; /* request */ - u32 se_callback_prog; /* request */ - u32 se_callback_netid_len; /* request */ - char * se_callback_netid_val; /* request */ - u32 se_callback_addr_len; /* request */ - char * se_callback_addr_val; /* request */ - u32 se_callback_ident; /* request */ - clientid_t se_clientid; /* response */ - nfs4_verifier se_confirm; /* response */ -}; - -struct nfsd4_setclientid_confirm { - clientid_t sc_clientid; - nfs4_verifier sc_confirm; -}; - -/* also used for NVERIFY */ -struct nfsd4_verify { - u32 ve_bmval[3]; /* request */ - u32 ve_attrlen; /* request */ - char * ve_attrval; /* request */ -}; - -struct nfsd4_write { - stateid_t wr_stateid; /* request */ - u64 wr_offset; /* request */ - u32 wr_stable_how; /* request */ - u32 wr_buflen; /* request */ - int wr_vlen; - - u32 wr_bytes_written; /* response */ - u32 wr_how_written; /* response */ - nfs4_verifier wr_verifier; /* response */ -}; - -struct nfsd4_exchange_id { - nfs4_verifier verifier; - struct xdr_netobj clname; - u32 flags; - clientid_t clientid; - u32 seqid; - int spa_how; -}; - -struct nfsd4_sequence { - struct nfs4_sessionid sessionid; /* request/response */ - u32 seqid; /* request/response */ - u32 slotid; /* request/response */ - u32 maxslots; /* request/response */ - u32 cachethis; /* request */ -#if 0 - u32 target_maxslots; /* response */ - u32 status_flags; /* response */ -#endif /* not yet */ -}; - -struct nfsd4_destroy_session { - struct nfs4_sessionid sessionid; -}; - -struct nfsd4_op { - int opnum; - __be32 status; - union { - struct nfsd4_access access; - struct nfsd4_close close; - struct nfsd4_commit commit; - struct nfsd4_create create; - struct nfsd4_delegreturn delegreturn; - struct nfsd4_getattr getattr; - struct svc_fh * getfh; - struct nfsd4_link link; - struct nfsd4_lock lock; - struct nfsd4_lockt lockt; - struct nfsd4_locku locku; - struct nfsd4_lookup lookup; - struct nfsd4_verify nverify; - struct nfsd4_open open; - struct nfsd4_open_confirm open_confirm; - struct nfsd4_open_downgrade open_downgrade; - struct nfsd4_putfh putfh; - struct nfsd4_read read; - struct nfsd4_readdir readdir; - struct nfsd4_readlink readlink; - struct nfsd4_remove remove; - struct nfsd4_rename rename; - clientid_t renew; - struct nfsd4_secinfo secinfo; - struct nfsd4_setattr setattr; - struct nfsd4_setclientid setclientid; - struct nfsd4_setclientid_confirm setclientid_confirm; - struct nfsd4_verify verify; - struct nfsd4_write write; - struct nfsd4_release_lockowner release_lockowner; - - /* NFSv4.1 */ - struct nfsd4_exchange_id exchange_id; - struct nfsd4_create_session create_session; - struct nfsd4_destroy_session destroy_session; - struct nfsd4_sequence sequence; - } u; - struct nfs4_replay * replay; -}; - -struct nfsd4_compoundargs { - /* scratch variables for XDR decode */ - __be32 * p; - __be32 * end; - struct page ** pagelist; - int pagelen; - __be32 tmp[8]; - __be32 * tmpp; - struct tmpbuf { - struct tmpbuf *next; - void (*release)(const void *); - void *buf; - } *to_free; - - struct svc_rqst *rqstp; - - u32 taglen; - char * tag; - u32 minorversion; - u32 opcnt; - struct nfsd4_op *ops; - struct nfsd4_op iops[8]; -}; - -struct nfsd4_compoundres { - /* scratch variables for XDR encode */ - __be32 * p; - __be32 * end; - struct xdr_buf * xbuf; - struct svc_rqst * rqstp; - - u32 taglen; - char * tag; - u32 opcnt; - __be32 * tagp; /* tag, opcount encode location */ - struct nfsd4_compound_state cstate; -}; - -static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp) -{ - struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; - return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE; -} - -static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) -{ - return !resp->cstate.slot->sl_cachethis || nfsd4_is_solo_sequence(resp); -} - -#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) - -static inline void -set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) -{ - BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved); - cinfo->atomic = 1; - cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); - if (cinfo->change_supported) { - cinfo->before_change = fhp->fh_pre_change; - cinfo->after_change = fhp->fh_post_change; - } else { - cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; - cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; - cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; - cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; - } -} - -int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); -int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *, - struct nfsd4_compoundargs *); -int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, - struct nfsd4_compoundres *); -void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); -void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); -__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, - struct dentry *dentry, __be32 *buffer, int *countp, - u32 *bmval, struct svc_rqst *, int ignore_crossmnt); -extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, - struct nfsd4_compound_state *, - struct nfsd4_setclientid *setclid); -extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, - struct nfsd4_compound_state *, - struct nfsd4_setclientid_confirm *setclientid_confirm); -extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp); -extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, - struct nfsd4_sequence *seq); -extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, - struct nfsd4_compound_state *, -struct nfsd4_exchange_id *); - extern __be32 nfsd4_create_session(struct svc_rqst *, - struct nfsd4_compound_state *, - struct nfsd4_create_session *); -extern __be32 nfsd4_sequence(struct svc_rqst *, - struct nfsd4_compound_state *, - struct nfsd4_sequence *); -extern __be32 nfsd4_destroy_session(struct svc_rqst *, - struct nfsd4_compound_state *, - struct nfsd4_destroy_session *); -extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, - struct nfsd4_open *open); -extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, - struct svc_fh *current_fh, struct nfsd4_open *open); -extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, - struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); -extern __be32 nfsd4_close(struct svc_rqst *rqstp, - struct nfsd4_compound_state *, - struct nfsd4_close *close); -extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp, - struct nfsd4_compound_state *, - struct nfsd4_open_downgrade *od); -extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *, - struct nfsd4_lock *lock); -extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, - struct nfsd4_compound_state *, - struct nfsd4_lockt *lockt); -extern __be32 nfsd4_locku(struct svc_rqst *rqstp, - struct nfsd4_compound_state *, - struct nfsd4_locku *locku); -extern __be32 -nfsd4_release_lockowner(struct svc_rqst *rqstp, - struct nfsd4_compound_state *, - struct nfsd4_release_lockowner *rlockowner); -extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *); -extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp, - struct nfsd4_compound_state *, struct nfsd4_delegreturn *dr); -extern __be32 nfsd4_renew(struct svc_rqst *rqstp, - struct nfsd4_compound_state *, clientid_t *clid); -#endif - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ -- cgit v1.2.3 From e8e8753f7a32ce4f636771126fc8eba0dc4ad817 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 14 Dec 2009 12:53:32 -0500 Subject: nfsd: new interface to advertise export features Soon we will add the new V4ROOT flag, and allow the INSECURE flag to vary by pseudoflavor. It would be useful for nfs-utils (for example, for improved exportfs error reporting) to be able to know when this happens. Use this new interface for that purpose. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 21 +++++++++++++++++++++ include/linux/nfsd/export.h | 3 ++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 0415680d3f58..e7051ac4dc73 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -31,6 +31,7 @@ enum { NFSD_Getfd, NFSD_Getfs, NFSD_List, + NFSD_Export_features, NFSD_Fh, NFSD_FO_UnlockIP, NFSD_FO_UnlockFS, @@ -149,6 +150,24 @@ static const struct file_operations exports_operations = { .owner = THIS_MODULE, }; +static int export_features_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%x 0x%x\n", NFSEXP_ALLFLAGS, NFSEXP_SECINFO_FLAGS); + return 0; +} + +static int export_features_open(struct inode *inode, struct file *file) +{ + return single_open(file, export_features_show, NULL); +} + +static struct file_operations export_features_operations = { + .open = export_features_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); @@ -1306,6 +1325,8 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, + [NFSD_Export_features] = {"export_features", + &export_features_operations, S_IRUGO}, [NFSD_FO_UnlockIP] = {"unlock_ip", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_FO_UnlockFS] = {"unlock_filesystem", diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h index ef3d416fcf67..4f1df1d7312c 100644 --- a/include/linux/nfsd/export.h +++ b/include/linux/nfsd/export.h @@ -39,7 +39,8 @@ #define NFSEXP_FSID 0x2000 #define NFSEXP_CROSSMOUNT 0x4000 #define NFSEXP_NOACL 0x8000 /* reserved for possible ACL related use */ -#define NFSEXP_ALLFLAGS 0xFE3F +/* All flags that we claim to support. (Note we don't support NOACL.) */ +#define NFSEXP_ALLFLAGS 0x7E3F /* The flags that may vary depending on security flavor: */ #define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \ -- cgit v1.2.3 From 12045a6ee9908b38b6d286530c7d816e39071346 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 8 Dec 2009 18:15:52 -0500 Subject: nfsd: let "insecure" flag vary by pseudoflavor This was an oversight; it should be among the export flags that can be allowed to vary by pseudoflavor. This allows an administrator to (for example) allow auth_sys mounts only from low ports, but allow auth_krb5 mounts to use any port. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsfh.c | 4 +++- include/linux/nfsd/export.h | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 0eb1c59f5ab8..951938d6c495 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -88,8 +88,10 @@ nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type) static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, struct svc_export *exp) { + int flags = nfsexp_flags(rqstp, exp); + /* Check if the request originated from a secure port. */ - if (!rqstp->rq_secure && EX_SECURE(exp)) { + if (!rqstp->rq_secure && (flags & NFSEXP_INSECURE_PORT)) { RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk(KERN_WARNING "nfsd: request from insecure port %s!\n", diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h index 4f1df1d7312c..4cafbe1255f0 100644 --- a/include/linux/nfsd/export.h +++ b/include/linux/nfsd/export.h @@ -44,7 +44,8 @@ /* The flags that may vary depending on security flavor: */ #define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \ - | NFSEXP_ALLSQUASH) + | NFSEXP_ALLSQUASH \ + | NFSEXP_INSECURE_PORT) #ifdef __KERNEL__ @@ -109,7 +110,6 @@ struct svc_expkey { struct path ek_path; }; -#define EX_SECURE(exp) (!((exp)->ex_flags & NFSEXP_INSECURE_PORT)) #define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC)) #define EX_NOHIDE(exp) ((exp)->ex_flags & NFSEXP_NOHIDE) #define EX_WGATHER(exp) ((exp)->ex_flags & NFSEXP_GATHERED_WRITES) -- cgit v1.2.3 From 9e1b9b80721661bd63b3662453767b22cd614fe7 Mon Sep 17 00:00:00 2001 From: Alan Jenkins Date: Sat, 7 Nov 2009 21:03:54 +0000 Subject: module: make MODULE_SYMBOL_PREFIX into a CONFIG option The next commit will require the use of MODULE_SYMBOL_PREFIX in .tmp_exports-asm.S. Currently it is mixed in with C structure definitions in "asm/module.h". Move the definition of this arch option into Kconfig, so it can be easily accessed by any code. This also lets modpost.c use the same definition. Previously modpost relied on a hardcoded list of architectures in mk_elfconfig.c. A build test for blackfin, one of the two MODULE_SYMBOL_PREFIX archs, showed the generated code was unchanged. vmlinux was identical save for build ids, and an apparently randomized suffix on a single "__key" symbol in the kallsyms data). Signed-off-by: Alan Jenkins Acked-by: Mike Frysinger (blackfin) CC: Sam Ravnborg Signed-off-by: Rusty Russell --- arch/blackfin/Kconfig | 4 ++++ arch/blackfin/include/asm/module.h | 2 -- arch/blackfin/kernel/vmlinux.lds.S | 2 -- arch/h8300/Kconfig | 4 ++++ arch/h8300/include/asm/module.h | 2 -- arch/h8300/kernel/vmlinux.lds.S | 1 - include/asm-generic/vmlinux.lds.h | 8 ++++++-- include/linux/module.h | 6 ++++-- scripts/Makefile.lib | 5 +++++ scripts/mod/Makefile | 2 +- scripts/mod/mk_elfconfig.c | 9 --------- scripts/mod/modpost.c | 9 +++++++++ 12 files changed, 33 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index ae6a60f10120..2180433213b7 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -5,6 +5,10 @@ mainmenu "Blackfin Kernel Configuration" +config SYMBOL_PREFIX + string + default "_" + config MMU def_bool n diff --git a/arch/blackfin/include/asm/module.h b/arch/blackfin/include/asm/module.h index 9c1cfffddd9b..4282b169ead9 100644 --- a/arch/blackfin/include/asm/module.h +++ b/arch/blackfin/include/asm/module.h @@ -7,8 +7,6 @@ #ifndef _ASM_BFIN_MODULE_H #define _ASM_BFIN_MODULE_H -#define MODULE_SYMBOL_PREFIX "_" - #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index 10e12539000e..f39707c6590d 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S @@ -4,8 +4,6 @@ * Licensed under the GPL-2 or later */ -#define VMLINUX_SYMBOL(_sym_) _##_sym_ - #include #include #include diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 9420648352b8..53cc669e6d59 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -10,6 +10,10 @@ config H8300 default y select HAVE_IDE +config SYMBOL_PREFIX + string + default "_" + config MMU bool default n diff --git a/arch/h8300/include/asm/module.h b/arch/h8300/include/asm/module.h index de23231f3196..8e46724b7c09 100644 --- a/arch/h8300/include/asm/module.h +++ b/arch/h8300/include/asm/module.h @@ -8,6 +8,4 @@ struct mod_arch_specific { }; #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr -#define MODULE_SYMBOL_PREFIX "_" - #endif /* _ASM_H8/300_MODULE_H */ diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S index b9e24907e6ea..03d356d96e5d 100644 --- a/arch/h8300/kernel/vmlinux.lds.S +++ b/arch/h8300/kernel/vmlinux.lds.S @@ -1,4 +1,3 @@ -#define VMLINUX_SYMBOL(_sym_) _##_sym_ #include #include diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index b6e818f4b247..67e652068e0e 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -52,8 +52,12 @@ #define LOAD_OFFSET 0 #endif -#ifndef VMLINUX_SYMBOL -#define VMLINUX_SYMBOL(_sym_) _sym_ +#ifndef SYMBOL_PREFIX +#define VMLINUX_SYMBOL(sym) sym +#else +#define PASTE2(x,y) x##y +#define PASTE(x,y) PASTE2(x,y) +#define VMLINUX_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym) #endif /* Align . to a 8 byte boundary equals to maximum function alignment. */ diff --git a/include/linux/module.h b/include/linux/module.h index 482efc865acf..6cb1a3cab5d3 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -25,8 +25,10 @@ /* Not Yet Implemented */ #define MODULE_SUPPORTED_DEVICE(name) -/* some toolchains uses a `_' prefix for all user symbols */ -#ifndef MODULE_SYMBOL_PREFIX +/* Some toolchains use a `_' prefix for all user symbols. */ +#ifdef CONFIG_SYMBOL_PREFIX +#define MODULE_SYMBOL_PREFIX CONFIG_SYMBOL_PREFIX +#else #define MODULE_SYMBOL_PREFIX "" #endif diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index ffdafb26f539..224d85e72ef1 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -127,6 +127,11 @@ _c_flags += $(if $(patsubst n%,, \ $(CFLAGS_GCOV)) endif +ifdef CONFIG_SYMBOL_PREFIX +_cpp_flags += -DSYMBOL_PREFIX=$(patsubst "%",%,$(CONFIG_SYMBOL_PREFIX)) +endif + + # If building the kernel in a separate objtree expand all occurrences # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/'). diff --git a/scripts/mod/Makefile b/scripts/mod/Makefile index 11d69c35e5b4..ff954f8168c1 100644 --- a/scripts/mod/Makefile +++ b/scripts/mod/Makefile @@ -8,7 +8,7 @@ modpost-objs := modpost.o file2alias.o sumversion.o $(obj)/modpost.o $(obj)/file2alias.o $(obj)/sumversion.o: $(obj)/elfconfig.h quiet_cmd_elfconfig = MKELF $@ - cmd_elfconfig = $(obj)/mk_elfconfig $(ARCH) < $< > $@ + cmd_elfconfig = $(obj)/mk_elfconfig < $< > $@ $(obj)/elfconfig.h: $(obj)/empty.o $(obj)/mk_elfconfig FORCE $(call if_changed,elfconfig) diff --git a/scripts/mod/mk_elfconfig.c b/scripts/mod/mk_elfconfig.c index 6a96d47bd1e6..639bca7ba559 100644 --- a/scripts/mod/mk_elfconfig.c +++ b/scripts/mod/mk_elfconfig.c @@ -9,9 +9,6 @@ main(int argc, char **argv) unsigned char ei[EI_NIDENT]; union { short s; char c[2]; } endian_test; - if (argc != 2) { - fprintf(stderr, "Error: no arch\n"); - } if (fread(ei, 1, EI_NIDENT, stdin) != EI_NIDENT) { fprintf(stderr, "Error: input truncated\n"); return 1; @@ -55,12 +52,6 @@ main(int argc, char **argv) else exit(1); - if ((strcmp(argv[1], "h8300") == 0) - || (strcmp(argv[1], "blackfin") == 0)) - printf("#define MODULE_SYMBOL_PREFIX \"_\"\n"); - else - printf("#define MODULE_SYMBOL_PREFIX \"\"\n"); - return 0; } diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 801a16a17545..fb0f9b711af3 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -15,8 +15,17 @@ #include #include #include "modpost.h" +#include "../../include/linux/autoconf.h" #include "../../include/linux/license.h" +/* Some toolchains use a `_' prefix for all user symbols. */ +#ifdef CONFIG_SYMBOL_PREFIX +#define MODULE_SYMBOL_PREFIX CONFIG_SYMBOL_PREFIX +#else +#define MODULE_SYMBOL_PREFIX "" +#endif + + /* Are we using CONFIG_MODVERSIONS? */ int modversions = 0; /* Warn about undefined symbols? (do so if we have vmlinux) */ -- cgit v1.2.3 From e36c54582c6f14adc9e10473e2aec2cc4f0acc03 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 14 Dec 2009 15:58:33 -0500 Subject: tracing: Fix return of trace_dump_stack() The trace_dump_stack() returned a value for a void function. Also, added the missing stub for trace_dump_stack() when tracing is not configured. Reported-by: Ingo Molnar LKML-Reference: <20091214162713.GA31060@elte.hu> Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 1 + kernel/trace/trace.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5ad4199fb073..f1dc752da0d2 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -527,6 +527,7 @@ trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); static inline void tracing_start(void) { } static inline void tracing_stop(void) { } static inline void ftrace_off_permanent(void) { } +static inline void trace_dump_stack(void) { } static inline int trace_printk(const char *fmt, ...) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bd7b969a729a..ee61915935d5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1158,7 +1158,7 @@ void trace_dump_stack(void) unsigned long flags; if (tracing_disabled || tracing_selftest_running) - return 0; + return; local_save_flags(flags); -- cgit v1.2.3 From 48f186124220794fce85ed1439fc32f16f69d3e2 Mon Sep 17 00:00:00 2001 From: Alexandros Batsakis Date: Mon, 14 Dec 2009 21:27:53 -0800 Subject: rpc: add rpc_queue_empty function Signed-off-by: Alexandros Batsakis Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 1 + net/sunrpc/sched.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 1906782ec86b..9157405f9320 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -229,6 +229,7 @@ void rpc_wake_up_queued_task(struct rpc_wait_queue *, void rpc_wake_up(struct rpc_wait_queue *); struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *); void rpc_wake_up_status(struct rpc_wait_queue *, int); +int rpc_queue_empty(struct rpc_wait_queue *); void rpc_delay(struct rpc_task *, unsigned long); void * rpc_malloc(struct rpc_task *, size_t); void rpc_free(void *); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index cef74ba0666c..89ea8e69ec78 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -384,6 +384,20 @@ static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct r __rpc_do_wake_up_task(queue, task); } +/* + * Tests whether rpc queue is empty + */ +int rpc_queue_empty(struct rpc_wait_queue *queue) +{ + int res; + + spin_lock_bh(&queue->lock); + res = queue->qlen; + spin_unlock_bh(&queue->lock); + return (res == 0); +} +EXPORT_SYMBOL_GPL(rpc_queue_empty); + /* * Wake up a task on a specific queue */ -- cgit v1.2.3 From cf3b01b54880debb01ea7d471123da5887a7c2cb Mon Sep 17 00:00:00 2001 From: Alexandros Batsakis Date: Mon, 14 Dec 2009 21:27:55 -0800 Subject: rpc: add a new priority in RPC task Signed-off-by: Alexandros Batsakis Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 9157405f9320..7bc7fd5291ce 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -173,7 +173,8 @@ struct rpc_task_setup { #define RPC_PRIORITY_LOW (-1) #define RPC_PRIORITY_NORMAL (0) #define RPC_PRIORITY_HIGH (1) -#define RPC_NR_PRIORITY (1 + RPC_PRIORITY_HIGH - RPC_PRIORITY_LOW) +#define RPC_PRIORITY_PRIVILEGED (2) +#define RPC_NR_PRIORITY (1 + RPC_PRIORITY_PRIVILEGED - RPC_PRIORITY_LOW) struct rpc_timer { struct timer_list timer; @@ -255,6 +256,16 @@ static inline int rpc_wait_for_completion_task(struct rpc_task *task) return __rpc_wait_for_completion_task(task, NULL); } +static inline void rpc_task_set_priority(struct rpc_task *task, unsigned char prio) +{ + task->tk_priority = prio - RPC_PRIORITY_LOW; +} + +static inline int rpc_task_has_priority(struct rpc_task *task, unsigned char prio) +{ + return (task->tk_priority + RPC_PRIORITY_LOW == prio); +} + #ifdef RPC_DEBUG static inline const char * rpc_qname(struct rpc_wait_queue *q) { -- cgit v1.2.3 From eb4c86c6a5adec423c9e615d4937fdddd06a16c5 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Wed, 9 Sep 2009 14:58:22 -0400 Subject: nfsd: introduce export flag for v4 pseudoroot NFSv4 differs from v2 and v3 in that it presents a single unified filesystem tree, whereas v2 and v3 exported multiple filesystem (whose roots could be found using a separate mount protocol). Our original NFSv4 server implementation asked the administrator to designate a single filesystem as the NFSv4 root, then to mount filesystems they wished to export underneath. (Often using bind mounts of already-existing filesystems.) This was conceptually simple, and allowed easy implementation, but created a serious obstacle to upgrading between v2/v3: since the paths to v4 filesystems were different, administrators would have to adjust all the paths in client-side mount commands when switching to v4. Various workarounds are possible. For example, the administrator could export "/" and designate it as the v4 root. However, the security risks of that approach are obvious, and in any case we shouldn't be requiring the administrator to take extra steps to fix this problem; instead, the server should present consistent paths across different versions by default. These patches take a modified version of that approach: we provide a new export option which exports only a subset of a filesystem. With this flag, it becomes safe for mountd to export "/" by default, with no need for additional configuration. We begin just by defining the new flag. Signed-off-by: Steve Dickson Signed-off-by: J. Bruce Fields --- fs/nfsd/export.c | 1 + include/linux/nfsd/export.h | 10 ++++++++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index cb3dae2fcd86..c64d55f319bd 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1425,6 +1425,7 @@ static struct flags { { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, + { NFSEXP_V4ROOT, {"v4root", ""}}, #ifdef MSNFS { NFSEXP_MSNFS, {"msnfs", ""}}, #endif diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h index 4cafbe1255f0..41f0d4e25374 100644 --- a/include/linux/nfsd/export.h +++ b/include/linux/nfsd/export.h @@ -39,6 +39,16 @@ #define NFSEXP_FSID 0x2000 #define NFSEXP_CROSSMOUNT 0x4000 #define NFSEXP_NOACL 0x8000 /* reserved for possible ACL related use */ +/* + * The NFSEXP_V4ROOT flag causes the kernel to give access only to NFSv4 + * clients, and only to the single directory that is the root of the + * export; further lookup and readdir operations are treated as if every + * subdirectory was a mountpoint, and ignored if they are not themselves + * exported. This is used by nfsd and mountd to construct the NFSv4 + * pseudofilesystem, which provides access only to paths leading to each + * exported filesystem. + */ +#define NFSEXP_V4ROOT 0x10000 /* All flags that we claim to support. (Note we don't support NOACL.) */ #define NFSEXP_ALLFLAGS 0x7E3F -- cgit v1.2.3 From f13c12c634e124d5d31f912b969d542a016d6105 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Dec 2009 19:43:11 +0100 Subject: perf_events: Fix perf_event_attr layout The miss-alignment of bp_addr created a 32bit hole, causing different structure packings on 32 and 64 bit machines. Fix that by moving __reserve_2 into that hole. Further, remove the useless struct and redundant __bp_reserve muck. Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo LKML-Reference: <1260902591.8023.781.camel@laptop> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 12 +++--------- kernel/perf_event.c | 2 +- 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 64a53f74c9a9..5fcbf7d2712a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -211,17 +211,11 @@ struct perf_event_attr { __u32 wakeup_watermark; /* bytes before wakeup */ }; - struct { /* Hardware breakpoint info */ - __u64 bp_addr; - __u32 bp_type; - __u32 bp_len; - __u64 __bp_reserved_1; - __u64 __bp_reserved_2; - }; - __u32 __reserved_2; - __u64 __reserved_3; + __u64 bp_addr; + __u32 bp_type; + __u32 bp_len; }; /* diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8823b0885183..0dd8e5d02c66 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4564,7 +4564,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->type >= PERF_TYPE_MAX) return -EINVAL; - if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + if (attr->__reserved_1 || attr->__reserved_2) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) -- cgit v1.2.3 From f2511774863487e61b56a97da07ebf8dd61d7836 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 13 Dec 2009 20:29:01 +0100 Subject: PM: Add initcall_debug style timing for suspend/resume In order to diagnose overall suspend/resume times, we need basic instrumentation to break down the total time into per device timing, similar to initcall_debug. This patch adds the basic timing instrumentation, needed for a scritps/bootgraph.pl equivalent or humans. The bootgraph.pl program is still a work in progress, but is far enough along to know that this patch is sufficient. Signed-off-by: Arjan van de Ven Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 31 +++++++++++++++++++++++++++++++ include/linux/init.h | 2 ++ 2 files changed, 33 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 8aa2443182d5..30f0ceebd36c 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "../base.h" #include "power.h" @@ -172,6 +173,13 @@ static int pm_op(struct device *dev, pm_message_t state) { int error = 0; + ktime_t calltime, delta, rettime; + + if (initcall_debug) { + pr_info("calling %s+ @ %i\n", + dev_name(dev), task_pid_nr(current)); + calltime = ktime_get(); + } switch (state.event) { #ifdef CONFIG_SUSPEND @@ -219,6 +227,14 @@ static int pm_op(struct device *dev, default: error = -EINVAL; } + + if (initcall_debug) { + rettime = ktime_get(); + delta = ktime_sub(rettime, calltime); + pr_info("call %s+ returned %d after %Ld usecs\n", dev_name(dev), + error, (unsigned long long)ktime_to_ns(delta) >> 10); + } + return error; } @@ -236,6 +252,13 @@ static int pm_noirq_op(struct device *dev, pm_message_t state) { int error = 0; + ktime_t calltime, delta, rettime; + + if (initcall_debug) { + pr_info("calling %s_i+ @ %i\n", + dev_name(dev), task_pid_nr(current)); + calltime = ktime_get(); + } switch (state.event) { #ifdef CONFIG_SUSPEND @@ -283,6 +306,14 @@ static int pm_noirq_op(struct device *dev, default: error = -EINVAL; } + + if (initcall_debug) { + rettime = ktime_get(); + delta = ktime_sub(rettime, calltime); + printk("initcall %s_i+ returned %d after %Ld usecs\n", dev_name(dev), + error, (unsigned long long)ktime_to_ns(delta) >> 10); + } + return error; } diff --git a/include/linux/init.h b/include/linux/init.h index ff8bde520d03..ab1d31f9352b 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -149,6 +149,8 @@ void prepare_namespace(void); extern void (*late_time_init)(void); +extern int initcall_debug; + #endif #ifndef MODULE -- cgit v1.2.3 From 3d8986c7585457c45fd349b2c542c7c1ecd20843 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 15 Dec 2009 14:09:03 -0500 Subject: nfsd: enable V4ROOT exports With the v4root option now enforced everywhere it should be, it is safe to advertise support for it to mountd. Signed-off-by: J. Bruce Fields --- include/linux/nfsd/export.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h index 41f0d4e25374..8ae78a61eea4 100644 --- a/include/linux/nfsd/export.h +++ b/include/linux/nfsd/export.h @@ -50,7 +50,7 @@ */ #define NFSEXP_V4ROOT 0x10000 /* All flags that we claim to support. (Note we don't support NOACL.) */ -#define NFSEXP_ALLFLAGS 0x7E3F +#define NFSEXP_ALLFLAGS 0x17E3F /* The flags that may vary depending on security flavor: */ #define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \ -- cgit v1.2.3 From c7af6b0895229bd080b86afc91302b66f6df0378 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 4 Dec 2009 18:29:33 -0500 Subject: nfsd: remove unused field rq_reffh This field is never referenced anywhere else. I don't know what it was intended for. Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index d1567d627557..5a3085b9b394 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -273,10 +273,6 @@ struct svc_rqst { struct auth_domain * rq_client; /* RPC peer info */ struct auth_domain * rq_gssclient; /* "gss/"-style peer info */ struct svc_cacherep * rq_cacherep; /* cache info */ - struct knfsd_fh * rq_reffh; /* Referrence filehandle, used to - * determine what device number - * to report (real or virtual) - */ int rq_splice_ok; /* turned off in gss privacy * to prevent encrypting page * cache pages */ -- cgit v1.2.3 From 1557aca7904ed6fadd22cdc3364754070bb3d3c3 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 4 Dec 2009 19:36:06 -0500 Subject: nfsd: move most of nfsfh.h to fs/nfsd Most of this can be trivially moved to a private header as well. Signed-off-by: J. Bruce Fields --- fs/nfsd/export.c | 1 + fs/nfsd/nfsfh.h | 208 +++++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/state.h | 1 + fs/nfsd/vfs.h | 2 + fs/nfsd/xdr.h | 1 + include/linux/nfsd/nfsfh.h | 199 ------------------------------------------- 6 files changed, 213 insertions(+), 199 deletions(-) create mode 100644 fs/nfsd/nfsfh.h (limited to 'include/linux') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 7d5ba1b0ffcf..b26a3644fbb9 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -22,6 +22,7 @@ #include #include "nfsd.h" +#include "nfsfh.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h new file mode 100644 index 000000000000..cdfb8c6a4206 --- /dev/null +++ b/fs/nfsd/nfsfh.h @@ -0,0 +1,208 @@ +/* Copyright (C) 1995, 1996, 1997 Olaf Kirch */ + +#ifndef _LINUX_NFSD_FH_INT_H +#define _LINUX_NFSD_FH_INT_H + +#include + +enum nfsd_fsid { + FSID_DEV = 0, + FSID_NUM, + FSID_MAJOR_MINOR, + FSID_ENCODE_DEV, + FSID_UUID4_INUM, + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, +}; + +enum fsid_source { + FSIDSOURCE_DEV, + FSIDSOURCE_FSID, + FSIDSOURCE_UUID, +}; +extern enum fsid_source fsid_source(struct svc_fh *fhp); + + +/* This might look a little large to "inline" but in all calls except + * one, 'vers' is constant so moste of the function disappears. + */ +static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino, + u32 fsid, unsigned char *uuid) +{ + u32 *up; + switch(vers) { + case FSID_DEV: + fsidv[0] = htonl((MAJOR(dev)<<16) | + MINOR(dev)); + fsidv[1] = ino_t_to_u32(ino); + break; + case FSID_NUM: + fsidv[0] = fsid; + break; + case FSID_MAJOR_MINOR: + fsidv[0] = htonl(MAJOR(dev)); + fsidv[1] = htonl(MINOR(dev)); + fsidv[2] = ino_t_to_u32(ino); + break; + + case FSID_ENCODE_DEV: + fsidv[0] = new_encode_dev(dev); + fsidv[1] = ino_t_to_u32(ino); + break; + + case FSID_UUID4_INUM: + /* 4 byte fsid and inode number */ + up = (u32*)uuid; + fsidv[0] = ino_t_to_u32(ino); + fsidv[1] = up[0] ^ up[1] ^ up[2] ^ up[3]; + break; + + case FSID_UUID8: + /* 8 byte fsid */ + up = (u32*)uuid; + fsidv[0] = up[0] ^ up[2]; + fsidv[1] = up[1] ^ up[3]; + break; + + case FSID_UUID16: + /* 16 byte fsid - NFSv3+ only */ + memcpy(fsidv, uuid, 16); + break; + + case FSID_UUID16_INUM: + /* 8 byte inode and 16 byte fsid */ + *(u64*)fsidv = (u64)ino; + memcpy(fsidv+2, uuid, 16); + break; + default: BUG(); + } +} + +static inline int key_len(int type) +{ + switch(type) { + case FSID_DEV: return 8; + case FSID_NUM: return 4; + case FSID_MAJOR_MINOR: return 12; + case FSID_ENCODE_DEV: return 8; + case FSID_UUID4_INUM: return 8; + case FSID_UUID8: return 8; + case FSID_UUID16: return 16; + case FSID_UUID16_INUM: return 24; + default: return 0; + } +} + +/* + * Shorthand for dprintk()'s + */ +extern char * SVCFH_fmt(struct svc_fh *fhp); + +/* + * Function prototypes + */ +__be32 fh_verify(struct svc_rqst *, struct svc_fh *, int, int); +__be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *); +__be32 fh_update(struct svc_fh *); +void fh_put(struct svc_fh *); + +static __inline__ struct svc_fh * +fh_copy(struct svc_fh *dst, struct svc_fh *src) +{ + WARN_ON(src->fh_dentry || src->fh_locked); + + *dst = *src; + return dst; +} + +static inline void +fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src) +{ + dst->fh_size = src->fh_size; + memcpy(&dst->fh_base, &src->fh_base, src->fh_size); +} + +static __inline__ struct svc_fh * +fh_init(struct svc_fh *fhp, int maxsize) +{ + memset(fhp, 0, sizeof(*fhp)); + fhp->fh_maxsize = maxsize; + return fhp; +} + +#ifdef CONFIG_NFSD_V3 +/* + * Fill in the pre_op attr for the wcc data + */ +static inline void +fill_pre_wcc(struct svc_fh *fhp) +{ + struct inode *inode; + + inode = fhp->fh_dentry->d_inode; + if (!fhp->fh_pre_saved) { + fhp->fh_pre_mtime = inode->i_mtime; + fhp->fh_pre_ctime = inode->i_ctime; + fhp->fh_pre_size = inode->i_size; + fhp->fh_pre_change = inode->i_version; + fhp->fh_pre_saved = 1; + } +} + +extern void fill_post_wcc(struct svc_fh *); +#else +#define fill_pre_wcc(ignored) +#define fill_post_wcc(notused) +#endif /* CONFIG_NFSD_V3 */ + + +/* + * Lock a file handle/inode + * NOTE: both fh_lock and fh_unlock are done "by hand" in + * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once + * so, any changes here should be reflected there. + */ + +static inline void +fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) +{ + struct dentry *dentry = fhp->fh_dentry; + struct inode *inode; + + BUG_ON(!dentry); + + if (fhp->fh_locked) { + printk(KERN_WARNING "fh_lock: %s/%s already locked!\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + return; + } + + inode = dentry->d_inode; + mutex_lock_nested(&inode->i_mutex, subclass); + fill_pre_wcc(fhp); + fhp->fh_locked = 1; +} + +static inline void +fh_lock(struct svc_fh *fhp) +{ + fh_lock_nested(fhp, I_MUTEX_NORMAL); +} + +/* + * Unlock a file handle/inode + */ +static inline void +fh_unlock(struct svc_fh *fhp) +{ + BUG_ON(!fhp->fh_dentry); + + if (fhp->fh_locked) { + fill_post_wcc(fhp); + mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); + fhp->fh_locked = 0; + } +} + +#endif /* _LINUX_NFSD_FH_INT_H */ diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 2af75686e0d3..775b8d281d6a 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -38,6 +38,7 @@ #define _NFSD4_STATE_H #include +#include "nfsfh.h" typedef struct { u32 cl_boot; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index f4fa6d351bbd..4b1de0a9ea75 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -5,6 +5,8 @@ #ifndef LINUX_NFSD_VFS_H #define LINUX_NFSD_VFS_H +#include "nfsfh.h" + /* * Flags for nfsd_permission */ diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 235ee5c3be54..87fe6f64b8f7 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -9,6 +9,7 @@ #include #include "nfsd.h" +#include "nfsfh.h" struct nfsd_fhandle { struct svc_fh fh; diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h index 49523edbc510..65e333afaee4 100644 --- a/include/linux/nfsd/nfsfh.h +++ b/include/linux/nfsd/nfsfh.h @@ -162,205 +162,6 @@ typedef struct svc_fh { } svc_fh; -enum nfsd_fsid { - FSID_DEV = 0, - FSID_NUM, - FSID_MAJOR_MINOR, - FSID_ENCODE_DEV, - FSID_UUID4_INUM, - FSID_UUID8, - FSID_UUID16, - FSID_UUID16_INUM, -}; - -enum fsid_source { - FSIDSOURCE_DEV, - FSIDSOURCE_FSID, - FSIDSOURCE_UUID, -}; -extern enum fsid_source fsid_source(struct svc_fh *fhp); - - -/* This might look a little large to "inline" but in all calls except - * one, 'vers' is constant so moste of the function disappears. - */ -static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino, - u32 fsid, unsigned char *uuid) -{ - u32 *up; - switch(vers) { - case FSID_DEV: - fsidv[0] = htonl((MAJOR(dev)<<16) | - MINOR(dev)); - fsidv[1] = ino_t_to_u32(ino); - break; - case FSID_NUM: - fsidv[0] = fsid; - break; - case FSID_MAJOR_MINOR: - fsidv[0] = htonl(MAJOR(dev)); - fsidv[1] = htonl(MINOR(dev)); - fsidv[2] = ino_t_to_u32(ino); - break; - - case FSID_ENCODE_DEV: - fsidv[0] = new_encode_dev(dev); - fsidv[1] = ino_t_to_u32(ino); - break; - - case FSID_UUID4_INUM: - /* 4 byte fsid and inode number */ - up = (u32*)uuid; - fsidv[0] = ino_t_to_u32(ino); - fsidv[1] = up[0] ^ up[1] ^ up[2] ^ up[3]; - break; - - case FSID_UUID8: - /* 8 byte fsid */ - up = (u32*)uuid; - fsidv[0] = up[0] ^ up[2]; - fsidv[1] = up[1] ^ up[3]; - break; - - case FSID_UUID16: - /* 16 byte fsid - NFSv3+ only */ - memcpy(fsidv, uuid, 16); - break; - - case FSID_UUID16_INUM: - /* 8 byte inode and 16 byte fsid */ - *(u64*)fsidv = (u64)ino; - memcpy(fsidv+2, uuid, 16); - break; - default: BUG(); - } -} - -static inline int key_len(int type) -{ - switch(type) { - case FSID_DEV: return 8; - case FSID_NUM: return 4; - case FSID_MAJOR_MINOR: return 12; - case FSID_ENCODE_DEV: return 8; - case FSID_UUID4_INUM: return 8; - case FSID_UUID8: return 8; - case FSID_UUID16: return 16; - case FSID_UUID16_INUM: return 24; - default: return 0; - } -} - -/* - * Shorthand for dprintk()'s - */ -extern char * SVCFH_fmt(struct svc_fh *fhp); - -/* - * Function prototypes - */ -__be32 fh_verify(struct svc_rqst *, struct svc_fh *, int, int); -__be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *); -__be32 fh_update(struct svc_fh *); -void fh_put(struct svc_fh *); - -static __inline__ struct svc_fh * -fh_copy(struct svc_fh *dst, struct svc_fh *src) -{ - WARN_ON(src->fh_dentry || src->fh_locked); - - *dst = *src; - return dst; -} - -static inline void -fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src) -{ - dst->fh_size = src->fh_size; - memcpy(&dst->fh_base, &src->fh_base, src->fh_size); -} - -static __inline__ struct svc_fh * -fh_init(struct svc_fh *fhp, int maxsize) -{ - memset(fhp, 0, sizeof(*fhp)); - fhp->fh_maxsize = maxsize; - return fhp; -} - -#ifdef CONFIG_NFSD_V3 -/* - * Fill in the pre_op attr for the wcc data - */ -static inline void -fill_pre_wcc(struct svc_fh *fhp) -{ - struct inode *inode; - - inode = fhp->fh_dentry->d_inode; - if (!fhp->fh_pre_saved) { - fhp->fh_pre_mtime = inode->i_mtime; - fhp->fh_pre_ctime = inode->i_ctime; - fhp->fh_pre_size = inode->i_size; - fhp->fh_pre_change = inode->i_version; - fhp->fh_pre_saved = 1; - } -} - -extern void fill_post_wcc(struct svc_fh *); -#else -#define fill_pre_wcc(ignored) -#define fill_post_wcc(notused) -#endif /* CONFIG_NFSD_V3 */ - - -/* - * Lock a file handle/inode - * NOTE: both fh_lock and fh_unlock are done "by hand" in - * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once - * so, any changes here should be reflected there. - */ - -static inline void -fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) -{ - struct dentry *dentry = fhp->fh_dentry; - struct inode *inode; - - BUG_ON(!dentry); - - if (fhp->fh_locked) { - printk(KERN_WARNING "fh_lock: %s/%s already locked!\n", - dentry->d_parent->d_name.name, dentry->d_name.name); - return; - } - - inode = dentry->d_inode; - mutex_lock_nested(&inode->i_mutex, subclass); - fill_pre_wcc(fhp); - fhp->fh_locked = 1; -} - -static inline void -fh_lock(struct svc_fh *fhp) -{ - fh_lock_nested(fhp, I_MUTEX_NORMAL); -} - -/* - * Unlock a file handle/inode - */ -static inline void -fh_unlock(struct svc_fh *fhp) -{ - BUG_ON(!fhp->fh_dentry); - - if (fhp->fh_locked) { - fill_post_wcc(fhp); - mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); - fhp->fh_locked = 0; - } -} #endif /* __KERNEL__ */ -- cgit v1.2.3 From c1e7c3ae59b065bf7ff24a05cb609b2f9e314db6 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 14 Dec 2009 21:45:19 +0000 Subject: bzip2/lzma/gzip: pre-boot malloc doesn't return NULL on failure The trivial malloc implementation used in the pre-boot environment by the decompressors returns a bad pointer on failure (falling through after calling error). This is doubly wrong - the callers expect malloc to return NULL on failure, second the error function is intended to be used by the decompressors to propagate errors to *their* callers. The decompressors have no access to any state set by the error function. Signed-off-by: Phillip Lougher LKML-Reference: <4b26b1ef.hIInb2AYPMtImAJO%phillip@lougher.demon.co.uk> Signed-off-by: H. Peter Anvin --- include/linux/decompress/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/decompress/mm.h b/include/linux/decompress/mm.h index 12ff8c3f1d05..5032b9a31ae7 100644 --- a/include/linux/decompress/mm.h +++ b/include/linux/decompress/mm.h @@ -25,7 +25,7 @@ static void *malloc(int size) void *p; if (size < 0) - error("Malloc error"); + return NULL; if (!malloc_ptr) malloc_ptr = free_mem_ptr; @@ -35,7 +35,7 @@ static void *malloc(int size) malloc_ptr += size; if (free_mem_end_ptr && malloc_ptr >= free_mem_end_ptr) - error("Out of memory"); + return NULL; malloc_count++; return p; -- cgit v1.2.3 From 9065ce4500085b9ca66b19d3c4d21a73cb410173 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 17 Nov 2009 17:05:19 -0700 Subject: PNP: add interface to retrieve ACPI device from a PNPACPI device Add pnp_acpi_device(pnp_dev), which takes a PNP device and returns the associated ACPI device (or NULL, if the device is not a PNPACPI device). This allows us to write a PNP driver that can manage both traditional PNPBIOS and ACPI devices, treating ACPI-only functionality as an optional extension. Signed-off-by: Bjorn Helgaas Signed-off-by: Len Brown --- drivers/pnp/pnpacpi/core.c | 3 ++- include/linux/pnp.h | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/pnp/pnpacpi/core.c b/drivers/pnp/pnpacpi/core.c index b2348fc2378e..5314bf630bc4 100644 --- a/drivers/pnp/pnpacpi/core.c +++ b/drivers/pnp/pnpacpi/core.c @@ -144,7 +144,7 @@ static int pnpacpi_resume(struct pnp_dev *dev) } #endif -static struct pnp_protocol pnpacpi_protocol = { +struct pnp_protocol pnpacpi_protocol = { .name = "Plug and Play ACPI", .get = pnpacpi_get_resources, .set = pnpacpi_set_resources, @@ -154,6 +154,7 @@ static struct pnp_protocol pnpacpi_protocol = { .resume = pnpacpi_resume, #endif }; +EXPORT_SYMBOL(pnpacpi_protocol); static int __init pnpacpi_add_device(struct acpi_device *device) { diff --git a/include/linux/pnp.h b/include/linux/pnp.h index fddfafaed024..7c4193eb0072 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -334,6 +334,19 @@ extern struct pnp_protocol pnpbios_protocol; #define pnp_device_is_pnpbios(dev) 0 #endif +#ifdef CONFIG_PNPACPI +extern struct pnp_protocol pnpacpi_protocol; + +static inline struct acpi_device *pnp_acpi_device(struct pnp_dev *dev) +{ + if (dev->protocol == &pnpacpi_protocol) + return dev->data; + return NULL; +} +#else +#define pnp_acpi_device(dev) 0 +#endif + /* status */ #define PNP_READY 0x0000 #define PNP_ATTACHED 0x0001 -- cgit v1.2.3 From bb5b7c11263dbbe78253cd05945a6bf8f55add8e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 15 Dec 2009 20:56:42 -0800 Subject: tcp: Revert per-route SACK/DSACK/TIMESTAMP changes. It creates a regression, triggering badness for SYN_RECV sockets, for example: [19148.022102] Badness at net/ipv4/inet_connection_sock.c:293 [19148.022570] NIP: c02a0914 LR: c02a0904 CTR: 00000000 [19148.023035] REGS: eeecbd30 TRAP: 0700 Not tainted (2.6.32) [19148.023496] MSR: 00029032 CR: 24002442 XER: 00000000 [19148.024012] TASK = eee9a820[1756] 'privoxy' THREAD: eeeca000 This is likely caused by the change in the 'estab' parameter passed to tcp_parse_options() when invoked by the functions in net/ipv4/tcp_minisocks.c But even if that is fixed, the ->conn_request() changes made in this patch series is fundamentally wrong. They try to use the listening socket's 'dst' to probe the route settings. The listening socket doesn't even have a route, and you can't get the right route (the child request one) until much later after we setup all of the state, and it must be done by hand. This stuff really isn't ready, so the best thing to do is a full revert. This reverts the following commits: f55017a93f1a74d50244b1254b9a2bd7ac9bbf7d 022c3f7d82f0f1c68018696f2f027b87b9bb45c2 1aba721eba1d84a2defce45b950272cee1e6c72a cda42ebd67ee5fdf09d7057b5a4584d36fe8a335 345cda2fd695534be5a4494f1b59da9daed33663 dc343475ed062e13fc260acccaab91d7d80fd5b2 05eaade2782fb0c90d3034fd7a7d5a16266182bb 6a2a2d6bf8581216e08be15fcb563cfd6c430e1e Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 6 ++---- include/net/dst.h | 2 +- include/net/tcp.h | 3 +-- net/ipv4/syncookies.c | 27 +++++++++++++-------------- net/ipv4/tcp_input.c | 24 ++++++++---------------- net/ipv4/tcp_ipv4.c | 21 +++++++++------------ net/ipv4/tcp_minisocks.c | 10 +++++----- net/ipv4/tcp_output.c | 18 +++++------------- net/ipv6/syncookies.c | 28 +++++++++++++--------------- net/ipv6/tcp_ipv6.c | 3 +-- 10 files changed, 58 insertions(+), 84 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 14fc906ed602..05330fc5b436 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -368,11 +368,9 @@ enum { #define RTAX_MAX (__RTAX_MAX - 1) #define RTAX_FEATURE_ECN 0x00000001 -#define RTAX_FEATURE_NO_SACK 0x00000002 -#define RTAX_FEATURE_NO_TSTAMP 0x00000004 +#define RTAX_FEATURE_SACK 0x00000002 +#define RTAX_FEATURE_TIMESTAMP 0x00000004 #define RTAX_FEATURE_ALLFRAG 0x00000008 -#define RTAX_FEATURE_NO_WSCALE 0x00000010 -#define RTAX_FEATURE_NO_DSACK 0x00000020 struct rta_session { __u8 proto; diff --git a/include/net/dst.h b/include/net/dst.h index 387cb3cfde7e..39c4a5963e12 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -113,7 +113,7 @@ dst_metric(const struct dst_entry *dst, int metric) static inline u32 dst_feature(const struct dst_entry *dst, u32 feature) { - return (dst ? dst_metric(dst, RTAX_FEATURES) & feature : 0); + return dst_metric(dst, RTAX_FEATURES) & feature; } static inline u32 dst_mtu(const struct dst_entry *dst) diff --git a/include/net/tcp.h b/include/net/tcp.h index 1b6f7d348cee..34f5cc24d903 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -408,8 +408,7 @@ extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, extern void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, u8 **hvpp, - int estab, - struct dst_entry *dst); + int estab); extern u8 *tcp_parse_md5sig_option(struct tcphdr *th); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 26399ad2a289..66fd80ef2473 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -277,6 +277,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); + /* check for timestamp cookie support */ + memset(&tcp_opt, 0, sizeof(tcp_opt)); + tcp_parse_options(skb, &tcp_opt, &hash_location, 0); + + if (tcp_opt.saw_tstamp) + cookie_check_timestamp(&tcp_opt); + ret = NULL; req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ if (!req) @@ -292,6 +299,12 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->loc_addr = ip_hdr(skb)->daddr; ireq->rmt_addr = ip_hdr(skb)->saddr; ireq->ecn_ok = 0; + ireq->snd_wscale = tcp_opt.snd_wscale; + ireq->rcv_wscale = tcp_opt.rcv_wscale; + ireq->sack_ok = tcp_opt.sack_ok; + ireq->wscale_ok = tcp_opt.wscale_ok; + ireq->tstamp_ok = tcp_opt.saw_tstamp; + req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) @@ -340,20 +353,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, } } - /* check for timestamp cookie support */ - memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, &hash_location, 0, &rt->u.dst); - - if (tcp_opt.saw_tstamp) - cookie_check_timestamp(&tcp_opt); - - ireq->snd_wscale = tcp_opt.snd_wscale; - ireq->rcv_wscale = tcp_opt.rcv_wscale; - ireq->sack_ok = tcp_opt.sack_ok; - ireq->wscale_ok = tcp_opt.wscale_ok; - ireq->tstamp_ok = tcp_opt.saw_tstamp; - req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - /* Try to redo what tcp_v4_send_synack did. */ req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 12cab7d74dba..28e029632493 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3727,7 +3727,7 @@ old_ack: * the fast version below fails. */ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, - u8 **hvpp, int estab, struct dst_entry *dst) + u8 **hvpp, int estab) { unsigned char *ptr; struct tcphdr *th = tcp_hdr(skb); @@ -3766,8 +3766,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, break; case TCPOPT_WINDOW: if (opsize == TCPOLEN_WINDOW && th->syn && - !estab && sysctl_tcp_window_scaling && - !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)) { + !estab && sysctl_tcp_window_scaling) { __u8 snd_wscale = *(__u8 *)ptr; opt_rx->wscale_ok = 1; if (snd_wscale > 14) { @@ -3783,8 +3782,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, case TCPOPT_TIMESTAMP: if ((opsize == TCPOLEN_TIMESTAMP) && ((estab && opt_rx->tstamp_ok) || - (!estab && sysctl_tcp_timestamps && - !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP)))) { + (!estab && sysctl_tcp_timestamps))) { opt_rx->saw_tstamp = 1; opt_rx->rcv_tsval = get_unaligned_be32(ptr); opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); @@ -3792,8 +3790,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM && th->syn && - !estab && sysctl_tcp_sack && - !dst_feature(dst, RTAX_FEATURE_NO_SACK)) { + !estab && sysctl_tcp_sack) { opt_rx->sack_ok = 1; tcp_sack_reset(opt_rx); } @@ -3878,7 +3875,7 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, if (tcp_parse_aligned_timestamp(tp, th)) return 1; } - tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); + tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); return 1; } @@ -4133,10 +4130,8 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - if (tcp_is_sack(tp) && sysctl_tcp_dsack && - !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) { + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { int mib_idx; if (before(seq, tp->rcv_nxt)) @@ -4165,15 +4160,13 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); - if (tcp_is_sack(tp) && sysctl_tcp_dsack && - !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) { + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) @@ -5428,11 +5421,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, u8 *hash_location; struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); struct tcp_cookie_values *cvp = tp->cookie_values; int saved_clamp = tp->rx_opt.mss_clamp; - tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, dst); + tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); if (th->ack) { /* rfc793: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 15e96030ce47..65b8ebfd078a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1262,20 +1262,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; #endif - ireq = inet_rsk(req); - ireq->loc_addr = daddr; - ireq->rmt_addr = saddr; - ireq->no_srccheck = inet_sk(sk)->transparent; - ireq->opt = tcp_v4_save_options(sk, skb); - - dst = inet_csk_route_req(sk, req); - if(!dst) - goto drop_and_free; - tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = TCP_MSS_DEFAULT; tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, &hash_location, 0, dst); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); if (tmp_opt.cookie_plus > 0 && tmp_opt.saw_tstamp && @@ -1319,8 +1309,14 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); + ireq = inet_rsk(req); + ireq->loc_addr = daddr; + ireq->rmt_addr = saddr; + ireq->no_srccheck = inet_sk(sk)->transparent; + ireq->opt = tcp_v4_save_options(sk, skb); + if (security_inet_conn_request(sk, skb, req)) - goto drop_and_release; + goto drop_and_free; if (!want_cookie) TCP_ECN_create_request(req, tcp_hdr(skb)); @@ -1345,6 +1341,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && + (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 87accec8d097..f206ee5dda80 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -95,9 +95,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); int paws_reject = 0; + tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { - tmp_opt.tstamp_ok = 1; - tcp_parse_options(skb, &tmp_opt, &hash_location, 1, NULL); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = tcptw->tw_ts_recent; @@ -526,9 +526,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); int paws_reject = 0; - if ((th->doff > (sizeof(*th) >> 2)) && (req->ts_recent)) { - tmp_opt.tstamp_ok = 1; - tcp_parse_options(skb, &tmp_opt, &hash_location, 1, NULL); + tmp_opt.saw_tstamp = 0; + if (th->doff > (sizeof(struct tcphdr)>>2)) { + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 93316a96d820..383ce237640f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -553,7 +553,6 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_md5sig_key **md5) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_cookie_values *cvp = tp->cookie_values; - struct dst_entry *dst = __sk_dst_get(sk); unsigned remaining = MAX_TCP_OPTION_SPACE; u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? tcp_cookie_size_check(cvp->cookie_desired) : @@ -581,22 +580,18 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, opts->mss = tcp_advertise_mss(sk); remaining -= TCPOLEN_MSS_ALIGNED; - if (likely(sysctl_tcp_timestamps && - !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) && - *md5 == NULL)) { + if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { opts->options |= OPTION_TS; opts->tsval = TCP_SKB_CB(skb)->when; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } - if (likely(sysctl_tcp_window_scaling && - !dst_feature(dst, RTAX_FEATURE_NO_WSCALE))) { + if (likely(sysctl_tcp_window_scaling)) { opts->ws = tp->rx_opt.rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } - if (likely(sysctl_tcp_sack && - !dst_feature(dst, RTAX_FEATURE_NO_SACK))) { + if (likely(sysctl_tcp_sack)) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) remaining -= TCPOLEN_SACKPERM_ALIGNED; @@ -2527,9 +2522,7 @@ static void tcp_connect_init(struct sock *sk) * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ tp->tcp_header_len = sizeof(struct tcphdr) + - (sysctl_tcp_timestamps && - (!dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) ? - TCPOLEN_TSTAMP_ALIGNED : 0)); + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); #ifdef CONFIG_TCP_MD5SIG if (tp->af_specific->md5_lookup(sk, sk) != NULL) @@ -2555,8 +2548,7 @@ static void tcp_connect_init(struct sock *sk) tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, - (sysctl_tcp_window_scaling && - !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)), + sysctl_tcp_window_scaling, &rcv_wscale); tp->rx_opt.rcv_wscale = rcv_wscale; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 5b9af508b8f2..7208a06576c6 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -185,6 +185,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); + /* check for timestamp cookie support */ + memset(&tcp_opt, 0, sizeof(tcp_opt)); + tcp_parse_options(skb, &tcp_opt, &hash_location, 0); + + if (tcp_opt.saw_tstamp) + cookie_check_timestamp(&tcp_opt); + ret = NULL; req = inet6_reqsk_alloc(&tcp6_request_sock_ops); if (!req) @@ -218,6 +225,12 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) req->expires = 0UL; req->retrans = 0; ireq->ecn_ok = 0; + ireq->snd_wscale = tcp_opt.snd_wscale; + ireq->rcv_wscale = tcp_opt.rcv_wscale; + ireq->sack_ok = tcp_opt.sack_ok; + ireq->wscale_ok = tcp_opt.wscale_ok; + ireq->tstamp_ok = tcp_opt.saw_tstamp; + req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; @@ -253,21 +266,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out_free; } - /* check for timestamp cookie support */ - memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, &hash_location, 0, dst); - - if (tcp_opt.saw_tstamp) - cookie_check_timestamp(&tcp_opt); - - req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - - ireq->snd_wscale = tcp_opt.snd_wscale; - ireq->rcv_wscale = tcp_opt.rcv_wscale; - ireq->sack_ok = tcp_opt.sack_ok; - ireq->wscale_ok = tcp_opt.wscale_ok; - ireq->tstamp_ok = tcp_opt.saw_tstamp; - req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ee9cf62458d4..febfd595a40d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1169,7 +1169,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) struct inet6_request_sock *treq; struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); __u32 isn = TCP_SKB_CB(skb)->when; #ifdef CONFIG_SYN_COOKIES int want_cookie = 0; @@ -1208,7 +1207,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, &hash_location, 0, dst); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); if (tmp_opt.cookie_plus > 0 && tmp_opt.saw_tstamp && -- cgit v1.2.3 From 588f9ce6ca61ecb4663ee6ef2f75d2d96c73151e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:19:57 +0100 Subject: HWPOISON: Be more aggressive at freeing non LRU caches shake_page handles more types of page caches than lru_drain_all() - per cpu page allocator pages - per CPU LRU Stops early when the page became free. Used in followon patches. Signed-off-by: Andi Kleen --- include/linux/mm.h | 1 + mm/memory-failure.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9d65ae4ba0e0..68c84bb2ad3f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1335,6 +1335,7 @@ extern void memory_failure(unsigned long pfn, int trapno); extern int __memory_failure(unsigned long pfn, int trapno, int ref); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; +extern void shake_page(struct page *p); extern atomic_long_t mce_bad_pages; #endif /* __KERNEL__ */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 50d4f8d7024a..38fcbb22eab9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -82,6 +82,28 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, return ret; } +/* + * When a unknown page type is encountered drain as many buffers as possible + * in the hope to turn the page into a LRU or free page, which we can handle. + */ +void shake_page(struct page *p) +{ + if (!PageSlab(p)) { + lru_add_drain_all(); + if (PageLRU(p)) + return; + drain_all_pages(); + if (PageLRU(p) || is_free_buddy_page(p)) + return; + } + /* + * Could call shrink_slab here (which would also + * shrink other caches). Unfortunately that might + * also access the corrupted page, which could be fatal. + */ +} +EXPORT_SYMBOL_GPL(shake_page); + /* * Kill all processes that have a poisoned page mapped and then isolate * the page. -- cgit v1.2.3 From 82ba011b9041dd31c15e4f63797b08aa0a288e61 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:19:57 +0100 Subject: HWPOISON: Turn ref argument into flags argument Now that "ref" is just a boolean turn it into a flags argument. First step is only a single flag that makes the code's intention more clear, but more may follow. Signed-off-by: Andi Kleen --- include/linux/mm.h | 5 ++++- mm/madvise.c | 2 +- mm/memory-failure.c | 5 +++-- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 68c84bb2ad3f..135e19198cd3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1331,8 +1331,11 @@ extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, size_t size); extern void refund_locked_memory(struct mm_struct *mm, size_t size); +enum mf_flags { + MF_COUNT_INCREASED = 1 << 0, +}; extern void memory_failure(unsigned long pfn, int trapno); -extern int __memory_failure(unsigned long pfn, int trapno, int ref); +extern int __memory_failure(unsigned long pfn, int trapno, int flags); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p); diff --git a/mm/madvise.c b/mm/madvise.c index 18970aec0d2f..6ca34f0cd4aa 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -237,7 +237,7 @@ static int madvise_hwpoison(unsigned long start, unsigned long end) printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", page_to_pfn(p), start); /* Ignore return value for now */ - __memory_failure(page_to_pfn(p), 0, 1); + __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); } return ret; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4253e14fa709..3338c443272c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -737,7 +737,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, ret != SWAP_SUCCESS, pfn); } -int __memory_failure(unsigned long pfn, int trapno, int ref) +int __memory_failure(unsigned long pfn, int trapno, int flags) { unsigned long lru_flag; struct page_state *ps; @@ -773,7 +773,8 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) * In fact it's dangerous to directly bump up page count from 0, * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. */ - if (!ref && !get_page_unless_zero(compound_head(p))) { + if (!(flags & MF_COUNT_INCREASED) && + !get_page_unless_zero(compound_head(p))) { action_result(pfn, "free or high order kernel", IGNORED); return PageBuddy(compound_head(p)) ? 0 : -EBUSY; } -- cgit v1.2.3 From 847ce401df392b0704369fd3f75df614ac1414b4 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:58 +0100 Subject: HWPOISON: Add unpoisoning support The unpoisoning interface is useful for stress testing tools to reclaim poisoned pages (to prevent OOM) There is no hardware level unpoisioning, so this cannot be used for real memory errors, only for software injected errors. Note that it may leak pages silently - those who have been removed from LRU cache, but not isolated from page cache/swap cache at hwpoison time. Especially the stress test of dirty swap cache pages shall reboot system before exhausting memory. AK: Fix comments, add documentation, add printks, rename symbol Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 16 ++++++++-- include/linux/mm.h | 1 + include/linux/page-flags.h | 2 +- mm/hwpoison-inject.c | 36 +++++++++++++++++++---- mm/memory-failure.c | 68 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 114 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index 3ffadf8da61f..f047e75acb23 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -98,10 +98,22 @@ madvise(MADV_POISON, ....) hwpoison-inject module through debugfs - /sys/debug/hwpoison/corrupt-pfn -Inject hwpoison fault at PFN echoed into this file +/sys/debug/hwpoison/ +corrupt-pfn + +Inject hwpoison fault at PFN echoed into this file. + +unpoison-pfn + +Software-unpoison page at PFN echoed into this file. This +way a page can be reused again. +This only works for Linux injected failures, not for real +memory failures. + +Note these injection interfaces are not stable and might change between +kernel versions Architecture specific MCE injector diff --git a/include/linux/mm.h b/include/linux/mm.h index 135e19198cd3..8cdb941fc7b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1336,6 +1336,7 @@ enum mf_flags { }; extern void memory_failure(unsigned long pfn, int trapno); extern int __memory_failure(unsigned long pfn, int trapno, int flags); +extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 49e907bd067f..f9df6308af95 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -275,7 +275,7 @@ PAGEFLAG_FALSE(Uncached) #ifdef CONFIG_MEMORY_FAILURE PAGEFLAG(HWPoison, hwpoison) -TESTSETFLAG(HWPoison, hwpoison) +TESTSCFLAG(HWPoison, hwpoison) #define __PG_HWPOISON (1UL << PG_hwpoison) #else PAGEFLAG_FALSE(HWPoison) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index e1d85137f086..6e35e563bf50 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -4,7 +4,7 @@ #include #include -static struct dentry *hwpoison_dir, *corrupt_pfn; +static struct dentry *hwpoison_dir; static int hwpoison_inject(void *data, u64 val) { @@ -14,7 +14,16 @@ static int hwpoison_inject(void *data, u64 val) return __memory_failure(val, 18, 0); } +static int hwpoison_unpoison(void *data, u64 val) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return unpoison_memory(val); +} + DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); +DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void pfn_inject_exit(void) { @@ -24,16 +33,31 @@ static void pfn_inject_exit(void) static int pfn_inject_init(void) { + struct dentry *dentry; + hwpoison_dir = debugfs_create_dir("hwpoison", NULL); if (hwpoison_dir == NULL) return -ENOMEM; - corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, + + /* + * Note that the below poison/unpoison interfaces do not involve + * hardware status change, hence do not require hardware support. + * They are mainly for testing hwpoison in software level. + */ + dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, NULL, &hwpoison_fops); - if (corrupt_pfn == NULL) { - pfn_inject_exit(); - return -ENOMEM; - } + if (!dentry) + goto fail; + + dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, + NULL, &unpoison_fops); + if (!dentry) + goto fail; + return 0; +fail: + pfn_inject_exit(); + return -ENOMEM; } module_init(pfn_inject_init); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5055b940df5f..ed6e91c87a54 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -838,6 +838,16 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * and in many cases impossible, so we just avoid it here. */ lock_page_nosync(p); + + /* + * unpoison always clear PG_hwpoison inside page lock + */ + if (!PageHWPoison(p)) { + action_result(pfn, "unpoisoned", IGNORED); + res = 0; + goto out; + } + wait_on_page_writeback(p); /* @@ -893,3 +903,61 @@ void memory_failure(unsigned long pfn, int trapno) { __memory_failure(pfn, trapno, 0); } + +/** + * unpoison_memory - Unpoison a previously poisoned page + * @pfn: Page number of the to be unpoisoned page + * + * Software-unpoison a page that has been poisoned by + * memory_failure() earlier. + * + * This is only done on the software-level, so it only works + * for linux injected failures, not real hardware failures + * + * Returns 0 for success, otherwise -errno. + */ +int unpoison_memory(unsigned long pfn) +{ + struct page *page; + struct page *p; + int freeit = 0; + + if (!pfn_valid(pfn)) + return -ENXIO; + + p = pfn_to_page(pfn); + page = compound_head(p); + + if (!PageHWPoison(p)) { + pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); + return 0; + } + + if (!get_page_unless_zero(page)) { + if (TestClearPageHWPoison(p)) + atomic_long_dec(&mce_bad_pages); + pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); + return 0; + } + + lock_page_nosync(page); + /* + * This test is racy because PG_hwpoison is set outside of page lock. + * That's acceptable because that won't trigger kernel panic. Instead, + * the PG_hwpoison page will be caught and isolated on the entrance to + * the free buddy page pool. + */ + if (TestClearPageHWPoison(p)) { + pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); + atomic_long_dec(&mce_bad_pages); + freeit = 1; + } + unlock_page(page); + + put_page(page); + if (freeit) + put_page(page); + + return 0; +} +EXPORT_SYMBOL(unpoison_memory); -- cgit v1.2.3 From 1a9b5b7fe0c5dad8a635288882d36785dea742f9 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: mm: export stable page flags Rename get_uflags() to stable_page_flags() and make it a global function for use in the hwpoison page flags filter, which need to compare user page flags with the value provided by user space. Also move KPF_* to kernel-page-flags.h for use by user space tools. Acked-by: Matt Mackall Signed-off-by: Andi Kleen CC: Nick Piggin CC: Christoph Lameter Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- fs/proc/page.c | 45 +++----------------------------------- include/linux/kernel-page-flags.h | 46 +++++++++++++++++++++++++++++++++++++++ include/linux/page-flags.h | 2 ++ 3 files changed, 51 insertions(+), 42 deletions(-) create mode 100644 include/linux/kernel-page-flags.h (limited to 'include/linux') diff --git a/fs/proc/page.c b/fs/proc/page.c index 5033ce0d254b..180cf5a0bd67 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "internal.h" @@ -71,52 +72,12 @@ static const struct file_operations proc_kpagecount_operations = { * physical page flags. */ -/* These macros are used to decouple internal flags from exported ones */ - -#define KPF_LOCKED 0 -#define KPF_ERROR 1 -#define KPF_REFERENCED 2 -#define KPF_UPTODATE 3 -#define KPF_DIRTY 4 -#define KPF_LRU 5 -#define KPF_ACTIVE 6 -#define KPF_SLAB 7 -#define KPF_WRITEBACK 8 -#define KPF_RECLAIM 9 -#define KPF_BUDDY 10 - -/* 11-20: new additions in 2.6.31 */ -#define KPF_MMAP 11 -#define KPF_ANON 12 -#define KPF_SWAPCACHE 13 -#define KPF_SWAPBACKED 14 -#define KPF_COMPOUND_HEAD 15 -#define KPF_COMPOUND_TAIL 16 -#define KPF_HUGE 17 -#define KPF_UNEVICTABLE 18 -#define KPF_HWPOISON 19 -#define KPF_NOPAGE 20 - -#define KPF_KSM 21 - -/* kernel hacking assistances - * WARNING: subject to change, never rely on them! - */ -#define KPF_RESERVED 32 -#define KPF_MLOCKED 33 -#define KPF_MAPPEDTODISK 34 -#define KPF_PRIVATE 35 -#define KPF_PRIVATE_2 36 -#define KPF_OWNER_PRIVATE 37 -#define KPF_ARCH 38 -#define KPF_UNCACHED 39 - static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) { return ((kflags >> kbit) & 1) << ubit; } -static u64 get_uflags(struct page *page) +u64 stable_page_flags(struct page *page) { u64 k; u64 u; @@ -219,7 +180,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, else ppage = NULL; - if (put_user(get_uflags(ppage), out)) { + if (put_user(stable_page_flags(ppage), out)) { ret = -EFAULT; break; } diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h new file mode 100644 index 000000000000..bd92a89f4b0a --- /dev/null +++ b/include/linux/kernel-page-flags.h @@ -0,0 +1,46 @@ +#ifndef LINUX_KERNEL_PAGE_FLAGS_H +#define LINUX_KERNEL_PAGE_FLAGS_H + +/* + * Stable page flag bits exported to user space + */ + +#define KPF_LOCKED 0 +#define KPF_ERROR 1 +#define KPF_REFERENCED 2 +#define KPF_UPTODATE 3 +#define KPF_DIRTY 4 +#define KPF_LRU 5 +#define KPF_ACTIVE 6 +#define KPF_SLAB 7 +#define KPF_WRITEBACK 8 +#define KPF_RECLAIM 9 +#define KPF_BUDDY 10 + +/* 11-20: new additions in 2.6.31 */ +#define KPF_MMAP 11 +#define KPF_ANON 12 +#define KPF_SWAPCACHE 13 +#define KPF_SWAPBACKED 14 +#define KPF_COMPOUND_HEAD 15 +#define KPF_COMPOUND_TAIL 16 +#define KPF_HUGE 17 +#define KPF_UNEVICTABLE 18 +#define KPF_HWPOISON 19 +#define KPF_NOPAGE 20 + +#define KPF_KSM 21 + +/* kernel hacking assistances + * WARNING: subject to change, never rely on them! + */ +#define KPF_RESERVED 32 +#define KPF_MLOCKED 33 +#define KPF_MAPPEDTODISK 34 +#define KPF_PRIVATE 35 +#define KPF_PRIVATE_2 36 +#define KPF_OWNER_PRIVATE 37 +#define KPF_ARCH 38 +#define KPF_UNCACHED 39 + +#endif /* LINUX_KERNEL_PAGE_FLAGS_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f9df6308af95..feee2ba8d06a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -282,6 +282,8 @@ PAGEFLAG_FALSE(HWPoison) #define __PG_HWPOISON 0 #endif +u64 stable_page_flags(struct page *page); + static inline int PageUptodate(struct page *page) { int ret = test_bit(PG_uptodate, &(page)->flags); -- cgit v1.2.3 From e42d9d5d47961fb5db0be65b56dd52fe7b2421f1 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: memcg: rename and export try_get_mem_cgroup_from_page() So that the hwpoison injector can get mem_cgroup for arbitrary page and thus know whether it is owned by some mem_cgroup task(s). [AK: Merged with latest git tree] CC: KOSAKI Motohiro CC: Hugh Dickins CC: Daisuke Nishimura CC: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- include/linux/memcontrol.h | 6 ++++++ mm/memcontrol.c | 11 ++++------- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bf9213b2db8f..fc9bae82ac42 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -68,6 +68,7 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); +extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); static inline @@ -189,6 +190,11 @@ mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to) { } +static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) +{ + return NULL; +} + static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem) { return 1; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e0c2066495e3..b5ac61ce7346 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1379,25 +1379,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) return container_of(css, struct mem_cgroup, css); } -static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) +struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { - struct mem_cgroup *mem; + struct mem_cgroup *mem = NULL; struct page_cgroup *pc; unsigned short id; swp_entry_t ent; VM_BUG_ON(!PageLocked(page)); - if (!PageSwapCache(page)) - return NULL; - pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { mem = pc->mem_cgroup; if (mem && !css_tryget(&mem->css)) mem = NULL; - } else { + } else if (PageSwapCache(page)) { ent.val = page_private(page); id = lookup_swap_cgroup(ent); rcu_read_lock(); @@ -1743,7 +1740,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, */ if (!PageSwapCache(page)) goto charge_cur_mm; - mem = try_get_mem_cgroup_from_swapcache(page); + mem = try_get_mem_cgroup_from_page(page); if (!mem) goto charge_cur_mm; *ptr = mem; -- cgit v1.2.3 From d324236b3333e87c8825b35f2104184734020d35 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: memcg: add accessor to mem_cgroup.css So that an outside user can free the reference count grabbed by try_get_mem_cgroup_from_page(). CC: KOSAKI Motohiro CC: Hugh Dickins CC: Daisuke Nishimura CC: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- include/linux/memcontrol.h | 7 +++++++ mm/memcontrol.c | 5 +++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fc9bae82ac42..2c30a1116d84 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -81,6 +81,8 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) return cgroup == mem; } +extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem); + extern int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr); extern void mem_cgroup_end_migration(struct mem_cgroup *mem, @@ -206,6 +208,11 @@ static inline int task_in_mem_cgroup(struct task_struct *task, return 1; } +static inline struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) +{ + return NULL; +} + static inline int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b5ac61ce7346..9eee80d6d490 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -282,6 +282,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) return &mem->info.nodeinfo[nid]->zoneinfo[zid]; } +struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) +{ + return &mem->css; +} + static struct mem_cgroup_per_zone * page_cgroup_zoneinfo(struct page_cgroup *pc) { -- cgit v1.2.3 From facb6011f3993947283fa15d039dacb4ad140230 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:20:00 +0100 Subject: HWPOISON: Add soft page offline support This is a simpler, gentler variant of memory_failure() for soft page offlining controlled from user space. It doesn't kill anything, just tries to invalidate and if that doesn't work migrate the page away. This is useful for predictive failure analysis, where a page has a high rate of corrected errors, but hasn't gone bad yet. Instead it can be offlined early and avoided. The offlining is controlled from sysfs, including a new generic entry point for hard page offlining for symmetry too. We use the page isolate facility to prevent re-allocation race. Normally this is only used by memory hotplug. To avoid races with memory allocation I am using lock_system_sleep(). This avoids the situation where memory hotplug is about to isolate a page range and then hwpoison undoes that work. This is a big hammer currently, but the simplest solution currently. When the page is not free or LRU we try to free pages from slab and other caches. The slab freeing is currently quite dumb and does not try to focus on the specific slab cache which might own the page. This could be potentially improved later. Thanks to Fengguang Wu and Haicheng Li for some fixes. [Added fix from Andrew Morton to adapt to new migrate_pages prototype] Signed-off-by: Andi Kleen --- .../ABI/testing/sysfs-memory-page-offline | 44 +++++ drivers/base/memory.c | 61 +++++++ include/linux/mm.h | 3 +- mm/hwpoison-inject.c | 2 +- mm/memory-failure.c | 194 ++++++++++++++++++++- 5 files changed, 297 insertions(+), 7 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-memory-page-offline (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-memory-page-offline b/Documentation/ABI/testing/sysfs-memory-page-offline new file mode 100644 index 000000000000..e14703f12fdf --- /dev/null +++ b/Documentation/ABI/testing/sysfs-memory-page-offline @@ -0,0 +1,44 @@ +What: /sys/devices/system/memory/soft_offline_page +Date: Sep 2009 +KernelVersion: 2.6.33 +Contact: andi@firstfloor.org +Description: + Soft-offline the memory page containing the physical address + written into this file. Input is a hex number specifying the + physical address of the page. The kernel will then attempt + to soft-offline it, by moving the contents elsewhere or + dropping it if possible. The kernel will then be placed + on the bad page list and never be reused. + + The offlining is done in kernel specific granuality. + Normally it's the base page size of the kernel, but + this might change. + + The page must be still accessible, not poisoned. The + kernel will never kill anything for this, but rather + fail the offline. Return value is the size of the + number, or a error when the offlining failed. Reading + the file is not allowed. + +What: /sys/devices/system/memory/hard_offline_page +Date: Sep 2009 +KernelVersion: 2.6.33 +Contact: andi@firstfloor.org +Description: + Hard-offline the memory page containing the physical + address written into this file. Input is a hex number + specifying the physical address of the page. The + kernel will then attempt to hard-offline the page, by + trying to drop the page or killing any owner or + triggering IO errors if needed. Note this may kill + any processes owning the page. The kernel will avoid + to access this page assuming it's poisoned by the + hardware. + + The offlining is done in kernel specific granuality. + Normally it's the base page size of the kernel, but + this might change. + + Return value is the size of the number, or a error when + the offlining failed. + Reading the file is not allowed. diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 989429cfed88..c4c8f2e1dd15 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -341,6 +341,64 @@ static inline int memory_probe_init(void) } #endif +#ifdef CONFIG_MEMORY_FAILURE +/* + * Support for offlining pages of memory + */ + +/* Soft offline a page */ +static ssize_t +store_soft_offline_page(struct class *class, const char *buf, size_t count) +{ + int ret; + u64 pfn; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (strict_strtoull(buf, 0, &pfn) < 0) + return -EINVAL; + pfn >>= PAGE_SHIFT; + if (!pfn_valid(pfn)) + return -ENXIO; + ret = soft_offline_page(pfn_to_page(pfn), 0); + return ret == 0 ? count : ret; +} + +/* Forcibly offline a page, including killing processes. */ +static ssize_t +store_hard_offline_page(struct class *class, const char *buf, size_t count) +{ + int ret; + u64 pfn; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (strict_strtoull(buf, 0, &pfn) < 0) + return -EINVAL; + pfn >>= PAGE_SHIFT; + ret = __memory_failure(pfn, 0, 0); + return ret ? ret : count; +} + +static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); +static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); + +static __init int memory_fail_init(void) +{ + int err; + + err = sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_soft_offline_page.attr); + if (!err) + err = sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_hard_offline_page.attr); + return err; +} +#else +static inline int memory_fail_init(void) +{ + return 0; +} +#endif + /* * Note that phys_device is optional. It is here to allow for * differentiation between which *physical* devices each @@ -471,6 +529,9 @@ int __init memory_dev_init(void) } err = memory_probe_init(); + if (!ret) + ret = err; + err = memory_fail_init(); if (!ret) ret = err; err = block_size_init(); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8cdb941fc7b5..849b4a61bd8f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1339,8 +1339,9 @@ extern int __memory_failure(unsigned long pfn, int trapno, int flags); extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; -extern void shake_page(struct page *p); +extern void shake_page(struct page *p, int access); extern atomic_long_t mce_bad_pages; +extern int soft_offline_page(struct page *page, int flags); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index c597f46ac18a..a77fe3f9e211 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -29,7 +29,7 @@ static int hwpoison_inject(void *data, u64 val) return 0; if (!PageLRU(p)) - shake_page(p); + shake_page(p, 0); /* * This implies unable to support non-LRU pages. */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b5c3b6bd511f..bcce28755832 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -41,6 +41,9 @@ #include #include #include +#include +#include +#include #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -201,7 +204,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, * When a unknown page type is encountered drain as many buffers as possible * in the hope to turn the page into a LRU or free page, which we can handle. */ -void shake_page(struct page *p) +void shake_page(struct page *p, int access) { if (!PageSlab(p)) { lru_add_drain_all(); @@ -211,11 +214,19 @@ void shake_page(struct page *p) if (PageLRU(p) || is_free_buddy_page(p)) return; } + /* - * Could call shrink_slab here (which would also - * shrink other caches). Unfortunately that might - * also access the corrupted page, which could be fatal. + * Only all shrink_slab here (which would also + * shrink other caches) if access is not potentially fatal. */ + if (access) { + int nr; + do { + nr = shrink_slab(1000, GFP_KERNEL, 1000); + if (page_count(p) == 0) + break; + } while (nr > 10); + } } EXPORT_SYMBOL_GPL(shake_page); @@ -949,7 +960,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * walked by the page reclaim code, however that's not a big loss. */ if (!PageLRU(p)) - shake_page(p); + shake_page(p, 0); if (!PageLRU(p)) { /* * shake_page could have turned it free. @@ -1099,3 +1110,176 @@ int unpoison_memory(unsigned long pfn) return 0; } EXPORT_SYMBOL(unpoison_memory); + +static struct page *new_page(struct page *p, unsigned long private, int **x) +{ + return alloc_pages(GFP_HIGHUSER_MOVABLE, 0); +} + +/* + * Safely get reference count of an arbitrary page. + * Returns 0 for a free page, -EIO for a zero refcount page + * that is not free, and 1 for any other page type. + * For 1 the page is returned with increased page count, otherwise not. + */ +static int get_any_page(struct page *p, unsigned long pfn, int flags) +{ + int ret; + + if (flags & MF_COUNT_INCREASED) + return 1; + + /* + * The lock_system_sleep prevents a race with memory hotplug, + * because the isolation assumes there's only a single user. + * This is a big hammer, a better would be nicer. + */ + lock_system_sleep(); + + /* + * Isolate the page, so that it doesn't get reallocated if it + * was free. + */ + set_migratetype_isolate(p); + if (!get_page_unless_zero(compound_head(p))) { + if (is_free_buddy_page(p)) { + pr_debug("get_any_page: %#lx free buddy page\n", pfn); + /* Set hwpoison bit while page is still isolated */ + SetPageHWPoison(p); + ret = 0; + } else { + pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", + pfn, p->flags); + ret = -EIO; + } + } else { + /* Not a free page */ + ret = 1; + } + unset_migratetype_isolate(p); + unlock_system_sleep(); + return ret; +} + +/** + * soft_offline_page - Soft offline a page. + * @page: page to offline + * @flags: flags. Same as memory_failure(). + * + * Returns 0 on success, otherwise negated errno. + * + * Soft offline a page, by migration or invalidation, + * without killing anything. This is for the case when + * a page is not corrupted yet (so it's still valid to access), + * but has had a number of corrected errors and is better taken + * out. + * + * The actual policy on when to do that is maintained by + * user space. + * + * This should never impact any application or cause data loss, + * however it might take some time. + * + * This is not a 100% solution for all memory, but tries to be + * ``good enough'' for the majority of memory. + */ +int soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + + ret = get_any_page(page, pfn, flags); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + + /* + * Page cache page we can handle? + */ + if (!PageLRU(page)) { + /* + * Try to free it. + */ + put_page(page); + shake_page(page, 1); + + /* + * Did it turn free? + */ + ret = get_any_page(page, pfn, 0); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + } + if (!PageLRU(page)) { + pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", + pfn, page->flags); + return -EIO; + } + + lock_page(page); + wait_on_page_writeback(page); + + /* + * Synchronized using the page lock with memory_failure() + */ + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + pr_debug("soft offline: %#lx page already poisoned\n", pfn); + return -EBUSY; + } + + /* + * Try to invalidate first. This should work for + * non dirty unmapped page cache pages. + */ + ret = invalidate_inode_page(page); + unlock_page(page); + + /* + * Drop count because page migration doesn't like raised + * counts. The page could get re-allocated, but if it becomes + * LRU the isolation will just fail. + * RED-PEN would be better to keep it isolated here, but we + * would need to fix isolation locking first. + */ + put_page(page); + if (ret == 1) { + ret = 0; + pr_debug("soft_offline: %#lx: invalidated\n", pfn); + goto done; + } + + /* + * Simple invalidation didn't work. + * Try to migrate to a new page instead. migrate.c + * handles a large number of cases for us. + */ + ret = isolate_lru_page(page); + if (!ret) { + LIST_HEAD(pagelist); + + list_add(&page->lru, &pagelist); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + if (ret) { + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + if (ret > 0) + ret = -EIO; + } + } else { + pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", + pfn, ret, page_count(page), page->flags); + } + if (ret) + return ret; + +done: + atomic_long_add(1, &mce_bad_pages); + SetPageHWPoison(page); + /* keep elevated page count for bad page */ + return ret; +} -- cgit v1.2.3 From 9905a43b2d563e6f89e4c63c4278ada03f2ebb14 Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Mon, 14 Dec 2009 00:58:57 +0100 Subject: backlight: Constify struct backlight_ops Signed-off-by: Emese Revfy Signed-off-by: Richard Purdie --- drivers/video/backlight/adp5520_bl.c | 2 +- drivers/video/backlight/adx_bl.c | 2 +- drivers/video/backlight/atmel-pwm-bl.c | 2 +- drivers/video/backlight/backlight.c | 2 +- drivers/video/backlight/corgi_lcd.c | 2 +- drivers/video/backlight/cr_bllcd.c | 2 +- drivers/video/backlight/da903x_bl.c | 2 +- drivers/video/backlight/generic_bl.c | 2 +- drivers/video/backlight/hp680_bl.c | 2 +- drivers/video/backlight/jornada720_bl.c | 2 +- drivers/video/backlight/kb3886_bl.c | 2 +- drivers/video/backlight/locomolcd.c | 2 +- drivers/video/backlight/mbp_nvidia_bl.c | 2 +- drivers/video/backlight/omap1_bl.c | 2 +- drivers/video/backlight/progear_bl.c | 2 +- drivers/video/backlight/pwm_bl.c | 2 +- drivers/video/backlight/tosa_bl.c | 2 +- drivers/video/backlight/wm831x_bl.c | 2 +- include/linux/backlight.h | 12 ++++++------ 19 files changed, 24 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/backlight/adp5520_bl.c b/drivers/video/backlight/adp5520_bl.c index 4c10edecfb66..86d95c228adb 100644 --- a/drivers/video/backlight/adp5520_bl.c +++ b/drivers/video/backlight/adp5520_bl.c @@ -85,7 +85,7 @@ static int adp5520_bl_get_brightness(struct backlight_device *bl) return error ? data->current_brightness : reg_val; } -static struct backlight_ops adp5520_bl_ops = { +static const struct backlight_ops adp5520_bl_ops = { .update_status = adp5520_bl_update_status, .get_brightness = adp5520_bl_get_brightness, }; diff --git a/drivers/video/backlight/adx_bl.c b/drivers/video/backlight/adx_bl.c index 2c3bdfc620b7..d769b0bab21a 100644 --- a/drivers/video/backlight/adx_bl.c +++ b/drivers/video/backlight/adx_bl.c @@ -61,7 +61,7 @@ static int adx_backlight_check_fb(struct fb_info *fb) return 1; } -static struct backlight_ops adx_backlight_ops = { +static const struct backlight_ops adx_backlight_ops = { .options = 0, .update_status = adx_backlight_update_status, .get_brightness = adx_backlight_get_brightness, diff --git a/drivers/video/backlight/atmel-pwm-bl.c b/drivers/video/backlight/atmel-pwm-bl.c index 2cf7ba52f67c..f625ffc69ad3 100644 --- a/drivers/video/backlight/atmel-pwm-bl.c +++ b/drivers/video/backlight/atmel-pwm-bl.c @@ -113,7 +113,7 @@ static int atmel_pwm_bl_init_pwm(struct atmel_pwm_bl *pwmbl) return pwm_channel_enable(&pwmbl->pwmc); } -static struct backlight_ops atmel_pwm_bl_ops = { +static const struct backlight_ops atmel_pwm_bl_ops = { .get_brightness = atmel_pwm_bl_get_intensity, .update_status = atmel_pwm_bl_set_intensity, }; diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 6615ac7fa60a..18829cf68b1b 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -269,7 +269,7 @@ EXPORT_SYMBOL(backlight_force_update); * ERR_PTR() or a pointer to the newly allocated device. */ struct backlight_device *backlight_device_register(const char *name, - struct device *parent, void *devdata, struct backlight_ops *ops) + struct device *parent, void *devdata, const struct backlight_ops *ops) { struct backlight_device *new_bd; int rc; diff --git a/drivers/video/backlight/corgi_lcd.c b/drivers/video/backlight/corgi_lcd.c index 96774949cd30..b4bcf8043797 100644 --- a/drivers/video/backlight/corgi_lcd.c +++ b/drivers/video/backlight/corgi_lcd.c @@ -451,7 +451,7 @@ void corgi_lcd_limit_intensity(int limit) } EXPORT_SYMBOL(corgi_lcd_limit_intensity); -static struct backlight_ops corgi_bl_ops = { +static const struct backlight_ops corgi_bl_ops = { .get_brightness = corgi_bl_get_intensity, .update_status = corgi_bl_update_status, }; diff --git a/drivers/video/backlight/cr_bllcd.c b/drivers/video/backlight/cr_bllcd.c index b9fe62b475c6..2914bf104adf 100644 --- a/drivers/video/backlight/cr_bllcd.c +++ b/drivers/video/backlight/cr_bllcd.c @@ -108,7 +108,7 @@ static int cr_backlight_get_intensity(struct backlight_device *bd) return intensity; } -static struct backlight_ops cr_backlight_ops = { +static const struct backlight_ops cr_backlight_ops = { .get_brightness = cr_backlight_get_intensity, .update_status = cr_backlight_set_intensity, }; diff --git a/drivers/video/backlight/da903x_bl.c b/drivers/video/backlight/da903x_bl.c index f2d76dae1eb3..74cdc640173d 100644 --- a/drivers/video/backlight/da903x_bl.c +++ b/drivers/video/backlight/da903x_bl.c @@ -95,7 +95,7 @@ static int da903x_backlight_get_brightness(struct backlight_device *bl) return data->current_brightness; } -static struct backlight_ops da903x_backlight_ops = { +static const struct backlight_ops da903x_backlight_ops = { .update_status = da903x_backlight_update_status, .get_brightness = da903x_backlight_get_brightness, }; diff --git a/drivers/video/backlight/generic_bl.c b/drivers/video/backlight/generic_bl.c index 6d27f62fdcd0..e6d348e63596 100644 --- a/drivers/video/backlight/generic_bl.c +++ b/drivers/video/backlight/generic_bl.c @@ -70,7 +70,7 @@ void corgibl_limit_intensity(int limit) } EXPORT_SYMBOL(corgibl_limit_intensity); -static struct backlight_ops genericbl_ops = { +static const struct backlight_ops genericbl_ops = { .options = BL_CORE_SUSPENDRESUME, .get_brightness = genericbl_get_intensity, .update_status = genericbl_send_intensity, diff --git a/drivers/video/backlight/hp680_bl.c b/drivers/video/backlight/hp680_bl.c index 7fb4eefff80d..f7cc528d5be7 100644 --- a/drivers/video/backlight/hp680_bl.c +++ b/drivers/video/backlight/hp680_bl.c @@ -98,7 +98,7 @@ static int hp680bl_get_intensity(struct backlight_device *bd) return current_intensity; } -static struct backlight_ops hp680bl_ops = { +static const struct backlight_ops hp680bl_ops = { .get_brightness = hp680bl_get_intensity, .update_status = hp680bl_set_intensity, }; diff --git a/drivers/video/backlight/jornada720_bl.c b/drivers/video/backlight/jornada720_bl.c index 7aed2565c1bd..db9071fc5665 100644 --- a/drivers/video/backlight/jornada720_bl.c +++ b/drivers/video/backlight/jornada720_bl.c @@ -93,7 +93,7 @@ out: return ret; } -static struct backlight_ops jornada_bl_ops = { +static const struct backlight_ops jornada_bl_ops = { .get_brightness = jornada_bl_get_brightness, .update_status = jornada_bl_update_status, .options = BL_CORE_SUSPENDRESUME, diff --git a/drivers/video/backlight/kb3886_bl.c b/drivers/video/backlight/kb3886_bl.c index a38fda1742dd..939e7b830cf3 100644 --- a/drivers/video/backlight/kb3886_bl.c +++ b/drivers/video/backlight/kb3886_bl.c @@ -134,7 +134,7 @@ static int kb3886bl_get_intensity(struct backlight_device *bd) return kb3886bl_intensity; } -static struct backlight_ops kb3886bl_ops = { +static const struct backlight_ops kb3886bl_ops = { .get_brightness = kb3886bl_get_intensity, .update_status = kb3886bl_send_intensity, }; diff --git a/drivers/video/backlight/locomolcd.c b/drivers/video/backlight/locomolcd.c index 6b488b8a7eee..00a9591b0003 100644 --- a/drivers/video/backlight/locomolcd.c +++ b/drivers/video/backlight/locomolcd.c @@ -141,7 +141,7 @@ static int locomolcd_get_intensity(struct backlight_device *bd) return current_intensity; } -static struct backlight_ops locomobl_data = { +static const struct backlight_ops locomobl_data = { .get_brightness = locomolcd_get_intensity, .update_status = locomolcd_set_intensity, }; diff --git a/drivers/video/backlight/mbp_nvidia_bl.c b/drivers/video/backlight/mbp_nvidia_bl.c index 9edb8d7c295f..581246894733 100644 --- a/drivers/video/backlight/mbp_nvidia_bl.c +++ b/drivers/video/backlight/mbp_nvidia_bl.c @@ -33,7 +33,7 @@ struct dmi_match_data { unsigned long iostart; unsigned long iolen; /* Backlight operations structure. */ - struct backlight_ops backlight_ops; + const struct backlight_ops backlight_ops; }; /* Module parameters. */ diff --git a/drivers/video/backlight/omap1_bl.c b/drivers/video/backlight/omap1_bl.c index 8693e5fcd2eb..409ca9643528 100644 --- a/drivers/video/backlight/omap1_bl.c +++ b/drivers/video/backlight/omap1_bl.c @@ -125,7 +125,7 @@ static int omapbl_get_intensity(struct backlight_device *dev) return bl->current_intensity; } -static struct backlight_ops omapbl_ops = { +static const struct backlight_ops omapbl_ops = { .get_brightness = omapbl_get_intensity, .update_status = omapbl_update_status, }; diff --git a/drivers/video/backlight/progear_bl.c b/drivers/video/backlight/progear_bl.c index 9edaf24fd82d..075786e05034 100644 --- a/drivers/video/backlight/progear_bl.c +++ b/drivers/video/backlight/progear_bl.c @@ -54,7 +54,7 @@ static int progearbl_get_intensity(struct backlight_device *bd) return intensity - HW_LEVEL_MIN; } -static struct backlight_ops progearbl_ops = { +static const struct backlight_ops progearbl_ops = { .get_brightness = progearbl_get_intensity, .update_status = progearbl_set_intensity, }; diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index 887166267443..df9e0b32cf39 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -56,7 +56,7 @@ static int pwm_backlight_get_brightness(struct backlight_device *bl) return bl->props.brightness; } -static struct backlight_ops pwm_backlight_ops = { +static const struct backlight_ops pwm_backlight_ops = { .update_status = pwm_backlight_update_status, .get_brightness = pwm_backlight_get_brightness, }; diff --git a/drivers/video/backlight/tosa_bl.c b/drivers/video/backlight/tosa_bl.c index 43edbada12d1..e14ce4d469f5 100644 --- a/drivers/video/backlight/tosa_bl.c +++ b/drivers/video/backlight/tosa_bl.c @@ -72,7 +72,7 @@ static int tosa_bl_get_brightness(struct backlight_device *dev) return props->brightness; } -static struct backlight_ops bl_ops = { +static const struct backlight_ops bl_ops = { .get_brightness = tosa_bl_get_brightness, .update_status = tosa_bl_update_status, }; diff --git a/drivers/video/backlight/wm831x_bl.c b/drivers/video/backlight/wm831x_bl.c index 467bdb7efb23..e32add37a203 100644 --- a/drivers/video/backlight/wm831x_bl.c +++ b/drivers/video/backlight/wm831x_bl.c @@ -112,7 +112,7 @@ static int wm831x_backlight_get_brightness(struct backlight_device *bl) return data->current_brightness; } -static struct backlight_ops wm831x_backlight_ops = { +static const struct backlight_ops wm831x_backlight_ops = { .options = BL_CORE_SUSPENDRESUME, .update_status = wm831x_backlight_update_status, .get_brightness = wm831x_backlight_get_brightness, diff --git a/include/linux/backlight.h b/include/linux/backlight.h index 0f5f57858a23..8c4f884db6b4 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -36,18 +36,18 @@ struct backlight_device; struct fb_info; struct backlight_ops { - unsigned int options; + const unsigned int options; #define BL_CORE_SUSPENDRESUME (1 << 0) /* Notify the backlight driver some property has changed */ - int (*update_status)(struct backlight_device *); + int (* const update_status)(struct backlight_device *); /* Return the current backlight brightness (accounting for power, fb_blank etc.) */ - int (*get_brightness)(struct backlight_device *); + int (* const get_brightness)(struct backlight_device *); /* Check if given framebuffer device is the one bound to this backlight; return 0 if not, !=0 if it is. If NULL, backlight always matches the fb. */ - int (*check_fb)(struct fb_info *); + int (* const check_fb)(struct fb_info *); }; /* This structure defines all the properties of a backlight */ @@ -86,7 +86,7 @@ struct backlight_device { registered this device has been unloaded, and if class_get_devdata() points to something in the body of that driver, it is also invalid. */ struct mutex ops_lock; - struct backlight_ops *ops; + const struct backlight_ops *ops; /* The framebuffer notifier block */ struct notifier_block fb_notif; @@ -103,7 +103,7 @@ static inline void backlight_update_status(struct backlight_device *bd) } extern struct backlight_device *backlight_device_register(const char *name, - struct device *dev, void *devdata, struct backlight_ops *ops); + struct device *dev, void *devdata, const struct backlight_ops *ops); extern void backlight_device_unregister(struct backlight_device *bd); extern void backlight_force_update(struct backlight_device *bd, enum backlight_update_reason reason); -- cgit v1.2.3 From 3d1e463158febf6e047897597722f768b15350cd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 8 Aug 2009 23:56:29 +0400 Subject: get rid of init_file() Signed-off-by: Al Viro --- fs/file_table.c | 30 ++---------------------------- include/linux/file.h | 3 --- 2 files changed, 2 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/fs/file_table.c b/fs/file_table.c index f906ac8c9a9f..602a9ee3023a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -171,32 +171,6 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry, if (!file) return NULL; - init_file(file, mnt, dentry, mode, fop); - return file; -} -EXPORT_SYMBOL(alloc_file); - -/** - * init_file - initialize a 'struct file' - * @file: the already allocated 'struct file' to initialized - * @mnt: the vfsmount on which the file resides - * @dentry: the dentry representing this file - * @mode: the mode the file is opened with - * @fop: the 'struct file_operations' for this file - * - * Use this instead of setting the members directly. Doing so - * avoids making mistakes like forgetting the mntget() or - * forgetting to take a write on the mnt. - * - * Note: This is a crappy interface. It is here to make - * merging with the existing users of get_empty_filp() - * who have complex failure logic easier. All users - * of this should be moving to alloc_file(). - */ -int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, - fmode_t mode, const struct file_operations *fop) -{ - int error = 0; file->f_path.dentry = dentry; file->f_path.mnt = mntget(mnt); file->f_mapping = dentry->d_inode->i_mapping; @@ -210,13 +184,13 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, * that we can do debugging checks at __fput() */ if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { + int error = 0; file_take_write(file); error = mnt_clone_write(mnt); WARN_ON(error); } - return error; + return file; } -EXPORT_SYMBOL(init_file); void fput(struct file *file) { diff --git a/include/linux/file.h b/include/linux/file.h index 335a0a5c316e..6a8d3612eb2a 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -18,9 +18,6 @@ extern void drop_file_write_access(struct file *file); struct file_operations; struct vfsmount; struct dentry; -extern int init_file(struct file *, struct vfsmount *mnt, - struct dentry *dentry, fmode_t mode, - const struct file_operations *fop); extern struct file *alloc_file(struct vfsmount *, struct dentry *dentry, fmode_t mode, const struct file_operations *fop); -- cgit v1.2.3 From 2c48b9c45579a9b5e3e74694eebf3d2451f3dbd3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 9 Aug 2009 00:52:35 +0400 Subject: switch alloc_file() to passing struct path ... and have the caller grab both mnt and dentry; kill leak in infiniband, while we are at it. Signed-off-by: Al Viro --- arch/ia64/kernel/perfmon.c | 15 ++++++++------- drivers/infiniband/core/uverbs_main.c | 9 +++++++-- fs/anon_inodes.c | 18 +++++++++--------- fs/file_table.c | 13 ++++++------- fs/hugetlbfs/inode.c | 15 ++++++++------- fs/notify/inotify/inotify_user.c | 8 ++++++-- fs/pipe.c | 17 +++++++++-------- include/linux/file.h | 5 +++-- ipc/shm.c | 10 +++++----- mm/shmem.c | 14 ++++++++------ net/socket.c | 17 +++++++++-------- 11 files changed, 78 insertions(+), 63 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 599b233bef75..5246285a95fb 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2200,7 +2200,7 @@ pfm_alloc_file(pfm_context_t *ctx) { struct file *file; struct inode *inode; - struct dentry *dentry; + struct path path; char name[32]; struct qstr this; @@ -2225,18 +2225,19 @@ pfm_alloc_file(pfm_context_t *ctx) /* * allocate a new dcache entry */ - dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); - if (!dentry) { + path.dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); + if (!path.dentry) { iput(inode); return ERR_PTR(-ENOMEM); } + path.mnt = mntget(pfmfs_mnt); - dentry->d_op = &pfmfs_dentry_operations; - d_add(dentry, inode); + path.dentry->d_op = &pfmfs_dentry_operations; + d_add(path.dentry, inode); - file = alloc_file(pfmfs_mnt, dentry, FMODE_READ, &pfm_file_ops); + file = alloc_file(&path, FMODE_READ, &pfm_file_ops); if (!file) { - dput(dentry); + path_put(&path); return ERR_PTR(-ENFILE); } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index aec0fbdfe7f0..5f284ffd430e 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -492,6 +492,7 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, int is_async, int *fd) { struct ib_uverbs_event_file *ev_file; + struct path path; struct file *filp; int ret; @@ -519,8 +520,10 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, * system call on a uverbs file, which will already have a * module reference. */ - filp = alloc_file(uverbs_event_mnt, dget(uverbs_event_mnt->mnt_root), - FMODE_READ, fops_get(&uverbs_event_fops)); + path.mnt = uverbs_event_mnt; + path.dentry = uverbs_event_mnt->mnt_root; + path_get(&path); + filp = alloc_file(&path, FMODE_READ, fops_get(&uverbs_event_fops)); if (!filp) { ret = -ENFILE; goto err_fd; @@ -531,6 +534,8 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, return filp; err_fd: + fops_put(&uverbs_event_fops); + path_put(&path); put_unused_fd(*fd); err: diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 2ca7a7cafdbf..94f5110c4655 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -88,7 +88,7 @@ struct file *anon_inode_getfile(const char *name, void *priv, int flags) { struct qstr this; - struct dentry *dentry; + struct path path; struct file *file; int error; @@ -106,10 +106,11 @@ struct file *anon_inode_getfile(const char *name, this.name = name; this.len = strlen(name); this.hash = 0; - dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); - if (!dentry) + path.dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); + if (!path.dentry) goto err_module; + path.mnt = mntget(anon_inode_mnt); /* * We know the anon_inode inode count is always greater than zero, * so we can avoid doing an igrab() and we can use an open-coded @@ -117,14 +118,13 @@ struct file *anon_inode_getfile(const char *name, */ atomic_inc(&anon_inode_inode->i_count); - dentry->d_op = &anon_inodefs_dentry_operations; + path.dentry->d_op = &anon_inodefs_dentry_operations; /* Do not publish this dentry inside the global dentry hash table */ - dentry->d_flags &= ~DCACHE_UNHASHED; - d_instantiate(dentry, anon_inode_inode); + path.dentry->d_flags &= ~DCACHE_UNHASHED; + d_instantiate(path.dentry, anon_inode_inode); error = -ENFILE; - file = alloc_file(anon_inode_mnt, dentry, - FMODE_READ | FMODE_WRITE, fops); + file = alloc_file(&path, FMODE_READ | FMODE_WRITE, fops); if (!file) goto err_dput; file->f_mapping = anon_inode_inode->i_mapping; @@ -137,7 +137,7 @@ struct file *anon_inode_getfile(const char *name, return file; err_dput: - dput(dentry); + path_put(&path); err_module: module_put(fops->owner); return ERR_PTR(error); diff --git a/fs/file_table.c b/fs/file_table.c index 602a9ee3023a..163cd28314e0 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -162,8 +162,8 @@ fail: * If all the callers of init_file() are eliminated, its * code should be moved into this function. */ -struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry, - fmode_t mode, const struct file_operations *fop) +struct file *alloc_file(struct path *path, fmode_t mode, + const struct file_operations *fop) { struct file *file; @@ -171,9 +171,8 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry, if (!file) return NULL; - file->f_path.dentry = dentry; - file->f_path.mnt = mntget(mnt); - file->f_mapping = dentry->d_inode->i_mapping; + file->f_path = *path; + file->f_mapping = path->dentry->d_inode->i_mapping; file->f_mode = mode; file->f_op = fop; @@ -183,10 +182,10 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry, * visible. We do this for consistency, and so * that we can do debugging checks at __fput() */ - if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { + if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) { int error = 0; file_take_write(file); - error = mnt_clone_write(mnt); + error = mnt_clone_write(path->mnt); WARN_ON(error); } return file; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 87a1258953b8..6bd41525cd71 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -922,7 +922,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, int error = -ENOMEM; struct file *file; struct inode *inode; - struct dentry *dentry, *root; + struct path path; + struct dentry *root; struct qstr quick_string; *user = NULL; @@ -944,10 +945,11 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, quick_string.name = name; quick_string.len = strlen(quick_string.name); quick_string.hash = 0; - dentry = d_alloc(root, &quick_string); - if (!dentry) + path.dentry = d_alloc(root, &quick_string); + if (!path.dentry) goto out_shm_unlock; + path.mnt = mntget(hugetlbfs_vfsmount); error = -ENOSPC; inode = hugetlbfs_get_inode(root->d_sb, current_fsuid(), current_fsgid(), S_IFREG | S_IRWXUGO, 0); @@ -960,13 +962,12 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, acctflag)) goto out_inode; - d_instantiate(dentry, inode); + d_instantiate(path.dentry, inode); inode->i_size = size; inode->i_nlink = 0; error = -ENFILE; - file = alloc_file(hugetlbfs_vfsmount, dentry, - FMODE_WRITE | FMODE_READ, + file = alloc_file(&path, FMODE_WRITE | FMODE_READ, &hugetlbfs_file_operations); if (!file) goto out_dentry; /* inode is already attached */ @@ -977,7 +978,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, out_inode: iput(inode); out_dentry: - dput(dentry); + path_put(&path); out_shm_unlock: if (*user) { user_shm_unlock(size, *user); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 9e4f90042eaf..8271cf05c957 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -646,6 +646,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) struct fsnotify_group *group; struct user_struct *user; struct file *filp; + struct path path; int fd, ret; /* Check the IN_* constants for consistency. */ @@ -675,8 +676,10 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) atomic_inc(&user->inotify_devs); - filp = alloc_file(inotify_mnt, dget(inotify_mnt->mnt_root), - FMODE_READ, &inotify_fops); + path.mnt = inotify_mnt; + path.dentry = inotify_mnt->mnt_root; + path_get(&path); + filp = alloc_file(&path, FMODE_READ, &inotify_fops); if (!filp) goto Enfile; @@ -689,6 +692,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) Enfile: ret = -ENFILE; + path_put(&path); atomic_dec(&user->inotify_devs); out_free_uid: free_uid(user); diff --git a/fs/pipe.c b/fs/pipe.c index ae17d026aaa3..81288bc2bcbb 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -974,7 +974,7 @@ struct file *create_write_pipe(int flags) int err; struct inode *inode; struct file *f; - struct dentry *dentry; + struct path path; struct qstr name = { .name = "" }; err = -ENFILE; @@ -983,21 +983,22 @@ struct file *create_write_pipe(int flags) goto err; err = -ENOMEM; - dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); - if (!dentry) + path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); + if (!path.dentry) goto err_inode; + path.mnt = mntget(pipe_mnt); - dentry->d_op = &pipefs_dentry_operations; + path.dentry->d_op = &pipefs_dentry_operations; /* * We dont want to publish this dentry into global dentry hash table. * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED * This permits a working /proc/$pid/fd/XXX on pipes */ - dentry->d_flags &= ~DCACHE_UNHASHED; - d_instantiate(dentry, inode); + path.dentry->d_flags &= ~DCACHE_UNHASHED; + d_instantiate(path.dentry, inode); err = -ENFILE; - f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipefifo_fops); + f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); if (!f) goto err_dentry; f->f_mapping = inode->i_mapping; @@ -1009,7 +1010,7 @@ struct file *create_write_pipe(int flags) err_dentry: free_pipe_info(inode); - dput(dentry); + path_put(&path); return ERR_PTR(err); err_inode: diff --git a/include/linux/file.h b/include/linux/file.h index 6a8d3612eb2a..5555508fd517 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -18,8 +18,9 @@ extern void drop_file_write_access(struct file *file); struct file_operations; struct vfsmount; struct dentry; -extern struct file *alloc_file(struct vfsmount *, struct dentry *dentry, - fmode_t mode, const struct file_operations *fop); +struct path; +extern struct file *alloc_file(struct path *, fmode_t mode, + const struct file_operations *fop); static inline void fput_light(struct file *file, int fput_needed) { diff --git a/ipc/shm.c b/ipc/shm.c index 11bec626c228..16e39230aa0d 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -878,8 +878,8 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) if (err) goto out_unlock; - path.dentry = dget(shp->shm_file->f_path.dentry); - path.mnt = shp->shm_file->f_path.mnt; + path = shp->shm_file->f_path; + path_get(&path); shp->shm_nattch++; size = i_size_read(path.dentry->d_inode); shm_unlock(shp); @@ -889,8 +889,8 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) if (!sfd) goto out_put_dentry; - file = alloc_file(path.mnt, path.dentry, f_mode, - is_file_hugepages(shp->shm_file) ? + file = alloc_file(&path, f_mode, + is_file_hugepages(shp->shm_file) ? &shm_file_operations_huge : &shm_file_operations); if (!file) @@ -950,7 +950,7 @@ out_unlock: out_free: kfree(sfd); out_put_dentry: - dput(path.dentry); + path_put(&path); goto out_nattch; } diff --git a/mm/shmem.c b/mm/shmem.c index ef8f47473c5a..d2ec7f029ff4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2626,7 +2626,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags int error; struct file *file; struct inode *inode; - struct dentry *dentry, *root; + struct path path; + struct dentry *root; struct qstr this; if (IS_ERR(shm_mnt)) @@ -2643,16 +2644,17 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags this.len = strlen(name); this.hash = 0; /* will go */ root = shm_mnt->mnt_root; - dentry = d_alloc(root, &this); - if (!dentry) + path.dentry = d_alloc(root, &this); + if (!path.dentry) goto put_memory; + path.mnt = mntget(shm_mnt); error = -ENOSPC; inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) goto put_dentry; - d_instantiate(dentry, inode); + d_instantiate(path.dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ #ifndef CONFIG_MMU @@ -2662,7 +2664,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags #endif error = -ENFILE; - file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ, + file = alloc_file(&path, FMODE_WRITE | FMODE_READ, &shmem_file_operations); if (!file) goto put_dentry; @@ -2671,7 +2673,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags return file; put_dentry: - dput(dentry); + path_put(&path); put_memory: shmem_unacct_size(flags, size); return ERR_PTR(error); diff --git a/net/socket.c b/net/socket.c index eaaba3510e81..dbfdfa96d29b 100644 --- a/net/socket.c +++ b/net/socket.c @@ -358,7 +358,7 @@ static const struct dentry_operations sockfs_dentry_operations = { static int sock_alloc_file(struct socket *sock, struct file **f, int flags) { struct qstr name = { .name = "" }; - struct dentry *dentry; + struct path path; struct file *file; int fd; @@ -366,28 +366,29 @@ static int sock_alloc_file(struct socket *sock, struct file **f, int flags) if (unlikely(fd < 0)) return fd; - dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name); - if (unlikely(!dentry)) { + path.dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name); + if (unlikely(!path.dentry)) { put_unused_fd(fd); return -ENOMEM; } + path.mnt = mntget(sock_mnt); - dentry->d_op = &sockfs_dentry_operations; + path.dentry->d_op = &sockfs_dentry_operations; /* * We dont want to push this dentry into global dentry hash table. * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED * This permits a working /proc/$pid/fd/XXX on sockets */ - dentry->d_flags &= ~DCACHE_UNHASHED; - d_instantiate(dentry, SOCK_INODE(sock)); + path.dentry->d_flags &= ~DCACHE_UNHASHED; + d_instantiate(path.dentry, SOCK_INODE(sock)); SOCK_INODE(sock)->i_fop = &socket_file_ops; - file = alloc_file(sock_mnt, dentry, FMODE_READ | FMODE_WRITE, + file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &socket_file_ops); if (unlikely(!file)) { /* drop dentry, keep inode */ atomic_inc(&path.dentry->d_inode->i_count); - dput(dentry); + path_put(&path); put_unused_fd(fd); return -ENFILE; } -- cgit v1.2.3 From e9496ff46a20a8592fdc7bdaaf41b45eb808d310 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 9 Aug 2009 18:44:32 +0400 Subject: fix mismerge with Trond's stuff (create_mnt_ns() export is gone now) Signed-off-by: Al Viro --- fs/namespace.c | 3 +-- fs/nfs/super.c | 8 -------- include/linux/mnt_namespace.h | 1 - 3 files changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 7d70d63ceb29..faab1273281e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2068,7 +2068,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, * create_mnt_ns - creates a private namespace and adds a root filesystem * @mnt: pointer to the new root filesystem mountpoint */ -struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) +static struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) { struct mnt_namespace *new_ns; @@ -2080,7 +2080,6 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) } return new_ns; } -EXPORT_SYMBOL(create_mnt_ns); SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ce907efc5508..d5b112bcf3de 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2648,21 +2648,13 @@ out_freepage: static int nfs_follow_remote_path(struct vfsmount *root_mnt, const char *export_path, struct vfsmount *mnt_target) { - struct mnt_namespace *ns_private; struct nameidata nd; struct super_block *s; int ret; - ns_private = create_mnt_ns(root_mnt); - ret = PTR_ERR(ns_private); - if (IS_ERR(ns_private)) - goto out_mntput; - ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, export_path, LOOKUP_FOLLOW, &nd); - put_mnt_ns(ns_private); - if (ret != 0) goto out_err; diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index d74785c2393a..d9ebf1037dfa 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -23,7 +23,6 @@ struct proc_mounts { struct fs_struct; -extern struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt); extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); -- cgit v1.2.3 From e81e3f4dca6c54116a24aec217d2c15c6f58ada5 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 4 Dec 2009 15:47:36 -0500 Subject: fs: move get_empty_filp() deffinition to internal.h All users outside of fs/ of get_empty_filp() have been removed. This patch moves the definition from the include/ directory to internal.h so no new users crop up and removes the EXPORT_SYMBOL. I'd love to see open intents stop using it too, but that's a problem for another day and a smarter developer! Signed-off-by: Eric Paris Acked-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/file_table.c | 2 ++ fs/internal.h | 1 + fs/namei.c | 2 ++ fs/open.c | 2 ++ include/linux/fs.h | 1 - 5 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/file_table.c b/fs/file_table.c index 163cd28314e0..361d76be8295 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -24,6 +24,8 @@ #include +#include "internal.h" + /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE diff --git a/fs/internal.h b/fs/internal.h index 515175b8b72e..f67cd141d9a8 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -79,6 +79,7 @@ extern void chroot_fs_refs(struct path *, struct path *); * file_table.c */ extern void mark_files_ro(struct super_block *); +extern struct file *get_empty_filp(void); /* * super.c diff --git a/fs/namei.c b/fs/namei.c index 8c8b379b94a4..1fc038b117be 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -35,6 +35,8 @@ #include #include +#include "internal.h" + #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) /* [Feb-1997 T. Schoebel-Theuer] diff --git a/fs/open.c b/fs/open.c index b4b31d277f3a..d95651e8be9e 100644 --- a/fs/open.c +++ b/fs/open.c @@ -31,6 +31,8 @@ #include #include +#include "internal.h" + int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) { int retval = -ENODEV; diff --git a/include/linux/fs.h b/include/linux/fs.h index a057f48eb156..cdc23be4edde 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2189,7 +2189,6 @@ static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } -extern struct file * get_empty_filp(void); extern void file_move(struct file *f, struct list_head *list); extern void file_kill(struct file *f); #ifdef CONFIG_BLOCK -- cgit v1.2.3 From 1429b3eca23818f87f9fa569a15d9816de81f698 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 16 Dec 2009 06:38:01 -0500 Subject: Untangling ima mess, part 3: kill dead code in ima Kill the 'update' argument of ima_path_check(), kill dead code in ima. Current rules: ima counters are bumped at the same time when the file switches from put_filp() fodder to fput() one. Which happens exactly in two places - alloc_file() and __dentry_open(). Nothing else needs to do that at all. Signed-off-by: Al Viro --- fs/namei.c | 4 +-- fs/nfsd/vfs.c | 3 +-- include/linux/ima.h | 12 ++------- security/integrity/ima/ima_main.c | 52 +++------------------------------------ 4 files changed, 9 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index c530e5d32f12..a765e7a741f4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1686,7 +1686,7 @@ do_last: path_put(&nd.root); if (!IS_ERR(filp)) { error = ima_path_check(&filp->f_path, filp->f_mode & - (MAY_READ | MAY_WRITE | MAY_EXEC), 0); + (MAY_READ | MAY_WRITE | MAY_EXEC)); if (error) { fput(filp); filp = ERR_PTR(error); @@ -1747,7 +1747,7 @@ ok: filp = nameidata_to_filp(&nd, open_flag); if (!IS_ERR(filp)) { error = ima_path_check(&filp->f_path, filp->f_mode & - (MAY_READ | MAY_WRITE | MAY_EXEC), 0); + (MAY_READ | MAY_WRITE | MAY_EXEC)); if (error) { fput(filp); filp = ERR_PTR(error); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c9942b39654e..936f08400db6 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -2122,8 +2122,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, */ path.mnt = exp->ex_path.mnt; path.dentry = dentry; - err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC), - IMA_COUNT_LEAVE); + err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC)); nfsd_out: return err? nfserrno(err) : 0; } diff --git a/include/linux/ima.h b/include/linux/ima.h index 0e3f2a4c25f6..99dc6d5cf7e5 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -13,18 +13,14 @@ #include struct linux_binprm; -#define IMA_COUNT_UPDATE 1 -#define IMA_COUNT_LEAVE 0 - #ifdef CONFIG_IMA extern int ima_bprm_check(struct linux_binprm *bprm); extern int ima_inode_alloc(struct inode *inode); extern void ima_inode_free(struct inode *inode); -extern int ima_path_check(struct path *path, int mask, int update_counts); +extern int ima_path_check(struct path *path, int mask); extern void ima_file_free(struct file *file); extern int ima_file_mmap(struct file *file, unsigned long prot); extern void ima_counts_get(struct file *file); -extern void ima_counts_put(struct path *path, int mask); #else static inline int ima_bprm_check(struct linux_binprm *bprm) @@ -42,7 +38,7 @@ static inline void ima_inode_free(struct inode *inode) return; } -static inline int ima_path_check(struct path *path, int mask, int update_counts) +static inline int ima_path_check(struct path *path, int mask) { return 0; } @@ -62,9 +58,5 @@ static inline void ima_counts_get(struct file *file) return; } -static inline void ima_counts_put(struct path *path, int mask) -{ - return; -} #endif /* CONFIG_IMA_H */ #endif /* _LINUX_IMA_H */ diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index e041233b4d2a..16dc57d247d0 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -49,20 +49,13 @@ static void ima_inc_counts(struct ima_iint_cache *iint, fmode_t mode) iint->writecount++; } -/* - * Update the counts given open flags instead of fmode - */ -static void ima_inc_counts_flags(struct ima_iint_cache *iint, int flags) -{ - ima_inc_counts(iint, (__force fmode_t)((flags+1) & O_ACCMODE)); -} - /* * Decrement ima counts */ static void ima_dec_counts(struct ima_iint_cache *iint, struct inode *inode, - fmode_t mode) + struct file *file) { + mode_t mode = file->f_mode; BUG_ON(!mutex_is_locked(&iint->mutex)); iint->opencount--; @@ -92,12 +85,6 @@ static void ima_dec_counts(struct ima_iint_cache *iint, struct inode *inode, } } -static void ima_dec_counts_flags(struct ima_iint_cache *iint, - struct inode *inode, int flags) -{ - ima_dec_counts(iint, inode, (__force fmode_t)((flags+1) & O_ACCMODE)); -} - /** * ima_file_free - called on __fput() * @file: pointer to file structure being freed @@ -117,7 +104,7 @@ void ima_file_free(struct file *file) return; mutex_lock(&iint->mutex); - ima_dec_counts(iint, inode, file->f_mode); + ima_dec_counts(iint, inode, file); mutex_unlock(&iint->mutex); kref_put(&iint->refcount, iint_free); } @@ -183,7 +170,7 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file, * Always return 0 and audit dentry_open failures. * (Return code will be based upon measurement appraisal.) */ -int ima_path_check(struct path *path, int mask, int update_counts) +int ima_path_check(struct path *path, int mask) { struct inode *inode = path->dentry->d_inode; struct ima_iint_cache *iint; @@ -197,8 +184,6 @@ int ima_path_check(struct path *path, int mask, int update_counts) return 0; mutex_lock(&iint->mutex); - if (update_counts) - ima_inc_counts_flags(iint, mask); rc = ima_must_measure(iint, inode, MAY_READ, PATH_CHECK); if (rc < 0) @@ -268,35 +253,6 @@ out: return rc; } -/* - * ima_counts_put - decrement file counts - * - * File counts are incremented in ima_path_check. On file open - * error, such as ETXTBSY, decrement the counts to prevent - * unnecessary imbalance messages. - */ -void ima_counts_put(struct path *path, int mask) -{ - struct inode *inode = path->dentry->d_inode; - struct ima_iint_cache *iint; - - /* The inode may already have been freed, freeing the iint - * with it. Verify the inode is not NULL before dereferencing - * it. - */ - if (!ima_initialized || !inode || !S_ISREG(inode->i_mode)) - return; - iint = ima_iint_find_get(inode); - if (!iint) - return; - - mutex_lock(&iint->mutex); - ima_dec_counts_flags(iint, inode, mask); - mutex_unlock(&iint->mutex); - - kref_put(&iint->refcount, iint_free); -} - /* * ima_counts_get - increment file counts * -- cgit v1.2.3 From 431547b3c4533b8c7fd150ab36980b9a3147797b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Nov 2009 09:52:56 +0000 Subject: sanitize xattr handler prototypes Add a flags argument to struct xattr_handler and pass it to all xattr handler methods. This allows using the same methods for multiple handlers, e.g. for the ACL methods which perform exactly the same action for the access and default ACLs, just using a different underlying attribute. With a little more groundwork it'll also allow sharing the methods for the regular user/trusted/secure handlers in extN, ocfs2 and jffs2 like it's already done for xfs in this patch. Also change the inode argument to the handlers to a dentry to allow using the handlers mechnism for filesystems that require it later, e.g. cifs. [with GFS2 bits updated by Steven Whitehouse ] Signed-off-by: Christoph Hellwig Reviewed-by: James Morris Acked-by: Joel Becker Signed-off-by: Al Viro --- fs/btrfs/acl.c | 47 ++++++------------------ fs/ext2/acl.c | 79 +++++++++++++--------------------------- fs/ext2/xattr.c | 11 ++++-- fs/ext2/xattr_security.c | 16 ++++---- fs/ext2/xattr_trusted.c | 16 ++++---- fs/ext2/xattr_user.c | 25 +++++++------ fs/ext3/acl.c | 74 ++++++++++++------------------------- fs/ext3/xattr.c | 31 +++++++++------- fs/ext3/xattr_security.c | 20 +++++----- fs/ext3/xattr_trusted.c | 18 ++++----- fs/ext3/xattr_user.c | 25 +++++++------ fs/ext4/acl.c | 74 ++++++++++++------------------------- fs/ext4/xattr.c | 31 +++++++++------- fs/ext4/xattr_security.c | 20 +++++----- fs/ext4/xattr_trusted.c | 20 +++++----- fs/ext4/xattr_user.c | 25 +++++++------ fs/gfs2/acl.c | 16 +++++--- fs/gfs2/inode.c | 3 +- fs/gfs2/xattr.c | 69 +++++++++++++---------------------- fs/gfs2/xattr.h | 7 ++-- fs/jffs2/acl.c | 65 +++++++++++---------------------- fs/jffs2/security.c | 18 +++++---- fs/jffs2/xattr.c | 6 ++- fs/jffs2/xattr_trusted.c | 18 +++++---- fs/jffs2/xattr_user.c | 18 +++++---- fs/ocfs2/acl.c | 87 +++++++++++++------------------------------- fs/ocfs2/xattr.c | 72 ++++++++++++++++++------------------ fs/reiserfs/xattr.c | 36 +++++++++--------- fs/reiserfs/xattr_acl.c | 69 ++++++++++------------------------- fs/reiserfs/xattr_security.c | 21 ++++++----- fs/reiserfs/xattr_trusted.c | 21 ++++++----- fs/reiserfs/xattr_user.c | 21 ++++++----- fs/xattr.c | 28 +++++++------- fs/xfs/linux-2.6/xfs_acl.c | 57 ++++++++++------------------- fs/xfs/linux-2.6/xfs_xattr.c | 71 ++++++++---------------------------- fs/xfs/xfs_acl.h | 3 +- include/linux/xattr.h | 13 ++++--- mm/shmem.c | 19 +++++----- mm/shmem_acl.c | 78 ++++++++++----------------------------- 39 files changed, 533 insertions(+), 815 deletions(-) (limited to 'include/linux') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 361604244271..52cbe47022bf 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -73,13 +73,13 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) return acl; } -static int btrfs_xattr_get_acl(struct inode *inode, int type, - void *value, size_t size) +static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, + void *value, size_t size, int type) { struct posix_acl *acl; int ret = 0; - acl = btrfs_get_acl(inode, type); + acl = btrfs_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); @@ -151,8 +151,8 @@ out: return ret; } -static int btrfs_xattr_set_acl(struct inode *inode, int type, - const void *value, size_t size) +static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { int ret = 0; struct posix_acl *acl = NULL; @@ -167,38 +167,13 @@ static int btrfs_xattr_set_acl(struct inode *inode, int type, } } - ret = btrfs_set_acl(inode, acl, type); + ret = btrfs_set_acl(dentry->d_inode, acl, type); posix_acl_release(acl); return ret; } - -static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - -static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - int btrfs_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; @@ -303,14 +278,16 @@ int btrfs_acl_chmod(struct inode *inode) struct xattr_handler btrfs_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, - .get = btrfs_xattr_acl_default_get, - .set = btrfs_xattr_acl_default_set, + .flags = ACL_TYPE_DEFAULT, + .get = btrfs_xattr_acl_get, + .set = btrfs_xattr_acl_set, }; struct xattr_handler btrfs_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, - .get = btrfs_xattr_acl_access_get, - .set = btrfs_xattr_acl_access_set, + .flags = ACL_TYPE_ACCESS, + .get = btrfs_xattr_acl_get, + .set = btrfs_xattr_acl_set, }; #else /* CONFIG_BTRFS_FS_POSIX_ACL */ diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index a63d44256a70..a99e54318c3d 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -339,12 +339,12 @@ ext2_acl_chmod(struct inode *inode) * Extended attribut handlers */ static size_t -ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_size) memcpy(list, POSIX_ACL_XATTR_ACCESS, size); @@ -352,12 +352,12 @@ ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size, } static size_t -ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_size) memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); @@ -365,15 +365,18 @@ ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size, } static int -ext2_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) return -EOPNOTSUPP; - acl = ext2_get_acl(inode, type); + acl = ext2_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -385,33 +388,17 @@ ext2_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) } static int -ext2_xattr_get_acl_access(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int -ext2_xattr_get_acl_default(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int -ext2_xattr_set_acl(struct inode *inode, int type, const void *value, - size_t size) +ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { struct posix_acl *acl; int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) + if (!is_owner_or_cap(dentry->d_inode)) return -EPERM; if (value) { @@ -426,41 +413,25 @@ ext2_xattr_set_acl(struct inode *inode, int type, const void *value, } else acl = NULL; - error = ext2_set_acl(inode, type, acl); + error = ext2_set_acl(dentry->d_inode, type, acl); release_and_out: posix_acl_release(acl); return error; } -static int -ext2_xattr_set_acl_access(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int -ext2_xattr_set_acl_default(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - struct xattr_handler ext2_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, .list = ext2_xattr_list_acl_access, - .get = ext2_xattr_get_acl_access, - .set = ext2_xattr_set_acl_access, + .get = ext2_xattr_get_acl, + .set = ext2_xattr_set_acl, }; struct xattr_handler ext2_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = ext2_xattr_list_acl_default, - .get = ext2_xattr_get_acl_default, - .set = ext2_xattr_set_acl_default, + .get = ext2_xattr_get_acl, + .set = ext2_xattr_set_acl, }; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 7913531ec6d5..904f00642f84 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -60,6 +60,7 @@ #include #include #include +#include #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -249,8 +250,9 @@ cleanup: * used / required on success. */ static int -ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) +ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; struct ext2_xattr_entry *entry; char *end; @@ -300,9 +302,10 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", ext2_xattr_handler(entry->e_name_index); if (handler) { - size_t size = handler->list(inode, buffer, rest, + size_t size = handler->list(dentry, buffer, rest, entry->e_name, - entry->e_name_len); + entry->e_name_len, + handler->flags); if (buffer) { if (size > rest) { error = -ERANGE; @@ -330,7 +333,7 @@ cleanup: ssize_t ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) { - return ext2_xattr_list(dentry->d_inode, buffer, size); + return ext2_xattr_list(dentry, buffer, size); } /* diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 70c0dbdcdcb7..c8155845ac05 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -11,8 +11,8 @@ #include "xattr.h" static size_t -ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const int prefix_len = XATTR_SECURITY_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -26,22 +26,22 @@ ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size, } static int -ext2_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext2_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_get(inode, EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name, buffer, size); } static int -ext2_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext2_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index e8219f8eae9f..2a26d71f4771 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -13,8 +13,8 @@ #include "xattr.h" static size_t -ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const int prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -31,22 +31,22 @@ ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, } static int -ext2_xattr_trusted_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext2_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_get(inode, EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name, buffer, size); } static int -ext2_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext2_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 92495d28c62f..3f6caf3684b4 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -12,13 +12,13 @@ #include "xattr.h" static size_t -ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return 0; if (list && total_len <= list_size) { @@ -30,27 +30,28 @@ ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size, } static int -ext2_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext2_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, buffer, size); + return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_USER, + name, buffer, size); } static int -ext2_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext2_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, - value, size, flags); + return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_USER, + name, value, size, flags); } struct xattr_handler ext2_xattr_user_handler = { diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index c9b0df376b5f..82ba34158661 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -366,12 +366,12 @@ out: * Extended attribute handlers */ static size_t -ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, - const char *name, size_t name_len) +ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_len) memcpy(list, POSIX_ACL_XATTR_ACCESS, size); @@ -379,12 +379,12 @@ ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, } static size_t -ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, - const char *name, size_t name_len) +ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_len) memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); @@ -392,15 +392,18 @@ ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, } static int -ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) return -EOPNOTSUPP; - acl = ext3_get_acl(inode, type); + acl = ext3_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -412,31 +415,16 @@ ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) } static int -ext3_xattr_get_acl_access(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int -ext3_xattr_get_acl_default(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int -ext3_xattr_set_acl(struct inode *inode, int type, const void *value, - size_t size) +ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; handle_t *handle; struct posix_acl *acl; int error, retries = 0; + if (strcmp(name, "") != 0) + return -EINVAL; if (!test_opt(inode->i_sb, POSIX_ACL)) return -EOPNOTSUPP; if (!is_owner_or_cap(inode)) @@ -468,34 +456,18 @@ release_and_out: return error; } -static int -ext3_xattr_set_acl_access(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int -ext3_xattr_set_acl_default(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - struct xattr_handler ext3_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, .list = ext3_xattr_list_acl_access, - .get = ext3_xattr_get_acl_access, - .set = ext3_xattr_set_acl_access, + .get = ext3_xattr_get_acl, + .set = ext3_xattr_set_acl, }; struct xattr_handler ext3_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = ext3_xattr_list_acl_default, - .get = ext3_xattr_get_acl_default, - .set = ext3_xattr_set_acl_default, + .get = ext3_xattr_get_acl, + .set = ext3_xattr_set_acl, }; diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 387d92d00b97..66895ccf76c7 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -99,7 +99,7 @@ static struct buffer_head *ext3_xattr_cache_find(struct inode *, struct mb_cache_entry **); static void ext3_xattr_rehash(struct ext3_xattr_header *, struct ext3_xattr_entry *); -static int ext3_xattr_list(struct inode *inode, char *buffer, +static int ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size); static struct mb_cache *ext3_xattr_cache; @@ -147,7 +147,7 @@ ext3_xattr_handler(int name_index) ssize_t ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) { - return ext3_xattr_list(dentry->d_inode, buffer, size); + return ext3_xattr_list(dentry, buffer, size); } static int @@ -332,7 +332,7 @@ ext3_xattr_get(struct inode *inode, int name_index, const char *name, } static int -ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry, +ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry, char *buffer, size_t buffer_size) { size_t rest = buffer_size; @@ -342,9 +342,10 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry, ext3_xattr_handler(entry->e_name_index); if (handler) { - size_t size = handler->list(inode, buffer, rest, + size_t size = handler->list(dentry, buffer, rest, entry->e_name, - entry->e_name_len); + entry->e_name_len, + handler->flags); if (buffer) { if (size > rest) return -ERANGE; @@ -357,8 +358,9 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry, } static int -ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) +ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; int error; @@ -383,7 +385,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) goto cleanup; } ext3_xattr_cache_insert(bh); - error = ext3_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); + error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: brelse(bh); @@ -392,8 +394,9 @@ cleanup: } static int -ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) +ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct ext3_xattr_ibody_header *header; struct ext3_inode *raw_inode; struct ext3_iloc iloc; @@ -411,7 +414,7 @@ ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) error = ext3_xattr_check_names(IFIRST(header), end); if (error) goto cleanup; - error = ext3_xattr_list_entries(inode, IFIRST(header), + error = ext3_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); cleanup: @@ -430,12 +433,12 @@ cleanup: * used / required on success. */ static int -ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) +ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { int i_error, b_error; - down_read(&EXT3_I(inode)->xattr_sem); - i_error = ext3_xattr_ibody_list(inode, buffer, buffer_size); + down_read(&EXT3_I(dentry->d_inode)->xattr_sem); + i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size); if (i_error < 0) { b_error = 0; } else { @@ -443,11 +446,11 @@ ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) buffer += i_error; buffer_size -= i_error; } - b_error = ext3_xattr_block_list(inode, buffer, buffer_size); + b_error = ext3_xattr_block_list(dentry, buffer, buffer_size); if (b_error < 0) i_error = 0; } - up_read(&EXT3_I(inode)->xattr_sem); + up_read(&EXT3_I(dentry->d_inode)->xattr_sem); return i_error + b_error; } diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 37b81097bdf2..474348788dd9 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -12,8 +12,8 @@ #include "xattr.h" static size_t -ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -28,23 +28,23 @@ ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size, } static int -ext3_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext3_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_get(inode, EXT3_XATTR_INDEX_SECURITY, name, - buffer, size); + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + name, buffer, size); } static int -ext3_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext3_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_set(inode, EXT3_XATTR_INDEX_SECURITY, name, - value, size, flags); + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + name, value, size, flags); } int diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index c7c41a410c4b..e5562845ed96 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -14,8 +14,8 @@ #include "xattr.h" static size_t -ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -32,22 +32,22 @@ ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, } static int -ext3_xattr_trusted_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext3_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name, - buffer, size); + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, + name, buffer, size); } static int -ext3_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext3_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name, + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 430fe63b31b3..3bcfe9ee0a68 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -13,13 +13,13 @@ #include "xattr.h" static size_t -ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return 0; if (list && total_len <= list_size) { @@ -31,26 +31,27 @@ ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size, } static int -ext3_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size); + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_USER, + name, buffer, size); } static int -ext3_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext3_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name, - value, size, flags); + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_USER, + name, value, size, flags); } struct xattr_handler ext3_xattr_user_handler = { diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 0df88b2a69b0..8a2a29d35a6f 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -364,12 +364,12 @@ out: * Extended attribute handlers */ static size_t -ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, - const char *name, size_t name_len) +ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_len) memcpy(list, POSIX_ACL_XATTR_ACCESS, size); @@ -377,12 +377,12 @@ ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, } static size_t -ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, - const char *name, size_t name_len) +ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_len) memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); @@ -390,15 +390,18 @@ ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, } static int -ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) return -EOPNOTSUPP; - acl = ext4_get_acl(inode, type); + acl = ext4_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -410,31 +413,16 @@ ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) } static int -ext4_xattr_get_acl_access(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int -ext4_xattr_get_acl_default(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int -ext4_xattr_set_acl(struct inode *inode, int type, const void *value, - size_t size) +ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; handle_t *handle; struct posix_acl *acl; int error, retries = 0; + if (strcmp(name, "") != 0) + return -EINVAL; if (!test_opt(inode->i_sb, POSIX_ACL)) return -EOPNOTSUPP; if (!is_owner_or_cap(inode)) @@ -466,34 +454,18 @@ release_and_out: return error; } -static int -ext4_xattr_set_acl_access(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int -ext4_xattr_set_acl_default(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - struct xattr_handler ext4_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, .list = ext4_xattr_list_acl_access, - .get = ext4_xattr_get_acl_access, - .set = ext4_xattr_set_acl_access, + .get = ext4_xattr_get_acl, + .set = ext4_xattr_set_acl, }; struct xattr_handler ext4_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = ext4_xattr_list_acl_default, - .get = ext4_xattr_get_acl_default, - .set = ext4_xattr_set_acl_default, + .get = ext4_xattr_get_acl, + .set = ext4_xattr_set_acl, }; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 910bf9a59cb3..83218bebbc7c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -92,7 +92,7 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct mb_cache_entry **); static void ext4_xattr_rehash(struct ext4_xattr_header *, struct ext4_xattr_entry *); -static int ext4_xattr_list(struct inode *inode, char *buffer, +static int ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size); static struct mb_cache *ext4_xattr_cache; @@ -140,7 +140,7 @@ ext4_xattr_handler(int name_index) ssize_t ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) { - return ext4_xattr_list(dentry->d_inode, buffer, size); + return ext4_xattr_list(dentry, buffer, size); } static int @@ -325,7 +325,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, } static int -ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry, +ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, char *buffer, size_t buffer_size) { size_t rest = buffer_size; @@ -335,9 +335,10 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry, ext4_xattr_handler(entry->e_name_index); if (handler) { - size_t size = handler->list(inode, buffer, rest, + size_t size = handler->list(dentry, buffer, rest, entry->e_name, - entry->e_name_len); + entry->e_name_len, + handler->flags); if (buffer) { if (size > rest) return -ERANGE; @@ -350,8 +351,9 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry, } static int -ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) +ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; int error; @@ -376,7 +378,7 @@ ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) goto cleanup; } ext4_xattr_cache_insert(bh); - error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); + error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: brelse(bh); @@ -385,8 +387,9 @@ cleanup: } static int -ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) +ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; @@ -404,7 +407,7 @@ ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) error = ext4_xattr_check_names(IFIRST(header), end); if (error) goto cleanup; - error = ext4_xattr_list_entries(inode, IFIRST(header), + error = ext4_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); cleanup: @@ -423,12 +426,12 @@ cleanup: * used / required on success. */ static int -ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) +ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { int i_error, b_error; - down_read(&EXT4_I(inode)->xattr_sem); - i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size); + down_read(&EXT4_I(dentry->d_inode)->xattr_sem); + i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size); if (i_error < 0) { b_error = 0; } else { @@ -436,11 +439,11 @@ ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) buffer += i_error; buffer_size -= i_error; } - b_error = ext4_xattr_block_list(inode, buffer, buffer_size); + b_error = ext4_xattr_block_list(dentry, buffer, buffer_size); if (b_error < 0) i_error = 0; } - up_read(&EXT4_I(inode)->xattr_sem); + up_read(&EXT4_I(dentry->d_inode)->xattr_sem); return i_error + b_error; } diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index ca5f89fc6cae..983c253999a7 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -12,8 +12,8 @@ #include "xattr.h" static size_t -ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; const size_t total_len = prefix_len + name_len + 1; @@ -28,23 +28,23 @@ ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size, } static int -ext4_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext4_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name, - buffer, size); + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + name, buffer, size); } static int -ext4_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext4_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name, - value, size, flags); + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + name, value, size, flags); } int diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index ac1a52cf2a37..15b50edc6587 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -14,8 +14,8 @@ #include "xattr.h" static size_t -ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -32,23 +32,23 @@ ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, } static int -ext4_xattr_trusted_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name, - buffer, size); + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + name, buffer, size); } static int -ext4_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext4_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name, - value, size, flags); + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + name, value, size, flags); } struct xattr_handler ext4_xattr_trusted_handler = { diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index d91aa61b42aa..c4ce05746ce1 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -13,13 +13,13 @@ #include "xattr.h" static size_t -ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return 0; if (list && total_len <= list_size) { @@ -31,26 +31,27 @@ ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, } static int -ext4_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext4_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size); + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER, + name, buffer, size); } static int -ext4_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext4_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name, - value, size, flags); + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER, + name, value, size, flags); } struct xattr_handler ext4_xattr_user_handler = { diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 3eb1ea846173..87ee309d4c24 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -126,7 +126,7 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) error = posix_acl_to_xattr(acl, data, len); if (error < 0) goto out; - error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, data, len, 0); + error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); if (!error) set_cached_acl(inode, type, acl); out: @@ -232,9 +232,10 @@ static int gfs2_acl_type(const char *name) return -EINVAL; } -static int gfs2_xattr_system_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int gfs2_xattr_system_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int xtype) { + struct inode *inode = dentry->d_inode; struct posix_acl *acl; int type; int error; @@ -255,9 +256,11 @@ static int gfs2_xattr_system_get(struct inode *inode, const char *name, return error; } -static int gfs2_xattr_system_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, + int xtype) { + struct inode *inode = dentry->d_inode; struct gfs2_sbd *sdp = GFS2_SB(inode); struct posix_acl *acl = NULL; int error = 0, type; @@ -319,7 +322,7 @@ static int gfs2_xattr_system_set(struct inode *inode, const char *name, } set_acl: - error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, 0); + error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS); if (!error) { if (acl) set_cached_acl(inode, type, acl); @@ -334,6 +337,7 @@ out: struct xattr_handler gfs2_xattr_system_handler = { .prefix = XATTR_SYSTEM_PREFIX, + .flags = GFS2_EATYPE_SYS, .get = gfs2_xattr_system_get, .set = gfs2_xattr_system_set, }; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 26ba2a4c4a2d..3ff32fa793da 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -801,7 +801,8 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) return err; } - err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0); + err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0, + GFS2_EATYPE_SECURITY); kfree(value); kfree(name); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 912f5cbc4740..8a04108e0c22 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -567,18 +567,17 @@ out: /** * gfs2_xattr_get - Get a GFS2 extended attribute * @inode: The inode - * @type: The type of extended attribute * @name: The name of the extended attribute * @buffer: The buffer to write the result into * @size: The size of the buffer + * @type: The type of extended attribute * * Returns: actual size of data on success, -errno on error */ - -int gfs2_xattr_get(struct inode *inode, int type, const char *name, - void *buffer, size_t size) +static int gfs2_xattr_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { - struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); struct gfs2_ea_location el; int error; @@ -1119,7 +1118,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) /** * gfs2_xattr_remove - Remove a GFS2 extended attribute - * @inode: The inode + * @ip: The inode * @type: The type of the extended attribute * @name: The name of the extended attribute * @@ -1130,9 +1129,8 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) * Returns: 0, or errno on failure */ -static int gfs2_xattr_remove(struct inode *inode, int type, const char *name) +static int gfs2_xattr_remove(struct gfs2_inode *ip, int type, const char *name) { - struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_ea_location el; int error; @@ -1156,24 +1154,24 @@ static int gfs2_xattr_remove(struct inode *inode, int type, const char *name) } /** - * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute - * @inode: The inode - * @type: The type of the extended attribute + * __gfs2_xattr_set - Set (or remove) a GFS2 extended attribute + * @ip: The inode * @name: The name of the extended attribute * @value: The value of the extended attribute (NULL for remove) * @size: The size of the @value argument * @flags: Create or Replace + * @type: The type of the extended attribute * * See gfs2_xattr_remove() for details of the removal of xattrs. * * Returns: 0 or errno on failure */ -int gfs2_xattr_set(struct inode *inode, int type, const char *name, - const void *value, size_t size, int flags) +int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags, int type) { - struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_ea_location el; unsigned int namel = strlen(name); int error; @@ -1184,7 +1182,7 @@ int gfs2_xattr_set(struct inode *inode, int type, const char *name, return -ERANGE; if (value == NULL) - return gfs2_xattr_remove(inode, type, name); + return gfs2_xattr_remove(ip, type, name); if (ea_check_size(sdp, namel, size)) return -ERANGE; @@ -1224,6 +1222,13 @@ int gfs2_xattr_set(struct inode *inode, int type, const char *name, return error; } +static int gfs2_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + return __gfs2_xattr_set(dentry->d_inode, name, value, + size, flags, type); +} + static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, char *data) { @@ -1529,40 +1534,18 @@ out_alloc: return error; } -static int gfs2_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size); -} - -static int gfs2_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags); -} - -static int gfs2_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size); -} - -static int gfs2_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags); -} - static struct xattr_handler gfs2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, - .get = gfs2_xattr_user_get, - .set = gfs2_xattr_user_set, + .flags = GFS2_EATYPE_USR, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, }; static struct xattr_handler gfs2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .get = gfs2_xattr_security_get, - .set = gfs2_xattr_security_set, + .flags = GFS2_EATYPE_SECURITY, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, }; struct xattr_handler *gfs2_xattr_handlers[] = { diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h index 8d6ae5813c4d..d392f8358f2f 100644 --- a/fs/gfs2/xattr.h +++ b/fs/gfs2/xattr.h @@ -53,10 +53,9 @@ struct gfs2_ea_location { struct gfs2_ea_header *el_prev; }; -extern int gfs2_xattr_get(struct inode *inode, int type, const char *name, - void *buffer, size_t size); -extern int gfs2_xattr_set(struct inode *inode, int type, const char *name, - const void *value, size_t size, int flags); +extern int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, + int flags, int type); extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); extern int gfs2_ea_dealloc(struct gfs2_inode *ip); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 7edb62e97419..7cdc3196476a 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -350,8 +350,8 @@ int jffs2_acl_chmod(struct inode *inode) return rc; } -static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS); @@ -360,8 +360,8 @@ static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t return retlen; } -static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT); @@ -370,12 +370,16 @@ static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_ return retlen; } -static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_t size) +static int jffs2_acl_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { struct posix_acl *acl; int rc; - acl = jffs2_get_acl(inode, type); + if (name[0] != '\0') + return -EINVAL; + + acl = jffs2_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (!acl) @@ -386,26 +390,15 @@ static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_ return rc; } -static int jffs2_acl_access_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) -{ - if (name[0] != '\0') - return -EINVAL; - return jffs2_acl_getxattr(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int jffs2_acl_default_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) -{ - if (name[0] != '\0') - return -EINVAL; - return jffs2_acl_getxattr(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value, size_t size) +static int jffs2_acl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { struct posix_acl *acl; int rc; - if (!is_owner_or_cap(inode)) + if (name[0] != '\0') + return -EINVAL; + if (!is_owner_or_cap(dentry->d_inode)) return -EPERM; if (value) { @@ -420,38 +413,24 @@ static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value, } else { acl = NULL; } - rc = jffs2_set_acl(inode, type, acl); + rc = jffs2_set_acl(dentry->d_inode, type, acl); out: posix_acl_release(acl); return rc; } -static int jffs2_acl_access_setxattr(struct inode *inode, const char *name, - const void *buffer, size_t size, int flags) -{ - if (name[0] != '\0') - return -EINVAL; - return jffs2_acl_setxattr(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int jffs2_acl_default_setxattr(struct inode *inode, const char *name, - const void *buffer, size_t size, int flags) -{ - if (name[0] != '\0') - return -EINVAL; - return jffs2_acl_setxattr(inode, ACL_TYPE_DEFAULT, buffer, size); -} - struct xattr_handler jffs2_acl_access_xattr_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_DEFAULT, .list = jffs2_acl_access_listxattr, - .get = jffs2_acl_access_getxattr, - .set = jffs2_acl_access_setxattr, + .get = jffs2_acl_getxattr, + .set = jffs2_acl_setxattr, }; struct xattr_handler jffs2_acl_default_xattr_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = jffs2_acl_default_listxattr, - .get = jffs2_acl_default_getxattr, - .set = jffs2_acl_default_setxattr, + .get = jffs2_acl_getxattr, + .set = jffs2_acl_setxattr, }; diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index 02c39c64ecb3..eaccee058583 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -44,26 +44,28 @@ int jffs2_init_security(struct inode *inode, struct inode *dir) } /* ---- XATTR Handler for "security.*" ----------------- */ -static int jffs2_security_getxattr(struct inode *inode, const char *name, - void *buffer, size_t size) +static int jffs2_security_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size); + return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY, + name, buffer, size); } -static int jffs2_security_setxattr(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +static int jffs2_security_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size, flags); + return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY, + name, buffer, size, flags); } -static size_t jffs2_security_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_security_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1; diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 4b107881acd5..9e75c62c85d6 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -990,9 +990,11 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) if (!xhandle) continue; if (buffer) { - rc = xhandle->list(inode, buffer+len, size-len, xd->xname, xd->name_len); + rc = xhandle->list(dentry, buffer+len, size-len, + xd->xname, xd->name_len, xd->flags); } else { - rc = xhandle->list(inode, NULL, 0, xd->xname, xd->name_len); + rc = xhandle->list(dentry, NULL, 0, xd->xname, + xd->name_len, xd->flags); } if (rc < 0) goto out; diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 8ec5765ef348..3e5a5e356e05 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -16,24 +16,26 @@ #include #include "nodelist.h" -static int jffs2_trusted_getxattr(struct inode *inode, const char *name, - void *buffer, size_t size) +static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size); + return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED, + name, buffer, size); } -static int jffs2_trusted_setxattr(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags); + return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED, + name, buffer, size, flags); } -static size_t jffs2_trusted_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1; diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index 8bbeab90ada1..8544af67dffe 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -16,24 +16,26 @@ #include #include "nodelist.h" -static int jffs2_user_getxattr(struct inode *inode, const char *name, - void *buffer, size_t size) +static int jffs2_user_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size); + return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_USER, + name, buffer, size); } -static int jffs2_user_setxattr(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +static int jffs2_user_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size, flags); + return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_USER, + name, buffer, size, flags); } -static size_t jffs2_user_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_user_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1; diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index fbeaec762103..e3e47415d851 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -331,13 +331,14 @@ cleanup: return ret; } -static size_t ocfs2_xattr_list_acl_access(struct inode *inode, +static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, const char *name, - size_t name_len) + size_t name_len, + int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) @@ -348,13 +349,14 @@ static size_t ocfs2_xattr_list_acl_access(struct inode *inode, return size; } -static size_t ocfs2_xattr_list_acl_default(struct inode *inode, +static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, const char *name, - size_t name_len) + size_t name_len, + int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) @@ -365,19 +367,19 @@ static size_t ocfs2_xattr_list_acl_default(struct inode *inode, return size; } -static int ocfs2_xattr_get_acl(struct inode *inode, - int type, - void *buffer, - size_t size) +static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); struct posix_acl *acl; int ret; + if (strcmp(name, "") != 0) + return -EINVAL; if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return -EOPNOTSUPP; - acl = ocfs2_get_acl(inode, type); + acl = ocfs2_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -388,35 +390,16 @@ static int ocfs2_xattr_get_acl(struct inode *inode, return ret; } -static int ocfs2_xattr_get_acl_access(struct inode *inode, - const char *name, - void *buffer, - size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int ocfs2_xattr_get_acl_default(struct inode *inode, - const char *name, - void *buffer, - size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int ocfs2_xattr_set_acl(struct inode *inode, - int type, - const void *value, - size_t size) +static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct posix_acl *acl; int ret = 0; + if (strcmp(name, "") != 0) + return -EINVAL; if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return -EOPNOTSUPP; @@ -442,38 +425,18 @@ cleanup: return ret; } -static int ocfs2_xattr_set_acl_access(struct inode *inode, - const char *name, - const void *value, - size_t size, - int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int ocfs2_xattr_set_acl_default(struct inode *inode, - const char *name, - const void *value, - size_t size, - int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - struct xattr_handler ocfs2_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, .list = ocfs2_xattr_list_acl_access, - .get = ocfs2_xattr_get_acl_access, - .set = ocfs2_xattr_set_acl_access, + .get = ocfs2_xattr_get_acl, + .set = ocfs2_xattr_set_acl, }; struct xattr_handler ocfs2_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = ocfs2_xattr_list_acl_default, - .get = ocfs2_xattr_get_acl_default, - .set = ocfs2_xattr_set_acl_default, + .get = ocfs2_xattr_get_acl, + .set = ocfs2_xattr_set_acl, }; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index fe3419068df2..43c114831c0d 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -205,8 +205,6 @@ static int ocfs2_get_xattr_tree_value_root(struct super_block *sb, int offset, struct ocfs2_xattr_value_root **xv, struct buffer_head **bh); -static int ocfs2_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags); static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) { @@ -6978,9 +6976,9 @@ int ocfs2_init_security_and_acl(struct inode *dir, ret = ocfs2_init_security_get(inode, dir, &si); if (!ret) { - ret = ocfs2_xattr_security_set(inode, si.name, - si.value, si.value_len, - XATTR_CREATE); + ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, + si.name, si.value, si.value_len, + XATTR_CREATE); if (ret) { mlog_errno(ret); goto leave; @@ -7008,9 +7006,9 @@ leave: /* * 'security' attributes support */ -static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, +static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -7023,23 +7021,23 @@ static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, return total_len; } -static int ocfs2_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name, - buffer, size); + return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + name, buffer, size); } -static int ocfs2_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value, - size, flags); + return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + name, value, size, flags); } int ocfs2_init_security_get(struct inode *inode, @@ -7076,9 +7074,9 @@ struct xattr_handler ocfs2_xattr_security_handler = { /* * 'trusted' attributes support */ -static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, +static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -7091,23 +7089,23 @@ static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, return total_len; } -static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name, - buffer, size); + return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + name, buffer, size); } -static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value, - size, flags); + return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + name, value, size, flags); } struct xattr_handler ocfs2_xattr_trusted_handler = { @@ -7120,13 +7118,13 @@ struct xattr_handler ocfs2_xattr_trusted_handler = { /* * 'user' attributes support */ -static size_t ocfs2_xattr_user_list(struct inode *inode, char *list, +static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return 0; @@ -7139,31 +7137,31 @@ static size_t ocfs2_xattr_user_list(struct inode *inode, char *list, return total_len; } -static int ocfs2_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); if (strcmp(name, "") == 0) return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name, + return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name, buffer, size); } -static int ocfs2_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); if (strcmp(name, "") == 0) return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value, - size, flags); + return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER, + name, value, size, flags); } struct xattr_handler ocfs2_xattr_user_handler = { diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 58aa8e75f7f5..8c7033a8b67e 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -48,6 +48,7 @@ #include #include #include +#include #define PRIVROOT_NAME ".reiserfs_priv" #define XAROOT_NAME "xattrs" @@ -726,15 +727,14 @@ ssize_t reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, size_t size) { - struct inode *inode = dentry->d_inode; struct xattr_handler *handler; - handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name); + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) return -EOPNOTSUPP; - return handler->get(inode, name, buffer, size); + return handler->get(dentry, name, buffer, size, handler->flags); } /* @@ -746,15 +746,14 @@ int reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = dentry->d_inode; struct xattr_handler *handler; - handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name); + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) return -EOPNOTSUPP; - return handler->set(inode, name, value, size, flags); + return handler->set(dentry, name, value, size, flags, handler->flags); } /* @@ -764,21 +763,20 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, */ int reiserfs_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode = dentry->d_inode; struct xattr_handler *handler; - handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name); + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) return -EOPNOTSUPP; - return handler->set(inode, name, NULL, 0, XATTR_REPLACE); + return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags); } struct listxattr_buf { size_t size; size_t pos; char *buf; - struct inode *inode; + struct dentry *dentry; }; static int listxattr_filler(void *buf, const char *name, int namelen, @@ -789,17 +787,19 @@ static int listxattr_filler(void *buf, const char *name, int namelen, if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { struct xattr_handler *handler; - handler = find_xattr_handler_prefix(b->inode->i_sb->s_xattr, + handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, name); if (!handler) /* Unsupported xattr name */ return 0; if (b->buf) { - size = handler->list(b->inode, b->buf + b->pos, - b->size, name, namelen); + size = handler->list(b->dentry, b->buf + b->pos, + b->size, name, namelen, + handler->flags); if (size > b->size) return -ERANGE; } else { - size = handler->list(b->inode, NULL, 0, name, namelen); + size = handler->list(b->dentry, NULL, 0, name, + namelen, handler->flags); } b->pos += size; @@ -820,7 +820,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) int err = 0; loff_t pos = 0; struct listxattr_buf buf = { - .inode = dentry->d_inode, + .dentry = dentry, .buf = buffer, .size = buffer ? size : 0, }; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 35d6e672a279..cc32e6ada67b 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -15,8 +15,10 @@ static int reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct posix_acl *acl); static int -xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) +posix_acl_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; struct posix_acl *acl; int error, error2; struct reiserfs_transaction_handle th; @@ -60,15 +62,16 @@ xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) } static int -xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +posix_acl_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - if (!reiserfs_posixacl(inode->i_sb)) + if (!reiserfs_posixacl(dentry->d_sb)) return -EOPNOTSUPP; - acl = reiserfs_get_acl(inode, type); + acl = reiserfs_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -482,30 +485,12 @@ int reiserfs_acl_chmod(struct inode *inode) return error; } -static int -posix_acl_access_get(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1) - return -EINVAL; - return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int -posix_acl_access_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1) - return -EINVAL; - return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static size_t posix_acl_access_list(struct inode *inode, char *list, +static size_t posix_acl_access_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!reiserfs_posixacl(inode->i_sb)) + if (!reiserfs_posixacl(dentry->d_sb)) return 0; if (list && size <= list_size) memcpy(list, POSIX_ACL_XATTR_ACCESS, size); @@ -514,35 +499,18 @@ static size_t posix_acl_access_list(struct inode *inode, char *list, struct xattr_handler reiserfs_posix_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, - .get = posix_acl_access_get, - .set = posix_acl_access_set, + .flags = ACL_TYPE_ACCESS, + .get = posix_acl_get, + .set = posix_acl_set, .list = posix_acl_access_list, }; -static int -posix_acl_default_get(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1) - return -EINVAL; - return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int -posix_acl_default_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1) - return -EINVAL; - return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - -static size_t posix_acl_default_list(struct inode *inode, char *list, +static size_t posix_acl_default_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!reiserfs_posixacl(inode->i_sb)) + if (!reiserfs_posixacl(dentry->d_sb)) return 0; if (list && size <= list_size) memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); @@ -551,7 +519,8 @@ static size_t posix_acl_default_list(struct inode *inode, char *list, struct xattr_handler reiserfs_posix_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, - .get = posix_acl_default_get, - .set = posix_acl_default_set, + .flags = ACL_TYPE_DEFAULT, + .get = posix_acl_get, + .set = posix_acl_set, .list = posix_acl_default_list, }; diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index a92c8792c0f6..d8b5bfcbdd30 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -8,36 +8,37 @@ #include static int -security_get(struct inode *inode, const char *name, void *buffer, size_t size) +security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) { if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) return -EINVAL; - if (IS_PRIVATE(inode)) + if (IS_PRIVATE(dentry->d_inode)) return -EPERM; - return reiserfs_xattr_get(inode, name, buffer, size); + return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); } static int -security_set(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +security_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) { if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) return -EINVAL; - if (IS_PRIVATE(inode)) + if (IS_PRIVATE(dentry->d_inode)) return -EPERM; - return reiserfs_xattr_set(inode, name, buffer, size, flags); + return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); } -static size_t security_list(struct inode *inode, char *list, size_t list_len, - const char *name, size_t namelen) +static size_t security_list(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t namelen, int handler_flags) { const size_t len = namelen + 1; - if (IS_PRIVATE(inode)) + if (IS_PRIVATE(dentry->d_inode)) return 0; if (list && len <= list_len) { diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index a865042f75e2..5b08aaca3daf 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -8,36 +8,37 @@ #include static int -trusted_get(struct inode *inode, const char *name, void *buffer, size_t size) +trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) { if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) return -EPERM; - return reiserfs_xattr_get(inode, name, buffer, size); + return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); } static int -trusted_set(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +trusted_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) { if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) return -EPERM; - return reiserfs_xattr_set(inode, name, buffer, size, flags); + return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); } -static size_t trusted_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int handler_flags) { const size_t len = name_len + 1; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) return 0; if (list && len <= list_size) { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index e3238dc4f3db..75d59c49b911 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -7,34 +7,35 @@ #include static int -user_get(struct inode *inode, const char *name, void *buffer, size_t size) +user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) { if (strlen(name) < sizeof(XATTR_USER_PREFIX)) return -EINVAL; - if (!reiserfs_xattrs_user(inode->i_sb)) + if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_get(inode, name, buffer, size); + return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); } static int -user_set(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +user_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) { if (strlen(name) < sizeof(XATTR_USER_PREFIX)) return -EINVAL; - if (!reiserfs_xattrs_user(inode->i_sb)) + if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_set(inode, name, buffer, size, flags); + return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); } -static size_t user_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int handler_flags) { const size_t len = name_len + 1; - if (!reiserfs_xattrs_user(inode->i_sb)) + if (!reiserfs_xattrs_user(dentry->d_sb)) return 0; if (list && len <= list_size) { memcpy(list, name, name_len); diff --git a/fs/xattr.c b/fs/xattr.c index 6d4f6d3449fb..46f87e828b48 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -615,12 +615,11 @@ ssize_t generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { struct xattr_handler *handler; - struct inode *inode = dentry->d_inode; - handler = xattr_resolve_name(inode->i_sb->s_xattr, &name); + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) return -EOPNOTSUPP; - return handler->get(inode, name, buffer, size); + return handler->get(dentry, name, buffer, size, handler->flags); } /* @@ -630,18 +629,20 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; - struct xattr_handler *handler, **handlers = inode->i_sb->s_xattr; + struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; unsigned int size = 0; if (!buffer) { - for_each_xattr_handler(handlers, handler) - size += handler->list(inode, NULL, 0, NULL, 0); + for_each_xattr_handler(handlers, handler) { + size += handler->list(dentry, NULL, 0, NULL, 0, + handler->flags); + } } else { char *buf = buffer; for_each_xattr_handler(handlers, handler) { - size = handler->list(inode, buf, buffer_size, NULL, 0); + size = handler->list(dentry, buf, buffer_size, + NULL, 0, handler->flags); if (size > buffer_size) return -ERANGE; buf += size; @@ -659,14 +660,13 @@ int generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct xattr_handler *handler; - struct inode *inode = dentry->d_inode; if (size == 0) value = ""; /* empty EA, do not remove */ - handler = xattr_resolve_name(inode->i_sb->s_xattr, &name); + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) return -EOPNOTSUPP; - return handler->set(inode, name, value, size, flags); + return handler->set(dentry, name, value, size, 0, handler->flags); } /* @@ -677,12 +677,12 @@ int generic_removexattr(struct dentry *dentry, const char *name) { struct xattr_handler *handler; - struct inode *inode = dentry->d_inode; - handler = xattr_resolve_name(inode->i_sb->s_xattr, &name); + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) return -EOPNOTSUPP; - return handler->set(inode, name, NULL, 0, XATTR_REPLACE); + return handler->set(dentry, name, NULL, 0, + XATTR_REPLACE, handler->flags); } EXPORT_SYMBOL(generic_getxattr); diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index 69e598b6986f..2512125dfa7c 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -354,37 +354,14 @@ xfs_acl_chmod(struct inode *inode) return error; } -/* - * System xattr handlers. - * - * Currently Posix ACLs are the only system namespace extended attribute - * handlers supported by XFS, so we just implement the handlers here. - * If we ever support other system extended attributes this will need - * some refactoring. - */ - static int -xfs_decode_acl(const char *name) -{ - if (strcmp(name, "posix_acl_access") == 0) - return ACL_TYPE_ACCESS; - else if (strcmp(name, "posix_acl_default") == 0) - return ACL_TYPE_DEFAULT; - return -EINVAL; -} - -static int -xfs_xattr_system_get(struct inode *inode, const char *name, - void *value, size_t size) +xfs_xattr_acl_get(struct dentry *dentry, const char *name, + void *value, size_t size, int type) { struct posix_acl *acl; - int type, error; - - type = xfs_decode_acl(name); - if (type < 0) - return type; + int error; - acl = xfs_get_acl(inode, type); + acl = xfs_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -397,15 +374,13 @@ xfs_xattr_system_get(struct inode *inode, const char *name, } static int -xfs_xattr_system_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +xfs_xattr_acl_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; struct posix_acl *acl = NULL; - int error = 0, type; + int error = 0; - type = xfs_decode_acl(name); - if (type < 0) - return type; if (flags & XATTR_CREATE) return -EINVAL; if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) @@ -462,8 +437,16 @@ xfs_xattr_system_set(struct inode *inode, const char *name, return error; } -struct xattr_handler xfs_xattr_system_handler = { - .prefix = XATTR_SYSTEM_PREFIX, - .get = xfs_xattr_system_get, - .set = xfs_xattr_system_set, +struct xattr_handler xfs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .get = xfs_xattr_acl_get, + .set = xfs_xattr_acl_set, +}; + +struct xattr_handler xfs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .get = xfs_xattr_acl_get, + .set = xfs_xattr_acl_set, }; diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c index 497c7fb75cc1..0b1878857fc3 100644 --- a/fs/xfs/linux-2.6/xfs_xattr.c +++ b/fs/xfs/linux-2.6/xfs_xattr.c @@ -30,10 +30,10 @@ static int -__xfs_xattr_get(struct inode *inode, const char *name, +xfs_xattr_get(struct dentry *dentry, const char *name, void *value, size_t size, int xflags) { - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(dentry->d_inode); int error, asize = size; if (strcmp(name, "") == 0) @@ -52,10 +52,10 @@ __xfs_xattr_get(struct inode *inode, const char *name, } static int -__xfs_xattr_set(struct inode *inode, const char *name, const void *value, +xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int xflags) { - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(dentry->d_inode); if (strcmp(name, "") == 0) return -EINVAL; @@ -71,75 +71,34 @@ __xfs_xattr_set(struct inode *inode, const char *name, const void *value, return -xfs_attr_set(ip, name, (void *)value, size, xflags); } -static int -xfs_xattr_user_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return __xfs_xattr_get(inode, name, value, size, 0); -} - -static int -xfs_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return __xfs_xattr_set(inode, name, value, size, flags, 0); -} - static struct xattr_handler xfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, - .get = xfs_xattr_user_get, - .set = xfs_xattr_user_set, + .flags = 0, /* no flags implies user namespace */ + .get = xfs_xattr_get, + .set = xfs_xattr_set, }; - -static int -xfs_xattr_trusted_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return __xfs_xattr_get(inode, name, value, size, ATTR_ROOT); -} - -static int -xfs_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return __xfs_xattr_set(inode, name, value, size, flags, ATTR_ROOT); -} - static struct xattr_handler xfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .get = xfs_xattr_trusted_get, - .set = xfs_xattr_trusted_set, + .flags = ATTR_ROOT, + .get = xfs_xattr_get, + .set = xfs_xattr_set, }; - -static int -xfs_xattr_secure_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return __xfs_xattr_get(inode, name, value, size, ATTR_SECURE); -} - -static int -xfs_xattr_secure_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return __xfs_xattr_set(inode, name, value, size, flags, ATTR_SECURE); -} - static struct xattr_handler xfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .get = xfs_xattr_secure_get, - .set = xfs_xattr_secure_set, + .flags = ATTR_SECURE, + .get = xfs_xattr_get, + .set = xfs_xattr_set, }; - struct xattr_handler *xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, #ifdef CONFIG_XFS_POSIX_ACL - &xfs_xattr_system_handler, + &xfs_xattr_acl_access_handler, + &xfs_xattr_acl_default_handler, #endif NULL }; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 947b150df8ed..00fd357c3e46 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -49,7 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode); extern int posix_acl_access_exists(struct inode *inode); extern int posix_acl_default_exists(struct inode *inode); -extern struct xattr_handler xfs_xattr_system_handler; +extern struct xattr_handler xfs_xattr_acl_access_handler; +extern struct xattr_handler xfs_xattr_acl_default_handler; #else # define xfs_check_acl NULL # define xfs_get_acl(inode, type) NULL diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 5c84af8c5f6f..fb9b7e6e1e2d 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -38,12 +38,13 @@ struct dentry; struct xattr_handler { char *prefix; - size_t (*list)(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len); - int (*get)(struct inode *inode, const char *name, void *buffer, - size_t size); - int (*set)(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags); + int flags; /* fs private flags passed back to the handlers */ + size_t (*list)(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int handler_flags); + int (*get)(struct dentry *dentry, const char *name, void *buffer, + size_t size, int handler_flags); + int (*set)(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags); }; ssize_t xattr_getsecurity(struct inode *, const char *, void *, size_t); diff --git a/mm/shmem.c b/mm/shmem.c index adf8033afd52..3cd32c2ea0a0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2042,27 +2042,28 @@ static const struct inode_operations shmem_symlink_inode_operations = { * filesystem level, though. */ -static size_t shmem_xattr_security_list(struct inode *inode, char *list, +static size_t shmem_xattr_security_list(struct dentry *dentry, char *list, size_t list_len, const char *name, - size_t name_len) + size_t name_len, int handler_flags) { - return security_inode_listsecurity(inode, list, list_len); + return security_inode_listsecurity(dentry->d_inode, list, list_len); } -static int shmem_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int shmem_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int handler_flags) { if (strcmp(name, "") == 0) return -EINVAL; - return xattr_getsecurity(inode, name, buffer, size); + return xattr_getsecurity(dentry->d_inode, name, buffer, size); } -static int shmem_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int shmem_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int handler_flags) { if (strcmp(name, "") == 0) return -EINVAL; - return security_inode_setsecurity(inode, name, value, size, flags); + return security_inode_setsecurity(dentry->d_inode, name, value, + size, flags); } static struct xattr_handler shmem_xattr_security_handler = { diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index df2c87fdae50..f8d5330ec0d7 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c @@ -63,86 +63,48 @@ struct generic_acl_operations shmem_acl_ops = { .setacl = shmem_set_acl, }; -/** - * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access, - * shmem_xattr_acl_access_handler - plumbing code to implement the - * system.posix_acl_access xattr using the generic acl functions. - */ - static size_t -shmem_list_acl_access(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +shmem_xattr_list_acl(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { - return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, - list, list_size); + return generic_acl_list(dentry->d_inode, &shmem_acl_ops, + type, list, list_size); } static int -shmem_get_acl_access(struct inode *inode, const char *name, void *buffer, - size_t size) +shmem_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { if (strcmp(name, "") != 0) return -EINVAL; - return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer, - size); + return generic_acl_get(dentry->d_inode, &shmem_acl_ops, type, + buffer, size); } static int -shmem_set_acl_access(struct inode *inode, const char *name, const void *value, - size_t size, int flags) +shmem_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { if (strcmp(name, "") != 0) return -EINVAL; - return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value, - size); + return generic_acl_set(dentry->d_inode, &shmem_acl_ops, type, + value, size); } struct xattr_handler shmem_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, - .list = shmem_list_acl_access, - .get = shmem_get_acl_access, - .set = shmem_set_acl_access, + .flags = ACL_TYPE_ACCESS, + .list = shmem_xattr_list_acl, + .get = shmem_xattr_get_acl, + .set = shmem_xattr_set_acl, }; -/** - * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default, - * shmem_xattr_acl_default_handler - plumbing code to implement the - * system.posix_acl_default xattr using the generic acl functions. - */ - -static size_t -shmem_list_acl_default(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) -{ - return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, - list, list_size); -} - -static int -shmem_get_acl_default(struct inode *inode, const char *name, void *buffer, - size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer, - size); -} - -static int -shmem_set_acl_default(struct inode *inode, const char *name, const void *value, - size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value, - size); -} - struct xattr_handler shmem_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, - .list = shmem_list_acl_default, - .get = shmem_get_acl_default, - .set = shmem_set_acl_default, + .flags = ACL_TYPE_DEFAULT, + .list = shmem_xattr_list_acl, + .get = shmem_xattr_get_acl, + .set = shmem_xattr_set_acl, }; /** -- cgit v1.2.3 From 1c7c474c31aea6d5cb2fb35f31d9e9e91ae466b1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2009 16:44:44 +0100 Subject: make generic_acl slightly more generic Now that we cache the ACL pointers in the generic inode all the generic_acl cruft can go away and generic_acl.c can directly implement xattr handlers dealing with the full Posix ACL semantics for in-memory filesystems. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/generic_acl.c | 158 +++++++++++++++++++++++++------------------- include/linux/generic_acl.h | 41 +++--------- include/linux/shmem_fs.h | 16 ----- mm/Makefile | 1 - mm/shmem.c | 17 +++-- mm/shmem_acl.c | 133 ------------------------------------- 6 files changed, 109 insertions(+), 257 deletions(-) delete mode 100644 mm/shmem_acl.c (limited to 'include/linux') diff --git a/fs/generic_acl.c b/fs/generic_acl.c index e0b53aa7bbec..55458031e501 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -1,62 +1,58 @@ /* - * fs/generic_acl.c - * * (C) 2005 Andreas Gruenbacher * * This file is released under the GPL. + * + * Generic ACL support for in-memory filesystems. */ #include #include #include +#include +#include -/** - * generic_acl_list - Generic xattr_handler->list() operation - * @ops: Filesystem specific getacl and setacl callbacks - */ -size_t -generic_acl_list(struct inode *inode, struct generic_acl_operations *ops, - int type, char *list, size_t list_size) + +static size_t +generic_acl_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { struct posix_acl *acl; - const char *name; + const char *xname; size_t size; - acl = ops->getacl(inode, type); + acl = get_cached_acl(dentry->d_inode, type); if (!acl) return 0; posix_acl_release(acl); - switch(type) { - case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; - break; - - case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; - break; - - default: - return 0; + switch (type) { + case ACL_TYPE_ACCESS: + xname = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + xname = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return 0; } - size = strlen(name) + 1; + size = strlen(xname) + 1; if (list && size <= list_size) - memcpy(list, name, size); + memcpy(list, xname, size); return size; } -/** - * generic_acl_get - Generic xattr_handler->get() operation - * @ops: Filesystem specific getacl and setacl callbacks - */ -int -generic_acl_get(struct inode *inode, struct generic_acl_operations *ops, - int type, void *buffer, size_t size) +static int +generic_acl_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - acl = ops->getacl(inode, type); + if (strcmp(name, "") != 0) + return -EINVAL; + + acl = get_cached_acl(dentry->d_inode, type); if (!acl) return -ENODATA; error = posix_acl_to_xattr(acl, buffer, size); @@ -65,17 +61,16 @@ generic_acl_get(struct inode *inode, struct generic_acl_operations *ops, return error; } -/** - * generic_acl_set - Generic xattr_handler->set() operation - * @ops: Filesystem specific getacl and setacl callbacks - */ -int -generic_acl_set(struct inode *inode, struct generic_acl_operations *ops, - int type, const void *value, size_t size) +static int +generic_acl_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; struct posix_acl *acl = NULL; int error; + if (strcmp(name, "") != 0) + return -EINVAL; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; if (!is_owner_or_cap(inode)) @@ -91,28 +86,27 @@ generic_acl_set(struct inode *inode, struct generic_acl_operations *ops, error = posix_acl_valid(acl); if (error) goto failed; - switch(type) { - case ACL_TYPE_ACCESS: - mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); - if (error < 0) - goto failed; - inode->i_mode = mode; - if (error == 0) { - posix_acl_release(acl); - acl = NULL; - } - break; - - case ACL_TYPE_DEFAULT: - if (!S_ISDIR(inode->i_mode)) { - error = -EINVAL; - goto failed; - } - break; + switch (type) { + case ACL_TYPE_ACCESS: + mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + goto failed; + inode->i_mode = mode; + if (error == 0) { + posix_acl_release(acl); + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + error = -EINVAL; + goto failed; + } + break; } } - ops->setacl(inode, type, acl); + set_cached_acl(inode, type, acl); error = 0; failed: posix_acl_release(acl); @@ -121,14 +115,12 @@ failed: /** * generic_acl_init - Take care of acl inheritance at @inode create time - * @ops: Filesystem specific getacl and setacl callbacks * * Files created inside a directory with a default ACL inherit the * directory's default ACL. */ int -generic_acl_init(struct inode *inode, struct inode *dir, - struct generic_acl_operations *ops) +generic_acl_init(struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; mode_t mode = inode->i_mode; @@ -136,7 +128,7 @@ generic_acl_init(struct inode *inode, struct inode *dir, inode->i_mode = mode & ~current_umask(); if (!S_ISLNK(inode->i_mode)) - acl = ops->getacl(dir, ACL_TYPE_DEFAULT); + acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); if (acl) { struct posix_acl *clone; @@ -145,7 +137,7 @@ generic_acl_init(struct inode *inode, struct inode *dir, error = -ENOMEM; if (!clone) goto cleanup; - ops->setacl(inode, ACL_TYPE_DEFAULT, clone); + set_cached_acl(inode, ACL_TYPE_DEFAULT, clone); posix_acl_release(clone); } clone = posix_acl_clone(acl, GFP_KERNEL); @@ -156,7 +148,7 @@ generic_acl_init(struct inode *inode, struct inode *dir, if (error >= 0) { inode->i_mode = mode; if (error > 0) - ops->setacl(inode, ACL_TYPE_ACCESS, clone); + set_cached_acl(inode, ACL_TYPE_ACCESS, clone); } posix_acl_release(clone); } @@ -169,20 +161,19 @@ cleanup: /** * generic_acl_chmod - change the access acl of @inode upon chmod() - * @ops: FIlesystem specific getacl and setacl callbacks * * A chmod also changes the permissions of the owner, group/mask, and * other ACL entries. */ int -generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops) +generic_acl_chmod(struct inode *inode) { struct posix_acl *acl, *clone; int error = 0; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - acl = ops->getacl(inode, ACL_TYPE_ACCESS); + acl = get_cached_acl(inode, ACL_TYPE_ACCESS); if (acl) { clone = posix_acl_clone(acl, GFP_KERNEL); posix_acl_release(acl); @@ -190,8 +181,37 @@ generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops) return -ENOMEM; error = posix_acl_chmod_masq(clone, inode->i_mode); if (!error) - ops->setacl(inode, ACL_TYPE_ACCESS, clone); + set_cached_acl(inode, ACL_TYPE_ACCESS, clone); posix_acl_release(clone); } return error; } + +int +generic_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS); + + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + return -EAGAIN; +} + +struct xattr_handler generic_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = generic_acl_list, + .get = generic_acl_get, + .set = generic_acl_set, +}; + +struct xattr_handler generic_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = generic_acl_list, + .get = generic_acl_get, + .set = generic_acl_set, +}; diff --git a/include/linux/generic_acl.h b/include/linux/generic_acl.h index 886f5faa08cb..ca666d18ed67 100644 --- a/include/linux/generic_acl.h +++ b/include/linux/generic_acl.h @@ -1,36 +1,15 @@ -/* - * include/linux/generic_acl.h - * - * (C) 2005 Andreas Gruenbacher - * - * This file is released under the GPL. - */ +#ifndef LINUX_GENERIC_ACL_H +#define LINUX_GENERIC_ACL_H -#ifndef GENERIC_ACL_H -#define GENERIC_ACL_H +#include -#include -#include +struct inode; -/** - * struct generic_acl_operations - filesystem operations - * - * Filesystems must make these operations available to the generic - * operations. - */ -struct generic_acl_operations { - struct posix_acl *(*getacl)(struct inode *, int); - void (*setacl)(struct inode *, int, struct posix_acl *); -}; +extern struct xattr_handler generic_acl_access_handler; +extern struct xattr_handler generic_acl_default_handler; -size_t generic_acl_list(struct inode *, struct generic_acl_operations *, int, - char *, size_t); -int generic_acl_get(struct inode *, struct generic_acl_operations *, int, - void *, size_t); -int generic_acl_set(struct inode *, struct generic_acl_operations *, int, - const void *, size_t); -int generic_acl_init(struct inode *, struct inode *, - struct generic_acl_operations *); -int generic_acl_chmod(struct inode *, struct generic_acl_operations *); +int generic_acl_init(struct inode *, struct inode *); +int generic_acl_chmod(struct inode *); +int generic_check_acl(struct inode *inode, int mask); -#endif +#endif /* LINUX_GENERIC_ACL_H */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index deee7afd8d66..e164291fb3e7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -41,20 +41,4 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) extern int init_tmpfs(void); extern int shmem_fill_super(struct super_block *sb, void *data, int silent); -#ifdef CONFIG_TMPFS_POSIX_ACL -int shmem_check_acl(struct inode *, int); -int shmem_acl_init(struct inode *, struct inode *); - -extern struct xattr_handler shmem_xattr_acl_access_handler; -extern struct xattr_handler shmem_xattr_acl_default_handler; - -extern struct generic_acl_operations shmem_acl_ops; - -#else -static inline int shmem_acl_init(struct inode *inode, struct inode *dir) -{ - return 0; -} -#endif /* CONFIG_TMPFS_POSIX_ACL */ - #endif diff --git a/mm/Makefile b/mm/Makefile index 82131d0f8d85..7a68d2ab5560 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o -obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o diff --git a/mm/shmem.c b/mm/shmem.c index 3cd32c2ea0a0..f8485062f3ba 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -41,6 +41,7 @@ static struct vfsmount *shm_mnt; #include #include +#include #include #include #include @@ -809,7 +810,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) error = inode_setattr(inode, attr); #ifdef CONFIG_TMPFS_POSIX_ACL if (!error && (attr->ia_valid & ATTR_MODE)) - error = generic_acl_chmod(inode, &shmem_acl_ops); + error = generic_acl_chmod(inode); #endif if (page) page_cache_release(page); @@ -1823,11 +1824,13 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) return error; } } - error = shmem_acl_init(inode, dir); +#ifdef CONFIG_TMPFS_POSIX_ACL + error = generic_acl_init(inode, dir); if (error) { iput(inode); return error; } +#endif if (dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; if (S_ISDIR(mode)) @@ -2074,8 +2077,8 @@ static struct xattr_handler shmem_xattr_security_handler = { }; static struct xattr_handler *shmem_xattr_handlers[] = { - &shmem_xattr_acl_access_handler, - &shmem_xattr_acl_default_handler, + &generic_acl_access_handler, + &generic_acl_default_handler, &shmem_xattr_security_handler, NULL }; @@ -2454,7 +2457,7 @@ static const struct inode_operations shmem_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .check_acl = shmem_check_acl, + .check_acl = generic_check_acl, #endif }; @@ -2477,7 +2480,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .check_acl = shmem_check_acl, + .check_acl = generic_check_acl, #endif }; @@ -2488,7 +2491,7 @@ static const struct inode_operations shmem_special_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .check_acl = shmem_check_acl, + .check_acl = generic_check_acl, #endif }; diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c deleted file mode 100644 index f8d5330ec0d7..000000000000 --- a/mm/shmem_acl.c +++ /dev/null @@ -1,133 +0,0 @@ -/* - * mm/shmem_acl.c - * - * (C) 2005 Andreas Gruenbacher - * - * This file is released under the GPL. - */ - -#include -#include -#include -#include - -/** - * shmem_get_acl - generic_acl_operations->getacl() operation - */ -static struct posix_acl * -shmem_get_acl(struct inode *inode, int type) -{ - struct posix_acl *acl = NULL; - - spin_lock(&inode->i_lock); - switch(type) { - case ACL_TYPE_ACCESS: - acl = posix_acl_dup(inode->i_acl); - break; - - case ACL_TYPE_DEFAULT: - acl = posix_acl_dup(inode->i_default_acl); - break; - } - spin_unlock(&inode->i_lock); - - return acl; -} - -/** - * shmem_set_acl - generic_acl_operations->setacl() operation - */ -static void -shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) -{ - struct posix_acl *free = NULL; - - spin_lock(&inode->i_lock); - switch(type) { - case ACL_TYPE_ACCESS: - free = inode->i_acl; - inode->i_acl = posix_acl_dup(acl); - break; - - case ACL_TYPE_DEFAULT: - free = inode->i_default_acl; - inode->i_default_acl = posix_acl_dup(acl); - break; - } - spin_unlock(&inode->i_lock); - posix_acl_release(free); -} - -struct generic_acl_operations shmem_acl_ops = { - .getacl = shmem_get_acl, - .setacl = shmem_set_acl, -}; - -static size_t -shmem_xattr_list_acl(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - return generic_acl_list(dentry->d_inode, &shmem_acl_ops, - type, list, list_size); -} - -static int -shmem_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return generic_acl_get(dentry->d_inode, &shmem_acl_ops, type, - buffer, size); -} - -static int -shmem_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return generic_acl_set(dentry->d_inode, &shmem_acl_ops, type, - value, size); -} - -struct xattr_handler shmem_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = shmem_xattr_list_acl, - .get = shmem_xattr_get_acl, - .set = shmem_xattr_set_acl, -}; - -struct xattr_handler shmem_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = shmem_xattr_list_acl, - .get = shmem_xattr_get_acl, - .set = shmem_xattr_set_acl, -}; - -/** - * shmem_acl_init - Inizialize the acl(s) of a new inode - */ -int -shmem_acl_init(struct inode *inode, struct inode *dir) -{ - return generic_acl_init(inode, dir, &shmem_acl_ops); -} - -/** - * shmem_check_acl - check_acl() callback for generic_permission() - */ -int -shmem_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); - - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - return -EAGAIN; -} -- cgit v1.2.3 From 1e431f5ce78f3ae8254d725060288b78ff74f086 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2009 16:44:53 +0100 Subject: cleanup blockdev_direct_IO locking Currently the locking in blockdev_direct_IO is a mess, we have three different locking types and very confusing checks for some of them. The most complicated one is DIO_OWN_LOCKING for reads, which happens to not actually be used. This patch gets rid of the DIO_OWN_LOCKING - as mentioned above the read case is unused anyway, and the write side is almost identical to DIO_NO_LOCKING. The difference is that DIO_NO_LOCKING always sets the create argument for the get_blocks callback to zero, but we can easily move that to the actual get_blocks callbacks. There are four users of the DIO_NO_LOCKING mode: gfs already ignores the create argument and thus is fine with the new version, ocfs2 only errors out if create were ever set, and we can remove this dead code now, the block device code only ever uses create for an error message if we are fully beyond the device which can never happen, and last but not least XFS will need the new behavour for writes. Now we can replace the lock_type variable with a flags one, where no flag means the DIO_NO_LOCKING behaviour and DIO_LOCKING is kept as the first flag. Separate out the check for not allowing to fill holes into a separate flag, although for now both flags always get set at the same time. Also revamp the documentation of the locking scheme to actually make sense. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/direct-io.c | 129 ++++++++++++++++++-------------------------- fs/ocfs2/aops.c | 34 ++---------- fs/xfs/linux-2.6/xfs_aops.c | 20 +++---- include/linux/fs.h | 22 +++----- 4 files changed, 71 insertions(+), 134 deletions(-) (limited to 'include/linux') diff --git a/fs/direct-io.c b/fs/direct-io.c index b912270942fa..7dde0df8e8b6 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -53,13 +53,6 @@ * * If blkfactor is zero then the user's request was aligned to the filesystem's * blocksize. - * - * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems. - * This determines whether we need to do the fancy locking which prevents - * direct-IO from being able to read uninitialised disk blocks. If its zero - * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is - * not held for the entire direct write (taken briefly, initially, during a - * direct read though, but its never held for the duration of a direct-IO). */ struct dio { @@ -68,7 +61,7 @@ struct dio { struct inode *inode; int rw; loff_t i_size; /* i_size when submitted */ - int lock_type; /* doesn't change */ + int flags; /* doesn't change */ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -240,7 +233,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) if (dio->end_io && dio->result) dio->end_io(dio->iocb, offset, transferred, dio->map_bh.b_private); - if (dio->lock_type == DIO_LOCKING) + + if (dio->flags & DIO_LOCKING) /* lockdep: non-owner release */ up_read_non_owner(&dio->inode->i_alloc_sem); @@ -515,21 +509,24 @@ static int get_more_blocks(struct dio *dio) map_bh->b_state = 0; map_bh->b_size = fs_count << dio->inode->i_blkbits; + /* + * For writes inside i_size on a DIO_SKIP_HOLES filesystem we + * forbid block creations: only overwrites are permitted. + * We will return early to the caller once we see an + * unmapped buffer head returned, and the caller will fall + * back to buffered I/O. + * + * Otherwise the decision is left to the get_blocks method, + * which may decide to handle it or also return an unmapped + * buffer head. + */ create = dio->rw & WRITE; - if (dio->lock_type == DIO_LOCKING) { + if (dio->flags & DIO_SKIP_HOLES) { if (dio->block_in_file < (i_size_read(dio->inode) >> dio->blkbits)) create = 0; - } else if (dio->lock_type == DIO_NO_LOCKING) { - create = 0; } - /* - * For writes inside i_size we forbid block creations: only - * overwrites are permitted. We fall back to buffered writes - * at a higher level for inside-i_size block-instantiating - * writes. - */ ret = (*dio->get_block)(dio->inode, fs_startblk, map_bh, create); } @@ -1039,7 +1036,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * we can let i_mutex go now that its achieved its purpose * of protecting us from looking up uninitialized blocks. */ - if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) + if (rw == READ && (dio->flags & DIO_LOCKING)) mutex_unlock(&dio->inode->i_mutex); /* @@ -1086,30 +1083,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, /* * This is a library function for use by filesystem drivers. - * The locking rules are governed by the dio_lock_type parameter. * - * DIO_NO_LOCKING (no locking, for raw block device access) - * For writes, i_mutex is not held on entry; it is never taken. + * The locking rules are governed by the flags parameter: + * - if the flags value contains DIO_LOCKING we use a fancy locking + * scheme for dumb filesystems. + * For writes this function is called under i_mutex and returns with + * i_mutex held, for reads, i_mutex is not held on entry, but it is + * taken and dropped again before returning. + * For reads and writes i_alloc_sem is taken in shared mode and released + * on I/O completion (which may happen asynchronously after returning to + * the caller). * - * DIO_LOCKING (simple locking for regular files) - * For writes we are called under i_mutex and return with i_mutex held, even - * though it is internally dropped. - * For reads, i_mutex is not held on entry, but it is taken and dropped before - * returning. - * - * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of - * uninitialised data, allowing parallel direct readers and writers) - * For writes we are called without i_mutex, return without it, never touch it. - * For reads we are called under i_mutex and return with i_mutex held, even - * though it may be internally dropped. - * - * Additional i_alloc_sem locking requirements described inline below. + * - if the flags value does NOT contain DIO_LOCKING we don't use any + * internal locking but rather rely on the filesystem to synchronize + * direct I/O reads/writes versus each other and truncate. + * For reads and writes both i_mutex and i_alloc_sem are not held on + * entry and are never taken. */ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - int dio_lock_type) + int flags) { int seg; size_t size; @@ -1120,8 +1115,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; - int release_i_mutex = 0; - int acquire_i_mutex = 0; if (rw & WRITE) rw = WRITE_ODIRECT_PLUG; @@ -1156,43 +1149,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (!dio) goto out; - /* - * For block device access DIO_NO_LOCKING is used, - * neither readers nor writers do any locking at all - * For regular files using DIO_LOCKING, - * readers need to grab i_mutex and i_alloc_sem - * writers need to grab i_alloc_sem only (i_mutex is already held) - * For regular files using DIO_OWN_LOCKING, - * neither readers nor writers take any locks here - */ - dio->lock_type = dio_lock_type; - if (dio_lock_type != DIO_NO_LOCKING) { + dio->flags = flags; + if (dio->flags & DIO_LOCKING) { /* watch out for a 0 len io from a tricksy fs */ if (rw == READ && end > offset) { - struct address_space *mapping; + struct address_space *mapping = + iocb->ki_filp->f_mapping; - mapping = iocb->ki_filp->f_mapping; - if (dio_lock_type != DIO_OWN_LOCKING) { - mutex_lock(&inode->i_mutex); - release_i_mutex = 1; - } + /* will be released by direct_io_worker */ + mutex_lock(&inode->i_mutex); retval = filemap_write_and_wait_range(mapping, offset, end - 1); if (retval) { + mutex_unlock(&inode->i_mutex); kfree(dio); goto out; } - - if (dio_lock_type == DIO_OWN_LOCKING) { - mutex_unlock(&inode->i_mutex); - acquire_i_mutex = 1; - } } - if (dio_lock_type == DIO_LOCKING) - /* lockdep: not the owner will release it */ - down_read_non_owner(&inode->i_alloc_sem); + /* + * Will be released at I/O completion, possibly in a + * different thread. + */ + down_read_non_owner(&inode->i_alloc_sem); } /* @@ -1210,24 +1190,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again for DIO_LOCKING. - * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by - * it's own meaner. + * + * NOTE: filesystems with their own locking have to handle this + * on their own. */ - if (unlikely(retval < 0 && (rw & WRITE))) { - loff_t isize = i_size_read(inode); - - if (end > isize && dio_lock_type == DIO_LOCKING) - vmtruncate(inode, isize); + if (dio->flags & DIO_LOCKING) { + if (unlikely((rw & WRITE) && retval < 0)) { + loff_t isize = i_size_read(inode); + if (end > isize ) + vmtruncate(inode, isize); + } } - if (rw == READ && dio_lock_type == DIO_LOCKING) - release_i_mutex = 0; - out: - if (release_i_mutex) - mutex_unlock(&inode->i_mutex); - else if (acquire_i_mutex) - mutex_lock(&inode->i_mutex); return retval; } EXPORT_SYMBOL(__blockdev_direct_IO); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index deb2b132ae5e..3dae4a13f6e4 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -547,6 +547,9 @@ bail: * * called like this: dio->get_blocks(dio->inode, fs_startblk, * fs_count, map_bh, dio->rw == WRITE); + * + * Note that we never bother to allocate blocks here, and thus ignore the + * create argument. */ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - /* - * Any write past EOF is not allowed because we'd be extending. - */ - if (create && (iblock + max_blocks) > inode_blocks) { - ret = -EIO; - goto bail; - } - /* This figures out the size of the next contiguous block, and * our logical offset */ ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, @@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, goto bail; } - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) { - ocfs2_error(inode->i_sb, - "Inode %llu has a hole at block %llu\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)iblock); - ret = -EROFS; - goto bail; - } - /* We should already CoW the refcounted extent. */ BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); /* @@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, */ if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) map_bh(bh_result, inode->i_sb, p_blkno); - else { - /* - * ocfs2_prepare_inode_for_write() should have caught - * the case where we'd be filling a hole and triggered - * a buffered write instead. - */ - if (create) { - ret = -EIO; - mlog_errno(ret); - goto bail; - } - + else clear_buffer_mapped(bh_result); - } /* make sure we don't map more than max_blocks blocks here as that's all the kernel will handle at this point. */ diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index d798c54296eb..66abe36c1213 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1474,19 +1474,13 @@ xfs_vm_direct_IO( bdev = xfs_find_bdev_for_inode(XFS_I(inode)); - if (rw == WRITE) { - iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); - ret = blockdev_direct_IO_own_locking(rw, iocb, inode, - bdev, iov, offset, nr_segs, - xfs_get_blocks_direct, - xfs_end_io_direct); - } else { - iocb->private = xfs_alloc_ioend(inode, IOMAP_READ); - ret = blockdev_direct_IO_no_locking(rw, iocb, inode, - bdev, iov, offset, nr_segs, - xfs_get_blocks_direct, - xfs_end_io_direct); - } + iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? + IOMAP_UNWRITTEN : IOMAP_READ); + + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, + offset, nr_segs, + xfs_get_blocks_direct, + xfs_end_io_direct); if (unlikely(ret != -EIOCBQUEUED && iocb->private)) xfs_destroy_ioend(iocb->private); diff --git a/include/linux/fs.h b/include/linux/fs.h index cdc23be4edde..7c8ff12d1995 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2263,9 +2263,11 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, int lock_type); enum { - DIO_LOCKING = 1, /* need locking between buffered and direct access */ - DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ - DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ + /* need locking between buffered and direct access */ + DIO_LOCKING = 0x01, + + /* filesystem does not support filling holes */ + DIO_SKIP_HOLES = 0x02, }; static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, @@ -2274,7 +2276,8 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, dio_iodone_t end_io) { return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, DIO_LOCKING); + nr_segs, get_block, end_io, + DIO_LOCKING | DIO_SKIP_HOLES); } static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, @@ -2283,16 +2286,7 @@ static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, dio_iodone_t end_io) { return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, DIO_NO_LOCKING); -} - -static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, - struct inode *inode, struct block_device *bdev, const struct iovec *iov, - loff_t offset, unsigned long nr_segs, get_block_t get_block, - dio_iodone_t end_io) -{ - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, DIO_OWN_LOCKING); + nr_segs, get_block, end_io, 0); } #endif -- cgit v1.2.3 From efbbd05a595343a413964ad85a2ad359b7b7efbd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:40 +0100 Subject: sched: Add pre and post wakeup hooks As will be apparent in the next patch, we need a pre wakeup hook for sched_fair task migration, hence rename the post wakeup hook and one pre wakeup. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.114746117@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++- kernel/sched.c | 12 ++++++++---- kernel/sched_rt.c | 4 ++-- 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c858f38e81a..2c9fa1ccebff 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1091,7 +1091,8 @@ struct sched_class { enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); - void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); + void (*task_waking) (struct rq *this_rq, struct task_struct *task); + void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, const struct cpumask *newmask); diff --git a/kernel/sched.c b/kernel/sched.c index 297dc441ff96..6c571bdd5658 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2412,6 +2412,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; p->state = TASK_WAKING; + + if (p->sched_class->task_waking) + p->sched_class->task_waking(rq, p); + __task_rq_unlock(rq); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); @@ -2475,8 +2479,8 @@ out_running: p->state = TASK_RUNNING; #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); if (unlikely(rq->idle_stamp)) { u64 delta = rq->clock - rq->idle_stamp; @@ -2666,8 +2670,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, &flags); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d2ea2828164e..f48328ac216f 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1472,7 +1472,7 @@ static void post_schedule_rt(struct rq *rq) * If we are not running and we are not going to reschedule soon, we should * try to push tasks away now */ -static void task_wake_up_rt(struct rq *rq, struct task_struct *p) +static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && @@ -1753,7 +1753,7 @@ static const struct sched_class rt_sched_class = { .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, - .task_wake_up = task_wake_up_rt, + .task_woken = task_woken_rt, .switched_from = switched_from_rt, #endif -- cgit v1.2.3 From 88ec22d3edb72b261f8628226cd543589a6d5e1b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:41 +0100 Subject: sched: Remove the cfs_rq dependency from set_task_cpu() In order to remove the cfs_rq dependency from set_task_cpu() we need to ensure the task is cfs_rq invariant for all callsites. The simple approach is to substract cfs_rq->min_vruntime from se->vruntime on dequeue, and add cfs_rq->min_vruntime on enqueue. However, this has the downside of breaking FAIR_SLEEPERS since we loose the old vruntime as we only maintain the relative position. To solve this, we observe that we only migrate runnable tasks, we do this using deactivate_task(.sleep=0) and activate_task(.wakeup=0), therefore we can restrain the min_vruntime invariance to that state. The only other case is wakeup balancing, since we want to maintain the old vruntime we cannot make it relative on dequeue, but since we don't migrate inactive tasks, we can do so right before we activate it again. This is where we need the new pre-wakeup hook, we need to call this while still holding the old rq->lock. We could fold it into ->select_task_rq(), but since that has multiple callsites and would obfuscate the locking requirements, that seems like a fudge. This leaves the fork() case, simply make sure that ->task_fork() leaves the ->vruntime in a relative state. This covers all cases where set_task_cpu() gets called, and ensures it sees a relative vruntime. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.191697025@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- kernel/sched.c | 6 +----- kernel/sched_fair.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 46 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c9fa1ccebff..973b2b89f86d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1116,7 +1116,7 @@ struct sched_class { struct task_struct *task); #ifdef CONFIG_FAIR_GROUP_SCHED - void (*moved_group) (struct task_struct *p); + void (*moved_group) (struct task_struct *p, int on_rq); #endif }; diff --git a/kernel/sched.c b/kernel/sched.c index 6c571bdd5658..f92ce63edfff 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2038,8 +2038,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct cfs_rq *old_cfsrq = task_cfs_rq(p), - *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); #ifdef CONFIG_SCHED_DEBUG /* @@ -2056,8 +2054,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } - p->se.vruntime -= old_cfsrq->min_vruntime - - new_cfsrq->min_vruntime; __set_task_cpu(p, new_cpu); } @@ -10102,7 +10098,7 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk); + tsk->sched_class->moved_group(tsk, on_rq); #endif if (unlikely(running)) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ec1d2715620c..42ac3c9f66f6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -510,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = calc_delta_fair(delta_exec, curr); + curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); } @@ -765,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) se->vruntime = vruntime; } +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_MIGRATE 2 + static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + /* + * Update the normalized vruntime before updating min_vruntime + * through callig update_curr(). + */ + if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) + se->vruntime += cfs_rq->min_vruntime; + /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); account_entity_enqueue(cfs_rq, se); - if (wakeup) { + if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); } @@ -828,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); + + /* + * Normalize the entity after updating the min_vruntime because the + * update can refer to the ->curr item and we need to reflect this + * movement in our normalized position. + */ + if (!sleep) + se->vruntime -= cfs_rq->min_vruntime; } /* @@ -1038,13 +1057,19 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int flags = 0; + + if (wakeup) + flags |= ENQUEUE_WAKEUP; + if (p->state == TASK_WAKING) + flags |= ENQUEUE_MIGRATE; for_each_sched_entity(se) { if (se->on_rq) break; cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, wakeup); - wakeup = 1; + enqueue_entity(cfs_rq, se, flags); + flags = ENQUEUE_WAKEUP; } hrtick_update(rq); @@ -1120,6 +1145,14 @@ static void yield_task_fair(struct rq *rq) #ifdef CONFIG_SMP +static void task_waking_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + se->vruntime -= cfs_rq->min_vruntime; +} + #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group @@ -1978,6 +2011,8 @@ static void task_fork_fair(struct task_struct *p) resched_task(rq->curr); } + se->vruntime -= cfs_rq->min_vruntime; + raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -2031,12 +2066,13 @@ static void set_curr_task_fair(struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p) +static void moved_group_fair(struct task_struct *p, int on_rq) { struct cfs_rq *cfs_rq = task_cfs_rq(p); update_curr(cfs_rq); - place_entity(cfs_rq, &p->se, 1); + if (!on_rq) + place_entity(cfs_rq, &p->se, 1); } #endif @@ -2076,6 +2112,8 @@ static const struct sched_class fair_sched_class = { .move_one_task = move_one_task_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, + + .task_waking = task_waking_fair, #endif .set_curr_task = set_curr_task_fair, -- cgit v1.2.3 From 70023de88c58a81a730ab4d13c51a30e537ec76e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 29 Oct 2009 11:04:28 +0800 Subject: ACPI: Add a generic API for _OSC -v2 v2->v1: .improve debug info as suggedted by Bjorn,Kenji .API is using uuid string as suggested by Alexey Add an API to execute _OSC. A lot of devices can have this method, so add a generic API. Signed-off-by: Shaohua Li Signed-off-by: Len Brown --- drivers/acpi/bus.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/acpi.h | 9 ++++ 2 files changed, 131 insertions(+) (limited to 'include/linux') diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 741191524353..12240be58f27 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -344,6 +344,128 @@ bool acpi_bus_can_wakeup(acpi_handle handle) EXPORT_SYMBOL(acpi_bus_can_wakeup); +static void acpi_print_osc_error(acpi_handle handle, + struct acpi_osc_context *context, char *error) +{ + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER}; + int i; + + if (ACPI_FAILURE(acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer))) + printk(KERN_DEBUG "%s\n", error); + else { + printk(KERN_DEBUG "%s:%s\n", (char *)buffer.pointer, error); + kfree(buffer.pointer); + } + printk(KERN_DEBUG"_OSC request data:"); + for (i = 0; i < context->cap.length; i += sizeof(u32)) + printk("%x ", *((u32 *)(context->cap.pointer + i))); + printk("\n"); +} + +static u8 hex_val(unsigned char c) +{ + return isdigit(c) ? c - '0' : toupper(c) - 'A' + 10; +} + +static acpi_status acpi_str_to_uuid(char *str, u8 *uuid) +{ + int i; + static int opc_map_to_uuid[16] = {6, 4, 2, 0, 11, 9, 16, 14, 19, 21, + 24, 26, 28, 30, 32, 34}; + + if (strlen(str) != 36) + return AE_BAD_PARAMETER; + for (i = 0; i < 36; i++) { + if (i == 8 || i == 13 || i == 18 || i == 23) { + if (str[i] != '-') + return AE_BAD_PARAMETER; + } else if (!isxdigit(str[i])) + return AE_BAD_PARAMETER; + } + for (i = 0; i < 16; i++) { + uuid[i] = hex_val(str[opc_map_to_uuid[i]]) << 4; + uuid[i] |= hex_val(str[opc_map_to_uuid[i] + 1]); + } + return AE_OK; +} + +acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context) +{ + acpi_status status; + struct acpi_object_list input; + union acpi_object in_params[4]; + union acpi_object *out_obj; + u8 uuid[16]; + u32 errors; + + if (!context) + return AE_ERROR; + if (ACPI_FAILURE(acpi_str_to_uuid(context->uuid_str, uuid))) + return AE_ERROR; + context->ret.length = ACPI_ALLOCATE_BUFFER; + context->ret.pointer = NULL; + + /* Setting up input parameters */ + input.count = 4; + input.pointer = in_params; + in_params[0].type = ACPI_TYPE_BUFFER; + in_params[0].buffer.length = 16; + in_params[0].buffer.pointer = uuid; + in_params[1].type = ACPI_TYPE_INTEGER; + in_params[1].integer.value = context->rev; + in_params[2].type = ACPI_TYPE_INTEGER; + in_params[2].integer.value = context->cap.length/sizeof(u32); + in_params[3].type = ACPI_TYPE_BUFFER; + in_params[3].buffer.length = context->cap.length; + in_params[3].buffer.pointer = context->cap.pointer; + + status = acpi_evaluate_object(handle, "_OSC", &input, &context->ret); + if (ACPI_FAILURE(status)) + return status; + + /* return buffer should have the same length as cap buffer */ + if (context->ret.length != context->cap.length) + return AE_NULL_OBJECT; + + out_obj = context->ret.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + acpi_print_osc_error(handle, context, + "_OSC evaluation returned wrong type"); + status = AE_TYPE; + goto out_kfree; + } + /* Need to ignore the bit0 in result code */ + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + if (errors & OSC_REQUEST_ERROR) + acpi_print_osc_error(handle, context, + "_OSC request failed"); + if (errors & OSC_INVALID_UUID_ERROR) + acpi_print_osc_error(handle, context, + "_OSC invalid UUID"); + if (errors & OSC_INVALID_REVISION_ERROR) + acpi_print_osc_error(handle, context, + "_OSC invalid revision"); + if (errors & OSC_CAPABILITIES_MASK_ERROR) { + if (((u32 *)context->cap.pointer)[OSC_QUERY_TYPE] + & OSC_QUERY_ENABLE) + goto out_success; + status = AE_SUPPORT; + goto out_kfree; + } + status = AE_ERROR; + goto out_kfree; + } +out_success: + return AE_OK; + +out_kfree: + kfree(context->ret.pointer); + context->ret.pointer = NULL; + return status; +} +EXPORT_SYMBOL(acpi_run_osc); + /* -------------------------------------------------------------------------- Event Management -------------------------------------------------------------------------- */ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index dfcd920c3e54..3247e09db20d 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -253,6 +253,13 @@ void __init acpi_old_suspend_ordering(void); void __init acpi_s4_no_nvs(void); #endif /* CONFIG_PM_SLEEP */ +struct acpi_osc_context { + char *uuid_str; /* uuid string */ + int rev; + struct acpi_buffer cap; /* arg2/arg3 */ + struct acpi_buffer ret; /* free by caller if success */ +}; + #define OSC_QUERY_TYPE 0 #define OSC_SUPPORT_TYPE 1 #define OSC_CONTROL_TYPE 2 @@ -265,6 +272,8 @@ void __init acpi_s4_no_nvs(void); #define OSC_INVALID_REVISION_ERROR 8 #define OSC_CAPABILITIES_MASK_ERROR 16 +acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); + /* _OSC DW1 Definition (OS Support Fields) */ #define OSC_EXT_PCI_CONFIG_SUPPORT 1 #define OSC_ACTIVE_STATE_PWR_SUPPORT 2 -- cgit v1.2.3 From 3a9622dc4659af44a8098a233f65c51e495ff0a5 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 29 Oct 2009 11:04:50 +0800 Subject: ACPI: cleanup pci_root _OSC code. Signed-off-by: Shaohua Li Signed-off-by: Len Brown --- drivers/acpi/pci_root.c | 76 +++++++++---------------------------------------- include/linux/acpi.h | 5 ++-- 2 files changed, 17 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 1af808171d46..101cce3681d1 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -202,72 +202,24 @@ static void acpi_pci_bridge_scan(struct acpi_device *device) } } -static u8 OSC_UUID[16] = {0x5B, 0x4D, 0xDB, 0x33, 0xF7, 0x1F, 0x1C, 0x40, - 0x96, 0x57, 0x74, 0x41, 0xC0, 0x3D, 0xD7, 0x66}; +static u8 pci_osc_uuid_str[] = "33DB4D5B-1FF7-401C-9657-7441C03DD766"; static acpi_status acpi_pci_run_osc(acpi_handle handle, const u32 *capbuf, u32 *retval) { + struct acpi_osc_context context = { + .uuid_str = pci_osc_uuid_str, + .rev = 1, + .cap.length = 12, + .cap.pointer = (void *)capbuf, + }; acpi_status status; - struct acpi_object_list input; - union acpi_object in_params[4]; - struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; - union acpi_object *out_obj; - u32 errors; - - /* Setting up input parameters */ - input.count = 4; - input.pointer = in_params; - in_params[0].type = ACPI_TYPE_BUFFER; - in_params[0].buffer.length = 16; - in_params[0].buffer.pointer = OSC_UUID; - in_params[1].type = ACPI_TYPE_INTEGER; - in_params[1].integer.value = 1; - in_params[2].type = ACPI_TYPE_INTEGER; - in_params[2].integer.value = 3; - in_params[3].type = ACPI_TYPE_BUFFER; - in_params[3].buffer.length = 12; - in_params[3].buffer.pointer = (u8 *)capbuf; - - status = acpi_evaluate_object(handle, "_OSC", &input, &output); - if (ACPI_FAILURE(status)) - return status; - if (!output.length) - return AE_NULL_OBJECT; - - out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) { - printk(KERN_DEBUG "_OSC evaluation returned wrong type\n"); - status = AE_TYPE; - goto out_kfree; - } - /* Need to ignore the bit0 in result code */ - errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) { - if (errors & OSC_REQUEST_ERROR) - printk(KERN_DEBUG "_OSC request failed\n"); - if (errors & OSC_INVALID_UUID_ERROR) - printk(KERN_DEBUG "_OSC invalid UUID\n"); - if (errors & OSC_INVALID_REVISION_ERROR) - printk(KERN_DEBUG "_OSC invalid revision\n"); - if (errors & OSC_CAPABILITIES_MASK_ERROR) { - if (capbuf[OSC_QUERY_TYPE] & OSC_QUERY_ENABLE) - goto out_success; - printk(KERN_DEBUG - "Firmware did not grant requested _OSC control\n"); - status = AE_SUPPORT; - goto out_kfree; - } - status = AE_ERROR; - goto out_kfree; + status = acpi_run_osc(handle, &context); + if (ACPI_SUCCESS(status)) { + *retval = *((u32 *)(context.ret.pointer + 8)); + kfree(context.ret.pointer); } -out_success: - *retval = *((u32 *)(out_obj->buffer.pointer + 8)); - status = AE_OK; - -out_kfree: - kfree(output.pointer); return status; } @@ -277,10 +229,10 @@ static acpi_status acpi_pci_query_osc(struct acpi_pci_root *root, u32 flags) u32 support_set, result, capbuf[3]; /* do _OSC query for all possible controls */ - support_set = root->osc_support_set | (flags & OSC_SUPPORT_MASKS); + support_set = root->osc_support_set | (flags & OSC_PCI_SUPPORT_MASKS); capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE; capbuf[OSC_SUPPORT_TYPE] = support_set; - capbuf[OSC_CONTROL_TYPE] = OSC_CONTROL_MASKS; + capbuf[OSC_CONTROL_TYPE] = OSC_PCI_CONTROL_MASKS; status = acpi_pci_run_osc(root->device->handle, capbuf, &result); if (ACPI_SUCCESS(status)) { @@ -427,7 +379,7 @@ acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags) if (ACPI_FAILURE(status)) return status; - control_req = (flags & OSC_CONTROL_MASKS); + control_req = (flags & OSC_PCI_CONTROL_MASKS); if (!control_req) return AE_TYPE; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 3247e09db20d..535beecc37cf 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -263,7 +263,6 @@ struct acpi_osc_context { #define OSC_QUERY_TYPE 0 #define OSC_SUPPORT_TYPE 1 #define OSC_CONTROL_TYPE 2 -#define OSC_SUPPORT_MASKS 0x1f /* _OSC DW0 Definition */ #define OSC_QUERY_ENABLE 1 @@ -274,12 +273,14 @@ struct acpi_osc_context { acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); +/* PCI defined _OSC bits */ /* _OSC DW1 Definition (OS Support Fields) */ #define OSC_EXT_PCI_CONFIG_SUPPORT 1 #define OSC_ACTIVE_STATE_PWR_SUPPORT 2 #define OSC_CLOCK_PWR_CAPABILITY_SUPPORT 4 #define OSC_PCI_SEGMENT_GROUPS_SUPPORT 8 #define OSC_MSI_SUPPORT 16 +#define OSC_PCI_SUPPORT_MASKS 0x1f /* _OSC DW1 Definition (OS Control Fields) */ #define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL 1 @@ -288,7 +289,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); #define OSC_PCI_EXPRESS_AER_CONTROL 8 #define OSC_PCI_EXPRESS_CAP_STRUCTURE_CONTROL 16 -#define OSC_CONTROL_MASKS (OSC_PCI_EXPRESS_NATIVE_HP_CONTROL | \ +#define OSC_PCI_CONTROL_MASKS (OSC_PCI_EXPRESS_NATIVE_HP_CONTROL | \ OSC_SHPC_NATIVE_HP_CONTROL | \ OSC_PCI_EXPRESS_PME_CONTROL | \ OSC_PCI_EXPRESS_AER_CONTROL | \ -- cgit v1.2.3 From 3563ff964fdc36358cef0330936fdac28e65142a Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 29 Oct 2009 11:05:05 +0800 Subject: ACPI: Add platform-wide _OSC support. Signed-off-by: Shaohua Li Signed-off-by: Len Brown --- drivers/acpi/bus.c | 26 ++++++++++++++++++++++++++ include/linux/acpi.h | 7 +++++++ 2 files changed, 33 insertions(+) (limited to 'include/linux') diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 12240be58f27..65f7e335f122 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -466,6 +466,30 @@ out_kfree: } EXPORT_SYMBOL(acpi_run_osc); +static u8 sb_uuid_str[] = "0811B06E-4A27-44F9-8D60-3CBBC22E7B48"; +static void acpi_bus_osc_support(void) +{ + u32 capbuf[2]; + struct acpi_osc_context context = { + .uuid_str = sb_uuid_str, + .rev = 1, + .cap.length = 8, + .cap.pointer = capbuf, + }; + acpi_handle handle; + + capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE; + capbuf[OSC_SUPPORT_TYPE] = OSC_SB_PR3_SUPPORT; /* _PR3 is in use */ +#ifdef CONFIG_ACPI_PROCESSOR_AGGREGATOR + capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_PAD_SUPPORT; +#endif + if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) + return; + if (ACPI_SUCCESS(acpi_run_osc(handle, &context))) + kfree(context.ret.pointer); + /* do we need to check the returned cap? Sounds no */ +} + /* -------------------------------------------------------------------------- Event Management -------------------------------------------------------------------------- */ @@ -856,6 +880,8 @@ static int __init acpi_bus_init(void) status = acpi_ec_ecdt_probe(); /* Ignore result. Not having an ECDT is not fatal. */ + acpi_bus_osc_support(); + status = acpi_initialize_objects(ACPI_FULL_INITIALIZATION); if (ACPI_FAILURE(status)) { printk(KERN_ERR PREFIX "Unable to initialize ACPI objects\n"); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 535beecc37cf..e11090d462d2 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -273,6 +273,13 @@ struct acpi_osc_context { acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); +/* platform-wide _OSC bits */ +#define OSC_SB_PAD_SUPPORT 1 +#define OSC_SB_PPC_OST_SUPPORT 2 +#define OSC_SB_PR3_SUPPORT 4 +#define OSC_SB_CPUHP_OST_SUPPORT 8 +#define OSC_SB_APEI_SUPPORT 16 + /* PCI defined _OSC bits */ /* _OSC DW1 Definition (OS Support Fields) */ #define OSC_EXT_PCI_CONFIG_SUPPORT 1 -- cgit v1.2.3 From 2ee1abad73a12df5521cd3f017f081f1f684a361 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Nov 2009 18:03:15 +0000 Subject: xfs: improve metadata I/O merging in the elevator Change all async metadata buffers to use [READ|WRITE]_META I/O types so that the I/O doesn't get issued immediately. This allows merging of adjacent metadata requests but still prioritises them over bulk data. This shows a 10-15% improvement in sequential create speed of small files. Don't include the log buffers in this classification - leave them as sync types so they are issued immediately. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 6 +++++- fs/xfs/linux-2.6/xfs_buf.h | 1 + fs/xfs/xfs_log.c | 2 ++ include/linux/fs.h | 1 + 4 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index b4c7d4248aac..162359b664ca 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1149,10 +1149,14 @@ _xfs_buf_ioapply( if (bp->b_flags & XBF_ORDERED) { ASSERT(!(bp->b_flags & XBF_READ)); rw = WRITE_BARRIER; - } else if (bp->b_flags & _XBF_RUN_QUEUES) { + } else if (bp->b_flags & XBF_LOG_BUFFER) { ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); bp->b_flags &= ~_XBF_RUN_QUEUES; rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC; + } else if (bp->b_flags & _XBF_RUN_QUEUES) { + ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); + bp->b_flags &= ~_XBF_RUN_QUEUES; + rw = (bp->b_flags & XBF_WRITE) ? WRITE_META : READ_META; } else { rw = (bp->b_flags & XBF_WRITE) ? WRITE : (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index a509f4addc2a..a34c7b54822d 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -55,6 +55,7 @@ typedef enum { XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ XBF_ORDERED = (1 << 11), /* use ordered writes */ XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ + XBF_LOG_BUFFER = (1 << 13), /* this is a buffer used for the log */ /* flags used only as arguments to access routines */ XBF_LOCK = (1 << 14), /* lock requested */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4cb1792040e3..600b5b06aaeb 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1441,6 +1441,7 @@ xlog_sync(xlog_t *log, XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); + bp->b_flags |= XBF_LOG_BUFFER; /* * Do an ordered write for the log block. * Its unnecessary to flush the first split block in the log wrap case. @@ -1478,6 +1479,7 @@ xlog_sync(xlog_t *log, XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); + bp->b_flags |= XBF_LOG_BUFFER; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) XFS_BUF_ORDERED(bp); dptr = XFS_BUF_PTR(bp); diff --git a/include/linux/fs.h b/include/linux/fs.h index b23a7018eb90..cf7fc8a7fe6a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -152,6 +152,7 @@ struct inodes_stat_t { #define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) #define WRITE_ODIRECT_PLUG (WRITE | (1 << BIO_RW_SYNCIO)) +#define WRITE_META (WRITE | (1 << BIO_RW_META)) #define SWRITE_SYNC_PLUG \ (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) -- cgit v1.2.3 From 6e1415467614e854fee660ff6648bd10fa976e95 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 15 Dec 2009 19:27:45 +0000 Subject: NOMMU: Optimise away the {dac_,}mmap_min_addr tests In NOMMU mode clamp dac_mmap_min_addr to zero to cause the tests on it to be skipped by the compiler. We do this as the minimum mmap address doesn't make any sense in NOMMU mode. mmap_min_addr and round_hint_to_min() can be discarded entirely in NOMMU mode. Signed-off-by: David Howells Acked-by: Eric Paris Signed-off-by: James Morris --- include/linux/security.h | 7 +++++++ kernel/sysctl.c | 2 ++ mm/Kconfig | 1 + security/Makefile | 3 ++- 4 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 466cbadbd1ef..2c627d361c02 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -95,8 +95,13 @@ struct seq_file; extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); extern int cap_netlink_recv(struct sk_buff *skb, int cap); +#ifdef CONFIG_MMU extern unsigned long mmap_min_addr; extern unsigned long dac_mmap_min_addr; +#else +#define dac_mmap_min_addr 0UL +#endif + /* * Values used in the task_security_ops calls */ @@ -121,6 +126,7 @@ struct request_sock; #define LSM_UNSAFE_PTRACE 2 #define LSM_UNSAFE_PTRACE_CAP 4 +#ifdef CONFIG_MMU /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr @@ -135,6 +141,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint) } extern int mmap_min_addr_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#endif #ifdef CONFIG_SECURITY diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 45e4bef0012a..856a24eadf7e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1214,6 +1214,7 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif +#ifdef CONFIG_MMU { .procname = "mmap_min_addr", .data = &dac_mmap_min_addr, @@ -1221,6 +1222,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = mmap_min_addr_handler, }, +#endif #ifdef CONFIG_NUMA { .procname = "numa_zonelist_order", diff --git a/mm/Kconfig b/mm/Kconfig index 43ea8c3a2bbf..ee9f3e0f2b69 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -221,6 +221,7 @@ config KSM config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" + depends on MMU default 4096 help This is the portion of low virtual memory which should be protected diff --git a/security/Makefile b/security/Makefile index bb44e350c618..da20a193c8dd 100644 --- a/security/Makefile +++ b/security/Makefile @@ -8,7 +8,8 @@ subdir-$(CONFIG_SECURITY_SMACK) += smack subdir-$(CONFIG_SECURITY_TOMOYO) += tomoyo # always enable default capabilities -obj-y += commoncap.o min_addr.o +obj-y += commoncap.o +obj-$(CONFIG_MMU) += min_addr.o # Object file lists obj-$(CONFIG_SECURITY) += security.o capability.o -- cgit v1.2.3 From 47376ceba54600cec4dd9e7c4fe8b98e4269633a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Dec 2009 23:25:50 +0100 Subject: reiserfs: Fix reiserfs lock <-> inode mutex dependency inversion The reiserfs lock -> inode mutex dependency gets inverted when we relax the lock while walking to the tree. To fix this, use a specialized version of reiserfs_mutex_lock_safe that takes care of mutex subclasses. Then we can grab the inode mutex with I_MUTEX_XATTR subclass without any reiserfs lock dependency. This fixes the following report: [ INFO: possible circular locking dependency detected ] 2.6.32-06793-gf405425-dirty #2 ------------------------------------------------------- mv/18566 is trying to acquire lock: (&REISERFS_SB(s)->lock){+.+.+.}, at: [] reiserfs_write_lock+0x28= /0x40 but task is already holding lock: (&sb->s_type->i_mutex_key#5/3){+.+.+.}, at: [] reiserfs_for_each_xattr+0x10c/0x380 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&sb->s_type->i_mutex_key#5/3){+.+.+.}: [] validate_chain+0xa23/0xf70 [] __lock_acquire+0x4e5/0xa70 [] lock_acquire+0x7a/0xa0 [] mutex_lock_nested+0x5f/0x2b0 [] reiserfs_for_each_xattr+0x84/0x380 [] reiserfs_delete_xattrs+0x15/0x50 [] reiserfs_delete_inode+0x8f/0x140 [] generic_delete_inode+0x9c/0x150 [] generic_drop_inode+0x3d/0x60 [] iput+0x47/0x50 [] do_unlinkat+0xdb/0x160 [] sys_unlink+0x10/0x20 [] sysenter_do_call+0x12/0x36 -> #0 (&REISERFS_SB(s)->lock){+.+.+.}: [] validate_chain+0xf68/0xf70 [] __lock_acquire+0x4e5/0xa70 [] lock_acquire+0x7a/0xa0 [] mutex_lock_nested+0x5f/0x2b0 [] reiserfs_write_lock+0x28/0x40 [] search_by_key+0x1f7b/0x21b0 [] search_by_entry_key+0x1f/0x3b0 [] reiserfs_find_entry+0x77/0x400 [] reiserfs_lookup+0x85/0x130 [] __lookup_hash+0xb4/0x110 [] lookup_one_len+0xb3/0x100 [] reiserfs_for_each_xattr+0x120/0x380 [] reiserfs_delete_xattrs+0x15/0x50 [] reiserfs_delete_inode+0x8f/0x140 [] generic_delete_inode+0x9c/0x150 [] generic_drop_inode+0x3d/0x60 [] iput+0x47/0x50 [] dentry_iput+0x6f/0xf0 [] d_kill+0x24/0x50 [] dput+0x5b/0x120 [] sys_renameat+0x1b9/0x230 [] sys_rename+0x28/0x30 [] sysenter_do_call+0x12/0x36 other info that might help us debug this: 2 locks held by mv/18566: #0: (&sb->s_type->i_mutex_key#5/1){+.+.+.}, at: [] lock_rename+0xcc/0xd0 #1: (&sb->s_type->i_mutex_key#5/3){+.+.+.}, at: [] reiserfs_for_each_xattr+0x10c/0x380 stack backtrace: Pid: 18566, comm: mv Tainted: G C 2.6.32-06793-gf405425-dirty #2 Call Trace: [] ? printk+0x18/0x1e [] print_circular_bug+0xc0/0xd0 [] validate_chain+0xf68/0xf70 [] ? trace_hardirqs_off+0xb/0x10 [] __lock_acquire+0x4e5/0xa70 [] lock_acquire+0x7a/0xa0 [] ? reiserfs_write_lock+0x28/0x40 [] mutex_lock_nested+0x5f/0x2b0 [] ? reiserfs_write_lock+0x28/0x40 [] ? reiserfs_write_lock+0x28/0x40 [] ? schedule+0x27a/0x440 [] reiserfs_write_lock+0x28/0x40 [] search_by_key+0x1f7b/0x21b0 [] ? __lock_acquire+0x506/0xa70 [] ? lock_release_non_nested+0x1e7/0x340 [] ? reiserfs_write_lock+0x28/0x40 [] ? trace_hardirqs_on_caller+0x124/0x170 [] ? trace_hardirqs_on+0xb/0x10 [] ? T.316+0x15/0x1a0 [] ? sched_clock_cpu+0x9d/0x100 [] search_by_entry_key+0x1f/0x3b0 [] ? __mutex_unlock_slowpath+0x9a/0x120 [] ? trace_hardirqs_on_caller+0x124/0x170 [] reiserfs_find_entry+0x77/0x400 [] reiserfs_lookup+0x85/0x130 [] ? sched_clock_cpu+0x9d/0x100 [] __lookup_hash+0xb4/0x110 [] lookup_one_len+0xb3/0x100 [] reiserfs_for_each_xattr+0x120/0x380 [] ? delete_one_xattr+0x0/0x1c0 [] ? math_error+0x22/0x150 [] ? reiserfs_write_lock+0x28/0x40 [] reiserfs_delete_xattrs+0x15/0x50 [] ? reiserfs_write_lock+0x28/0x40 [] reiserfs_delete_inode+0x8f/0x140 [] ? generic_delete_inode+0x5f/0x150 [] ? reiserfs_delete_inode+0x0/0x140 [] generic_delete_inode+0x9c/0x150 [] generic_drop_inode+0x3d/0x60 [] iput+0x47/0x50 [] dentry_iput+0x6f/0xf0 [] d_kill+0x24/0x50 [] dput+0x5b/0x120 [] sys_renameat+0x1b9/0x230 [] ? sched_clock_cpu+0x9d/0x100 [] ? trace_hardirqs_off+0xb/0x10 [] ? cpu_clock+0x4e/0x60 [] ? do_page_fault+0x155/0x370 [] ? up_read+0x16/0x30 [] ? do_page_fault+0x155/0x370 [] sys_rename+0x28/0x30 [] sysenter_do_call+0x12/0x36 Reported-by: Alexander Beregalov Signed-off-by: Frederic Weisbecker Cc: Chris Mason Cc: Ingo Molnar Cc: Thomas Gleixner --- fs/reiserfs/xattr.c | 3 ++- include/linux/reiserfs_fs.h | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 58aa8e75f7f5..8891cd88a3f4 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -243,7 +243,8 @@ static int reiserfs_for_each_xattr(struct inode *inode, goto out_dir; } - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); + reiserfs_mutex_lock_nested_safe(&dir->d_inode->i_mutex, I_MUTEX_XATTR, + inode->i_sb); buf.xadir = dir; err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos); while ((err == 0 || err == -ENOSPC) && buf.count) { diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index a05b4a20768d..4351b49e2b1e 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -97,6 +97,15 @@ static inline void reiserfs_mutex_lock_safe(struct mutex *m, reiserfs_write_lock(s); } +static inline void +reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass, + struct super_block *s) +{ + reiserfs_write_unlock(s); + mutex_lock_nested(m, subclass); + reiserfs_write_lock(s); +} + /* * When we schedule, we usually want to also release the write lock, * according to the previous bkl based locking scheme of reiserfs. -- cgit v1.2.3 From 329962503692b42d8088f31584e42d52db179d52 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 15 Dec 2009 17:59:02 -0800 Subject: x86: Fix checking of SRAT when node 0 ram is not from 0 Found one system that boot from socket1 instead of socket0, SRAT get rejected... [ 0.000000] SRAT: Node 1 PXM 0 0-a0000 [ 0.000000] SRAT: Node 1 PXM 0 100000-80000000 [ 0.000000] SRAT: Node 1 PXM 0 100000000-2080000000 [ 0.000000] SRAT: Node 0 PXM 1 2080000000-4080000000 [ 0.000000] SRAT: Node 2 PXM 2 4080000000-6080000000 [ 0.000000] SRAT: Node 3 PXM 3 6080000000-8080000000 [ 0.000000] SRAT: Node 4 PXM 4 8080000000-a080000000 [ 0.000000] SRAT: Node 5 PXM 5 a080000000-c080000000 [ 0.000000] SRAT: Node 6 PXM 6 c080000000-e080000000 [ 0.000000] SRAT: Node 7 PXM 7 e080000000-10080000000 ... [ 0.000000] NUMA: Allocated memnodemap from 500000 - 701040 [ 0.000000] NUMA: Using 20 for the hash shift. [ 0.000000] Adding active range (0, 0x2080000, 0x4080000) 0 entries of 3200 used [ 0.000000] Adding active range (1, 0x0, 0x96) 1 entries of 3200 used [ 0.000000] Adding active range (1, 0x100, 0x7f750) 2 entries of 3200 used [ 0.000000] Adding active range (1, 0x100000, 0x2080000) 3 entries of 3200 used [ 0.000000] Adding active range (2, 0x4080000, 0x6080000) 4 entries of 3200 used [ 0.000000] Adding active range (3, 0x6080000, 0x8080000) 5 entries of 3200 used [ 0.000000] Adding active range (4, 0x8080000, 0xa080000) 6 entries of 3200 used [ 0.000000] Adding active range (5, 0xa080000, 0xc080000) 7 entries of 3200 used [ 0.000000] Adding active range (6, 0xc080000, 0xe080000) 8 entries of 3200 used [ 0.000000] Adding active range (7, 0xe080000, 0x10080000) 9 entries of 3200 used [ 0.000000] SRAT: PXMs only cover 917504MB of your 1048566MB e820 RAM. Not used. [ 0.000000] SRAT: SRAT not used. the early_node_map is not sorted because node0 with non zero start come first. so try to sort it right away after all regions are registered. also fixs refression by 8716273c (x86: Export srat physical topology) -v2: make it more solid to handle cross node case like node0 [0,4g), [8,12g) and node1 [4g, 8g), [12g, 16g) -v3: update comments. Reported-and-tested-by: Jens Axboe Signed-off-by: Yinghai Lu LKML-Reference: <4B2579D2.3010201@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/mm/srat_32.c | 2 ++ arch/x86/mm/srat_64.c | 4 +++- include/linux/mm.h | 3 +++ mm/page_alloc.c | 4 ++-- 4 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 6f8aa33031c7..9324f13492d5 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -267,6 +267,8 @@ int __init get_memcfg_from_srat(void) e820_register_active_regions(chunk->nid, chunk->start_pfn, min(chunk->end_pfn, max_pfn)); } + /* for out of order entries in SRAT */ + sort_node_map(); for_each_online_node(nid) { unsigned long start = node_start_pfn[nid]; diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index d89075489664..a27124185fc1 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -317,7 +317,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) unsigned long s = nodes[i].start >> PAGE_SHIFT; unsigned long e = nodes[i].end >> PAGE_SHIFT; pxmram += e - s; - pxmram -= absent_pages_in_range(s, e); + pxmram -= __absent_pages_in_range(i, s, e); if ((long)pxmram < 0) pxmram = 0; } @@ -373,6 +373,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) for_each_node_mask(i, nodes_parsed) e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); + /* for out of order entries in SRAT */ + sort_node_map(); if (!nodes_cover_memory(nodes)) { bad_srat(); return -1; diff --git a/include/linux/mm.h b/include/linux/mm.h index 24c395694f4d..20a2036f3a94 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1022,6 +1022,9 @@ extern void add_active_range(unsigned int nid, unsigned long start_pfn, extern void remove_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); +void sort_node_map(void); +unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, + unsigned long end_pfn); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bc2ac63f41e..873c86308b4e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3573,7 +3573,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -static unsigned long __meminit __absent_pages_in_range(int nid, +unsigned long __meminit __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { @@ -4102,7 +4102,7 @@ static int __init cmp_node_active_region(const void *a, const void *b) } /* sort the node_map by start_pfn */ -static void __init sort_node_map(void) +void __init sort_node_map(void) { sort(early_node_map, (size_t)nr_nodemap_entries, sizeof(struct node_active_region), -- cgit v1.2.3 From a4636818f8e0991f32d9528f39cf4f3d6a7d30a3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 17 Dec 2009 11:43:29 -0600 Subject: cpumask: rename tsk_cpumask to tsk_cpus_allowed Noone uses this wrapper yet, and Ingo asked that it be kept consistent with current task_struct usage. (One user crept in via linux-next: fixed) Signed-off-by: Rusty Russell Cc: Tejun Heo --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- include/linux/sched.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index a9df9441a9a2..f125e5c551c0 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1136,7 +1136,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, if (!alloc_cpumask_var(&oldmask, GFP_KERNEL)) return -ENOMEM; - cpumask_copy(oldmask, tsk_cpumask(current)); + cpumask_copy(oldmask, tsk_cpus_allowed(current)); set_cpus_allowed_ptr(current, cpumask_of(pol->cpu)); if (smp_processor_id() != pol->cpu) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 244c287a5ac1..4d7adb282bdd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1555,7 +1555,7 @@ struct task_struct { }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ -#define tsk_cpumask(tsk) (&(tsk)->cpus_allowed) +#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT -- cgit v1.2.3 From 2d1c861871d767153538a77c498752b36d4bb4b8 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 9 Dec 2009 17:52:13 +1100 Subject: PCI/cardbus: Add a fixup hook and fix powerpc The cardbus code creates PCI devices without ever going through the necessary fixup bits and pieces that normal PCI devices go through. There's in fact a commented out call to pcibios_fixup_bus() in there, it's commented because ... it doesn't work. I could make pcibios_fixup_bus() do the right thing on powerpc easily but I felt it cleaner instead to provide a specific hook pci_fixup_cardbus for which a weak empty implementation is provided by the PCI core. This fixes cardbus on powerbooks and probably all other PowerPC platforms which was broken completely for ever on some platforms and since 2.6.31 on others such as PowerBooks when we made the DMA ops mandatory (since those are setup by the fixups). Acked-by: Dominik Brodowski Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Jesse Barnes --- arch/powerpc/kernel/pci-common.c | 13 +++++++++++++ drivers/pci/pci.c | 5 +++++ drivers/pcmcia/cardbus.c | 2 +- include/linux/pci.h | 3 +++ 4 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index e8dfdbd9327a..cadbed679fbb 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1107,6 +1107,12 @@ void __devinit pcibios_setup_bus_devices(struct pci_bus *bus) list_for_each_entry(dev, &bus->devices, bus_list) { struct dev_archdata *sd = &dev->dev.archdata; + /* Cardbus can call us to add new devices to a bus, so ignore + * those who are already fully discovered + */ + if (dev->is_added) + continue; + /* Setup OF node pointer in archdata */ sd->of_node = pci_device_to_OF_node(dev); @@ -1147,6 +1153,13 @@ void __devinit pcibios_fixup_bus(struct pci_bus *bus) } EXPORT_SYMBOL(pcibios_fixup_bus); +void __devinit pci_fixup_cardbus(struct pci_bus *bus) +{ + /* Now fixup devices on that bus */ + pcibios_setup_bus_devices(bus); +} + + static int skip_isa_ioresource_align(struct pci_dev *dev) { if ((ppc_pci_flags & PPC_PCI_CAN_SKIP_ISA_ALIGN) && diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index d50522bf16b1..864e703cf737 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2798,6 +2798,11 @@ int __attribute__ ((weak)) pci_ext_cfg_avail(struct pci_dev *dev) return 1; } +void __weak pci_fixup_cardbus(struct pci_bus *bus) +{ +} +EXPORT_SYMBOL(pci_fixup_cardbus); + static int __init pci_setup(char *str) { while (str) { diff --git a/drivers/pcmcia/cardbus.c b/drivers/pcmcia/cardbus.c index cdf50f3bc2df..d99f846451a3 100644 --- a/drivers/pcmcia/cardbus.c +++ b/drivers/pcmcia/cardbus.c @@ -222,7 +222,7 @@ int __ref cb_alloc(struct pcmcia_socket *s) unsigned int max, pass; s->functions = pci_scan_slot(bus, PCI_DEVFN(0, 0)); -/* pcibios_fixup_bus(bus); */ + pci_fixup_cardbus(bus); max = bus->secondary; for (pass = 0; pass < 2; pass++) diff --git a/include/linux/pci.h b/include/linux/pci.h index bf1e67080849..5da0690d9cee 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -566,6 +566,9 @@ void pcibios_align_resource(void *, struct resource *, resource_size_t, resource_size_t); void pcibios_update_irq(struct pci_dev *, int irq); +/* Weak but can be overriden by arch */ +void pci_fixup_cardbus(struct pci_bus *); + /* Generic PCI functions used internally */ extern struct pci_bus *pci_find_bus(int domain, int busnr); -- cgit v1.2.3 From 4f924ba5b5aaf1477daafeae779495d39549186d Mon Sep 17 00:00:00 2001 From: Mattia Dongili Date: Thu, 17 Dec 2009 00:08:33 +0900 Subject: sony-laptop: add AVMode key mapping Some models are equipped with an "AVMode" function key that sends sony-laptop: Unknown event: 0x100 0xa1 sony-laptop: Unknown event: 0x100 0x21 for press and release respectively. Cc: "Matthew W. S. Bell" Cc: Dmitry Torokhov Signed-off-by: Mattia Dongili Signed-off-by: Len Brown --- drivers/platform/x86/sony-laptop.c | 4 ++++ include/linux/sonypi.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index a2a742c8ff7e..9710f70040ba 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -232,6 +232,7 @@ static int sony_laptop_input_index[] = { 56, /* 69 SONYPI_EVENT_VOLUME_INC_PRESSED */ 57, /* 70 SONYPI_EVENT_VOLUME_DEC_PRESSED */ -1, /* 71 SONYPI_EVENT_BRIGHTNESS_PRESSED */ + 58, /* 72 SONYPI_EVENT_MEDIA_PRESSED */ }; static int sony_laptop_input_keycode_map[] = { @@ -293,6 +294,7 @@ static int sony_laptop_input_keycode_map[] = { KEY_F15, /* 55 SONYPI_EVENT_SETTINGKEY_PRESSED */ KEY_VOLUMEUP, /* 56 SONYPI_EVENT_VOLUME_INC_PRESSED */ KEY_VOLUMEDOWN, /* 57 SONYPI_EVENT_VOLUME_DEC_PRESSED */ + KEY_MEDIA, /* 58 SONYPI_EVENT_MEDIA_PRESSED */ }; /* release buttons after a short delay if pressed */ @@ -890,6 +892,8 @@ static struct sony_nc_event sony_100_events[] = { { 0x0C, SONYPI_EVENT_FNKEY_RELEASED }, { 0x9f, SONYPI_EVENT_CD_EJECT_PRESSED }, { 0x1f, SONYPI_EVENT_ANYBUTTON_RELEASED }, + { 0xa1, SONYPI_EVENT_MEDIA_PRESSED }, + { 0x21, SONYPI_EVENT_ANYBUTTON_RELEASED }, { 0, 0 }, }; diff --git a/include/linux/sonypi.h b/include/linux/sonypi.h index 34c4475ac4a2..4f95c1aac2fd 100644 --- a/include/linux/sonypi.h +++ b/include/linux/sonypi.h @@ -111,6 +111,7 @@ #define SONYPI_EVENT_VOLUME_INC_PRESSED 69 #define SONYPI_EVENT_VOLUME_DEC_PRESSED 70 #define SONYPI_EVENT_BRIGHTNESS_PRESSED 71 +#define SONYPI_EVENT_MEDIA_PRESSED 72 /* get/set brightness */ #define SONYPI_IOCGBRT _IOR('v', 0, __u8) -- cgit v1.2.3 From 234da7bcdc7aaa935846534c3b726dbc79a9cdd5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Dec 2009 20:21:05 +0100 Subject: sched: Teach might_sleep() about preemptible RCU In practice, it is harmless to voluntarily sleep in a rcu_read_lock() section if we are running under preempt rcu, but it is illegal if we build a kernel running non-preemptable rcu. Currently, might_sleep() doesn't notice sleepable operations under rcu_read_lock() sections if we are running under preemptable rcu because preempt_count() is left untouched after rcu_read_lock() in this case. But we want developers who test their changes under such config to notice the "sleeping while atomic" issues. So we add rcu_read_lock_nesting to prempt_count() in might_sleep() checks. [ v2: Handle rcu-tiny ] Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Cc: Peter Zijlstra LKML-Reference: <1260991265-8451-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/rcutiny.h | 5 +++++ include/linux/rcutree.h | 11 +++++++++++ kernel/sched.c | 2 +- 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index c4ba9a78721e..96cc307ed9f4 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -101,4 +101,9 @@ static inline void exit_rcu(void) { } +static inline int rcu_preempt_depth(void) +{ + return 0; +} + #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index c93eee5911b0..8044b1b94333 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -45,6 +45,12 @@ extern void __rcu_read_unlock(void); extern void synchronize_rcu(void); extern void exit_rcu(void); +/* + * Defined as macro as it is a very low level header + * included from areas that don't even know about current + */ +#define rcu_preempt_depth() (current->rcu_read_lock_nesting) + #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ static inline void __rcu_read_lock(void) @@ -63,6 +69,11 @@ static inline void exit_rcu(void) { } +static inline int rcu_preempt_depth(void) +{ + return 0; +} + #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ static inline void __rcu_read_lock_bh(void) diff --git a/kernel/sched.c b/kernel/sched.c index af7dfa74e6bb..7be88a7be047 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9682,7 +9682,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = preempt_count() & ~PREEMPT_ACTIVE; + int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); } -- cgit v1.2.3 From 5b74ed4729ad2b2017453add68104a83206caefb Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Wed, 16 Dec 2009 10:09:45 -0500 Subject: perf events: Remove unused perf_counter.h header file Since nothing includes the file and it's also not exported to user space, remove it. Signed-off-by: Robert P. J. Day Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 444 ------------------------------------------- 1 file changed, 444 deletions(-) delete mode 100644 include/linux/perf_counter.h (limited to 'include/linux') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h deleted file mode 100644 index e3fb25606706..000000000000 --- a/include/linux/perf_counter.h +++ /dev/null @@ -1,444 +0,0 @@ -/* - * NOTE: this file will be removed in a future kernel release, it is - * provided as a courtesy copy of user-space code that relies on the - * old (pre-rename) symbols and constants. - * - * Performance events: - * - * Copyright (C) 2008-2009, Thomas Gleixner - * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra - * - * Data type definitions, declarations, prototypes. - * - * Started by: Thomas Gleixner and Ingo Molnar - * - * For licencing details see kernel-base/COPYING - */ -#ifndef _LINUX_PERF_COUNTER_H -#define _LINUX_PERF_COUNTER_H - -#include -#include -#include - -/* - * User-space ABI bits: - */ - -/* - * attr.type - */ -enum perf_type_id { - PERF_TYPE_HARDWARE = 0, - PERF_TYPE_SOFTWARE = 1, - PERF_TYPE_TRACEPOINT = 2, - PERF_TYPE_HW_CACHE = 3, - PERF_TYPE_RAW = 4, - - PERF_TYPE_MAX, /* non-ABI */ -}; - -/* - * Generalized performance counter event types, used by the - * attr.event_id parameter of the sys_perf_counter_open() - * syscall: - */ -enum perf_hw_id { - /* - * Common hardware events, generalized by the kernel: - */ - PERF_COUNT_HW_CPU_CYCLES = 0, - PERF_COUNT_HW_INSTRUCTIONS = 1, - PERF_COUNT_HW_CACHE_REFERENCES = 2, - PERF_COUNT_HW_CACHE_MISSES = 3, - PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, - PERF_COUNT_HW_BRANCH_MISSES = 5, - PERF_COUNT_HW_BUS_CYCLES = 6, - - PERF_COUNT_HW_MAX, /* non-ABI */ -}; - -/* - * Generalized hardware cache counters: - * - * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x - * { read, write, prefetch } x - * { accesses, misses } - */ -enum perf_hw_cache_id { - PERF_COUNT_HW_CACHE_L1D = 0, - PERF_COUNT_HW_CACHE_L1I = 1, - PERF_COUNT_HW_CACHE_LL = 2, - PERF_COUNT_HW_CACHE_DTLB = 3, - PERF_COUNT_HW_CACHE_ITLB = 4, - PERF_COUNT_HW_CACHE_BPU = 5, - - PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ -}; - -enum perf_hw_cache_op_id { - PERF_COUNT_HW_CACHE_OP_READ = 0, - PERF_COUNT_HW_CACHE_OP_WRITE = 1, - PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, - - PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ -}; - -enum perf_hw_cache_op_result_id { - PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, - PERF_COUNT_HW_CACHE_RESULT_MISS = 1, - - PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ -}; - -/* - * Special "software" counters provided by the kernel, even if the hardware - * does not support performance counters. These counters measure various - * physical and sw events of the kernel (and allow the profiling of them as - * well): - */ -enum perf_sw_ids { - PERF_COUNT_SW_CPU_CLOCK = 0, - PERF_COUNT_SW_TASK_CLOCK = 1, - PERF_COUNT_SW_PAGE_FAULTS = 2, - PERF_COUNT_SW_CONTEXT_SWITCHES = 3, - PERF_COUNT_SW_CPU_MIGRATIONS = 4, - PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, - PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, - PERF_COUNT_SW_ALIGNMENT_FAULTS = 7, - PERF_COUNT_SW_EMULATION_FAULTS = 8, - - PERF_COUNT_SW_MAX, /* non-ABI */ -}; - -/* - * Bits that can be set in attr.sample_type to request information - * in the overflow packets. - */ -enum perf_counter_sample_format { - PERF_SAMPLE_IP = 1U << 0, - PERF_SAMPLE_TID = 1U << 1, - PERF_SAMPLE_TIME = 1U << 2, - PERF_SAMPLE_ADDR = 1U << 3, - PERF_SAMPLE_READ = 1U << 4, - PERF_SAMPLE_CALLCHAIN = 1U << 5, - PERF_SAMPLE_ID = 1U << 6, - PERF_SAMPLE_CPU = 1U << 7, - PERF_SAMPLE_PERIOD = 1U << 8, - PERF_SAMPLE_STREAM_ID = 1U << 9, - PERF_SAMPLE_RAW = 1U << 10, - - PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ -}; - -/* - * The format of the data returned by read() on a perf counter fd, - * as specified by attr.read_format: - * - * struct read_format { - * { u64 value; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 id; } && PERF_FORMAT_ID - * } && !PERF_FORMAT_GROUP - * - * { u64 nr; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 value; - * { u64 id; } && PERF_FORMAT_ID - * } cntr[nr]; - * } && PERF_FORMAT_GROUP - * }; - */ -enum perf_counter_read_format { - PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, - PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, - PERF_FORMAT_ID = 1U << 2, - PERF_FORMAT_GROUP = 1U << 3, - - PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ -}; - -#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ - -/* - * Hardware event to monitor via a performance monitoring counter: - */ -struct perf_counter_attr { - - /* - * Major type: hardware/software/tracepoint/etc. - */ - __u32 type; - - /* - * Size of the attr structure, for fwd/bwd compat. - */ - __u32 size; - - /* - * Type specific configuration information. - */ - __u64 config; - - union { - __u64 sample_period; - __u64 sample_freq; - }; - - __u64 sample_type; - __u64 read_format; - - __u64 disabled : 1, /* off by default */ - inherit : 1, /* children inherit it */ - pinned : 1, /* must always be on PMU */ - exclusive : 1, /* only group on PMU */ - exclude_user : 1, /* don't count user */ - exclude_kernel : 1, /* ditto kernel */ - exclude_hv : 1, /* ditto hypervisor */ - exclude_idle : 1, /* don't count when idle */ - mmap : 1, /* include mmap data */ - comm : 1, /* include comm data */ - freq : 1, /* use freq, not period */ - inherit_stat : 1, /* per task counts */ - enable_on_exec : 1, /* next exec enables */ - task : 1, /* trace fork/exit */ - watermark : 1, /* wakeup_watermark */ - - __reserved_1 : 49; - - union { - __u32 wakeup_events; /* wakeup every n events */ - __u32 wakeup_watermark; /* bytes before wakeup */ - }; - __u32 __reserved_2; - - __u64 __reserved_3; -}; - -/* - * Ioctls that can be done on a perf counter fd: - */ -#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) -#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) -#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2) -#define PERF_COUNTER_IOC_RESET _IO ('$', 3) -#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64) -#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5) -#define PERF_COUNTER_IOC_SET_FILTER _IOW('$', 6, char *) - -enum perf_counter_ioc_flags { - PERF_IOC_FLAG_GROUP = 1U << 0, -}; - -/* - * Structure of the page that can be mapped via mmap - */ -struct perf_counter_mmap_page { - __u32 version; /* version number of this structure */ - __u32 compat_version; /* lowest version this is compat with */ - - /* - * Bits needed to read the hw counters in user-space. - * - * u32 seq; - * s64 count; - * - * do { - * seq = pc->lock; - * - * barrier() - * if (pc->index) { - * count = pmc_read(pc->index - 1); - * count += pc->offset; - * } else - * goto regular_read; - * - * barrier(); - * } while (pc->lock != seq); - * - * NOTE: for obvious reason this only works on self-monitoring - * processes. - */ - __u32 lock; /* seqlock for synchronization */ - __u32 index; /* hardware counter identifier */ - __s64 offset; /* add to hardware counter value */ - __u64 time_enabled; /* time counter active */ - __u64 time_running; /* time counter on cpu */ - - /* - * Hole for extension of the self monitor capabilities - */ - - __u64 __reserved[123]; /* align to 1k */ - - /* - * Control data for the mmap() data buffer. - * - * User-space reading the @data_head value should issue an rmb(), on - * SMP capable platforms, after reading this value -- see - * perf_counter_wakeup(). - * - * When the mapping is PROT_WRITE the @data_tail value should be - * written by userspace to reflect the last read data. In this case - * the kernel will not over-write unread data. - */ - __u64 data_head; /* head in the data section */ - __u64 data_tail; /* user-space written tail */ -}; - -#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) -#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0) -#define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (2 << 0) -#define PERF_EVENT_MISC_HYPERVISOR (3 << 0) - -struct perf_event_header { - __u32 type; - __u16 misc; - __u16 size; -}; - -enum perf_event_type { - - /* - * The MMAP events record the PROT_EXEC mappings so that we can - * correlate userspace IPs to code. They have the following structure: - * - * struct { - * struct perf_event_header header; - * - * u32 pid, tid; - * u64 addr; - * u64 len; - * u64 pgoff; - * char filename[]; - * }; - */ - PERF_EVENT_MMAP = 1, - - /* - * struct { - * struct perf_event_header header; - * u64 id; - * u64 lost; - * }; - */ - PERF_EVENT_LOST = 2, - - /* - * struct { - * struct perf_event_header header; - * - * u32 pid, tid; - * char comm[]; - * }; - */ - PERF_EVENT_COMM = 3, - - /* - * struct { - * struct perf_event_header header; - * u32 pid, ppid; - * u32 tid, ptid; - * u64 time; - * }; - */ - PERF_EVENT_EXIT = 4, - - /* - * struct { - * struct perf_event_header header; - * u64 time; - * u64 id; - * u64 stream_id; - * }; - */ - PERF_EVENT_THROTTLE = 5, - PERF_EVENT_UNTHROTTLE = 6, - - /* - * struct { - * struct perf_event_header header; - * u32 pid, ppid; - * u32 tid, ptid; - * u64 time; - * }; - */ - PERF_EVENT_FORK = 7, - - /* - * struct { - * struct perf_event_header header; - * u32 pid, tid; - * - * struct read_format values; - * }; - */ - PERF_EVENT_READ = 8, - - /* - * struct { - * struct perf_event_header header; - * - * { u64 ip; } && PERF_SAMPLE_IP - * { u32 pid, tid; } && PERF_SAMPLE_TID - * { u64 time; } && PERF_SAMPLE_TIME - * { u64 addr; } && PERF_SAMPLE_ADDR - * { u64 id; } && PERF_SAMPLE_ID - * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID - * { u32 cpu, res; } && PERF_SAMPLE_CPU - * { u64 period; } && PERF_SAMPLE_PERIOD - * - * { struct read_format values; } && PERF_SAMPLE_READ - * - * { u64 nr, - * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN - * - * # - * # The RAW record below is opaque data wrt the ABI - * # - * # That is, the ABI doesn't make any promises wrt to - * # the stability of its content, it may vary depending - * # on event, hardware, kernel version and phase of - * # the moon. - * # - * # In other words, PERF_SAMPLE_RAW contents are not an ABI. - * # - * - * { u32 size; - * char data[size];}&& PERF_SAMPLE_RAW - * }; - */ - PERF_EVENT_SAMPLE = 9, - - PERF_EVENT_MAX, /* non-ABI */ -}; - -enum perf_callchain_context { - PERF_CONTEXT_HV = (__u64)-32, - PERF_CONTEXT_KERNEL = (__u64)-128, - PERF_CONTEXT_USER = (__u64)-512, - - PERF_CONTEXT_GUEST = (__u64)-2048, - PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, - PERF_CONTEXT_GUEST_USER = (__u64)-2560, - - PERF_CONTEXT_MAX = (__u64)-4095, -}; - -#define PERF_FLAG_FD_NO_GROUP (1U << 0) -#define PERF_FLAG_FD_OUTPUT (1U << 1) - -/* - * In case some app still references the old symbols: - */ - -#define __NR_perf_counter_open __NR_perf_event_open - -#define PR_TASK_PERF_COUNTERS_DISABLE PR_TASK_PERF_EVENTS_DISABLE -#define PR_TASK_PERF_COUNTERS_ENABLE PR_TASK_PERF_EVENTS_ENABLE - -#endif /* _LINUX_PERF_COUNTER_H */ -- cgit v1.2.3 From 27f37e4bfed803be338dcc78845d4a67eefb40a0 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 25 Sep 2009 09:39:26 +0200 Subject: regulator: add driver for MAX8660/8661 Tested with a MX25-based custom board. Signed-off-by: Wolfram Sang Acked-by: Mark Brown Signed-off-by: Liam Girdwood --- drivers/regulator/Kconfig | 7 + drivers/regulator/Makefile | 1 + drivers/regulator/max8660.c | 510 ++++++++++++++++++++++++++++++++++++++ include/linux/regulator/max8660.h | 57 +++++ 4 files changed, 575 insertions(+) create mode 100644 drivers/regulator/max8660.c create mode 100644 include/linux/regulator/max8660.h (limited to 'include/linux') diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index 7cfdd65bebb4..9e0aa14dc6af 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -69,6 +69,13 @@ config REGULATOR_MAX1586 regulator via I2C bus. The provided regulator is suitable for PXA27x chips to control VCC_CORE and VCC_USIM voltages. +config REGULATOR_MAX8660 + tristate "Maxim 8660/8661 voltage regulator" + depends on I2C + help + This driver controls a Maxim 8660/8661 voltage output + regulator via I2C bus. + config REGULATOR_TWL4030 bool "TI TWL4030/TWL5030/TWL6030/TPS695x0 PMIC" depends on TWL4030_CORE diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index 9ae3cc44e668..12285e41beec 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_REGULATOR_BQ24022) += bq24022.o obj-$(CONFIG_REGULATOR_LP3971) += lp3971.o obj-$(CONFIG_REGULATOR_MAX1586) += max1586.o obj-$(CONFIG_REGULATOR_TWL4030) += twl-regulator.o +obj-$(CONFIG_REGULATOR_MAX8660) += max8660.o obj-$(CONFIG_REGULATOR_WM831X) += wm831x-dcdc.o obj-$(CONFIG_REGULATOR_WM831X) += wm831x-isink.o obj-$(CONFIG_REGULATOR_WM831X) += wm831x-ldo.o diff --git a/drivers/regulator/max8660.c b/drivers/regulator/max8660.c new file mode 100644 index 000000000000..acc2fb7b6087 --- /dev/null +++ b/drivers/regulator/max8660.c @@ -0,0 +1,510 @@ +/* + * max8660.c -- Voltage regulation for the Maxim 8660/8661 + * + * based on max1586.c and wm8400-regulator.c + * + * Copyright (C) 2009 Wolfram Sang, Pengutronix e.K. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place, Suite 330, Boston, MA 02111-1307 USA + * + * Some info: + * + * Datasheet: http://datasheets.maxim-ic.com/en/ds/MAX8660-MAX8661.pdf + * + * This chip is a bit nasty because it is a write-only device. Thus, the driver + * uses shadow registers to keep track of its values. The main problem appears + * to be the initialization: When Linux boots up, we cannot know if the chip is + * in the default state or not, so we would have to pass such information in + * platform_data. As this adds a bit of complexity to the driver, this is left + * out for now until it is really needed. + * + * [A|S|M]DTV1 registers are currently not used, but [A|S|M]DTV2. + * + * If the driver is feature complete, it might be worth to check if one set of + * functions for V3-V7 is sufficient. For maximum flexibility during + * development, they are separated for now. + * + */ + +#include +#include +#include +#include +#include +#include + +#define MAX8660_DCDC_MIN_UV 725000 +#define MAX8660_DCDC_MAX_UV 1800000 +#define MAX8660_DCDC_STEP 25000 +#define MAX8660_DCDC_MAX_SEL 0x2b + +#define MAX8660_LDO5_MIN_UV 1700000 +#define MAX8660_LDO5_MAX_UV 2000000 +#define MAX8660_LDO5_STEP 25000 +#define MAX8660_LDO5_MAX_SEL 0x0c + +#define MAX8660_LDO67_MIN_UV 1800000 +#define MAX8660_LDO67_MAX_UV 3300000 +#define MAX8660_LDO67_STEP 100000 +#define MAX8660_LDO67_MAX_SEL 0x0f + +enum { + MAX8660_OVER1, + MAX8660_OVER2, + MAX8660_VCC1, + MAX8660_ADTV1, + MAX8660_ADTV2, + MAX8660_SDTV1, + MAX8660_SDTV2, + MAX8660_MDTV1, + MAX8660_MDTV2, + MAX8660_L12VCR, + MAX8660_FPWM, + MAX8660_N_REGS, /* not a real register */ +}; + +struct max8660 { + struct i2c_client *client; + u8 shadow_regs[MAX8660_N_REGS]; /* as chip is write only */ + struct regulator_dev *rdev[]; +}; + +static int max8660_write(struct max8660 *max8660, u8 reg, u8 mask, u8 val) +{ + static const u8 max8660_addresses[MAX8660_N_REGS] = + { 0x10, 0x12, 0x20, 0x23, 0x24, 0x29, 0x2a, 0x32, 0x33, 0x39, 0x80 }; + + int ret; + u8 reg_val = (max8660->shadow_regs[reg] & mask) | val; + dev_vdbg(&max8660->client->dev, "Writing reg %02x with %02x\n", + max8660_addresses[reg], reg_val); + + ret = i2c_smbus_write_byte_data(max8660->client, + max8660_addresses[reg], reg_val); + if (ret == 0) + max8660->shadow_regs[reg] = reg_val; + + return ret; +} + + +/* + * DCDC functions + */ + +static int max8660_dcdc_is_enabled(struct regulator_dev *rdev) +{ + struct max8660 *max8660 = rdev_get_drvdata(rdev); + u8 val = max8660->shadow_regs[MAX8660_OVER1]; + u8 mask = (rdev_get_id(rdev) == MAX8660_V3) ? 1 : 4; + return !!(val & mask); +} + +static int max8660_dcdc_enable(struct regulator_dev *rdev) +{ + struct max8660 *max8660 = rdev_get_drvdata(rdev); + u8 bit = (rdev_get_id(rdev) == MAX8660_V3) ? 1 : 4; + return max8660_write(max8660, MAX8660_OVER1, 0xff, bit); +} + +static int max8660_dcdc_disable(struct regulator_dev *rdev) +{ + struct max8660 *max8660 = rdev_get_drvdata(rdev); + u8 mask = (rdev_get_id(rdev) == MAX8660_V3) ? ~1 : ~4; + return max8660_write(max8660, MAX8660_OVER1, mask, 0); +} + +static int max8660_dcdc_list(struct regulator_dev *rdev, unsigned selector) +{ + if (selector > MAX8660_DCDC_MAX_SEL) + return -EINVAL; + return MAX8660_DCDC_MIN_UV + selector * MAX8660_DCDC_STEP; +} + +static int max8660_dcdc_get(struct regulator_dev *rdev) +{ + struct max8660 *max8660 = rdev_get_drvdata(rdev); + u8 reg = (rdev_get_id(rdev) == MAX8660_V3) ? MAX8660_ADTV2 : MAX8660_SDTV2; + u8 selector = max8660->shadow_regs[reg]; + return MAX8660_DCDC_MIN_UV + selector * MAX8660_DCDC_STEP; +} + +static int max8660_dcdc_set(struct regulator_dev *rdev, int min_uV, int max_uV) +{ + struct max8660 *max8660 = rdev_get_drvdata(rdev); + u8 reg, selector, bits; + int ret; + + if (min_uV < MAX8660_DCDC_MIN_UV || min_uV > MAX8660_DCDC_MAX_UV) + return -EINVAL; + if (max_uV < MAX8660_DCDC_MIN_UV || max_uV > MAX8660_DCDC_MAX_UV) + return -EINVAL; + + selector = (min_uV - (MAX8660_DCDC_MIN_UV - MAX8660_DCDC_STEP + 1)) + / MAX8660_DCDC_STEP; + + ret = max8660_dcdc_list(rdev, selector); + if (ret < 0 || ret > max_uV) + return -EINVAL; + + reg = (rdev_get_id(rdev) == MAX8660_V3) ? MAX8660_ADTV2 : MAX8660_SDTV2; + ret = max8660_write(max8660, reg, 0, selector); + if (ret) + return ret; + + /* Select target voltage register and activate regulation */ + bits = (rdev_get_id(rdev) == MAX8660_V3) ? 0x03 : 0x30; + return max8660_write(max8660, MAX8660_VCC1, 0xff, bits); +} + +static struct regulator_ops max8660_dcdc_ops = { + .is_enabled = max8660_dcdc_is_enabled, + .list_voltage = max8660_dcdc_list, + .set_voltage = max8660_dcdc_set, + .get_voltage = max8660_dcdc_get, +}; + + +/* + * LDO5 functions + */ + +static int max8660_ldo5_list(struct regulator_dev *rdev, unsigned selector) +{ + if (selector > MAX8660_LDO5_MAX_SEL) + return -EINVAL; + return MAX8660_LDO5_MIN_UV + selector * MAX8660_LDO5_STEP; +} + +static int max8660_ldo5_get(struct regulator_dev *rdev) +{ + struct max8660 *max8660 = rdev_get_drvdata(rdev); + u8 selector = max8660->shadow_regs[MAX8660_MDTV2]; + + return MAX8660_LDO5_MIN_UV + selector * MAX8660_LDO5_STEP; +} + +static int max8660_ldo5_set(struct regulator_dev *rdev, int min_uV, int max_uV) +{ + struct max8660 *max8660 = rdev_get_drvdata(rdev); + u8 selector; + int ret; + + if (min_uV < MAX8660_LDO5_MIN_UV || min_uV > MAX8660_LDO5_MAX_UV) + return -EINVAL; + if (max_uV < MAX8660_LDO5_MIN_UV || max_uV > MAX8660_LDO5_MAX_UV) + return -EINVAL; + + selector = (min_uV - (MAX8660_LDO5_MIN_UV - MAX8660_LDO5_STEP + 1)) + / MAX8660_LDO5_STEP; + ret = max8660_ldo5_list(rdev, selector); + if (ret < 0 || ret > max_uV) + return -EINVAL; + + ret = max8660_write(max8660, MAX8660_MDTV2, 0, selector); + if (ret) + return ret; + + /* Select target voltage register and activate regulation */ + return max8660_write(max8660, MAX8660_VCC1, 0xff, 0xc0); +} + +static struct regulator_ops max8660_ldo5_ops = { + .list_voltage = max8660_ldo5_list, + .set_voltage = max8660_ldo5_set, + .get_voltage = max8660_ldo5_get, +}; + + +/* + * LDO67 functions + */ + +static int max8660_ldo67_is_enabled(struct regulator_dev *rdev) +{ + struct max8660 *max8660 = rdev_get_drvdata(rdev); + u8 val = max8660->shadow_regs[MAX8660_OVER2]; + u8 mask = (rdev_get_id(rdev) == MAX8660_V6) ? 2 : 4; + return !!(val & mask); +} + +static int max8660_ldo67_enable(struct regulator_dev *rdev) +{ + struct max8660 *max8660 = rdev_get_drvdata(rdev); + u8 bit = (rdev_get_id(rdev) == MAX8660_V6) ? 2 : 4; + return max8660_write(max8660, MAX8660_OVER2, 0xff, bit); +} + +static int max8660_ldo67_disable(struct regulator_dev *rdev) +{ + struct max8660 *max8660 = rdev_get_drvdata(rdev); + u8 mask = (rdev_get_id(rdev) == MAX8660_V6) ? ~2 : ~4; + return max8660_write(max8660, MAX8660_OVER2, mask, 0); +} + +static int max8660_ldo67_list(struct regulator_dev *rdev, unsigned selector) +{ + if (selector > MAX8660_LDO67_MAX_SEL) + return -EINVAL; + return MAX8660_LDO67_MIN_UV + selector * MAX8660_LDO67_STEP; +} + +static int max8660_ldo67_get(struct regulator_dev *rdev) +{ + struct max8660 *max8660 = rdev_get_drvdata(rdev); + u8 shift = (rdev_get_id(rdev) == MAX8660_V6) ? 0 : 4; + u8 selector = (max8660->shadow_regs[MAX8660_L12VCR] >> shift) & 0xf; + + return MAX8660_LDO67_MIN_UV + selector * MAX8660_LDO67_STEP; +} + +static int max8660_ldo67_set(struct regulator_dev *rdev, int min_uV, int max_uV) +{ + struct max8660 *max8660 = rdev_get_drvdata(rdev); + u8 selector; + int ret; + + if (min_uV < MAX8660_LDO67_MIN_UV || min_uV > MAX8660_LDO67_MAX_UV) + return -EINVAL; + if (max_uV < MAX8660_LDO67_MIN_UV || max_uV > MAX8660_LDO67_MAX_UV) + return -EINVAL; + + selector = (min_uV - (MAX8660_LDO67_MIN_UV - MAX8660_LDO67_STEP + 1)) + / MAX8660_LDO67_STEP; + + ret = max8660_ldo67_list(rdev, selector); + if (ret < 0 || ret > max_uV) + return -EINVAL; + + if (rdev_get_id(rdev) == MAX8660_V6) + return max8660_write(max8660, MAX8660_L12VCR, 0xf0, selector); + else + return max8660_write(max8660, MAX8660_L12VCR, 0x0f, selector << 4); +} + +static struct regulator_ops max8660_ldo67_ops = { + .is_enabled = max8660_ldo67_is_enabled, + .enable = max8660_ldo67_enable, + .disable = max8660_ldo67_disable, + .list_voltage = max8660_ldo67_list, + .get_voltage = max8660_ldo67_get, + .set_voltage = max8660_ldo67_set, +}; + +static struct regulator_desc max8660_reg[] = { + { + .name = "V3(DCDC)", + .id = MAX8660_V3, + .ops = &max8660_dcdc_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = MAX8660_DCDC_MAX_SEL + 1, + .owner = THIS_MODULE, + }, + { + .name = "V4(DCDC)", + .id = MAX8660_V4, + .ops = &max8660_dcdc_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = MAX8660_DCDC_MAX_SEL + 1, + .owner = THIS_MODULE, + }, + { + .name = "V5(LDO)", + .id = MAX8660_V5, + .ops = &max8660_ldo5_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = MAX8660_LDO5_MAX_SEL + 1, + .owner = THIS_MODULE, + }, + { + .name = "V6(LDO)", + .id = MAX8660_V6, + .ops = &max8660_ldo67_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = MAX8660_LDO67_MAX_SEL + 1, + .owner = THIS_MODULE, + }, + { + .name = "V7(LDO)", + .id = MAX8660_V7, + .ops = &max8660_ldo67_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = MAX8660_LDO67_MAX_SEL + 1, + .owner = THIS_MODULE, + }, +}; + +static int max8660_probe(struct i2c_client *client, + const struct i2c_device_id *i2c_id) +{ + struct regulator_dev **rdev; + struct max8660_platform_data *pdata = client->dev.platform_data; + struct max8660 *max8660; + int boot_on, i, id, ret = -EINVAL; + + if (pdata->num_subdevs > MAX8660_V_END) { + dev_err(&client->dev, "Too much regulators found!\n"); + goto out; + } + + max8660 = kzalloc(sizeof(struct max8660) + + sizeof(struct regulator_dev *) * MAX8660_V_END, + GFP_KERNEL); + if (!max8660) { + ret = -ENOMEM; + goto out; + } + + max8660->client = client; + rdev = max8660->rdev; + + if (pdata->en34_is_high) { + /* Simulate always on */ + max8660->shadow_regs[MAX8660_OVER1] = 5; + } else { + /* Otherwise devices can be toggled via software */ + max8660_dcdc_ops.enable = max8660_dcdc_enable; + max8660_dcdc_ops.disable = max8660_dcdc_disable; + } + + /* + * First, set up shadow registers to prevent glitches. As some + * registers are shared between regulators, everything must be properly + * set up for all regulators in advance. + */ + max8660->shadow_regs[MAX8660_ADTV1] = + max8660->shadow_regs[MAX8660_ADTV2] = + max8660->shadow_regs[MAX8660_SDTV1] = + max8660->shadow_regs[MAX8660_SDTV2] = 0x1b; + max8660->shadow_regs[MAX8660_MDTV1] = + max8660->shadow_regs[MAX8660_MDTV2] = 0x04; + + for (i = 0; i < pdata->num_subdevs; i++) { + + if (!pdata->subdevs[i].platform_data) + goto err_free; + + boot_on = pdata->subdevs[i].platform_data->constraints.boot_on; + + switch (pdata->subdevs[i].id) { + case MAX8660_V3: + if (boot_on) + max8660->shadow_regs[MAX8660_OVER1] |= 1; + break; + + case MAX8660_V4: + if (boot_on) + max8660->shadow_regs[MAX8660_OVER1] |= 4; + break; + + case MAX8660_V5: + break; + + case MAX8660_V6: + if (boot_on) + max8660->shadow_regs[MAX8660_OVER2] |= 2; + break; + + case MAX8660_V7: + if (!strcmp(i2c_id->name, "max8661")) { + dev_err(&client->dev, "Regulator not on this chip!\n"); + goto err_free; + } + + if (boot_on) + max8660->shadow_regs[MAX8660_OVER2] |= 4; + break; + + default: + dev_err(&client->dev, "invalid regulator %s\n", + pdata->subdevs[i].name); + goto err_free; + } + } + + /* Finally register devices */ + for (i = 0; i < pdata->num_subdevs; i++) { + + id = pdata->subdevs[i].id; + + rdev[i] = regulator_register(&max8660_reg[id], &client->dev, + pdata->subdevs[i].platform_data, + max8660); + if (IS_ERR(rdev[i])) { + ret = PTR_ERR(rdev[i]); + dev_err(&client->dev, "failed to register %s\n", + max8660_reg[id].name); + goto err_unregister; + } + } + + i2c_set_clientdata(client, rdev); + dev_info(&client->dev, "Maxim 8660/8661 regulator driver loaded\n"); + return 0; + +err_unregister: + while (--i >= 0) + regulator_unregister(rdev[i]); +err_free: + kfree(max8660); +out: + return ret; +} + +static int max8660_remove(struct i2c_client *client) +{ + struct regulator_dev **rdev = i2c_get_clientdata(client); + int i; + + for (i = 0; i < MAX8660_V_END; i++) + if (rdev[i]) + regulator_unregister(rdev[i]); + kfree(rdev); + i2c_set_clientdata(client, NULL); + + return 0; +} + +static const struct i2c_device_id max8660_id[] = { + { "max8660", 0 }, + { "max8661", 0 }, + { } +}; +MODULE_DEVICE_TABLE(i2c, max8660_id); + +static struct i2c_driver max8660_driver = { + .probe = max8660_probe, + .remove = max8660_remove, + .driver = { + .name = "max8660", + }, + .id_table = max8660_id, +}; + +static int __init max8660_init(void) +{ + return i2c_add_driver(&max8660_driver); +} +subsys_initcall(max8660_init); + +static void __exit max8660_exit(void) +{ + i2c_del_driver(&max8660_driver); +} +module_exit(max8660_exit); + +/* Module information */ +MODULE_DESCRIPTION("MAXIM 8660/8661 voltage regulator driver"); +MODULE_AUTHOR("Wolfram Sang"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/regulator/max8660.h b/include/linux/regulator/max8660.h new file mode 100644 index 000000000000..9936763621c7 --- /dev/null +++ b/include/linux/regulator/max8660.h @@ -0,0 +1,57 @@ +/* + * max8660.h -- Voltage regulation for the Maxim 8660/8661 + * + * Copyright (C) 2009 Wolfram Sang, Pengutronix e.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __LINUX_REGULATOR_MAX8660_H +#define __LINUX_REGULATOR_MAX8660_H + +#include + +enum { + MAX8660_V3, + MAX8660_V4, + MAX8660_V5, + MAX8660_V6, + MAX8660_V7, + MAX8660_V_END, +}; + +/** + * max8660_subdev_data - regulator subdev data + * @id: regulator id + * @name: regulator name + * @platform_data: regulator init data + */ +struct max8660_subdev_data { + int id; + char *name; + struct regulator_init_data *platform_data; +}; + +/** + * max8660_platform_data - platform data for max8660 + * @num_subdevs: number of regulators used + * @subdevs: pointer to regulators used + * @en34_is_high: if EN34 is driven high, regulators cannot be en-/disabled. + */ +struct max8660_platform_data { + int num_subdevs; + struct max8660_subdev_data *subdevs; + unsigned en34_is_high:1; +}; +#endif -- cgit v1.2.3 From e24a04c44cf312e88b50006a91ad7ffc1c0d97a5 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 22 Sep 2009 08:47:11 -0700 Subject: regulator: Implement WM831x BuckWise DC-DC convertor DVS support The BuckWise DC-DC convertors in WM831x devices support switching to a second output voltage using the logic level on one of the device pins. This is intended to allow rapid voltage switching for uses like cpufreq, replacing the I2C or SPI write used to configure the voltage of the regulator with a much faster GPIO status change. This is implemented by keeping the DVS voltage configured as the maximum voltage permitted for the regulator. If a request is made for the maximum voltage then the GPIO is used to switch to the DVS voltage, otherwise the normal ON voltage is updated and used. This follows the idiom used by most cpufreq drivers, which drop the minimum voltage as the core frequency is dropped but use a constant maximum - raising the voltage should normally be fast, but lowering it may be slower. Configuration of the DVS MFP on the device should be done externally, for example via OTP. Support is present in the hardware for monitoring the status of the transition using a second GPIO. This is not currently implemented but platform data is provided for it - the driver currently assumes that the device will be configured to transition immediately - but platform data is provided to reduce merge issues once it is. Signed-off-by: Mark Brown Acked-by: Samuel Ortiz Signed-off-by: Liam Girdwood --- drivers/regulator/wm831x-dcdc.c | 207 +++++++++++++++++++++++++++++++++++---- include/linux/mfd/wm831x/pdata.h | 17 ++++ 2 files changed, 206 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/regulator/wm831x-dcdc.c b/drivers/regulator/wm831x-dcdc.c index 2eefc1a0cf08..0a6577577e8d 100644 --- a/drivers/regulator/wm831x-dcdc.c +++ b/drivers/regulator/wm831x-dcdc.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -39,6 +41,7 @@ #define WM831X_DCDC_CONTROL_2 1 #define WM831X_DCDC_ON_CONFIG 2 #define WM831X_DCDC_SLEEP_CONTROL 3 +#define WM831X_DCDC_DVS_CONTROL 4 /* * Shared @@ -50,6 +53,10 @@ struct wm831x_dcdc { int base; struct wm831x *wm831x; struct regulator_dev *regulator; + int dvs_gpio; + int dvs_gpio_state; + int on_vsel; + int dvs_vsel; }; static int wm831x_dcdc_is_enabled(struct regulator_dev *rdev) @@ -240,11 +247,9 @@ static int wm831x_buckv_list_voltage(struct regulator_dev *rdev, return -EINVAL; } -static int wm831x_buckv_set_voltage_int(struct regulator_dev *rdev, int reg, - int min_uV, int max_uV) +static int wm831x_buckv_select_min_voltage(struct regulator_dev *rdev, + int min_uV, int max_uV) { - struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev); - struct wm831x *wm831x = dcdc->wm831x; u16 vsel; if (min_uV < 600000) @@ -257,39 +262,126 @@ static int wm831x_buckv_set_voltage_int(struct regulator_dev *rdev, int reg, if (wm831x_buckv_list_voltage(rdev, vsel) > max_uV) return -EINVAL; - return wm831x_set_bits(wm831x, reg, WM831X_DC1_ON_VSEL_MASK, vsel); + return vsel; +} + +static int wm831x_buckv_select_max_voltage(struct regulator_dev *rdev, + int min_uV, int max_uV) +{ + u16 vsel; + + if (max_uV < 600000 || max_uV > 1800000) + return -EINVAL; + + vsel = ((max_uV - 600000) / 12500) + 8; + + if (wm831x_buckv_list_voltage(rdev, vsel) < min_uV || + wm831x_buckv_list_voltage(rdev, vsel) < max_uV) + return -EINVAL; + + return vsel; +} + +static int wm831x_buckv_set_dvs(struct regulator_dev *rdev, int state) +{ + struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev); + + if (state == dcdc->dvs_gpio_state) + return 0; + + dcdc->dvs_gpio_state = state; + gpio_set_value(dcdc->dvs_gpio, state); + + /* Should wait for DVS state change to be asserted if we have + * a GPIO for it, for now assume the device is configured + * for the fastest possible transition. + */ + + return 0; } static int wm831x_buckv_set_voltage(struct regulator_dev *rdev, - int min_uV, int max_uV) + int min_uV, int max_uV) { struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev); - u16 reg = dcdc->base + WM831X_DCDC_ON_CONFIG; + struct wm831x *wm831x = dcdc->wm831x; + int on_reg = dcdc->base + WM831X_DCDC_ON_CONFIG; + int dvs_reg = dcdc->base + WM831X_DCDC_DVS_CONTROL; + int vsel, ret; + + vsel = wm831x_buckv_select_min_voltage(rdev, min_uV, max_uV); + if (vsel < 0) + return vsel; + + /* If this value is already set then do a GPIO update if we can */ + if (dcdc->dvs_gpio && dcdc->on_vsel == vsel) + return wm831x_buckv_set_dvs(rdev, 0); + + if (dcdc->dvs_gpio && dcdc->dvs_vsel == vsel) + return wm831x_buckv_set_dvs(rdev, 1); + + /* Always set the ON status to the minimum voltage */ + ret = wm831x_set_bits(wm831x, on_reg, WM831X_DC1_ON_VSEL_MASK, vsel); + if (ret < 0) + return ret; + dcdc->on_vsel = vsel; + + if (!dcdc->dvs_gpio) + return ret; + + /* Kick the voltage transition now */ + ret = wm831x_buckv_set_dvs(rdev, 0); + if (ret < 0) + return ret; + + /* Set the high voltage as the DVS voltage. This is optimised + * for CPUfreq usage, most processors will keep the maximum + * voltage constant and lower the minimum with the frequency. */ + vsel = wm831x_buckv_select_max_voltage(rdev, min_uV, max_uV); + if (vsel < 0) { + /* This should never happen - at worst the same vsel + * should be chosen */ + WARN_ON(vsel < 0); + return 0; + } + + /* Don't bother if it's the same VSEL we're already using */ + if (vsel == dcdc->on_vsel) + return 0; - return wm831x_buckv_set_voltage_int(rdev, reg, min_uV, max_uV); + ret = wm831x_set_bits(wm831x, dvs_reg, WM831X_DC1_DVS_VSEL_MASK, vsel); + if (ret == 0) + dcdc->dvs_vsel = vsel; + else + dev_warn(wm831x->dev, "Failed to set DCDC DVS VSEL: %d\n", + ret); + + return 0; } static int wm831x_buckv_set_suspend_voltage(struct regulator_dev *rdev, - int uV) + int uV) { struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev); + struct wm831x *wm831x = dcdc->wm831x; u16 reg = dcdc->base + WM831X_DCDC_SLEEP_CONTROL; + int vsel; + + vsel = wm831x_buckv_select_min_voltage(rdev, uV, uV); + if (vsel < 0) + return vsel; - return wm831x_buckv_set_voltage_int(rdev, reg, uV, uV); + return wm831x_set_bits(wm831x, reg, WM831X_DC1_SLP_VSEL_MASK, vsel); } static int wm831x_buckv_get_voltage(struct regulator_dev *rdev) { struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev); - struct wm831x *wm831x = dcdc->wm831x; - u16 reg = dcdc->base + WM831X_DCDC_ON_CONFIG; - int val; - val = wm831x_reg_read(wm831x, reg); - if (val < 0) - return val; - - return wm831x_buckv_list_voltage(rdev, val & WM831X_DC1_ON_VSEL_MASK); + if (dcdc->dvs_gpio && dcdc->dvs_gpio_state) + return wm831x_buckv_list_voltage(rdev, dcdc->dvs_vsel); + else + return wm831x_buckv_list_voltage(rdev, dcdc->on_vsel); } /* Current limit options */ @@ -346,6 +438,64 @@ static struct regulator_ops wm831x_buckv_ops = { .set_suspend_mode = wm831x_dcdc_set_suspend_mode, }; +/* + * Set up DVS control. We just log errors since we can still run + * (with reduced performance) if we fail. + */ +static __devinit void wm831x_buckv_dvs_init(struct wm831x_dcdc *dcdc, + struct wm831x_buckv_pdata *pdata) +{ + struct wm831x *wm831x = dcdc->wm831x; + int ret; + u16 ctrl; + + if (!pdata || !pdata->dvs_gpio) + return; + + switch (pdata->dvs_control_src) { + case 1: + ctrl = 2 << WM831X_DC1_DVS_SRC_SHIFT; + break; + case 2: + ctrl = 3 << WM831X_DC1_DVS_SRC_SHIFT; + break; + default: + dev_err(wm831x->dev, "Invalid DVS control source %d for %s\n", + pdata->dvs_control_src, dcdc->name); + return; + } + + ret = wm831x_set_bits(wm831x, dcdc->base + WM831X_DCDC_DVS_CONTROL, + WM831X_DC1_DVS_SRC_MASK, ctrl); + if (ret < 0) { + dev_err(wm831x->dev, "Failed to set %s DVS source: %d\n", + dcdc->name, ret); + return; + } + + ret = gpio_request(pdata->dvs_gpio, "DCDC DVS"); + if (ret < 0) { + dev_err(wm831x->dev, "Failed to get %s DVS GPIO: %d\n", + dcdc->name, ret); + return; + } + + /* gpiolib won't let us read the GPIO status so pick the higher + * of the two existing voltages so we take it as platform data. + */ + dcdc->dvs_gpio_state = pdata->dvs_init_state; + + ret = gpio_direction_output(pdata->dvs_gpio, dcdc->dvs_gpio_state); + if (ret < 0) { + dev_err(wm831x->dev, "Failed to enable %s DVS GPIO: %d\n", + dcdc->name, ret); + gpio_free(pdata->dvs_gpio); + return; + } + + dcdc->dvs_gpio = pdata->dvs_gpio; +} + static __devinit int wm831x_buckv_probe(struct platform_device *pdev) { struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent); @@ -384,6 +534,23 @@ static __devinit int wm831x_buckv_probe(struct platform_device *pdev) dcdc->desc.ops = &wm831x_buckv_ops; dcdc->desc.owner = THIS_MODULE; + ret = wm831x_reg_read(wm831x, dcdc->base + WM831X_DCDC_ON_CONFIG); + if (ret < 0) { + dev_err(wm831x->dev, "Failed to read ON VSEL: %d\n", ret); + goto err; + } + dcdc->on_vsel = ret & WM831X_DC1_ON_VSEL_MASK; + + ret = wm831x_reg_read(wm831x, dcdc->base + WM831X_DCDC_ON_CONFIG); + if (ret < 0) { + dev_err(wm831x->dev, "Failed to read DVS VSEL: %d\n", ret); + goto err; + } + dcdc->dvs_vsel = ret & WM831X_DC1_DVS_VSEL_MASK; + + if (pdata->dcdc[id]) + wm831x_buckv_dvs_init(dcdc, pdata->dcdc[id]->driver_data); + dcdc->regulator = regulator_register(&dcdc->desc, &pdev->dev, pdata->dcdc[id], dcdc); if (IS_ERR(dcdc->regulator)) { @@ -422,6 +589,8 @@ err_uv: err_regulator: regulator_unregister(dcdc->regulator); err: + if (dcdc->dvs_gpio) + gpio_free(dcdc->dvs_gpio); kfree(dcdc); return ret; } @@ -434,6 +603,8 @@ static __devexit int wm831x_buckv_remove(struct platform_device *pdev) wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "HC"), dcdc); wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "UV"), dcdc); regulator_unregister(dcdc->regulator); + if (dcdc->dvs_gpio) + gpio_free(dcdc->dvs_gpio); kfree(dcdc); return 0; diff --git a/include/linux/mfd/wm831x/pdata.h b/include/linux/mfd/wm831x/pdata.h index 415c228743d5..fd322aca33ba 100644 --- a/include/linux/mfd/wm831x/pdata.h +++ b/include/linux/mfd/wm831x/pdata.h @@ -41,6 +41,23 @@ struct wm831x_battery_pdata { int timeout; /** Charge cycle timeout, in minutes */ }; +/** + * Configuration for the WM831x DC-DC BuckWise convertors. This + * should be passed as driver_data in the regulator_init_data. + * + * Currently all the configuration is for the fast DVS switching + * support of the devices. This allows MFPs on the device to be + * configured as an input to switch between two output voltages, + * allowing voltage transitions without the expense of an access over + * I2C or SPI buses. + */ +struct wm831x_buckv_pdata { + int dvs_gpio; /** CPU GPIO to use for DVS switching */ + int dvs_control_src; /** Hardware DVS source to use (1 or 2) */ + int dvs_init_state; /** DVS state to expect on startup */ + int dvs_state_gpio; /** CPU GPIO to use for monitoring status */ +}; + /* Sources for status LED configuration. Values are register values * plus 1 to allow for a zero default for preserve. */ -- cgit v1.2.3 From 638f85c54f4fed0f8f1fbc23745a8f334112e892 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 22 Oct 2009 16:31:33 +0100 Subject: regulator: Handle regulators without suspend mode configuration Since some regulators in the system may not support suspend mode configuration we need to allow some regulators to have a missing suspend mode configuration. Do this by requiring that disabled regulators are explicitly flagged and then skip over regulators that have no state specified. Try to avoid surprises by warning the if we could set the state but no configuration is provided. This also ensures that an all zeros configuration generates a warning rather than silently disabling the regulator. Reported-by: Joonyoung Shim Signed-off-by: Mark Brown Signed-off-by: Liam Girdwood --- drivers/regulator/core.c | 25 ++++++++++++++++++++++--- include/linux/regulator/machine.h | 6 +++++- 2 files changed, 27 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 7d0c0d7d90ca..2dab0d9e11f5 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -581,10 +581,29 @@ static int suspend_set_state(struct regulator_dev *rdev, struct regulator_state *rstate) { int ret = 0; + bool can_set_state; - /* enable & disable are mandatory for suspend control */ - if (!rdev->desc->ops->set_suspend_enable || - !rdev->desc->ops->set_suspend_disable) { + can_set_state = rdev->desc->ops->set_suspend_enable && + rdev->desc->ops->set_suspend_disable; + + /* If we have no suspend mode configration don't set anything; + * only warn if the driver actually makes the suspend mode + * configurable. + */ + if (!rstate->enabled && !rstate->disabled) { + if (can_set_state) + printk(KERN_WARNING "%s: No configuration for %s\n", + __func__, rdev_get_name(rdev)); + return 0; + } + + if (rstate->enabled && rstate->disabled) { + printk(KERN_ERR "%s: invalid configuration for %s\n", + __func__, rdev_get_name(rdev)); + return -EINVAL; + } + + if (!can_set_state) { printk(KERN_ERR "%s: no way to set suspend state\n", __func__); return -EINVAL; diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index 87f5f176d4ef..234a8476cba8 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -43,16 +43,20 @@ struct regulator; /** * struct regulator_state - regulator state during low power system states * - * This describes a regulators state during a system wide low power state. + * This describes a regulators state during a system wide low power + * state. One of enabled or disabled must be set for the + * configuration to be applied. * * @uV: Operating voltage during suspend. * @mode: Operating mode during suspend. * @enabled: Enabled during suspend. + * @disabled: Disabled during suspend. */ struct regulator_state { int uV; /* suspend voltage */ unsigned int mode; /* suspend regulator operating mode */ int enabled; /* is regulator enabled in this suspend state */ + int disabled; /* is the regulator disbled in this suspend state */ }; /** -- cgit v1.2.3 From b56daf13eb77ee24f48f0bb34c2492f46a432ec4 Mon Sep 17 00:00:00 2001 From: Liam Girdwood Date: Wed, 11 Nov 2009 14:16:10 +0000 Subject: regulator: consumer.h - fix build when consumer.h is #included first. consumer.h requires device.h for stand alone build. Signed-off-by: Liam Girdwood --- include/linux/regulator/consumer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 490c5b37b6d7..030d92255c7a 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -35,6 +35,8 @@ #ifndef __LINUX_REGULATOR_CONSUMER_H_ #define __LINUX_REGULATOR_CONSUMER_H_ +#include + /* * Regulator operating modes. * -- cgit v1.2.3 From d4cc6a2eee98faebf2c7d3ebc4b35541c1d47d21 Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Mon, 7 Dec 2009 15:08:13 +0100 Subject: leds: Add LED class driver for regulator driven LEDs. This driver provides an interface for controlling LEDs (or vibrators) connected to PMICs for which there is a regulator framework driver. This driver can be used, for instance, to control vibrator on all Motorola EZX phones using the pcap-regulator driver services. Signed-off-by: Antonio Ospite Reviewed-by: Mark Brown Signed-off-by: Richard Purdie --- drivers/leds/Kconfig | 6 + drivers/leds/Makefile | 1 + drivers/leds/leds-regulator.c | 242 +++++++++++++++++++++++++++++++++++++++++ include/linux/leds-regulator.h | 46 ++++++++ 4 files changed, 295 insertions(+) create mode 100644 drivers/leds/leds-regulator.c create mode 100644 include/linux/leds-regulator.h (limited to 'include/linux') diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig index f12a99676628..8a0e1ec95e4a 100644 --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@ -229,6 +229,12 @@ config LEDS_PWM help This option enables support for pwm driven LEDs +config LEDS_REGULATOR + tristate "REGULATOR driven LED support" + depends on LEDS_CLASS && REGULATOR + help + This option enables support for regulator driven LEDs. + config LEDS_BD2802 tristate "LED driver for BD2802 RGB LED" depends on LEDS_CLASS && I2C diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile index 176f0c674751..9e63869d7c0d 100644 --- a/drivers/leds/Makefile +++ b/drivers/leds/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_LEDS_DA903X) += leds-da903x.o obj-$(CONFIG_LEDS_WM831X_STATUS) += leds-wm831x-status.o obj-$(CONFIG_LEDS_WM8350) += leds-wm8350.o obj-$(CONFIG_LEDS_PWM) += leds-pwm.o +obj-$(CONFIG_LEDS_REGULATOR) += leds-regulator.o obj-$(CONFIG_LEDS_INTEL_SS4200) += leds-ss4200.o obj-$(CONFIG_LEDS_LT3593) += leds-lt3593.o obj-$(CONFIG_LEDS_ADP5520) += leds-adp5520.o diff --git a/drivers/leds/leds-regulator.c b/drivers/leds/leds-regulator.c new file mode 100644 index 000000000000..7f00de3ef922 --- /dev/null +++ b/drivers/leds/leds-regulator.c @@ -0,0 +1,242 @@ +/* + * leds-regulator.c - LED class driver for regulator driven LEDs. + * + * Copyright (C) 2009 Antonio Ospite + * + * Inspired by leds-wm8350 driver. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#define to_regulator_led(led_cdev) \ + container_of(led_cdev, struct regulator_led, cdev) + +struct regulator_led { + struct led_classdev cdev; + enum led_brightness value; + int enabled; + struct mutex mutex; + struct work_struct work; + + struct regulator *vcc; +}; + +static inline int led_regulator_get_max_brightness(struct regulator *supply) +{ + int ret; + int voltage = regulator_list_voltage(supply, 0); + + if (voltage <= 0) + return 1; + + /* even if regulator can't change voltages, + * we still assume it can change status + * and the LED can be turned on and off. + */ + ret = regulator_set_voltage(supply, voltage, voltage); + if (ret < 0) + return 1; + + return regulator_count_voltages(supply); +} + +static int led_regulator_get_voltage(struct regulator *supply, + enum led_brightness brightness) +{ + if (brightness == 0) + return -EINVAL; + + return regulator_list_voltage(supply, brightness - 1); +} + + +static void regulator_led_enable(struct regulator_led *led) +{ + int ret; + + if (led->enabled) + return; + + ret = regulator_enable(led->vcc); + if (ret != 0) { + dev_err(led->cdev.dev, "Failed to enable vcc: %d\n", ret); + return; + } + + led->enabled = 1; +} + +static void regulator_led_disable(struct regulator_led *led) +{ + int ret; + + if (!led->enabled) + return; + + ret = regulator_disable(led->vcc); + if (ret != 0) { + dev_err(led->cdev.dev, "Failed to disable vcc: %d\n", ret); + return; + } + + led->enabled = 0; +} + +static void regulator_led_set_value(struct regulator_led *led) +{ + int voltage; + int ret; + + mutex_lock(&led->mutex); + + if (led->value == LED_OFF) { + regulator_led_disable(led); + goto out; + } + + if (led->cdev.max_brightness > 1) { + voltage = led_regulator_get_voltage(led->vcc, led->value); + dev_dbg(led->cdev.dev, "brightness: %d voltage: %d\n", + led->value, voltage); + + ret = regulator_set_voltage(led->vcc, voltage, voltage); + if (ret != 0) + dev_err(led->cdev.dev, "Failed to set voltage %d: %d\n", + voltage, ret); + } + + regulator_led_enable(led); + +out: + mutex_unlock(&led->mutex); +} + +static void led_work(struct work_struct *work) +{ + struct regulator_led *led; + + led = container_of(work, struct regulator_led, work); + regulator_led_set_value(led); +} + +static void regulator_led_brightness_set(struct led_classdev *led_cdev, + enum led_brightness value) +{ + struct regulator_led *led = to_regulator_led(led_cdev); + + led->value = value; + schedule_work(&led->work); +} + +static int __devinit regulator_led_probe(struct platform_device *pdev) +{ + struct led_regulator_platform_data *pdata = pdev->dev.platform_data; + struct regulator_led *led; + struct regulator *vcc; + int ret = 0; + + if (pdata == NULL) { + dev_err(&pdev->dev, "no platform data\n"); + return -ENODEV; + } + + vcc = regulator_get_exclusive(&pdev->dev, "vled"); + if (IS_ERR(vcc)) { + dev_err(&pdev->dev, "Cannot get vcc for %s\n", pdata->name); + return PTR_ERR(vcc); + } + + led = kzalloc(sizeof(*led), GFP_KERNEL); + if (led == NULL) { + ret = -ENOMEM; + goto err_vcc; + } + + led->cdev.max_brightness = led_regulator_get_max_brightness(vcc); + if (pdata->brightness > led->cdev.max_brightness) { + dev_err(&pdev->dev, "Invalid default brightness %d\n", + pdata->brightness); + ret = -EINVAL; + goto err_led; + } + led->value = pdata->brightness; + + led->cdev.brightness_set = regulator_led_brightness_set; + led->cdev.name = pdata->name; + led->cdev.flags |= LED_CORE_SUSPENDRESUME; + led->vcc = vcc; + + mutex_init(&led->mutex); + INIT_WORK(&led->work, led_work); + + platform_set_drvdata(pdev, led); + + ret = led_classdev_register(&pdev->dev, &led->cdev); + if (ret < 0) { + cancel_work_sync(&led->work); + goto err_led; + } + + /* to expose the default value to userspace */ + led->cdev.brightness = led->value; + + /* Set the default led status */ + regulator_led_set_value(led); + + return 0; + +err_led: + kfree(led); +err_vcc: + regulator_put(vcc); + return ret; +} + +static int __devexit regulator_led_remove(struct platform_device *pdev) +{ + struct regulator_led *led = platform_get_drvdata(pdev); + + led_classdev_unregister(&led->cdev); + cancel_work_sync(&led->work); + regulator_led_disable(led); + regulator_put(led->vcc); + kfree(led); + return 0; +} + +static struct platform_driver regulator_led_driver = { + .driver = { + .name = "leds-regulator", + .owner = THIS_MODULE, + }, + .probe = regulator_led_probe, + .remove = __devexit_p(regulator_led_remove), +}; + +static int __init regulator_led_init(void) +{ + return platform_driver_register(®ulator_led_driver); +} +module_init(regulator_led_init); + +static void __exit regulator_led_exit(void) +{ + platform_driver_unregister(®ulator_led_driver); +} +module_exit(regulator_led_exit); + +MODULE_AUTHOR("Antonio Ospite "); +MODULE_DESCRIPTION("Regulator driven LED driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:leds-regulator"); diff --git a/include/linux/leds-regulator.h b/include/linux/leds-regulator.h new file mode 100644 index 000000000000..5a8eb389aab8 --- /dev/null +++ b/include/linux/leds-regulator.h @@ -0,0 +1,46 @@ +/* + * leds-regulator.h - platform data structure for regulator driven LEDs. + * + * Copyright (C) 2009 Antonio Ospite + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#ifndef __LINUX_LEDS_REGULATOR_H +#define __LINUX_LEDS_REGULATOR_H + +/* + * Use "vled" as supply id when declaring the regulator consumer: + * + * static struct regulator_consumer_supply pcap_regulator_VVIB_consumers [] = { + * { .dev_name = "leds-regulator.0", supply = "vled" }, + * }; + * + * If you have several regulator driven LEDs, you can append a numerical id to + * .dev_name as done above, and use the same id when declaring the platform + * device: + * + * static struct led_regulator_platform_data a780_vibrator_data = { + * .name = "a780::vibrator", + * }; + * + * static struct platform_device a780_vibrator = { + * .name = "leds-regulator", + * .id = 0, + * .dev = { + * .platform_data = &a780_vibrator_data, + * }, + * }; + */ + +#include + +struct led_regulator_platform_data { + char *name; /* LED name as expected by LED class */ + enum led_brightness brightness; /* initial brightness value */ +}; + +#endif /* __LINUX_LEDS_REGULATOR_H */ -- cgit v1.2.3 From 9695fff8f84d7ab849139750036e443b85804edd Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Sat, 28 Nov 2009 12:55:51 +0100 Subject: leds: leds-pca9532.h- indent with tabs, not spaces Signed-off-by: Antonio Ospite Signed-off-by: Richard Purdie --- include/linux/leds-pca9532.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/leds-pca9532.h b/include/linux/leds-pca9532.h index 96eea90f01a8..f158eb1149aa 100644 --- a/include/linux/leds-pca9532.h +++ b/include/linux/leds-pca9532.h @@ -32,7 +32,7 @@ struct pca9532_led { struct i2c_client *client; char *name; struct led_classdev ldev; - struct work_struct work; + struct work_struct work; enum pca9532_type type; enum pca9532_state state; }; -- cgit v1.2.3 From 1998111582f5d726ca4dbf9d68935d9e7c994374 Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Sun, 29 Nov 2009 13:12:21 +0100 Subject: leds: leds-lp3944.h - remove unneeded includes These were needed in the first version of the driver because we used to expose workqueue and led class details in the header file, now we don't. Signed-off-by: Antonio Ospite Signed-off-by: Richard Purdie --- include/linux/leds-lp3944.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/leds-lp3944.h b/include/linux/leds-lp3944.h index afc9f9fd70f5..2618aa9063bc 100644 --- a/include/linux/leds-lp3944.h +++ b/include/linux/leds-lp3944.h @@ -12,9 +12,6 @@ #ifndef __LINUX_LEDS_LP3944_H #define __LINUX_LEDS_LP3944_H -#include -#include - #define LP3944_LED0 0 #define LP3944_LED1 1 #define LP3944_LED2 2 -- cgit v1.2.3 From cfc3899fcd0b3b990b29d3d33f75f4edf715e7d1 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 10 Nov 2009 17:20:40 +0000 Subject: backlight: Pass device through notify callback in the pwm driver Add the device to the notify callback's arguments in the PWM backlight driver. This brings the notify callback into line with the other callbacks defined by this driver. Signed-off-by: Ben Dooks Signed-off-by: Simtec Linux Team Signed-off-by: Richard Purdie --- drivers/video/backlight/pwm_bl.c | 9 ++++++--- include/linux/pwm_backlight.h | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index df9e0b32cf39..9d2ec2a1cce8 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -22,8 +22,10 @@ struct pwm_bl_data { struct pwm_device *pwm; + struct device *dev; unsigned int period; - int (*notify)(int brightness); + int (*notify)(struct device *, + int brightness); }; static int pwm_backlight_update_status(struct backlight_device *bl) @@ -39,7 +41,7 @@ static int pwm_backlight_update_status(struct backlight_device *bl) brightness = 0; if (pb->notify) - brightness = pb->notify(brightness); + brightness = pb->notify(pb->dev, brightness); if (brightness == 0) { pwm_config(pb->pwm, 0, pb->period); @@ -88,6 +90,7 @@ static int pwm_backlight_probe(struct platform_device *pdev) pb->period = data->pwm_period_ns; pb->notify = data->notify; + pb->dev = &pdev->dev; pb->pwm = pwm_request(data->pwm_id, "backlight"); if (IS_ERR(pb->pwm)) { @@ -146,7 +149,7 @@ static int pwm_backlight_suspend(struct platform_device *pdev, struct pwm_bl_data *pb = dev_get_drvdata(&bl->dev); if (pb->notify) - pb->notify(0); + pb->notify(pb->dev, 0); pwm_config(pb->pwm, 0, pb->period); pwm_disable(pb->pwm); return 0; diff --git a/include/linux/pwm_backlight.h b/include/linux/pwm_backlight.h index 7a9754c96775..01b3d759f1fc 100644 --- a/include/linux/pwm_backlight.h +++ b/include/linux/pwm_backlight.h @@ -10,7 +10,7 @@ struct platform_pwm_backlight_data { unsigned int dft_brightness; unsigned int pwm_period_ns; int (*init)(struct device *dev); - int (*notify)(int brightness); + int (*notify)(struct device *dev, int brightness); void (*exit)(struct device *dev); }; -- cgit v1.2.3 From 733421516b42c44b9e21f1793c430cc801ef8324 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:27 +0100 Subject: sched: Move TASK_STATE_TO_CHAR_STR near the TASK_state bits So that we don't keep forgetting about it. Signed-off-by: Peter Zijlstra LKML-Reference: <20091217121829.815779372@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 973b2b89f86d..c28ed1b1d7c2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -193,6 +193,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define TASK_WAKEKILL 128 #define TASK_WAKING 256 +#define TASK_STATE_TO_CHAR_STR "RSDTtZX" + /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) @@ -2595,8 +2597,6 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) } #endif /* CONFIG_MM_OWNER */ -#define TASK_STATE_TO_CHAR_STR "RSDTtZX" - #endif /* __KERNEL__ */ #endif -- cgit v1.2.3 From 44d90df6b757c59651ddd55f1a84f28132b50d29 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:28 +0100 Subject: sched: Add missing state chars to TASK_STATE_TO_CHAR_STR We grew 3 new task states since the last time someone touched it. Signed-off-by: Peter Zijlstra LKML-Reference: <20091217121829.892737686@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index c28ed1b1d7c2..94858df38a87 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -193,7 +193,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define TASK_WAKEKILL 128 #define TASK_WAKING 256 -#define TASK_STATE_TO_CHAR_STR "RSDTtZX" +#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) -- cgit v1.2.3 From e1781538cf5c870ab696e9b8f0a5c498d3900f2f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:30 +0100 Subject: sched: Assert task state bits at build time Since everybody is lazy and prone to forgetting things, make the compiler help us a bit. Signed-off-by: Peter Zijlstra LKML-Reference: <20091217121830.060186433@chello.nl> Signed-off-by: Ingo Molnar --- fs/proc/array.c | 18 ++++++++++-------- include/linux/sched.h | 4 ++++ 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/array.c b/fs/proc/array.c index 96361e8fa3a8..f560325c444f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -134,14 +134,14 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) * simple bit tests. */ static const char *task_state_array[] = { - "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "t (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)", /* 32 */ - "x (dead)", /* 64 */ + "R (running)", /* 0 */ + "S (sleeping)", /* 1 */ + "D (disk sleep)", /* 2 */ + "T (stopped)", /* 4 */ + "t (tracing stop)", /* 8 */ + "Z (zombie)", /* 16 */ + "X (dead)", /* 32 */ + "x (dead)", /* 64 */ "K (wakekill)", /* 128 */ "W (waking)", /* 256 */ }; @@ -151,6 +151,8 @@ static inline const char *get_task_state(struct task_struct *tsk) unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; const char **p = &task_state_array[0]; + BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array)); + while (state) { p++; state >>= 1; diff --git a/include/linux/sched.h b/include/linux/sched.h index 94858df38a87..37543876ddf5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -192,9 +192,13 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define TASK_DEAD 64 #define TASK_WAKEKILL 128 #define TASK_WAKING 256 +#define TASK_STATE_MAX 512 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" +extern char ___assert_task_state[1 - 2*!!( + sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; + /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) -- cgit v1.2.3 From e24c745272072fd2abe55209f1949b7b7ee602a7 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 14 Dec 2009 14:20:22 -0800 Subject: spi: controller driver for Designware SPI core Driver for the Designware SPI core, it supports multipul interfaces like PCI/APB etc. User can use "dw_apb_ssi_db.pdf" from Synopsys as HW datasheet. [randy.dunlap@oracle.com: fix build] [akpm@linux-foundation.org: build fix] Signed-off-by: Feng Tang Cc: David Brownell Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Grant Likely --- drivers/spi/Kconfig | 10 + drivers/spi/Makefile | 2 + drivers/spi/dw_spi.c | 944 +++++++++++++++++++++++++++++++++++++++++++++ drivers/spi/dw_spi_pci.c | 169 ++++++++ include/linux/spi/dw_spi.h | 212 ++++++++++ 5 files changed, 1337 insertions(+) create mode 100644 drivers/spi/dw_spi.c create mode 100644 drivers/spi/dw_spi_pci.c create mode 100644 include/linux/spi/dw_spi.h (limited to 'include/linux') diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index 07e5453f7b18..d7c1741c4c5b 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -292,6 +292,16 @@ config SPI_NUC900 # Add new SPI master controllers in alphabetical order above this line # +config SPI_DESIGNWARE + bool "DesignWare SPI controller core support" + depends on SPI_MASTER + help + general driver for SPI controller core from DesignWare + +config SPI_DW_PCI + tristate "PCI interface driver for DW SPI core" + depends on SPI_DESIGNWARE && PCI + # # There are lots of SPI device types, with sensors and memory # being probably the most widely used ones. diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile index ed8c1675b52f..a909e39f7e7c 100644 --- a/drivers/spi/Makefile +++ b/drivers/spi/Makefile @@ -16,6 +16,8 @@ obj-$(CONFIG_SPI_BFIN) += spi_bfin5xx.o obj-$(CONFIG_SPI_BITBANG) += spi_bitbang.o obj-$(CONFIG_SPI_AU1550) += au1550_spi.o obj-$(CONFIG_SPI_BUTTERFLY) += spi_butterfly.o +obj-$(CONFIG_SPI_DESIGNWARE) += dw_spi.o +obj-$(CONFIG_SPI_DW_PCI) += dw_spi_pci.o obj-$(CONFIG_SPI_GPIO) += spi_gpio.o obj-$(CONFIG_SPI_IMX) += spi_imx.o obj-$(CONFIG_SPI_LM70_LLP) += spi_lm70llp.o diff --git a/drivers/spi/dw_spi.c b/drivers/spi/dw_spi.c new file mode 100644 index 000000000000..31620fae77be --- /dev/null +++ b/drivers/spi/dw_spi.c @@ -0,0 +1,944 @@ +/* + * dw_spi.c - Designware SPI core controller driver (refer pxa2xx_spi.c) + * + * Copyright (c) 2009, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include + +#include +#include + +#ifdef CONFIG_DEBUG_FS +#include +#endif + +#define START_STATE ((void *)0) +#define RUNNING_STATE ((void *)1) +#define DONE_STATE ((void *)2) +#define ERROR_STATE ((void *)-1) + +#define QUEUE_RUNNING 0 +#define QUEUE_STOPPED 1 + +#define MRST_SPI_DEASSERT 0 +#define MRST_SPI_ASSERT 1 + +/* Slave spi_dev related */ +struct chip_data { + u16 cr0; + u8 cs; /* chip select pin */ + u8 n_bytes; /* current is a 1/2/4 byte op */ + u8 tmode; /* TR/TO/RO/EEPROM */ + u8 type; /* SPI/SSP/MicroWire */ + + u8 poll_mode; /* 1 means use poll mode */ + + u32 dma_width; + u32 rx_threshold; + u32 tx_threshold; + u8 enable_dma; + u8 bits_per_word; + u16 clk_div; /* baud rate divider */ + u32 speed_hz; /* baud rate */ + int (*write)(struct dw_spi *dws); + int (*read)(struct dw_spi *dws); + void (*cs_control)(u32 command); +}; + +#ifdef CONFIG_DEBUG_FS +static int spi_show_regs_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +#define SPI_REGS_BUFSIZE 1024 +static ssize_t spi_show_regs(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct dw_spi *dws; + char *buf; + u32 len = 0; + ssize_t ret; + + dws = file->private_data; + + buf = kzalloc(SPI_REGS_BUFSIZE, GFP_KERNEL); + if (!buf) + return 0; + + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "MRST SPI0 registers:\n"); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "=================================\n"); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "CTRL0: \t\t0x%08x\n", dw_readl(dws, ctrl0)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "CTRL1: \t\t0x%08x\n", dw_readl(dws, ctrl1)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "SSIENR: \t0x%08x\n", dw_readl(dws, ssienr)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "SER: \t\t0x%08x\n", dw_readl(dws, ser)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "BAUDR: \t\t0x%08x\n", dw_readl(dws, baudr)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "TXFTLR: \t0x%08x\n", dw_readl(dws, txfltr)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "RXFTLR: \t0x%08x\n", dw_readl(dws, rxfltr)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "TXFLR: \t\t0x%08x\n", dw_readl(dws, txflr)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "RXFLR: \t\t0x%08x\n", dw_readl(dws, rxflr)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "SR: \t\t0x%08x\n", dw_readl(dws, sr)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "IMR: \t\t0x%08x\n", dw_readl(dws, imr)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "ISR: \t\t0x%08x\n", dw_readl(dws, isr)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "DMACR: \t\t0x%08x\n", dw_readl(dws, dmacr)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "DMATDLR: \t0x%08x\n", dw_readl(dws, dmatdlr)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "DMARDLR: \t0x%08x\n", dw_readl(dws, dmardlr)); + len += snprintf(buf + len, SPI_REGS_BUFSIZE - len, + "=================================\n"); + + ret = simple_read_from_buffer(user_buf, count, ppos, buf, len); + kfree(buf); + return ret; +} + +static const struct file_operations mrst_spi_regs_ops = { + .owner = THIS_MODULE, + .open = spi_show_regs_open, + .read = spi_show_regs, +}; + +static int mrst_spi_debugfs_init(struct dw_spi *dws) +{ + dws->debugfs = debugfs_create_dir("mrst_spi", NULL); + if (!dws->debugfs) + return -ENOMEM; + + debugfs_create_file("registers", S_IFREG | S_IRUGO, + dws->debugfs, (void *)dws, &mrst_spi_regs_ops); + return 0; +} + +static void mrst_spi_debugfs_remove(struct dw_spi *dws) +{ + if (dws->debugfs) + debugfs_remove_recursive(dws->debugfs); +} + +#else +static inline int mrst_spi_debugfs_init(struct dw_spi *dws) +{ +} + +static inline void mrst_spi_debugfs_remove(struct dw_spi *dws) +{ +} +#endif /* CONFIG_DEBUG_FS */ + +static void wait_till_not_busy(struct dw_spi *dws) +{ + unsigned long end = jiffies + usecs_to_jiffies(1000); + + while (time_before(jiffies, end)) { + if (!(dw_readw(dws, sr) & SR_BUSY)) + return; + } + dev_err(&dws->master->dev, + "DW SPI: Stutus keeps busy for 1000us after a read/write!\n"); +} + +static void flush(struct dw_spi *dws) +{ + while (dw_readw(dws, sr) & SR_RF_NOT_EMPT) + dw_readw(dws, dr); + + wait_till_not_busy(dws); +} + +static void null_cs_control(u32 command) +{ +} + +static int null_writer(struct dw_spi *dws) +{ + u8 n_bytes = dws->n_bytes; + + if (!(dw_readw(dws, sr) & SR_TF_NOT_FULL) + || (dws->tx == dws->tx_end)) + return 0; + dw_writew(dws, dr, 0); + dws->tx += n_bytes; + + wait_till_not_busy(dws); + return 1; +} + +static int null_reader(struct dw_spi *dws) +{ + u8 n_bytes = dws->n_bytes; + + while ((dw_readw(dws, sr) & SR_RF_NOT_EMPT) + && (dws->rx < dws->rx_end)) { + dw_readw(dws, dr); + dws->rx += n_bytes; + } + wait_till_not_busy(dws); + return dws->rx == dws->rx_end; +} + +static int u8_writer(struct dw_spi *dws) +{ + if (!(dw_readw(dws, sr) & SR_TF_NOT_FULL) + || (dws->tx == dws->tx_end)) + return 0; + + dw_writew(dws, dr, *(u8 *)(dws->tx)); + ++dws->tx; + + wait_till_not_busy(dws); + return 1; +} + +static int u8_reader(struct dw_spi *dws) +{ + while ((dw_readw(dws, sr) & SR_RF_NOT_EMPT) + && (dws->rx < dws->rx_end)) { + *(u8 *)(dws->rx) = dw_readw(dws, dr); + ++dws->rx; + } + + wait_till_not_busy(dws); + return dws->rx == dws->rx_end; +} + +static int u16_writer(struct dw_spi *dws) +{ + if (!(dw_readw(dws, sr) & SR_TF_NOT_FULL) + || (dws->tx == dws->tx_end)) + return 0; + + dw_writew(dws, dr, *(u16 *)(dws->tx)); + dws->tx += 2; + + wait_till_not_busy(dws); + return 1; +} + +static int u16_reader(struct dw_spi *dws) +{ + u16 temp; + + while ((dw_readw(dws, sr) & SR_RF_NOT_EMPT) + && (dws->rx < dws->rx_end)) { + temp = dw_readw(dws, dr); + *(u16 *)(dws->rx) = temp; + dws->rx += 2; + } + + wait_till_not_busy(dws); + return dws->rx == dws->rx_end; +} + +static void *next_transfer(struct dw_spi *dws) +{ + struct spi_message *msg = dws->cur_msg; + struct spi_transfer *trans = dws->cur_transfer; + + /* Move to next transfer */ + if (trans->transfer_list.next != &msg->transfers) { + dws->cur_transfer = + list_entry(trans->transfer_list.next, + struct spi_transfer, + transfer_list); + return RUNNING_STATE; + } else + return DONE_STATE; +} + +/* + * Note: first step is the protocol driver prepares + * a dma-capable memory, and this func just need translate + * the virt addr to physical + */ +static int map_dma_buffers(struct dw_spi *dws) +{ + if (!dws->cur_msg->is_dma_mapped || !dws->dma_inited + || !dws->cur_chip->enable_dma) + return 0; + + if (dws->cur_transfer->tx_dma) + dws->tx_dma = dws->cur_transfer->tx_dma; + + if (dws->cur_transfer->rx_dma) + dws->rx_dma = dws->cur_transfer->rx_dma; + + return 1; +} + +/* Caller already set message->status; dma and pio irqs are blocked */ +static void giveback(struct dw_spi *dws) +{ + struct spi_transfer *last_transfer; + unsigned long flags; + struct spi_message *msg; + + spin_lock_irqsave(&dws->lock, flags); + msg = dws->cur_msg; + dws->cur_msg = NULL; + dws->cur_transfer = NULL; + dws->prev_chip = dws->cur_chip; + dws->cur_chip = NULL; + dws->dma_mapped = 0; + queue_work(dws->workqueue, &dws->pump_messages); + spin_unlock_irqrestore(&dws->lock, flags); + + last_transfer = list_entry(msg->transfers.prev, + struct spi_transfer, + transfer_list); + + if (!last_transfer->cs_change) + dws->cs_control(MRST_SPI_DEASSERT); + + msg->state = NULL; + if (msg->complete) + msg->complete(msg->context); +} + +static void int_error_stop(struct dw_spi *dws, const char *msg) +{ + /* Stop and reset hw */ + flush(dws); + spi_enable_chip(dws, 0); + + dev_err(&dws->master->dev, "%s\n", msg); + dws->cur_msg->state = ERROR_STATE; + tasklet_schedule(&dws->pump_transfers); +} + +static void transfer_complete(struct dw_spi *dws) +{ + /* Update total byte transfered return count actual bytes read */ + dws->cur_msg->actual_length += dws->len; + + /* Move to next transfer */ + dws->cur_msg->state = next_transfer(dws); + + /* Handle end of message */ + if (dws->cur_msg->state == DONE_STATE) { + dws->cur_msg->status = 0; + giveback(dws); + } else + tasklet_schedule(&dws->pump_transfers); +} + +static irqreturn_t interrupt_transfer(struct dw_spi *dws) +{ + u16 irq_status, irq_mask = 0x3f; + + irq_status = dw_readw(dws, isr) & irq_mask; + /* Error handling */ + if (irq_status & (SPI_INT_TXOI | SPI_INT_RXOI | SPI_INT_RXUI)) { + dw_readw(dws, txoicr); + dw_readw(dws, rxoicr); + dw_readw(dws, rxuicr); + int_error_stop(dws, "interrupt_transfer: fifo overrun"); + return IRQ_HANDLED; + } + + /* INT comes from tx */ + if (dws->tx && (irq_status & SPI_INT_TXEI)) { + while (dws->tx < dws->tx_end) + dws->write(dws); + + if (dws->tx == dws->tx_end) { + spi_mask_intr(dws, SPI_INT_TXEI); + transfer_complete(dws); + } + } + + /* INT comes from rx */ + if (dws->rx && (irq_status & SPI_INT_RXFI)) { + if (dws->read(dws)) + transfer_complete(dws); + } + return IRQ_HANDLED; +} + +static irqreturn_t dw_spi_irq(int irq, void *dev_id) +{ + struct dw_spi *dws = dev_id; + + if (!dws->cur_msg) { + spi_mask_intr(dws, SPI_INT_TXEI); + /* Never fail */ + return IRQ_HANDLED; + } + + return dws->transfer_handler(dws); +} + +/* Must be called inside pump_transfers() */ +static void poll_transfer(struct dw_spi *dws) +{ + if (dws->tx) { + while (dws->write(dws)) + dws->read(dws); + } + + dws->read(dws); + transfer_complete(dws); +} + +static void dma_transfer(struct dw_spi *dws, int cs_change) +{ +} + +static void pump_transfers(unsigned long data) +{ + struct dw_spi *dws = (struct dw_spi *)data; + struct spi_message *message = NULL; + struct spi_transfer *transfer = NULL; + struct spi_transfer *previous = NULL; + struct spi_device *spi = NULL; + struct chip_data *chip = NULL; + u8 bits = 0; + u8 imask = 0; + u8 cs_change = 0; + u16 clk_div = 0; + u32 speed = 0; + u32 cr0 = 0; + + /* Get current state information */ + message = dws->cur_msg; + transfer = dws->cur_transfer; + chip = dws->cur_chip; + spi = message->spi; + + if (message->state == ERROR_STATE) { + message->status = -EIO; + goto early_exit; + } + + /* Handle end of message */ + if (message->state == DONE_STATE) { + message->status = 0; + goto early_exit; + } + + /* Delay if requested at end of transfer*/ + if (message->state == RUNNING_STATE) { + previous = list_entry(transfer->transfer_list.prev, + struct spi_transfer, + transfer_list); + if (previous->delay_usecs) + udelay(previous->delay_usecs); + } + + dws->n_bytes = chip->n_bytes; + dws->dma_width = chip->dma_width; + dws->cs_control = chip->cs_control; + + dws->rx_dma = transfer->rx_dma; + dws->tx_dma = transfer->tx_dma; + dws->tx = (void *)transfer->tx_buf; + dws->tx_end = dws->tx + transfer->len; + dws->rx = transfer->rx_buf; + dws->rx_end = dws->rx + transfer->len; + dws->write = dws->tx ? chip->write : null_writer; + dws->read = dws->rx ? chip->read : null_reader; + dws->cs_change = transfer->cs_change; + dws->len = dws->cur_transfer->len; + if (chip != dws->prev_chip) + cs_change = 1; + + cr0 = chip->cr0; + + /* Handle per transfer options for bpw and speed */ + if (transfer->speed_hz) { + speed = chip->speed_hz; + + if (transfer->speed_hz != speed) { + speed = transfer->speed_hz; + if (speed > dws->max_freq) { + printk(KERN_ERR "MRST SPI0: unsupported" + "freq: %dHz\n", speed); + message->status = -EIO; + goto early_exit; + } + + /* clk_div doesn't support odd number */ + clk_div = dws->max_freq / speed; + clk_div = (clk_div >> 1) << 1; + + chip->speed_hz = speed; + chip->clk_div = clk_div; + } + } + if (transfer->bits_per_word) { + bits = transfer->bits_per_word; + + switch (bits) { + case 8: + dws->n_bytes = 1; + dws->dma_width = 1; + dws->read = (dws->read != null_reader) ? + u8_reader : null_reader; + dws->write = (dws->write != null_writer) ? + u8_writer : null_writer; + break; + case 16: + dws->n_bytes = 2; + dws->dma_width = 2; + dws->read = (dws->read != null_reader) ? + u16_reader : null_reader; + dws->write = (dws->write != null_writer) ? + u16_writer : null_writer; + break; + default: + printk(KERN_ERR "MRST SPI0: unsupported bits:" + "%db\n", bits); + message->status = -EIO; + goto early_exit; + } + + cr0 = (bits - 1) + | (chip->type << SPI_FRF_OFFSET) + | (spi->mode << SPI_MODE_OFFSET) + | (chip->tmode << SPI_TMOD_OFFSET); + } + message->state = RUNNING_STATE; + + /* Check if current transfer is a DMA transaction */ + dws->dma_mapped = map_dma_buffers(dws); + + if (!dws->dma_mapped && !chip->poll_mode) { + if (dws->rx) + imask |= SPI_INT_RXFI; + if (dws->tx) + imask |= SPI_INT_TXEI; + dws->transfer_handler = interrupt_transfer; + } + + /* + * Reprogram registers only if + * 1. chip select changes + * 2. clk_div is changed + * 3. control value changes + */ + if (dw_readw(dws, ctrl0) != cr0 || cs_change || clk_div) { + spi_enable_chip(dws, 0); + + if (dw_readw(dws, ctrl0) != cr0) + dw_writew(dws, ctrl0, cr0); + + /* Set the interrupt mask, for poll mode just diable all int */ + spi_mask_intr(dws, 0xff); + if (!chip->poll_mode) + spi_umask_intr(dws, imask); + + spi_set_clk(dws, clk_div ? clk_div : chip->clk_div); + spi_chip_sel(dws, spi->chip_select); + spi_enable_chip(dws, 1); + + if (cs_change) + dws->prev_chip = chip; + } + + if (dws->dma_mapped) + dma_transfer(dws, cs_change); + + if (chip->poll_mode) + poll_transfer(dws); + + return; + +early_exit: + giveback(dws); + return; +} + +static void pump_messages(struct work_struct *work) +{ + struct dw_spi *dws = + container_of(work, struct dw_spi, pump_messages); + unsigned long flags; + + /* Lock queue and check for queue work */ + spin_lock_irqsave(&dws->lock, flags); + if (list_empty(&dws->queue) || dws->run == QUEUE_STOPPED) { + dws->busy = 0; + spin_unlock_irqrestore(&dws->lock, flags); + return; + } + + /* Make sure we are not already running a message */ + if (dws->cur_msg) { + spin_unlock_irqrestore(&dws->lock, flags); + return; + } + + /* Extract head of queue */ + dws->cur_msg = list_entry(dws->queue.next, struct spi_message, queue); + list_del_init(&dws->cur_msg->queue); + + /* Initial message state*/ + dws->cur_msg->state = START_STATE; + dws->cur_transfer = list_entry(dws->cur_msg->transfers.next, + struct spi_transfer, + transfer_list); + dws->cur_chip = spi_get_ctldata(dws->cur_msg->spi); + + /* Mark as busy and launch transfers */ + tasklet_schedule(&dws->pump_transfers); + + dws->busy = 1; + spin_unlock_irqrestore(&dws->lock, flags); +} + +/* spi_device use this to queue in their spi_msg */ +static int dw_spi_transfer(struct spi_device *spi, struct spi_message *msg) +{ + struct dw_spi *dws = spi_master_get_devdata(spi->master); + unsigned long flags; + + spin_lock_irqsave(&dws->lock, flags); + + if (dws->run == QUEUE_STOPPED) { + spin_unlock_irqrestore(&dws->lock, flags); + return -ESHUTDOWN; + } + + msg->actual_length = 0; + msg->status = -EINPROGRESS; + msg->state = START_STATE; + + list_add_tail(&msg->queue, &dws->queue); + + if (dws->run == QUEUE_RUNNING && !dws->busy) { + + if (dws->cur_transfer || dws->cur_msg) + queue_work(dws->workqueue, + &dws->pump_messages); + else { + /* If no other data transaction in air, just go */ + spin_unlock_irqrestore(&dws->lock, flags); + pump_messages(&dws->pump_messages); + return 0; + } + } + + spin_unlock_irqrestore(&dws->lock, flags); + return 0; +} + +/* This may be called twice for each spi dev */ +static int dw_spi_setup(struct spi_device *spi) +{ + struct dw_spi_chip *chip_info = NULL; + struct chip_data *chip; + + if (spi->bits_per_word != 8 && spi->bits_per_word != 16) + return -EINVAL; + + /* Only alloc on first setup */ + chip = spi_get_ctldata(spi); + if (!chip) { + chip = kzalloc(sizeof(struct chip_data), GFP_KERNEL); + if (!chip) + return -ENOMEM; + + chip->cs_control = null_cs_control; + chip->enable_dma = 0; + } + + /* + * Protocol drivers may change the chip settings, so... + * if chip_info exists, use it + */ + chip_info = spi->controller_data; + + /* chip_info doesn't always exist */ + if (chip_info) { + if (chip_info->cs_control) + chip->cs_control = chip_info->cs_control; + + chip->poll_mode = chip_info->poll_mode; + chip->type = chip_info->type; + + chip->rx_threshold = 0; + chip->tx_threshold = 0; + + chip->enable_dma = chip_info->enable_dma; + } + + if (spi->bits_per_word <= 8) { + chip->n_bytes = 1; + chip->dma_width = 1; + chip->read = u8_reader; + chip->write = u8_writer; + } else if (spi->bits_per_word <= 16) { + chip->n_bytes = 2; + chip->dma_width = 2; + chip->read = u16_reader; + chip->write = u16_writer; + } else { + /* Never take >16b case for MRST SPIC */ + dev_err(&spi->dev, "invalid wordsize\n"); + return -EINVAL; + } + chip->bits_per_word = spi->bits_per_word; + + chip->speed_hz = spi->max_speed_hz; + if (chip->speed_hz) + chip->clk_div = 25000000 / chip->speed_hz; + else + chip->clk_div = 8; /* default value */ + + chip->tmode = 0; /* Tx & Rx */ + /* Default SPI mode is SCPOL = 0, SCPH = 0 */ + chip->cr0 = (chip->bits_per_word - 1) + | (chip->type << SPI_FRF_OFFSET) + | (spi->mode << SPI_MODE_OFFSET) + | (chip->tmode << SPI_TMOD_OFFSET); + + spi_set_ctldata(spi, chip); + return 0; +} + +static void dw_spi_cleanup(struct spi_device *spi) +{ + struct chip_data *chip = spi_get_ctldata(spi); + kfree(chip); +} + +static int __init init_queue(struct dw_spi *dws) +{ + INIT_LIST_HEAD(&dws->queue); + spin_lock_init(&dws->lock); + + dws->run = QUEUE_STOPPED; + dws->busy = 0; + + tasklet_init(&dws->pump_transfers, + pump_transfers, (unsigned long)dws); + + INIT_WORK(&dws->pump_messages, pump_messages); + dws->workqueue = create_singlethread_workqueue( + dev_name(dws->master->dev.parent)); + if (dws->workqueue == NULL) + return -EBUSY; + + return 0; +} + +static int start_queue(struct dw_spi *dws) +{ + unsigned long flags; + + spin_lock_irqsave(&dws->lock, flags); + + if (dws->run == QUEUE_RUNNING || dws->busy) { + spin_unlock_irqrestore(&dws->lock, flags); + return -EBUSY; + } + + dws->run = QUEUE_RUNNING; + dws->cur_msg = NULL; + dws->cur_transfer = NULL; + dws->cur_chip = NULL; + dws->prev_chip = NULL; + spin_unlock_irqrestore(&dws->lock, flags); + + queue_work(dws->workqueue, &dws->pump_messages); + + return 0; +} + +static int stop_queue(struct dw_spi *dws) +{ + unsigned long flags; + unsigned limit = 50; + int status = 0; + + spin_lock_irqsave(&dws->lock, flags); + dws->run = QUEUE_STOPPED; + while (!list_empty(&dws->queue) && dws->busy && limit--) { + spin_unlock_irqrestore(&dws->lock, flags); + msleep(10); + spin_lock_irqsave(&dws->lock, flags); + } + + if (!list_empty(&dws->queue) || dws->busy) + status = -EBUSY; + spin_unlock_irqrestore(&dws->lock, flags); + + return status; +} + +static int destroy_queue(struct dw_spi *dws) +{ + int status; + + status = stop_queue(dws); + if (status != 0) + return status; + destroy_workqueue(dws->workqueue); + return 0; +} + +/* Restart the controller, disable all interrupts, clean rx fifo */ +static void spi_hw_init(struct dw_spi *dws) +{ + spi_enable_chip(dws, 0); + spi_mask_intr(dws, 0xff); + spi_enable_chip(dws, 1); + flush(dws); +} + +int __devinit dw_spi_add_host(struct dw_spi *dws) +{ + struct spi_master *master; + int ret; + + BUG_ON(dws == NULL); + + master = spi_alloc_master(dws->parent_dev, 0); + if (!master) { + ret = -ENOMEM; + goto exit; + } + + dws->master = master; + dws->type = SSI_MOTO_SPI; + dws->prev_chip = NULL; + dws->dma_inited = 0; + dws->dma_addr = (dma_addr_t)(dws->paddr + 0x60); + + ret = request_irq(dws->irq, dw_spi_irq, 0, + "dw_spi", dws); + if (ret < 0) { + dev_err(&master->dev, "can not get IRQ\n"); + goto err_free_master; + } + + master->mode_bits = SPI_CPOL | SPI_CPHA; + master->bus_num = dws->bus_num; + master->num_chipselect = dws->num_cs; + master->cleanup = dw_spi_cleanup; + master->setup = dw_spi_setup; + master->transfer = dw_spi_transfer; + + dws->dma_inited = 0; + + /* Basic HW init */ + spi_hw_init(dws); + + /* Initial and start queue */ + ret = init_queue(dws); + if (ret) { + dev_err(&master->dev, "problem initializing queue\n"); + goto err_diable_hw; + } + ret = start_queue(dws); + if (ret) { + dev_err(&master->dev, "problem starting queue\n"); + goto err_diable_hw; + } + + spi_master_set_devdata(master, dws); + ret = spi_register_master(master); + if (ret) { + dev_err(&master->dev, "problem registering spi master\n"); + goto err_queue_alloc; + } + + mrst_spi_debugfs_init(dws); + return 0; + +err_queue_alloc: + destroy_queue(dws); +err_diable_hw: + spi_enable_chip(dws, 0); + free_irq(dws->irq, dws); +err_free_master: + spi_master_put(master); +exit: + return ret; +} +EXPORT_SYMBOL(dw_spi_add_host); + +void __devexit dw_spi_remove_host(struct dw_spi *dws) +{ + int status = 0; + + if (!dws) + return; + mrst_spi_debugfs_remove(dws); + + /* Remove the queue */ + status = destroy_queue(dws); + if (status != 0) + dev_err(&dws->master->dev, "dw_spi_remove: workqueue will not " + "complete, message memory not freed\n"); + + spi_enable_chip(dws, 0); + /* Disable clk */ + spi_set_clk(dws, 0); + free_irq(dws->irq, dws); + + /* Disconnect from the SPI framework */ + spi_unregister_master(dws->master); +} + +int dw_spi_suspend_host(struct dw_spi *dws) +{ + int ret = 0; + + ret = stop_queue(dws); + if (ret) + return ret; + spi_enable_chip(dws, 0); + spi_set_clk(dws, 0); + return ret; +} +EXPORT_SYMBOL(dw_spi_suspend_host); + +int dw_spi_resume_host(struct dw_spi *dws) +{ + int ret; + + spi_hw_init(dws); + ret = start_queue(dws); + if (ret) + dev_err(&dws->master->dev, "fail to start queue (%d)\n", ret); + return ret; +} +EXPORT_SYMBOL(dw_spi_resume_host); + +MODULE_AUTHOR("Feng Tang "); +MODULE_DESCRIPTION("Driver for DesignWare SPI controller core"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/spi/dw_spi_pci.c b/drivers/spi/dw_spi_pci.c new file mode 100644 index 000000000000..34ba69161734 --- /dev/null +++ b/drivers/spi/dw_spi_pci.c @@ -0,0 +1,169 @@ +/* + * mrst_spi_pci.c - PCI interface driver for DW SPI Core + * + * Copyright (c) 2009, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include + +#define DRIVER_NAME "dw_spi_pci" + +struct dw_spi_pci { + struct pci_dev *pdev; + struct dw_spi dws; +}; + +static int __devinit spi_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + struct dw_spi_pci *dwpci; + struct dw_spi *dws; + int pci_bar = 0; + int ret; + + printk(KERN_INFO "DW: found PCI SPI controller(ID: %04x:%04x)\n", + pdev->vendor, pdev->device); + + ret = pci_enable_device(pdev); + if (ret) + return ret; + + dwpci = kzalloc(sizeof(struct dw_spi_pci), GFP_KERNEL); + if (!dwpci) { + ret = -ENOMEM; + goto err_disable; + } + + dwpci->pdev = pdev; + dws = &dwpci->dws; + + /* Get basic io resource and map it */ + dws->paddr = pci_resource_start(pdev, pci_bar); + dws->iolen = pci_resource_len(pdev, pci_bar); + + ret = pci_request_region(pdev, pci_bar, dev_name(&pdev->dev)); + if (ret) + goto err_kfree; + + dws->regs = ioremap_nocache((unsigned long)dws->paddr, + pci_resource_len(pdev, pci_bar)); + if (!dws->regs) { + ret = -ENOMEM; + goto err_release_reg; + } + + dws->parent_dev = &pdev->dev; + dws->bus_num = 0; + dws->num_cs = 4; + dws->max_freq = 25000000; /* for Moorestwon */ + dws->irq = pdev->irq; + + ret = dw_spi_add_host(dws); + if (ret) + goto err_unmap; + + /* PCI hook and SPI hook use the same drv data */ + pci_set_drvdata(pdev, dwpci); + return 0; + +err_unmap: + iounmap(dws->regs); +err_release_reg: + pci_release_region(pdev, pci_bar); +err_kfree: + kfree(dwpci); +err_disable: + pci_disable_device(pdev); + return ret; +} + +static void __devexit spi_pci_remove(struct pci_dev *pdev) +{ + struct dw_spi_pci *dwpci = pci_get_drvdata(pdev); + + pci_set_drvdata(pdev, NULL); + iounmap(dwpci->dws.regs); + pci_release_region(pdev, 0); + kfree(dwpci); + pci_disable_device(pdev); +} + +#ifdef CONFIG_PM +static int spi_suspend(struct pci_dev *pdev, pm_message_t state) +{ + struct dw_spi_pci *dwpci = pci_get_drvdata(pdev); + int ret; + + ret = dw_spi_suspend_host(&dwpci->dws); + if (ret) + return ret; + pci_save_state(pdev); + pci_disable_device(pdev); + pci_set_power_state(pdev, pci_choose_state(pdev, state)); + return ret; +} + +static int spi_resume(struct pci_dev *pdev) +{ + struct dw_spi_pci *dwpci = pci_get_drvdata(pdev); + int ret; + + pci_set_power_state(pdev, PCI_D0); + pci_restore_state(pdev); + ret = pci_enable_device(pdev); + if (ret) + return ret; + return dw_spi_resume_host(&dwpci->dws); +} +#else +#define spi_suspend NULL +#define spi_resume NULL +#endif + +static const struct pci_device_id pci_ids[] __devinitdata = { + /* Intel Moorestown platform SPI controller 0 */ + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0800) }, + {}, +}; + +static struct pci_driver dw_spi_driver = { + .name = DRIVER_NAME, + .id_table = pci_ids, + .probe = spi_pci_probe, + .remove = __devexit_p(spi_pci_remove), + .suspend = spi_suspend, + .resume = spi_resume, +}; + +static int __init mrst_spi_init(void) +{ + return pci_register_driver(&dw_spi_driver); +} + +static void __exit mrst_spi_exit(void) +{ + pci_unregister_driver(&dw_spi_driver); +} + +module_init(mrst_spi_init); +module_exit(mrst_spi_exit); + +MODULE_AUTHOR("Feng Tang "); +MODULE_DESCRIPTION("PCI interface driver for DW SPI Core"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/spi/dw_spi.h b/include/linux/spi/dw_spi.h new file mode 100644 index 000000000000..51b3e771a9a3 --- /dev/null +++ b/include/linux/spi/dw_spi.h @@ -0,0 +1,212 @@ +#ifndef DW_SPI_HEADER_H +#define DW_SPI_HEADER_H +#include + +/* Bit fields in CTRLR0 */ +#define SPI_DFS_OFFSET 0 + +#define SPI_FRF_OFFSET 4 +#define SPI_FRF_SPI 0x0 +#define SPI_FRF_SSP 0x1 +#define SPI_FRF_MICROWIRE 0x2 +#define SPI_FRF_RESV 0x3 + +#define SPI_MODE_OFFSET 6 +#define SPI_SCPH_OFFSET 6 +#define SPI_SCOL_OFFSET 7 +#define SPI_TMOD_OFFSET 8 +#define SPI_TMOD_TR 0x0 /* xmit & recv */ +#define SPI_TMOD_TO 0x1 /* xmit only */ +#define SPI_TMOD_RO 0x2 /* recv only */ +#define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */ + +#define SPI_SLVOE_OFFSET 10 +#define SPI_SRL_OFFSET 11 +#define SPI_CFS_OFFSET 12 + +/* Bit fields in SR, 7 bits */ +#define SR_MASK 0x7f /* cover 7 bits */ +#define SR_BUSY (1 << 0) +#define SR_TF_NOT_FULL (1 << 1) +#define SR_TF_EMPT (1 << 2) +#define SR_RF_NOT_EMPT (1 << 3) +#define SR_RF_FULL (1 << 4) +#define SR_TX_ERR (1 << 5) +#define SR_DCOL (1 << 6) + +/* Bit fields in ISR, IMR, RISR, 7 bits */ +#define SPI_INT_TXEI (1 << 0) +#define SPI_INT_TXOI (1 << 1) +#define SPI_INT_RXUI (1 << 2) +#define SPI_INT_RXOI (1 << 3) +#define SPI_INT_RXFI (1 << 4) +#define SPI_INT_MSTI (1 << 5) + +/* TX RX interrupt level threshhold, max can be 256 */ +#define SPI_INT_THRESHOLD 32 + +enum dw_ssi_type { + SSI_MOTO_SPI = 0, + SSI_TI_SSP, + SSI_NS_MICROWIRE, +}; + +struct dw_spi_reg { + u32 ctrl0; + u32 ctrl1; + u32 ssienr; + u32 mwcr; + u32 ser; + u32 baudr; + u32 txfltr; + u32 rxfltr; + u32 txflr; + u32 rxflr; + u32 sr; + u32 imr; + u32 isr; + u32 risr; + u32 txoicr; + u32 rxoicr; + u32 rxuicr; + u32 msticr; + u32 icr; + u32 dmacr; + u32 dmatdlr; + u32 dmardlr; + u32 idr; + u32 version; + u32 dr; /* Currently oper as 32 bits, + though only low 16 bits matters */ +} __packed; + +struct dw_spi { + struct spi_master *master; + struct spi_device *cur_dev; + struct device *parent_dev; + enum dw_ssi_type type; + + void __iomem *regs; + unsigned long paddr; + u32 iolen; + int irq; + u32 max_freq; /* max bus freq supported */ + + u16 bus_num; + u16 num_cs; /* supported slave numbers */ + + /* Driver message queue */ + struct workqueue_struct *workqueue; + struct work_struct pump_messages; + spinlock_t lock; + struct list_head queue; + int busy; + int run; + + /* Message Transfer pump */ + struct tasklet_struct pump_transfers; + + /* Current message transfer state info */ + struct spi_message *cur_msg; + struct spi_transfer *cur_transfer; + struct chip_data *cur_chip; + struct chip_data *prev_chip; + size_t len; + void *tx; + void *tx_end; + void *rx; + void *rx_end; + int dma_mapped; + dma_addr_t rx_dma; + dma_addr_t tx_dma; + size_t rx_map_len; + size_t tx_map_len; + u8 n_bytes; /* current is a 1/2 bytes op */ + u8 max_bits_per_word; /* maxim is 16b */ + u32 dma_width; + int cs_change; + int (*write)(struct dw_spi *dws); + int (*read)(struct dw_spi *dws); + irqreturn_t (*transfer_handler)(struct dw_spi *dws); + void (*cs_control)(u32 command); + + /* Dma info */ + int dma_inited; + struct dma_chan *txchan; + struct dma_chan *rxchan; + int txdma_done; + int rxdma_done; + u64 tx_param; + u64 rx_param; + struct device *dma_dev; + dma_addr_t dma_addr; + + /* Bus interface info */ + void *priv; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs; +#endif +}; + +#define dw_readl(dw, name) \ + __raw_readl(&(((struct dw_spi_reg *)dw->regs)->name)) +#define dw_writel(dw, name, val) \ + __raw_writel((val), &(((struct dw_spi_reg *)dw->regs)->name)) +#define dw_readw(dw, name) \ + __raw_readw(&(((struct dw_spi_reg *)dw->regs)->name)) +#define dw_writew(dw, name, val) \ + __raw_writew((val), &(((struct dw_spi_reg *)dw->regs)->name)) + +static inline void spi_enable_chip(struct dw_spi *dws, int enable) +{ + dw_writel(dws, ssienr, (enable ? 1 : 0)); +} + +static inline void spi_set_clk(struct dw_spi *dws, u16 div) +{ + dw_writel(dws, baudr, div); +} + +static inline void spi_chip_sel(struct dw_spi *dws, u16 cs) +{ + if (cs > dws->num_cs) + return; + dw_writel(dws, ser, 1 << cs); +} + +/* Disable IRQ bits */ +static inline void spi_mask_intr(struct dw_spi *dws, u32 mask) +{ + u32 new_mask; + + new_mask = dw_readl(dws, imr) & ~mask; + dw_writel(dws, imr, new_mask); +} + +/* Enable IRQ bits */ +static inline void spi_umask_intr(struct dw_spi *dws, u32 mask) +{ + u32 new_mask; + + new_mask = dw_readl(dws, imr) | mask; + dw_writel(dws, imr, new_mask); +} + +/* + * Each SPI slave device to work with dw_api controller should + * has such a structure claiming its working mode (PIO/DMA etc), + * which can be save in the "controller_data" member of the + * struct spi_device + */ +struct dw_spi_chip { + u8 poll_mode; /* 0 for contoller polling mode */ + u8 type; /* SPI/SSP/Micrwire */ + u8 enable_dma; + void (*cs_control)(u32 command); +}; + +extern int dw_spi_add_host(struct dw_spi *dws); +extern void dw_spi_remove_host(struct dw_spi *dws); +extern int dw_spi_suspend_host(struct dw_spi *dws); +extern int dw_spi_resume_host(struct dw_spi *dws); +#endif /* DW_SPI_HEADER_H */ -- cgit v1.2.3 From 9afa2fb6c13501e5b3536d15344fce4e5442c469 Mon Sep 17 00:00:00 2001 From: Erez Zadok Date: Wed, 2 Dec 2009 19:51:54 -0500 Subject: fsstack/ecryptfs: remove unused get_nlinks param to fsstack_copy_attr_all This get_nlinks parameter was never used by the only mainline user, ecryptfs; and it has never been used by unionfs or wrapfs either. Acked-by: Dustin Kirkland Acked-by: Tyler Hicks Signed-off-by: Erez Zadok Signed-off-by: Al Viro --- fs/ecryptfs/dentry.c | 2 +- fs/ecryptfs/inode.c | 6 +++--- fs/ecryptfs/main.c | 2 +- fs/stack.c | 17 +++-------------- include/linux/fs_stack.h | 4 +--- 5 files changed, 9 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 2dda5ade75bc..8f006a0d6076 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -62,7 +62,7 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) struct inode *lower_inode = ecryptfs_inode_to_lower(dentry->d_inode); - fsstack_copy_attr_all(dentry->d_inode, lower_inode, NULL); + fsstack_copy_attr_all(dentry->d_inode, lower_inode); } out: return rc; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 056fed62d0de..429ca0b3ba08 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -626,9 +626,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, lower_new_dir_dentry->d_inode, lower_new_dentry); if (rc) goto out_lock; - fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode, NULL); + fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); if (new_dir != old_dir) - fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode, NULL); + fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); out_lock: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); dput(lower_new_dentry->d_parent); @@ -967,7 +967,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) rc = notify_change(lower_dentry, ia); mutex_unlock(&lower_dentry->d_inode->i_mutex); out: - fsstack_copy_attr_all(inode, lower_inode, NULL); + fsstack_copy_attr_all(inode, lower_inode); return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 101fe4c7b1ee..567bc4b9f70a 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -189,7 +189,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, init_special_inode(inode, lower_inode->i_mode, lower_inode->i_rdev); dentry->d_op = &ecryptfs_dops; - fsstack_copy_attr_all(inode, lower_inode, NULL); + fsstack_copy_attr_all(inode, lower_inode); /* This size will be overwritten for real files w/ headers and * other metadata */ fsstack_copy_inode_size(inode, lower_inode); diff --git a/fs/stack.c b/fs/stack.c index 67716f6a1a4a..0e20e43ad740 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -14,11 +14,8 @@ void fsstack_copy_inode_size(struct inode *dst, const struct inode *src) } EXPORT_SYMBOL_GPL(fsstack_copy_inode_size); -/* copy all attributes; get_nlinks is optional way to override the i_nlink - * copying - */ -void fsstack_copy_attr_all(struct inode *dest, const struct inode *src, - int (*get_nlinks)(struct inode *)) +/* copy all attributes */ +void fsstack_copy_attr_all(struct inode *dest, const struct inode *src) { dest->i_mode = src->i_mode; dest->i_uid = src->i_uid; @@ -29,14 +26,6 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src, dest->i_ctime = src->i_ctime; dest->i_blkbits = src->i_blkbits; dest->i_flags = src->i_flags; - - /* - * Update the nlinks AFTER updating the above fields, because the - * get_links callback may depend on them. - */ - if (!get_nlinks) - dest->i_nlink = src->i_nlink; - else - dest->i_nlink = (*get_nlinks)(dest); + dest->i_nlink = src->i_nlink; } EXPORT_SYMBOL_GPL(fsstack_copy_attr_all); diff --git a/include/linux/fs_stack.h b/include/linux/fs_stack.h index bb516ceeefc9..aa60311900dd 100644 --- a/include/linux/fs_stack.h +++ b/include/linux/fs_stack.h @@ -8,9 +8,7 @@ #include /* externs for fs/stack.c */ -extern void fsstack_copy_attr_all(struct inode *dest, const struct inode *src, - int (*get_nlinks)(struct inode *)); - +extern void fsstack_copy_attr_all(struct inode *dest, const struct inode *src); extern void fsstack_copy_inode_size(struct inode *dst, const struct inode *src); /* inlines */ -- cgit v1.2.3 From 1b8ab8159ef8f818f870a1d2e3b6953d80eefd3f Mon Sep 17 00:00:00 2001 From: Erez Zadok Date: Thu, 3 Dec 2009 21:56:09 -0500 Subject: VFS/fsstack: handle 32-bit smp + preempt + large files in fsstack_copy_inode_size Copy the inode size and blocks from one inode to another correctly on 32-bit systems with CONFIG_SMP, CONFIG_PREEMPT, or CONFIG_LBDAF. Use proper inode spinlocks only when i_size/i_blocks cannot fit in one 32-bit word. Signed-off-by: Hugh Dickins Signed-off-by: Erez Zadok Signed-off-by: Al Viro --- fs/stack.c | 54 +++++++++++++++++++++++++++++++++++++++++++++--- include/linux/fs_stack.h | 2 +- 2 files changed, 52 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/stack.c b/fs/stack.c index 0e20e43ad740..4a6f7f440658 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -7,10 +7,58 @@ * This function cannot be inlined since i_size_{read,write} is rather * heavy-weight on 32-bit systems */ -void fsstack_copy_inode_size(struct inode *dst, const struct inode *src) +void fsstack_copy_inode_size(struct inode *dst, struct inode *src) { - i_size_write(dst, i_size_read((struct inode *)src)); - dst->i_blocks = src->i_blocks; + loff_t i_size; + blkcnt_t i_blocks; + + /* + * i_size_read() includes its own seqlocking and protection from + * preemption (see include/linux/fs.h): we need nothing extra for + * that here, and prefer to avoid nesting locks than attempt to keep + * i_size and i_blocks in sync together. + */ + i_size = i_size_read(src); + + /* + * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to + * keep the two halves of i_blocks in sync despite SMP or PREEMPT - + * though stat's generic_fillattr() doesn't bother, and we won't be + * applying quotas (where i_blocks does become important) at the + * upper level. + * + * We don't actually know what locking is used at the lower level; + * but if it's a filesystem that supports quotas, it will be using + * i_lock as in inode_add_bytes(). tmpfs uses other locking, and + * its 32-bit is (just) able to exceed 2TB i_size with the aid of + * holes; but its i_blocks cannot carry into the upper long without + * almost 2TB swap - let's ignore that case. + */ + if (sizeof(i_blocks) > sizeof(long)) + spin_lock(&src->i_lock); + i_blocks = src->i_blocks; + if (sizeof(i_blocks) > sizeof(long)) + spin_unlock(&src->i_lock); + + /* + * If CONFIG_SMP or CONFIG_PREEMPT on 32-bit, it's vital for + * fsstack_copy_inode_size() to hold some lock around + * i_size_write(), otherwise i_size_read() may spin forever (see + * include/linux/fs.h). We don't necessarily hold i_mutex when this + * is called, so take i_lock for that case. + * + * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the + * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock + * for that case too, and do both at once by combining the tests. + * + * There is none of this locking overhead in the 64-bit case. + */ + if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long)) + spin_lock(&dst->i_lock); + i_size_write(dst, i_size); + dst->i_blocks = i_blocks; + if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long)) + spin_unlock(&dst->i_lock); } EXPORT_SYMBOL_GPL(fsstack_copy_inode_size); diff --git a/include/linux/fs_stack.h b/include/linux/fs_stack.h index aa60311900dd..da317c7163ab 100644 --- a/include/linux/fs_stack.h +++ b/include/linux/fs_stack.h @@ -9,7 +9,7 @@ /* externs for fs/stack.c */ extern void fsstack_copy_attr_all(struct inode *dest, const struct inode *src); -extern void fsstack_copy_inode_size(struct inode *dst, const struct inode *src); +extern void fsstack_copy_inode_size(struct inode *dst, struct inode *src); /* inlines */ static inline void fsstack_copy_attr_atime(struct inode *dest, -- cgit v1.2.3 From 7a0ad10c367ab57c899d340372f37880cbe6ab52 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Dec 2009 14:24:40 +0100 Subject: fold do_sync_file_range into sys_sync_file_range We recently go rid of all callers of do_sync_file_range as they're better served with vfs_fsync or the filemap_write_and_wait. Now that do_sync_file_range is down to a single caller fold it into it so that people don't start using it again accidentally. While at it also switch it from using __filemap_fdatawrite_range(..., WB_SYNC_ALL) to the more clear filemap_fdatawrite_range(). Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/sync.c | 59 +++++++++++++++++++++--------------------------------- include/linux/fs.h | 4 ---- 2 files changed, 23 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/fs/sync.c b/fs/sync.c index 36752a683481..418727a2a239 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -355,6 +355,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, { int ret; struct file *file; + struct address_space *mapping; loff_t endbyte; /* inclusive */ int fput_needed; umode_t i_mode; @@ -405,7 +406,28 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, !S_ISLNK(i_mode)) goto out_put; - ret = do_sync_mapping_range(file->f_mapping, offset, endbyte, flags); + mapping = file->f_mapping; + if (!mapping) { + ret = -EINVAL; + goto out_put; + } + + ret = 0; + if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { + ret = filemap_fdatawait_range(mapping, offset, endbyte); + if (ret < 0) + goto out_put; + } + + if (flags & SYNC_FILE_RANGE_WRITE) { + ret = filemap_fdatawrite_range(mapping, offset, endbyte); + if (ret < 0) + goto out_put; + } + + if (flags & SYNC_FILE_RANGE_WAIT_AFTER) + ret = filemap_fdatawait_range(mapping, offset, endbyte); + out_put: fput_light(file, fput_needed); out: @@ -437,38 +459,3 @@ asmlinkage long SyS_sync_file_range2(long fd, long flags, } SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2); #endif - -/* - * `endbyte' is inclusive - */ -int do_sync_mapping_range(struct address_space *mapping, loff_t offset, - loff_t endbyte, unsigned int flags) -{ - int ret; - - if (!mapping) { - ret = -EINVAL; - goto out; - } - - ret = 0; - if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { - ret = filemap_fdatawait_range(mapping, offset, endbyte); - if (ret < 0) - goto out; - } - - if (flags & SYNC_FILE_RANGE_WRITE) { - ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_ALL); - if (ret < 0) - goto out; - } - - if (flags & SYNC_FILE_RANGE_WAIT_AFTER) { - ret = filemap_fdatawait_range(mapping, offset, endbyte); - } -out: - return ret; -} -EXPORT_SYMBOL_GPL(do_sync_mapping_range); diff --git a/include/linux/fs.h b/include/linux/fs.h index 66bc0a54b284..77a975089d9a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1095,10 +1095,6 @@ struct file_lock { extern void send_sigio(struct fown_struct *fown, int fd, int band); -/* fs/sync.c */ -extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset, - loff_t endbyte, unsigned int flags); - #ifdef CONFIG_FILE_LOCKING extern int fcntl_getlk(struct file *, struct flock __user *); extern int fcntl_setlk(unsigned int, struct file *, unsigned int, -- cgit v1.2.3 From eaff8079d4f1016a12e34ab323737314f24127dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Dec 2009 14:25:01 +0100 Subject: kill I_LOCK After I_SYNC was split from I_LOCK the leftover is always used together with I_NEW and thus superflous. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/gfs2/inode.c | 2 +- fs/inode.c | 26 +++++++++++++------------- fs/jfs/jfs_txnmgr.c | 2 +- fs/ntfs/inode.c | 6 +++--- fs/ubifs/file.c | 2 +- fs/xfs/linux-2.6/xfs_iops.c | 2 +- fs/xfs/xfs_iget.c | 4 ++-- include/linux/fs.h | 36 ++++++++++++++++-------------------- include/linux/writeback.h | 3 +-- 9 files changed, 39 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 3ff32fa793da..6e220f4eee7d 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -125,7 +125,7 @@ static struct inode *gfs2_iget_skip(struct super_block *sb, * directory entry when gfs2_inode_lookup() is invoked. Part of the code * segment inside gfs2_inode_lookup code needs to get moved around. * - * Clean up I_LOCK and I_NEW as well. + * Clears I_NEW as well. **/ void gfs2_set_iop(struct inode *inode) diff --git a/fs/inode.c b/fs/inode.c index 06c1f02de611..03dfeb2e3928 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -113,7 +113,7 @@ static void wake_up_inode(struct inode *inode) * Prevent speculative execution through spin_unlock(&inode_lock); */ smp_mb(); - wake_up_bit(&inode->i_state, __I_LOCK); + wake_up_bit(&inode->i_state, __I_NEW); } /** @@ -690,17 +690,17 @@ void unlock_new_inode(struct inode *inode) } #endif /* - * This is special! We do not need the spinlock when clearing I_LOCK, + * This is special! We do not need the spinlock when clearing I_NEW, * because we're guaranteed that nobody else tries to do anything about * the state of the inode when it is locked, as we just created it (so - * there can be no old holders that haven't tested I_LOCK). + * there can be no old holders that haven't tested I_NEW). * However we must emit the memory barrier so that other CPUs reliably - * see the clearing of I_LOCK after the other inode initialisation has + * see the clearing of I_NEW after the other inode initialisation has * completed. */ smp_mb(); - WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW)); - inode->i_state &= ~(I_LOCK|I_NEW); + WARN_ON(!(inode->i_state & I_NEW)); + inode->i_state &= ~I_NEW; wake_up_inode(inode); } EXPORT_SYMBOL(unlock_new_inode); @@ -731,7 +731,7 @@ static struct inode *get_new_inode(struct super_block *sb, goto set_failed; __inode_add_to_lists(sb, head, inode); - inode->i_state = I_LOCK|I_NEW; + inode->i_state = I_NEW; spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -778,7 +778,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb, if (!old) { inode->i_ino = ino; __inode_add_to_lists(sb, head, inode); - inode->i_state = I_LOCK|I_NEW; + inode->i_state = I_NEW; spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -1083,7 +1083,7 @@ int insert_inode_locked(struct inode *inode) ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); - inode->i_state |= I_LOCK|I_NEW; + inode->i_state |= I_NEW; while (1) { struct hlist_node *node; struct inode *old = NULL; @@ -1120,7 +1120,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, struct super_block *sb = inode->i_sb; struct hlist_head *head = inode_hashtable + hash(sb, hashval); - inode->i_state |= I_LOCK|I_NEW; + inode->i_state |= I_NEW; while (1) { struct hlist_node *node; @@ -1510,7 +1510,7 @@ EXPORT_SYMBOL(inode_wait); * until the deletion _might_ have completed. Callers are responsible * to recheck inode state. * - * It doesn't matter if I_LOCK is not set initially, a call to + * It doesn't matter if I_NEW is not set initially, a call to * wake_up_inode() after removing from the hash list will DTRT. * * This is called with inode_lock held. @@ -1518,8 +1518,8 @@ EXPORT_SYMBOL(inode_wait); static void __wait_on_freeing_inode(struct inode *inode) { wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK); - wq = bit_waitqueue(&inode->i_state, __I_LOCK); + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); spin_unlock(&inode_lock); schedule(); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index f26e4d03ada5..d945ea76b445 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1292,7 +1292,7 @@ int txCommit(tid_t tid, /* transaction identifier */ */ /* * I believe this code is no longer needed. Splitting I_LOCK - * into two bits, I_LOCK and I_SYNC should prevent this + * into two bits, I_NEW and I_SYNC should prevent this * deadlock as well. But since I don't have a JFS testload * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done. * Joern diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 9938034762cc..dc2505abb6d7 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -530,7 +530,7 @@ err_corrupt_attr: * the ntfs inode. * * Q: What locks are held when the function is called? - * A: i_state has I_LOCK set, hence the inode is locked, also + * A: i_state has I_NEW set, hence the inode is locked, also * i_count is set to 1, so it is not going to go away * i_flags is set to 0 and we have no business touching it. Only an ioctl() * is allowed to write to them. We should of course be honouring them but @@ -1207,7 +1207,7 @@ err_out: * necessary fields in @vi as well as initializing the ntfs inode. * * Q: What locks are held when the function is called? - * A: i_state has I_LOCK set, hence the inode is locked, also + * A: i_state has I_NEW set, hence the inode is locked, also * i_count is set to 1, so it is not going to go away * * Return 0 on success and -errno on error. In the error case, the inode will @@ -1474,7 +1474,7 @@ err_out: * normal directory inodes. * * Q: What locks are held when the function is called? - * A: i_state has I_LOCK set, hence the inode is locked, also + * A: i_state has I_NEW set, hence the inode is locked, also * i_count is set to 1, so it is not going to go away * * Return 0 on success and -errno on error. In the error case, the inode will diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 39849f887e72..16a6444330ec 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -45,7 +45,7 @@ * * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the * read-ahead path does not lock it ("sys_read -> generic_file_aio_read -> - * ondemand_readahead -> readpage"). In case of readahead, @I_LOCK flag is not + * ondemand_readahead -> readpage"). In case of readahead, @I_SYNC flag is not * set as well. However, UBIFS disables readahead. */ diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 1d5b298ba8b2..225946012d0b 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -794,7 +794,7 @@ xfs_setup_inode( struct inode *inode = &ip->i_vnode; inode->i_ino = ip->i_ino; - inode->i_state = I_NEW|I_LOCK; + inode->i_state = I_NEW; inode_add_to_lists(ip->i_mount->m_super, inode); inode->i_mode = ip->i_d.di_mode; diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 0de36c2a46f1..fa402a6bbbcf 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -91,7 +91,7 @@ xfs_inode_alloc( ip->i_new_size = 0; /* prevent anyone from using this yet */ - VFS_I(ip)->i_state = I_NEW|I_LOCK; + VFS_I(ip)->i_state = I_NEW; return ip; } @@ -217,7 +217,7 @@ xfs_iget_cache_hit( trace_xfs_iget_reclaim(ip); goto out_error; } - inode->i_state = I_LOCK|I_NEW; + inode->i_state = I_NEW; } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 77a975089d9a..cca191933ff6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1587,7 +1587,7 @@ struct super_operations { * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at * various stages of removing an inode. * - * Two bits are used for locking and completion notification, I_LOCK and I_SYNC. + * Two bits are used for locking and completion notification, I_NEW and I_SYNC. * * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on * fdatasync(). i_atime is the usual cause. @@ -1596,8 +1596,14 @@ struct super_operations { * don't have to write inode on fdatasync() when only * mtime has changed in it. * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. - * I_NEW get_new_inode() sets i_state to I_LOCK|I_NEW. Both - * are cleared by unlock_new_inode(), called from iget(). + * I_NEW Serves as both a mutex and completion notification. + * New inodes set I_NEW. If two processes both create + * the same inode, one of them will release its inode and + * wait for I_NEW to be released before returning. + * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can + * also cause waiting on I_NEW, without I_NEW actually + * being set. find_inode() uses this to prevent returning + * nearly-dead inodes. * I_WILL_FREE Must be set when calling write_inode_now() if i_count * is zero. I_FREEING must be set when I_WILL_FREE is * cleared. @@ -1611,20 +1617,11 @@ struct super_operations { * prohibited for many purposes. iget() must wait for * the inode to be completely released, then create it * anew. Other functions will just ignore such inodes, - * if appropriate. I_LOCK is used for waiting. + * if appropriate. I_NEW is used for waiting. * - * I_LOCK Serves as both a mutex and completion notification. - * New inodes set I_LOCK. If two processes both create - * the same inode, one of them will release its inode and - * wait for I_LOCK to be released before returning. - * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can - * also cause waiting on I_LOCK, without I_LOCK actually - * being set. find_inode() uses this to prevent returning - * nearly-dead inodes. - * I_SYNC Similar to I_LOCK, but limited in scope to writeback - * of inode dirty data. Having a separate lock for this - * purpose reduces latency and prevents some filesystem- - * specific deadlocks. + * I_SYNC Synchonized write of dirty inode data. The bits is + * set during data writeback, and cleared with a wakeup + * on the bit address once it is done. * * Q: What is the difference between I_WILL_FREE and I_FREEING? * Q: igrab() only checks on (I_FREEING|I_WILL_FREE). Should it also check on @@ -1633,13 +1630,12 @@ struct super_operations { #define I_DIRTY_SYNC 1 #define I_DIRTY_DATASYNC 2 #define I_DIRTY_PAGES 4 -#define I_NEW 8 +#define __I_NEW 3 +#define I_NEW (1 << __I_NEW) #define I_WILL_FREE 16 #define I_FREEING 32 #define I_CLEAR 64 -#define __I_LOCK 7 -#define I_LOCK (1 << __I_LOCK) -#define __I_SYNC 8 +#define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 705f01fe413a..c18c008f4bbf 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -79,8 +79,7 @@ void wakeup_flusher_threads(long nr_pages); static inline void wait_on_inode(struct inode *inode) { might_sleep(); - wait_on_bit(&inode->i_state, __I_LOCK, inode_wait, - TASK_UNINTERRUPTIBLE); + wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE); } static inline void inode_sync_wait(struct inode *inode) { -- cgit v1.2.3 From a2770d86b33024f71df269fde2de096df89d6a48 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2009 12:51:05 -0800 Subject: Revert "fix mismerge with Trond's stuff (create_mnt_ns() export is gone now)" This reverts commit e9496ff46a20a8592fdc7bdaaf41b45eb808d310. Quoth Al: "it's dependent on a lot of other stuff not currently in mainline and badly broken with current fs/namespace.c. Sorry, badly out-of-order cherry-pick from old queue. PS: there's a large pending series reworking the refcounting and lifetime rules for vfsmounts that will, among other things, allow to rip a subtree away _without_ dissolving connections in it, to be garbage-collected when all active references are gone. It's considerably saner wrt "is the subtree busy" logics, but it's nowhere near being ready for merge at the moment; this changeset is one of the things becoming possible with that sucker, but it certainly shouldn't have been picked during this cycle. My apologies..." Noticed-by: Eric Paris Requested-by: Al Viro Signed-off-by: Linus Torvalds --- fs/namespace.c | 3 ++- fs/nfs/super.c | 8 ++++++++ include/linux/mnt_namespace.h | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index faab1273281e..7d70d63ceb29 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2068,7 +2068,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, * create_mnt_ns - creates a private namespace and adds a root filesystem * @mnt: pointer to the new root filesystem mountpoint */ -static struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) +struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) { struct mnt_namespace *new_ns; @@ -2080,6 +2080,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) } return new_ns; } +EXPORT_SYMBOL(create_mnt_ns); SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index d5b112bcf3de..ce907efc5508 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2648,13 +2648,21 @@ out_freepage: static int nfs_follow_remote_path(struct vfsmount *root_mnt, const char *export_path, struct vfsmount *mnt_target) { + struct mnt_namespace *ns_private; struct nameidata nd; struct super_block *s; int ret; + ns_private = create_mnt_ns(root_mnt); + ret = PTR_ERR(ns_private); + if (IS_ERR(ns_private)) + goto out_mntput; + ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, export_path, LOOKUP_FOLLOW, &nd); + put_mnt_ns(ns_private); + if (ret != 0) goto out_err; diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index d9ebf1037dfa..d74785c2393a 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -23,6 +23,7 @@ struct proc_mounts { struct fs_struct; +extern struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt); extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); -- cgit v1.2.3 From b6e3224fb20954f155e41ec5709b2ab70b50ae2d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2009 13:23:24 -0800 Subject: Revert "task_struct: make journal_info conditional" This reverts commit e4c570c4cb7a95dbfafa3d016d2739bf3fdfe319, as requested by Alexey: "I think I gave a good enough arguments to not merge it. To iterate: * patch makes impossible to start using ext3 on EXT3_FS=n kernels without reboot. * this is done only for one pointer on task_struct" None of config options which define task_struct are tristate directly or effectively." Requested-by: Alexey Dobriyan Acked-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 4 ---- fs/btrfs/Kconfig | 1 - fs/ext4/Kconfig | 1 - fs/gfs2/Kconfig | 1 - fs/jbd/Kconfig | 1 - fs/jbd2/Kconfig | 1 - fs/nilfs2/Kconfig | 1 - fs/reiserfs/Kconfig | 1 - include/linux/init_task.h | 8 +------- include/linux/sched.h | 2 -- 10 files changed, 1 insertion(+), 20 deletions(-) (limited to 'include/linux') diff --git a/fs/Kconfig b/fs/Kconfig index f8fccaaad628..64d44efad7a5 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -6,10 +6,6 @@ menu "File systems" if BLOCK -config FS_JOURNAL_INFO - bool - default n - source "fs/ext2/Kconfig" source "fs/ext3/Kconfig" source "fs/ext4/Kconfig" diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 402afe0a0bfb..7bb3c020e570 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -4,7 +4,6 @@ config BTRFS_FS select LIBCRC32C select ZLIB_INFLATE select ZLIB_DEFLATE - select FS_JOURNAL_INFO help Btrfs is a new filesystem with extents, writable snapshotting, support for multiple devices and many more features. diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index e5f6774846e4..9acf7e808139 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -2,7 +2,6 @@ config EXT4_FS tristate "The Extended 4 (ext4) filesystem" select JBD2 select CRC16 - select FS_JOURNAL_INFO help This is the next generation of the ext3 filesystem. diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index b192c661caa6..4dcddf83326f 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -10,7 +10,6 @@ config GFS2_FS select SLOW_WORK select QUOTA select QUOTACTL - select FS_JOURNAL_INFO help A cluster filesystem. diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig index a8408983abd4..4e28beeed157 100644 --- a/fs/jbd/Kconfig +++ b/fs/jbd/Kconfig @@ -1,6 +1,5 @@ config JBD tristate - select FS_JOURNAL_INFO help This is a generic journalling layer for block devices. It is currently used by the ext3 file system, but it could also be diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig index 0f7d1ceafdfd..f32f346f4b0a 100644 --- a/fs/jbd2/Kconfig +++ b/fs/jbd2/Kconfig @@ -1,7 +1,6 @@ config JBD2 tristate select CRC32 - select FS_JOURNAL_INFO help This is a generic journaling layer for block devices that support both 32-bit and 64-bit block numbers. It is currently used by diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 1225af7b2166..251da07b2a1d 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -2,7 +2,6 @@ config NILFS2_FS tristate "NILFS2 file system support (EXPERIMENTAL)" depends on EXPERIMENTAL select CRC32 - select FS_JOURNAL_INFO help NILFS2 is a log-structured file system (LFS) supporting continuous snapshotting. In addition to versioning capability of the entire diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index ac7cd75c86f8..513f431038f9 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -1,7 +1,6 @@ config REISERFS_FS tristate "Reiserfs support" select CRC32 - select FS_JOURNAL_INFO help Stores not just filenames but the files themselves in a balanced tree. Uses journalling. diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 5ed8b9c50355..abec69b63d7e 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -111,12 +111,6 @@ extern struct cred init_cred; # define INIT_PERF_EVENTS(tsk) #endif -#ifdef CONFIG_FS_JOURNAL_INFO -#define INIT_JOURNAL_INFO .journal_info = NULL, -#else -#define INIT_JOURNAL_INFO -#endif - /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -168,6 +162,7 @@ extern struct cred init_cred; .signal = {{0}}}, \ .blocked = {{0}}, \ .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ + .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ @@ -178,7 +173,6 @@ extern struct cred init_cred; [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ }, \ .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ - INIT_JOURNAL_INFO \ INIT_IDS \ INIT_PERF_EVENTS(tsk) \ INIT_TRACE_IRQFLAGS \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 244c287a5ac1..211ed32befbd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1446,10 +1446,8 @@ struct task_struct { gfp_t lockdep_reclaim_gfp; #endif -#ifdef CONFIG_FS_JOURNAL_INFO /* journalling filesystem info */ void *journal_info; -#endif /* stacked block device info */ struct bio *bio_list, **bio_tail; -- cgit v1.2.3 From 5d0bb2c4238e333ae18c5cd23f75e02a3dac3519 Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Thu, 17 Dec 2009 15:27:11 -0800 Subject: vt: don't export vt_kmsg_redirect() to userspace Fix following warning in linux-next by guarding the function definition (both the "extern" and the inline) with #ifdef __KERNEL__. usr/include/linux/vt.h:89: userspace cannot call function or variable defined in the kernel Introduced by commit 5ada918b82399eef3afd6a71e3637697d6bd719f ("vt: introduce and use vt_kmsg_redirect() function"). Signed-off-by: Bernhard Walle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vt.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vt.h b/include/linux/vt.h index 3fb9944e50a6..d5dd0bc408fd 100644 --- a/include/linux/vt.h +++ b/include/linux/vt.h @@ -84,6 +84,8 @@ struct vt_setactivate { #define VT_SETACTIVATE 0x560F /* Activate and set the mode of a console */ +#ifdef __KERNEL__ + #ifdef CONFIG_VT_CONSOLE extern int vt_kmsg_redirect(int new); @@ -97,6 +99,8 @@ static inline int vt_kmsg_redirect(int new) #endif +#endif /* __KERNEL__ */ + #define vt_get_kmsg_redirect() vt_kmsg_redirect(-1) #endif /* _LINUX_VT_H */ -- cgit v1.2.3 From f6151dfea21496d43dbaba32cfcd9c9f404769bc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Dec 2009 15:27:16 -0800 Subject: mm: introduce coredump parameter structure Introduce coredump parameter data structure (struct coredump_params) to simplify binfmt->core_dump() arguments. Signed-off-by: Masami Hiramatsu Suggested-by: Ingo Molnar Cc: Hidehiro Kawai Cc: Oleg Nesterov Cc: Roland McGrath Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_aout.c | 13 +++++++------ fs/binfmt_elf.c | 24 +++++++++++++----------- fs/binfmt_elf_fdpic.c | 29 +++++++++++++++-------------- fs/binfmt_flat.c | 6 +++--- fs/binfmt_som.c | 2 +- fs/exec.c | 38 +++++++++++++++++++++----------------- include/linux/binfmts.h | 10 +++++++++- 7 files changed, 69 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index b639dcf7c778..346b69405363 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -32,7 +32,7 @@ static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); static int load_aout_library(struct file*); -static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); +static int aout_core_dump(struct coredump_params *cprm); static struct linux_binfmt aout_format = { .module = THIS_MODULE, @@ -89,8 +89,9 @@ if (file->f_op->llseek) { \ * dumping of the process results in another error.. */ -static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) +static int aout_core_dump(struct coredump_params *cprm) { + struct file *file = cprm->file; mm_segment_t fs; int has_dumped = 0; unsigned long dump_start, dump_size; @@ -108,16 +109,16 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u current->flags |= PF_DUMPCORE; strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); dump.u_ar0 = offsetof(struct user, regs); - dump.signal = signr; - aout_dump_thread(regs, &dump); + dump.signal = cprm->signr; + aout_dump_thread(cprm->regs, &dump); /* If the size of the dump file exceeds the rlimit, then see what would happen if we wrote the stack, but not the data area. */ - if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit) + if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit) dump.u_dsize = 0; /* Make sure we have enough room to write the stack and data areas. */ - if ((dump.u_ssize + 1) * PAGE_SIZE > limit) + if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit) dump.u_ssize = 0; /* make sure we actually have a data and stack area to dump */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 97b6e9efeb7f..edd90c49003c 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -45,7 +45,7 @@ static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, * don't even try. */ #ifdef CONFIG_ELF_CORE -static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); +static int elf_core_dump(struct coredump_params *cprm); #else #define elf_core_dump NULL #endif @@ -1272,8 +1272,9 @@ static int writenote(struct memelfnote *men, struct file *file, } #undef DUMP_WRITE -#define DUMP_WRITE(addr, nr) \ - if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ +#define DUMP_WRITE(addr, nr) \ + if ((size += (nr)) > cprm->limit || \ + !dump_write(cprm->file, (addr), (nr))) \ goto end_coredump; static void fill_elf_header(struct elfhdr *elf, int segs, @@ -1901,7 +1902,7 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, * and then they are actually written out. If we run out of core limit * we just truncate. */ -static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) +static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; mm_segment_t fs; @@ -1947,7 +1948,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un * notes. This also sets up the file header. */ if (!fill_note_info(elf, segs + 1, /* including notes section */ - &info, signr, regs)) + &info, cprm->signr, cprm->regs)) goto cleanup; has_dumped = 1; @@ -2009,14 +2010,14 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un #endif /* write out the notes section */ - if (!write_note_info(&info, file, &foffset)) + if (!write_note_info(&info, cprm->file, &foffset)) goto end_coredump; - if (elf_coredump_extra_notes_write(file, &foffset)) + if (elf_coredump_extra_notes_write(cprm->file, &foffset)) goto end_coredump; /* Align to page */ - if (!dump_seek(file, dataoff - foffset)) + if (!dump_seek(cprm->file, dataoff - foffset)) goto end_coredump; for (vma = first_vma(current, gate_vma); vma != NULL; @@ -2033,12 +2034,13 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un page = get_dump_page(addr); if (page) { void *kaddr = kmap(page); - stop = ((size += PAGE_SIZE) > limit) || - !dump_write(file, kaddr, PAGE_SIZE); + stop = ((size += PAGE_SIZE) > cprm->limit) || + !dump_write(cprm->file, kaddr, + PAGE_SIZE); kunmap(page); page_cache_release(page); } else - stop = !dump_seek(file, PAGE_SIZE); + stop = !dump_seek(cprm->file, PAGE_SIZE); if (stop) goto end_coredump; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 7b055385db8e..c25256a5c5b0 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -76,7 +76,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, struct file *, struct mm_struct *); #ifdef CONFIG_ELF_CORE -static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *, unsigned long limit); +static int elf_fdpic_core_dump(struct coredump_params *cprm); #endif static struct linux_binfmt elf_fdpic_format = { @@ -1326,8 +1326,9 @@ static int writenote(struct memelfnote *men, struct file *file) #undef DUMP_WRITE #undef DUMP_SEEK -#define DUMP_WRITE(addr, nr) \ - if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ +#define DUMP_WRITE(addr, nr) \ + if ((size += (nr)) > cprm->limit || \ + !dump_write(cprm->file, (addr), (nr))) \ goto end_coredump; static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) @@ -1582,8 +1583,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size, * and then they are actually written out. If we run out of core limit * we just truncate. */ -static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, - struct file *file, unsigned long limit) +static int elf_fdpic_core_dump(struct coredump_params *cprm) { #define NUM_NOTES 6 int has_dumped = 0; @@ -1642,7 +1642,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, goto cleanup; #endif - if (signr) { + if (cprm->signr) { struct core_thread *ct; struct elf_thread_status *tmp; @@ -1661,14 +1661,14 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, int sz; tmp = list_entry(t, struct elf_thread_status, list); - sz = elf_dump_thread_status(signr, tmp); + sz = elf_dump_thread_status(cprm->signr, tmp); thread_status_size += sz; } } /* now collect the dump for the current */ - fill_prstatus(prstatus, current, signr); - elf_core_copy_regs(&prstatus->pr_reg, regs); + fill_prstatus(prstatus, current, cprm->signr); + elf_core_copy_regs(&prstatus->pr_reg, cprm->regs); segs = current->mm->map_count; #ifdef ELF_CORE_EXTRA_PHDRS @@ -1703,7 +1703,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, /* Try to dump the FPU. */ if ((prstatus->pr_fpvalid = - elf_core_copy_task_fpregs(current, regs, fpu))) + elf_core_copy_task_fpregs(current, cprm->regs, fpu))) fill_note(notes + numnote++, "CORE", NT_PRFPREG, sizeof(*fpu), fpu); #ifdef ELF_CORE_COPY_XFPREGS @@ -1774,7 +1774,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, /* write out the notes section */ for (i = 0; i < numnote; i++) - if (!writenote(notes + i, file)) + if (!writenote(notes + i, cprm->file)) goto end_coredump; /* write out the thread status notes section */ @@ -1783,14 +1783,15 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, list_entry(t, struct elf_thread_status, list); for (i = 0; i < tmp->num_notes; i++) - if (!writenote(&tmp->notes[i], file)) + if (!writenote(&tmp->notes[i], cprm->file)) goto end_coredump; } - if (!dump_seek(file, dataoff)) + if (!dump_seek(cprm->file, dataoff)) goto end_coredump; - if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0) + if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit, + mm_flags) < 0) goto end_coredump; #ifdef ELF_CORE_WRITE_EXTRA_DATA diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index a2796651e756..d4a00ea1054c 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -87,7 +87,7 @@ static int load_flat_shared_library(int id, struct lib_info *p); #endif static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); -static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); +static int flat_core_dump(struct coredump_params *cprm); static struct linux_binfmt flat_format = { .module = THIS_MODULE, @@ -102,10 +102,10 @@ static struct linux_binfmt flat_format = { * Currently only a stub-function. */ -static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) +static int flat_core_dump(struct coredump_params *cprm) { printk("Process %s:%d received signr %d and should have core dumped\n", - current->comm, current->pid, (int) signr); + current->comm, current->pid, (int) cprm->signr); return(1); } diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index eff74b9c9e77..2a9b5330cc5e 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -43,7 +43,7 @@ static int load_som_library(struct file *); * don't even try. */ #if 0 -static int som_core_dump(long signr, struct pt_regs *regs, unsigned long limit); +static int som_core_dump(struct coredump_params *cprm); #else #define som_core_dump NULL #endif diff --git a/fs/exec.c b/fs/exec.c index 77db9a97a773..632b02e34ec7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1763,17 +1763,20 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) struct mm_struct *mm = current->mm; struct linux_binfmt * binfmt; struct inode * inode; - struct file * file; const struct cred *old_cred; struct cred *cred; int retval = 0; int flag = 0; int ispipe = 0; - unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; char **helper_argv = NULL; int helper_argc = 0; int dump_count = 0; static atomic_t core_dump_count = ATOMIC_INIT(0); + struct coredump_params cprm = { + .signr = signr, + .regs = regs, + .limit = current->signal->rlim[RLIMIT_CORE].rlim_cur, + }; audit_core_dumps(signr); @@ -1829,15 +1832,15 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) ispipe = format_corename(corename, signr); unlock_kernel(); - if ((!ispipe) && (core_limit < binfmt->min_coredump)) + if ((!ispipe) && (cprm.limit < binfmt->min_coredump)) goto fail_unlock; if (ispipe) { - if (core_limit == 0) { + if (cprm.limit == 0) { /* * Normally core limits are irrelevant to pipes, since * we're not writing to the file system, but we use - * core_limit of 0 here as a speacial value. Any + * cprm.limit of 0 here as a speacial value. Any * non-zero limit gets set to RLIM_INFINITY below, but * a limit of 0 skips the dump. This is a consistent * way to catch recursive crashes. We can still crash @@ -1870,25 +1873,25 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) goto fail_dropcount; } - core_limit = RLIM_INFINITY; + cprm.limit = RLIM_INFINITY; /* SIGPIPE can happen, but it's just never processed */ if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, - &file)) { + &cprm.file)) { printk(KERN_INFO "Core dump to %s pipe failed\n", corename); goto fail_dropcount; } } else - file = filp_open(corename, + cprm.file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600); - if (IS_ERR(file)) + if (IS_ERR(cprm.file)) goto fail_dropcount; - inode = file->f_path.dentry->d_inode; + inode = cprm.file->f_path.dentry->d_inode; if (inode->i_nlink > 1) goto close_fail; /* multiple links - don't dump */ - if (!ispipe && d_unhashed(file->f_path.dentry)) + if (!ispipe && d_unhashed(cprm.file->f_path.dentry)) goto close_fail; /* AK: actually i see no reason to not allow this for named pipes etc., @@ -1901,21 +1904,22 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) */ if (inode->i_uid != current_fsuid()) goto close_fail; - if (!file->f_op) + if (!cprm.file->f_op) goto close_fail; - if (!file->f_op->write) + if (!cprm.file->f_op->write) goto close_fail; - if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0) + if (!ispipe && + do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0) goto close_fail; - retval = binfmt->core_dump(signr, regs, file, core_limit); + retval = binfmt->core_dump(&cprm); if (retval) current->signal->group_exit_code |= 0x80; close_fail: if (ispipe && core_pipe_limit) - wait_for_dump_helpers(file); - filp_close(file, NULL); + wait_for_dump_helpers(cprm.file); + filp_close(cprm.file, NULL); fail_dropcount: if (dump_count) atomic_dec(&core_dump_count); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index aece486ac734..cd4349bdc34e 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -68,6 +68,14 @@ struct linux_binprm{ #define BINPRM_MAX_RECURSION 4 +/* Function parameter for binfmt->coredump */ +struct coredump_params { + long signr; + struct pt_regs *regs; + struct file *file; + unsigned long limit; +}; + /* * This structure defines the functions that are used to load the binary formats that * linux accepts. @@ -77,7 +85,7 @@ struct linux_binfmt { struct module *module; int (*load_binary)(struct linux_binprm *, struct pt_regs * regs); int (*load_shlib)(struct file *); - int (*core_dump)(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); + int (*core_dump)(struct coredump_params *cprm); unsigned long min_coredump; /* minimal dump size */ int hasvdso; }; -- cgit v1.2.3 From 925cc71e512a29e2594bcc17dc58d0a0e9c4d524 Mon Sep 17 00:00:00 2001 From: Robert Jennings Date: Thu, 17 Dec 2009 14:44:38 +0000 Subject: mm: Add notifier in pageblock isolation for balloon drivers Memory balloon drivers can allocate a large amount of memory which is not movable but could be freed to accomodate memory hotplug remove. Prior to calling the memory hotplug notifier chain the memory in the pageblock is isolated. Currently, if the migrate type is not MIGRATE_MOVABLE the isolation will not proceed, causing the memory removal for that page range to fail. Rather than failing pageblock isolation if the migrateteype is not MIGRATE_MOVABLE, this patch checks if all of the pages in the pageblock, and not on the LRU, are owned by a registered balloon driver (or other entity) using a notifier chain. If all of the non-movable pages are owned by a balloon, they can be freed later through the memory notifier chain and the range can still be isolated in set_migratetype_isolate(). Signed-off-by: Robert Jennings Cc: Mel Gorman Cc: Ingo Molnar Cc: Brian King Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Gerald Schaefer Cc: KAMEZAWA Hiroyuki Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Benjamin Herrenschmidt --- drivers/base/memory.c | 19 +++++++++++++++++ include/linux/memory.h | 27 ++++++++++++++++++++++++ mm/page_alloc.c | 57 +++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 96 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index c4c8f2e1dd15..d7d77d4a402c 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -63,6 +63,20 @@ void unregister_memory_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_memory_notifier); +static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); + +int register_memory_isolate_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&memory_isolate_chain, nb); +} +EXPORT_SYMBOL(register_memory_isolate_notifier); + +void unregister_memory_isolate_notifier(struct notifier_block *nb) +{ + atomic_notifier_chain_unregister(&memory_isolate_chain, nb); +} +EXPORT_SYMBOL(unregister_memory_isolate_notifier); + /* * register_memory - Setup a sysfs device for a memory block */ @@ -157,6 +171,11 @@ int memory_notify(unsigned long val, void *v) return blocking_notifier_call_chain(&memory_chain, val, v); } +int memory_isolate_notify(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&memory_isolate_chain, val, v); +} + /* * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is * OK to have direct references to sparsemem variables in here. diff --git a/include/linux/memory.h b/include/linux/memory.h index 37fa19b34ef5..1adfe779eb99 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -50,6 +50,19 @@ struct memory_notify { int status_change_nid; }; +/* + * During pageblock isolation, count the number of pages within the + * range [start_pfn, start_pfn + nr_pages) which are owned by code + * in the notifier chain. + */ +#define MEM_ISOLATE_COUNT (1<<0) + +struct memory_isolate_notify { + unsigned long start_pfn; /* Start of range to check */ + unsigned int nr_pages; /* # pages in range to check */ + unsigned int pages_found; /* # pages owned found by callbacks */ +}; + struct notifier_block; struct mem_section; @@ -76,14 +89,28 @@ static inline int memory_notify(unsigned long val, void *v) { return 0; } +static inline int register_memory_isolate_notifier(struct notifier_block *nb) +{ + return 0; +} +static inline void unregister_memory_isolate_notifier(struct notifier_block *nb) +{ +} +static inline int memory_isolate_notify(unsigned long val, void *v) +{ + return 0; +} #else extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); +extern int register_memory_isolate_notifier(struct notifier_block *nb); +extern void unregister_memory_isolate_notifier(struct notifier_block *nb); extern int register_new_memory(int, struct mem_section *); extern int unregister_memory_section(struct mem_section *); extern int memory_dev_init(void); extern int remove_memory_block(unsigned long, struct mem_section *, int); extern int memory_notify(unsigned long val, void *v); +extern int memory_isolate_notify(unsigned long val, void *v); extern struct memory_block *find_memory_block(struct mem_section *); #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION< #include #include +#include #include #include @@ -5008,23 +5009,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, int set_migratetype_isolate(struct page *page) { struct zone *zone; - unsigned long flags; + struct page *curr_page; + unsigned long flags, pfn, iter; + unsigned long immobile = 0; + struct memory_isolate_notify arg; + int notifier_ret; int ret = -EBUSY; int zone_idx; zone = page_zone(page); zone_idx = zone_idx(zone); + spin_lock_irqsave(&zone->lock, flags); + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE || + zone_idx == ZONE_MOVABLE) { + ret = 0; + goto out; + } + + pfn = page_to_pfn(page); + arg.start_pfn = pfn; + arg.nr_pages = pageblock_nr_pages; + arg.pages_found = 0; + /* - * In future, more migrate types will be able to be isolation target. + * It may be possible to isolate a pageblock even if the + * migratetype is not MIGRATE_MOVABLE. The memory isolation + * notifier chain is used by balloon drivers to return the + * number of pages in a range that are held by the balloon + * driver to shrink memory. If all the pages are accounted for + * by balloons, are free, or on the LRU, isolation can continue. + * Later, for example, when memory hotplug notifier runs, these + * pages reported as "can be isolated" should be isolated(freed) + * by the balloon driver through the memory notifier chain. */ - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && - zone_idx != ZONE_MOVABLE) + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); + notifier_ret = notifier_to_errno(notifier_ret); + if (notifier_ret || !arg.pages_found) goto out; - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - move_freepages_block(zone, page, MIGRATE_ISOLATE); - ret = 0; + + for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { + if (!pfn_valid_within(pfn)) + continue; + + curr_page = pfn_to_page(iter); + if (!page_count(curr_page) || PageLRU(curr_page)) + continue; + + immobile++; + } + + if (arg.pages_found == immobile) + ret = 0; + out: + if (!ret) { + set_pageblock_migratetype(page, MIGRATE_ISOLATE); + move_freepages_block(zone, page, MIGRATE_ISOLATE); + } + spin_unlock_irqrestore(&zone->lock, flags); if (!ret) drain_all_pages(); -- cgit v1.2.3 From 8c0414cd524e9f1c483ffb3ff1c2d860f5c567c8 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 3 Dec 2009 12:46:51 -0800 Subject: fiemap: Add new extent flag FIEMAP_EXTENT_SHARED Some filesystems may allow multiple files to point to a particular extent. This patch adds flag FIEMAP_EXTENT_SHARED to denote extents that are shared with other inodes. Signed-off-by: Sunil Mushran Acked-by: Mark Fasheh Signed-off-by: Joel Becker --- include/linux/fiemap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h index 934e22d65801..d830747f5c0b 100644 --- a/include/linux/fiemap.h +++ b/include/linux/fiemap.h @@ -62,5 +62,7 @@ struct fiemap { #define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively * support extents. Result * merged for efficiency. */ +#define FIEMAP_EXTENT_SHARED 0x00002000 /* Space shared with other + * files. */ #endif /* _LINUX_FIEMAP_H */ -- cgit v1.2.3 From 622e99bf0d54c4517cb0524540cd77257db8621a Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 18 Dec 2009 17:43:20 +0100 Subject: [S390] rename NT_PRXSTATUS to NT_S390_HIGHREGS The elf notes number for the upper register halves is s390 specific. Change the name of the elf notes to include S390. Signed-off-by: Martin Schwidefsky Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/ptrace.c | 2 +- include/linux/elf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 653c6a178740..13815d39f7dd 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -959,7 +959,7 @@ static const struct user_regset s390_compat_regsets[] = { .set = s390_fpregs_set, }, [REGSET_GENERAL_EXTENDED] = { - .core_note_type = NT_PRXSTATUS, + .core_note_type = NT_S390_HIGH_GPRS, .n = sizeof(s390_compat_regs_high) / sizeof(compat_long_t), .size = sizeof(compat_long_t), .align = sizeof(compat_long_t), diff --git a/include/linux/elf.h b/include/linux/elf.h index 90a4ed0ea0e5..0cc4d55151b7 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -361,7 +361,7 @@ typedef struct elf64_shdr { #define NT_PPC_VSX 0x102 /* PowerPC VSX registers */ #define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ #define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ -#define NT_PRXSTATUS 0x300 /* s390 upper register halves */ +#define NT_S390_HIGH_GPRS 0x300 /* s390 upper register halves */ /* Note header in a PT_NOTE section */ -- cgit v1.2.3 From 9dfc6e68bfe6ee452efb1a4e9ca26a9007f2b864 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 18 Dec 2009 16:26:20 -0600 Subject: SLUB: Use this_cpu operations in slub Using per cpu allocations removes the needs for the per cpu arrays in the kmem_cache struct. These could get quite big if we have to support systems with thousands of cpus. The use of this_cpu_xx operations results in: 1. The size of kmem_cache for SMP configuration shrinks since we will only need 1 pointer instead of NR_CPUS. The same pointer can be used by all processors. Reduces cache footprint of the allocator. 2. We can dynamically size kmem_cache according to the actual nodes in the system meaning less memory overhead for configurations that may potentially support up to 1k NUMA nodes / 4k cpus. 3. We can remove the diddle widdle with allocating and releasing of kmem_cache_cpu structures when bringing up and shutting down cpus. The cpu alloc logic will do it all for us. Removes some portions of the cpu hotplug functionality. 4. Fastpath performance increases since per cpu pointer lookups and address calculations are avoided. V7-V8 - Convert missed get_cpu_slab() under CONFIG_SLUB_STATS Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 6 +- mm/slub.c | 202 +++++++++++------------------------------------ 2 files changed, 49 insertions(+), 159 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 1e14beb23f9b..17ebe0f89bf3 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -69,6 +69,7 @@ struct kmem_cache_order_objects { * Slab cache management. */ struct kmem_cache { + struct kmem_cache_cpu *cpu_slab; /* Used for retriving partial slabs etc */ unsigned long flags; int size; /* The size of an object including meta data */ @@ -104,11 +105,6 @@ struct kmem_cache { int remote_node_defrag_ratio; struct kmem_cache_node *node[MAX_NUMNODES]; #endif -#ifdef CONFIG_SMP - struct kmem_cache_cpu *cpu_slab[NR_CPUS]; -#else - struct kmem_cache_cpu cpu_slab; -#endif }; /* diff --git a/mm/slub.c b/mm/slub.c index 8d71aaf888d7..d6c9ecf629d5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -242,15 +242,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) #endif } -static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) -{ -#ifdef CONFIG_SMP - return s->cpu_slab[cpu]; -#else - return &s->cpu_slab; -#endif -} - /* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) @@ -1124,7 +1115,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) return NULL; - stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); + stat(this_cpu_ptr(s->cpu_slab), ORDER_FALLBACK); } if (kmemcheck_enabled @@ -1422,7 +1413,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); __ClearPageSlubFrozen(page); if (page->inuse) { @@ -1454,7 +1445,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) slab_unlock(page); } else { slab_unlock(page); - stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); + stat(__this_cpu_ptr(s->cpu_slab), FREE_SLAB); discard_slab(s, page); } } @@ -1507,7 +1498,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) */ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); if (likely(c && c->page)) flush_slab(s, c); @@ -1673,7 +1664,7 @@ new_slab: local_irq_disable(); if (new) { - c = get_cpu_slab(s, smp_processor_id()); + c = __this_cpu_ptr(s->cpu_slab); stat(c, ALLOC_SLAB); if (c->page) flush_slab(s, c); @@ -1711,7 +1702,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, void **object; struct kmem_cache_cpu *c; unsigned long flags; - unsigned int objsize; + unsigned long objsize; gfpflags &= gfp_allowed_mask; @@ -1722,14 +1713,14 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, return NULL; local_irq_save(flags); - c = get_cpu_slab(s, smp_processor_id()); + c = __this_cpu_ptr(s->cpu_slab); + object = c->freelist; objsize = c->objsize; - if (unlikely(!c->freelist || !node_match(c, node))) + if (unlikely(!object || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { - object = c->freelist; c->freelist = object[c->offset]; stat(c, ALLOC_FASTPATH); } @@ -1800,7 +1791,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, void **object = (void *)x; struct kmem_cache_cpu *c; - c = get_cpu_slab(s, raw_smp_processor_id()); + c = __this_cpu_ptr(s->cpu_slab); stat(c, FREE_SLOWPATH); slab_lock(page); @@ -1872,7 +1863,7 @@ static __always_inline void slab_free(struct kmem_cache *s, kmemleak_free_recursive(x, s->flags); local_irq_save(flags); - c = get_cpu_slab(s, smp_processor_id()); + c = __this_cpu_ptr(s->cpu_slab); kmemcheck_slab_free(s, object, c->objsize); debug_check_no_locks_freed(object, c->objsize); if (!(s->flags & SLAB_DEBUG_OBJECTS)) @@ -2095,130 +2086,28 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) #endif } -#ifdef CONFIG_SMP -/* - * Per cpu array for per cpu structures. - * - * The per cpu array places all kmem_cache_cpu structures from one processor - * close together meaning that it becomes possible that multiple per cpu - * structures are contained in one cacheline. This may be particularly - * beneficial for the kmalloc caches. - * - * A desktop system typically has around 60-80 slabs. With 100 here we are - * likely able to get per cpu structures for all caches from the array defined - * here. We must be able to cover all kmalloc caches during bootstrap. - * - * If the per cpu array is exhausted then fall back to kmalloc - * of individual cachelines. No sharing is possible then. - */ -#define NR_KMEM_CACHE_CPU 100 - -static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], - kmem_cache_cpu); - -static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); -static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); - -static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, - int cpu, gfp_t flags) -{ - struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); - - if (c) - per_cpu(kmem_cache_cpu_free, cpu) = - (void *)c->freelist; - else { - /* Table overflow: So allocate ourselves */ - c = kmalloc_node( - ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), - flags, cpu_to_node(cpu)); - if (!c) - return NULL; - } - - init_kmem_cache_cpu(s, c); - return c; -} - -static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) -{ - if (c < per_cpu(kmem_cache_cpu, cpu) || - c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { - kfree(c); - return; - } - c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); - per_cpu(kmem_cache_cpu_free, cpu) = c; -} - -static void free_kmem_cache_cpus(struct kmem_cache *s) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - - if (c) { - s->cpu_slab[cpu] = NULL; - free_kmem_cache_cpu(c, cpu); - } - } -} - -static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); +static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[SLUB_PAGE_SHIFT]); - if (c) - continue; - - c = alloc_kmem_cache_cpu(s, cpu, flags); - if (!c) { - free_kmem_cache_cpus(s); - return 0; - } - s->cpu_slab[cpu] = c; - } - return 1; -} - -/* - * Initialize the per cpu array. - */ -static void init_alloc_cpu_cpu(int cpu) -{ - int i; - - if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) - return; - - for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) - free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); - - cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)); -} - -static void __init init_alloc_cpu(void) +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) { int cpu; - for_each_online_cpu(cpu) - init_alloc_cpu_cpu(cpu); - } + if (s < kmalloc_caches + SLUB_PAGE_SHIFT && s >= kmalloc_caches) + /* + * Boot time creation of the kmalloc array. Use static per cpu data + * since the per cpu allocator is not available yet. + */ + s->cpu_slab = per_cpu_var(kmalloc_percpu) + (s - kmalloc_caches); + else + s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); -#else -static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} -static inline void init_alloc_cpu(void) {} + if (!s->cpu_slab) + return 0; -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) -{ - init_kmem_cache_cpu(s, &s->cpu_slab); + for_each_possible_cpu(cpu) + init_kmem_cache_cpu(s, per_cpu_ptr(s->cpu_slab, cpu)); return 1; } -#endif #ifdef CONFIG_NUMA /* @@ -2609,9 +2498,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) int node; flush_all(s); - + free_percpu(s->cpu_slab); /* Attempt to free all objects */ - free_kmem_cache_cpus(s); for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -2760,7 +2648,19 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) realsize = kmalloc_caches[index].objsize; text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize); - s = kmalloc(kmem_size, flags & ~SLUB_DMA); + + if (flags & __GFP_WAIT) + s = kmalloc(kmem_size, flags & ~SLUB_DMA); + else { + int i; + + s = NULL; + for (i = 0; i < SLUB_PAGE_SHIFT; i++) + if (kmalloc_caches[i].size) { + s = kmalloc_caches + i; + break; + } + } /* * Must defer sysfs creation to a workqueue because we don't know @@ -3176,8 +3076,6 @@ void __init kmem_cache_init(void) int i; int caches = 0; - init_alloc_cpu(); - #ifdef CONFIG_NUMA /* * Must first have the slab cache available for the allocations of the @@ -3261,8 +3159,10 @@ void __init kmem_cache_init(void) #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); - kmem_size = offsetof(struct kmem_cache, cpu_slab) + - nr_cpu_ids * sizeof(struct kmem_cache_cpu *); +#endif +#ifdef CONFIG_NUMA + kmem_size = offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *); #else kmem_size = sizeof(struct kmem_cache); #endif @@ -3365,7 +3265,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, * per cpu structures */ for_each_online_cpu(cpu) - get_cpu_slab(s, cpu)->objsize = s->objsize; + per_cpu_ptr(s->cpu_slab, cpu)->objsize = s->objsize; s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); up_write(&slub_lock); @@ -3422,11 +3322,9 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - init_alloc_cpu_cpu(cpu); down_read(&slub_lock); list_for_each_entry(s, &slab_caches, list) - s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, - GFP_KERNEL); + init_kmem_cache_cpu(s, per_cpu_ptr(s->cpu_slab, cpu)); up_read(&slub_lock); break; @@ -3436,13 +3334,9 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, case CPU_DEAD_FROZEN: down_read(&slub_lock); list_for_each_entry(s, &slab_caches, list) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - local_irq_save(flags); __flush_cpu_slab(s, cpu); local_irq_restore(flags); - free_kmem_cache_cpu(c, cpu); - s->cpu_slab[cpu] = NULL; } up_read(&slub_lock); break; @@ -3928,7 +3822,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, int cpu; for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); if (!c || c->node < 0) continue; @@ -4353,7 +4247,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) return -ENOMEM; for_each_online_cpu(cpu) { - unsigned x = get_cpu_slab(s, cpu)->stat[si]; + unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; data[cpu] = x; sum += x; @@ -4376,7 +4270,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si) int cpu; for_each_online_cpu(cpu) - get_cpu_slab(s, cpu)->stat[si] = 0; + per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; } #define STAT_ATTR(si, text) \ -- cgit v1.2.3 From 756dee75872a2a764b478e18076360b8a4ec9045 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 18 Dec 2009 16:26:21 -0600 Subject: SLUB: Get rid of dynamic DMA kmalloc cache allocation Dynamic DMA kmalloc cache allocation is troublesome since the new percpu allocator does not support allocations in atomic contexts. Reserve some statically allocated kmalloc_cpu structures instead. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 19 +++++++++++-------- mm/slub.c | 24 ++++++++++-------------- 2 files changed, 21 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 17ebe0f89bf3..a78fb4ac2015 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -131,11 +131,21 @@ struct kmem_cache { #define SLUB_PAGE_SHIFT (PAGE_SHIFT + 2) +#ifdef CONFIG_ZONE_DMA +#define SLUB_DMA __GFP_DMA +/* Reserve extra caches for potential DMA use */ +#define KMALLOC_CACHES (2 * SLUB_PAGE_SHIFT - 6) +#else +/* Disable DMA functionality */ +#define SLUB_DMA (__force gfp_t)0 +#define KMALLOC_CACHES SLUB_PAGE_SHIFT +#endif + /* * We keep the general caches in an array of slab caches that are used for * 2^x bytes of allocations. */ -extern struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT]; +extern struct kmem_cache kmalloc_caches[KMALLOC_CACHES]; /* * Sorry that the following has to be that ugly but some versions of GCC @@ -203,13 +213,6 @@ static __always_inline struct kmem_cache *kmalloc_slab(size_t size) return &kmalloc_caches[index]; } -#ifdef CONFIG_ZONE_DMA -#define SLUB_DMA __GFP_DMA -#else -/* Disable DMA functionality */ -#define SLUB_DMA (__force gfp_t)0 -#endif - void *kmem_cache_alloc(struct kmem_cache *, gfp_t); void *__kmalloc(size_t size, gfp_t flags); diff --git a/mm/slub.c b/mm/slub.c index d6c9ecf629d5..cdb7f0214af0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2092,7 +2092,7 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) { int cpu; - if (s < kmalloc_caches + SLUB_PAGE_SHIFT && s >= kmalloc_caches) + if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) /* * Boot time creation of the kmalloc array. Use static per cpu data * since the per cpu allocator is not available yet. @@ -2539,7 +2539,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; +struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); static int __init setup_slub_min_order(char *str) @@ -2629,6 +2629,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) char *text; size_t realsize; unsigned long slabflags; + int i; s = kmalloc_caches_dma[index]; if (s) @@ -2649,18 +2650,13 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize); - if (flags & __GFP_WAIT) - s = kmalloc(kmem_size, flags & ~SLUB_DMA); - else { - int i; + s = NULL; + for (i = 0; i < KMALLOC_CACHES; i++) + if (!kmalloc_caches[i].size) + break; - s = NULL; - for (i = 0; i < SLUB_PAGE_SHIFT; i++) - if (kmalloc_caches[i].size) { - s = kmalloc_caches + i; - break; - } - } + BUG_ON(i >= KMALLOC_CACHES); + s = kmalloc_caches + i; /* * Must defer sysfs creation to a workqueue because we don't know @@ -2674,7 +2670,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) if (!s || !text || !kmem_cache_open(s, flags, text, realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { - kfree(s); + s->size = 0; kfree(text); goto unlock_out; } -- cgit v1.2.3 From ff12059ed14b0773d7bbef86f98218ada6c20770 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 18 Dec 2009 16:26:22 -0600 Subject: SLUB: this_cpu: Remove slub kmem_cache fields Remove the fields in struct kmem_cache_cpu that were used to cache data from struct kmem_cache when they were in different cachelines. The cacheline that holds the per cpu array pointer now also holds these values. We can cut down the struct kmem_cache_cpu size to almost half. The get_freepointer() and set_freepointer() functions that used to be only intended for the slow path now are also useful for the hot path since access to the size field does not require accessing an additional cacheline anymore. This results in consistent use of functions for setting the freepointer of objects throughout SLUB. Also we initialize all possible kmem_cache_cpu structures when a slab is created. No need to initialize them when a processor or node comes online. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 2 -- mm/slub.c | 76 +++++++++++------------------------------------- 2 files changed, 17 insertions(+), 61 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index a78fb4ac2015..0249d4175bac 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -38,8 +38,6 @@ struct kmem_cache_cpu { void **freelist; /* Pointer to first free per cpu object */ struct page *page; /* The slab from which we are allocating */ int node; /* The node of the page (or -1 for debug) */ - unsigned int offset; /* Freepointer offset (in word units) */ - unsigned int objsize; /* Size of an object (from kmem_cache) */ #ifdef CONFIG_SLUB_STATS unsigned stat[NR_SLUB_STAT_ITEMS]; #endif diff --git a/mm/slub.c b/mm/slub.c index cdb7f0214af0..30d2dde27563 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -260,13 +260,6 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; } -/* - * Slow version of get and set free pointer. - * - * This version requires touching the cache lines of kmem_cache which - * we avoid to do in the fast alloc free paths. There we obtain the offset - * from the page struct. - */ static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -1473,10 +1466,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) /* Retrieve object from cpu_freelist */ object = c->freelist; - c->freelist = c->freelist[c->offset]; + c->freelist = get_freepointer(s, c->freelist); /* And put onto the regular freelist */ - object[c->offset] = page->freelist; + set_freepointer(s, object, page->freelist); page->freelist = object; page->inuse--; } @@ -1635,7 +1628,7 @@ load_freelist: if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) goto debug; - c->freelist = object[c->offset]; + c->freelist = get_freepointer(s, object); c->page->inuse = c->page->objects; c->page->freelist = NULL; c->node = page_to_nid(c->page); @@ -1681,7 +1674,7 @@ debug: goto another_slab; c->page->inuse++; - c->page->freelist = object[c->offset]; + c->page->freelist = get_freepointer(s, object); c->node = -1; goto unlock_out; } @@ -1702,7 +1695,6 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, void **object; struct kmem_cache_cpu *c; unsigned long flags; - unsigned long objsize; gfpflags &= gfp_allowed_mask; @@ -1715,22 +1707,21 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, local_irq_save(flags); c = __this_cpu_ptr(s->cpu_slab); object = c->freelist; - objsize = c->objsize; if (unlikely(!object || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { - c->freelist = object[c->offset]; + c->freelist = get_freepointer(s, object); stat(c, ALLOC_FASTPATH); } local_irq_restore(flags); if (unlikely(gfpflags & __GFP_ZERO) && object) - memset(object, 0, objsize); + memset(object, 0, s->objsize); - kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); - kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); + kmemcheck_slab_alloc(s, gfpflags, object, s->objsize); + kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags); return object; } @@ -1785,7 +1776,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, unsigned long addr, unsigned int offset) + void *x, unsigned long addr) { void *prior; void **object = (void *)x; @@ -1799,7 +1790,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page, goto debug; checks_ok: - prior = object[offset] = page->freelist; + prior = page->freelist; + set_freepointer(s, object, prior); page->freelist = object; page->inuse--; @@ -1864,16 +1856,16 @@ static __always_inline void slab_free(struct kmem_cache *s, kmemleak_free_recursive(x, s->flags); local_irq_save(flags); c = __this_cpu_ptr(s->cpu_slab); - kmemcheck_slab_free(s, object, c->objsize); - debug_check_no_locks_freed(object, c->objsize); + kmemcheck_slab_free(s, object, s->objsize); + debug_check_no_locks_freed(object, s->objsize); if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, c->objsize); + debug_check_no_obj_freed(object, s->objsize); if (likely(page == c->page && c->node >= 0)) { - object[c->offset] = c->freelist; + set_freepointer(s, object, c->freelist); c->freelist = object; stat(c, FREE_FASTPATH); } else - __slab_free(s, page, x, addr, c->offset); + __slab_free(s, page, x, addr); local_irq_restore(flags); } @@ -2060,19 +2052,6 @@ static unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } -static void init_kmem_cache_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) -{ - c->page = NULL; - c->freelist = NULL; - c->node = 0; - c->offset = s->offset / sizeof(void *); - c->objsize = s->objsize; -#ifdef CONFIG_SLUB_STATS - memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned)); -#endif -} - static void init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { @@ -2090,8 +2069,6 @@ static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[SLUB_PAGE_SHIFT]); static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) { - int cpu; - if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) /* * Boot time creation of the kmalloc array. Use static per cpu data @@ -2104,8 +2081,6 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) if (!s->cpu_slab) return 0; - for_each_possible_cpu(cpu) - init_kmem_cache_cpu(s, per_cpu_ptr(s->cpu_slab, cpu)); return 1; } @@ -2391,6 +2366,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) return 1; + free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) @@ -3247,22 +3223,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { - int cpu; - s->refcount++; /* * Adjust the object sizes so that we clear * the complete object on kzalloc. */ s->objsize = max(s->objsize, (int)size); - - /* - * And then we need to update the object size in the - * per cpu structures - */ - for_each_online_cpu(cpu) - per_cpu_ptr(s->cpu_slab, cpu)->objsize = s->objsize; - s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); up_write(&slub_lock); @@ -3316,14 +3282,6 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, unsigned long flags; switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - down_read(&slub_lock); - list_for_each_entry(s, &slab_caches, list) - init_kmem_cache_cpu(s, per_cpu_ptr(s->cpu_slab, cpu)); - up_read(&slub_lock); - break; - case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: -- cgit v1.2.3 From 9a418af5df03ad133cd8c8f6742b75e542db6392 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 17 Dec 2009 13:55:48 +0100 Subject: mac80211: fix peer HT capabilities I noticed yesterday, because Jeff had noticed a speed regression, cf. bug http://bugzilla.intellinuxwireless.org/show_bug.cgi?id=2138 that the SM PS settings for peers were wrong. Instead of overwriting the SM PS settings with the local bits, we need to keep the remote bits. The bug was part of the original HT code from over two years ago, but unfortunately nobody noticed that it makes no sense -- we shouldn't be overwriting the peer's setting with our own but rather keep it intact when masking the peer capabilities with our own. While fixing that, I noticed that the masking of capabilities is completely useless for most of the bits, so also fix those other bits. Finally, I also noticed that PSMP_SUPPORT no longer exists in the final 802.11n version, so also remove that. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- drivers/net/wireless/rt2x00/rt2800lib.c | 3 +-- include/linux/ieee80211.h | 2 +- net/mac80211/ht.c | 25 ++++++++++++++++++++++--- 3 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/rt2x00/rt2800lib.c b/drivers/net/wireless/rt2x00/rt2800lib.c index 6bf6c0f12f35..27bf887f1453 100644 --- a/drivers/net/wireless/rt2x00/rt2800lib.c +++ b/drivers/net/wireless/rt2x00/rt2800lib.c @@ -2080,8 +2080,7 @@ int rt2800_probe_hw_mode(struct rt2x00_dev *rt2x00dev) IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_TX_STBC | - IEEE80211_HT_CAP_RX_STBC | - IEEE80211_HT_CAP_PSMP_SUPPORT; + IEEE80211_HT_CAP_RX_STBC; spec->ht.ampdu_factor = 3; spec->ht.ampdu_density = 4; spec->ht.mcs.tx_params = diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d9724a28c0c2..163c840437d6 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -832,7 +832,7 @@ struct ieee80211_ht_cap { #define IEEE80211_HT_CAP_DELAY_BA 0x0400 #define IEEE80211_HT_CAP_MAX_AMSDU 0x0800 #define IEEE80211_HT_CAP_DSSSCCK40 0x1000 -#define IEEE80211_HT_CAP_PSMP_SUPPORT 0x2000 +#define IEEE80211_HT_CAP_RESERVED 0x2000 #define IEEE80211_HT_CAP_40MHZ_INTOLERANT 0x4000 #define IEEE80211_HT_CAP_LSIG_TXOP_PROT 0x8000 diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 3787455fb696..d7dcee680728 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -34,9 +34,28 @@ void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband, ht_cap->ht_supported = true; - ht_cap->cap = le16_to_cpu(ht_cap_ie->cap_info) & sband->ht_cap.cap; - ht_cap->cap &= ~IEEE80211_HT_CAP_SM_PS; - ht_cap->cap |= sband->ht_cap.cap & IEEE80211_HT_CAP_SM_PS; + /* + * The bits listed in this expression should be + * the same for the peer and us, if the station + * advertises more then we can't use those thus + * we mask them out. + */ + ht_cap->cap = le16_to_cpu(ht_cap_ie->cap_info) & + (sband->ht_cap.cap | + ~(IEEE80211_HT_CAP_LDPC_CODING | + IEEE80211_HT_CAP_SUP_WIDTH_20_40 | + IEEE80211_HT_CAP_GRN_FLD | + IEEE80211_HT_CAP_SGI_20 | + IEEE80211_HT_CAP_SGI_40 | + IEEE80211_HT_CAP_DSSSCCK40)); + /* + * The STBC bits are asymmetric -- if we don't have + * TX then mask out the peer's RX and vice versa. + */ + if (!(sband->ht_cap.cap & IEEE80211_HT_CAP_TX_STBC)) + ht_cap->cap &= ~IEEE80211_HT_CAP_RX_STBC; + if (!(sband->ht_cap.cap & IEEE80211_HT_CAP_RX_STBC)) + ht_cap->cap &= ~IEEE80211_HT_CAP_TX_STBC; ampdu_info = ht_cap_ie->ampdu_params_info; ht_cap->ampdu_factor = -- cgit v1.2.3 From 482928d59db668b8d82a48717f78986d8cea72e9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Dec 2009 10:10:39 -0500 Subject: Fix f_flags/f_mode in case of lookup_instantiate_filp() from open(pathname, 3) Just set f_flags when shoving struct file into nameidata; don't postpone that until __dentry_open(). do_filp_open() has correct value; lookup_instantiate_filp() doesn't - we lose the difference between O_RDWR and 3 by that point. We still set .intent.open.flags, so no fs code needs to be changed. Signed-off-by: Al Viro --- fs/internal.h | 7 +++++++ fs/namei.c | 6 ++++-- fs/open.c | 13 ++++++------- include/linux/namei.h | 2 -- 4 files changed, 17 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/internal.h b/fs/internal.h index f67cd141d9a8..e96a1667d749 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -85,3 +85,10 @@ extern struct file *get_empty_filp(void); * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); + +/* + * open.c + */ +struct nameidata; +extern struct file *nameidata_to_filp(struct nameidata *); +extern void release_open_intent(struct nameidata *); diff --git a/fs/namei.c b/fs/namei.c index dad4b80257db..d517f73aa36b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1640,6 +1640,7 @@ struct file *do_filp_open(int dfd, const char *pathname, if (filp == NULL) return ERR_PTR(-ENFILE); nd.intent.open.file = filp; + filp->f_flags = open_flag; nd.intent.open.flags = flag; nd.intent.open.create_mode = 0; error = do_path_lookup(dfd, pathname, @@ -1685,6 +1686,7 @@ struct file *do_filp_open(int dfd, const char *pathname, if (filp == NULL) goto exit_parent; nd.intent.open.file = filp; + filp->f_flags = open_flag; nd.intent.open.flags = flag; nd.intent.open.create_mode = mode; dir = nd.path.dentry; @@ -1725,7 +1727,7 @@ do_last: mnt_drop_write(nd.path.mnt); goto exit; } - filp = nameidata_to_filp(&nd, open_flag); + filp = nameidata_to_filp(&nd); mnt_drop_write(nd.path.mnt); if (nd.root.mnt) path_put(&nd.root); @@ -1789,7 +1791,7 @@ ok: mnt_drop_write(nd.path.mnt); goto exit; } - filp = nameidata_to_filp(&nd, open_flag); + filp = nameidata_to_filp(&nd); if (!IS_ERR(filp)) { error = ima_path_check(&filp->f_path, filp->f_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); diff --git a/fs/open.c b/fs/open.c index ca69241796bd..6daee28f6e8f 100644 --- a/fs/open.c +++ b/fs/open.c @@ -821,15 +821,14 @@ static inline int __get_file_write_access(struct inode *inode, } static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, - int flags, struct file *f, + struct file *f, int (*open)(struct inode *, struct file *), const struct cred *cred) { struct inode *inode; int error; - f->f_flags = flags; - f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK | + f->f_mode = (__force fmode_t)((f->f_flags+1) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { @@ -930,7 +929,6 @@ struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry if (IS_ERR(dentry)) goto out_err; nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt), - nd->intent.open.flags - 1, nd->intent.open.file, open, cred); out: @@ -949,7 +947,7 @@ EXPORT_SYMBOL_GPL(lookup_instantiate_filp); * * Note that this function destroys the original nameidata */ -struct file *nameidata_to_filp(struct nameidata *nd, int flags) +struct file *nameidata_to_filp(struct nameidata *nd) { const struct cred *cred = current_cred(); struct file *filp; @@ -958,7 +956,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags) filp = nd->intent.open.file; /* Has the filesystem initialised the file for us? */ if (filp->f_path.dentry == NULL) - filp = __dentry_open(nd->path.dentry, nd->path.mnt, flags, filp, + filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp, NULL, cred); else path_put(&nd->path); @@ -997,7 +995,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, return ERR_PTR(error); } - return __dentry_open(dentry, mnt, flags, f, NULL, cred); + f->f_flags = flags; + return __dentry_open(dentry, mnt, f, NULL, cred); } EXPORT_SYMBOL(dentry_open); diff --git a/include/linux/namei.h b/include/linux/namei.h index 028946750289..05b441d93642 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -72,8 +72,6 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry, int (*open)(struct inode *, struct file *)); -extern struct file *nameidata_to_filp(struct nameidata *nd, int flags); -extern void release_open_intent(struct nameidata *); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); -- cgit v1.2.3 From 5300990c0370e804e49d9a59d928c5d53fb73487 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Dec 2009 10:15:07 -0500 Subject: Sanitize f_flags helpers * pull ACC_MODE to fs.h; we have several copies all over the place * nightmarish expression calculating f_mode by f_flags deserves a helper too (OPEN_FMODE(flags)) Signed-off-by: Al Viro --- fs/anon_inodes.c | 10 +--------- fs/namei.c | 2 -- fs/open.c | 2 +- include/linux/fs.h | 3 +++ kernel/auditsc.c | 1 - security/tomoyo/file.c | 1 - 6 files changed, 5 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 598237e97221..9f0bf13291e5 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -89,19 +89,11 @@ struct file *anon_inode_getfile(const char *name, struct qstr this; struct path path; struct file *file; - fmode_t mode; int error; if (IS_ERR(anon_inode_inode)) return ERR_PTR(-ENODEV); - switch (flags & O_ACCMODE) { - case O_RDONLY: mode = FMODE_READ; break; - case O_WRONLY: mode = FMODE_WRITE; break; - case O_RDWR: mode = FMODE_READ | FMODE_WRITE; break; - default: return ERR_PTR(-EINVAL); - } - if (fops->owner && !try_module_get(fops->owner)) return ERR_PTR(-ENOENT); @@ -129,7 +121,7 @@ struct file *anon_inode_getfile(const char *name, d_instantiate(path.dentry, anon_inode_inode); error = -ENFILE; - file = alloc_file(&path, mode, fops); + file = alloc_file(&path, OPEN_FMODE(flags), fops); if (!file) goto err_dput; file->f_mapping = anon_inode_inode->i_mapping; diff --git a/fs/namei.c b/fs/namei.c index d517f73aa36b..68921d9b5302 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -37,8 +37,6 @@ #include "internal.h" -#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) - /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) * were necessary because of omirr. The reason is that omirr needs diff --git a/fs/open.c b/fs/open.c index 6daee28f6e8f..040cef72bc00 100644 --- a/fs/open.c +++ b/fs/open.c @@ -828,7 +828,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, struct inode *inode; int error; - f->f_mode = (__force fmode_t)((f->f_flags+1) & O_ACCMODE) | FMODE_LSEEK | + f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { diff --git a/include/linux/fs.h b/include/linux/fs.h index cca191933ff6..9e13b533aaef 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2464,5 +2464,8 @@ int proc_nr_files(struct ctl_table *table, int write, int __init get_filesystem_list(char *buf); +#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) +#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE)) + #endif /* __KERNEL__ */ #endif /* _LINUX_FS_H */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 267e484f0198..fc0f928167e7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -250,7 +250,6 @@ struct audit_context { #endif }; -#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) static inline int open_arg(int flags, int mask) { int n = ACC_MODE(flags); diff --git a/security/tomoyo/file.c b/security/tomoyo/file.c index 8346938809b1..9a6c58881c0a 100644 --- a/security/tomoyo/file.c +++ b/security/tomoyo/file.c @@ -12,7 +12,6 @@ #include "common.h" #include "tomoyo.h" #include "realpath.h" -#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) /* * tomoyo_globally_readable_file_entry is a structure which is used for holding -- cgit v1.2.3 From 95ebc3a7930d5965b00bbedbf36bfd3eb9124d65 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 28 Oct 2009 01:46:33 +0100 Subject: Remove obsolete comment in fs.h This question was determined to be a bug which was fixed in commit 4a3b0a49. Signed-off-by: Andreas Gruenbacher Cc: Jan Blunck Signed-off-by: Al Viro --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9e13b533aaef..7e3012e0ac06 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1624,8 +1624,6 @@ struct super_operations { * on the bit address once it is done. * * Q: What is the difference between I_WILL_FREE and I_FREEING? - * Q: igrab() only checks on (I_FREEING|I_WILL_FREE). Should it also check on - * I_CLEAR? If not, why? */ #define I_DIRTY_SYNC 1 #define I_DIRTY_DATASYNC 2 -- cgit v1.2.3 From 0f78231bffb868a30e8533aace142213266bb811 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 1 Dec 2009 13:37:02 +0100 Subject: mac80211: enable spatial multiplexing powersave Enable spatial multiplexing in mac80211 by telling the driver what to do and, where necessary, sending action frames to the AP to update the requested SMPS mode. Also includes a trivial implementation for hwsim that just logs the requested mode. For now, the userspace interface is in debugfs only, and let you toggle the requested mode at any time. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- drivers/net/wireless/mac80211_hwsim.c | 24 ++++++-- include/linux/ieee80211.h | 25 +++++++- include/net/mac80211.h | 59 ++++++++++++++++++ net/mac80211/cfg.c | 49 +++++++++++++++ net/mac80211/debugfs_netdev.c | 111 +++++++++++++++++++++++++++++++++- net/mac80211/driver-trace.h | 2 + net/mac80211/ht.c | 47 ++++++++++++++ net/mac80211/ieee80211_i.h | 14 +++++ net/mac80211/main.c | 24 ++++++++ net/mac80211/mlme.c | 63 +++++++++++++++++-- net/mac80211/status.c | 38 ++++++++++++ net/mac80211/util.c | 74 +++++++++++++++++++++++ 12 files changed, 518 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 88e41176e7fd..92c669ebb358 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -618,12 +618,26 @@ static int mac80211_hwsim_config(struct ieee80211_hw *hw, u32 changed) { struct mac80211_hwsim_data *data = hw->priv; struct ieee80211_conf *conf = &hw->conf; - - printk(KERN_DEBUG "%s:%s (freq=%d idle=%d ps=%d)\n", + static const char *chantypes[4] = { + [NL80211_CHAN_NO_HT] = "noht", + [NL80211_CHAN_HT20] = "ht20", + [NL80211_CHAN_HT40MINUS] = "ht40-", + [NL80211_CHAN_HT40PLUS] = "ht40+", + }; + static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = { + [IEEE80211_SMPS_AUTOMATIC] = "auto", + [IEEE80211_SMPS_OFF] = "off", + [IEEE80211_SMPS_STATIC] = "static", + [IEEE80211_SMPS_DYNAMIC] = "dynamic", + }; + + printk(KERN_DEBUG "%s:%s (freq=%d/%s idle=%d ps=%d smps=%s)\n", wiphy_name(hw->wiphy), __func__, conf->channel->center_freq, + chantypes[conf->channel_type], !!(conf->flags & IEEE80211_CONF_IDLE), - !!(conf->flags & IEEE80211_CONF_PS)); + !!(conf->flags & IEEE80211_CONF_PS), + smps_modes[conf->smps_mode]); data->idle = !!(conf->flags & IEEE80211_CONF_IDLE); @@ -1082,7 +1096,9 @@ static int __init init_mac80211_hwsim(void) BIT(NL80211_IFTYPE_MESH_POINT); hw->flags = IEEE80211_HW_MFP_CAPABLE | - IEEE80211_HW_SIGNAL_DBM; + IEEE80211_HW_SIGNAL_DBM | + IEEE80211_HW_SUPPORTS_STATIC_SMPS | + IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS; /* ask mac80211 to reserve space for magic */ hw->vif_data_size = sizeof(struct hwsim_vif_priv); diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d9724a28c0c2..e8d43d0ff2c3 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -707,6 +707,10 @@ struct ieee80211_mgmt { u8 action; u8 trans_id[WLAN_SA_QUERY_TR_ID_LEN]; } __attribute__ ((packed)) sa_query; + struct { + u8 action; + u8 smps_control; + } __attribute__ ((packed)) ht_smps; } u; } __attribute__ ((packed)) action; } u; @@ -824,6 +828,7 @@ struct ieee80211_ht_cap { #define IEEE80211_HT_CAP_LDPC_CODING 0x0001 #define IEEE80211_HT_CAP_SUP_WIDTH_20_40 0x0002 #define IEEE80211_HT_CAP_SM_PS 0x000C +#define IEEE80211_HT_CAP_SM_PS_SHIFT 2 #define IEEE80211_HT_CAP_GRN_FLD 0x0010 #define IEEE80211_HT_CAP_SGI_20 0x0020 #define IEEE80211_HT_CAP_SGI_40 0x0040 @@ -839,6 +844,7 @@ struct ieee80211_ht_cap { /* 802.11n HT capability AMPDU settings (for ampdu_params_info) */ #define IEEE80211_HT_AMPDU_PARM_FACTOR 0x03 #define IEEE80211_HT_AMPDU_PARM_DENSITY 0x1C +#define IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT 2 /* * Maximum length of AMPDU that the STA can receive. @@ -922,12 +928,17 @@ struct ieee80211_ht_info { #define IEEE80211_MAX_AMPDU_BUF 0x40 -/* Spatial Multiplexing Power Save Modes */ +/* Spatial Multiplexing Power Save Modes (for capability) */ #define WLAN_HT_CAP_SM_PS_STATIC 0 #define WLAN_HT_CAP_SM_PS_DYNAMIC 1 #define WLAN_HT_CAP_SM_PS_INVALID 2 #define WLAN_HT_CAP_SM_PS_DISABLED 3 +/* for SM power control field lower two bits */ +#define WLAN_HT_SMPS_CONTROL_DISABLED 0 +#define WLAN_HT_SMPS_CONTROL_STATIC 1 +#define WLAN_HT_SMPS_CONTROL_DYNAMIC 3 + /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 @@ -1150,6 +1161,18 @@ enum ieee80211_spectrum_mgmt_actioncode { WLAN_ACTION_SPCT_CHL_SWITCH = 4, }; +/* HT action codes */ +enum ieee80211_ht_actioncode { + WLAN_HT_ACTION_NOTIFY_CHANWIDTH = 0, + WLAN_HT_ACTION_SMPS = 1, + WLAN_HT_ACTION_PSMP = 2, + WLAN_HT_ACTION_PCO_PHASE = 3, + WLAN_HT_ACTION_CSI = 4, + WLAN_HT_ACTION_NONCOMPRESSED_BF = 5, + WLAN_HT_ACTION_COMPRESSED_BF = 6, + WLAN_HT_ACTION_ASEL_IDX_FEEDBACK = 7, +}; + /* Security key length */ enum ieee80211_key_len { WLAN_KEY_LEN_WEP40 = 5, diff --git a/include/net/mac80211.h b/include/net/mac80211.h index e94cc526b0f6..e6b6bf81d5b9 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -597,8 +597,10 @@ enum ieee80211_conf_flags { * @IEEE80211_CONF_CHANGE_CHANNEL: the channel/channel_type changed * @IEEE80211_CONF_CHANGE_RETRY_LIMITS: retry limits changed * @IEEE80211_CONF_CHANGE_IDLE: Idle flag changed + * @IEEE80211_CONF_CHANGE_SMPS: Spatial multiplexing powersave mode changed */ enum ieee80211_conf_changed { + IEEE80211_CONF_CHANGE_SMPS = BIT(1), IEEE80211_CONF_CHANGE_LISTEN_INTERVAL = BIT(2), IEEE80211_CONF_CHANGE_MONITOR = BIT(3), IEEE80211_CONF_CHANGE_PS = BIT(4), @@ -608,6 +610,21 @@ enum ieee80211_conf_changed { IEEE80211_CONF_CHANGE_IDLE = BIT(8), }; +/** + * enum ieee80211_smps_mode - spatial multiplexing power save mode + * + * @ + */ +enum ieee80211_smps_mode { + IEEE80211_SMPS_AUTOMATIC, + IEEE80211_SMPS_OFF, + IEEE80211_SMPS_STATIC, + IEEE80211_SMPS_DYNAMIC, + + /* keep last */ + IEEE80211_SMPS_NUM_MODES, +}; + /** * struct ieee80211_conf - configuration of the device * @@ -636,6 +653,10 @@ enum ieee80211_conf_changed { * @short_frame_max_tx_count: Maximum number of transmissions for a "short" * frame, called "dot11ShortRetryLimit" in 802.11, but actually means the * number of transmissions not the number of retries + * + * @smps_mode: spatial multiplexing powersave mode; note that + * %IEEE80211_SMPS_STATIC is used when the device is not + * configured for an HT channel */ struct ieee80211_conf { u32 flags; @@ -648,6 +669,7 @@ struct ieee80211_conf { struct ieee80211_channel *channel; enum nl80211_channel_type channel_type; + enum ieee80211_smps_mode smps_mode; }; /** @@ -930,6 +952,16 @@ enum ieee80211_tkip_key_type { * @IEEE80211_HW_BEACON_FILTER: * Hardware supports dropping of irrelevant beacon frames to * avoid waking up cpu. + * + * @IEEE80211_HW_SUPPORTS_STATIC_SMPS: + * Hardware supports static spatial multiplexing powersave, + * ie. can turn off all but one chain even on HT connections + * that should be using more chains. + * + * @IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS: + * Hardware supports dynamic spatial multiplexing powersave, + * ie. can turn off all but one chain and then wake the rest + * up as required after, for example, rts/cts handshake. */ enum ieee80211_hw_flags { IEEE80211_HW_HAS_RATE_CONTROL = 1<<0, @@ -947,6 +979,8 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_DYNAMIC_PS = 1<<12, IEEE80211_HW_MFP_CAPABLE = 1<<13, IEEE80211_HW_BEACON_FILTER = 1<<14, + IEEE80211_HW_SUPPORTS_STATIC_SMPS = 1<<15, + IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS = 1<<16, }; /** @@ -1214,6 +1248,31 @@ ieee80211_get_alt_retry_rate(const struct ieee80211_hw *hw, * signal strength threshold checking. */ +/** + * DOC: Spatial multiplexing power save + * + * SMPS (Spatial multiplexing power save) is a mechanism to conserve + * power in an 802.11n implementation. For details on the mechanism + * and rationale, please refer to 802.11 (as amended by 802.11n-2009) + * "11.2.3 SM power save". + * + * The mac80211 implementation is capable of sending action frames + * to update the AP about the station's SMPS mode, and will instruct + * the driver to enter the specific mode. It will also announce the + * requested SMPS mode during the association handshake. Hardware + * support for this feature is required, and can be indicated by + * hardware flags. + * + * The default mode will be "automatic", which nl80211/cfg80211 + * defines to be dynamic SMPS in (regular) powersave, and SMPS + * turned off otherwise. + * + * To support this feature, the driver must set the appropriate + * hardware support flags, and handle the SMPS flag to the config() + * operation. It will then with this mechanism be instructed to + * enter the requested SMPS mode while associated to an HT AP. + */ + /** * DOC: Frame filtering * diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index fcfa1bf776a7..8c35418d1c96 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1318,6 +1318,50 @@ static int ieee80211_testmode_cmd(struct wiphy *wiphy, void *data, int len) } #endif +int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps_mode) +{ + const u8 *ap; + enum ieee80211_smps_mode old_req; + int err; + + old_req = sdata->u.mgd.req_smps; + sdata->u.mgd.req_smps = smps_mode; + + if (old_req == smps_mode && + smps_mode != IEEE80211_SMPS_AUTOMATIC) + return 0; + + /* + * If not associated, or current association is not an HT + * association, there's no need to send an action frame. + */ + if (!sdata->u.mgd.associated || + sdata->local->oper_channel_type == NL80211_CHAN_NO_HT) { + mutex_lock(&sdata->local->iflist_mtx); + ieee80211_recalc_smps(sdata->local, sdata); + mutex_unlock(&sdata->local->iflist_mtx); + return 0; + } + + ap = sdata->u.mgd.associated->cbss.bssid; + + if (smps_mode == IEEE80211_SMPS_AUTOMATIC) { + if (sdata->u.mgd.powersave) + smps_mode = IEEE80211_SMPS_DYNAMIC; + else + smps_mode = IEEE80211_SMPS_OFF; + } + + /* send SM PS frame to AP */ + err = ieee80211_send_smps_action(sdata, smps_mode, + ap, ap); + if (err) + sdata->u.mgd.req_smps = old_req; + + return err; +} + static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, bool enabled, int timeout) { @@ -1335,6 +1379,11 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, sdata->u.mgd.powersave = enabled; conf->dynamic_ps_timeout = timeout; + /* no change, but if automatic follow powersave */ + mutex_lock(&sdata->u.mgd.mtx); + __ieee80211_request_smps(sdata, sdata->u.mgd.req_smps); + mutex_unlock(&sdata->u.mgd.mtx); + if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 5d9c797635a9..355983503885 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -41,6 +41,30 @@ static ssize_t ieee80211_if_read( return ret; } +static ssize_t ieee80211_if_write( + struct ieee80211_sub_if_data *sdata, + const char __user *userbuf, + size_t count, loff_t *ppos, + ssize_t (*write)(struct ieee80211_sub_if_data *, const char *, int)) +{ + u8 *buf; + ssize_t ret = -ENODEV; + + buf = kzalloc(count, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, userbuf, count)) + return -EFAULT; + + rtnl_lock(); + if (sdata->dev->reg_state == NETREG_REGISTERED) + ret = (*write)(sdata, buf, count); + rtnl_unlock(); + + return ret; +} + #define IEEE80211_IF_FMT(name, field, format_string) \ static ssize_t ieee80211_if_fmt_##name( \ const struct ieee80211_sub_if_data *sdata, char *buf, \ @@ -71,7 +95,7 @@ static ssize_t ieee80211_if_fmt_##name( \ return scnprintf(buf, buflen, "%pM\n", sdata->field); \ } -#define __IEEE80211_IF_FILE(name) \ +#define __IEEE80211_IF_FILE(name, _write) \ static ssize_t ieee80211_if_read_##name(struct file *file, \ char __user *userbuf, \ size_t count, loff_t *ppos) \ @@ -82,12 +106,24 @@ static ssize_t ieee80211_if_read_##name(struct file *file, \ } \ static const struct file_operations name##_ops = { \ .read = ieee80211_if_read_##name, \ + .write = (_write), \ .open = mac80211_open_file_generic, \ } +#define __IEEE80211_IF_FILE_W(name) \ +static ssize_t ieee80211_if_write_##name(struct file *file, \ + const char __user *userbuf, \ + size_t count, loff_t *ppos) \ +{ \ + return ieee80211_if_write(file->private_data, userbuf, count, \ + ppos, ieee80211_if_parse_##name); \ +} \ +__IEEE80211_IF_FILE(name, ieee80211_if_write_##name) + + #define IEEE80211_IF_FILE(name, field, format) \ IEEE80211_IF_FMT_##format(name, field) \ - __IEEE80211_IF_FILE(name) + __IEEE80211_IF_FILE(name, NULL) /* common attributes */ IEEE80211_IF_FILE(drop_unencrypted, drop_unencrypted, DEC); @@ -99,6 +135,70 @@ IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC); IEEE80211_IF_FILE(aid, u.mgd.aid, DEC); IEEE80211_IF_FILE(capab, u.mgd.capab, HEX); +static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps_mode) +{ + struct ieee80211_local *local = sdata->local; + int err; + + if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_STATIC_SMPS) && + smps_mode == IEEE80211_SMPS_STATIC) + return -EINVAL; + + /* auto should be dynamic if in PS mode */ + if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) && + (smps_mode == IEEE80211_SMPS_DYNAMIC || + smps_mode == IEEE80211_SMPS_AUTOMATIC)) + return -EINVAL; + + /* supported only on managed interfaces for now */ + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return -EOPNOTSUPP; + + mutex_lock(&local->iflist_mtx); + err = __ieee80211_request_smps(sdata, smps_mode); + mutex_unlock(&local->iflist_mtx); + + return err; +} + +static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = { + [IEEE80211_SMPS_AUTOMATIC] = "auto", + [IEEE80211_SMPS_OFF] = "off", + [IEEE80211_SMPS_STATIC] = "static", + [IEEE80211_SMPS_DYNAMIC] = "dynamic", +}; + +static ssize_t ieee80211_if_fmt_smps(const struct ieee80211_sub_if_data *sdata, + char *buf, int buflen) +{ + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return -EOPNOTSUPP; + + return snprintf(buf, buflen, "request: %s\nused: %s\n", + smps_modes[sdata->u.mgd.req_smps], + smps_modes[sdata->u.mgd.ap_smps]); +} + +static ssize_t ieee80211_if_parse_smps(struct ieee80211_sub_if_data *sdata, + const char *buf, int buflen) +{ + enum ieee80211_smps_mode mode; + + for (mode = 0; mode < IEEE80211_SMPS_NUM_MODES; mode++) { + if (strncmp(buf, smps_modes[mode], buflen) == 0) { + int err = ieee80211_set_smps(sdata, mode); + if (!err) + return buflen; + return err; + } + } + + return -EINVAL; +} + +__IEEE80211_IF_FILE_W(smps); + /* AP attributes */ IEEE80211_IF_FILE(num_sta_ps, u.ap.num_sta_ps, ATOMIC); IEEE80211_IF_FILE(dtim_count, u.ap.dtim_count, DEC); @@ -109,7 +209,7 @@ static ssize_t ieee80211_if_fmt_num_buffered_multicast( return scnprintf(buf, buflen, "%u\n", skb_queue_len(&sdata->u.ap.ps_bc_buf)); } -__IEEE80211_IF_FILE(num_buffered_multicast); +__IEEE80211_IF_FILE(num_buffered_multicast, NULL); /* WDS attributes */ IEEE80211_IF_FILE(peer, u.wds.remote_addr, MAC); @@ -158,6 +258,10 @@ IEEE80211_IF_FILE(dot11MeshHWMPRootMode, debugfs_create_file(#name, 0400, sdata->debugfs.dir, \ sdata, &name##_ops); +#define DEBUGFS_ADD_MODE(name, mode) \ + debugfs_create_file(#name, mode, sdata->debugfs.dir, \ + sdata, &name##_ops); + static void add_sta_files(struct ieee80211_sub_if_data *sdata) { DEBUGFS_ADD(drop_unencrypted, sta); @@ -167,6 +271,7 @@ static void add_sta_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD(bssid, sta); DEBUGFS_ADD(aid, sta); DEBUGFS_ADD(capab, sta); + DEBUGFS_ADD_MODE(smps, 0600); } static void add_ap_files(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/driver-trace.h b/net/mac80211/driver-trace.h index ee2d19a25ce1..7a849b920165 100644 --- a/net/mac80211/driver-trace.h +++ b/net/mac80211/driver-trace.h @@ -140,6 +140,7 @@ TRACE_EVENT(drv_config, __field(u8, short_frame_max_tx_count) __field(int, center_freq) __field(int, channel_type) + __field(int, smps) ), TP_fast_assign( @@ -155,6 +156,7 @@ TRACE_EVENT(drv_config, __entry->short_frame_max_tx_count = local->hw.conf.short_frame_max_tx_count; __entry->center_freq = local->hw.conf.channel->center_freq; __entry->channel_type = local->hw.conf.channel_type; + __entry->smps = local->hw.conf.smps_mode; ), TP_printk( diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 45ebd062a2fb..63b8f86b7f16 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -166,3 +166,50 @@ void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, spin_unlock_bh(&sta->lock); } } + +int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps, const u8 *da, + const u8 *bssid) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *action_frame; + + /* 27 = header + category + action + smps mode */ + skb = dev_alloc_skb(27 + local->hw.extra_tx_headroom); + if (!skb) + return -ENOMEM; + + skb_reserve(skb, local->hw.extra_tx_headroom); + action_frame = (void *)skb_put(skb, 27); + memcpy(action_frame->da, da, ETH_ALEN); + memcpy(action_frame->sa, sdata->dev->dev_addr, ETH_ALEN); + memcpy(action_frame->bssid, bssid, ETH_ALEN); + action_frame->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + action_frame->u.action.category = WLAN_CATEGORY_HT; + action_frame->u.action.u.ht_smps.action = WLAN_HT_ACTION_SMPS; + switch (smps) { + case IEEE80211_SMPS_AUTOMATIC: + case IEEE80211_SMPS_NUM_MODES: + WARN_ON(1); + case IEEE80211_SMPS_OFF: + action_frame->u.action.u.ht_smps.smps_control = + WLAN_HT_SMPS_CONTROL_DISABLED; + break; + case IEEE80211_SMPS_STATIC: + action_frame->u.action.u.ht_smps.smps_control = + WLAN_HT_SMPS_CONTROL_STATIC; + break; + case IEEE80211_SMPS_DYNAMIC: + action_frame->u.action.u.ht_smps.smps_control = + WLAN_HT_SMPS_CONTROL_DYNAMIC; + break; + } + + /* we'll do more on status of this frame */ + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + ieee80211_tx_skb(sdata, skb); + + return 0; +} diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 178e329f9257..e63aecbddfbe 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -297,6 +297,8 @@ struct ieee80211_if_managed { unsigned long timers_running; /* used for quiesce/restart */ bool powersave; /* powersave requested for this iface */ + enum ieee80211_smps_mode req_smps, /* requested smps mode */ + ap_smps; /* smps mode AP thinks we're in */ unsigned long request; @@ -587,6 +589,9 @@ struct ieee80211_local { /* used for uploading changed mc list */ struct work_struct reconfig_filter; + /* used to reconfigure hardware SM PS */ + struct work_struct recalc_smps; + /* aggregated multicast list */ struct dev_addr_list *mc_list; int mc_count; @@ -760,6 +765,8 @@ struct ieee80211_local { int user_power_level; /* in dBm */ int power_constr_level; /* in dBm */ + enum ieee80211_smps_mode smps_mode; + struct work_struct restart_work; #ifdef CONFIG_MAC80211_DEBUGFS @@ -978,6 +985,9 @@ void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u1 void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, u16 initiator, u16 reason_code); +int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps, const u8 *da, + const u8 *bssid); void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid, u16 initiator, u16 reason); @@ -1088,6 +1098,10 @@ void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata, u32 ieee80211_sta_get_rates(struct ieee80211_local *local, struct ieee802_11_elems *elems, enum ieee80211_band band); +int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps_mode); +void ieee80211_recalc_smps(struct ieee80211_local *local, + struct ieee80211_sub_if_data *forsdata); #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 98320a94c270..e1293e8ed83a 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -113,6 +113,18 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) changed |= IEEE80211_CONF_CHANGE_CHANNEL; } + if (!conf_is_ht(&local->hw.conf)) { + /* + * mac80211.h documents that this is only valid + * when the channel is set to an HT type, and + * that otherwise STATIC is used. + */ + local->hw.conf.smps_mode = IEEE80211_SMPS_STATIC; + } else if (local->hw.conf.smps_mode != local->smps_mode) { + local->hw.conf.smps_mode = local->smps_mode; + changed |= IEEE80211_CONF_CHANGE_SMPS; + } + if (scan_chan) power = chan->max_power; else @@ -297,6 +309,16 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw) } EXPORT_SYMBOL(ieee80211_restart_hw); +static void ieee80211_recalc_smps_work(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, recalc_smps); + + mutex_lock(&local->iflist_mtx); + ieee80211_recalc_smps(local, NULL); + mutex_unlock(&local->iflist_mtx); +} + struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops) { @@ -370,6 +392,8 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, INIT_WORK(&local->restart_work, ieee80211_restart_work); INIT_WORK(&local->reconfig_filter, ieee80211_reconfig_filter); + INIT_WORK(&local->recalc_smps, ieee80211_recalc_smps_work); + local->smps_mode = IEEE80211_SMPS_OFF; INIT_WORK(&local->dynamic_ps_enable_work, ieee80211_dynamic_ps_enable_work); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index cd5dcc3d8c2b..0a762a9ba4df 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -398,6 +398,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, __le16 tmp; u32 flags = local->hw.conf.channel->flags; + /* determine capability flags */ + switch (ht_info->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: if (flags & IEEE80211_CHAN_NO_HT40PLUS) { @@ -413,17 +415,64 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, break; } - tmp = cpu_to_le16(cap); - pos = skb_put(skb, sizeof(struct ieee80211_ht_cap)+2); + /* set SM PS mode properly */ + cap &= ~IEEE80211_HT_CAP_SM_PS; + /* new association always uses requested smps mode */ + if (ifmgd->req_smps == IEEE80211_SMPS_AUTOMATIC) { + if (ifmgd->powersave) + ifmgd->ap_smps = IEEE80211_SMPS_DYNAMIC; + else + ifmgd->ap_smps = IEEE80211_SMPS_OFF; + } else + ifmgd->ap_smps = ifmgd->req_smps; + + switch (ifmgd->ap_smps) { + case IEEE80211_SMPS_AUTOMATIC: + case IEEE80211_SMPS_NUM_MODES: + WARN_ON(1); + case IEEE80211_SMPS_OFF: + cap |= WLAN_HT_CAP_SM_PS_DISABLED << + IEEE80211_HT_CAP_SM_PS_SHIFT; + break; + case IEEE80211_SMPS_STATIC: + cap |= WLAN_HT_CAP_SM_PS_STATIC << + IEEE80211_HT_CAP_SM_PS_SHIFT; + break; + case IEEE80211_SMPS_DYNAMIC: + cap |= WLAN_HT_CAP_SM_PS_DYNAMIC << + IEEE80211_HT_CAP_SM_PS_SHIFT; + break; + } + + /* reserve and fill IE */ + + pos = skb_put(skb, sizeof(struct ieee80211_ht_cap) + 2); *pos++ = WLAN_EID_HT_CAPABILITY; *pos++ = sizeof(struct ieee80211_ht_cap); memset(pos, 0, sizeof(struct ieee80211_ht_cap)); + + /* capability flags */ + tmp = cpu_to_le16(cap); memcpy(pos, &tmp, sizeof(u16)); pos += sizeof(u16); - /* TODO: needs a define here for << 2 */ + + /* AMPDU parameters */ *pos++ = sband->ht_cap.ampdu_factor | - (sband->ht_cap.ampdu_density << 2); + (sband->ht_cap.ampdu_density << + IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT); + + /* MCS set */ memcpy(pos, &sband->ht_cap.mcs, sizeof(sband->ht_cap.mcs)); + pos += sizeof(sband->ht_cap.mcs); + + /* extended capabilities */ + pos += sizeof(__le16); + + /* BF capabilities */ + pos += sizeof(__le32); + + /* antenna selection */ + pos += sizeof(u8); } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; @@ -932,6 +981,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, mutex_lock(&local->iflist_mtx); ieee80211_recalc_ps(local, -1); + ieee80211_recalc_smps(local, sdata); mutex_unlock(&local->iflist_mtx); netif_start_queue(sdata->dev); @@ -2327,6 +2377,11 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) ifmgd->flags |= IEEE80211_STA_WMM_ENABLED; mutex_init(&ifmgd->mtx); + + if (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) + ifmgd->req_smps = IEEE80211_SMPS_AUTOMATIC; + else + ifmgd->req_smps = IEEE80211_SMPS_OFF; } /* scan finished notification */ diff --git a/net/mac80211/status.c b/net/mac80211/status.c index b4608f11a40f..0c0850d37dda 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -134,6 +134,40 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, dev_kfree_skb(skb); } +static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (void *) skb->data; + struct ieee80211_local *local = sta->local; + struct ieee80211_sub_if_data *sdata = sta->sdata; + + if (ieee80211_is_action(mgmt->frame_control) && + sdata->vif.type == NL80211_IFTYPE_STATION && + mgmt->u.action.category == WLAN_CATEGORY_HT && + mgmt->u.action.u.ht_smps.action == WLAN_HT_ACTION_SMPS) { + /* + * This update looks racy, but isn't -- if we come + * here we've definitely got a station that we're + * talking to, and on a managed interface that can + * only be the AP. And the only other place updating + * this variable is before we're associated. + */ + switch (mgmt->u.action.u.ht_smps.smps_control) { + case WLAN_HT_SMPS_CONTROL_DYNAMIC: + sta->sdata->u.mgd.ap_smps = IEEE80211_SMPS_DYNAMIC; + break; + case WLAN_HT_SMPS_CONTROL_STATIC: + sta->sdata->u.mgd.ap_smps = IEEE80211_SMPS_STATIC; + break; + case WLAN_HT_SMPS_CONTROL_DISABLED: + default: /* shouldn't happen since we don't send that */ + sta->sdata->u.mgd.ap_smps = IEEE80211_SMPS_OFF; + break; + } + + ieee80211_queue_work(&local->hw, &local->recalc_smps); + } +} + void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) { struct sk_buff *skb2; @@ -210,6 +244,10 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) rate_control_tx_status(local, sband, sta, skb); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) ieee80211s_update_metric(local, sta, skb); + + if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && + (info->flags & IEEE80211_TX_STAT_ACK)) + ieee80211_frame_acked(sta, skb); } rcu_read_unlock(); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index d54dbe8e09ba..086ef6257b4b 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1170,3 +1170,77 @@ int ieee80211_reconfig(struct ieee80211_local *local) return 0; } +static int check_mgd_smps(struct ieee80211_if_managed *ifmgd, + enum ieee80211_smps_mode *smps_mode) +{ + if (ifmgd->associated) { + *smps_mode = ifmgd->ap_smps; + + if (*smps_mode == IEEE80211_SMPS_AUTOMATIC) { + if (ifmgd->powersave) + *smps_mode = IEEE80211_SMPS_DYNAMIC; + else + *smps_mode = IEEE80211_SMPS_OFF; + } + + return 1; + } + + return 0; +} + +/* must hold iflist_mtx */ +void ieee80211_recalc_smps(struct ieee80211_local *local, + struct ieee80211_sub_if_data *forsdata) +{ + struct ieee80211_sub_if_data *sdata; + enum ieee80211_smps_mode smps_mode = IEEE80211_SMPS_OFF; + int count = 0; + + if (forsdata) + WARN_ON(!mutex_is_locked(&forsdata->u.mgd.mtx)); + + WARN_ON(!mutex_is_locked(&local->iflist_mtx)); + + /* + * This function could be improved to handle multiple + * interfaces better, but right now it makes any + * non-station interfaces force SM PS to be turned + * off. If there are multiple station interfaces it + * could also use the best possible mode, e.g. if + * one is in static and the other in dynamic then + * dynamic is ok. + */ + + list_for_each_entry(sdata, &local->interfaces, list) { + if (!netif_running(sdata->dev)) + continue; + if (sdata->vif.type != NL80211_IFTYPE_STATION) + goto set; + if (sdata != forsdata) { + /* + * This nested is ok -- we are holding the iflist_mtx + * so can't get here twice or so. But it's required + * since normally we acquire it first and then the + * iflist_mtx. + */ + mutex_lock_nested(&sdata->u.mgd.mtx, SINGLE_DEPTH_NESTING); + count += check_mgd_smps(&sdata->u.mgd, &smps_mode); + mutex_unlock(&sdata->u.mgd.mtx); + } else + count += check_mgd_smps(&sdata->u.mgd, &smps_mode); + + if (count > 1) { + smps_mode = IEEE80211_SMPS_OFF; + break; + } + } + + if (smps_mode == local->smps_mode) + return; + + set: + local->smps_mode = smps_mode; + /* changed flag is auto-detected for this */ + ieee80211_hw_config(local, 0); +} -- cgit v1.2.3 From 9da3e068142ec7856b2f13261dcf0660fad32b61 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Mon, 7 Dec 2009 15:57:50 -0500 Subject: mac80211: only bother printing highest data rate on debugfs if its set IEEE-802.11n spec says the RX highest data rate field does not specify the highest supported RX data rate if its not set. Ignore it if not set then. Refer to section 7.3.56.4 Cc: johannes@sipsolutions.net Signed-off-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 5 ++++- net/mac80211/debugfs_sta.c | 12 ++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index e8d43d0ff2c3..098bedcde9bb 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -775,7 +775,10 @@ struct ieee80211_bar { /** * struct ieee80211_mcs_info - MCS information * @rx_mask: RX mask - * @rx_highest: highest supported RX rate + * @rx_highest: highest supported RX rate. If set represents + * the highest supported RX data rate in units of 1 Mbps. + * If this field is 0 this value should not be used to + * consider the highest RX data rate supported. * @tx_params: TX parameters */ struct ieee80211_mcs_info { diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index c833b6ce9902..0d4a759ba72c 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -218,11 +218,19 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf, p += scnprintf(p, sizeof(buf)+buf-p, "ampdu factor/density: %d/%d\n", htc->ampdu_factor, htc->ampdu_density); p += scnprintf(p, sizeof(buf)+buf-p, "MCS mask:"); + for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) p += scnprintf(p, sizeof(buf)+buf-p, " %.2x", htc->mcs.rx_mask[i]); - p += scnprintf(p, sizeof(buf)+buf-p, "\nMCS rx highest: %d\n", - le16_to_cpu(htc->mcs.rx_highest)); + p += scnprintf(p, sizeof(buf)+buf-p, "\n"); + + /* If not set this is meaningless */ + if (le16_to_cpu(htc->mcs.rx_highest)) { + p += scnprintf(p, sizeof(buf)+buf-p, + "MCS rx highest: %d Mbps\n", + le16_to_cpu(htc->mcs.rx_highest)); + } + p += scnprintf(p, sizeof(buf)+buf-p, "MCS tx params: %x\n", htc->mcs.tx_params); } -- cgit v1.2.3 From 45465487897a1c6d508b14b904dc5777f7ec7e04 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:26 -0800 Subject: kfifo: move struct kfifo in place This is a new generic kernel FIFO implementation. The current kernel fifo API is not very widely used, because it has to many constrains. Only 17 files in the current 2.6.31-rc5 used it. FIFO's are like list's a very basic thing and a kfifo API which handles the most use case would save a lot of development time and memory resources. I think this are the reasons why kfifo is not in use: - The API is to simple, important functions are missing - A fifo can be only allocated dynamically - There is a requirement of a spinlock whether you need it or not - There is no support for data records inside a fifo So I decided to extend the kfifo in a more generic way without blowing up the API to much. The new API has the following benefits: - Generic usage: For kernel internal use and/or device driver. - Provide an API for the most use case. - Slim API: The whole API provides 25 functions. - Linux style habit. - DECLARE_KFIFO, DEFINE_KFIFO and INIT_KFIFO Macros - Direct copy_to_user from the fifo and copy_from_user into the fifo. - The kfifo itself is an in place member of the using data structure, this save an indirection access and does not waste the kernel allocator. - Lockless access: if only one reader and one writer is active on the fifo, which is the common use case, no additional locking is necessary. - Remove spinlock - give the user the freedom of choice what kind of locking to use if one is required. - Ability to handle records. Three type of records are supported: - Variable length records between 0-255 bytes, with a record size field of 1 bytes. - Variable length records between 0-65535 bytes, with a record size field of 2 bytes. - Fixed size records, which no record size field. - Preserve memory resource. - Performance! - Easy to use! This patch: Since most users want to have the kfifo as part of another object, reorganize the code to allow including struct kfifo in another data structure. This requires changing the kfifo_alloc and kfifo_init prototypes so that we pass an existing kfifo pointer into them. This patch changes the implementation and all existing users. [akpm@linux-foundation.org: fix warning] Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/nozomi.c | 21 +++++----- drivers/char/sonypi.c | 40 +++++++++--------- drivers/infiniband/hw/cxgb3/cxio_hal.h | 9 ++-- drivers/infiniband/hw/cxgb3/cxio_resource.c | 60 +++++++++++++------------- drivers/media/video/meye.c | 48 ++++++++++----------- drivers/media/video/meye.h | 4 +- drivers/net/wireless/libertas/cmd.c | 4 +- drivers/net/wireless/libertas/dev.h | 4 +- drivers/net/wireless/libertas/main.c | 16 ++++--- drivers/platform/x86/fujitsu-laptop.c | 18 ++++---- drivers/platform/x86/sony-laptop.c | 46 ++++++++++---------- drivers/scsi/libiscsi.c | 22 ++++------ drivers/scsi/libiscsi_tcp.c | 29 +++++++------ drivers/scsi/libsrp.c | 13 +++--- drivers/usb/host/fhci-sched.c | 10 ++--- drivers/usb/host/fhci-tds.c | 35 ++++++++-------- drivers/usb/host/fhci.h | 10 ++--- drivers/usb/serial/usb-serial.c | 5 +-- include/linux/kfifo.h | 11 ++--- include/scsi/libiscsi.h | 3 +- include/scsi/libiscsi_tcp.h | 2 +- include/scsi/libsrp.h | 2 +- kernel/kfifo.c | 65 +++++++++++++++-------------- net/dccp/probe.c | 20 ++++----- 24 files changed, 238 insertions(+), 259 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/nozomi.c b/drivers/char/nozomi.c index d3400b20444f..0f39bec28b45 100644 --- a/drivers/char/nozomi.c +++ b/drivers/char/nozomi.c @@ -358,7 +358,7 @@ struct port { u8 update_flow_control; struct ctrl_ul ctrl_ul; struct ctrl_dl ctrl_dl; - struct kfifo *fifo_ul; + struct kfifo fifo_ul; void __iomem *dl_addr[2]; u32 dl_size[2]; u8 toggle_dl; @@ -685,8 +685,8 @@ static int nozomi_read_config_table(struct nozomi *dc) dump_table(dc); for (i = PORT_MDM; i < MAX_PORT; i++) { - dc->port[i].fifo_ul = - kfifo_alloc(FIFO_BUFFER_SIZE_UL, GFP_ATOMIC, NULL); + kfifo_alloc(&dc->port[i].fifo_ul, + FIFO_BUFFER_SIZE_UL, GFP_ATOMIC, NULL); memset(&dc->port[i].ctrl_dl, 0, sizeof(struct ctrl_dl)); memset(&dc->port[i].ctrl_ul, 0, sizeof(struct ctrl_ul)); } @@ -798,7 +798,7 @@ static int send_data(enum port_type index, struct nozomi *dc) struct tty_struct *tty = tty_port_tty_get(&port->port); /* Get data from tty and place in buf for now */ - size = __kfifo_get(port->fifo_ul, dc->send_buf, + size = __kfifo_get(&port->fifo_ul, dc->send_buf, ul_size < SEND_BUF_MAX ? ul_size : SEND_BUF_MAX); if (size == 0) { @@ -988,11 +988,11 @@ static int receive_flow_control(struct nozomi *dc) } else if (old_ctrl.CTS == 0 && ctrl_dl.CTS == 1) { - if (__kfifo_len(dc->port[port].fifo_ul)) { + if (__kfifo_len(&dc->port[port].fifo_ul)) { DBG1("Enable interrupt (0x%04X) on port: %d", enable_ier, port); DBG1("Data in buffer [%d], enable transmit! ", - __kfifo_len(dc->port[port].fifo_ul)); + __kfifo_len(&dc->port[port].fifo_ul)); enable_transmit_ul(port, dc); } else { DBG1("No data in buffer..."); @@ -1536,8 +1536,7 @@ static void __devexit nozomi_card_exit(struct pci_dev *pdev) free_irq(pdev->irq, dc); for (i = 0; i < MAX_PORT; i++) - if (dc->port[i].fifo_ul) - kfifo_free(dc->port[i].fifo_ul); + kfifo_free(&dc->port[i].fifo_ul); kfree(dc->send_buf); @@ -1673,7 +1672,7 @@ static int ntty_write(struct tty_struct *tty, const unsigned char *buffer, goto exit; } - rval = __kfifo_put(port->fifo_ul, (unsigned char *)buffer, count); + rval = __kfifo_put(&port->fifo_ul, (unsigned char *)buffer, count); /* notify card */ if (unlikely(dc == NULL)) { @@ -1721,7 +1720,7 @@ static int ntty_write_room(struct tty_struct *tty) if (!port->port.count) goto exit; - room = port->fifo_ul->size - __kfifo_len(port->fifo_ul); + room = port->fifo_ul.size - __kfifo_len(&port->fifo_ul); exit: mutex_unlock(&port->tty_sem); @@ -1878,7 +1877,7 @@ static s32 ntty_chars_in_buffer(struct tty_struct *tty) goto exit_in_buffer; } - rval = __kfifo_len(port->fifo_ul); + rval = __kfifo_len(&port->fifo_ul); exit_in_buffer: return rval; diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c index 8c262aaf7c26..9e6efb1f029f 100644 --- a/drivers/char/sonypi.c +++ b/drivers/char/sonypi.c @@ -487,7 +487,7 @@ static struct sonypi_device { int camera_power; int bluetooth_power; struct mutex lock; - struct kfifo *fifo; + struct kfifo fifo; spinlock_t fifo_lock; wait_queue_head_t fifo_proc_list; struct fasync_struct *fifo_async; @@ -496,7 +496,7 @@ static struct sonypi_device { struct input_dev *input_jog_dev; struct input_dev *input_key_dev; struct work_struct input_work; - struct kfifo *input_fifo; + struct kfifo input_fifo; spinlock_t input_fifo_lock; } sonypi_device; @@ -777,7 +777,7 @@ static void input_keyrelease(struct work_struct *work) { struct sonypi_keypress kp; - while (kfifo_get(sonypi_device.input_fifo, (unsigned char *)&kp, + while (kfifo_get(&sonypi_device.input_fifo, (unsigned char *)&kp, sizeof(kp)) == sizeof(kp)) { msleep(10); input_report_key(kp.dev, kp.key, 0); @@ -827,7 +827,7 @@ static void sonypi_report_input_event(u8 event) if (kp.dev) { input_report_key(kp.dev, kp.key, 1); input_sync(kp.dev); - kfifo_put(sonypi_device.input_fifo, + kfifo_put(&sonypi_device.input_fifo, (unsigned char *)&kp, sizeof(kp)); schedule_work(&sonypi_device.input_work); } @@ -880,7 +880,7 @@ found: acpi_bus_generate_proc_event(sonypi_acpi_device, 1, event); #endif - kfifo_put(sonypi_device.fifo, (unsigned char *)&event, sizeof(event)); + kfifo_put(&sonypi_device.fifo, (unsigned char *)&event, sizeof(event)); kill_fasync(&sonypi_device.fifo_async, SIGIO, POLL_IN); wake_up_interruptible(&sonypi_device.fifo_proc_list); @@ -906,7 +906,7 @@ static int sonypi_misc_open(struct inode *inode, struct file *file) mutex_lock(&sonypi_device.lock); /* Flush input queue on first open */ if (!sonypi_device.open_count) - kfifo_reset(sonypi_device.fifo); + kfifo_reset(&sonypi_device.fifo); sonypi_device.open_count++; mutex_unlock(&sonypi_device.lock); unlock_kernel(); @@ -919,17 +919,17 @@ static ssize_t sonypi_misc_read(struct file *file, char __user *buf, ssize_t ret; unsigned char c; - if ((kfifo_len(sonypi_device.fifo) == 0) && + if ((kfifo_len(&sonypi_device.fifo) == 0) && (file->f_flags & O_NONBLOCK)) return -EAGAIN; ret = wait_event_interruptible(sonypi_device.fifo_proc_list, - kfifo_len(sonypi_device.fifo) != 0); + kfifo_len(&sonypi_device.fifo) != 0); if (ret) return ret; while (ret < count && - (kfifo_get(sonypi_device.fifo, &c, sizeof(c)) == sizeof(c))) { + (kfifo_get(&sonypi_device.fifo, &c, sizeof(c)) == sizeof(c))) { if (put_user(c, buf++)) return -EFAULT; ret++; @@ -946,7 +946,7 @@ static ssize_t sonypi_misc_read(struct file *file, char __user *buf, static unsigned int sonypi_misc_poll(struct file *file, poll_table *wait) { poll_wait(file, &sonypi_device.fifo_proc_list, wait); - if (kfifo_len(sonypi_device.fifo)) + if (kfifo_len(&sonypi_device.fifo)) return POLLIN | POLLRDNORM; return 0; } @@ -1313,11 +1313,11 @@ static int __devinit sonypi_probe(struct platform_device *dev) "http://www.linux.it/~malattia/wiki/index.php/Sony_drivers\n"); spin_lock_init(&sonypi_device.fifo_lock); - sonypi_device.fifo = kfifo_alloc(SONYPI_BUF_SIZE, GFP_KERNEL, + error = kfifo_alloc(&sonypi_device.fifo, SONYPI_BUF_SIZE, GFP_KERNEL, &sonypi_device.fifo_lock); - if (IS_ERR(sonypi_device.fifo)) { + if (error) { printk(KERN_ERR "sonypi: kfifo_alloc failed\n"); - return PTR_ERR(sonypi_device.fifo); + return error; } init_waitqueue_head(&sonypi_device.fifo_proc_list); @@ -1393,12 +1393,10 @@ static int __devinit sonypi_probe(struct platform_device *dev) } spin_lock_init(&sonypi_device.input_fifo_lock); - sonypi_device.input_fifo = - kfifo_alloc(SONYPI_BUF_SIZE, GFP_KERNEL, - &sonypi_device.input_fifo_lock); - if (IS_ERR(sonypi_device.input_fifo)) { + error = kfifo_alloc(&sonypi_device.input_fifo, SONYPI_BUF_SIZE, + GFP_KERNEL, &sonypi_device.input_fifo_lock); + if (error) { printk(KERN_ERR "sonypi: kfifo_alloc failed\n"); - error = PTR_ERR(sonypi_device.input_fifo); goto err_inpdev_unregister; } @@ -1423,7 +1421,7 @@ static int __devinit sonypi_probe(struct platform_device *dev) pci_disable_device(pcidev); err_put_pcidev: pci_dev_put(pcidev); - kfifo_free(sonypi_device.fifo); + kfifo_free(&sonypi_device.fifo); return error; } @@ -1438,7 +1436,7 @@ static int __devexit sonypi_remove(struct platform_device *dev) if (useinput) { input_unregister_device(sonypi_device.input_key_dev); input_unregister_device(sonypi_device.input_jog_dev); - kfifo_free(sonypi_device.input_fifo); + kfifo_free(&sonypi_device.input_fifo); } misc_deregister(&sonypi_misc_device); @@ -1451,7 +1449,7 @@ static int __devexit sonypi_remove(struct platform_device *dev) pci_dev_put(sonypi_device.dev); } - kfifo_free(sonypi_device.fifo); + kfifo_free(&sonypi_device.fifo); return 0; } diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h index bfd03bf8be54..f3d440cc68f2 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.h +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h @@ -34,6 +34,7 @@ #include #include +#include #include "t3_cpl.h" #include "t3cdev.h" @@ -75,13 +76,13 @@ struct cxio_hal_ctrl_qp { }; struct cxio_hal_resource { - struct kfifo *tpt_fifo; + struct kfifo tpt_fifo; spinlock_t tpt_fifo_lock; - struct kfifo *qpid_fifo; + struct kfifo qpid_fifo; spinlock_t qpid_fifo_lock; - struct kfifo *cqid_fifo; + struct kfifo cqid_fifo; spinlock_t cqid_fifo_lock; - struct kfifo *pdid_fifo; + struct kfifo pdid_fifo; spinlock_t pdid_fifo_lock; }; diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c index bd233c087653..65072bdfc1bf 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_resource.c +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c @@ -39,12 +39,12 @@ #include "cxio_resource.h" #include "cxio_hal.h" -static struct kfifo *rhdl_fifo; +static struct kfifo rhdl_fifo; static spinlock_t rhdl_fifo_lock; #define RANDOM_SIZE 16 -static int __cxio_init_resource_fifo(struct kfifo **fifo, +static int __cxio_init_resource_fifo(struct kfifo *fifo, spinlock_t *fifo_lock, u32 nr, u32 skip_low, u32 skip_high, @@ -55,12 +55,11 @@ static int __cxio_init_resource_fifo(struct kfifo **fifo, u32 rarray[16]; spin_lock_init(fifo_lock); - *fifo = kfifo_alloc(nr * sizeof(u32), GFP_KERNEL, fifo_lock); - if (IS_ERR(*fifo)) + if (kfifo_alloc(fifo, nr * sizeof(u32), GFP_KERNEL, fifo_lock)) return -ENOMEM; for (i = 0; i < skip_low + skip_high; i++) - __kfifo_put(*fifo, (unsigned char *) &entry, sizeof(u32)); + __kfifo_put(fifo, (unsigned char *) &entry, sizeof(u32)); if (random) { j = 0; random_bytes = random32(); @@ -72,33 +71,33 @@ static int __cxio_init_resource_fifo(struct kfifo **fifo, random_bytes = random32(); } idx = (random_bytes >> (j * 2)) & 0xF; - __kfifo_put(*fifo, + __kfifo_put(fifo, (unsigned char *) &rarray[idx], sizeof(u32)); rarray[idx] = i; j++; } for (i = 0; i < RANDOM_SIZE; i++) - __kfifo_put(*fifo, + __kfifo_put(fifo, (unsigned char *) &rarray[i], sizeof(u32)); } else for (i = skip_low; i < nr - skip_high; i++) - __kfifo_put(*fifo, (unsigned char *) &i, sizeof(u32)); + __kfifo_put(fifo, (unsigned char *) &i, sizeof(u32)); for (i = 0; i < skip_low + skip_high; i++) - kfifo_get(*fifo, (unsigned char *) &entry, sizeof(u32)); + kfifo_get(fifo, (unsigned char *) &entry, sizeof(u32)); return 0; } -static int cxio_init_resource_fifo(struct kfifo **fifo, spinlock_t * fifo_lock, +static int cxio_init_resource_fifo(struct kfifo *fifo, spinlock_t * fifo_lock, u32 nr, u32 skip_low, u32 skip_high) { return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low, skip_high, 0)); } -static int cxio_init_resource_fifo_random(struct kfifo **fifo, +static int cxio_init_resource_fifo_random(struct kfifo *fifo, spinlock_t * fifo_lock, u32 nr, u32 skip_low, u32 skip_high) { @@ -113,15 +112,14 @@ static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p) spin_lock_init(&rdev_p->rscp->qpid_fifo_lock); - rdev_p->rscp->qpid_fifo = kfifo_alloc(T3_MAX_NUM_QP * sizeof(u32), + if (kfifo_alloc(&rdev_p->rscp->qpid_fifo, T3_MAX_NUM_QP * sizeof(u32), GFP_KERNEL, - &rdev_p->rscp->qpid_fifo_lock); - if (IS_ERR(rdev_p->rscp->qpid_fifo)) + &rdev_p->rscp->qpid_fifo_lock)) return -ENOMEM; for (i = 16; i < T3_MAX_NUM_QP; i++) if (!(i & rdev_p->qpmask)) - __kfifo_put(rdev_p->rscp->qpid_fifo, + __kfifo_put(&rdev_p->rscp->qpid_fifo, (unsigned char *) &i, sizeof(u32)); return 0; } @@ -134,7 +132,7 @@ int cxio_hal_init_rhdl_resource(u32 nr_rhdl) void cxio_hal_destroy_rhdl_resource(void) { - kfifo_free(rhdl_fifo); + kfifo_free(&rhdl_fifo); } /* nr_* must be power of 2 */ @@ -167,11 +165,11 @@ int cxio_hal_init_resource(struct cxio_rdev *rdev_p, goto pdid_err; return 0; pdid_err: - kfifo_free(rscp->cqid_fifo); + kfifo_free(&rscp->cqid_fifo); cqid_err: - kfifo_free(rscp->qpid_fifo); + kfifo_free(&rscp->qpid_fifo); qpid_err: - kfifo_free(rscp->tpt_fifo); + kfifo_free(&rscp->tpt_fifo); tpt_err: return -ENOMEM; } @@ -195,17 +193,17 @@ static void cxio_hal_put_resource(struct kfifo *fifo, u32 entry) u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp) { - return cxio_hal_get_resource(rscp->tpt_fifo); + return cxio_hal_get_resource(&rscp->tpt_fifo); } void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag) { - cxio_hal_put_resource(rscp->tpt_fifo, stag); + cxio_hal_put_resource(&rscp->tpt_fifo, stag); } u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp) { - u32 qpid = cxio_hal_get_resource(rscp->qpid_fifo); + u32 qpid = cxio_hal_get_resource(&rscp->qpid_fifo); PDBG("%s qpid 0x%x\n", __func__, qpid); return qpid; } @@ -213,35 +211,35 @@ u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp) void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid) { PDBG("%s qpid 0x%x\n", __func__, qpid); - cxio_hal_put_resource(rscp->qpid_fifo, qpid); + cxio_hal_put_resource(&rscp->qpid_fifo, qpid); } u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp) { - return cxio_hal_get_resource(rscp->cqid_fifo); + return cxio_hal_get_resource(&rscp->cqid_fifo); } void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid) { - cxio_hal_put_resource(rscp->cqid_fifo, cqid); + cxio_hal_put_resource(&rscp->cqid_fifo, cqid); } u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp) { - return cxio_hal_get_resource(rscp->pdid_fifo); + return cxio_hal_get_resource(&rscp->pdid_fifo); } void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid) { - cxio_hal_put_resource(rscp->pdid_fifo, pdid); + cxio_hal_put_resource(&rscp->pdid_fifo, pdid); } void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp) { - kfifo_free(rscp->tpt_fifo); - kfifo_free(rscp->cqid_fifo); - kfifo_free(rscp->qpid_fifo); - kfifo_free(rscp->pdid_fifo); + kfifo_free(&rscp->tpt_fifo); + kfifo_free(&rscp->cqid_fifo); + kfifo_free(&rscp->qpid_fifo); + kfifo_free(&rscp->pdid_fifo); kfree(rscp); } diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c index 6ffa64cd1c6d..dacbbb839b9e 100644 --- a/drivers/media/video/meye.c +++ b/drivers/media/video/meye.c @@ -800,7 +800,7 @@ again: return IRQ_HANDLED; if (meye.mchip_mode == MCHIP_HIC_MODE_CONT_OUT) { - if (kfifo_get(meye.grabq, (unsigned char *)&reqnr, + if (kfifo_get(&meye.grabq, (unsigned char *)&reqnr, sizeof(int)) != sizeof(int)) { mchip_free_frame(); return IRQ_HANDLED; @@ -811,7 +811,7 @@ again: meye.grab_buffer[reqnr].state = MEYE_BUF_DONE; do_gettimeofday(&meye.grab_buffer[reqnr].timestamp); meye.grab_buffer[reqnr].sequence = sequence++; - kfifo_put(meye.doneq, (unsigned char *)&reqnr, sizeof(int)); + kfifo_put(&meye.doneq, (unsigned char *)&reqnr, sizeof(int)); wake_up_interruptible(&meye.proc_list); } else { int size; @@ -820,7 +820,7 @@ again: mchip_free_frame(); goto again; } - if (kfifo_get(meye.grabq, (unsigned char *)&reqnr, + if (kfifo_get(&meye.grabq, (unsigned char *)&reqnr, sizeof(int)) != sizeof(int)) { mchip_free_frame(); goto again; @@ -831,7 +831,7 @@ again: meye.grab_buffer[reqnr].state = MEYE_BUF_DONE; do_gettimeofday(&meye.grab_buffer[reqnr].timestamp); meye.grab_buffer[reqnr].sequence = sequence++; - kfifo_put(meye.doneq, (unsigned char *)&reqnr, sizeof(int)); + kfifo_put(&meye.doneq, (unsigned char *)&reqnr, sizeof(int)); wake_up_interruptible(&meye.proc_list); } mchip_free_frame(); @@ -859,8 +859,8 @@ static int meye_open(struct file *file) for (i = 0; i < MEYE_MAX_BUFNBRS; i++) meye.grab_buffer[i].state = MEYE_BUF_UNUSED; - kfifo_reset(meye.grabq); - kfifo_reset(meye.doneq); + kfifo_reset(&meye.grabq); + kfifo_reset(&meye.doneq); return 0; } @@ -933,7 +933,7 @@ static int meyeioc_qbuf_capt(int *nb) mchip_cont_compression_start(); meye.grab_buffer[*nb].state = MEYE_BUF_USING; - kfifo_put(meye.grabq, (unsigned char *)nb, sizeof(int)); + kfifo_put(&meye.grabq, (unsigned char *)nb, sizeof(int)); mutex_unlock(&meye.lock); return 0; @@ -965,7 +965,7 @@ static int meyeioc_sync(struct file *file, void *fh, int *i) /* fall through */ case MEYE_BUF_DONE: meye.grab_buffer[*i].state = MEYE_BUF_UNUSED; - kfifo_get(meye.doneq, (unsigned char *)&unused, sizeof(int)); + kfifo_get(&meye.doneq, (unsigned char *)&unused, sizeof(int)); } *i = meye.grab_buffer[*i].size; mutex_unlock(&meye.lock); @@ -1452,7 +1452,7 @@ static int vidioc_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) buf->flags |= V4L2_BUF_FLAG_QUEUED; buf->flags &= ~V4L2_BUF_FLAG_DONE; meye.grab_buffer[buf->index].state = MEYE_BUF_USING; - kfifo_put(meye.grabq, (unsigned char *)&buf->index, sizeof(int)); + kfifo_put(&meye.grabq, (unsigned char *)&buf->index, sizeof(int)); mutex_unlock(&meye.lock); return 0; @@ -1467,18 +1467,18 @@ static int vidioc_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) mutex_lock(&meye.lock); - if (kfifo_len(meye.doneq) == 0 && file->f_flags & O_NONBLOCK) { + if (kfifo_len(&meye.doneq) == 0 && file->f_flags & O_NONBLOCK) { mutex_unlock(&meye.lock); return -EAGAIN; } if (wait_event_interruptible(meye.proc_list, - kfifo_len(meye.doneq) != 0) < 0) { + kfifo_len(&meye.doneq) != 0) < 0) { mutex_unlock(&meye.lock); return -EINTR; } - if (!kfifo_get(meye.doneq, (unsigned char *)&reqnr, + if (!kfifo_get(&meye.doneq, (unsigned char *)&reqnr, sizeof(int))) { mutex_unlock(&meye.lock); return -EBUSY; @@ -1529,8 +1529,8 @@ static int vidioc_streamoff(struct file *file, void *fh, enum v4l2_buf_type i) { mutex_lock(&meye.lock); mchip_hic_stop(); - kfifo_reset(meye.grabq); - kfifo_reset(meye.doneq); + kfifo_reset(&meye.grabq); + kfifo_reset(&meye.doneq); for (i = 0; i < MEYE_MAX_BUFNBRS; i++) meye.grab_buffer[i].state = MEYE_BUF_UNUSED; @@ -1572,7 +1572,7 @@ static unsigned int meye_poll(struct file *file, poll_table *wait) mutex_lock(&meye.lock); poll_wait(file, &meye.proc_list, wait); - if (kfifo_len(meye.doneq)) + if (kfifo_len(&meye.doneq)) res = POLLIN | POLLRDNORM; mutex_unlock(&meye.lock); return res; @@ -1745,16 +1745,14 @@ static int __devinit meye_probe(struct pci_dev *pcidev, } spin_lock_init(&meye.grabq_lock); - meye.grabq = kfifo_alloc(sizeof(int) * MEYE_MAX_BUFNBRS, GFP_KERNEL, - &meye.grabq_lock); - if (IS_ERR(meye.grabq)) { + if (kfifo_alloc(&meye.grabq, sizeof(int) * MEYE_MAX_BUFNBRS, GFP_KERNEL, + &meye.grabq_lock)) { printk(KERN_ERR "meye: fifo allocation failed\n"); goto outkfifoalloc1; } spin_lock_init(&meye.doneq_lock); - meye.doneq = kfifo_alloc(sizeof(int) * MEYE_MAX_BUFNBRS, GFP_KERNEL, - &meye.doneq_lock); - if (IS_ERR(meye.doneq)) { + if (kfifo_alloc(&meye.doneq, sizeof(int) * MEYE_MAX_BUFNBRS, GFP_KERNEL, + &meye.doneq_lock)) { printk(KERN_ERR "meye: fifo allocation failed\n"); goto outkfifoalloc2; } @@ -1868,9 +1866,9 @@ outregions: outenabledev: sony_pic_camera_command(SONY_PIC_COMMAND_SETCAMERA, 0); outsonypienable: - kfifo_free(meye.doneq); + kfifo_free(&meye.doneq); outkfifoalloc2: - kfifo_free(meye.grabq); + kfifo_free(&meye.grabq); outkfifoalloc1: vfree(meye.grab_temp); outvmalloc: @@ -1901,8 +1899,8 @@ static void __devexit meye_remove(struct pci_dev *pcidev) sony_pic_camera_command(SONY_PIC_COMMAND_SETCAMERA, 0); - kfifo_free(meye.doneq); - kfifo_free(meye.grabq); + kfifo_free(&meye.doneq); + kfifo_free(&meye.grabq); vfree(meye.grab_temp); diff --git a/drivers/media/video/meye.h b/drivers/media/video/meye.h index 5f70a106ba2b..1321ad5d6597 100644 --- a/drivers/media/video/meye.h +++ b/drivers/media/video/meye.h @@ -303,9 +303,9 @@ struct meye { struct meye_grab_buffer grab_buffer[MEYE_MAX_BUFNBRS]; int vma_use_count[MEYE_MAX_BUFNBRS]; /* mmap count */ struct mutex lock; /* mutex for open/mmap... */ - struct kfifo *grabq; /* queue for buffers to be grabbed */ + struct kfifo grabq; /* queue for buffers to be grabbed */ spinlock_t grabq_lock; /* lock protecting the queue */ - struct kfifo *doneq; /* queue for grabbed buffers */ + struct kfifo doneq; /* queue for grabbed buffers */ spinlock_t doneq_lock; /* lock protecting the queue */ wait_queue_head_t proc_list; /* wait queue */ struct video_device *video_dev; /* video device parameters */ diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c index b9b371bfa30f..ffed17f4f506 100644 --- a/drivers/net/wireless/libertas/cmd.c +++ b/drivers/net/wireless/libertas/cmd.c @@ -1365,7 +1365,7 @@ static void lbs_send_confirmsleep(struct lbs_private *priv) priv->dnld_sent = DNLD_RES_RECEIVED; /* If nothing to do, go back to sleep (?) */ - if (!__kfifo_len(priv->event_fifo) && !priv->resp_len[priv->resp_idx]) + if (!__kfifo_len(&priv->event_fifo) && !priv->resp_len[priv->resp_idx]) priv->psstate = PS_STATE_SLEEP; spin_unlock_irqrestore(&priv->driver_lock, flags); @@ -1439,7 +1439,7 @@ void lbs_ps_confirm_sleep(struct lbs_private *priv) } /* Pending events or command responses? */ - if (__kfifo_len(priv->event_fifo) || priv->resp_len[priv->resp_idx]) { + if (__kfifo_len(&priv->event_fifo) || priv->resp_len[priv->resp_idx]) { allowed = 0; lbs_deb_host("pending events or command responses\n"); } diff --git a/drivers/net/wireless/libertas/dev.h b/drivers/net/wireless/libertas/dev.h index 6a8d2b291d8c..05bb298dfae9 100644 --- a/drivers/net/wireless/libertas/dev.h +++ b/drivers/net/wireless/libertas/dev.h @@ -10,7 +10,7 @@ #include "scan.h" #include "assoc.h" - +#include /** sleep_params */ struct sleep_params { @@ -120,7 +120,7 @@ struct lbs_private { u32 resp_len[2]; /* Events sent from hardware to driver */ - struct kfifo *event_fifo; + struct kfifo event_fifo; /** thread to service interrupts */ struct task_struct *main_thread; diff --git a/drivers/net/wireless/libertas/main.c b/drivers/net/wireless/libertas/main.c index db38a5a719fa..403909287414 100644 --- a/drivers/net/wireless/libertas/main.c +++ b/drivers/net/wireless/libertas/main.c @@ -459,7 +459,7 @@ static int lbs_thread(void *data) else if (!list_empty(&priv->cmdpendingq) && !(priv->wakeup_dev_required)) shouldsleep = 0; /* We have a command to send */ - else if (__kfifo_len(priv->event_fifo)) + else if (__kfifo_len(&priv->event_fifo)) shouldsleep = 0; /* We have an event to process */ else shouldsleep = 1; /* No command */ @@ -511,9 +511,9 @@ static int lbs_thread(void *data) /* Process hardware events, e.g. card removed, link lost */ spin_lock_irq(&priv->driver_lock); - while (__kfifo_len(priv->event_fifo)) { + while (__kfifo_len(&priv->event_fifo)) { u32 event; - __kfifo_get(priv->event_fifo, (unsigned char *) &event, + __kfifo_get(&priv->event_fifo, (unsigned char *) &event, sizeof(event)); spin_unlock_irq(&priv->driver_lock); lbs_process_event(priv, event); @@ -883,10 +883,9 @@ static int lbs_init_adapter(struct lbs_private *priv) priv->resp_len[0] = priv->resp_len[1] = 0; /* Create the event FIFO */ - priv->event_fifo = kfifo_alloc(sizeof(u32) * 16, GFP_KERNEL, NULL); - if (IS_ERR(priv->event_fifo)) { + ret = kfifo_alloc(&priv->event_fifo, sizeof(u32) * 16, GFP_KERNEL, NULL); + if (ret) { lbs_pr_err("Out of memory allocating event FIFO buffer\n"); - ret = -ENOMEM; goto out; } @@ -901,8 +900,7 @@ static void lbs_free_adapter(struct lbs_private *priv) lbs_deb_enter(LBS_DEB_MAIN); lbs_free_cmd_buffer(priv); - if (priv->event_fifo) - kfifo_free(priv->event_fifo); + kfifo_free(&priv->event_fifo); del_timer(&priv->command_timer); del_timer(&priv->auto_deepsleep_timer); kfree(priv->networks); @@ -1177,7 +1175,7 @@ void lbs_queue_event(struct lbs_private *priv, u32 event) if (priv->psstate == PS_STATE_SLEEP) priv->psstate = PS_STATE_AWAKE; - __kfifo_put(priv->event_fifo, (unsigned char *) &event, sizeof(u32)); + __kfifo_put(&priv->event_fifo, (unsigned char *) &event, sizeof(u32)); wake_up_interruptible(&priv->waitq); diff --git a/drivers/platform/x86/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c index bcd4ba8be7db..f999fba0e25e 100644 --- a/drivers/platform/x86/fujitsu-laptop.c +++ b/drivers/platform/x86/fujitsu-laptop.c @@ -164,7 +164,7 @@ struct fujitsu_hotkey_t { struct input_dev *input; char phys[32]; struct platform_device *pf_device; - struct kfifo *fifo; + struct kfifo fifo; spinlock_t fifo_lock; int rfkill_supported; int rfkill_state; @@ -824,12 +824,10 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device) /* kfifo */ spin_lock_init(&fujitsu_hotkey->fifo_lock); - fujitsu_hotkey->fifo = - kfifo_alloc(RINGBUFFERSIZE * sizeof(int), GFP_KERNEL, - &fujitsu_hotkey->fifo_lock); - if (IS_ERR(fujitsu_hotkey->fifo)) { + error = kfifo_alloc(&fujitsu_hotkey->fifo, RINGBUFFERSIZE * sizeof(int), + GFP_KERNEL, &fujitsu_hotkey->fifo_lock); + if (error) { printk(KERN_ERR "kfifo_alloc failed\n"); - error = PTR_ERR(fujitsu_hotkey->fifo); goto err_stop; } @@ -934,7 +932,7 @@ err_unregister_input_dev: err_free_input_dev: input_free_device(input); err_free_fifo: - kfifo_free(fujitsu_hotkey->fifo); + kfifo_free(&fujitsu_hotkey->fifo); err_stop: return result; } @@ -956,7 +954,7 @@ static int acpi_fujitsu_hotkey_remove(struct acpi_device *device, int type) input_free_device(input); - kfifo_free(fujitsu_hotkey->fifo); + kfifo_free(&fujitsu_hotkey->fifo); fujitsu_hotkey->acpi_handle = NULL; @@ -1008,7 +1006,7 @@ static void acpi_fujitsu_hotkey_notify(struct acpi_device *device, u32 event) vdbg_printk(FUJLAPTOP_DBG_TRACE, "Push keycode into ringbuffer [%d]\n", keycode); - status = kfifo_put(fujitsu_hotkey->fifo, + status = kfifo_put(&fujitsu_hotkey->fifo, (unsigned char *)&keycode, sizeof(keycode)); if (status != sizeof(keycode)) { @@ -1022,7 +1020,7 @@ static void acpi_fujitsu_hotkey_notify(struct acpi_device *device, u32 event) } else if (keycode == 0) { while ((status = kfifo_get - (fujitsu_hotkey->fifo, (unsigned char *) + (&fujitsu_hotkey->fifo, (unsigned char *) &keycode_r, sizeof (keycode_r))) == sizeof(keycode_r)) { diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 7a2cc8a5c975..04625a048e74 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -142,7 +142,7 @@ struct sony_laptop_input_s { atomic_t users; struct input_dev *jog_dev; struct input_dev *key_dev; - struct kfifo *fifo; + struct kfifo fifo; spinlock_t fifo_lock; struct workqueue_struct *wq; }; @@ -300,7 +300,7 @@ static void do_sony_laptop_release_key(struct work_struct *work) { struct sony_laptop_keypress kp; - while (kfifo_get(sony_laptop_input.fifo, (unsigned char *)&kp, + while (kfifo_get(&sony_laptop_input.fifo, (unsigned char *)&kp, sizeof(kp)) == sizeof(kp)) { msleep(10); input_report_key(kp.dev, kp.key, 0); @@ -362,7 +362,7 @@ static void sony_laptop_report_input_event(u8 event) /* we emit the scancode so we can always remap the key */ input_event(kp.dev, EV_MSC, MSC_SCAN, event); input_sync(kp.dev); - kfifo_put(sony_laptop_input.fifo, + kfifo_put(&sony_laptop_input.fifo, (unsigned char *)&kp, sizeof(kp)); if (!work_pending(&sony_laptop_release_key_work)) @@ -385,12 +385,11 @@ static int sony_laptop_setup_input(struct acpi_device *acpi_device) /* kfifo */ spin_lock_init(&sony_laptop_input.fifo_lock); - sony_laptop_input.fifo = - kfifo_alloc(SONY_LAPTOP_BUF_SIZE, GFP_KERNEL, + error = + kfifo_alloc(&sony_laptop_input.fifo, SONY_LAPTOP_BUF_SIZE, GFP_KERNEL, &sony_laptop_input.fifo_lock); - if (IS_ERR(sony_laptop_input.fifo)) { + if (error) { printk(KERN_ERR DRV_PFX "kfifo_alloc failed\n"); - error = PTR_ERR(sony_laptop_input.fifo); goto err_dec_users; } @@ -474,7 +473,7 @@ err_destroy_wq: destroy_workqueue(sony_laptop_input.wq); err_free_kfifo: - kfifo_free(sony_laptop_input.fifo); + kfifo_free(&sony_laptop_input.fifo); err_dec_users: atomic_dec(&sony_laptop_input.users); @@ -500,7 +499,7 @@ static void sony_laptop_remove_input(void) } destroy_workqueue(sony_laptop_input.wq); - kfifo_free(sony_laptop_input.fifo); + kfifo_free(&sony_laptop_input.fifo); } /*********** Platform Device ***********/ @@ -2079,7 +2078,7 @@ static struct attribute_group spic_attribute_group = { struct sonypi_compat_s { struct fasync_struct *fifo_async; - struct kfifo *fifo; + struct kfifo fifo; spinlock_t fifo_lock; wait_queue_head_t fifo_proc_list; atomic_t open_count; @@ -2104,12 +2103,12 @@ static int sonypi_misc_open(struct inode *inode, struct file *file) /* Flush input queue on first open */ unsigned long flags; - spin_lock_irqsave(sonypi_compat.fifo->lock, flags); + spin_lock_irqsave(&sonypi_compat.fifo_lock, flags); if (atomic_inc_return(&sonypi_compat.open_count) == 1) - __kfifo_reset(sonypi_compat.fifo); + __kfifo_reset(&sonypi_compat.fifo); - spin_unlock_irqrestore(sonypi_compat.fifo->lock, flags); + spin_unlock_irqrestore(&sonypi_compat.fifo_lock, flags); return 0; } @@ -2120,17 +2119,17 @@ static ssize_t sonypi_misc_read(struct file *file, char __user *buf, ssize_t ret; unsigned char c; - if ((kfifo_len(sonypi_compat.fifo) == 0) && + if ((kfifo_len(&sonypi_compat.fifo) == 0) && (file->f_flags & O_NONBLOCK)) return -EAGAIN; ret = wait_event_interruptible(sonypi_compat.fifo_proc_list, - kfifo_len(sonypi_compat.fifo) != 0); + kfifo_len(&sonypi_compat.fifo) != 0); if (ret) return ret; while (ret < count && - (kfifo_get(sonypi_compat.fifo, &c, sizeof(c)) == sizeof(c))) { + (kfifo_get(&sonypi_compat.fifo, &c, sizeof(c)) == sizeof(c))) { if (put_user(c, buf++)) return -EFAULT; ret++; @@ -2147,7 +2146,7 @@ static ssize_t sonypi_misc_read(struct file *file, char __user *buf, static unsigned int sonypi_misc_poll(struct file *file, poll_table *wait) { poll_wait(file, &sonypi_compat.fifo_proc_list, wait); - if (kfifo_len(sonypi_compat.fifo)) + if (kfifo_len(&sonypi_compat.fifo)) return POLLIN | POLLRDNORM; return 0; } @@ -2309,7 +2308,7 @@ static struct miscdevice sonypi_misc_device = { static void sonypi_compat_report_event(u8 event) { - kfifo_put(sonypi_compat.fifo, (unsigned char *)&event, sizeof(event)); + kfifo_put(&sonypi_compat.fifo, (unsigned char *)&event, sizeof(event)); kill_fasync(&sonypi_compat.fifo_async, SIGIO, POLL_IN); wake_up_interruptible(&sonypi_compat.fifo_proc_list); } @@ -2319,11 +2318,12 @@ static int sonypi_compat_init(void) int error; spin_lock_init(&sonypi_compat.fifo_lock); - sonypi_compat.fifo = kfifo_alloc(SONY_LAPTOP_BUF_SIZE, GFP_KERNEL, + error = + kfifo_alloc(&sonypi_compat.fifo, SONY_LAPTOP_BUF_SIZE, GFP_KERNEL, &sonypi_compat.fifo_lock); - if (IS_ERR(sonypi_compat.fifo)) { + if (error) { printk(KERN_ERR DRV_PFX "kfifo_alloc failed\n"); - return PTR_ERR(sonypi_compat.fifo); + return error; } init_waitqueue_head(&sonypi_compat.fifo_proc_list); @@ -2342,14 +2342,14 @@ static int sonypi_compat_init(void) return 0; err_free_kfifo: - kfifo_free(sonypi_compat.fifo); + kfifo_free(&sonypi_compat.fifo); return error; } static void sonypi_compat_exit(void) { misc_deregister(&sonypi_misc_device); - kfifo_free(sonypi_compat.fifo); + kfifo_free(&sonypi_compat.fifo); } #else static int sonypi_compat_init(void) { return 0; } diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index b7689f3d05f5..cf0aa7e90be9 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -517,7 +517,7 @@ static void iscsi_free_task(struct iscsi_task *task) if (conn->login_task == task) return; - __kfifo_put(session->cmdpool.queue, (void*)&task, sizeof(void*)); + __kfifo_put(&session->cmdpool.queue, (void*)&task, sizeof(void*)); if (sc) { task->sc = NULL; @@ -737,7 +737,7 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr, BUG_ON(conn->c_stage == ISCSI_CONN_INITIAL_STAGE); BUG_ON(conn->c_stage == ISCSI_CONN_STOPPED); - if (!__kfifo_get(session->cmdpool.queue, + if (!__kfifo_get(&session->cmdpool.queue, (void*)&task, sizeof(void*))) return NULL; } @@ -1567,7 +1567,7 @@ static inline struct iscsi_task *iscsi_alloc_task(struct iscsi_conn *conn, { struct iscsi_task *task; - if (!__kfifo_get(conn->session->cmdpool.queue, + if (!__kfifo_get(&conn->session->cmdpool.queue, (void *) &task, sizeof(void *))) return NULL; @@ -2461,12 +2461,7 @@ iscsi_pool_init(struct iscsi_pool *q, int max, void ***items, int item_size) if (q->pool == NULL) return -ENOMEM; - q->queue = kfifo_init((void*)q->pool, max * sizeof(void*), - GFP_KERNEL, NULL); - if (IS_ERR(q->queue)) { - q->queue = NULL; - goto enomem; - } + kfifo_init(&q->queue, (void*)q->pool, max * sizeof(void*), NULL); for (i = 0; i < max; i++) { q->pool[i] = kzalloc(item_size, GFP_KERNEL); @@ -2474,7 +2469,7 @@ iscsi_pool_init(struct iscsi_pool *q, int max, void ***items, int item_size) q->max = i; goto enomem; } - __kfifo_put(q->queue, (void*)&q->pool[i], sizeof(void*)); + __kfifo_put(&q->queue, (void*)&q->pool[i], sizeof(void*)); } if (items) { @@ -2497,7 +2492,6 @@ void iscsi_pool_free(struct iscsi_pool *q) for (i = 0; i < q->max; i++) kfree(q->pool[i]); kfree(q->pool); - kfree(q->queue); } EXPORT_SYMBOL_GPL(iscsi_pool_free); @@ -2825,7 +2819,7 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size, /* allocate login_task used for the login/text sequences */ spin_lock_bh(&session->lock); - if (!__kfifo_get(session->cmdpool.queue, + if (!__kfifo_get(&session->cmdpool.queue, (void*)&conn->login_task, sizeof(void*))) { spin_unlock_bh(&session->lock); @@ -2845,7 +2839,7 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size, return cls_conn; login_task_data_alloc_fail: - __kfifo_put(session->cmdpool.queue, (void*)&conn->login_task, + __kfifo_put(&session->cmdpool.queue, (void*)&conn->login_task, sizeof(void*)); login_task_alloc_fail: iscsi_destroy_conn(cls_conn); @@ -2908,7 +2902,7 @@ void iscsi_conn_teardown(struct iscsi_cls_conn *cls_conn) free_pages((unsigned long) conn->data, get_order(ISCSI_DEF_MAX_RECV_SEG_LEN)); kfree(conn->persistent_address); - __kfifo_put(session->cmdpool.queue, (void*)&conn->login_task, + __kfifo_put(&session->cmdpool.queue, (void*)&conn->login_task, sizeof(void*)); if (session->leadconn == conn) session->leadconn = NULL; diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c index ca25ee5190b0..a83ee56a185e 100644 --- a/drivers/scsi/libiscsi_tcp.c +++ b/drivers/scsi/libiscsi_tcp.c @@ -445,15 +445,15 @@ void iscsi_tcp_cleanup_task(struct iscsi_task *task) return; /* flush task's r2t queues */ - while (__kfifo_get(tcp_task->r2tqueue, (void*)&r2t, sizeof(void*))) { - __kfifo_put(tcp_task->r2tpool.queue, (void*)&r2t, + while (__kfifo_get(&tcp_task->r2tqueue, (void*)&r2t, sizeof(void*))) { + __kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); ISCSI_DBG_TCP(task->conn, "pending r2t dropped\n"); } r2t = tcp_task->r2t; if (r2t != NULL) { - __kfifo_put(tcp_task->r2tpool.queue, (void*)&r2t, + __kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); tcp_task->r2t = NULL; } @@ -541,7 +541,7 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task) return 0; } - rc = __kfifo_get(tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); + rc = __kfifo_get(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); if (!rc) { iscsi_conn_printk(KERN_ERR, conn, "Could not allocate R2T. " "Target has sent more R2Ts than it " @@ -554,7 +554,7 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task) if (r2t->data_length == 0) { iscsi_conn_printk(KERN_ERR, conn, "invalid R2T with zero data len\n"); - __kfifo_put(tcp_task->r2tpool.queue, (void*)&r2t, + __kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); return ISCSI_ERR_DATALEN; } @@ -570,7 +570,7 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task) "invalid R2T with data len %u at offset %u " "and total length %d\n", r2t->data_length, r2t->data_offset, scsi_out(task->sc)->length); - __kfifo_put(tcp_task->r2tpool.queue, (void*)&r2t, + __kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); return ISCSI_ERR_DATALEN; } @@ -580,7 +580,7 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task) r2t->sent = 0; tcp_task->exp_datasn = r2tsn + 1; - __kfifo_put(tcp_task->r2tqueue, (void*)&r2t, sizeof(void*)); + __kfifo_put(&tcp_task->r2tqueue, (void*)&r2t, sizeof(void*)); conn->r2t_pdus_cnt++; iscsi_requeue_task(task); @@ -951,7 +951,7 @@ int iscsi_tcp_task_init(struct iscsi_task *task) return conn->session->tt->init_pdu(task, 0, task->data_count); } - BUG_ON(__kfifo_len(tcp_task->r2tqueue)); + BUG_ON(__kfifo_len(&tcp_task->r2tqueue)); tcp_task->exp_datasn = 0; /* Prepare PDU, optionally w/ immediate data */ @@ -982,7 +982,7 @@ static struct iscsi_r2t_info *iscsi_tcp_get_curr_r2t(struct iscsi_task *task) if (r2t->data_length <= r2t->sent) { ISCSI_DBG_TCP(task->conn, " done with r2t %p\n", r2t); - __kfifo_put(tcp_task->r2tpool.queue, + __kfifo_put(&tcp_task->r2tpool.queue, (void *)&tcp_task->r2t, sizeof(void *)); tcp_task->r2t = r2t = NULL; @@ -990,7 +990,7 @@ static struct iscsi_r2t_info *iscsi_tcp_get_curr_r2t(struct iscsi_task *task) } if (r2t == NULL) { - __kfifo_get(tcp_task->r2tqueue, + __kfifo_get(&tcp_task->r2tqueue, (void *)&tcp_task->r2t, sizeof(void *)); r2t = tcp_task->r2t; } @@ -1127,9 +1127,8 @@ int iscsi_tcp_r2tpool_alloc(struct iscsi_session *session) } /* R2T xmit queue */ - tcp_task->r2tqueue = kfifo_alloc( - session->max_r2t * 4 * sizeof(void*), GFP_KERNEL, NULL); - if (tcp_task->r2tqueue == ERR_PTR(-ENOMEM)) { + if (kfifo_alloc(&tcp_task->r2tqueue, + session->max_r2t * 4 * sizeof(void*), GFP_KERNEL, NULL)) { iscsi_pool_free(&tcp_task->r2tpool); goto r2t_alloc_fail; } @@ -1142,7 +1141,7 @@ r2t_alloc_fail: struct iscsi_task *task = session->cmds[i]; struct iscsi_tcp_task *tcp_task = task->dd_data; - kfifo_free(tcp_task->r2tqueue); + kfifo_free(&tcp_task->r2tqueue); iscsi_pool_free(&tcp_task->r2tpool); } return -ENOMEM; @@ -1157,7 +1156,7 @@ void iscsi_tcp_r2tpool_free(struct iscsi_session *session) struct iscsi_task *task = session->cmds[i]; struct iscsi_tcp_task *tcp_task = task->dd_data; - kfifo_free(tcp_task->r2tqueue); + kfifo_free(&tcp_task->r2tqueue); iscsi_pool_free(&tcp_task->r2tpool); } } diff --git a/drivers/scsi/libsrp.c b/drivers/scsi/libsrp.c index 9ad38e81e343..b1b5e51ca8e3 100644 --- a/drivers/scsi/libsrp.c +++ b/drivers/scsi/libsrp.c @@ -58,19 +58,16 @@ static int srp_iu_pool_alloc(struct srp_queue *q, size_t max, goto free_pool; spin_lock_init(&q->lock); - q->queue = kfifo_init((void *) q->pool, max * sizeof(void *), - GFP_KERNEL, &q->lock); - if (IS_ERR(q->queue)) - goto free_item; + kfifo_init(&q->queue, (void *) q->pool, max * sizeof(void *), + &q->lock); for (i = 0, iue = q->items; i < max; i++) { - __kfifo_put(q->queue, (void *) &iue, sizeof(void *)); + __kfifo_put(&q->queue, (void *) &iue, sizeof(void *)); iue->sbuf = ring[i]; iue++; } return 0; -free_item: kfree(q->items); free_pool: kfree(q->pool); @@ -167,7 +164,7 @@ struct iu_entry *srp_iu_get(struct srp_target *target) { struct iu_entry *iue = NULL; - kfifo_get(target->iu_queue.queue, (void *) &iue, sizeof(void *)); + kfifo_get(&target->iu_queue.queue, (void *) &iue, sizeof(void *)); if (!iue) return iue; iue->target = target; @@ -179,7 +176,7 @@ EXPORT_SYMBOL_GPL(srp_iu_get); void srp_iu_put(struct iu_entry *iue) { - kfifo_put(iue->target->iu_queue.queue, (void *) &iue, sizeof(void *)); + kfifo_put(&iue->target->iu_queue.queue, (void *) &iue, sizeof(void *)); } EXPORT_SYMBOL_GPL(srp_iu_put); diff --git a/drivers/usb/host/fhci-sched.c b/drivers/usb/host/fhci-sched.c index 00a29855d0c4..ff43747a614f 100644 --- a/drivers/usb/host/fhci-sched.c +++ b/drivers/usb/host/fhci-sched.c @@ -37,7 +37,7 @@ static void recycle_frame(struct fhci_usb *usb, struct packet *pkt) pkt->info = 0; pkt->priv_data = NULL; - cq_put(usb->ep0->empty_frame_Q, pkt); + cq_put(&usb->ep0->empty_frame_Q, pkt); } /* confirm submitted packet */ @@ -57,7 +57,7 @@ void fhci_transaction_confirm(struct fhci_usb *usb, struct packet *pkt) if ((td->data + td->actual_len) && trans_len) memcpy(td->data + td->actual_len, pkt->data, trans_len); - cq_put(usb->ep0->dummy_packets_Q, pkt->data); + cq_put(&usb->ep0->dummy_packets_Q, pkt->data); } recycle_frame(usb, pkt); @@ -213,7 +213,7 @@ static int add_packet(struct fhci_usb *usb, struct ed *ed, struct td *td) } /* update frame object fields before transmitting */ - pkt = cq_get(usb->ep0->empty_frame_Q); + pkt = cq_get(&usb->ep0->empty_frame_Q); if (!pkt) { fhci_dbg(usb->fhci, "there is no empty frame\n"); return -1; @@ -222,7 +222,7 @@ static int add_packet(struct fhci_usb *usb, struct ed *ed, struct td *td) pkt->info = 0; if (data == NULL) { - data = cq_get(usb->ep0->dummy_packets_Q); + data = cq_get(&usb->ep0->dummy_packets_Q); BUG_ON(!data); pkt->info = PKT_DUMMY_PACKET; } @@ -246,7 +246,7 @@ static int add_packet(struct fhci_usb *usb, struct ed *ed, struct td *td) list_del_init(&td->frame_lh); td->status = USB_TD_OK; if (pkt->info & PKT_DUMMY_PACKET) - cq_put(usb->ep0->dummy_packets_Q, pkt->data); + cq_put(&usb->ep0->dummy_packets_Q, pkt->data); recycle_frame(usb, pkt); usb->actual_frame->total_bytes -= (len + PROTOCOL_OVERHEAD); fhci_err(usb->fhci, "host transaction failed\n"); diff --git a/drivers/usb/host/fhci-tds.c b/drivers/usb/host/fhci-tds.c index b40332290319..d224ab467a40 100644 --- a/drivers/usb/host/fhci-tds.c +++ b/drivers/usb/host/fhci-tds.c @@ -106,33 +106,33 @@ void fhci_ep0_free(struct fhci_usb *usb) cpm_muram_free(cpm_muram_offset(ep->td_base)); if (ep->conf_frame_Q) { - size = cq_howmany(ep->conf_frame_Q); + size = cq_howmany(&ep->conf_frame_Q); for (; size; size--) { - struct packet *pkt = cq_get(ep->conf_frame_Q); + struct packet *pkt = cq_get(&ep->conf_frame_Q); kfree(pkt); } - cq_delete(ep->conf_frame_Q); + cq_delete(&ep->conf_frame_Q); } if (ep->empty_frame_Q) { - size = cq_howmany(ep->empty_frame_Q); + size = cq_howmany(&ep->empty_frame_Q); for (; size; size--) { - struct packet *pkt = cq_get(ep->empty_frame_Q); + struct packet *pkt = cq_get(&ep->empty_frame_Q); kfree(pkt); } - cq_delete(ep->empty_frame_Q); + cq_delete(&ep->empty_frame_Q); } if (ep->dummy_packets_Q) { - size = cq_howmany(ep->dummy_packets_Q); + size = cq_howmany(&ep->dummy_packets_Q); for (; size; size--) { - u8 *buff = cq_get(ep->dummy_packets_Q); + u8 *buff = cq_get(&ep->dummy_packets_Q); kfree(buff); } - cq_delete(ep->dummy_packets_Q); + cq_delete(&ep->dummy_packets_Q); } kfree(ep); @@ -175,10 +175,9 @@ u32 fhci_create_ep(struct fhci_usb *usb, enum fhci_mem_alloc data_mem, ep->td_base = cpm_muram_addr(ep_offset); /* zero all queue pointers */ - ep->conf_frame_Q = cq_new(ring_len + 2); - ep->empty_frame_Q = cq_new(ring_len + 2); - ep->dummy_packets_Q = cq_new(ring_len + 2); - if (!ep->conf_frame_Q || !ep->empty_frame_Q || !ep->dummy_packets_Q) { + if (cq_new(&ep->conf_frame_Q, ring_len + 2) || + cq_new(&ep->empty_frame_Q, ring_len + 2) || + cq_new(&ep->dummy_packets_Q, ring_len + 2)) { err_for = "frame_queues"; goto err; } @@ -199,8 +198,8 @@ u32 fhci_create_ep(struct fhci_usb *usb, enum fhci_mem_alloc data_mem, err_for = "buffer"; goto err; } - cq_put(ep->empty_frame_Q, pkt); - cq_put(ep->dummy_packets_Q, buff); + cq_put(&ep->empty_frame_Q, pkt); + cq_put(&ep->dummy_packets_Q, buff); } /* we put the endpoint parameter RAM right behind the TD ring */ @@ -319,7 +318,7 @@ static void fhci_td_transaction_confirm(struct fhci_usb *usb) if ((buf == DUMMY2_BD_BUFFER) && !(td_status & ~TD_W)) continue; - pkt = cq_get(ep->conf_frame_Q); + pkt = cq_get(&ep->conf_frame_Q); if (!pkt) fhci_err(usb->fhci, "no frame to confirm\n"); @@ -460,9 +459,9 @@ u32 fhci_host_transaction(struct fhci_usb *usb, out_be16(&td->length, pkt->len); /* put the frame to the confirmation queue */ - cq_put(ep->conf_frame_Q, pkt); + cq_put(&ep->conf_frame_Q, pkt); - if (cq_howmany(ep->conf_frame_Q) == 1) + if (cq_howmany(&ep->conf_frame_Q) == 1) out_8(&usb->fhci->regs->usb_comm, USB_CMD_STR_FIFO); return 0; diff --git a/drivers/usb/host/fhci.h b/drivers/usb/host/fhci.h index 7116284ed21a..2277428ef5d3 100644 --- a/drivers/usb/host/fhci.h +++ b/drivers/usb/host/fhci.h @@ -423,9 +423,9 @@ struct endpoint { struct usb_td __iomem *td_base; /* first TD in the ring */ struct usb_td __iomem *conf_td; /* next TD for confirm after transac */ struct usb_td __iomem *empty_td;/* next TD for new transaction req. */ - struct kfifo *empty_frame_Q; /* Empty frames list to use */ - struct kfifo *conf_frame_Q; /* frames passed to TDs,waiting for tx */ - struct kfifo *dummy_packets_Q;/* dummy packets for the CRC overun */ + struct kfifo empty_frame_Q; /* Empty frames list to use */ + struct kfifo conf_frame_Q; /* frames passed to TDs,waiting for tx */ + struct kfifo dummy_packets_Q;/* dummy packets for the CRC overun */ bool already_pushed_dummy_bd; }; @@ -493,9 +493,9 @@ static inline struct usb_hcd *fhci_to_hcd(struct fhci_hcd *fhci) } /* fifo of pointers */ -static inline struct kfifo *cq_new(int size) +static inline int cq_new(struct kfifo *fifo, int size) { - return kfifo_alloc(size * sizeof(void *), GFP_KERNEL, NULL); + return kfifo_alloc(fifo, size * sizeof(void *), GFP_KERNEL, NULL); } static inline void cq_delete(struct kfifo *kfifo) diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c index 4543f359be75..44b72d47fac2 100644 --- a/drivers/usb/serial/usb-serial.c +++ b/drivers/usb/serial/usb-serial.c @@ -939,9 +939,8 @@ int usb_serial_probe(struct usb_interface *interface, dev_err(&interface->dev, "No free urbs available\n"); goto probe_error; } - port->write_fifo = kfifo_alloc(PAGE_SIZE, GFP_KERNEL, - &port->lock); - if (IS_ERR(port->write_fifo)) + if (kfifo_alloc(&port->write_fifo, PAGE_SIZE, GFP_KERNEL, + &port->lock)) goto probe_error; buffer_size = le16_to_cpu(endpoint->wMaxPacketSize); port->bulk_out_size = buffer_size; diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index ad6bdf5a5970..c3f8d82efd34 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -1,6 +1,7 @@ /* - * A simple kernel FIFO implementation. + * A generic kernel FIFO implementation. * + * Copyright (C) 2009 Stefani Seibold * Copyright (C) 2004 Stelian Pop * * This program is free software; you can redistribute it and/or modify @@ -32,10 +33,10 @@ struct kfifo { spinlock_t *lock; /* protects concurrent modifications */ }; -extern struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, - gfp_t gfp_mask, spinlock_t *lock); -extern struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, - spinlock_t *lock); +extern void kfifo_init(struct kfifo *fifo, unsigned char *buffer, + unsigned int size, spinlock_t *lock); +extern __must_check int kfifo_alloc(struct kfifo *fifo, unsigned int size, + gfp_t gfp_mask, spinlock_t *lock); extern void kfifo_free(struct kfifo *fifo); extern unsigned int __kfifo_put(struct kfifo *fifo, const unsigned char *buffer, unsigned int len); diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index 7394e3bc8f4b..ff92b46f5153 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -231,7 +232,7 @@ struct iscsi_conn { }; struct iscsi_pool { - struct kfifo *queue; /* FIFO Queue */ + struct kfifo queue; /* FIFO Queue */ void **pool; /* Pool of elements */ int max; /* Max number of elements */ }; diff --git a/include/scsi/libiscsi_tcp.h b/include/scsi/libiscsi_tcp.h index 9e3182e659db..741ae7ed4394 100644 --- a/include/scsi/libiscsi_tcp.h +++ b/include/scsi/libiscsi_tcp.h @@ -80,7 +80,7 @@ struct iscsi_tcp_task { int data_offset; struct iscsi_r2t_info *r2t; /* in progress solict R2T */ struct iscsi_pool r2tpool; - struct kfifo *r2tqueue; + struct kfifo r2tqueue; void *dd_data; }; diff --git a/include/scsi/libsrp.h b/include/scsi/libsrp.h index ba615e4c1d7c..07e3adde21d9 100644 --- a/include/scsi/libsrp.h +++ b/include/scsi/libsrp.h @@ -21,7 +21,7 @@ struct srp_buf { struct srp_queue { void *pool; void *items; - struct kfifo *queue; + struct kfifo queue; spinlock_t lock; }; diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 3765ff3c1bbe..8da6bb9782bb 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -1,6 +1,7 @@ /* - * A simple kernel FIFO implementation. + * A generic kernel FIFO implementation. * + * Copyright (C) 2009 Stefani Seibold * Copyright (C) 2004 Stelian Pop * * This program is free software; you can redistribute it and/or modify @@ -26,49 +27,51 @@ #include #include +static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, + unsigned int size, spinlock_t *lock) +{ + fifo->buffer = buffer; + fifo->size = size; + fifo->lock = lock; + + kfifo_reset(fifo); +} + /** - * kfifo_init - allocates a new FIFO using a preallocated buffer + * kfifo_init - initialize a FIFO using a preallocated buffer + * @fifo: the fifo to assign the buffer * @buffer: the preallocated buffer to be used. * @size: the size of the internal buffer, this have to be a power of 2. - * @gfp_mask: get_free_pages mask, passed to kmalloc() * @lock: the lock to be used to protect the fifo buffer * - * Do NOT pass the kfifo to kfifo_free() after use! Simply free the - * &struct kfifo with kfree(). */ -struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, - gfp_t gfp_mask, spinlock_t *lock) +void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size, + spinlock_t *lock) { - struct kfifo *fifo; - /* size must be a power of 2 */ BUG_ON(!is_power_of_2(size)); - fifo = kmalloc(sizeof(struct kfifo), gfp_mask); - if (!fifo) - return ERR_PTR(-ENOMEM); - - fifo->buffer = buffer; - fifo->size = size; - fifo->in = fifo->out = 0; - fifo->lock = lock; - - return fifo; + _kfifo_init(fifo, buffer, size, lock); } EXPORT_SYMBOL(kfifo_init); /** - * kfifo_alloc - allocates a new FIFO and its internal buffer - * @size: the size of the internal buffer to be allocated. + * kfifo_alloc - allocates a new FIFO internal buffer + * @fifo: the fifo to assign then new buffer + * @size: the size of the buffer to be allocated, this have to be a power of 2. * @gfp_mask: get_free_pages mask, passed to kmalloc() * @lock: the lock to be used to protect the fifo buffer * + * This function dynamically allocates a new fifo internal buffer + * * The size will be rounded-up to a power of 2. + * The buffer will be release with kfifo_free(). + * Return 0 if no error, otherwise the an error code */ -struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) +int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask, + spinlock_t *lock) { unsigned char *buffer; - struct kfifo *ret; /* * round up to the next power of 2, since our 'let the indices @@ -80,26 +83,24 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) } buffer = kmalloc(size, gfp_mask); - if (!buffer) - return ERR_PTR(-ENOMEM); - - ret = kfifo_init(buffer, size, gfp_mask, lock); + if (!buffer) { + _kfifo_init(fifo, 0, 0, NULL); + return -ENOMEM; + } - if (IS_ERR(ret)) - kfree(buffer); + _kfifo_init(fifo, buffer, size, lock); - return ret; + return 0; } EXPORT_SYMBOL(kfifo_alloc); /** - * kfifo_free - frees the FIFO + * kfifo_free - frees the FIFO internal buffer * @fifo: the fifo to be freed. */ void kfifo_free(struct kfifo *fifo) { kfree(fifo->buffer); - kfree(fifo); } EXPORT_SYMBOL(kfifo_free); diff --git a/net/dccp/probe.c b/net/dccp/probe.c index dc328425fa20..6230ceb0823e 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -43,7 +43,7 @@ static int bufsize = 64 * 1024; static const char procname[] = "dccpprobe"; static struct { - struct kfifo *fifo; + struct kfifo fifo; spinlock_t lock; wait_queue_head_t wait; struct timespec tstart; @@ -67,7 +67,7 @@ static void printl(const char *fmt, ...) len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); va_end(args); - kfifo_put(dccpw.fifo, tbuf, len); + kfifo_put(&dccpw.fifo, tbuf, len); wake_up(&dccpw.wait); } @@ -109,7 +109,7 @@ static struct jprobe dccp_send_probe = { static int dccpprobe_open(struct inode *inode, struct file *file) { - kfifo_reset(dccpw.fifo); + kfifo_reset(&dccpw.fifo); getnstimeofday(&dccpw.tstart); return 0; } @@ -131,11 +131,11 @@ static ssize_t dccpprobe_read(struct file *file, char __user *buf, return -ENOMEM; error = wait_event_interruptible(dccpw.wait, - __kfifo_len(dccpw.fifo) != 0); + __kfifo_len(&dccpw.fifo) != 0); if (error) goto out_free; - cnt = kfifo_get(dccpw.fifo, tbuf, len); + cnt = kfifo_get(&dccpw.fifo, tbuf, len); error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0; out_free: @@ -156,10 +156,8 @@ static __init int dccpprobe_init(void) init_waitqueue_head(&dccpw.wait); spin_lock_init(&dccpw.lock); - dccpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &dccpw.lock); - if (IS_ERR(dccpw.fifo)) - return PTR_ERR(dccpw.fifo); - + if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL, &dccpw.lock)) + return ret; if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &dccpprobe_fops)) goto err0; @@ -172,14 +170,14 @@ static __init int dccpprobe_init(void) err1: proc_net_remove(&init_net, procname); err0: - kfifo_free(dccpw.fifo); + kfifo_free(&dccpw.fifo); return ret; } module_init(dccpprobe_init); static __exit void dccpprobe_exit(void) { - kfifo_free(dccpw.fifo); + kfifo_free(&dccpw.fifo); proc_net_remove(&init_net, procname); unregister_jprobe(&dccp_send_probe); -- cgit v1.2.3 From c1e13f25674ed564948ecb7dfe5f83e578892896 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:27 -0800 Subject: kfifo: move out spinlock Move the pointer to the spinlock out of struct kfifo. Most users in tree do not actually use a spinlock, so the few exceptions now have to call kfifo_{get,put}_locked, which takes an extra argument to a spinlock. Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/nozomi.c | 2 +- drivers/char/sonypi.c | 21 ++++---- drivers/infiniband/hw/cxgb3/cxio_resource.c | 36 +++++++------ drivers/media/video/meye.c | 35 +++++++------ drivers/net/wireless/libertas/main.c | 2 +- drivers/platform/x86/fujitsu-laptop.c | 18 ++++--- drivers/platform/x86/sony-laptop.c | 22 ++++---- drivers/scsi/libiscsi.c | 2 +- drivers/scsi/libiscsi_tcp.c | 2 +- drivers/scsi/libsrp.c | 9 ++-- drivers/usb/host/fhci.h | 2 +- drivers/usb/serial/generic.c | 4 +- drivers/usb/serial/usb-serial.c | 3 +- include/linux/kfifo.h | 80 +++++++++++++---------------- kernel/kfifo.c | 17 +++--- net/dccp/probe.c | 6 +-- 16 files changed, 131 insertions(+), 130 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/nozomi.c b/drivers/char/nozomi.c index 0f39bec28b45..935b30d80adf 100644 --- a/drivers/char/nozomi.c +++ b/drivers/char/nozomi.c @@ -686,7 +686,7 @@ static int nozomi_read_config_table(struct nozomi *dc) for (i = PORT_MDM; i < MAX_PORT; i++) { kfifo_alloc(&dc->port[i].fifo_ul, - FIFO_BUFFER_SIZE_UL, GFP_ATOMIC, NULL); + FIFO_BUFFER_SIZE_UL, GFP_ATOMIC); memset(&dc->port[i].ctrl_dl, 0, sizeof(struct ctrl_dl)); memset(&dc->port[i].ctrl_ul, 0, sizeof(struct ctrl_ul)); } diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c index 9e6efb1f029f..dbcb3bd192c7 100644 --- a/drivers/char/sonypi.c +++ b/drivers/char/sonypi.c @@ -777,8 +777,9 @@ static void input_keyrelease(struct work_struct *work) { struct sonypi_keypress kp; - while (kfifo_get(&sonypi_device.input_fifo, (unsigned char *)&kp, - sizeof(kp)) == sizeof(kp)) { + while (kfifo_get_locked(&sonypi_device.input_fifo, (unsigned char *)&kp, + sizeof(kp), &sonypi_device.input_fifo_lock) + == sizeof(kp)) { msleep(10); input_report_key(kp.dev, kp.key, 0); input_sync(kp.dev); @@ -827,8 +828,9 @@ static void sonypi_report_input_event(u8 event) if (kp.dev) { input_report_key(kp.dev, kp.key, 1); input_sync(kp.dev); - kfifo_put(&sonypi_device.input_fifo, - (unsigned char *)&kp, sizeof(kp)); + kfifo_put_locked(&sonypi_device.input_fifo, + (unsigned char *)&kp, sizeof(kp), + &sonypi_device.input_fifo_lock); schedule_work(&sonypi_device.input_work); } } @@ -880,7 +882,8 @@ found: acpi_bus_generate_proc_event(sonypi_acpi_device, 1, event); #endif - kfifo_put(&sonypi_device.fifo, (unsigned char *)&event, sizeof(event)); + kfifo_put_locked(&sonypi_device.fifo, (unsigned char *)&event, + sizeof(event), &sonypi_device.fifo_lock); kill_fasync(&sonypi_device.fifo_async, SIGIO, POLL_IN); wake_up_interruptible(&sonypi_device.fifo_proc_list); @@ -929,7 +932,8 @@ static ssize_t sonypi_misc_read(struct file *file, char __user *buf, return ret; while (ret < count && - (kfifo_get(&sonypi_device.fifo, &c, sizeof(c)) == sizeof(c))) { + (kfifo_get_locked(&sonypi_device.fifo, &c, sizeof(c), + &sonypi_device.fifo_lock) == sizeof(c))) { if (put_user(c, buf++)) return -EFAULT; ret++; @@ -1313,8 +1317,7 @@ static int __devinit sonypi_probe(struct platform_device *dev) "http://www.linux.it/~malattia/wiki/index.php/Sony_drivers\n"); spin_lock_init(&sonypi_device.fifo_lock); - error = kfifo_alloc(&sonypi_device.fifo, SONYPI_BUF_SIZE, GFP_KERNEL, - &sonypi_device.fifo_lock); + error = kfifo_alloc(&sonypi_device.fifo, SONYPI_BUF_SIZE, GFP_KERNEL); if (error) { printk(KERN_ERR "sonypi: kfifo_alloc failed\n"); return error; @@ -1394,7 +1397,7 @@ static int __devinit sonypi_probe(struct platform_device *dev) spin_lock_init(&sonypi_device.input_fifo_lock); error = kfifo_alloc(&sonypi_device.input_fifo, SONYPI_BUF_SIZE, - GFP_KERNEL, &sonypi_device.input_fifo_lock); + GFP_KERNEL); if (error) { printk(KERN_ERR "sonypi: kfifo_alloc failed\n"); goto err_inpdev_unregister; diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c index 65072bdfc1bf..98f24e6d906e 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_resource.c +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c @@ -55,7 +55,7 @@ static int __cxio_init_resource_fifo(struct kfifo *fifo, u32 rarray[16]; spin_lock_init(fifo_lock); - if (kfifo_alloc(fifo, nr * sizeof(u32), GFP_KERNEL, fifo_lock)) + if (kfifo_alloc(fifo, nr * sizeof(u32), GFP_KERNEL)) return -ENOMEM; for (i = 0; i < skip_low + skip_high; i++) @@ -86,7 +86,8 @@ static int __cxio_init_resource_fifo(struct kfifo *fifo, __kfifo_put(fifo, (unsigned char *) &i, sizeof(u32)); for (i = 0; i < skip_low + skip_high; i++) - kfifo_get(fifo, (unsigned char *) &entry, sizeof(u32)); + kfifo_get_locked(fifo, (unsigned char *) &entry, + sizeof(u32), fifo_lock); return 0; } @@ -113,8 +114,7 @@ static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p) spin_lock_init(&rdev_p->rscp->qpid_fifo_lock); if (kfifo_alloc(&rdev_p->rscp->qpid_fifo, T3_MAX_NUM_QP * sizeof(u32), - GFP_KERNEL, - &rdev_p->rscp->qpid_fifo_lock)) + GFP_KERNEL)) return -ENOMEM; for (i = 16; i < T3_MAX_NUM_QP; i++) @@ -177,33 +177,37 @@ tpt_err: /* * returns 0 if no resource available */ -static u32 cxio_hal_get_resource(struct kfifo *fifo) +static u32 cxio_hal_get_resource(struct kfifo *fifo, spinlock_t * lock) { u32 entry; - if (kfifo_get(fifo, (unsigned char *) &entry, sizeof(u32))) + if (kfifo_get_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock)) return entry; else return 0; /* fifo emptry */ } -static void cxio_hal_put_resource(struct kfifo *fifo, u32 entry) +static void cxio_hal_put_resource(struct kfifo *fifo, spinlock_t * lock, + u32 entry) { - BUG_ON(kfifo_put(fifo, (unsigned char *) &entry, sizeof(u32)) == 0); + BUG_ON( + kfifo_put_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock) + == 0); } u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp) { - return cxio_hal_get_resource(&rscp->tpt_fifo); + return cxio_hal_get_resource(&rscp->tpt_fifo, &rscp->tpt_fifo_lock); } void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag) { - cxio_hal_put_resource(&rscp->tpt_fifo, stag); + cxio_hal_put_resource(&rscp->tpt_fifo, &rscp->tpt_fifo_lock, stag); } u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp) { - u32 qpid = cxio_hal_get_resource(&rscp->qpid_fifo); + u32 qpid = cxio_hal_get_resource(&rscp->qpid_fifo, + &rscp->qpid_fifo_lock); PDBG("%s qpid 0x%x\n", __func__, qpid); return qpid; } @@ -211,27 +215,27 @@ u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp) void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid) { PDBG("%s qpid 0x%x\n", __func__, qpid); - cxio_hal_put_resource(&rscp->qpid_fifo, qpid); + cxio_hal_put_resource(&rscp->qpid_fifo, &rscp->qpid_fifo_lock, qpid); } u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp) { - return cxio_hal_get_resource(&rscp->cqid_fifo); + return cxio_hal_get_resource(&rscp->cqid_fifo, &rscp->cqid_fifo_lock); } void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid) { - cxio_hal_put_resource(&rscp->cqid_fifo, cqid); + cxio_hal_put_resource(&rscp->cqid_fifo, &rscp->cqid_fifo_lock, cqid); } u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp) { - return cxio_hal_get_resource(&rscp->pdid_fifo); + return cxio_hal_get_resource(&rscp->pdid_fifo, &rscp->pdid_fifo_lock); } void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid) { - cxio_hal_put_resource(&rscp->pdid_fifo, pdid); + cxio_hal_put_resource(&rscp->pdid_fifo, &rscp->pdid_fifo_lock, pdid); } void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp) diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c index dacbbb839b9e..38bcedfd9fec 100644 --- a/drivers/media/video/meye.c +++ b/drivers/media/video/meye.c @@ -800,8 +800,8 @@ again: return IRQ_HANDLED; if (meye.mchip_mode == MCHIP_HIC_MODE_CONT_OUT) { - if (kfifo_get(&meye.grabq, (unsigned char *)&reqnr, - sizeof(int)) != sizeof(int)) { + if (kfifo_get_locked(&meye.grabq, (unsigned char *)&reqnr, + sizeof(int), &meye.grabq_lock) != sizeof(int)) { mchip_free_frame(); return IRQ_HANDLED; } @@ -811,7 +811,8 @@ again: meye.grab_buffer[reqnr].state = MEYE_BUF_DONE; do_gettimeofday(&meye.grab_buffer[reqnr].timestamp); meye.grab_buffer[reqnr].sequence = sequence++; - kfifo_put(&meye.doneq, (unsigned char *)&reqnr, sizeof(int)); + kfifo_put_locked(&meye.doneq, (unsigned char *)&reqnr, + sizeof(int), &meye.doneq_lock); wake_up_interruptible(&meye.proc_list); } else { int size; @@ -820,8 +821,8 @@ again: mchip_free_frame(); goto again; } - if (kfifo_get(&meye.grabq, (unsigned char *)&reqnr, - sizeof(int)) != sizeof(int)) { + if (kfifo_get_locked(&meye.grabq, (unsigned char *)&reqnr, + sizeof(int), &meye.grabq_lock) != sizeof(int)) { mchip_free_frame(); goto again; } @@ -831,7 +832,8 @@ again: meye.grab_buffer[reqnr].state = MEYE_BUF_DONE; do_gettimeofday(&meye.grab_buffer[reqnr].timestamp); meye.grab_buffer[reqnr].sequence = sequence++; - kfifo_put(&meye.doneq, (unsigned char *)&reqnr, sizeof(int)); + kfifo_put_locked(&meye.doneq, (unsigned char *)&reqnr, + sizeof(int), &meye.doneq_lock); wake_up_interruptible(&meye.proc_list); } mchip_free_frame(); @@ -933,7 +935,8 @@ static int meyeioc_qbuf_capt(int *nb) mchip_cont_compression_start(); meye.grab_buffer[*nb].state = MEYE_BUF_USING; - kfifo_put(&meye.grabq, (unsigned char *)nb, sizeof(int)); + kfifo_put_locked(&meye.grabq, (unsigned char *)nb, sizeof(int), + &meye.grabq_lock); mutex_unlock(&meye.lock); return 0; @@ -965,7 +968,8 @@ static int meyeioc_sync(struct file *file, void *fh, int *i) /* fall through */ case MEYE_BUF_DONE: meye.grab_buffer[*i].state = MEYE_BUF_UNUSED; - kfifo_get(&meye.doneq, (unsigned char *)&unused, sizeof(int)); + kfifo_get_locked(&meye.doneq, (unsigned char *)&unused, + sizeof(int), &meye.doneq_lock); } *i = meye.grab_buffer[*i].size; mutex_unlock(&meye.lock); @@ -1452,7 +1456,8 @@ static int vidioc_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) buf->flags |= V4L2_BUF_FLAG_QUEUED; buf->flags &= ~V4L2_BUF_FLAG_DONE; meye.grab_buffer[buf->index].state = MEYE_BUF_USING; - kfifo_put(&meye.grabq, (unsigned char *)&buf->index, sizeof(int)); + kfifo_put_locked(&meye.grabq, (unsigned char *)&buf->index, + sizeof(int), &meye.grabq_lock); mutex_unlock(&meye.lock); return 0; @@ -1478,8 +1483,8 @@ static int vidioc_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) return -EINTR; } - if (!kfifo_get(&meye.doneq, (unsigned char *)&reqnr, - sizeof(int))) { + if (!kfifo_get_locked(&meye.doneq, (unsigned char *)&reqnr, + sizeof(int), &meye.doneq_lock)) { mutex_unlock(&meye.lock); return -EBUSY; } @@ -1745,14 +1750,14 @@ static int __devinit meye_probe(struct pci_dev *pcidev, } spin_lock_init(&meye.grabq_lock); - if (kfifo_alloc(&meye.grabq, sizeof(int) * MEYE_MAX_BUFNBRS, GFP_KERNEL, - &meye.grabq_lock)) { + if (kfifo_alloc(&meye.grabq, sizeof(int) * MEYE_MAX_BUFNBRS, + GFP_KERNEL)) { printk(KERN_ERR "meye: fifo allocation failed\n"); goto outkfifoalloc1; } spin_lock_init(&meye.doneq_lock); - if (kfifo_alloc(&meye.doneq, sizeof(int) * MEYE_MAX_BUFNBRS, GFP_KERNEL, - &meye.doneq_lock)) { + if (kfifo_alloc(&meye.doneq, sizeof(int) * MEYE_MAX_BUFNBRS, + GFP_KERNEL)) { printk(KERN_ERR "meye: fifo allocation failed\n"); goto outkfifoalloc2; } diff --git a/drivers/net/wireless/libertas/main.c b/drivers/net/wireless/libertas/main.c index 403909287414..2cc7ecd8d123 100644 --- a/drivers/net/wireless/libertas/main.c +++ b/drivers/net/wireless/libertas/main.c @@ -883,7 +883,7 @@ static int lbs_init_adapter(struct lbs_private *priv) priv->resp_len[0] = priv->resp_len[1] = 0; /* Create the event FIFO */ - ret = kfifo_alloc(&priv->event_fifo, sizeof(u32) * 16, GFP_KERNEL, NULL); + ret = kfifo_alloc(&priv->event_fifo, sizeof(u32) * 16, GFP_KERNEL); if (ret) { lbs_pr_err("Out of memory allocating event FIFO buffer\n"); goto out; diff --git a/drivers/platform/x86/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c index f999fba0e25e..13dc7bedcfce 100644 --- a/drivers/platform/x86/fujitsu-laptop.c +++ b/drivers/platform/x86/fujitsu-laptop.c @@ -825,7 +825,7 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device) /* kfifo */ spin_lock_init(&fujitsu_hotkey->fifo_lock); error = kfifo_alloc(&fujitsu_hotkey->fifo, RINGBUFFERSIZE * sizeof(int), - GFP_KERNEL, &fujitsu_hotkey->fifo_lock); + GFP_KERNEL); if (error) { printk(KERN_ERR "kfifo_alloc failed\n"); goto err_stop; @@ -1006,9 +1006,10 @@ static void acpi_fujitsu_hotkey_notify(struct acpi_device *device, u32 event) vdbg_printk(FUJLAPTOP_DBG_TRACE, "Push keycode into ringbuffer [%d]\n", keycode); - status = kfifo_put(&fujitsu_hotkey->fifo, + status = kfifo_put_locked(&fujitsu_hotkey->fifo, (unsigned char *)&keycode, - sizeof(keycode)); + sizeof(keycode), + &fujitsu_hotkey->fifo_lock); if (status != sizeof(keycode)) { vdbg_printk(FUJLAPTOP_DBG_WARN, "Could not push keycode [0x%x]\n", @@ -1019,11 +1020,12 @@ static void acpi_fujitsu_hotkey_notify(struct acpi_device *device, u32 event) } } else if (keycode == 0) { while ((status = - kfifo_get - (&fujitsu_hotkey->fifo, (unsigned char *) - &keycode_r, - sizeof - (keycode_r))) == sizeof(keycode_r)) { + kfifo_get_locked( + &fujitsu_hotkey->fifo, + (unsigned char *) &keycode_r, + sizeof(keycode_r), + &fujitsu_hotkey->fifo_lock)) + == sizeof(keycode_r)) { input_report_key(input, keycode_r, 0); input_sync(input); vdbg_printk(FUJLAPTOP_DBG_TRACE, diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 04625a048e74..1538a0a3c0af 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -300,8 +300,9 @@ static void do_sony_laptop_release_key(struct work_struct *work) { struct sony_laptop_keypress kp; - while (kfifo_get(&sony_laptop_input.fifo, (unsigned char *)&kp, - sizeof(kp)) == sizeof(kp)) { + while (kfifo_get_locked(&sony_laptop_input.fifo, (unsigned char *)&kp, + sizeof(kp), &sony_laptop_input.fifo_lock) + == sizeof(kp)) { msleep(10); input_report_key(kp.dev, kp.key, 0); input_sync(kp.dev); @@ -362,8 +363,9 @@ static void sony_laptop_report_input_event(u8 event) /* we emit the scancode so we can always remap the key */ input_event(kp.dev, EV_MSC, MSC_SCAN, event); input_sync(kp.dev); - kfifo_put(&sony_laptop_input.fifo, - (unsigned char *)&kp, sizeof(kp)); + kfifo_put_locked(&sony_laptop_input.fifo, + (unsigned char *)&kp, sizeof(kp), + &sony_laptop_input.fifo_lock); if (!work_pending(&sony_laptop_release_key_work)) queue_work(sony_laptop_input.wq, @@ -386,8 +388,7 @@ static int sony_laptop_setup_input(struct acpi_device *acpi_device) /* kfifo */ spin_lock_init(&sony_laptop_input.fifo_lock); error = - kfifo_alloc(&sony_laptop_input.fifo, SONY_LAPTOP_BUF_SIZE, GFP_KERNEL, - &sony_laptop_input.fifo_lock); + kfifo_alloc(&sony_laptop_input.fifo, SONY_LAPTOP_BUF_SIZE, GFP_KERNEL); if (error) { printk(KERN_ERR DRV_PFX "kfifo_alloc failed\n"); goto err_dec_users; @@ -2129,7 +2130,8 @@ static ssize_t sonypi_misc_read(struct file *file, char __user *buf, return ret; while (ret < count && - (kfifo_get(&sonypi_compat.fifo, &c, sizeof(c)) == sizeof(c))) { + (kfifo_get_locked(&sonypi_compat.fifo, &c, sizeof(c), + &sonypi_compat.fifo_lock) == sizeof(c))) { if (put_user(c, buf++)) return -EFAULT; ret++; @@ -2308,7 +2310,8 @@ static struct miscdevice sonypi_misc_device = { static void sonypi_compat_report_event(u8 event) { - kfifo_put(&sonypi_compat.fifo, (unsigned char *)&event, sizeof(event)); + kfifo_put_locked(&sonypi_compat.fifo, (unsigned char *)&event, + sizeof(event), &sonypi_compat.fifo_lock); kill_fasync(&sonypi_compat.fifo_async, SIGIO, POLL_IN); wake_up_interruptible(&sonypi_compat.fifo_proc_list); } @@ -2319,8 +2322,7 @@ static int sonypi_compat_init(void) spin_lock_init(&sonypi_compat.fifo_lock); error = - kfifo_alloc(&sonypi_compat.fifo, SONY_LAPTOP_BUF_SIZE, GFP_KERNEL, - &sonypi_compat.fifo_lock); + kfifo_alloc(&sonypi_compat.fifo, SONY_LAPTOP_BUF_SIZE, GFP_KERNEL); if (error) { printk(KERN_ERR DRV_PFX "kfifo_alloc failed\n"); return error; diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index cf0aa7e90be9..1bccbc1e588e 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -2461,7 +2461,7 @@ iscsi_pool_init(struct iscsi_pool *q, int max, void ***items, int item_size) if (q->pool == NULL) return -ENOMEM; - kfifo_init(&q->queue, (void*)q->pool, max * sizeof(void*), NULL); + kfifo_init(&q->queue, (void*)q->pool, max * sizeof(void*)); for (i = 0; i < max; i++) { q->pool[i] = kzalloc(item_size, GFP_KERNEL); diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c index a83ee56a185e..41643c860d26 100644 --- a/drivers/scsi/libiscsi_tcp.c +++ b/drivers/scsi/libiscsi_tcp.c @@ -1128,7 +1128,7 @@ int iscsi_tcp_r2tpool_alloc(struct iscsi_session *session) /* R2T xmit queue */ if (kfifo_alloc(&tcp_task->r2tqueue, - session->max_r2t * 4 * sizeof(void*), GFP_KERNEL, NULL)) { + session->max_r2t * 4 * sizeof(void*), GFP_KERNEL)) { iscsi_pool_free(&tcp_task->r2tpool); goto r2t_alloc_fail; } diff --git a/drivers/scsi/libsrp.c b/drivers/scsi/libsrp.c index b1b5e51ca8e3..db1b41c55fd3 100644 --- a/drivers/scsi/libsrp.c +++ b/drivers/scsi/libsrp.c @@ -58,8 +58,7 @@ static int srp_iu_pool_alloc(struct srp_queue *q, size_t max, goto free_pool; spin_lock_init(&q->lock); - kfifo_init(&q->queue, (void *) q->pool, max * sizeof(void *), - &q->lock); + kfifo_init(&q->queue, (void *) q->pool, max * sizeof(void *)); for (i = 0, iue = q->items; i < max; i++) { __kfifo_put(&q->queue, (void *) &iue, sizeof(void *)); @@ -164,7 +163,8 @@ struct iu_entry *srp_iu_get(struct srp_target *target) { struct iu_entry *iue = NULL; - kfifo_get(&target->iu_queue.queue, (void *) &iue, sizeof(void *)); + kfifo_get_locked(&target->iu_queue.queue, (void *) &iue, + sizeof(void *), &target->iu_queue.lock); if (!iue) return iue; iue->target = target; @@ -176,7 +176,8 @@ EXPORT_SYMBOL_GPL(srp_iu_get); void srp_iu_put(struct iu_entry *iue) { - kfifo_put(&iue->target->iu_queue.queue, (void *) &iue, sizeof(void *)); + kfifo_put_locked(&iue->target->iu_queue.queue, (void *) &iue, + sizeof(void *), &iue->target->iu_queue.lock); } EXPORT_SYMBOL_GPL(srp_iu_put); diff --git a/drivers/usb/host/fhci.h b/drivers/usb/host/fhci.h index 2277428ef5d3..a76da201183b 100644 --- a/drivers/usb/host/fhci.h +++ b/drivers/usb/host/fhci.h @@ -495,7 +495,7 @@ static inline struct usb_hcd *fhci_to_hcd(struct fhci_hcd *fhci) /* fifo of pointers */ static inline int cq_new(struct kfifo *fifo, int size) { - return kfifo_alloc(fifo, size * sizeof(void *), GFP_KERNEL, NULL); + return kfifo_alloc(fifo, size * sizeof(void *), GFP_KERNEL); } static inline void cq_delete(struct kfifo *kfifo) diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c index bbe005cefcfb..61eef18218be 100644 --- a/drivers/usb/serial/generic.c +++ b/drivers/usb/serial/generic.c @@ -285,7 +285,7 @@ static int usb_serial_generic_write_start(struct usb_serial_port *port) return 0; data = port->write_urb->transfer_buffer; - count = kfifo_get(port->write_fifo, data, port->bulk_out_size); + count = kfifo_get_locked(port->write_fifo, data, port->bulk_out_size, &port->lock); usb_serial_debug_data(debug, &port->dev, __func__, count, data); /* set up our urb */ @@ -345,7 +345,7 @@ int usb_serial_generic_write(struct tty_struct *tty, return usb_serial_multi_urb_write(tty, port, buf, count); - count = kfifo_put(port->write_fifo, buf, count); + count = kfifo_put_locked(port->write_fifo, buf, count, &port->lock); result = usb_serial_generic_write_start(port); if (result >= 0) diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c index 44b72d47fac2..636a4f23445e 100644 --- a/drivers/usb/serial/usb-serial.c +++ b/drivers/usb/serial/usb-serial.c @@ -939,8 +939,7 @@ int usb_serial_probe(struct usb_interface *interface, dev_err(&interface->dev, "No free urbs available\n"); goto probe_error; } - if (kfifo_alloc(&port->write_fifo, PAGE_SIZE, GFP_KERNEL, - &port->lock)) + if (kfifo_alloc(port->write_fifo, PAGE_SIZE, GFP_KERNEL)) goto probe_error; buffer_size = le16_to_cpu(endpoint->wMaxPacketSize); port->bulk_out_size = buffer_size; diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index c3f8d82efd34..e0f5c9d4197d 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -30,13 +30,12 @@ struct kfifo { unsigned int size; /* the size of the allocated buffer */ unsigned int in; /* data is added at offset (in % size) */ unsigned int out; /* data is extracted from off. (out % size) */ - spinlock_t *lock; /* protects concurrent modifications */ }; extern void kfifo_init(struct kfifo *fifo, unsigned char *buffer, - unsigned int size, spinlock_t *lock); + unsigned int size); extern __must_check int kfifo_alloc(struct kfifo *fifo, unsigned int size, - gfp_t gfp_mask, spinlock_t *lock); + gfp_t gfp_mask); extern void kfifo_free(struct kfifo *fifo); extern unsigned int __kfifo_put(struct kfifo *fifo, const unsigned char *buffer, unsigned int len); @@ -58,58 +57,67 @@ static inline void __kfifo_reset(struct kfifo *fifo) */ static inline void kfifo_reset(struct kfifo *fifo) { - unsigned long flags; - - spin_lock_irqsave(fifo->lock, flags); - __kfifo_reset(fifo); +} + +/** + * __kfifo_len - returns the number of bytes available in the FIFO + * @fifo: the fifo to be used. + */ +static inline unsigned int __kfifo_len(struct kfifo *fifo) +{ + register unsigned int out; - spin_unlock_irqrestore(fifo->lock, flags); + out = fifo->out; + smp_rmb(); + return fifo->in - out; } /** - * kfifo_put - puts some data into the FIFO + * kfifo_put_locked - puts some data into the FIFO using a spinlock for locking * @fifo: the fifo to be used. - * @buffer: the data to be added. - * @len: the length of the data to be added. + * @from: the data to be added. + * @n: the length of the data to be added. + * @lock: pointer to the spinlock to use for locking. * - * This function copies at most @len bytes from the @buffer into + * This function copies at most @len bytes from the @from buffer into * the FIFO depending on the free space, and returns the number of * bytes copied. */ -static inline unsigned int kfifo_put(struct kfifo *fifo, - const unsigned char *buffer, unsigned int len) +static inline __must_check unsigned int kfifo_put_locked(struct kfifo *fifo, + const unsigned char *from, unsigned int n, spinlock_t *lock) { unsigned long flags; unsigned int ret; - spin_lock_irqsave(fifo->lock, flags); + spin_lock_irqsave(lock, flags); - ret = __kfifo_put(fifo, buffer, len); + ret = __kfifo_put(fifo, from, n); - spin_unlock_irqrestore(fifo->lock, flags); + spin_unlock_irqrestore(lock, flags); return ret; } /** - * kfifo_get - gets some data from the FIFO + * kfifo_get_locked - gets some data from the FIFO using a spinlock for locking * @fifo: the fifo to be used. - * @buffer: where the data must be copied. - * @len: the size of the destination buffer. + * @to: where the data must be copied. + * @n: the size of the destination buffer. + * @lock: pointer to the spinlock to use for locking. * * This function copies at most @len bytes from the FIFO into the - * @buffer and returns the number of copied bytes. + * @to buffer and returns the number of copied bytes. */ -static inline unsigned int kfifo_get(struct kfifo *fifo, - unsigned char *buffer, unsigned int len) +static inline __must_check unsigned int kfifo_get_locked(struct kfifo *fifo, + unsigned char *to, unsigned int n, spinlock_t *lock) { unsigned long flags; unsigned int ret; - spin_lock_irqsave(fifo->lock, flags); + spin_lock_irqsave(lock, flags); - ret = __kfifo_get(fifo, buffer, len); + ret = __kfifo_get(fifo, to, n); /* * optimization: if the FIFO is empty, set the indices to 0 @@ -118,36 +126,18 @@ static inline unsigned int kfifo_get(struct kfifo *fifo, if (fifo->in == fifo->out) fifo->in = fifo->out = 0; - spin_unlock_irqrestore(fifo->lock, flags); + spin_unlock_irqrestore(lock, flags); return ret; } -/** - * __kfifo_len - returns the number of bytes available in the FIFO, no locking version - * @fifo: the fifo to be used. - */ -static inline unsigned int __kfifo_len(struct kfifo *fifo) -{ - return fifo->in - fifo->out; -} - /** * kfifo_len - returns the number of bytes available in the FIFO * @fifo: the fifo to be used. */ static inline unsigned int kfifo_len(struct kfifo *fifo) { - unsigned long flags; - unsigned int ret; - - spin_lock_irqsave(fifo->lock, flags); - - ret = __kfifo_len(fifo); - - spin_unlock_irqrestore(fifo->lock, flags); - - return ret; + return __kfifo_len(fifo); } #endif diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 8da6bb9782bb..4950bdbe3477 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -28,11 +28,10 @@ #include static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, - unsigned int size, spinlock_t *lock) + unsigned int size) { fifo->buffer = buffer; fifo->size = size; - fifo->lock = lock; kfifo_reset(fifo); } @@ -42,16 +41,14 @@ static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, * @fifo: the fifo to assign the buffer * @buffer: the preallocated buffer to be used. * @size: the size of the internal buffer, this have to be a power of 2. - * @lock: the lock to be used to protect the fifo buffer * */ -void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size, - spinlock_t *lock) +void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size) { /* size must be a power of 2 */ BUG_ON(!is_power_of_2(size)); - _kfifo_init(fifo, buffer, size, lock); + _kfifo_init(fifo, buffer, size); } EXPORT_SYMBOL(kfifo_init); @@ -60,7 +57,6 @@ EXPORT_SYMBOL(kfifo_init); * @fifo: the fifo to assign then new buffer * @size: the size of the buffer to be allocated, this have to be a power of 2. * @gfp_mask: get_free_pages mask, passed to kmalloc() - * @lock: the lock to be used to protect the fifo buffer * * This function dynamically allocates a new fifo internal buffer * @@ -68,8 +64,7 @@ EXPORT_SYMBOL(kfifo_init); * The buffer will be release with kfifo_free(). * Return 0 if no error, otherwise the an error code */ -int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask, - spinlock_t *lock) +int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) { unsigned char *buffer; @@ -84,11 +79,11 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask, buffer = kmalloc(size, gfp_mask); if (!buffer) { - _kfifo_init(fifo, 0, 0, NULL); + _kfifo_init(fifo, 0, 0); return -ENOMEM; } - _kfifo_init(fifo, buffer, size, lock); + _kfifo_init(fifo, buffer, size); return 0; } diff --git a/net/dccp/probe.c b/net/dccp/probe.c index 6230ceb0823e..c6b50351aa78 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -67,7 +67,7 @@ static void printl(const char *fmt, ...) len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); va_end(args); - kfifo_put(&dccpw.fifo, tbuf, len); + kfifo_put_locked(&dccpw.fifo, tbuf, len, &dccpw.lock); wake_up(&dccpw.wait); } @@ -135,7 +135,7 @@ static ssize_t dccpprobe_read(struct file *file, char __user *buf, if (error) goto out_free; - cnt = kfifo_get(&dccpw.fifo, tbuf, len); + cnt = kfifo_get_locked(&dccpw.fifo, tbuf, len, &dccpw.lock); error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0; out_free: @@ -156,7 +156,7 @@ static __init int dccpprobe_init(void) init_waitqueue_head(&dccpw.wait); spin_lock_init(&dccpw.lock); - if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL, &dccpw.lock)) + if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL)) return ret; if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &dccpprobe_fops)) goto err0; -- cgit v1.2.3 From e64c026dd09b73faf20707711402fc5ed55a8e70 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:28 -0800 Subject: kfifo: cleanup namespace change name of __kfifo_* functions to kfifo_*, because the prefix __kfifo should be reserved for internal functions only. Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/nozomi.c | 12 +++++------ drivers/infiniband/hw/cxgb3/cxio_resource.c | 10 ++++----- drivers/net/wireless/libertas/cmd.c | 4 ++-- drivers/net/wireless/libertas/main.c | 8 ++++---- drivers/platform/x86/sony-laptop.c | 2 +- drivers/scsi/libiscsi.c | 14 ++++++------- drivers/scsi/libiscsi_tcp.c | 20 +++++++++--------- drivers/scsi/libsrp.c | 2 +- drivers/usb/host/fhci.h | 6 +++--- drivers/usb/serial/generic.c | 4 ++-- include/linux/kfifo.h | 32 +++++++---------------------- kernel/kfifo.c | 12 +++++------ net/dccp/probe.c | 2 +- 13 files changed, 55 insertions(+), 73 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/nozomi.c b/drivers/char/nozomi.c index 935b30d80adf..61f5bfe74f38 100644 --- a/drivers/char/nozomi.c +++ b/drivers/char/nozomi.c @@ -798,7 +798,7 @@ static int send_data(enum port_type index, struct nozomi *dc) struct tty_struct *tty = tty_port_tty_get(&port->port); /* Get data from tty and place in buf for now */ - size = __kfifo_get(&port->fifo_ul, dc->send_buf, + size = kfifo_get(&port->fifo_ul, dc->send_buf, ul_size < SEND_BUF_MAX ? ul_size : SEND_BUF_MAX); if (size == 0) { @@ -988,11 +988,11 @@ static int receive_flow_control(struct nozomi *dc) } else if (old_ctrl.CTS == 0 && ctrl_dl.CTS == 1) { - if (__kfifo_len(&dc->port[port].fifo_ul)) { + if (kfifo_len(&dc->port[port].fifo_ul)) { DBG1("Enable interrupt (0x%04X) on port: %d", enable_ier, port); DBG1("Data in buffer [%d], enable transmit! ", - __kfifo_len(&dc->port[port].fifo_ul)); + kfifo_len(&dc->port[port].fifo_ul)); enable_transmit_ul(port, dc); } else { DBG1("No data in buffer..."); @@ -1672,7 +1672,7 @@ static int ntty_write(struct tty_struct *tty, const unsigned char *buffer, goto exit; } - rval = __kfifo_put(&port->fifo_ul, (unsigned char *)buffer, count); + rval = kfifo_put(&port->fifo_ul, (unsigned char *)buffer, count); /* notify card */ if (unlikely(dc == NULL)) { @@ -1720,7 +1720,7 @@ static int ntty_write_room(struct tty_struct *tty) if (!port->port.count) goto exit; - room = port->fifo_ul.size - __kfifo_len(&port->fifo_ul); + room = port->fifo_ul.size - kfifo_len(&port->fifo_ul); exit: mutex_unlock(&port->tty_sem); @@ -1877,7 +1877,7 @@ static s32 ntty_chars_in_buffer(struct tty_struct *tty) goto exit_in_buffer; } - rval = __kfifo_len(&port->fifo_ul); + rval = kfifo_len(&port->fifo_ul); exit_in_buffer: return rval; diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c index 98f24e6d906e..d7d18fb02c93 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_resource.c +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c @@ -59,7 +59,7 @@ static int __cxio_init_resource_fifo(struct kfifo *fifo, return -ENOMEM; for (i = 0; i < skip_low + skip_high; i++) - __kfifo_put(fifo, (unsigned char *) &entry, sizeof(u32)); + kfifo_put(fifo, (unsigned char *) &entry, sizeof(u32)); if (random) { j = 0; random_bytes = random32(); @@ -71,19 +71,19 @@ static int __cxio_init_resource_fifo(struct kfifo *fifo, random_bytes = random32(); } idx = (random_bytes >> (j * 2)) & 0xF; - __kfifo_put(fifo, + kfifo_put(fifo, (unsigned char *) &rarray[idx], sizeof(u32)); rarray[idx] = i; j++; } for (i = 0; i < RANDOM_SIZE; i++) - __kfifo_put(fifo, + kfifo_put(fifo, (unsigned char *) &rarray[i], sizeof(u32)); } else for (i = skip_low; i < nr - skip_high; i++) - __kfifo_put(fifo, (unsigned char *) &i, sizeof(u32)); + kfifo_put(fifo, (unsigned char *) &i, sizeof(u32)); for (i = 0; i < skip_low + skip_high; i++) kfifo_get_locked(fifo, (unsigned char *) &entry, @@ -119,7 +119,7 @@ static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p) for (i = 16; i < T3_MAX_NUM_QP; i++) if (!(i & rdev_p->qpmask)) - __kfifo_put(&rdev_p->rscp->qpid_fifo, + kfifo_put(&rdev_p->rscp->qpid_fifo, (unsigned char *) &i, sizeof(u32)); return 0; } diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c index ffed17f4f506..42611bea76a3 100644 --- a/drivers/net/wireless/libertas/cmd.c +++ b/drivers/net/wireless/libertas/cmd.c @@ -1365,7 +1365,7 @@ static void lbs_send_confirmsleep(struct lbs_private *priv) priv->dnld_sent = DNLD_RES_RECEIVED; /* If nothing to do, go back to sleep (?) */ - if (!__kfifo_len(&priv->event_fifo) && !priv->resp_len[priv->resp_idx]) + if (!kfifo_len(&priv->event_fifo) && !priv->resp_len[priv->resp_idx]) priv->psstate = PS_STATE_SLEEP; spin_unlock_irqrestore(&priv->driver_lock, flags); @@ -1439,7 +1439,7 @@ void lbs_ps_confirm_sleep(struct lbs_private *priv) } /* Pending events or command responses? */ - if (__kfifo_len(&priv->event_fifo) || priv->resp_len[priv->resp_idx]) { + if (kfifo_len(&priv->event_fifo) || priv->resp_len[priv->resp_idx]) { allowed = 0; lbs_deb_host("pending events or command responses\n"); } diff --git a/drivers/net/wireless/libertas/main.c b/drivers/net/wireless/libertas/main.c index 2cc7ecd8d123..0622104f0a03 100644 --- a/drivers/net/wireless/libertas/main.c +++ b/drivers/net/wireless/libertas/main.c @@ -459,7 +459,7 @@ static int lbs_thread(void *data) else if (!list_empty(&priv->cmdpendingq) && !(priv->wakeup_dev_required)) shouldsleep = 0; /* We have a command to send */ - else if (__kfifo_len(&priv->event_fifo)) + else if (kfifo_len(&priv->event_fifo)) shouldsleep = 0; /* We have an event to process */ else shouldsleep = 1; /* No command */ @@ -511,9 +511,9 @@ static int lbs_thread(void *data) /* Process hardware events, e.g. card removed, link lost */ spin_lock_irq(&priv->driver_lock); - while (__kfifo_len(&priv->event_fifo)) { + while (kfifo_len(&priv->event_fifo)) { u32 event; - __kfifo_get(&priv->event_fifo, (unsigned char *) &event, + kfifo_get(&priv->event_fifo, (unsigned char *) &event, sizeof(event)); spin_unlock_irq(&priv->driver_lock); lbs_process_event(priv, event); @@ -1175,7 +1175,7 @@ void lbs_queue_event(struct lbs_private *priv, u32 event) if (priv->psstate == PS_STATE_SLEEP) priv->psstate = PS_STATE_AWAKE; - __kfifo_put(&priv->event_fifo, (unsigned char *) &event, sizeof(u32)); + kfifo_put(&priv->event_fifo, (unsigned char *) &event, sizeof(u32)); wake_up_interruptible(&priv->waitq); diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 1538a0a3c0af..36e5dc6fc953 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -2107,7 +2107,7 @@ static int sonypi_misc_open(struct inode *inode, struct file *file) spin_lock_irqsave(&sonypi_compat.fifo_lock, flags); if (atomic_inc_return(&sonypi_compat.open_count) == 1) - __kfifo_reset(&sonypi_compat.fifo); + kfifo_reset(&sonypi_compat.fifo); spin_unlock_irqrestore(&sonypi_compat.fifo_lock, flags); diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 1bccbc1e588e..5f0c46f43ee1 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -517,7 +517,7 @@ static void iscsi_free_task(struct iscsi_task *task) if (conn->login_task == task) return; - __kfifo_put(&session->cmdpool.queue, (void*)&task, sizeof(void*)); + kfifo_put(&session->cmdpool.queue, (void*)&task, sizeof(void*)); if (sc) { task->sc = NULL; @@ -737,7 +737,7 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr, BUG_ON(conn->c_stage == ISCSI_CONN_INITIAL_STAGE); BUG_ON(conn->c_stage == ISCSI_CONN_STOPPED); - if (!__kfifo_get(&session->cmdpool.queue, + if (!kfifo_get(&session->cmdpool.queue, (void*)&task, sizeof(void*))) return NULL; } @@ -1567,7 +1567,7 @@ static inline struct iscsi_task *iscsi_alloc_task(struct iscsi_conn *conn, { struct iscsi_task *task; - if (!__kfifo_get(&conn->session->cmdpool.queue, + if (!kfifo_get(&conn->session->cmdpool.queue, (void *) &task, sizeof(void *))) return NULL; @@ -2469,7 +2469,7 @@ iscsi_pool_init(struct iscsi_pool *q, int max, void ***items, int item_size) q->max = i; goto enomem; } - __kfifo_put(&q->queue, (void*)&q->pool[i], sizeof(void*)); + kfifo_put(&q->queue, (void*)&q->pool[i], sizeof(void*)); } if (items) { @@ -2819,7 +2819,7 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size, /* allocate login_task used for the login/text sequences */ spin_lock_bh(&session->lock); - if (!__kfifo_get(&session->cmdpool.queue, + if (!kfifo_get(&session->cmdpool.queue, (void*)&conn->login_task, sizeof(void*))) { spin_unlock_bh(&session->lock); @@ -2839,7 +2839,7 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size, return cls_conn; login_task_data_alloc_fail: - __kfifo_put(&session->cmdpool.queue, (void*)&conn->login_task, + kfifo_put(&session->cmdpool.queue, (void*)&conn->login_task, sizeof(void*)); login_task_alloc_fail: iscsi_destroy_conn(cls_conn); @@ -2902,7 +2902,7 @@ void iscsi_conn_teardown(struct iscsi_cls_conn *cls_conn) free_pages((unsigned long) conn->data, get_order(ISCSI_DEF_MAX_RECV_SEG_LEN)); kfree(conn->persistent_address); - __kfifo_put(&session->cmdpool.queue, (void*)&conn->login_task, + kfifo_put(&session->cmdpool.queue, (void*)&conn->login_task, sizeof(void*)); if (session->leadconn == conn) session->leadconn = NULL; diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c index 41643c860d26..c0be926637b1 100644 --- a/drivers/scsi/libiscsi_tcp.c +++ b/drivers/scsi/libiscsi_tcp.c @@ -445,15 +445,15 @@ void iscsi_tcp_cleanup_task(struct iscsi_task *task) return; /* flush task's r2t queues */ - while (__kfifo_get(&tcp_task->r2tqueue, (void*)&r2t, sizeof(void*))) { - __kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, + while (kfifo_get(&tcp_task->r2tqueue, (void*)&r2t, sizeof(void*))) { + kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); ISCSI_DBG_TCP(task->conn, "pending r2t dropped\n"); } r2t = tcp_task->r2t; if (r2t != NULL) { - __kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, + kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); tcp_task->r2t = NULL; } @@ -541,7 +541,7 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task) return 0; } - rc = __kfifo_get(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); + rc = kfifo_get(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); if (!rc) { iscsi_conn_printk(KERN_ERR, conn, "Could not allocate R2T. " "Target has sent more R2Ts than it " @@ -554,7 +554,7 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task) if (r2t->data_length == 0) { iscsi_conn_printk(KERN_ERR, conn, "invalid R2T with zero data len\n"); - __kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, + kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); return ISCSI_ERR_DATALEN; } @@ -570,7 +570,7 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task) "invalid R2T with data len %u at offset %u " "and total length %d\n", r2t->data_length, r2t->data_offset, scsi_out(task->sc)->length); - __kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, + kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); return ISCSI_ERR_DATALEN; } @@ -580,7 +580,7 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task) r2t->sent = 0; tcp_task->exp_datasn = r2tsn + 1; - __kfifo_put(&tcp_task->r2tqueue, (void*)&r2t, sizeof(void*)); + kfifo_put(&tcp_task->r2tqueue, (void*)&r2t, sizeof(void*)); conn->r2t_pdus_cnt++; iscsi_requeue_task(task); @@ -951,7 +951,7 @@ int iscsi_tcp_task_init(struct iscsi_task *task) return conn->session->tt->init_pdu(task, 0, task->data_count); } - BUG_ON(__kfifo_len(&tcp_task->r2tqueue)); + BUG_ON(kfifo_len(&tcp_task->r2tqueue)); tcp_task->exp_datasn = 0; /* Prepare PDU, optionally w/ immediate data */ @@ -982,7 +982,7 @@ static struct iscsi_r2t_info *iscsi_tcp_get_curr_r2t(struct iscsi_task *task) if (r2t->data_length <= r2t->sent) { ISCSI_DBG_TCP(task->conn, " done with r2t %p\n", r2t); - __kfifo_put(&tcp_task->r2tpool.queue, + kfifo_put(&tcp_task->r2tpool.queue, (void *)&tcp_task->r2t, sizeof(void *)); tcp_task->r2t = r2t = NULL; @@ -990,7 +990,7 @@ static struct iscsi_r2t_info *iscsi_tcp_get_curr_r2t(struct iscsi_task *task) } if (r2t == NULL) { - __kfifo_get(&tcp_task->r2tqueue, + kfifo_get(&tcp_task->r2tqueue, (void *)&tcp_task->r2t, sizeof(void *)); r2t = tcp_task->r2t; } diff --git a/drivers/scsi/libsrp.c b/drivers/scsi/libsrp.c index db1b41c55fd3..975e448cfcb9 100644 --- a/drivers/scsi/libsrp.c +++ b/drivers/scsi/libsrp.c @@ -61,7 +61,7 @@ static int srp_iu_pool_alloc(struct srp_queue *q, size_t max, kfifo_init(&q->queue, (void *) q->pool, max * sizeof(void *)); for (i = 0, iue = q->items; i < max; i++) { - __kfifo_put(&q->queue, (void *) &iue, sizeof(void *)); + kfifo_put(&q->queue, (void *) &iue, sizeof(void *)); iue->sbuf = ring[i]; iue++; } diff --git a/drivers/usb/host/fhci.h b/drivers/usb/host/fhci.h index a76da201183b..96aa787f208f 100644 --- a/drivers/usb/host/fhci.h +++ b/drivers/usb/host/fhci.h @@ -505,19 +505,19 @@ static inline void cq_delete(struct kfifo *kfifo) static inline unsigned int cq_howmany(struct kfifo *kfifo) { - return __kfifo_len(kfifo) / sizeof(void *); + return kfifo_len(kfifo) / sizeof(void *); } static inline int cq_put(struct kfifo *kfifo, void *p) { - return __kfifo_put(kfifo, (void *)&p, sizeof(p)); + return kfifo_put(kfifo, (void *)&p, sizeof(p)); } static inline void *cq_get(struct kfifo *kfifo) { void *p = NULL; - __kfifo_get(kfifo, (void *)&p, sizeof(p)); + kfifo_get(kfifo, (void *)&p, sizeof(p)); return p; } diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c index 61eef18218be..d0a2e464cacd 100644 --- a/drivers/usb/serial/generic.c +++ b/drivers/usb/serial/generic.c @@ -276,7 +276,7 @@ static int usb_serial_generic_write_start(struct usb_serial_port *port) if (port->write_urb_busy) start_io = false; else { - start_io = (__kfifo_len(port->write_fifo) != 0); + start_io = (kfifo_len(port->write_fifo) != 0); port->write_urb_busy = start_io; } spin_unlock_irqrestore(&port->lock, flags); @@ -370,7 +370,7 @@ int usb_serial_generic_write_room(struct tty_struct *tty) (serial->type->max_in_flight_urbs - port->urbs_in_flight); } else if (serial->num_bulk_out) - room = port->write_fifo->size - __kfifo_len(port->write_fifo); + room = port->write_fifo->size - kfifo_len(port->write_fifo); spin_unlock_irqrestore(&port->lock, flags); dbg("%s - returns %d", __func__, room); diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index e0f5c9d4197d..a893acda3964 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -37,34 +37,25 @@ extern void kfifo_init(struct kfifo *fifo, unsigned char *buffer, extern __must_check int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask); extern void kfifo_free(struct kfifo *fifo); -extern unsigned int __kfifo_put(struct kfifo *fifo, +extern unsigned int kfifo_put(struct kfifo *fifo, const unsigned char *buffer, unsigned int len); -extern unsigned int __kfifo_get(struct kfifo *fifo, +extern unsigned int kfifo_get(struct kfifo *fifo, unsigned char *buffer, unsigned int len); -/** - * __kfifo_reset - removes the entire FIFO contents, no locking version - * @fifo: the fifo to be emptied. - */ -static inline void __kfifo_reset(struct kfifo *fifo) -{ - fifo->in = fifo->out = 0; -} - /** * kfifo_reset - removes the entire FIFO contents * @fifo: the fifo to be emptied. */ static inline void kfifo_reset(struct kfifo *fifo) { - __kfifo_reset(fifo); + fifo->in = fifo->out = 0; } /** - * __kfifo_len - returns the number of bytes available in the FIFO + * kfifo_len - returns the number of used bytes in the FIFO * @fifo: the fifo to be used. */ -static inline unsigned int __kfifo_len(struct kfifo *fifo) +static inline unsigned int kfifo_len(struct kfifo *fifo) { register unsigned int out; @@ -92,7 +83,7 @@ static inline __must_check unsigned int kfifo_put_locked(struct kfifo *fifo, spin_lock_irqsave(lock, flags); - ret = __kfifo_put(fifo, from, n); + ret = kfifo_put(fifo, from, n); spin_unlock_irqrestore(lock, flags); @@ -117,7 +108,7 @@ static inline __must_check unsigned int kfifo_get_locked(struct kfifo *fifo, spin_lock_irqsave(lock, flags); - ret = __kfifo_get(fifo, to, n); + ret = kfifo_get(fifo, to, n); /* * optimization: if the FIFO is empty, set the indices to 0 @@ -131,13 +122,4 @@ static inline __must_check unsigned int kfifo_get_locked(struct kfifo *fifo, return ret; } -/** - * kfifo_len - returns the number of bytes available in the FIFO - * @fifo: the fifo to be used. - */ -static inline unsigned int kfifo_len(struct kfifo *fifo) -{ - return __kfifo_len(fifo); -} - #endif diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 4950bdbe3477..963ffde4af1a 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -100,7 +100,7 @@ void kfifo_free(struct kfifo *fifo) EXPORT_SYMBOL(kfifo_free); /** - * __kfifo_put - puts some data into the FIFO, no locking version + * kfifo_put - puts some data into the FIFO, no locking version * @fifo: the fifo to be used. * @buffer: the data to be added. * @len: the length of the data to be added. @@ -112,7 +112,7 @@ EXPORT_SYMBOL(kfifo_free); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int __kfifo_put(struct kfifo *fifo, +unsigned int kfifo_put(struct kfifo *fifo, const unsigned char *buffer, unsigned int len) { unsigned int l; @@ -144,10 +144,10 @@ unsigned int __kfifo_put(struct kfifo *fifo, return len; } -EXPORT_SYMBOL(__kfifo_put); +EXPORT_SYMBOL(kfifo_put); /** - * __kfifo_get - gets some data from the FIFO, no locking version + * kfifo_get - gets some data from the FIFO, no locking version * @fifo: the fifo to be used. * @buffer: where the data must be copied. * @len: the size of the destination buffer. @@ -158,7 +158,7 @@ EXPORT_SYMBOL(__kfifo_put); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int __kfifo_get(struct kfifo *fifo, +unsigned int kfifo_get(struct kfifo *fifo, unsigned char *buffer, unsigned int len) { unsigned int l; @@ -190,4 +190,4 @@ unsigned int __kfifo_get(struct kfifo *fifo, return len; } -EXPORT_SYMBOL(__kfifo_get); +EXPORT_SYMBOL(kfifo_get); diff --git a/net/dccp/probe.c b/net/dccp/probe.c index c6b50351aa78..9ef36849edd7 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -131,7 +131,7 @@ static ssize_t dccpprobe_read(struct file *file, char __user *buf, return -ENOMEM; error = wait_event_interruptible(dccpw.wait, - __kfifo_len(&dccpw.fifo) != 0); + kfifo_len(&dccpw.fifo) != 0); if (error) goto out_free; -- cgit v1.2.3 From 7acd72eb85f1c7a15e8b5eb554994949241737f1 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:28 -0800 Subject: kfifo: rename kfifo_put... into kfifo_in... and kfifo_get... into kfifo_out... rename kfifo_put... into kfifo_in... to prevent miss use of old non in kernel-tree drivers ditto for kfifo_get... -> kfifo_out... Improve the prototypes of kfifo_in and kfifo_out to make the kerneldoc annotations more readable. Add mini "howto porting to the new API" in kfifo.h Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/nozomi.c | 4 +-- drivers/char/sonypi.c | 8 +++--- drivers/infiniband/hw/cxgb3/cxio_resource.c | 16 ++++++------ drivers/media/video/meye.c | 16 ++++++------ drivers/net/wireless/libertas/main.c | 5 ++-- drivers/platform/x86/fujitsu-laptop.c | 4 +-- drivers/platform/x86/sony-laptop.c | 8 +++--- drivers/scsi/libiscsi.c | 14 +++++------ drivers/scsi/libiscsi_tcp.c | 18 ++++++------- drivers/scsi/libsrp.c | 6 ++--- drivers/usb/host/fhci.h | 4 +-- drivers/usb/serial/generic.c | 4 +-- include/linux/kfifo.h | 39 +++++++++++++++++++++-------- kernel/kfifo.c | 32 +++++++++++------------ net/dccp/probe.c | 4 +-- 15 files changed, 101 insertions(+), 81 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/nozomi.c b/drivers/char/nozomi.c index 61f5bfe74f38..9ef243429014 100644 --- a/drivers/char/nozomi.c +++ b/drivers/char/nozomi.c @@ -798,7 +798,7 @@ static int send_data(enum port_type index, struct nozomi *dc) struct tty_struct *tty = tty_port_tty_get(&port->port); /* Get data from tty and place in buf for now */ - size = kfifo_get(&port->fifo_ul, dc->send_buf, + size = kfifo_out(&port->fifo_ul, dc->send_buf, ul_size < SEND_BUF_MAX ? ul_size : SEND_BUF_MAX); if (size == 0) { @@ -1672,7 +1672,7 @@ static int ntty_write(struct tty_struct *tty, const unsigned char *buffer, goto exit; } - rval = kfifo_put(&port->fifo_ul, (unsigned char *)buffer, count); + rval = kfifo_in(&port->fifo_ul, (unsigned char *)buffer, count); /* notify card */ if (unlikely(dc == NULL)) { diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c index dbcb3bd192c7..0798754a607c 100644 --- a/drivers/char/sonypi.c +++ b/drivers/char/sonypi.c @@ -777,7 +777,7 @@ static void input_keyrelease(struct work_struct *work) { struct sonypi_keypress kp; - while (kfifo_get_locked(&sonypi_device.input_fifo, (unsigned char *)&kp, + while (kfifo_out_locked(&sonypi_device.input_fifo, (unsigned char *)&kp, sizeof(kp), &sonypi_device.input_fifo_lock) == sizeof(kp)) { msleep(10); @@ -828,7 +828,7 @@ static void sonypi_report_input_event(u8 event) if (kp.dev) { input_report_key(kp.dev, kp.key, 1); input_sync(kp.dev); - kfifo_put_locked(&sonypi_device.input_fifo, + kfifo_in_locked(&sonypi_device.input_fifo, (unsigned char *)&kp, sizeof(kp), &sonypi_device.input_fifo_lock); schedule_work(&sonypi_device.input_work); @@ -882,7 +882,7 @@ found: acpi_bus_generate_proc_event(sonypi_acpi_device, 1, event); #endif - kfifo_put_locked(&sonypi_device.fifo, (unsigned char *)&event, + kfifo_in_locked(&sonypi_device.fifo, (unsigned char *)&event, sizeof(event), &sonypi_device.fifo_lock); kill_fasync(&sonypi_device.fifo_async, SIGIO, POLL_IN); wake_up_interruptible(&sonypi_device.fifo_proc_list); @@ -932,7 +932,7 @@ static ssize_t sonypi_misc_read(struct file *file, char __user *buf, return ret; while (ret < count && - (kfifo_get_locked(&sonypi_device.fifo, &c, sizeof(c), + (kfifo_out_locked(&sonypi_device.fifo, &c, sizeof(c), &sonypi_device.fifo_lock) == sizeof(c))) { if (put_user(c, buf++)) return -EFAULT; diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c index d7d18fb02c93..dcbf2606c438 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_resource.c +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c @@ -59,7 +59,7 @@ static int __cxio_init_resource_fifo(struct kfifo *fifo, return -ENOMEM; for (i = 0; i < skip_low + skip_high; i++) - kfifo_put(fifo, (unsigned char *) &entry, sizeof(u32)); + kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32)); if (random) { j = 0; random_bytes = random32(); @@ -71,22 +71,22 @@ static int __cxio_init_resource_fifo(struct kfifo *fifo, random_bytes = random32(); } idx = (random_bytes >> (j * 2)) & 0xF; - kfifo_put(fifo, + kfifo_in(fifo, (unsigned char *) &rarray[idx], sizeof(u32)); rarray[idx] = i; j++; } for (i = 0; i < RANDOM_SIZE; i++) - kfifo_put(fifo, + kfifo_in(fifo, (unsigned char *) &rarray[i], sizeof(u32)); } else for (i = skip_low; i < nr - skip_high; i++) - kfifo_put(fifo, (unsigned char *) &i, sizeof(u32)); + kfifo_in(fifo, (unsigned char *) &i, sizeof(u32)); for (i = 0; i < skip_low + skip_high; i++) - kfifo_get_locked(fifo, (unsigned char *) &entry, + kfifo_out_locked(fifo, (unsigned char *) &entry, sizeof(u32), fifo_lock); return 0; } @@ -119,7 +119,7 @@ static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p) for (i = 16; i < T3_MAX_NUM_QP; i++) if (!(i & rdev_p->qpmask)) - kfifo_put(&rdev_p->rscp->qpid_fifo, + kfifo_in(&rdev_p->rscp->qpid_fifo, (unsigned char *) &i, sizeof(u32)); return 0; } @@ -180,7 +180,7 @@ tpt_err: static u32 cxio_hal_get_resource(struct kfifo *fifo, spinlock_t * lock) { u32 entry; - if (kfifo_get_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock)) + if (kfifo_out_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock)) return entry; else return 0; /* fifo emptry */ @@ -190,7 +190,7 @@ static void cxio_hal_put_resource(struct kfifo *fifo, spinlock_t * lock, u32 entry) { BUG_ON( - kfifo_put_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock) + kfifo_in_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock) == 0); } diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c index 38bcedfd9fec..884a569d60a2 100644 --- a/drivers/media/video/meye.c +++ b/drivers/media/video/meye.c @@ -800,7 +800,7 @@ again: return IRQ_HANDLED; if (meye.mchip_mode == MCHIP_HIC_MODE_CONT_OUT) { - if (kfifo_get_locked(&meye.grabq, (unsigned char *)&reqnr, + if (kfifo_out_locked(&meye.grabq, (unsigned char *)&reqnr, sizeof(int), &meye.grabq_lock) != sizeof(int)) { mchip_free_frame(); return IRQ_HANDLED; @@ -811,7 +811,7 @@ again: meye.grab_buffer[reqnr].state = MEYE_BUF_DONE; do_gettimeofday(&meye.grab_buffer[reqnr].timestamp); meye.grab_buffer[reqnr].sequence = sequence++; - kfifo_put_locked(&meye.doneq, (unsigned char *)&reqnr, + kfifo_in_locked(&meye.doneq, (unsigned char *)&reqnr, sizeof(int), &meye.doneq_lock); wake_up_interruptible(&meye.proc_list); } else { @@ -821,7 +821,7 @@ again: mchip_free_frame(); goto again; } - if (kfifo_get_locked(&meye.grabq, (unsigned char *)&reqnr, + if (kfifo_out_locked(&meye.grabq, (unsigned char *)&reqnr, sizeof(int), &meye.grabq_lock) != sizeof(int)) { mchip_free_frame(); goto again; @@ -832,7 +832,7 @@ again: meye.grab_buffer[reqnr].state = MEYE_BUF_DONE; do_gettimeofday(&meye.grab_buffer[reqnr].timestamp); meye.grab_buffer[reqnr].sequence = sequence++; - kfifo_put_locked(&meye.doneq, (unsigned char *)&reqnr, + kfifo_in_locked(&meye.doneq, (unsigned char *)&reqnr, sizeof(int), &meye.doneq_lock); wake_up_interruptible(&meye.proc_list); } @@ -935,7 +935,7 @@ static int meyeioc_qbuf_capt(int *nb) mchip_cont_compression_start(); meye.grab_buffer[*nb].state = MEYE_BUF_USING; - kfifo_put_locked(&meye.grabq, (unsigned char *)nb, sizeof(int), + kfifo_in_locked(&meye.grabq, (unsigned char *)nb, sizeof(int), &meye.grabq_lock); mutex_unlock(&meye.lock); @@ -968,7 +968,7 @@ static int meyeioc_sync(struct file *file, void *fh, int *i) /* fall through */ case MEYE_BUF_DONE: meye.grab_buffer[*i].state = MEYE_BUF_UNUSED; - kfifo_get_locked(&meye.doneq, (unsigned char *)&unused, + kfifo_out_locked(&meye.doneq, (unsigned char *)&unused, sizeof(int), &meye.doneq_lock); } *i = meye.grab_buffer[*i].size; @@ -1456,7 +1456,7 @@ static int vidioc_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) buf->flags |= V4L2_BUF_FLAG_QUEUED; buf->flags &= ~V4L2_BUF_FLAG_DONE; meye.grab_buffer[buf->index].state = MEYE_BUF_USING; - kfifo_put_locked(&meye.grabq, (unsigned char *)&buf->index, + kfifo_in_locked(&meye.grabq, (unsigned char *)&buf->index, sizeof(int), &meye.grabq_lock); mutex_unlock(&meye.lock); @@ -1483,7 +1483,7 @@ static int vidioc_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) return -EINTR; } - if (!kfifo_get_locked(&meye.doneq, (unsigned char *)&reqnr, + if (!kfifo_out_locked(&meye.doneq, (unsigned char *)&reqnr, sizeof(int), &meye.doneq_lock)) { mutex_unlock(&meye.lock); return -EBUSY; diff --git a/drivers/net/wireless/libertas/main.c b/drivers/net/wireless/libertas/main.c index 0622104f0a03..2bcfa745524a 100644 --- a/drivers/net/wireless/libertas/main.c +++ b/drivers/net/wireless/libertas/main.c @@ -513,7 +513,8 @@ static int lbs_thread(void *data) spin_lock_irq(&priv->driver_lock); while (kfifo_len(&priv->event_fifo)) { u32 event; - kfifo_get(&priv->event_fifo, (unsigned char *) &event, + + kfifo_out(&priv->event_fifo, (unsigned char *) &event, sizeof(event)); spin_unlock_irq(&priv->driver_lock); lbs_process_event(priv, event); @@ -1175,7 +1176,7 @@ void lbs_queue_event(struct lbs_private *priv, u32 event) if (priv->psstate == PS_STATE_SLEEP) priv->psstate = PS_STATE_AWAKE; - kfifo_put(&priv->event_fifo, (unsigned char *) &event, sizeof(u32)); + kfifo_in(&priv->event_fifo, (unsigned char *) &event, sizeof(u32)); wake_up_interruptible(&priv->waitq); diff --git a/drivers/platform/x86/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c index 13dc7bedcfce..b66029bd75d0 100644 --- a/drivers/platform/x86/fujitsu-laptop.c +++ b/drivers/platform/x86/fujitsu-laptop.c @@ -1006,7 +1006,7 @@ static void acpi_fujitsu_hotkey_notify(struct acpi_device *device, u32 event) vdbg_printk(FUJLAPTOP_DBG_TRACE, "Push keycode into ringbuffer [%d]\n", keycode); - status = kfifo_put_locked(&fujitsu_hotkey->fifo, + status = kfifo_in_locked(&fujitsu_hotkey->fifo, (unsigned char *)&keycode, sizeof(keycode), &fujitsu_hotkey->fifo_lock); @@ -1020,7 +1020,7 @@ static void acpi_fujitsu_hotkey_notify(struct acpi_device *device, u32 event) } } else if (keycode == 0) { while ((status = - kfifo_get_locked( + kfifo_out_locked( &fujitsu_hotkey->fifo, (unsigned char *) &keycode_r, sizeof(keycode_r), diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 36e5dc6fc953..2896ca4cd9ab 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -300,7 +300,7 @@ static void do_sony_laptop_release_key(struct work_struct *work) { struct sony_laptop_keypress kp; - while (kfifo_get_locked(&sony_laptop_input.fifo, (unsigned char *)&kp, + while (kfifo_out_locked(&sony_laptop_input.fifo, (unsigned char *)&kp, sizeof(kp), &sony_laptop_input.fifo_lock) == sizeof(kp)) { msleep(10); @@ -363,7 +363,7 @@ static void sony_laptop_report_input_event(u8 event) /* we emit the scancode so we can always remap the key */ input_event(kp.dev, EV_MSC, MSC_SCAN, event); input_sync(kp.dev); - kfifo_put_locked(&sony_laptop_input.fifo, + kfifo_in_locked(&sony_laptop_input.fifo, (unsigned char *)&kp, sizeof(kp), &sony_laptop_input.fifo_lock); @@ -2130,7 +2130,7 @@ static ssize_t sonypi_misc_read(struct file *file, char __user *buf, return ret; while (ret < count && - (kfifo_get_locked(&sonypi_compat.fifo, &c, sizeof(c), + (kfifo_out_locked(&sonypi_compat.fifo, &c, sizeof(c), &sonypi_compat.fifo_lock) == sizeof(c))) { if (put_user(c, buf++)) return -EFAULT; @@ -2310,7 +2310,7 @@ static struct miscdevice sonypi_misc_device = { static void sonypi_compat_report_event(u8 event) { - kfifo_put_locked(&sonypi_compat.fifo, (unsigned char *)&event, + kfifo_in_locked(&sonypi_compat.fifo, (unsigned char *)&event, sizeof(event), &sonypi_compat.fifo_lock); kill_fasync(&sonypi_compat.fifo_async, SIGIO, POLL_IN); wake_up_interruptible(&sonypi_compat.fifo_proc_list); diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 5f0c46f43ee1..c28a712fd4db 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -517,7 +517,7 @@ static void iscsi_free_task(struct iscsi_task *task) if (conn->login_task == task) return; - kfifo_put(&session->cmdpool.queue, (void*)&task, sizeof(void*)); + kfifo_in(&session->cmdpool.queue, (void*)&task, sizeof(void*)); if (sc) { task->sc = NULL; @@ -737,7 +737,7 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr, BUG_ON(conn->c_stage == ISCSI_CONN_INITIAL_STAGE); BUG_ON(conn->c_stage == ISCSI_CONN_STOPPED); - if (!kfifo_get(&session->cmdpool.queue, + if (!kfifo_out(&session->cmdpool.queue, (void*)&task, sizeof(void*))) return NULL; } @@ -1567,7 +1567,7 @@ static inline struct iscsi_task *iscsi_alloc_task(struct iscsi_conn *conn, { struct iscsi_task *task; - if (!kfifo_get(&conn->session->cmdpool.queue, + if (!kfifo_out(&conn->session->cmdpool.queue, (void *) &task, sizeof(void *))) return NULL; @@ -2469,7 +2469,7 @@ iscsi_pool_init(struct iscsi_pool *q, int max, void ***items, int item_size) q->max = i; goto enomem; } - kfifo_put(&q->queue, (void*)&q->pool[i], sizeof(void*)); + kfifo_in(&q->queue, (void*)&q->pool[i], sizeof(void*)); } if (items) { @@ -2819,7 +2819,7 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size, /* allocate login_task used for the login/text sequences */ spin_lock_bh(&session->lock); - if (!kfifo_get(&session->cmdpool.queue, + if (!kfifo_out(&session->cmdpool.queue, (void*)&conn->login_task, sizeof(void*))) { spin_unlock_bh(&session->lock); @@ -2839,7 +2839,7 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size, return cls_conn; login_task_data_alloc_fail: - kfifo_put(&session->cmdpool.queue, (void*)&conn->login_task, + kfifo_in(&session->cmdpool.queue, (void*)&conn->login_task, sizeof(void*)); login_task_alloc_fail: iscsi_destroy_conn(cls_conn); @@ -2902,7 +2902,7 @@ void iscsi_conn_teardown(struct iscsi_cls_conn *cls_conn) free_pages((unsigned long) conn->data, get_order(ISCSI_DEF_MAX_RECV_SEG_LEN)); kfree(conn->persistent_address); - kfifo_put(&session->cmdpool.queue, (void*)&conn->login_task, + kfifo_in(&session->cmdpool.queue, (void*)&conn->login_task, sizeof(void*)); if (session->leadconn == conn) session->leadconn = NULL; diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c index c0be926637b1..d51ffeca2ec9 100644 --- a/drivers/scsi/libiscsi_tcp.c +++ b/drivers/scsi/libiscsi_tcp.c @@ -445,15 +445,15 @@ void iscsi_tcp_cleanup_task(struct iscsi_task *task) return; /* flush task's r2t queues */ - while (kfifo_get(&tcp_task->r2tqueue, (void*)&r2t, sizeof(void*))) { - kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, + while (kfifo_out(&tcp_task->r2tqueue, (void*)&r2t, sizeof(void*))) { + kfifo_in(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); ISCSI_DBG_TCP(task->conn, "pending r2t dropped\n"); } r2t = tcp_task->r2t; if (r2t != NULL) { - kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, + kfifo_in(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); tcp_task->r2t = NULL; } @@ -541,7 +541,7 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task) return 0; } - rc = kfifo_get(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); + rc = kfifo_out(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); if (!rc) { iscsi_conn_printk(KERN_ERR, conn, "Could not allocate R2T. " "Target has sent more R2Ts than it " @@ -554,7 +554,7 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task) if (r2t->data_length == 0) { iscsi_conn_printk(KERN_ERR, conn, "invalid R2T with zero data len\n"); - kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, + kfifo_in(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); return ISCSI_ERR_DATALEN; } @@ -570,7 +570,7 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task) "invalid R2T with data len %u at offset %u " "and total length %d\n", r2t->data_length, r2t->data_offset, scsi_out(task->sc)->length); - kfifo_put(&tcp_task->r2tpool.queue, (void*)&r2t, + kfifo_in(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*)); return ISCSI_ERR_DATALEN; } @@ -580,7 +580,7 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task) r2t->sent = 0; tcp_task->exp_datasn = r2tsn + 1; - kfifo_put(&tcp_task->r2tqueue, (void*)&r2t, sizeof(void*)); + kfifo_in(&tcp_task->r2tqueue, (void*)&r2t, sizeof(void*)); conn->r2t_pdus_cnt++; iscsi_requeue_task(task); @@ -982,7 +982,7 @@ static struct iscsi_r2t_info *iscsi_tcp_get_curr_r2t(struct iscsi_task *task) if (r2t->data_length <= r2t->sent) { ISCSI_DBG_TCP(task->conn, " done with r2t %p\n", r2t); - kfifo_put(&tcp_task->r2tpool.queue, + kfifo_in(&tcp_task->r2tpool.queue, (void *)&tcp_task->r2t, sizeof(void *)); tcp_task->r2t = r2t = NULL; @@ -990,7 +990,7 @@ static struct iscsi_r2t_info *iscsi_tcp_get_curr_r2t(struct iscsi_task *task) } if (r2t == NULL) { - kfifo_get(&tcp_task->r2tqueue, + kfifo_out(&tcp_task->r2tqueue, (void *)&tcp_task->r2t, sizeof(void *)); r2t = tcp_task->r2t; } diff --git a/drivers/scsi/libsrp.c b/drivers/scsi/libsrp.c index 975e448cfcb9..8424b8606efb 100644 --- a/drivers/scsi/libsrp.c +++ b/drivers/scsi/libsrp.c @@ -61,7 +61,7 @@ static int srp_iu_pool_alloc(struct srp_queue *q, size_t max, kfifo_init(&q->queue, (void *) q->pool, max * sizeof(void *)); for (i = 0, iue = q->items; i < max; i++) { - kfifo_put(&q->queue, (void *) &iue, sizeof(void *)); + kfifo_in(&q->queue, (void *) &iue, sizeof(void *)); iue->sbuf = ring[i]; iue++; } @@ -163,7 +163,7 @@ struct iu_entry *srp_iu_get(struct srp_target *target) { struct iu_entry *iue = NULL; - kfifo_get_locked(&target->iu_queue.queue, (void *) &iue, + kfifo_out_locked(&target->iu_queue.queue, (void *) &iue, sizeof(void *), &target->iu_queue.lock); if (!iue) return iue; @@ -176,7 +176,7 @@ EXPORT_SYMBOL_GPL(srp_iu_get); void srp_iu_put(struct iu_entry *iue) { - kfifo_put_locked(&iue->target->iu_queue.queue, (void *) &iue, + kfifo_in_locked(&iue->target->iu_queue.queue, (void *) &iue, sizeof(void *), &iue->target->iu_queue.lock); } EXPORT_SYMBOL_GPL(srp_iu_put); diff --git a/drivers/usb/host/fhci.h b/drivers/usb/host/fhci.h index 96aa787f208f..72dae1c5ab38 100644 --- a/drivers/usb/host/fhci.h +++ b/drivers/usb/host/fhci.h @@ -510,14 +510,14 @@ static inline unsigned int cq_howmany(struct kfifo *kfifo) static inline int cq_put(struct kfifo *kfifo, void *p) { - return kfifo_put(kfifo, (void *)&p, sizeof(p)); + return kfifo_in(kfifo, (void *)&p, sizeof(p)); } static inline void *cq_get(struct kfifo *kfifo) { void *p = NULL; - kfifo_get(kfifo, (void *)&p, sizeof(p)); + kfifo_out(kfifo, (void *)&p, sizeof(p)); return p; } diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c index d0a2e464cacd..b0f1183755c9 100644 --- a/drivers/usb/serial/generic.c +++ b/drivers/usb/serial/generic.c @@ -285,7 +285,7 @@ static int usb_serial_generic_write_start(struct usb_serial_port *port) return 0; data = port->write_urb->transfer_buffer; - count = kfifo_get_locked(port->write_fifo, data, port->bulk_out_size, &port->lock); + count = kfifo_out_locked(port->write_fifo, data, port->bulk_out_size, &port->lock); usb_serial_debug_data(debug, &port->dev, __func__, count, data); /* set up our urb */ @@ -345,7 +345,7 @@ int usb_serial_generic_write(struct tty_struct *tty, return usb_serial_multi_urb_write(tty, port, buf, count); - count = kfifo_put_locked(port->write_fifo, buf, count, &port->lock); + count = kfifo_in_locked(port->write_fifo, buf, count, &port->lock); result = usb_serial_generic_write_start(port); if (result >= 0) diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index a893acda3964..1b59c4a0e85f 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -19,6 +19,25 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * */ + +/* + * Howto porting drivers to the new generic fifo API: + * + * - Modify the declaration of the "struct kfifo *" object into a + * in-place "struct kfifo" object + * - Init the in-place object with kfifo_alloc() or kfifo_init() + * Note: The address of the in-place "struct kfifo" object must be + * passed as the first argument to this functions + * - Replace the use of __kfifo_put into kfifo_in and __kfifo_get + * into kfifo_out + * - Replace the use of kfifo_put into kfifo_in_locked and kfifo_get + * into kfifo_out_locked + * Note: the spinlock pointer formerly passed to kfifo_init/kfifo_alloc + * must be passed now to the kfifo_in_locked and kfifo_out_locked + * as the last parameter. + * - All formerly name __kfifo_* functions has been renamed into kfifo_* + */ + #ifndef _LINUX_KFIFO_H #define _LINUX_KFIFO_H @@ -37,10 +56,10 @@ extern void kfifo_init(struct kfifo *fifo, unsigned char *buffer, extern __must_check int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask); extern void kfifo_free(struct kfifo *fifo); -extern unsigned int kfifo_put(struct kfifo *fifo, - const unsigned char *buffer, unsigned int len); -extern unsigned int kfifo_get(struct kfifo *fifo, - unsigned char *buffer, unsigned int len); +extern __must_check unsigned int kfifo_in(struct kfifo *fifo, + const unsigned char *from, unsigned int len); +extern __must_check unsigned int kfifo_out(struct kfifo *fifo, + unsigned char *to, unsigned int len); /** * kfifo_reset - removes the entire FIFO contents @@ -65,7 +84,7 @@ static inline unsigned int kfifo_len(struct kfifo *fifo) } /** - * kfifo_put_locked - puts some data into the FIFO using a spinlock for locking + * kfifo_in_locked - puts some data into the FIFO using a spinlock for locking * @fifo: the fifo to be used. * @from: the data to be added. * @n: the length of the data to be added. @@ -75,7 +94,7 @@ static inline unsigned int kfifo_len(struct kfifo *fifo) * the FIFO depending on the free space, and returns the number of * bytes copied. */ -static inline __must_check unsigned int kfifo_put_locked(struct kfifo *fifo, +static inline __must_check unsigned int kfifo_in_locked(struct kfifo *fifo, const unsigned char *from, unsigned int n, spinlock_t *lock) { unsigned long flags; @@ -83,7 +102,7 @@ static inline __must_check unsigned int kfifo_put_locked(struct kfifo *fifo, spin_lock_irqsave(lock, flags); - ret = kfifo_put(fifo, from, n); + ret = kfifo_in(fifo, from, n); spin_unlock_irqrestore(lock, flags); @@ -91,7 +110,7 @@ static inline __must_check unsigned int kfifo_put_locked(struct kfifo *fifo, } /** - * kfifo_get_locked - gets some data from the FIFO using a spinlock for locking + * kfifo_out_locked - gets some data from the FIFO using a spinlock for locking * @fifo: the fifo to be used. * @to: where the data must be copied. * @n: the size of the destination buffer. @@ -100,7 +119,7 @@ static inline __must_check unsigned int kfifo_put_locked(struct kfifo *fifo, * This function copies at most @len bytes from the FIFO into the * @to buffer and returns the number of copied bytes. */ -static inline __must_check unsigned int kfifo_get_locked(struct kfifo *fifo, +static inline __must_check unsigned int kfifo_out_locked(struct kfifo *fifo, unsigned char *to, unsigned int n, spinlock_t *lock) { unsigned long flags; @@ -108,7 +127,7 @@ static inline __must_check unsigned int kfifo_get_locked(struct kfifo *fifo, spin_lock_irqsave(lock, flags); - ret = kfifo_get(fifo, to, n); + ret = kfifo_out(fifo, to, n); /* * optimization: if the FIFO is empty, set the indices to 0 diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 963ffde4af1a..d659442e73f2 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -100,20 +100,20 @@ void kfifo_free(struct kfifo *fifo) EXPORT_SYMBOL(kfifo_free); /** - * kfifo_put - puts some data into the FIFO, no locking version + * kfifo_in - puts some data into the FIFO * @fifo: the fifo to be used. - * @buffer: the data to be added. + * @from: the data to be added. * @len: the length of the data to be added. * - * This function copies at most @len bytes from the @buffer into + * This function copies at most @len bytes from the @from buffer into * the FIFO depending on the free space, and returns the number of * bytes copied. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_put(struct kfifo *fifo, - const unsigned char *buffer, unsigned int len) +unsigned int kfifo_in(struct kfifo *fifo, + const unsigned char *from, unsigned int len) { unsigned int l; @@ -128,10 +128,10 @@ unsigned int kfifo_put(struct kfifo *fifo, /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); - memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); + memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), from, l); /* then put the rest (if any) at the beginning of the buffer */ - memcpy(fifo->buffer, buffer + l, len - l); + memcpy(fifo->buffer, from + l, len - l); /* * Ensure that we add the bytes to the kfifo -before- @@ -144,22 +144,22 @@ unsigned int kfifo_put(struct kfifo *fifo, return len; } -EXPORT_SYMBOL(kfifo_put); +EXPORT_SYMBOL(kfifo_in); /** - * kfifo_get - gets some data from the FIFO, no locking version + * kfifo_out - gets some data from the FIFO * @fifo: the fifo to be used. - * @buffer: where the data must be copied. + * @to: where the data must be copied. * @len: the size of the destination buffer. * * This function copies at most @len bytes from the FIFO into the - * @buffer and returns the number of copied bytes. + * @to buffer and returns the number of copied bytes. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_get(struct kfifo *fifo, - unsigned char *buffer, unsigned int len) +unsigned int kfifo_out(struct kfifo *fifo, + unsigned char *to, unsigned int len) { unsigned int l; @@ -174,10 +174,10 @@ unsigned int kfifo_get(struct kfifo *fifo, /* first get the data from fifo->out until the end of the buffer */ l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); - memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); + memcpy(to, fifo->buffer + (fifo->out & (fifo->size - 1)), l); /* then get the rest (if any) from the beginning of the buffer */ - memcpy(buffer + l, fifo->buffer, len - l); + memcpy(to + l, fifo->buffer, len - l); /* * Ensure that we remove the bytes from the kfifo -before- @@ -190,4 +190,4 @@ unsigned int kfifo_get(struct kfifo *fifo, return len; } -EXPORT_SYMBOL(kfifo_get); +EXPORT_SYMBOL(kfifo_out); diff --git a/net/dccp/probe.c b/net/dccp/probe.c index 9ef36849edd7..a1362dc8abb0 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -67,7 +67,7 @@ static void printl(const char *fmt, ...) len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); va_end(args); - kfifo_put_locked(&dccpw.fifo, tbuf, len, &dccpw.lock); + kfifo_in_locked(&dccpw.fifo, tbuf, len, &dccpw.lock); wake_up(&dccpw.wait); } @@ -135,7 +135,7 @@ static ssize_t dccpprobe_read(struct file *file, char __user *buf, if (error) goto out_free; - cnt = kfifo_get_locked(&dccpw.fifo, tbuf, len, &dccpw.lock); + cnt = kfifo_out_locked(&dccpw.fifo, tbuf, len, &dccpw.lock); error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0; out_free: -- cgit v1.2.3 From 9842c38e917636fa7dc6b88aff17a8f1fd7f0cc0 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:29 -0800 Subject: kfifo: fix warn_unused_result Fix the "ignoring return value of '...', declared with attribute warn_unused_result" compiler warning in several users of the new kfifo API. It removes the __must_check attribute from kfifo_in() and kfifo_in_locked() which must not necessary performed. Fix the allocation bug in the nozomi driver file, by moving out the kfifo_alloc from the interrupt handler into the probe function. Fix the kfifo_out() and kfifo_out_locked() users to handle a unexpected end of fifo. Signed-off-by: Stefani Seibold Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/nozomi.c | 31 +++++++++++++++++++++++++---- drivers/infiniband/hw/cxgb3/cxio_resource.c | 5 +++-- drivers/media/video/meye.c | 5 +++-- drivers/net/wireless/libertas/main.c | 6 ++++-- drivers/scsi/libiscsi_tcp.c | 9 +++++++-- drivers/scsi/libsrp.c | 7 +++++-- include/linux/kfifo.h | 4 ++-- 7 files changed, 51 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/nozomi.c b/drivers/char/nozomi.c index 9ef243429014..7d73cd430340 100644 --- a/drivers/char/nozomi.c +++ b/drivers/char/nozomi.c @@ -685,8 +685,6 @@ static int nozomi_read_config_table(struct nozomi *dc) dump_table(dc); for (i = PORT_MDM; i < MAX_PORT; i++) { - kfifo_alloc(&dc->port[i].fifo_ul, - FIFO_BUFFER_SIZE_UL, GFP_ATOMIC); memset(&dc->port[i].ctrl_dl, 0, sizeof(struct ctrl_dl)); memset(&dc->port[i].ctrl_ul, 0, sizeof(struct ctrl_ul)); } @@ -1433,6 +1431,16 @@ static int __devinit nozomi_card_init(struct pci_dev *pdev, goto err_free_sbuf; } + for (i = PORT_MDM; i < MAX_PORT; i++) { + if (kfifo_alloc(&dc->port[i].fifo_ul, + FIFO_BUFFER_SIZE_UL, GFP_ATOMIC)) { + dev_err(&pdev->dev, + "Could not allocate kfifo buffer\n"); + ret = -ENOMEM; + goto err_free_kfifo; + } + } + spin_lock_init(&dc->spin_mutex); nozomi_setup_private_data(dc); @@ -1445,7 +1453,7 @@ static int __devinit nozomi_card_init(struct pci_dev *pdev, NOZOMI_NAME, dc); if (unlikely(ret)) { dev_err(&pdev->dev, "can't request irq %d\n", pdev->irq); - goto err_free_sbuf; + goto err_free_kfifo; } DBG1("base_addr: %p", dc->base_addr); @@ -1464,13 +1472,28 @@ static int __devinit nozomi_card_init(struct pci_dev *pdev, dc->state = NOZOMI_STATE_ENABLED; for (i = 0; i < MAX_PORT; i++) { + struct device *tty_dev; + mutex_init(&dc->port[i].tty_sem); tty_port_init(&dc->port[i].port); - tty_register_device(ntty_driver, dc->index_start + i, + tty_dev = tty_register_device(ntty_driver, dc->index_start + i, &pdev->dev); + + if (IS_ERR(tty_dev)) { + ret = PTR_ERR(tty_dev); + dev_err(&pdev->dev, "Could not allocate tty?\n"); + goto err_free_tty; + } } + return 0; +err_free_tty: + for (i = dc->index_start; i < dc->index_start + MAX_PORT; ++i) + tty_unregister_device(ntty_driver, i); +err_free_kfifo: + for (i = 0; i < MAX_PORT; i++) + kfifo_free(&dc->port[i].fifo_ul); err_free_sbuf: kfree(dc->send_buf); iounmap(dc->base_addr); diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c index dcbf2606c438..31f9201b2980 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_resource.c +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c @@ -86,8 +86,9 @@ static int __cxio_init_resource_fifo(struct kfifo *fifo, kfifo_in(fifo, (unsigned char *) &i, sizeof(u32)); for (i = 0; i < skip_low + skip_high; i++) - kfifo_out_locked(fifo, (unsigned char *) &entry, - sizeof(u32), fifo_lock); + if (kfifo_out_locked(fifo, (unsigned char *) &entry, + sizeof(u32), fifo_lock) != sizeof(u32)) + break; return 0; } diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c index 884a569d60a2..b421858ccf90 100644 --- a/drivers/media/video/meye.c +++ b/drivers/media/video/meye.c @@ -968,8 +968,9 @@ static int meyeioc_sync(struct file *file, void *fh, int *i) /* fall through */ case MEYE_BUF_DONE: meye.grab_buffer[*i].state = MEYE_BUF_UNUSED; - kfifo_out_locked(&meye.doneq, (unsigned char *)&unused, - sizeof(int), &meye.doneq_lock); + if (kfifo_out_locked(&meye.doneq, (unsigned char *)&unused, + sizeof(int), &meye.doneq_lock) != sizeof(int)) + break; } *i = meye.grab_buffer[*i].size; mutex_unlock(&meye.lock); diff --git a/drivers/net/wireless/libertas/main.c b/drivers/net/wireless/libertas/main.c index 2bcfa745524a..c2975c8e2f21 100644 --- a/drivers/net/wireless/libertas/main.c +++ b/drivers/net/wireless/libertas/main.c @@ -514,8 +514,10 @@ static int lbs_thread(void *data) while (kfifo_len(&priv->event_fifo)) { u32 event; - kfifo_out(&priv->event_fifo, (unsigned char *) &event, - sizeof(event)); + if (kfifo_out(&priv->event_fifo, + (unsigned char *) &event, sizeof(event)) != + sizeof(event)) + break; spin_unlock_irq(&priv->driver_lock); lbs_process_event(priv, event); spin_lock_irq(&priv->driver_lock); diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c index d51ffeca2ec9..db6856c138fc 100644 --- a/drivers/scsi/libiscsi_tcp.c +++ b/drivers/scsi/libiscsi_tcp.c @@ -990,8 +990,13 @@ static struct iscsi_r2t_info *iscsi_tcp_get_curr_r2t(struct iscsi_task *task) } if (r2t == NULL) { - kfifo_out(&tcp_task->r2tqueue, - (void *)&tcp_task->r2t, sizeof(void *)); + if (kfifo_out(&tcp_task->r2tqueue, + (void *)&tcp_task->r2t, sizeof(void *)) != + sizeof(void *)) { + WARN_ONCE(1, "unexpected fifo state"); + r2t = NULL; + } + r2t = tcp_task->r2t; } spin_unlock_bh(&session->lock); diff --git a/drivers/scsi/libsrp.c b/drivers/scsi/libsrp.c index 8424b8606efb..ab19b3b4be52 100644 --- a/drivers/scsi/libsrp.c +++ b/drivers/scsi/libsrp.c @@ -163,8 +163,11 @@ struct iu_entry *srp_iu_get(struct srp_target *target) { struct iu_entry *iue = NULL; - kfifo_out_locked(&target->iu_queue.queue, (void *) &iue, - sizeof(void *), &target->iu_queue.lock); + if (kfifo_out_locked(&target->iu_queue.queue, (void *) &iue, + sizeof(void *), &target->iu_queue.lock) != sizeof(void *)) { + WARN_ONCE(1, "unexpected fifo state"); + return NULL; + } if (!iue) return iue; iue->target = target; diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 1b59c4a0e85f..5ed2565c89b6 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -56,7 +56,7 @@ extern void kfifo_init(struct kfifo *fifo, unsigned char *buffer, extern __must_check int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask); extern void kfifo_free(struct kfifo *fifo); -extern __must_check unsigned int kfifo_in(struct kfifo *fifo, +extern unsigned int kfifo_in(struct kfifo *fifo, const unsigned char *from, unsigned int len); extern __must_check unsigned int kfifo_out(struct kfifo *fifo, unsigned char *to, unsigned int len); @@ -94,7 +94,7 @@ static inline unsigned int kfifo_len(struct kfifo *fifo) * the FIFO depending on the free space, and returns the number of * bytes copied. */ -static inline __must_check unsigned int kfifo_in_locked(struct kfifo *fifo, +static inline unsigned int kfifo_in_locked(struct kfifo *fifo, const unsigned char *from, unsigned int n, spinlock_t *lock) { unsigned long flags; -- cgit v1.2.3 From 37bdfbbfaab47811fcec84dff23c4e8da1a09f9e Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:30 -0800 Subject: kfifo: add DEFINE_KFIFO and friends, add very tiny functions Add DECLARE_KFIFO - macro to declare a kfifo and the associated buffer inside a struct Add INIT_KFIFO - Initialize a kfifo declared by DECLARED_KFIFO Add DEFINE_KFIFO - macro to define and initialize a kfifo as a global or local object Add kfifo_size() - returns the size of the fifo in bytes Add kfifo_is_empty() - returns true if the fifo is empty Add kfifo_is_full() - returns true if the fifo is full Add kfifo_avail() - returns the number of bytes available in the FIFO Do some code cleanup Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 94 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 5ed2565c89b6..dd53eed3e2af 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -51,6 +51,60 @@ struct kfifo { unsigned int out; /* data is extracted from off. (out % size) */ }; +/* + * Macros for declaration and initialization of the kfifo datatype + */ + +/* helper macro */ +#define __kfifo_initializer(s, b) \ + (struct kfifo) { \ + .size = s, \ + .in = 0, \ + .out = 0, \ + .buffer = b \ + } + +/** + * DECLARE_KFIFO - macro to declare a kfifo and the associated buffer + * @name: name of the declared kfifo datatype + * @size: size of the fifo buffer + * + * Note: the macro can be used inside struct or union declaration + * Note: the macro creates two objects: + * A kfifo object with the given name and a buffer for the kfifo + * object named name##kfifo_buffer + */ +#define DECLARE_KFIFO(name, size) \ +union { \ + struct kfifo name; \ + unsigned char name##kfifo_buffer[size + sizeof(struct kfifo)]; \ +} + +/** + * INIT_KFIFO - Initialize a kfifo declared by DECLARED_KFIFO + * @name: name of the declared kfifo datatype + * @size: size of the fifo buffer + */ +#define INIT_KFIFO(name) \ + name = __kfifo_initializer(sizeof(name##kfifo_buffer) - \ + sizeof(struct kfifo), name##kfifo_buffer) + +/** + * DEFINE_KFIFO - macro to define and initialize a kfifo + * @name: name of the declared kfifo datatype + * @size: size of the fifo buffer + * + * Note: the macro can be used for global and local kfifo data type variables + * Note: the macro creates two objects: + * A kfifo object with the given name and a buffer for the kfifo + * object named name##kfifo_buffer + */ +#define DEFINE_KFIFO(name, size) \ + unsigned char name##kfifo_buffer[size]; \ + struct kfifo name = __kfifo_initializer(size, name##kfifo_buffer) + +#undef __kfifo_initializer + extern void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size); extern __must_check int kfifo_alloc(struct kfifo *fifo, unsigned int size, @@ -70,6 +124,15 @@ static inline void kfifo_reset(struct kfifo *fifo) fifo->in = fifo->out = 0; } +/** + * kfifo_size - returns the size of the fifo in bytes + * @fifo: the fifo to be used. + */ +static inline __must_check unsigned int kfifo_size(struct kfifo *fifo) +{ + return fifo->size; +} + /** * kfifo_len - returns the number of used bytes in the FIFO * @fifo: the fifo to be used. @@ -83,6 +146,33 @@ static inline unsigned int kfifo_len(struct kfifo *fifo) return fifo->in - out; } +/** + * kfifo_is_empty - returns true if the fifo is empty + * @fifo: the fifo to be used. + */ +static inline __must_check int kfifo_is_empty(struct kfifo *fifo) +{ + return fifo->in == fifo->out; +} + +/** + * kfifo_is_full - returns true if the fifo is full + * @fifo: the fifo to be used. + */ +static inline __must_check int kfifo_is_full(struct kfifo *fifo) +{ + return kfifo_len(fifo) == kfifo_size(fifo); +} + +/** + * kfifo_avail - returns the number of bytes available in the FIFO + * @fifo: the fifo to be used. + */ +static inline __must_check unsigned int kfifo_avail(struct kfifo *fifo) +{ + return kfifo_size(fifo) - kfifo_len(fifo); +} + /** * kfifo_in_locked - puts some data into the FIFO using a spinlock for locking * @fifo: the fifo to be used. @@ -133,8 +223,8 @@ static inline __must_check unsigned int kfifo_out_locked(struct kfifo *fifo, * optimization: if the FIFO is empty, set the indices to 0 * so we don't wrap the next time */ - if (fifo->in == fifo->out) - fifo->in = fifo->out = 0; + if (kfifo_is_empty(fifo)) + kfifo_reset(fifo); spin_unlock_irqrestore(lock, flags); -- cgit v1.2.3 From a121f24accac1600bf5b6fb1e12eeabdfed7cb1a Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:31 -0800 Subject: kfifo: add kfifo_skip, kfifo_from_user and kfifo_to_user Add kfifo_reset_out() for save lockless discard the fifo output Add kfifo_skip() to skip a number of output bytes Add kfifo_from_user() to copy user space data into the fifo Add kfifo_to_user() to copy fifo data to user space Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 47 +++++++++++++++++ kernel/kfifo.c | 139 ++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 170 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index dd53eed3e2af..d3230fb08bc7 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -124,6 +124,16 @@ static inline void kfifo_reset(struct kfifo *fifo) fifo->in = fifo->out = 0; } +/** + * kfifo_reset_out - skip FIFO contents + * @fifo: the fifo to be emptied. + */ +static inline void kfifo_reset_out(struct kfifo *fifo) +{ + smp_mb(); + fifo->out = fifo->in; +} + /** * kfifo_size - returns the size of the fifo in bytes * @fifo: the fifo to be used. @@ -231,4 +241,41 @@ static inline __must_check unsigned int kfifo_out_locked(struct kfifo *fifo, return ret; } +extern void kfifo_skip(struct kfifo *fifo, unsigned int len); + +extern __must_check unsigned int kfifo_from_user(struct kfifo *fifo, + const void __user *from, unsigned int n); + +extern __must_check unsigned int kfifo_to_user(struct kfifo *fifo, + void __user *to, unsigned int n); + +/** + * __kfifo_add_out internal helper function for updating the out offset + */ +static inline void __kfifo_add_out(struct kfifo *fifo, + unsigned int off) +{ + smp_mb(); + fifo->out += off; +} + +/** + * __kfifo_add_in internal helper function for updating the in offset + */ +static inline void __kfifo_add_in(struct kfifo *fifo, + unsigned int off) +{ + smp_wmb(); + fifo->in += off; +} + +/** + * __kfifo_off internal helper function for calculating the index of a + * given offeset + */ +static inline unsigned int __kfifo_off(struct kfifo *fifo, unsigned int off) +{ + return off & (fifo->size - 1); +} + #endif diff --git a/kernel/kfifo.c b/kernel/kfifo.c index d659442e73f2..2a78425ef67f 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -26,6 +26,7 @@ #include #include #include +#include static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size) @@ -99,6 +100,21 @@ void kfifo_free(struct kfifo *fifo) } EXPORT_SYMBOL(kfifo_free); +/** + * kfifo_skip - skip output data + * @fifo: the fifo to be used. + * @len: number of bytes to skip + */ +void kfifo_skip(struct kfifo *fifo, unsigned int len) +{ + if (len < kfifo_len(fifo)) { + __kfifo_add_out(fifo, len); + return; + } + kfifo_reset_out(fifo); +} +EXPORT_SYMBOL(kfifo_skip); + /** * kfifo_in - puts some data into the FIFO * @fifo: the fifo to be used. @@ -115,6 +131,7 @@ EXPORT_SYMBOL(kfifo_free); unsigned int kfifo_in(struct kfifo *fifo, const unsigned char *from, unsigned int len) { + unsigned int off; unsigned int l; len = min(len, fifo->size - fifo->in + fifo->out); @@ -126,21 +143,16 @@ unsigned int kfifo_in(struct kfifo *fifo, smp_mb(); + off = __kfifo_off(fifo, fifo->in); + /* first put the data starting from fifo->in to buffer end */ - l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); - memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), from, l); + l = min(len, fifo->size - off); + memcpy(fifo->buffer + off, from, l); /* then put the rest (if any) at the beginning of the buffer */ memcpy(fifo->buffer, from + l, len - l); - /* - * Ensure that we add the bytes to the kfifo -before- - * we update the fifo->in index. - */ - - smp_wmb(); - - fifo->in += len; + __kfifo_add_in(fifo, len); return len; } @@ -161,6 +173,7 @@ EXPORT_SYMBOL(kfifo_in); unsigned int kfifo_out(struct kfifo *fifo, unsigned char *to, unsigned int len) { + unsigned int off; unsigned int l; len = min(len, fifo->in - fifo->out); @@ -172,22 +185,116 @@ unsigned int kfifo_out(struct kfifo *fifo, smp_rmb(); + off = __kfifo_off(fifo, fifo->out); + /* first get the data from fifo->out until the end of the buffer */ - l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); - memcpy(to, fifo->buffer + (fifo->out & (fifo->size - 1)), l); + l = min(len, fifo->size - off); + memcpy(to, fifo->buffer + off, l); /* then get the rest (if any) from the beginning of the buffer */ memcpy(to + l, fifo->buffer, len - l); + __kfifo_add_out(fifo, len); + + return len; +} +EXPORT_SYMBOL(kfifo_out); + +/** + * kfifo_from_user - puts some data from user space into the FIFO + * @fifo: the fifo to be used. + * @from: pointer to the data to be added. + * @len: the length of the data to be added. + * + * This function copies at most @len bytes from the @from into the + * FIFO depending and returns the number of copied bytes. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_from_user(struct kfifo *fifo, + const void __user *from, unsigned int len) +{ + unsigned int off; + unsigned int l; + int ret; + + len = min(len, fifo->size - fifo->in + fifo->out); + /* - * Ensure that we remove the bytes from the kfifo -before- - * we update the fifo->out index. + * Ensure that we sample the fifo->out index -before- we + * start putting bytes into the kfifo. */ smp_mb(); - fifo->out += len; + off = __kfifo_off(fifo, fifo->in); + + /* first put the data starting from fifo->in to buffer end */ + l = min(len, fifo->size - off); + ret = copy_from_user(fifo->buffer + off, from, l); + + if (unlikely(ret)) + return l - ret; + + /* then put the rest (if any) at the beginning of the buffer */ + ret = copy_from_user(fifo->buffer, from + l, len - l); + + if (unlikely(ret)) + return len - ret; + + __kfifo_add_in(fifo, len); return len; } -EXPORT_SYMBOL(kfifo_out); +EXPORT_SYMBOL(kfifo_from_user); + +/** + * kfifo_to_user - gets data from the FIFO and write it to user space + * @fifo: the fifo to be used. + * @to: where the data must be copied. + * @len: the size of the destination buffer. + * + * This function copies at most @len bytes from the FIFO into the + * @to buffer and returns the number of copied bytes. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_to_user(struct kfifo *fifo, + void __user *to, unsigned int len) +{ + unsigned int off; + unsigned int l; + int ret; + + len = min(len, fifo->in - fifo->out); + + /* + * Ensure that we sample the fifo->in index -before- we + * start removing bytes from the kfifo. + */ + + smp_rmb(); + + off = __kfifo_off(fifo, fifo->out); + + /* first get the data from fifo->out until the end of the buffer */ + l = min(len, fifo->size - off); + ret = copy_to_user(to, fifo->buffer + off, l); + + if (unlikely(ret)) + return l - ret; + + /* then get the rest (if any) from the beginning of the buffer */ + ret = copy_to_user(to + l, fifo->buffer, len - l); + + if (unlikely(ret)) + return len - ret; + + __kfifo_add_out(fifo, len); + + return len; +} +EXPORT_SYMBOL(kfifo_to_user); + -- cgit v1.2.3 From 86d4880313603810901f639ccb5c88ff13d4ad3c Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:32 -0800 Subject: kfifo: add record handling functions Add kfifo_in_rec() - puts some record data into the FIFO Add kfifo_out_rec() - gets some record data from the FIFO Add kfifo_from_user_rec() - puts some data from user space into the FIFO Add kfifo_to_user_rec() - gets data from the FIFO and write it to user space Add kfifo_peek_rec() - gets the size of the next FIFO record field Add kfifo_skip_rec() - skip the next fifo out record Add kfifo_avail_rec() - determinate the number of bytes available in a record FIFO Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 330 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/kfifo.c | 286 +++++++++++++++++++++++++++++-------------- 2 files changed, 523 insertions(+), 93 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index d3230fb08bc7..486e8ad3bb50 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -278,4 +278,334 @@ static inline unsigned int __kfifo_off(struct kfifo *fifo, unsigned int off) return off & (fifo->size - 1); } +/** + * __kfifo_peek_n internal helper function for determinate the length of + * the next record in the fifo + */ +static inline unsigned int __kfifo_peek_n(struct kfifo *fifo, + unsigned int recsize) +{ +#define __KFIFO_GET(fifo, off, shift) \ + ((fifo)->buffer[__kfifo_off((fifo), (fifo)->out+(off))] << (shift)) + + unsigned int l; + + l = __KFIFO_GET(fifo, 0, 0); + + if (--recsize) + l |= __KFIFO_GET(fifo, 1, 8); + + return l; +#undef __KFIFO_GET +} + +/** + * __kfifo_poke_n internal helper function for storing the length of + * the next record into the fifo + */ +static inline void __kfifo_poke_n(struct kfifo *fifo, + unsigned int recsize, unsigned int n) +{ +#define __KFIFO_PUT(fifo, off, val, shift) \ + ( \ + (fifo)->buffer[__kfifo_off((fifo), (fifo)->in+(off))] = \ + (unsigned char)((val) >> (shift)) \ + ) + + __KFIFO_PUT(fifo, 0, n, 0); + + if (--recsize) + __KFIFO_PUT(fifo, 1, n, 8); +#undef __KFIFO_PUT +} + +/** + * __kfifo_in_... internal functions for put date into the fifo + * do not call it directly, use kfifo_in_rec() instead + */ +extern unsigned int __kfifo_in_n(struct kfifo *fifo, + const void *from, unsigned int n, unsigned int recsize); + +extern unsigned int __kfifo_in_generic(struct kfifo *fifo, + const void *from, unsigned int n, unsigned int recsize); + +static inline unsigned int __kfifo_in_rec(struct kfifo *fifo, + const void *from, unsigned int n, unsigned int recsize) +{ + unsigned int ret; + + ret = __kfifo_in_n(fifo, from, n, recsize); + + if (likely(ret == 0)) { + if (recsize) + __kfifo_poke_n(fifo, recsize, n); + __kfifo_add_in(fifo, n + recsize); + } + return ret; +} + +/** + * kfifo_in_rec - puts some record data into the FIFO + * @fifo: the fifo to be used. + * @from: the data to be added. + * @n: the length of the data to be added. + * @recsize: size of record field + * + * This function copies @n bytes from the @from into the FIFO and returns + * the number of bytes which cannot be copied. + * A returned value greater than the @n value means that the record doesn't + * fit into the buffer. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +static inline __must_check unsigned int kfifo_in_rec(struct kfifo *fifo, + void *from, unsigned int n, unsigned int recsize) +{ + if (!__builtin_constant_p(recsize)) + return __kfifo_in_generic(fifo, from, n, recsize); + return __kfifo_in_rec(fifo, from, n, recsize); +} + +/** + * __kfifo_out_... internal functions for get date from the fifo + * do not call it directly, use kfifo_out_rec() instead + */ +extern unsigned int __kfifo_out_n(struct kfifo *fifo, + void *to, unsigned int reclen, unsigned int recsize); + +extern unsigned int __kfifo_out_generic(struct kfifo *fifo, + void *to, unsigned int n, + unsigned int recsize, unsigned int *total); + +static inline unsigned int __kfifo_out_rec(struct kfifo *fifo, + void *to, unsigned int n, unsigned int recsize, + unsigned int *total) +{ + unsigned int l; + + if (!recsize) { + l = n; + if (total) + *total = l; + } else { + l = __kfifo_peek_n(fifo, recsize); + if (total) + *total = l; + if (n < l) + return l; + } + + return __kfifo_out_n(fifo, to, l, recsize); +} + +/** + * kfifo_out_rec - gets some record data from the FIFO + * @fifo: the fifo to be used. + * @to: where the data must be copied. + * @n: the size of the destination buffer. + * @recsize: size of record field + * @total: pointer where the total number of to copied bytes should stored + * + * This function copies at most @n bytes from the FIFO to @to and returns the + * number of bytes which cannot be copied. + * A returned value greater than the @n value means that the record doesn't + * fit into the @to buffer. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +static inline __must_check unsigned int kfifo_out_rec(struct kfifo *fifo, + void *to, unsigned int n, unsigned int recsize, + unsigned int *total) + +{ + if (!__builtin_constant_p(recsize)) + return __kfifo_out_generic(fifo, to, n, recsize, total); + return __kfifo_out_rec(fifo, to, n, recsize, total); +} + +/** + * __kfifo_from_user_... internal functions for transfer from user space into + * the fifo. do not call it directly, use kfifo_from_user_rec() instead + */ +extern unsigned int __kfifo_from_user_n(struct kfifo *fifo, + const void __user *from, unsigned int n, unsigned int recsize); + +extern unsigned int __kfifo_from_user_generic(struct kfifo *fifo, + const void __user *from, unsigned int n, unsigned int recsize); + +static inline unsigned int __kfifo_from_user_rec(struct kfifo *fifo, + const void __user *from, unsigned int n, unsigned int recsize) +{ + unsigned int ret; + + ret = __kfifo_from_user_n(fifo, from, n, recsize); + + if (likely(ret == 0)) { + if (recsize) + __kfifo_poke_n(fifo, recsize, n); + __kfifo_add_in(fifo, n + recsize); + } + return ret; +} + +/** + * kfifo_from_user_rec - puts some data from user space into the FIFO + * @fifo: the fifo to be used. + * @from: pointer to the data to be added. + * @n: the length of the data to be added. + * @recsize: size of record field + * + * This function copies @n bytes from the @from into the + * FIFO and returns the number of bytes which cannot be copied. + * + * If the returned value is equal or less the @n value, the copy_from_user() + * functions has failed. Otherwise the record doesn't fit into the buffer. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +static inline __must_check unsigned int kfifo_from_user_rec(struct kfifo *fifo, + const void __user *from, unsigned int n, unsigned int recsize) +{ + if (!__builtin_constant_p(recsize)) + return __kfifo_from_user_generic(fifo, from, n, recsize); + return __kfifo_from_user_rec(fifo, from, n, recsize); +} + +/** + * __kfifo_to_user_... internal functions for transfer fifo data into user space + * do not call it directly, use kfifo_to_user_rec() instead + */ +extern unsigned int __kfifo_to_user_n(struct kfifo *fifo, + void __user *to, unsigned int n, unsigned int reclen, + unsigned int recsize); + +extern unsigned int __kfifo_to_user_generic(struct kfifo *fifo, + void __user *to, unsigned int n, unsigned int recsize, + unsigned int *total); + +static inline unsigned int __kfifo_to_user_rec(struct kfifo *fifo, + void __user *to, unsigned int n, + unsigned int recsize, unsigned int *total) +{ + unsigned int l; + + if (!recsize) { + l = n; + if (total) + *total = l; + } else { + l = __kfifo_peek_n(fifo, recsize); + if (total) + *total = l; + if (n < l) + return l; + } + + return __kfifo_to_user_n(fifo, to, n, l, recsize); +} + +/** + * kfifo_to_user_rec - gets data from the FIFO and write it to user space + * @fifo: the fifo to be used. + * @to: where the data must be copied. + * @n: the size of the destination buffer. + * @recsize: size of record field + * @total: pointer where the total number of to copied bytes should stored + * + * This function copies at most @n bytes from the FIFO to the @to. + * In case of an error, the function returns the number of bytes which cannot + * be copied. + * If the returned value is equal or less the @n value, the copy_to_user() + * functions has failed. Otherwise the record doesn't fit into the @to buffer. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +static inline __must_check unsigned int kfifo_to_user_rec(struct kfifo *fifo, + void __user *to, unsigned int n, unsigned int recsize, + unsigned int *total) +{ + if (!__builtin_constant_p(recsize)) + return __kfifo_to_user_generic(fifo, to, n, recsize, total); + return __kfifo_to_user_rec(fifo, to, n, recsize, total); +} + +/** + * __kfifo_peek_... internal functions for peek into the next fifo record + * do not call it directly, use kfifo_peek_rec() instead + */ +extern unsigned int __kfifo_peek_generic(struct kfifo *fifo, + unsigned int recsize); + +/** + * kfifo_peek_rec - gets the size of the next FIFO record data + * @fifo: the fifo to be used. + * @recsize: size of record field + * + * This function returns the size of the next FIFO record in number of bytes + */ +static inline __must_check unsigned int kfifo_peek_rec(struct kfifo *fifo, + unsigned int recsize) +{ + if (!__builtin_constant_p(recsize)) + return __kfifo_peek_generic(fifo, recsize); + if (!recsize) + return kfifo_len(fifo); + return __kfifo_peek_n(fifo, recsize); +} + +/** + * __kfifo_skip_... internal functions for skip the next fifo record + * do not call it directly, use kfifo_skip_rec() instead + */ +extern void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize); + +static inline void __kfifo_skip_rec(struct kfifo *fifo, + unsigned int recsize) +{ + unsigned int l; + + if (recsize) { + l = __kfifo_peek_n(fifo, recsize); + + if (l + recsize <= kfifo_len(fifo)) { + __kfifo_add_out(fifo, l + recsize); + return; + } + } + kfifo_reset_out(fifo); +} + +/** + * kfifo_skip_rec - skip the next fifo out record + * @fifo: the fifo to be used. + * @recsize: size of record field + * + * This function skips the next FIFO record + */ +static inline void kfifo_skip_rec(struct kfifo *fifo, + unsigned int recsize) +{ + if (!__builtin_constant_p(recsize)) + __kfifo_skip_generic(fifo, recsize); + else + __kfifo_skip_rec(fifo, recsize); +} + +/** + * kfifo_avail_rec - returns the number of bytes available in a record FIFO + * @fifo: the fifo to be used. + * @recsize: size of record field + */ +static inline __must_check unsigned int kfifo_avail_rec(struct kfifo *fifo, + unsigned int recsize) +{ + unsigned int l = kfifo_size(fifo) - kfifo_len(fifo); + + return (l > recsize) ? l - recsize : 0; +} + #endif diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 2a78425ef67f..e92d519f93b1 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -115,27 +115,11 @@ void kfifo_skip(struct kfifo *fifo, unsigned int len) } EXPORT_SYMBOL(kfifo_skip); -/** - * kfifo_in - puts some data into the FIFO - * @fifo: the fifo to be used. - * @from: the data to be added. - * @len: the length of the data to be added. - * - * This function copies at most @len bytes from the @from buffer into - * the FIFO depending on the free space, and returns the number of - * bytes copied. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -unsigned int kfifo_in(struct kfifo *fifo, - const unsigned char *from, unsigned int len) +static inline void __kfifo_in_data(struct kfifo *fifo, + const void *from, unsigned int len, unsigned int off) { - unsigned int off; unsigned int l; - len = min(len, fifo->size - fifo->in + fifo->out); - /* * Ensure that we sample the fifo->out index -before- we * start putting bytes into the kfifo. @@ -143,7 +127,7 @@ unsigned int kfifo_in(struct kfifo *fifo, smp_mb(); - off = __kfifo_off(fifo, fifo->in); + off = __kfifo_off(fifo, fifo->in + off); /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - off); @@ -151,33 +135,13 @@ unsigned int kfifo_in(struct kfifo *fifo, /* then put the rest (if any) at the beginning of the buffer */ memcpy(fifo->buffer, from + l, len - l); - - __kfifo_add_in(fifo, len); - - return len; } -EXPORT_SYMBOL(kfifo_in); -/** - * kfifo_out - gets some data from the FIFO - * @fifo: the fifo to be used. - * @to: where the data must be copied. - * @len: the size of the destination buffer. - * - * This function copies at most @len bytes from the FIFO into the - * @to buffer and returns the number of copied bytes. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -unsigned int kfifo_out(struct kfifo *fifo, - unsigned char *to, unsigned int len) +static inline void __kfifo_out_data(struct kfifo *fifo, + void *to, unsigned int len, unsigned int off) { - unsigned int off; unsigned int l; - len = min(len, fifo->in - fifo->out); - /* * Ensure that we sample the fifo->in index -before- we * start removing bytes from the kfifo. @@ -185,7 +149,7 @@ unsigned int kfifo_out(struct kfifo *fifo, smp_rmb(); - off = __kfifo_off(fifo, fifo->out); + off = __kfifo_off(fifo, fifo->out + off); /* first get the data from fifo->out until the end of the buffer */ l = min(len, fifo->size - off); @@ -193,34 +157,14 @@ unsigned int kfifo_out(struct kfifo *fifo, /* then get the rest (if any) from the beginning of the buffer */ memcpy(to + l, fifo->buffer, len - l); - - __kfifo_add_out(fifo, len); - - return len; } -EXPORT_SYMBOL(kfifo_out); -/** - * kfifo_from_user - puts some data from user space into the FIFO - * @fifo: the fifo to be used. - * @from: pointer to the data to be added. - * @len: the length of the data to be added. - * - * This function copies at most @len bytes from the @from into the - * FIFO depending and returns the number of copied bytes. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -unsigned int kfifo_from_user(struct kfifo *fifo, - const void __user *from, unsigned int len) +static inline unsigned int __kfifo_from_user_data(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int off) { - unsigned int off; unsigned int l; int ret; - len = min(len, fifo->size - fifo->in + fifo->out); - /* * Ensure that we sample the fifo->out index -before- we * start putting bytes into the kfifo. @@ -228,29 +172,101 @@ unsigned int kfifo_from_user(struct kfifo *fifo, smp_mb(); - off = __kfifo_off(fifo, fifo->in); + off = __kfifo_off(fifo, fifo->in + off); /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - off); ret = copy_from_user(fifo->buffer + off, from, l); if (unlikely(ret)) - return l - ret; + return ret + len - l; /* then put the rest (if any) at the beginning of the buffer */ - ret = copy_from_user(fifo->buffer, from + l, len - l); + return copy_from_user(fifo->buffer, from + l, len - l); +} + +static inline unsigned int __kfifo_to_user_data(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int off) +{ + unsigned int l; + int ret; + + /* + * Ensure that we sample the fifo->in index -before- we + * start removing bytes from the kfifo. + */ + + smp_rmb(); + + off = __kfifo_off(fifo, fifo->out + off); + + /* first get the data from fifo->out until the end of the buffer */ + l = min(len, fifo->size - off); + ret = copy_to_user(to, fifo->buffer + off, l); if (unlikely(ret)) - return len - ret; + return ret + len - l; - __kfifo_add_in(fifo, len); + /* then get the rest (if any) from the beginning of the buffer */ + return copy_to_user(to + l, fifo->buffer, len - l); +} +unsigned int __kfifo_in_n(struct kfifo *fifo, + const void *from, unsigned int len, unsigned int recsize) +{ + if (kfifo_avail(fifo) < len + recsize) + return len + 1; + + __kfifo_in_data(fifo, from, len, recsize); + return 0; +} +EXPORT_SYMBOL(__kfifo_in_n); + +/** + * kfifo_in - puts some data into the FIFO + * @fifo: the fifo to be used. + * @from: the data to be added. + * @len: the length of the data to be added. + * + * This function copies at most @len bytes from the @from buffer into + * the FIFO depending on the free space, and returns the number of + * bytes copied. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_in(struct kfifo *fifo, const unsigned char *from, + unsigned int len) +{ + len = min(kfifo_avail(fifo), len); + + __kfifo_in_data(fifo, from, len, 0); + __kfifo_add_in(fifo, len); return len; } -EXPORT_SYMBOL(kfifo_from_user); +EXPORT_SYMBOL(kfifo_in); + +unsigned int __kfifo_in_generic(struct kfifo *fifo, + const void *from, unsigned int len, unsigned int recsize) +{ + return __kfifo_in_rec(fifo, from, len, recsize); +} +EXPORT_SYMBOL(__kfifo_in_generic); + +unsigned int __kfifo_out_n(struct kfifo *fifo, + void *to, unsigned int len, unsigned int recsize) +{ + if (kfifo_len(fifo) < len + recsize) + return len; + + __kfifo_out_data(fifo, to, len, recsize); + __kfifo_add_out(fifo, len + recsize); + return 0; +} +EXPORT_SYMBOL(__kfifo_out_n); /** - * kfifo_to_user - gets data from the FIFO and write it to user space + * kfifo_out - gets some data from the FIFO * @fifo: the fifo to be used. * @to: where the data must be copied. * @len: the size of the destination buffer. @@ -261,40 +277,124 @@ EXPORT_SYMBOL(kfifo_from_user); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_to_user(struct kfifo *fifo, - void __user *to, unsigned int len) +unsigned int kfifo_out(struct kfifo *fifo, unsigned char *to, unsigned int len) { - unsigned int off; - unsigned int l; - int ret; + len = min(kfifo_len(fifo), len); - len = min(len, fifo->in - fifo->out); + __kfifo_out_data(fifo, to, len, 0); + __kfifo_add_out(fifo, len); - /* - * Ensure that we sample the fifo->in index -before- we - * start removing bytes from the kfifo. - */ + return len; +} +EXPORT_SYMBOL(kfifo_out); - smp_rmb(); +unsigned int __kfifo_out_generic(struct kfifo *fifo, + void *to, unsigned int len, unsigned int recsize, + unsigned int *total) +{ + return __kfifo_out_rec(fifo, to, len, recsize, total); +} +EXPORT_SYMBOL(__kfifo_out_generic); - off = __kfifo_off(fifo, fifo->out); +unsigned int __kfifo_from_user_n(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int recsize) +{ + if (kfifo_avail(fifo) < len + recsize) + return len + 1; - /* first get the data from fifo->out until the end of the buffer */ - l = min(len, fifo->size - off); - ret = copy_to_user(to, fifo->buffer + off, l); + return __kfifo_from_user_data(fifo, from, len, recsize); +} +EXPORT_SYMBOL(__kfifo_from_user_n); - if (unlikely(ret)) - return l - ret; +/** + * kfifo_from_user - puts some data from user space into the FIFO + * @fifo: the fifo to be used. + * @from: pointer to the data to be added. + * @len: the length of the data to be added. + * + * This function copies at most @len bytes from the @from into the + * FIFO depending and returns the number of copied bytes. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_from_user(struct kfifo *fifo, + const void __user *from, unsigned int len) +{ + len = min(kfifo_avail(fifo), len); + len -= __kfifo_from_user_data(fifo, from, len, 0); + __kfifo_add_in(fifo, len); + return len; +} +EXPORT_SYMBOL(kfifo_from_user); - /* then get the rest (if any) from the beginning of the buffer */ - ret = copy_to_user(to + l, fifo->buffer, len - l); +unsigned int __kfifo_from_user_generic(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int recsize) +{ + return __kfifo_from_user_rec(fifo, from, len, recsize); +} +EXPORT_SYMBOL(__kfifo_from_user_generic); - if (unlikely(ret)) - return len - ret; +unsigned int __kfifo_to_user_n(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int reclen, + unsigned int recsize) +{ + unsigned int ret; - __kfifo_add_out(fifo, len); + if (kfifo_len(fifo) < reclen + recsize) + return len; + ret = __kfifo_to_user_data(fifo, to, reclen, recsize); + + if (likely(ret == 0)) + __kfifo_add_out(fifo, reclen + recsize); + + return ret; +} +EXPORT_SYMBOL(__kfifo_to_user_n); + +/** + * kfifo_to_user - gets data from the FIFO and write it to user space + * @fifo: the fifo to be used. + * @to: where the data must be copied. + * @len: the size of the destination buffer. + * + * This function copies at most @len bytes from the FIFO into the + * @to buffer and returns the number of copied bytes. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_to_user(struct kfifo *fifo, + void __user *to, unsigned int len) +{ + len = min(kfifo_len(fifo), len); + len -= __kfifo_to_user_data(fifo, to, len, 0); + __kfifo_add_out(fifo, len); return len; } EXPORT_SYMBOL(kfifo_to_user); +unsigned int __kfifo_to_user_generic(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int recsize, + unsigned int *total) +{ + return __kfifo_to_user_rec(fifo, to, len, recsize, total); +} +EXPORT_SYMBOL(__kfifo_to_user_generic); + +unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize) +{ + if (recsize == 0) + return kfifo_avail(fifo); + + return __kfifo_peek_n(fifo, recsize); +} +EXPORT_SYMBOL(__kfifo_peek_generic); + +void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize) +{ + __kfifo_skip_rec(fifo, recsize); +} +EXPORT_SYMBOL(__kfifo_skip_generic); + -- cgit v1.2.3 From cc3e1bea5d87635c519da657303690f5538bb4eb Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 23 Dec 2009 06:52:08 -0500 Subject: ext4, jbd2: Add barriers for file systems with exernal journals This is a bit complicated because we are trying to optimize when we send barriers to the fs data disk. We could just throw in an extra barrier to the data disk whenever we send a barrier to the journal disk, but that's not always strictly necessary. We only need to send a barrier during a commit when there are data blocks which are must be written out due to an inode written in ordered mode, or if fsync() depends on the commit to force data blocks to disk. Finally, before we drop transactions from the beginning of the journal during a checkpoint operation, we need to guarantee that any blocks that were flushed out to the data disk are firmly on the rust platter before we drop the transaction from the journal. Thanks to Oleg Drokin for pointing out this flaw in ext3/ext4. Signed-off-by: "Theodore Ts'o" --- fs/ext4/fsync.c | 16 ++++++++++++++-- fs/jbd2/checkpoint.c | 15 +++++++++++++++ fs/jbd2/commit.c | 19 +++++++++++-------- include/linux/jbd2.h | 1 + 4 files changed, 41 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 0b22497d92e1..98bd140aad01 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -88,9 +88,21 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) return ext4_force_commit(inode->i_sb); commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; - if (jbd2_log_start_commit(journal, commit_tid)) + if (jbd2_log_start_commit(journal, commit_tid)) { + /* + * When the journal is on a different device than the + * fs data disk, we need to issue the barrier in + * writeback mode. (In ordered mode, the jbd2 layer + * will take care of issuing the barrier. In + * data=journal, all of the data blocks are written to + * the journal device.) + */ + if (ext4_should_writeback_data(inode) && + (journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); jbd2_log_wait_commit(journal, commit_tid); - else if (journal->j_flags & JBD2_BARRIER) + } else if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, NULL); return ret; } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index ca0f5eb62b20..886849370950 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -22,6 +22,7 @@ #include #include #include +#include #include /* @@ -515,6 +516,20 @@ int jbd2_cleanup_journal_tail(journal_t *journal) journal->j_tail_sequence = first_tid; journal->j_tail = blocknr; spin_unlock(&journal->j_state_lock); + + /* + * If there is an external journal, we need to make sure that + * any data blocks that were recently written out --- perhaps + * by jbd2_log_do_checkpoint() --- are flushed out before we + * drop the transactions from the external journal. It's + * unlikely this will be necessary, especially with a + * appropriately sized journal, but we need this to guarantee + * correctness. Fortunately jbd2_cleanup_journal_tail() + * doesn't get called all that often. + */ + if ((journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(journal->j_fs_dev, NULL); if (!(journal->j_flags & JBD2_ABORT)) jbd2_journal_update_superblock(journal, 1); return 0; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6a10238d2c63..1bc74b6f26d2 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -259,6 +259,7 @@ static int journal_submit_data_buffers(journal_t *journal, ret = err; spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); + commit_transaction->t_flushed_data_blocks = 1; jinode->i_flags &= ~JI_COMMIT_RUNNING; wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } @@ -708,8 +709,17 @@ start_journal_io: } } - /* Done it all: now write the commit record asynchronously. */ + /* + * If the journal is not located on the file system device, + * then we must flush the file system device before we issue + * the commit record + */ + if (commit_transaction->t_flushed_data_blocks && + (journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(journal->j_fs_dev, NULL); + /* Done it all: now write the commit record asynchronously. */ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { err = journal_submit_commit_record(journal, commit_transaction, @@ -720,13 +730,6 @@ start_journal_io: blkdev_issue_flush(journal->j_dev, NULL); } - /* - * This is the right place to wait for data buffers both for ASYNC - * and !ASYNC commit. If commit is ASYNC, we need to wait only after - * the commit block went to disk (which happens above). If commit is - * SYNC, we need to wait for data buffers before we start writing - * commit block, which happens below in such setting. - */ err = journal_finish_inode_data_buffers(journal, commit_transaction); if (err) { printk(KERN_WARNING diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f1011f7f3d41..638ce4554c76 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -653,6 +653,7 @@ struct transaction_s * waiting for it to finish. */ unsigned int t_synchronous_commit:1; + unsigned int t_flushed_data_blocks:1; /* * For use by the filesystem to store fs-specific data -- cgit v1.2.3 From c459001fa4f71deafb62e00fa70d35f695498965 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 9 Dec 2009 03:05:30 +0300 Subject: ext3: quota macros cleanup [V2] Currently all quota block reservation macros contains hardcoded "2" aka MAXQUOTAS value. This is no good because in some places it is not obvious to understand what does this digit represent. Let's introduce new macro with self descriptive name. Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara --- fs/ext3/inode.c | 8 ++++---- fs/ext3/namei.c | 8 ++++---- include/linux/ext3_jbd.h | 7 +++++-- 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index ad14227f509e..455e6e6e5cb9 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -970,7 +970,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock, if (max_blocks > DIO_MAX_BLOCKS) max_blocks = DIO_MAX_BLOCKS; handle = ext3_journal_start(inode, DIO_CREDITS + - 2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb)); + EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -3146,8 +3146,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+ - EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; @@ -3239,7 +3239,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode) #ifdef CONFIG_QUOTA /* We know that structure was already allocated during vfs_dq_init so * we will be updating only the data blocks + inodes */ - ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); + ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); #endif return ret; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index aad6400c9b77..81f7348b2de3 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1699,7 +1699,7 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1733,7 +1733,7 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1769,7 +1769,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2175,7 +2175,7 @@ static int ext3_symlink (struct inode * dir, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/include/linux/ext3_jbd.h b/include/linux/ext3_jbd.h index cf82d519be40..d7b5ddca99c2 100644 --- a/include/linux/ext3_jbd.h +++ b/include/linux/ext3_jbd.h @@ -44,13 +44,13 @@ #define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \ EXT3_XATTR_TRANS_BLOCKS - 2 + \ - 2*EXT3_QUOTA_TRANS_BLOCKS(sb)) + EXT3_MAXQUOTAS_TRANS_BLOCKS(sb)) /* Delete operations potentially hit one directory's namespace plus an * entire inode, plus arbitrary amounts of bitmap/indirection data. Be * generous. We can grow the delete transaction later if necessary. */ -#define EXT3_DELETE_TRANS_BLOCKS(sb) (2 * EXT3_DATA_TRANS_BLOCKS(sb) + 64) +#define EXT3_DELETE_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64) /* Define an arbitrary limit for the amount of data we will anticipate * writing to any given transaction. For unbounded transactions such as @@ -86,6 +86,9 @@ #define EXT3_QUOTA_INIT_BLOCKS(sb) 0 #define EXT3_QUOTA_DEL_BLOCKS(sb) 0 #endif +#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb)) int ext3_mark_iloc_dirty(handle_t *handle, -- cgit v1.2.3 From b462707e7ccad058ae151e5c5b06eb5cadcb737f Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 14 Dec 2009 15:21:12 +0300 Subject: Add unlocked version of inode_add_bytes() function Quota code requires unlocked version of this function. Off course we can just copy-paste the code, but copy-pasting is always an evil. Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara --- fs/stat.c | 10 ++++++++-- include/linux/fs.h | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/stat.c b/fs/stat.c index 075694e31d8b..c4ecd52c5737 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -401,9 +401,9 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename, } #endif /* __ARCH_WANT_STAT64 */ -void inode_add_bytes(struct inode *inode, loff_t bytes) +/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ +void __inode_add_bytes(struct inode *inode, loff_t bytes) { - spin_lock(&inode->i_lock); inode->i_blocks += bytes >> 9; bytes &= 511; inode->i_bytes += bytes; @@ -411,6 +411,12 @@ void inode_add_bytes(struct inode *inode, loff_t bytes) inode->i_blocks++; inode->i_bytes -= 512; } +} + +void inode_add_bytes(struct inode *inode, loff_t bytes) +{ + spin_lock(&inode->i_lock); + __inode_add_bytes(inode, bytes); spin_unlock(&inode->i_lock); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e3012e0ac06..9147ca88f253 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2297,6 +2297,7 @@ extern const struct inode_operations page_symlink_inode_operations; extern int generic_readlink(struct dentry *, char __user *, int); extern void generic_fillattr(struct inode *, struct kstat *); extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); +void __inode_add_bytes(struct inode *inode, loff_t bytes); void inode_add_bytes(struct inode *inode, loff_t bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes); loff_t inode_get_bytes(struct inode *inode); -- cgit v1.2.3 From fd8fbfc1709822bd94247c5b2ab15a5f5041e103 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 14 Dec 2009 15:21:13 +0300 Subject: quota: decouple fs reserved space from quota reservation Currently inode_reservation is managed by fs itself and this reservation is transfered on dquot_transfer(). This means what inode_reservation must always be in sync with dquot->dq_dqb.dqb_rsvspace. Otherwise dquot_transfer() will result in incorrect quota(WARN_ON in dquot_claim_reserved_space() will be triggered) This is not easy because of complex locking order issues for example http://bugzilla.kernel.org/show_bug.cgi?id=14739 The patch introduce quota reservation field for each fs-inode (fs specific inode is used in order to prevent bloating generic vfs inode). This reservation is managed by quota code internally similar to i_blocks/i_bytes and may not be always in sync with internal fs reservation. Also perform some code rearrangement: - Unify dquot_reserve_space() and dquot_reserve_space() - Unify dquot_release_reserved_space() and dquot_free_space() - Also this patch add missing warning update to release_rsv() dquot_release_reserved_space() must call flush_warnings() as dquot_free_space() does. Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara --- fs/quota/dquot.c | 213 ++++++++++++++++++++++++++++---------------------- include/linux/quota.h | 5 +- 2 files changed, 122 insertions(+), 96 deletions(-) (limited to 'include/linux') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index cd6bb9a33c13..1cb8fa84300f 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1318,6 +1318,67 @@ void vfs_dq_drop(struct inode *inode) } EXPORT_SYMBOL(vfs_dq_drop); +/* + * inode_reserved_space is managed internally by quota, and protected by + * i_lock similar to i_blocks+i_bytes. + */ +static qsize_t *inode_reserved_space(struct inode * inode) +{ + /* Filesystem must explicitly define it's own method in order to use + * quota reservation interface */ + BUG_ON(!inode->i_sb->dq_op->get_reserved_space); + return inode->i_sb->dq_op->get_reserved_space(inode); +} + +static void inode_add_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + spin_unlock(&inode->i_lock); +} + + +static void inode_claim_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + __inode_add_bytes(inode, number); + spin_unlock(&inode->i_lock); +} + +static void inode_sub_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + spin_unlock(&inode->i_lock); +} + +static qsize_t inode_get_rsv_space(struct inode *inode) +{ + qsize_t ret; + spin_lock(&inode->i_lock); + ret = *inode_reserved_space(inode); + spin_unlock(&inode->i_lock); + return ret; +} + +static void inode_incr_space(struct inode *inode, qsize_t number, + int reserve) +{ + if (reserve) + inode_add_rsv_space(inode, number); + else + inode_add_bytes(inode, number); +} + +static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) +{ + if (reserve) + inode_sub_rsv_space(inode, number); + else + inode_sub_bytes(inode, number); +} + /* * Following four functions update i_blocks+i_bytes fields and * quota information (together with appropriate checks) @@ -1336,6 +1397,21 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int cnt, ret = QUOTA_OK; char warntype[MAXQUOTAS]; + /* + * First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex + */ + if (IS_NOQUOTA(inode)) { + inode_incr_space(inode, number, reserve); + goto out; + } + + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + if (IS_NOQUOTA(inode)) { + inode_incr_space(inode, number, reserve); + goto out_unlock; + } + for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; @@ -1346,7 +1422,8 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) == NO_QUOTA) { ret = NO_QUOTA; - goto out_unlock; + spin_unlock(&dq_data_lock); + goto out_flush_warn; } } for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1357,64 +1434,32 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, else dquot_incr_space(inode->i_dquot[cnt], number); } - if (!reserve) - inode_add_bytes(inode, number); -out_unlock: + inode_incr_space(inode, number, reserve); spin_unlock(&dq_data_lock); - flush_warnings(inode->i_dquot, warntype); - return ret; -} - -int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) -{ - int cnt, ret = QUOTA_OK; - - /* - * First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex - */ - if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); - goto out; - } - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); - goto out_unlock; - } - - ret = __dquot_alloc_space(inode, number, warn, 0); - if (ret == NO_QUOTA) - goto out_unlock; + if (reserve) + goto out_flush_warn; /* Dirtify all the dquots - this can block when journalling */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (inode->i_dquot[cnt]) mark_dquot_dirty(inode->i_dquot[cnt]); +out_flush_warn: + flush_warnings(inode->i_dquot, warntype); out_unlock: up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); out: return ret; } + +int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) +{ + return __dquot_alloc_space(inode, number, warn, 0); +} EXPORT_SYMBOL(dquot_alloc_space); int dquot_reserve_space(struct inode *inode, qsize_t number, int warn) { - int ret = QUOTA_OK; - - if (IS_NOQUOTA(inode)) - goto out; - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) - goto out_unlock; - - ret = __dquot_alloc_space(inode, number, warn, 1); -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return ret; + return __dquot_alloc_space(inode, number, warn, 1); } EXPORT_SYMBOL(dquot_reserve_space); @@ -1471,14 +1516,14 @@ int dquot_claim_space(struct inode *inode, qsize_t number) int ret = QUOTA_OK; if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); goto out; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); if (IS_NOQUOTA(inode)) { up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); goto out; } @@ -1490,7 +1535,7 @@ int dquot_claim_space(struct inode *inode, qsize_t number) number); } /* Update inode bytes */ - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); spin_unlock(&dq_data_lock); /* Dirtify all the dquots - this can block when journalling */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) @@ -1502,39 +1547,10 @@ out: } EXPORT_SYMBOL(dquot_claim_space); -/* - * Release reserved quota space - */ -void dquot_release_reserved_space(struct inode *inode, qsize_t number) -{ - int cnt; - - if (IS_NOQUOTA(inode)) - goto out; - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) - goto out_unlock; - - spin_lock(&dq_data_lock); - /* Release reserved dquots */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (inode->i_dquot[cnt]) - dquot_free_reserved_space(inode->i_dquot[cnt], number); - } - spin_unlock(&dq_data_lock); - -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return; -} -EXPORT_SYMBOL(dquot_release_reserved_space); - /* * This operation can block, but only after everything is updated */ -int dquot_free_space(struct inode *inode, qsize_t number) +int __dquot_free_space(struct inode *inode, qsize_t number, int reserve) { unsigned int cnt; char warntype[MAXQUOTAS]; @@ -1543,7 +1559,7 @@ int dquot_free_space(struct inode *inode, qsize_t number) * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) { out_sub: - inode_sub_bytes(inode, number); + inode_decr_space(inode, number, reserve); return QUOTA_OK; } @@ -1558,20 +1574,42 @@ out_sub: if (!inode->i_dquot[cnt]) continue; warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); - dquot_decr_space(inode->i_dquot[cnt], number); + if (reserve) + dquot_free_reserved_space(inode->i_dquot[cnt], number); + else + dquot_decr_space(inode->i_dquot[cnt], number); } - inode_sub_bytes(inode, number); + inode_decr_space(inode, number, reserve); spin_unlock(&dq_data_lock); + + if (reserve) + goto out_unlock; /* Dirtify all the dquots - this can block when journalling */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (inode->i_dquot[cnt]) mark_dquot_dirty(inode->i_dquot[cnt]); +out_unlock: flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; } + +int dquot_free_space(struct inode *inode, qsize_t number) +{ + return __dquot_free_space(inode, number, 0); +} EXPORT_SYMBOL(dquot_free_space); +/* + * Release reserved quota space + */ +void dquot_release_reserved_space(struct inode *inode, qsize_t number) +{ + __dquot_free_space(inode, number, 1); + +} +EXPORT_SYMBOL(dquot_release_reserved_space); + /* * This operation can block, but only after everything is updated */ @@ -1609,19 +1647,6 @@ int dquot_free_inode(const struct inode *inode, qsize_t number) } EXPORT_SYMBOL(dquot_free_inode); -/* - * call back function, get reserved quota space from underlying fs - */ -qsize_t dquot_get_reserved_space(struct inode *inode) -{ - qsize_t reserved_space = 0; - - if (sb_any_quota_active(inode->i_sb) && - inode->i_sb->dq_op->get_reserved_space) - reserved_space = inode->i_sb->dq_op->get_reserved_space(inode); - return reserved_space; -} - /* * Transfer the number of inode and blocks from one diskquota to an other. * @@ -1665,7 +1690,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) } spin_lock(&dq_data_lock); cur_space = inode_get_bytes(inode); - rsv_space = dquot_get_reserved_space(inode); + rsv_space = inode_get_rsv_space(inode); space = cur_space + rsv_space; /* Build the transfer_from list and check the limits */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { diff --git a/include/linux/quota.h b/include/linux/quota.h index e70e62194243..a6861f117480 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -315,8 +315,9 @@ struct dquot_operations { int (*claim_space) (struct inode *, qsize_t); /* release rsved quota for delayed alloc */ void (*release_rsv) (struct inode *, qsize_t); - /* get reserved quota for delayed alloc */ - qsize_t (*get_reserved_space) (struct inode *); + /* get reserved quota for delayed alloc, value returned is managed by + * quota code only */ + qsize_t *(*get_reserved_space) (struct inode *); }; /* Operations handling requests from userspace */ -- cgit v1.2.3 From b8a052d01669977f224255b0f9f2737018171ddb Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 14 Dec 2009 13:00:30 -0600 Subject: ext3: Replace lock/unlock_super() with an explicit lock for the orphan list Use a separate lock to protect the orphan list, so we can stop overloading the use of lock_super(). Port of ext4 commit 3b9d4ed26680771295d904a6b83e88e620780893 by Theodore Ts'o . CC: Theodore Ts'o Signed-off-by: Eric Sandeen Signed-off-by: Jan Kara --- fs/ext3/namei.c | 20 +++++++++++--------- fs/ext3/super.c | 1 + include/linux/ext3_fs_sb.h | 1 + 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 81f7348b2de3..7b0e44f7d66f 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1920,7 +1920,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) struct ext3_iloc iloc; int err = 0, rc; - lock_super(sb); + mutex_lock(&EXT3_SB(sb)->s_orphan_lock); if (!list_empty(&EXT3_I(inode)->i_orphan)) goto out_unlock; @@ -1929,9 +1929,13 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) /* @@@ FIXME: Observation from aviro: * I think I can trigger J_ASSERT in ext3_orphan_add(). We block - * here (on lock_super()), so race with ext3_link() which might bump + * here (on s_orphan_lock), so race with ext3_link() which might bump * ->i_nlink. For, say it, character device. Not a regular file, * not a directory, not a symlink and ->i_nlink > 0. + * + * tytso, 4/25/2009: I'm not sure how that could happen; + * shouldn't the fs core protect us from these sort of + * unlink()/link() races? */ J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); @@ -1968,7 +1972,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out_unlock: - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_orphan_lock); ext3_std_error(inode->i_sb, err); return err; } @@ -1986,11 +1990,9 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode) struct ext3_iloc iloc; int err = 0; - lock_super(inode->i_sb); - if (list_empty(&ei->i_orphan)) { - unlock_super(inode->i_sb); - return 0; - } + mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock); + if (list_empty(&ei->i_orphan)) + goto out; ino_next = NEXT_ORPHAN(inode); prev = ei->i_orphan.prev; @@ -2040,7 +2042,7 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode) out_err: ext3_std_error(inode->i_sb, err); out: - unlock_super(inode->i_sb); + mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock); return err; out_brelse: diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 806b8b780add..97dd3828384c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1928,6 +1928,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sb->dq_op = &ext3_quota_operations; #endif INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); sb->s_root = NULL; diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h index f07f34de2f0e..dd61d83026a0 100644 --- a/include/linux/ext3_fs_sb.h +++ b/include/linux/ext3_fs_sb.h @@ -72,6 +72,7 @@ struct ext3_sb_info { struct inode * s_journal_inode; struct journal_s * s_journal; struct list_head s_orphan; + struct mutex s_orphan_lock; unsigned long s_commit_interval; struct block_device *journal_bdev; #ifdef CONFIG_JBD_DEBUG -- cgit v1.2.3 From 96d2a495c25d525873529b736cdb63ad502b101c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 14 Dec 2009 13:01:05 -0600 Subject: ext3: Replace lock/unlock_super() with an explicit lock for resizing Use a separate lock to protect s_groups_count and the other block group descriptors which get changed via an on-line resize operation, so we can stop overloading the use of lock_super(). Port of ext4 commit 32ed5058ce90024efcd811254b4b1de0468099df by Theodore Ts'o . CC: Theodore Ts'o Signed-off-by: Eric Sandeen Signed-off-by: Jan Kara --- fs/ext3/resize.c | 35 ++++++++++++++++++----------------- fs/ext3/super.c | 1 + include/linux/ext3_fs_sb.h | 1 + 3 files changed, 20 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 5f83b6179178..54351ac7cef9 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -209,7 +209,7 @@ static int setup_new_group_blocks(struct super_block *sb, if (IS_ERR(handle)) return PTR_ERR(handle); - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { err = -EBUSY; goto exit_journal; @@ -324,7 +324,7 @@ exit_bh: brelse(bh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; @@ -662,11 +662,12 @@ exit_free: * important part is that the new block and inode counts are in the backup * superblocks, and the location of the new group metadata in the GDT backups. * - * We do not need lock_super() for this, because these blocks are not - * otherwise touched by the filesystem code when it is mounted. We don't - * need to worry about last changing from sbi->s_groups_count, because the - * worst that can happen is that we do not copy the full number of backups - * at this time. The resize which changed s_groups_count will backup again. + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. */ static void update_backups(struct super_block *sb, int blk_off, char *data, int size) @@ -825,7 +826,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) goto exit_put; } - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { ext3_warning(sb, __func__, "multiple resizers run on filesystem!"); @@ -856,7 +857,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) /* * OK, now we've set up the new group. Time to make it active. * - * Current kernels don't lock all allocations via lock_super(), + * We do not lock all allocations via s_resize_lock * so we have to be safe wrt. concurrent accesses the group * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. @@ -900,12 +901,12 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) * * The precise rules we use are: * - * * Writers of s_groups_count *must* hold lock_super + * * Writers of s_groups_count *must* hold s_resize_lock * AND * * Writers must perform a smp_wmb() after updating all dependent * data and before modifying the groups count * - * * Readers must hold lock_super() over the access + * * Readers must hold s_resize_lock over the access * OR * * Readers must perform an smp_rmb() after reading the groups count * and before reading any dependent data. @@ -936,7 +937,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) ext3_journal_dirty_metadata(handle, sbi->s_sbh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; if (!err) { @@ -973,7 +974,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after - * taking lock_super() below. */ + * taking the s_resize_lock below. */ o_blocks_count = le32_to_cpu(es->s_blocks_count); o_groups_count = EXT3_SB(sb)->s_groups_count; @@ -1045,11 +1046,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, goto exit_put; } - lock_super(sb); + mutex_lock(&EXT3_SB(sb)->s_resize_lock); if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { ext3_warning(sb, __func__, "multiple resizers run on filesystem!"); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_journal_stop(handle); err = -EBUSY; goto exit_put; @@ -1059,13 +1060,13 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, EXT3_SB(sb)->s_sbh))) { ext3_warning(sb, __func__, "error %d on journal write access", err); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_journal_stop(handle); goto exit_put; } es->s_blocks_count = cpu_to_le32(o_blocks_count + add); ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, o_blocks_count + add); ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 97dd3828384c..afa2b569da10 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1929,6 +1929,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) #endif INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); + mutex_init(&sbi->s_resize_lock); sb->s_root = NULL; diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h index dd61d83026a0..258088ab3c6b 100644 --- a/include/linux/ext3_fs_sb.h +++ b/include/linux/ext3_fs_sb.h @@ -73,6 +73,7 @@ struct ext3_sb_info { struct journal_s * s_journal; struct list_head s_orphan; struct mutex s_orphan_lock; + struct mutex s_resize_lock; unsigned long s_commit_interval; struct block_device *journal_bdev; #ifdef CONFIG_JBD_DEBUG -- cgit v1.2.3 From 17bd55d037a02b04d9119511cfd1a4b985d20f63 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 23 Dec 2009 07:57:07 -0500 Subject: fs-writeback: Add helper function to start writeback if idle ext4, at least, would like to start pushing on writeback if it starts to get close to ENOSPC when reserving worst-case blocks for delalloc writes. Writing out delalloc data will convert those worst-case predictions into usually smaller actual usage, freeing up space before we hit ENOSPC based on this speculation. Thanks to Jens for the suggestion for the helper function, & the naming help. I've made the helper return status on whether writeback was started even though I don't plan to use it in the ext4 patch; it seems like it would be potentially useful to test this in some cases. Signed-off-by: Eric Sandeen Acked-by: Jan Kara --- fs/fs-writeback.c | 17 +++++++++++++++++ include/linux/writeback.h | 1 + 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 49bc1b8e8f19..f6c2155e0026 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1186,6 +1186,23 @@ void writeback_inodes_sb(struct super_block *sb) } EXPORT_SYMBOL(writeback_inodes_sb); +/** + * writeback_inodes_sb_if_idle - start writeback if none underway + * @sb: the superblock + * + * Invoke writeback_inodes_sb if no writeback is currently underway. + * Returns 1 if writeback was started, 0 if not. + */ +int writeback_inodes_sb_if_idle(struct super_block *sb) +{ + if (!writeback_in_progress(sb->s_bdi)) { + writeback_inodes_sb(sb); + return 1; + } else + return 0; +} +EXPORT_SYMBOL(writeback_inodes_sb_if_idle); + /** * sync_inodes_sb - sync sb inode pages * @sb: the superblock diff --git a/include/linux/writeback.h b/include/linux/writeback.h index c18c008f4bbf..76e8903cd204 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -70,6 +70,7 @@ struct writeback_control { struct bdi_writeback; int inode_wait(void *); void writeback_inodes_sb(struct super_block *); +int writeback_inodes_sb_if_idle(struct super_block *); void sync_inodes_sb(struct super_block *); void writeback_inodes_wbc(struct writeback_control *wbc); long wb_do_writeback(struct bdi_writeback *wb, int force_wait); -- cgit v1.2.3 From 119eecc831a42bd090543568932e440c6831f1bb Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Wed, 23 Dec 2009 09:10:48 +0100 Subject: Fix usb_serial_probe() problem introduced by the recent kfifo changes The USB serial code was a new user of the kfifo API, and it was missed when porting things to the new kfifo API. Please make the write_fifo in place. Here is my patch to fix the regression and full ported version. Signed-off-by: Stefani Seibold Reported-and-tested-by: Rafael J. Wysocki Cc: Greg KH Cc: Andrew Morton Cc: Alan Stern Signed-off-by: Linus Torvalds --- drivers/usb/serial/generic.c | 12 ++++++------ drivers/usb/serial/usb-serial.c | 5 ++--- include/linux/usb/serial.h | 3 ++- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c index b0f1183755c9..f1ea3a33b6e6 100644 --- a/drivers/usb/serial/generic.c +++ b/drivers/usb/serial/generic.c @@ -276,7 +276,7 @@ static int usb_serial_generic_write_start(struct usb_serial_port *port) if (port->write_urb_busy) start_io = false; else { - start_io = (kfifo_len(port->write_fifo) != 0); + start_io = (kfifo_len(&port->write_fifo) != 0); port->write_urb_busy = start_io; } spin_unlock_irqrestore(&port->lock, flags); @@ -285,7 +285,7 @@ static int usb_serial_generic_write_start(struct usb_serial_port *port) return 0; data = port->write_urb->transfer_buffer; - count = kfifo_out_locked(port->write_fifo, data, port->bulk_out_size, &port->lock); + count = kfifo_out_locked(&port->write_fifo, data, port->bulk_out_size, &port->lock); usb_serial_debug_data(debug, &port->dev, __func__, count, data); /* set up our urb */ @@ -345,7 +345,7 @@ int usb_serial_generic_write(struct tty_struct *tty, return usb_serial_multi_urb_write(tty, port, buf, count); - count = kfifo_in_locked(port->write_fifo, buf, count, &port->lock); + count = kfifo_in_locked(&port->write_fifo, buf, count, &port->lock); result = usb_serial_generic_write_start(port); if (result >= 0) @@ -370,7 +370,7 @@ int usb_serial_generic_write_room(struct tty_struct *tty) (serial->type->max_in_flight_urbs - port->urbs_in_flight); } else if (serial->num_bulk_out) - room = port->write_fifo->size - kfifo_len(port->write_fifo); + room = kfifo_avail(&port->write_fifo); spin_unlock_irqrestore(&port->lock, flags); dbg("%s - returns %d", __func__, room); @@ -391,7 +391,7 @@ int usb_serial_generic_chars_in_buffer(struct tty_struct *tty) chars = port->tx_bytes_flight; spin_unlock_irqrestore(&port->lock, flags); } else if (serial->num_bulk_out) - chars = kfifo_len(port->write_fifo); + chars = kfifo_len(&port->write_fifo); dbg("%s - returns %d", __func__, chars); return chars; @@ -507,7 +507,7 @@ void usb_serial_generic_write_bulk_callback(struct urb *urb) if (status) { dbg("%s - nonzero multi-urb write bulk status " "received: %d", __func__, status); - kfifo_reset(port->write_fifo); + kfifo_reset_out(&port->write_fifo); } else usb_serial_generic_write_start(port); } diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c index 636a4f23445e..33c85f7084f8 100644 --- a/drivers/usb/serial/usb-serial.c +++ b/drivers/usb/serial/usb-serial.c @@ -595,8 +595,7 @@ static void port_release(struct device *dev) usb_free_urb(port->write_urb); usb_free_urb(port->interrupt_in_urb); usb_free_urb(port->interrupt_out_urb); - if (!IS_ERR(port->write_fifo) && port->write_fifo) - kfifo_free(port->write_fifo); + kfifo_free(&port->write_fifo); kfree(port->bulk_in_buffer); kfree(port->bulk_out_buffer); kfree(port->interrupt_in_buffer); @@ -939,7 +938,7 @@ int usb_serial_probe(struct usb_interface *interface, dev_err(&interface->dev, "No free urbs available\n"); goto probe_error; } - if (kfifo_alloc(port->write_fifo, PAGE_SIZE, GFP_KERNEL)) + if (kfifo_alloc(&port->write_fifo, PAGE_SIZE, GFP_KERNEL)) goto probe_error; buffer_size = le16_to_cpu(endpoint->wMaxPacketSize); port->bulk_out_size = buffer_size; diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h index acf6e457c04b..1819396ed501 100644 --- a/include/linux/usb/serial.h +++ b/include/linux/usb/serial.h @@ -16,6 +16,7 @@ #include #include #include +#include #define SERIAL_TTY_MAJOR 188 /* Nice legal number now */ #define SERIAL_TTY_MINORS 254 /* loads of devices :) */ @@ -94,7 +95,7 @@ struct usb_serial_port { unsigned char *bulk_out_buffer; int bulk_out_size; struct urb *write_urb; - struct kfifo *write_fifo; + struct kfifo write_fifo; int write_urb_busy; __u8 bulk_out_endpointAddress; -- cgit v1.2.3 From 9c717de946ed7f5782e6dffacf2d05859073058c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 23 Dec 2009 09:23:33 -0800 Subject: kfifo: fix Error/broken kernel-doc notation Fix kernel-doc errors and warnings in new header file kfifo.h. Don't use kernel-doc "/**" for internal functions whose comments are not in kernel-doc format. kernel-doc section header names (like "Note:") must be unique per function. Looks like I need to document that. Error(include/linux/kfifo.h:76): duplicate section name 'Note' Warning(include/linux/kfifo.h:88): Excess function parameter 'size' description in 'INIT_KFIFO' Error(include/linux/kfifo.h:101): duplicate section name 'Note' Warning(include/linux/kfifo.h:257): No description found for parameter 'fifo' (many of this last type, from internal functions) Signed-off-by: Randy Dunlap Cc: Stefani Seibold Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 486e8ad3bb50..3d44e9c65a8e 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -69,8 +69,8 @@ struct kfifo { * @name: name of the declared kfifo datatype * @size: size of the fifo buffer * - * Note: the macro can be used inside struct or union declaration - * Note: the macro creates two objects: + * Note1: the macro can be used inside struct or union declaration + * Note2: the macro creates two objects: * A kfifo object with the given name and a buffer for the kfifo * object named name##kfifo_buffer */ @@ -83,7 +83,6 @@ union { \ /** * INIT_KFIFO - Initialize a kfifo declared by DECLARED_KFIFO * @name: name of the declared kfifo datatype - * @size: size of the fifo buffer */ #define INIT_KFIFO(name) \ name = __kfifo_initializer(sizeof(name##kfifo_buffer) - \ @@ -94,8 +93,8 @@ union { \ * @name: name of the declared kfifo datatype * @size: size of the fifo buffer * - * Note: the macro can be used for global and local kfifo data type variables - * Note: the macro creates two objects: + * Note1: the macro can be used for global and local kfifo data type variables + * Note2: the macro creates two objects: * A kfifo object with the given name and a buffer for the kfifo * object named name##kfifo_buffer */ @@ -249,7 +248,7 @@ extern __must_check unsigned int kfifo_from_user(struct kfifo *fifo, extern __must_check unsigned int kfifo_to_user(struct kfifo *fifo, void __user *to, unsigned int n); -/** +/* * __kfifo_add_out internal helper function for updating the out offset */ static inline void __kfifo_add_out(struct kfifo *fifo, @@ -259,7 +258,7 @@ static inline void __kfifo_add_out(struct kfifo *fifo, fifo->out += off; } -/** +/* * __kfifo_add_in internal helper function for updating the in offset */ static inline void __kfifo_add_in(struct kfifo *fifo, @@ -269,7 +268,7 @@ static inline void __kfifo_add_in(struct kfifo *fifo, fifo->in += off; } -/** +/* * __kfifo_off internal helper function for calculating the index of a * given offeset */ @@ -278,7 +277,7 @@ static inline unsigned int __kfifo_off(struct kfifo *fifo, unsigned int off) return off & (fifo->size - 1); } -/** +/* * __kfifo_peek_n internal helper function for determinate the length of * the next record in the fifo */ @@ -299,7 +298,7 @@ static inline unsigned int __kfifo_peek_n(struct kfifo *fifo, #undef __KFIFO_GET } -/** +/* * __kfifo_poke_n internal helper function for storing the length of * the next record into the fifo */ @@ -319,7 +318,7 @@ static inline void __kfifo_poke_n(struct kfifo *fifo, #undef __KFIFO_PUT } -/** +/* * __kfifo_in_... internal functions for put date into the fifo * do not call it directly, use kfifo_in_rec() instead */ @@ -367,7 +366,7 @@ static inline __must_check unsigned int kfifo_in_rec(struct kfifo *fifo, return __kfifo_in_rec(fifo, from, n, recsize); } -/** +/* * __kfifo_out_... internal functions for get date from the fifo * do not call it directly, use kfifo_out_rec() instead */ @@ -425,7 +424,7 @@ static inline __must_check unsigned int kfifo_out_rec(struct kfifo *fifo, return __kfifo_out_rec(fifo, to, n, recsize, total); } -/** +/* * __kfifo_from_user_... internal functions for transfer from user space into * the fifo. do not call it directly, use kfifo_from_user_rec() instead */ @@ -474,7 +473,7 @@ static inline __must_check unsigned int kfifo_from_user_rec(struct kfifo *fifo, return __kfifo_from_user_rec(fifo, from, n, recsize); } -/** +/* * __kfifo_to_user_... internal functions for transfer fifo data into user space * do not call it directly, use kfifo_to_user_rec() instead */ @@ -533,7 +532,7 @@ static inline __must_check unsigned int kfifo_to_user_rec(struct kfifo *fifo, return __kfifo_to_user_rec(fifo, to, n, recsize, total); } -/** +/* * __kfifo_peek_... internal functions for peek into the next fifo record * do not call it directly, use kfifo_peek_rec() instead */ @@ -557,7 +556,7 @@ static inline __must_check unsigned int kfifo_peek_rec(struct kfifo *fifo, return __kfifo_peek_n(fifo, recsize); } -/** +/* * __kfifo_skip_... internal functions for skip the next fifo record * do not call it directly, use kfifo_skip_rec() instead */ -- cgit v1.2.3 From 26579ab70aa0e0ea434e6e100279d2f67c094431 Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Fri, 18 Dec 2009 15:34:19 +0200 Subject: Driver core: device_attribute parameters can often be const* Most device_attributes are const, and are begging to be put in a ro section. However, the create and remove file interfaces were failing to propagate the const promise which the only functions they call offer. Signed-off-by: Phil Carmody Signed-off-by: Greg Kroah-Hartman --- Documentation/filesystems/sysfs.txt | 8 ++++---- drivers/base/core.c | 6 ++++-- include/linux/device.h | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index b245d524d568..a4ac2b98c613 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt @@ -91,8 +91,8 @@ struct device_attribute { const char *buf, size_t count); }; -int device_create_file(struct device *, struct device_attribute *); -void device_remove_file(struct device *, struct device_attribute *); +int device_create_file(struct device *, const struct device_attribute *); +void device_remove_file(struct device *, const struct device_attribute *); It also defines this helper for defining device attributes: @@ -316,8 +316,8 @@ DEVICE_ATTR(_name, _mode, _show, _store); Creation/Removal: -int device_create_file(struct device *device, struct device_attribute * attr); -void device_remove_file(struct device * dev, struct device_attribute * attr); +int device_create_file(struct device *dev, const struct device_attribute * attr); +void device_remove_file(struct device *dev, const struct device_attribute * attr); - bus drivers (include/linux/device.h) diff --git a/drivers/base/core.c b/drivers/base/core.c index f1290cbd1350..2fd9e611f8a6 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -446,7 +446,8 @@ struct kset *devices_kset; * @dev: device. * @attr: device attribute descriptor. */ -int device_create_file(struct device *dev, struct device_attribute *attr) +int device_create_file(struct device *dev, + const struct device_attribute *attr) { int error = 0; if (dev) @@ -459,7 +460,8 @@ int device_create_file(struct device *dev, struct device_attribute *attr) * @dev: device. * @attr: device attribute descriptor. */ -void device_remove_file(struct device *dev, struct device_attribute *attr) +void device_remove_file(struct device *dev, + const struct device_attribute *attr) { if (dev) sysfs_remove_file(&dev->kobj, &attr->attr); diff --git a/include/linux/device.h b/include/linux/device.h index 2a73d9bcbc9c..aa5b3e66a147 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -319,9 +319,9 @@ struct device_attribute { struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store) extern int __must_check device_create_file(struct device *device, - struct device_attribute *entry); + const struct device_attribute *entry); extern void device_remove_file(struct device *dev, - struct device_attribute *attr); + const struct device_attribute *attr); extern int __must_check device_create_bin_file(struct device *dev, struct bin_attribute *attr); extern void device_remove_bin_file(struct device *dev, -- cgit v1.2.3 From 66ecb92be9eb579df93add22d19843e7869f168e Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Fri, 18 Dec 2009 15:34:20 +0200 Subject: Driver core: bin_attribute parameters can often be const* Many struct bin_attribute descriptors are purely read-only structures, and there's no need to change them. Therefore make the promise not to, which will let those descriptors be put in a ro section. Signed-off-by: Phil Carmody Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 6 ++++-- fs/sysfs/bin.c | 6 ++++-- include/linux/device.h | 6 +++--- include/linux/sysfs.h | 9 +++++---- 4 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/core.c b/drivers/base/core.c index 2fd9e611f8a6..83afc8b8f27b 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -472,7 +472,8 @@ void device_remove_file(struct device *dev, * @dev: device. * @attr: device binary attribute descriptor. */ -int device_create_bin_file(struct device *dev, struct bin_attribute *attr) +int device_create_bin_file(struct device *dev, + const struct bin_attribute *attr) { int error = -EINVAL; if (dev) @@ -486,7 +487,8 @@ EXPORT_SYMBOL_GPL(device_create_bin_file); * @dev: device. * @attr: device binary attribute descriptor. */ -void device_remove_bin_file(struct device *dev, struct bin_attribute *attr) +void device_remove_bin_file(struct device *dev, + const struct bin_attribute *attr) { if (dev) sysfs_remove_bin_file(&dev->kobj, attr); diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 60c702bc10ae..a0a500af24a1 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -483,7 +483,8 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd) * @attr: attribute descriptor. */ -int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) +int sysfs_create_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) { BUG_ON(!kobj || !kobj->sd || !attr); @@ -497,7 +498,8 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) * @attr: attribute descriptor. */ -void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) +void sysfs_remove_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) { sysfs_hash_and_remove(kobj->sd, attr->attr.name); } diff --git a/include/linux/device.h b/include/linux/device.h index aa5b3e66a147..10d74ce93a46 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -319,13 +319,13 @@ struct device_attribute { struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store) extern int __must_check device_create_file(struct device *device, - const struct device_attribute *entry); + const struct device_attribute *entry); extern void device_remove_file(struct device *dev, const struct device_attribute *attr); extern int __must_check device_create_bin_file(struct device *dev, - struct bin_attribute *attr); + const struct bin_attribute *attr); extern void device_remove_bin_file(struct device *dev, - struct bin_attribute *attr); + const struct bin_attribute *attr); extern int device_schedule_callback_owner(struct device *dev, void (*func)(struct device *dev), struct module *owner); diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 9d68fed50f11..cfa83083a2d4 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -99,8 +99,9 @@ int __must_check sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr); int __must_check sysfs_create_bin_file(struct kobject *kobj, - struct bin_attribute *attr); -void sysfs_remove_bin_file(struct kobject *kobj, struct bin_attribute *attr); + const struct bin_attribute *attr); +void sysfs_remove_bin_file(struct kobject *kobj, + const struct bin_attribute *attr); int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name); @@ -175,13 +176,13 @@ static inline void sysfs_remove_file(struct kobject *kobj, } static inline int sysfs_create_bin_file(struct kobject *kobj, - struct bin_attribute *attr) + const struct bin_attribute *attr) { return 0; } static inline void sysfs_remove_bin_file(struct kobject *kobj, - struct bin_attribute *attr) + const struct bin_attribute *attr) { } -- cgit v1.2.3 From 099c2f21d8cf0724b85abb2c589d6276953781b7 Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Fri, 18 Dec 2009 15:34:21 +0200 Subject: Driver core: driver_attribute parameters can often be const* Many struct driver_attribute descriptors are purely read-only structures, and there's no need to change them. Therefore make the promise not to, which will let those descriptors be put in a ro section. Signed-off-by: Phil Carmody Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-model/driver.txt | 4 ++-- Documentation/filesystems/sysfs.txt | 4 ++-- drivers/base/driver.c | 4 ++-- include/linux/device.h | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-model/driver.txt b/Documentation/driver-model/driver.txt index 60120fb3b961..d2cd6fb8ba9e 100644 --- a/Documentation/driver-model/driver.txt +++ b/Documentation/driver-model/driver.txt @@ -226,5 +226,5 @@ struct driver_attribute driver_attr_debug; This can then be used to add and remove the attribute from the driver's directory using: -int driver_create_file(struct device_driver *, struct driver_attribute *); -void driver_remove_file(struct device_driver *, struct driver_attribute *); +int driver_create_file(struct device_driver *, const struct driver_attribute *); +void driver_remove_file(struct device_driver *, const struct driver_attribute *); diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index a4ac2b98c613..931c806642c5 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt @@ -358,7 +358,7 @@ DRIVER_ATTR(_name, _mode, _show, _store) Creation/Removal: -int driver_create_file(struct device_driver *, struct driver_attribute *); -void driver_remove_file(struct device_driver *, struct driver_attribute *); +int driver_create_file(struct device_driver *, const struct driver_attribute *); +void driver_remove_file(struct device_driver *, const struct driver_attribute *); diff --git a/drivers/base/driver.c b/drivers/base/driver.c index f367885a7646..90c9fff09ead 100644 --- a/drivers/base/driver.c +++ b/drivers/base/driver.c @@ -98,7 +98,7 @@ EXPORT_SYMBOL_GPL(driver_find_device); * @attr: driver attribute descriptor. */ int driver_create_file(struct device_driver *drv, - struct driver_attribute *attr) + const struct driver_attribute *attr) { int error; if (drv) @@ -115,7 +115,7 @@ EXPORT_SYMBOL_GPL(driver_create_file); * @attr: driver attribute descriptor. */ void driver_remove_file(struct device_driver *drv, - struct driver_attribute *attr) + const struct driver_attribute *attr) { if (drv) sysfs_remove_file(&drv->p->kobj, &attr->attr); diff --git a/include/linux/device.h b/include/linux/device.h index 10d74ce93a46..a62799f2ab00 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -166,9 +166,9 @@ struct driver_attribute driver_attr_##_name = \ __ATTR(_name, _mode, _show, _store) extern int __must_check driver_create_file(struct device_driver *driver, - struct driver_attribute *attr); + const struct driver_attribute *attr); extern void driver_remove_file(struct device_driver *driver, - struct driver_attribute *attr); + const struct driver_attribute *attr); extern int __must_check driver_add_kobj(struct device_driver *drv, struct kobject *kobj, -- cgit v1.2.3 From 29d249ed80c80b479df78e456e6809223c648505 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 18 Dec 2009 09:59:48 -0800 Subject: Staging: dst: remove from the tree DST is dead, no one is using it and upstream has abandoned it, so remove it from the tree because it is not going anywhere. Acked-by: Evgeniy Polyakov Signed-off-by: Greg Kroah-Hartman --- drivers/staging/Kconfig | 2 - drivers/staging/Makefile | 1 - drivers/staging/dst/Kconfig | 67 --- drivers/staging/dst/Makefile | 3 - drivers/staging/dst/crypto.c | 733 ----------------------------- drivers/staging/dst/dcore.c | 968 -------------------------------------- drivers/staging/dst/export.c | 660 -------------------------- drivers/staging/dst/state.c | 844 --------------------------------- drivers/staging/dst/thread_pool.c | 348 -------------- drivers/staging/dst/trans.c | 337 ------------- include/linux/dst.h | 587 ----------------------- 11 files changed, 4550 deletions(-) delete mode 100644 drivers/staging/dst/Kconfig delete mode 100644 drivers/staging/dst/Makefile delete mode 100644 drivers/staging/dst/crypto.c delete mode 100644 drivers/staging/dst/dcore.c delete mode 100644 drivers/staging/dst/export.c delete mode 100644 drivers/staging/dst/state.c delete mode 100644 drivers/staging/dst/thread_pool.c delete mode 100644 drivers/staging/dst/trans.c delete mode 100644 include/linux/dst.h (limited to 'include/linux') diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index db0de940949e..94eb86319ff3 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -87,8 +87,6 @@ source "drivers/staging/frontier/Kconfig" source "drivers/staging/dream/Kconfig" -source "drivers/staging/dst/Kconfig" - source "drivers/staging/pohmelfs/Kconfig" source "drivers/staging/b3dfg/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 73c6a71155e0..b5e67b889f60 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -26,7 +26,6 @@ obj-$(CONFIG_RTL8192E) += rtl8192e/ obj-$(CONFIG_INPUT_MIMIO) += mimio/ obj-$(CONFIG_TRANZPORT) += frontier/ obj-$(CONFIG_DREAM) += dream/ -obj-$(CONFIG_DST) += dst/ obj-$(CONFIG_POHMELFS) += pohmelfs/ obj-$(CONFIG_B3DFG) += b3dfg/ obj-$(CONFIG_IDE_PHISON) += phison/ diff --git a/drivers/staging/dst/Kconfig b/drivers/staging/dst/Kconfig deleted file mode 100644 index 448d342ac2a2..000000000000 --- a/drivers/staging/dst/Kconfig +++ /dev/null @@ -1,67 +0,0 @@ -config DST - tristate "Distributed storage" - depends on NET && CRYPTO && SYSFS && BLK_DEV - select CONNECTOR - ---help--- - DST is a network block device storage, which can be used to organize - exported storage on the remote nodes into the local block device. - - DST works on top of any network media and protocol; it is just a matter - of configuration utility to understand the correct addresses. The most - common example is TCP over IP, which allows to pass through firewalls and - create remote backup storage in a different datacenter. DST requires - single port to be enabled on the exporting node and outgoing connections - on the local node. - - DST works with in-kernel client and server, which improves performance by - eliminating unneded data copies and by not depending on the version - of the external IO components. It requires userspace configuration utility - though. - - DST uses transaction model, when each store has to be explicitly acked - from the remote node to be considered as successfully written. There - may be lots of in-flight transactions. When remote host does not ack - the transaction it will be resent predefined number of times with specified - timeouts between them. All those parameters are configurable. Transactions - are marked as failed after all resends complete unsuccessfully; having - long enough resend timeout and/or large number of resends allows not to - return error to the higher (FS usually) layer in case of short network - problems or remote node outages. In case of network RAID setup this means - that storage will not degrade until transactions are marked as failed, and - thus will not force checksum recalculation and data rebuild. In case of - connection failure DST will try to reconnect to the remote node automatically. - DST sends ping commands at idle time to detect if remote node is alive. - - Because of transactional model it is possible to use zero-copy sending - without worry of data corruption (which in turn could be detected by the - strong checksums though). - - DST may fully encrypt the data channel in case of untrusted channel and implement - strong checksum of the transferred data. It is possible to configure algorithms - and crypto keys; they should match on both sides of the network channel. - Crypto processing does not introduce noticeble performance overhead, since DST - uses configurable pool of threads to perform crypto processing. - - DST utilizes memory pool model of all its transaction allocations (it is the - only additional allocation on the client) and server allocations (bio pools, - while pages are allocated from the slab). - - At startup DST performs a simple negotiation with the export node to determine - access permissions and size of the exported storage. It can be extended if - new parameters should be autonegotiated. - - DST carries block IO flags in the protocol, which allows to transparently implement - barriers and sync/flush operations. Those flags are used in the export node where - IO against the local storage is performed, which means that sync write will be sync - on the remote node too, which in turn improves data integrity and improved resistance - to errors and data corruption during power outages or storage damages. - - Homepage: http://www.ioremap.net/projects/dst - Userspace configuration utility and the latest releases: http://www.ioremap.net/archive/dst/ - -config DST_DEBUG - bool "DST debug" - depends on DST - ---help--- - This option will enable HEAVY debugging of the DST. - Turn it on ONLY if you have to debug some really obscure problem. diff --git a/drivers/staging/dst/Makefile b/drivers/staging/dst/Makefile deleted file mode 100644 index 3a8b0cf9643e..000000000000 --- a/drivers/staging/dst/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -obj-$(CONFIG_DST) += nst.o - -nst-y := dcore.o state.o export.o thread_pool.o crypto.o trans.o diff --git a/drivers/staging/dst/crypto.c b/drivers/staging/dst/crypto.c deleted file mode 100644 index 351295c97a4b..000000000000 --- a/drivers/staging/dst/crypto.c +++ /dev/null @@ -1,733 +0,0 @@ -/* - * 2007+ Copyright (c) Evgeniy Polyakov - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include -#include -#include -#include -#include -#include - -/* - * Tricky bastard, but IV can be more complex with time... - */ -static inline u64 dst_gen_iv(struct dst_trans *t) -{ - return t->gen; -} - -/* - * Crypto machinery: hash/cipher support for the given crypto controls. - */ -static struct crypto_hash *dst_init_hash(struct dst_crypto_ctl *ctl, u8 *key) -{ - int err; - struct crypto_hash *hash; - - hash = crypto_alloc_hash(ctl->hash_algo, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(hash)) { - err = PTR_ERR(hash); - dprintk("%s: failed to allocate hash '%s', err: %d.\n", - __func__, ctl->hash_algo, err); - goto err_out_exit; - } - - ctl->crypto_attached_size = crypto_hash_digestsize(hash); - - if (!ctl->hash_keysize) - return hash; - - err = crypto_hash_setkey(hash, key, ctl->hash_keysize); - if (err) { - dprintk("%s: failed to set key for hash '%s', err: %d.\n", - __func__, ctl->hash_algo, err); - goto err_out_free; - } - - return hash; - -err_out_free: - crypto_free_hash(hash); -err_out_exit: - return ERR_PTR(err); -} - -static struct crypto_ablkcipher *dst_init_cipher(struct dst_crypto_ctl *ctl, - u8 *key) -{ - int err = -EINVAL; - struct crypto_ablkcipher *cipher; - - if (!ctl->cipher_keysize) - goto err_out_exit; - - cipher = crypto_alloc_ablkcipher(ctl->cipher_algo, 0, 0); - if (IS_ERR(cipher)) { - err = PTR_ERR(cipher); - dprintk("%s: failed to allocate cipher '%s', err: %d.\n", - __func__, ctl->cipher_algo, err); - goto err_out_exit; - } - - crypto_ablkcipher_clear_flags(cipher, ~0); - - err = crypto_ablkcipher_setkey(cipher, key, ctl->cipher_keysize); - if (err) { - dprintk("%s: failed to set key for cipher '%s', err: %d.\n", - __func__, ctl->cipher_algo, err); - goto err_out_free; - } - - return cipher; - -err_out_free: - crypto_free_ablkcipher(cipher); -err_out_exit: - return ERR_PTR(err); -} - -/* - * Crypto engine has a pool of pages to encrypt data into before sending - * it over the network. This pool is freed/allocated here. - */ -static void dst_crypto_pages_free(struct dst_crypto_engine *e) -{ - unsigned int i; - - for (i = 0; i < e->page_num; ++i) - __free_page(e->pages[i]); - kfree(e->pages); -} - -static int dst_crypto_pages_alloc(struct dst_crypto_engine *e, int num) -{ - int i; - - e->pages = kmalloc(num * sizeof(struct page **), GFP_KERNEL); - if (!e->pages) - return -ENOMEM; - - for (i = 0; i < num; ++i) { - e->pages[i] = alloc_page(GFP_KERNEL); - if (!e->pages[i]) - goto err_out_free_pages; - } - - e->page_num = num; - return 0; - -err_out_free_pages: - while (--i >= 0) - __free_page(e->pages[i]); - - kfree(e->pages); - return -ENOMEM; -} - -/* - * Initialize crypto engine for given node. - * Setup cipher/hash, keys, pool of threads and private data. - */ -static int dst_crypto_engine_init(struct dst_crypto_engine *e, - struct dst_node *n) -{ - int err; - struct dst_crypto_ctl *ctl = &n->crypto; - - err = dst_crypto_pages_alloc(e, n->max_pages); - if (err) - goto err_out_exit; - - e->size = PAGE_SIZE; - e->data = kmalloc(e->size, GFP_KERNEL); - if (!e->data) { - err = -ENOMEM; - goto err_out_free_pages; - } - - if (ctl->hash_algo[0]) { - e->hash = dst_init_hash(ctl, n->hash_key); - if (IS_ERR(e->hash)) { - err = PTR_ERR(e->hash); - e->hash = NULL; - goto err_out_free; - } - } - - if (ctl->cipher_algo[0]) { - e->cipher = dst_init_cipher(ctl, n->cipher_key); - if (IS_ERR(e->cipher)) { - err = PTR_ERR(e->cipher); - e->cipher = NULL; - goto err_out_free_hash; - } - } - - return 0; - -err_out_free_hash: - crypto_free_hash(e->hash); -err_out_free: - kfree(e->data); -err_out_free_pages: - dst_crypto_pages_free(e); -err_out_exit: - return err; -} - -static void dst_crypto_engine_exit(struct dst_crypto_engine *e) -{ - if (e->hash) - crypto_free_hash(e->hash); - if (e->cipher) - crypto_free_ablkcipher(e->cipher); - dst_crypto_pages_free(e); - kfree(e->data); -} - -/* - * Waiting for cipher processing to be completed. - */ -struct dst_crypto_completion { - struct completion complete; - int error; -}; - -static void dst_crypto_complete(struct crypto_async_request *req, int err) -{ - struct dst_crypto_completion *c = req->data; - - if (err == -EINPROGRESS) - return; - - dprintk("%s: req: %p, err: %d.\n", __func__, req, err); - c->error = err; - complete(&c->complete); -} - -static int dst_crypto_process(struct ablkcipher_request *req, - struct scatterlist *sg_dst, struct scatterlist *sg_src, - void *iv, int enc, unsigned long timeout) -{ - struct dst_crypto_completion c; - int err; - - init_completion(&c.complete); - c.error = -EINPROGRESS; - - ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - dst_crypto_complete, &c); - - ablkcipher_request_set_crypt(req, sg_src, sg_dst, sg_src->length, iv); - - if (enc) - err = crypto_ablkcipher_encrypt(req); - else - err = crypto_ablkcipher_decrypt(req); - - switch (err) { - case -EINPROGRESS: - case -EBUSY: - err = wait_for_completion_interruptible_timeout(&c.complete, - timeout); - if (!err) - err = -ETIMEDOUT; - else - err = c.error; - break; - default: - break; - } - - return err; -} - -/* - * DST uses generic iteration approach for data crypto processing. - * Single block IO request is switched into array of scatterlists, - * which are submitted to the crypto processing iterator. - * - * Input and output iterator initialization are different, since - * in output case we can not encrypt data in-place and need a - * temporary storage, which is then being sent to the remote peer. - */ -static int dst_trans_iter_out(struct bio *bio, struct dst_crypto_engine *e, - int (*iterator) (struct dst_crypto_engine *e, - struct scatterlist *dst, - struct scatterlist *src)) -{ - struct bio_vec *bv; - int err, i; - - sg_init_table(e->src, bio->bi_vcnt); - sg_init_table(e->dst, bio->bi_vcnt); - - bio_for_each_segment(bv, bio, i) { - sg_set_page(&e->src[i], bv->bv_page, bv->bv_len, bv->bv_offset); - sg_set_page(&e->dst[i], e->pages[i], bv->bv_len, bv->bv_offset); - - err = iterator(e, &e->dst[i], &e->src[i]); - if (err) - return err; - } - - return 0; -} - -static int dst_trans_iter_in(struct bio *bio, struct dst_crypto_engine *e, - int (*iterator) (struct dst_crypto_engine *e, - struct scatterlist *dst, - struct scatterlist *src)) -{ - struct bio_vec *bv; - int err, i; - - sg_init_table(e->src, bio->bi_vcnt); - sg_init_table(e->dst, bio->bi_vcnt); - - bio_for_each_segment(bv, bio, i) { - sg_set_page(&e->src[i], bv->bv_page, bv->bv_len, bv->bv_offset); - sg_set_page(&e->dst[i], bv->bv_page, bv->bv_len, bv->bv_offset); - - err = iterator(e, &e->dst[i], &e->src[i]); - if (err) - return err; - } - - return 0; -} - -static int dst_crypt_iterator(struct dst_crypto_engine *e, - struct scatterlist *sg_dst, struct scatterlist *sg_src) -{ - struct ablkcipher_request *req = e->data; - u8 iv[32]; - - memset(iv, 0, sizeof(iv)); - - memcpy(iv, &e->iv, sizeof(e->iv)); - - return dst_crypto_process(req, sg_dst, sg_src, iv, e->enc, e->timeout); -} - -static int dst_crypt(struct dst_crypto_engine *e, struct bio *bio) -{ - struct ablkcipher_request *req = e->data; - - memset(req, 0, sizeof(struct ablkcipher_request)); - ablkcipher_request_set_tfm(req, e->cipher); - - if (e->enc) - return dst_trans_iter_out(bio, e, dst_crypt_iterator); - else - return dst_trans_iter_in(bio, e, dst_crypt_iterator); -} - -static int dst_hash_iterator(struct dst_crypto_engine *e, - struct scatterlist *sg_dst, struct scatterlist *sg_src) -{ - return crypto_hash_update(e->data, sg_src, sg_src->length); -} - -static int dst_hash(struct dst_crypto_engine *e, struct bio *bio, void *dst) -{ - struct hash_desc *desc = e->data; - int err; - - desc->tfm = e->hash; - desc->flags = 0; - - err = crypto_hash_init(desc); - if (err) - return err; - - err = dst_trans_iter_in(bio, e, dst_hash_iterator); - if (err) - return err; - - err = crypto_hash_final(desc, dst); - if (err) - return err; - - return 0; -} - -/* - * Initialize/cleanup a crypto thread. The only thing it should - * do is to allocate a pool of pages as temporary storage. - * And to setup cipher and/or hash. - */ -static void *dst_crypto_thread_init(void *data) -{ - struct dst_node *n = data; - struct dst_crypto_engine *e; - int err = -ENOMEM; - - e = kzalloc(sizeof(struct dst_crypto_engine), GFP_KERNEL); - if (!e) - goto err_out_exit; - e->src = kcalloc(2 * n->max_pages, sizeof(struct scatterlist), - GFP_KERNEL); - if (!e->src) - goto err_out_free; - - e->dst = e->src + n->max_pages; - - err = dst_crypto_engine_init(e, n); - if (err) - goto err_out_free_all; - - return e; - -err_out_free_all: - kfree(e->src); -err_out_free: - kfree(e); -err_out_exit: - return ERR_PTR(err); -} - -static void dst_crypto_thread_cleanup(void *private) -{ - struct dst_crypto_engine *e = private; - - dst_crypto_engine_exit(e); - kfree(e->src); - kfree(e); -} - -/* - * Initialize crypto engine for given node: store keys, create pool - * of threads, initialize each one. - * - * Each thread has unique ID, but 0 and 1 are reserved for receiving and - * accepting threads (if export node), so IDs could start from 2, but starting - * them from 10 allows easily understand what this thread is for. - */ -int dst_node_crypto_init(struct dst_node *n, struct dst_crypto_ctl *ctl) -{ - void *key = (ctl + 1); - int err = -ENOMEM, i; - char name[32]; - - if (ctl->hash_keysize) { - n->hash_key = kmalloc(ctl->hash_keysize, GFP_KERNEL); - if (!n->hash_key) - goto err_out_exit; - memcpy(n->hash_key, key, ctl->hash_keysize); - } - - if (ctl->cipher_keysize) { - n->cipher_key = kmalloc(ctl->cipher_keysize, GFP_KERNEL); - if (!n->cipher_key) - goto err_out_free_hash; - memcpy(n->cipher_key, key, ctl->cipher_keysize); - } - memcpy(&n->crypto, ctl, sizeof(struct dst_crypto_ctl)); - - for (i = 0; i < ctl->thread_num; ++i) { - snprintf(name, sizeof(name), "%s-crypto-%d", n->name, i); - /* Unique ids... */ - err = thread_pool_add_worker(n->pool, name, i + 10, - dst_crypto_thread_init, dst_crypto_thread_cleanup, n); - if (err) - goto err_out_free_threads; - } - - return 0; - -err_out_free_threads: - while (--i >= 0) - thread_pool_del_worker_id(n->pool, i+10); - - if (ctl->cipher_keysize) - kfree(n->cipher_key); - ctl->cipher_keysize = 0; -err_out_free_hash: - if (ctl->hash_keysize) - kfree(n->hash_key); - ctl->hash_keysize = 0; -err_out_exit: - return err; -} - -void dst_node_crypto_exit(struct dst_node *n) -{ - struct dst_crypto_ctl *ctl = &n->crypto; - - if (ctl->cipher_algo[0] || ctl->hash_algo[0]) { - kfree(n->hash_key); - kfree(n->cipher_key); - } -} - -/* - * Thrad pool setup callback. Just stores a transaction in private data. - */ -static int dst_trans_crypto_setup(void *crypto_engine, void *trans) -{ - struct dst_crypto_engine *e = crypto_engine; - - e->private = trans; - return 0; -} - -#if 0 -static void dst_dump_bio(struct bio *bio) -{ - u8 *p; - struct bio_vec *bv; - int i; - - bio_for_each_segment(bv, bio, i) { - dprintk("%s: %llu/%u: size: %u, offset: %u, data: ", - __func__, bio->bi_sector, bio->bi_size, - bv->bv_len, bv->bv_offset); - - p = kmap(bv->bv_page) + bv->bv_offset; - for (i = 0; i < bv->bv_len; ++i) - printk(KERN_DEBUG "%02x ", p[i]); - kunmap(bv->bv_page); - printk("\n"); - } -} -#endif - -/* - * Encrypt/hash data and send it to the network. - */ -static int dst_crypto_process_sending(struct dst_crypto_engine *e, - struct bio *bio, u8 *hash) -{ - int err; - - if (e->cipher) { - err = dst_crypt(e, bio); - if (err) - goto err_out_exit; - } - - if (e->hash) { - err = dst_hash(e, bio, hash); - if (err) - goto err_out_exit; - -#ifdef CONFIG_DST_DEBUG - { - unsigned int i; - - /* dst_dump_bio(bio); */ - - printk(KERN_DEBUG "%s: bio: %llu/%u, rw: %lu, hash: ", - __func__, (u64)bio->bi_sector, - bio->bi_size, bio_data_dir(bio)); - for (i = 0; i < crypto_hash_digestsize(e->hash); ++i) - printk("%02x ", hash[i]); - printk("\n"); - } -#endif - } - - return 0; - -err_out_exit: - return err; -} - -/* - * Check if received data is valid. Decipher if it is. - */ -static int dst_crypto_process_receiving(struct dst_crypto_engine *e, - struct bio *bio, u8 *hash, u8 *recv_hash) -{ - int err; - - if (e->hash) { - int mismatch; - - err = dst_hash(e, bio, hash); - if (err) - goto err_out_exit; - - mismatch = !!memcmp(recv_hash, hash, - crypto_hash_digestsize(e->hash)); -#ifdef CONFIG_DST_DEBUG - /* dst_dump_bio(bio); */ - - printk(KERN_DEBUG "%s: bio: %llu/%u, rw: %lu, hash mismatch: %d", - __func__, (u64)bio->bi_sector, bio->bi_size, - bio_data_dir(bio), mismatch); - if (mismatch) { - unsigned int i; - - printk(", recv/calc: "); - for (i = 0; i < crypto_hash_digestsize(e->hash); ++i) - printk("%02x/%02x ", recv_hash[i], hash[i]); - - } - printk("\n"); -#endif - err = -1; - if (mismatch) - goto err_out_exit; - } - - if (e->cipher) { - err = dst_crypt(e, bio); - if (err) - goto err_out_exit; - } - - return 0; - -err_out_exit: - return err; -} - -/* - * Thread pool callback to encrypt data and send it to the netowork. - */ -static int dst_trans_crypto_action(void *crypto_engine, void *schedule_data) -{ - struct dst_crypto_engine *e = crypto_engine; - struct dst_trans *t = schedule_data; - struct bio *bio = t->bio; - int err; - - dprintk("%s: t: %p, gen: %llu, cipher: %p, hash: %p.\n", - __func__, t, t->gen, e->cipher, e->hash); - - e->enc = t->enc; - e->iv = dst_gen_iv(t); - - if (bio_data_dir(bio) == WRITE) { - err = dst_crypto_process_sending(e, bio, t->cmd.hash); - if (err) - goto err_out_exit; - - if (e->hash) { - t->cmd.csize = crypto_hash_digestsize(e->hash); - t->cmd.size += t->cmd.csize; - } - - return dst_trans_send(t); - } else { - u8 *hash = e->data + e->size/2; - - err = dst_crypto_process_receiving(e, bio, hash, t->cmd.hash); - if (err) - goto err_out_exit; - - dst_trans_remove(t); - dst_trans_put(t); - } - - return 0; - -err_out_exit: - t->error = err; - dst_trans_put(t); - return err; -} - -/* - * Schedule crypto processing for given transaction. - */ -int dst_trans_crypto(struct dst_trans *t) -{ - struct dst_node *n = t->n; - int err; - - err = thread_pool_schedule(n->pool, - dst_trans_crypto_setup, dst_trans_crypto_action, - t, MAX_SCHEDULE_TIMEOUT); - if (err) - goto err_out_exit; - - return 0; - -err_out_exit: - dst_trans_put(t); - return err; -} - -/* - * Crypto machinery for the export node. - */ -static int dst_export_crypto_setup(void *crypto_engine, void *bio) -{ - struct dst_crypto_engine *e = crypto_engine; - - e->private = bio; - return 0; -} - -static int dst_export_crypto_action(void *crypto_engine, void *schedule_data) -{ - struct dst_crypto_engine *e = crypto_engine; - struct bio *bio = schedule_data; - struct dst_export_priv *p = bio->bi_private; - int err; - - dprintk("%s: e: %p, data: %p, bio: %llu/%u, dir: %lu.\n", - __func__, e, e->data, (u64)bio->bi_sector, - bio->bi_size, bio_data_dir(bio)); - - e->enc = (bio_data_dir(bio) == READ); - e->iv = p->cmd.id; - - if (bio_data_dir(bio) == WRITE) { - u8 *hash = e->data + e->size/2; - - err = dst_crypto_process_receiving(e, bio, hash, p->cmd.hash); - if (err) - goto err_out_exit; - - generic_make_request(bio); - } else { - err = dst_crypto_process_sending(e, bio, p->cmd.hash); - if (err) - goto err_out_exit; - - if (e->hash) { - p->cmd.csize = crypto_hash_digestsize(e->hash); - p->cmd.size += p->cmd.csize; - } - - err = dst_export_send_bio(bio); - } - return 0; - -err_out_exit: - bio_put(bio); - return err; -} - -int dst_export_crypto(struct dst_node *n, struct bio *bio) -{ - int err; - - err = thread_pool_schedule(n->pool, - dst_export_crypto_setup, dst_export_crypto_action, - bio, MAX_SCHEDULE_TIMEOUT); - if (err) - goto err_out_exit; - - return 0; - -err_out_exit: - bio_put(bio); - return err; -} diff --git a/drivers/staging/dst/dcore.c b/drivers/staging/dst/dcore.c deleted file mode 100644 index c83ca7e3d048..000000000000 --- a/drivers/staging/dst/dcore.c +++ /dev/null @@ -1,968 +0,0 @@ -/* - * 2007+ Copyright (c) Evgeniy Polyakov - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -static int dst_major; - -static DEFINE_MUTEX(dst_hash_lock); -static struct list_head *dst_hashtable; -static unsigned int dst_hashtable_size = 128; -module_param(dst_hashtable_size, uint, 0644); - -static char dst_name[] = "Dementianting goldfish"; - -static DEFINE_IDR(dst_index_idr); -static struct cb_id cn_dst_id = { CN_DST_IDX, CN_DST_VAL }; - -/* - * DST sysfs tree for device called 'storage': - * - * /sys/bus/dst/devices/storage/ - * /sys/bus/dst/devices/storage/type : 192.168.4.80:1025 - * /sys/bus/dst/devices/storage/size : 800 - * /sys/bus/dst/devices/storage/name : storage - */ - -static int dst_dev_match(struct device *dev, struct device_driver *drv) -{ - return 1; -} - -static struct bus_type dst_dev_bus_type = { - .name = "dst", - .match = &dst_dev_match, -}; - -static void dst_node_release(struct device *dev) -{ - struct dst_info *info = container_of(dev, struct dst_info, device); - - kfree(info); -} - -static struct device dst_node_dev = { - .bus = &dst_dev_bus_type, - .release = &dst_node_release -}; - -/* - * Setting size of the node after it was changed. - */ -static void dst_node_set_size(struct dst_node *n) -{ - struct block_device *bdev; - - set_capacity(n->disk, n->size >> 9); - - bdev = bdget_disk(n->disk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, n->size); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } -} - -/* - * Distributed storage request processing function. - */ -static int dst_request(struct request_queue *q, struct bio *bio) -{ - struct dst_node *n = q->queuedata; - int err = -EIO; - - if (bio_empty_barrier(bio) && !blk_queue_discard(q)) { - /* - * This is a dirty^Wnice hack, but if we complete this - * operation with -EOPNOTSUPP like intended, XFS - * will stuck and freeze the machine. This may be - * not particulary XFS problem though, but it is the - * only FS which sends empty barrier at umount time - * I worked with. - * - * Empty barriers are not allowed anyway, see 51fd77bd9f512 - * for example, although later it was changed to - * bio_rw_flagged(bio, BIO_RW_DISCARD) only, which does not - * work in this case. - */ - /* err = -EOPNOTSUPP; */ - err = 0; - goto end_io; - } - - bio_get(bio); - - return dst_process_bio(n, bio); - -end_io: - bio_endio(bio, err); - return err; -} - -/* - * Open/close callbacks for appropriate block device. - */ -static int dst_bdev_open(struct block_device *bdev, fmode_t mode) -{ - struct dst_node *n = bdev->bd_disk->private_data; - - dst_node_get(n); - return 0; -} - -static int dst_bdev_release(struct gendisk *disk, fmode_t mode) -{ - struct dst_node *n = disk->private_data; - - dst_node_put(n); - return 0; -} - -static struct block_device_operations dst_blk_ops = { - .open = dst_bdev_open, - .release = dst_bdev_release, - .owner = THIS_MODULE, -}; - -/* - * Block layer binding - disk is created when array is fully configured - * by userspace request. - */ -static int dst_node_create_disk(struct dst_node *n) -{ - int err = -ENOMEM; - u32 index = 0; - - n->queue = blk_init_queue(NULL, NULL); - if (!n->queue) - goto err_out_exit; - - n->queue->queuedata = n; - blk_queue_make_request(n->queue, dst_request); - blk_queue_max_phys_segments(n->queue, n->max_pages); - blk_queue_max_hw_segments(n->queue, n->max_pages); - - err = -ENOMEM; - n->disk = alloc_disk(1); - if (!n->disk) - goto err_out_free_queue; - - if (!(n->state->permissions & DST_PERM_WRITE)) { - printk(KERN_INFO "DST node %s attached read-only.\n", n->name); - set_disk_ro(n->disk, 1); - } - - if (!idr_pre_get(&dst_index_idr, GFP_KERNEL)) - goto err_out_put; - - mutex_lock(&dst_hash_lock); - err = idr_get_new(&dst_index_idr, NULL, &index); - mutex_unlock(&dst_hash_lock); - if (err) - goto err_out_put; - - n->disk->major = dst_major; - n->disk->first_minor = index; - n->disk->fops = &dst_blk_ops; - n->disk->queue = n->queue; - n->disk->private_data = n; - snprintf(n->disk->disk_name, sizeof(n->disk->disk_name), - "dst-%s", n->name); - - return 0; - -err_out_put: - put_disk(n->disk); -err_out_free_queue: - blk_cleanup_queue(n->queue); -err_out_exit: - return err; -} - -/* - * Sysfs machinery: show device's size. - */ -static ssize_t dst_show_size(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dst_info *info = container_of(dev, struct dst_info, device); - - return sprintf(buf, "%llu\n", info->size); -} - -/* - * Show local exported device. - */ -static ssize_t dst_show_local(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dst_info *info = container_of(dev, struct dst_info, device); - - return sprintf(buf, "%s\n", info->local); -} - -/* - * Shows type of the remote node - device major/minor number - * for local nodes and address (af_inet ipv4/ipv6 only) for remote nodes. - */ -static ssize_t dst_show_type(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dst_info *info = container_of(dev, struct dst_info, device); - int family = info->net.addr.sa_family; - - if (family == AF_INET) { - struct sockaddr_in *sin = (struct sockaddr_in *)&info->net.addr; - return sprintf(buf, "%u.%u.%u.%u:%d\n", - NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port)); - } else if (family == AF_INET6) { - struct sockaddr_in6 *sin = (struct sockaddr_in6 *) - &info->net.addr; - return sprintf(buf, - "%pi6:%d\n", - &sin->sin6_addr, ntohs(sin->sin6_port)); - } else { - int i, sz = PAGE_SIZE - 2; /* 0 symbol and '\n' below */ - int size, addrlen = info->net.addr.sa_data_len; - unsigned char *a = (unsigned char *)&info->net.addr.sa_data; - char *buf_orig = buf; - - size = snprintf(buf, sz, "family: %d, addrlen: %u, addr: ", - family, addrlen); - sz -= size; - buf += size; - - for (i = 0; i < addrlen; ++i) { - if (sz < 3) - break; - - size = snprintf(buf, sz, "%02x ", a[i]); - sz -= size; - buf += size; - } - buf += sprintf(buf, "\n"); - - return buf - buf_orig; - } - return 0; -} - -static struct device_attribute dst_node_attrs[] = { - __ATTR(size, 0444, dst_show_size, NULL), - __ATTR(type, 0444, dst_show_type, NULL), - __ATTR(local, 0444, dst_show_local, NULL), -}; - -static int dst_create_node_attributes(struct dst_node *n) -{ - int err, i; - - for (i = 0; i < ARRAY_SIZE(dst_node_attrs); ++i) { - err = device_create_file(&n->info->device, - &dst_node_attrs[i]); - if (err) - goto err_out_remove_all; - } - return 0; - -err_out_remove_all: - while (--i >= 0) - device_remove_file(&n->info->device, - &dst_node_attrs[i]); - - return err; -} - -static void dst_remove_node_attributes(struct dst_node *n) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(dst_node_attrs); ++i) - device_remove_file(&n->info->device, - &dst_node_attrs[i]); -} - -/* - * Sysfs cleanup and initialization. - * Shows number of useful parameters. - */ -static void dst_node_sysfs_exit(struct dst_node *n) -{ - if (n->info) { - dst_remove_node_attributes(n); - device_unregister(&n->info->device); - n->info = NULL; - } -} - -static int dst_node_sysfs_init(struct dst_node *n) -{ - int err; - - n->info = kzalloc(sizeof(struct dst_info), GFP_KERNEL); - if (!n->info) - return -ENOMEM; - - memcpy(&n->info->device, &dst_node_dev, sizeof(struct device)); - n->info->size = n->size; - - dev_set_name(&n->info->device, "dst-%s", n->name); - err = device_register(&n->info->device); - if (err) { - dprintk(KERN_ERR "Failed to register node '%s', err: %d.\n", - n->name, err); - goto err_out_exit; - } - - dst_create_node_attributes(n); - - return 0; - -err_out_exit: - kfree(n->info); - n->info = NULL; - return err; -} - -/* - * DST node hash tables machinery. - */ -static inline unsigned int dst_hash(char *str, unsigned int size) -{ - return jhash(str, size, 0) % dst_hashtable_size; -} - -static void dst_node_remove(struct dst_node *n) -{ - mutex_lock(&dst_hash_lock); - list_del_init(&n->node_entry); - mutex_unlock(&dst_hash_lock); -} - -static void dst_node_add(struct dst_node *n) -{ - unsigned hash = dst_hash(n->name, sizeof(n->name)); - - mutex_lock(&dst_hash_lock); - list_add_tail(&n->node_entry, &dst_hashtable[hash]); - mutex_unlock(&dst_hash_lock); -} - -/* - * Cleaning node when it is about to be freed. - * There are still users of the socket though, - * so connection cleanup should be protected. - */ -static void dst_node_cleanup(struct dst_node *n) -{ - struct dst_state *st = n->state; - - if (!st) - return; - - if (n->queue) { - blk_cleanup_queue(n->queue); - - mutex_lock(&dst_hash_lock); - idr_remove(&dst_index_idr, n->disk->first_minor); - mutex_unlock(&dst_hash_lock); - - put_disk(n->disk); - } - - if (n->bdev) { - sync_blockdev(n->bdev); - close_bdev_exclusive(n->bdev, FMODE_READ|FMODE_WRITE); - } - - dst_state_lock(st); - st->need_exit = 1; - dst_state_exit_connected(st); - dst_state_unlock(st); - - wake_up(&st->thread_wait); - - dst_state_put(st); - n->state = NULL; -} - -/* - * Free security attributes attached to given node. - */ -static void dst_security_exit(struct dst_node *n) -{ - struct dst_secure *s, *tmp; - - list_for_each_entry_safe(s, tmp, &n->security_list, sec_entry) { - list_del(&s->sec_entry); - kfree(s); - } -} - -/* - * Free node when there are no more users. - * Actually node has to be freed on behalf od userspace process, - * since there are number of threads, which are embedded in the - * node, so they can not exit and free node from there, that is - * why there is a wakeup if reference counter is not equal to zero. - */ -void dst_node_put(struct dst_node *n) -{ - if (unlikely(!n)) - return; - - dprintk("%s: n: %p, refcnt: %d.\n", - __func__, n, atomic_read(&n->refcnt)); - - if (atomic_dec_and_test(&n->refcnt)) { - dst_node_remove(n); - n->trans_scan_timeout = 0; - dst_node_cleanup(n); - thread_pool_destroy(n->pool); - dst_node_sysfs_exit(n); - dst_node_crypto_exit(n); - dst_security_exit(n); - dst_node_trans_exit(n); - - kfree(n); - - dprintk("%s: freed n: %p.\n", __func__, n); - } else { - wake_up(&n->wait); - } -} - -/* - * Setting up export device: lookup by the name, get its size - * and setup listening socket, which will accept clients, which - * will submit IO for given storage. - */ -static int dst_setup_export(struct dst_node *n, struct dst_ctl *ctl, - struct dst_export_ctl *le) -{ - int err; - - snprintf(n->info->local, sizeof(n->info->local), "%s", le->device); - - n->bdev = open_bdev_exclusive(le->device, FMODE_READ|FMODE_WRITE, NULL); - if (IS_ERR(n->bdev)) - return PTR_ERR(n->bdev); - - if (n->size != 0) - n->size = min_t(loff_t, n->bdev->bd_inode->i_size, n->size); - else - n->size = n->bdev->bd_inode->i_size; - - n->info->size = n->size; - err = dst_node_init_listened(n, le); - if (err) - goto err_out_cleanup; - - return 0; - -err_out_cleanup: - close_bdev_exclusive(n->bdev, FMODE_READ|FMODE_WRITE); - n->bdev = NULL; - - return err; -} - -/* Empty thread pool callbacks for the network processing threads. */ -static inline void *dst_thread_network_init(void *data) -{ - dprintk("%s: data: %p.\n", __func__, data); - return data; -} - -static inline void dst_thread_network_cleanup(void *data) -{ - dprintk("%s: data: %p.\n", __func__, data); -} - -/* - * Allocate DST node and initialize some of its parameters. - */ -static struct dst_node *dst_alloc_node(struct dst_ctl *ctl, - int (*start)(struct dst_node *), - int num) -{ - struct dst_node *n; - int err; - - n = kzalloc(sizeof(struct dst_node), GFP_KERNEL); - if (!n) - return NULL; - - INIT_LIST_HEAD(&n->node_entry); - - INIT_LIST_HEAD(&n->security_list); - mutex_init(&n->security_lock); - - init_waitqueue_head(&n->wait); - - n->trans_scan_timeout = msecs_to_jiffies(ctl->trans_scan_timeout); - if (!n->trans_scan_timeout) - n->trans_scan_timeout = HZ; - - n->trans_max_retries = ctl->trans_max_retries; - if (!n->trans_max_retries) - n->trans_max_retries = 10; - - /* - * Pretty much arbitrary default numbers. - * 32 matches maximum number of pages in bio originated from ext3 (31). - */ - n->max_pages = ctl->max_pages; - if (!n->max_pages) - n->max_pages = 32; - - if (n->max_pages > 1024) - n->max_pages = 1024; - - n->start = start; - n->size = ctl->size; - - atomic_set(&n->refcnt, 1); - atomic_long_set(&n->gen, 0); - snprintf(n->name, sizeof(n->name), "%s", ctl->name); - - err = dst_node_sysfs_init(n); - if (err) - goto err_out_free; - - n->pool = thread_pool_create(num, n->name, dst_thread_network_init, - dst_thread_network_cleanup, n); - if (IS_ERR(n->pool)) { - err = PTR_ERR(n->pool); - goto err_out_sysfs_exit; - } - - dprintk("%s: n: %p, name: %s.\n", __func__, n, n->name); - - return n; - -err_out_sysfs_exit: - dst_node_sysfs_exit(n); -err_out_free: - kfree(n); - return NULL; -} - -/* - * Starting a node, connected to the remote server: - * register block device and initialize transaction mechanism. - * In revers order though. - * - * It will autonegotiate some parameters with the remote node - * and update local if needed. - * - * Transaction initialization should be the last thing before - * starting the node, since transaction should include not only - * block IO, but also crypto related data (if any), which are - * initialized separately. - */ -static int dst_start_remote(struct dst_node *n) -{ - int err; - - err = dst_node_trans_init(n, sizeof(struct dst_trans)); - if (err) - return err; - - err = dst_node_create_disk(n); - if (err) - return err; - - dst_node_set_size(n); - add_disk(n->disk); - - dprintk("DST: started remote node '%s', minor: %d.\n", - n->name, n->disk->first_minor); - - return 0; -} - -/* - * Adding remote node and initialize connection. - */ -static int dst_add_remote(struct dst_node *n, struct dst_ctl *ctl, - void *data, unsigned int size) -{ - int err; - struct dst_network_ctl *rctl = data; - - if (n) - return -EEXIST; - - if (size != sizeof(struct dst_network_ctl)) - return -EINVAL; - - n = dst_alloc_node(ctl, dst_start_remote, 1); - if (!n) - return -ENOMEM; - - memcpy(&n->info->net, rctl, sizeof(struct dst_network_ctl)); - err = dst_node_init_connected(n, rctl); - if (err) - goto err_out_free; - - dst_node_add(n); - - return 0; - -err_out_free: - dst_node_put(n); - return err; -} - -/* - * Adding export node: initializing block device and listening socket. - */ -static int dst_add_export(struct dst_node *n, struct dst_ctl *ctl, - void *data, unsigned int size) -{ - int err; - struct dst_export_ctl *le = data; - - if (n) - return -EEXIST; - - if (size != sizeof(struct dst_export_ctl)) - return -EINVAL; - - n = dst_alloc_node(ctl, dst_start_export, 2); - if (!n) - return -EINVAL; - - err = dst_setup_export(n, ctl, le); - if (err) - goto err_out_free; - - dst_node_add(n); - - return 0; - -err_out_free: - dst_node_put(n); - return err; -} - -static int dst_node_remove_unload(struct dst_node *n) -{ - printk(KERN_INFO "STOPPED name: '%s', size: %llu.\n", - n->name, n->size); - - if (n->disk) - del_gendisk(n->disk); - - dst_node_remove(n); - dst_node_sysfs_exit(n); - - /* - * This is not a hack. Really. - * Node's reference counter allows to implement fine grained - * node freeing, but since all transactions (which hold node's - * reference counter) are processed in the dedicated thread, - * it is possible that reference will hit zero in that thread, - * so we will not be able to exit thread and cleanup the node. - * - * So, we remove disk, so no new activity is possible, and - * wait until all pending transaction are completed (either - * in receiving thread or by timeout in workqueue), in this - * case reference counter will be less or equal to 2 (once set in - * dst_alloc_node() and then in connector message parser; - * or when we force module unloading, and connector message - * parser does not hold a reference, in this case reference - * counter will be equal to 1), - * and subsequent dst_node_put() calls will free the node. - */ - dprintk("%s: going to sleep with %d refcnt.\n", - __func__, atomic_read(&n->refcnt)); - wait_event(n->wait, atomic_read(&n->refcnt) <= 2); - - dst_node_put(n); - return 0; -} - -/* - * Remove node from the hash table. - */ -static int dst_del_node(struct dst_node *n, struct dst_ctl *ctl, - void *data, unsigned int size) -{ - if (!n) - return -ENODEV; - - return dst_node_remove_unload(n); -} - -/* - * Initialize crypto processing for given node. - */ -static int dst_crypto_init(struct dst_node *n, struct dst_ctl *ctl, - void *data, unsigned int size) -{ - struct dst_crypto_ctl *crypto = data; - - if (!n) - return -ENODEV; - - if (size != sizeof(struct dst_crypto_ctl) + crypto->hash_keysize + - crypto->cipher_keysize) - return -EINVAL; - - if (n->trans_cache) - return -EEXIST; - - return dst_node_crypto_init(n, crypto); -} - -/* - * Security attributes for given node. - */ -static int dst_security_init(struct dst_node *n, struct dst_ctl *ctl, - void *data, unsigned int size) -{ - struct dst_secure *s; - - if (!n) - return -ENODEV; - - if (size != sizeof(struct dst_secure_user)) - return -EINVAL; - - s = kmalloc(sizeof(struct dst_secure), GFP_KERNEL); - if (!s) - return -ENOMEM; - - memcpy(&s->sec, data, size); - - mutex_lock(&n->security_lock); - list_add_tail(&s->sec_entry, &n->security_list); - mutex_unlock(&n->security_lock); - - return 0; -} - -/* - * Kill'em all! - */ -static int dst_start_node(struct dst_node *n, struct dst_ctl *ctl, - void *data, unsigned int size) -{ - int err; - - if (!n) - return -ENODEV; - - if (n->trans_cache) - return 0; - - err = n->start(n); - if (err) - return err; - - printk(KERN_INFO "STARTED name: '%s', size: %llu.\n", n->name, n->size); - return 0; -} - -typedef int (*dst_command_func)(struct dst_node *n, struct dst_ctl *ctl, - void *data, unsigned int size); - -/* - * List of userspace commands. - */ -static dst_command_func dst_commands[] = { - [DST_ADD_REMOTE] = &dst_add_remote, - [DST_ADD_EXPORT] = &dst_add_export, - [DST_DEL_NODE] = &dst_del_node, - [DST_CRYPTO] = &dst_crypto_init, - [DST_SECURITY] = &dst_security_init, - [DST_START] = &dst_start_node, -}; - -/* - * Configuration parser. - */ -static void cn_dst_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) -{ - struct dst_ctl *ctl; - int err; - struct dst_ctl_ack ack; - struct dst_node *n = NULL, *tmp; - unsigned int hash; - - if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) { - err = -EPERM; - goto out; - } - - if (msg->len < sizeof(struct dst_ctl)) { - err = -EBADMSG; - goto out; - } - - ctl = (struct dst_ctl *)msg->data; - - if (ctl->cmd >= DST_CMD_MAX) { - err = -EINVAL; - goto out; - } - hash = dst_hash(ctl->name, sizeof(ctl->name)); - - mutex_lock(&dst_hash_lock); - list_for_each_entry(tmp, &dst_hashtable[hash], node_entry) { - if (!memcmp(tmp->name, ctl->name, sizeof(tmp->name))) { - n = tmp; - dst_node_get(n); - break; - } - } - mutex_unlock(&dst_hash_lock); - - err = dst_commands[ctl->cmd](n, ctl, msg->data + sizeof(struct dst_ctl), - msg->len - sizeof(struct dst_ctl)); - - dst_node_put(n); -out: - memcpy(&ack.msg, msg, sizeof(struct cn_msg)); - - ack.msg.ack = msg->ack + 1; - ack.msg.len = sizeof(struct dst_ctl_ack) - sizeof(struct cn_msg); - - ack.error = err; - - cn_netlink_send(&ack.msg, 0, GFP_KERNEL); -} - -/* - * Global initialization: sysfs, hash table, block device registration, - * connector and various caches. - */ -static int __init dst_sysfs_init(void) -{ - return bus_register(&dst_dev_bus_type); -} - -static void dst_sysfs_exit(void) -{ - bus_unregister(&dst_dev_bus_type); -} - -static int __init dst_hashtable_init(void) -{ - unsigned int i; - - dst_hashtable = kcalloc(dst_hashtable_size, sizeof(struct list_head), - GFP_KERNEL); - if (!dst_hashtable) - return -ENOMEM; - - for (i = 0; i < dst_hashtable_size; ++i) - INIT_LIST_HEAD(&dst_hashtable[i]); - - return 0; -} - -static void dst_hashtable_exit(void) -{ - unsigned int i; - struct dst_node *n, *tmp; - - for (i = 0; i < dst_hashtable_size; ++i) { - list_for_each_entry_safe(n, tmp, &dst_hashtable[i], node_entry) { - dst_node_remove_unload(n); - } - } - - kfree(dst_hashtable); -} - -static int __init dst_sys_init(void) -{ - int err = -ENOMEM; - - err = dst_hashtable_init(); - if (err) - goto err_out_exit; - - err = dst_export_init(); - if (err) - goto err_out_hashtable_exit; - - err = register_blkdev(dst_major, DST_NAME); - if (err < 0) - goto err_out_export_exit; - if (err) - dst_major = err; - - err = dst_sysfs_init(); - if (err) - goto err_out_unregister; - - err = cn_add_callback(&cn_dst_id, "DST", cn_dst_callback); - if (err) - goto err_out_sysfs_exit; - - printk(KERN_INFO "Distributed storage, '%s' release.\n", dst_name); - - return 0; - -err_out_sysfs_exit: - dst_sysfs_exit(); -err_out_unregister: - unregister_blkdev(dst_major, DST_NAME); -err_out_export_exit: - dst_export_exit(); -err_out_hashtable_exit: - dst_hashtable_exit(); -err_out_exit: - return err; -} - -static void __exit dst_sys_exit(void) -{ - cn_del_callback(&cn_dst_id); - unregister_blkdev(dst_major, DST_NAME); - dst_hashtable_exit(); - dst_sysfs_exit(); - dst_export_exit(); -} - -module_init(dst_sys_init); -module_exit(dst_sys_exit); - -MODULE_DESCRIPTION("Distributed storage"); -MODULE_AUTHOR("Evgeniy Polyakov "); -MODULE_LICENSE("GPL"); diff --git a/drivers/staging/dst/export.c b/drivers/staging/dst/export.c deleted file mode 100644 index c324230e8b60..000000000000 --- a/drivers/staging/dst/export.c +++ /dev/null @@ -1,660 +0,0 @@ -/* - * 2007+ Copyright (c) Evgeniy Polyakov - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * Export bioset is used for server block IO requests. - */ -static struct bio_set *dst_bio_set; - -int __init dst_export_init(void) -{ - int err = -ENOMEM; - - dst_bio_set = bioset_create(32, sizeof(struct dst_export_priv)); - if (!dst_bio_set) - goto err_out_exit; - - return 0; - -err_out_exit: - return err; -} - -void dst_export_exit(void) -{ - bioset_free(dst_bio_set); -} - -/* - * When client connects and autonegotiates with the server node, - * its permissions are checked in a security attributes and sent - * back. - */ -static unsigned int dst_check_permissions(struct dst_state *main, - struct dst_state *st) -{ - struct dst_node *n = main->node; - struct dst_secure *sentry; - struct dst_secure_user *s; - struct saddr *sa = &st->ctl.addr; - unsigned int perm = 0; - - mutex_lock(&n->security_lock); - list_for_each_entry(sentry, &n->security_list, sec_entry) { - s = &sentry->sec; - - if (s->addr.sa_family != sa->sa_family) - continue; - - if (s->addr.sa_data_len != sa->sa_data_len) - continue; - - /* - * This '2' below is a port field. This may be very wrong to do - * in atalk for example though. If there will be any need - * to extent protocol to something else, I can create - * per-family helpers and use them instead of this memcmp. - */ - if (memcmp(s->addr.sa_data + 2, sa->sa_data + 2, - sa->sa_data_len - 2)) - continue; - - perm = s->permissions; - } - mutex_unlock(&n->security_lock); - - return perm; -} - -/* - * Accept new client: allocate appropriate network state and check permissions. - */ -static struct dst_state *dst_accept_client(struct dst_state *st) -{ - unsigned int revents = 0; - unsigned int err_mask = POLLERR | POLLHUP | POLLRDHUP; - unsigned int mask = err_mask | POLLIN; - struct dst_node *n = st->node; - int err = 0; - struct socket *sock = NULL; - struct dst_state *new; - - while (!err && !sock) { - revents = dst_state_poll(st); - - if (!(revents & mask)) { - DEFINE_WAIT(wait); - - for (;;) { - prepare_to_wait(&st->thread_wait, - &wait, TASK_INTERRUPTIBLE); - if (!n->trans_scan_timeout || st->need_exit) - break; - - revents = dst_state_poll(st); - - if (revents & mask) - break; - - if (signal_pending(current)) - break; - - /* - * Magic HZ? Polling check above is not safe in - * all cases (like socket reset in BH context), - * so it is simpler just to postpone it to the - * process context instead of implementing - * special locking there. - */ - schedule_timeout(HZ); - } - finish_wait(&st->thread_wait, &wait); - } - - err = -ECONNRESET; - dst_state_lock(st); - - dprintk("%s: st: %p, revents: %x [err: %d, in: %d].\n", - __func__, st, revents, revents & err_mask, - revents & POLLIN); - - if (revents & err_mask) { - dprintk("%s: revents: %x, socket: %p, err: %d.\n", - __func__, revents, st->socket, err); - err = -ECONNRESET; - } - - if (!n->trans_scan_timeout || st->need_exit) - err = -ENODEV; - - if (st->socket && (revents & POLLIN)) - err = kernel_accept(st->socket, &sock, 0); - - dst_state_unlock(st); - } - - if (err) - goto err_out_exit; - - new = dst_state_alloc(st->node); - if (IS_ERR(new)) { - err = -ENOMEM; - goto err_out_release; - } - new->socket = sock; - - new->ctl.addr.sa_data_len = sizeof(struct sockaddr); - err = kernel_getpeername(sock, (struct sockaddr *)&new->ctl.addr, - (int *)&new->ctl.addr.sa_data_len); - if (err) - goto err_out_put; - - new->permissions = dst_check_permissions(st, new); - if (new->permissions == 0) { - err = -EPERM; - dst_dump_addr(sock, (struct sockaddr *)&new->ctl.addr, - "Client is not allowed to connect"); - goto err_out_put; - } - - err = dst_poll_init(new); - if (err) - goto err_out_put; - - dst_dump_addr(sock, (struct sockaddr *)&new->ctl.addr, - "Connected client"); - - return new; - -err_out_put: - dst_state_put(new); -err_out_release: - sock_release(sock); -err_out_exit: - return ERR_PTR(err); -} - -/* - * Each server's block request sometime finishes. - * Usually it happens in hard irq context of the appropriate controller, - * so to play good with all cases we just queue BIO into the queue - * and wake up processing thread, which gets completed request and - * send (encrypting if needed) it back to the client (if it was a read - * request), or sends back reply that writing successfully completed. - */ -static int dst_export_process_request_queue(struct dst_state *st) -{ - unsigned long flags; - struct dst_export_priv *p = NULL; - struct bio *bio; - int err = 0; - - while (!list_empty(&st->request_list)) { - spin_lock_irqsave(&st->request_lock, flags); - if (!list_empty(&st->request_list)) { - p = list_first_entry(&st->request_list, - struct dst_export_priv, request_entry); - list_del(&p->request_entry); - } - spin_unlock_irqrestore(&st->request_lock, flags); - - if (!p) - break; - - bio = p->bio; - - if (dst_need_crypto(st->node) && (bio_data_dir(bio) == READ)) - err = dst_export_crypto(st->node, bio); - else - err = dst_export_send_bio(bio); - - if (err) - break; - } - - return err; -} - -/* - * Cleanup export state. - * It has to wait until all requests are finished, - * and then free them all. - */ -static void dst_state_cleanup_export(struct dst_state *st) -{ - struct dst_export_priv *p; - unsigned long flags; - - /* - * This loop waits for all pending bios to be completed and freed. - */ - while (atomic_read(&st->refcnt) > 1) { - dprintk("%s: st: %p, refcnt: %d, list_empty: %d.\n", - __func__, st, atomic_read(&st->refcnt), - list_empty(&st->request_list)); - wait_event_timeout(st->thread_wait, - (atomic_read(&st->refcnt) == 1) || - !list_empty(&st->request_list), - HZ/2); - - while (!list_empty(&st->request_list)) { - p = NULL; - spin_lock_irqsave(&st->request_lock, flags); - if (!list_empty(&st->request_list)) { - p = list_first_entry(&st->request_list, - struct dst_export_priv, request_entry); - list_del(&p->request_entry); - } - spin_unlock_irqrestore(&st->request_lock, flags); - - if (p) - bio_put(p->bio); - - dprintk("%s: st: %p, refcnt: %d, list_empty: %d, p: " - "%p.\n", __func__, st, atomic_read(&st->refcnt), - list_empty(&st->request_list), p); - } - } - - dst_state_put(st); -} - -/* - * Client accepting thread. - * Not only accepts new connection, but also schedules receiving thread - * and performs request completion described above. - */ -static int dst_accept(void *init_data, void *schedule_data) -{ - struct dst_state *main_st = schedule_data; - struct dst_node *n = init_data; - struct dst_state *st; - int err; - - while (n->trans_scan_timeout && !main_st->need_exit) { - dprintk("%s: main_st: %p, n: %p.\n", __func__, main_st, n); - st = dst_accept_client(main_st); - if (IS_ERR(st)) - continue; - - err = dst_state_schedule_receiver(st); - if (!err) { - while (n->trans_scan_timeout) { - err = wait_event_interruptible_timeout(st->thread_wait, - !list_empty(&st->request_list) || - !n->trans_scan_timeout || - st->need_exit, - HZ); - - if (!n->trans_scan_timeout || st->need_exit) - break; - - if (list_empty(&st->request_list)) - continue; - - err = dst_export_process_request_queue(st); - if (err) - break; - } - - st->need_exit = 1; - wake_up(&st->thread_wait); - } - - dst_state_cleanup_export(st); - } - - dprintk("%s: freeing listening socket st: %p.\n", __func__, main_st); - - dst_state_lock(main_st); - dst_poll_exit(main_st); - dst_state_socket_release(main_st); - dst_state_unlock(main_st); - dst_state_put(main_st); - dprintk("%s: freed listening socket st: %p.\n", __func__, main_st); - - return 0; -} - -int dst_start_export(struct dst_node *n) -{ - if (list_empty(&n->security_list)) { - printk(KERN_ERR "You are trying to export node '%s' " - "without security attributes.\nNo clients will " - "be allowed to connect. Exiting.\n", n->name); - return -EINVAL; - } - return dst_node_trans_init(n, sizeof(struct dst_export_priv)); -} - -/* - * Initialize listening state and schedule accepting thread. - */ -int dst_node_init_listened(struct dst_node *n, struct dst_export_ctl *le) -{ - struct dst_state *st; - int err = -ENOMEM; - struct dst_network_ctl *ctl = &le->ctl; - - memcpy(&n->info->net, ctl, sizeof(struct dst_network_ctl)); - - st = dst_state_alloc(n); - if (IS_ERR(st)) { - err = PTR_ERR(st); - goto err_out_exit; - } - memcpy(&st->ctl, ctl, sizeof(struct dst_network_ctl)); - - err = dst_state_socket_create(st); - if (err) - goto err_out_put; - - st->socket->sk->sk_reuse = 1; - - err = kernel_bind(st->socket, (struct sockaddr *)&ctl->addr, - ctl->addr.sa_data_len); - if (err) - goto err_out_socket_release; - - err = kernel_listen(st->socket, 1024); - if (err) - goto err_out_socket_release; - n->state = st; - - err = dst_poll_init(st); - if (err) - goto err_out_socket_release; - - dst_state_get(st); - - err = thread_pool_schedule(n->pool, dst_thread_setup, - dst_accept, st, MAX_SCHEDULE_TIMEOUT); - if (err) - goto err_out_poll_exit; - - return 0; - -err_out_poll_exit: - dst_poll_exit(st); -err_out_socket_release: - dst_state_socket_release(st); -err_out_put: - dst_state_put(st); -err_out_exit: - n->state = NULL; - return err; -} - -/* - * Free bio and related private data. - * Also drop a reference counter for appropriate state, - * which waits when there are no more block IOs in-flight. - */ -static void dst_bio_destructor(struct bio *bio) -{ - struct bio_vec *bv; - struct dst_export_priv *priv = bio->bi_private; - int i; - - bio_for_each_segment(bv, bio, i) { - if (!bv->bv_page) - break; - - __free_page(bv->bv_page); - } - - if (priv) - dst_state_put(priv->state); - bio_free(bio, dst_bio_set); -} - -/* - * Block IO completion. Queue request to be sent back to - * the client (or just confirmation). - */ -static void dst_bio_end_io(struct bio *bio, int err) -{ - struct dst_export_priv *p = bio->bi_private; - struct dst_state *st = p->state; - unsigned long flags; - - spin_lock_irqsave(&st->request_lock, flags); - list_add_tail(&p->request_entry, &st->request_list); - spin_unlock_irqrestore(&st->request_lock, flags); - - wake_up(&st->thread_wait); -} - -/* - * Allocate read request for the server. - */ -static int dst_export_read_request(struct bio *bio, unsigned int total_size) -{ - unsigned int size; - struct page *page; - int err; - - while (total_size) { - err = -ENOMEM; - page = alloc_page(GFP_KERNEL); - if (!page) - goto err_out_exit; - - size = min_t(unsigned int, PAGE_SIZE, total_size); - - err = bio_add_page(bio, page, size, 0); - dprintk("%s: bio: %llu/%u, size: %u, err: %d.\n", - __func__, (u64)bio->bi_sector, bio->bi_size, - size, err); - if (err <= 0) - goto err_out_free_page; - - total_size -= size; - } - - return 0; - -err_out_free_page: - __free_page(page); -err_out_exit: - return err; -} - -/* - * Allocate write request for the server. - * Should not only get pages, but also read data from the network. - */ -static int dst_export_write_request(struct dst_state *st, - struct bio *bio, unsigned int total_size) -{ - unsigned int size; - struct page *page; - void *data; - int err; - - while (total_size) { - err = -ENOMEM; - page = alloc_page(GFP_KERNEL); - if (!page) - goto err_out_exit; - - data = kmap(page); - if (!data) - goto err_out_free_page; - - size = min_t(unsigned int, PAGE_SIZE, total_size); - - err = dst_data_recv(st, data, size); - if (err) - goto err_out_unmap_page; - - err = bio_add_page(bio, page, size, 0); - if (err <= 0) - goto err_out_unmap_page; - - kunmap(page); - - total_size -= size; - } - - return 0; - -err_out_unmap_page: - kunmap(page); -err_out_free_page: - __free_page(page); -err_out_exit: - return err; -} - -/* - * Groovy, we've gotten an IO request from the client. - * Allocate BIO from the bioset, private data from the mempool - * and lots of pages for IO. - */ -int dst_process_io(struct dst_state *st) -{ - struct dst_node *n = st->node; - struct dst_cmd *cmd = st->data; - struct bio *bio; - struct dst_export_priv *priv; - int err = -ENOMEM; - - if (unlikely(!n->bdev)) { - err = -EINVAL; - goto err_out_exit; - } - - bio = bio_alloc_bioset(GFP_KERNEL, - PAGE_ALIGN(cmd->size) >> PAGE_SHIFT, - dst_bio_set); - if (!bio) - goto err_out_exit; - - priv = (struct dst_export_priv *)(((void *)bio) - - sizeof (struct dst_export_priv)); - - priv->state = dst_state_get(st); - priv->bio = bio; - - bio->bi_private = priv; - bio->bi_end_io = dst_bio_end_io; - bio->bi_destructor = dst_bio_destructor; - bio->bi_bdev = n->bdev; - - /* - * Server side is only interested in two low bits: - * uptodate (set by itself actually) and rw block - */ - bio->bi_flags |= cmd->flags & 3; - - bio->bi_rw = cmd->rw; - bio->bi_size = 0; - bio->bi_sector = cmd->sector; - - dst_bio_to_cmd(bio, &priv->cmd, DST_IO_RESPONSE, cmd->id); - - priv->cmd.flags = 0; - priv->cmd.size = cmd->size; - - if (bio_data_dir(bio) == WRITE) { - err = dst_recv_cdata(st, priv->cmd.hash); - if (err) - goto err_out_free; - - err = dst_export_write_request(st, bio, cmd->size); - if (err) - goto err_out_free; - - if (dst_need_crypto(n)) - return dst_export_crypto(n, bio); - } else { - err = dst_export_read_request(bio, cmd->size); - if (err) - goto err_out_free; - } - - dprintk("%s: bio: %llu/%u, rw: %lu, dir: %lu, flags: %lx, phys: %d.\n", - __func__, (u64)bio->bi_sector, bio->bi_size, - bio->bi_rw, bio_data_dir(bio), - bio->bi_flags, bio->bi_phys_segments); - - generic_make_request(bio); - - return 0; - -err_out_free: - bio_put(bio); -err_out_exit: - return err; -} - -/* - * Ok, block IO is ready, let's send it back to the client... - */ -int dst_export_send_bio(struct bio *bio) -{ - struct dst_export_priv *p = bio->bi_private; - struct dst_state *st = p->state; - struct dst_cmd *cmd = &p->cmd; - int err; - - dprintk("%s: id: %llu, bio: %llu/%u, csize: %u, flags: %lu, rw: %lu.\n", - __func__, cmd->id, (u64)bio->bi_sector, bio->bi_size, - cmd->csize, bio->bi_flags, bio->bi_rw); - - dst_convert_cmd(cmd); - - dst_state_lock(st); - if (!st->socket) { - err = -ECONNRESET; - goto err_out_unlock; - } - - if (bio_data_dir(bio) == WRITE) { - /* ... or just confirmation that writing has completed. */ - cmd->size = cmd->csize = 0; - err = dst_data_send_header(st->socket, cmd, - sizeof(struct dst_cmd), 0); - if (err) - goto err_out_unlock; - } else { - err = dst_send_bio(st, cmd, bio); - if (err) - goto err_out_unlock; - } - - dst_state_unlock(st); - - bio_put(bio); - return 0; - -err_out_unlock: - dst_state_unlock(st); - - bio_put(bio); - return err; -} diff --git a/drivers/staging/dst/state.c b/drivers/staging/dst/state.c deleted file mode 100644 index 02a05e6c48c3..000000000000 --- a/drivers/staging/dst/state.c +++ /dev/null @@ -1,844 +0,0 @@ -/* - * 2007+ Copyright (c) Evgeniy Polyakov - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * Polling machinery. - */ - -struct dst_poll_helper { - poll_table pt; - struct dst_state *st; -}; - -static int dst_queue_wake(wait_queue_t *wait, unsigned mode, - int sync, void *key) -{ - struct dst_state *st = container_of(wait, struct dst_state, wait); - - wake_up(&st->thread_wait); - return 1; -} - -static void dst_queue_func(struct file *file, wait_queue_head_t *whead, - poll_table *pt) -{ - struct dst_state *st = container_of(pt, struct dst_poll_helper, pt)->st; - - st->whead = whead; - init_waitqueue_func_entry(&st->wait, dst_queue_wake); - add_wait_queue(whead, &st->wait); -} - -void dst_poll_exit(struct dst_state *st) -{ - if (st->whead) { - remove_wait_queue(st->whead, &st->wait); - st->whead = NULL; - } -} - -int dst_poll_init(struct dst_state *st) -{ - struct dst_poll_helper ph; - - ph.st = st; - init_poll_funcptr(&ph.pt, &dst_queue_func); - - st->socket->ops->poll(NULL, st->socket, &ph.pt); - return 0; -} - -/* - * Header receiving function - may block. - */ -static int dst_data_recv_header(struct socket *sock, - void *data, unsigned int size, int block) -{ - struct msghdr msg; - struct kvec iov; - int err; - - iov.iov_base = data; - iov.iov_len = size; - - msg.msg_iov = (struct iovec *)&iov; - msg.msg_iovlen = 1; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = (block) ? MSG_WAITALL : MSG_DONTWAIT; - - err = kernel_recvmsg(sock, &msg, &iov, 1, iov.iov_len, - msg.msg_flags); - if (err != size) - return -1; - - return 0; -} - -/* - * Header sending function - may block. - */ -int dst_data_send_header(struct socket *sock, - void *data, unsigned int size, int more) -{ - struct msghdr msg; - struct kvec iov; - int err; - - iov.iov_base = data; - iov.iov_len = size; - - msg.msg_iov = (struct iovec *)&iov; - msg.msg_iovlen = 1; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = MSG_WAITALL | (more ? MSG_MORE : 0); - - err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); - if (err != size) { - dprintk("%s: size: %u, more: %d, err: %d.\n", - __func__, size, more, err); - return -1; - } - - return 0; -} - -/* - * Block autoconfiguration: request size of the storage and permissions. - */ -static int dst_request_remote_config(struct dst_state *st) -{ - struct dst_node *n = st->node; - int err = -EINVAL; - struct dst_cmd *cmd = st->data; - - memset(cmd, 0, sizeof(struct dst_cmd)); - cmd->cmd = DST_CFG; - - dst_convert_cmd(cmd); - - err = dst_data_send_header(st->socket, cmd, sizeof(struct dst_cmd), 0); - if (err) - goto out; - - err = dst_data_recv_header(st->socket, cmd, sizeof(struct dst_cmd), 1); - if (err) - goto out; - - dst_convert_cmd(cmd); - - if (cmd->cmd != DST_CFG) { - err = -EINVAL; - dprintk("%s: checking result: cmd: %d, size reported: %llu.\n", - __func__, cmd->cmd, cmd->sector); - goto out; - } - - if (n->size != 0) - n->size = min_t(loff_t, n->size, cmd->sector); - else - n->size = cmd->sector; - - n->info->size = n->size; - st->permissions = cmd->rw; - -out: - dprintk("%s: n: %p, err: %d, size: %llu, permission: %x.\n", - __func__, n, err, n->size, st->permissions); - return err; -} - -/* - * Socket machinery. - */ - -#define DST_DEFAULT_TIMEO 20000 - -int dst_state_socket_create(struct dst_state *st) -{ - int err; - struct socket *sock; - struct dst_network_ctl *ctl = &st->ctl; - - err = sock_create(ctl->addr.sa_family, ctl->type, ctl->proto, &sock); - if (err < 0) - return err; - - sock->sk->sk_sndtimeo = sock->sk->sk_rcvtimeo = - msecs_to_jiffies(DST_DEFAULT_TIMEO); - sock->sk->sk_allocation = GFP_NOIO; - - st->socket = st->read_socket = sock; - return 0; -} - -void dst_state_socket_release(struct dst_state *st) -{ - dprintk("%s: st: %p, socket: %p, n: %p.\n", - __func__, st, st->socket, st->node); - if (st->socket) { - sock_release(st->socket); - st->socket = NULL; - st->read_socket = NULL; - } -} - -void dst_dump_addr(struct socket *sk, struct sockaddr *sa, char *str) -{ - if (sk->ops->family == AF_INET) { - struct sockaddr_in *sin = (struct sockaddr_in *)sa; - printk(KERN_INFO "%s %u.%u.%u.%u:%d.\n", str, - NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port)); - } else if (sk->ops->family == AF_INET6) { - struct sockaddr_in6 *sin = (struct sockaddr_in6 *)sa; - printk(KERN_INFO "%s %pi6:%d", - str, &sin->sin6_addr, ntohs(sin->sin6_port)); - } -} - -void dst_state_exit_connected(struct dst_state *st) -{ - if (st->socket) { - dst_poll_exit(st); - st->socket->ops->shutdown(st->socket, 2); - - dst_dump_addr(st->socket, (struct sockaddr *)&st->ctl.addr, - "Disconnected peer"); - dst_state_socket_release(st); - } -} - -static int dst_state_init_connected(struct dst_state *st) -{ - int err; - struct dst_network_ctl *ctl = &st->ctl; - - err = dst_state_socket_create(st); - if (err) - goto err_out_exit; - - err = kernel_connect(st->socket, (struct sockaddr *)&st->ctl.addr, - st->ctl.addr.sa_data_len, 0); - if (err) - goto err_out_release; - - err = dst_poll_init(st); - if (err) - goto err_out_release; - - dst_dump_addr(st->socket, (struct sockaddr *)&ctl->addr, - "Connected to peer"); - - return 0; - -err_out_release: - dst_state_socket_release(st); -err_out_exit: - return err; -} - -/* - * State reset is used to reconnect to the remote peer. - * May fail, but who cares, we will try again later. - */ -static inline void dst_state_reset_nolock(struct dst_state *st) -{ - dst_state_exit_connected(st); - dst_state_init_connected(st); -} - -static inline void dst_state_reset(struct dst_state *st) -{ - dst_state_lock(st); - dst_state_reset_nolock(st); - dst_state_unlock(st); -} - -/* - * Basic network sending/receiving functions. - * Blocked mode is used. - */ -static int dst_data_recv_raw(struct dst_state *st, void *buf, u64 size) -{ - struct msghdr msg; - struct kvec iov; - int err; - - BUG_ON(!size); - - iov.iov_base = buf; - iov.iov_len = size; - - msg.msg_iov = (struct iovec *)&iov; - msg.msg_iovlen = 1; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = MSG_DONTWAIT; - - err = kernel_recvmsg(st->socket, &msg, &iov, 1, iov.iov_len, - msg.msg_flags); - if (err <= 0) { - dprintk("%s: failed to recv data: size: %llu, err: %d.\n", - __func__, size, err); - if (err == 0) - err = -ECONNRESET; - - dst_state_exit_connected(st); - } - - return err; -} - -/* - * Ping command to early detect failed nodes. - */ -static int dst_send_ping(struct dst_state *st) -{ - struct dst_cmd *cmd = st->data; - int err = -ECONNRESET; - - dst_state_lock(st); - if (st->socket) { - memset(cmd, 0, sizeof(struct dst_cmd)); - - cmd->cmd = __cpu_to_be32(DST_PING); - - err = dst_data_send_header(st->socket, cmd, - sizeof(struct dst_cmd), 0); - } - dprintk("%s: st: %p, socket: %p, err: %d.\n", __func__, - st, st->socket, err); - dst_state_unlock(st); - - return err; -} - -/* - * Receiving function, which should either return error or read - * whole block request. If there was no traffic for a one second, - * send a ping, since remote node may die. - */ -int dst_data_recv(struct dst_state *st, void *data, unsigned int size) -{ - unsigned int revents = 0; - unsigned int err_mask = POLLERR | POLLHUP | POLLRDHUP; - unsigned int mask = err_mask | POLLIN; - struct dst_node *n = st->node; - int err = 0; - - while (size && !err) { - revents = dst_state_poll(st); - - if (!(revents & mask)) { - DEFINE_WAIT(wait); - - for (;;) { - prepare_to_wait(&st->thread_wait, &wait, - TASK_INTERRUPTIBLE); - if (!n->trans_scan_timeout || st->need_exit) - break; - - revents = dst_state_poll(st); - - if (revents & mask) - break; - - if (signal_pending(current)) - break; - - if (!schedule_timeout(HZ)) { - err = dst_send_ping(st); - if (err) - return err; - } - - continue; - } - finish_wait(&st->thread_wait, &wait); - } - - err = -ECONNRESET; - dst_state_lock(st); - - if (st->socket && (st->read_socket == st->socket) && - (revents & POLLIN)) { - err = dst_data_recv_raw(st, data, size); - if (err > 0) { - data += err; - size -= err; - err = 0; - } - } - - if (revents & err_mask || !st->socket) { - dprintk("%s: revents: %x, socket: %p, size: %u, " - "err: %d.\n", __func__, revents, - st->socket, size, err); - err = -ECONNRESET; - } - - dst_state_unlock(st); - - if (!n->trans_scan_timeout) - err = -ENODEV; - } - - return err; -} - -/* - * Send block autoconf reply. - */ -static int dst_process_cfg(struct dst_state *st) -{ - struct dst_node *n = st->node; - struct dst_cmd *cmd = st->data; - int err; - - cmd->sector = n->size; - cmd->rw = st->permissions; - - dst_convert_cmd(cmd); - - dst_state_lock(st); - err = dst_data_send_header(st->socket, cmd, sizeof(struct dst_cmd), 0); - dst_state_unlock(st); - - return err; -} - -/* - * Receive block IO from the network. - */ -static int dst_recv_bio(struct dst_state *st, struct bio *bio, - unsigned int total_size) -{ - struct bio_vec *bv; - int i, err; - void *data; - unsigned int sz; - - bio_for_each_segment(bv, bio, i) { - sz = min(total_size, bv->bv_len); - - dprintk("%s: bio: %llu/%u, total: %u, len: %u, sz: %u, " - "off: %u.\n", __func__, (u64)bio->bi_sector, - bio->bi_size, total_size, bv->bv_len, sz, - bv->bv_offset); - - data = kmap(bv->bv_page) + bv->bv_offset; - err = dst_data_recv(st, data, sz); - kunmap(bv->bv_page); - - bv->bv_len = sz; - - if (err) - return err; - - total_size -= sz; - if (total_size == 0) - break; - } - - return 0; -} - -/* - * Our block IO has just completed and arrived: get it. - */ -static int dst_process_io_response(struct dst_state *st) -{ - struct dst_node *n = st->node; - struct dst_cmd *cmd = st->data; - struct dst_trans *t; - int err = 0; - struct bio *bio; - - mutex_lock(&n->trans_lock); - t = dst_trans_search(n, cmd->id); - mutex_unlock(&n->trans_lock); - - if (!t) - goto err_out_exit; - - bio = t->bio; - - dprintk("%s: bio: %llu/%u, cmd_size: %u, csize: %u, dir: %lu.\n", - __func__, (u64)bio->bi_sector, bio->bi_size, cmd->size, - cmd->csize, bio_data_dir(bio)); - - if (bio_data_dir(bio) == READ) { - if (bio->bi_size != cmd->size - cmd->csize) - goto err_out_exit; - - if (dst_need_crypto(n)) { - err = dst_recv_cdata(st, t->cmd.hash); - if (err) - goto err_out_exit; - } - - err = dst_recv_bio(st, t->bio, bio->bi_size); - if (err) - goto err_out_exit; - - if (dst_need_crypto(n)) - return dst_trans_crypto(t); - } else { - err = -EBADMSG; - if (cmd->size || cmd->csize) - goto err_out_exit; - } - - dst_trans_remove(t); - dst_trans_put(t); - - return 0; - -err_out_exit: - return err; -} - -/* - * Receive crypto data. - */ -int dst_recv_cdata(struct dst_state *st, void *cdata) -{ - struct dst_cmd *cmd = st->data; - struct dst_node *n = st->node; - struct dst_crypto_ctl *c = &n->crypto; - int err; - - if (cmd->csize != c->crypto_attached_size) { - dprintk("%s: cmd: cmd: %u, sector: %llu, size: %u, " - "csize: %u != digest size %u.\n", - __func__, cmd->cmd, cmd->sector, cmd->size, - cmd->csize, c->crypto_attached_size); - err = -EINVAL; - goto err_out_exit; - } - - err = dst_data_recv(st, cdata, cmd->csize); - if (err) - goto err_out_exit; - - cmd->size -= cmd->csize; - return 0; - -err_out_exit: - return err; -} - -/* - * Receive the command and start its processing. - */ -static int dst_recv_processing(struct dst_state *st) -{ - int err = -EINTR; - struct dst_cmd *cmd = st->data; - - /* - * If socket will be reset after this statement, then - * dst_data_recv() will just fail and loop will - * start again, so it can be done without any locks. - * - * st->read_socket is needed to prevents state machine - * breaking between this data reading and subsequent one - * in protocol specific functions during connection reset. - * In case of reset we have to read next command and do - * not expect data for old command to magically appear in - * new connection. - */ - st->read_socket = st->socket; - err = dst_data_recv(st, cmd, sizeof(struct dst_cmd)); - if (err) - goto out_exit; - - dst_convert_cmd(cmd); - - dprintk("%s: cmd: %u, size: %u, csize: %u, id: %llu, " - "sector: %llu, flags: %llx, rw: %llx.\n", - __func__, cmd->cmd, cmd->size, - cmd->csize, cmd->id, cmd->sector, - cmd->flags, cmd->rw); - - /* - * This should catch protocol breakage and random garbage - * instead of commands. - */ - if (unlikely(cmd->csize > st->size - sizeof(struct dst_cmd))) { - err = -EBADMSG; - goto out_exit; - } - - err = -EPROTO; - switch (cmd->cmd) { - case DST_IO_RESPONSE: - err = dst_process_io_response(st); - break; - case DST_IO: - err = dst_process_io(st); - break; - case DST_CFG: - err = dst_process_cfg(st); - break; - case DST_PING: - err = 0; - break; - default: - break; - } - -out_exit: - return err; -} - -/* - * Receiving thread. For the client node we should try to reconnect, - * for accepted client we just drop the state and expect it to reconnect. - */ -static int dst_recv(void *init_data, void *schedule_data) -{ - struct dst_state *st = schedule_data; - struct dst_node *n = init_data; - int err = 0; - - dprintk("%s: start st: %p, n: %p, scan: %lu, need_exit: %d.\n", - __func__, st, n, n->trans_scan_timeout, st->need_exit); - - while (n->trans_scan_timeout && !st->need_exit) { - err = dst_recv_processing(st); - if (err < 0) { - if (!st->ctl.type) - break; - - if (!n->trans_scan_timeout || st->need_exit) - break; - - dst_state_reset(st); - msleep(1000); - } - } - - st->need_exit = 1; - wake_up(&st->thread_wait); - - dprintk("%s: freeing receiving socket st: %p.\n", __func__, st); - dst_state_lock(st); - dst_state_exit_connected(st); - dst_state_unlock(st); - dst_state_put(st); - - dprintk("%s: freed receiving socket st: %p.\n", __func__, st); - - return err; -} - -/* - * Network state dies here and borns couple of lines below. - * This object is the main network state processing engine: - * sending, receiving, reconnections, all network related - * tasks are handled on behalf of the state. - */ -static void dst_state_free(struct dst_state *st) -{ - dprintk("%s: st: %p.\n", __func__, st); - if (st->cleanup) - st->cleanup(st); - kfree(st->data); - kfree(st); -} - -struct dst_state *dst_state_alloc(struct dst_node *n) -{ - struct dst_state *st; - int err = -ENOMEM; - - st = kzalloc(sizeof(struct dst_state), GFP_KERNEL); - if (!st) - goto err_out_exit; - - st->node = n; - st->need_exit = 0; - - st->size = PAGE_SIZE; - st->data = kmalloc(st->size, GFP_KERNEL); - if (!st->data) - goto err_out_free; - - spin_lock_init(&st->request_lock); - INIT_LIST_HEAD(&st->request_list); - - mutex_init(&st->state_lock); - init_waitqueue_head(&st->thread_wait); - - /* - * One for processing thread, another one for node itself. - */ - atomic_set(&st->refcnt, 2); - - dprintk("%s: st: %p, n: %p.\n", __func__, st, st->node); - - return st; - -err_out_free: - kfree(st); -err_out_exit: - return ERR_PTR(err); -} - -int dst_state_schedule_receiver(struct dst_state *st) -{ - return thread_pool_schedule_private(st->node->pool, dst_thread_setup, - dst_recv, st, MAX_SCHEDULE_TIMEOUT, st->node); -} - -/* - * Initialize client's connection to the remote peer: allocate state, - * connect and perform block IO autoconfiguration. - */ -int dst_node_init_connected(struct dst_node *n, struct dst_network_ctl *r) -{ - struct dst_state *st; - int err = -ENOMEM; - - st = dst_state_alloc(n); - if (IS_ERR(st)) { - err = PTR_ERR(st); - goto err_out_exit; - } - memcpy(&st->ctl, r, sizeof(struct dst_network_ctl)); - - err = dst_state_init_connected(st); - if (err) - goto err_out_free_data; - - err = dst_request_remote_config(st); - if (err) - goto err_out_exit_connected; - n->state = st; - - err = dst_state_schedule_receiver(st); - if (err) - goto err_out_exit_connected; - - return 0; - -err_out_exit_connected: - dst_state_exit_connected(st); -err_out_free_data: - dst_state_free(st); -err_out_exit: - n->state = NULL; - return err; -} - -void dst_state_put(struct dst_state *st) -{ - dprintk("%s: st: %p, refcnt: %d.\n", - __func__, st, atomic_read(&st->refcnt)); - if (atomic_dec_and_test(&st->refcnt)) - dst_state_free(st); -} - -/* - * Send block IO to the network one by one using zero-copy ->sendpage(). - */ -int dst_send_bio(struct dst_state *st, struct dst_cmd *cmd, struct bio *bio) -{ - struct bio_vec *bv; - struct dst_crypto_ctl *c = &st->node->crypto; - int err, i = 0; - int flags = MSG_WAITALL; - - err = dst_data_send_header(st->socket, cmd, - sizeof(struct dst_cmd) + c->crypto_attached_size, bio->bi_vcnt); - if (err) - goto err_out_exit; - - bio_for_each_segment(bv, bio, i) { - if (i < bio->bi_vcnt - 1) - flags |= MSG_MORE; - - err = kernel_sendpage(st->socket, bv->bv_page, bv->bv_offset, - bv->bv_len, flags); - if (err <= 0) - goto err_out_exit; - } - - return 0; - -err_out_exit: - dprintk("%s: %d/%d, flags: %x, err: %d.\n", - __func__, i, bio->bi_vcnt, flags, err); - return err; -} - -/* - * Send transaction to the remote peer. - */ -int dst_trans_send(struct dst_trans *t) -{ - int err; - struct dst_state *st = t->n->state; - struct bio *bio = t->bio; - - dst_convert_cmd(&t->cmd); - - dst_state_lock(st); - if (!st->socket) { - err = dst_state_init_connected(st); - if (err) - goto err_out_unlock; - } - - if (bio_data_dir(bio) == WRITE) { - err = dst_send_bio(st, &t->cmd, t->bio); - } else { - err = dst_data_send_header(st->socket, &t->cmd, - sizeof(struct dst_cmd), 0); - } - if (err) - goto err_out_reset; - - dst_state_unlock(st); - return 0; - -err_out_reset: - dst_state_reset_nolock(st); -err_out_unlock: - dst_state_unlock(st); - - return err; -} diff --git a/drivers/staging/dst/thread_pool.c b/drivers/staging/dst/thread_pool.c deleted file mode 100644 index 29a82b2602f3..000000000000 --- a/drivers/staging/dst/thread_pool.c +++ /dev/null @@ -1,348 +0,0 @@ -/* - * 2007+ Copyright (c) Evgeniy Polyakov - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include -#include -#include -#include - -/* - * Thread pool abstraction allows to schedule a work to be performed - * on behalf of kernel thread. One does not operate with threads itself, - * instead user provides setup and cleanup callbacks for thread pool itself, - * and action and cleanup callbacks for each submitted work. - * - * Each worker has private data initialized at creation time and data, - * provided by user at scheduling time. - * - * When action is being performed, thread can not be used by other users, - * instead they will sleep until there is free thread to pick their work. - */ -struct thread_pool_worker { - struct list_head worker_entry; - - struct task_struct *thread; - - struct thread_pool *pool; - - int error; - int has_data; - int need_exit; - unsigned int id; - - wait_queue_head_t wait; - - void *private; - void *schedule_data; - - int (*action)(void *private, void *schedule_data); - void (*cleanup)(void *private); -}; - -static void thread_pool_exit_worker(struct thread_pool_worker *w) -{ - kthread_stop(w->thread); - - w->cleanup(w->private); - kfree(w); -} - -/* - * Called to mark thread as ready and allow users to schedule new work. - */ -static void thread_pool_worker_make_ready(struct thread_pool_worker *w) -{ - struct thread_pool *p = w->pool; - - mutex_lock(&p->thread_lock); - - if (!w->need_exit) { - list_move_tail(&w->worker_entry, &p->ready_list); - w->has_data = 0; - mutex_unlock(&p->thread_lock); - - wake_up(&p->wait); - } else { - p->thread_num--; - list_del(&w->worker_entry); - mutex_unlock(&p->thread_lock); - - thread_pool_exit_worker(w); - } -} - -/* - * Thread action loop: waits until there is new work. - */ -static int thread_pool_worker_func(void *data) -{ - struct thread_pool_worker *w = data; - - while (!kthread_should_stop()) { - wait_event_interruptible(w->wait, - kthread_should_stop() || w->has_data); - - if (kthread_should_stop()) - break; - - if (!w->has_data) - continue; - - w->action(w->private, w->schedule_data); - thread_pool_worker_make_ready(w); - } - - return 0; -} - -/* - * Remove single worker without specifying which one. - */ -void thread_pool_del_worker(struct thread_pool *p) -{ - struct thread_pool_worker *w = NULL; - - while (!w && p->thread_num) { - wait_event(p->wait, !list_empty(&p->ready_list) || - !p->thread_num); - - dprintk("%s: locking list_empty: %d, thread_num: %d.\n", - __func__, list_empty(&p->ready_list), - p->thread_num); - - mutex_lock(&p->thread_lock); - if (!list_empty(&p->ready_list)) { - w = list_first_entry(&p->ready_list, - struct thread_pool_worker, - worker_entry); - - dprintk("%s: deleting w: %p, thread_num: %d, " - "list: %p [%p.%p].\n", __func__, - w, p->thread_num, &p->ready_list, - p->ready_list.prev, p->ready_list.next); - - p->thread_num--; - list_del(&w->worker_entry); - } - mutex_unlock(&p->thread_lock); - } - - if (w) - thread_pool_exit_worker(w); - dprintk("%s: deleted w: %p, thread_num: %d.\n", - __func__, w, p->thread_num); -} - -/* - * Remove a worker with given ID. - */ -void thread_pool_del_worker_id(struct thread_pool *p, unsigned int id) -{ - struct thread_pool_worker *w; - int found = 0; - - mutex_lock(&p->thread_lock); - list_for_each_entry(w, &p->ready_list, worker_entry) { - if (w->id == id) { - found = 1; - p->thread_num--; - list_del(&w->worker_entry); - break; - } - } - - if (!found) { - list_for_each_entry(w, &p->active_list, worker_entry) { - if (w->id == id) { - w->need_exit = 1; - break; - } - } - } - mutex_unlock(&p->thread_lock); - - if (found) - thread_pool_exit_worker(w); -} - -/* - * Add new worker thread with given parameters. - * If initialization callback fails, return error. - */ -int thread_pool_add_worker(struct thread_pool *p, - char *name, - unsigned int id, - void *(*init)(void *private), - void (*cleanup)(void *private), - void *private) -{ - struct thread_pool_worker *w; - int err = -ENOMEM; - - w = kzalloc(sizeof(struct thread_pool_worker), GFP_KERNEL); - if (!w) - goto err_out_exit; - - w->pool = p; - init_waitqueue_head(&w->wait); - w->cleanup = cleanup; - w->id = id; - - w->thread = kthread_run(thread_pool_worker_func, w, "%s", name); - if (IS_ERR(w->thread)) { - err = PTR_ERR(w->thread); - goto err_out_free; - } - - w->private = init(private); - if (IS_ERR(w->private)) { - err = PTR_ERR(w->private); - goto err_out_stop_thread; - } - - mutex_lock(&p->thread_lock); - list_add_tail(&w->worker_entry, &p->ready_list); - p->thread_num++; - mutex_unlock(&p->thread_lock); - - return 0; - -err_out_stop_thread: - kthread_stop(w->thread); -err_out_free: - kfree(w); -err_out_exit: - return err; -} - -/* - * Destroy the whole pool. - */ -void thread_pool_destroy(struct thread_pool *p) -{ - while (p->thread_num) { - dprintk("%s: num: %d.\n", __func__, p->thread_num); - thread_pool_del_worker(p); - } - - kfree(p); -} - -/* - * Create a pool with given number of threads. - * They will have sequential IDs started from zero. - */ -struct thread_pool *thread_pool_create(int num, char *name, - void *(*init)(void *private), - void (*cleanup)(void *private), - void *private) -{ - struct thread_pool_worker *w, *tmp; - struct thread_pool *p; - int err = -ENOMEM; - int i; - - p = kzalloc(sizeof(struct thread_pool), GFP_KERNEL); - if (!p) - goto err_out_exit; - - init_waitqueue_head(&p->wait); - mutex_init(&p->thread_lock); - INIT_LIST_HEAD(&p->ready_list); - INIT_LIST_HEAD(&p->active_list); - p->thread_num = 0; - - for (i = 0; i < num; ++i) { - err = thread_pool_add_worker(p, name, i, init, - cleanup, private); - if (err) - goto err_out_free_all; - } - - return p; - -err_out_free_all: - list_for_each_entry_safe(w, tmp, &p->ready_list, worker_entry) { - list_del(&w->worker_entry); - thread_pool_exit_worker(w); - } - kfree(p); -err_out_exit: - return ERR_PTR(err); -} - -/* - * Schedule execution of the action on a given thread, - * provided ID pointer has to match previously stored - * private data. - */ -int thread_pool_schedule_private(struct thread_pool *p, - int (*setup)(void *private, void *data), - int (*action)(void *private, void *data), - void *data, long timeout, void *id) -{ - struct thread_pool_worker *w, *tmp, *worker = NULL; - int err = 0; - - while (!worker && !err) { - timeout = wait_event_interruptible_timeout(p->wait, - !list_empty(&p->ready_list), - timeout); - - if (!timeout) { - err = -ETIMEDOUT; - break; - } - - worker = NULL; - mutex_lock(&p->thread_lock); - list_for_each_entry_safe(w, tmp, &p->ready_list, worker_entry) { - if (id && id != w->private) - continue; - - worker = w; - - list_move_tail(&w->worker_entry, &p->active_list); - - err = setup(w->private, data); - if (!err) { - w->schedule_data = data; - w->action = action; - w->has_data = 1; - wake_up(&w->wait); - } else { - list_move_tail(&w->worker_entry, - &p->ready_list); - } - - break; - } - mutex_unlock(&p->thread_lock); - } - - return err; -} - -/* - * Schedule execution on arbitrary thread from the pool. - */ -int thread_pool_schedule(struct thread_pool *p, - int (*setup)(void *private, void *data), - int (*action)(void *private, void *data), - void *data, long timeout) -{ - return thread_pool_schedule_private(p, setup, - action, data, timeout, NULL); -} diff --git a/drivers/staging/dst/trans.c b/drivers/staging/dst/trans.c deleted file mode 100644 index 1c36a6bc31d5..000000000000 --- a/drivers/staging/dst/trans.c +++ /dev/null @@ -1,337 +0,0 @@ -/* - * 2007+ Copyright (c) Evgeniy Polyakov - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include -#include -#include -#include - -/* - * Transaction memory pool size. - */ -static int dst_mempool_num = 32; -module_param(dst_mempool_num, int, 0644); - -/* - * Transaction tree management. - */ -static inline int dst_trans_cmp(dst_gen_t gen, dst_gen_t new) -{ - if (gen < new) - return 1; - if (gen > new) - return -1; - return 0; -} - -struct dst_trans *dst_trans_search(struct dst_node *node, dst_gen_t gen) -{ - struct rb_root *root = &node->trans_root; - struct rb_node *n = root->rb_node; - struct dst_trans *t, *ret = NULL; - int cmp; - - while (n) { - t = rb_entry(n, struct dst_trans, trans_entry); - - cmp = dst_trans_cmp(t->gen, gen); - if (cmp < 0) - n = n->rb_left; - else if (cmp > 0) - n = n->rb_right; - else { - ret = t; - break; - } - } - - dprintk("%s: %s transaction: id: %llu.\n", __func__, - (ret) ? "found" : "not found", gen); - - return ret; -} - -static int dst_trans_insert(struct dst_trans *new) -{ - struct rb_root *root = &new->n->trans_root; - struct rb_node **n = &root->rb_node, *parent = NULL; - struct dst_trans *ret = NULL, *t; - int cmp; - - while (*n) { - parent = *n; - - t = rb_entry(parent, struct dst_trans, trans_entry); - - cmp = dst_trans_cmp(t->gen, new->gen); - if (cmp < 0) - n = &parent->rb_left; - else if (cmp > 0) - n = &parent->rb_right; - else { - ret = t; - break; - } - } - - new->send_time = jiffies; - if (ret) { - printk(KERN_DEBUG "%s: exist: old: gen: %llu, bio: %llu/%u, " - "send_time: %lu, new: gen: %llu, bio: %llu/%u, " - "send_time: %lu.\n", __func__, - ret->gen, (u64)ret->bio->bi_sector, - ret->bio->bi_size, ret->send_time, - new->gen, (u64)new->bio->bi_sector, - new->bio->bi_size, new->send_time); - return -EEXIST; - } - - rb_link_node(&new->trans_entry, parent, n); - rb_insert_color(&new->trans_entry, root); - - dprintk("%s: inserted: gen: %llu, bio: %llu/%u, send_time: %lu.\n", - __func__, new->gen, (u64)new->bio->bi_sector, - new->bio->bi_size, new->send_time); - - return 0; -} - -int dst_trans_remove_nolock(struct dst_trans *t) -{ - struct dst_node *n = t->n; - - if (t->trans_entry.rb_parent_color) { - rb_erase(&t->trans_entry, &n->trans_root); - t->trans_entry.rb_parent_color = 0; - } - return 0; -} - -int dst_trans_remove(struct dst_trans *t) -{ - int ret; - struct dst_node *n = t->n; - - mutex_lock(&n->trans_lock); - ret = dst_trans_remove_nolock(t); - mutex_unlock(&n->trans_lock); - - return ret; -} - -/* - * When transaction is completed and there are no more users, - * we complete appriate block IO request with given error status. - */ -void dst_trans_put(struct dst_trans *t) -{ - if (atomic_dec_and_test(&t->refcnt)) { - struct bio *bio = t->bio; - - dprintk("%s: completed t: %p, gen: %llu, bio: %p.\n", - __func__, t, t->gen, bio); - - bio_endio(bio, t->error); - bio_put(bio); - - dst_node_put(t->n); - mempool_free(t, t->n->trans_pool); - } -} - -/* - * Process given block IO request: allocate transaction, insert it into the tree - * and send/schedule crypto processing. - */ -int dst_process_bio(struct dst_node *n, struct bio *bio) -{ - struct dst_trans *t; - int err = -ENOMEM; - - t = mempool_alloc(n->trans_pool, GFP_NOFS); - if (!t) - goto err_out_exit; - - t->n = dst_node_get(n); - t->bio = bio; - t->error = 0; - t->retries = 0; - atomic_set(&t->refcnt, 1); - t->gen = atomic_long_inc_return(&n->gen); - - t->enc = bio_data_dir(bio); - dst_bio_to_cmd(bio, &t->cmd, DST_IO, t->gen); - - mutex_lock(&n->trans_lock); - err = dst_trans_insert(t); - mutex_unlock(&n->trans_lock); - if (err) - goto err_out_free; - - dprintk("%s: gen: %llu, bio: %llu/%u, dir/enc: %d, need_crypto: %d.\n", - __func__, t->gen, (u64)bio->bi_sector, - bio->bi_size, t->enc, dst_need_crypto(n)); - - if (dst_need_crypto(n) && t->enc) - dst_trans_crypto(t); - else - dst_trans_send(t); - - return 0; - -err_out_free: - dst_node_put(n); - mempool_free(t, n->trans_pool); -err_out_exit: - bio_endio(bio, err); - bio_put(bio); - return err; -} - -/* - * Scan for timeout/stale transactions. - * Each transaction is being resent multiple times before error completion. - */ -static void dst_trans_scan(struct work_struct *work) -{ - struct dst_node *n = container_of(work, struct dst_node, - trans_work.work); - struct rb_node *rb_node; - struct dst_trans *t; - unsigned long timeout = n->trans_scan_timeout; - int num = 10 * n->trans_max_retries; - - mutex_lock(&n->trans_lock); - - for (rb_node = rb_first(&n->trans_root); rb_node; ) { - t = rb_entry(rb_node, struct dst_trans, trans_entry); - - if (timeout && time_after(t->send_time + timeout, jiffies) - && t->retries == 0) - break; -#if 0 - dprintk("%s: t: %p, gen: %llu, n: %s, retries: %u, max: %u.\n", - __func__, t, t->gen, n->name, - t->retries, n->trans_max_retries); -#endif - if (--num == 0) - break; - - dst_trans_get(t); - - rb_node = rb_next(rb_node); - - if (timeout && (++t->retries < n->trans_max_retries)) { - dst_trans_send(t); - } else { - t->error = -ETIMEDOUT; - dst_trans_remove_nolock(t); - dst_trans_put(t); - } - - dst_trans_put(t); - } - - mutex_unlock(&n->trans_lock); - - /* - * If no timeout specified then system is in the middle of exiting - * process, so no need to reschedule scanning process again. - */ - if (timeout) { - if (!num) - timeout = HZ; - schedule_delayed_work(&n->trans_work, timeout); - } -} - -/* - * Flush all transactions and mark them as timed out. - * Destroy transaction pools. - */ -void dst_node_trans_exit(struct dst_node *n) -{ - struct dst_trans *t; - struct rb_node *rb_node; - - if (!n->trans_cache) - return; - - dprintk("%s: n: %p, cancelling the work.\n", __func__, n); - cancel_delayed_work_sync(&n->trans_work); - flush_scheduled_work(); - dprintk("%s: n: %p, work has been cancelled.\n", __func__, n); - - for (rb_node = rb_first(&n->trans_root); rb_node; ) { - t = rb_entry(rb_node, struct dst_trans, trans_entry); - - dprintk("%s: t: %p, gen: %llu, n: %s.\n", - __func__, t, t->gen, n->name); - - rb_node = rb_next(rb_node); - - t->error = -ETIMEDOUT; - dst_trans_remove_nolock(t); - dst_trans_put(t); - } - - mempool_destroy(n->trans_pool); - kmem_cache_destroy(n->trans_cache); -} - -/* - * Initialize transaction storage for given node. - * Transaction stores not only control information, - * but also network command and crypto data (if needed) - * to reduce number of allocations. Thus transaction size - * differs from node to node. - */ -int dst_node_trans_init(struct dst_node *n, unsigned int size) -{ - /* - * We need this, since node with given name can be dropped from the - * hash table, but be still alive, so subsequent creation of the node - * with the same name may collide with existing cache name. - */ - - snprintf(n->cache_name, sizeof(n->cache_name), "%s-%p", n->name, n); - - n->trans_cache = kmem_cache_create(n->cache_name, - size + n->crypto.crypto_attached_size, - 0, 0, NULL); - if (!n->trans_cache) - goto err_out_exit; - - n->trans_pool = mempool_create_slab_pool(dst_mempool_num, - n->trans_cache); - if (!n->trans_pool) - goto err_out_cache_destroy; - - mutex_init(&n->trans_lock); - n->trans_root = RB_ROOT; - - INIT_DELAYED_WORK(&n->trans_work, dst_trans_scan); - schedule_delayed_work(&n->trans_work, n->trans_scan_timeout); - - dprintk("%s: n: %p, size: %u, crypto: %u.\n", - __func__, n, size, n->crypto.crypto_attached_size); - - return 0; - -err_out_cache_destroy: - kmem_cache_destroy(n->trans_cache); -err_out_exit: - return -ENOMEM; -} diff --git a/include/linux/dst.h b/include/linux/dst.h deleted file mode 100644 index e26fed84b1aa..000000000000 --- a/include/linux/dst.h +++ /dev/null @@ -1,587 +0,0 @@ -/* - * 2007+ Copyright (c) Evgeniy Polyakov - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#ifndef __DST_H -#define __DST_H - -#include -#include - -#define DST_NAMELEN 32 -#define DST_NAME "dst" - -enum { - /* Remove node with given id from storage */ - DST_DEL_NODE = 0, - /* Add remote node with given id to the storage */ - DST_ADD_REMOTE, - /* Add local node with given id to the storage to be exported and used by remote peers */ - DST_ADD_EXPORT, - /* Crypto initialization command (hash/cipher used to protect the connection) */ - DST_CRYPTO, - /* Security attributes for given connection (permissions for example) */ - DST_SECURITY, - /* Register given node in the block layer subsystem */ - DST_START, - DST_CMD_MAX -}; - -struct dst_ctl -{ - /* Storage name */ - char name[DST_NAMELEN]; - /* Command flags */ - __u32 flags; - /* Command itself (see above) */ - __u32 cmd; - /* Maximum number of pages per single request in this device */ - __u32 max_pages; - /* Stale/error transaction scanning timeout in milliseconds */ - __u32 trans_scan_timeout; - /* Maximum number of retry sends before completing transaction as broken */ - __u32 trans_max_retries; - /* Storage size */ - __u64 size; -}; - -/* Reply command carries completion status */ -struct dst_ctl_ack -{ - struct cn_msg msg; - int error; - int unused[3]; -}; - -/* - * Unfortunaltely socket address structure is not exported to userspace - * and is redefined there. - */ -#define SADDR_MAX_DATA 128 - -struct saddr { - /* address family, AF_xxx */ - unsigned short sa_family; - /* 14 bytes of protocol address */ - char sa_data[SADDR_MAX_DATA]; - /* Number of bytes used in sa_data */ - unsigned short sa_data_len; -}; - -/* Address structure */ -struct dst_network_ctl -{ - /* Socket type: datagram, stream...*/ - unsigned int type; - /* Let me guess, is it a Jupiter diameter? */ - unsigned int proto; - /* Peer's address */ - struct saddr addr; -}; - -struct dst_crypto_ctl -{ - /* Cipher and hash names */ - char cipher_algo[DST_NAMELEN]; - char hash_algo[DST_NAMELEN]; - - /* Key sizes. Can be zero for digest for example */ - unsigned int cipher_keysize, hash_keysize; - /* Alignment. Calculated by the DST itself. */ - unsigned int crypto_attached_size; - /* Number of threads to perform crypto operations */ - int thread_num; -}; - -/* Export security attributes have this bits checked in when client connects */ -#define DST_PERM_READ (1<<0) -#define DST_PERM_WRITE (1<<1) - -/* - * Right now it is simple model, where each remote address - * is assigned to set of permissions it is allowed to perform. - * In real world block device does not know anything but - * reading and writing, so it should be more than enough. - */ -struct dst_secure_user -{ - unsigned int permissions; - struct saddr addr; -}; - -/* - * Export control command: device to export and network address to accept - * clients to work with given device - */ -struct dst_export_ctl -{ - char device[DST_NAMELEN]; - struct dst_network_ctl ctl; -}; - -enum { - DST_CFG = 1, /* Request remote configuration */ - DST_IO, /* IO command */ - DST_IO_RESPONSE, /* IO response */ - DST_PING, /* Keepalive message */ - DST_NCMD_MAX, -}; - -struct dst_cmd -{ - /* Network command itself, see above */ - __u32 cmd; - /* - * Size of the attached data - * (in most cases, for READ command it means how many bytes were requested) - */ - __u32 size; - /* Crypto size: number of attached bytes with digest/hmac */ - __u32 csize; - /* Here we can carry secret data */ - __u32 reserved; - /* Read/write bits, see how they are encoded in bio structure */ - __u64 rw; - /* BIO flags */ - __u64 flags; - /* Unique command id (like transaction ID) */ - __u64 id; - /* Sector to start IO from */ - __u64 sector; - /* Hash data is placed after this header */ - __u8 hash[0]; -}; - -/* - * Convert command to/from network byte order. - * We do not use hton*() functions, since there is - * no 64-bit implementation. - */ -static inline void dst_convert_cmd(struct dst_cmd *c) -{ - c->cmd = __cpu_to_be32(c->cmd); - c->csize = __cpu_to_be32(c->csize); - c->size = __cpu_to_be32(c->size); - c->sector = __cpu_to_be64(c->sector); - c->id = __cpu_to_be64(c->id); - c->flags = __cpu_to_be64(c->flags); - c->rw = __cpu_to_be64(c->rw); -} - -/* Transaction id */ -typedef __u64 dst_gen_t; - -#ifdef __KERNEL__ - -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_DST_DEBUG -#define dprintk(f, a...) printk(KERN_NOTICE f, ##a) -#else -static inline void __attribute__ ((format (printf, 1, 2))) - dprintk(const char *fmt, ...) {} -#endif - -struct dst_node; - -struct dst_trans -{ - /* DST node we are working with */ - struct dst_node *n; - - /* Entry inside transaction tree */ - struct rb_node trans_entry; - - /* Merlin kills this transaction when this memory cell equals zero */ - atomic_t refcnt; - - /* How this transaction should be processed by crypto engine */ - short enc; - /* How many times this transaction was resent */ - short retries; - /* Completion status */ - int error; - - /* When did we send it to the remote peer */ - long send_time; - - /* My name is... - * Well, computers does not speak, they have unique id instead */ - dst_gen_t gen; - - /* Block IO we are working with */ - struct bio *bio; - - /* Network command for above block IO request */ - struct dst_cmd cmd; -}; - -struct dst_crypto_engine -{ - /* What should we do with all block requests */ - struct crypto_hash *hash; - struct crypto_ablkcipher *cipher; - - /* Pool of pages used to encrypt data into before sending */ - int page_num; - struct page **pages; - - /* What to do with current request */ - int enc; - /* Who we are and where do we go */ - struct scatterlist *src, *dst; - - /* Maximum timeout waiting for encryption to be completed */ - long timeout; - /* IV is a 64-bit sequential counter */ - u64 iv; - - /* Secret data */ - void *private; - - /* Cached temporary data lives here */ - int size; - void *data; -}; - -struct dst_state -{ - /* The main state protection */ - struct mutex state_lock; - - /* Polling machinery for sockets */ - wait_queue_t wait; - wait_queue_head_t *whead; - /* Most of events are being waited here */ - wait_queue_head_t thread_wait; - - /* Who owns this? */ - struct dst_node *node; - - /* Network address for this state */ - struct dst_network_ctl ctl; - - /* Permissions to work with: read-only or rw connection */ - u32 permissions; - - /* Called when we need to clean private data */ - void (* cleanup)(struct dst_state *st); - - /* Used by the server: BIO completion queues BIOs here */ - struct list_head request_list; - spinlock_t request_lock; - - /* Guess what? No, it is not number of planets */ - atomic_t refcnt; - - /* This flags is set when connection should be dropped */ - int need_exit; - - /* - * Socket to work with. Second pointer is used for - * lockless check if socket was changed before performing - * next action (like working with cached polling result) - */ - struct socket *socket, *read_socket; - - /* Cached preallocated data */ - void *data; - unsigned int size; - - /* Currently processed command */ - struct dst_cmd cmd; -}; - -struct dst_info -{ - /* Device size */ - u64 size; - - /* Local device name for export devices */ - char local[DST_NAMELEN]; - - /* Network setup */ - struct dst_network_ctl net; - - /* Sysfs bits use this */ - struct device device; -}; - -struct dst_node -{ - struct list_head node_entry; - - /* Hi, my name is stored here */ - char name[DST_NAMELEN]; - /* My cache name is stored here */ - char cache_name[DST_NAMELEN]; - - /* Block device attached to given node. - * Only valid for exporting nodes */ - struct block_device *bdev; - /* Network state machine for given peer */ - struct dst_state *state; - - /* Block IO machinery */ - struct request_queue *queue; - struct gendisk *disk; - - /* Number of threads in processing pool */ - int thread_num; - /* Maximum number of pages in single IO */ - int max_pages; - - /* I'm that big in bytes */ - loff_t size; - - /* Exported to userspace node information */ - struct dst_info *info; - - /* - * Security attribute list. - * Used only by exporting node currently. - */ - struct list_head security_list; - struct mutex security_lock; - - /* - * When this unerflows below zero, university collapses. - * But this will not happen, since node will be freed, - * when reference counter reaches zero. - */ - atomic_t refcnt; - - /* How precisely should I be started? */ - int (*start)(struct dst_node *); - - /* Crypto capabilities */ - struct dst_crypto_ctl crypto; - u8 *hash_key; - u8 *cipher_key; - - /* Pool of processing thread */ - struct thread_pool *pool; - - /* Transaction IDs live here */ - atomic_long_t gen; - - /* - * How frequently and how many times transaction - * tree should be scanned to drop stale objects. - */ - long trans_scan_timeout; - int trans_max_retries; - - /* Small gnomes live here */ - struct rb_root trans_root; - struct mutex trans_lock; - - /* - * Transaction cache/memory pool. - * It is big enough to contain not only transaction - * itself, but additional crypto data (digest/hmac). - */ - struct kmem_cache *trans_cache; - mempool_t *trans_pool; - - /* This entity scans transaction tree */ - struct delayed_work trans_work; - - wait_queue_head_t wait; -}; - -/* Kernel representation of the security attribute */ -struct dst_secure -{ - struct list_head sec_entry; - struct dst_secure_user sec; -}; - -int dst_process_bio(struct dst_node *n, struct bio *bio); - -int dst_node_init_connected(struct dst_node *n, struct dst_network_ctl *r); -int dst_node_init_listened(struct dst_node *n, struct dst_export_ctl *le); - -static inline struct dst_state *dst_state_get(struct dst_state *st) -{ - BUG_ON(atomic_read(&st->refcnt) == 0); - atomic_inc(&st->refcnt); - return st; -} - -void dst_state_put(struct dst_state *st); - -struct dst_state *dst_state_alloc(struct dst_node *n); -int dst_state_socket_create(struct dst_state *st); -void dst_state_socket_release(struct dst_state *st); - -void dst_state_exit_connected(struct dst_state *st); - -int dst_state_schedule_receiver(struct dst_state *st); - -void dst_dump_addr(struct socket *sk, struct sockaddr *sa, char *str); - -static inline void dst_state_lock(struct dst_state *st) -{ - mutex_lock(&st->state_lock); -} - -static inline void dst_state_unlock(struct dst_state *st) -{ - mutex_unlock(&st->state_lock); -} - -void dst_poll_exit(struct dst_state *st); -int dst_poll_init(struct dst_state *st); - -static inline unsigned int dst_state_poll(struct dst_state *st) -{ - unsigned int revents = POLLHUP | POLLERR; - - dst_state_lock(st); - if (st->socket) - revents = st->socket->ops->poll(NULL, st->socket, NULL); - dst_state_unlock(st); - - return revents; -} - -static inline int dst_thread_setup(void *private, void *data) -{ - return 0; -} - -void dst_node_put(struct dst_node *n); - -static inline struct dst_node *dst_node_get(struct dst_node *n) -{ - atomic_inc(&n->refcnt); - return n; -} - -int dst_data_recv(struct dst_state *st, void *data, unsigned int size); -int dst_recv_cdata(struct dst_state *st, void *cdata); -int dst_data_send_header(struct socket *sock, - void *data, unsigned int size, int more); - -int dst_send_bio(struct dst_state *st, struct dst_cmd *cmd, struct bio *bio); - -int dst_process_io(struct dst_state *st); -int dst_export_crypto(struct dst_node *n, struct bio *bio); -int dst_export_send_bio(struct bio *bio); -int dst_start_export(struct dst_node *n); - -int __init dst_export_init(void); -void dst_export_exit(void); - -/* Private structure for export block IO requests */ -struct dst_export_priv -{ - struct list_head request_entry; - struct dst_state *state; - struct bio *bio; - struct dst_cmd cmd; -}; - -static inline void dst_trans_get(struct dst_trans *t) -{ - atomic_inc(&t->refcnt); -} - -struct dst_trans *dst_trans_search(struct dst_node *node, dst_gen_t gen); -int dst_trans_remove(struct dst_trans *t); -int dst_trans_remove_nolock(struct dst_trans *t); -void dst_trans_put(struct dst_trans *t); - -/* - * Convert bio into network command. - */ -static inline void dst_bio_to_cmd(struct bio *bio, struct dst_cmd *cmd, - u32 command, u64 id) -{ - cmd->cmd = command; - cmd->flags = (bio->bi_flags << BIO_POOL_BITS) >> BIO_POOL_BITS; - cmd->rw = bio->bi_rw; - cmd->size = bio->bi_size; - cmd->csize = 0; - cmd->id = id; - cmd->sector = bio->bi_sector; -}; - -int dst_trans_send(struct dst_trans *t); -int dst_trans_crypto(struct dst_trans *t); - -int dst_node_crypto_init(struct dst_node *n, struct dst_crypto_ctl *ctl); -void dst_node_crypto_exit(struct dst_node *n); - -static inline int dst_need_crypto(struct dst_node *n) -{ - struct dst_crypto_ctl *c = &n->crypto; - /* - * Logical OR is appropriate here, but boolean one produces - * more optimal code, so it is used instead. - */ - return (c->hash_algo[0] | c->cipher_algo[0]); -} - -int dst_node_trans_init(struct dst_node *n, unsigned int size); -void dst_node_trans_exit(struct dst_node *n); - -/* - * Pool of threads. - * Ready list contains threads currently free to be used, - * active one contains threads with some work scheduled for them. - * Caller can wait in given queue when thread is ready. - */ -struct thread_pool -{ - int thread_num; - struct mutex thread_lock; - struct list_head ready_list, active_list; - - wait_queue_head_t wait; -}; - -void thread_pool_del_worker(struct thread_pool *p); -void thread_pool_del_worker_id(struct thread_pool *p, unsigned int id); -int thread_pool_add_worker(struct thread_pool *p, - char *name, - unsigned int id, - void *(* init)(void *data), - void (* cleanup)(void *data), - void *data); - -void thread_pool_destroy(struct thread_pool *p); -struct thread_pool *thread_pool_create(int num, char *name, - void *(* init)(void *data), - void (* cleanup)(void *data), - void *data); - -int thread_pool_schedule(struct thread_pool *p, - int (* setup)(void *stored_private, void *setup_data), - int (* action)(void *stored_private, void *setup_data), - void *setup_data, long timeout); -int thread_pool_schedule_private(struct thread_pool *p, - int (* setup)(void *private, void *data), - int (* action)(void *private, void *data), - void *data, long timeout, void *id); - -#endif /* __KERNEL__ */ -#endif /* __DST_H */ -- cgit v1.2.3 From 31d12926e37291970dd4f6e9940df3897766a81d Mon Sep 17 00:00:00 2001 From: laurent chavey Date: Tue, 15 Dec 2009 11:15:28 +0000 Subject: net: Add rtnetlink init_rcvwnd to set the TCP initial receive window Add rtnetlink init_rcvwnd to set the TCP initial receive window size advertised by passive and active TCP connections. The current Linux TCP implementation limits the advertised TCP initial receive window to the one prescribed by slow start. For short lived TCP connections used for transaction type of traffic (i.e. http requests), bounding the advertised TCP initial receive window results in increased latency to complete the transaction. Support for setting initial congestion window is already supported using rtnetlink init_cwnd, but the feature is useless without the ability to set a larger TCP initial receive window. The rtnetlink init_rcvwnd allows increasing the TCP initial receive window, allowing TCP connection to advertise larger TCP receive window than the ones bounded by slow start. Signed-off-by: Laurent Chavey Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 2 ++ include/net/dst.h | 2 -- include/net/tcp.h | 3 ++- net/ipv4/syncookies.c | 3 ++- net/ipv4/tcp_output.c | 17 +++++++++++++---- net/ipv6/syncookies.c | 3 ++- 6 files changed, 21 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 05330fc5b436..9590364fe8b5 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -362,6 +362,8 @@ enum { #define RTAX_FEATURES RTAX_FEATURES RTAX_RTO_MIN, #define RTAX_RTO_MIN RTAX_RTO_MIN + RTAX_INITRWND, +#define RTAX_INITRWND RTAX_INITRWND __RTAX_MAX }; diff --git a/include/net/dst.h b/include/net/dst.h index 39c4a5963e12..ce078cda6b74 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -83,8 +83,6 @@ struct dst_entry { * (L1_CACHE_SIZE would be too much) */ #ifdef CONFIG_64BIT - long __pad_to_align_refcnt[2]; -#else long __pad_to_align_refcnt[1]; #endif /* diff --git a/include/net/tcp.h b/include/net/tcp.h index 185e22baecb1..788c99f98597 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -965,7 +965,8 @@ static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) /* Determine a window scaling and initial window to offer. */ extern void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, - int wscale_ok, __u8 *rcv_wscale); + int wscale_ok, __u8 *rcv_wscale, + __u32 init_rcv_wnd); static inline int tcp_win_from_space(int space) { diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 66fd80ef2473..5c24db4a3c91 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -358,7 +358,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, - ireq->wscale_ok, &rcv_wscale); + ireq->wscale_ok, &rcv_wscale, + dst_metric(&rt->u.dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 12b2af36eab8..4a1605d3f909 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -183,7 +183,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) */ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, - int wscale_ok, __u8 *rcv_wscale) + int wscale_ok, __u8 *rcv_wscale, + __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); @@ -232,7 +233,13 @@ void tcp_select_initial_window(int __space, __u32 mss, init_cwnd = 2; else if (mss > 1460) init_cwnd = 3; - if (*rcv_wnd > init_cwnd * mss) + /* when initializing use the value from init_rcv_wnd + * rather than the default from above + */ + if (init_rcv_wnd && + (*rcv_wnd > init_rcv_wnd * mss)) + *rcv_wnd = init_rcv_wnd * mss; + else if (*rcv_wnd > init_cwnd * mss) *rcv_wnd = init_cwnd * mss; } @@ -2417,7 +2424,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, - &rcv_wscale); + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; } @@ -2544,7 +2552,8 @@ static void tcp_connect_init(struct sock *sk) &tp->rcv_wnd, &tp->window_clamp, sysctl_tcp_window_scaling, - &rcv_wscale); + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7208a06576c6..34d1f0690d7e 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -269,7 +269,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, - ireq->wscale_ok, &rcv_wscale); + ireq->wscale_ok, &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; -- cgit v1.2.3 From 28f6aeea3f12d37bd258b2c0d5ba891bff4ec479 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Fri, 25 Dec 2009 17:30:22 -0800 Subject: net: restore ip source validation when using policy routing and the skb mark: there are cases where a back path validation requires us to use a different routing table for src ip validation than the one used for mapping ingress dst ip. One such a case is transparent proxying where we pretend to be the destination system and therefore the local table is used for incoming packets but possibly a main table would be used on outbound. Make the default behavior to allow the above and if users need to turn on the symmetry via sysctl src_valid_mark Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 1 + include/linux/sysctl.h | 1 + net/ipv4/devinet.c | 1 + net/ipv4/fib_frontend.c | 2 ++ 4 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 699e85c01a4d..b2304929434e 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -81,6 +81,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) #define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING) #define IN_DEV_MFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), MC_FORWARDING) #define IN_DEV_RPFILTER(in_dev) IN_DEV_MAXCONF((in_dev), RP_FILTER) +#define IN_DEV_SRC_VMARK(in_dev) IN_DEV_ORCONF((in_dev), SRC_VMARK) #define IN_DEV_SOURCE_ROUTE(in_dev) IN_DEV_ANDCONF((in_dev), \ ACCEPT_SOURCE_ROUTE) #define IN_DEV_ACCEPT_LOCAL(in_dev) IN_DEV_ORCONF((in_dev), ACCEPT_LOCAL) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 877ba039e6a4..bd27fbc9db62 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -482,6 +482,7 @@ enum NET_IPV4_CONF_ARP_ACCEPT=21, NET_IPV4_CONF_ARP_NOTIFY=22, NET_IPV4_CONF_ACCEPT_LOCAL=23, + NET_IPV4_CONF_SRC_VMARK=24, __NET_IPV4_CONF_MAX }; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5cdbc102a418..040c4f05b653 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1397,6 +1397,7 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, "accept_source_route"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"), + DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 3323168ee52d..82dbf711d6d0 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -252,6 +252,8 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, no_addr = in_dev->ifa_list == NULL; rpf = IN_DEV_RPFILTER(in_dev); accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); + if (mark && !IN_DEV_SRC_VMARK(in_dev)) + fl.mark = 0; } rcu_read_unlock(); -- cgit v1.2.3 From e5cd6fe391aa8c93560bb7ffdfe334cf4d0a02e4 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Sat, 26 Dec 2009 11:51:00 +0000 Subject: llc: add support for LLC_OPT_PKTINFO Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- include/linux/llc.h | 7 +++++++ include/net/llc_conn.h | 1 + net/llc/af_llc.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/llc.h b/include/linux/llc.h index 7733585603f1..ad7074ba81af 100644 --- a/include/linux/llc.h +++ b/include/linux/llc.h @@ -36,6 +36,7 @@ enum llc_sockopts { LLC_OPT_BUSY_TMR_EXP, /* busy state expire time (secs). */ LLC_OPT_TX_WIN, /* tx window size. */ LLC_OPT_RX_WIN, /* rx window size. */ + LLC_OPT_PKTINFO, /* ancillary packet information. */ LLC_OPT_MAX }; @@ -70,6 +71,12 @@ enum llc_sockopts { #define LLC_SAP_RM 0xD4 /* Resource Management */ #define LLC_SAP_GLOBAL 0xFF /* Global SAP. */ +struct llc_pktinfo { + int lpi_ifindex; + unsigned char lpi_sap; + unsigned char lpi_mac[IFHWADDRLEN]; +}; + #ifdef __KERNEL__ #define LLC_SAP_DYN_START 0xC0 #define LLC_SAP_DYN_STOP 0xDE diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h index e2374e34989f..fe982fd94c4a 100644 --- a/include/net/llc_conn.h +++ b/include/net/llc_conn.h @@ -76,6 +76,7 @@ struct llc_sock { u32 rx_pdu_hdr; /* used for saving header of last pdu received and caused sending FRMR. Used for resending FRMR */ + u32 cmsg_flags; }; static inline struct llc_sock *llc_sk(const struct sock *sk) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 3a66546cad06..ac691fe08076 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -47,6 +47,10 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout); #define dprintk(args...) #endif +/* Maybe we'll add some more in the future. */ +#define LLC_CMSG_PKTINFO 1 + + /** * llc_ui_next_link_no - return the next unused link number for a sap * @sap: Address of sap to get link number from. @@ -591,6 +595,20 @@ static int llc_wait_data(struct sock *sk, long timeo) return rc; } +static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(skb->sk); + + if (llc->cmsg_flags & LLC_CMSG_PKTINFO) { + struct llc_pktinfo info; + + info.lpi_ifindex = llc_sk(skb->sk)->dev->ifindex; + llc_pdu_decode_dsap(skb, &info.lpi_sap); + llc_pdu_decode_da(skb, info.lpi_mac); + put_cmsg(msg, SOL_LLC, LLC_OPT_PKTINFO, sizeof(info), &info); + } +} + /** * llc_ui_accept - accept a new incoming connection. * @sock: Socket which connections arrive on. @@ -812,6 +830,8 @@ copy_uaddr: memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr)); msg->msg_namelen = sizeof(*uaddr); } + if (llc_sk(sk)->cmsg_flags) + llc_cmsg_rcv(msg, skb); goto out; } @@ -1030,6 +1050,12 @@ static int llc_ui_setsockopt(struct socket *sock, int level, int optname, goto out; llc->rw = opt; break; + case LLC_OPT_PKTINFO: + if (opt) + llc->cmsg_flags |= LLC_CMSG_PKTINFO; + else + llc->cmsg_flags &= ~LLC_CMSG_PKTINFO; + break; default: rc = -ENOPROTOOPT; goto out; @@ -1083,6 +1109,9 @@ static int llc_ui_getsockopt(struct socket *sock, int level, int optname, val = llc->k; break; case LLC_OPT_RX_WIN: val = llc->rw; break; + case LLC_OPT_PKTINFO: + val = (llc->cmsg_flags & LLC_CMSG_PKTINFO) != 0; + break; default: rc = -ENOPROTOOPT; goto out; -- cgit v1.2.3 From 49f474331e563a6ecf3b1e87ec27ec5482b3e4f1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 27 Dec 2009 11:51:52 +0100 Subject: perf events: Remove arg from perf sched hooks Since we only ever schedule the local cpu, there is no need to pass the cpu number to the perf sched hooks. This micro-optimizes things a bit. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 12 ++++++------ kernel/perf_event.c | 27 ++++++++++++++------------- kernel/sched.c | 6 +++--- 3 files changed, 23 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c66b34f75eea..a494e7501292 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -746,10 +746,10 @@ extern int perf_max_events; extern const struct pmu *hw_perf_event_init(struct perf_event *event); -extern void perf_event_task_sched_in(struct task_struct *task, int cpu); +extern void perf_event_task_sched_in(struct task_struct *task); extern void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu); -extern void perf_event_task_tick(struct task_struct *task, int cpu); + struct task_struct *next); +extern void perf_event_task_tick(struct task_struct *task); extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); @@ -870,12 +870,12 @@ extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); #else static inline void -perf_event_task_sched_in(struct task_struct *task, int cpu) { } +perf_event_task_sched_in(struct task_struct *task) { } static inline void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) { } + struct task_struct *next) { } static inline void -perf_event_task_tick(struct task_struct *task, int cpu) { } +perf_event_task_tick(struct task_struct *task) { } static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 03cc061398d1..099bd662daa6 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1170,9 +1170,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, * not restart the event. */ void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) + struct task_struct *next) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent; @@ -1252,8 +1252,9 @@ static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) static void __perf_event_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, int cpu) + struct perf_cpu_context *cpuctx) { + int cpu = smp_processor_id(); struct perf_event *event; int can_add_hw = 1; @@ -1326,24 +1327,24 @@ __perf_event_sched_in(struct perf_event_context *ctx, * accessing the event control register. If a NMI hits, then it will * keep the event running. */ -void perf_event_task_sched_in(struct task_struct *task, int cpu) +void perf_event_task_sched_in(struct task_struct *task) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; if (likely(!ctx)) return; if (cpuctx->task_ctx == ctx) return; - __perf_event_sched_in(ctx, cpuctx, cpu); + __perf_event_sched_in(ctx, cpuctx); cpuctx->task_ctx = ctx; } -static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = &cpuctx->ctx; - __perf_event_sched_in(ctx, cpuctx, cpu); + __perf_event_sched_in(ctx, cpuctx); } #define MAX_INTERRUPTS (~0ULL) @@ -1461,7 +1462,7 @@ static void rotate_ctx(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); } -void perf_event_task_tick(struct task_struct *curr, int cpu) +void perf_event_task_tick(struct task_struct *curr) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; @@ -1469,7 +1470,7 @@ void perf_event_task_tick(struct task_struct *curr, int cpu) if (!atomic_read(&nr_events)) return; - cpuctx = &per_cpu(perf_cpu_context, cpu); + cpuctx = &__get_cpu_var(perf_cpu_context); ctx = curr->perf_event_ctxp; perf_ctx_adjust_freq(&cpuctx->ctx); @@ -1484,9 +1485,9 @@ void perf_event_task_tick(struct task_struct *curr, int cpu) if (ctx) rotate_ctx(ctx); - perf_event_cpu_sched_in(cpuctx, cpu); + perf_event_cpu_sched_in(cpuctx); if (ctx) - perf_event_task_sched_in(curr, cpu); + perf_event_task_sched_in(curr); } /* @@ -1527,7 +1528,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_unlock(&ctx->lock); - perf_event_task_sched_in(task, smp_processor_id()); + perf_event_task_sched_in(task); out: local_irq_restore(flags); } diff --git a/kernel/sched.c b/kernel/sched.c index 18cceeecce35..d6527ac0f6e7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2752,7 +2752,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); - perf_event_task_sched_in(current, cpu_of(rq)); + perf_event_task_sched_in(current); finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); @@ -5266,7 +5266,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); - perf_event_task_tick(curr, cpu); + perf_event_task_tick(curr); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -5480,7 +5480,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_event_task_sched_out(prev, next, cpu); + perf_event_task_sched_out(prev, next); rq->nr_switches++; rq->curr = next; -- cgit v1.2.3 From 07b139c8c81b97bbe55c68daf0cbeca8b1c609ca Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 21 Dec 2009 14:27:35 +0800 Subject: perf events: Remove CONFIG_EVENT_PROFILE Quoted from Ingo: | This reminds me - i think we should eliminate CONFIG_EVENT_PROFILE - | it's an unnecessary Kconfig complication. If both PERF_EVENTS and | EVENT_TRACING is enabled we should expose generic tracepoints. | | Nor is it limited to event 'profiling', so it has become a misnomer as | well. Signed-off-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4B2F1557.2050705@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 2 +- include/linux/perf_event.h | 2 +- include/linux/syscalls.h | 4 ++-- include/trace/ftrace.h | 12 ++++++------ include/trace/syscall.h | 4 ++-- init/Kconfig | 13 ------------- kernel/perf_event.c | 4 ++-- kernel/trace/Makefile | 4 +++- kernel/trace/trace_events_filter.c | 4 ++-- kernel/trace/trace_kprobe.c | 14 +++++++------- kernel/trace/trace_syscalls.c | 5 ++--- 11 files changed, 28 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 2233c98d80df..0a09e758c7d3 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -188,7 +188,7 @@ do { \ __trace_printk(ip, fmt, ##args); \ } while (0) -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS struct perf_event; extern int ftrace_profile_enable(int event_id); extern void ftrace_profile_disable(int event_id); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a494e7501292..9a1d276db754 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -658,7 +658,7 @@ struct perf_event { perf_overflow_handler_t overflow_handler; -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_EVENT_TRACING struct event_filter *filter; #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 65793e90d6f6..b7c7fcf7790b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -99,7 +99,7 @@ struct perf_event_attr; #define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) #define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS #define TRACE_SYS_ENTER_PROFILE_INIT(sname) \ .profile_enable = prof_sysenter_enable, \ @@ -113,7 +113,7 @@ struct perf_event_attr; #define TRACE_SYS_ENTER_PROFILE_INIT(sname) #define TRACE_SYS_EXIT_PROFILE(sname) #define TRACE_SYS_EXIT_PROFILE_INIT(sname) -#endif +#endif /* CONFIG_PERF_EVENTS */ #ifdef CONFIG_FTRACE_SYSCALLS #define __SC_STR_ADECL1(t, a) #a diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 73523151a731..2fdd36df41f6 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -498,7 +498,7 @@ static inline int ftrace_get_offsets_##call( \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS /* * Generate the functions needed for tracepoint perf_event support. @@ -541,7 +541,7 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#endif +#endif /* CONFIG_PERF_EVENTS */ /* * Stage 4 of the trace events. @@ -626,7 +626,7 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ * */ -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS #define _TRACE_PROFILE_INIT(call) \ .profile_enable = ftrace_profile_enable_##call, \ @@ -634,7 +634,7 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ #else #define _TRACE_PROFILE_INIT(call) -#endif +#endif /* CONFIG_PERF_EVENTS */ #undef __entry #define __entry entry @@ -834,7 +834,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * } */ -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS #undef __perf_addr #define __perf_addr(a) __addr = (a) @@ -926,7 +926,7 @@ static void ftrace_profile_##call(proto) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_PERF_EVENTS */ #undef _TRACE_PROFILE_INIT diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 961fda3556bb..3d463dcef298 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -49,12 +49,12 @@ ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s); enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags); enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags); #endif -#ifdef CONFIG_EVENT_PROFILE + +#ifdef CONFIG_PERF_EVENTS int prof_sysenter_enable(struct ftrace_event_call *call); void prof_sysenter_disable(struct ftrace_event_call *call); int prof_sysexit_enable(struct ftrace_event_call *call); void prof_sysexit_disable(struct ftrace_event_call *call); - #endif #endif /* _TRACE_SYSCALL_H */ diff --git a/init/Kconfig b/init/Kconfig index a23da9f01803..06dab27c18d9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -966,19 +966,6 @@ config PERF_EVENTS Say Y if unsure. -config EVENT_PROFILE - bool "Tracepoint profiling sources" - depends on PERF_EVENTS && EVENT_TRACING - default y - help - Allow the use of tracepoints as software performance events. - - When this is enabled, you can create perf events based on - tracepoints using PERF_TYPE_TRACEPOINT and the tracepoint ID - found in debugfs://tracing/events/*/*/id. (The -e/--events - option to the perf tool can parse and interpret symbolic - tracepoints, in the subsystem:tracepoint_name format.) - config PERF_COUNTERS bool "Kernel performance counters (old config option)" depends on HAVE_PERF_EVENTS diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 099bd662daa6..5b987b4a98a8 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4177,7 +4177,7 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_event_read, }; -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_EVENT_TRACING void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) @@ -4282,7 +4282,7 @@ static void perf_event_free_filter(struct perf_event *event) { } -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT static void bp_perf_event_destroy(struct perf_event *event) diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index cd9ecd89ec77..d00c6fe23f54 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -51,7 +51,9 @@ endif obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o -obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o +endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 50504cb228de..74563d7e102e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1360,7 +1360,7 @@ out_unlock: return err; } -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS void ftrace_profile_free_filter(struct perf_event *event) { @@ -1428,5 +1428,5 @@ out_unlock: return err; } -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 375f81a568dc..75d75dec226a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1249,7 +1249,7 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call, ", REC->" FIELD_STRING_RETIP); } -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ static __kprobes int kprobe_profile_func(struct kprobe *kp, @@ -1407,7 +1407,7 @@ static void probe_profile_disable(struct ftrace_event_call *call) disable_kprobe(&tp->rp.kp); } } -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_PERF_EVENTS */ static __kprobes @@ -1417,10 +1417,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) if (tp->flags & TP_FLAG_TRACE) kprobe_trace_func(kp, regs); -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) kprobe_profile_func(kp, regs); -#endif /* CONFIG_EVENT_PROFILE */ +#endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1431,10 +1431,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) if (tp->flags & TP_FLAG_TRACE) kretprobe_trace_func(ri, regs); -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) kretprobe_profile_func(ri, regs); -#endif /* CONFIG_EVENT_PROFILE */ +#endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1463,7 +1463,7 @@ static int register_probe_event(struct trace_probe *tp) call->regfunc = probe_event_enable; call->unregfunc = probe_event_disable; -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS call->profile_enable = probe_profile_enable; call->profile_disable = probe_profile_disable; #endif diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 75289f372dd2..f694f66d75b0 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -421,7 +421,7 @@ int __init init_ftrace_syscalls(void) } core_initcall(init_ftrace_syscalls); -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); @@ -626,6 +626,5 @@ void prof_sysexit_disable(struct ftrace_event_call *call) mutex_unlock(&syscall_trace_lock); } -#endif - +#endif /* CONFIG_PERF_EVENTS */ -- cgit v1.2.3 From d894837f23f491aa7ed167aae767fc07cfe6e6e6 Mon Sep 17 00:00:00 2001 From: Simon Kagstrom Date: Wed, 23 Dec 2009 11:08:18 +0100 Subject: sched: might_sleep(): Make file parameter const char * Fixes a warning when building with g++: warning: deprecated conversion from string constant to 'char*' And the file parameter use is constant, so mark it as such. Signed-off-by: Simon Kagstrom Cc: peterz@infradead.org LKML-Reference: <20091223110818.442d848e@marrow.netinsight.se> Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 5 +++-- kernel/sched.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3fc9f5aab5f8..785d7d1099d4 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -124,7 +124,7 @@ extern int _cond_resched(void); #endif #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - void __might_sleep(char *file, int line, int preempt_offset); + void __might_sleep(const char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep * @@ -138,7 +138,8 @@ extern int _cond_resched(void); # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) #else - static inline void __might_sleep(char *file, int line, int preempt_offset) { } + static inline void __might_sleep(const char *file, int line, + int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) #endif diff --git a/kernel/sched.c b/kernel/sched.c index c535cc4f6428..64298a52eaa6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9694,7 +9694,7 @@ static inline int preempt_count_equals(int preempt_offset) return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); } -void __might_sleep(char *file, int line, int preempt_offset) +void __might_sleep(const char *file, int line, int preempt_offset) { #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ -- cgit v1.2.3 From 8e664fb3fd2b04e3ac5fad7f046000ba54e0e275 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 23 Dec 2009 13:15:38 +0100 Subject: mac80211: split up and insert custom IEs correctly Currently, we insert all user-specified IEs before the HT IE for association, and after the HT IE for probe requests. For association, that's correct only if the user-specified IEs are RSN only, incorrect in all other cases including WPA. Change this to split apart the user-specified IEs in two places for association: before the HT IE (e.g. RSN), after the HT IE (generally empty right now I think?) and after WMM (all other vendor-specific IEs). For probes, split the IEs in different places to be correct according to the spec. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 39 ++++++++++--- net/mac80211/ieee80211_i.h | 4 ++ net/mac80211/util.c | 134 ++++++++++++++++++++++++++++++++++++++------- net/mac80211/work.c | 43 ++++++++++++--- 4 files changed, 184 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d62edc7df3ae..aeea282bd2fe 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1085,12 +1085,12 @@ enum ieee80211_eid { WLAN_EID_TIM = 5, WLAN_EID_IBSS_PARAMS = 6, WLAN_EID_CHALLENGE = 16, - /* 802.11d */ + WLAN_EID_COUNTRY = 7, WLAN_EID_HP_PARAMS = 8, WLAN_EID_HP_TABLE = 9, WLAN_EID_REQUEST = 10, - /* 802.11e */ + WLAN_EID_QBSS_LOAD = 11, WLAN_EID_EDCA_PARAM_SET = 12, WLAN_EID_TSPEC = 13, @@ -1113,7 +1113,7 @@ enum ieee80211_eid { WLAN_EID_PREP = 69, WLAN_EID_PERR = 70, WLAN_EID_RANN = 49, /* compatible with FreeBSD */ - /* 802.11h */ + WLAN_EID_PWR_CONSTRAINT = 32, WLAN_EID_PWR_CAPABILITY = 33, WLAN_EID_TPC_REQUEST = 34, @@ -1124,20 +1124,41 @@ enum ieee80211_eid { WLAN_EID_MEASURE_REPORT = 39, WLAN_EID_QUIET = 40, WLAN_EID_IBSS_DFS = 41, - /* 802.11g */ + WLAN_EID_ERP_INFO = 42, WLAN_EID_EXT_SUPP_RATES = 50, - /* 802.11n */ + WLAN_EID_HT_CAPABILITY = 45, WLAN_EID_HT_INFORMATION = 61, - /* 802.11i */ + WLAN_EID_RSN = 48, - WLAN_EID_TIMEOUT_INTERVAL = 56, - WLAN_EID_MMIE = 76 /* 802.11w */, + WLAN_EID_MMIE = 76, WLAN_EID_WPA = 221, WLAN_EID_GENERIC = 221, WLAN_EID_VENDOR_SPECIFIC = 221, - WLAN_EID_QOS_PARAMETER = 222 + WLAN_EID_QOS_PARAMETER = 222, + + WLAN_EID_AP_CHAN_REPORT = 51, + WLAN_EID_NEIGHBOR_REPORT = 52, + WLAN_EID_RCPI = 53, + WLAN_EID_BSS_AVG_ACCESS_DELAY = 63, + WLAN_EID_ANTENNA_INFO = 64, + WLAN_EID_RSNI = 65, + WLAN_EID_MEASUREMENT_PILOT_TX_INFO = 66, + WLAN_EID_BSS_AVAILABLE_CAPACITY = 67, + WLAN_EID_BSS_AC_ACCESS_DELAY = 68, + WLAN_EID_RRM_ENABLED_CAPABILITIES = 70, + WLAN_EID_MULTIPLE_BSSID = 71, + + WLAN_EID_MOBILITY_DOMAIN = 54, + WLAN_EID_FAST_BSS_TRANSITION = 55, + WLAN_EID_TIMEOUT_INTERVAL = 56, + WLAN_EID_RIC_DATA = 57, + WLAN_EID_RIC_DESCRIPTOR = 75, + + WLAN_EID_DSE_REGISTERED_LOCATION = 58, + WLAN_EID_SUPPORTED_REGULATORY_CLASSES = 59, + WLAN_EID_EXT_CHANSWITCH_ANN = 60, }; /* Action category code */ diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 97b6076b492e..6ea4ffbf84d8 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1148,6 +1148,10 @@ int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata, void ieee80211_recalc_smps(struct ieee80211_local *local, struct ieee80211_sub_if_data *forsdata); +size_t ieee80211_ie_split(const u8 *ies, size_t ielen, + const u8 *ids, int n_ids, size_t offset); +size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset); + /* internal work items */ void ieee80211_work_init(struct ieee80211_local *local); void ieee80211_add_work(struct ieee80211_work *wk); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 5ffe9e831b66..1fdb80ff9241 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -881,30 +881,66 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, enum ieee80211_band band) { struct ieee80211_supported_band *sband; - u8 *pos, *supp_rates_len, *esupp_rates_len = NULL; - int i; + u8 *pos; + size_t offset = 0, noffset; + int supp_rates_len, i; sband = local->hw.wiphy->bands[band]; pos = buffer; + supp_rates_len = min_t(int, sband->n_bitrates, 8); + *pos++ = WLAN_EID_SUPP_RATES; - supp_rates_len = pos; - *pos++ = 0; - - for (i = 0; i < sband->n_bitrates; i++) { - struct ieee80211_rate *rate = &sband->bitrates[i]; - - if (esupp_rates_len) { - *esupp_rates_len += 1; - } else if (*supp_rates_len == 8) { - *pos++ = WLAN_EID_EXT_SUPP_RATES; - esupp_rates_len = pos; - *pos++ = 1; - } else - *supp_rates_len += 1; + *pos++ = supp_rates_len; - *pos++ = rate->bitrate / 5; + for (i = 0; i < supp_rates_len; i++) { + int rate = sband->bitrates[i].bitrate; + *pos++ = (u8) (rate / 5); + } + + /* insert "request information" if in custom IEs */ + if (ie && ie_len) { + static const u8 before_extrates[] = { + WLAN_EID_SSID, + WLAN_EID_SUPP_RATES, + WLAN_EID_REQUEST, + }; + noffset = ieee80211_ie_split(ie, ie_len, + before_extrates, + ARRAY_SIZE(before_extrates), + offset); + memcpy(pos, ie + offset, noffset - offset); + pos += noffset - offset; + offset = noffset; + } + + if (sband->n_bitrates > i) { + *pos++ = WLAN_EID_EXT_SUPP_RATES; + *pos++ = sband->n_bitrates - i; + + for (; i < sband->n_bitrates; i++) { + int rate = sband->bitrates[i].bitrate; + *pos++ = (u8) (rate / 5); + } + } + + /* insert custom IEs that go before HT */ + if (ie && ie_len) { + static const u8 before_ht[] = { + WLAN_EID_SSID, + WLAN_EID_SUPP_RATES, + WLAN_EID_REQUEST, + WLAN_EID_EXT_SUPP_RATES, + WLAN_EID_DS_PARAMS, + WLAN_EID_SUPPORTED_REGULATORY_CLASSES, + }; + noffset = ieee80211_ie_split(ie, ie_len, + before_ht, ARRAY_SIZE(before_ht), + offset); + memcpy(pos, ie + offset, noffset - offset); + pos += noffset - offset; + offset = noffset; } if (sband->ht_cap.ht_supported) { @@ -936,9 +972,11 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, * that calculates local->scan_ies_len. */ - if (ie) { - memcpy(pos, ie, ie_len); - pos += ie_len; + /* add any remaining custom IEs */ + if (ie && ie_len) { + noffset = ie_len; + memcpy(pos, ie + offset, noffset - offset); + pos += noffset - offset; } return pos - buffer; @@ -1252,3 +1290,59 @@ void ieee80211_recalc_smps(struct ieee80211_local *local, /* changed flag is auto-detected for this */ ieee80211_hw_config(local, 0); } + +static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id) +{ + int i; + + for (i = 0; i < n_ids; i++) + if (ids[i] == id) + return true; + return false; +} + +/** + * ieee80211_ie_split - split an IE buffer according to ordering + * + * @ies: the IE buffer + * @ielen: the length of the IE buffer + * @ids: an array with element IDs that are allowed before + * the split + * @n_ids: the size of the element ID array + * @offset: offset where to start splitting in the buffer + * + * This function splits an IE buffer by updating the @offset + * variable to point to the location where the buffer should be + * split. + * + * It assumes that the given IE buffer is well-formed, this + * has to be guaranteed by the caller! + * + * It also assumes that the IEs in the buffer are ordered + * correctly, if not the result of using this function will not + * be ordered correctly either, i.e. it does no reordering. + * + * The function returns the offset where the next part of the + * buffer starts, which may be @ielen if the entire (remainder) + * of the buffer should be used. + */ +size_t ieee80211_ie_split(const u8 *ies, size_t ielen, + const u8 *ids, int n_ids, size_t offset) +{ + size_t pos = offset; + + while (pos < ielen && ieee80211_id_in_list(ids, n_ids, ies[pos])) + pos += 2 + ies[pos + 1]; + + return pos; +} + +size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset) +{ + size_t pos = offset; + + while (pos < ielen && ies[pos] != WLAN_EID_VENDOR_SPECIFIC) + pos += 2 + ies[pos + 1]; + + return pos; +} diff --git a/net/mac80211/work.c b/net/mac80211/work.c index c03c22d5bca3..affdd10b67ad 100644 --- a/net/mac80211/work.c +++ b/net/mac80211/work.c @@ -204,6 +204,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt; u8 *pos; const u8 *ies; + size_t offset = 0, noffset; int i, len, count, rates_len, supp_rates_len; u16 capab; struct ieee80211_supported_band *sband; @@ -337,14 +338,26 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, } } - /* - * XXX: These IEs could contain (vendor-specified) - * IEs that belong after HT -- the buffer may - * need to be split up. - */ + /* if present, add any custom IEs that go before HT */ if (wk->ie_len && wk->ie) { - pos = skb_put(skb, wk->ie_len); - memcpy(pos, wk->ie, wk->ie_len); + static const u8 before_ht[] = { + WLAN_EID_SSID, + WLAN_EID_SUPP_RATES, + WLAN_EID_EXT_SUPP_RATES, + WLAN_EID_PWR_CAPABILITY, + WLAN_EID_SUPPORTED_CHANNELS, + WLAN_EID_RSN, + WLAN_EID_QOS_CAPA, + WLAN_EID_RRM_ENABLED_CAPABILITIES, + WLAN_EID_MOBILITY_DOMAIN, + WLAN_EID_SUPPORTED_REGULATORY_CLASSES, + }; + noffset = ieee80211_ie_split(wk->ie, wk->ie_len, + before_ht, ARRAY_SIZE(before_ht), + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, wk->ie + offset, noffset - offset); + offset = noffset; } if (wk->assoc.use_11n && wk->assoc.wmm_used && @@ -352,6 +365,15 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, ieee80211_add_ht_ie(skb, wk->assoc.ht_information_ie, sband, wk->chan, wk->assoc.smps); + /* if present, add any custom non-vendor IEs that go after HT */ + if (wk->ie_len && wk->ie) { + noffset = ieee80211_ie_split_vendor(wk->ie, wk->ie_len, + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, wk->ie + offset, noffset - offset); + offset = noffset; + } + if (wk->assoc.wmm_used && local->hw.queues >= 4) { pos = skb_put(skb, 9); *pos++ = WLAN_EID_VENDOR_SPECIFIC; @@ -365,6 +387,13 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, *pos++ = 0; } + /* add any remaining custom (i.e. vendor specific here) IEs */ + if (wk->ie_len && wk->ie) { + noffset = wk->ie_len; + pos = skb_put(skb, noffset - offset); + memcpy(pos, wk->ie + offset, noffset - offset); + } + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; ieee80211_tx_skb(sdata, skb); } -- cgit v1.2.3 From 9588bbd5529461a3dacd435bf239c84c3508f569 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 23 Dec 2009 13:15:41 +0100 Subject: cfg80211: add remain-on-channel command Add new commands for requesting the driver to remain awake on a specified channel for the specified amount of time (and another command to cancel such an operation). This can be used to implement userspace-controlled off-channel operations, like Public Action frame exchange on another channel than the operation channel. The off-channel operation should behave similarly to scan, i.e. the local station (if associated) moves into power save mode to request the AP to buffer frames for it and then moves to the other channel to allow the off-channel operation to be completed. The duration parameter can be used to request enough time to receive a response from the target station. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/nl80211.h | 36 ++++++++ include/net/cfg80211.h | 47 +++++++++++ net/wireless/chan.c | 41 +++++---- net/wireless/core.h | 3 + net/wireless/mlme.c | 27 ++++++ net/wireless/nl80211.c | 218 +++++++++++++++++++++++++++++++++++++++++++++++- net/wireless/nl80211.h | 11 +++ 7 files changed, 368 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index da8ea2e19273..2bfbe88837ef 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -270,6 +270,31 @@ * @NL80211_CMD_SET_WIPHY_NETNS: Set a wiphy's netns. Note that all devices * associated with this wiphy must be down and will follow. * + * @NL80211_CMD_REMAIN_ON_CHANNEL: Request to remain awake on the specified + * channel for the specified amount of time. This can be used to do + * off-channel operations like transmit a Public Action frame and wait for + * a response while being associated to an AP on another channel. + * %NL80211_ATTR_WIPHY or %NL80211_ATTR_IFINDEX is used to specify which + * radio is used. %NL80211_ATTR_WIPHY_FREQ is used to specify the + * frequency for the operation and %NL80211_ATTR_WIPHY_CHANNEL_TYPE may be + * optionally used to specify additional channel parameters. + * %NL80211_ATTR_DURATION is used to specify the duration in milliseconds + * to remain on the channel. This command is also used as an event to + * notify when the requested duration starts (it may take a while for the + * driver to schedule this time due to other concurrent needs for the + * radio). + * When called, this operation returns a cookie (%NL80211_ATTR_COOKIE) + * that will be included with any events pertaining to this request; + * the cookie is also used to cancel the request. + * @NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL: This command can be used to cancel a + * pending remain-on-channel duration if the desired operation has been + * completed prior to expiration of the originally requested duration. + * %NL80211_ATTR_WIPHY or %NL80211_ATTR_IFINDEX is used to specify the + * radio. The %NL80211_ATTR_COOKIE attribute must be given as well to + * uniquely identify the request. + * This command is also used as an event to notify when a requested + * remain-on-channel duration has expired. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -353,6 +378,9 @@ enum nl80211_commands { NL80211_CMD_DEL_PMKSA, NL80211_CMD_FLUSH_PMKSA, + NL80211_CMD_REMAIN_ON_CHANNEL, + NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -606,6 +634,10 @@ enum nl80211_commands { * @NL80211_ATTR_MAX_NUM_PMKIDS: maximum number of PMKIDs a firmware can * cache, a wiphy attribute. * + * @NL80211_ATTR_DURATION: Duration of an operation in milliseconds, u32. + * + * @NL80211_ATTR_COOKIE: Generic 64-bit cookie to identify objects. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -743,6 +775,10 @@ enum nl80211_attrs { NL80211_ATTR_PMKID, NL80211_ATTR_MAX_NUM_PMKIDS, + NL80211_ATTR_DURATION, + + NL80211_ATTR_COOKIE, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 542a477a94da..b66beb052054 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -988,6 +988,15 @@ struct cfg80211_pmksa { * * @dump_survey: get site survey information. * + * @remain_on_channel: Request the driver to remain awake on the specified + * channel for the specified duration to complete an off-channel + * operation (e.g., public action frame exchange). When the driver is + * ready on the requested channel, it must indicate this with an event + * notification by calling cfg80211_ready_on_channel(). + * @cancel_remain_on_channel: Cancel an on-going remain-on-channel operation. + * This allows the operation to be terminated prior to timeout based on + * the duration value. + * * @testmode_cmd: run a test mode command * * @set_pmksa: Cache a PMKID for a BSSID. This is mostly useful for fullmac @@ -1123,6 +1132,16 @@ struct cfg80211_ops { struct cfg80211_pmksa *pmksa); int (*flush_pmksa)(struct wiphy *wiphy, struct net_device *netdev); + int (*remain_on_channel)(struct wiphy *wiphy, + struct net_device *dev, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, + u64 *cookie); + int (*cancel_remain_on_channel)(struct wiphy *wiphy, + struct net_device *dev, + u64 cookie); + /* some temporary stuff to finish wext */ int (*set_power_mgmt)(struct wiphy *wiphy, struct net_device *dev, bool enabled, int timeout); @@ -2147,5 +2166,33 @@ void cfg80211_roamed(struct net_device *dev, const u8 *bssid, void cfg80211_disconnected(struct net_device *dev, u16 reason, u8 *ie, size_t ie_len, gfp_t gfp); +/** + * cfg80211_ready_on_channel - notification of remain_on_channel start + * @dev: network device + * @cookie: the request cookie + * @chan: The current channel (from remain_on_channel request) + * @channel_type: Channel type + * @duration: Duration in milliseconds that the driver intents to remain on the + * channel + * @gfp: allocation flags + */ +void cfg80211_ready_on_channel(struct net_device *dev, u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, gfp_t gfp); + +/** + * cfg80211_remain_on_channel_expired - remain_on_channel duration expired + * @dev: network device + * @cookie: the request cookie + * @chan: The current channel (from remain_on_channel request) + * @channel_type: Channel type + * @gfp: allocation flags + */ +void cfg80211_remain_on_channel_expired(struct net_device *dev, + u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + gfp_t gfp); #endif /* __NET_CFG80211_H */ diff --git a/net/wireless/chan.c b/net/wireless/chan.c index a46ac6c9b365..bf1737fc9a7e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -41,44 +41,57 @@ rdev_fixed_channel(struct cfg80211_registered_device *rdev, return result; } -int rdev_set_freq(struct cfg80211_registered_device *rdev, - struct wireless_dev *for_wdev, +struct ieee80211_channel * +rdev_freq_to_chan(struct cfg80211_registered_device *rdev, int freq, enum nl80211_channel_type channel_type) { struct ieee80211_channel *chan; struct ieee80211_sta_ht_cap *ht_cap; - int result; - - if (rdev_fixed_channel(rdev, for_wdev)) - return -EBUSY; - - if (!rdev->ops->set_channel) - return -EOPNOTSUPP; chan = ieee80211_get_channel(&rdev->wiphy, freq); /* Primary channel not allowed */ if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) - return -EINVAL; + return NULL; if (channel_type == NL80211_CHAN_HT40MINUS && chan->flags & IEEE80211_CHAN_NO_HT40MINUS) - return -EINVAL; + return NULL; else if (channel_type == NL80211_CHAN_HT40PLUS && chan->flags & IEEE80211_CHAN_NO_HT40PLUS) - return -EINVAL; + return NULL; ht_cap = &rdev->wiphy.bands[chan->band]->ht_cap; if (channel_type != NL80211_CHAN_NO_HT) { if (!ht_cap->ht_supported) - return -EINVAL; + return NULL; if (!(ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) || ht_cap->cap & IEEE80211_HT_CAP_40MHZ_INTOLERANT) - return -EINVAL; + return NULL; } + return chan; +} + +int rdev_set_freq(struct cfg80211_registered_device *rdev, + struct wireless_dev *for_wdev, + int freq, enum nl80211_channel_type channel_type) +{ + struct ieee80211_channel *chan; + int result; + + if (rdev_fixed_channel(rdev, for_wdev)) + return -EBUSY; + + if (!rdev->ops->set_channel) + return -EOPNOTSUPP; + + chan = rdev_freq_to_chan(rdev, freq, channel_type); + if (!chan) + return -EINVAL; + result = rdev->ops->set_channel(&rdev->wiphy, chan, channel_type); if (result) return result; diff --git a/net/wireless/core.h b/net/wireless/core.h index 35b712127143..30ec95f05b52 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -374,6 +374,9 @@ void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); struct ieee80211_channel * rdev_fixed_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *for_wdev); +struct ieee80211_channel * +rdev_freq_to_chan(struct cfg80211_registered_device *rdev, + int freq, enum nl80211_channel_type channel_type); int rdev_set_freq(struct cfg80211_registered_device *rdev, struct wireless_dev *for_wdev, int freq, enum nl80211_channel_type channel_type); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index acaeaa784d68..11f6469b3f98 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -680,3 +680,30 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, } } } + +void cfg80211_ready_on_channel(struct net_device *dev, u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, gfp_t gfp) +{ + struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + nl80211_send_remain_on_channel(rdev, dev, cookie, chan, channel_type, + duration, gfp); +} +EXPORT_SYMBOL(cfg80211_ready_on_channel); + +void cfg80211_remain_on_channel_expired(struct net_device *dev, + u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + gfp_t gfp) +{ + struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + nl80211_send_remain_on_channel_cancel(rdev, dev, cookie, chan, + channel_type, gfp); +} +EXPORT_SYMBOL(cfg80211_remain_on_channel_expired); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 60f854377f90..ff857f10cb85 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -141,6 +141,8 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = { [NL80211_ATTR_4ADDR] = { .type = NLA_U8 }, [NL80211_ATTR_PMKID] = { .type = NLA_BINARY, .len = WLAN_PMKID_LEN }, + [NL80211_ATTR_DURATION] = { .type = NLA_U32 }, + [NL80211_ATTR_COOKIE] = { .type = NLA_U64 }, }; /* policy for the attributes */ @@ -569,6 +571,7 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, CMD(set_pmksa, SET_PMKSA); CMD(del_pmksa, DEL_PMKSA); CMD(flush_pmksa, FLUSH_PMKSA); + CMD(remain_on_channel, REMAIN_ON_CHANNEL); if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) { i++; NLA_PUT_U32(msg, i, NL80211_CMD_SET_WIPHY_NETNS); @@ -4283,6 +4286,143 @@ static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info) } +static int nl80211_remain_on_channel(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev; + struct net_device *dev; + struct ieee80211_channel *chan; + struct sk_buff *msg; + void *hdr; + u64 cookie; + enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT; + u32 freq, duration; + int err; + + if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] || + !info->attrs[NL80211_ATTR_DURATION]) + return -EINVAL; + + duration = nla_get_u32(info->attrs[NL80211_ATTR_DURATION]); + + /* + * We should be on that channel for at least one jiffie, + * and more than 5 seconds seems excessive. + */ + if (!duration || !msecs_to_jiffies(duration) || duration > 5000) + return -EINVAL; + + rtnl_lock(); + + err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); + if (err) + goto unlock_rtnl; + + if (!rdev->ops->remain_on_channel) { + err = -EOPNOTSUPP; + goto out; + } + + if (!netif_running(dev)) { + err = -ENETDOWN; + goto out; + } + + if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { + channel_type = nla_get_u32( + info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]); + if (channel_type != NL80211_CHAN_NO_HT && + channel_type != NL80211_CHAN_HT20 && + channel_type != NL80211_CHAN_HT40PLUS && + channel_type != NL80211_CHAN_HT40MINUS) + err = -EINVAL; + goto out; + } + + freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]); + chan = rdev_freq_to_chan(rdev, freq, channel_type); + if (chan == NULL) { + err = -EINVAL; + goto out; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto out; + } + + hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, + NL80211_CMD_REMAIN_ON_CHANNEL); + + if (IS_ERR(hdr)) { + err = PTR_ERR(hdr); + goto free_msg; + } + + err = rdev->ops->remain_on_channel(&rdev->wiphy, dev, chan, + channel_type, duration, &cookie); + + if (err) + goto free_msg; + + NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie); + + genlmsg_end(msg, hdr); + err = genlmsg_reply(msg, info); + goto out; + + nla_put_failure: + err = -ENOBUFS; + free_msg: + nlmsg_free(msg); + out: + cfg80211_unlock_rdev(rdev); + dev_put(dev); + unlock_rtnl: + rtnl_unlock(); + return err; +} + +static int nl80211_cancel_remain_on_channel(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev; + struct net_device *dev; + u64 cookie; + int err; + + if (!info->attrs[NL80211_ATTR_COOKIE]) + return -EINVAL; + + rtnl_lock(); + + err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); + if (err) + goto unlock_rtnl; + + if (!rdev->ops->cancel_remain_on_channel) { + err = -EOPNOTSUPP; + goto out; + } + + if (!netif_running(dev)) { + err = -ENETDOWN; + goto out; + } + + cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]); + + err = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, dev, cookie); + + out: + cfg80211_unlock_rdev(rdev); + dev_put(dev); + unlock_rtnl: + rtnl_unlock(); + return err; +} + static struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_GET_WIPHY, @@ -4545,8 +4685,20 @@ static struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, }, - + { + .cmd = NL80211_CMD_REMAIN_ON_CHANNEL, + .doit = nl80211_remain_on_channel, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, + .doit = nl80211_cancel_remain_on_channel, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, }; + static struct genl_multicast_group nl80211_mlme_mcgrp = { .name = "mlme", }; @@ -5134,6 +5286,70 @@ nla_put_failure: nlmsg_free(msg); } +static void nl80211_send_remain_on_chan_event( + int cmd, struct cfg80211_registered_device *rdev, + struct net_device *netdev, u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, gfp_t gfp) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, cmd); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_CHANNEL_TYPE, channel_type); + NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie); + + if (cmd == NL80211_CMD_REMAIN_ON_CHANNEL) + NLA_PUT_U32(msg, NL80211_ATTR_DURATION, duration); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +void nl80211_send_remain_on_channel(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, gfp_t gfp) +{ + nl80211_send_remain_on_chan_event(NL80211_CMD_REMAIN_ON_CHANNEL, + rdev, netdev, cookie, chan, + channel_type, duration, gfp); +} + +void nl80211_send_remain_on_channel_cancel( + struct cfg80211_registered_device *rdev, struct net_device *netdev, + u64 cookie, struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, gfp_t gfp) +{ + nl80211_send_remain_on_chan_event(NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, + rdev, netdev, cookie, chan, + channel_type, 0, gfp); +} + /* initialisation/exit functions */ int nl80211_init(void) diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 44cc2a76a1b0..a5e2de419b7a 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -59,4 +59,15 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, gfp_t gfp); +void nl80211_send_remain_on_channel(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, gfp_t gfp); +void nl80211_send_remain_on_channel_cancel( + struct cfg80211_registered_device *rdev, struct net_device *netdev, + u64 cookie, struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, gfp_t gfp); + #endif /* __NET_WIRELESS_NL80211_H */ -- cgit v1.2.3 From 81744ee44ab2845c16ffd7d6f762f7b4a49a4750 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 29 Dec 2009 08:35:35 +0100 Subject: block: Fix incorrect alignment offset reporting and update documentation queue_sector_alignment_offset returned the wrong value which caused partitions to report an incorrect alignment_offset. Since offset alignment calculation is needed several places it has been split into a separate helper function. The topology stacking function has been updated accordingly. Furthermore, comments have been added to clarify how the stacking function works. Signed-off-by: Martin K. Petersen Tested-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-settings.c | 44 +++++++++++++++++++++++++++++++++----------- include/linux/blkdev.h | 11 +++++++++-- 2 files changed, 42 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/block/blk-settings.c b/block/blk-settings.c index e14fcbcedbfa..d52d4adc440b 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -505,20 +505,30 @@ static unsigned int lcm(unsigned int a, unsigned int b) /** * blk_stack_limits - adjust queue_limits for stacked devices - * @t: the stacking driver limits (top) - * @b: the underlying queue limits (bottom) + * @t: the stacking driver limits (top device) + * @b: the underlying queue limits (bottom, component device) * @offset: offset to beginning of data within component device * * Description: - * Merges two queue_limit structs. Returns 0 if alignment didn't - * change. Returns -1 if adding the bottom device caused - * misalignment. + * This function is used by stacking drivers like MD and DM to ensure + * that all component devices have compatible block sizes and + * alignments. The stacking driver must provide a queue_limits + * struct (top) and then iteratively call the stacking function for + * all component (bottom) devices. The stacking function will + * attempt to combine the values and ensure proper alignment. + * + * Returns 0 if the top and bottom queue_limits are compatible. The + * top device's block sizes and alignment offsets may be adjusted to + * ensure alignment with the bottom device. If no compatible sizes + * and alignments exist, -1 is returned and the resulting top + * queue_limits will have the misaligned flag set to indicate that + * the alignment_offset is undefined. */ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset) { sector_t alignment; - unsigned int top, bottom, granularity; + unsigned int top, bottom; t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); @@ -536,15 +546,18 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); - granularity = max(b->physical_block_size, b->io_min); - alignment = b->alignment_offset - (offset & (granularity - 1)); + alignment = queue_limit_alignment_offset(b, offset); + /* Bottom device has different alignment. Check that it is + * compatible with the current top alignment. + */ if (t->alignment_offset != alignment) { top = max(t->physical_block_size, t->io_min) + t->alignment_offset; - bottom = granularity + alignment; + bottom = max(b->physical_block_size, b->io_min) + alignment; + /* Verify that top and bottom intervals line up */ if (max(top, bottom) & (min(top, bottom) - 1)) t->misaligned = 1; } @@ -561,32 +574,39 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->no_cluster |= b->no_cluster; t->discard_zeroes_data &= b->discard_zeroes_data; + /* Physical block size a multiple of the logical block size? */ if (t->physical_block_size & (t->logical_block_size - 1)) { t->physical_block_size = t->logical_block_size; t->misaligned = 1; } + /* Minimum I/O a multiple of the physical block size? */ if (t->io_min & (t->physical_block_size - 1)) { t->io_min = t->physical_block_size; t->misaligned = 1; } + /* Optimal I/O a multiple of the physical block size? */ if (t->io_opt & (t->physical_block_size - 1)) { t->io_opt = 0; t->misaligned = 1; } + /* Find lowest common alignment_offset */ t->alignment_offset = lcm(t->alignment_offset, alignment) & (max(t->physical_block_size, t->io_min) - 1); + /* Verify that new alignment_offset is on a logical block boundary */ if (t->alignment_offset & (t->logical_block_size - 1)) t->misaligned = 1; /* Discard alignment and granularity */ if (b->discard_granularity) { + unsigned int granularity = b->discard_granularity; + offset &= granularity - 1; - alignment = b->discard_alignment - - (offset & (b->discard_granularity - 1)); + alignment = (granularity + b->discard_alignment - offset) + & (granularity - 1); if (t->discard_granularity != 0 && t->discard_alignment != alignment) { @@ -598,6 +618,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->discard_misaligned = 1; } + t->max_discard_sectors = min_not_zero(t->max_discard_sectors, + b->max_discard_sectors); t->discard_granularity = max(t->discard_granularity, b->discard_granularity); t->discard_alignment = lcm(t->discard_alignment, alignment) & diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 784a919aa0d0..59b832be3044 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1116,11 +1116,18 @@ static inline int queue_alignment_offset(struct request_queue *q) return q->limits.alignment_offset; } +static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t offset) +{ + unsigned int granularity = max(lim->physical_block_size, lim->io_min); + + offset &= granularity - 1; + return (granularity + lim->alignment_offset - offset) & (granularity - 1); +} + static inline int queue_sector_alignment_offset(struct request_queue *q, sector_t sector) { - return ((sector << 9) - q->limits.alignment_offset) - & (q->limits.io_min - 1); + return queue_limit_alignment_offset(&q->limits, sector << 9); } static inline int bdev_alignment_offset(struct block_device *bdev) -- cgit v1.2.3 From db5d247ae811f49185a71e703b65acad845e4b18 Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Thu, 24 Dec 2009 12:05:58 +0100 Subject: firewire: fix use of multiple AV/C devices, allow multiple FCP listeners Control of more than one AV/C device at once --- e.g. camcorders, tape decks, audio devices, TV tuners --- failed or worked only unreliably, depending on driver implementation. This affected kernelspace and userspace drivers alike and was caused by firewire-core's inability to accept multiple registrations of FCP listeners. The fix allows multiple address handlers to be registered for the FCP command and response registers. When a request for these registers is received, all handlers are invoked, and the Firewire response is generated by the core and not by any handler. The cdev API does not change, i.e., userspace is still expected to send a response for FCP requests; this response is silently ignored. Signed-off-by: Clemens Ladisch Signed-off-by: Stefan Richter (changelog, rebased, whitespace) --- drivers/firewire/core-cdev.c | 26 ++++--- drivers/firewire/core-transaction.c | 118 ++++++++++++++++++++++++++------ drivers/media/dvb/firewire/firedtv-fw.c | 12 +--- include/linux/firewire-cdev.h | 3 + include/linux/firewire.h | 4 +- 5 files changed, 119 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 231e6ee5ba43..2cb22d160f6e 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -601,8 +601,9 @@ static void release_request(struct client *client, struct inbound_transaction_resource *r = container_of(resource, struct inbound_transaction_resource, resource); - fw_send_response(client->device->card, r->request, - RCODE_CONFLICT_ERROR); + if (r->request) + fw_send_response(client->device->card, r->request, + RCODE_CONFLICT_ERROR); kfree(r); } @@ -645,7 +646,8 @@ static void handle_request(struct fw_card *card, struct fw_request *request, failed: kfree(r); kfree(e); - fw_send_response(card, request, RCODE_CONFLICT_ERROR); + if (request) + fw_send_response(card, request, RCODE_CONFLICT_ERROR); } static void release_address_handler(struct client *client, @@ -715,15 +717,17 @@ static int ioctl_send_response(struct client *client, void *buffer) r = container_of(resource, struct inbound_transaction_resource, resource); - if (request->length < r->length) - r->length = request->length; - - if (copy_from_user(r->data, u64_to_uptr(request->data), r->length)) { - ret = -EFAULT; - goto out; + if (r->request) { + if (request->length < r->length) + r->length = request->length; + if (copy_from_user(r->data, u64_to_uptr(request->data), + r->length)) { + ret = -EFAULT; + goto out; + } + fw_send_response(client->device->card, r->request, + request->rcode); } - - fw_send_response(client->device->card, r->request, request->rcode); out: kfree(r); diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 842739df23e2..495849eb13cc 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -432,14 +432,20 @@ static struct fw_address_handler *lookup_overlapping_address_handler( return NULL; } +static bool is_enclosing_handler(struct fw_address_handler *handler, + unsigned long long offset, size_t length) +{ + return handler->offset <= offset && + offset + length <= handler->offset + handler->length; +} + static struct fw_address_handler *lookup_enclosing_address_handler( struct list_head *list, unsigned long long offset, size_t length) { struct fw_address_handler *handler; list_for_each_entry(handler, list, link) { - if (handler->offset <= offset && - offset + length <= handler->offset + handler->length) + if (is_enclosing_handler(handler, offset, length)) return handler; } @@ -465,6 +471,12 @@ const struct fw_address_region fw_unit_space_region = { .start = 0xfffff0000900ULL, .end = 0x1000000000000ULL, }; #endif /* 0 */ +static bool is_in_fcp_region(u64 offset, size_t length) +{ + return offset >= (CSR_REGISTER_BASE | CSR_FCP_COMMAND) && + offset + length <= (CSR_REGISTER_BASE | CSR_FCP_END); +} + /** * fw_core_add_address_handler - register for incoming requests * @handler: callback @@ -477,8 +489,11 @@ const struct fw_address_region fw_unit_space_region = * give the details of the particular request. * * Return value: 0 on success, non-zero otherwise. + * * The start offset of the handler's address region is determined by * fw_core_add_address_handler() and is returned in handler->offset. + * + * Address allocations are exclusive, except for the FCP registers. */ int fw_core_add_address_handler(struct fw_address_handler *handler, const struct fw_address_region *region) @@ -498,10 +513,12 @@ int fw_core_add_address_handler(struct fw_address_handler *handler, handler->offset = region->start; while (handler->offset + handler->length <= region->end) { - other = - lookup_overlapping_address_handler(&address_handler_list, - handler->offset, - handler->length); + if (is_in_fcp_region(handler->offset, handler->length)) + other = NULL; + else + other = lookup_overlapping_address_handler + (&address_handler_list, + handler->offset, handler->length); if (other != NULL) { handler->offset += other->length; } else { @@ -668,6 +685,9 @@ static struct fw_request *allocate_request(struct fw_packet *p) void fw_send_response(struct fw_card *card, struct fw_request *request, int rcode) { + if (WARN_ONCE(!request, "invalid for FCP address handlers")) + return; + /* unified transaction or broadcast transaction: don't respond */ if (request->ack != ACK_PENDING || HEADER_DESTINATION_IS_BROADCAST(request->request_header[0])) { @@ -686,26 +706,15 @@ void fw_send_response(struct fw_card *card, } EXPORT_SYMBOL(fw_send_response); -void fw_core_handle_request(struct fw_card *card, struct fw_packet *p) +static void handle_exclusive_region_request(struct fw_card *card, + struct fw_packet *p, + struct fw_request *request, + unsigned long long offset) { struct fw_address_handler *handler; - struct fw_request *request; - unsigned long long offset; unsigned long flags; int tcode, destination, source; - if (p->ack != ACK_PENDING && p->ack != ACK_COMPLETE) - return; - - request = allocate_request(p); - if (request == NULL) { - /* FIXME: send statically allocated busy packet. */ - return; - } - - offset = - ((unsigned long long) - HEADER_GET_OFFSET_HIGH(p->header[1]) << 32) | p->header[2]; tcode = HEADER_GET_TCODE(p->header[0]); destination = HEADER_GET_DESTINATION(p->header[0]); source = HEADER_GET_SOURCE(p->header[1]); @@ -732,6 +741,73 @@ void fw_core_handle_request(struct fw_card *card, struct fw_packet *p) request->data, request->length, handler->callback_data); } + +static void handle_fcp_region_request(struct fw_card *card, + struct fw_packet *p, + struct fw_request *request, + unsigned long long offset) +{ + struct fw_address_handler *handler; + unsigned long flags; + int tcode, destination, source; + + if ((offset != (CSR_REGISTER_BASE | CSR_FCP_COMMAND) && + offset != (CSR_REGISTER_BASE | CSR_FCP_RESPONSE)) || + request->length > 0x200) { + fw_send_response(card, request, RCODE_ADDRESS_ERROR); + + return; + } + + tcode = HEADER_GET_TCODE(p->header[0]); + destination = HEADER_GET_DESTINATION(p->header[0]); + source = HEADER_GET_SOURCE(p->header[1]); + + if (tcode != TCODE_WRITE_QUADLET_REQUEST && + tcode != TCODE_WRITE_BLOCK_REQUEST) { + fw_send_response(card, request, RCODE_TYPE_ERROR); + + return; + } + + spin_lock_irqsave(&address_handler_lock, flags); + list_for_each_entry(handler, &address_handler_list, link) { + if (is_enclosing_handler(handler, offset, request->length)) + handler->address_callback(card, NULL, tcode, + destination, source, + p->generation, p->speed, + offset, request->data, + request->length, + handler->callback_data); + } + spin_unlock_irqrestore(&address_handler_lock, flags); + + fw_send_response(card, request, RCODE_COMPLETE); +} + +void fw_core_handle_request(struct fw_card *card, struct fw_packet *p) +{ + struct fw_request *request; + unsigned long long offset; + + if (p->ack != ACK_PENDING && p->ack != ACK_COMPLETE) + return; + + request = allocate_request(p); + if (request == NULL) { + /* FIXME: send statically allocated busy packet. */ + return; + } + + offset = ((u64)HEADER_GET_OFFSET_HIGH(p->header[1]) << 32) | + p->header[2]; + + if (!is_in_fcp_region(offset, request->length)) + handle_exclusive_region_request(card, p, request, offset); + else + handle_fcp_region_request(card, p, request, offset); + +} EXPORT_SYMBOL(fw_core_handle_request); void fw_core_handle_response(struct fw_card *card, struct fw_packet *p) diff --git a/drivers/media/dvb/firewire/firedtv-fw.c b/drivers/media/dvb/firewire/firedtv-fw.c index fe44789ab037..6223bf01efe9 100644 --- a/drivers/media/dvb/firewire/firedtv-fw.c +++ b/drivers/media/dvb/firewire/firedtv-fw.c @@ -202,14 +202,8 @@ static void handle_fcp(struct fw_card *card, struct fw_request *request, unsigned long flags; int su; - if ((tcode != TCODE_WRITE_QUADLET_REQUEST && - tcode != TCODE_WRITE_BLOCK_REQUEST) || - offset != CSR_REGISTER_BASE + CSR_FCP_RESPONSE || - length == 0 || - (((u8 *)payload)[0] & 0xf0) != 0) { - fw_send_response(card, request, RCODE_TYPE_ERROR); + if (length < 2 || (((u8 *)payload)[0] & 0xf0) != 0) return; - } su = ((u8 *)payload)[1] & 0x7; @@ -230,10 +224,8 @@ static void handle_fcp(struct fw_card *card, struct fw_request *request, } spin_unlock_irqrestore(&node_list_lock, flags); - if (fdtv) { + if (fdtv) avc_recv(fdtv, payload, length); - fw_send_response(card, request, RCODE_COMPLETE); - } } static struct fw_address_handler fcp_handler = { diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h index c6b3ca3af6df..1f716d9f714b 100644 --- a/include/linux/firewire-cdev.h +++ b/include/linux/firewire-cdev.h @@ -340,6 +340,9 @@ struct fw_cdev_send_response { * The @closure field is passed back to userspace in the response event. * The @handle field is an out parameter, returning a handle to the allocated * range to be used for later deallocation of the range. + * + * The address range is allocated on all local nodes. The address allocation + * is exclusive except for the FCP command and response registers. */ struct fw_cdev_allocate { __u64 offset; diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 9416a461b696..a0e67150a729 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -248,8 +248,8 @@ typedef void (*fw_transaction_callback_t)(struct fw_card *card, int rcode, void *data, size_t length, void *callback_data); /* - * Important note: The callback must guarantee that either fw_send_response() - * or kfree() is called on the @request. + * Important note: Except for the FCP registers, the callback must guarantee + * that either fw_send_response() or kfree() is called on the @request. */ typedef void (*fw_address_callback_t)(struct fw_card *card, struct fw_request *request, -- cgit v1.2.3 From 1f8fef7b3388b5a976e80839679b5bae581a1091 Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Thu, 24 Dec 2009 11:59:57 +0100 Subject: firewire: add fw_csr_string() helper function The core (sysfs attributes), the firedtv driver, and possible future drivers all read strings from some configuration ROM directory. Factor out the generic code from show_text_leaf() into a new helper function, modified slightly to handle arbitrary buffer sizes. Signed-off-by: Clemens Ladisch Signed-off-by: Stefan Richter --- drivers/firewire/core-device.c | 110 ++++++++++++++++++++++---------- drivers/media/dvb/firewire/firedtv-fw.c | 39 ++--------- include/linux/firewire.h | 2 + 3 files changed, 84 insertions(+), 67 deletions(-) (limited to 'include/linux') diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 9d0dfcbe2c1c..a39e4344cd58 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -59,6 +59,67 @@ int fw_csr_iterator_next(struct fw_csr_iterator *ci, int *key, int *value) } EXPORT_SYMBOL(fw_csr_iterator_next); +static u32 *search_leaf(u32 *directory, int search_key) +{ + struct fw_csr_iterator ci; + int last_key = 0, key, value; + + fw_csr_iterator_init(&ci, directory); + while (fw_csr_iterator_next(&ci, &key, &value)) { + if (last_key == search_key && + key == (CSR_DESCRIPTOR | CSR_LEAF)) + return ci.p - 1 + value; + last_key = key; + } + return NULL; +} + +static int textual_leaf_to_string(u32 *block, char *buf, size_t size) +{ + unsigned int quadlets, length; + + if (!size || !buf) + return -EINVAL; + + quadlets = min(block[0] >> 16, 256u); + if (quadlets < 2) + return -ENODATA; + + if (block[1] != 0 || block[2] != 0) + /* unknown language/character set */ + return -ENODATA; + + block += 3; + quadlets -= 2; + for (length = 0; length < quadlets * 4 && length + 1 < size; length++) { + char c = block[length / 4] >> (24 - 8 * (length % 4)); + if (c == '\0') + break; + buf[length] = c; + } + buf[length] = '\0'; + return length; +} + +/** + * fw_csr_string - reads a string from the configuration ROM + * @directory: device or unit directory; + * fw_device->config_rom+5 or fw_unit->directory + * @key: the key of the preceding directory entry + * @buf: where to put the string + * @size: size of @buf, in bytes + * + * Returns string length (>= 0) or error code (< 0). + */ +int fw_csr_string(u32 *directory, int key, char *buf, size_t size) +{ + u32 *leaf = search_leaf(directory, key); + if (!leaf) + return -ENOENT; + return textual_leaf_to_string(leaf, buf, size); +} +EXPORT_SYMBOL(fw_csr_string); + static bool is_fw_unit(struct device *dev); static int match_unit_directory(u32 *directory, u32 match_flags, @@ -226,10 +287,10 @@ static ssize_t show_text_leaf(struct device *dev, { struct config_rom_attribute *attr = container_of(dattr, struct config_rom_attribute, attr); - struct fw_csr_iterator ci; - u32 *dir, *block = NULL, *p, *end; - int length, key, value, last_key = 0, ret = -ENOENT; - char *b; + u32 *dir; + size_t bufsize; + char dummy_buf[2]; + int ret; down_read(&fw_device_rwsem); @@ -238,40 +299,23 @@ static ssize_t show_text_leaf(struct device *dev, else dir = fw_device(dev)->config_rom + 5; - fw_csr_iterator_init(&ci, dir); - while (fw_csr_iterator_next(&ci, &key, &value)) { - if (attr->key == last_key && - key == (CSR_DESCRIPTOR | CSR_LEAF)) - block = ci.p - 1 + value; - last_key = key; + if (buf) { + bufsize = PAGE_SIZE - 1; + } else { + buf = dummy_buf; + bufsize = 1; } - if (block == NULL) - goto out; - - length = min(block[0] >> 16, 256U); - if (length < 3) - goto out; - - if (block[1] != 0 || block[2] != 0) - /* Unknown encoding. */ - goto out; + ret = fw_csr_string(dir, attr->key, buf, bufsize); - if (buf == NULL) { - ret = length * 4; - goto out; + if (ret >= 0) { + /* Strip trailing whitespace and add newline. */ + while (ret > 0 && isspace(buf[ret - 1])) + ret--; + strcpy(buf + ret, "\n"); + ret++; } - b = buf; - end = &block[length + 1]; - for (p = &block[3]; p < end; p++, b += 4) - * (u32 *) b = (__force u32) __cpu_to_be32(*p); - - /* Strip trailing whitespace and add newline. */ - while (b--, (isspace(*b) || *b == '\0') && b > buf); - strcpy(b + 1, "\n"); - ret = b + 2 - buf; - out: up_read(&fw_device_rwsem); return ret; diff --git a/drivers/media/dvb/firewire/firedtv-fw.c b/drivers/media/dvb/firewire/firedtv-fw.c index 6223bf01efe9..4253b7ab0097 100644 --- a/drivers/media/dvb/firewire/firedtv-fw.c +++ b/drivers/media/dvb/firewire/firedtv-fw.c @@ -239,47 +239,18 @@ static const struct fw_address_region fcp_region = { }; /* Adjust the template string if models with longer names appear. */ -#define MAX_MODEL_NAME_LEN ((int)DIV_ROUND_UP(sizeof("FireDTV ????"), 4)) - -static size_t model_name(u32 *directory, __be32 *buffer) -{ - struct fw_csr_iterator ci; - int i, length, key, value, last_key = 0; - u32 *block = NULL; - - fw_csr_iterator_init(&ci, directory); - while (fw_csr_iterator_next(&ci, &key, &value)) { - if (last_key == CSR_MODEL && - key == (CSR_DESCRIPTOR | CSR_LEAF)) - block = ci.p - 1 + value; - last_key = key; - } - - if (block == NULL) - return 0; - - length = min((int)(block[0] >> 16) - 2, MAX_MODEL_NAME_LEN); - if (length <= 0) - return 0; - - /* fast-forward to text string */ - block += 3; - - for (i = 0; i < length; i++) - buffer[i] = cpu_to_be32(block[i]); - - return length * 4; -} +#define MAX_MODEL_NAME_LEN sizeof("FireDTV ????") static int node_probe(struct device *dev) { struct firedtv *fdtv; - __be32 name[MAX_MODEL_NAME_LEN]; + char name[MAX_MODEL_NAME_LEN]; int name_len, err; - name_len = model_name(fw_unit(dev)->directory, name); + name_len = fw_csr_string(fw_unit(dev)->directory, CSR_MODEL, + name, sizeof(name)); - fdtv = fdtv_alloc(dev, &backend, (char *)name, name_len); + fdtv = fdtv_alloc(dev, &backend, name, name_len >= 0 ? name_len : 0); if (!fdtv) return -ENOMEM; diff --git a/include/linux/firewire.h b/include/linux/firewire.h index a0e67150a729..5246869d8083 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -72,6 +72,8 @@ struct fw_csr_iterator { void fw_csr_iterator_init(struct fw_csr_iterator *ci, u32 *p); int fw_csr_iterator_next(struct fw_csr_iterator *ci, int *key, int *value); +int fw_csr_string(u32 *directory, int key, char *buf, size_t size); + extern struct bus_type fw_bus_type; struct fw_card_driver; -- cgit v1.2.3 From 3c2c58cb33b3b15a2c4871babeec8fe1456e1db6 Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Sat, 26 Dec 2009 01:43:21 +0100 Subject: firewire: core: fw_csr_string addendum Witespace and comment changes, and a different way to say i + 1 < end. Signed-off-by: Stefan Richter --- drivers/firewire/core-device.c | 26 ++++++++++++++++---------- include/linux/firewire.h | 1 - 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index a39e4344cd58..5d5c6a689837 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -69,19 +69,22 @@ static u32 *search_leaf(u32 *directory, int search_key) if (last_key == search_key && key == (CSR_DESCRIPTOR | CSR_LEAF)) return ci.p - 1 + value; + last_key = key; } + return NULL; } static int textual_leaf_to_string(u32 *block, char *buf, size_t size) { - unsigned int quadlets, length; + unsigned int quadlets, i; + char c; if (!size || !buf) return -EINVAL; - quadlets = min(block[0] >> 16, 256u); + quadlets = min(block[0] >> 16, 256U); if (quadlets < 2) return -ENODATA; @@ -91,31 +94,34 @@ static int textual_leaf_to_string(u32 *block, char *buf, size_t size) block += 3; quadlets -= 2; - for (length = 0; length < quadlets * 4 && length + 1 < size; length++) { - char c = block[length / 4] >> (24 - 8 * (length % 4)); + for (i = 0; i < quadlets * 4 && i < size - 1; i++) { + c = block[i / 4] >> (24 - 8 * (i % 4)); if (c == '\0') break; - buf[length] = c; + buf[i] = c; } - buf[length] = '\0'; - return length; + buf[i] = '\0'; + + return i; } /** * fw_csr_string - reads a string from the configuration ROM - * @directory: device or unit directory; - * fw_device->config_rom+5 or fw_unit->directory + * @directory: e.g. root directory or unit directory * @key: the key of the preceding directory entry * @buf: where to put the string * @size: size of @buf, in bytes * - * Returns string length (>= 0) or error code (< 0). + * The string is taken from a minimal ASCII text descriptor leaf after + * the immediate entry with @key. The string is zero-terminated. + * Returns strlen(buf) or a negative error code. */ int fw_csr_string(u32 *directory, int key, char *buf, size_t size) { u32 *leaf = search_leaf(directory, key); if (!leaf) return -ENOENT; + return textual_leaf_to_string(leaf, buf, size); } EXPORT_SYMBOL(fw_csr_string); diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 5246869d8083..df680216e7b6 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -71,7 +71,6 @@ struct fw_csr_iterator { void fw_csr_iterator_init(struct fw_csr_iterator *ci, u32 *p); int fw_csr_iterator_next(struct fw_csr_iterator *ci, int *key, int *value); - int fw_csr_string(u32 *directory, int key, char *buf, size_t size); extern struct bus_type fw_bus_type; -- cgit v1.2.3 From 13b302d0a217580c0129b0641b0ca8b592e437b0 Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Sat, 26 Dec 2009 01:44:10 +0100 Subject: firewire: qualify config ROM cache pointers as const pointers Several config ROM related functions only peek at the ROM cache; mark their arguments as const pointers. Ditto fw_device.config_rom and fw_unit.directory, as the memory behind them is meant to be write-once. Signed-off-by: Stefan Richter --- drivers/firewire/core-device.c | 21 +++++++++++---------- drivers/firewire/sbp2.c | 5 +++-- include/linux/firewire.h | 12 ++++++------ 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 5d5c6a689837..eecd52dc8e98 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -43,7 +43,7 @@ #include "core.h" -void fw_csr_iterator_init(struct fw_csr_iterator *ci, u32 * p) +void fw_csr_iterator_init(struct fw_csr_iterator *ci, const u32 *p) { ci->p = p + 1; ci->end = ci->p + (p[0] >> 16); @@ -59,7 +59,7 @@ int fw_csr_iterator_next(struct fw_csr_iterator *ci, int *key, int *value) } EXPORT_SYMBOL(fw_csr_iterator_next); -static u32 *search_leaf(u32 *directory, int search_key) +static const u32 *search_leaf(const u32 *directory, int search_key) { struct fw_csr_iterator ci; int last_key = 0, key, value; @@ -76,7 +76,7 @@ static u32 *search_leaf(u32 *directory, int search_key) return NULL; } -static int textual_leaf_to_string(u32 *block, char *buf, size_t size) +static int textual_leaf_to_string(const u32 *block, char *buf, size_t size) { unsigned int quadlets, i; char c; @@ -116,9 +116,9 @@ static int textual_leaf_to_string(u32 *block, char *buf, size_t size) * the immediate entry with @key. The string is zero-terminated. * Returns strlen(buf) or a negative error code. */ -int fw_csr_string(u32 *directory, int key, char *buf, size_t size) +int fw_csr_string(const u32 *directory, int key, char *buf, size_t size) { - u32 *leaf = search_leaf(directory, key); + const u32 *leaf = search_leaf(directory, key); if (!leaf) return -ENOENT; @@ -128,7 +128,7 @@ EXPORT_SYMBOL(fw_csr_string); static bool is_fw_unit(struct device *dev); -static int match_unit_directory(u32 *directory, u32 match_flags, +static int match_unit_directory(const u32 *directory, u32 match_flags, const struct ieee1394_device_id *id) { struct fw_csr_iterator ci; @@ -262,7 +262,7 @@ static ssize_t show_immediate(struct device *dev, struct config_rom_attribute *attr = container_of(dattr, struct config_rom_attribute, attr); struct fw_csr_iterator ci; - u32 *dir; + const u32 *dir; int key, value, ret = -ENOENT; down_read(&fw_device_rwsem); @@ -293,7 +293,7 @@ static ssize_t show_text_leaf(struct device *dev, { struct config_rom_attribute *attr = container_of(dattr, struct config_rom_attribute, attr); - u32 *dir; + const u32 *dir; size_t bufsize; char dummy_buf[2]; int ret; @@ -421,7 +421,7 @@ static ssize_t guid_show(struct device *dev, return ret; } -static int units_sprintf(char *buf, u32 *directory) +static int units_sprintf(char *buf, const u32 *directory) { struct fw_csr_iterator ci; int key, value; @@ -503,7 +503,8 @@ static int read_rom(struct fw_device *device, */ static int read_bus_info_block(struct fw_device *device, int generation) { - u32 *rom, *stack, *old_rom, *new_rom; + const u32 *old_rom, *new_rom; + u32 *rom, *stack; u32 sp, key; int i, end, length, ret = -1; diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c index d485cdd8cbac..7e33b0b1704c 100644 --- a/drivers/firewire/sbp2.c +++ b/drivers/firewire/sbp2.c @@ -1014,7 +1014,8 @@ static int sbp2_add_logical_unit(struct sbp2_target *tgt, int lun_entry) return 0; } -static int sbp2_scan_logical_unit_dir(struct sbp2_target *tgt, u32 *directory) +static int sbp2_scan_logical_unit_dir(struct sbp2_target *tgt, + const u32 *directory) { struct fw_csr_iterator ci; int key, value; @@ -1027,7 +1028,7 @@ static int sbp2_scan_logical_unit_dir(struct sbp2_target *tgt, u32 *directory) return 0; } -static int sbp2_scan_unit_dir(struct sbp2_target *tgt, u32 *directory, +static int sbp2_scan_unit_dir(struct sbp2_target *tgt, const u32 *directory, u32 *model, u32 *firmware_revision) { struct fw_csr_iterator ci; diff --git a/include/linux/firewire.h b/include/linux/firewire.h index df680216e7b6..4bd94bf5e739 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -65,13 +65,13 @@ #define CSR_DIRECTORY_ID 0x20 struct fw_csr_iterator { - u32 *p; - u32 *end; + const u32 *p; + const u32 *end; }; -void fw_csr_iterator_init(struct fw_csr_iterator *ci, u32 *p); +void fw_csr_iterator_init(struct fw_csr_iterator *ci, const u32 *p); int fw_csr_iterator_next(struct fw_csr_iterator *ci, int *key, int *value); -int fw_csr_string(u32 *directory, int key, char *buf, size_t size); +int fw_csr_string(const u32 *directory, int key, char *buf, size_t size); extern struct bus_type fw_bus_type; @@ -163,7 +163,7 @@ struct fw_device { struct mutex client_list_mutex; struct list_head client_list; - u32 *config_rom; + const u32 *config_rom; size_t config_rom_length; int config_rom_retries; unsigned is_local:1; @@ -205,7 +205,7 @@ int fw_device_enable_phys_dma(struct fw_device *device); */ struct fw_unit { struct device device; - u32 *directory; + const u32 *directory; struct fw_attribute_group attribute_group; }; -- cgit v1.2.3 From 9bd3f98821a83041e77ee25158b80b535d02d7b4 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Wed, 30 Dec 2009 08:41:07 +0100 Subject: block: blk_rq_err_sectors cleanup blk_rq_err_sectors() seems useless, get rid of it. Signed-off-by: Gui Jianfeng Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 59b832be3044..9b98173a8184 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -845,7 +845,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) * blk_rq_err_bytes() : bytes left till the next error boundary * blk_rq_sectors() : sectors left in the entire request * blk_rq_cur_sectors() : sectors left in the current segment - * blk_rq_err_sectors() : sectors left till the next error boundary */ static inline sector_t blk_rq_pos(const struct request *rq) { @@ -874,11 +873,6 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> 9; } -static inline unsigned int blk_rq_err_sectors(const struct request *rq) -{ - return blk_rq_err_bytes(rq) >> 9; -} - /* * Request issue related functions. */ -- cgit v1.2.3 From e96dc9674cb597de4fee757ed005c8465072d13f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:26 +0800 Subject: tracing/syscalls: Fix typo in SYSCALL_DEFINE0 The struct syscall_metadata variable name in SYSCALL_DEFINE0 should be __syscall_meta__##sname instead of __syscall_meta_##sname to match the name that is in SYSCALL_DEFINE1/2/3/4/5/6. This error causes event_enter_##sname->data to point to the wrong location, which causes syscalls which are defined by SYSCALL_DEFINE0() not to be traced. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D2E.1010807@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- include/linux/syscalls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 65793e90d6f6..207466a49f3d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -195,7 +195,7 @@ struct perf_event_attr; static const struct syscall_metadata __used \ __attribute__((__aligned__(4))) \ __attribute__((section("__syscalls_metadata"))) \ - __syscall_meta_##sname = { \ + __syscall_meta__##sname = { \ .name = "sys_"#sname, \ .nb_args = 0, \ .enter_event = &event_enter__##sname, \ -- cgit v1.2.3 From ed656d8deccc5669afa33387568e7ec6f14e3e94 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Sat, 26 Dec 2009 17:58:11 +0100 Subject: kfifo: Fix typo in comment It's DECLARE_KFIFO, not DECLARED_KFIFO. Signed-off-by: Rolf Eike Beer Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 3d44e9c65a8e..7c6b32a1421c 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -81,7 +81,7 @@ union { \ } /** - * INIT_KFIFO - Initialize a kfifo declared by DECLARED_KFIFO + * INIT_KFIFO - Initialize a kfifo declared by DECLARE_KFIFO * @name: name of the declared kfifo datatype */ #define INIT_KFIFO(name) \ -- cgit v1.2.3 From d7f0eea9e431e1b8b0742a74db1a9490730b2a25 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 30 Dec 2009 15:36:42 +0800 Subject: ACPI: introduce kernel parameter acpi_sleep=sci_force_enable Introduce kernel parameter acpi_sleep=sci_force_enable some laptop requires SCI_EN being set directly on resume, or else they hung somewhere in the resume code path. We already have a blacklist for these laptops but we still need this option, especially when debugging some suspend/resume problems, in case there are systems that need this workaround and are not yet in the blacklist. Signed-off-by: Zhang Rui Acked-by: Rafael J. Wysocki Signed-off-by: Len Brown --- Documentation/kernel-parameters.txt | 5 ++++- arch/x86/kernel/acpi/sleep.c | 2 ++ drivers/acpi/sleep.c | 29 +++++++++++++++++------------ include/linux/acpi.h | 1 + 4 files changed, 24 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 5ba4d9dff113..736d45602886 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -240,7 +240,7 @@ and is between 256 and 4096 characters. It is defined in the file acpi_sleep= [HW,ACPI] Sleep options Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, - old_ordering, s4_nonvs } + old_ordering, s4_nonvs, sci_force_enable } See Documentation/power/video.txt for information on s3_bios and s3_mode. s3_beep is for debugging; it makes the PC's speaker beep @@ -253,6 +253,9 @@ and is between 256 and 4096 characters. It is defined in the file of _PTS is used by default). s4_nonvs prevents the kernel from saving/restoring the ACPI NVS memory during hibernation. + sci_force_enable causes the kernel to set SCI_EN directly + on resume from S1/S3 (which is against the ACPI spec, + but some broken systems don't work without it). acpi_use_timer_override [HW,ACPI] Use timer override. For some broken Nvidia NF5 boards diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 82e508677b91..f9961034e557 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str) #endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); + if (strncmp(str, "sci_force_enable", 16) == 0) + acpi_set_sci_en_on_resume(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 5f2c379ab7bf..79d33d908b5a 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -80,6 +80,23 @@ static int acpi_sleep_prepare(u32 acpi_state) #ifdef CONFIG_ACPI_SLEEP static u32 acpi_target_sleep_state = ACPI_STATE_S0; +/* + * According to the ACPI specification the BIOS should make sure that ACPI is + * enabled and SCI_EN bit is set on wake-up from S1 - S3 sleep states. Still, + * some BIOSes don't do that and therefore we use acpi_enable() to enable ACPI + * on such systems during resume. Unfortunately that doesn't help in + * particularly pathological cases in which SCI_EN has to be set directly on + * resume, although the specification states very clearly that this flag is + * owned by the hardware. The set_sci_en_on_resume variable will be set in such + * cases. + */ +static bool set_sci_en_on_resume; + +void __init acpi_set_sci_en_on_resume(void) +{ + set_sci_en_on_resume = true; +} + /* * ACPI 1.0 wants us to execute _PTS before suspending devices, so we allow the * user to request that behavior by using the 'acpi_old_suspend_ordering' @@ -170,18 +187,6 @@ static void acpi_pm_end(void) #endif /* CONFIG_ACPI_SLEEP */ #ifdef CONFIG_SUSPEND -/* - * According to the ACPI specification the BIOS should make sure that ACPI is - * enabled and SCI_EN bit is set on wake-up from S1 - S3 sleep states. Still, - * some BIOSes don't do that and therefore we use acpi_enable() to enable ACPI - * on such systems during resume. Unfortunately that doesn't help in - * particularly pathological cases in which SCI_EN has to be set directly on - * resume, although the specification states very clearly that this flag is - * owned by the hardware. The set_sci_en_on_resume variable will be set in such - * cases. - */ -static bool set_sci_en_on_resume; - extern void do_suspend_lowlevel(void); static u32 acpi_suspend_states[] = { diff --git a/include/linux/acpi.h b/include/linux/acpi.h index ce945d4845fc..36924255c0d5 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -251,6 +251,7 @@ int acpi_check_mem_region(resource_size_t start, resource_size_t n, void __init acpi_no_s4_hw_signature(void); void __init acpi_old_suspend_ordering(void); void __init acpi_s4_no_nvs(void); +void __init acpi_set_sci_en_on_resume(void); #endif /* CONFIG_PM_SLEEP */ struct acpi_osc_context { -- cgit v1.2.3 From 2f5cb43406d0b29b96248f5328a14a6f6abf8ae6 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 30 Dec 2009 08:23:30 +0000 Subject: phylib: Properly reinitialize PHYs after hibernation Since hibernation assumes power loss, we should fully reinitialize PHYs (including platform fixups), as if PHYs were just attached. This patch factors phy_init_hw() out of phy_attach_direct(), then converts mdio_bus to dev_pm_ops and adds an appropriate restore() callback. Signed-off-by: Anton Vorontsov Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 50 +++++++++++++++++++++++++++++++++++++------- drivers/net/phy/phy_device.c | 30 +++++++++++++------------- include/linux/phy.h | 1 + 3 files changed, 59 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 49252d390903..e17b70291bbc 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -264,6 +264,8 @@ static int mdio_bus_match(struct device *dev, struct device_driver *drv) (phydev->phy_id & phydrv->phy_id_mask)); } +#ifdef CONFIG_PM + static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) { struct device_driver *drv = phydev->dev.driver; @@ -295,10 +297,7 @@ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) return true; } -/* Suspend and resume. Copied from platform_suspend and - * platform_resume - */ -static int mdio_bus_suspend(struct device * dev, pm_message_t state) +static int mdio_bus_suspend(struct device *dev) { struct phy_driver *phydrv = to_phy_driver(dev->driver); struct phy_device *phydev = to_phy_device(dev); @@ -318,7 +317,7 @@ static int mdio_bus_suspend(struct device * dev, pm_message_t state) return phydrv->suspend(phydev); } -static int mdio_bus_resume(struct device * dev) +static int mdio_bus_resume(struct device *dev) { struct phy_driver *phydrv = to_phy_driver(dev->driver); struct phy_device *phydev = to_phy_device(dev); @@ -338,11 +337,48 @@ no_resume: return 0; } +static int mdio_bus_restore(struct device *dev) +{ + struct phy_device *phydev = to_phy_device(dev); + struct net_device *netdev = phydev->attached_dev; + int ret; + + if (!netdev) + return 0; + + ret = phy_init_hw(phydev); + if (ret < 0) + return ret; + + /* The PHY needs to renegotiate. */ + phydev->link = 0; + phydev->state = PHY_UP; + + phy_start_machine(phydev, NULL); + + return 0; +} + +static struct dev_pm_ops mdio_bus_pm_ops = { + .suspend = mdio_bus_suspend, + .resume = mdio_bus_resume, + .freeze = mdio_bus_suspend, + .thaw = mdio_bus_resume, + .restore = mdio_bus_restore, +}; + +#define MDIO_BUS_PM_OPS (&mdio_bus_pm_ops) + +#else + +#define MDIO_BUS_PM_OPS NULL + +#endif /* CONFIG_PM */ + struct bus_type mdio_bus_type = { .name = "mdio_bus", .match = mdio_bus_match, - .suspend = mdio_bus_suspend, - .resume = mdio_bus_resume, + .pm = MDIO_BUS_PM_OPS, }; EXPORT_SYMBOL(mdio_bus_type); diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index b10fedd82143..8212b2b93422 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -378,6 +378,20 @@ void phy_disconnect(struct phy_device *phydev) } EXPORT_SYMBOL(phy_disconnect); +int phy_init_hw(struct phy_device *phydev) +{ + int ret; + + if (!phydev->drv || !phydev->drv->config_init) + return 0; + + ret = phy_scan_fixups(phydev); + if (ret < 0) + return ret; + + return phydev->drv->config_init(phydev); +} + /** * phy_attach_direct - attach a network device to a given PHY device pointer * @dev: network device to attach @@ -425,21 +439,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, /* Do initial configuration here, now that * we have certain key parameters * (dev_flags and interface) */ - if (phydev->drv->config_init) { - int err; - - err = phy_scan_fixups(phydev); - - if (err < 0) - return err; - - err = phydev->drv->config_init(phydev); - - if (err < 0) - return err; - } - - return 0; + return phy_init_hw(phydev); } EXPORT_SYMBOL(phy_attach_direct); diff --git a/include/linux/phy.h b/include/linux/phy.h index b1368b8f6572..7968defd2fa7 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -447,6 +447,7 @@ struct phy_device* get_phy_device(struct mii_bus *bus, int addr); int phy_device_register(struct phy_device *phy); int phy_clear_interrupt(struct phy_device *phydev); int phy_config_interrupt(struct phy_device *phydev, u32 interrupts); +int phy_init_hw(struct phy_device *phydev); int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface); struct phy_device * phy_attach(struct net_device *dev, -- cgit v1.2.3 From 0f4bd46ec252887f44f1f065b41867cac8f70dfb Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 22 Dec 2009 03:15:43 +0000 Subject: kmsg_dump: Dump on crash_kexec as well crash_kexec gets called before kmsg_dump(KMSG_DUMP_OOPS) if panic_on_oops is set, so the kernel log buffer is not stored for this case. This patch adds a KMSG_DUMP_KEXEC dump type which gets called when crash_kexec() is invoked. To avoid getting double dumps, the old KMSG_DUMP_PANIC is moved below crash_kexec(). The mtdoops driver is modified to handle KMSG_DUMP_KEXEC in the same way as a panic. Signed-off-by: KOSAKI Motohiro Acked-by: Simon Kagstrom Signed-off-by: David Woodhouse --- drivers/mtd/mtdoops.c | 2 +- include/linux/kmsg_dump.h | 1 + kernel/kexec.c | 4 ++++ kernel/panic.c | 3 ++- kernel/printk.c | 1 + 5 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c index a714ec482761..92e12df0917f 100644 --- a/drivers/mtd/mtdoops.c +++ b/drivers/mtd/mtdoops.c @@ -322,7 +322,7 @@ static void mtdoops_do_dump(struct kmsg_dumper *dumper, memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); /* Panics must be written immediately */ - if (reason == KMSG_DUMP_PANIC) { + if (reason != KMSG_DUMP_OOPS) { if (!cxt->mtd->panic_write) printk(KERN_ERR "mtdoops: Cannot write from panic without panic_write\n"); else diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index e32aa268efac..24b44145a886 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -17,6 +17,7 @@ enum kmsg_dump_reason { KMSG_DUMP_OOPS, KMSG_DUMP_PANIC, + KMSG_DUMP_KEXEC, }; /** diff --git a/kernel/kexec.c b/kernel/kexec.c index 433e9fcc1fc5..ae217488fef8 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -1074,6 +1075,9 @@ void crash_kexec(struct pt_regs *regs) if (mutex_trylock(&kexec_mutex)) { if (kexec_crash_image) { struct pt_regs fixed_regs; + + kmsg_dump(KMSG_DUMP_KEXEC); + crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); diff --git a/kernel/panic.c b/kernel/panic.c index 5827f7b97254..c787333282b8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -75,7 +75,6 @@ NORET_TYPE void panic(const char * fmt, ...) dump_stack(); #endif - kmsg_dump(KMSG_DUMP_PANIC); /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. @@ -83,6 +82,8 @@ NORET_TYPE void panic(const char * fmt, ...) */ crash_kexec(NULL); + kmsg_dump(KMSG_DUMP_PANIC); + /* * Note smp_send_stop is the usual smp shutdown function, which * unfortunately means it may not be hardened to work in a panic diff --git a/kernel/printk.c b/kernel/printk.c index 1ded8e7dd19b..2c9dc0b03a5e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1467,6 +1467,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static const char const *kmsg_reasons[] = { [KMSG_DUMP_OOPS] = "oops", [KMSG_DUMP_PANIC] = "panic", + [KMSG_DUMP_KEXEC] = "kexec", }; static const char *kmsg_to_str(enum kmsg_dump_reason reason) -- cgit v1.2.3 From 0719d3434747889b314a1e8add776418c4148bcf Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 30 Dec 2009 00:39:22 +0100 Subject: reiserfs: Fix reiserfs lock <-> i_xattr_sem dependency inversion i_xattr_sem depends on the reiserfs lock. But after we grab i_xattr_sem, we may relax/relock the reiserfs lock while waiting on a freezed filesystem, creating a dependency inversion between the two locks. In order to avoid the i_xattr_sem -> reiserfs lock dependency, let's create a reiserfs_down_read_safe() that acts like reiserfs_mutex_lock_safe(): relax the reiserfs lock while grabbing another lock to avoid undesired dependencies induced by the heivyweight reiserfs lock. This fixes the following warning: [ 990.005931] ======================================================= [ 990.012373] [ INFO: possible circular locking dependency detected ] [ 990.013233] 2.6.33-rc1 #1 [ 990.013233] ------------------------------------------------------- [ 990.013233] dbench/1891 is trying to acquire lock: [ 990.013233] (&REISERFS_SB(s)->lock){+.+.+.}, at: [] reiserfs_write_lock+0x35/0x50 [ 990.013233] [ 990.013233] but task is already holding lock: [ 990.013233] (&REISERFS_I(inode)->i_xattr_sem){+.+.+.}, at: [] reiserfs_xattr_set_handle+0x8a/0x470 [ 990.013233] [ 990.013233] which lock already depends on the new lock. [ 990.013233] [ 990.013233] [ 990.013233] the existing dependency chain (in reverse order) is: [ 990.013233] [ 990.013233] -> #1 (&REISERFS_I(inode)->i_xattr_sem){+.+.+.}: [ 990.013233] [] __lock_acquire+0xf9c/0x1560 [ 990.013233] [] lock_acquire+0x8f/0xb0 [ 990.013233] [] down_write+0x44/0x80 [ 990.013233] [] reiserfs_xattr_set_handle+0x8a/0x470 [ 990.013233] [] reiserfs_xattr_set+0xb0/0x150 [ 990.013233] [] user_set+0x8a/0x90 [ 990.013233] [] reiserfs_setxattr+0xaa/0xb0 [ 990.013233] [] __vfs_setxattr_noperm+0x36/0xa0 [ 990.013233] [] vfs_setxattr+0xbc/0xc0 [ 990.013233] [] setxattr+0xc0/0x150 [ 990.013233] [] sys_fsetxattr+0x8d/0xa0 [ 990.013233] [] system_call_fastpath+0x16/0x1b [ 990.013233] [ 990.013233] -> #0 (&REISERFS_SB(s)->lock){+.+.+.}: [ 990.013233] [] __lock_acquire+0x12d0/0x1560 [ 990.013233] [] lock_acquire+0x8f/0xb0 [ 990.013233] [] __mutex_lock_common+0x47/0x3b0 [ 990.013233] [] mutex_lock_nested+0x3e/0x50 [ 990.013233] [] reiserfs_write_lock+0x35/0x50 [ 990.013233] [] reiserfs_prepare_write+0x45/0x180 [ 990.013233] [] reiserfs_xattr_set_handle+0x2a6/0x470 [ 990.013233] [] reiserfs_xattr_set+0xb0/0x150 [ 990.013233] [] user_set+0x8a/0x90 [ 990.013233] [] reiserfs_setxattr+0xaa/0xb0 [ 990.013233] [] __vfs_setxattr_noperm+0x36/0xa0 [ 990.013233] [] vfs_setxattr+0xbc/0xc0 [ 990.013233] [] setxattr+0xc0/0x150 [ 990.013233] [] sys_fsetxattr+0x8d/0xa0 [ 990.013233] [] system_call_fastpath+0x16/0x1b [ 990.013233] [ 990.013233] other info that might help us debug this: [ 990.013233] [ 990.013233] 2 locks held by dbench/1891: [ 990.013233] #0: (&sb->s_type->i_mutex_key#12){+.+.+.}, at: [] vfs_setxattr+0x78/0xc0 [ 990.013233] #1: (&REISERFS_I(inode)->i_xattr_sem){+.+.+.}, at: [] reiserfs_xattr_set_handle+0x8a/0x470 [ 990.013233] [ 990.013233] stack backtrace: [ 990.013233] Pid: 1891, comm: dbench Not tainted 2.6.33-rc1 #1 [ 990.013233] Call Trace: [ 990.013233] [] print_circular_bug+0xe9/0xf0 [ 990.013233] [] __lock_acquire+0x12d0/0x1560 [ 990.013233] [] ? reiserfs_xattr_set_handle+0x8a/0x470 [ 990.013233] [] lock_acquire+0x8f/0xb0 [ 990.013233] [] ? reiserfs_write_lock+0x35/0x50 [ 990.013233] [] ? reiserfs_xattr_set_handle+0x8a/0x470 [ 990.013233] [] __mutex_lock_common+0x47/0x3b0 [ 990.013233] [] ? reiserfs_write_lock+0x35/0x50 [ 990.013233] [] ? reiserfs_write_lock+0x35/0x50 [ 990.013233] [] ? mark_held_locks+0x72/0xa0 [ 990.013233] [] ? __mutex_unlock_slowpath+0xbd/0x140 [ 990.013233] [] ? trace_hardirqs_on_caller+0x14d/0x1a0 [ 990.013233] [] mutex_lock_nested+0x3e/0x50 [ 990.013233] [] reiserfs_write_lock+0x35/0x50 [ 990.013233] [] reiserfs_prepare_write+0x45/0x180 [ 990.013233] [] reiserfs_xattr_set_handle+0x2a6/0x470 [ 990.013233] [] reiserfs_xattr_set+0xb0/0x150 [ 990.013233] [] ? __mutex_lock_common+0x284/0x3b0 [ 990.013233] [] user_set+0x8a/0x90 [ 990.013233] [] reiserfs_setxattr+0xaa/0xb0 [ 990.013233] [] __vfs_setxattr_noperm+0x36/0xa0 [ 990.013233] [] vfs_setxattr+0xbc/0xc0 [ 990.013233] [] setxattr+0xc0/0x150 [ 990.013233] [] ? sched_clock_cpu+0xb8/0x100 [ 990.013233] [] ? trace_hardirqs_off+0xd/0x10 [ 990.013233] [] ? cpu_clock+0x43/0x50 [ 990.013233] [] ? fget+0xb0/0x110 [ 990.013233] [] ? fget+0x0/0x110 [ 990.013233] [] ? sysret_check+0x27/0x62 [ 990.013233] [] sys_fsetxattr+0x8d/0xa0 [ 990.013233] [] system_call_fastpath+0x16/0x1b Reported-and-tested-by: Christian Kujau Signed-off-by: Frederic Weisbecker Cc: Alexander Beregalov Cc: Chris Mason Cc: Ingo Molnar --- fs/reiserfs/xattr.c | 2 +- include/linux/reiserfs_fs.h | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 8891cd88a3f4..a0e2e7acdc75 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -484,7 +484,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, if (IS_ERR(dentry)) return PTR_ERR(dentry); - down_write(&REISERFS_I(inode)->i_xattr_sem); + reiserfs_down_read_safe(&REISERFS_I(inode)->i_xattr_sem, inode->i_sb); xahash = xattr_hash(buffer, buffer_size); while (buffer_pos < buffer_size || buffer_pos == 0) { diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 4351b49e2b1e..35d3f459b0ac 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -106,6 +106,14 @@ reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass, reiserfs_write_lock(s); } +static inline void +reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s) +{ + reiserfs_write_unlock(s); + down_read(sem); + reiserfs_write_lock(s); +} + /* * When we schedule, we usually want to also release the write lock, * according to the previous bkl based locking scheme of reiserfs. -- cgit v1.2.3 From c4a62ca362258d98f42efb282cfbf9b61caffdbe Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 30 Dec 2009 03:20:19 +0100 Subject: reiserfs: Warn on lock relax if taken recursively When we relax the reiserfs lock to avoid creating unwanted dependencies against others locks while grabbing these, we want to ensure it has not been taken recursively, otherwise the lock won't be really relaxed. Only its depth will be decreased. The unwanted dependency would then actually happen. To prevent from that, add a reiserfs_lock_check_recursive() call in the places that need it. Signed-off-by: Frederic Weisbecker Cc: Alexander Beregalov Cc: Chris Mason Cc: Ingo Molnar --- fs/reiserfs/lock.c | 9 +++++++++ include/linux/reiserfs_fs.h | 9 +++++++++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c index ee2cfc0fd8a7..b87aa2c1afc1 100644 --- a/fs/reiserfs/lock.c +++ b/fs/reiserfs/lock.c @@ -86,3 +86,12 @@ void reiserfs_check_lock_depth(struct super_block *sb, char *caller) reiserfs_panic(sb, "%s called without kernel lock held %d", caller); } + +#ifdef CONFIG_REISERFS_CHECK +void reiserfs_lock_check_recursive(struct super_block *sb) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); + + WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n"); +} +#endif diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 35d3f459b0ac..793bf8351ab8 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -62,6 +62,12 @@ void reiserfs_write_unlock(struct super_block *s); int reiserfs_write_lock_once(struct super_block *s); void reiserfs_write_unlock_once(struct super_block *s, int lock_depth); +#ifdef CONFIG_REISERFS_CHECK +void reiserfs_lock_check_recursive(struct super_block *s); +#else +static inline void reiserfs_lock_check_recursive(struct super_block *s) { } +#endif + /* * Several mutexes depend on the write lock. * However sometimes we want to relax the write lock while we hold @@ -92,6 +98,7 @@ void reiserfs_write_unlock_once(struct super_block *s, int lock_depth); static inline void reiserfs_mutex_lock_safe(struct mutex *m, struct super_block *s) { + reiserfs_lock_check_recursive(s); reiserfs_write_unlock(s); mutex_lock(m); reiserfs_write_lock(s); @@ -101,6 +108,7 @@ static inline void reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass, struct super_block *s) { + reiserfs_lock_check_recursive(s); reiserfs_write_unlock(s); mutex_lock_nested(m, subclass); reiserfs_write_lock(s); @@ -109,6 +117,7 @@ reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass, static inline void reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s) { + reiserfs_lock_check_recursive(s); reiserfs_write_unlock(s); down_read(sem); reiserfs_write_lock(s); -- cgit v1.2.3 From c1c5523dd1517250cac8b15a4acbc237c24a67d4 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 23 Dec 2009 01:27:48 +0000 Subject: can/netlink: add CAN_CTRLMODE_ONE_SHOT This patch adds the flag CAN_CTRLMODE_ONE_SHOT. It is used as mask or flag in the "struct can_ctrlmode". It allows userspace via netlink to set a CAN controller into the special "one-shot" mode. In this mode, if supported by the CAN controller, it tries only once to deliver a CAN frame and aborts it if an error (e.g.: arbitration lost) happens. Signed-off-by: Marc Kleine-Budde Acked-by: Wolfgang Grandegger Signed-off-by: David S. Miller --- include/linux/can/netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/can/netlink.h b/include/linux/can/netlink.h index 9ecbb7871c0e..c818335fbb13 100644 --- a/include/linux/can/netlink.h +++ b/include/linux/can/netlink.h @@ -80,6 +80,7 @@ struct can_ctrlmode { #define CAN_CTRLMODE_LOOPBACK 0x1 /* Loopback mode */ #define CAN_CTRLMODE_LISTENONLY 0x2 /* Listen-only mode */ #define CAN_CTRLMODE_3_SAMPLES 0x4 /* Triple sampling mode */ +#define CAN_CTRLMODE_ONE_SHOT 0x8 /* One-Shot mode */ /* * CAN device statistics -- cgit v1.2.3 From 96d07d211739fd2450ac54e81d00fa40fcd4b1bd Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 20 Nov 2009 14:16:33 +0100 Subject: resource: move kernel function inside __KERNEL__ It is an internal function. Move it inside __KERNEL__ ifdef, along with task_struct declaration. Then we get: --- /usr/include/linux/resource.h 2009-09-14 15:09:29.000000000 +0200 +++ usr/include/linux/resource.h 2010-01-04 11:30:54.000000000 +0100 @@ -3,8 +3,6 @@ #include -struct task_struct; - /* * Resource control/accounting header file for linux */ @@ -70,6 +68,5 @@ */ #include -int getrusage(struct task_struct *p, int who, struct rusage *ru); #endif *********** include/linux/Kbuild is untouched, since unifdef is run even on headers-y nowadays. Signed-off-by: Jiri Slaby --- include/linux/resource.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resource.h b/include/linux/resource.h index 40fc7e626082..f1e914eefeab 100644 --- a/include/linux/resource.h +++ b/include/linux/resource.h @@ -3,8 +3,6 @@ #include -struct task_struct; - /* * Resource control/accounting header file for linux */ @@ -70,6 +68,12 @@ struct rlimit { */ #include +#ifdef __KERNEL__ + +struct task_struct; + int getrusage(struct task_struct *p, int who, struct rusage __user *ru); +#endif /* __KERNEL__ */ + #endif -- cgit v1.2.3 From 3e10e716abf3c71bdb5d86b8f507f9e72236c9cd Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 19 Nov 2009 17:16:37 +0100 Subject: resource: add helpers for fetching rlimits We want to be sure that compiler fetches the limit variable only once, so add helpers for fetching current and maximal resource limits which do that. Add them to sched.h (instead of resource.h) due to circular dependency sched.h->resource.h->task_struct Alternative would be to create a separate res_access.h or similar. Signed-off-by: Jiri Slaby Cc: James Morris Cc: Heiko Carstens Cc: Andrew Morton Cc: Ingo Molnar --- include/linux/sched.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f2f842db03ce..8d4991be9d53 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2601,6 +2601,28 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) } #endif /* CONFIG_MM_OWNER */ +static inline unsigned long task_rlimit(const struct task_struct *tsk, + unsigned int limit) +{ + return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur); +} + +static inline unsigned long task_rlimit_max(const struct task_struct *tsk, + unsigned int limit) +{ + return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max); +} + +static inline unsigned long rlimit(unsigned int limit) +{ + return task_rlimit(current, limit); +} + +static inline unsigned long rlimit_max(unsigned int limit) +{ + return task_rlimit_max(current, limit); +} + #endif /* __KERNEL__ */ #endif -- cgit v1.2.3 From cf2f765f1896064e34c6f0f2ef896ff058dd5c06 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 4 Jan 2010 12:20:56 +0100 Subject: HID: handle joysticks with large number of buttons Current HID code doesn't properly handle HID joysticks which have larger number of buttons than what fits into current range reserved for BTN_JOYSTICK. One such joystick reported to not work properly is Saitek X52 Pro Flight System. We can't extend the range to fit more buttons in, because of backwards compatibility reasons. Therefore this patch introduces a new BTN_TRIGGER_HAPPY range, and uses these to map the buttons which are over BTN_JOYSTICK limit. Acked-by: Dmitry Torokhov [for the input.h part] Signed-off-by: Jiri Kosina --- drivers/hid/hid-input.c | 7 ++++++- include/linux/input.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 5862b0f3b55d..dad7aae9c975 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -198,7 +198,12 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel switch (field->application) { case HID_GD_MOUSE: case HID_GD_POINTER: code += 0x110; break; - case HID_GD_JOYSTICK: code += 0x120; break; + case HID_GD_JOYSTICK: + if (code <= 0xf) + code += BTN_JOYSTICK; + else + code += BTN_TRIGGER_HAPPY; + break; case HID_GD_GAMEPAD: code += 0x130; break; default: switch (field->physical) { diff --git a/include/linux/input.h b/include/linux/input.h index 7be8a6537b57..97f98ca9b040 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -597,6 +597,48 @@ struct input_absinfo { #define KEY_CAMERA_FOCUS 0x210 +#define BTN_TRIGGER_HAPPY 0x2c0 +#define BTN_TRIGGER_HAPPY1 0x2c0 +#define BTN_TRIGGER_HAPPY2 0x2c1 +#define BTN_TRIGGER_HAPPY3 0x2c2 +#define BTN_TRIGGER_HAPPY4 0x2c3 +#define BTN_TRIGGER_HAPPY5 0x2c4 +#define BTN_TRIGGER_HAPPY6 0x2c5 +#define BTN_TRIGGER_HAPPY7 0x2c6 +#define BTN_TRIGGER_HAPPY8 0x2c7 +#define BTN_TRIGGER_HAPPY9 0x2c8 +#define BTN_TRIGGER_HAPPY10 0x2c9 +#define BTN_TRIGGER_HAPPY11 0x2ca +#define BTN_TRIGGER_HAPPY12 0x2cb +#define BTN_TRIGGER_HAPPY13 0x2cc +#define BTN_TRIGGER_HAPPY14 0x2cd +#define BTN_TRIGGER_HAPPY15 0x2ce +#define BTN_TRIGGER_HAPPY16 0x2cf +#define BTN_TRIGGER_HAPPY17 0x2d0 +#define BTN_TRIGGER_HAPPY18 0x2d1 +#define BTN_TRIGGER_HAPPY19 0x2d2 +#define BTN_TRIGGER_HAPPY20 0x2d3 +#define BTN_TRIGGER_HAPPY21 0x2d4 +#define BTN_TRIGGER_HAPPY22 0x2d5 +#define BTN_TRIGGER_HAPPY23 0x2d6 +#define BTN_TRIGGER_HAPPY24 0x2d7 +#define BTN_TRIGGER_HAPPY25 0x2d8 +#define BTN_TRIGGER_HAPPY26 0x2d9 +#define BTN_TRIGGER_HAPPY27 0x2da +#define BTN_TRIGGER_HAPPY28 0x2db +#define BTN_TRIGGER_HAPPY29 0x2dc +#define BTN_TRIGGER_HAPPY30 0x2dd +#define BTN_TRIGGER_HAPPY31 0x2de +#define BTN_TRIGGER_HAPPY32 0x2df +#define BTN_TRIGGER_HAPPY33 0x2e0 +#define BTN_TRIGGER_HAPPY34 0x2e1 +#define BTN_TRIGGER_HAPPY35 0x2e2 +#define BTN_TRIGGER_HAPPY36 0x2e3 +#define BTN_TRIGGER_HAPPY37 0x2e4 +#define BTN_TRIGGER_HAPPY38 0x2e5 +#define BTN_TRIGGER_HAPPY39 0x2e6 +#define BTN_TRIGGER_HAPPY40 0x2e7 + /* We avoid low common keys in module aliases so they don't get huge. */ #define KEY_MIN_INTERESTING KEY_MUTE #define KEY_MAX 0x2ff -- cgit v1.2.3 From 1ae861e652b5457e7fa98ccbc55abea1e207916e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 31 Dec 2009 12:15:54 +0100 Subject: PCI/PM: Use per-device D3 delays It turns out that some PCI devices require extra delays when changing power state from D3 to D0 (and the other way around). Although this is against the PCI specification, we can handle it quite easily by allowing drivers to define arbitrary D3 delays for devices known to require extra time for switching power states. Introduce additional field d3_delay in struct pci_dev and use it to store the value of the device's D0->D3 delay, in miliseconds. Make the PCI PM core code use the per-device d3_delay unless pci_pm_d3_delay is greater (in which case the latter is used). [This also allows the driver to specify d3_delay shorter than the 10 ms required by the PCI standard if the device is known to be able to handle that.] Make the sky2 driver set d3_delay to 150 for devices handled by it. Fixes http://bugzilla.kernel.org/show_bug.cgi?id=14730 which is a listed regression from 2.6.30. Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/net/sky2.c | 1 + drivers/pci/pci.c | 19 +++++++++++++++---- include/linux/pci.h | 1 + 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index 1c01b96c9611..2d28d58200d0 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -4684,6 +4684,7 @@ static int __devinit sky2_probe(struct pci_dev *pdev, INIT_WORK(&hw->restart_work, sky2_restart); pci_set_drvdata(pdev, hw); + pdev->d3_delay = 150; return 0; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 0906599ebfde..315fea47e784 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -29,7 +29,17 @@ const char *pci_power_names[] = { }; EXPORT_SYMBOL_GPL(pci_power_names); -unsigned int pci_pm_d3_delay = PCI_PM_D3_WAIT; +unsigned int pci_pm_d3_delay; + +static void pci_dev_d3_sleep(struct pci_dev *dev) +{ + unsigned int delay = dev->d3_delay; + + if (delay < pci_pm_d3_delay) + delay = pci_pm_d3_delay; + + msleep(delay); +} #ifdef CONFIG_PCI_DOMAINS int pci_domains_supported = 1; @@ -522,7 +532,7 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state) /* Mandatory power management transition delays */ /* see PCI PM 1.1 5.6.1 table 18 */ if (state == PCI_D3hot || dev->current_state == PCI_D3hot) - msleep(pci_pm_d3_delay); + pci_dev_d3_sleep(dev); else if (state == PCI_D2 || dev->current_state == PCI_D2) udelay(PCI_PM_D2_DELAY); @@ -1409,6 +1419,7 @@ void pci_pm_init(struct pci_dev *dev) } dev->pm_cap = pm; + dev->d3_delay = PCI_PM_D3_WAIT; dev->d1_support = false; dev->d2_support = false; @@ -2247,12 +2258,12 @@ static int pci_pm_reset(struct pci_dev *dev, int probe) csr &= ~PCI_PM_CTRL_STATE_MASK; csr |= PCI_D3hot; pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr); - msleep(pci_pm_d3_delay); + pci_dev_d3_sleep(dev); csr &= ~PCI_PM_CTRL_STATE_MASK; csr |= PCI_D0; pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr); - msleep(pci_pm_d3_delay); + pci_dev_d3_sleep(dev); return 0; } diff --git a/include/linux/pci.h b/include/linux/pci.h index 5da0690d9cee..174e5392e51e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -243,6 +243,7 @@ struct pci_dev { unsigned int d2_support:1; /* Low power state D2 is supported */ unsigned int no_d1d2:1; /* Only allow D0 and D3 */ unsigned int wakeup_prepared:1; + unsigned int d3_delay; /* D3->D0 transition time in ms */ #ifdef CONFIG_PCIEASPM struct pcie_link_state *link_state; /* ASPM link state. */ -- cgit v1.2.3 From e1783a240f491fb233f04edc042e16b18a7a79ba Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 5 Jan 2010 15:34:50 +0900 Subject: module: Use this_cpu_xx to dynamically allocate counters Use cpu ops to deal with the per cpu data instead of a local_t. Reduces memory requirements, cache footprint and decreases cycle counts. The this_cpu_xx operations are also used for !SMP mode. Otherwise we could not drop the use of __module_ref_addr() which would make per cpu data handling complicated. this_cpu_xx operations have their own fallback for !SMP. V8-V9: - Leave include asm/module.h since ringbuffer.c depends on it. Nothing else does though. Another patch will deal with that. - Remove spurious free. Signed-off-by: Christoph Lameter Acked-by: Rusty Russell Signed-off-by: Tejun Heo --- include/linux/module.h | 36 ++++++++++++++---------------------- kernel/module.c | 29 +++++++++++++++-------------- 2 files changed, 29 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 6cb1a3cab5d3..2302f09ea2d9 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -363,11 +364,9 @@ struct module /* Destruction function. */ void (*exit)(void); -#ifdef CONFIG_SMP - char *refptr; -#else - local_t ref; -#endif + struct module_ref { + int count; + } *refptr; #endif #ifdef CONFIG_CONSTRUCTORS @@ -454,25 +453,16 @@ void __symbol_put(const char *symbol); #define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x) void symbol_put_addr(void *addr); -static inline local_t *__module_ref_addr(struct module *mod, int cpu) -{ -#ifdef CONFIG_SMP - return (local_t *) (mod->refptr + per_cpu_offset(cpu)); -#else - return &mod->ref; -#endif -} - /* Sometimes we know we already have a refcount, and it's easier not to handle the error case (which only happens with rmmod --wait). */ static inline void __module_get(struct module *module) { if (module) { - unsigned int cpu = get_cpu(); - local_inc(__module_ref_addr(module, cpu)); + preempt_disable(); + __this_cpu_inc(module->refptr->count); trace_module_get(module, _THIS_IP_, - local_read(__module_ref_addr(module, cpu))); - put_cpu(); + __this_cpu_read(module->refptr->count)); + preempt_enable(); } } @@ -481,15 +471,17 @@ static inline int try_module_get(struct module *module) int ret = 1; if (module) { - unsigned int cpu = get_cpu(); + preempt_disable(); + if (likely(module_is_live(module))) { - local_inc(__module_ref_addr(module, cpu)); + __this_cpu_inc(module->refptr->count); trace_module_get(module, _THIS_IP_, - local_read(__module_ref_addr(module, cpu))); + __this_cpu_read(module->refptr->count)); } else ret = 0; - put_cpu(); + + preempt_enable(); } return ret; } diff --git a/kernel/module.c b/kernel/module.c index e96b8ed1cb6a..9bf228052ec5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -474,9 +474,10 @@ static void module_unload_init(struct module *mod) INIT_LIST_HEAD(&mod->modules_which_use_me); for_each_possible_cpu(cpu) - local_set(__module_ref_addr(mod, cpu), 0); + per_cpu_ptr(mod->refptr, cpu)->count = 0; + /* Hold reference count during initialization. */ - local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); + __this_cpu_write(mod->refptr->count, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } @@ -619,7 +620,7 @@ unsigned int module_refcount(struct module *mod) int cpu; for_each_possible_cpu(cpu) - total += local_read(__module_ref_addr(mod, cpu)); + total += per_cpu_ptr(mod->refptr, cpu)->count; return total; } EXPORT_SYMBOL(module_refcount); @@ -796,14 +797,15 @@ static struct module_attribute refcnt = { void module_put(struct module *module) { if (module) { - unsigned int cpu = get_cpu(); - local_dec(__module_ref_addr(module, cpu)); + preempt_disable(); + __this_cpu_dec(module->refptr->count); + trace_module_put(module, _RET_IP_, - local_read(__module_ref_addr(module, cpu))); + __this_cpu_read(module->refptr->count)); /* Maybe they're waiting for us to drop reference? */ if (unlikely(!module_is_live(module))) wake_up_process(module->waiter); - put_cpu(); + preempt_enable(); } } EXPORT_SYMBOL(module_put); @@ -1394,9 +1396,9 @@ static void free_module(struct module *mod) kfree(mod->args); if (mod->percpu) percpu_modfree(mod->percpu); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) +#if defined(CONFIG_MODULE_UNLOAD) if (mod->refptr) - percpu_modfree(mod->refptr); + free_percpu(mod->refptr); #endif /* Free lock-classes: */ lockdep_free_key_range(mod->module_core, mod->core_size); @@ -2159,9 +2161,8 @@ static noinline struct module *load_module(void __user *umod, mod = (void *)sechdrs[modindex].sh_addr; kmemleak_load_module(mod, hdr, sechdrs, secstrings); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), - mod->name); +#if defined(CONFIG_MODULE_UNLOAD) + mod->refptr = alloc_percpu(struct module_ref); if (!mod->refptr) { err = -ENOMEM; goto free_init; @@ -2393,8 +2394,8 @@ static noinline struct module *load_module(void __user *umod, kobject_put(&mod->mkobj.kobj); free_unload: module_unload_free(mod); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - percpu_modfree(mod->refptr); +#if defined(CONFIG_MODULE_UNLOAD) + free_percpu(mod->refptr); free_init: #endif module_free(mod, mod->module_init); -- cgit v1.2.3 From 79615760f380ec86cd58204744e774c33fab9211 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 5 Jan 2010 15:34:50 +0900 Subject: local_t: Move local.h include to ringbuffer.c and ring_buffer_benchmark.c ringbuffer*.c are the last users of local.h. Remove the include from modules.h and add it to ringbuffer files. Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- include/linux/module.h | 1 - kernel/trace/ring_buffer.c | 1 + kernel/trace/ring_buffer_benchmark.c | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 2302f09ea2d9..7e74ae0051cc 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -17,7 +17,6 @@ #include #include -#include #include #include diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2326b04c95c4..eb6c8988c31a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -20,6 +20,7 @@ #include #include +#include #include "trace.h" /* diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index b2477caf09c2..df74c7982255 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -8,6 +8,7 @@ #include #include #include +#include struct rb_page { u64 ts; -- cgit v1.2.3 From 99dcc3e5a94ed491fbef402831d8c0bbb267f995 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 5 Jan 2010 15:34:51 +0900 Subject: this_cpu: Page allocator conversion Use the per cpu allocator functionality to avoid per cpu arrays in struct zone. This drastically reduces the size of struct zone for systems with large amounts of processors and allows placement of critical variables of struct zone in one cacheline even on very large systems. Another effect is that the pagesets of one processor are placed near one another. If multiple pagesets from different zones fit into one cacheline then additional cacheline fetches can be avoided on the hot paths when allocating memory from multiple zones. Bootstrap becomes simpler if we use the same scheme for UP, SMP, NUMA. #ifdefs are reduced and we can drop the zone_pcp macro. Hotplug handling is also simplified since cpu alloc can bring up and shut down cpu areas for a specific cpu as a whole. So there is no need to allocate or free individual pagesets. V7-V8: - Explain chicken egg dilemmna with percpu allocator. V4-V5: - Fix up cases where per_cpu_ptr is called before irq disable - Integrate the bootstrap logic that was separate before. tj: Build failure in pageset_cpuup_callback() due to missing ret variable fixed. Reviewed-by: Mel Gorman Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- include/linux/mm.h | 4 - include/linux/mmzone.h | 12 +-- mm/page_alloc.c | 202 +++++++++++++++++-------------------------------- mm/vmstat.c | 14 ++-- 4 files changed, 81 insertions(+), 151 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2265f28eb47a..554fa395aac9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1079,11 +1079,7 @@ extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern int after_bootmem; -#ifdef CONFIG_NUMA extern void setup_per_cpu_pageset(void); -#else -static inline void setup_per_cpu_pageset(void) {} -#endif extern void zone_pcp_update(struct zone *zone); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 30fe668c2542..7874201a3556 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -184,13 +184,7 @@ struct per_cpu_pageset { s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; #endif -} ____cacheline_aligned_in_smp; - -#ifdef CONFIG_NUMA -#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)]) -#else -#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)]) -#endif +}; #endif /* !__GENERATING_BOUNDS.H */ @@ -306,10 +300,8 @@ struct zone { */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; - struct per_cpu_pageset *pageset[NR_CPUS]; -#else - struct per_cpu_pageset pageset[NR_CPUS]; #endif + struct per_cpu_pageset *pageset; /* * free areas of different sizes */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e9f5cc5fb59..6849e870de54 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1008,10 +1008,10 @@ static void drain_pages(unsigned int cpu) struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - pset = zone_pcp(zone, cpu); + local_irq_save(flags); + pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - local_irq_save(flags); free_pcppages_bulk(zone, pcp->count, pcp); pcp->count = 0; local_irq_restore(flags); @@ -1095,7 +1095,6 @@ static void free_hot_cold_page(struct page *page, int cold) arch_free_page(page, 0); kernel_map_pages(page, 1, 0); - pcp = &zone_pcp(zone, get_cpu())->pcp; migratetype = get_pageblock_migratetype(page); set_page_private(page, migratetype); local_irq_save(flags); @@ -1118,6 +1117,7 @@ static void free_hot_cold_page(struct page *page, int cold) migratetype = MIGRATE_MOVABLE; } + pcp = &this_cpu_ptr(zone->pageset)->pcp; if (cold) list_add_tail(&page->lru, &pcp->lists[migratetype]); else @@ -1130,7 +1130,6 @@ static void free_hot_cold_page(struct page *page, int cold) out: local_irq_restore(flags); - put_cpu(); } void free_hot_page(struct page *page) @@ -1180,17 +1179,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); - int cpu; again: - cpu = get_cpu(); if (likely(order == 0)) { struct per_cpu_pages *pcp; struct list_head *list; - pcp = &zone_pcp(zone, cpu)->pcp; - list = &pcp->lists[migratetype]; local_irq_save(flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, @@ -1231,7 +1228,6 @@ again: __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone); local_irq_restore(flags); - put_cpu(); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) @@ -1240,7 +1236,6 @@ again: failed: local_irq_restore(flags); - put_cpu(); return NULL; } @@ -2179,7 +2174,7 @@ void show_free_areas(void) for_each_online_cpu(cpu) { struct per_cpu_pageset *pageset; - pageset = zone_pcp(zone, cpu); + pageset = per_cpu_ptr(zone->pageset, cpu); printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", cpu, pageset->pcp.high, @@ -2744,10 +2739,29 @@ static void build_zonelist_cache(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ +/* + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * The boot_pagesets must be kept even after bootup is complete for + * unused processors and/or zones. They do play a role for bootstrapping + * hotplugged processors. + * + * zoneinfo_show() and maybe other functions do + * not check if the processor is online before following the pageset pointer. + * Other parts of the kernel may not check if the zone is available. + */ +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); +static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); + /* return values int ....just for stop_machine() */ static int __build_all_zonelists(void *dummy) { int nid; + int cpu; #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -2758,6 +2772,23 @@ static int __build_all_zonelists(void *dummy) build_zonelists(pgdat); build_zonelist_cache(pgdat); } + + /* + * Initialize the boot_pagesets that are going to be used + * for bootstrapping processors. The real pagesets for + * each zone will be allocated later when the per cpu + * allocator is available. + * + * boot_pagesets are used also for bootstrapping offline + * cpus if the system is already booted because the pagesets + * are needed to initialize allocators on a specific cpu too. + * F.e. the percpu allocator needs the page allocator which + * needs the percpu allocator in order to allocate its pagesets + * (a chicken-egg dilemma). + */ + for_each_possible_cpu(cpu) + setup_pageset(&per_cpu(boot_pageset, cpu), 0); + return 0; } @@ -3095,121 +3126,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pcp->batch = PAGE_SHIFT * 8; } - -#ifdef CONFIG_NUMA -/* - * Boot pageset table. One per cpu which is going to be used for all - * zones and all nodes. The parameters will be set in such a way - * that an item put on a list will immediately be handed over to - * the buddy list. This is safe since pageset manipulation is done - * with interrupts disabled. - * - * Some NUMA counter updates may also be caught by the boot pagesets. - * - * The boot_pagesets must be kept even after bootup is complete for - * unused processors and/or zones. They do play a role for bootstrapping - * hotplugged processors. - * - * zoneinfo_show() and maybe other functions do - * not check if the processor is online before following the pageset pointer. - * Other parts of the kernel may not check if the zone is available. - */ -static struct per_cpu_pageset boot_pageset[NR_CPUS]; - /* - * Dynamically allocate memory for the - * per cpu pageset array in struct zone. + * Allocate per cpu pagesets and initialize them. + * Before this call only boot pagesets were available. + * Boot pagesets will no longer be used by this processorr + * after setup_per_cpu_pageset(). */ -static int __cpuinit process_zones(int cpu) +void __init setup_per_cpu_pageset(void) { - struct zone *zone, *dzone; - int node = cpu_to_node(cpu); - - node_set_state(node, N_CPU); /* this node has a cpu */ + struct zone *zone; + int cpu; for_each_populated_zone(zone) { - zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), - GFP_KERNEL, node); - if (!zone_pcp(zone, cpu)) - goto bad; - - setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); - - if (percpu_pagelist_fraction) - setup_pagelist_highmark(zone_pcp(zone, cpu), - (zone->present_pages / percpu_pagelist_fraction)); - } - - return 0; -bad: - for_each_zone(dzone) { - if (!populated_zone(dzone)) - continue; - if (dzone == zone) - break; - kfree(zone_pcp(dzone, cpu)); - zone_pcp(dzone, cpu) = &boot_pageset[cpu]; - } - return -ENOMEM; -} + zone->pageset = alloc_percpu(struct per_cpu_pageset); -static inline void free_zone_pagesets(int cpu) -{ - struct zone *zone; - - for_each_zone(zone) { - struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - /* Free per_cpu_pageset if it is slab allocated */ - if (pset != &boot_pageset[cpu]) - kfree(pset); - zone_pcp(zone, cpu) = &boot_pageset[cpu]; - } -} + setup_pageset(pcp, zone_batchsize(zone)); -static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - int ret = NOTIFY_OK; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (process_zones(cpu)) - ret = NOTIFY_BAD; - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - free_zone_pagesets(cpu); - break; - default: - break; + if (percpu_pagelist_fraction) + setup_pagelist_highmark(pcp, + (zone->present_pages / + percpu_pagelist_fraction)); + } } - return ret; } -static struct notifier_block __cpuinitdata pageset_notifier = - { &pageset_cpuup_callback, NULL, 0 }; - -void __init setup_per_cpu_pageset(void) -{ - int err; - - /* Initialize per_cpu_pageset for cpu 0. - * A cpuup callback will do this for every cpu - * as it comes online - */ - err = process_zones(smp_processor_id()); - BUG_ON(err); - register_cpu_notifier(&pageset_notifier); -} - -#endif - static noinline __init_refok int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { @@ -3263,7 +3206,7 @@ static int __zone_pcp_update(void *data) struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - pset = zone_pcp(zone, cpu); + pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; local_irq_save(flags); @@ -3281,21 +3224,17 @@ void zone_pcp_update(struct zone *zone) static __meminit void zone_pcp_init(struct zone *zone) { - int cpu; - unsigned long batch = zone_batchsize(zone); + /* + * per cpu subsystem is not up at this point. The following code + * relies on the ability of the linker to provide the + * offset of a (static) per cpu variable into the per cpu area. + */ + zone->pageset = &boot_pageset; - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone_pcp(zone, cpu) = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); -#endif - } if (zone->present_pages) - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone->name, zone->present_pages, batch); + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", + zone->name, zone->present_pages, + zone_batchsize(zone)); } __meminit int init_currently_empty_zone(struct zone *zone, @@ -4809,10 +4748,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, if (!write || (ret == -EINVAL)) return ret; for_each_populated_zone(zone) { - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { unsigned long high; high = zone->present_pages / percpu_pagelist_fraction; - setup_pagelist_highmark(zone_pcp(zone, cpu), high); + setup_pagelist_highmark( + per_cpu_ptr(zone->pageset, cpu), high); } } return 0; diff --git a/mm/vmstat.c b/mm/vmstat.c index 6051fbab67ba..1ba0bb7ad043 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds(void) threshold = calculate_threshold(zone); for_each_online_cpu(cpu) - zone_pcp(zone, cpu)->stat_threshold = threshold; + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; } } @@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds(void) void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); + s8 *p = pcp->vm_stat_diff + item; long x; @@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); s8 *p = pcp->vm_stat_diff + item; (*p)++; @@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); s8 *p = pcp->vm_stat_diff + item; (*p)--; @@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu) for_each_populated_zone(zone) { struct per_cpu_pageset *p; - p = zone_pcp(zone, cpu); + p = per_cpu_ptr(zone->pageset, cpu); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) if (p->vm_stat_diff[i]) { @@ -741,7 +743,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, for_each_online_cpu(i) { struct per_cpu_pageset *pageset; - pageset = zone_pcp(zone, i); + pageset = per_cpu_ptr(zone->pageset, i); seq_printf(m, "\n cpu: %i" "\n count: %i" -- cgit v1.2.3 From 59b015133cd0034f5904a76969d73476380aac46 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 5 Jan 2010 17:56:02 -0800 Subject: Input: serio - fix potential deadlock when unbinding drivers sysfs_remove_group() waits for sysfs attributes to be removed, therefore we do not need to worry about driver-specific attributes being accessed after driver has been detached from the device. In fact, attempts to take serio->drv_mutex in attribute methods may lead to the following deadlock: sysfs_read_file() fill_read_buffer() sysfs_get_active_two() psmouse_attr_show_helper() serio_pin_driver() serio_disconnect_driver() mutex_lock(&serio->drv_mutex); <--------> mutex_lock(&serio_drv_mutex); psmouse_disconnect() sysfs_remove_group(... psmouse_attr_group); .... sysfs_deactivate(); wait_for_completion(); Fix this by removing calls to serio_[un]pin_driver() and functions themselves and using driver-private mutexes to serialize access to attribute's set() methods that may change device state. Signed-off-by: Eric W. Biederman Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/atkbd.c | 59 ++++++++++++++++---------------------- drivers/input/mouse/psmouse-base.c | 32 ++------------------- include/linux/serio.h | 19 ------------ 3 files changed, 28 insertions(+), 82 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/keyboard/atkbd.c b/drivers/input/keyboard/atkbd.c index 1f5e2ce327d6..1cf32a7814d0 100644 --- a/drivers/input/keyboard/atkbd.c +++ b/drivers/input/keyboard/atkbd.c @@ -225,8 +225,10 @@ struct atkbd { struct delayed_work event_work; unsigned long event_jiffies; - struct mutex event_mutex; unsigned long event_mask; + + /* Serializes reconnect(), attr->set() and event work */ + struct mutex mutex; }; /* @@ -577,7 +579,7 @@ static void atkbd_event_work(struct work_struct *work) { struct atkbd *atkbd = container_of(work, struct atkbd, event_work.work); - mutex_lock(&atkbd->event_mutex); + mutex_lock(&atkbd->mutex); if (!atkbd->enabled) { /* @@ -596,7 +598,7 @@ static void atkbd_event_work(struct work_struct *work) atkbd_set_repeat_rate(atkbd); } - mutex_unlock(&atkbd->event_mutex); + mutex_unlock(&atkbd->mutex); } /* @@ -612,7 +614,7 @@ static void atkbd_schedule_event_work(struct atkbd *atkbd, int event_bit) atkbd->event_jiffies = jiffies; set_bit(event_bit, &atkbd->event_mask); - wmb(); + mb(); schedule_delayed_work(&atkbd->event_work, delay); } @@ -849,12 +851,13 @@ static void atkbd_disconnect(struct serio *serio) { struct atkbd *atkbd = serio_get_drvdata(serio); + sysfs_remove_group(&serio->dev.kobj, &atkbd_attribute_group); + atkbd_disable(atkbd); /* make sure we don't have a command in flight */ cancel_delayed_work_sync(&atkbd->event_work); - sysfs_remove_group(&serio->dev.kobj, &atkbd_attribute_group); input_unregister_device(atkbd->dev); serio_close(serio); serio_set_drvdata(serio, NULL); @@ -1087,7 +1090,7 @@ static int atkbd_connect(struct serio *serio, struct serio_driver *drv) atkbd->dev = dev; ps2_init(&atkbd->ps2dev, serio); INIT_DELAYED_WORK(&atkbd->event_work, atkbd_event_work); - mutex_init(&atkbd->event_mutex); + mutex_init(&atkbd->mutex); switch (serio->id.type) { @@ -1160,19 +1163,23 @@ static int atkbd_reconnect(struct serio *serio) { struct atkbd *atkbd = serio_get_drvdata(serio); struct serio_driver *drv = serio->drv; + int retval = -1; if (!atkbd || !drv) { printk(KERN_DEBUG "atkbd: reconnect request, but serio is disconnected, ignoring...\n"); return -1; } + mutex_lock(&atkbd->mutex); + atkbd_disable(atkbd); if (atkbd->write) { if (atkbd_probe(atkbd)) - return -1; + goto out; + if (atkbd->set != atkbd_select_set(atkbd, atkbd->set, atkbd->extra)) - return -1; + goto out; atkbd_activate(atkbd); @@ -1190,8 +1197,11 @@ static int atkbd_reconnect(struct serio *serio) } atkbd_enable(atkbd); + retval = 0; - return 0; + out: + mutex_unlock(&atkbd->mutex); + return retval; } static struct serio_device_id atkbd_serio_ids[] = { @@ -1235,47 +1245,28 @@ static ssize_t atkbd_attr_show_helper(struct device *dev, char *buf, ssize_t (*handler)(struct atkbd *, char *)) { struct serio *serio = to_serio_port(dev); - int retval; - - retval = serio_pin_driver(serio); - if (retval) - return retval; - - if (serio->drv != &atkbd_drv) { - retval = -ENODEV; - goto out; - } - - retval = handler((struct atkbd *)serio_get_drvdata(serio), buf); + struct atkbd *atkbd = serio_get_drvdata(serio); -out: - serio_unpin_driver(serio); - return retval; + return handler(atkbd, buf); } static ssize_t atkbd_attr_set_helper(struct device *dev, const char *buf, size_t count, ssize_t (*handler)(struct atkbd *, const char *, size_t)) { struct serio *serio = to_serio_port(dev); - struct atkbd *atkbd; + struct atkbd *atkbd = serio_get_drvdata(serio); int retval; - retval = serio_pin_driver(serio); + retval = mutex_lock_interruptible(&atkbd->mutex); if (retval) return retval; - if (serio->drv != &atkbd_drv) { - retval = -ENODEV; - goto out; - } - - atkbd = serio_get_drvdata(serio); atkbd_disable(atkbd); retval = handler(atkbd, buf, count); atkbd_enable(atkbd); -out: - serio_unpin_driver(serio); + mutex_unlock(&atkbd->mutex); + return retval; } diff --git a/drivers/input/mouse/psmouse-base.c b/drivers/input/mouse/psmouse-base.c index 401ac6b6edd4..d59e18b24ede 100644 --- a/drivers/input/mouse/psmouse-base.c +++ b/drivers/input/mouse/psmouse-base.c @@ -1450,24 +1450,10 @@ ssize_t psmouse_attr_show_helper(struct device *dev, struct device_attribute *de struct serio *serio = to_serio_port(dev); struct psmouse_attribute *attr = to_psmouse_attr(devattr); struct psmouse *psmouse; - int retval; - - retval = serio_pin_driver(serio); - if (retval) - return retval; - - if (serio->drv != &psmouse_drv) { - retval = -ENODEV; - goto out; - } psmouse = serio_get_drvdata(serio); - retval = attr->show(psmouse, attr->data, buf); - -out: - serio_unpin_driver(serio); - return retval; + return attr->show(psmouse, attr->data, buf); } ssize_t psmouse_attr_set_helper(struct device *dev, struct device_attribute *devattr, @@ -1478,18 +1464,9 @@ ssize_t psmouse_attr_set_helper(struct device *dev, struct device_attribute *dev struct psmouse *psmouse, *parent = NULL; int retval; - retval = serio_pin_driver(serio); - if (retval) - return retval; - - if (serio->drv != &psmouse_drv) { - retval = -ENODEV; - goto out_unpin; - } - retval = mutex_lock_interruptible(&psmouse_mutex); if (retval) - goto out_unpin; + goto out; psmouse = serio_get_drvdata(serio); @@ -1519,8 +1496,7 @@ ssize_t psmouse_attr_set_helper(struct device *dev, struct device_attribute *dev out_unlock: mutex_unlock(&psmouse_mutex); - out_unpin: - serio_unpin_driver(serio); + out: return retval; } @@ -1582,9 +1558,7 @@ static ssize_t psmouse_attr_set_protocol(struct psmouse *psmouse, void *data, co } mutex_unlock(&psmouse_mutex); - serio_unpin_driver(serio); serio_unregister_child_port(serio); - serio_pin_driver_uninterruptible(serio); mutex_lock(&psmouse_mutex); if (serio->drv != &psmouse_drv) { diff --git a/include/linux/serio.h b/include/linux/serio.h index e2f3044d4a4a..813d26c247ec 100644 --- a/include/linux/serio.h +++ b/include/linux/serio.h @@ -136,25 +136,6 @@ static inline void serio_continue_rx(struct serio *serio) spin_unlock_irq(&serio->lock); } -/* - * Use the following functions to pin serio's driver in process context - */ -static inline int serio_pin_driver(struct serio *serio) -{ - return mutex_lock_interruptible(&serio->drv_mutex); -} - -static inline void serio_pin_driver_uninterruptible(struct serio *serio) -{ - mutex_lock(&serio->drv_mutex); -} - -static inline void serio_unpin_driver(struct serio *serio) -{ - mutex_unlock(&serio->drv_mutex); -} - - #endif /* -- cgit v1.2.3 From ddf1ffbd40c92ff1e58c45fa96d309788f7beb60 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 5 Jan 2010 17:56:04 -0800 Subject: Input: serio - let device core tell us if device was registered No need to keep track of it by ourselves. Signed-off-by: Dmitry Torokhov --- drivers/input/serio/serio.c | 8 ++------ include/linux/serio.h | 1 - 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c index 0a278f9f1d3a..d89880450f77 100644 --- a/drivers/input/serio/serio.c +++ b/drivers/input/serio/serio.c @@ -577,8 +577,6 @@ static void serio_add_port(struct serio *serio) printk(KERN_ERR "serio: device_add() failed for %s (%s), error: %d\n", serio->phys, serio->name, error); - else - serio->registered = true; } /* @@ -605,10 +603,8 @@ static void serio_destroy_port(struct serio *serio) serio->parent = NULL; } - if (serio->registered) { + if (device_is_registered(&serio->dev)) device_del(&serio->dev); - serio->registered = false; - } list_del_init(&serio->node); serio_remove_pending_events(serio); @@ -995,7 +991,7 @@ irqreturn_t serio_interrupt(struct serio *serio, if (likely(serio->drv)) { ret = serio->drv->interrupt(serio, data, dfl); - } else if (!dfl && serio->registered) { + } else if (!dfl && device_is_registered(&serio->dev)) { serio_rescan(serio); ret = IRQ_HANDLED; } diff --git a/include/linux/serio.h b/include/linux/serio.h index e2f3044d4a4a..d0fb702059cd 100644 --- a/include/linux/serio.h +++ b/include/linux/serio.h @@ -30,7 +30,6 @@ struct serio { char phys[32]; bool manual_bind; - bool registered; /* port has been fully registered with driver core */ struct serio_device_id id; -- cgit v1.2.3 From 361b7b5b032338361ea88412f1fc45479fdd5859 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 5 Jan 2010 17:56:03 -0800 Subject: Input: gameport - let device core tell us if device was registered No need to keep track of it by ourselves. Signed-off-by: Dmitry Torokhov --- drivers/input/gameport/gameport.c | 6 +----- include/linux/gameport.h | 1 - 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c index ac11be08585e..f9e5f8e1690b 100644 --- a/drivers/input/gameport/gameport.c +++ b/drivers/input/gameport/gameport.c @@ -561,8 +561,6 @@ static void gameport_add_port(struct gameport *gameport) printk(KERN_ERR "gameport: device_add() failed for %s (%s), error: %d\n", gameport->phys, gameport->name, error); - else - gameport->registered = 1; } /* @@ -584,10 +582,8 @@ static void gameport_destroy_port(struct gameport *gameport) gameport->parent = NULL; } - if (gameport->registered) { + if (device_is_registered(&gameport->dev)) device_del(&gameport->dev); - gameport->registered = 0; - } list_del_init(&gameport->node); diff --git a/include/linux/gameport.h b/include/linux/gameport.h index 1bc08541c2b9..48e68da097f6 100644 --- a/include/linux/gameport.h +++ b/include/linux/gameport.h @@ -46,7 +46,6 @@ struct gameport { struct mutex drv_mutex; /* protects serio->drv so attributes can pin driver */ struct device dev; - unsigned int registered; /* port has been fully registered with driver core */ struct list_head node; }; -- cgit v1.2.3 From 16295bec6398a3eedc9377e1af6ff4c71b98c300 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 6 Jan 2010 19:47:10 +1100 Subject: padata: Generic parallelization/serialization interface This patch introduces an interface to process data objects in parallel. The parallelized objects return after serialization in the same order as they were before the parallelization. Signed-off-by: Steffen Klassert Signed-off-by: Herbert Xu --- include/linux/padata.h | 88 +++++++ init/Kconfig | 4 + kernel/Makefile | 1 + kernel/padata.c | 690 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 783 insertions(+) create mode 100644 include/linux/padata.h create mode 100644 kernel/padata.c (limited to 'include/linux') diff --git a/include/linux/padata.h b/include/linux/padata.h new file mode 100644 index 000000000000..51611da9c498 --- /dev/null +++ b/include/linux/padata.h @@ -0,0 +1,88 @@ +/* + * padata.h - header for the padata parallelization interface + * + * Copyright (C) 2008, 2009 secunet Security Networks AG + * Copyright (C) 2008, 2009 Steffen Klassert + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef PADATA_H +#define PADATA_H + +#include +#include +#include + +struct padata_priv { + struct list_head list; + struct parallel_data *pd; + int cb_cpu; + int seq_nr; + int info; + void (*parallel)(struct padata_priv *padata); + void (*serial)(struct padata_priv *padata); +}; + +struct padata_list { + struct list_head list; + spinlock_t lock; +}; + +struct padata_queue { + struct padata_list parallel; + struct padata_list reorder; + struct padata_list serial; + struct work_struct pwork; + struct work_struct swork; + struct parallel_data *pd; + atomic_t num_obj; + int cpu_index; +}; + +struct parallel_data { + struct padata_instance *pinst; + struct padata_queue *queue; + atomic_t seq_nr; + atomic_t reorder_objects; + atomic_t refcnt; + unsigned int max_seq_nr; + cpumask_var_t cpumask; + spinlock_t lock; +}; + +struct padata_instance { + struct notifier_block cpu_notifier; + struct workqueue_struct *wq; + struct parallel_data *pd; + cpumask_var_t cpumask; + struct mutex lock; + u8 flags; +#define PADATA_INIT 1 +#define PADATA_RESET 2 +}; + +extern struct padata_instance *padata_alloc(const struct cpumask *cpumask, + struct workqueue_struct *wq); +extern void padata_free(struct padata_instance *pinst); +extern int padata_do_parallel(struct padata_instance *pinst, + struct padata_priv *padata, int cb_cpu); +extern void padata_do_serial(struct padata_priv *padata); +extern int padata_set_cpumask(struct padata_instance *pinst, + cpumask_var_t cpumask); +extern int padata_add_cpu(struct padata_instance *pinst, int cpu); +extern int padata_remove_cpu(struct padata_instance *pinst, int cpu); +extern void padata_start(struct padata_instance *pinst); +extern void padata_stop(struct padata_instance *pinst); +#endif diff --git a/init/Kconfig b/init/Kconfig index a23da9f01803..9fd23bcc1709 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1252,4 +1252,8 @@ source "block/Kconfig" config PREEMPT_NOTIFIERS bool +config PADATA + depends on SMP + bool + source "kernel/Kconfig.locks" diff --git a/kernel/Makefile b/kernel/Makefile index 864ff75d65f2..6aebdeb2aa34 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -100,6 +100,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o +obj-$(CONFIG_PADATA) += padata.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/padata.c b/kernel/padata.c new file mode 100644 index 000000000000..6f9bcb8313d6 --- /dev/null +++ b/kernel/padata.c @@ -0,0 +1,690 @@ +/* + * padata.c - generic interface to process data streams in parallel + * + * Copyright (C) 2008, 2009 secunet Security Networks AG + * Copyright (C) 2008, 2009 Steffen Klassert + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_SEQ_NR INT_MAX - NR_CPUS +#define MAX_OBJ_NUM 10000 * NR_CPUS + +static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) +{ + int cpu, target_cpu; + + target_cpu = cpumask_first(pd->cpumask); + for (cpu = 0; cpu < cpu_index; cpu++) + target_cpu = cpumask_next(target_cpu, pd->cpumask); + + return target_cpu; +} + +static int padata_cpu_hash(struct padata_priv *padata) +{ + int cpu_index; + struct parallel_data *pd; + + pd = padata->pd; + + /* + * Hash the sequence numbers to the cpus by taking + * seq_nr mod. number of cpus in use. + */ + cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); + + return padata_index_to_cpu(pd, cpu_index); +} + +static void padata_parallel_worker(struct work_struct *work) +{ + struct padata_queue *queue; + struct parallel_data *pd; + struct padata_instance *pinst; + LIST_HEAD(local_list); + + local_bh_disable(); + queue = container_of(work, struct padata_queue, pwork); + pd = queue->pd; + pinst = pd->pinst; + + spin_lock(&queue->parallel.lock); + list_replace_init(&queue->parallel.list, &local_list); + spin_unlock(&queue->parallel.lock); + + while (!list_empty(&local_list)) { + struct padata_priv *padata; + + padata = list_entry(local_list.next, + struct padata_priv, list); + + list_del_init(&padata->list); + + padata->parallel(padata); + } + + local_bh_enable(); +} + +/* + * padata_do_parallel - padata parallelization function + * + * @pinst: padata instance + * @padata: object to be parallelized + * @cb_cpu: cpu the serialization callback function will run on, + * must be in the cpumask of padata. + * + * The parallelization callback function will run with BHs off. + * Note: Every object which is parallelized by padata_do_parallel + * must be seen by padata_do_serial. + */ +int padata_do_parallel(struct padata_instance *pinst, + struct padata_priv *padata, int cb_cpu) +{ + int target_cpu, err; + struct padata_queue *queue; + struct parallel_data *pd; + + rcu_read_lock_bh(); + + pd = rcu_dereference(pinst->pd); + + err = 0; + if (!(pinst->flags & PADATA_INIT)) + goto out; + + err = -EBUSY; + if ((pinst->flags & PADATA_RESET)) + goto out; + + if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) + goto out; + + err = -EINVAL; + if (!cpumask_test_cpu(cb_cpu, pd->cpumask)) + goto out; + + err = -EINPROGRESS; + atomic_inc(&pd->refcnt); + padata->pd = pd; + padata->cb_cpu = cb_cpu; + + if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) + atomic_set(&pd->seq_nr, -1); + + padata->seq_nr = atomic_inc_return(&pd->seq_nr); + + target_cpu = padata_cpu_hash(padata); + queue = per_cpu_ptr(pd->queue, target_cpu); + + spin_lock(&queue->parallel.lock); + list_add_tail(&padata->list, &queue->parallel.list); + spin_unlock(&queue->parallel.lock); + + queue_work_on(target_cpu, pinst->wq, &queue->pwork); + +out: + rcu_read_unlock_bh(); + + return err; +} +EXPORT_SYMBOL(padata_do_parallel); + +static struct padata_priv *padata_get_next(struct parallel_data *pd) +{ + int cpu, num_cpus, empty, calc_seq_nr; + int seq_nr, next_nr, overrun, next_overrun; + struct padata_queue *queue, *next_queue; + struct padata_priv *padata; + struct padata_list *reorder; + + empty = 0; + next_nr = -1; + next_overrun = 0; + next_queue = NULL; + + num_cpus = cpumask_weight(pd->cpumask); + + for_each_cpu(cpu, pd->cpumask) { + queue = per_cpu_ptr(pd->queue, cpu); + reorder = &queue->reorder; + + /* + * Calculate the seq_nr of the object that should be + * next in this queue. + */ + overrun = 0; + calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) + + queue->cpu_index; + + if (unlikely(calc_seq_nr > pd->max_seq_nr)) { + calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; + overrun = 1; + } + + if (!list_empty(&reorder->list)) { + padata = list_entry(reorder->list.next, + struct padata_priv, list); + + seq_nr = padata->seq_nr; + BUG_ON(calc_seq_nr != seq_nr); + } else { + seq_nr = calc_seq_nr; + empty++; + } + + if (next_nr < 0 || seq_nr < next_nr + || (next_overrun && !overrun)) { + next_nr = seq_nr; + next_overrun = overrun; + next_queue = queue; + } + } + + padata = NULL; + + if (empty == num_cpus) + goto out; + + reorder = &next_queue->reorder; + + if (!list_empty(&reorder->list)) { + padata = list_entry(reorder->list.next, + struct padata_priv, list); + + if (unlikely(next_overrun)) { + for_each_cpu(cpu, pd->cpumask) { + queue = per_cpu_ptr(pd->queue, cpu); + atomic_set(&queue->num_obj, 0); + } + } + + spin_lock(&reorder->lock); + list_del_init(&padata->list); + atomic_dec(&pd->reorder_objects); + spin_unlock(&reorder->lock); + + atomic_inc(&next_queue->num_obj); + + goto out; + } + + if (next_nr % num_cpus == next_queue->cpu_index) { + padata = ERR_PTR(-ENODATA); + goto out; + } + + padata = ERR_PTR(-EINPROGRESS); +out: + return padata; +} + +static void padata_reorder(struct parallel_data *pd) +{ + struct padata_priv *padata; + struct padata_queue *queue; + struct padata_instance *pinst = pd->pinst; + +try_again: + if (!spin_trylock_bh(&pd->lock)) + goto out; + + while (1) { + padata = padata_get_next(pd); + + if (!padata || PTR_ERR(padata) == -EINPROGRESS) + break; + + if (PTR_ERR(padata) == -ENODATA) { + spin_unlock_bh(&pd->lock); + goto out; + } + + queue = per_cpu_ptr(pd->queue, padata->cb_cpu); + + spin_lock(&queue->serial.lock); + list_add_tail(&padata->list, &queue->serial.list); + spin_unlock(&queue->serial.lock); + + queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); + } + + spin_unlock_bh(&pd->lock); + + if (atomic_read(&pd->reorder_objects)) + goto try_again; + +out: + return; +} + +static void padata_serial_worker(struct work_struct *work) +{ + struct padata_queue *queue; + struct parallel_data *pd; + LIST_HEAD(local_list); + + local_bh_disable(); + queue = container_of(work, struct padata_queue, swork); + pd = queue->pd; + + spin_lock(&queue->serial.lock); + list_replace_init(&queue->serial.list, &local_list); + spin_unlock(&queue->serial.lock); + + while (!list_empty(&local_list)) { + struct padata_priv *padata; + + padata = list_entry(local_list.next, + struct padata_priv, list); + + list_del_init(&padata->list); + + padata->serial(padata); + atomic_dec(&pd->refcnt); + } + local_bh_enable(); +} + +/* + * padata_do_serial - padata serialization function + * + * @padata: object to be serialized. + * + * padata_do_serial must be called for every parallelized object. + * The serialization callback function will run with BHs off. + */ +void padata_do_serial(struct padata_priv *padata) +{ + int cpu; + struct padata_queue *queue; + struct parallel_data *pd; + + pd = padata->pd; + + cpu = get_cpu(); + queue = per_cpu_ptr(pd->queue, cpu); + + spin_lock(&queue->reorder.lock); + atomic_inc(&pd->reorder_objects); + list_add_tail(&padata->list, &queue->reorder.list); + spin_unlock(&queue->reorder.lock); + + put_cpu(); + + padata_reorder(pd); +} +EXPORT_SYMBOL(padata_do_serial); + +static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, + const struct cpumask *cpumask) +{ + int cpu, cpu_index, num_cpus; + struct padata_queue *queue; + struct parallel_data *pd; + + cpu_index = 0; + + pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); + if (!pd) + goto err; + + pd->queue = alloc_percpu(struct padata_queue); + if (!pd->queue) + goto err_free_pd; + + if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) + goto err_free_queue; + + for_each_possible_cpu(cpu) { + queue = per_cpu_ptr(pd->queue, cpu); + + queue->pd = pd; + + if (cpumask_test_cpu(cpu, cpumask) + && cpumask_test_cpu(cpu, cpu_active_mask)) { + queue->cpu_index = cpu_index; + cpu_index++; + } else + queue->cpu_index = -1; + + INIT_LIST_HEAD(&queue->reorder.list); + INIT_LIST_HEAD(&queue->parallel.list); + INIT_LIST_HEAD(&queue->serial.list); + spin_lock_init(&queue->reorder.lock); + spin_lock_init(&queue->parallel.lock); + spin_lock_init(&queue->serial.lock); + + INIT_WORK(&queue->pwork, padata_parallel_worker); + INIT_WORK(&queue->swork, padata_serial_worker); + atomic_set(&queue->num_obj, 0); + } + + cpumask_and(pd->cpumask, cpumask, cpu_active_mask); + + num_cpus = cpumask_weight(pd->cpumask); + pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; + + atomic_set(&pd->seq_nr, -1); + atomic_set(&pd->reorder_objects, 0); + atomic_set(&pd->refcnt, 0); + pd->pinst = pinst; + spin_lock_init(&pd->lock); + + return pd; + +err_free_queue: + free_percpu(pd->queue); +err_free_pd: + kfree(pd); +err: + return NULL; +} + +static void padata_free_pd(struct parallel_data *pd) +{ + free_cpumask_var(pd->cpumask); + free_percpu(pd->queue); + kfree(pd); +} + +static void padata_replace(struct padata_instance *pinst, + struct parallel_data *pd_new) +{ + struct parallel_data *pd_old = pinst->pd; + + pinst->flags |= PADATA_RESET; + + rcu_assign_pointer(pinst->pd, pd_new); + + synchronize_rcu(); + + while (atomic_read(&pd_old->refcnt) != 0) + yield(); + + flush_workqueue(pinst->wq); + + padata_free_pd(pd_old); + + pinst->flags &= ~PADATA_RESET; +} + +/* + * padata_set_cpumask - set the cpumask that padata should use + * + * @pinst: padata instance + * @cpumask: the cpumask to use + */ +int padata_set_cpumask(struct padata_instance *pinst, + cpumask_var_t cpumask) +{ + struct parallel_data *pd; + int err = 0; + + might_sleep(); + + mutex_lock(&pinst->lock); + + pd = padata_alloc_pd(pinst, cpumask); + if (!pd) { + err = -ENOMEM; + goto out; + } + + cpumask_copy(pinst->cpumask, cpumask); + + padata_replace(pinst, pd); + +out: + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_set_cpumask); + +static int __padata_add_cpu(struct padata_instance *pinst, int cpu) +{ + struct parallel_data *pd; + + if (cpumask_test_cpu(cpu, cpu_active_mask)) { + pd = padata_alloc_pd(pinst, pinst->cpumask); + if (!pd) + return -ENOMEM; + + padata_replace(pinst, pd); + } + + return 0; +} + +/* + * padata_add_cpu - add a cpu to the padata cpumask + * + * @pinst: padata instance + * @cpu: cpu to add + */ +int padata_add_cpu(struct padata_instance *pinst, int cpu) +{ + int err; + + might_sleep(); + + mutex_lock(&pinst->lock); + + cpumask_set_cpu(cpu, pinst->cpumask); + err = __padata_add_cpu(pinst, cpu); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_add_cpu); + +static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) +{ + struct parallel_data *pd; + + if (cpumask_test_cpu(cpu, cpu_online_mask)) { + pd = padata_alloc_pd(pinst, pinst->cpumask); + if (!pd) + return -ENOMEM; + + padata_replace(pinst, pd); + } + + return 0; +} + +/* + * padata_remove_cpu - remove a cpu from the padata cpumask + * + * @pinst: padata instance + * @cpu: cpu to remove + */ +int padata_remove_cpu(struct padata_instance *pinst, int cpu) +{ + int err; + + might_sleep(); + + mutex_lock(&pinst->lock); + + cpumask_clear_cpu(cpu, pinst->cpumask); + err = __padata_remove_cpu(pinst, cpu); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_remove_cpu); + +/* + * padata_start - start the parallel processing + * + * @pinst: padata instance to start + */ +void padata_start(struct padata_instance *pinst) +{ + might_sleep(); + + mutex_lock(&pinst->lock); + pinst->flags |= PADATA_INIT; + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_start); + +/* + * padata_stop - stop the parallel processing + * + * @pinst: padata instance to stop + */ +void padata_stop(struct padata_instance *pinst) +{ + might_sleep(); + + mutex_lock(&pinst->lock); + pinst->flags &= ~PADATA_INIT; + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_stop); + +static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int err; + struct padata_instance *pinst; + int cpu = (unsigned long)hcpu; + + pinst = container_of(nfb, struct padata_instance, cpu_notifier); + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + err = __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + if (err) + return NOTIFY_BAD; + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + err = __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + if (err) + return NOTIFY_BAD; + break; + + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + } + + return NOTIFY_OK; +} + +/* + * padata_alloc - allocate and initialize a padata instance + * + * @cpumask: cpumask that padata uses for parallelization + * @wq: workqueue to use for the allocated padata instance + */ +struct padata_instance *padata_alloc(const struct cpumask *cpumask, + struct workqueue_struct *wq) +{ + int err; + struct padata_instance *pinst; + struct parallel_data *pd; + + pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); + if (!pinst) + goto err; + + pd = padata_alloc_pd(pinst, cpumask); + if (!pd) + goto err_free_inst; + + rcu_assign_pointer(pinst->pd, pd); + + pinst->wq = wq; + + cpumask_copy(pinst->cpumask, cpumask); + + pinst->flags = 0; + + pinst->cpu_notifier.notifier_call = padata_cpu_callback; + pinst->cpu_notifier.priority = 0; + err = register_hotcpu_notifier(&pinst->cpu_notifier); + if (err) + goto err_free_pd; + + mutex_init(&pinst->lock); + + return pinst; + +err_free_pd: + padata_free_pd(pd); +err_free_inst: + kfree(pinst); +err: + return NULL; +} +EXPORT_SYMBOL(padata_alloc); + +/* + * padata_free - free a padata instance + * + * @ padata_inst: padata instance to free + */ +void padata_free(struct padata_instance *pinst) +{ + padata_stop(pinst); + + synchronize_rcu(); + + while (atomic_read(&pinst->pd->refcnt) != 0) + yield(); + + unregister_hotcpu_notifier(&pinst->cpu_notifier); + padata_free_pd(pinst->pd); + kfree(pinst); +} +EXPORT_SYMBOL(padata_free); -- cgit v1.2.3 From 509e760cd91c831983097ae174cb6c0b8c6c8e6b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:42 +0800 Subject: tracing: Add print_fmt field This is part of a patch set that removes the show_format method in the ftrace event macros. The print_fmt field is added to hold the string that shows the print_fmt in the event format files. This patch only adds the field but it is currently not used. Later patches will use this field to enable us to remove the show_format field and function. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D3E.2000704@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + include/trace/ftrace.h | 28 +++++++++++++++++++++++++++- kernel/trace/trace_export.c | 7 +++++++ 3 files changed, 35 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 2233c98d80df..bd23d8e52f02 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -121,6 +121,7 @@ struct ftrace_event_call { int (*regfunc)(struct ftrace_event_call *); void (*unregfunc)(struct ftrace_event_call *); int id; + const char *print_fmt; int (*raw_init)(struct ftrace_event_call *); int (*show_format)(struct ftrace_event_call *, struct trace_seq *); diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index c6fe03e902ca..3351d85c83a3 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -722,8 +722,20 @@ static struct trace_event ftrace_event_type_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#undef __entry +#define __entry REC + +#undef __print_flags +#undef __print_symbolic +#undef __get_dynamic_array +#undef __get_str + +#undef TP_printk +#define TP_printk(fmt, args...) "\"" fmt "\", " __stringify(args) + #undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +static const char print_fmt_##call[] = print; #undef DEFINE_EVENT #define DEFINE_EVENT(template, call, proto, args) \ @@ -737,6 +749,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .raw_init = trace_event_raw_init, \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ + .print_fmt = print_fmt_##template, \ .show_format = ftrace_format_##template, \ .define_fields = ftrace_define_fields_##template, \ _TRACE_PROFILE_INIT(call) \ @@ -745,6 +758,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, call, proto, args, print) \ \ +static const char print_fmt_##call[] = print; \ + \ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ @@ -754,6 +769,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .raw_init = trace_event_raw_init, \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ + .print_fmt = print_fmt_##call, \ .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##template, \ _TRACE_PROFILE_INIT(call) \ @@ -837,6 +853,16 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #ifdef CONFIG_EVENT_PROFILE +#undef __entry +#define __entry entry + +#undef __get_dynamic_array +#define __get_dynamic_array(field) \ + ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) + +#undef __get_str +#define __get_str(field) (char *)__get_dynamic_array(field) + #undef __perf_addr #define __perf_addr(a) __addr = (a) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 9978a4f40090..95d14b640a66 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -203,6 +203,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) return 0; } +#undef __entry +#define __entry REC + #undef __field #define __field(type, item) @@ -218,6 +221,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) #undef __dynamic_array #define __dynamic_array(type, item) +#undef F_printk +#define F_printk(fmt, args...) #fmt ", " __stringify(args) + #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ \ @@ -228,6 +234,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .id = type, \ .system = __stringify(TRACE_SYSTEM), \ .raw_init = ftrace_raw_init_event, \ + .print_fmt = print, \ .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##call, \ }; \ -- cgit v1.2.3 From c7ef3a9004201bca90626db246a19dadd2c29c9b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 28 Dec 2009 21:13:59 -0500 Subject: tracing: Have syscall tracing call its own init function In the clean up of having all events call one specific function, the syscall event init was changed to call this helper function. With the new print_fmt updates, the syscalls need to do special initializations. This patch converts the syscall events to call its own init function again. Cc: Lai Jiangshan Cc: Li Zefan Signed-off-by: Steven Rostedt --- include/linux/syscalls.h | 4 ++-- kernel/trace/trace_syscalls.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 207466a49f3d..ed353d274a77 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -143,7 +143,7 @@ struct perf_event_attr; .name = "sys_enter"#sname, \ .system = "syscalls", \ .event = &enter_syscall_print_##sname, \ - .raw_init = trace_event_raw_init, \ + .raw_init = init_syscall_trace, \ .show_format = syscall_enter_format, \ .define_fields = syscall_enter_define_fields, \ .regfunc = reg_event_syscall_enter, \ @@ -165,7 +165,7 @@ struct perf_event_attr; .name = "sys_exit"#sname, \ .system = "syscalls", \ .event = &exit_syscall_print_##sname, \ - .raw_init = trace_event_raw_init, \ + .raw_init = init_syscall_trace, \ .show_format = syscall_exit_format, \ .define_fields = syscall_exit_define_fields, \ .regfunc = reg_event_syscall_exit, \ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 1352b0a36fac..a78e86349ecb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -450,14 +450,14 @@ int init_syscall_trace(struct ftrace_event_call *call) if (set_syscall_print_fmt(call) < 0) return -ENOMEM; - id = register_ftrace_event(call->event); - if (!id) { + id = trace_event_raw_init(call); + + if (id < 0) { free_syscall_print_fmt(call); - return -ENODEV; + return id; } - call->id = id; - INIT_LIST_HEAD(&call->fields); - return 0; + + return id; } int __init init_ftrace_syscalls(void) -- cgit v1.2.3 From 0fa0edaf32b9a78b9854f1da98d4511a501089b0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:57 +0800 Subject: tracing: Remove show_format and related macros from TRACE_EVENT The previous patches added the use of print_fmt string and changes the trace_define_field() function to also create the fields and format output for the event format files. text data bss dec hex filename 5857201 1355780 9336808 16549789 fc879d vmlinux 5884589 1351684 9337896 16574169 fce6d9 vmlinux-orig The above shows the size of the vmlinux after this patch set compared to the vmlinux-orig which is before the patch set. This saves us 27k on text, 1k on bss and adds just 4k of data. The total savings of 24k in size. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D4D.40604@cn.fujitsu.com> Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 - include/linux/syscalls.h | 2 - include/trace/ftrace.h | 133 ++---------------------------------------- include/trace/syscall.h | 4 -- kernel/trace/trace_events.c | 12 ---- kernel/trace/trace_export.c | 73 ----------------------- kernel/trace/trace_kprobe.c | 78 ------------------------- kernel/trace/trace_syscalls.c | 66 --------------------- 8 files changed, 6 insertions(+), 364 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index bd23d8e52f02..84a5629adfd8 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -123,8 +123,6 @@ struct ftrace_event_call { int id; const char *print_fmt; int (*raw_init)(struct ftrace_event_call *); - int (*show_format)(struct ftrace_event_call *, - struct trace_seq *); int (*define_fields)(struct ftrace_event_call *); struct list_head fields; int filter_active; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index ed353d274a77..7b219696ad24 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -144,7 +144,6 @@ struct perf_event_attr; .system = "syscalls", \ .event = &enter_syscall_print_##sname, \ .raw_init = init_syscall_trace, \ - .show_format = syscall_enter_format, \ .define_fields = syscall_enter_define_fields, \ .regfunc = reg_event_syscall_enter, \ .unregfunc = unreg_event_syscall_enter, \ @@ -166,7 +165,6 @@ struct perf_event_attr; .system = "syscalls", \ .event = &exit_syscall_print_##sname, \ .raw_init = init_syscall_trace, \ - .show_format = syscall_exit_format, \ .define_fields = syscall_exit_define_fields, \ .regfunc = reg_event_syscall_exit, \ .unregfunc = unreg_event_syscall_exit, \ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 3351d85c83a3..df65b99880b1 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -130,130 +130,6 @@ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - * struct ftrace_raw_##call field; - * int ret; - * - * ret = trace_seq_printf(s, #type " " #item ";" - * " offset:%u; size:%u;\n", - * offsetof(struct ftrace_raw_##call, item), - * sizeof(field.type)); - * - * } - */ - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\tsigned:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item), \ - (unsigned int)is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __field_ext -#define __field_ext(type, item, filter_type) __field(type, item) - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%u;\tsize:%u;\tsigned:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item), \ - (unsigned int)is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __dynamic_array -#define __dynamic_array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:__data_loc " #type "[] " #item ";\t"\ - "offset:%u;\tsize:%u;\tsigned:%u;\n", \ - (unsigned int)offsetof(typeof(field), \ - __data_loc_##item), \ - (unsigned int)sizeof(field.__data_loc_##item), \ - (unsigned int)is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __string -#define __string(item, src) __dynamic_array(char, item, -1) - -#undef __entry -#define __entry REC - -#undef __print_symbolic -#undef __get_dynamic_array -#undef __get_str - -#undef TP_printk -#define TP_printk(fmt, args...) "\"%s\", %s\n", fmt, __stringify(args) - -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TP_perf_assign -#define TP_perf_assign(args...) - -#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print) \ -static int \ -ftrace_format_setup_##call(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - struct ftrace_raw_##call field __attribute__((unused)); \ - int ret = 0; \ - \ - tstruct; \ - \ - return ret; \ -} \ - \ -static int \ -ftrace_format_##call(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - int ret = 0; \ - \ - ret = ftrace_format_setup_##call(unused, s); \ - if (!ret) \ - return ret; \ - \ - ret = trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#undef DEFINE_EVENT -#define DEFINE_EVENT(template, name, proto, args) - -#undef DEFINE_EVENT_PRINT -#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ -static int \ -ftrace_format_##name(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - int ret = 0; \ - \ - ret = ftrace_format_setup_##template(unused, s); \ - if (!ret) \ - return ret; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) - /* * Stage 3 of the trace events. * @@ -622,7 +498,6 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ * .raw_init = trace_event_raw_init, * .regfunc = ftrace_reg_event_, * .unregfunc = ftrace_unreg_event_, - * .show_format = ftrace_format_, * } * */ @@ -657,6 +532,12 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ #define __assign_str(dst, src) \ strcpy(__get_str(dst), src); +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TP_perf_assign +#define TP_perf_assign(args...) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ \ @@ -750,7 +631,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ .print_fmt = print_fmt_##template, \ - .show_format = ftrace_format_##template, \ .define_fields = ftrace_define_fields_##template, \ _TRACE_PROFILE_INIT(call) \ } @@ -770,7 +650,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ .print_fmt = print_fmt_##call, \ - .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##template, \ _TRACE_PROFILE_INIT(call) \ } diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 961fda3556bb..8cd410254456 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -34,10 +34,6 @@ struct syscall_metadata { extern unsigned long arch_syscall_addr(int nr); extern int init_syscall_trace(struct ftrace_event_call *call); -extern int syscall_enter_format(struct ftrace_event_call *call, - struct trace_seq *s); -extern int syscall_exit_format(struct ftrace_event_call *call, - struct trace_seq *s); extern int syscall_enter_define_fields(struct ftrace_event_call *call); extern int syscall_exit_define_fields(struct ftrace_event_call *call); extern int reg_event_syscall_enter(struct ftrace_event_call *call); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 250ec865d5f5..c2a3077b7353 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -520,14 +520,6 @@ out: return ret; } -extern char *__bad_type_size(void); - -#undef FIELD -#define FIELD(type, name) \ - sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ - #type, "common_" #name, offsetof(typeof(field), name), \ - sizeof(field.name), is_signed_type(type) - static ssize_t event_format_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -965,10 +957,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, filter); } - /* A trace may not want to export its format */ - if (!call->show_format) - return 0; - trace_create_file("format", 0444, call->dir, call, format); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 95d14b640a66..e091f64ba6ce 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \ #include "trace_entries.h" - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __field_desc -#define __field_desc(type, container, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), container.item), \ - sizeof(field.container.item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __array_desc -#define __array_desc(type, container, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), container.item), \ - sizeof(field.container.item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __dynamic_array -#define __dynamic_array(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:0;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef F_printk -#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef __entry -#define __entry REC - -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -static int \ -ftrace_format_##name(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - struct struct_name field __attribute__((unused)); \ - int ret = 0; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include "trace_entries.h" - #undef __field #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ @@ -235,7 +163,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .raw_init = ftrace_raw_init_event, \ .print_fmt = print, \ - .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##call, \ }; \ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 147491dccead..c99029916c76 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1174,82 +1174,6 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -static int __probe_event_show_format(struct trace_seq *s, - struct trace_probe *tp, const char *fmt, - const char *arg) -{ - int i; - - /* Show format */ - if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) - return 0; - - for (i = 0; i < tp->nr_args; i++) - if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) - return 0; - - if (!trace_seq_printf(s, "\", %s", arg)) - return 0; - - for (i = 0; i < tp->nr_args; i++) - if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) - return 0; - - return trace_seq_puts(s, "\n"); -} - -#undef SHOW_FIELD -#define SHOW_FIELD(type, item, name) \ - do { \ - ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ - "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\ - (unsigned int)offsetof(typeof(field), item),\ - (unsigned int)sizeof(type), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; \ - } while (0) - -static int kprobe_event_show_format(struct ftrace_event_call *call, - struct trace_seq *s) -{ - struct kprobe_trace_entry field __attribute__((unused)); - int ret, i; - struct trace_probe *tp = (struct trace_probe *)call->data; - - SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); - SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); - - /* Show fields */ - for (i = 0; i < tp->nr_args; i++) - SHOW_FIELD(unsigned long, args[i], tp->args[i].name); - trace_seq_puts(s, "\n"); - - return __probe_event_show_format(s, tp, "(%lx)", - "REC->" FIELD_STRING_IP); -} - -static int kretprobe_event_show_format(struct ftrace_event_call *call, - struct trace_seq *s) -{ - struct kretprobe_trace_entry field __attribute__((unused)); - int ret, i; - struct trace_probe *tp = (struct trace_probe *)call->data; - - SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); - SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); - SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); - - /* Show fields */ - for (i = 0; i < tp->nr_args; i++) - SHOW_FIELD(unsigned long, args[i], tp->args[i].name); - trace_seq_puts(s, "\n"); - - return __probe_event_show_format(s, tp, "(%lx <- %lx)", - "REC->" FIELD_STRING_FUNC - ", REC->" FIELD_STRING_RETIP); -} - static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) { int i; @@ -1504,12 +1428,10 @@ static int register_probe_event(struct trace_probe *tp) if (probe_is_return(tp)) { tp->event.trace = print_kretprobe_event; call->raw_init = probe_event_raw_init; - call->show_format = kretprobe_event_show_format; call->define_fields = kretprobe_event_define_fields; } else { tp->event.trace = print_kprobe_event; call->raw_init = probe_event_raw_init; - call->show_format = kprobe_event_show_format; call->define_fields = kprobe_event_define_fields; } if (set_print_fmt(tp) < 0) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index a78e86349ecb..49cea70fbf6d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -143,54 +143,6 @@ extern char *__bad_type_size(void); #type, #name, offsetof(typeof(trace), name), \ sizeof(trace.name), is_signed_type(type) -int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) -{ - int i; - int ret; - struct syscall_metadata *entry = call->data; - struct syscall_trace_enter trace; - int offset = offsetof(struct syscall_trace_enter, args); - - ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n", - SYSCALL_FIELD(int, nr)); - if (!ret) - return 0; - - for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], - entry->args[i]); - if (!ret) - return 0; - ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" - "\tsigned:%u;\n", offset, - sizeof(unsigned long), - is_signed_type(unsigned long)); - if (!ret) - return 0; - offset += sizeof(unsigned long); - } - - trace_seq_puts(s, "\nprint fmt: \""); - for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], - sizeof(unsigned long), - i == entry->nb_args - 1 ? "" : ", "); - if (!ret) - return 0; - } - trace_seq_putc(s, '"'); - - for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))", - entry->args[i]); - if (!ret) - return 0; - } - - return trace_seq_putc(s, '\n'); -} - static int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { @@ -252,24 +204,6 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call) kfree(call->print_fmt); } -int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) -{ - int ret; - struct syscall_trace_exit trace; - - ret = trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n", - SYSCALL_FIELD(int, nr), - SYSCALL_FIELD(long, ret)); - if (!ret) - return 0; - - return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); -} - int syscall_enter_define_fields(struct ftrace_event_call *call) { struct syscall_trace_enter trace; -- cgit v1.2.3 From 8558e3943df1c51c3377cb4e8a52ea484d6f357d Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Jan 2010 16:11:06 -0500 Subject: x86, ACPI: delete acpi_boot_table_init() return value cleanup only. setup_arch(), doesn't care care if ACPI initialization succeeded or failed, so delete acpi_boot_table_init()'s return value. Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 22 ++++++---------------- include/linux/acpi.h | 6 +++--- 2 files changed, 9 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index fb1035cd9a6a..036d28adf59d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1529,16 +1529,10 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { * if acpi_blacklisted() acpi_disabled = 1; * acpi_irq_model=... * ... - * - * return value: (currently ignored) - * 0: success - * !0: failure */ -int __init acpi_boot_table_init(void) +void __init acpi_boot_table_init(void) { - int error; - dmi_check_system(acpi_dmi_table); /* @@ -1546,15 +1540,14 @@ int __init acpi_boot_table_init(void) * One exception: acpi=ht continues far enough to enumerate LAPICs */ if (acpi_disabled && !acpi_ht) - return 1; + return; /* * Initialize the ACPI boot-time table parser. */ - error = acpi_table_init(); - if (error) { + if (acpi_table_init()) { disable_acpi(); - return error; + return; } acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1562,18 +1555,15 @@ int __init acpi_boot_table_init(void) /* * blacklist may disable ACPI entirely */ - error = acpi_blacklisted(); - if (error) { + if (acpi_blacklisted()) { if (acpi_force) { printk(KERN_WARNING PREFIX "acpi=force override\n"); } else { printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); disable_acpi(); - return error; + return; } } - - return 0; } int __init early_acpi_boot_init(void) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 36924255c0d5..b926afe8c03e 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -80,7 +80,7 @@ char * __acpi_map_table (unsigned long phys_addr, unsigned long size); void __acpi_unmap_table(char *map, unsigned long size); int early_acpi_boot_init(void); int acpi_boot_init (void); -int acpi_boot_table_init (void); +void acpi_boot_table_init (void); int acpi_mps_check (void); int acpi_numa_init (void); @@ -321,9 +321,9 @@ static inline int acpi_boot_init(void) return 0; } -static inline int acpi_boot_table_init(void) +static inline void acpi_boot_table_init(void) { - return 0; + return; } static inline int acpi_mps_check(void) -- cgit v1.2.3 From cfe79c00a2f4f687eed8b7534d1d3d3d35540c29 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 6 Jan 2010 17:23:23 +0000 Subject: NOMMU: Avoiding duplicate icache flushes of shared maps When working with FDPIC, there are many shared mappings of read-only code regions between applications (the C library, applet packages like busybox, etc.), but the current do_mmap_pgoff() function will issue an icache flush whenever a VMA is added to an MM instead of only doing it when the map is initially created. The flush can instead be done when a region is first mmapped PROT_EXEC. Note that we may not rely on the first mapping of a region being executable - it's possible for it to be PROT_READ only, so we have to remember whether we've flushed the region or not, and then flush the entire region when a bit of it is made executable. However, this also affects the brk area. That will no longer be executable. We can mprotect() it to PROT_EXEC on MPU-mode kernels, but for NOMMU mode kernels, when it increases the brk allocation, making sys_brk() flush the extra from the icache should suffice. The brk area probably isn't used by NOMMU programs since the brk area can only use up the leavings from the stack allocation, where the stack allocation is larger than requested. Signed-off-by: David Howells Signed-off-by: Mike Frysinger Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 ++ mm/nommu.c | 11 ++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 84a524afb3dc..84d020bed083 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -123,6 +123,8 @@ struct vm_region { struct file *vm_file; /* the backing file or NULL */ atomic_t vm_usage; /* region usage count */ + bool vm_icache_flushed : 1; /* true if the icache has been flushed for + * this region */ }; /* diff --git a/mm/nommu.c b/mm/nommu.c index 6f9248f89bde..a8d17521624a 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Ok, looks good - let it rip. */ + flush_icache_range(mm->brk, brk); return mm->brk = brk; } @@ -1353,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file, share: add_vma_to_mm(current->mm, vma); - up_write(&nommu_region_sem); + /* we flush the region from the icache only when the first executable + * mapping of it is made */ + if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { + flush_icache_range(region->vm_start, region->vm_end); + region->vm_icache_flushed = true; + } - if (prot & PROT_EXEC) - flush_icache_range(result, result + len); + up_write(&nommu_region_sem); kleave(" = %lx", result); return result; -- cgit v1.2.3 From 65324144b50bc7022cc9b6ca8f4a536a957019e3 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 5 Jan 2010 05:50:47 +0000 Subject: net: RFC3069, private VLAN proxy arp support This is to be used together with switch technologies, like RFC3069, that where the individual ports are not allowed to communicate with each other, but they are allowed to talk to the upstream router. As described in RFC 3069, it is possible to allow these hosts to communicate through the upstream router by proxy_arp'ing. This patch basically allow proxy arp replies back to the same interface (from which the ARP request/solicitation was received). Tunable per device via proc "proxy_arp_pvlan": /proc/sys/net/ipv4/conf/*/proxy_arp_pvlan This switch technology is known by different vendor names: - In RFC 3069 it is called VLAN Aggregation. - Cisco and Allied Telesyn call it Private VLAN. - Hewlett-Packard call it Source-Port filtering or port-isolation. - Ericsson call it MAC-Forced Forwarding (RFC Draft). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 19 +++++++++++++ include/linux/inetdevice.h | 1 + include/linux/sysctl.h | 1 + net/ipv4/arp.c | 52 +++++++++++++++++++++++++++++++--- net/ipv4/devinet.c | 1 + net/ipv4/route.c | 7 ++++- 6 files changed, 76 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 006b39dec87d..c532884f4fec 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -692,6 +692,25 @@ proxy_arp - BOOLEAN conf/{all,interface}/proxy_arp is set to TRUE, it will be disabled otherwise +proxy_arp_pvlan - BOOLEAN + Private VLAN proxy arp. + Basically allow proxy arp replies back to the same interface + (from which the ARP request/solicitation was received). + + This is done to support (ethernet) switch features, like RFC + 3069, where the individual ports are NOT allowed to + communicate with each other, but they are allowed to talk to + the upstream router. As described in RFC 3069, it is possible + to allow these hosts to communicate through the upstream + router by proxy_arp'ing. Don't need to be used together with + proxy_arp. + + This technology is known by different names: + In RFC 3069 it is called VLAN Aggregation. + Cisco and Allied Telesyn call it Private VLAN. + Hewlett-Packard call it Source-Port filtering or port-isolation. + Ericsson call it MAC-Forced Forwarding (RFC Draft). + shared_media - BOOLEAN Send(router) or accept(host) RFC1620 shared media redirects. Overrides ip_secure_redirects. diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 699e85c01a4d..9a8c57467d3d 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -88,6 +88,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) #define IN_DEV_LOG_MARTIANS(in_dev) IN_DEV_ORCONF((in_dev), LOG_MARTIANS) #define IN_DEV_PROXY_ARP(in_dev) IN_DEV_ORCONF((in_dev), PROXY_ARP) +#define IN_DEV_PROXY_ARP_PVLAN(in_dev) IN_DEV_CONF_GET(in_dev, PROXY_ARP_PVLAN) #define IN_DEV_SHARED_MEDIA(in_dev) IN_DEV_ORCONF((in_dev), SHARED_MEDIA) #define IN_DEV_TX_REDIRECTS(in_dev) IN_DEV_ORCONF((in_dev), SEND_REDIRECTS) #define IN_DEV_SEC_REDIRECTS(in_dev) IN_DEV_ORCONF((in_dev), \ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 877ba039e6a4..24ff7e3a0d59 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -482,6 +482,7 @@ enum NET_IPV4_CONF_ARP_ACCEPT=21, NET_IPV4_CONF_ARP_NOTIFY=22, NET_IPV4_CONF_ACCEPT_LOCAL=23, + NET_IPV4_CONF_PROXY_ARP_PVLAN=24, __NET_IPV4_CONF_MAX }; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c95cd93acf29..078709233bc4 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -70,6 +70,7 @@ * bonding can change the skb before * sending (e.g. insert 8021q tag). * Harald Welte : convert to make use of jenkins hash + * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. */ #include @@ -524,12 +525,15 @@ int arp_bind_neighbour(struct dst_entry *dst) /* * Check if we can use proxy ARP for this path */ - -static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) +static inline int arp_fwd_proxy(struct in_device *in_dev, + struct net_device *dev, struct rtable *rt) { struct in_device *out_dev; int imi, omi = -1; + if (rt->u.dst.dev == dev) + return 0; + if (!IN_DEV_PROXY_ARP(in_dev)) return 0; @@ -547,6 +551,43 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) return (omi != imi && omi != -1); } +/* + * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev) + * + * RFC3069 supports proxy arp replies back to the same interface. This + * is done to support (ethernet) switch features, like RFC 3069, where + * the individual ports are not allowed to communicate with each + * other, BUT they are allowed to talk to the upstream router. As + * described in RFC 3069, it is possible to allow these hosts to + * communicate through the upstream router, by proxy_arp'ing. + * + * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation" + * + * This technology is known by different names: + * In RFC 3069 it is called VLAN Aggregation. + * Cisco and Allied Telesyn call it Private VLAN. + * Hewlett-Packard call it Source-Port filtering or port-isolation. + * Ericsson call it MAC-Forced Forwarding (RFC Draft). + * + */ +static inline int arp_fwd_pvlan(struct in_device *in_dev, + struct net_device *dev, struct rtable *rt, + __be32 sip, __be32 tip) +{ + /* Private VLAN is only concerned about the same ethernet segment */ + if (rt->u.dst.dev != dev) + return 0; + + /* Don't reply on self probes (often done by windowz boxes)*/ + if (sip == tip) + return 0; + + if (IN_DEV_PROXY_ARP_PVLAN(in_dev)) + return 1; + else + return 0; +} + /* * Interface to link layer: send routine and receive handler. */ @@ -833,8 +874,11 @@ static int arp_process(struct sk_buff *skb) } goto out; } else if (IN_DEV_FORWARD(in_dev)) { - if (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && - (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { + if (addr_type == RTN_UNICAST && + (arp_fwd_proxy(in_dev, dev, rt) || + arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || + pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) + { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5cdbc102a418..0715f4cac391 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1407,6 +1407,7 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), + DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e446496f564f..1cc339441e7d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1988,8 +1988,13 @@ static int __mkroute_input(struct sk_buff *skb, if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. + * + * Proxy arp feature have been extended to allow, ARP + * replies back to the same interface, to support + * Private VLAN switch technologies. See arp.c. */ - if (out_dev == in_dev) { + if (out_dev == in_dev && + IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { err = -EINVAL; goto cleanup; } -- cgit v1.2.3 From 6144a85a0e018c19bc4b24f7eb6c1f3f7431813d Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 7 Jan 2010 11:58:36 -0600 Subject: maccess,probe_kernel: Allow arch specific override probe_kernel_(read|write) Some archs such as blackfin, would like to have an arch specific probe_kernel_read() and probe_kernel_write() implementation which can fall back to the generic implementation if no special operations are needed. CC: Thomas Gleixner CC: Ingo Molnar Signed-off-by: Jason Wessel Signed-off-by: Mike Frysinger --- include/linux/uaccess.h | 4 +++- mm/maccess.c | 11 +++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 6b58367d145e..d512d98dfb7d 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -94,6 +94,7 @@ static inline unsigned long __copy_from_user_nocache(void *to, * happens, handle that and return -EFAULT. */ extern long probe_kernel_read(void *dst, void *src, size_t size); +extern long __probe_kernel_read(void *dst, void *src, size_t size); /* * probe_kernel_write(): safely attempt to write to a location @@ -104,6 +105,7 @@ extern long probe_kernel_read(void *dst, void *src, size_t size); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ -extern long probe_kernel_write(void *dst, void *src, size_t size); +extern long notrace probe_kernel_write(void *dst, void *src, size_t size); +extern long notrace __probe_kernel_write(void *dst, void *src, size_t size); #endif /* __LINUX_UACCESS_H__ */ diff --git a/mm/maccess.c b/mm/maccess.c index 9073695ff25f..4e348dbaecd7 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -14,7 +14,11 @@ * Safely read from address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. */ -long probe_kernel_read(void *dst, void *src, size_t size) + +long __weak probe_kernel_read(void *dst, void *src, size_t size) + __attribute__((alias("__probe_kernel_read"))); + +long __probe_kernel_read(void *dst, void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); @@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ -long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) +long __weak probe_kernel_write(void *dst, void *src, size_t size) + __attribute__((alias("__probe_kernel_write"))); + +long __probe_kernel_write(void *dst, void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); -- cgit v1.2.3 From b11e1eca7ed9c0b5dab21a62c11acc711d9bdda0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 7 Jan 2010 11:58:37 -0600 Subject: kgdb: Fix kernel-doc format error in kgdb.h linux-next-20081022//include/linux/kgdb.h:308): duplicate section name 'Description' and fix typos in that file's kernel-doc comments. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Jason Wessel --- include/linux/kgdb.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 6adcc297e354..19ec41a183f5 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -29,8 +29,7 @@ struct pt_regs; * * On some architectures it is required to skip a breakpoint * exception when it occurs after a breakpoint has been removed. - * This can be implemented in the architecture specific portion of - * for kgdb. + * This can be implemented in the architecture specific portion of kgdb. */ extern int kgdb_skipexception(int exception, struct pt_regs *regs); @@ -65,7 +64,7 @@ struct uart_port; /** * kgdb_breakpoint - compiled in breakpoint * - * This will be impelmented a static inline per architecture. This + * This will be implemented as a static inline per architecture. This * function is called by the kgdb core to execute an architecture * specific trap to cause kgdb to enter the exception processing. * @@ -190,7 +189,7 @@ kgdb_arch_handle_exception(int vector, int signo, int err_code, * @flags: Current IRQ state * * On SMP systems, we need to get the attention of the other CPUs - * and get them be in a known state. This should do what is needed + * and get them into a known state. This should do what is needed * to get the other CPUs to call kgdb_wait(). Note that on some arches, * the NMI approach is not used for rounding up all the CPUs. For example, * in case of MIPS, smp_call_function() is used to roundup CPUs. In -- cgit v1.2.3 From 3c9732c06879d85f2fdf7ec69198c1d78da42a98 Mon Sep 17 00:00:00 2001 From: Giuseppe CAVALLARO Date: Wed, 6 Jan 2010 23:07:13 +0000 Subject: stmmac: add the new Header file for stmmac platform data Signed-off-by: Giuseppe Cavallaro Signed-off-by: David S. Miller --- include/linux/stmmac.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 include/linux/stmmac.h (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h new file mode 100644 index 000000000000..32bfd1a8a48d --- /dev/null +++ b/include/linux/stmmac.h @@ -0,0 +1,53 @@ +/******************************************************************************* + + Header file for stmmac platform data + + Copyright (C) 2009 STMicroelectronics Ltd + + This program is free software; you can redistribute it and/or modify it + under the terms and conditions of the GNU General Public License, + version 2, as published by the Free Software Foundation. + + This program is distributed in the hope it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + + The full GNU General Public License is included in this distribution in + the file called "COPYING". + + Author: Giuseppe Cavallaro +*******************************************************************************/ + +#ifndef __STMMAC_PLATFORM_DATA +#define __STMMAC_PLATFORM_DATA + +/* platfrom data for platfrom device structure's platfrom_data field */ + +/* Private data for the STM on-board ethernet driver */ +struct plat_stmmacenet_data { + int bus_id; + int pbl; + int has_gmac; + void (*fix_mac_speed)(void *priv, unsigned int speed); + void (*bus_setup)(unsigned long ioaddr); +#ifdef CONFIG_STM_DRIVERS + struct stm_pad_config *pad_config; +#endif + void *bsp_priv; +}; + +struct plat_stmmacphy_data { + int bus_id; + int phy_addr; + unsigned int phy_mask; + int interface; + int (*phy_reset)(void *priv); + void *priv; +}; +#endif + -- cgit v1.2.3 From 0ed731859e24cd6e3ec058cf2b49b2a0df80e86b Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 6 Jan 2010 09:23:54 +0900 Subject: LSM: Update comment on security_sock_rcv_skb It is not permitted to do sleeping operation inside security_sock_rcv_skb(). Signed-off-by: Tetsuo Handa Acked-by: Serge Hallyn -- Signed-off-by: James Morris --- include/linux/security.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 466cbadbd1ef..3696ca345745 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -978,6 +978,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Check permissions on incoming network packets. This hook is distinct * from Netfilter's IP input hooks since it is the first time that the * incoming sk_buff @skb has been associated with a particular socket, @sk. + * Must not sleep inside this hook because some callers hold spinlocks. * @sk contains the sock (not socket) associated with the incoming sk_buff. * @skb contains the incoming network data. * @socket_getpeersec_stream: -- cgit v1.2.3 From dd3d145d49c5816b79acc6761ebbd842bc50b0ee Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 11 Jan 2010 03:21:48 -0500 Subject: block: Fix discard alignment calculation and printing Discard alignment reporting for partitions was incorrect. Update to match the algorithm used elsewhere. The alignment can be negative (misaligned). Fix format string accordingly. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- include/linux/blkdev.h | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index b11a4ad7d571..d13ba76a169c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -867,7 +867,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue)); + return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); } static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9b98173a8184..a41bcc8e140f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1148,8 +1148,11 @@ static inline int queue_discard_alignment(struct request_queue *q) static inline int queue_sector_discard_alignment(struct request_queue *q, sector_t sector) { - return ((sector << 9) - q->limits.discard_alignment) - & (q->limits.discard_granularity - 1); + struct queue_limits *lim = &q->limits; + unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1); + + return (lim->discard_granularity + lim->discard_alignment - alignment) + & (lim->discard_granularity - 1); } static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) -- cgit v1.2.3 From 17be8c245054b9c7786545af3ba3ca4e54cd4ad9 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 11 Jan 2010 03:21:49 -0500 Subject: block: bdev_stack_limits wrapper DM does not want to know about partition offsets. Add a partition-aware wrapper that DM can use when stacking block devices. Signed-off-by: Martin K. Petersen Acked-by: Mike Snitzer Reviewed-by: Alasdair G Kergon Signed-off-by: Jens Axboe --- block/blk-settings.c | 22 ++++++++++++++++++++++ include/linux/blkdev.h | 2 ++ 2 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/block/blk-settings.c b/block/blk-settings.c index 127f82551855..5eeb9e0d256e 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -639,6 +639,28 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, } EXPORT_SYMBOL(blk_stack_limits); +/** + * bdev_stack_limits - adjust queue limits for stacked drivers + * @t: the stacking driver limits (top device) + * @bdev: the component block_device (bottom) + * @start: first data sector within component device + * + * Description: + * Merges queue limits for a top device and a block_device. Returns + * 0 if alignment didn't change. Returns -1 if adding the bottom + * device caused misalignment. + */ +int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev, + sector_t start) +{ + struct request_queue *bq = bdev_get_queue(bdev); + + start += get_start_sect(bdev); + + return blk_stack_limits(t, &bq->limits, start << 9); +} +EXPORT_SYMBOL(bdev_stack_limits); + /** * disk_stack_limits - adjust queue limits for stacked drivers * @disk: MD/DM gendisk (top) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a41bcc8e140f..5c8018977efa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -938,6 +938,8 @@ extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); extern void blk_set_default_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); +extern int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev, + sector_t offset); extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset); extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); -- cgit v1.2.3 From ce289321b7dc1eb108e3df0dec872b7429ef49f7 Mon Sep 17 00:00:00 2001 From: Kirill Afonshin Date: Fri, 8 Jan 2010 22:09:59 +0300 Subject: block: removed unused as_io_context It isn't used anymore, since AS was deleted. Signed-off-by: Jens Axboe --- block/blk-ioc.c | 5 ----- include/linux/iocontext.h | 27 --------------------------- 2 files changed, 32 deletions(-) (limited to 'include/linux') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index cbdabb0dd6d7..98e6bf61b0ac 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -39,8 +39,6 @@ int put_io_context(struct io_context *ioc) if (atomic_long_dec_and_test(&ioc->refcount)) { rcu_read_lock(); - if (ioc->aic && ioc->aic->dtor) - ioc->aic->dtor(ioc->aic); cfq_dtor(ioc); rcu_read_unlock(); @@ -76,8 +74,6 @@ void exit_io_context(struct task_struct *task) task_unlock(task); if (atomic_dec_and_test(&ioc->nr_tasks)) { - if (ioc->aic && ioc->aic->exit) - ioc->aic->exit(ioc->aic); cfq_exit(ioc); } @@ -97,7 +93,6 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) ret->ioprio = 0; ret->last_waited = jiffies; /* doesn't matter... */ ret->nr_batch_requests = 0; /* because this is 0 */ - ret->aic = NULL; INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ret->cic_list); ret->ioc_data = NULL; diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index a63235996309..78ef023227d4 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -4,32 +4,6 @@ #include #include -/* - * This is the per-process anticipatory I/O scheduler state. - */ -struct as_io_context { - spinlock_t lock; - - void (*dtor)(struct as_io_context *aic); /* destructor */ - void (*exit)(struct as_io_context *aic); /* called on task exit */ - - unsigned long state; - atomic_t nr_queued; /* queued reads & sync writes */ - atomic_t nr_dispatched; /* number of requests gone to the drivers */ - - /* IO History tracking */ - /* Thinktime */ - unsigned long last_end_request; - unsigned long ttime_total; - unsigned long ttime_samples; - unsigned long ttime_mean; - /* Layout pattern */ - unsigned int seek_samples; - sector_t last_request_pos; - u64 seek_total; - sector_t seek_mean; -}; - struct cfq_queue; struct cfq_io_context { void *key; @@ -78,7 +52,6 @@ struct io_context { unsigned long last_waited; /* Time last woken after wait for request */ int nr_batch_requests; /* Number of requests left in the batch */ - struct as_io_context *aic; struct radix_tree_root radix_root; struct hlist_head cic_list; void *ioc_data; -- cgit v1.2.3 From e03a72e13648ac6277bf2bab6b8324d51f89c0fa Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 11 Jan 2010 03:21:51 -0500 Subject: block: Stop using byte offsets All callers of the stacking functions use 512-byte sector units rather than byte offsets. Simplify the code so the stacking functions take sectors when specifying data offsets. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 26 +++++++++----------------- fs/partitions/check.c | 7 ++++--- include/linux/blkdev.h | 17 +++++------------ 3 files changed, 18 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/block/blk-settings.c b/block/blk-settings.c index 5eeb9e0d256e..78549c723783 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -507,7 +507,7 @@ static unsigned int lcm(unsigned int a, unsigned int b) * blk_stack_limits - adjust queue_limits for stacked devices * @t: the stacking driver limits (top device) * @b: the underlying queue limits (bottom, component device) - * @offset: offset to beginning of data within component device + * @start: first data sector within component device * * Description: * This function is used by stacking drivers like MD and DM to ensure @@ -525,10 +525,9 @@ static unsigned int lcm(unsigned int a, unsigned int b) * the alignment_offset is undefined. */ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, - sector_t offset) + sector_t start) { - sector_t alignment; - unsigned int top, bottom, ret = 0; + unsigned int top, bottom, alignment, ret = 0; t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); @@ -548,7 +547,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->misaligned |= b->misaligned; - alignment = queue_limit_alignment_offset(b, offset); + alignment = queue_limit_alignment_offset(b, start); /* Bottom device has different alignment. Check that it is * compatible with the current top alignment. @@ -611,11 +610,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Discard alignment and granularity */ if (b->discard_granularity) { - unsigned int granularity = b->discard_granularity; - offset &= granularity - 1; - - alignment = (granularity + b->discard_alignment - offset) - & (granularity - 1); + alignment = queue_limit_discard_alignment(b, start); if (t->discard_granularity != 0 && t->discard_alignment != alignment) { @@ -657,7 +652,7 @@ int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev, start += get_start_sect(bdev); - return blk_stack_limits(t, &bq->limits, start << 9); + return blk_stack_limits(t, &bq->limits, start); } EXPORT_SYMBOL(bdev_stack_limits); @@ -668,9 +663,8 @@ EXPORT_SYMBOL(bdev_stack_limits); * @offset: offset to beginning of data within component device * * Description: - * Merges the limits for two queues. Returns 0 if alignment - * didn't change. Returns -1 if adding the bottom device caused - * misalignment. + * Merges the limits for a top level gendisk and a bottom level + * block_device. */ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset) @@ -678,9 +672,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, struct request_queue *t = disk->queue; struct request_queue *b = bdev_get_queue(bdev); - offset += get_start_sect(bdev) << 9; - - if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) { + if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) { char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; disk_name(disk, 0, top); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 64bc8998ac9a..e8865c11777f 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -412,9 +412,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev = part_to_dev(p); p->start_sect = start; - p->alignment_offset = queue_sector_alignment_offset(disk->queue, start); - p->discard_alignment = queue_sector_discard_alignment(disk->queue, - start); + p->alignment_offset = + queue_limit_alignment_offset(&disk->queue->limits, start); + p->discard_alignment = + queue_limit_discard_alignment(&disk->queue->limits, start); p->nr_sects = len; p->partno = partno; p->policy = get_disk_ro(disk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c8018977efa..ffb13ad35716 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1112,18 +1112,13 @@ static inline int queue_alignment_offset(struct request_queue *q) return q->limits.alignment_offset; } -static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t offset) +static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector) { unsigned int granularity = max(lim->physical_block_size, lim->io_min); + unsigned int alignment = (sector << 9) & (granularity - 1); - offset &= granularity - 1; - return (granularity + lim->alignment_offset - offset) & (granularity - 1); -} - -static inline int queue_sector_alignment_offset(struct request_queue *q, - sector_t sector) -{ - return queue_limit_alignment_offset(&q->limits, sector << 9); + return (granularity + lim->alignment_offset - alignment) + & (granularity - 1); } static inline int bdev_alignment_offset(struct block_device *bdev) @@ -1147,10 +1142,8 @@ static inline int queue_discard_alignment(struct request_queue *q) return q->limits.discard_alignment; } -static inline int queue_sector_discard_alignment(struct request_queue *q, - sector_t sector) +static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector_t sector) { - struct queue_limits *lim = &q->limits; unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1); return (lim->discard_granularity + lim->discard_alignment - alignment) -- cgit v1.2.3 From 7af92f8754b87bc78cbfd447d5f4096b25c46682 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 6 Jan 2010 15:45:55 -0800 Subject: genhd: overlapping variable definition This fixes the sparse warning: fs/ext4/super.c:2390:40: warning: symbol 'i' shadows an earlier one fs/ext4/super.c:2368:22: originally declared here Using 'i' in a macro is dubious practice. Signed-off-by: Stephen Hemminger Signed-off-by: Jens Axboe --- include/linux/genhd.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c6c0c41af35f..9717081c75ad 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -256,9 +256,9 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, #define part_stat_read(part, field) \ ({ \ typeof((part)->dkstats->field) res = 0; \ - int i; \ - for_each_possible_cpu(i) \ - res += per_cpu_ptr((part)->dkstats, i)->field; \ + unsigned int _cpu; \ + for_each_possible_cpu(_cpu) \ + res += per_cpu_ptr((part)->dkstats, _cpu)->field; \ res; \ }) -- cgit v1.2.3 From 4b529401c5089cf33f7165607cbc2fde43357bfb Mon Sep 17 00:00:00 2001 From: Andreas Fenkart Date: Fri, 8 Jan 2010 14:42:31 -0800 Subject: mm: make totalhigh_pages unsigned long Makes it consistent with the extern declaration, used when CONFIG_HIGHMEM is set Removes redundant casts in printout messages Signed-off-by: Andreas Fenkart Acked-by: Russell King Cc: Ralf Baechle Cc: David Howells Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Chen Liqin Cc: Lennox Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/init.c | 2 +- arch/mips/mm/init.c | 2 +- arch/mips/sgi-ip27/ip27-memory.c | 2 +- arch/mn10300/mm/init.c | 3 +-- arch/score/mm/init.c | 2 +- arch/x86/mm/init_32.c | 3 +-- include/linux/highmem.h | 2 +- 7 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 52c40d155672..a04ffbbbe253 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -616,7 +616,7 @@ void __init mem_init(void) "%dK data, %dK init, %luK highmem)\n", nr_free_pages() << (PAGE_SHIFT-10), codesize >> 10, datasize >> 10, initsize >> 10, - (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))); + totalhigh_pages << (PAGE_SHIFT-10)); if (PAGE_SIZE >= 16384 && num_physpages <= 128) { extern int sysctl_overcommit_memory; diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 9e8d00389eef..1651942f7feb 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -424,7 +424,7 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10, - (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))); + totalhigh_pages << (PAGE_SHIFT-10)); } #endif /* !CONFIG_NEED_MULTIPLE_NODES */ diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index f61c164d1e67..bc1297109cc5 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -505,5 +505,5 @@ void __init mem_init(void) (num_physpages - tmp) << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10, - (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))); + totalhigh_pages << (PAGE_SHIFT-10)); } diff --git a/arch/mn10300/mm/init.c b/arch/mn10300/mm/init.c index ec1420562dc7..dd27a9a35152 100644 --- a/arch/mn10300/mm/init.c +++ b/arch/mn10300/mm/init.c @@ -118,8 +118,7 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10, - (unsigned long) (totalhigh_pages << (PAGE_SHIFT - 10)) - ); + totalhigh_pages << (PAGE_SHIFT - 10)); } /* diff --git a/arch/score/mm/init.c b/arch/score/mm/init.c index 8c15b2c85d5a..dfaf458d6702 100644 --- a/arch/score/mm/init.c +++ b/arch/score/mm/init.c @@ -106,7 +106,7 @@ void __init mem_init(void) ram << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10, - (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))); + totalhigh_pages << (PAGE_SHIFT-10)); } #endif /* !CONFIG_NEED_MULTIPLE_NODES */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c973f8e2a6cf..9a0c258a86be 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -892,8 +892,7 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10, - (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) - ); + totalhigh_pages << (PAGE_SHIFT-10)); printk(KERN_INFO "virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 211ff4497269..ab2cc20e21a5 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -46,7 +46,7 @@ void kmap_flush_unused(void); static inline unsigned int nr_free_highpages(void) { return 0; } -#define totalhigh_pages 0 +#define totalhigh_pages 0UL #ifndef ARCH_HAS_KMAP static inline void *kmap(struct page *page) -- cgit v1.2.3 From e992cd9b72a18122bd5c958715623057f110793f Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 8 Jan 2010 14:42:35 -0800 Subject: kmemcheck: make bitfield annotations truly no-ops when disabled It turns out that even zero-sized struct members (int foo[0];) will affect the struct layout, causing us in particular to lose 4 bytes in struct sock. This patch fixes the regression in CONFIG_KMEMCHECK=n case. Reported-by: Eric Dumazet Signed-off-by: Vegard Nossum Acked-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmemcheck.h | 110 ++++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 52 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h index e880d4cf9e22..08d7dc4ddf40 100644 --- a/include/linux/kmemcheck.h +++ b/include/linux/kmemcheck.h @@ -36,6 +36,56 @@ int kmemcheck_hide_addr(unsigned long address); bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size); +/* + * Bitfield annotations + * + * How to use: If you have a struct using bitfields, for example + * + * struct a { + * int x:8, y:8; + * }; + * + * then this should be rewritten as + * + * struct a { + * kmemcheck_bitfield_begin(flags); + * int x:8, y:8; + * kmemcheck_bitfield_end(flags); + * }; + * + * Now the "flags_begin" and "flags_end" members may be used to refer to the + * beginning and end, respectively, of the bitfield (and things like + * &x.flags_begin is allowed). As soon as the struct is allocated, the bit- + * fields should be annotated: + * + * struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL); + * kmemcheck_annotate_bitfield(a, flags); + */ +#define kmemcheck_bitfield_begin(name) \ + int name##_begin[0]; + +#define kmemcheck_bitfield_end(name) \ + int name##_end[0]; + +#define kmemcheck_annotate_bitfield(ptr, name) \ + do { \ + int _n; \ + \ + if (!ptr) \ + break; \ + \ + _n = (long) &((ptr)->name##_end) \ + - (long) &((ptr)->name##_begin); \ + MAYBE_BUILD_BUG_ON(_n < 0); \ + \ + kmemcheck_mark_initialized(&((ptr)->name##_begin), _n); \ + } while (0) + +#define kmemcheck_annotate_variable(var) \ + do { \ + kmemcheck_mark_initialized(&(var), sizeof(var)); \ + } while (0) \ + #else #define kmemcheck_enabled 0 @@ -106,60 +156,16 @@ static inline bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) return true; } -#endif /* CONFIG_KMEMCHECK */ - -/* - * Bitfield annotations - * - * How to use: If you have a struct using bitfields, for example - * - * struct a { - * int x:8, y:8; - * }; - * - * then this should be rewritten as - * - * struct a { - * kmemcheck_bitfield_begin(flags); - * int x:8, y:8; - * kmemcheck_bitfield_end(flags); - * }; - * - * Now the "flags_begin" and "flags_end" members may be used to refer to the - * beginning and end, respectively, of the bitfield (and things like - * &x.flags_begin is allowed). As soon as the struct is allocated, the bit- - * fields should be annotated: - * - * struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL); - * kmemcheck_annotate_bitfield(a, flags); - * - * Note: We provide the same definitions for both kmemcheck and non- - * kmemcheck kernels. This makes it harder to introduce accidental errors. It - * is also allowed to pass NULL pointers to kmemcheck_annotate_bitfield(). - */ -#define kmemcheck_bitfield_begin(name) \ - int name##_begin[0]; - -#define kmemcheck_bitfield_end(name) \ - int name##_end[0]; +#define kmemcheck_bitfield_begin(name) +#define kmemcheck_bitfield_end(name) +#define kmemcheck_annotate_bitfield(ptr, name) \ + do { \ + } while (0) -#define kmemcheck_annotate_bitfield(ptr, name) \ - do { \ - int _n; \ - \ - if (!ptr) \ - break; \ - \ - _n = (long) &((ptr)->name##_end) \ - - (long) &((ptr)->name##_begin); \ - MAYBE_BUILD_BUG_ON(_n < 0); \ - \ - kmemcheck_mark_initialized(&((ptr)->name##_begin), _n); \ +#define kmemcheck_annotate_variable(var) \ + do { \ } while (0) -#define kmemcheck_annotate_variable(var) \ - do { \ - kmemcheck_mark_initialized(&(var), sizeof(var)); \ - } while (0) \ +#endif /* CONFIG_KMEMCHECK */ #endif /* LINUX_KMEMCHECK_H */ -- cgit v1.2.3 From 7dd65feb6c603e13eba501c34c662259ab38e70e Mon Sep 17 00:00:00 2001 From: Albin Tonnerre Date: Fri, 8 Jan 2010 14:42:42 -0800 Subject: lib: add support for LZO-compressed kernels This patch series adds generic support for creating and extracting LZO-compressed kernel images, as well as support for using such images on the x86 and ARM architectures, and support for creating and using LZO-compressed initrd and initramfs images. Russell King said: : Testing on a Cortex A9 model: : - lzo decompressor is 65% of the time gzip takes to decompress a kernel : - lzo kernel is 9% larger than a gzip kernel : : which I'm happy to say confirms your figures when comparing the two. : : However, when comparing your new gzip code to the old gzip code: : - new is 99% of the size of the old code : - new takes 42% of the time to decompress than the old code : : What this means is that for a proper comparison, the results get even better: : - lzo is 7.5% larger than the old gzip'd kernel image : - lzo takes 28% of the time that the old gzip code took : : So the expense seems definitely worth the effort. The only reason I : can think of ever using gzip would be if you needed the additional : compression (eg, because you have limited flash to store the image.) : : I would argue that the default for ARM should therefore be LZO. This patch: The lzo compressor is worse than gzip at compression, but faster at extraction. Here are some figures for an ARM board I'm working on: Uncompressed size: 3.24Mo gzip 1.61Mo 0.72s lzo 1.75Mo 0.48s So for a compression ratio that is still relatively close to gzip, it's much faster to extract, at least in that case. This part contains: - Makefile routine to support lzo compression - Fixes to the existing lzo compressor so that it can be used in compressed kernels - wrapper around the existing lzo1x_decompress, as it only extracts one block at a time, while we need to extract a whole file here - config dialog for kernel compression [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: cleanup] Signed-off-by: Albin Tonnerre Tested-by: Wu Zhangjin Acked-by: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Tested-by: Russell King Acked-by: Russell King Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/decompress/unlzo.h | 10 ++ init/Kconfig | 18 +++- lib/decompress_unlzo.c | 209 +++++++++++++++++++++++++++++++++++++++ lib/lzo/lzo1x_decompress.c | 9 +- scripts/Makefile.lib | 5 + 5 files changed, 244 insertions(+), 7 deletions(-) create mode 100644 include/linux/decompress/unlzo.h create mode 100644 lib/decompress_unlzo.c (limited to 'include/linux') diff --git a/include/linux/decompress/unlzo.h b/include/linux/decompress/unlzo.h new file mode 100644 index 000000000000..987229752519 --- /dev/null +++ b/include/linux/decompress/unlzo.h @@ -0,0 +1,10 @@ +#ifndef DECOMPRESS_UNLZO_H +#define DECOMPRESS_UNLZO_H + +int unlzo(unsigned char *inbuf, int len, + int(*fill)(void*, unsigned int), + int(*flush)(void*, unsigned int), + unsigned char *output, + int *pos, + void(*error)(char *x)); +#endif diff --git a/init/Kconfig b/init/Kconfig index a23da9f01803..d95ca7cd5d45 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -115,10 +115,13 @@ config HAVE_KERNEL_BZIP2 config HAVE_KERNEL_LZMA bool +config HAVE_KERNEL_LZO + bool + choice prompt "Kernel compression mode" default KERNEL_GZIP - depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA + depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_LZO help The linux kernel is a kind of self-extracting executable. Several compression algorithms are available, which differ @@ -141,9 +144,8 @@ config KERNEL_GZIP bool "Gzip" depends on HAVE_KERNEL_GZIP help - The old and tried gzip compression. Its compression ratio is - the poorest among the 3 choices; however its speed (both - compression and decompression) is the fastest. + The old and tried gzip compression. It provides a good balance + between compression ratio and decompression speed. config KERNEL_BZIP2 bool "Bzip2" @@ -164,6 +166,14 @@ config KERNEL_LZMA two. Compression is slowest. The kernel size is about 33% smaller with LZMA in comparison to gzip. +config KERNEL_LZO + bool "LZO" + depends on HAVE_KERNEL_LZO + help + Its compression ratio is the poorest among the 4. The kernel + size is about about 10% bigger than gzip; however its speed + (both compression and decompression) is the fastest. + endchoice config SWAP diff --git a/lib/decompress_unlzo.c b/lib/decompress_unlzo.c new file mode 100644 index 000000000000..db521f45626e --- /dev/null +++ b/lib/decompress_unlzo.c @@ -0,0 +1,209 @@ +/* + * LZO decompressor for the Linux kernel. Code borrowed from the lzo + * implementation by Markus Franz Xaver Johannes Oberhumer. + * + * Linux kernel adaptation: + * Copyright (C) 2009 + * Albin Tonnerre, Free Electrons + * + * Original code: + * Copyright (C) 1996-2005 Markus Franz Xaver Johannes Oberhumer + * All Rights Reserved. + * + * lzop and the LZO library are free software; you can redistribute them + * and/or modify them under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. + * If not, write to the Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Markus F.X.J. Oberhumer + * + * http://www.oberhumer.com/opensource/lzop/ + */ + +#ifdef STATIC +#include "lzo/lzo1x_decompress.c" +#else +#include +#include +#endif + +#include +#include +#include + +#include +#include + +static const unsigned char lzop_magic[] = { + 0x89, 0x4c, 0x5a, 0x4f, 0x00, 0x0d, 0x0a, 0x1a, 0x0a }; + +#define LZO_BLOCK_SIZE (256*1024l) +#define HEADER_HAS_FILTER 0x00000800L + +STATIC inline int INIT parse_header(u8 *input, u8 *skip) +{ + int l; + u8 *parse = input; + u8 level = 0; + u16 version; + + /* read magic: 9 first bits */ + for (l = 0; l < 9; l++) { + if (*parse++ != lzop_magic[l]) + return 0; + } + /* get version (2bytes), skip library version (2), + * 'need to be extracted' version (2) and + * method (1) */ + version = get_unaligned_be16(parse); + parse += 7; + if (version >= 0x0940) + level = *parse++; + if (get_unaligned_be32(parse) & HEADER_HAS_FILTER) + parse += 8; /* flags + filter info */ + else + parse += 4; /* flags */ + + /* skip mode and mtime_low */ + parse += 8; + if (version >= 0x0940) + parse += 4; /* skip mtime_high */ + + l = *parse++; + /* don't care about the file name, and skip checksum */ + parse += l + 4; + + *skip = parse - input; + return 1; +} + +STATIC inline int INIT unlzo(u8 *input, int in_len, + int (*fill) (void *, unsigned int), + int (*flush) (void *, unsigned int), + u8 *output, int *posp, + void (*error_fn) (char *x)) +{ + u8 skip = 0, r = 0; + u32 src_len, dst_len; + size_t tmp; + u8 *in_buf, *in_buf_save, *out_buf; + int obytes_processed = 0; + + set_error_fn(error_fn); + + if (output) { + out_buf = output; + } else if (!flush) { + error("NULL output pointer and no flush function provided"); + goto exit; + } else { + out_buf = malloc(LZO_BLOCK_SIZE); + if (!out_buf) { + error("Could not allocate output buffer"); + goto exit; + } + } + + if (input && fill) { + error("Both input pointer and fill function provided, don't know what to do"); + goto exit_1; + } else if (input) { + in_buf = input; + } else if (!fill || !posp) { + error("NULL input pointer and missing position pointer or fill function"); + goto exit_1; + } else { + in_buf = malloc(lzo1x_worst_compress(LZO_BLOCK_SIZE)); + if (!in_buf) { + error("Could not allocate input buffer"); + goto exit_1; + } + } + in_buf_save = in_buf; + + if (posp) + *posp = 0; + + if (fill) + fill(in_buf, lzo1x_worst_compress(LZO_BLOCK_SIZE)); + + if (!parse_header(input, &skip)) { + error("invalid header"); + goto exit_2; + } + in_buf += skip; + + if (posp) + *posp = skip; + + for (;;) { + /* read uncompressed block size */ + dst_len = get_unaligned_be32(in_buf); + in_buf += 4; + + /* exit if last block */ + if (dst_len == 0) { + if (posp) + *posp += 4; + break; + } + + if (dst_len > LZO_BLOCK_SIZE) { + error("dest len longer than block size"); + goto exit_2; + } + + /* read compressed block size, and skip block checksum info */ + src_len = get_unaligned_be32(in_buf); + in_buf += 8; + + if (src_len <= 0 || src_len > dst_len) { + error("file corrupted"); + goto exit_2; + } + + /* decompress */ + tmp = dst_len; + r = lzo1x_decompress_safe((u8 *) in_buf, src_len, + out_buf, &tmp); + + if (r != LZO_E_OK || dst_len != tmp) { + error("Compressed data violation"); + goto exit_2; + } + + obytes_processed += dst_len; + if (flush) + flush(out_buf, dst_len); + if (output) + out_buf += dst_len; + if (posp) + *posp += src_len + 12; + if (fill) { + in_buf = in_buf_save; + fill(in_buf, lzo1x_worst_compress(LZO_BLOCK_SIZE)); + } else + in_buf += src_len; + } + +exit_2: + if (!input) + free(in_buf); +exit_1: + if (!output) + free(out_buf); +exit: + return obytes_processed; +} + +#define decompress unlzo diff --git a/lib/lzo/lzo1x_decompress.c b/lib/lzo/lzo1x_decompress.c index 5dc6b29c1575..f2fd09850223 100644 --- a/lib/lzo/lzo1x_decompress.c +++ b/lib/lzo/lzo1x_decompress.c @@ -11,11 +11,13 @@ * Richard Purdie */ +#ifndef STATIC #include #include -#include -#include +#endif + #include +#include #include "lzodefs.h" #define HAVE_IP(x, ip_end, ip) ((size_t)(ip_end - ip) < (x)) @@ -244,9 +246,10 @@ lookbehind_overrun: *out_len = op - out; return LZO_E_LOOKBEHIND_OVERRUN; } - +#ifndef STATIC EXPORT_SYMBOL_GPL(lzo1x_decompress_safe); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("LZO1X Decompressor"); +#endif diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index cd815ac2a50b..0fe48cd91ffa 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -235,3 +235,8 @@ quiet_cmd_lzma = LZMA $@ cmd_lzma = (cat $(filter-out FORCE,$^) | \ lzma -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ (rm -f $@ ; false) + +quiet_cmd_lzo = LZO $@ +cmd_lzo = (cat $(filter-out FORCE,$^) | \ + lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + (rm -f $@ ; false) -- cgit v1.2.3 From 80884094e34456887ecdbd107d40e72c4a40f9c9 Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Fri, 8 Jan 2010 14:43:08 -0800 Subject: gpio: adp5588-gpio: new driver for ADP5588 GPIO expanders Signed-off-by: Michael Hennerich Signed-off-by: Mike Frysinger Cc: Jean Delvare Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpio/Kconfig | 9 ++ drivers/gpio/Makefile | 1 + drivers/gpio/adp5588-gpio.c | 266 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/i2c/adp5588.h | 12 ++ 4 files changed, 288 insertions(+) create mode 100644 drivers/gpio/adp5588-gpio.c (limited to 'include/linux') diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index a019b49ecc9b..1f1d88ae68d6 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -172,6 +172,15 @@ config GPIO_ADP5520 To compile this driver as a module, choose M here: the module will be called adp5520-gpio. +config GPIO_ADP5588 + tristate "ADP5588 I2C GPIO expander" + depends on I2C + help + This option enables support for 18 GPIOs found + on Analog Devices ADP5588 GPIO Expanders. + To compile this driver as a module, choose M here: the module will be + called adp5588-gpio. + comment "PCI GPIO expanders:" config GPIO_CS5535 diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index 52fe4cf734c7..48687238edb1 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -5,6 +5,7 @@ ccflags-$(CONFIG_DEBUG_GPIO) += -DDEBUG obj-$(CONFIG_GPIOLIB) += gpiolib.o obj-$(CONFIG_GPIO_ADP5520) += adp5520-gpio.o +obj-$(CONFIG_GPIO_ADP5588) += adp5588-gpio.o obj-$(CONFIG_GPIO_LANGWELL) += langwell_gpio.o obj-$(CONFIG_GPIO_MAX7301) += max7301.o obj-$(CONFIG_GPIO_MAX732X) += max732x.o diff --git a/drivers/gpio/adp5588-gpio.c b/drivers/gpio/adp5588-gpio.c new file mode 100644 index 000000000000..afc097a16b33 --- /dev/null +++ b/drivers/gpio/adp5588-gpio.c @@ -0,0 +1,266 @@ +/* + * GPIO Chip driver for Analog Devices + * ADP5588 I/O Expander and QWERTY Keypad Controller + * + * Copyright 2009 Analog Devices Inc. + * + * Licensed under the GPL-2 or later. + */ + +#include +#include +#include +#include +#include + +#include + +#define DRV_NAME "adp5588-gpio" +#define MAXGPIO 18 +#define ADP_BANK(offs) ((offs) >> 3) +#define ADP_BIT(offs) (1u << ((offs) & 0x7)) + +struct adp5588_gpio { + struct i2c_client *client; + struct gpio_chip gpio_chip; + struct mutex lock; /* protect cached dir, dat_out */ + unsigned gpio_start; + uint8_t dat_out[3]; + uint8_t dir[3]; +}; + +static int adp5588_gpio_read(struct i2c_client *client, u8 reg) +{ + int ret = i2c_smbus_read_byte_data(client, reg); + + if (ret < 0) + dev_err(&client->dev, "Read Error\n"); + + return ret; +} + +static int adp5588_gpio_write(struct i2c_client *client, u8 reg, u8 val) +{ + int ret = i2c_smbus_write_byte_data(client, reg, val); + + if (ret < 0) + dev_err(&client->dev, "Write Error\n"); + + return ret; +} + +static int adp5588_gpio_get_value(struct gpio_chip *chip, unsigned off) +{ + struct adp5588_gpio *dev = + container_of(chip, struct adp5588_gpio, gpio_chip); + + return !!(adp5588_gpio_read(dev->client, GPIO_DAT_STAT1 + ADP_BANK(off)) + & ADP_BIT(off)); +} + +static void adp5588_gpio_set_value(struct gpio_chip *chip, + unsigned off, int val) +{ + unsigned bank, bit; + struct adp5588_gpio *dev = + container_of(chip, struct adp5588_gpio, gpio_chip); + + bank = ADP_BANK(off); + bit = ADP_BIT(off); + + mutex_lock(&dev->lock); + if (val) + dev->dat_out[bank] |= bit; + else + dev->dat_out[bank] &= ~bit; + + adp5588_gpio_write(dev->client, GPIO_DAT_OUT1 + bank, + dev->dat_out[bank]); + mutex_unlock(&dev->lock); +} + +static int adp5588_gpio_direction_input(struct gpio_chip *chip, unsigned off) +{ + int ret; + unsigned bank; + struct adp5588_gpio *dev = + container_of(chip, struct adp5588_gpio, gpio_chip); + + bank = ADP_BANK(off); + + mutex_lock(&dev->lock); + dev->dir[bank] &= ~ADP_BIT(off); + ret = adp5588_gpio_write(dev->client, GPIO_DIR1 + bank, dev->dir[bank]); + mutex_unlock(&dev->lock); + + return ret; +} + +static int adp5588_gpio_direction_output(struct gpio_chip *chip, + unsigned off, int val) +{ + int ret; + unsigned bank, bit; + struct adp5588_gpio *dev = + container_of(chip, struct adp5588_gpio, gpio_chip); + + bank = ADP_BANK(off); + bit = ADP_BIT(off); + + mutex_lock(&dev->lock); + dev->dir[bank] |= bit; + + if (val) + dev->dat_out[bank] |= bit; + else + dev->dat_out[bank] &= ~bit; + + ret = adp5588_gpio_write(dev->client, GPIO_DAT_OUT1 + bank, + dev->dat_out[bank]); + ret |= adp5588_gpio_write(dev->client, GPIO_DIR1 + bank, + dev->dir[bank]); + mutex_unlock(&dev->lock); + + return ret; +} + +static int __devinit adp5588_gpio_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct adp5588_gpio_platform_data *pdata = client->dev.platform_data; + struct adp5588_gpio *dev; + struct gpio_chip *gc; + int ret, i, revid; + + if (pdata == NULL) { + dev_err(&client->dev, "missing platform data\n"); + return -ENODEV; + } + + if (!i2c_check_functionality(client->adapter, + I2C_FUNC_SMBUS_BYTE_DATA)) { + dev_err(&client->dev, "SMBUS Byte Data not Supported\n"); + return -EIO; + } + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (dev == NULL) { + dev_err(&client->dev, "failed to alloc memory\n"); + return -ENOMEM; + } + + dev->client = client; + + gc = &dev->gpio_chip; + gc->direction_input = adp5588_gpio_direction_input; + gc->direction_output = adp5588_gpio_direction_output; + gc->get = adp5588_gpio_get_value; + gc->set = adp5588_gpio_set_value; + gc->can_sleep = 1; + + gc->base = pdata->gpio_start; + gc->ngpio = MAXGPIO; + gc->label = client->name; + gc->owner = THIS_MODULE; + + mutex_init(&dev->lock); + + + ret = adp5588_gpio_read(dev->client, DEV_ID); + if (ret < 0) + goto err; + + revid = ret & ADP5588_DEVICE_ID_MASK; + + for (i = 0, ret = 0; i <= ADP_BANK(MAXGPIO); i++) { + dev->dat_out[i] = adp5588_gpio_read(client, GPIO_DAT_OUT1 + i); + dev->dir[i] = adp5588_gpio_read(client, GPIO_DIR1 + i); + ret |= adp5588_gpio_write(client, KP_GPIO1 + i, 0); + ret |= adp5588_gpio_write(client, GPIO_PULL1 + i, + (pdata->pullup_dis_mask >> (8 * i)) & 0xFF); + + if (ret) + goto err; + } + + ret = gpiochip_add(&dev->gpio_chip); + if (ret) + goto err; + + dev_info(&client->dev, "gpios %d..%d on a %s Rev. %d\n", + gc->base, gc->base + gc->ngpio - 1, + client->name, revid); + + if (pdata->setup) { + ret = pdata->setup(client, gc->base, gc->ngpio, pdata->context); + if (ret < 0) + dev_warn(&client->dev, "setup failed, %d\n", ret); + } + + i2c_set_clientdata(client, dev); + return 0; + +err: + kfree(dev); + return ret; +} + +static int __devexit adp5588_gpio_remove(struct i2c_client *client) +{ + struct adp5588_gpio_platform_data *pdata = client->dev.platform_data; + struct adp5588_gpio *dev = i2c_get_clientdata(client); + int ret; + + if (pdata->teardown) { + ret = pdata->teardown(client, + dev->gpio_chip.base, dev->gpio_chip.ngpio, + pdata->context); + if (ret < 0) { + dev_err(&client->dev, "teardown failed %d\n", ret); + return ret; + } + } + + ret = gpiochip_remove(&dev->gpio_chip); + if (ret) { + dev_err(&client->dev, "gpiochip_remove failed %d\n", ret); + return ret; + } + + kfree(dev); + return 0; +} + +static const struct i2c_device_id adp5588_gpio_id[] = { + {DRV_NAME, 0}, + {} +}; + +MODULE_DEVICE_TABLE(i2c, adp5588_gpio_id); + +static struct i2c_driver adp5588_gpio_driver = { + .driver = { + .name = DRV_NAME, + }, + .probe = adp5588_gpio_probe, + .remove = __devexit_p(adp5588_gpio_remove), + .id_table = adp5588_gpio_id, +}; + +static int __init adp5588_gpio_init(void) +{ + return i2c_add_driver(&adp5588_gpio_driver); +} + +module_init(adp5588_gpio_init); + +static void __exit adp5588_gpio_exit(void) +{ + i2c_del_driver(&adp5588_gpio_driver); +} + +module_exit(adp5588_gpio_exit); + +MODULE_AUTHOR("Michael Hennerich "); +MODULE_DESCRIPTION("GPIO ADP5588 Driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/i2c/adp5588.h b/include/linux/i2c/adp5588.h index fc5db826b48e..02c9af374741 100644 --- a/include/linux/i2c/adp5588.h +++ b/include/linux/i2c/adp5588.h @@ -89,4 +89,16 @@ struct adp5588_kpad_platform_data { unsigned short unlock_key2; /* Unlock Key 2 */ }; +struct adp5588_gpio_platform_data { + unsigned gpio_start; /* GPIO Chip base # */ + unsigned pullup_dis_mask; /* Pull-Up Disable Mask */ + int (*setup)(struct i2c_client *client, + int gpio, unsigned ngpio, + void *context); + int (*teardown)(struct i2c_client *client, + int gpio, unsigned ngpio, + void *context); + void *context; +}; + #endif -- cgit v1.2.3 From a29815a333c6c6e677294bbe5958e771d0aad3fd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 10 Jan 2010 16:28:09 +0200 Subject: core, x86: make LIST_POISON less deadly The list macros use LIST_POISON1 and LIST_POISON2 as undereferencable pointers in order to trap erronous use of freed list_heads. Unfortunately userspace can arrange for those pointers to actually be dereferencable, potentially turning an oops to an expolit. To avoid this allow architectures (currently x86_64 only) to override the default values for these pointers with truly-undereferencable values. This is easy on x86_64 as the virtual address space is large and contains areas that cannot be mapped. Other 64-bit architectures will likely find similar unmapped ranges. [ingo: switch to 0xdead000000000000 as the unmapped area] [ingo: add comments, cleanup] [jaswinder: eliminate sparse warnings] Acked-by: Linus Torvalds Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar Signed-off-by: Avi Kivity Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 5 +++++ include/linux/poison.h | 16 ++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6bf1f1ac478c..cbcbfdee3ee0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1247,6 +1247,11 @@ config ARCH_MEMORY_PROBE def_bool X86_64 depends on MEMORY_HOTPLUG +config ILLEGAL_POINTER_VALUE + hex + default 0 if X86_32 + default 0xdead000000000000 if X86_64 + source "mm/Kconfig" config HIGHPTE diff --git a/include/linux/poison.h b/include/linux/poison.h index 7fc194aef8c2..2110a81c5e2a 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -2,13 +2,25 @@ #define _LINUX_POISON_H /********** include/linux/list.h **********/ + +/* + * Architectures might want to move the poison pointer offset + * into some well-recognized area such as 0xdead000000000000, + * that is also not mappable by user-space exploits: + */ +#ifdef CONFIG_ILLEGAL_POINTER_VALUE +# define POISON_POINTER_DELTA _AC(CONFIG_ILLEGAL_POINTER_VALUE, UL) +#else +# define POISON_POINTER_DELTA 0 +#endif + /* * These are non-NULL pointers that will result in page faults * under normal circumstances, used to verify that nobody uses * non-initialized list entries. */ -#define LIST_POISON1 ((void *) 0x00100100) -#define LIST_POISON2 ((void *) 0x00200200) +#define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA) /********** include/linux/timer.h **********/ /* -- cgit v1.2.3 From d218d11133d888f9745802146a50255a4781d37a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 11 Jan 2010 16:28:01 -0800 Subject: tcp: Generalized TTL Security Mechanism This patch adds the kernel portions needed to implement RFC 5082 Generalized TTL Security Mechanism (GTSM). It is a lightweight security measure against forged packets causing DoS attacks (for BGP). This is already implemented the same way in BSD kernels. For the necessary Quagga patch http://www.gossamer-threads.com/lists/quagga/dev/17389 Description from Cisco http://www.cisco.com/en/US/docs/ios/12_3t/12_3t7/feature/guide/gt_btsh.html It does add one byte to each socket structure, but I did a little rearrangement to reuse a hole (on 64 bit), but it does grow the structure on 32 bit This should be documented on ip(4) man page and the Glibc in.h file also needs update. IPV6_MINHOPLIMIT should also be added (although BSD doesn't support that). Only TCP is supported, but could also be added to UDP, DCCP, SCTP if desired. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/in.h | 2 ++ include/net/inet_sock.h | 4 +++- net/ipv4/ip_sockglue.c | 14 +++++++++++++- net/ipv4/tcp_ipv4.c | 3 +++ 4 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/in.h b/include/linux/in.h index b615649db129..583c76f9c30f 100644 --- a/include/linux/in.h +++ b/include/linux/in.h @@ -84,6 +84,8 @@ struct in_addr { #define IP_ORIGDSTADDR 20 #define IP_RECVORIGDSTADDR IP_ORIGDSTADDR +#define IP_MINTTL 21 + /* IP_MTU_DISCOVER values */ #define IP_PMTUDISC_DONT 0 /* Never send DF frames */ #define IP_PMTUDISC_WANT 1 /* Use per route hints */ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index bd4c53f75ac0..83fd34437cf1 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -122,10 +122,12 @@ struct inet_sock { __be32 inet_saddr; __s16 uc_ttl; __u16 cmsg_flags; - struct ip_options *opt; __be16 inet_sport; __u16 inet_id; + + struct ip_options *opt; __u8 tos; + __u8 min_ttl; __u8 mc_ttl; __u8 pmtudisc; __u8 recverr:1, diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index cafad9baff03..644dc43a55de 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -451,7 +451,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, (1<transparent = !!val; break; + case IP_MINTTL: + if (optlen < 1) + goto e_inval; + if (val < 0 || val > 255) + goto e_inval; + inet->min_ttl = val; + break; + default: err = -ENOPROTOOPT; break; @@ -1198,6 +1207,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_TRANSPARENT: val = inet->transparent; break; + case IP_MINTTL: + val = inet->min_ttl; + break; default: release_sock(sk); return -ENOPROTOOPT; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 65b8ebfd078a..382f667238ec 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1649,6 +1649,9 @@ int tcp_v4_rcv(struct sk_buff *skb) if (!sk) goto no_tcp_socket; + if (iph->ttl < inet_sk(sk)->min_ttl) + goto discard_and_relse; + process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; -- cgit v1.2.3 From a393db6f10ef2d4f28257234cfc730e744dfb6a4 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 22 Dec 2009 13:35:52 +0100 Subject: drbd: Allow online resizing of DRBD devices while peer not reachable (needs to be explicitly forced) Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 5 ++--- drivers/block/drbd/drbd_nl.c | 17 +++++++++++------ drivers/block/drbd/drbd_receiver.c | 4 ++-- include/linux/drbd_nl.h | 1 + 4 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 79d8e22c4d0d..2bf3a6ef3684 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1371,10 +1371,9 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); extern void drbd_suspend_io(struct drbd_conf *mdev); extern void drbd_resume_io(struct drbd_conf *mdev); extern char *ppsize(char *buf, unsigned long long size); -extern sector_t drbd_new_dev_size(struct drbd_conf *, - struct drbd_backing_dev *); +extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; -extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *) __must_hold(local); +extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, int force) __must_hold(local); extern void resync_after_online_grow(struct drbd_conf *); extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 3313901a4861..1292e0620663 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -510,7 +510,7 @@ void drbd_resume_io(struct drbd_conf *mdev) * Returns 0 on success, negative return values indicate errors. * You should call drbd_md_sync() after calling this function. */ -enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev) __must_hold(local) +enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force) __must_hold(local) { sector_t prev_first_sect, prev_size; /* previous meta location */ sector_t la_size; @@ -541,7 +541,7 @@ enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev) __must_ho /* TODO: should only be some assert here, not (re)init... */ drbd_md_set_sector_offsets(mdev, mdev->ldev); - size = drbd_new_dev_size(mdev, mdev->ldev); + size = drbd_new_dev_size(mdev, mdev->ldev, force); if (drbd_get_capacity(mdev->this_bdev) != size || drbd_bm_capacity(mdev) != size) { @@ -596,7 +596,7 @@ out: } sector_t -drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) +drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space) { sector_t p_size = mdev->p_size; /* partner's disk size. */ sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ @@ -606,6 +606,11 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) m_size = drbd_get_max_capacity(bdev); + if (mdev->state.conn < C_CONNECTED && assume_peer_has_space) { + dev_warn(DEV, "Resize while not connected was forced by the user!\n"); + p_size = m_size; + } + if (p_size && m_size) { size = min_t(sector_t, p_size, m_size); } else { @@ -965,7 +970,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp /* Prevent shrinking of consistent devices ! */ if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && - drbd_new_dev_size(mdev, nbc) < nbc->md.la_size_sect) { + drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) { dev_warn(DEV, "refusing to truncate a consistent device\n"); retcode = ERR_DISK_TO_SMALL; goto force_diskless_dec; @@ -1052,7 +1057,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) set_bit(USE_DEGR_WFC_T, &mdev->flags); - dd = drbd_determin_dev_size(mdev); + dd = drbd_determin_dev_size(mdev, 0); if (dd == dev_size_error) { retcode = ERR_NOMEM_BITMAP; goto force_diskless_dec; @@ -1504,7 +1509,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, } mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; - dd = drbd_determin_dev_size(mdev); + dd = drbd_determin_dev_size(mdev, rs.resize_force); drbd_md_sync(mdev); put_ldev(mdev); if (dd == dev_size_error) { diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index e3716fadc6a5..f22a5283128a 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2870,7 +2870,7 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) /* Never shrink a device with usable data during connect. But allow online shrinking if we are connected. */ - if (drbd_new_dev_size(mdev, mdev->ldev) < + if (drbd_new_dev_size(mdev, mdev->ldev, 0) < drbd_get_capacity(mdev->this_bdev) && mdev->state.disk >= D_OUTDATED && mdev->state.conn < C_CONNECTED) { @@ -2885,7 +2885,7 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) #undef min_not_zero if (get_ldev(mdev)) { - dd = drbd_determin_dev_size(mdev); + dd = drbd_determin_dev_size(mdev, 0); put_ldev(mdev); if (dd == dev_size_error) return FALSE; diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index db5721ad50d1..a4d82f895994 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h @@ -69,6 +69,7 @@ NL_PACKET(disconnect, 6, ) NL_PACKET(resize, 7, NL_INT64( 29, T_MAY_IGNORE, resize_size) + NL_BIT( 68, T_MAY_IGNORE, resize_force) ) NL_PACKET(syncer_conf, 8, -- cgit v1.2.3 From 2ebccd71a71e6078920bc65b40f120e72b71c2b6 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 12 Jan 2010 10:09:07 +0100 Subject: drbd: The kernel code is now equivalent to out of tree release 8.3.7 Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/drbd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index e84f4733cb55..78962272338a 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -53,7 +53,7 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.6" +#define REL_VERSION "8.3.7" #define API_VERSION 88 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 91 -- cgit v1.2.3 From 3ccd4c6167d3b39d52631767ebbf8b5677c5855d Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 12 Jan 2010 02:00:46 -0800 Subject: can: Unify droping of invalid tx skbs and netdev stats To prevent the CAN drivers to operate on invalid socketbuffers the skbs are now checked and silently dropped at the xmit-function consistently. Also the netdev stats are consistently using the CAN data length code (dlc) for [rx|tx]_bytes now. Signed-off-by: Oliver Hartkopp Acked-by: Wolfgang Grandegger Signed-off-by: David S. Miller --- drivers/net/can/at91_can.c | 3 +++ drivers/net/can/bfin_can.c | 3 +++ drivers/net/can/mcp251x.c | 6 +----- drivers/net/can/mscan/mscan.c | 5 +---- drivers/net/can/sja1000/sja1000.c | 3 +++ drivers/net/can/ti_hecc.c | 4 +++- drivers/net/can/usb/ems_usb.c | 3 +++ drivers/net/can/vcan.c | 12 +++++++++--- include/linux/can/dev.h | 15 +++++++++++++++ 9 files changed, 41 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c index 166cc7e579c0..f7287497ba6e 100644 --- a/drivers/net/can/at91_can.c +++ b/drivers/net/can/at91_can.c @@ -342,6 +342,9 @@ static netdev_tx_t at91_start_xmit(struct sk_buff *skb, struct net_device *dev) unsigned int mb, prio; u32 reg_mid, reg_mcr; + if (can_dropped_invalid_skb(dev, skb)) + return NETDEV_TX_OK; + mb = get_tx_next_mb(priv); prio = get_tx_next_prio(priv); diff --git a/drivers/net/can/bfin_can.c b/drivers/net/can/bfin_can.c index 0ec1524523cc..7e1926e79e98 100644 --- a/drivers/net/can/bfin_can.c +++ b/drivers/net/can/bfin_can.c @@ -318,6 +318,9 @@ static int bfin_can_start_xmit(struct sk_buff *skb, struct net_device *dev) u16 val; int i; + if (can_dropped_invalid_skb(dev, skb)) + return NETDEV_TX_OK; + netif_stop_queue(dev); /* fill id */ diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c index 1a72ca066a17..afa2fa45fed9 100644 --- a/drivers/net/can/mcp251x.c +++ b/drivers/net/can/mcp251x.c @@ -494,12 +494,8 @@ static netdev_tx_t mcp251x_hard_start_xmit(struct sk_buff *skb, return NETDEV_TX_BUSY; } - if (skb->len != sizeof(struct can_frame)) { - dev_err(&spi->dev, "dropping packet - bad length\n"); - dev_kfree_skb(skb); - net->stats.tx_dropped++; + if (can_dropped_invalid_skb(net, skb)) return NETDEV_TX_OK; - } netif_stop_queue(net); priv->tx_skb = skb; diff --git a/drivers/net/can/mscan/mscan.c b/drivers/net/can/mscan/mscan.c index 500d18918bd5..40827c128b65 100644 --- a/drivers/net/can/mscan/mscan.c +++ b/drivers/net/can/mscan/mscan.c @@ -204,11 +204,8 @@ static netdev_tx_t mscan_start_xmit(struct sk_buff *skb, struct net_device *dev) int i, rtr, buf_id; u32 can_id; - if (skb->len != sizeof(*frame) || frame->can_dlc > 8) { - kfree_skb(skb); - dev->stats.tx_dropped++; + if (can_dropped_invalid_skb(dev, skb)) return NETDEV_TX_OK; - } out_8(®s->cantier, 0); diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c index 542a4f7255b4..345304d779b9 100644 --- a/drivers/net/can/sja1000/sja1000.c +++ b/drivers/net/can/sja1000/sja1000.c @@ -249,6 +249,9 @@ static netdev_tx_t sja1000_start_xmit(struct sk_buff *skb, uint8_t dreg; int i; + if (can_dropped_invalid_skb(dev, skb)) + return NETDEV_TX_OK; + netif_stop_queue(dev); fi = dlc = cf->can_dlc; diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c index 5c993c2da528..7d370e32a7a8 100644 --- a/drivers/net/can/ti_hecc.c +++ b/drivers/net/can/ti_hecc.c @@ -477,6 +477,9 @@ static netdev_tx_t ti_hecc_xmit(struct sk_buff *skb, struct net_device *ndev) u32 mbxno, mbx_mask, data; unsigned long flags; + if (can_dropped_invalid_skb(ndev, skb)) + return NETDEV_TX_OK; + mbxno = get_tx_head_mb(priv); mbx_mask = BIT(mbxno); spin_lock_irqsave(&priv->mbx_lock, flags); @@ -491,7 +494,6 @@ static netdev_tx_t ti_hecc_xmit(struct sk_buff *skb, struct net_device *ndev) spin_unlock_irqrestore(&priv->mbx_lock, flags); /* Prepare mailbox for transmission */ - data = min_t(u8, cf->can_dlc, 8); if (cf->can_id & CAN_RTR_FLAG) /* Remote transmission request */ data |= HECC_CANMCF_RTR; data |= get_tx_head_prio(priv) << 8; diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index efbb05c71bf4..ddb17e256656 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -767,6 +767,9 @@ static netdev_tx_t ems_usb_start_xmit(struct sk_buff *skb, struct net_device *ne size_t size = CPC_HEADER_SIZE + CPC_MSG_HEADER_LEN + sizeof(struct cpc_can_msg); + if (can_dropped_invalid_skb(netdev, skb)) + return NETDEV_TX_OK; + /* create a URB, and a buffer for it, and copy the data to the URB */ urb = usb_alloc_urb(0, GFP_ATOMIC); if (!urb) { diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c index 80ac56313981..d124d837ae58 100644 --- a/drivers/net/can/vcan.c +++ b/drivers/net/can/vcan.c @@ -47,6 +47,7 @@ #include #include #include +#include #include static __initdata const char banner[] = @@ -70,10 +71,11 @@ MODULE_PARM_DESC(echo, "Echo sent frames (for testing). Default: 0 (Off)"); static void vcan_rx(struct sk_buff *skb, struct net_device *dev) { + struct can_frame *cf = (struct can_frame *)skb->data; struct net_device_stats *stats = &dev->stats; stats->rx_packets++; - stats->rx_bytes += skb->len; + stats->rx_bytes += cf->can_dlc; skb->protocol = htons(ETH_P_CAN); skb->pkt_type = PACKET_BROADCAST; @@ -85,11 +87,15 @@ static void vcan_rx(struct sk_buff *skb, struct net_device *dev) static netdev_tx_t vcan_tx(struct sk_buff *skb, struct net_device *dev) { + struct can_frame *cf = (struct can_frame *)skb->data; struct net_device_stats *stats = &dev->stats; int loop; + if (can_dropped_invalid_skb(dev, skb)) + return NETDEV_TX_OK; + stats->tx_packets++; - stats->tx_bytes += skb->len; + stats->tx_bytes += cf->can_dlc; /* set flag whether this packet has to be looped back */ loop = skb->pkt_type == PACKET_LOOPBACK; @@ -103,7 +109,7 @@ static netdev_tx_t vcan_tx(struct sk_buff *skb, struct net_device *dev) * CAN core already did the echo for us */ stats->rx_packets++; - stats->rx_bytes += skb->len; + stats->rx_bytes += cf->can_dlc; } kfree_skb(skb); return NETDEV_TX_OK; diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 3db7767d2a17..7e7c98a3e908 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -60,6 +60,21 @@ struct can_priv { */ #define get_can_dlc(i) (min_t(__u8, (i), 8)) +/* Drop a given socketbuffer if it does not contain a valid CAN frame. */ +static inline int can_dropped_invalid_skb(struct net_device *dev, + struct sk_buff *skb) +{ + const struct can_frame *cf = (struct can_frame *)skb->data; + + if (unlikely(skb->len != sizeof(*cf) || cf->can_dlc > 8)) { + kfree_skb(skb); + dev->stats.tx_dropped++; + return 1; + } + + return 0; +} + struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max); void free_candev(struct net_device *dev); -- cgit v1.2.3 From 81077e82c3f591578625805dd6464a27a9ff56ec Mon Sep 17 00:00:00 2001 From: Lukáš Turek <8an@praha12.net> Date: Mon, 21 Dec 2009 22:50:47 +0100 Subject: nl80211: Add new WIPHY attribute COVERAGE_CLASS The new attribute NL80211_ATTR_WIPHY_COVERAGE_CLASS sets IEEE 802.11 Coverage Class, which depends on maximum distance of nodes in a wireless network. It's required for long distance links (more than a few hundred meters). The attribute is now ignored by two non-mac80211 drivers, rndis and iwmc3200wifi, together with WIPHY_PARAM_RETRY_SHORT and WIPHY_PARAM_RETRY_LONG. If it turns out to be a problem, we could split set_wiphy_params callback or add new capability bits. Signed-off-by: Lukas Turek <8an@praha12.net> Signed-off-by: John W. Linville --- include/linux/nl80211.h | 4 ++++ include/net/cfg80211.h | 2 ++ net/wireless/core.c | 1 + net/wireless/nl80211.c | 15 +++++++++++++++ 4 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index 2bfbe88837ef..d4c556de7170 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -430,6 +430,8 @@ enum nl80211_commands { * @NL80211_ATTR_WIPHY_RTS_THRESHOLD: RTS threshold (TX frames with length * larger than or equal to this use RTS/CTS handshake); allowed range: * 0..65536, disable with (u32)-1; dot11RTSThreshold; u32 + * @NL80211_ATTR_WIPHY_COVERAGE_CLASS: Coverage Class as defined by IEEE 802.11 + * section 7.3.2.9; dot11CoverageClass; u8 * * @NL80211_ATTR_IFINDEX: network interface index of the device to operate on * @NL80211_ATTR_IFNAME: network interface name @@ -779,6 +781,8 @@ enum nl80211_attrs { NL80211_ATTR_COOKIE, + NL80211_ATTR_WIPHY_COVERAGE_CLASS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index add79930f47d..a8d5d04314b9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -837,6 +837,7 @@ enum wiphy_params_flags { WIPHY_PARAM_RETRY_LONG = 1 << 1, WIPHY_PARAM_FRAG_THRESHOLD = 1 << 2, WIPHY_PARAM_RTS_THRESHOLD = 1 << 3, + WIPHY_PARAM_COVERAGE_CLASS = 1 << 4, }; /** @@ -1236,6 +1237,7 @@ struct wiphy { u8 retry_long; u32 frag_threshold; u32 rts_threshold; + u8 coverage_class; char fw_version[ETHTOOL_BUSINFO_LEN]; u32 hw_version; diff --git a/net/wireless/core.c b/net/wireless/core.c index c2a2c563d21a..0a545bb6ed05 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -402,6 +402,7 @@ struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv) rdev->wiphy.retry_long = 4; rdev->wiphy.frag_threshold = (u32) -1; rdev->wiphy.rts_threshold = (u32) -1; + rdev->wiphy.coverage_class = 0; return &rdev->wiphy; } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e3bee3cecdfa..c09fbcd278fb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -69,6 +69,7 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = { [NL80211_ATTR_WIPHY_RETRY_LONG] = { .type = NLA_U8 }, [NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 }, + [NL80211_ATTR_WIPHY_COVERAGE_CLASS] = { .type = NLA_U8 }, [NL80211_ATTR_IFTYPE] = { .type = NLA_U32 }, [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 }, @@ -444,6 +445,8 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, dev->wiphy.frag_threshold); NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_RTS_THRESHOLD, dev->wiphy.rts_threshold); + NLA_PUT_U8(msg, NL80211_ATTR_WIPHY_COVERAGE_CLASS, + dev->wiphy.coverage_class); NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS, dev->wiphy.max_scan_ssids); @@ -684,6 +687,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u32 changed; u8 retry_short = 0, retry_long = 0; u32 frag_threshold = 0, rts_threshold = 0; + u8 coverage_class = 0; rtnl_lock(); @@ -806,9 +810,16 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) changed |= WIPHY_PARAM_RTS_THRESHOLD; } + if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) { + coverage_class = nla_get_u8( + info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]); + changed |= WIPHY_PARAM_COVERAGE_CLASS; + } + if (changed) { u8 old_retry_short, old_retry_long; u32 old_frag_threshold, old_rts_threshold; + u8 old_coverage_class; if (!rdev->ops->set_wiphy_params) { result = -EOPNOTSUPP; @@ -819,6 +830,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) old_retry_long = rdev->wiphy.retry_long; old_frag_threshold = rdev->wiphy.frag_threshold; old_rts_threshold = rdev->wiphy.rts_threshold; + old_coverage_class = rdev->wiphy.coverage_class; if (changed & WIPHY_PARAM_RETRY_SHORT) rdev->wiphy.retry_short = retry_short; @@ -828,6 +840,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.frag_threshold = frag_threshold; if (changed & WIPHY_PARAM_RTS_THRESHOLD) rdev->wiphy.rts_threshold = rts_threshold; + if (changed & WIPHY_PARAM_COVERAGE_CLASS) + rdev->wiphy.coverage_class = coverage_class; result = rdev->ops->set_wiphy_params(&rdev->wiphy, changed); if (result) { @@ -835,6 +849,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.retry_long = old_retry_long; rdev->wiphy.frag_threshold = old_frag_threshold; rdev->wiphy.rts_threshold = old_rts_threshold; + rdev->wiphy.coverage_class = old_coverage_class; } } -- cgit v1.2.3 From 13ae75b103e07304a34ab40c9136e9f53e06475c Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Tue, 29 Dec 2009 12:59:45 +0200 Subject: nl80211: New command for setting TX rate mask for rate control Add a new NL80211_CMD_SET_TX_BITRATE_MASK command and related attributes to provide support for setting TX rate mask for rate control. This uses the existing cfg80211 set_bitrate_mask operation that was previously used only with WEXT compat code (SIOCSIWRATE). The nl80211 command allows more generic configuration of allowed rates as a mask instead of fixed/max rate. Signed-off-by: Jouni Malinen Acked-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/nl80211.h | 44 +++++++++++++++++++ include/net/cfg80211.h | 4 +- net/wireless/nl80211.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index d4c556de7170..7a1c8c145b22 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -295,6 +295,10 @@ * This command is also used as an event to notify when a requested * remain-on-channel duration has expired. * + * @NL80211_CMD_SET_TX_BITRATE_MASK: Set the mask of rates to be used in TX + * rate selection. %NL80211_ATTR_IFINDEX is used to specify the interface + * and @NL80211_ATTR_TX_RATES the set of allowed rates. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -381,6 +385,8 @@ enum nl80211_commands { NL80211_CMD_REMAIN_ON_CHANNEL, NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, + NL80211_CMD_SET_TX_BITRATE_MASK, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -640,6 +646,13 @@ enum nl80211_commands { * * @NL80211_ATTR_COOKIE: Generic 64-bit cookie to identify objects. * + * @NL80211_ATTR_TX_RATES: Nested set of attributes + * (enum nl80211_tx_rate_attributes) describing TX rates per band. The + * enum nl80211_band value is used as the index (nla_type() of the nested + * data. If a band is not included, it will be configured to allow all + * rates based on negotiated supported rates information. This attribute + * is used with %NL80211_CMD_SET_TX_BITRATE_MASK. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -783,6 +796,8 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_COVERAGE_CLASS, + NL80211_ATTR_TX_RATES, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -1482,4 +1497,33 @@ enum nl80211_key_attributes { NL80211_KEY_MAX = __NL80211_KEY_AFTER_LAST - 1 }; +/** + * enum nl80211_tx_rate_attributes - TX rate set attributes + * @__NL80211_TXRATE_INVALID: invalid + * @NL80211_TXRATE_LEGACY: Legacy (non-MCS) rates allowed for TX rate selection + * in an array of rates as defined in IEEE 802.11 7.3.2.2 (u8 values with + * 1 = 500 kbps) but without the IE length restriction (at most + * %NL80211_MAX_SUPP_RATES in a single array). + * @__NL80211_TXRATE_AFTER_LAST: internal + * @NL80211_TXRATE_MAX: highest TX rate attribute + */ +enum nl80211_tx_rate_attributes { + __NL80211_TXRATE_INVALID, + NL80211_TXRATE_LEGACY, + + /* keep last */ + __NL80211_TXRATE_AFTER_LAST, + NL80211_TXRATE_MAX = __NL80211_TXRATE_AFTER_LAST - 1 +}; + +/** + * enum nl80211_band - Frequency band + * @NL80211_BAND_2GHZ - 2.4 GHz ISM band + * @NL80211_BAND_5GHZ - around 5 GHz band (4.9 - 5.7 GHz) + */ +enum nl80211_band { + NL80211_BAND_2GHZ, + NL80211_BAND_5GHZ, +}; + #endif /* __LINUX_NL80211_H */ diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 22e062afb5a1..0d734413b5fb 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -39,8 +39,8 @@ * @IEEE80211_BAND_5GHZ: around 5GHz band (4.9-5.7) */ enum ieee80211_band { - IEEE80211_BAND_2GHZ, - IEEE80211_BAND_5GHZ, + IEEE80211_BAND_2GHZ = NL80211_BAND_2GHZ, + IEEE80211_BAND_5GHZ = NL80211_BAND_5GHZ, /* keep last */ IEEE80211_NUM_BANDS diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c09fbcd278fb..b804062e0179 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -144,6 +144,7 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = { .len = WLAN_PMKID_LEN }, [NL80211_ATTR_DURATION] = { .type = NLA_U32 }, [NL80211_ATTR_COOKIE] = { .type = NLA_U64 }, + [NL80211_ATTR_TX_RATES] = { .type = NLA_NESTED }, }; /* policy for the attributes */ @@ -575,6 +576,7 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, CMD(del_pmksa, DEL_PMKSA); CMD(flush_pmksa, FLUSH_PMKSA); CMD(remain_on_channel, REMAIN_ON_CHANNEL); + CMD(set_bitrate_mask, SET_TX_BITRATE_MASK); if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) { i++; NLA_PUT_U32(msg, i, NL80211_CMD_SET_WIPHY_NETNS); @@ -4438,6 +4440,109 @@ static int nl80211_cancel_remain_on_channel(struct sk_buff *skb, return err; } +static u32 rateset_to_mask(struct ieee80211_supported_band *sband, + u8 *rates, u8 rates_len) +{ + u8 i; + u32 mask = 0; + + for (i = 0; i < rates_len; i++) { + int rate = (rates[i] & 0x7f) * 5; + int ridx; + for (ridx = 0; ridx < sband->n_bitrates; ridx++) { + struct ieee80211_rate *srate = + &sband->bitrates[ridx]; + if (rate == srate->bitrate) { + mask |= 1 << ridx; + break; + } + } + if (ridx == sband->n_bitrates) + return 0; /* rate not found */ + } + + return mask; +} + +static struct nla_policy +nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] __read_mostly = { + [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_RATES }, +}; + +static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *tb[NL80211_TXRATE_MAX + 1]; + struct cfg80211_registered_device *rdev; + struct cfg80211_bitrate_mask mask; + int err, rem, i; + struct net_device *dev; + struct nlattr *tx_rates; + struct ieee80211_supported_band *sband; + + if (info->attrs[NL80211_ATTR_TX_RATES] == NULL) + return -EINVAL; + + rtnl_lock(); + + err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); + if (err) + goto unlock_rtnl; + + if (!rdev->ops->set_bitrate_mask) { + err = -EOPNOTSUPP; + goto unlock; + } + + memset(&mask, 0, sizeof(mask)); + /* Default to all rates enabled */ + for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + sband = rdev->wiphy.bands[i]; + mask.control[i].legacy = + sband ? (1 << sband->n_bitrates) - 1 : 0; + } + + /* + * The nested attribute uses enum nl80211_band as the index. This maps + * directly to the enum ieee80211_band values used in cfg80211. + */ + nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) + { + enum ieee80211_band band = nla_type(tx_rates); + if (band < 0 || band >= IEEE80211_NUM_BANDS) { + err = -EINVAL; + goto unlock; + } + sband = rdev->wiphy.bands[band]; + if (sband == NULL) { + err = -EINVAL; + goto unlock; + } + nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates), + nla_len(tx_rates), nl80211_txattr_policy); + if (tb[NL80211_TXRATE_LEGACY]) { + mask.control[band].legacy = rateset_to_mask( + sband, + nla_data(tb[NL80211_TXRATE_LEGACY]), + nla_len(tb[NL80211_TXRATE_LEGACY])); + if (mask.control[band].legacy == 0) { + err = -EINVAL; + goto unlock; + } + } + } + + err = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, NULL, &mask); + + unlock: + dev_put(dev); + cfg80211_unlock_rdev(rdev); + unlock_rtnl: + rtnl_unlock(); + return err; +} + static struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_GET_WIPHY, @@ -4712,6 +4817,12 @@ static struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, }, + { + .cmd = NL80211_CMD_SET_TX_BITRATE_MASK, + .doit = nl80211_set_tx_bitrate_mask, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_multicast_group nl80211_mlme_mcgrp = { -- cgit v1.2.3 From 7044cc565b45a898c140fb185174a66f2d68a163 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Tue, 5 Jan 2010 20:16:19 +0200 Subject: mac80211: add functions to create PS Poll and Nullfunc templates Some hardware, for example wl1251 and wl1271, handle the transmission of power save related frames in hardware, but the driver is responsible for creating the templates. It's better to create the templates in mac80211, that way all drivers can benefit from this. Add two new functions, ieee80211_pspoll_get() and ieee80211_nullfunc_get() which drivers need to call to get the frame. Drivers are also responsible for updating the templates after each association. Also new struct ieee80211_hdr_3addr is added to ieee80211.h to make it easy to calculate length of the Nullfunc frame. Signed-off-by: Kalle Valo Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 9 ++++++ include/net/mac80211.h | 30 ++++++++++++++++++ net/mac80211/tx.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index aeea282bd2fe..602c0692c3fc 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -130,6 +130,15 @@ struct ieee80211_hdr { u8 addr4[6]; } __attribute__ ((packed)); +struct ieee80211_hdr_3addr { + __le16 frame_control; + __le16 duration_id; + u8 addr1[6]; + u8 addr2[6]; + u8 addr3[6]; + __le16 seq_ctrl; +} __attribute__ ((packed)); + /** * ieee80211_has_tods - check if IEEE80211_FCTL_TODS is set * @fc: frame control bytes in little-endian byteorder diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7e5af6d90b93..75f46e26ad60 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1874,6 +1874,36 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, return ieee80211_beacon_get_tim(hw, vif, NULL, NULL); } +/** + * ieee80211_pspoll_get - retrieve a PS Poll template + * @hw: pointer obtained from ieee80211_alloc_hw(). + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * + * Creates a PS Poll a template which can, for example, uploaded to + * hardware. The template must be updated after association so that correct + * AID, BSSID and MAC address is used. + * + * Note: Caller (or hardware) is responsible for setting the + * &IEEE80211_FCTL_PM bit. + */ +struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif); + +/** + * ieee80211_nullfunc_get - retrieve a nullfunc template + * @hw: pointer obtained from ieee80211_alloc_hw(). + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * + * Creates a Nullfunc template which can, for example, uploaded to + * hardware. The template must be updated after association so that correct + * BSSID and address is used. + * + * Note: Caller (or hardware) is responsible for setting the + * &IEEE80211_FCTL_PM bit as well as Duration and Sequence Control fields. + */ +struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif); + /** * ieee80211_rts_get - RTS frame generation function * @hw: pointer obtained from ieee80211_alloc_hw(). diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index d3a44812f8bf..055b45b146d9 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2200,6 +2200,84 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_beacon_get_tim); +struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata; + struct ieee80211_if_managed *ifmgd; + struct ieee80211_pspoll *pspoll; + struct ieee80211_local *local; + struct sk_buff *skb; + + if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) + return NULL; + + sdata = vif_to_sdata(vif); + ifmgd = &sdata->u.mgd; + local = sdata->local; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*pspoll)); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for " + "pspoll template\n", sdata->name); + return NULL; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + pspoll = (struct ieee80211_pspoll *) skb_put(skb, sizeof(*pspoll)); + memset(pspoll, 0, sizeof(*pspoll)); + pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | + IEEE80211_STYPE_PSPOLL); + pspoll->aid = cpu_to_le16(ifmgd->aid); + + /* aid in PS-Poll has its two MSBs each set to 1 */ + pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14); + + memcpy(pspoll->bssid, ifmgd->bssid, ETH_ALEN); + memcpy(pspoll->ta, vif->addr, ETH_ALEN); + + return skb; +} +EXPORT_SYMBOL(ieee80211_pspoll_get); + +struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) +{ + struct ieee80211_hdr_3addr *nullfunc; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_if_managed *ifmgd; + struct ieee80211_local *local; + struct sk_buff *skb; + + if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) + return NULL; + + sdata = vif_to_sdata(vif); + ifmgd = &sdata->u.mgd; + local = sdata->local; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*nullfunc)); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc " + "template\n", sdata->name); + return NULL; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + nullfunc = (struct ieee80211_hdr_3addr *) skb_put(skb, + sizeof(*nullfunc)); + memset(nullfunc, 0, sizeof(*nullfunc)); + nullfunc->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA | + IEEE80211_STYPE_NULLFUNC | + IEEE80211_FCTL_TODS); + memcpy(nullfunc->addr1, ifmgd->bssid, ETH_ALEN); + memcpy(nullfunc->addr2, vif->addr, ETH_ALEN); + memcpy(nullfunc->addr3, ifmgd->bssid, ETH_ALEN); + + return skb; +} +EXPORT_SYMBOL(ieee80211_nullfunc_get); + void ieee80211_rts_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_info *frame_txctl, -- cgit v1.2.3 From 34a6eddbabd704b3c7dae9362234552267573be2 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 6 Jan 2010 16:19:24 +0200 Subject: cfg80211: Store IEs from both Beacon and Probe Response frames Store information elements from Beacon and Probe Response frames in separate buffers to allow both sets to be made available through nl80211. This allows user space applications to get access to IEs from Beacon frames even if we have received Probe Response frames from the BSS. Previously, the IEs from Probe Response frames would have overridden the IEs from Beacon frames. This feature is of somewhat limited use since most protocols include the same (or extended) information in Probe Response frames. However, there are couple of exceptions where the IEs from Beacon frames could be of some use: TIM IE is only included in Beacon frames (and it would be needed to figure out the DTIM period used in the BSS) and at least some implementations of Wireless Provisioning Services seem to include the full IE only in Beacon frames). The new BSS attribute for scan results is added to allow both the IE sets to be delivered. This is done in a way that maintains the previously used behavior for applications that are not aware of the new NL80211_BSS_BEACON_IES attribute. Signed-off-by: Jouni Malinen Acked-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/nl80211.h | 10 +++- include/net/cfg80211.h | 12 ++++- net/wireless/core.h | 3 +- net/wireless/nl80211.c | 4 ++ net/wireless/scan.c | 120 ++++++++++++++++++++++++++++++++++++------------ 5 files changed, 116 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index 7a1c8c145b22..127a73015760 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -1378,13 +1378,20 @@ enum nl80211_channel_type { * @NL80211_BSS_BEACON_INTERVAL: beacon interval of the (I)BSS (u16) * @NL80211_BSS_CAPABILITY: capability field (CPU order, u16) * @NL80211_BSS_INFORMATION_ELEMENTS: binary attribute containing the - * raw information elements from the probe response/beacon (bin) + * raw information elements from the probe response/beacon (bin); + * if the %NL80211_BSS_BEACON_IES attribute is present, the IEs here are + * from a Probe Response frame; otherwise they are from a Beacon frame. + * However, if the driver does not indicate the source of the IEs, these + * IEs may be from either frame subtype. * @NL80211_BSS_SIGNAL_MBM: signal strength of probe response/beacon * in mBm (100 * dBm) (s32) * @NL80211_BSS_SIGNAL_UNSPEC: signal strength of the probe response/beacon * in unspecified units, scaled to 0..100 (u8) * @NL80211_BSS_STATUS: status, if this BSS is "used" * @NL80211_BSS_SEEN_MS_AGO: age of this BSS entry in ms + * @NL80211_BSS_BEACON_IES: binary attribute containing the raw information + * elements from a Beacon frame (bin); not present if no Beacon frame has + * yet been received * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -1400,6 +1407,7 @@ enum nl80211_bss { NL80211_BSS_SIGNAL_UNSPEC, NL80211_BSS_STATUS, NL80211_BSS_SEEN_MS_AGO, + NL80211_BSS_BEACON_IES, /* keep last */ __NL80211_BSS_AFTER_LAST, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0d734413b5fb..2af52704e670 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -626,8 +626,14 @@ enum cfg80211_signal_type { * @beacon_interval: the beacon interval as from the frame * @capability: the capability field in host byte order * @information_elements: the information elements (Note that there - * is no guarantee that these are well-formed!) + * is no guarantee that these are well-formed!); this is a pointer to + * either the beacon_ies or proberesp_ies depending on whether Probe + * Response frame has been received * @len_information_elements: total length of the information elements + * @beacon_ies: the information elements from the last Beacon frame + * @len_beacon_ies: total length of the beacon_ies + * @proberesp_ies: the information elements from the last Probe Response frame + * @len_proberesp_ies: total length of the proberesp_ies * @signal: signal strength value (type depends on the wiphy's signal_type) * @free_priv: function pointer to free private data * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes @@ -641,6 +647,10 @@ struct cfg80211_bss { u16 capability; u8 *information_elements; size_t len_information_elements; + u8 *beacon_ies; + size_t len_beacon_ies; + u8 *proberesp_ies; + size_t len_proberesp_ies; s32 signal; diff --git a/net/wireless/core.h b/net/wireless/core.h index 30ec95f05b52..2d6a6b9c0c43 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -111,7 +111,8 @@ struct cfg80211_internal_bss { unsigned long ts; struct kref ref; atomic_t hold; - bool ies_allocated; + bool beacon_ies_allocated; + bool proberesp_ies_allocated; /* must be last because of priv member */ struct cfg80211_bss pub; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b804062e0179..4af7991a9ec8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3163,6 +3163,10 @@ static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags, NLA_PUT(msg, NL80211_BSS_INFORMATION_ELEMENTS, res->len_information_elements, res->information_elements); + if (res->beacon_ies && res->len_beacon_ies && + res->beacon_ies != res->information_elements) + NLA_PUT(msg, NL80211_BSS_BEACON_IES, + res->len_beacon_ies, res->beacon_ies); if (res->tsf) NLA_PUT_U64(msg, NL80211_BSS_TSF, res->tsf); if (res->beacon_interval) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 0c2cbbebca95..06b0231ee5e3 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -100,8 +100,10 @@ static void bss_release(struct kref *ref) if (bss->pub.free_priv) bss->pub.free_priv(&bss->pub); - if (bss->ies_allocated) - kfree(bss->pub.information_elements); + if (bss->beacon_ies_allocated) + kfree(bss->pub.beacon_ies); + if (bss->proberesp_ies_allocated) + kfree(bss->pub.proberesp_ies); BUG_ON(atomic_read(&bss->hold)); @@ -375,8 +377,7 @@ rb_find_bss(struct cfg80211_registered_device *dev, static struct cfg80211_internal_bss * cfg80211_bss_update(struct cfg80211_registered_device *dev, - struct cfg80211_internal_bss *res, - bool overwrite) + struct cfg80211_internal_bss *res) { struct cfg80211_internal_bss *found = NULL; const u8 *meshid, *meshcfg; @@ -418,28 +419,64 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev, found->pub.capability = res->pub.capability; found->ts = res->ts; - /* overwrite IEs */ - if (overwrite) { + /* Update IEs */ + if (res->pub.proberesp_ies) { size_t used = dev->wiphy.bss_priv_size + sizeof(*res); - size_t ielen = res->pub.len_information_elements; + size_t ielen = res->pub.len_proberesp_ies; + + if (found->pub.proberesp_ies && + !found->proberesp_ies_allocated && + ksize(found) >= used + ielen) { + memcpy(found->pub.proberesp_ies, + res->pub.proberesp_ies, ielen); + found->pub.len_proberesp_ies = ielen; + } else { + u8 *ies = found->pub.proberesp_ies; + + if (found->proberesp_ies_allocated) + ies = krealloc(ies, ielen, GFP_ATOMIC); + else + ies = kmalloc(ielen, GFP_ATOMIC); + + if (ies) { + memcpy(ies, res->pub.proberesp_ies, + ielen); + found->proberesp_ies_allocated = true; + found->pub.proberesp_ies = ies; + found->pub.len_proberesp_ies = ielen; + } + } - if (!found->ies_allocated && ksize(found) >= used + ielen) { - memcpy(found->pub.information_elements, - res->pub.information_elements, ielen); - found->pub.len_information_elements = ielen; + /* Override possible earlier Beacon frame IEs */ + found->pub.information_elements = + found->pub.proberesp_ies; + found->pub.len_information_elements = + found->pub.len_proberesp_ies; + } + if (res->pub.beacon_ies) { + size_t used = dev->wiphy.bss_priv_size + sizeof(*res); + size_t ielen = res->pub.len_beacon_ies; + + if (found->pub.beacon_ies && + !found->beacon_ies_allocated && + ksize(found) >= used + ielen) { + memcpy(found->pub.beacon_ies, + res->pub.beacon_ies, ielen); + found->pub.len_beacon_ies = ielen; } else { - u8 *ies = found->pub.information_elements; + u8 *ies = found->pub.beacon_ies; - if (found->ies_allocated) + if (found->beacon_ies_allocated) ies = krealloc(ies, ielen, GFP_ATOMIC); else ies = kmalloc(ielen, GFP_ATOMIC); if (ies) { - memcpy(ies, res->pub.information_elements, ielen); - found->ies_allocated = true; - found->pub.information_elements = ies; - found->pub.len_information_elements = ielen; + memcpy(ies, res->pub.beacon_ies, + ielen); + found->beacon_ies_allocated = true; + found->pub.beacon_ies = ies; + found->pub.len_beacon_ies = ielen; } } } @@ -489,14 +526,26 @@ cfg80211_inform_bss(struct wiphy *wiphy, res->pub.tsf = timestamp; res->pub.beacon_interval = beacon_interval; res->pub.capability = capability; - /* point to after the private area */ - res->pub.information_elements = (u8 *)res + sizeof(*res) + privsz; - memcpy(res->pub.information_elements, ie, ielen); - res->pub.len_information_elements = ielen; + /* + * Since we do not know here whether the IEs are from a Beacon or Probe + * Response frame, we need to pick one of the options and only use it + * with the driver that does not provide the full Beacon/Probe Response + * frame. Use Beacon frame pointer to avoid indicating that this should + * override the information_elements pointer should we have received an + * earlier indication of Probe Response data. + * + * The initial buffer for the IEs is allocated with the BSS entry and + * is located after the private area. + */ + res->pub.beacon_ies = (u8 *)res + sizeof(*res) + privsz; + memcpy(res->pub.beacon_ies, ie, ielen); + res->pub.len_beacon_ies = ielen; + res->pub.information_elements = res->pub.beacon_ies; + res->pub.len_information_elements = res->pub.len_beacon_ies; kref_init(&res->ref); - res = cfg80211_bss_update(wiphy_to_dev(wiphy), res, 0); + res = cfg80211_bss_update(wiphy_to_dev(wiphy), res); if (!res) return NULL; @@ -517,7 +566,6 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, struct cfg80211_internal_bss *res; size_t ielen = len - offsetof(struct ieee80211_mgmt, u.probe_resp.variable); - bool overwrite; size_t privsz = wiphy->bss_priv_size; if (WARN_ON(wiphy->signal_type == NL80211_BSS_SIGNAL_UNSPEC && @@ -538,16 +586,28 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, res->pub.tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); res->pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); res->pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); - /* point to after the private area */ - res->pub.information_elements = (u8 *)res + sizeof(*res) + privsz; - memcpy(res->pub.information_elements, mgmt->u.probe_resp.variable, ielen); - res->pub.len_information_elements = ielen; + /* + * The initial buffer for the IEs is allocated with the BSS entry and + * is located after the private area. + */ + if (ieee80211_is_probe_resp(mgmt->frame_control)) { + res->pub.proberesp_ies = (u8 *) res + sizeof(*res) + privsz; + memcpy(res->pub.proberesp_ies, mgmt->u.probe_resp.variable, + ielen); + res->pub.len_proberesp_ies = ielen; + res->pub.information_elements = res->pub.proberesp_ies; + res->pub.len_information_elements = res->pub.len_proberesp_ies; + } else { + res->pub.beacon_ies = (u8 *) res + sizeof(*res) + privsz; + memcpy(res->pub.beacon_ies, mgmt->u.beacon.variable, ielen); + res->pub.len_beacon_ies = ielen; + res->pub.information_elements = res->pub.beacon_ies; + res->pub.len_information_elements = res->pub.len_beacon_ies; + } kref_init(&res->ref); - overwrite = ieee80211_is_probe_resp(mgmt->frame_control); - - res = cfg80211_bss_update(wiphy_to_dev(wiphy), res, overwrite); + res = cfg80211_bss_update(wiphy_to_dev(wiphy), res); if (!res) return NULL; -- cgit v1.2.3 From ab13315af97919fae0e014748105fdc2e30afb2d Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Tue, 12 Jan 2010 10:42:31 +0200 Subject: mac80211: add U-APSD client support Add Unscheduled Automatic Power-Save Delivery (U-APSD) client support. The idea is that the data frames from the client trigger AP to send the buffered frames with ACs which have U-APSD enabled. This decreases latency and makes it possible to save even more power. Driver needs to use IEEE80211_HW_UAPSD to enable the feature. The current implementation assumes that firmware takes care of the wakeup and hardware needing IEEE80211_HW_PS_NULLFUNC_STACK is not yet supported. Tested with wl1251 on a Nokia N900 and Cisco Aironet 1231G AP and running various test traffic with ping. Signed-off-by: Kalle Valo Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 18 ++++++++++++++++++ include/net/mac80211.h | 7 +++++++ net/mac80211/cfg.c | 7 +++++++ net/mac80211/ieee80211_i.h | 13 ++++++++++++- net/mac80211/main.c | 4 ++++ net/mac80211/mlme.c | 31 ++++++++++++++++++++++++++++--- net/mac80211/scan.c | 18 ++++++++++++++++++ net/mac80211/util.c | 2 ++ net/mac80211/work.c | 12 ++++++++++-- 9 files changed, 106 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 602c0692c3fc..a8c6069a0d9f 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -120,6 +120,24 @@ #define IEEE80211_QOS_CTL_TID_MASK 0x000F #define IEEE80211_QOS_CTL_TAG1D_MASK 0x0007 +/* U-APSD queue for WMM IEs sent by AP */ +#define IEEE80211_WMM_IE_AP_QOSINFO_UAPSD (1<<7) + +/* U-APSD queues for WMM IEs sent by STA */ +#define IEEE80211_WMM_IE_STA_QOSINFO_AC_VO (1<<0) +#define IEEE80211_WMM_IE_STA_QOSINFO_AC_VI (1<<1) +#define IEEE80211_WMM_IE_STA_QOSINFO_AC_BK (1<<2) +#define IEEE80211_WMM_IE_STA_QOSINFO_AC_BE (1<<3) +#define IEEE80211_WMM_IE_STA_QOSINFO_AC_MASK 0x0f + +/* U-APSD max SP length for WMM IEs sent by STA */ +#define IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL 0x00 +#define IEEE80211_WMM_IE_STA_QOSINFO_SP_2 0x01 +#define IEEE80211_WMM_IE_STA_QOSINFO_SP_4 0x02 +#define IEEE80211_WMM_IE_STA_QOSINFO_SP_6 0x03 +#define IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK 0x03 +#define IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT 5 + struct ieee80211_hdr { __le16 frame_control; __le16 duration_id; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index e1e73c6abeff..f313a3cbabda 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -113,6 +113,7 @@ struct ieee80211_tx_queue_params { u16 cw_min; u16 cw_max; u8 aifs; + bool uapsd; }; /** @@ -929,6 +930,11 @@ enum ieee80211_tkip_key_type { * Hardware supports dynamic spatial multiplexing powersave, * ie. can turn off all but one chain and then wake the rest * up as required after, for example, rts/cts handshake. + * + * @IEEE80211_HW_SUPPORTS_UAPSD: + * Hardware supports Unscheduled Automatic Power Save Delivery + * (U-APSD) in managed mode. The mode is configured with + * conf_tx() operation. */ enum ieee80211_hw_flags { IEEE80211_HW_HAS_RATE_CONTROL = 1<<0, @@ -948,6 +954,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_BEACON_FILTER = 1<<14, IEEE80211_HW_SUPPORTS_STATIC_SMPS = 1<<15, IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS = 1<<16, + IEEE80211_HW_SUPPORTS_UAPSD = 1<<17, }; /** diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index dc12e9466ffd..8286df5822d5 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1128,6 +1128,13 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy, p.cw_max = params->cwmax; p.cw_min = params->cwmin; p.txop = params->txop; + + /* + * Setting tx queue params disables u-apsd because it's only + * called in master mode. + */ + p.uapsd = false; + if (drv_conf_tx(local, params->queue, &p)) { printk(KERN_DEBUG "%s: failed to set TX queue " "parameters for queue %d\n", diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 3e4ac3f30857..3468e378509a 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -58,6 +58,15 @@ struct ieee80211_local; #define TU_TO_EXP_TIME(x) (jiffies + usecs_to_jiffies((x) * 1024)) +#define IEEE80211_DEFAULT_UAPSD_QUEUES \ + (IEEE80211_WMM_IE_STA_QOSINFO_AC_BK | \ + IEEE80211_WMM_IE_STA_QOSINFO_AC_BE | \ + IEEE80211_WMM_IE_STA_QOSINFO_AC_VI | \ + IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) + +#define IEEE80211_DEFAULT_MAX_SP_LEN \ + IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL + struct ieee80211_fragment_entry { unsigned long first_frag_time; unsigned int seq; @@ -78,6 +87,7 @@ struct ieee80211_bss { u8 dtim_period; bool wmm_used; + bool uapsd_supported; unsigned long last_probe_resp; @@ -285,7 +295,7 @@ struct ieee80211_work { u8 ssid[IEEE80211_MAX_SSID_LEN]; u8 ssid_len; u8 supp_rates_len; - bool wmm_used, use_11n; + bool wmm_used, use_11n, uapsd_used; } assoc; struct { u32 duration; @@ -306,6 +316,7 @@ enum ieee80211_sta_flags { IEEE80211_STA_DISABLE_11N = BIT(4), IEEE80211_STA_CSA_RECEIVED = BIT(5), IEEE80211_STA_MFP_ENABLED = BIT(6), + IEEE80211_STA_UAPSD_ENABLED = BIT(7), }; struct ieee80211_if_managed { diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 468829143991..0054bba08ce1 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -491,6 +491,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC; + WARN((local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD) + && (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK), + "U-APSD not supported with HW_PS_NULLFUNC_STACK\n"); + /* * Calculate scan IE length -- we need this to alloc * memory and to subtract from the driver limit. It diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 86f025bc9456..39c27d83a4f2 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -569,7 +569,7 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, struct ieee80211_tx_queue_params params; size_t left; int count; - u8 *pos; + u8 *pos, uapsd_queues = 0; if (local->hw.queues < 4) return; @@ -579,6 +579,10 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, if (wmm_param_len < 8 || wmm_param[5] /* version */ != 1) return; + + if (ifmgd->flags & IEEE80211_STA_UAPSD_ENABLED) + uapsd_queues = IEEE80211_DEFAULT_UAPSD_QUEUES; + count = wmm_param[6] & 0x0f; if (count == ifmgd->wmm_last_param_set) return; @@ -593,6 +597,7 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, for (; left >= 4; left -= 4, pos += 4) { int aci = (pos[0] >> 5) & 0x03; int acm = (pos[0] >> 4) & 0x01; + bool uapsd = false; int queue; switch (aci) { @@ -600,22 +605,30 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, queue = 3; if (acm) local->wmm_acm |= BIT(1) | BIT(2); /* BK/- */ + if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK) + uapsd = true; break; case 2: /* AC_VI */ queue = 1; if (acm) local->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */ + if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI) + uapsd = true; break; case 3: /* AC_VO */ queue = 0; if (acm) local->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */ + if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) + uapsd = true; break; case 0: /* AC_BE */ default: queue = 2; if (acm) local->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */ + if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE) + uapsd = true; break; } @@ -623,11 +636,14 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, params.cw_max = ecw2cw((pos[1] & 0xf0) >> 4); params.cw_min = ecw2cw(pos[1] & 0x0f); params.txop = get_unaligned_le16(pos + 2); + params.uapsd = uapsd; + #ifdef CONFIG_MAC80211_VERBOSE_DEBUG printk(KERN_DEBUG "%s: WMM queue=%d aci=%d acm=%d aifs=%d " - "cWmin=%d cWmax=%d txop=%d\n", + "cWmin=%d cWmax=%d txop=%d uapsd=%d\n", wiphy_name(local->hw.wiphy), queue, aci, acm, - params.aifs, params.cw_min, params.cw_max, params.txop); + params.aifs, params.cw_min, params.cw_max, params.txop, + params.uapsd); #endif if (drv_conf_tx(local, queue, ¶ms) && local->ops->conf_tx) printk(KERN_DEBUG "%s: failed to set TX queue " @@ -1906,6 +1922,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, wk->assoc.ht_information_ie = ieee80211_bss_get_ie(req->bss, WLAN_EID_HT_INFORMATION); + if (bss->wmm_used && bss->uapsd_supported && + (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD)) { + wk->assoc.uapsd_used = true; + ifmgd->flags |= IEEE80211_STA_UAPSD_ENABLED; + } else { + wk->assoc.uapsd_used = false; + ifmgd->flags &= ~IEEE80211_STA_UAPSD_ENABLED; + } + ssid = ieee80211_bss_get_ie(req->bss, WLAN_EID_SSID); memcpy(wk->assoc.ssid, ssid + 2, ssid[1]); wk->assoc.ssid_len = ssid[1]; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 30cb62bb45b3..9afe2f9885dc 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -54,6 +54,23 @@ void ieee80211_rx_bss_put(struct ieee80211_local *local, cfg80211_put_bss(container_of((void *)bss, struct cfg80211_bss, priv)); } +static bool is_uapsd_supported(struct ieee802_11_elems *elems) +{ + u8 qos_info; + + if (elems->wmm_info && elems->wmm_info_len == 7 + && elems->wmm_info[5] == 1) + qos_info = elems->wmm_info[6]; + else if (elems->wmm_param && elems->wmm_param_len == 24 + && elems->wmm_param[5] == 1) + qos_info = elems->wmm_param[6]; + else + /* no valid wmm information or parameter element found */ + return false; + + return qos_info & IEEE80211_WMM_IE_AP_QOSINFO_UAPSD; +} + struct ieee80211_bss * ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_rx_status *rx_status, @@ -117,6 +134,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local, } bss->wmm_used = elems->wmm_param || elems->wmm_info; + bss->uapsd_supported = is_uapsd_supported(elems); if (!beacon) bss->last_probe_resp = jiffies; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index a2ba6e29bd9a..e278f97c8305 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -792,6 +792,8 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata) break; } + qparam.uapsd = false; + drv_conf_tx(local, queue, &qparam); } } diff --git a/net/mac80211/work.c b/net/mac80211/work.c index 7c5d95b1bc04..a74fd6ee0083 100644 --- a/net/mac80211/work.c +++ b/net/mac80211/work.c @@ -202,7 +202,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; - u8 *pos; + u8 *pos, qos_info; const u8 *ies; size_t offset = 0, noffset; int i, len, count, rates_len, supp_rates_len; @@ -375,6 +375,14 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, } if (wk->assoc.wmm_used && local->hw.queues >= 4) { + if (wk->assoc.uapsd_used) { + qos_info = IEEE80211_DEFAULT_UAPSD_QUEUES; + qos_info |= (IEEE80211_DEFAULT_MAX_SP_LEN << + IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT); + } else { + qos_info = 0; + } + pos = skb_put(skb, 9); *pos++ = WLAN_EID_VENDOR_SPECIFIC; *pos++ = 7; /* len */ @@ -384,7 +392,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, *pos++ = 2; /* WME */ *pos++ = 0; /* WME info */ *pos++ = 1; /* WME ver */ - *pos++ = 0; + *pos++ = qos_info; } /* add any remaining custom (i.e. vendor specific here) IEs */ -- cgit v1.2.3 From 558a6669d7cb407fbb0b5aec184b5c3b9a893d30 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Tue, 12 Jan 2010 10:43:00 +0200 Subject: ieee80211: add struct ieee80211_hdr_qos The header can be used to create qos nullfunc frames, for example. Signed-off-by: Kalle Valo Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a8c6069a0d9f..842701906ae9 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -157,6 +157,16 @@ struct ieee80211_hdr_3addr { __le16 seq_ctrl; } __attribute__ ((packed)); +struct ieee80211_qos_hdr { + __le16 frame_control; + __le16 duration_id; + u8 addr1[6]; + u8 addr2[6]; + u8 addr3[6]; + __le16 seq_ctrl; + __le16 qos_ctrl; +} __attribute__ ((packed)); + /** * ieee80211_has_tods - check if IEEE80211_FCTL_TODS is set * @fc: frame control bytes in little-endian byteorder -- cgit v1.2.3 From 5040ab67a2c6d5710ba497dc52a8f7035729d7b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 11 Jan 2010 11:14:44 +0900 Subject: libata: retry link resume if necessary Interestingly, when SIDPR is used in ata_piix, writes to DET in SControl sometimes get ignored leading to detection failure. Update sata_link_resume() such that it reads back SControl after clearing DET and retry if it's not clear. Signed-off-by: Tejun Heo Reported-by: fengxiangjun Reported-by: Jim Faulkner Cc: stable@kernel.org Signed-off-by: Jeff Garzik --- drivers/ata/libata-core.c | 38 +++++++++++++++++++++++++++++++------- include/linux/libata.h | 3 +++ 2 files changed, 34 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 22ff51bdbc8a..6728328f3bea 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -3790,21 +3790,45 @@ int sata_link_debounce(struct ata_link *link, const unsigned long *params, int sata_link_resume(struct ata_link *link, const unsigned long *params, unsigned long deadline) { + int tries = ATA_LINK_RESUME_TRIES; u32 scontrol, serror; int rc; if ((rc = sata_scr_read(link, SCR_CONTROL, &scontrol))) return rc; - scontrol = (scontrol & 0x0f0) | 0x300; + /* + * Writes to SControl sometimes get ignored under certain + * controllers (ata_piix SIDPR). Make sure DET actually is + * cleared. + */ + do { + scontrol = (scontrol & 0x0f0) | 0x300; + if ((rc = sata_scr_write(link, SCR_CONTROL, scontrol))) + return rc; + /* + * Some PHYs react badly if SStatus is pounded + * immediately after resuming. Delay 200ms before + * debouncing. + */ + msleep(200); - if ((rc = sata_scr_write(link, SCR_CONTROL, scontrol))) - return rc; + /* is SControl restored correctly? */ + if ((rc = sata_scr_read(link, SCR_CONTROL, &scontrol))) + return rc; + } while ((scontrol & 0xf0f) != 0x300 && --tries); - /* Some PHYs react badly if SStatus is pounded immediately - * after resuming. Delay 200ms before debouncing. - */ - msleep(200); + if ((scontrol & 0xf0f) != 0x300) { + ata_link_printk(link, KERN_ERR, + "failed to resume link (SControl %X)\n", + scontrol); + return 0; + } + + if (tries < ATA_LINK_RESUME_TRIES) + ata_link_printk(link, KERN_WARNING, + "link resume succeeded after %d retries\n", + ATA_LINK_RESUME_TRIES - tries); if ((rc = sata_link_debounce(link, params, deadline))) return rc; diff --git a/include/linux/libata.h b/include/linux/libata.h index 6a9c4ddd3d95..73112250862c 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -354,6 +354,9 @@ enum { /* max tries if error condition is still set after ->error_handler */ ATA_EH_MAX_TRIES = 5, + /* sometimes resuming a link requires several retries */ + ATA_LINK_RESUME_TRIES = 5, + /* how hard are we gonna try to probe/recover devices */ ATA_PROBE_MAX_TRIES = 3, ATA_EH_DEV_TRIES = 3, -- cgit v1.2.3 From 2c761270d5520dd84ab0b4e47c24d99ff8503c38 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 12 Jan 2010 17:39:16 +1100 Subject: lib: Introduce generic list_sort function There are two copies of list_sort() in the tree already, one in the DRM code, another in ubifs. Now XFS needs this as well. Create a generic list_sort() function from the ubifs version and convert existing users to it so we don't end up with yet another copy in the tree. Signed-off-by: Dave Chinner Acked-by: Dave Airlie Acked-by: Artem Bityutskiy Signed-off-by: Linus Torvalds --- drivers/gpu/drm/drm_modes.c | 90 ++------------------------------------ fs/ubifs/gc.c | 96 +---------------------------------------- include/linux/list_sort.h | 11 +++++ lib/Makefile | 2 +- lib/list_sort.c | 102 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 119 insertions(+), 182 deletions(-) create mode 100644 include/linux/list_sort.h create mode 100644 lib/list_sort.c (limited to 'include/linux') diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c index 6d81a02463a3..76d63394c776 100644 --- a/drivers/gpu/drm/drm_modes.c +++ b/drivers/gpu/drm/drm_modes.c @@ -1,9 +1,4 @@ /* - * The list_sort function is (presumably) licensed under the GPL (see the - * top level "COPYING" file for details). - * - * The remainder of this file is: - * * Copyright © 1997-2003 by The XFree86 Project, Inc. * Copyright © 2007 Dave Airlie * Copyright © 2007-2008 Intel Corporation @@ -36,6 +31,7 @@ */ #include +#include #include "drmP.h" #include "drm.h" #include "drm_crtc.h" @@ -855,6 +851,7 @@ EXPORT_SYMBOL(drm_mode_prune_invalid); /** * drm_mode_compare - compare modes for favorability + * @priv: unused * @lh_a: list_head for first mode * @lh_b: list_head for second mode * @@ -868,7 +865,7 @@ EXPORT_SYMBOL(drm_mode_prune_invalid); * Negative if @lh_a is better than @lh_b, zero if they're equivalent, or * positive if @lh_b is better than @lh_a. */ -static int drm_mode_compare(struct list_head *lh_a, struct list_head *lh_b) +static int drm_mode_compare(void *priv, struct list_head *lh_a, struct list_head *lh_b) { struct drm_display_mode *a = list_entry(lh_a, struct drm_display_mode, head); struct drm_display_mode *b = list_entry(lh_b, struct drm_display_mode, head); @@ -885,85 +882,6 @@ static int drm_mode_compare(struct list_head *lh_a, struct list_head *lh_b) return diff; } -/* FIXME: what we don't have a list sort function? */ -/* list sort from Mark J Roberts (mjr@znex.org) */ -void list_sort(struct list_head *head, - int (*cmp)(struct list_head *a, struct list_head *b)) -{ - struct list_head *p, *q, *e, *list, *tail, *oldhead; - int insize, nmerges, psize, qsize, i; - - list = head->next; - list_del(head); - insize = 1; - for (;;) { - p = oldhead = list; - list = tail = NULL; - nmerges = 0; - - while (p) { - nmerges++; - q = p; - psize = 0; - for (i = 0; i < insize; i++) { - psize++; - q = q->next == oldhead ? NULL : q->next; - if (!q) - break; - } - - qsize = insize; - while (psize > 0 || (qsize > 0 && q)) { - if (!psize) { - e = q; - q = q->next; - qsize--; - if (q == oldhead) - q = NULL; - } else if (!qsize || !q) { - e = p; - p = p->next; - psize--; - if (p == oldhead) - p = NULL; - } else if (cmp(p, q) <= 0) { - e = p; - p = p->next; - psize--; - if (p == oldhead) - p = NULL; - } else { - e = q; - q = q->next; - qsize--; - if (q == oldhead) - q = NULL; - } - if (tail) - tail->next = e; - else - list = e; - e->prev = tail; - tail = e; - } - p = q; - } - - tail->next = list; - list->prev = tail; - - if (nmerges <= 1) - break; - - insize *= 2; - } - - head->next = list; - head->prev = list->prev; - list->prev->next = head; - list->prev = head; -} - /** * drm_mode_sort - sort mode list * @mode_list: list to sort @@ -975,7 +893,7 @@ void list_sort(struct list_head *head, */ void drm_mode_sort(struct list_head *mode_list) { - list_sort(mode_list, drm_mode_compare); + list_sort(NULL, mode_list, drm_mode_compare); } EXPORT_SYMBOL(drm_mode_sort); diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 618c2701d3a7..e5a3d8e96bb7 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -54,6 +54,7 @@ */ #include +#include #include "ubifs.h" /* @@ -107,101 +108,6 @@ static int switch_gc_head(struct ubifs_info *c) return err; } -/** - * list_sort - sort a list. - * @priv: private data, passed to @cmp - * @head: the list to sort - * @cmp: the elements comparison function - * - * This function has been implemented by Mark J Roberts . It - * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted - * in ascending order. - * - * The comparison function @cmp is supposed to return a negative value if @a is - * than @b, and a positive value if @a is greater than @b. If @a and @b are - * equivalent, then it does not matter what this function returns. - */ -static void list_sort(void *priv, struct list_head *head, - int (*cmp)(void *priv, struct list_head *a, - struct list_head *b)) -{ - struct list_head *p, *q, *e, *list, *tail, *oldhead; - int insize, nmerges, psize, qsize, i; - - if (list_empty(head)) - return; - - list = head->next; - list_del(head); - insize = 1; - for (;;) { - p = oldhead = list; - list = tail = NULL; - nmerges = 0; - - while (p) { - nmerges++; - q = p; - psize = 0; - for (i = 0; i < insize; i++) { - psize++; - q = q->next == oldhead ? NULL : q->next; - if (!q) - break; - } - - qsize = insize; - while (psize > 0 || (qsize > 0 && q)) { - if (!psize) { - e = q; - q = q->next; - qsize--; - if (q == oldhead) - q = NULL; - } else if (!qsize || !q) { - e = p; - p = p->next; - psize--; - if (p == oldhead) - p = NULL; - } else if (cmp(priv, p, q) <= 0) { - e = p; - p = p->next; - psize--; - if (p == oldhead) - p = NULL; - } else { - e = q; - q = q->next; - qsize--; - if (q == oldhead) - q = NULL; - } - if (tail) - tail->next = e; - else - list = e; - e->prev = tail; - tail = e; - } - p = q; - } - - tail->next = list; - list->prev = tail; - - if (nmerges <= 1) - break; - - insize *= 2; - } - - head->next = list; - head->prev = list->prev; - list->prev->next = head; - list->prev = head; -} - /** * data_nodes_cmp - compare 2 data nodes. * @priv: UBIFS file-system description object diff --git a/include/linux/list_sort.h b/include/linux/list_sort.h new file mode 100644 index 000000000000..1a2df2efb771 --- /dev/null +++ b/include/linux/list_sort.h @@ -0,0 +1,11 @@ +#ifndef _LINUX_LIST_SORT_H +#define _LINUX_LIST_SORT_H + +#include + +struct list_head; + +void list_sort(void *priv, struct list_head *head, + int (*cmp)(void *priv, struct list_head *a, + struct list_head *b)); +#endif diff --git a/lib/Makefile b/lib/Makefile index 911b25aed1e7..3b0b4a696db9 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -21,7 +21,7 @@ lib-y += kobject.o kref.o klist.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ - string_helpers.o gcd.o + string_helpers.o gcd.o list_sort.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG diff --git a/lib/list_sort.c b/lib/list_sort.c new file mode 100644 index 000000000000..19d11e0bb958 --- /dev/null +++ b/lib/list_sort.c @@ -0,0 +1,102 @@ +#include +#include +#include +#include +#include + +/** + * list_sort - sort a list. + * @priv: private data, passed to @cmp + * @head: the list to sort + * @cmp: the elements comparison function + * + * This function has been implemented by Mark J Roberts . It + * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted + * in ascending order. + * + * The comparison function @cmp is supposed to return a negative value if @a is + * less than @b, and a positive value if @a is greater than @b. If @a and @b + * are equivalent, then it does not matter what this function returns. + */ +void list_sort(void *priv, struct list_head *head, + int (*cmp)(void *priv, struct list_head *a, + struct list_head *b)) +{ + struct list_head *p, *q, *e, *list, *tail, *oldhead; + int insize, nmerges, psize, qsize, i; + + if (list_empty(head)) + return; + + list = head->next; + list_del(head); + insize = 1; + for (;;) { + p = oldhead = list; + list = tail = NULL; + nmerges = 0; + + while (p) { + nmerges++; + q = p; + psize = 0; + for (i = 0; i < insize; i++) { + psize++; + q = q->next == oldhead ? NULL : q->next; + if (!q) + break; + } + + qsize = insize; + while (psize > 0 || (qsize > 0 && q)) { + if (!psize) { + e = q; + q = q->next; + qsize--; + if (q == oldhead) + q = NULL; + } else if (!qsize || !q) { + e = p; + p = p->next; + psize--; + if (p == oldhead) + p = NULL; + } else if (cmp(priv, p, q) <= 0) { + e = p; + p = p->next; + psize--; + if (p == oldhead) + p = NULL; + } else { + e = q; + q = q->next; + qsize--; + if (q == oldhead) + q = NULL; + } + if (tail) + tail->next = e; + else + list = e; + e->prev = tail; + tail = e; + } + p = q; + } + + tail->next = list; + list->prev = tail; + + if (nmerges <= 1) + break; + + insize *= 2; + } + + head->next = list; + head->prev = list->prev; + list->prev->next = head; + list->prev = head; +} + +EXPORT_SYMBOL(list_sort); -- cgit v1.2.3 From bf66f18e79e34c421bbd8f6511e2c556b779df2f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Jan 2010 15:09:10 -0800 Subject: rcu: Add force_quiescent_state() testing to rcutorture Add force_quiescent_state() testing to rcutorture, with a separate thread that repeatedly invokes force_quiescent_state() in bursts. This can greatly increase the probability of encountering certain types of race conditions. Suggested-by: Josh Triplett Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1262646551116-git-send-email-> Signed-off-by: Ingo Molnar --- include/linux/rcutiny.h | 12 ++++++++ include/linux/rcutree.h | 3 ++ kernel/rcutorture.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcutree.c | 18 +++++++++++ kernel/rcutree_plugin.h | 19 ++++++++++++ 5 files changed, 130 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 96cc307ed9f4..2b70d4e37383 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -62,6 +62,18 @@ static inline long rcu_batches_completed_bh(void) extern int rcu_expedited_torture_stats(char *page); +static inline void rcu_force_quiescent_state(void) +{ +} + +static inline void rcu_bh_force_quiescent_state(void) +{ +} + +static inline void rcu_sched_force_quiescent_state(void) +{ +} + #define synchronize_rcu synchronize_sched static inline void synchronize_rcu_expedited(void) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 8044b1b94333..704a010f686c 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -99,6 +99,9 @@ extern void rcu_check_callbacks(int cpu, int user); extern long rcu_batches_completed(void); extern long rcu_batches_completed_bh(void); extern long rcu_batches_completed_sched(void); +extern void rcu_force_quiescent_state(void); +extern void rcu_bh_force_quiescent_state(void); +extern void rcu_sched_force_quiescent_state(void); #ifdef CONFIG_NO_HZ void rcu_enter_nohz(void); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9bb52177af02..adda92bfafac 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -61,6 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ static int stutter = 5; /* Start/stop testing interval (in sec) */ static int irqreader = 1; /* RCU readers from irq (timers). */ +static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ +static int fqs_holdoff = 0; /* Hold time within burst (us). */ +static int fqs_stutter = 3; /* Wait time between bursts (s). */ static char *torture_type = "rcu"; /* What RCU implementation to torture. */ module_param(nreaders, int, 0444); @@ -79,6 +82,12 @@ module_param(stutter, int, 0444); MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); module_param(irqreader, int, 0444); MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); +module_param(fqs_duration, int, 0444); +MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); +module_param(fqs_holdoff, int, 0444); +MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); +module_param(fqs_stutter, int, 0444); +MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); @@ -99,6 +108,7 @@ static struct task_struct **reader_tasks; static struct task_struct *stats_task; static struct task_struct *shuffler_task; static struct task_struct *stutter_task; +static struct task_struct *fqs_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -263,6 +273,7 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*cb_barrier)(void); + void (*fqs)(void); int (*stats)(char *page); int irq_capable; char *name; @@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = { .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .cb_barrier = rcu_barrier, + .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu" @@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu, .cb_barrier = NULL, + .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu_sync" @@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu_expedited, .cb_barrier = NULL, + .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu_expedited" @@ -465,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .deferred_free = rcu_bh_torture_deferred_free, .sync = rcu_bh_torture_synchronize, .cb_barrier = rcu_barrier_bh, + .fqs = rcu_bh_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu_bh" @@ -480,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = rcu_bh_torture_synchronize, .cb_barrier = NULL, + .fqs = rcu_bh_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu_bh_sync" @@ -621,6 +637,7 @@ static struct rcu_torture_ops sched_ops = { .deferred_free = rcu_sched_torture_deferred_free, .sync = sched_torture_synchronize, .cb_barrier = rcu_barrier_sched, + .fqs = rcu_sched_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "sched" @@ -636,6 +653,7 @@ static struct rcu_torture_ops sched_sync_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = sched_torture_synchronize, .cb_barrier = NULL, + .fqs = rcu_sched_force_quiescent_state, .stats = NULL, .name = "sched_sync" }; @@ -650,11 +668,44 @@ static struct rcu_torture_ops sched_expedited_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_sched_expedited, .cb_barrier = NULL, + .fqs = rcu_sched_force_quiescent_state, .stats = rcu_expedited_torture_stats, .irq_capable = 1, .name = "sched_expedited" }; +/* + * RCU torture force-quiescent-state kthread. Repeatedly induces + * bursts of calls to force_quiescent_state(), increasing the probability + * of occurrence of some important types of race conditions. + */ +static int +rcu_torture_fqs(void *arg) +{ + unsigned long fqs_resume_time; + int fqs_burst_remaining; + + VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); + do { + fqs_resume_time = jiffies + fqs_stutter * HZ; + while (jiffies - fqs_resume_time > LONG_MAX) { + schedule_timeout_interruptible(1); + } + fqs_burst_remaining = fqs_duration; + while (fqs_burst_remaining > 0) { + cur_ops->fqs(); + udelay(fqs_holdoff); + fqs_burst_remaining -= fqs_holdoff; + } + rcu_stutter_wait("rcu_torture_fqs"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); + rcutorture_shutdown_absorb("rcu_torture_fqs"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + /* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure @@ -1030,10 +1081,11 @@ rcu_torture_print_module_parms(char *tag) printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d nfakewriters=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval=%d stutter=%d irqreader=%d\n", + "shuffle_interval=%d stutter=%d irqreader=%d " + "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter, irqreader); + stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); } static struct notifier_block rcutorture_nb = { @@ -1109,6 +1161,12 @@ rcu_torture_cleanup(void) } stats_task = NULL; + if (fqs_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); + kthread_stop(fqs_task); + } + fqs_task = NULL; + /* Wait for all RCU callbacks to fire. */ if (cur_ops->cb_barrier != NULL) @@ -1154,6 +1212,11 @@ rcu_torture_init(void) mutex_unlock(&fullstop_mutex); return -EINVAL; } + if (cur_ops->fqs == NULL && fqs_duration != 0) { + printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " + "fqs_duration, fqs disabled.\n"); + fqs_duration = 0; + } if (cur_ops->init) cur_ops->init(); /* no "goto unwind" prior to this point!!! */ @@ -1282,6 +1345,19 @@ rcu_torture_init(void) goto unwind; } } + if (fqs_duration < 0) + fqs_duration = 0; + if (fqs_duration) { + /* Create the stutter thread */ + fqs_task = kthread_run(rcu_torture_fqs, NULL, + "rcu_torture_fqs"); + if (IS_ERR(fqs_task)) { + firsterr = PTR_ERR(fqs_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); + fqs_task = NULL; + goto unwind; + } + } register_reboot_notifier(&rcutorture_nb); mutex_unlock(&fullstop_mutex); return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 55e8f6ef8195..0a4c32879398 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -156,6 +156,24 @@ long rcu_batches_completed_bh(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); +/* + * Force a quiescent state for RCU BH. + */ +void rcu_bh_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_bh_state, 0); +} +EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); + +/* + * Force a quiescent state for RCU-sched. + */ +void rcu_sched_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_sched_state, 0); +} +EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); + /* * Does the CPU have callbacks ready to be invoked? */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 37fbccdf41d5..f11ebd44b454 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -61,6 +61,15 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); +/* + * Force a quiescent state for preemptible RCU. + */ +void rcu_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_preempt_state, 0); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + /* * Record a preemptable-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is @@ -712,6 +721,16 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); +/* + * Force a quiescent state for RCU, which, because there is no preemptible + * RCU, becomes the same as rcu-sched. + */ +void rcu_force_quiescent_state(void) +{ + rcu_sched_force_quiescent_state(); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + /* * Because preemptable RCU does not exist, we never have to check for * CPUs being in quiescent states. -- cgit v1.2.3 From f6a8c60960bbea378142d1fa1b3d111555ee41c7 Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Sun, 29 Nov 2009 15:23:51 +0000 Subject: mtd: Really add ARM pismo support (Commit 7cb777a3d71f9d1f7eb149c7a504d21f24219ae8 (mtd: add ARM pismo support) intended to add this, but seems only to have patched the Makefile without touching Kconfig or providing any code...) The following patch adds support for PISMO modules found on ARM Ltd development platforms. These are MTD modules, and can have a selection of SRAM, flash or DOC devices as described by an on-board I2C EEPROM. We support SRAM and NOR flash devices only by registering appropriate conventional MTD platform devices as children of the 'pismo' device. Signed-off-by: Russell King Signed-off-by: David Woodhouse --- drivers/mtd/maps/Kconfig | 17 +++ drivers/mtd/maps/pismo.c | 320 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mtd/pismo.h | 17 +++ 3 files changed, 354 insertions(+) create mode 100644 drivers/mtd/maps/pismo.c create mode 100644 include/linux/mtd/pismo.h (limited to 'include/linux') diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig index 4c364d44ad59..2de0cc823d60 100644 --- a/drivers/mtd/maps/Kconfig +++ b/drivers/mtd/maps/Kconfig @@ -549,4 +549,21 @@ config MTD_VMU To build this as a module select M here, the module will be called vmu-flash. +config MTD_PISMO + tristate "MTD discovery driver for PISMO modules" + depends on I2C + depends on ARCH_VERSATILE + help + This driver allows for discovery of PISMO modules - see + . These are small modules containing + up to five memory devices (eg, SRAM, flash, DOC) described by an + I2C EEPROM. + + This driver does not create any MTD maps itself; instead it + creates MTD physmap and MTD SRAM platform devices. If you + enable this option, you should consider enabling MTD_PHYSMAP + and/or MTD_PLATRAM according to the devices on your module. + + When built as a module, it will be called pismo.ko + endmenu diff --git a/drivers/mtd/maps/pismo.c b/drivers/mtd/maps/pismo.c new file mode 100644 index 000000000000..c48cad271f5d --- /dev/null +++ b/drivers/mtd/maps/pismo.c @@ -0,0 +1,320 @@ +/* + * PISMO memory driver - http://www.pismoworld.org/ + * + * For ARM Realview and Versatile platforms + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PISMO_NUM_CS 5 + +struct pismo_cs_block { + u8 type; + u8 width; + __le16 access; + __le32 size; + u32 reserved[2]; + char device[32]; +} __packed; + +struct pismo_eeprom { + struct pismo_cs_block cs[PISMO_NUM_CS]; + char board[15]; + u8 sum; +} __packed; + +struct pismo_mem { + phys_addr_t base; + u32 size; + u16 access; + u8 width; + u8 type; +}; + +struct pismo_data { + struct i2c_client *client; + void (*vpp)(void *, int); + void *vpp_data; + struct platform_device *dev[PISMO_NUM_CS]; +}; + +/* FIXME: set_vpp could do with a better calling convention */ +static struct pismo_data *vpp_pismo; +static DEFINE_MUTEX(pismo_mutex); + +static int pismo_setvpp_probe_fix(struct pismo_data *pismo) +{ + mutex_lock(&pismo_mutex); + if (vpp_pismo) { + mutex_unlock(&pismo_mutex); + kfree(pismo); + return -EBUSY; + } + vpp_pismo = pismo; + mutex_unlock(&pismo_mutex); + return 0; +} + +static void pismo_setvpp_remove_fix(struct pismo_data *pismo) +{ + mutex_lock(&pismo_mutex); + if (vpp_pismo == pismo) + vpp_pismo = NULL; + mutex_unlock(&pismo_mutex); +} + +static void pismo_set_vpp(struct map_info *map, int on) +{ + struct pismo_data *pismo = vpp_pismo; + + pismo->vpp(pismo->vpp_data, on); +} +/* end of hack */ + + +static unsigned int __devinit pismo_width_to_bytes(unsigned int width) +{ + width &= 15; + if (width > 2) + return 0; + return 1 << width; +} + +static int __devinit pismo_eeprom_read(struct i2c_client *client, void *buf, + u8 addr, size_t size) +{ + int ret; + struct i2c_msg msg[] = { + { + .addr = client->addr, + .len = sizeof(addr), + .buf = &addr, + }, { + .addr = client->addr, + .flags = I2C_M_RD, + .len = size, + .buf = buf, + }, + }; + + ret = i2c_transfer(client->adapter, msg, ARRAY_SIZE(msg)); + + return ret == ARRAY_SIZE(msg) ? size : -EIO; +} + +static int __devinit pismo_add_device(struct pismo_data *pismo, int i, + struct pismo_mem *region, const char *name, void *pdata, size_t psize) +{ + struct platform_device *dev; + struct resource res = { }; + phys_addr_t base = region.base; + int ret; + + if (base == ~0) + return -ENXIO; + + res.start = base; + res.end = base + region->size - 1; + res.flags = IORESOURCE_MEM; + + dev = platform_device_alloc(name, i); + if (!dev) + return -ENOMEM; + dev->dev.parent = &pismo->client->dev; + + do { + ret = platform_device_add_resources(dev, &res, 1); + if (ret) + break; + + ret = platform_device_add_data(dev, pdata, psize); + if (ret) + break; + + ret = platform_device_add(dev); + if (ret) + break; + + pismo->dev[i] = dev; + return 0; + } while (0); + + platform_device_put(dev); + return ret; +} + +static int __devinit pismo_add_nor(struct pismo_data *pismo, int i, + struct pismo_mem *region) +{ + struct physmap_flash_data data = { + .width = region->width, + }; + + if (pismo->vpp) + data.set_vpp = pismo_set_vpp; + + return pismo_add_device(pismo, i, region, "physmap-flash", + &data, sizeof(data)); +} + +static int __devinit pismo_add_sram(struct pismo_data *pismo, int i, + struct pismo_mem *region) +{ + struct platdata_mtd_ram data = { + .bankwidth = region->width, + }; + + return pismo_add_device(pismo, i, region, "mtd-ram", + &data, sizeof(data)); +} + +static void __devinit pismo_add_one(struct pismo_data *pismo, int i, + const struct pismo_cs_block *cs, phys_addr_t base) +{ + struct device *dev = &pismo->client->dev; + struct pismo_mem region; + + region.base = base; + region.type = cs->type; + region.width = pismo_width_to_bytes(cs->width); + region.access = le16_to_cpu(cs->access); + region.size = le32_to_cpu(cs->size); + + if (region.width == 0) { + dev_err(dev, "cs%u: bad width: %02x, ignoring\n", i, cs->width); + return; + } + + /* + * FIXME: may need to the platforms memory controller here, but at + * the moment we assume that it has already been correctly setup. + * The memory controller can also tell us the base address as well. + */ + + dev_info(dev, "cs%u: %.32s: type %02x access %u00ps size %uK\n", + i, cs->device, region.type, region.access, region.size / 1024); + + switch (region.type) { + case 0: + break; + case 1: + /* static DOC */ + break; + case 2: + /* static NOR */ + pismo_add_nor(pismo, i, ®ion); + break; + case 3: + /* static RAM */ + pismo_add_sram(pismo, i, ®ion); + break; + } +} + +static int __devexit pismo_remove(struct i2c_client *client) +{ + struct pismo_data *pismo = i2c_get_clientdata(client); + int i; + + for (i = 0; i < ARRAY_SIZE(pismo->dev); i++) + platform_device_unregister(pismo->dev[i]); + + /* FIXME: set_vpp needs saner arguments */ + pismo_setvpp_remove_fix(pismo); + + kfree(pismo); + + return 0; +} + +static int __devinit pismo_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent); + struct pismo_pdata *pdata = client->dev.platform_data; + struct pismo_eeprom eeprom; + struct pismo_data *pismo; + int ret, i; + + if (!i2c_check_functionality(adapter, I2C_FUNC_I2C)) { + dev_err(&client->dev, "functionality mismatch\n"); + return -EIO; + } + + pismo = kzalloc(sizeof(*pismo), GFP_KERNEL); + if (!pismo) + return -ENOMEM; + + /* FIXME: set_vpp needs saner arguments */ + ret = pismo_setvpp_probe_fix(pismo); + if (ret) + return ret; + + pismo->client = client; + if (pdata) { + pismo->vpp = pdata->set_vpp; + pismo->vpp_data = pdata->vpp_data; + } + i2c_set_clientdata(client, pismo); + + ret = pismo_eeprom_read(client, &eeprom, 0, sizeof(eeprom)); + if (ret < 0) { + dev_err(&client->dev, "error reading EEPROM: %d\n", ret); + return ret; + } + + dev_info(&client->dev, "%.15s board found\n", eeprom.board); + + for (i = 0; i < ARRAY_SIZE(eeprom.cs); i++) + if (eeprom.cs[i].type != 0xff) + pismo_add_one(pismo, i, &eeprom.cs[i], + pdata->cs_addrs[i]); + + return 0; +} + +static const struct i2c_device_id pismo_id[] = { + { "pismo" }, + { }, +}; +MODULE_DEVICE_TABLE(i2c, pismo_id); + +static struct i2c_driver pismo_driver = { + .driver = { + .name = "pismo", + .owner = THIS_MODULE, + }, + .probe = pismo_probe, + .remove = __devexit_p(pismo_remove), + .id_table = pismo_id, +}; + +static int __init pismo_init(void) +{ + BUILD_BUG_ON(sizeof(struct pismo_cs_block) != 48); + BUILD_BUG_ON(sizeof(struct pismo_eeprom) != 256); + + return i2c_add_driver(&pismo_driver); +} +module_init(pismo_init); + +static void __exit pismo_exit(void) +{ + i2c_del_driver(&pismo_driver); +} +module_exit(pismo_exit); + +MODULE_AUTHOR("Russell King "); +MODULE_DESCRIPTION("PISMO memory driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mtd/pismo.h b/include/linux/mtd/pismo.h new file mode 100644 index 000000000000..8dfb7e1421c5 --- /dev/null +++ b/include/linux/mtd/pismo.h @@ -0,0 +1,17 @@ +/* + * PISMO memory driver - http://www.pismoworld.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + */ +#ifndef __LINUX_MTD_PISMO_H +#define __LINUX_MTD_PISMO_H + +struct pismo_pdata { + void (*set_vpp)(void *, int); + void *vpp_data; + phys_addr_t cs_addrs[5]; +}; + +#endif -- cgit v1.2.3 From 9ca94d7c016130f9ed77f142424ace9c19742809 Mon Sep 17 00:00:00 2001 From: John Kacur Date: Mon, 11 Jan 2010 21:21:06 +0100 Subject: plist: Fix grammar mistake, and c-style mistake Signed-off-by: John Kacur Acked-by: Peter Zijlstra LKML-Reference: <1263241267-25204-2-git-send-email-jkacur@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/plist.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/plist.h b/include/linux/plist.h index 8227f717c70f..6898985e7b38 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -45,7 +45,7 @@ * the insertion of new nodes. There are no nodes with duplicate * priorites on the list. * - * The nodes on the node_list is ordered by priority and can contain + * The nodes on the node_list are ordered by priority and can contain * entries which have the same priority. Those entries are ordered * FIFO * @@ -265,7 +265,7 @@ static inline int plist_node_empty(const struct plist_node *node) * * Assumes the plist is _not_ empty. */ -static inline struct plist_node* plist_first(const struct plist_head *head) +static inline struct plist_node *plist_first(const struct plist_head *head) { return list_entry(head->node_list.next, struct plist_node, plist.node_list); -- cgit v1.2.3 From 599faa0e264fe2e7f563f87b4aad8c83e9dc46d1 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 5 Jan 2010 13:29:58 +0000 Subject: genirq: Fix documentation of default chip disable() The documentation says that by default disable() will be chip->mask but in fact default_disable() is a noop. Signed-off-by: Mark Brown LKML-Reference: <1262698198-30392-1-git-send-email-broonie@opensource.wolfsonmicro.com> Signed-off-by: Ingo Molnar --- include/linux/irq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 451481c082b5..d13492df57a1 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -90,7 +90,7 @@ struct msi_desc; * @startup: start up the interrupt (defaults to ->enable if NULL) * @shutdown: shut down the interrupt (defaults to ->disable if NULL) * @enable: enable the interrupt (defaults to chip->unmask if NULL) - * @disable: disable the interrupt (defaults to chip->mask if NULL) + * @disable: disable the interrupt * @ack: start of a new interrupt * @mask: mask an interrupt source * @mask_ack: ack and mask an interrupt source -- cgit v1.2.3 From cd8c20b650f49354722b8cc1f03320b004815a0a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 13 Jan 2010 16:02:14 +0100 Subject: netfilter: nfnetlink: netns support Make nfnl socket per-petns. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/linux/netfilter/nfnetlink.h | 8 ++--- include/net/net_namespace.h | 2 ++ net/netfilter/nf_conntrack_netlink.c | 13 ++++---- net/netfilter/nfnetlink.c | 65 +++++++++++++++++++++++------------- net/netfilter/nfnetlink_log.c | 3 +- net/netfilter/nfnetlink_queue.c | 2 +- 6 files changed, 58 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 49d321f3ccd2..53923868c9bd 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -73,11 +73,11 @@ struct nfnetlink_subsystem { extern int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n); -extern int nfnetlink_has_listeners(unsigned int group); -extern int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, +extern int nfnetlink_has_listeners(struct net *net, unsigned int group); +extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo, gfp_t flags); -extern void nfnetlink_set_err(u32 pid, u32 group, int error); -extern int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags); +extern void nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error); +extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags); extern void nfnl_lock(void); extern void nfnl_unlock(void); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index f307e133d14c..82b7be4db89a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -81,6 +81,8 @@ struct net { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct netns_ct ct; #endif + struct sock *nfnl; + struct sock *nfnl_stash; #endif #ifdef CONFIG_XFRM struct netns_xfrm xfrm; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 59d8064eb522..d4c5d06677f9 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -482,7 +482,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) } else return 0; - if (!item->report && !nfnetlink_has_listeners(group)) + if (!item->report && !nfnetlink_has_listeners(&init_net, group)) return 0; skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC); @@ -559,7 +559,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) rcu_read_unlock(); nlmsg_end(skb, nlh); - err = nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC); + err = nfnetlink_send(skb, &init_net, item->pid, group, item->report, + GFP_ATOMIC); if (err == -ENOBUFS || err == -EAGAIN) return -ENOBUFS; @@ -571,7 +572,7 @@ nla_put_failure: nlmsg_failure: kfree_skb(skb); errout: - nfnetlink_set_err(0, group, -ENOBUFS); + nfnetlink_set_err(&init_net, 0, group, -ENOBUFS); return 0; } #endif /* CONFIG_NF_CONNTRACK_EVENTS */ @@ -1539,7 +1540,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) return 0; if (!item->report && - !nfnetlink_has_listeners(NFNLGRP_CONNTRACK_EXP_NEW)) + !nfnetlink_has_listeners(&init_net, NFNLGRP_CONNTRACK_EXP_NEW)) return 0; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); @@ -1562,7 +1563,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) rcu_read_unlock(); nlmsg_end(skb, nlh); - nfnetlink_send(skb, item->pid, NFNLGRP_CONNTRACK_EXP_NEW, + nfnetlink_send(skb, &init_net, item->pid, NFNLGRP_CONNTRACK_EXP_NEW, item->report, GFP_ATOMIC); return 0; @@ -1572,7 +1573,7 @@ nla_put_failure: nlmsg_failure: kfree_skb(skb); errout: - nfnetlink_set_err(0, 0, -ENOBUFS); + nfnetlink_set_err(&init_net, 0, 0, -ENOBUFS); return 0; } #endif diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index eedc0c1ac7a4..8eb0cc23ada3 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -40,7 +40,6 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); static char __initdata nfversion[] = "0.30"; -static struct sock *nfnl = NULL; static const struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT]; static DEFINE_MUTEX(nfnl_mutex); @@ -101,34 +100,35 @@ nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss) return &ss->cb[cb_id]; } -int nfnetlink_has_listeners(unsigned int group) +int nfnetlink_has_listeners(struct net *net, unsigned int group) { - return netlink_has_listeners(nfnl, group); + return netlink_has_listeners(net->nfnl, group); } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); -int nfnetlink_send(struct sk_buff *skb, u32 pid, +int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo, gfp_t flags) { - return nlmsg_notify(nfnl, skb, pid, group, echo, flags); + return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags); } EXPORT_SYMBOL_GPL(nfnetlink_send); -void nfnetlink_set_err(u32 pid, u32 group, int error) +void nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error) { - netlink_set_err(nfnl, pid, group, error); + netlink_set_err(net->nfnl, pid, group, error); } EXPORT_SYMBOL_GPL(nfnetlink_set_err); -int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags) { - return netlink_unicast(nfnl, skb, pid, flags); + return netlink_unicast(net->nfnl, skb, pid, flags); } EXPORT_SYMBOL_GPL(nfnetlink_unicast); /* Process one complete nfnetlink message. */ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { + struct net *net = sock_net(skb->sk); const struct nfnl_callback *nc; const struct nfnetlink_subsystem *ss; int type, err; @@ -170,7 +170,7 @@ replay: if (err < 0) return err; - err = nc->call(nfnl, skb, nlh, (const struct nlattr **)cda); + err = nc->call(net->nfnl, skb, nlh, (const struct nlattr **)cda); if (err == -EAGAIN) goto replay; return err; @@ -184,26 +184,45 @@ static void nfnetlink_rcv(struct sk_buff *skb) nfnl_unlock(); } -static void __exit nfnetlink_exit(void) +static int __net_init nfnetlink_net_init(struct net *net) { - printk("Removing netfilter NETLINK layer.\n"); - netlink_kernel_release(nfnl); - return; + struct sock *nfnl; + + nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX, + nfnetlink_rcv, NULL, THIS_MODULE); + if (!nfnl) + return -ENOMEM; + net->nfnl_stash = nfnl; + rcu_assign_pointer(net->nfnl, nfnl); + return 0; } -static int __init nfnetlink_init(void) +static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list) { - printk("Netfilter messages via NETLINK v%s.\n", nfversion); + struct net *net; - nfnl = netlink_kernel_create(&init_net, NETLINK_NETFILTER, NFNLGRP_MAX, - nfnetlink_rcv, NULL, THIS_MODULE); - if (!nfnl) { - printk(KERN_ERR "cannot initialize nfnetlink!\n"); - return -ENOMEM; - } + list_for_each_entry(net, net_exit_list, exit_list) + rcu_assign_pointer(net->nfnl, NULL); + synchronize_net(); + list_for_each_entry(net, net_exit_list, exit_list) + netlink_kernel_release(net->nfnl_stash); +} - return 0; +static struct pernet_operations nfnetlink_net_ops = { + .init = nfnetlink_net_init, + .exit_batch = nfnetlink_net_exit_batch, +}; + +static int __init nfnetlink_init(void) +{ + printk("Netfilter messages via NETLINK v%s.\n", nfversion); + return register_pernet_subsys(&nfnetlink_net_ops); } +static void __exit nfnetlink_exit(void) +{ + printk("Removing netfilter NETLINK layer.\n"); + unregister_pernet_subsys(&nfnetlink_net_ops); +} module_init(nfnetlink_init); module_exit(nfnetlink_exit); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 9de0470d557e..285e9029a9ff 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -323,7 +323,8 @@ __nfulnl_send(struct nfulnl_instance *inst) NLMSG_DONE, sizeof(struct nfgenmsg)); - status = nfnetlink_unicast(inst->skb, inst->peer_pid, MSG_DONTWAIT); + status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_pid, + MSG_DONTWAIT); inst->qlen = 0; inst->skb = NULL; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 7e3fa410641e..5c589b27d6eb 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -420,7 +420,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) } /* nfnetlink_unicast will either free the nskb or add it to a socket */ - err = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT); + err = nfnetlink_unicast(nskb, &init_net, queue->peer_pid, MSG_DONTWAIT); if (err < 0) { queue->queue_user_dropped++; goto err_out_unlock; -- cgit v1.2.3 From 508e14b4a4fb1a824a14f2c5b8d7df67b313f8e4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Jan 2010 14:27:30 +0000 Subject: netpoll: allow execution of multiple rx_hooks per interface Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/netpoll.h | 11 +++- net/core/netpoll.c | 169 ++++++++++++++++++++++++++++++------------------ 2 files changed, 114 insertions(+), 66 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 2524267210d3..a765ea898549 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -21,15 +21,20 @@ struct netpoll { __be32 local_ip, remote_ip; u16 local_port, remote_port; u8 remote_mac[ETH_ALEN]; + + struct list_head rx; /* rx_np list element */ }; struct netpoll_info { atomic_t refcnt; + int rx_flags; spinlock_t rx_lock; - struct netpoll *rx_np; /* netpoll that registered an rx_hook */ + struct list_head rx_np; /* netpolls that registered an rx_hook */ + struct sk_buff_head arp_tx; /* list of arp requests to reply to */ struct sk_buff_head txq; + struct delayed_work tx_work; }; @@ -51,7 +56,7 @@ static inline int netpoll_rx(struct sk_buff *skb) unsigned long flags; int ret = 0; - if (!npinfo || (!npinfo->rx_np && !npinfo->rx_flags)) + if (!npinfo || (list_empty(&npinfo->rx_np) && !npinfo->rx_flags)) return 0; spin_lock_irqsave(&npinfo->rx_lock, flags); @@ -67,7 +72,7 @@ static inline int netpoll_rx_on(struct sk_buff *skb) { struct netpoll_info *npinfo = skb->dev->npinfo; - return npinfo && (npinfo->rx_np || npinfo->rx_flags); + return npinfo && (!list_empty(&npinfo->rx_np) || npinfo->rx_flags); } static inline int netpoll_receive_skb(struct sk_buff *skb) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 0b4d0d35ef40..7aa697253765 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -407,11 +407,24 @@ static void arp_reply(struct sk_buff *skb) __be32 sip, tip; unsigned char *sha; struct sk_buff *send_skb; - struct netpoll *np = NULL; + struct netpoll *np, *tmp; + unsigned long flags; + int hits = 0; + + if (list_empty(&npinfo->rx_np)) + return; + + /* Before checking the packet, we do some early + inspection whether this is interesting at all */ + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (np->dev == skb->dev) + hits++; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); - if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev) - np = npinfo->rx_np; - if (!np) + /* No netpoll struct is using this dev */ + if (!hits) return; /* No arp on this interface */ @@ -437,77 +450,91 @@ static void arp_reply(struct sk_buff *skb) arp_ptr += skb->dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4; - /* if we actually cared about dst hw addr, it would get copied here */ + /* If we actually cared about dst hw addr, + it would get copied here */ arp_ptr += skb->dev->addr_len; memcpy(&tip, arp_ptr, 4); /* Should we ignore arp? */ - if (tip != np->local_ip || - ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) + if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) return; size = arp_hdr_len(skb->dev); - send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev), - LL_RESERVED_SPACE(np->dev)); - if (!send_skb) - return; - - skb_reset_network_header(send_skb); - arp = (struct arphdr *) skb_put(send_skb, size); - send_skb->dev = skb->dev; - send_skb->protocol = htons(ETH_P_ARP); + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (tip != np->local_ip) + continue; - /* Fill the device header for the ARP frame */ - if (dev_hard_header(send_skb, skb->dev, ptype, - sha, np->dev->dev_addr, - send_skb->len) < 0) { - kfree_skb(send_skb); - return; - } + send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev), + LL_RESERVED_SPACE(np->dev)); + if (!send_skb) + continue; - /* - * Fill out the arp protocol part. - * - * we only support ethernet device type, - * which (according to RFC 1390) should always equal 1 (Ethernet). - */ + skb_reset_network_header(send_skb); + arp = (struct arphdr *) skb_put(send_skb, size); + send_skb->dev = skb->dev; + send_skb->protocol = htons(ETH_P_ARP); - arp->ar_hrd = htons(np->dev->type); - arp->ar_pro = htons(ETH_P_IP); - arp->ar_hln = np->dev->addr_len; - arp->ar_pln = 4; - arp->ar_op = htons(type); + /* Fill the device header for the ARP frame */ + if (dev_hard_header(send_skb, skb->dev, ptype, + sha, np->dev->dev_addr, + send_skb->len) < 0) { + kfree_skb(send_skb); + continue; + } - arp_ptr=(unsigned char *)(arp + 1); - memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &tip, 4); - arp_ptr += 4; - memcpy(arp_ptr, sha, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &sip, 4); + /* + * Fill out the arp protocol part. + * + * we only support ethernet device type, + * which (according to RFC 1390) should + * always equal 1 (Ethernet). + */ - netpoll_send_skb(np, send_skb); + arp->ar_hrd = htons(np->dev->type); + arp->ar_pro = htons(ETH_P_IP); + arp->ar_hln = np->dev->addr_len; + arp->ar_pln = 4; + arp->ar_op = htons(type); + + arp_ptr = (unsigned char *)(arp + 1); + memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &tip, 4); + arp_ptr += 4; + memcpy(arp_ptr, sha, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &sip, 4); + + netpoll_send_skb(np, send_skb); + + /* If there are several rx_hooks for the same address, + we're fine by sending a single reply */ + break; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); } int __netpoll_rx(struct sk_buff *skb) { int proto, len, ulen; + int hits = 0; struct iphdr *iph; struct udphdr *uh; - struct netpoll_info *npi = skb->dev->npinfo; - struct netpoll *np = npi->rx_np; + struct netpoll_info *npinfo = skb->dev->npinfo; + struct netpoll *np, *tmp; - if (!np) + if (list_empty(&npinfo->rx_np)) goto out; + if (skb->dev->type != ARPHRD_ETHER) goto out; /* check if netpoll clients need ARP */ if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) { - skb_queue_tail(&npi->arp_tx, skb); + skb_queue_tail(&npinfo->arp_tx, skb); return 1; } @@ -551,16 +578,23 @@ int __netpoll_rx(struct sk_buff *skb) goto out; if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) goto out; - if (np->local_ip && np->local_ip != iph->daddr) - goto out; - if (np->remote_ip && np->remote_ip != iph->saddr) - goto out; - if (np->local_port && np->local_port != ntohs(uh->dest)) - goto out; - np->rx_hook(np, ntohs(uh->source), - (char *)(uh+1), - ulen - sizeof(struct udphdr)); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (np->local_ip && np->local_ip != iph->daddr) + continue; + if (np->remote_ip && np->remote_ip != iph->saddr) + continue; + if (np->local_port && np->local_port != ntohs(uh->dest)) + continue; + + np->rx_hook(np, ntohs(uh->source), + (char *)(uh+1), + ulen - sizeof(struct udphdr)); + hits++; + } + + if (!hits) + goto out; kfree_skb(skb); return 1; @@ -684,6 +718,7 @@ int netpoll_setup(struct netpoll *np) struct net_device *ndev = NULL; struct in_device *in_dev; struct netpoll_info *npinfo; + struct netpoll *npe, *tmp; unsigned long flags; int err; @@ -704,7 +739,7 @@ int netpoll_setup(struct netpoll *np) } npinfo->rx_flags = 0; - npinfo->rx_np = NULL; + INIT_LIST_HEAD(&npinfo->rx_np); spin_lock_init(&npinfo->rx_lock); skb_queue_head_init(&npinfo->arp_tx); @@ -785,7 +820,7 @@ int netpoll_setup(struct netpoll *np) if (np->rx_hook) { spin_lock_irqsave(&npinfo->rx_lock, flags); npinfo->rx_flags |= NETPOLL_RX_ENABLED; - npinfo->rx_np = np; + list_add_tail(&np->rx, &npinfo->rx_np); spin_unlock_irqrestore(&npinfo->rx_lock, flags); } @@ -801,9 +836,16 @@ int netpoll_setup(struct netpoll *np) return 0; release: - if (!ndev->npinfo) + if (!ndev->npinfo) { + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(npe, tmp, &npinfo->rx_np, rx) { + npe->dev = NULL; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + kfree(npinfo); - np->dev = NULL; + } + dev_put(ndev); return err; } @@ -823,10 +865,11 @@ void netpoll_cleanup(struct netpoll *np) if (np->dev) { npinfo = np->dev->npinfo; if (npinfo) { - if (npinfo->rx_np == np) { + if (!list_empty(&npinfo->rx_np)) { spin_lock_irqsave(&npinfo->rx_lock, flags); - npinfo->rx_np = NULL; - npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; + list_del(&np->rx); + if (list_empty(&npinfo->rx_np)) + npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; spin_unlock_irqrestore(&npinfo->rx_lock, flags); } -- cgit v1.2.3 From 9a58a80a701bdb2d220cdab4914218df5b48d781 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 14 Jan 2010 03:10:54 -0800 Subject: proc_fops: convert drivers/isdn/ to seq_file Convert code away from ->read_proc/->write_proc interfaces. Switch to proc_create()/proc_create_data() which make addition of proc entries reliable wrt NULL ->proc_fops, NULL ->data and so on. Problem with ->read_proc et al is described here commit 786d7e1612f0b0adb6046f19b906609e4fe8b1ba "Fix rmmod/read/write races in /proc entries" [akpm@linux-foundation.org: CONFIG_PROC_FS=n build fix] Signed-off-by: Alexey Dobriyan Signed-off-by: Tilman Schmidt Signed-off-by: Karsten Keil Signed-off-by: David S. Miller --- Documentation/isdn/INTERFACE.CAPI | 9 +- drivers/isdn/capi/capi.c | 99 ++++++---------- drivers/isdn/capi/capidrv.c | 55 +++------ drivers/isdn/capi/kcapi.c | 8 +- drivers/isdn/gigaset/capi.c | 75 ++++++------ drivers/isdn/hardware/avm/avmcard.h | 6 +- drivers/isdn/hardware/avm/b1.c | 54 +++++---- drivers/isdn/hardware/avm/b1dma.c | 71 ++++++------ drivers/isdn/hardware/avm/b1isa.c | 2 +- drivers/isdn/hardware/avm/b1pci.c | 4 +- drivers/isdn/hardware/avm/b1pcmcia.c | 2 +- drivers/isdn/hardware/avm/c4.c | 53 +++++---- drivers/isdn/hardware/avm/t1isa.c | 2 +- drivers/isdn/hardware/avm/t1pci.c | 2 +- drivers/isdn/hardware/eicon/capimain.c | 40 ++++--- drivers/isdn/hardware/eicon/diva_didd.c | 45 ++++---- drivers/isdn/hardware/eicon/divasi.c | 48 ++++---- drivers/isdn/hardware/eicon/divasproc.c | 198 ++++++++++++++------------------ drivers/isdn/hysdn/hycapi.c | 56 ++++----- include/linux/isdn/capilli.h | 3 +- net/bluetooth/cmtp/capi.c | 37 +++--- 21 files changed, 411 insertions(+), 458 deletions(-) (limited to 'include/linux') diff --git a/Documentation/isdn/INTERFACE.CAPI b/Documentation/isdn/INTERFACE.CAPI index 5fe8de5cc727..f172091fb7cd 100644 --- a/Documentation/isdn/INTERFACE.CAPI +++ b/Documentation/isdn/INTERFACE.CAPI @@ -149,10 +149,11 @@ char *(*procinfo)(struct capi_ctr *ctrlr) pointer to a callback function returning the entry for the device in the CAPI controller info table, /proc/capi/controller -read_proc_t *ctr_read_proc - pointer to the read_proc callback function for the device's proc file - system entry, /proc/capi/controllers/; will be called with a - pointer to the device's capi_ctr structure as the last (data) argument +const struct file_operations *proc_fops + pointers to callback functions for the device's proc file + system entry, /proc/capi/controllers/; pointer to the device's + capi_ctr structure is available from struct proc_dir_entry::data + which is available from struct inode. Note: Callback functions except send_message() are never called in interrupt context. diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c index 65bf91e16a42..79f9364aded6 100644 --- a/drivers/isdn/capi/capi.c +++ b/drivers/isdn/capi/capi.c @@ -33,6 +33,7 @@ #endif /* CONFIG_ISDN_CAPI_MIDDLEWARE */ #include #include +#include #include #include #include @@ -1407,114 +1408,84 @@ static void capinc_tty_exit(void) * /proc/capi/capi20: * minor applid nrecvctlpkt nrecvdatapkt nsendctlpkt nsenddatapkt */ -static int proc_capidev_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int capi20_proc_show(struct seq_file *m, void *v) { struct capidev *cdev; struct list_head *l; - int len = 0; read_lock(&capidev_list_lock); list_for_each(l, &capidev_list) { cdev = list_entry(l, struct capidev, list); - len += sprintf(page+len, "0 %d %lu %lu %lu %lu\n", + seq_printf(m, "0 %d %lu %lu %lu %lu\n", cdev->ap.applid, cdev->ap.nrecvctlpkt, cdev->ap.nrecvdatapkt, cdev->ap.nsentctlpkt, cdev->ap.nsentdatapkt); - if (len <= off) { - off -= len; - len = 0; - } else { - if (len-off > count) - goto endloop; - } } - -endloop: read_unlock(&capidev_list_lock); - if (len < count) - *eof = 1; - if (len > count) len = count; - if (len < 0) len = 0; - return len; + return 0; } +static int capi20_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, capi20_proc_show, NULL); +} + +static const struct file_operations capi20_proc_fops = { + .owner = THIS_MODULE, + .open = capi20_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /* * /proc/capi/capi20ncci: * applid ncci */ -static int proc_capincci_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int capi20ncci_proc_show(struct seq_file *m, void *v) { struct capidev *cdev; struct capincci *np; struct list_head *l; - int len = 0; read_lock(&capidev_list_lock); list_for_each(l, &capidev_list) { cdev = list_entry(l, struct capidev, list); for (np=cdev->nccis; np; np = np->next) { - len += sprintf(page+len, "%d 0x%x\n", + seq_printf(m, "%d 0x%x\n", cdev->ap.applid, np->ncci); - if (len <= off) { - off -= len; - len = 0; - } else { - if (len-off > count) - goto endloop; - } } } -endloop: read_unlock(&capidev_list_lock); - *start = page+off; - if (len < count) - *eof = 1; - if (len>count) len = count; - if (len<0) len = 0; - return len; + return 0; } -static struct procfsentries { - char *name; - mode_t mode; - int (*read_proc)(char *page, char **start, off_t off, - int count, int *eof, void *data); - struct proc_dir_entry *procent; -} procfsentries[] = { - /* { "capi", S_IFDIR, 0 }, */ - { "capi/capi20", 0 , proc_capidev_read_proc }, - { "capi/capi20ncci", 0 , proc_capincci_read_proc }, +static int capi20ncci_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, capi20ncci_proc_show, NULL); +} + +static const struct file_operations capi20ncci_proc_fops = { + .owner = THIS_MODULE, + .open = capi20ncci_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; static void __init proc_init(void) { - int nelem = ARRAY_SIZE(procfsentries); - int i; - - for (i=0; i < nelem; i++) { - struct procfsentries *p = procfsentries + i; - p->procent = create_proc_entry(p->name, p->mode, NULL); - if (p->procent) p->procent->read_proc = p->read_proc; - } + proc_create("capi/capi20", 0, NULL, &capi20_proc_fops); + proc_create("capi/capi20ncci", 0, NULL, &capi20ncci_proc_fops); } static void __exit proc_exit(void) { - int nelem = ARRAY_SIZE(procfsentries); - int i; - - for (i=nelem-1; i >= 0; i--) { - struct procfsentries *p = procfsentries + i; - if (p->procent) { - remove_proc_entry(p->name, NULL); - p->procent = NULL; - } - } + remove_proc_entry("capi/capi20", NULL); + remove_proc_entry("capi/capi20ncci", NULL); } /* -------- init function and module interface ---------------------- */ diff --git a/drivers/isdn/capi/capidrv.c b/drivers/isdn/capi/capidrv.c index 66b7d7a86474..bb450152fb74 100644 --- a/drivers/isdn/capi/capidrv.c +++ b/drivers/isdn/capi/capidrv.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -2229,59 +2230,37 @@ static void lower_callback(unsigned int cmd, u32 contr, void *data) * /proc/capi/capidrv: * nrecvctlpkt nrecvdatapkt nsendctlpkt nsenddatapkt */ -static int proc_capidrv_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int capidrv_proc_show(struct seq_file *m, void *v) { - int len = 0; - - len += sprintf(page+len, "%lu %lu %lu %lu\n", + seq_printf(m, "%lu %lu %lu %lu\n", global.ap.nrecvctlpkt, global.ap.nrecvdatapkt, global.ap.nsentctlpkt, global.ap.nsentdatapkt); - if (off+count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len-off) ? count : len-off); + return 0; +} + +static int capidrv_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, capidrv_proc_show, NULL); } -static struct procfsentries { - char *name; - mode_t mode; - int (*read_proc)(char *page, char **start, off_t off, - int count, int *eof, void *data); - struct proc_dir_entry *procent; -} procfsentries[] = { - /* { "capi", S_IFDIR, 0 }, */ - { "capi/capidrv", 0 , proc_capidrv_read_proc }, +static const struct file_operations capidrv_proc_fops = { + .owner = THIS_MODULE, + .open = capidrv_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; static void __init proc_init(void) { - int nelem = ARRAY_SIZE(procfsentries); - int i; - - for (i=0; i < nelem; i++) { - struct procfsentries *p = procfsentries + i; - p->procent = create_proc_entry(p->name, p->mode, NULL); - if (p->procent) p->procent->read_proc = p->read_proc; - } + proc_create("capi/capidrv", 0, NULL, &capidrv_proc_fops); } static void __exit proc_exit(void) { - int nelem = ARRAY_SIZE(procfsentries); - int i; - - for (i=nelem-1; i >= 0; i--) { - struct procfsentries *p = procfsentries + i; - if (p->procent) { - remove_proc_entry(p->name, NULL); - p->procent = NULL; - } - } + remove_proc_entry("capi/capidrv", NULL); } static int __init capidrv_init(void) diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index dc506ab99cac..b0bacf377c18 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -490,13 +490,7 @@ attach_capi_ctr(struct capi_ctr *card) card->traceflag = showcapimsgs; sprintf(card->procfn, "capi/controllers/%d", card->cnr); - card->procent = create_proc_entry(card->procfn, 0, NULL); - if (card->procent) { - card->procent->read_proc = - (int (*)(char *,char **,off_t,int,int *,void *)) - card->ctr_read_proc; - card->procent->data = card; - } + card->procent = proc_create_data(card->procfn, 0, NULL, card->proc_fops, card); ncards++; printk(KERN_NOTICE "kcapi: Controller [%03d]: %s attached\n", diff --git a/drivers/isdn/gigaset/capi.c b/drivers/isdn/gigaset/capi.c index 3f5cd06af104..6f0ae32906bf 100644 --- a/drivers/isdn/gigaset/capi.c +++ b/drivers/isdn/gigaset/capi.c @@ -13,6 +13,8 @@ #include "gigaset.h" #include +#include +#include #include #include #include @@ -2106,35 +2108,22 @@ static char *gigaset_procinfo(struct capi_ctr *ctr) return ctr->name; /* ToDo: more? */ } -/** - * gigaset_ctr_read_proc() - build controller proc file entry - * @page: buffer of PAGE_SIZE bytes for receiving the entry. - * @start: unused. - * @off: unused. - * @count: unused. - * @eof: unused. - * @ctr: controller descriptor structure. - * - * Return value: length of generated entry - */ -static int gigaset_ctr_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctr) +static int gigaset_proc_show(struct seq_file *m, void *v) { + struct capi_ctr *ctr = m->private; struct cardstate *cs = ctr->driverdata; char *s; int i; - int len = 0; - len += sprintf(page+len, "%-16s %s\n", "name", ctr->name); - len += sprintf(page+len, "%-16s %s %s\n", "dev", + + seq_printf(m, "%-16s %s\n", "name", ctr->name); + seq_printf(m, "%-16s %s %s\n", "dev", dev_driver_string(cs->dev), dev_name(cs->dev)); - len += sprintf(page+len, "%-16s %d\n", "id", cs->myid); + seq_printf(m, "%-16s %d\n", "id", cs->myid); if (cs->gotfwver) - len += sprintf(page+len, "%-16s %d.%d.%d.%d\n", "firmware", + seq_printf(m, "%-16s %d.%d.%d.%d\n", "firmware", cs->fwver[0], cs->fwver[1], cs->fwver[2], cs->fwver[3]); - len += sprintf(page+len, "%-16s %d\n", "channels", - cs->channels); - len += sprintf(page+len, "%-16s %s\n", "onechannel", - cs->onechannel ? "yes" : "no"); + seq_printf(m, "%-16s %d\n", "channels", cs->channels); + seq_printf(m, "%-16s %s\n", "onechannel", cs->onechannel ? "yes" : "no"); switch (cs->mode) { case M_UNKNOWN: @@ -2152,7 +2141,7 @@ static int gigaset_ctr_read_proc(char *page, char **start, off_t off, default: s = "??"; } - len += sprintf(page+len, "%-16s %s\n", "mode", s); + seq_printf(m, "%-16s %s\n", "mode", s); switch (cs->mstate) { case MS_UNINITIALIZED: @@ -2176,25 +2165,21 @@ static int gigaset_ctr_read_proc(char *page, char **start, off_t off, default: s = "??"; } - len += sprintf(page+len, "%-16s %s\n", "mstate", s); + seq_printf(m, "%-16s %s\n", "mstate", s); - len += sprintf(page+len, "%-16s %s\n", "running", - cs->running ? "yes" : "no"); - len += sprintf(page+len, "%-16s %s\n", "connected", - cs->connected ? "yes" : "no"); - len += sprintf(page+len, "%-16s %s\n", "isdn_up", - cs->isdn_up ? "yes" : "no"); - len += sprintf(page+len, "%-16s %s\n", "cidmode", - cs->cidmode ? "yes" : "no"); + seq_printf(m, "%-16s %s\n", "running", cs->running ? "yes" : "no"); + seq_printf(m, "%-16s %s\n", "connected", cs->connected ? "yes" : "no"); + seq_printf(m, "%-16s %s\n", "isdn_up", cs->isdn_up ? "yes" : "no"); + seq_printf(m, "%-16s %s\n", "cidmode", cs->cidmode ? "yes" : "no"); for (i = 0; i < cs->channels; i++) { - len += sprintf(page+len, "[%d]%-13s %d\n", i, "corrupted", + seq_printf(m, "[%d]%-13s %d\n", i, "corrupted", cs->bcs[i].corrupted); - len += sprintf(page+len, "[%d]%-13s %d\n", i, "trans_down", + seq_printf(m, "[%d]%-13s %d\n", i, "trans_down", cs->bcs[i].trans_down); - len += sprintf(page+len, "[%d]%-13s %d\n", i, "trans_up", + seq_printf(m, "[%d]%-13s %d\n", i, "trans_up", cs->bcs[i].trans_up); - len += sprintf(page+len, "[%d]%-13s %d\n", i, "chstate", + seq_printf(m, "[%d]%-13s %d\n", i, "chstate", cs->bcs[i].chstate); switch (cs->bcs[i].proto2) { case L2_BITSYNC: @@ -2209,11 +2194,23 @@ static int gigaset_ctr_read_proc(char *page, char **start, off_t off, default: s = "??"; } - len += sprintf(page+len, "[%d]%-13s %s\n", i, "proto2", s); + seq_printf(m, "[%d]%-13s %s\n", i, "proto2", s); } - return len; + return 0; } +static int gigaset_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, gigaset_proc_show, PDE(inode)->data); +} + +static const struct file_operations gigaset_proc_fops = { + .owner = THIS_MODULE, + .open = gigaset_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; static struct capi_driver capi_driver_gigaset = { .name = "gigaset", @@ -2256,7 +2253,7 @@ int gigaset_isdn_register(struct cardstate *cs, const char *isdnid) iif->ctr.release_appl = gigaset_release_appl; iif->ctr.send_message = gigaset_send_message; iif->ctr.procinfo = gigaset_procinfo; - iif->ctr.ctr_read_proc = gigaset_ctr_read_proc; + iif->ctr.proc_fops = &gigaset_proc_fops; INIT_LIST_HEAD(&iif->appls); skb_queue_head_init(&iif->sendqueue); atomic_set(&iif->sendqlen, 0); diff --git a/drivers/isdn/hardware/avm/avmcard.h b/drivers/isdn/hardware/avm/avmcard.h index d964f07e4a56..a70e8854461d 100644 --- a/drivers/isdn/hardware/avm/avmcard.h +++ b/drivers/isdn/hardware/avm/avmcard.h @@ -556,8 +556,7 @@ u16 b1_send_message(struct capi_ctr *ctrl, struct sk_buff *skb); void b1_parse_version(avmctrl_info *card); irqreturn_t b1_interrupt(int interrupt, void *devptr); -int b1ctl_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctrl); +extern const struct file_operations b1ctl_proc_fops; avmcard_dmainfo *avmcard_dma_alloc(char *name, struct pci_dev *, long rsize, long ssize); @@ -577,7 +576,6 @@ void b1dma_register_appl(struct capi_ctr *ctrl, capi_register_params *rp); void b1dma_release_appl(struct capi_ctr *ctrl, u16 appl); u16 b1dma_send_message(struct capi_ctr *ctrl, struct sk_buff *skb); -int b1dmactl_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctrl); +extern const struct file_operations b1dmactl_proc_fops; #endif /* _AVMCARD_H_ */ diff --git a/drivers/isdn/hardware/avm/b1.c b/drivers/isdn/hardware/avm/b1.c index a7c0083e78a7..c38fa0f4c729 100644 --- a/drivers/isdn/hardware/avm/b1.c +++ b/drivers/isdn/hardware/avm/b1.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -634,18 +636,17 @@ irqreturn_t b1_interrupt(int interrupt, void *devptr) } /* ------------------------------------------------------------- */ -int b1ctl_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctrl) +static int b1ctl_proc_show(struct seq_file *m, void *v) { + struct capi_ctr *ctrl = m->private; avmctrl_info *cinfo = (avmctrl_info *)(ctrl->driverdata); avmcard *card = cinfo->card; u8 flag; - int len = 0; char *s; - len += sprintf(page+len, "%-16s %s\n", "name", card->name); - len += sprintf(page+len, "%-16s 0x%x\n", "io", card->port); - len += sprintf(page+len, "%-16s %d\n", "irq", card->irq); + seq_printf(m, "%-16s %s\n", "name", card->name); + seq_printf(m, "%-16s 0x%x\n", "io", card->port); + seq_printf(m, "%-16s %d\n", "irq", card->irq); switch (card->cardtype) { case avm_b1isa: s = "B1 ISA"; break; case avm_b1pci: s = "B1 PCI"; break; @@ -658,20 +659,20 @@ int b1ctl_read_proc(char *page, char **start, off_t off, case avm_c2: s = "C2"; break; default: s = "???"; break; } - len += sprintf(page+len, "%-16s %s\n", "type", s); + seq_printf(m, "%-16s %s\n", "type", s); if (card->cardtype == avm_t1isa) - len += sprintf(page+len, "%-16s %d\n", "cardnr", card->cardnr); + seq_printf(m, "%-16s %d\n", "cardnr", card->cardnr); if ((s = cinfo->version[VER_DRIVER]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_driver", s); + seq_printf(m, "%-16s %s\n", "ver_driver", s); if ((s = cinfo->version[VER_CARDTYPE]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_cardtype", s); + seq_printf(m, "%-16s %s\n", "ver_cardtype", s); if ((s = cinfo->version[VER_SERIAL]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_serial", s); + seq_printf(m, "%-16s %s\n", "ver_serial", s); if (card->cardtype != avm_m1) { flag = ((u8 *)(ctrl->profile.manu))[3]; if (flag) - len += sprintf(page+len, "%-16s%s%s%s%s%s%s%s\n", + seq_printf(m, "%-16s%s%s%s%s%s%s%s\n", "protocol", (flag & 0x01) ? " DSS1" : "", (flag & 0x02) ? " CT1" : "", @@ -685,7 +686,7 @@ int b1ctl_read_proc(char *page, char **start, off_t off, if (card->cardtype != avm_m1) { flag = ((u8 *)(ctrl->profile.manu))[5]; if (flag) - len += sprintf(page+len, "%-16s%s%s%s%s\n", + seq_printf(m, "%-16s%s%s%s%s\n", "linetype", (flag & 0x01) ? " point to point" : "", (flag & 0x02) ? " point to multipoint" : "", @@ -693,16 +694,25 @@ int b1ctl_read_proc(char *page, char **start, off_t off, (flag & 0x04) ? " leased line with D-channel" : "" ); } - len += sprintf(page+len, "%-16s %s\n", "cardname", cinfo->cardname); - - if (off+count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len-off) ? count : len-off); + seq_printf(m, "%-16s %s\n", "cardname", cinfo->cardname); + + return 0; +} + +static int b1ctl_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, b1ctl_proc_show, PDE(inode)->data); } +const struct file_operations b1ctl_proc_fops = { + .owner = THIS_MODULE, + .open = b1ctl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +EXPORT_SYMBOL(b1ctl_proc_fops); + /* ------------------------------------------------------------- */ #ifdef CONFIG_PCI @@ -781,8 +791,6 @@ EXPORT_SYMBOL(b1_send_message); EXPORT_SYMBOL(b1_parse_version); EXPORT_SYMBOL(b1_interrupt); -EXPORT_SYMBOL(b1ctl_read_proc); - static int __init b1_init(void) { char *p; diff --git a/drivers/isdn/hardware/avm/b1dma.c b/drivers/isdn/hardware/avm/b1dma.c index 0e84aaae43fd..124550d0dbf3 100644 --- a/drivers/isdn/hardware/avm/b1dma.c +++ b/drivers/isdn/hardware/avm/b1dma.c @@ -11,6 +11,8 @@ #include #include +#include +#include #include #include #include @@ -855,21 +857,20 @@ u16 b1dma_send_message(struct capi_ctr *ctrl, struct sk_buff *skb) /* ------------------------------------------------------------- */ -int b1dmactl_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctrl) +static int b1dmactl_proc_show(struct seq_file *m, void *v) { + struct capi_ctr *ctrl = m->private; avmctrl_info *cinfo = (avmctrl_info *)(ctrl->driverdata); avmcard *card = cinfo->card; u8 flag; - int len = 0; char *s; u32 txoff, txlen, rxoff, rxlen, csr; unsigned long flags; - len += sprintf(page+len, "%-16s %s\n", "name", card->name); - len += sprintf(page+len, "%-16s 0x%x\n", "io", card->port); - len += sprintf(page+len, "%-16s %d\n", "irq", card->irq); - len += sprintf(page+len, "%-16s 0x%lx\n", "membase", card->membase); + seq_printf(m, "%-16s %s\n", "name", card->name); + seq_printf(m, "%-16s 0x%x\n", "io", card->port); + seq_printf(m, "%-16s %d\n", "irq", card->irq); + seq_printf(m, "%-16s 0x%lx\n", "membase", card->membase); switch (card->cardtype) { case avm_b1isa: s = "B1 ISA"; break; case avm_b1pci: s = "B1 PCI"; break; @@ -882,18 +883,18 @@ int b1dmactl_read_proc(char *page, char **start, off_t off, case avm_c2: s = "C2"; break; default: s = "???"; break; } - len += sprintf(page+len, "%-16s %s\n", "type", s); + seq_printf(m, "%-16s %s\n", "type", s); if ((s = cinfo->version[VER_DRIVER]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_driver", s); + seq_printf(m, "%-16s %s\n", "ver_driver", s); if ((s = cinfo->version[VER_CARDTYPE]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_cardtype", s); + seq_printf(m, "%-16s %s\n", "ver_cardtype", s); if ((s = cinfo->version[VER_SERIAL]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_serial", s); + seq_printf(m, "%-16s %s\n", "ver_serial", s); if (card->cardtype != avm_m1) { flag = ((u8 *)(ctrl->profile.manu))[3]; if (flag) - len += sprintf(page+len, "%-16s%s%s%s%s%s%s%s\n", + seq_printf(m, "%-16s%s%s%s%s%s%s%s\n", "protocol", (flag & 0x01) ? " DSS1" : "", (flag & 0x02) ? " CT1" : "", @@ -907,7 +908,7 @@ int b1dmactl_read_proc(char *page, char **start, off_t off, if (card->cardtype != avm_m1) { flag = ((u8 *)(ctrl->profile.manu))[5]; if (flag) - len += sprintf(page+len, "%-16s%s%s%s%s\n", + seq_printf(m, "%-16s%s%s%s%s\n", "linetype", (flag & 0x01) ? " point to point" : "", (flag & 0x02) ? " point to multipoint" : "", @@ -915,7 +916,7 @@ int b1dmactl_read_proc(char *page, char **start, off_t off, (flag & 0x04) ? " leased line with D-channel" : "" ); } - len += sprintf(page+len, "%-16s %s\n", "cardname", cinfo->cardname); + seq_printf(m, "%-16s %s\n", "cardname", cinfo->cardname); spin_lock_irqsave(&card->lock, flags); @@ -930,27 +931,30 @@ int b1dmactl_read_proc(char *page, char **start, off_t off, spin_unlock_irqrestore(&card->lock, flags); - len += sprintf(page+len, "%-16s 0x%lx\n", - "csr (cached)", (unsigned long)card->csr); - len += sprintf(page+len, "%-16s 0x%lx\n", - "csr", (unsigned long)csr); - len += sprintf(page+len, "%-16s %lu\n", - "txoff", (unsigned long)txoff); - len += sprintf(page+len, "%-16s %lu\n", - "txlen", (unsigned long)txlen); - len += sprintf(page+len, "%-16s %lu\n", - "rxoff", (unsigned long)rxoff); - len += sprintf(page+len, "%-16s %lu\n", - "rxlen", (unsigned long)rxlen); - - if (off+count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len-off) ? count : len-off); + seq_printf(m, "%-16s 0x%lx\n", "csr (cached)", (unsigned long)card->csr); + seq_printf(m, "%-16s 0x%lx\n", "csr", (unsigned long)csr); + seq_printf(m, "%-16s %lu\n", "txoff", (unsigned long)txoff); + seq_printf(m, "%-16s %lu\n", "txlen", (unsigned long)txlen); + seq_printf(m, "%-16s %lu\n", "rxoff", (unsigned long)rxoff); + seq_printf(m, "%-16s %lu\n", "rxlen", (unsigned long)rxlen); + + return 0; +} + +static int b1dmactl_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, b1dmactl_proc_show, PDE(inode)->data); } +const struct file_operations b1dmactl_proc_fops = { + .owner = THIS_MODULE, + .open = b1dmactl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +EXPORT_SYMBOL(b1dmactl_proc_fops); + /* ------------------------------------------------------------- */ EXPORT_SYMBOL(b1dma_reset); @@ -963,7 +967,6 @@ EXPORT_SYMBOL(b1dma_reset_ctr); EXPORT_SYMBOL(b1dma_register_appl); EXPORT_SYMBOL(b1dma_release_appl); EXPORT_SYMBOL(b1dma_send_message); -EXPORT_SYMBOL(b1dmactl_read_proc); static int __init b1dma_init(void) { diff --git a/drivers/isdn/hardware/avm/b1isa.c b/drivers/isdn/hardware/avm/b1isa.c index 6461a32bc838..ff5390546f92 100644 --- a/drivers/isdn/hardware/avm/b1isa.c +++ b/drivers/isdn/hardware/avm/b1isa.c @@ -121,7 +121,7 @@ static int b1isa_probe(struct pci_dev *pdev) cinfo->capi_ctrl.load_firmware = b1_load_firmware; cinfo->capi_ctrl.reset_ctr = b1_reset_ctr; cinfo->capi_ctrl.procinfo = b1isa_procinfo; - cinfo->capi_ctrl.ctr_read_proc = b1ctl_read_proc; + cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops; strcpy(cinfo->capi_ctrl.name, card->name); retval = attach_capi_ctr(&cinfo->capi_ctrl); diff --git a/drivers/isdn/hardware/avm/b1pci.c b/drivers/isdn/hardware/avm/b1pci.c index 5b314a2c4049..c97e4315079d 100644 --- a/drivers/isdn/hardware/avm/b1pci.c +++ b/drivers/isdn/hardware/avm/b1pci.c @@ -112,7 +112,7 @@ static int b1pci_probe(struct capicardparams *p, struct pci_dev *pdev) cinfo->capi_ctrl.load_firmware = b1_load_firmware; cinfo->capi_ctrl.reset_ctr = b1_reset_ctr; cinfo->capi_ctrl.procinfo = b1pci_procinfo; - cinfo->capi_ctrl.ctr_read_proc = b1ctl_read_proc; + cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops; strcpy(cinfo->capi_ctrl.name, card->name); cinfo->capi_ctrl.owner = THIS_MODULE; @@ -251,7 +251,7 @@ static int b1pciv4_probe(struct capicardparams *p, struct pci_dev *pdev) cinfo->capi_ctrl.load_firmware = b1dma_load_firmware; cinfo->capi_ctrl.reset_ctr = b1dma_reset_ctr; cinfo->capi_ctrl.procinfo = b1pciv4_procinfo; - cinfo->capi_ctrl.ctr_read_proc = b1dmactl_read_proc; + cinfo->capi_ctrl.proc_fops = &b1dmactl_proc_fops; strcpy(cinfo->capi_ctrl.name, card->name); retval = attach_capi_ctr(&cinfo->capi_ctrl); diff --git a/drivers/isdn/hardware/avm/b1pcmcia.c b/drivers/isdn/hardware/avm/b1pcmcia.c index 7740403b40e1..d6391e0afeea 100644 --- a/drivers/isdn/hardware/avm/b1pcmcia.c +++ b/drivers/isdn/hardware/avm/b1pcmcia.c @@ -108,7 +108,7 @@ static int b1pcmcia_add_card(unsigned int port, unsigned irq, cinfo->capi_ctrl.load_firmware = b1_load_firmware; cinfo->capi_ctrl.reset_ctr = b1_reset_ctr; cinfo->capi_ctrl.procinfo = b1pcmcia_procinfo; - cinfo->capi_ctrl.ctr_read_proc = b1ctl_read_proc; + cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops; strcpy(cinfo->capi_ctrl.name, card->name); retval = attach_capi_ctr(&cinfo->capi_ctrl); diff --git a/drivers/isdn/hardware/avm/c4.c b/drivers/isdn/hardware/avm/c4.c index 6833301a45fc..de6e6b311819 100644 --- a/drivers/isdn/hardware/avm/c4.c +++ b/drivers/isdn/hardware/avm/c4.c @@ -11,6 +11,8 @@ #include #include +#include +#include #include #include #include @@ -1062,19 +1064,18 @@ static char *c4_procinfo(struct capi_ctr *ctrl) return cinfo->infobuf; } -static int c4_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctrl) +static int c4_proc_show(struct seq_file *m, void *v) { + struct capi_ctr *ctrl = m->private; avmctrl_info *cinfo = (avmctrl_info *)(ctrl->driverdata); avmcard *card = cinfo->card; u8 flag; - int len = 0; char *s; - len += sprintf(page+len, "%-16s %s\n", "name", card->name); - len += sprintf(page+len, "%-16s 0x%x\n", "io", card->port); - len += sprintf(page+len, "%-16s %d\n", "irq", card->irq); - len += sprintf(page+len, "%-16s 0x%lx\n", "membase", card->membase); + seq_printf(m, "%-16s %s\n", "name", card->name); + seq_printf(m, "%-16s 0x%x\n", "io", card->port); + seq_printf(m, "%-16s %d\n", "irq", card->irq); + seq_printf(m, "%-16s 0x%lx\n", "membase", card->membase); switch (card->cardtype) { case avm_b1isa: s = "B1 ISA"; break; case avm_b1pci: s = "B1 PCI"; break; @@ -1087,18 +1088,18 @@ static int c4_read_proc(char *page, char **start, off_t off, case avm_c2: s = "C2"; break; default: s = "???"; break; } - len += sprintf(page+len, "%-16s %s\n", "type", s); + seq_printf(m, "%-16s %s\n", "type", s); if ((s = cinfo->version[VER_DRIVER]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_driver", s); + seq_printf(m, "%-16s %s\n", "ver_driver", s); if ((s = cinfo->version[VER_CARDTYPE]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_cardtype", s); + seq_printf(m, "%-16s %s\n", "ver_cardtype", s); if ((s = cinfo->version[VER_SERIAL]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_serial", s); + seq_printf(m, "%-16s %s\n", "ver_serial", s); if (card->cardtype != avm_m1) { flag = ((u8 *)(ctrl->profile.manu))[3]; if (flag) - len += sprintf(page+len, "%-16s%s%s%s%s%s%s%s\n", + seq_printf(m, "%-16s%s%s%s%s%s%s%s\n", "protocol", (flag & 0x01) ? " DSS1" : "", (flag & 0x02) ? " CT1" : "", @@ -1112,7 +1113,7 @@ static int c4_read_proc(char *page, char **start, off_t off, if (card->cardtype != avm_m1) { flag = ((u8 *)(ctrl->profile.manu))[5]; if (flag) - len += sprintf(page+len, "%-16s%s%s%s%s\n", + seq_printf(m, "%-16s%s%s%s%s\n", "linetype", (flag & 0x01) ? " point to point" : "", (flag & 0x02) ? " point to multipoint" : "", @@ -1120,16 +1121,24 @@ static int c4_read_proc(char *page, char **start, off_t off, (flag & 0x04) ? " leased line with D-channel" : "" ); } - len += sprintf(page+len, "%-16s %s\n", "cardname", cinfo->cardname); - - if (off+count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len-off) ? count : len-off); + seq_printf(m, "%-16s %s\n", "cardname", cinfo->cardname); + + return 0; } +static int c4_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, c4_proc_show, PDE(inode)->data); +} + +static const struct file_operations c4_proc_fops = { + .owner = THIS_MODULE, + .open = c4_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /* ------------------------------------------------------------- */ static int c4_add_card(struct capicardparams *p, struct pci_dev *dev, @@ -1201,7 +1210,7 @@ static int c4_add_card(struct capicardparams *p, struct pci_dev *dev, cinfo->capi_ctrl.load_firmware = c4_load_firmware; cinfo->capi_ctrl.reset_ctr = c4_reset_ctr; cinfo->capi_ctrl.procinfo = c4_procinfo; - cinfo->capi_ctrl.ctr_read_proc = c4_read_proc; + cinfo->capi_ctrl.proc_fops = &c4_proc_fops; strcpy(cinfo->capi_ctrl.name, card->name); retval = attach_capi_ctr(&cinfo->capi_ctrl); diff --git a/drivers/isdn/hardware/avm/t1isa.c b/drivers/isdn/hardware/avm/t1isa.c index 1c53fd49adb6..baeeb3c2a3ee 100644 --- a/drivers/isdn/hardware/avm/t1isa.c +++ b/drivers/isdn/hardware/avm/t1isa.c @@ -429,7 +429,7 @@ static int t1isa_probe(struct pci_dev *pdev, int cardnr) cinfo->capi_ctrl.load_firmware = t1isa_load_firmware; cinfo->capi_ctrl.reset_ctr = t1isa_reset_ctr; cinfo->capi_ctrl.procinfo = t1isa_procinfo; - cinfo->capi_ctrl.ctr_read_proc = b1ctl_read_proc; + cinfo->capi_ctrl.proc_fops = &b1ctl_proc_fops; strcpy(cinfo->capi_ctrl.name, card->name); retval = attach_capi_ctr(&cinfo->capi_ctrl); diff --git a/drivers/isdn/hardware/avm/t1pci.c b/drivers/isdn/hardware/avm/t1pci.c index e6d298d75146..5a3f83098018 100644 --- a/drivers/isdn/hardware/avm/t1pci.c +++ b/drivers/isdn/hardware/avm/t1pci.c @@ -119,7 +119,7 @@ static int t1pci_add_card(struct capicardparams *p, struct pci_dev *pdev) cinfo->capi_ctrl.load_firmware = b1dma_load_firmware; cinfo->capi_ctrl.reset_ctr = b1dma_reset_ctr; cinfo->capi_ctrl.procinfo = t1pci_procinfo; - cinfo->capi_ctrl.ctr_read_proc = b1dmactl_read_proc; + cinfo->capi_ctrl.proc_fops = &b1dmactl_proc_fops; strcpy(cinfo->capi_ctrl.name, card->name); retval = attach_capi_ctr(&cinfo->capi_ctrl); diff --git a/drivers/isdn/hardware/eicon/capimain.c b/drivers/isdn/hardware/eicon/capimain.c index 98fcdfc7ca55..0f073cd73763 100644 --- a/drivers/isdn/hardware/eicon/capimain.c +++ b/drivers/isdn/hardware/eicon/capimain.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "os_capi.h" @@ -75,25 +76,32 @@ void diva_os_free_message_buffer(diva_os_message_buffer_s * dmb) /* * proc function for controller info */ -static int diva_ctl_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctrl) +static int diva_ctl_proc_show(struct seq_file *m, void *v) { + struct capi_ctr *ctrl = m->private; diva_card *card = (diva_card *) ctrl->driverdata; - int len = 0; - - len += sprintf(page + len, "%s\n", ctrl->name); - len += sprintf(page + len, "Serial No. : %s\n", ctrl->serial); - len += sprintf(page + len, "Id : %d\n", card->Id); - len += sprintf(page + len, "Channels : %d\n", card->d.channels); - - if (off + count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len - off) ? count : len - off); + + seq_printf(m, "%s\n", ctrl->name); + seq_printf(m, "Serial No. : %s\n", ctrl->serial); + seq_printf(m, "Id : %d\n", card->Id); + seq_printf(m, "Channels : %d\n", card->d.channels); + + return 0; +} + +static int diva_ctl_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, diva_ctl_proc_show, NULL); } +static const struct file_operations diva_ctl_proc_fops = { + .owner = THIS_MODULE, + .open = diva_ctl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /* * set additional os settings in capi_ctr struct */ @@ -102,7 +110,7 @@ void diva_os_set_controller_struct(struct capi_ctr *ctrl) ctrl->driver_name = DRIVERLNAME; ctrl->load_firmware = NULL; ctrl->reset_ctr = NULL; - ctrl->ctr_read_proc = diva_ctl_read_proc; + ctrl->proc_fops = &diva_ctl_proc_fops; ctrl->owner = THIS_MODULE; } diff --git a/drivers/isdn/hardware/eicon/diva_didd.c b/drivers/isdn/hardware/eicon/diva_didd.c index 993b14cf1778..5d06a7437824 100644 --- a/drivers/isdn/hardware/eicon/diva_didd.c +++ b/drivers/isdn/hardware/eicon/diva_didd.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "platform.h" @@ -62,39 +63,41 @@ static char *getrev(const char *revision) return rev; } -static int -proc_read(char *page, char **start, off_t off, int count, int *eof, - void *data) +static int divadidd_proc_show(struct seq_file *m, void *v) { - int len = 0; char tmprev[32]; strcpy(tmprev, main_revision); - len += sprintf(page + len, "%s\n", DRIVERNAME); - len += sprintf(page + len, "name : %s\n", DRIVERLNAME); - len += sprintf(page + len, "release : %s\n", DRIVERRELEASE_DIDD); - len += sprintf(page + len, "build : %s(%s)\n", + seq_printf(m, "%s\n", DRIVERNAME); + seq_printf(m, "name : %s\n", DRIVERLNAME); + seq_printf(m, "release : %s\n", DRIVERRELEASE_DIDD); + seq_printf(m, "build : %s(%s)\n", diva_didd_common_code_build, DIVA_BUILD); - len += sprintf(page + len, "revision : %s\n", getrev(tmprev)); - - if (off + count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len - off) ? count : len - off); + seq_printf(m, "revision : %s\n", getrev(tmprev)); + + return 0; } +static int divadidd_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, divadidd_proc_show, NULL); +} + +static const struct file_operations divadidd_proc_fops = { + .owner = THIS_MODULE, + .open = divadidd_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int DIVA_INIT_FUNCTION create_proc(void) { proc_net_eicon = proc_mkdir("eicon", init_net.proc_net); if (proc_net_eicon) { - if ((proc_didd = - create_proc_entry(DRIVERLNAME, S_IFREG | S_IRUGO, - proc_net_eicon))) { - proc_didd->read_proc = proc_read; - } + proc_didd = proc_create(DRIVERLNAME, S_IRUGO, proc_net_eicon, + &divadidd_proc_fops); return (1); } return (0); diff --git a/drivers/isdn/hardware/eicon/divasi.c b/drivers/isdn/hardware/eicon/divasi.c index 69e71ebe7841..f577719ab3fa 100644 --- a/drivers/isdn/hardware/eicon/divasi.c +++ b/drivers/isdn/hardware/eicon/divasi.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -86,39 +87,40 @@ static void diva_um_timer_function(unsigned long data); extern struct proc_dir_entry *proc_net_eicon; static struct proc_dir_entry *um_idi_proc_entry = NULL; -static int -um_idi_proc_read(char *page, char **start, off_t off, int count, int *eof, - void *data) +static int um_idi_proc_show(struct seq_file *m, void *v) { - int len = 0; char tmprev[32]; - len += sprintf(page + len, "%s\n", DRIVERNAME); - len += sprintf(page + len, "name : %s\n", DRIVERLNAME); - len += sprintf(page + len, "release : %s\n", DRIVERRELEASE_IDI); + seq_printf(m, "%s\n", DRIVERNAME); + seq_printf(m, "name : %s\n", DRIVERLNAME); + seq_printf(m, "release : %s\n", DRIVERRELEASE_IDI); strcpy(tmprev, main_revision); - len += sprintf(page + len, "revision : %s\n", getrev(tmprev)); - len += sprintf(page + len, "build : %s\n", DIVA_BUILD); - len += sprintf(page + len, "major : %d\n", major); - - if (off + count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len - off) ? count : len - off); + seq_printf(m, "revision : %s\n", getrev(tmprev)); + seq_printf(m, "build : %s\n", DIVA_BUILD); + seq_printf(m, "major : %d\n", major); + + return 0; +} + +static int um_idi_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, um_idi_proc_show, NULL); } +static const struct file_operations um_idi_proc_fops = { + .owner = THIS_MODULE, + .open = um_idi_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int DIVA_INIT_FUNCTION create_um_idi_proc(void) { - um_idi_proc_entry = create_proc_entry(DRIVERLNAME, - S_IFREG | S_IRUGO | S_IWUSR, - proc_net_eicon); + um_idi_proc_entry = proc_create(DRIVERLNAME, S_IRUGO, proc_net_eicon, + &um_idi_proc_fops); if (!um_idi_proc_entry) return (0); - - um_idi_proc_entry->read_proc = um_idi_proc_read; - return (1); } diff --git a/drivers/isdn/hardware/eicon/divasproc.c b/drivers/isdn/hardware/eicon/divasproc.c index 040827288ec9..46d44a942624 100644 --- a/drivers/isdn/hardware/eicon/divasproc.c +++ b/drivers/isdn/hardware/eicon/divasproc.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -141,14 +142,10 @@ void remove_divas_proc(void) } } -/* -** write group_optimization -*/ -static int -write_grp_opt(struct file *file, const char __user *buffer, unsigned long count, - void *data) +static ssize_t grp_opt_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *pos) { - diva_os_xdi_adapter_t *a = (diva_os_xdi_adapter_t *) data; + diva_os_xdi_adapter_t *a = PDE(file->f_path.dentry->d_inode)->data; PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1]; if ((count == 1) || (count == 2)) { @@ -172,14 +169,10 @@ write_grp_opt(struct file *file, const char __user *buffer, unsigned long count, return (-EINVAL); } -/* -** write dynamic_l1_down -*/ -static int -write_d_l1_down(struct file *file, const char __user *buffer, unsigned long count, - void *data) +static ssize_t d_l1_down_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *pos) { - diva_os_xdi_adapter_t *a = (diva_os_xdi_adapter_t *) data; + diva_os_xdi_adapter_t *a = PDE(file->f_path.dentry->d_inode)->data; PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1]; if ((count == 1) || (count == 2)) { @@ -203,63 +196,62 @@ write_d_l1_down(struct file *file, const char __user *buffer, unsigned long coun return (-EINVAL); } - -/* -** read dynamic_l1_down -*/ -static int -read_d_l1_down(char *page, char **start, off_t off, int count, int *eof, - void *data) +static int d_l1_down_proc_show(struct seq_file *m, void *v) { - int len = 0; - diva_os_xdi_adapter_t *a = (diva_os_xdi_adapter_t *) data; + diva_os_xdi_adapter_t *a = m->private; PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1]; - len += sprintf(page + len, "%s\n", + seq_printf(m, "%s\n", (IoAdapter->capi_cfg. cfg_1 & DIVA_XDI_CAPI_CFG_1_DYNAMIC_L1_ON) ? "1" : "0"); + return 0; +} - if (off + count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len - off) ? count : len - off); +static int d_l1_down_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, d_l1_down_proc_show, PDE(inode)->data); } -/* -** read group_optimization -*/ -static int -read_grp_opt(char *page, char **start, off_t off, int count, int *eof, - void *data) +static const struct file_operations d_l1_down_proc_fops = { + .owner = THIS_MODULE, + .open = d_l1_down_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = d_l1_down_proc_write, +}; + +static int grp_opt_proc_show(struct seq_file *m, void *v) { - int len = 0; - diva_os_xdi_adapter_t *a = (diva_os_xdi_adapter_t *) data; + diva_os_xdi_adapter_t *a = m->private; PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1]; - len += sprintf(page + len, "%s\n", + seq_printf(m, "%s\n", (IoAdapter->capi_cfg. cfg_1 & DIVA_XDI_CAPI_CFG_1_GROUP_POPTIMIZATION_ON) ? "1" : "0"); + return 0; +} - if (off + count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len - off) ? count : len - off); +static int grp_opt_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, grp_opt_proc_show, PDE(inode)->data); } -/* -** info write -*/ -static int -info_write(struct file *file, const char __user *buffer, unsigned long count, - void *data) +static const struct file_operations grp_opt_proc_fops = { + .owner = THIS_MODULE, + .open = grp_opt_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = grp_opt_proc_write, +}; + +static ssize_t info_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *pos) { - diva_os_xdi_adapter_t *a = (diva_os_xdi_adapter_t *) data; + diva_os_xdi_adapter_t *a = PDE(file->f_path.dentry->d_inode)->data; PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1]; char c[4]; @@ -277,63 +269,46 @@ info_write(struct file *file, const char __user *buffer, unsigned long count, return (-EINVAL); } -/* -** info read -*/ -static int -info_read(char *page, char **start, off_t off, int count, int *eof, - void *data) +static int info_proc_show(struct seq_file *m, void *v) { int i = 0; - int len = 0; char *p; char tmpser[16]; - diva_os_xdi_adapter_t *a = (diva_os_xdi_adapter_t *) data; + diva_os_xdi_adapter_t *a = m->private; PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1]; - len += - sprintf(page + len, "Name : %s\n", - IoAdapter->Properties.Name); - len += sprintf(page + len, "DSP state : %08x\n", a->dsp_mask); - len += sprintf(page + len, "Channels : %02d\n", - IoAdapter->Properties.Channels); - len += sprintf(page + len, "E. max/used : %03d/%03d\n", + seq_printf(m, "Name : %s\n", IoAdapter->Properties.Name); + seq_printf(m, "DSP state : %08x\n", a->dsp_mask); + seq_printf(m, "Channels : %02d\n", IoAdapter->Properties.Channels); + seq_printf(m, "E. max/used : %03d/%03d\n", IoAdapter->e_max, IoAdapter->e_count); diva_get_vserial_number(IoAdapter, tmpser); - len += sprintf(page + len, "Serial : %s\n", tmpser); - len += - sprintf(page + len, "IRQ : %d\n", - IoAdapter->irq_info.irq_nr); - len += sprintf(page + len, "CardIndex : %d\n", a->CardIndex); - len += sprintf(page + len, "CardOrdinal : %d\n", a->CardOrdinal); - len += sprintf(page + len, "Controller : %d\n", a->controller); - len += sprintf(page + len, "Bus-Type : %s\n", + seq_printf(m, "Serial : %s\n", tmpser); + seq_printf(m, "IRQ : %d\n", IoAdapter->irq_info.irq_nr); + seq_printf(m, "CardIndex : %d\n", a->CardIndex); + seq_printf(m, "CardOrdinal : %d\n", a->CardOrdinal); + seq_printf(m, "Controller : %d\n", a->controller); + seq_printf(m, "Bus-Type : %s\n", (a->Bus == DIVAS_XDI_ADAPTER_BUS_ISA) ? "ISA" : "PCI"); - len += sprintf(page + len, "Port-Name : %s\n", a->port_name); + seq_printf(m, "Port-Name : %s\n", a->port_name); if (a->Bus == DIVAS_XDI_ADAPTER_BUS_PCI) { - len += - sprintf(page + len, "PCI-bus : %d\n", - a->resources.pci.bus); - len += - sprintf(page + len, "PCI-func : %d\n", - a->resources.pci.func); + seq_printf(m, "PCI-bus : %d\n", a->resources.pci.bus); + seq_printf(m, "PCI-func : %d\n", a->resources.pci.func); for (i = 0; i < 8; i++) { if (a->resources.pci.bar[i]) { - len += - sprintf(page + len, + seq_printf(m, "Mem / I/O %d : 0x%x / mapped : 0x%lx", i, a->resources.pci.bar[i], (unsigned long) a->resources. pci.addr[i]); if (a->resources.pci.length[i]) { - len += - sprintf(page + len, + seq_printf(m, " / length : %d", a->resources.pci. length[i]); } - len += sprintf(page + len, "\n"); + seq_putc(m, '\n'); } } } @@ -353,16 +328,25 @@ info_read(char *page, char **start, off_t off, int count, int *eof, } else { p = "ready"; } - len += sprintf(page + len, "State : %s\n", p); + seq_printf(m, "State : %s\n", p); - if (off + count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len - off) ? count : len - off); + return 0; +} + +static int info_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, info_proc_show, PDE(inode)->data); } +static const struct file_operations info_proc_fops = { + .owner = THIS_MODULE, + .open = info_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = info_proc_write, +}; + /* ** adapter proc init/de-init */ @@ -380,28 +364,20 @@ int create_adapter_proc(diva_os_xdi_adapter_t * a) return (0); a->proc_adapter_dir = (void *) de; - if (!(pe = - create_proc_entry(info_proc_name, S_IFREG | S_IRUGO | S_IWUSR, de))) + pe = proc_create_data(info_proc_name, S_IRUGO | S_IWUSR, de, + &info_proc_fops, a); + if (!pe) return (0); a->proc_info = (void *) pe; - pe->write_proc = info_write; - pe->read_proc = info_read; - pe->data = a; - if ((pe = create_proc_entry(grp_opt_proc_name, - S_IFREG | S_IRUGO | S_IWUSR, de))) { + pe = proc_create_data(grp_opt_proc_name, S_IRUGO | S_IWUSR, de, + &grp_opt_proc_fops, a); + if (pe) a->proc_grp_opt = (void *) pe; - pe->write_proc = write_grp_opt; - pe->read_proc = read_grp_opt; - pe->data = a; - } - if ((pe = create_proc_entry(d_l1_down_proc_name, - S_IFREG | S_IRUGO | S_IWUSR, de))) { + pe = proc_create_data(d_l1_down_proc_name, S_IRUGO | S_IWUSR, de, + &d_l1_down_proc_fops, a); + if (pe) a->proc_d_l1_down = (void *) pe; - pe->write_proc = write_d_l1_down; - pe->read_proc = read_d_l1_down; - pe->data = a; - } DBG_TRC(("proc entry %s created", tmp)); diff --git a/drivers/isdn/hysdn/hycapi.c b/drivers/isdn/hysdn/hycapi.c index 4ffaa14b9fc4..fe874afa4f81 100644 --- a/drivers/isdn/hysdn/hycapi.c +++ b/drivers/isdn/hysdn/hycapi.c @@ -11,6 +11,8 @@ */ #include +#include +#include #include #include #include @@ -432,26 +434,16 @@ static u16 hycapi_send_message(struct capi_ctr *ctrl, struct sk_buff *skb) return retval; } -/********************************************************************* -hycapi_read_proc - -Informations provided in the /proc/capi-entries. - -*********************************************************************/ - -static int hycapi_read_proc(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *ctrl) +static int hycapi_proc_show(struct seq_file *m, void *v) { + struct capi_ctr *ctrl = m->private; hycapictrl_info *cinfo = (hycapictrl_info *)(ctrl->driverdata); hysdn_card *card = cinfo->card; - int len = 0; char *s; -#ifdef HYCAPI_PRINTFNAMES - printk(KERN_NOTICE "hycapi_read_proc\n"); -#endif - len += sprintf(page+len, "%-16s %s\n", "name", cinfo->cardname); - len += sprintf(page+len, "%-16s 0x%x\n", "io", card->iobase); - len += sprintf(page+len, "%-16s %d\n", "irq", card->irq); + + seq_printf(m, "%-16s %s\n", "name", cinfo->cardname); + seq_printf(m, "%-16s 0x%x\n", "io", card->iobase); + seq_printf(m, "%-16s %d\n", "irq", card->irq); switch (card->brdtype) { case BD_PCCARD: s = "HYSDN Hycard"; break; @@ -461,24 +453,32 @@ static int hycapi_read_proc(char *page, char **start, off_t off, case BD_PLEXUS: s = "HYSDN Plexus30"; break; default: s = "???"; break; } - len += sprintf(page+len, "%-16s %s\n", "type", s); + seq_printf(m, "%-16s %s\n", "type", s); if ((s = cinfo->version[VER_DRIVER]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_driver", s); + seq_printf(m, "%-16s %s\n", "ver_driver", s); if ((s = cinfo->version[VER_CARDTYPE]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_cardtype", s); + seq_printf(m, "%-16s %s\n", "ver_cardtype", s); if ((s = cinfo->version[VER_SERIAL]) != NULL) - len += sprintf(page+len, "%-16s %s\n", "ver_serial", s); + seq_printf(m, "%-16s %s\n", "ver_serial", s); - len += sprintf(page+len, "%-16s %s\n", "cardname", cinfo->cardname); + seq_printf(m, "%-16s %s\n", "cardname", cinfo->cardname); - if (off+count >= len) - *eof = 1; - if (len < off) - return 0; - *start = page + off; - return ((count < len-off) ? count : len-off); + return 0; +} + +static int hycapi_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, hycapi_proc_show, PDE(inode)->data); } +static const struct file_operations hycapi_proc_fops = { + .owner = THIS_MODULE, + .open = hycapi_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /************************************************************** hycapi_load_firmware @@ -774,7 +774,7 @@ hycapi_capi_create(hysdn_card *card) ctrl->load_firmware = hycapi_load_firmware; ctrl->reset_ctr = hycapi_reset_ctr; ctrl->procinfo = hycapi_procinfo; - ctrl->ctr_read_proc = hycapi_read_proc; + ctrl->proc_fops = &hycapi_proc_fops; strcpy(ctrl->name, cinfo->cardname); ctrl->owner = THIS_MODULE; diff --git a/include/linux/isdn/capilli.h b/include/linux/isdn/capilli.h index 7acb87a44872..d3e5e9da0c82 100644 --- a/include/linux/isdn/capilli.h +++ b/include/linux/isdn/capilli.h @@ -50,8 +50,7 @@ struct capi_ctr { u16 (*send_message)(struct capi_ctr *, struct sk_buff *skb); char *(*procinfo)(struct capi_ctr *); - int (*ctr_read_proc)(char *page, char **start, off_t off, - int count, int *eof, struct capi_ctr *card); + const struct file_operations *proc_fops; /* filled in before calling ready callback */ u8 manu[CAPI_MANUFACTURER_LEN]; /* CAPI_GET_MANUFACTURER */ diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c index 97f8d68d574d..3487cfe74aec 100644 --- a/net/bluetooth/cmtp/capi.c +++ b/net/bluetooth/cmtp/capi.c @@ -21,7 +21,8 @@ */ #include - +#include +#include #include #include #include @@ -516,33 +517,37 @@ static char *cmtp_procinfo(struct capi_ctr *ctrl) return "CAPI Message Transport Protocol"; } -static int cmtp_ctr_read_proc(char *page, char **start, off_t off, int count, int *eof, struct capi_ctr *ctrl) +static int cmtp_proc_show(struct seq_file *m, void *v) { + struct capi_ctr *ctrl = m->private; struct cmtp_session *session = ctrl->driverdata; struct cmtp_application *app; struct list_head *p, *n; - int len = 0; - len += sprintf(page + len, "%s\n\n", cmtp_procinfo(ctrl)); - len += sprintf(page + len, "addr %s\n", session->name); - len += sprintf(page + len, "ctrl %d\n", session->num); + seq_printf(m, "%s\n\n", cmtp_procinfo(ctrl)); + seq_printf(m, "addr %s\n", session->name); + seq_printf(m, "ctrl %d\n", session->num); list_for_each_safe(p, n, &session->applications) { app = list_entry(p, struct cmtp_application, list); - len += sprintf(page + len, "appl %d -> %d\n", app->appl, app->mapping); + seq_printf(m, "appl %d -> %d\n", app->appl, app->mapping); } - if (off + count >= len) - *eof = 1; - - if (len < off) - return 0; - - *start = page + off; + return 0; +} - return ((count < len - off) ? count : len - off); +static int cmtp_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cmtp_proc_show, PDE(inode)->data); } +static const struct file_operations cmtp_proc_fops = { + .owner = THIS_MODULE, + .open = cmtp_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; int cmtp_attach_device(struct cmtp_session *session) { @@ -582,7 +587,7 @@ int cmtp_attach_device(struct cmtp_session *session) session->ctrl.send_message = cmtp_send_message; session->ctrl.procinfo = cmtp_procinfo; - session->ctrl.ctr_read_proc = cmtp_ctr_read_proc; + session->ctrl.proc_fops = &cmtp_proc_fops; if (attach_capi_ctr(&session->ctrl) < 0) { BT_ERR("Can't attach new controller"); -- cgit v1.2.3 From 6d125529c6cbfe570ce3bf9a0728548f087499da Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Dec 2009 06:58:56 -0500 Subject: Fix ACC_MODE() for real commit 5300990c0370e804e49d9a59d928c5d53fb73487 had stepped on a rather nasty mess: definitions of ACC_MODE used to be different. Fixed the resulting breakage, converting them to variant that takes O_... value; all callers have that and it actually simplifies life (see tomoyo part of changes). Signed-off-by: Al Viro --- fs/namei.c | 2 +- include/linux/fs.h | 2 +- security/tomoyo/tomoyo.c | 7 +------ 3 files changed, 3 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 1b26b1620664..d930f1856ed2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1620,7 +1620,7 @@ struct file *do_filp_open(int dfd, const char *pathname, open_flag |= O_DSYNC; if (!acc_mode) - acc_mode = MAY_OPEN | ACC_MODE(flag); + acc_mode = MAY_OPEN | ACC_MODE(open_flag); /* O_TRUNC implies we need access checks for write permissions */ if (flag & O_TRUNC) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9147ca88f253..b1bcb275b596 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2463,7 +2463,7 @@ int proc_nr_files(struct ctl_table *table, int write, int __init get_filesystem_list(char *buf); -#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) +#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) #define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE)) #endif /* __KERNEL__ */ diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index 8a00ade85166..2aceebf5f354 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -80,9 +80,8 @@ static int tomoyo_bprm_check_security(struct linux_binprm *bprm) return tomoyo_find_next_domain(bprm); /* * Read permission is checked against interpreters using next domain. - * '1' is the result of open_to_namei_flags(O_RDONLY). */ - return tomoyo_check_open_permission(domain, &bprm->file->f_path, 1); + return tomoyo_check_open_permission(domain, &bprm->file->f_path, O_RDONLY); } static int tomoyo_path_truncate(struct path *path, loff_t length, @@ -184,10 +183,6 @@ static int tomoyo_file_fcntl(struct file *file, unsigned int cmd, static int tomoyo_dentry_open(struct file *f, const struct cred *cred) { int flags = f->f_flags; - - if ((flags + 1) & O_ACCMODE) - flags++; - flags |= f->f_flags & (O_APPEND | O_TRUNC); /* Don't check read permission here if called from do_execve(). */ if (current->in_execve) return 0; -- cgit v1.2.3 From d5f1fb53353edc38da326445267c1df0c9676df2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:53:55 +0800 Subject: lib: Introduce strnstr() It differs strstr() in that it limits the length to be searched in the first string. Signed-off-by: Li Zefan LKML-Reference: <4B4E8743.6030805@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- include/linux/string.h | 5 ++++- lib/string.c | 27 ++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 651839a2a755..a716ee2a8adb 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -72,7 +72,10 @@ static inline __must_check char *strstrip(char *str) } #ifndef __HAVE_ARCH_STRSTR -extern char * strstr(const char *,const char *); +extern char * strstr(const char *, const char *); +#endif +#ifndef __HAVE_ARCH_STRNSTR +extern char * strnstr(const char *, const char *, size_t); #endif #ifndef __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); diff --git a/lib/string.c b/lib/string.c index 9f75b4ec50b8..a1cdcfcc42d0 100644 --- a/lib/string.c +++ b/lib/string.c @@ -667,7 +667,7 @@ EXPORT_SYMBOL(memscan); */ char *strstr(const char *s1, const char *s2) { - int l1, l2; + size_t l1, l2; l2 = strlen(s2); if (!l2) @@ -684,6 +684,31 @@ char *strstr(const char *s1, const char *s2) EXPORT_SYMBOL(strstr); #endif +#ifndef __HAVE_ARCH_STRNSTR +/** + * strnstr - Find the first substring in a length-limited string + * @s1: The string to be searched + * @s2: The string to search for + * @len: the maximum number of characters to search + */ +char *strnstr(const char *s1, const char *s2, size_t len) +{ + size_t l1 = len, l2; + + l2 = strlen(s2); + if (!l2) + return (char *)s1; + while (l1 >= l2) { + l1--; + if (!memcmp(s1, s2, l2)) + return (char *)s1; + s1++; + } + return NULL; +} +EXPORT_SYMBOL(strnstr); +#endif + #ifndef __HAVE_ARCH_MEMCHR /** * memchr - Find a character in an area of memory. -- cgit v1.2.3 From ad72c347e56bf3a0231b9d686e17764157d2961c Mon Sep 17 00:00:00 2001 From: Christian Pellegrin Date: Thu, 14 Jan 2010 07:08:34 +0000 Subject: can: Proper ctrlmode handling for CAN devices This patch adds error checking of ctrlmode values for CAN devices. As an example all availabe bits are implemented in the mcp251x driver. Signed-off-by: Christian Pellegrin Acked-by: Wolfgang Grandegger Signed-off-by: David S. Miller --- drivers/net/can/at91_can.c | 1 + drivers/net/can/bfin_can.c | 1 + drivers/net/can/dev.c | 2 ++ drivers/net/can/mcp251x.c | 11 ++++++++++- drivers/net/can/mscan/mscan.c | 1 + drivers/net/can/sja1000/sja1000.c | 1 + drivers/net/can/ti_hecc.c | 1 + drivers/net/can/usb/ems_usb.c | 1 + include/linux/can/dev.h | 1 + 9 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c index f7287497ba6e..a2f29a38798a 100644 --- a/drivers/net/can/at91_can.c +++ b/drivers/net/can/at91_can.c @@ -1073,6 +1073,7 @@ static int __init at91_can_probe(struct platform_device *pdev) priv->can.bittiming_const = &at91_bittiming_const; priv->can.do_set_bittiming = at91_set_bittiming; priv->can.do_set_mode = at91_set_mode; + priv->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES; priv->reg_base = addr; priv->dev = dev; priv->clk = clk; diff --git a/drivers/net/can/bfin_can.c b/drivers/net/can/bfin_can.c index 7e1926e79e98..bf7f9ba2d903 100644 --- a/drivers/net/can/bfin_can.c +++ b/drivers/net/can/bfin_can.c @@ -603,6 +603,7 @@ struct net_device *alloc_bfin_candev(void) priv->can.bittiming_const = &bfin_can_bittiming_const; priv->can.do_set_bittiming = bfin_can_set_bittiming; priv->can.do_set_mode = bfin_can_set_mode; + priv->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES; return dev; } diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index c1bb29f0322b..f08f1202ff00 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -592,6 +592,8 @@ static int can_changelink(struct net_device *dev, if (dev->flags & IFF_UP) return -EBUSY; cm = nla_data(data[IFLA_CAN_CTRLMODE]); + if (cm->flags & ~priv->ctrlmode_supported) + return -EOPNOTSUPP; priv->ctrlmode &= ~cm->mask; priv->ctrlmode |= cm->flags; } diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c index afa2fa45fed9..bbe186b5a0ed 100644 --- a/drivers/net/can/mcp251x.c +++ b/drivers/net/can/mcp251x.c @@ -539,9 +539,14 @@ static void mcp251x_set_normal_mode(struct spi_device *spi) if (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) { /* Put device into loopback mode */ mcp251x_write_reg(spi, CANCTRL, CANCTRL_REQOP_LOOPBACK); + } else if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) { + /* Put device into listen-only mode */ + mcp251x_write_reg(spi, CANCTRL, CANCTRL_REQOP_LISTEN_ONLY); } else { /* Put device into normal mode */ - mcp251x_write_reg(spi, CANCTRL, CANCTRL_REQOP_NORMAL); + mcp251x_write_reg(spi, CANCTRL, CANCTRL_REQOP_NORMAL | + (priv->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT ? + CANCTRL_OSM : 0)); /* Wait for the device to enter normal mode */ timeout = jiffies + HZ; @@ -948,6 +953,10 @@ static int __devinit mcp251x_can_probe(struct spi_device *spi) priv->can.bittiming_const = &mcp251x_bittiming_const; priv->can.do_set_mode = mcp251x_do_set_mode; priv->can.clock.freq = pdata->oscillator_frequency / 2; + priv->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES | + CAN_CTRLMODE_LOOPBACK | CAN_CTRLMODE_LISTENONLY; + if (pdata->model == CAN_MCP251X_MCP2515) + priv->can.ctrlmode_supported |= CAN_CTRLMODE_ONE_SHOT; priv->net = net; dev_set_drvdata(&spi->dev, priv); diff --git a/drivers/net/can/mscan/mscan.c b/drivers/net/can/mscan/mscan.c index 40827c128b65..6b7dd578d417 100644 --- a/drivers/net/can/mscan/mscan.c +++ b/drivers/net/can/mscan/mscan.c @@ -686,6 +686,7 @@ struct net_device *alloc_mscandev(void) priv->can.bittiming_const = &mscan_bittiming_const; priv->can.do_set_bittiming = mscan_do_set_bittiming; priv->can.do_set_mode = mscan_do_set_mode; + priv->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES; for (i = 0; i < TX_QUEUE_SIZE; i++) { priv->tx_queue[i].id = i; diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c index 345304d779b9..ace103a44833 100644 --- a/drivers/net/can/sja1000/sja1000.c +++ b/drivers/net/can/sja1000/sja1000.c @@ -567,6 +567,7 @@ struct net_device *alloc_sja1000dev(int sizeof_priv) priv->can.bittiming_const = &sja1000_bittiming_const; priv->can.do_set_bittiming = sja1000_set_bittiming; priv->can.do_set_mode = sja1000_set_mode; + priv->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES; if (sizeof_priv) priv->priv = (void *)priv + sizeof(struct sja1000_priv); diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c index 7d370e32a7a8..8332e242b0be 100644 --- a/drivers/net/can/ti_hecc.c +++ b/drivers/net/can/ti_hecc.c @@ -909,6 +909,7 @@ static int ti_hecc_probe(struct platform_device *pdev) priv->can.bittiming_const = &ti_hecc_bittiming_const; priv->can.do_set_mode = ti_hecc_do_set_mode; priv->can.do_get_state = ti_hecc_get_state; + priv->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES; ndev->irq = irq->start; ndev->flags |= IFF_ECHO; diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index ddb17e256656..bfab283ba9b1 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -1022,6 +1022,7 @@ static int ems_usb_probe(struct usb_interface *intf, dev->can.bittiming_const = &ems_usb_bittiming_const; dev->can.do_set_bittiming = ems_usb_set_bittiming; dev->can.do_set_mode = ems_usb_set_mode; + dev->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES; netdev->flags |= IFF_ECHO; /* we support local echo */ diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 7e7c98a3e908..c8c660a79f90 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -38,6 +38,7 @@ struct can_priv { enum can_state state; u32 ctrlmode; + u32 ctrlmode_supported; int restart_ms; struct timer_list restart_timer; -- cgit v1.2.3 From 05c2828c72c4eabf62376adfe27bd24797621f62 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 14 Jan 2010 06:17:09 +0000 Subject: tun: export underlying socket Tun device looks similar to a packet socket in that both pass complete frames from/to userspace. This patch fills in enough fields in the socket underlying tun driver to support sendmsg/recvmsg operations, and message flags MSG_TRUNC and MSG_DONTWAIT, and exports access to this socket to modules. Regular read/write behaviour is unchanged. This way, code using raw sockets to inject packets into a physical device, can support injecting packets into host network stack almost without modification. First user of this interface will be vhost virtualization accelerator. Signed-off-by: Michael S. Tsirkin Acked-by: Herbert Xu Acked-by: David S. Miller Signed-off-by: David S. Miller --- drivers/net/tun.c | 101 +++++++++++++++++++++++++++++++++++++++---------- include/linux/if_tun.h | 14 +++++++ 2 files changed, 96 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2834a01bae24..5adb3d150552 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -144,6 +144,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file) err = 0; tfile->tun = tun; tun->tfile = tfile; + tun->socket.file = file; dev_hold(tun->dev); sock_hold(tun->socket.sk); atomic_inc(&tfile->count); @@ -158,6 +159,7 @@ static void __tun_detach(struct tun_struct *tun) /* Detach from net device */ netif_tx_lock_bh(tun->dev); tun->tfile = NULL; + tun->socket.file = NULL; netif_tx_unlock_bh(tun->dev); /* Drop read queue */ @@ -387,7 +389,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) /* Notify and wake up reader process */ if (tun->flags & TUN_FASYNC) kill_fasync(&tun->fasync, SIGIO, POLL_IN); - wake_up_interruptible(&tun->socket.wait); + wake_up_interruptible_poll(&tun->socket.wait, POLLIN | + POLLRDNORM | POLLRDBAND); return NETDEV_TX_OK; drop: @@ -743,7 +746,7 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun, len = min_t(int, skb->len, len); skb_copy_datagram_const_iovec(skb, 0, iv, total, len); - total += len; + total += skb->len; tun->dev->stats.tx_packets++; tun->dev->stats.tx_bytes += len; @@ -751,34 +754,23 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun, return total; } -static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, - unsigned long count, loff_t pos) +static ssize_t tun_do_read(struct tun_struct *tun, + struct kiocb *iocb, const struct iovec *iv, + ssize_t len, int noblock) { - struct file *file = iocb->ki_filp; - struct tun_file *tfile = file->private_data; - struct tun_struct *tun = __tun_get(tfile); DECLARE_WAITQUEUE(wait, current); struct sk_buff *skb; - ssize_t len, ret = 0; - - if (!tun) - return -EBADFD; + ssize_t ret = 0; DBG(KERN_INFO "%s: tun_chr_read\n", tun->dev->name); - len = iov_length(iv, count); - if (len < 0) { - ret = -EINVAL; - goto out; - } - add_wait_queue(&tun->socket.wait, &wait); while (len) { current->state = TASK_INTERRUPTIBLE; /* Read frames from the queue */ if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) { - if (file->f_flags & O_NONBLOCK) { + if (noblock) { ret = -EAGAIN; break; } @@ -805,6 +797,27 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, current->state = TASK_RUNNING; remove_wait_queue(&tun->socket.wait, &wait); + return ret; +} + +static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct tun_file *tfile = file->private_data; + struct tun_struct *tun = __tun_get(tfile); + ssize_t len, ret; + + if (!tun) + return -EBADFD; + len = iov_length(iv, count); + if (len < 0) { + ret = -EINVAL; + goto out; + } + + ret = tun_do_read(tun, iocb, iv, len, file->f_flags & O_NONBLOCK); + ret = min_t(ssize_t, ret, len); out: tun_put(tun); return ret; @@ -847,7 +860,8 @@ static void tun_sock_write_space(struct sock *sk) return; if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible_sync(sk->sk_sleep); + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT | + POLLWRNORM | POLLWRBAND); tun = tun_sk(sk)->tun; kill_fasync(&tun->fasync, SIGIO, POLL_OUT); @@ -858,6 +872,37 @@ static void tun_sock_destruct(struct sock *sk) free_netdev(tun_sk(sk)->tun->dev); } +static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len) +{ + struct tun_struct *tun = container_of(sock, struct tun_struct, socket); + return tun_get_user(tun, m->msg_iov, total_len, + m->msg_flags & MSG_DONTWAIT); +} + +static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len, + int flags) +{ + struct tun_struct *tun = container_of(sock, struct tun_struct, socket); + int ret; + if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) + return -EINVAL; + ret = tun_do_read(tun, iocb, m->msg_iov, total_len, + flags & MSG_DONTWAIT); + if (ret > total_len) { + m->msg_flags |= MSG_TRUNC; + ret = flags & MSG_TRUNC ? ret : total_len; + } + return ret; +} + +/* Ops structure to mimic raw sockets with tun */ +static const struct proto_ops tun_socket_ops = { + .sendmsg = tun_sendmsg, + .recvmsg = tun_recvmsg, +}; + static struct proto tun_proto = { .name = "tun", .owner = THIS_MODULE, @@ -986,6 +1031,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) goto err_free_dev; init_waitqueue_head(&tun->socket.wait); + tun->socket.ops = &tun_socket_ops; sock_init_data(&tun->socket, sk); sk->sk_write_space = tun_sock_write_space; sk->sk_sndbuf = INT_MAX; @@ -1525,6 +1571,23 @@ static void tun_cleanup(void) rtnl_link_unregister(&tun_link_ops); } +/* Get an underlying socket object from tun file. Returns error unless file is + * attached to a device. The returned object works like a packet socket, it + * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for + * holding a reference to the file for as long as the socket is in use. */ +struct socket *tun_get_socket(struct file *file) +{ + struct tun_struct *tun; + if (file->f_op != &tun_fops) + return ERR_PTR(-EINVAL); + tun = tun_get(file); + if (!tun) + return ERR_PTR(-EBADFD); + tun_put(tun); + return &tun->socket; +} +EXPORT_SYMBOL_GPL(tun_get_socket); + module_init(tun_init); module_exit(tun_cleanup); MODULE_DESCRIPTION(DRV_DESCRIPTION); diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 3f5fd523b49d..404abe00162c 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -86,4 +86,18 @@ struct tun_filter { __u8 addr[0][ETH_ALEN]; }; +#ifdef __KERNEL__ +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) +struct socket *tun_get_socket(struct file *); +#else +#include +#include +struct file; +struct socket; +static inline struct socket *tun_get_socket(struct file *f) +{ + return ERR_PTR(-EINVAL); +} +#endif /* CONFIG_TUN */ +#endif /* __KERNEL__ */ #endif /* __IF_TUN_H */ -- cgit v1.2.3 From 3a4d5c94e959359ece6d6b55045c3f046677f55c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 14 Jan 2010 06:17:27 +0000 Subject: vhost_net: a kernel-level virtio server What it is: vhost net is a character device that can be used to reduce the number of system calls involved in virtio networking. Existing virtio net code is used in the guest without modification. There's similarity with vringfd, with some differences and reduced scope - uses eventfd for signalling - structures can be moved around in memory at any time (good for migration, bug work-arounds in userspace) - write logging is supported (good for migration) - support memory table and not just an offset (needed for kvm) common virtio related code has been put in a separate file vhost.c and can be made into a separate module if/when more backends appear. I used Rusty's lguest.c as the source for developing this part : this supplied me with witty comments I wouldn't be able to write myself. What it is not: vhost net is not a bus, and not a generic new system call. No assumptions are made on how guest performs hypercalls. Userspace hypervisors are supported as well as kvm. How it works: Basically, we connect virtio frontend (configured by userspace) to a backend. The backend could be a network device, or a tap device. Backend is also configured by userspace, including vlan/mac etc. Status: This works for me, and I haven't see any crashes. Compared to userspace, people reported improved latency (as I save up to 4 system calls per packet), as well as better bandwidth and CPU utilization. Features that I plan to look at in the future: - mergeable buffers - zero copy - scalability tuning: figure out the best threading model to use Note on RCU usage (this is also documented in vhost.h, near private_pointer which is the value protected by this variant of RCU): what is happening is that the rcu_dereference() is being used in a workqueue item. The role of rcu_read_lock() is taken on by the start of execution of the workqueue item, of rcu_read_unlock() by the end of execution of the workqueue item, and of synchronize_rcu() by flush_workqueue()/flush_work(). In the future we might need to apply some gcc attribute or sparse annotation to the function passed to INIT_WORK(). Paul's ack below is for this RCU usage. (Includes fixes by Alan Cox , David L Stevens , Chris Wright ) Acked-by: Rusty Russell Acked-by: Arnd Bergmann Acked-by: "Paul E. McKenney" Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- MAINTAINERS | 9 + arch/ia64/kvm/Kconfig | 1 + arch/powerpc/kvm/Kconfig | 1 + arch/s390/kvm/Kconfig | 1 + arch/x86/kvm/Kconfig | 1 + drivers/Makefile | 1 + drivers/vhost/Kconfig | 11 + drivers/vhost/Makefile | 2 + drivers/vhost/net.c | 661 ++++++++++++++++++++++++++ drivers/vhost/vhost.c | 1098 ++++++++++++++++++++++++++++++++++++++++++++ drivers/vhost/vhost.h | 161 +++++++ include/linux/Kbuild | 1 + include/linux/miscdevice.h | 1 + include/linux/vhost.h | 130 ++++++ 14 files changed, 2079 insertions(+) create mode 100644 drivers/vhost/Kconfig create mode 100644 drivers/vhost/Makefile create mode 100644 drivers/vhost/net.c create mode 100644 drivers/vhost/vhost.c create mode 100644 drivers/vhost/vhost.h create mode 100644 include/linux/vhost.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 745643b8c344..337dffbe9a47 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5803,6 +5803,15 @@ S: Maintained F: Documentation/filesystems/vfat.txt F: fs/fat/ +VIRTIO HOST (VHOST) +M: "Michael S. Tsirkin" +L: kvm@vger.kernel.org +L: virtualization@lists.osdl.org +L: netdev@vger.kernel.org +S: Maintained +F: drivers/vhost/ +F: include/linux/vhost.h + VIA RHINE NETWORK DRIVER M: Roger Luethi S: Maintained diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index ef3e7be29caf..01c75797119c 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -47,6 +47,7 @@ config KVM_INTEL Provides support for KVM on Itanium 2 processors equipped with the VT extensions. +source drivers/vhost/Kconfig source drivers/virtio/Kconfig endif # VIRTUALIZATION diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 07703f72330e..e28841fbfb8d 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -75,6 +75,7 @@ config KVM_E500 If unsure, say N. +source drivers/vhost/Kconfig source drivers/virtio/Kconfig endif # VIRTUALIZATION diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 6ee55ae84ce2..a7251580891c 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -35,6 +35,7 @@ config KVM # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. +source drivers/vhost/Kconfig source drivers/virtio/Kconfig endif # VIRTUALIZATION diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 4cd498332466..3c4d0109ad20 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -65,6 +65,7 @@ config KVM_AMD # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. +source drivers/vhost/Kconfig source drivers/lguest/Kconfig source drivers/virtio/Kconfig diff --git a/drivers/Makefile b/drivers/Makefile index 6ee53c7a57a1..81e36596b1e9 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -106,6 +106,7 @@ obj-$(CONFIG_HID) += hid/ obj-$(CONFIG_PPC_PS3) += ps3/ obj-$(CONFIG_OF) += of/ obj-$(CONFIG_SSB) += ssb/ +obj-$(CONFIG_VHOST_NET) += vhost/ obj-$(CONFIG_VIRTIO) += virtio/ obj-$(CONFIG_VLYNQ) += vlynq/ obj-$(CONFIG_STAGING) += staging/ diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig new file mode 100644 index 000000000000..9f409f447aea --- /dev/null +++ b/drivers/vhost/Kconfig @@ -0,0 +1,11 @@ +config VHOST_NET + tristate "Host kernel accelerator for virtio net (EXPERIMENTAL)" + depends on NET && EVENTFD && EXPERIMENTAL + ---help--- + This kernel module can be loaded in host kernel to accelerate + guest networking with virtio_net. Not to be confused with virtio_net + module itself which needs to be loaded in guest kernel. + + To compile this driver as a module, choose M here: the module will + be called vhost_net. + diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile new file mode 100644 index 000000000000..72dd02050bb9 --- /dev/null +++ b/drivers/vhost/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_VHOST_NET) += vhost_net.o +vhost_net-y := vhost.o net.o diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c new file mode 100644 index 000000000000..4c8928319e1d --- /dev/null +++ b/drivers/vhost/net.c @@ -0,0 +1,661 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * Author: Michael S. Tsirkin + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * virtio-net server in host kernel. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "vhost.h" + +/* Max number of bytes transferred before requeueing the job. + * Using this limit prevents one virtqueue from starving others. */ +#define VHOST_NET_WEIGHT 0x80000 + +enum { + VHOST_NET_VQ_RX = 0, + VHOST_NET_VQ_TX = 1, + VHOST_NET_VQ_MAX = 2, +}; + +enum vhost_net_poll_state { + VHOST_NET_POLL_DISABLED = 0, + VHOST_NET_POLL_STARTED = 1, + VHOST_NET_POLL_STOPPED = 2, +}; + +struct vhost_net { + struct vhost_dev dev; + struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; + struct vhost_poll poll[VHOST_NET_VQ_MAX]; + /* Tells us whether we are polling a socket for TX. + * We only do this when socket buffer fills up. + * Protected by tx vq lock. */ + enum vhost_net_poll_state tx_poll_state; +}; + +/* Pop first len bytes from iovec. Return number of segments used. */ +static int move_iovec_hdr(struct iovec *from, struct iovec *to, + size_t len, int iov_count) +{ + int seg = 0; + size_t size; + while (len && seg < iov_count) { + size = min(from->iov_len, len); + to->iov_base = from->iov_base; + to->iov_len = size; + from->iov_len -= size; + from->iov_base += size; + len -= size; + ++from; + ++to; + ++seg; + } + return seg; +} + +/* Caller must have TX VQ lock */ +static void tx_poll_stop(struct vhost_net *net) +{ + if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED)) + return; + vhost_poll_stop(net->poll + VHOST_NET_VQ_TX); + net->tx_poll_state = VHOST_NET_POLL_STOPPED; +} + +/* Caller must have TX VQ lock */ +static void tx_poll_start(struct vhost_net *net, struct socket *sock) +{ + if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED)) + return; + vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file); + net->tx_poll_state = VHOST_NET_POLL_STARTED; +} + +/* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ +static void handle_tx(struct vhost_net *net) +{ + struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; + unsigned head, out, in, s; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_control = NULL, + .msg_controllen = 0, + .msg_iov = vq->iov, + .msg_flags = MSG_DONTWAIT, + }; + size_t len, total_len = 0; + int err, wmem; + size_t hdr_size; + struct socket *sock = rcu_dereference(vq->private_data); + if (!sock) + return; + + wmem = atomic_read(&sock->sk->sk_wmem_alloc); + if (wmem >= sock->sk->sk_sndbuf) + return; + + use_mm(net->dev.mm); + mutex_lock(&vq->mutex); + vhost_disable_notify(vq); + + if (wmem < sock->sk->sk_sndbuf * 2) + tx_poll_stop(net); + hdr_size = vq->hdr_size; + + for (;;) { + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, + ARRAY_SIZE(vq->iov), + &out, &in, + NULL, NULL); + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (head == vq->num) { + wmem = atomic_read(&sock->sk->sk_wmem_alloc); + if (wmem >= sock->sk->sk_sndbuf * 3 / 4) { + tx_poll_start(net, sock); + set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + break; + } + if (unlikely(vhost_enable_notify(vq))) { + vhost_disable_notify(vq); + continue; + } + break; + } + if (in) { + vq_err(vq, "Unexpected descriptor format for TX: " + "out %d, int %d\n", out, in); + break; + } + /* Skip header. TODO: support TSO. */ + s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out); + msg.msg_iovlen = out; + len = iov_length(vq->iov, out); + /* Sanity check */ + if (!len) { + vq_err(vq, "Unexpected header len for TX: " + "%zd expected %zd\n", + iov_length(vq->hdr, s), hdr_size); + break; + } + /* TODO: Check specific error and bomb out unless ENOBUFS? */ + err = sock->ops->sendmsg(NULL, sock, &msg, len); + if (unlikely(err < 0)) { + vhost_discard_vq_desc(vq); + tx_poll_start(net, sock); + break; + } + if (err != len) + pr_err("Truncated TX packet: " + " len %d != %zd\n", err, len); + vhost_add_used_and_signal(&net->dev, vq, head, 0); + total_len += len; + if (unlikely(total_len >= VHOST_NET_WEIGHT)) { + vhost_poll_queue(&vq->poll); + break; + } + } + + mutex_unlock(&vq->mutex); + unuse_mm(net->dev.mm); +} + +/* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ +static void handle_rx(struct vhost_net *net) +{ + struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; + unsigned head, out, in, log, s; + struct vhost_log *vq_log; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_control = NULL, /* FIXME: get and handle RX aux data. */ + .msg_controllen = 0, + .msg_iov = vq->iov, + .msg_flags = MSG_DONTWAIT, + }; + + struct virtio_net_hdr hdr = { + .flags = 0, + .gso_type = VIRTIO_NET_HDR_GSO_NONE + }; + + size_t len, total_len = 0; + int err; + size_t hdr_size; + struct socket *sock = rcu_dereference(vq->private_data); + if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) + return; + + use_mm(net->dev.mm); + mutex_lock(&vq->mutex); + vhost_disable_notify(vq); + hdr_size = vq->hdr_size; + + vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? + vq->log : NULL; + + for (;;) { + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, + ARRAY_SIZE(vq->iov), + &out, &in, + vq_log, &log); + /* OK, now we need to know about added descriptors. */ + if (head == vq->num) { + if (unlikely(vhost_enable_notify(vq))) { + /* They have slipped one in as we were + * doing that: check again. */ + vhost_disable_notify(vq); + continue; + } + /* Nothing new? Wait for eventfd to tell us + * they refilled. */ + break; + } + /* We don't need to be notified again. */ + if (out) { + vq_err(vq, "Unexpected descriptor format for RX: " + "out %d, int %d\n", + out, in); + break; + } + /* Skip header. TODO: support TSO/mergeable rx buffers. */ + s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in); + msg.msg_iovlen = in; + len = iov_length(vq->iov, in); + /* Sanity check */ + if (!len) { + vq_err(vq, "Unexpected header len for RX: " + "%zd expected %zd\n", + iov_length(vq->hdr, s), hdr_size); + break; + } + err = sock->ops->recvmsg(NULL, sock, &msg, + len, MSG_DONTWAIT | MSG_TRUNC); + /* TODO: Check specific error and bomb out unless EAGAIN? */ + if (err < 0) { + vhost_discard_vq_desc(vq); + break; + } + /* TODO: Should check and handle checksum. */ + if (err > len) { + pr_err("Discarded truncated rx packet: " + " len %d > %zd\n", err, len); + vhost_discard_vq_desc(vq); + continue; + } + len = err; + err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size); + if (err) { + vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n", + vq->iov->iov_base, err); + break; + } + len += hdr_size; + vhost_add_used_and_signal(&net->dev, vq, head, len); + if (unlikely(vq_log)) + vhost_log_write(vq, vq_log, log, len); + total_len += len; + if (unlikely(total_len >= VHOST_NET_WEIGHT)) { + vhost_poll_queue(&vq->poll); + break; + } + } + + mutex_unlock(&vq->mutex); + unuse_mm(net->dev.mm); +} + +static void handle_tx_kick(struct work_struct *work) +{ + struct vhost_virtqueue *vq; + struct vhost_net *net; + vq = container_of(work, struct vhost_virtqueue, poll.work); + net = container_of(vq->dev, struct vhost_net, dev); + handle_tx(net); +} + +static void handle_rx_kick(struct work_struct *work) +{ + struct vhost_virtqueue *vq; + struct vhost_net *net; + vq = container_of(work, struct vhost_virtqueue, poll.work); + net = container_of(vq->dev, struct vhost_net, dev); + handle_rx(net); +} + +static void handle_tx_net(struct work_struct *work) +{ + struct vhost_net *net; + net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); + handle_tx(net); +} + +static void handle_rx_net(struct work_struct *work) +{ + struct vhost_net *net; + net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); + handle_rx(net); +} + +static int vhost_net_open(struct inode *inode, struct file *f) +{ + struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); + int r; + if (!n) + return -ENOMEM; + n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; + n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; + r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); + if (r < 0) { + kfree(n); + return r; + } + + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); + n->tx_poll_state = VHOST_NET_POLL_DISABLED; + + f->private_data = n; + + return 0; +} + +static void vhost_net_disable_vq(struct vhost_net *n, + struct vhost_virtqueue *vq) +{ + if (!vq->private_data) + return; + if (vq == n->vqs + VHOST_NET_VQ_TX) { + tx_poll_stop(n); + n->tx_poll_state = VHOST_NET_POLL_DISABLED; + } else + vhost_poll_stop(n->poll + VHOST_NET_VQ_RX); +} + +static void vhost_net_enable_vq(struct vhost_net *n, + struct vhost_virtqueue *vq) +{ + struct socket *sock = vq->private_data; + if (!sock) + return; + if (vq == n->vqs + VHOST_NET_VQ_TX) { + n->tx_poll_state = VHOST_NET_POLL_STOPPED; + tx_poll_start(n, sock); + } else + vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file); +} + +static struct socket *vhost_net_stop_vq(struct vhost_net *n, + struct vhost_virtqueue *vq) +{ + struct socket *sock; + + mutex_lock(&vq->mutex); + sock = vq->private_data; + vhost_net_disable_vq(n, vq); + rcu_assign_pointer(vq->private_data, NULL); + mutex_unlock(&vq->mutex); + return sock; +} + +static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, + struct socket **rx_sock) +{ + *tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX); + *rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX); +} + +static void vhost_net_flush_vq(struct vhost_net *n, int index) +{ + vhost_poll_flush(n->poll + index); + vhost_poll_flush(&n->dev.vqs[index].poll); +} + +static void vhost_net_flush(struct vhost_net *n) +{ + vhost_net_flush_vq(n, VHOST_NET_VQ_TX); + vhost_net_flush_vq(n, VHOST_NET_VQ_RX); +} + +static int vhost_net_release(struct inode *inode, struct file *f) +{ + struct vhost_net *n = f->private_data; + struct socket *tx_sock; + struct socket *rx_sock; + + vhost_net_stop(n, &tx_sock, &rx_sock); + vhost_net_flush(n); + vhost_dev_cleanup(&n->dev); + if (tx_sock) + fput(tx_sock->file); + if (rx_sock) + fput(rx_sock->file); + /* We do an extra flush before freeing memory, + * since jobs can re-queue themselves. */ + vhost_net_flush(n); + kfree(n); + return 0; +} + +static struct socket *get_raw_socket(int fd) +{ + struct { + struct sockaddr_ll sa; + char buf[MAX_ADDR_LEN]; + } uaddr; + int uaddr_len = sizeof uaddr, r; + struct socket *sock = sockfd_lookup(fd, &r); + if (!sock) + return ERR_PTR(-ENOTSOCK); + + /* Parameter checking */ + if (sock->sk->sk_type != SOCK_RAW) { + r = -ESOCKTNOSUPPORT; + goto err; + } + + r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, + &uaddr_len, 0); + if (r) + goto err; + + if (uaddr.sa.sll_family != AF_PACKET) { + r = -EPFNOSUPPORT; + goto err; + } + return sock; +err: + fput(sock->file); + return ERR_PTR(r); +} + +static struct socket *get_tun_socket(int fd) +{ + struct file *file = fget(fd); + struct socket *sock; + if (!file) + return ERR_PTR(-EBADF); + sock = tun_get_socket(file); + if (IS_ERR(sock)) + fput(file); + return sock; +} + +static struct socket *get_socket(int fd) +{ + struct socket *sock; + /* special case to disable backend */ + if (fd == -1) + return NULL; + sock = get_raw_socket(fd); + if (!IS_ERR(sock)) + return sock; + sock = get_tun_socket(fd); + if (!IS_ERR(sock)) + return sock; + return ERR_PTR(-ENOTSOCK); +} + +static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) +{ + struct socket *sock, *oldsock; + struct vhost_virtqueue *vq; + int r; + + mutex_lock(&n->dev.mutex); + r = vhost_dev_check_owner(&n->dev); + if (r) + goto err; + + if (index >= VHOST_NET_VQ_MAX) { + r = -ENOBUFS; + goto err; + } + vq = n->vqs + index; + mutex_lock(&vq->mutex); + + /* Verify that ring has been setup correctly. */ + if (!vhost_vq_access_ok(vq)) { + r = -EFAULT; + goto err; + } + sock = get_socket(fd); + if (IS_ERR(sock)) { + r = PTR_ERR(sock); + goto err; + } + + /* start polling new socket */ + oldsock = vq->private_data; + if (sock == oldsock) + goto done; + + vhost_net_disable_vq(n, vq); + rcu_assign_pointer(vq->private_data, sock); + vhost_net_enable_vq(n, vq); + mutex_unlock(&vq->mutex); +done: + if (oldsock) { + vhost_net_flush_vq(n, index); + fput(oldsock->file); + } +err: + mutex_unlock(&n->dev.mutex); + return r; +} + +static long vhost_net_reset_owner(struct vhost_net *n) +{ + struct socket *tx_sock = NULL; + struct socket *rx_sock = NULL; + long err; + mutex_lock(&n->dev.mutex); + err = vhost_dev_check_owner(&n->dev); + if (err) + goto done; + vhost_net_stop(n, &tx_sock, &rx_sock); + vhost_net_flush(n); + err = vhost_dev_reset_owner(&n->dev); +done: + mutex_unlock(&n->dev.mutex); + if (tx_sock) + fput(tx_sock->file); + if (rx_sock) + fput(rx_sock->file); + return err; +} + +static int vhost_net_set_features(struct vhost_net *n, u64 features) +{ + size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ? + sizeof(struct virtio_net_hdr) : 0; + int i; + mutex_lock(&n->dev.mutex); + if ((features & (1 << VHOST_F_LOG_ALL)) && + !vhost_log_access_ok(&n->dev)) { + mutex_unlock(&n->dev.mutex); + return -EFAULT; + } + n->dev.acked_features = features; + smp_wmb(); + for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { + mutex_lock(&n->vqs[i].mutex); + n->vqs[i].hdr_size = hdr_size; + mutex_unlock(&n->vqs[i].mutex); + } + vhost_net_flush(n); + mutex_unlock(&n->dev.mutex); + return 0; +} + +static long vhost_net_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct vhost_net *n = f->private_data; + void __user *argp = (void __user *)arg; + u64 __user *featurep = argp; + struct vhost_vring_file backend; + u64 features; + int r; + switch (ioctl) { + case VHOST_NET_SET_BACKEND: + r = copy_from_user(&backend, argp, sizeof backend); + if (r < 0) + return r; + return vhost_net_set_backend(n, backend.index, backend.fd); + case VHOST_GET_FEATURES: + features = VHOST_FEATURES; + return copy_to_user(featurep, &features, sizeof features); + case VHOST_SET_FEATURES: + r = copy_from_user(&features, featurep, sizeof features); + if (r < 0) + return r; + if (features & ~VHOST_FEATURES) + return -EOPNOTSUPP; + return vhost_net_set_features(n, features); + case VHOST_RESET_OWNER: + return vhost_net_reset_owner(n); + default: + mutex_lock(&n->dev.mutex); + r = vhost_dev_ioctl(&n->dev, ioctl, arg); + vhost_net_flush(n); + mutex_unlock(&n->dev.mutex); + return r; + } +} + +#ifdef CONFIG_COMPAT +static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); +} +#endif + +const static struct file_operations vhost_net_fops = { + .owner = THIS_MODULE, + .release = vhost_net_release, + .unlocked_ioctl = vhost_net_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vhost_net_compat_ioctl, +#endif + .open = vhost_net_open, +}; + +static struct miscdevice vhost_net_misc = { + VHOST_NET_MINOR, + "vhost-net", + &vhost_net_fops, +}; + +int vhost_net_init(void) +{ + int r = vhost_init(); + if (r) + goto err_init; + r = misc_register(&vhost_net_misc); + if (r) + goto err_reg; + return 0; +err_reg: + vhost_cleanup(); +err_init: + return r; + +} +module_init(vhost_net_init); + +void vhost_net_exit(void) +{ + misc_deregister(&vhost_net_misc); + vhost_cleanup(); +} +module_exit(vhost_net_exit); + +MODULE_VERSION("0.0.1"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Michael S. Tsirkin"); +MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c new file mode 100644 index 000000000000..c8c25dbc5857 --- /dev/null +++ b/drivers/vhost/vhost.c @@ -0,0 +1,1098 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * Copyright (C) 2006 Rusty Russell IBM Corporation + * + * Author: Michael S. Tsirkin + * + * Inspiration, some code, and most witty comments come from + * Documentation/lguest/lguest.c, by Rusty Russell + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Generic code for virtio server in host kernel. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "vhost.h" + +enum { + VHOST_MEMORY_MAX_NREGIONS = 64, + VHOST_MEMORY_F_LOG = 0x1, +}; + +static struct workqueue_struct *vhost_workqueue; + +static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct vhost_poll *poll; + poll = container_of(pt, struct vhost_poll, table); + + poll->wqh = wqh; + add_wait_queue(wqh, &poll->wait); +} + +static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + struct vhost_poll *poll; + poll = container_of(wait, struct vhost_poll, wait); + if (!((unsigned long)key & poll->mask)) + return 0; + + queue_work(vhost_workqueue, &poll->work); + return 0; +} + +/* Init poll structure */ +void vhost_poll_init(struct vhost_poll *poll, work_func_t func, + unsigned long mask) +{ + INIT_WORK(&poll->work, func); + init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); + init_poll_funcptr(&poll->table, vhost_poll_func); + poll->mask = mask; +} + +/* Start polling a file. We add ourselves to file's wait queue. The caller must + * keep a reference to a file until after vhost_poll_stop is called. */ +void vhost_poll_start(struct vhost_poll *poll, struct file *file) +{ + unsigned long mask; + mask = file->f_op->poll(file, &poll->table); + if (mask) + vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask); +} + +/* Stop polling a file. After this function returns, it becomes safe to drop the + * file reference. You must also flush afterwards. */ +void vhost_poll_stop(struct vhost_poll *poll) +{ + remove_wait_queue(poll->wqh, &poll->wait); +} + +/* Flush any work that has been scheduled. When calling this, don't hold any + * locks that are also used by the callback. */ +void vhost_poll_flush(struct vhost_poll *poll) +{ + flush_work(&poll->work); +} + +void vhost_poll_queue(struct vhost_poll *poll) +{ + queue_work(vhost_workqueue, &poll->work); +} + +static void vhost_vq_reset(struct vhost_dev *dev, + struct vhost_virtqueue *vq) +{ + vq->num = 1; + vq->desc = NULL; + vq->avail = NULL; + vq->used = NULL; + vq->last_avail_idx = 0; + vq->avail_idx = 0; + vq->last_used_idx = 0; + vq->used_flags = 0; + vq->used_flags = 0; + vq->log_used = false; + vq->log_addr = -1ull; + vq->hdr_size = 0; + vq->private_data = NULL; + vq->log_base = NULL; + vq->error_ctx = NULL; + vq->error = NULL; + vq->kick = NULL; + vq->call_ctx = NULL; + vq->call = NULL; +} + +long vhost_dev_init(struct vhost_dev *dev, + struct vhost_virtqueue *vqs, int nvqs) +{ + int i; + dev->vqs = vqs; + dev->nvqs = nvqs; + mutex_init(&dev->mutex); + dev->log_ctx = NULL; + dev->log_file = NULL; + dev->memory = NULL; + dev->mm = NULL; + + for (i = 0; i < dev->nvqs; ++i) { + dev->vqs[i].dev = dev; + mutex_init(&dev->vqs[i].mutex); + vhost_vq_reset(dev, dev->vqs + i); + if (dev->vqs[i].handle_kick) + vhost_poll_init(&dev->vqs[i].poll, + dev->vqs[i].handle_kick, + POLLIN); + } + return 0; +} + +/* Caller should have device mutex */ +long vhost_dev_check_owner(struct vhost_dev *dev) +{ + /* Are you the owner? If not, I don't think you mean to do that */ + return dev->mm == current->mm ? 0 : -EPERM; +} + +/* Caller should have device mutex */ +static long vhost_dev_set_owner(struct vhost_dev *dev) +{ + /* Is there an owner already? */ + if (dev->mm) + return -EBUSY; + /* No owner, become one */ + dev->mm = get_task_mm(current); + return 0; +} + +/* Caller should have device mutex */ +long vhost_dev_reset_owner(struct vhost_dev *dev) +{ + struct vhost_memory *memory; + + /* Restore memory to default empty mapping. */ + memory = kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL); + if (!memory) + return -ENOMEM; + + vhost_dev_cleanup(dev); + + memory->nregions = 0; + dev->memory = memory; + return 0; +} + +/* Caller should have device mutex */ +void vhost_dev_cleanup(struct vhost_dev *dev) +{ + int i; + for (i = 0; i < dev->nvqs; ++i) { + if (dev->vqs[i].kick && dev->vqs[i].handle_kick) { + vhost_poll_stop(&dev->vqs[i].poll); + vhost_poll_flush(&dev->vqs[i].poll); + } + if (dev->vqs[i].error_ctx) + eventfd_ctx_put(dev->vqs[i].error_ctx); + if (dev->vqs[i].error) + fput(dev->vqs[i].error); + if (dev->vqs[i].kick) + fput(dev->vqs[i].kick); + if (dev->vqs[i].call_ctx) + eventfd_ctx_put(dev->vqs[i].call_ctx); + if (dev->vqs[i].call) + fput(dev->vqs[i].call); + vhost_vq_reset(dev, dev->vqs + i); + } + if (dev->log_ctx) + eventfd_ctx_put(dev->log_ctx); + dev->log_ctx = NULL; + if (dev->log_file) + fput(dev->log_file); + dev->log_file = NULL; + /* No one will access memory at this point */ + kfree(dev->memory); + dev->memory = NULL; + if (dev->mm) + mmput(dev->mm); + dev->mm = NULL; +} + +static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) +{ + u64 a = addr / VHOST_PAGE_SIZE / 8; + /* Make sure 64 bit math will not overflow. */ + if (a > ULONG_MAX - (unsigned long)log_base || + a + (unsigned long)log_base > ULONG_MAX) + return -EFAULT; + + return access_ok(VERIFY_WRITE, log_base + a, + (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8); +} + +/* Caller should have vq mutex and device mutex. */ +static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem, + int log_all) +{ + int i; + for (i = 0; i < mem->nregions; ++i) { + struct vhost_memory_region *m = mem->regions + i; + unsigned long a = m->userspace_addr; + if (m->memory_size > ULONG_MAX) + return 0; + else if (!access_ok(VERIFY_WRITE, (void __user *)a, + m->memory_size)) + return 0; + else if (log_all && !log_access_ok(log_base, + m->guest_phys_addr, + m->memory_size)) + return 0; + } + return 1; +} + +/* Can we switch to this memory table? */ +/* Caller should have device mutex but not vq mutex */ +static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem, + int log_all) +{ + int i; + for (i = 0; i < d->nvqs; ++i) { + int ok; + mutex_lock(&d->vqs[i].mutex); + /* If ring is inactive, will check when it's enabled. */ + if (d->vqs[i].private_data) + ok = vq_memory_access_ok(d->vqs[i].log_base, mem, + log_all); + else + ok = 1; + mutex_unlock(&d->vqs[i].mutex); + if (!ok) + return 0; + } + return 1; +} + +static int vq_access_ok(unsigned int num, + struct vring_desc __user *desc, + struct vring_avail __user *avail, + struct vring_used __user *used) +{ + return access_ok(VERIFY_READ, desc, num * sizeof *desc) && + access_ok(VERIFY_READ, avail, + sizeof *avail + num * sizeof *avail->ring) && + access_ok(VERIFY_WRITE, used, + sizeof *used + num * sizeof *used->ring); +} + +/* Can we log writes? */ +/* Caller should have device mutex but not vq mutex */ +int vhost_log_access_ok(struct vhost_dev *dev) +{ + return memory_access_ok(dev, dev->memory, 1); +} + +/* Verify access for write logging. */ +/* Caller should have vq mutex and device mutex */ +static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base) +{ + return vq_memory_access_ok(log_base, vq->dev->memory, + vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) && + (!vq->log_used || log_access_ok(log_base, vq->log_addr, + sizeof *vq->used + + vq->num * sizeof *vq->used->ring)); +} + +/* Can we start vq? */ +/* Caller should have vq mutex and device mutex */ +int vhost_vq_access_ok(struct vhost_virtqueue *vq) +{ + return vq_access_ok(vq->num, vq->desc, vq->avail, vq->used) && + vq_log_access_ok(vq, vq->log_base); +} + +static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) +{ + struct vhost_memory mem, *newmem, *oldmem; + unsigned long size = offsetof(struct vhost_memory, regions); + long r; + r = copy_from_user(&mem, m, size); + if (r) + return r; + if (mem.padding) + return -EOPNOTSUPP; + if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS) + return -E2BIG; + newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL); + if (!newmem) + return -ENOMEM; + + memcpy(newmem, &mem, size); + r = copy_from_user(newmem->regions, m->regions, + mem.nregions * sizeof *m->regions); + if (r) { + kfree(newmem); + return r; + } + + if (!memory_access_ok(d, newmem, vhost_has_feature(d, VHOST_F_LOG_ALL))) + return -EFAULT; + oldmem = d->memory; + rcu_assign_pointer(d->memory, newmem); + synchronize_rcu(); + kfree(oldmem); + return 0; +} + +static int init_used(struct vhost_virtqueue *vq, + struct vring_used __user *used) +{ + int r = put_user(vq->used_flags, &used->flags); + if (r) + return r; + return get_user(vq->last_used_idx, &used->idx); +} + +static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) +{ + struct file *eventfp, *filep = NULL, + *pollstart = NULL, *pollstop = NULL; + struct eventfd_ctx *ctx = NULL; + u32 __user *idxp = argp; + struct vhost_virtqueue *vq; + struct vhost_vring_state s; + struct vhost_vring_file f; + struct vhost_vring_addr a; + u32 idx; + long r; + + r = get_user(idx, idxp); + if (r < 0) + return r; + if (idx > d->nvqs) + return -ENOBUFS; + + vq = d->vqs + idx; + + mutex_lock(&vq->mutex); + + switch (ioctl) { + case VHOST_SET_VRING_NUM: + /* Resizing ring with an active backend? + * You don't want to do that. */ + if (vq->private_data) { + r = -EBUSY; + break; + } + r = copy_from_user(&s, argp, sizeof s); + if (r < 0) + break; + if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) { + r = -EINVAL; + break; + } + vq->num = s.num; + break; + case VHOST_SET_VRING_BASE: + /* Moving base with an active backend? + * You don't want to do that. */ + if (vq->private_data) { + r = -EBUSY; + break; + } + r = copy_from_user(&s, argp, sizeof s); + if (r < 0) + break; + if (s.num > 0xffff) { + r = -EINVAL; + break; + } + vq->last_avail_idx = s.num; + /* Forget the cached index value. */ + vq->avail_idx = vq->last_avail_idx; + break; + case VHOST_GET_VRING_BASE: + s.index = idx; + s.num = vq->last_avail_idx; + r = copy_to_user(argp, &s, sizeof s); + break; + case VHOST_SET_VRING_ADDR: + r = copy_from_user(&a, argp, sizeof a); + if (r < 0) + break; + if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) { + r = -EOPNOTSUPP; + break; + } + /* For 32bit, verify that the top 32bits of the user + data are set to zero. */ + if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr || + (u64)(unsigned long)a.used_user_addr != a.used_user_addr || + (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) { + r = -EFAULT; + break; + } + if ((a.avail_user_addr & (sizeof *vq->avail->ring - 1)) || + (a.used_user_addr & (sizeof *vq->used->ring - 1)) || + (a.log_guest_addr & (sizeof *vq->used->ring - 1))) { + r = -EINVAL; + break; + } + + /* We only verify access here if backend is configured. + * If it is not, we don't as size might not have been setup. + * We will verify when backend is configured. */ + if (vq->private_data) { + if (!vq_access_ok(vq->num, + (void __user *)(unsigned long)a.desc_user_addr, + (void __user *)(unsigned long)a.avail_user_addr, + (void __user *)(unsigned long)a.used_user_addr)) { + r = -EINVAL; + break; + } + + /* Also validate log access for used ring if enabled. */ + if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) && + !log_access_ok(vq->log_base, a.log_guest_addr, + sizeof *vq->used + + vq->num * sizeof *vq->used->ring)) { + r = -EINVAL; + break; + } + } + + r = init_used(vq, (struct vring_used __user *)(unsigned long) + a.used_user_addr); + if (r) + break; + vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG)); + vq->desc = (void __user *)(unsigned long)a.desc_user_addr; + vq->avail = (void __user *)(unsigned long)a.avail_user_addr; + vq->log_addr = a.log_guest_addr; + vq->used = (void __user *)(unsigned long)a.used_user_addr; + break; + case VHOST_SET_VRING_KICK: + r = copy_from_user(&f, argp, sizeof f); + if (r < 0) + break; + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); + if (IS_ERR(eventfp)) + return PTR_ERR(eventfp); + if (eventfp != vq->kick) { + pollstop = filep = vq->kick; + pollstart = vq->kick = eventfp; + } else + filep = eventfp; + break; + case VHOST_SET_VRING_CALL: + r = copy_from_user(&f, argp, sizeof f); + if (r < 0) + break; + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); + if (IS_ERR(eventfp)) + return PTR_ERR(eventfp); + if (eventfp != vq->call) { + filep = vq->call; + ctx = vq->call_ctx; + vq->call = eventfp; + vq->call_ctx = eventfp ? + eventfd_ctx_fileget(eventfp) : NULL; + } else + filep = eventfp; + break; + case VHOST_SET_VRING_ERR: + r = copy_from_user(&f, argp, sizeof f); + if (r < 0) + break; + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); + if (IS_ERR(eventfp)) + return PTR_ERR(eventfp); + if (eventfp != vq->error) { + filep = vq->error; + vq->error = eventfp; + ctx = vq->error_ctx; + vq->error_ctx = eventfp ? + eventfd_ctx_fileget(eventfp) : NULL; + } else + filep = eventfp; + break; + default: + r = -ENOIOCTLCMD; + } + + if (pollstop && vq->handle_kick) + vhost_poll_stop(&vq->poll); + + if (ctx) + eventfd_ctx_put(ctx); + if (filep) + fput(filep); + + if (pollstart && vq->handle_kick) + vhost_poll_start(&vq->poll, vq->kick); + + mutex_unlock(&vq->mutex); + + if (pollstop && vq->handle_kick) + vhost_poll_flush(&vq->poll); + return r; +} + +/* Caller must have device mutex */ +long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct file *eventfp, *filep = NULL; + struct eventfd_ctx *ctx = NULL; + u64 p; + long r; + int i, fd; + + /* If you are not the owner, you can become one */ + if (ioctl == VHOST_SET_OWNER) { + r = vhost_dev_set_owner(d); + goto done; + } + + /* You must be the owner to do anything else */ + r = vhost_dev_check_owner(d); + if (r) + goto done; + + switch (ioctl) { + case VHOST_SET_MEM_TABLE: + r = vhost_set_memory(d, argp); + break; + case VHOST_SET_LOG_BASE: + r = copy_from_user(&p, argp, sizeof p); + if (r < 0) + break; + if ((u64)(unsigned long)p != p) { + r = -EFAULT; + break; + } + for (i = 0; i < d->nvqs; ++i) { + struct vhost_virtqueue *vq; + void __user *base = (void __user *)(unsigned long)p; + vq = d->vqs + i; + mutex_lock(&vq->mutex); + /* If ring is inactive, will check when it's enabled. */ + if (vq->private_data && !vq_log_access_ok(vq, base)) + r = -EFAULT; + else + vq->log_base = base; + mutex_unlock(&vq->mutex); + } + break; + case VHOST_SET_LOG_FD: + r = get_user(fd, (int __user *)argp); + if (r < 0) + break; + eventfp = fd == -1 ? NULL : eventfd_fget(fd); + if (IS_ERR(eventfp)) { + r = PTR_ERR(eventfp); + break; + } + if (eventfp != d->log_file) { + filep = d->log_file; + ctx = d->log_ctx; + d->log_ctx = eventfp ? + eventfd_ctx_fileget(eventfp) : NULL; + } else + filep = eventfp; + for (i = 0; i < d->nvqs; ++i) { + mutex_lock(&d->vqs[i].mutex); + d->vqs[i].log_ctx = d->log_ctx; + mutex_unlock(&d->vqs[i].mutex); + } + if (ctx) + eventfd_ctx_put(ctx); + if (filep) + fput(filep); + break; + default: + r = vhost_set_vring(d, ioctl, argp); + break; + } +done: + return r; +} + +static const struct vhost_memory_region *find_region(struct vhost_memory *mem, + __u64 addr, __u32 len) +{ + struct vhost_memory_region *reg; + int i; + /* linear search is not brilliant, but we really have on the order of 6 + * regions in practice */ + for (i = 0; i < mem->nregions; ++i) { + reg = mem->regions + i; + if (reg->guest_phys_addr <= addr && + reg->guest_phys_addr + reg->memory_size - 1 >= addr) + return reg; + } + return NULL; +} + +/* TODO: This is really inefficient. We need something like get_user() + * (instruction directly accesses the data, with an exception table entry + * returning -EFAULT). See Documentation/x86/exception-tables.txt. + */ +static int set_bit_to_user(int nr, void __user *addr) +{ + unsigned long log = (unsigned long)addr; + struct page *page; + void *base; + int bit = nr + (log % PAGE_SIZE) * 8; + int r; + r = get_user_pages_fast(log, 1, 1, &page); + if (r) + return r; + base = kmap_atomic(page, KM_USER0); + set_bit(bit, base); + kunmap_atomic(base, KM_USER0); + set_page_dirty_lock(page); + put_page(page); + return 0; +} + +static int log_write(void __user *log_base, + u64 write_address, u64 write_length) +{ + int r; + if (!write_length) + return 0; + write_address /= VHOST_PAGE_SIZE; + for (;;) { + u64 base = (u64)(unsigned long)log_base; + u64 log = base + write_address / 8; + int bit = write_address % 8; + if ((u64)(unsigned long)log != log) + return -EFAULT; + r = set_bit_to_user(bit, (void __user *)(unsigned long)log); + if (r < 0) + return r; + if (write_length <= VHOST_PAGE_SIZE) + break; + write_length -= VHOST_PAGE_SIZE; + write_address += VHOST_PAGE_SIZE; + } + return r; +} + +int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, + unsigned int log_num, u64 len) +{ + int i, r; + + /* Make sure data written is seen before log. */ + wmb(); + for (i = 0; i < log_num; ++i) { + u64 l = min(log[i].len, len); + r = log_write(vq->log_base, log[i].addr, l); + if (r < 0) + return r; + len -= l; + if (!len) + return 0; + } + if (vq->log_ctx) + eventfd_signal(vq->log_ctx, 1); + /* Length written exceeds what we have stored. This is a bug. */ + BUG(); + return 0; +} + +int translate_desc(struct vhost_dev *dev, u64 addr, u32 len, + struct iovec iov[], int iov_size) +{ + const struct vhost_memory_region *reg; + struct vhost_memory *mem; + struct iovec *_iov; + u64 s = 0; + int ret = 0; + + rcu_read_lock(); + + mem = rcu_dereference(dev->memory); + while ((u64)len > s) { + u64 size; + if (ret >= iov_size) { + ret = -ENOBUFS; + break; + } + reg = find_region(mem, addr, len); + if (!reg) { + ret = -EFAULT; + break; + } + _iov = iov + ret; + size = reg->memory_size - addr + reg->guest_phys_addr; + _iov->iov_len = min((u64)len, size); + _iov->iov_base = (void *)(unsigned long) + (reg->userspace_addr + addr - reg->guest_phys_addr); + s += size; + addr += size; + ++ret; + } + + rcu_read_unlock(); + return ret; +} + +/* Each buffer in the virtqueues is actually a chain of descriptors. This + * function returns the next descriptor in the chain, + * or -1U if we're at the end. */ +static unsigned next_desc(struct vring_desc *desc) +{ + unsigned int next; + + /* If this descriptor says it doesn't chain, we're done. */ + if (!(desc->flags & VRING_DESC_F_NEXT)) + return -1U; + + /* Check they're not leading us off end of descriptors. */ + next = desc->next; + /* Make sure compiler knows to grab that: we don't want it changing! */ + /* We will use the result as an index in an array, so most + * architectures only need a compiler barrier here. */ + read_barrier_depends(); + + return next; +} + +static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num, + struct vring_desc *indirect) +{ + struct vring_desc desc; + unsigned int i = 0, count, found = 0; + int ret; + + /* Sanity check */ + if (indirect->len % sizeof desc) { + vq_err(vq, "Invalid length in indirect descriptor: " + "len 0x%llx not multiple of 0x%zx\n", + (unsigned long long)indirect->len, + sizeof desc); + return -EINVAL; + } + + ret = translate_desc(dev, indirect->addr, indirect->len, vq->indirect, + ARRAY_SIZE(vq->indirect)); + if (ret < 0) { + vq_err(vq, "Translation failure %d in indirect.\n", ret); + return ret; + } + + /* We will use the result as an address to read from, so most + * architectures only need a compiler barrier here. */ + read_barrier_depends(); + + count = indirect->len / sizeof desc; + /* Buffers are chained via a 16 bit next field, so + * we can have at most 2^16 of these. */ + if (count > USHORT_MAX + 1) { + vq_err(vq, "Indirect buffer length too big: %d\n", + indirect->len); + return -E2BIG; + } + + do { + unsigned iov_count = *in_num + *out_num; + if (++found > count) { + vq_err(vq, "Loop detected: last one at %u " + "indirect size %u\n", + i, count); + return -EINVAL; + } + if (memcpy_fromiovec((unsigned char *)&desc, vq->indirect, + sizeof desc)) { + vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n", + i, (size_t)indirect->addr + i * sizeof desc); + return -EINVAL; + } + if (desc.flags & VRING_DESC_F_INDIRECT) { + vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n", + i, (size_t)indirect->addr + i * sizeof desc); + return -EINVAL; + } + + ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count, + iov_size - iov_count); + if (ret < 0) { + vq_err(vq, "Translation failure %d indirect idx %d\n", + ret, i); + return ret; + } + /* If this is an input descriptor, increment that count. */ + if (desc.flags & VRING_DESC_F_WRITE) { + *in_num += ret; + if (unlikely(log)) { + log[*log_num].addr = desc.addr; + log[*log_num].len = desc.len; + ++*log_num; + } + } else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (*in_num) { + vq_err(vq, "Indirect descriptor " + "has out after in: idx %d\n", i); + return -EINVAL; + } + *out_num += ret; + } + } while ((i = next_desc(&desc)) != -1); + return 0; +} + +/* This looks in the virtqueue and for the first available buffer, and converts + * it to an iovec for convenient access. Since descriptors consist of some + * number of output then some number of input descriptors, it's actually two + * iovecs, but we pack them into one and note how many of each there were. + * + * This function returns the descriptor number found, or vq->num (which + * is never a valid descriptor number) if none was found. */ +unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num) +{ + struct vring_desc desc; + unsigned int i, head, found = 0; + u16 last_avail_idx; + int ret; + + /* Check it isn't doing very strange things with descriptor numbers. */ + last_avail_idx = vq->last_avail_idx; + if (get_user(vq->avail_idx, &vq->avail->idx)) { + vq_err(vq, "Failed to access avail idx at %p\n", + &vq->avail->idx); + return vq->num; + } + + if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) { + vq_err(vq, "Guest moved used index from %u to %u", + last_avail_idx, vq->avail_idx); + return vq->num; + } + + /* If there's nothing new since last we looked, return invalid. */ + if (vq->avail_idx == last_avail_idx) + return vq->num; + + /* Only get avail ring entries after they have been exposed by guest. */ + rmb(); + + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) { + vq_err(vq, "Failed to read head: idx %d address %p\n", + last_avail_idx, + &vq->avail->ring[last_avail_idx % vq->num]); + return vq->num; + } + + /* If their number is silly, that's an error. */ + if (head >= vq->num) { + vq_err(vq, "Guest says index %u > %u is available", + head, vq->num); + return vq->num; + } + + /* When we start there are none of either input nor output. */ + *out_num = *in_num = 0; + if (unlikely(log)) + *log_num = 0; + + i = head; + do { + unsigned iov_count = *in_num + *out_num; + if (i >= vq->num) { + vq_err(vq, "Desc index is %u > %u, head = %u", + i, vq->num, head); + return vq->num; + } + if (++found > vq->num) { + vq_err(vq, "Loop detected: last one at %u " + "vq size %u head %u\n", + i, vq->num, head); + return vq->num; + } + ret = copy_from_user(&desc, vq->desc + i, sizeof desc); + if (ret) { + vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", + i, vq->desc + i); + return vq->num; + } + if (desc.flags & VRING_DESC_F_INDIRECT) { + ret = get_indirect(dev, vq, iov, iov_size, + out_num, in_num, + log, log_num, &desc); + if (ret < 0) { + vq_err(vq, "Failure detected " + "in indirect descriptor at idx %d\n", i); + return vq->num; + } + continue; + } + + ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count, + iov_size - iov_count); + if (ret < 0) { + vq_err(vq, "Translation failure %d descriptor idx %d\n", + ret, i); + return vq->num; + } + if (desc.flags & VRING_DESC_F_WRITE) { + /* If this is an input descriptor, + * increment that count. */ + *in_num += ret; + if (unlikely(log)) { + log[*log_num].addr = desc.addr; + log[*log_num].len = desc.len; + ++*log_num; + } + } else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (*in_num) { + vq_err(vq, "Descriptor has out after in: " + "idx %d\n", i); + return vq->num; + } + *out_num += ret; + } + } while ((i = next_desc(&desc)) != -1); + + /* On success, increment avail index. */ + vq->last_avail_idx++; + return head; +} + +/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ +void vhost_discard_vq_desc(struct vhost_virtqueue *vq) +{ + vq->last_avail_idx--; +} + +/* After we've used one of their buffers, we tell them about it. We'll then + * want to notify the guest, using eventfd. */ +int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) +{ + struct vring_used_elem *used; + + /* The virtqueue contains a ring of used buffers. Get a pointer to the + * next entry in that used ring. */ + used = &vq->used->ring[vq->last_used_idx % vq->num]; + if (put_user(head, &used->id)) { + vq_err(vq, "Failed to write used id"); + return -EFAULT; + } + if (put_user(len, &used->len)) { + vq_err(vq, "Failed to write used len"); + return -EFAULT; + } + /* Make sure buffer is written before we update index. */ + wmb(); + if (put_user(vq->last_used_idx + 1, &vq->used->idx)) { + vq_err(vq, "Failed to increment used idx"); + return -EFAULT; + } + if (unlikely(vq->log_used)) { + /* Make sure data is seen before log. */ + wmb(); + log_write(vq->log_base, vq->log_addr + sizeof *vq->used->ring * + (vq->last_used_idx % vq->num), + sizeof *vq->used->ring); + log_write(vq->log_base, vq->log_addr, sizeof *vq->used->ring); + if (vq->log_ctx) + eventfd_signal(vq->log_ctx, 1); + } + vq->last_used_idx++; + return 0; +} + +/* This actually signals the guest, using eventfd. */ +void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) +{ + __u16 flags = 0; + if (get_user(flags, &vq->avail->flags)) { + vq_err(vq, "Failed to get flags"); + return; + } + + /* If they don't want an interrupt, don't signal, unless empty. */ + if ((flags & VRING_AVAIL_F_NO_INTERRUPT) && + (vq->avail_idx != vq->last_avail_idx || + !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY))) + return; + + /* Signal the Guest tell them we used something up. */ + if (vq->call_ctx) + eventfd_signal(vq->call_ctx, 1); +} + +/* And here's the combo meal deal. Supersize me! */ +void vhost_add_used_and_signal(struct vhost_dev *dev, + struct vhost_virtqueue *vq, + unsigned int head, int len) +{ + vhost_add_used(vq, head, len); + vhost_signal(dev, vq); +} + +/* OK, now we need to know about added descriptors. */ +bool vhost_enable_notify(struct vhost_virtqueue *vq) +{ + u16 avail_idx; + int r; + if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) + return false; + vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; + r = put_user(vq->used_flags, &vq->used->flags); + if (r) { + vq_err(vq, "Failed to enable notification at %p: %d\n", + &vq->used->flags, r); + return false; + } + /* They could have slipped one in as we were doing that: make + * sure it's written, then check again. */ + mb(); + r = get_user(avail_idx, &vq->avail->idx); + if (r) { + vq_err(vq, "Failed to check avail idx at %p: %d\n", + &vq->avail->idx, r); + return false; + } + + return avail_idx != vq->last_avail_idx; +} + +/* We don't need to be notified again. */ +void vhost_disable_notify(struct vhost_virtqueue *vq) +{ + int r; + if (vq->used_flags & VRING_USED_F_NO_NOTIFY) + return; + vq->used_flags |= VRING_USED_F_NO_NOTIFY; + r = put_user(vq->used_flags, &vq->used->flags); + if (r) + vq_err(vq, "Failed to enable notification at %p: %d\n", + &vq->used->flags, r); +} + +int vhost_init(void) +{ + vhost_workqueue = create_singlethread_workqueue("vhost"); + if (!vhost_workqueue) + return -ENOMEM; + return 0; +} + +void vhost_cleanup(void) +{ + destroy_workqueue(vhost_workqueue); +} diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h new file mode 100644 index 000000000000..44591ba9b07a --- /dev/null +++ b/drivers/vhost/vhost.h @@ -0,0 +1,161 @@ +#ifndef _VHOST_H +#define _VHOST_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct vhost_device; + +enum { + /* Enough place for all fragments, head, and virtio net header. */ + VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2, +}; + +/* Poll a file (eventfd or socket) */ +/* Note: there's nothing vhost specific about this structure. */ +struct vhost_poll { + poll_table table; + wait_queue_head_t *wqh; + wait_queue_t wait; + /* struct which will handle all actual work. */ + struct work_struct work; + unsigned long mask; +}; + +void vhost_poll_init(struct vhost_poll *poll, work_func_t func, + unsigned long mask); +void vhost_poll_start(struct vhost_poll *poll, struct file *file); +void vhost_poll_stop(struct vhost_poll *poll); +void vhost_poll_flush(struct vhost_poll *poll); +void vhost_poll_queue(struct vhost_poll *poll); + +struct vhost_log { + u64 addr; + u64 len; +}; + +/* The virtqueue structure describes a queue attached to a device. */ +struct vhost_virtqueue { + struct vhost_dev *dev; + + /* The actual ring of buffers. */ + struct mutex mutex; + unsigned int num; + struct vring_desc __user *desc; + struct vring_avail __user *avail; + struct vring_used __user *used; + struct file *kick; + struct file *call; + struct file *error; + struct eventfd_ctx *call_ctx; + struct eventfd_ctx *error_ctx; + struct eventfd_ctx *log_ctx; + + struct vhost_poll poll; + + /* The routine to call when the Guest pings us, or timeout. */ + work_func_t handle_kick; + + /* Last available index we saw. */ + u16 last_avail_idx; + + /* Caches available index value from user. */ + u16 avail_idx; + + /* Last index we used. */ + u16 last_used_idx; + + /* Used flags */ + u16 used_flags; + + /* Log writes to used structure. */ + bool log_used; + u64 log_addr; + + struct iovec indirect[VHOST_NET_MAX_SG]; + struct iovec iov[VHOST_NET_MAX_SG]; + struct iovec hdr[VHOST_NET_MAX_SG]; + size_t hdr_size; + /* We use a kind of RCU to access private pointer. + * All readers access it from workqueue, which makes it possible to + * flush the workqueue instead of synchronize_rcu. Therefore readers do + * not need to call rcu_read_lock/rcu_read_unlock: the beginning of + * work item execution acts instead of rcu_read_lock() and the end of + * work item execution acts instead of rcu_read_lock(). + * Writers use virtqueue mutex. */ + void *private_data; + /* Log write descriptors */ + void __user *log_base; + struct vhost_log log[VHOST_NET_MAX_SG]; +}; + +struct vhost_dev { + /* Readers use RCU to access memory table pointer + * log base pointer and features. + * Writers use mutex below.*/ + struct vhost_memory *memory; + struct mm_struct *mm; + struct mutex mutex; + unsigned acked_features; + struct vhost_virtqueue *vqs; + int nvqs; + struct file *log_file; + struct eventfd_ctx *log_ctx; +}; + +long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); +long vhost_dev_check_owner(struct vhost_dev *); +long vhost_dev_reset_owner(struct vhost_dev *); +void vhost_dev_cleanup(struct vhost_dev *); +long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg); +int vhost_vq_access_ok(struct vhost_virtqueue *vq); +int vhost_log_access_ok(struct vhost_dev *); + +unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *, + struct iovec iov[], unsigned int iov_count, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num); +void vhost_discard_vq_desc(struct vhost_virtqueue *); + +int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); +void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); +void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, + unsigned int head, int len); +void vhost_disable_notify(struct vhost_virtqueue *); +bool vhost_enable_notify(struct vhost_virtqueue *); + +int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, + unsigned int log_num, u64 len); + +int vhost_init(void); +void vhost_cleanup(void); + +#define vq_err(vq, fmt, ...) do { \ + pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ + if ((vq)->error_ctx) \ + eventfd_signal((vq)->error_ctx, 1);\ + } while (0) + +enum { + VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) | + (1 << VIRTIO_RING_F_INDIRECT_DESC) | + (1 << VHOST_F_LOG_ALL) | + (1 << VHOST_NET_F_VIRTIO_NET_HDR), +}; + +static inline int vhost_has_feature(struct vhost_dev *dev, int bit) +{ + unsigned acked_features = rcu_dereference(dev->acked_features); + return acked_features & (1 << bit); +} + +#endif diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 756f831cbdd5..d93080748a91 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -362,6 +362,7 @@ unifdef-y += uio.h unifdef-y += unistd.h unifdef-y += usbdevice_fs.h unifdef-y += utsname.h +unifdef-y += vhost.h unifdef-y += videodev2.h unifdef-y += videodev.h unifdef-y += virtio_config.h diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index adaf3c15e449..8b5f7cc0fba6 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -30,6 +30,7 @@ #define HPET_MINOR 228 #define FUSE_MINOR 229 #define KVM_MINOR 232 +#define VHOST_NET_MINOR 233 #define MISC_DYNAMIC_MINOR 255 struct device; diff --git a/include/linux/vhost.h b/include/linux/vhost.h new file mode 100644 index 000000000000..e847f1e30756 --- /dev/null +++ b/include/linux/vhost.h @@ -0,0 +1,130 @@ +#ifndef _LINUX_VHOST_H +#define _LINUX_VHOST_H +/* Userspace interface for in-kernel virtio accelerators. */ + +/* vhost is used to reduce the number of system calls involved in virtio. + * + * Existing virtio net code is used in the guest without modification. + * + * This header includes interface used by userspace hypervisor for + * device configuration. + */ + +#include +#include +#include +#include +#include + +struct vhost_vring_state { + unsigned int index; + unsigned int num; +}; + +struct vhost_vring_file { + unsigned int index; + int fd; /* Pass -1 to unbind from file. */ + +}; + +struct vhost_vring_addr { + unsigned int index; + /* Option flags. */ + unsigned int flags; + /* Flag values: */ + /* Whether log address is valid. If set enables logging. */ +#define VHOST_VRING_F_LOG 0 + + /* Start of array of descriptors (virtually contiguous) */ + __u64 desc_user_addr; + /* Used structure address. Must be 32 bit aligned */ + __u64 used_user_addr; + /* Available structure address. Must be 16 bit aligned */ + __u64 avail_user_addr; + /* Logging support. */ + /* Log writes to used structure, at offset calculated from specified + * address. Address must be 32 bit aligned. */ + __u64 log_guest_addr; +}; + +struct vhost_memory_region { + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; + __u64 flags_padding; /* No flags are currently specified. */ +}; + +/* All region addresses and sizes must be 4K aligned. */ +#define VHOST_PAGE_SIZE 0x1000 + +struct vhost_memory { + __u32 nregions; + __u32 padding; + struct vhost_memory_region regions[0]; +}; + +/* ioctls */ + +#define VHOST_VIRTIO 0xAF + +/* Features bitmask for forward compatibility. Transport bits are used for + * vhost specific features. */ +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) + +/* Set current process as the (exclusive) owner of this file descriptor. This + * must be called before any other vhost command. Further calls to + * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +/* Give up ownership, and reset the device to default values. + * Allows subsequent call to VHOST_OWNER_SET to succeed. */ +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) + +/* Set up/modify memory layout */ +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) + +/* Write logging setup. */ +/* Memory writes can optionally be logged by setting bit at an offset + * (calculated from the physical address) from specified log base. + * The bit is set using an atomic 32 bit operation. */ +/* Set base address for logging. */ +#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) +/* Specify an eventfd file descriptor to signal on log write. */ +#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) + +/* Ring setup. */ +/* Set number of descriptors in ring. This parameter can not + * be modified while ring is running (bound to a device). */ +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +/* Set addresses for the ring. */ +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +/* Base value where queue looks for available descriptors */ +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) +/* Get accessor: reads index, writes value in num */ +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) + +/* The following ioctls use eventfd file descriptors to signal and poll + * for events. */ + +/* Set eventfd to poll for added buffers */ +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +/* Set eventfd to signal when buffers have beed used */ +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +/* Set eventfd to signal an error */ +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) + +/* VHOST_NET specific defines */ + +/* Attach virtio net ring to a raw socket, or tap device. + * The socket must be already bound to an ethernet device, this device will be + * used for transmit. Pass fd -1 to unbind from the socket and the transmit + * device. This can be used to stop the ring (e.g. for migration). */ +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) + +/* Feature bits */ +/* Log all write descriptors. Can be changed while device is active. */ +#define VHOST_F_LOG_ALL 26 +/* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */ +#define VHOST_NET_F_VIRTIO_NET_HDR 27 + +#endif -- cgit v1.2.3 From c955fe8e0bdd7be7a6bc2d49245d570a816f7cc5 Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Thu, 15 Oct 2009 14:31:30 +0400 Subject: POWER: Add support for cycle_count Signed-off-by: Alexey Starikovskiy Signed-off-by: Len Brown --- drivers/power/power_supply_sysfs.c | 1 + include/linux/power_supply.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c index c790e0c77d4b..ff05e6189768 100644 --- a/drivers/power/power_supply_sysfs.c +++ b/drivers/power/power_supply_sysfs.c @@ -99,6 +99,7 @@ static struct device_attribute power_supply_attrs[] = { POWER_SUPPLY_ATTR(present), POWER_SUPPLY_ATTR(online), POWER_SUPPLY_ATTR(technology), + POWER_SUPPLY_ATTR(cycle_count), POWER_SUPPLY_ATTR(voltage_max), POWER_SUPPLY_ATTR(voltage_min), POWER_SUPPLY_ATTR(voltage_max_design), diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index b5d096d3a9be..ebd2b8fb00d0 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -82,6 +82,7 @@ enum power_supply_property { POWER_SUPPLY_PROP_PRESENT, POWER_SUPPLY_PROP_ONLINE, POWER_SUPPLY_PROP_TECHNOLOGY, + POWER_SUPPLY_PROP_CYCLE_COUNT, POWER_SUPPLY_PROP_VOLTAGE_MAX, POWER_SUPPLY_PROP_VOLTAGE_MIN, POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN, -- cgit v1.2.3 From 889ff0150661512d79484219612b7e2e024b6c07 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 9 Jan 2010 20:04:47 +0100 Subject: perf/core: Split context's event group list into pinned and non-pinned lists Split-up struct perf_event_context::group_list into pinned_groups and flexible_groups (non-pinned). This first appears to be useless as it duplicates various loops around the group list handlings. But it scales better in the fast-path in perf_sched_in(). We don't anymore iterate twice through the entire list to separate pinned and non-pinned scheduling. Instead we interate through two distinct lists. The another desired effect is that it makes easier to define distinct scheduling rules on both. Changes in v2: - Respectively rename pinned_grp_list and volatile_grp_list into pinned_groups and flexible_groups as per Ingo suggestion. - Various cleanups Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 3 +- kernel/perf_event.c | 227 ++++++++++++++++++++++++++++++--------------- 2 files changed, 153 insertions(+), 77 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9a1d276db754..cdbc2aa64a0b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -683,7 +683,8 @@ struct perf_event_context { */ struct mutex mutex; - struct list_head group_list; + struct list_head pinned_groups; + struct list_head flexible_groups; struct list_head event_list; int nr_events; int nr_active; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 27f69a04541d..c9f8a757649d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -289,6 +289,15 @@ static void update_event_times(struct perf_event *event) event->total_time_running = run_end - event->tstamp_running; } +static struct list_head * +ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +{ + if (event->attr.pinned) + return &ctx->pinned_groups; + else + return &ctx->flexible_groups; +} + /* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -303,9 +312,12 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) * add it straight to the context's event list, or to the group * leader's sibling list: */ - if (group_leader == event) - list_add_tail(&event->group_entry, &ctx->group_list); - else { + if (group_leader == event) { + struct list_head *list; + + list = ctx_group_list(event, ctx); + list_add_tail(&event->group_entry, list); + } else { list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; } @@ -355,8 +367,10 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) * to the context list directly: */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + struct list_head *list; - list_move_tail(&sibling->group_entry, &ctx->group_list); + list = ctx_group_list(event, ctx); + list_move_tail(&sibling->group_entry, list); sibling->group_leader = sibling; } } @@ -1056,7 +1070,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx, perf_disable(); if (ctx->nr_active) { - list_for_each_entry(event, &ctx->group_list, group_entry) + list_for_each_entry(event, &ctx->pinned_groups, group_entry) + group_sched_out(event, cpuctx, ctx); + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } perf_enable(); @@ -1271,9 +1288,8 @@ __perf_event_sched_in(struct perf_event_context *ctx, * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - list_for_each_entry(event, &ctx->group_list, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF || - !event->attr.pinned) + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) continue; if (event->cpu != -1 && event->cpu != cpu) continue; @@ -1291,15 +1307,10 @@ __perf_event_sched_in(struct perf_event_context *ctx, } } - list_for_each_entry(event, &ctx->group_list, group_entry) { - /* - * Ignore events in OFF or ERROR state, and - * ignore pinned events since we did them already. - */ - if (event->state <= PERF_EVENT_STATE_OFF || - event->attr.pinned) + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + /* Ignore events in OFF or ERROR state */ + if (event->state <= PERF_EVENT_STATE_OFF) continue; - /* * Listen to the 'cpu' scheduling filter constraint * of events: @@ -1453,8 +1464,13 @@ static void rotate_ctx(struct perf_event_context *ctx) * Rotate the first entry last (works just fine for group events too): */ perf_disable(); - list_for_each_entry(event, &ctx->group_list, group_entry) { - list_move_tail(&event->group_entry, &ctx->group_list); + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + list_move_tail(&event->group_entry, &ctx->pinned_groups); + break; + } + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + list_move_tail(&event->group_entry, &ctx->flexible_groups); break; } perf_enable(); @@ -1490,6 +1506,21 @@ void perf_event_task_tick(struct task_struct *curr) perf_event_task_sched_in(curr); } +static int event_enable_on_exec(struct perf_event *event, + struct perf_event_context *ctx) +{ + if (!event->attr.enable_on_exec) + return 0; + + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) + return 0; + + __perf_event_mark_enabled(event, ctx); + + return 1; +} + /* * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. @@ -1500,6 +1531,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) struct perf_event *event; unsigned long flags; int enabled = 0; + int ret; local_irq_save(flags); ctx = task->perf_event_ctxp; @@ -1510,14 +1542,16 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_lock(&ctx->lock); - list_for_each_entry(event, &ctx->group_list, group_entry) { - if (!event->attr.enable_on_exec) - continue; - event->attr.enable_on_exec = 0; - if (event->state >= PERF_EVENT_STATE_INACTIVE) - continue; - __perf_event_mark_enabled(event, ctx); - enabled = 1; + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; + } + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; } /* @@ -1591,7 +1625,8 @@ __perf_event_init_context(struct perf_event_context *ctx, { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->group_list); + INIT_LIST_HEAD(&ctx->pinned_groups); + INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); ctx->task = task; @@ -5032,7 +5067,11 @@ void perf_event_exit_task(struct task_struct *child) mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); again: - list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, + list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, group_entry) __perf_event_exit_task(child_event, child_ctx, child); @@ -5041,7 +5080,8 @@ again: * its siblings to the list, but we obtained 'tmp' before that which * will still point to the list head terminating the iteration. */ - if (!list_empty(&child_ctx->group_list)) + if (!list_empty(&child_ctx->pinned_groups) || + !list_empty(&child_ctx->flexible_groups)) goto again; mutex_unlock(&child_ctx->mutex); @@ -5049,6 +5089,24 @@ again: put_ctx(child_ctx); } +static void perf_free_event(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *parent = event->parent; + + if (WARN_ON_ONCE(!parent)) + return; + + mutex_lock(&parent->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + list_del_event(event, ctx); + free_event(event); +} + /* * free an unexposed, unused context as created by inheritance by * init_task below, used by fork() in case of fail. @@ -5063,36 +5121,70 @@ void perf_event_free_task(struct task_struct *task) mutex_lock(&ctx->mutex); again: - list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { - struct perf_event *parent = event->parent; + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) + perf_free_event(event, ctx); - if (WARN_ON_ONCE(!parent)) - continue; + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, + group_entry) + perf_free_event(event, ctx); - mutex_lock(&parent->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent->child_mutex); + if (!list_empty(&ctx->pinned_groups) || + !list_empty(&ctx->flexible_groups)) + goto again; - fput(parent->filp); + mutex_unlock(&ctx->mutex); - list_del_event(event, ctx); - free_event(event); + put_ctx(ctx); +} + +static int +inherit_task_group(struct perf_event *event, struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + int *inherited_all) +{ + int ret; + struct perf_event_context *child_ctx = child->perf_event_ctxp; + + if (!event->attr.inherit) { + *inherited_all = 0; + return 0; } - if (!list_empty(&ctx->group_list)) - goto again; + if (!child_ctx) { + /* + * This is executed from the parent task context, so + * inherit events that have been marked for cloning. + * First allocate and initialize a context for the + * child. + */ - mutex_unlock(&ctx->mutex); + child_ctx = kzalloc(sizeof(struct perf_event_context), + GFP_KERNEL); + if (!child_ctx) + return -ENOMEM; - put_ctx(ctx); + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + } + + ret = inherit_group(event, parent, parent_ctx, + child, child_ctx); + + if (ret) + *inherited_all = 0; + + return ret; } + /* * Initialize the perf_event context in task_struct */ int perf_event_init_task(struct task_struct *child) { - struct perf_event_context *child_ctx = NULL, *parent_ctx; + struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; @@ -5130,41 +5222,22 @@ int perf_event_init_task(struct task_struct *child) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry(event, &parent_ctx->group_list, group_entry) { - - if (!event->attr.inherit) { - inherited_all = 0; - continue; - } - - if (!child->perf_event_ctxp) { - /* - * This is executed from the parent task context, so - * inherit events that have been marked for cloning. - * First allocate and initialize a context for the - * child. - */ - - child_ctx = kzalloc(sizeof(struct perf_event_context), - GFP_KERNEL); - if (!child_ctx) { - ret = -ENOMEM; - break; - } - - __perf_event_init_context(child_ctx, child); - child->perf_event_ctxp = child_ctx; - get_task_struct(child); - } + list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { + ret = inherit_task_group(event, parent, parent_ctx, child, + &inherited_all); + if (ret) + break; + } - ret = inherit_group(event, parent, parent_ctx, - child, child_ctx); - if (ret) { - inherited_all = 0; + list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { + ret = inherit_task_group(event, parent, parent_ctx, child, + &inherited_all); + if (ret) break; - } } + child_ctx = child->perf_event_ctxp; + if (child_ctx && inherited_all) { /* * Mark the child context as a clone of the parent @@ -5213,7 +5286,9 @@ static void __perf_event_exit_cpu(void *info) struct perf_event_context *ctx = &cpuctx->ctx; struct perf_event *event, *tmp; - list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) + __perf_event_remove_from_context(event); + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) __perf_event_remove_from_context(event); } static void perf_event_exit_cpu(int cpu) -- cgit v1.2.3 From 5908cdc85eb30f8d07f2cb11d4a62334d7229048 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 9 Jan 2010 20:53:14 +0100 Subject: list: Introduce list_rotate_left() Bring a new list_rotate_left() helper that rotates a list to the left. This is useful for codes that need to round roubin elements which queue priority increases from tail to head. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- include/linux/list.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 969f6e92d089..5d9c6558e8ab 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -205,6 +205,20 @@ static inline int list_empty_careful(const struct list_head *head) return (next == head) && (next == head->prev); } +/** + * list_rotate_left - rotate the list to the left + * @head: the head of the list + */ +static inline void list_rotate_left(struct list_head *head) +{ + struct list_head *first; + + if (!list_empty(head)) { + first = head->next; + list_move_tail(first, head); + } +} + /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. -- cgit v1.2.3 From d6f962b57bfaab62891c7abbf1469212a56d6103 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 10 Jan 2010 01:25:51 +0100 Subject: perf: Export software-only event group characteristic as a flag Before scheduling an event group, we first check if a group can go on. We first check if the group is made of software only events first, in which case it is enough to know if the group can be scheduled in. For that purpose, we iterate through the whole group, which is wasteful as we could do this check when we add/delete an event to a group. So we create a group_flags field in perf event that can host characteristics from a group of events, starting with a first PERF_GROUP_SOFTWARE flag that reduces the check on the fast path. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 5 +++++ kernel/perf_event.c | 30 +++++++++++------------------- 2 files changed, 16 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index cdbc2aa64a0b..c6f812e4d058 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -565,6 +565,10 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *, int, struct perf_sample_data *, struct pt_regs *regs); +enum perf_group_flag { + PERF_GROUP_SOFTWARE = 0x1, +}; + /** * struct perf_event - performance event kernel representation: */ @@ -574,6 +578,7 @@ struct perf_event { struct list_head event_entry; struct list_head sibling_list; int nr_siblings; + int group_flags; struct perf_event *group_leader; struct perf_event *output; const struct pmu *pmu; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index bbebe2832639..eae6ff693604 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -315,9 +315,16 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (group_leader == event) { struct list_head *list; + if (is_software_event(event)) + event->group_flags |= PERF_GROUP_SOFTWARE; + list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); } else { + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && + !is_software_event(event)) + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; } @@ -372,6 +379,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) list = ctx_group_list(event, ctx); list_move_tail(&sibling->group_entry, list); sibling->group_leader = sibling; + + /* Inherit group flags from the previous leader */ + sibling->group_flags = event->group_flags; } } @@ -699,24 +709,6 @@ group_error: return -EAGAIN; } -/* - * Return 1 for a group consisting entirely of software events, - * 0 if the group contains any hardware events. - */ -static int is_software_only_group(struct perf_event *leader) -{ - struct perf_event *event; - - if (!is_software_event(leader)) - return 0; - - list_for_each_entry(event, &leader->sibling_list, group_entry) - if (!is_software_event(event)) - return 0; - - return 1; -} - /* * Work out whether we can put this event group on the CPU now. */ @@ -727,7 +719,7 @@ static int group_can_go_on(struct perf_event *event, /* * Groups consisting entirely of software events can always go on. */ - if (is_software_only_group(event)) + if (event->group_flags & PERF_GROUP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware -- cgit v1.2.3 From 7e105057a34c83cea542dacc55ff0528bce67afa Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Fri, 15 Jan 2010 17:01:02 -0800 Subject: kfifo: fix kfifo_out_locked race bug Fix a wrong optimization in include/linux/kfifo.h which could cause a race in kfifo_out_locked. Signed-off-by: Stefani Seibold Reported-by: Johan Hovold Cc: Pete Zaitcev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 7c6b32a1421c..c4ac88b3c302 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -228,13 +228,6 @@ static inline __must_check unsigned int kfifo_out_locked(struct kfifo *fifo, ret = kfifo_out(fifo, to, n); - /* - * optimization: if the FIFO is empty, set the indices to 0 - * so we don't wrap the next time - */ - if (kfifo_is_empty(fifo)) - kfifo_reset(fifo); - spin_unlock_irqrestore(lock, flags); return ret; -- cgit v1.2.3 From 2427b8e3eaea3719e53bbed7b3375382c3aa6f13 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 15 Jan 2010 17:01:11 -0800 Subject: tty.h: make tty_port_get() static inline I get a few dozen of these warnings when using gcc (GCC) 4.4.1 20090725 (Red Hat 4.4.1-2): In file included from mmotm-2010-0113-1217/init/do_mounts.c:5: mmotm-2010-0113-1217/include/linux/tty.h: In function 'tty_port_get': mmotm-2010-0113-1217/include/linux/tty.h:469: warning: '______f' is static but declared in inline function 'tty_port_get' which is not static so make the function static inline. [akpm@linux-foundation.org: may as well convert tty_port_users() also] Signed-off-by: Randy Dunlap Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tty.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index ef3a2947b102..6abfcf5b5887 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -464,7 +464,7 @@ extern int tty_port_alloc_xmit_buf(struct tty_port *port); extern void tty_port_free_xmit_buf(struct tty_port *port); extern void tty_port_put(struct tty_port *port); -extern inline struct tty_port *tty_port_get(struct tty_port *port) +static inline struct tty_port *tty_port_get(struct tty_port *port) { if (port) kref_get(&port->kref); @@ -486,7 +486,7 @@ extern void tty_port_close(struct tty_port *port, struct tty_struct *tty, struct file *filp); extern int tty_port_open(struct tty_port *port, struct tty_struct *tty, struct file *filp); -extern inline int tty_port_users(struct tty_port *port) +static inline int tty_port_users(struct tty_port *port) { return port->count + port->blocked_open; } -- cgit v1.2.3 From 8ecc2951534af10e04ddb5e5ff5c6d217b79f5c2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:12 -0800 Subject: kfifo: use void * pointers for user buffers The pointers to user buffers are currently unsigned char *, which requires a lot of casting in the caller for any non-char typed buffers. Use void * instead. Signed-off-by: Andi Kleen Acked-by: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 10 +++++----- kernel/kfifo.c | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index c4ac88b3c302..6fb495ea956a 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -104,15 +104,15 @@ union { \ #undef __kfifo_initializer -extern void kfifo_init(struct kfifo *fifo, unsigned char *buffer, +extern void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size); extern __must_check int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask); extern void kfifo_free(struct kfifo *fifo); extern unsigned int kfifo_in(struct kfifo *fifo, - const unsigned char *from, unsigned int len); + const void *from, unsigned int len); extern __must_check unsigned int kfifo_out(struct kfifo *fifo, - unsigned char *to, unsigned int len); + void *to, unsigned int len); /** * kfifo_reset - removes the entire FIFO contents @@ -194,7 +194,7 @@ static inline __must_check unsigned int kfifo_avail(struct kfifo *fifo) * bytes copied. */ static inline unsigned int kfifo_in_locked(struct kfifo *fifo, - const unsigned char *from, unsigned int n, spinlock_t *lock) + const void *from, unsigned int n, spinlock_t *lock) { unsigned long flags; unsigned int ret; @@ -219,7 +219,7 @@ static inline unsigned int kfifo_in_locked(struct kfifo *fifo, * @to buffer and returns the number of copied bytes. */ static inline __must_check unsigned int kfifo_out_locked(struct kfifo *fifo, - unsigned char *to, unsigned int n, spinlock_t *lock) + void *to, unsigned int n, spinlock_t *lock) { unsigned long flags; unsigned int ret; diff --git a/kernel/kfifo.c b/kernel/kfifo.c index e92d519f93b1..ab615e695052 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -28,7 +28,7 @@ #include #include -static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, +static void _kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) { fifo->buffer = buffer; @@ -44,7 +44,7 @@ static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, * @size: the size of the internal buffer, this have to be a power of 2. * */ -void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size) +void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) { /* size must be a power of 2 */ BUG_ON(!is_power_of_2(size)); @@ -235,7 +235,7 @@ EXPORT_SYMBOL(__kfifo_in_n); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_in(struct kfifo *fifo, const unsigned char *from, +unsigned int kfifo_in(struct kfifo *fifo, const void *from, unsigned int len) { len = min(kfifo_avail(fifo), len); @@ -277,7 +277,7 @@ EXPORT_SYMBOL(__kfifo_out_n); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_out(struct kfifo *fifo, unsigned char *to, unsigned int len) +unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) { len = min(kfifo_len(fifo), len); -- cgit v1.2.3 From 64ce1037c5434b1d036cd99ecaee6e00496bc2e9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:15 -0800 Subject: kfifo: sanitize *_user error handling Right now for kfifo_*_user it's not easily possible to distingush between a user copy failing and the FIFO not containing enough data. The problem is that both conditions are multiplexed into the same return code. Avoid this by moving the "copy length" into a separate output parameter and only return 0/-EFAULT in the main return value. I didn't fully adapt the weird "record" variants, those seem to be unused anyways and were rather messy (should they be just removed?) I would appreciate some double checking if I did all the conversions correctly. Signed-off-by: Andi Kleen Cc: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 8 +++--- kernel/kfifo.c | 76 +++++++++++++++++++++++++++++++++------------------ 2 files changed, 53 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 6fb495ea956a..86ad50a900c8 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -235,11 +235,11 @@ static inline __must_check unsigned int kfifo_out_locked(struct kfifo *fifo, extern void kfifo_skip(struct kfifo *fifo, unsigned int len); -extern __must_check unsigned int kfifo_from_user(struct kfifo *fifo, - const void __user *from, unsigned int n); +extern __must_check int kfifo_from_user(struct kfifo *fifo, + const void __user *from, unsigned int n, unsigned *lenout); -extern __must_check unsigned int kfifo_to_user(struct kfifo *fifo, - void __user *to, unsigned int n); +extern __must_check int kfifo_to_user(struct kfifo *fifo, + void __user *to, unsigned int n, unsigned *lenout); /* * __kfifo_add_out internal helper function for updating the out offset diff --git a/kernel/kfifo.c b/kernel/kfifo.c index ab615e695052..b50bb622e8b0 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -159,8 +159,9 @@ static inline void __kfifo_out_data(struct kfifo *fifo, memcpy(to + l, fifo->buffer, len - l); } -static inline unsigned int __kfifo_from_user_data(struct kfifo *fifo, - const void __user *from, unsigned int len, unsigned int off) +static inline int __kfifo_from_user_data(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int off, + unsigned *lenout) { unsigned int l; int ret; @@ -177,16 +178,20 @@ static inline unsigned int __kfifo_from_user_data(struct kfifo *fifo, /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - off); ret = copy_from_user(fifo->buffer + off, from, l); - - if (unlikely(ret)) - return ret + len - l; + if (unlikely(ret)) { + *lenout = ret; + return -EFAULT; + } + *lenout = l; /* then put the rest (if any) at the beginning of the buffer */ - return copy_from_user(fifo->buffer, from + l, len - l); + ret = copy_from_user(fifo->buffer, from + l, len - l); + *lenout += ret ? ret : len - l; + return ret ? -EFAULT : 0; } -static inline unsigned int __kfifo_to_user_data(struct kfifo *fifo, - void __user *to, unsigned int len, unsigned int off) +static inline int __kfifo_to_user_data(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int off, unsigned *lenout) { unsigned int l; int ret; @@ -203,12 +208,21 @@ static inline unsigned int __kfifo_to_user_data(struct kfifo *fifo, /* first get the data from fifo->out until the end of the buffer */ l = min(len, fifo->size - off); ret = copy_to_user(to, fifo->buffer + off, l); - - if (unlikely(ret)) - return ret + len - l; + *lenout = l; + if (unlikely(ret)) { + *lenout -= ret; + return -EFAULT; + } /* then get the rest (if any) from the beginning of the buffer */ - return copy_to_user(to + l, fifo->buffer, len - l); + len -= l; + ret = copy_to_user(to + l, fifo->buffer, len); + if (unlikely(ret)) { + *lenout += len - ret; + return -EFAULT; + } + *lenout += len; + return 0; } unsigned int __kfifo_in_n(struct kfifo *fifo, @@ -299,10 +313,13 @@ EXPORT_SYMBOL(__kfifo_out_generic); unsigned int __kfifo_from_user_n(struct kfifo *fifo, const void __user *from, unsigned int len, unsigned int recsize) { + unsigned total; + if (kfifo_avail(fifo) < len + recsize) return len + 1; - return __kfifo_from_user_data(fifo, from, len, recsize); + __kfifo_from_user_data(fifo, from, len, recsize, &total); + return total; } EXPORT_SYMBOL(__kfifo_from_user_n); @@ -313,18 +330,21 @@ EXPORT_SYMBOL(__kfifo_from_user_n); * @len: the length of the data to be added. * * This function copies at most @len bytes from the @from into the - * FIFO depending and returns the number of copied bytes. + * FIFO depending and returns -EFAULT/0. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_from_user(struct kfifo *fifo, - const void __user *from, unsigned int len) +int kfifo_from_user(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned *total) { + int ret; len = min(kfifo_avail(fifo), len); - len -= __kfifo_from_user_data(fifo, from, len, 0); + ret = __kfifo_from_user_data(fifo, from, len, 0, total); + if (ret) + return ret; __kfifo_add_in(fifo, len); - return len; + return 0; } EXPORT_SYMBOL(kfifo_from_user); @@ -339,17 +359,17 @@ unsigned int __kfifo_to_user_n(struct kfifo *fifo, void __user *to, unsigned int len, unsigned int reclen, unsigned int recsize) { - unsigned int ret; + unsigned int ret, total; if (kfifo_len(fifo) < reclen + recsize) return len; - ret = __kfifo_to_user_data(fifo, to, reclen, recsize); + ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total); if (likely(ret == 0)) __kfifo_add_out(fifo, reclen + recsize); - return ret; + return total; } EXPORT_SYMBOL(__kfifo_to_user_n); @@ -358,20 +378,22 @@ EXPORT_SYMBOL(__kfifo_to_user_n); * @fifo: the fifo to be used. * @to: where the data must be copied. * @len: the size of the destination buffer. + @ @lenout: pointer to output variable with copied data * * This function copies at most @len bytes from the FIFO into the - * @to buffer and returns the number of copied bytes. + * @to buffer and 0 or -EFAULT. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_to_user(struct kfifo *fifo, - void __user *to, unsigned int len) +int kfifo_to_user(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned *lenout) { + int ret; len = min(kfifo_len(fifo), len); - len -= __kfifo_to_user_data(fifo, to, len, 0); - __kfifo_add_out(fifo, len); - return len; + ret = __kfifo_to_user_data(fifo, to, len, 0, lenout); + __kfifo_add_out(fifo, *lenout); + return ret; } EXPORT_SYMBOL(kfifo_to_user); -- cgit v1.2.3 From a5b9e2c1063046421ce01dcf5ddd7ec12567f3e1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:16 -0800 Subject: kfifo: add kfifo_out_peek In some upcoming code it's useful to peek into a FIFO without permanentely removing data. This patch implements a new kfifo_out_peek() to do this. Signed-off-by: Andi Kleen Acked-by: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 3 +++ kernel/kfifo.c | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 86ad50a900c8..7ad6d32dd673 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -113,6 +113,9 @@ extern unsigned int kfifo_in(struct kfifo *fifo, const void *from, unsigned int len); extern __must_check unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len); +extern __must_check unsigned int kfifo_out_peek(struct kfifo *fifo, + void *to, unsigned int len, unsigned offset); + /** * kfifo_reset - removes the entire FIFO contents diff --git a/kernel/kfifo.c b/kernel/kfifo.c index b50bb622e8b0..7384f120be87 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -302,6 +302,27 @@ unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) } EXPORT_SYMBOL(kfifo_out); +/** + * kfifo_out_peek - copy some data from the FIFO, but do not remove it + * @fifo: the fifo to be used. + * @to: where the data must be copied. + * @len: the size of the destination buffer. + * @offset: offset into the fifo + * + * This function copies at most @len bytes at @offset from the FIFO + * into the @to buffer and returns the number of copied bytes. + * The data is not removed from the FIFO. + */ +unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, + unsigned offset) +{ + len = min(kfifo_len(fifo), len + offset); + + __kfifo_out_data(fifo, to, len, offset); + return len; +} +EXPORT_SYMBOL(kfifo_out_peek); + unsigned int __kfifo_out_generic(struct kfifo *fifo, void *to, unsigned int len, unsigned int recsize, unsigned int *total) -- cgit v1.2.3 From d994ffc247f7c4a48b848f10c4c01c9b06411ada Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:17 -0800 Subject: kfifo: add kfifo_initialized Simple inline that checks if kfifo_init() has been executed on a fifo. This is useful for walking all per CPU fifos, when some of them might not have been brought up yet. Signed-off-by: Andi Kleen Acked-by: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 7ad6d32dd673..c8618243ca5a 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -116,6 +116,16 @@ extern __must_check unsigned int kfifo_out(struct kfifo *fifo, extern __must_check unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, unsigned offset); +/** + * kfifo_initialized - Check if kfifo is initialized. + * @fifo: fifo to check + * Return %true if FIFO is initialized, otherwise %false. + * Assumes the fifo was 0 before. + */ +static inline bool kfifo_initialized(struct kfifo *fifo) +{ + return fifo->buffer != 0; +} /** * kfifo_reset - removes the entire FIFO contents -- cgit v1.2.3 From 5dab600e6a153ceb64832f608069e6c08185411a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:17 -0800 Subject: kfifo: document everywhere that size has to be power of two On my first try using them I missed that the fifos need to be power of two, resulting in a runtime bug. Document that requirement everywhere (and fix one grammar bug) Signed-off-by: Andi Kleen Acked-by: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 4 ++-- kernel/kfifo.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index c8618243ca5a..6f6c5f300af6 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -67,7 +67,7 @@ struct kfifo { /** * DECLARE_KFIFO - macro to declare a kfifo and the associated buffer * @name: name of the declared kfifo datatype - * @size: size of the fifo buffer + * @size: size of the fifo buffer. Must be a power of two. * * Note1: the macro can be used inside struct or union declaration * Note2: the macro creates two objects: @@ -91,7 +91,7 @@ union { \ /** * DEFINE_KFIFO - macro to define and initialize a kfifo * @name: name of the declared kfifo datatype - * @size: size of the fifo buffer + * @size: size of the fifo buffer. Must be a power of two. * * Note1: the macro can be used for global and local kfifo data type variables * Note2: the macro creates two objects: diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 7384f120be87..32c5c15d750d 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -41,7 +41,7 @@ static void _kfifo_init(struct kfifo *fifo, void *buffer, * kfifo_init - initialize a FIFO using a preallocated buffer * @fifo: the fifo to assign the buffer * @buffer: the preallocated buffer to be used. - * @size: the size of the internal buffer, this have to be a power of 2. + * @size: the size of the internal buffer, this has to be a power of 2. * */ void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) -- cgit v1.2.3 From cc8ef6eb21e964b1c5eb97b2d0e8ac9893e1bf86 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 15 Jan 2010 17:01:22 -0800 Subject: kernel.h: add BUILD_BUG_ON_NOT_POWER_OF_2() Add BUILD_BUG_ON_NOT_POWER_OF_2() When code relies on a constant being a power of 2: #define FOO 512 /* must be a power of 2 */ it would be nice to be able to do: BUILD_BUG_ON(!is_power_of_2(FOO)); However applying an inline function does not result in a compile-time constant that can be used with BUILD_BUG_ON(), so trying that gives results in: error: bit-field '' width not an integer constant As suggested by akpm, rather than monkeying around with is_power_of_2() and risking gcc warts about constant expressions, just create a macro BUILD_BUG_ON_NOT_POWER_OF_2() to encapsulate this common requirement. Signed-off-by: Roland Dreier Cc: Bart Van Assche Cc: David Dillow Cc: "Robert P. J. Day" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3fc9f5aab5f8..328bca609b9b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -734,6 +734,10 @@ struct sysinfo { /* Force a compilation error if condition is constant and true */ #define MAYBE_BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2 * !!(cond)])) +/* Force a compilation error if a constant expression is not a power of 2 */ +#define BUILD_BUG_ON_NOT_POWER_OF_2(n) \ + BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0)) + /* Force a compilation error if condition is true, but also produce a result (of value 0 and type size_t), so the expression can be used e.g. in a structure initializer (or where-ever else comma expressions -- cgit v1.2.3 From 1e2ae599d37e60958c03ca5e46b1f657619a30cd Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 15 Jan 2010 17:01:33 -0800 Subject: nommu: struct vm_region's vm_usage count need not be atomic The vm_usage count field in struct vm_region does not need to be atomic as it's only even modified whilst nommu_region_sem is write locked. Signed-off-by: David Howells Acked-by: Al Viro Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- mm/nommu.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 84d020bed083..80cfa78a8cf6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -122,7 +122,7 @@ struct vm_region { unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ struct file *vm_file; /* the backing file or NULL */ - atomic_t vm_usage; /* region usage count */ + int vm_usage; /* region usage count (access under nommu_region_sem) */ bool vm_icache_flushed : 1; /* true if the icache has been flushed for * this region */ }; diff --git a/mm/nommu.c b/mm/nommu.c index 17773862619b..5e39294f8ea8 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -552,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to) static void __put_nommu_region(struct vm_region *region) __releases(nommu_region_sem) { - kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); + kenter("%p{%d}", region, region->vm_usage); BUG_ON(!nommu_region_tree.rb_node); - if (atomic_dec_and_test(®ion->vm_usage)) { + if (--region->vm_usage == 0) { if (region->vm_top > region->vm_start) delete_nommu_region(region); up_write(&nommu_region_sem); @@ -1205,7 +1205,7 @@ unsigned long do_mmap_pgoff(struct file *file, if (!vma) goto error_getting_vma; - atomic_set(®ion->vm_usage, 1); + region->vm_usage = 1; region->vm_flags = vm_flags; region->vm_pgoff = pgoff; @@ -1272,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file, } /* we've found a region we can share */ - atomic_inc(&pregion->vm_usage); + pregion->vm_usage++; vma->vm_region = pregion; start = pregion->vm_start; start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; @@ -1289,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_region = NULL; vma->vm_start = 0; vma->vm_end = 0; - atomic_dec(&pregion->vm_usage); + pregion->vm_usage--; pregion = NULL; goto error_just_free; } @@ -1444,7 +1444,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, /* we're only permitted to split anonymous regions that have a single * owner */ if (vma->vm_file || - atomic_read(&vma->vm_region->vm_usage) != 1) + vma->vm_region->vm_usage != 1) return -ENOMEM; if (mm->map_count >= sysctl_max_map_count) @@ -1518,7 +1518,7 @@ static int shrink_vma(struct mm_struct *mm, /* cut the backing region down to size */ region = vma->vm_region; - BUG_ON(atomic_read(®ion->vm_usage) != 1); + BUG_ON(region->vm_usage != 1); down_write(&nommu_region_sem); delete_nommu_region(region); -- cgit v1.2.3 From efc1a3b16930c41d64ffefde16b87d82f603a8a0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 15 Jan 2010 17:01:35 -0800 Subject: nommu: don't need get_unmapped_area() for NOMMU get_unmapped_area() is unnecessary for NOMMU as no-one calls it. Signed-off-by: David Howells Acked-by: Al Viro Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 ++ include/linux/sched.h | 7 +++++-- mm/nommu.c | 21 --------------------- mm/util.c | 2 +- 4 files changed, 8 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 80cfa78a8cf6..36f96271306c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -205,10 +205,12 @@ struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; struct vm_area_struct * mmap_cache; /* last find_vma result */ +#ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); void (*unmap_area) (struct mm_struct *mm, unsigned long addr); +#endif unsigned long mmap_base; /* base of mmap area */ unsigned long task_size; /* size of task vm space */ unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 8d4991be9d53..6f7bba93929b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -377,6 +377,8 @@ extern int sysctl_max_map_count; #include +#ifdef CONFIG_MMU +extern void arch_pick_mmap_layout(struct mm_struct *mm); extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); @@ -386,6 +388,9 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long flags); extern void arch_unmap_area(struct mm_struct *, unsigned long); extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); +#else +static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} +#endif #if USE_SPLIT_PTLOCKS /* @@ -2491,8 +2496,6 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ -extern void arch_pick_mmap_layout(struct mm_struct *mm); - #ifdef CONFIG_TRACING extern void __trace_special(void *__tr, void *__data, diff --git a/mm/nommu.c b/mm/nommu.c index d6dd656264a2..32be0cf51ba6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1760,27 +1760,6 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/* - * ask for an unmapped area at which to create a mapping on a file - */ -unsigned long get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - unsigned long (*get_area)(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long); - - get_area = current->mm->get_unmapped_area; - if (file && file->f_op && file->f_op->get_unmapped_area) - get_area = file->f_op->get_unmapped_area; - - if (!get_area) - return -ENOSYS; - - return get_area(file, addr, len, pgoff, flags); -} -EXPORT_SYMBOL(get_unmapped_area); - /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to diff --git a/mm/util.c b/mm/util.c index 7c35ad95f927..834db7be240f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -220,7 +220,7 @@ char *strndup_user(const char __user *s, long n) } EXPORT_SYMBOL(strndup_user); -#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { mm->mmap_base = TASK_UNMAPPED_BASE; -- cgit v1.2.3 From 7e6608724c640924aad1d556d17df33ebaa6124d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 15 Jan 2010 17:01:39 -0800 Subject: nommu: fix shared mmap after truncate shrinkage problems Fix a problem in NOMMU mmap with ramfs whereby a shared mmap can happen over the end of a truncation. The problem is that ramfs_nommu_check_mappings() checks that the reduced file size against the VMA tree, but not the vm_region tree. The following sequence of events can cause the problem: fd = open("/tmp/x", O_RDWR|O_TRUNC|O_CREAT, 0600); ftruncate(fd, 32 * 1024); a = mmap(NULL, 32 * 1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); b = mmap(NULL, 16 * 1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); munmap(a, 32 * 1024); ftruncate(fd, 16 * 1024); c = mmap(NULL, 32 * 1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); Mapping 'a' creates a vm_region covering 32KB of the file. Mapping 'b' sees that the vm_region from 'a' is covering the region it wants and so shares it, pinning it in memory. Mapping 'a' then goes away and the file is truncated to the end of VMA 'b'. However, the region allocated by 'a' is still in effect, and has _not_ been reduced. Mapping 'c' is then created, and because there's a vm_region covering the desired region, get_unmapped_area() is _not_ called to repeat the check, and the mapping is granted, even though the pages from the latter half of the mapping have been discarded. However: d = mmap(NULL, 16 * 1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); Mapping 'd' should work, and should end up sharing the region allocated by 'a'. To deal with this, we shrink the vm_region struct during the truncation, lest do_mmap_pgoff() take it as licence to share the full region automatically without calling the get_unmapped_area() file op again. Signed-off-by: David Howells Acked-by: Al Viro Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 31 +------------------------- include/linux/mm.h | 1 + mm/nommu.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 266531343aae..1739a4aba25f 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -121,35 +121,6 @@ add_error: return ret; } -/*****************************************************************************/ -/* - * check that file shrinkage doesn't leave any VMAs dangling in midair - */ -static int ramfs_nommu_check_mappings(struct inode *inode, - size_t newsize, size_t size) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - - down_write(&nommu_region_sem); - - /* search for VMAs that fall within the dead zone */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - newsize >> PAGE_SHIFT, - (size + PAGE_SIZE - 1) >> PAGE_SHIFT - ) { - /* found one - only interested if it's shared out of the page - * cache */ - if (vma->vm_flags & VM_SHARED) { - up_write(&nommu_region_sem); - return -ETXTBSY; /* not quite true, but near enough */ - } - } - - up_write(&nommu_region_sem); - return 0; -} - /*****************************************************************************/ /* * @@ -169,7 +140,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) /* check that a decrease in size doesn't cut off any shared mappings */ if (newsize < size) { - ret = ramfs_nommu_check_mappings(inode, newsize, size); + ret = nommu_shrink_inode_mappings(inode, size, newsize); if (ret < 0) return ret; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 2265f28eb47a..60c467bfbabd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1089,6 +1089,7 @@ extern void zone_pcp_update(struct zone *zone); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; +extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); /* prio_tree.c */ void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); diff --git a/mm/nommu.c b/mm/nommu.c index 32be0cf51ba6..48a2ecfaf059 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1914,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in mmput(mm); return len; } + +/** + * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode + * @inode: The inode to check + * @size: The current filesize of the inode + * @newsize: The proposed filesize of the inode + * + * Check the shared mappings on an inode on behalf of a shrinking truncate to + * make sure that that any outstanding VMAs aren't broken and then shrink the + * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't + * automatically grant mappings that are too large. + */ +int nommu_shrink_inode_mappings(struct inode *inode, size_t size, + size_t newsize) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + struct vm_region *region; + pgoff_t low, high; + size_t r_size, r_top; + + low = newsize >> PAGE_SHIFT; + high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + + down_write(&nommu_region_sem); + + /* search for VMAs that fall within the dead zone */ + vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, + low, high) { + /* found one - only interested if it's shared out of the page + * cache */ + if (vma->vm_flags & VM_SHARED) { + up_write(&nommu_region_sem); + return -ETXTBSY; /* not quite true, but near enough */ + } + } + + /* reduce any regions that overlap the dead zone - if in existence, + * these will be pointed to by VMAs that don't overlap the dead zone + * + * we don't check for any regions that start beyond the EOF as there + * shouldn't be any + */ + vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, + 0, ULONG_MAX) { + if (!(vma->vm_flags & VM_SHARED)) + continue; + + region = vma->vm_region; + r_size = region->vm_top - region->vm_start; + r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size; + + if (r_top > newsize) { + region->vm_top -= r_top - newsize; + if (region->vm_end > region->vm_top) + region->vm_end = region->vm_top; + } + } + + up_write(&nommu_region_sem); + return 0; +} -- cgit v1.2.3 From 73c89c15b959adf06366722c4be8d2eddec0a529 Mon Sep 17 00:00:00 2001 From: Tobias Brunner Date: Sun, 17 Jan 2010 21:52:11 +1100 Subject: crypto: gcm - Add RFC4543 wrapper for GCM This patch adds the RFC4543 (GMAC) wrapper for GCM similar to the existing RFC4106 wrapper. The main differences between GCM and GMAC are the contents of the AAD and that the plaintext is empty for the latter. Signed-off-by: Tobias Brunner Signed-off-by: Herbert Xu --- crypto/gcm.c | 287 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pfkeyv2.h | 1 + net/xfrm/xfrm_algo.c | 16 +++ 3 files changed, 304 insertions(+) (limited to 'include/linux') diff --git a/crypto/gcm.c b/crypto/gcm.c index c6547130624c..2f5fbba6576c 100644 --- a/crypto/gcm.c +++ b/crypto/gcm.c @@ -37,6 +37,19 @@ struct crypto_rfc4106_ctx { u8 nonce[4]; }; +struct crypto_rfc4543_ctx { + struct crypto_aead *child; + u8 nonce[4]; +}; + +struct crypto_rfc4543_req_ctx { + u8 auth_tag[16]; + struct scatterlist cipher[1]; + struct scatterlist payload[2]; + struct scatterlist assoc[2]; + struct aead_request subreq; +}; + struct crypto_gcm_ghash_ctx { unsigned int cryptlen; struct scatterlist *src; @@ -1047,6 +1060,272 @@ static struct crypto_template crypto_rfc4106_tmpl = { .module = THIS_MODULE, }; +static inline struct crypto_rfc4543_req_ctx *crypto_rfc4543_reqctx( + struct aead_request *req) +{ + unsigned long align = crypto_aead_alignmask(crypto_aead_reqtfm(req)); + + return (void *)PTR_ALIGN((u8 *)aead_request_ctx(req), align + 1); +} + +static int crypto_rfc4543_setkey(struct crypto_aead *parent, const u8 *key, + unsigned int keylen) +{ + struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(parent); + struct crypto_aead *child = ctx->child; + int err; + + if (keylen < 4) + return -EINVAL; + + keylen -= 4; + memcpy(ctx->nonce, key + keylen, 4); + + crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_aead_set_flags(child, crypto_aead_get_flags(parent) & + CRYPTO_TFM_REQ_MASK); + err = crypto_aead_setkey(child, key, keylen); + crypto_aead_set_flags(parent, crypto_aead_get_flags(child) & + CRYPTO_TFM_RES_MASK); + + return err; +} + +static int crypto_rfc4543_setauthsize(struct crypto_aead *parent, + unsigned int authsize) +{ + struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(parent); + + if (authsize != 16) + return -EINVAL; + + return crypto_aead_setauthsize(ctx->child, authsize); +} + +/* this is the same as crypto_authenc_chain */ +static void crypto_rfc4543_chain(struct scatterlist *head, + struct scatterlist *sg, int chain) +{ + if (chain) { + head->length += sg->length; + sg = scatterwalk_sg_next(sg); + } + + if (sg) + scatterwalk_sg_chain(head, 2, sg); + else + sg_mark_end(head); +} + +static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req, + int enc) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead); + struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req); + struct aead_request *subreq = &rctx->subreq; + struct scatterlist *dst = req->dst; + struct scatterlist *cipher = rctx->cipher; + struct scatterlist *payload = rctx->payload; + struct scatterlist *assoc = rctx->assoc; + unsigned int authsize = crypto_aead_authsize(aead); + unsigned int assoclen = req->assoclen; + struct page *dstp; + u8 *vdst; + u8 *iv = PTR_ALIGN((u8 *)(rctx + 1) + crypto_aead_reqsize(ctx->child), + crypto_aead_alignmask(ctx->child) + 1); + + memcpy(iv, ctx->nonce, 4); + memcpy(iv + 4, req->iv, 8); + + /* construct cipher/plaintext */ + if (enc) + memset(rctx->auth_tag, 0, authsize); + else + scatterwalk_map_and_copy(rctx->auth_tag, dst, + req->cryptlen - authsize, + authsize, 0); + + sg_init_one(cipher, rctx->auth_tag, authsize); + + /* construct the aad */ + dstp = sg_page(dst); + vdst = PageHighMem(dstp) ? NULL : page_address(dstp) + dst->offset; + + sg_init_table(payload, 2); + sg_set_buf(payload, req->iv, 8); + crypto_rfc4543_chain(payload, dst, vdst == req->iv + 8); + assoclen += 8 + req->cryptlen - (enc ? 0 : authsize); + + sg_init_table(assoc, 2); + sg_set_page(assoc, sg_page(req->assoc), req->assoc->length, + req->assoc->offset); + crypto_rfc4543_chain(assoc, payload, 0); + + aead_request_set_tfm(subreq, ctx->child); + aead_request_set_callback(subreq, req->base.flags, req->base.complete, + req->base.data); + aead_request_set_crypt(subreq, cipher, cipher, enc ? 0 : authsize, iv); + aead_request_set_assoc(subreq, assoc, assoclen); + + return subreq; +} + +static int crypto_rfc4543_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req); + struct aead_request *subreq; + int err; + + subreq = crypto_rfc4543_crypt(req, 1); + err = crypto_aead_encrypt(subreq); + if (err) + return err; + + scatterwalk_map_and_copy(rctx->auth_tag, req->dst, req->cryptlen, + crypto_aead_authsize(aead), 1); + + return 0; +} + +static int crypto_rfc4543_decrypt(struct aead_request *req) +{ + req = crypto_rfc4543_crypt(req, 0); + + return crypto_aead_decrypt(req); +} + +static int crypto_rfc4543_init_tfm(struct crypto_tfm *tfm) +{ + struct crypto_instance *inst = (void *)tfm->__crt_alg; + struct crypto_aead_spawn *spawn = crypto_instance_ctx(inst); + struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm); + struct crypto_aead *aead; + unsigned long align; + + aead = crypto_spawn_aead(spawn); + if (IS_ERR(aead)) + return PTR_ERR(aead); + + ctx->child = aead; + + align = crypto_aead_alignmask(aead); + align &= ~(crypto_tfm_ctx_alignment() - 1); + tfm->crt_aead.reqsize = sizeof(struct crypto_rfc4543_req_ctx) + + ALIGN(crypto_aead_reqsize(aead), + crypto_tfm_ctx_alignment()) + + align + 16; + + return 0; +} + +static void crypto_rfc4543_exit_tfm(struct crypto_tfm *tfm) +{ + struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm); + + crypto_free_aead(ctx->child); +} + +static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb) +{ + struct crypto_attr_type *algt; + struct crypto_instance *inst; + struct crypto_aead_spawn *spawn; + struct crypto_alg *alg; + const char *ccm_name; + int err; + + algt = crypto_get_attr_type(tb); + err = PTR_ERR(algt); + if (IS_ERR(algt)) + return ERR_PTR(err); + + if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask) + return ERR_PTR(-EINVAL); + + ccm_name = crypto_attr_alg_name(tb[1]); + err = PTR_ERR(ccm_name); + if (IS_ERR(ccm_name)) + return ERR_PTR(err); + + inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); + if (!inst) + return ERR_PTR(-ENOMEM); + + spawn = crypto_instance_ctx(inst); + crypto_set_aead_spawn(spawn, inst); + err = crypto_grab_aead(spawn, ccm_name, 0, + crypto_requires_sync(algt->type, algt->mask)); + if (err) + goto out_free_inst; + + alg = crypto_aead_spawn_alg(spawn); + + err = -EINVAL; + + /* We only support 16-byte blocks. */ + if (alg->cra_aead.ivsize != 16) + goto out_drop_alg; + + /* Not a stream cipher? */ + if (alg->cra_blocksize != 1) + goto out_drop_alg; + + err = -ENAMETOOLONG; + if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME, + "rfc4543(%s)", alg->cra_name) >= CRYPTO_MAX_ALG_NAME || + snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, + "rfc4543(%s)", alg->cra_driver_name) >= + CRYPTO_MAX_ALG_NAME) + goto out_drop_alg; + + inst->alg.cra_flags = CRYPTO_ALG_TYPE_AEAD; + inst->alg.cra_flags |= alg->cra_flags & CRYPTO_ALG_ASYNC; + inst->alg.cra_priority = alg->cra_priority; + inst->alg.cra_blocksize = 1; + inst->alg.cra_alignmask = alg->cra_alignmask; + inst->alg.cra_type = &crypto_nivaead_type; + + inst->alg.cra_aead.ivsize = 8; + inst->alg.cra_aead.maxauthsize = 16; + + inst->alg.cra_ctxsize = sizeof(struct crypto_rfc4543_ctx); + + inst->alg.cra_init = crypto_rfc4543_init_tfm; + inst->alg.cra_exit = crypto_rfc4543_exit_tfm; + + inst->alg.cra_aead.setkey = crypto_rfc4543_setkey; + inst->alg.cra_aead.setauthsize = crypto_rfc4543_setauthsize; + inst->alg.cra_aead.encrypt = crypto_rfc4543_encrypt; + inst->alg.cra_aead.decrypt = crypto_rfc4543_decrypt; + + inst->alg.cra_aead.geniv = "seqiv"; + +out: + return inst; + +out_drop_alg: + crypto_drop_aead(spawn); +out_free_inst: + kfree(inst); + inst = ERR_PTR(err); + goto out; +} + +static void crypto_rfc4543_free(struct crypto_instance *inst) +{ + crypto_drop_spawn(crypto_instance_ctx(inst)); + kfree(inst); +} + +static struct crypto_template crypto_rfc4543_tmpl = { + .name = "rfc4543", + .alloc = crypto_rfc4543_alloc, + .free = crypto_rfc4543_free, + .module = THIS_MODULE, +}; + static int __init crypto_gcm_module_init(void) { int err; @@ -1067,8 +1346,14 @@ static int __init crypto_gcm_module_init(void) if (err) goto out_undo_gcm; + err = crypto_register_template(&crypto_rfc4543_tmpl); + if (err) + goto out_undo_rfc4106; + return 0; +out_undo_rfc4106: + crypto_unregister_template(&crypto_rfc4106_tmpl); out_undo_gcm: crypto_unregister_template(&crypto_gcm_tmpl); out_undo_base: @@ -1081,6 +1366,7 @@ out: static void __exit crypto_gcm_module_exit(void) { kfree(gcm_zeroes); + crypto_unregister_template(&crypto_rfc4543_tmpl); crypto_unregister_template(&crypto_rfc4106_tmpl); crypto_unregister_template(&crypto_gcm_tmpl); crypto_unregister_template(&crypto_gcm_base_tmpl); @@ -1094,3 +1380,4 @@ MODULE_DESCRIPTION("Galois/Counter Mode"); MODULE_AUTHOR("Mikko Herranen "); MODULE_ALIAS("gcm_base"); MODULE_ALIAS("rfc4106"); +MODULE_ALIAS("rfc4543"); diff --git a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h index 228b0b6306b0..0b80c806631f 100644 --- a/include/linux/pfkeyv2.h +++ b/include/linux/pfkeyv2.h @@ -315,6 +315,7 @@ struct sadb_x_kmaddress { #define SADB_X_EALG_AES_GCM_ICV12 19 #define SADB_X_EALG_AES_GCM_ICV16 20 #define SADB_X_EALG_CAMELLIACBC 22 +#define SADB_X_EALG_NULL_AES_GMAC 23 #define SADB_EALG_MAX 253 /* last EALG */ /* private allocations should use 249-255 (RFC2407) */ #define SADB_X_EALG_SERPENTCBC 252 /* draft-ietf-ipsec-ciph-aes-cbc-00 */ diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 743c0134a6a9..8b4d6e3246e5 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -125,6 +125,22 @@ static struct xfrm_algo_desc aead_list[] = { .sadb_alg_maxbits = 256 } }, +{ + .name = "rfc4543(gcm(aes))", + + .uinfo = { + .aead = { + .icv_truncbits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_NULL_AES_GMAC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, }; static struct xfrm_algo_desc aalg_list[] = { -- cgit v1.2.3 From 8b0e58a70a7a41443c779de074288035b014cb94 Mon Sep 17 00:00:00 2001 From: Stephane Chatty Date: Wed, 13 Jan 2010 21:52:34 +0100 Subject: HID: let hid-input accept digitizers Extended IS_INPUT_APPLICATION to accept digitzers that are actual input devices (touchscreens, light pens, touch pads, white boards) Signed-off-by: Stephane Chatty Signed-off-by: Jiri Kosina --- include/linux/hid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 87093652dda8..b978c1e2e74d 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -663,7 +663,7 @@ struct hid_ll_driver { /* Applications from HID Usage Tables 4/8/99 Version 1.1 */ /* We ignore a few input applications that are not widely used */ -#define IS_INPUT_APPLICATION(a) (((a >= 0x00010000) && (a <= 0x00010008)) || (a == 0x00010080) || (a == 0x000c0001) || (a == 0x000d0002)) +#define IS_INPUT_APPLICATION(a) (((a >= 0x00010000) && (a <= 0x00010008)) || (a == 0x00010080) || (a == 0x000c0001) || ((a >= 0x000d0002) && (a <= 0x000d0006))) /* HID core API */ -- cgit v1.2.3 From a83d8e8d099fc373a5ca7112ad08c553bb2c180f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 18 Jan 2010 08:21:13 +0100 Subject: netfilter: xtables: add struct xt_mtchk_param::net Some complex match modules (like xt_hashlimit/xt_recent) want netns information at constructor and destructor time. We propably can play games at match destruction time, because netns can be passed in object, but I think it's cleaner to explicitly pass netns. Add ->net, make sure it's set from ebtables/iptables/ip6tables code. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/linux/netfilter/x_tables.h | 1 + net/bridge/netfilter/ebtables.c | 14 +++++++++----- net/ipv4/netfilter/ip_tables.c | 24 ++++++++++++++---------- net/ipv6/netfilter/ip6_tables.c | 14 ++++++++------ 4 files changed, 32 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 378f27ae7772..88261b9829a7 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -205,6 +205,7 @@ struct xt_match_param { * @hook_mask: via which hooks the new rule is reachable */ struct xt_mtchk_param { + struct net *net; const char *table; const void *entryinfo; const struct xt_match *match; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index bd1c65425d4f..c77bab986696 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -619,7 +619,9 @@ ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt) } static inline int -ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, +ebt_check_entry(struct ebt_entry *e, + struct net *net, + struct ebt_table_info *newinfo, const char *name, unsigned int *cnt, struct ebt_cl_stack *cl_s, unsigned int udc_cnt) { @@ -671,6 +673,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, } i = 0; + mtpar.net = net; mtpar.table = tgpar.table = name; mtpar.entryinfo = tgpar.entryinfo = e; mtpar.hook_mask = tgpar.hook_mask = hookmask; @@ -808,7 +811,8 @@ letscontinue: } /* do the parsing of the table/chains/entries/matches/watchers/targets, heh */ -static int translate_table(char *name, struct ebt_table_info *newinfo) +static int translate_table(struct net *net, char *name, + struct ebt_table_info *newinfo) { unsigned int i, j, k, udc_cnt; int ret; @@ -917,7 +921,7 @@ static int translate_table(char *name, struct ebt_table_info *newinfo) /* used to know what we need to clean up if something goes wrong */ i = 0; ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, - ebt_check_entry, newinfo, name, &i, cl_s, udc_cnt); + ebt_check_entry, net, newinfo, name, &i, cl_s, udc_cnt); if (ret != 0) { EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, ebt_cleanup_entry, &i); @@ -1017,7 +1021,7 @@ static int do_replace(struct net *net, void __user *user, unsigned int len) if (ret != 0) goto free_counterstmp; - ret = translate_table(tmp.name, newinfo); + ret = translate_table(net, tmp.name, newinfo); if (ret != 0) goto free_counterstmp; @@ -1154,7 +1158,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table) newinfo->hook_entry[i] = p + ((char *)repl->hook_entry[i] - repl->entries); } - ret = translate_table(repl->name, newinfo); + ret = translate_table(net, repl->name, newinfo); if (ret != 0) { BUGPRINT("Translate_table failed\n"); goto free_chainstack; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 572330a552ef..a069d72d9482 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -661,8 +661,8 @@ static int check_target(struct ipt_entry *e, const char *name) } static int -find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, - unsigned int *i) +find_check_entry(struct ipt_entry *e, struct net *net, const char *name, + unsigned int size, unsigned int *i) { struct ipt_entry_target *t; struct xt_target *target; @@ -675,6 +675,7 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, return ret; j = 0; + mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ip; mtpar.hook_mask = e->comefrom; @@ -798,7 +799,8 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i) /* Checks and translates the user-supplied table segment (held in newinfo) */ static int -translate_table(const char *name, +translate_table(struct net *net, + const char *name, unsigned int valid_hooks, struct xt_table_info *newinfo, void *entry0, @@ -860,7 +862,7 @@ translate_table(const char *name, /* Finally, each sanity check must pass */ i = 0; ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, - find_check_entry, name, size, &i); + find_check_entry, net, name, size, &i); if (ret != 0) { IPT_ENTRY_ITERATE(entry0, newinfo->size, @@ -1303,7 +1305,7 @@ do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_table(tmp.name, tmp.valid_hooks, + ret = translate_table(net, tmp.name, tmp.valid_hooks, newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) @@ -1655,7 +1657,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, } static int -compat_check_entry(struct ipt_entry *e, const char *name, +compat_check_entry(struct ipt_entry *e, struct net *net, const char *name, unsigned int *i) { struct xt_mtchk_param mtpar; @@ -1663,6 +1665,7 @@ compat_check_entry(struct ipt_entry *e, const char *name, int ret; j = 0; + mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ip; mtpar.hook_mask = e->comefrom; @@ -1684,7 +1687,8 @@ compat_check_entry(struct ipt_entry *e, const char *name, } static int -translate_compat_table(const char *name, +translate_compat_table(struct net *net, + const char *name, unsigned int valid_hooks, struct xt_table_info **pinfo, void **pentry0, @@ -1773,7 +1777,7 @@ translate_compat_table(const char *name, i = 0; ret = IPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, - name, &i); + net, name, &i); if (ret) { j -= i; COMPAT_IPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, @@ -1833,7 +1837,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_compat_table(tmp.name, tmp.valid_hooks, + ret = translate_compat_table(net, tmp.name, tmp.valid_hooks, &newinfo, &loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); @@ -2086,7 +2090,7 @@ struct xt_table *ipt_register_table(struct net *net, loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; memcpy(loc_cpu_entry, repl->entries, repl->size); - ret = translate_table(table->name, table->valid_hooks, + ret = translate_table(net, table->name, table->valid_hooks, newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 480d7f8c9802..a825940a92ef 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -693,8 +693,8 @@ static int check_target(struct ip6t_entry *e, const char *name) } static int -find_check_entry(struct ip6t_entry *e, const char *name, unsigned int size, - unsigned int *i) +find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, + unsigned int size, unsigned int *i) { struct ip6t_entry_target *t; struct xt_target *target; @@ -707,6 +707,7 @@ find_check_entry(struct ip6t_entry *e, const char *name, unsigned int size, return ret; j = 0; + mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ipv6; mtpar.hook_mask = e->comefrom; @@ -830,7 +831,8 @@ cleanup_entry(struct ip6t_entry *e, unsigned int *i) /* Checks and translates the user-supplied table segment (held in newinfo) */ static int -translate_table(const char *name, +translate_table(struct net *net, + const char *name, unsigned int valid_hooks, struct xt_table_info *newinfo, void *entry0, @@ -892,7 +894,7 @@ translate_table(const char *name, /* Finally, each sanity check must pass */ i = 0; ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, - find_check_entry, name, size, &i); + find_check_entry, net, name, size, &i); if (ret != 0) { IP6T_ENTRY_ITERATE(entry0, newinfo->size, @@ -1336,7 +1338,7 @@ do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_table(tmp.name, tmp.valid_hooks, + ret = translate_table(net, tmp.name, tmp.valid_hooks, newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) @@ -2121,7 +2123,7 @@ struct xt_table *ip6t_register_table(struct net *net, loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; memcpy(loc_cpu_entry, repl->entries, repl->size); - ret = translate_table(table->name, table->valid_hooks, + ret = translate_table(net, table->name, table->valid_hooks, newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, -- cgit v1.2.3 From f54e9367f8499a9bf6b2afbc0dce63e1d53c525a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 18 Jan 2010 08:25:47 +0100 Subject: netfilter: xtables: add struct xt_mtdtor_param::net Add ->net to match destructor list like ->net in constructor list. Make sure it's set in ebtables/iptables/ip6tables, this requires to propagate netns up to *_unregister_table(). Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/linux/netfilter/x_tables.h | 1 + include/linux/netfilter_bridge/ebtables.h | 2 +- include/linux/netfilter_ipv4/ip_tables.h | 2 +- include/linux/netfilter_ipv6/ip6_tables.h | 2 +- net/bridge/netfilter/ebtable_broute.c | 2 +- net/bridge/netfilter/ebtable_filter.c | 2 +- net/bridge/netfilter/ebtable_nat.c | 2 +- net/bridge/netfilter/ebtables.c | 19 ++++++++-------- net/ipv4/netfilter/ip_tables.c | 25 +++++++++++---------- net/ipv4/netfilter/iptable_filter.c | 2 +- net/ipv4/netfilter/iptable_mangle.c | 2 +- net/ipv4/netfilter/iptable_raw.c | 2 +- net/ipv4/netfilter/iptable_security.c | 2 +- net/ipv4/netfilter/nf_nat_rule.c | 2 +- net/ipv6/netfilter/ip6_tables.c | 37 +++++++++++++++++-------------- net/ipv6/netfilter/ip6table_filter.c | 2 +- net/ipv6/netfilter/ip6table_mangle.c | 2 +- net/ipv6/netfilter/ip6table_raw.c | 2 +- net/ipv6/netfilter/ip6table_security.c | 2 +- 19 files changed, 59 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 88261b9829a7..3caf5e151102 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -216,6 +216,7 @@ struct xt_mtchk_param { /* Match destructor parameters */ struct xt_mtdtor_param { + struct net *net; const struct xt_match *match; void *matchinfo; u_int8_t family; diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 3cc40c131cc3..1c6f0c5f530e 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -289,7 +289,7 @@ struct ebt_table { ~(__alignof__(struct ebt_replace)-1)) extern struct ebt_table *ebt_register_table(struct net *net, const struct ebt_table *table); -extern void ebt_unregister_table(struct ebt_table *table); +extern void ebt_unregister_table(struct net *net, struct ebt_table *table); extern unsigned int ebt_do_table(unsigned int hook, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, struct ebt_table *table); diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index 27b3f5807305..8d1f273d350b 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -242,7 +242,7 @@ extern void ipt_init(void) __init; extern struct xt_table *ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl); -extern void ipt_unregister_table(struct xt_table *table); +extern void ipt_unregister_table(struct net *net, struct xt_table *table); /* Standard entry. */ struct ipt_standard { diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index b31050d20ae4..d2952d2fa658 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -300,7 +300,7 @@ extern void ip6t_init(void) __init; extern struct xt_table *ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl); -extern void ip6t_unregister_table(struct xt_table *table); +extern void ip6t_unregister_table(struct net *net, struct xt_table *table); extern unsigned int ip6t_do_table(struct sk_buff *skb, unsigned int hook, const struct net_device *in, diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index d32ab13e728c..ae3f106c3908 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -71,7 +71,7 @@ static int __net_init broute_net_init(struct net *net) static void __net_exit broute_net_exit(struct net *net) { - ebt_unregister_table(net->xt.broute_table); + ebt_unregister_table(net, net->xt.broute_table); } static struct pernet_operations broute_net_ops = { diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 60b1a6ca7185..42e6bd094574 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -107,7 +107,7 @@ static int __net_init frame_filter_net_init(struct net *net) static void __net_exit frame_filter_net_exit(struct net *net) { - ebt_unregister_table(net->xt.frame_filter); + ebt_unregister_table(net, net->xt.frame_filter); } static struct pernet_operations frame_filter_net_ops = { diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 4a98804203b0..6dc2f878ae05 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -107,7 +107,7 @@ static int __net_init frame_nat_net_init(struct net *net) static void __net_exit frame_nat_net_exit(struct net *net) { - ebt_unregister_table(net->xt.frame_nat); + ebt_unregister_table(net, net->xt.frame_nat); } static struct pernet_operations frame_nat_net_ops = { diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index c77bab986696..1aa0e4c1f52d 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -561,13 +561,14 @@ ebt_get_udc_positions(struct ebt_entry *e, struct ebt_table_info *newinfo, } static inline int -ebt_cleanup_match(struct ebt_entry_match *m, unsigned int *i) +ebt_cleanup_match(struct ebt_entry_match *m, struct net *net, unsigned int *i) { struct xt_mtdtor_param par; if (i && (*i)-- == 0) return 1; + par.net = net; par.match = m->u.match; par.matchinfo = m->data; par.family = NFPROTO_BRIDGE; @@ -595,7 +596,7 @@ ebt_cleanup_watcher(struct ebt_entry_watcher *w, unsigned int *i) } static inline int -ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt) +ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt) { struct xt_tgdtor_param par; struct ebt_entry_target *t; @@ -606,7 +607,7 @@ ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt) if (cnt && (*cnt)-- == 0) return 1; EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, NULL); - EBT_MATCH_ITERATE(e, ebt_cleanup_match, NULL); + EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL); t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); par.target = t->u.target; @@ -731,7 +732,7 @@ ebt_check_entry(struct ebt_entry *e, cleanup_watchers: EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, &j); cleanup_matches: - EBT_MATCH_ITERATE(e, ebt_cleanup_match, &i); + EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, &i); return ret; } @@ -924,7 +925,7 @@ static int translate_table(struct net *net, char *name, ebt_check_entry, net, newinfo, name, &i, cl_s, udc_cnt); if (ret != 0) { EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, - ebt_cleanup_entry, &i); + ebt_cleanup_entry, net, &i); } vfree(cl_s); return ret; @@ -1074,7 +1075,7 @@ static int do_replace(struct net *net, void __user *user, unsigned int len) /* decrease module count and free resources */ EBT_ENTRY_ITERATE(table->entries, table->entries_size, - ebt_cleanup_entry, NULL); + ebt_cleanup_entry, net, NULL); vfree(table->entries); if (table->chainstack) { @@ -1091,7 +1092,7 @@ free_unlock: mutex_unlock(&ebt_mutex); free_iterate: EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, - ebt_cleanup_entry, NULL); + ebt_cleanup_entry, net, NULL); free_counterstmp: vfree(counterstmp); /* can be initialized in translate_table() */ @@ -1208,7 +1209,7 @@ out: return ERR_PTR(ret); } -void ebt_unregister_table(struct ebt_table *table) +void ebt_unregister_table(struct net *net, struct ebt_table *table) { int i; @@ -1220,7 +1221,7 @@ void ebt_unregister_table(struct ebt_table *table) list_del(&table->list); mutex_unlock(&ebt_mutex); EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size, - ebt_cleanup_entry, NULL); + ebt_cleanup_entry, net, NULL); if (table->private->nentries) module_put(table->me); vfree(table->private->entries); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index a069d72d9482..cfaba0e2e6fc 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -553,13 +553,14 @@ mark_source_chains(struct xt_table_info *newinfo, } static int -cleanup_match(struct ipt_entry_match *m, unsigned int *i) +cleanup_match(struct ipt_entry_match *m, struct net *net, unsigned int *i) { struct xt_mtdtor_param par; if (i && (*i)-- == 0) return 1; + par.net = net; par.match = m->u.kernel.match; par.matchinfo = m->data; par.family = NFPROTO_IPV4; @@ -705,7 +706,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, err: module_put(t->u.kernel.target->me); cleanup_matches: - IPT_MATCH_ITERATE(e, cleanup_match, &j); + IPT_MATCH_ITERATE(e, cleanup_match, net, &j); return ret; } @@ -775,7 +776,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, } static int -cleanup_entry(struct ipt_entry *e, unsigned int *i) +cleanup_entry(struct ipt_entry *e, struct net *net, unsigned int *i) { struct xt_tgdtor_param par; struct ipt_entry_target *t; @@ -784,7 +785,7 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i) return 1; /* Cleanup all matches */ - IPT_MATCH_ITERATE(e, cleanup_match, NULL); + IPT_MATCH_ITERATE(e, cleanup_match, net, NULL); t = ipt_get_target(e); par.target = t->u.kernel.target; @@ -866,7 +867,7 @@ translate_table(struct net *net, if (ret != 0) { IPT_ENTRY_ITERATE(entry0, newinfo->size, - cleanup_entry, &i); + cleanup_entry, net, &i); return ret; } @@ -1260,7 +1261,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, - NULL); + net, NULL); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, sizeof(struct xt_counters) * num_counters) != 0) @@ -1320,7 +1321,7 @@ do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, net, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1682,7 +1683,7 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name, return 0; cleanup_matches: - IPT_MATCH_ITERATE(e, cleanup_match, &j); + IPT_MATCH_ITERATE(e, cleanup_match, net, &j); return ret; } @@ -1782,7 +1783,7 @@ translate_compat_table(struct net *net, j -= i; COMPAT_IPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, compat_release_entry, &j); - IPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); + IPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, net, &i); xt_free_table_info(newinfo); return ret; } @@ -1853,7 +1854,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, net, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -2112,7 +2113,7 @@ out: return ERR_PTR(ret); } -void ipt_unregister_table(struct xt_table *table) +void ipt_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; @@ -2122,7 +2123,7 @@ void ipt_unregister_table(struct xt_table *table) /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; - IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, net, NULL); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index df566cbd68e5..dee90eb8aa47 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -138,7 +138,7 @@ static int __net_init iptable_filter_net_init(struct net *net) static void __net_exit iptable_filter_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_filter); + ipt_unregister_table(net, net->ipv4.iptable_filter); } static struct pernet_operations iptable_filter_net_ops = { diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index fae78c3076c4..e07bf242343a 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -208,7 +208,7 @@ static int __net_init iptable_mangle_net_init(struct net *net) static void __net_exit iptable_mangle_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_mangle); + ipt_unregister_table(net, net->ipv4.iptable_mangle); } static struct pernet_operations iptable_mangle_net_ops = { diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 993edc23be09..40f2b9f611a2 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -100,7 +100,7 @@ static int __net_init iptable_raw_net_init(struct net *net) static void __net_exit iptable_raw_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_raw); + ipt_unregister_table(net, net->ipv4.iptable_raw); } static struct pernet_operations iptable_raw_net_ops = { diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 3bd3d6388da5..7ce2366e4305 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -138,7 +138,7 @@ static int __net_init iptable_security_net_init(struct net *net) static void __net_exit iptable_security_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_security); + ipt_unregister_table(net, net->ipv4.iptable_security); } static struct pernet_operations iptable_security_net_ops = { diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 9e81e0dfb4ec..85da34fdc755 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -195,7 +195,7 @@ static int __net_init nf_nat_rule_net_init(struct net *net) static void __net_exit nf_nat_rule_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.nat_table); + ipt_unregister_table(net, net->ipv4.nat_table); } static struct pernet_operations nf_nat_rule_net_ops = { diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index a825940a92ef..9f1d45f2ba8f 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -585,13 +585,14 @@ mark_source_chains(struct xt_table_info *newinfo, } static int -cleanup_match(struct ip6t_entry_match *m, unsigned int *i) +cleanup_match(struct ip6t_entry_match *m, struct net *net, unsigned int *i) { struct xt_mtdtor_param par; if (i && (*i)-- == 0) return 1; + par.net = net; par.match = m->u.kernel.match; par.matchinfo = m->data; par.family = NFPROTO_IPV6; @@ -737,7 +738,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, err: module_put(t->u.kernel.target->me); cleanup_matches: - IP6T_MATCH_ITERATE(e, cleanup_match, &j); + IP6T_MATCH_ITERATE(e, cleanup_match, net, &j); return ret; } @@ -807,7 +808,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e, } static int -cleanup_entry(struct ip6t_entry *e, unsigned int *i) +cleanup_entry(struct ip6t_entry *e, struct net *net, unsigned int *i) { struct xt_tgdtor_param par; struct ip6t_entry_target *t; @@ -816,7 +817,7 @@ cleanup_entry(struct ip6t_entry *e, unsigned int *i) return 1; /* Cleanup all matches */ - IP6T_MATCH_ITERATE(e, cleanup_match, NULL); + IP6T_MATCH_ITERATE(e, cleanup_match, net, NULL); t = ip6t_get_target(e); par.target = t->u.kernel.target; @@ -898,7 +899,7 @@ translate_table(struct net *net, if (ret != 0) { IP6T_ENTRY_ITERATE(entry0, newinfo->size, - cleanup_entry, &i); + cleanup_entry, net, &i); return ret; } @@ -1293,7 +1294,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, - NULL); + net, NULL); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, sizeof(struct xt_counters) * num_counters) != 0) @@ -1353,7 +1354,7 @@ do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, net, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1692,14 +1693,15 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, return ret; } -static int compat_check_entry(struct ip6t_entry *e, const char *name, - unsigned int *i) +static int compat_check_entry(struct ip6t_entry *e, struct net *net, + const char *name, unsigned int *i) { unsigned int j; int ret; struct xt_mtchk_param mtpar; j = 0; + mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ipv6; mtpar.hook_mask = e->comefrom; @@ -1716,12 +1718,13 @@ static int compat_check_entry(struct ip6t_entry *e, const char *name, return 0; cleanup_matches: - IP6T_MATCH_ITERATE(e, cleanup_match, &j); + IP6T_MATCH_ITERATE(e, cleanup_match, net, &j); return ret; } static int -translate_compat_table(const char *name, +translate_compat_table(struct net *net, + const char *name, unsigned int valid_hooks, struct xt_table_info **pinfo, void **pentry0, @@ -1810,12 +1813,12 @@ translate_compat_table(const char *name, i = 0; ret = IP6T_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, - name, &i); + net, name, &i); if (ret) { j -= i; COMPAT_IP6T_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, compat_release_entry, &j); - IP6T_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); + IP6T_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, net, &i); xt_free_table_info(newinfo); return ret; } @@ -1870,7 +1873,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_compat_table(tmp.name, tmp.valid_hooks, + ret = translate_compat_table(net, tmp.name, tmp.valid_hooks, &newinfo, &loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); @@ -1886,7 +1889,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, net, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -2144,7 +2147,7 @@ out: return ERR_PTR(ret); } -void ip6t_unregister_table(struct xt_table *table) +void ip6t_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; @@ -2154,7 +2157,7 @@ void ip6t_unregister_table(struct xt_table *table) /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; - IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); + IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, net, NULL); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index ad378efd0eb8..33ddfe53e18d 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -131,7 +131,7 @@ static int __net_init ip6table_filter_net_init(struct net *net) static void __net_exit ip6table_filter_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_filter); + ip6t_unregister_table(net, net->ipv6.ip6table_filter); } static struct pernet_operations ip6table_filter_net_ops = { diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index a929c19d30e3..9bc483f000e5 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -182,7 +182,7 @@ static int __net_init ip6table_mangle_net_init(struct net *net) static void __net_exit ip6table_mangle_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_mangle); + ip6t_unregister_table(net, net->ipv6.ip6table_mangle); } static struct pernet_operations ip6table_mangle_net_ops = { diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index ed1a1180f3b3..4c90b552e433 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -94,7 +94,7 @@ static int __net_init ip6table_raw_net_init(struct net *net) static void __net_exit ip6table_raw_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_raw); + ip6t_unregister_table(net, net->ipv6.ip6table_raw); } static struct pernet_operations ip6table_raw_net_ops = { diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 41b444c60934..baa8d4ef3b0a 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -134,7 +134,7 @@ static int __net_init ip6table_security_net_init(struct net *net) static void __net_exit ip6table_security_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_security); + ip6t_unregister_table(net, net->ipv6.ip6table_security); } static struct pernet_operations ip6table_security_net_ops = { -- cgit v1.2.3 From 9dffe2a32b0deef52605d50527c0d240b15cabf7 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 4 Jan 2010 18:05:00 +0000 Subject: mfd: Correct WM835x ISINK ramp time defines The constants used to specify ISINK ramp times for WM835x had the wrong shifts so that the on times applied to the off ramp and vice versa. The masks for the bitfields are correct. Signed-off-by: Mark Brown Cc: stable@kernel.org Signed-off-by: Samuel Ortiz --- include/linux/mfd/wm8350/pmic.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/wm8350/pmic.h b/include/linux/mfd/wm8350/pmic.h index be3264e286e0..e786fe9841ef 100644 --- a/include/linux/mfd/wm8350/pmic.h +++ b/include/linux/mfd/wm8350/pmic.h @@ -666,20 +666,20 @@ #define WM8350_ISINK_FLASH_DUR_64MS (1 << 8) #define WM8350_ISINK_FLASH_DUR_96MS (2 << 8) #define WM8350_ISINK_FLASH_DUR_1024MS (3 << 8) -#define WM8350_ISINK_FLASH_ON_INSTANT (0 << 4) -#define WM8350_ISINK_FLASH_ON_0_25S (1 << 4) -#define WM8350_ISINK_FLASH_ON_0_50S (2 << 4) -#define WM8350_ISINK_FLASH_ON_1_00S (3 << 4) -#define WM8350_ISINK_FLASH_ON_1_95S (1 << 4) -#define WM8350_ISINK_FLASH_ON_3_91S (2 << 4) -#define WM8350_ISINK_FLASH_ON_7_80S (3 << 4) -#define WM8350_ISINK_FLASH_OFF_INSTANT (0 << 0) -#define WM8350_ISINK_FLASH_OFF_0_25S (1 << 0) -#define WM8350_ISINK_FLASH_OFF_0_50S (2 << 0) -#define WM8350_ISINK_FLASH_OFF_1_00S (3 << 0) -#define WM8350_ISINK_FLASH_OFF_1_95S (1 << 0) -#define WM8350_ISINK_FLASH_OFF_3_91S (2 << 0) -#define WM8350_ISINK_FLASH_OFF_7_80S (3 << 0) +#define WM8350_ISINK_FLASH_ON_INSTANT (0 << 0) +#define WM8350_ISINK_FLASH_ON_0_25S (1 << 0) +#define WM8350_ISINK_FLASH_ON_0_50S (2 << 0) +#define WM8350_ISINK_FLASH_ON_1_00S (3 << 0) +#define WM8350_ISINK_FLASH_ON_1_95S (1 << 0) +#define WM8350_ISINK_FLASH_ON_3_91S (2 << 0) +#define WM8350_ISINK_FLASH_ON_7_80S (3 << 0) +#define WM8350_ISINK_FLASH_OFF_INSTANT (0 << 4) +#define WM8350_ISINK_FLASH_OFF_0_25S (1 << 4) +#define WM8350_ISINK_FLASH_OFF_0_50S (2 << 4) +#define WM8350_ISINK_FLASH_OFF_1_00S (3 << 4) +#define WM8350_ISINK_FLASH_OFF_1_95S (1 << 4) +#define WM8350_ISINK_FLASH_OFF_3_91S (2 << 4) +#define WM8350_ISINK_FLASH_OFF_7_80S (3 << 4) /* * Regulator Interrupts. -- cgit v1.2.3 From 64e8867ba8098b69889c1af94997a5ba2348fb26 Mon Sep 17 00:00:00 2001 From: Ian Molton Date: Wed, 6 Jan 2010 13:51:48 +0100 Subject: mfd: tmio_mmc hardware abstraction for CNF area This patch abstracts out the CNF area code from tmio_mmc which is not present in all hardware that can use this driver. This is required so that we can support non-toshiba based hardware. ASIC3 support by Philipp Zabel Signed-off-by: Ian Molton Signed-off-by: Magnus Damm Signed-off-by: Samuel Ortiz --- drivers/mfd/Makefile | 6 +-- drivers/mfd/asic3.c | 40 ++++++++++++--- drivers/mfd/t7l66xb.c | 55 +++++++++++++------- drivers/mfd/tc6387xb.c | 119 ++++++++++++++++++++++++++++++++------------ drivers/mfd/tc6393xb.c | 56 +++++++++++++++++---- drivers/mfd/tmio_core.c | 52 +++++++++++++++++++ drivers/mmc/host/tmio_mmc.c | 59 +++++++--------------- drivers/mmc/host/tmio_mmc.h | 46 +++-------------- include/linux/mfd/tmio.h | 39 +++++++++++++++ 9 files changed, 323 insertions(+), 149 deletions(-) create mode 100644 drivers/mfd/tmio_core.c (limited to 'include/linux') diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index ca2f2c4ff05e..8f0d18409ede 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -11,9 +11,9 @@ obj-$(CONFIG_HTC_PASIC3) += htc-pasic3.o obj-$(CONFIG_MFD_DM355EVM_MSP) += dm355evm_msp.o -obj-$(CONFIG_MFD_T7L66XB) += t7l66xb.o -obj-$(CONFIG_MFD_TC6387XB) += tc6387xb.o -obj-$(CONFIG_MFD_TC6393XB) += tc6393xb.o +obj-$(CONFIG_MFD_T7L66XB) += t7l66xb.o tmio_core.o +obj-$(CONFIG_MFD_TC6387XB) += tc6387xb.o tmio_core.o +obj-$(CONFIG_MFD_TC6393XB) += tc6393xb.o tmio_core.o obj-$(CONFIG_MFD_WM8400) += wm8400-core.o wm831x-objs := wm831x-core.o wm831x-irq.o wm831x-otp.o diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c index e22128c3e9a8..95c1e6bd1729 100644 --- a/drivers/mfd/asic3.c +++ b/drivers/mfd/asic3.c @@ -80,6 +80,7 @@ struct asic3 { u16 irq_bothedge[4]; struct gpio_chip gpio; struct device *dev; + void __iomem *tmio_cnf; struct asic3_clk clocks[ARRAY_SIZE(asic3_clk_init)]; }; @@ -685,8 +686,24 @@ static struct mfd_cell asic3_cell_ds1wm = { .resources = ds1wm_resources, }; +static void asic3_mmc_pwr(struct platform_device *pdev, int state) +{ + struct asic3 *asic = dev_get_drvdata(pdev->dev.parent); + + tmio_core_mmc_pwr(asic->tmio_cnf, 1 - asic->bus_shift, state); +} + +static void asic3_mmc_clk_div(struct platform_device *pdev, int state) +{ + struct asic3 *asic = dev_get_drvdata(pdev->dev.parent); + + tmio_core_mmc_clk_div(asic->tmio_cnf, 1 - asic->bus_shift, state); +} + static struct tmio_mmc_data asic3_mmc_data = { - .hclk = 24576000, + .hclk = 24576000, + .set_pwr = asic3_mmc_pwr, + .set_clk_div = asic3_mmc_clk_div, }; static struct resource asic3_mmc_resources[] = { @@ -695,11 +712,6 @@ static struct resource asic3_mmc_resources[] = { .end = ASIC3_SD_CTRL_BASE + 0x3ff, .flags = IORESOURCE_MEM, }, - { - .start = ASIC3_SD_CONFIG_BASE, - .end = ASIC3_SD_CONFIG_BASE + 0x1ff, - .flags = IORESOURCE_MEM, - }, { .start = 0, .end = 0, @@ -743,6 +755,10 @@ static int asic3_mmc_enable(struct platform_device *pdev) asic3_set_register(asic, ASIC3_OFFSET(SDHWCTRL, SDCONF), ASIC3_SDHWCTRL_SDPWR, 1); + /* ASIC3_SD_CTRL_BASE assumes 32-bit addressing, TMIO is 16-bit */ + tmio_core_mmc_enable(asic->tmio_cnf, 1 - asic->bus_shift, + ASIC3_SD_CTRL_BASE >> 1); + return 0; } @@ -797,10 +813,15 @@ static int __init asic3_mfd_probe(struct platform_device *pdev, asic3_cell_ds1wm.data_size = sizeof(asic3_cell_ds1wm); /* MMC */ + asic->tmio_cnf = ioremap((ASIC3_SD_CONFIG_BASE >> asic->bus_shift) + + mem_sdio->start, 0x400 >> asic->bus_shift); + if (!asic->tmio_cnf) { + ret = -ENOMEM; + dev_dbg(asic->dev, "Couldn't ioremap SD_CONFIG\n"); + goto out; + } asic3_mmc_resources[0].start >>= asic->bus_shift; asic3_mmc_resources[0].end >>= asic->bus_shift; - asic3_mmc_resources[1].start >>= asic->bus_shift; - asic3_mmc_resources[1].end >>= asic->bus_shift; asic3_cell_mmc.platform_data = &asic3_cell_mmc; asic3_cell_mmc.data_size = sizeof(asic3_cell_mmc); @@ -820,7 +841,10 @@ static int __init asic3_mfd_probe(struct platform_device *pdev, static void asic3_mfd_remove(struct platform_device *pdev) { + struct asic3 *asic = platform_get_drvdata(pdev); + mfd_remove_devices(&pdev->dev); + iounmap(asic->tmio_cnf); } /* Core */ diff --git a/drivers/mfd/t7l66xb.c b/drivers/mfd/t7l66xb.c index 0a255c1f1ce7..bcf4687d4af5 100644 --- a/drivers/mfd/t7l66xb.c +++ b/drivers/mfd/t7l66xb.c @@ -38,6 +38,19 @@ enum { T7L66XB_CELL_MMC, }; +static const struct resource t7l66xb_mmc_resources[] = { + { + .start = 0x800, + .end = 0x9ff, + .flags = IORESOURCE_MEM, + }, + { + .start = IRQ_T7L66XB_MMC, + .end = IRQ_T7L66XB_MMC, + .flags = IORESOURCE_IRQ, + }, +}; + #define SCR_REVID 0x08 /* b Revision ID */ #define SCR_IMR 0x42 /* b Interrupt Mask */ #define SCR_DEV_CTL 0xe0 /* b Device control */ @@ -83,6 +96,9 @@ static int t7l66xb_mmc_enable(struct platform_device *mmc) spin_unlock_irqrestore(&t7l66xb->lock, flags); + tmio_core_mmc_enable(t7l66xb->scr + 0x200, 0, + t7l66xb_mmc_resources[0].start & 0xfffe); + return 0; } @@ -106,28 +122,28 @@ static int t7l66xb_mmc_disable(struct platform_device *mmc) return 0; } +static void t7l66xb_mmc_pwr(struct platform_device *mmc, int state) +{ + struct platform_device *dev = to_platform_device(mmc->dev.parent); + struct t7l66xb *t7l66xb = platform_get_drvdata(dev); + + tmio_core_mmc_pwr(t7l66xb->scr + 0x200, 0, state); +} + +static void t7l66xb_mmc_clk_div(struct platform_device *mmc, int state) +{ + struct platform_device *dev = to_platform_device(mmc->dev.parent); + struct t7l66xb *t7l66xb = platform_get_drvdata(dev); + + tmio_core_mmc_clk_div(t7l66xb->scr + 0x200, 0, state); +} + /*--------------------------------------------------------------------------*/ static struct tmio_mmc_data t7166xb_mmc_data = { .hclk = 24000000, -}; - -static const struct resource t7l66xb_mmc_resources[] = { - { - .start = 0x800, - .end = 0x9ff, - .flags = IORESOURCE_MEM, - }, - { - .start = 0x200, - .end = 0x2ff, - .flags = IORESOURCE_MEM, - }, - { - .start = IRQ_T7L66XB_MMC, - .end = IRQ_T7L66XB_MMC, - .flags = IORESOURCE_IRQ, - }, + .set_pwr = t7l66xb_mmc_pwr, + .set_clk_div = t7l66xb_mmc_clk_div, }; static const struct resource t7l66xb_nand_resources[] = { @@ -282,6 +298,9 @@ static int t7l66xb_resume(struct platform_device *dev) if (pdata && pdata->resume) pdata->resume(dev); + tmio_core_mmc_enable(t7l66xb->scr + 0x200, 0, + t7l66xb_mmc_resources[0].start & 0xfffe); + return 0; } #else diff --git a/drivers/mfd/tc6387xb.c b/drivers/mfd/tc6387xb.c index 3280ab33f88a..5c7f04343d5c 100644 --- a/drivers/mfd/tc6387xb.c +++ b/drivers/mfd/tc6387xb.c @@ -22,28 +22,52 @@ enum { TC6387XB_CELL_MMC, }; +struct tc6387xb { + void __iomem *scr; + struct clk *clk32k; + struct resource rscr; +}; + +static struct resource tc6387xb_mmc_resources[] = { + { + .start = 0x800, + .end = 0x9ff, + .flags = IORESOURCE_MEM, + }, + { + .start = 0, + .end = 0, + .flags = IORESOURCE_IRQ, + }, +}; + +/*--------------------------------------------------------------------------*/ + #ifdef CONFIG_PM static int tc6387xb_suspend(struct platform_device *dev, pm_message_t state) { - struct clk *clk32k = platform_get_drvdata(dev); + struct tc6387xb *tc6387xb = platform_get_drvdata(dev); struct tc6387xb_platform_data *pdata = dev->dev.platform_data; if (pdata && pdata->suspend) pdata->suspend(dev); - clk_disable(clk32k); + clk_disable(tc6387xb->clk32k); return 0; } static int tc6387xb_resume(struct platform_device *dev) { - struct clk *clk32k = platform_get_drvdata(dev); + struct tc6387xb *tc6387xb = platform_get_drvdata(dev); struct tc6387xb_platform_data *pdata = dev->dev.platform_data; - clk_enable(clk32k); + clk_enable(tc6387xb->clk32k); if (pdata && pdata->resume) pdata->resume(dev); + tmio_core_mmc_resume(tc6387xb->scr + 0x200, 0, + tc6387xb_mmc_resources[0].start & 0xfffe); + return 0; } #else @@ -53,12 +77,32 @@ static int tc6387xb_resume(struct platform_device *dev) /*--------------------------------------------------------------------------*/ +static void tc6387xb_mmc_pwr(struct platform_device *mmc, int state) +{ + struct platform_device *dev = to_platform_device(mmc->dev.parent); + struct tc6387xb *tc6387xb = platform_get_drvdata(dev); + + tmio_core_mmc_pwr(tc6387xb->scr + 0x200, 0, state); +} + +static void tc6387xb_mmc_clk_div(struct platform_device *mmc, int state) +{ + struct platform_device *dev = to_platform_device(mmc->dev.parent); + struct tc6387xb *tc6387xb = platform_get_drvdata(dev); + + tmio_core_mmc_clk_div(tc6387xb->scr + 0x200, 0, state); +} + + static int tc6387xb_mmc_enable(struct platform_device *mmc) { struct platform_device *dev = to_platform_device(mmc->dev.parent); - struct clk *clk32k = platform_get_drvdata(dev); + struct tc6387xb *tc6387xb = platform_get_drvdata(dev); - clk_enable(clk32k); + clk_enable(tc6387xb->clk32k); + + tmio_core_mmc_enable(tc6387xb->scr + 0x200, 0, + tc6387xb_mmc_resources[0].start & 0xfffe); return 0; } @@ -66,36 +110,20 @@ static int tc6387xb_mmc_enable(struct platform_device *mmc) static int tc6387xb_mmc_disable(struct platform_device *mmc) { struct platform_device *dev = to_platform_device(mmc->dev.parent); - struct clk *clk32k = platform_get_drvdata(dev); + struct tc6387xb *tc6387xb = platform_get_drvdata(dev); - clk_disable(clk32k); + clk_disable(tc6387xb->clk32k); return 0; } -/*--------------------------------------------------------------------------*/ - static struct tmio_mmc_data tc6387xb_mmc_data = { .hclk = 24000000, + .set_pwr = tc6387xb_mmc_pwr, + .set_clk_div = tc6387xb_mmc_clk_div, }; -static struct resource tc6387xb_mmc_resources[] = { - { - .start = 0x800, - .end = 0x9ff, - .flags = IORESOURCE_MEM, - }, - { - .start = 0x200, - .end = 0x2ff, - .flags = IORESOURCE_MEM, - }, - { - .start = 0, - .end = 0, - .flags = IORESOURCE_IRQ, - }, -}; +/*--------------------------------------------------------------------------*/ static struct mfd_cell tc6387xb_cells[] = { [TC6387XB_CELL_MMC] = { @@ -111,8 +139,9 @@ static struct mfd_cell tc6387xb_cells[] = { static int tc6387xb_probe(struct platform_device *dev) { struct tc6387xb_platform_data *pdata = dev->dev.platform_data; - struct resource *iomem; + struct resource *iomem, *rscr; struct clk *clk32k; + struct tc6387xb *tc6387xb; int irq, ret; iomem = platform_get_resource(dev, IORESOURCE_MEM, 0); @@ -120,18 +149,40 @@ static int tc6387xb_probe(struct platform_device *dev) return -EINVAL; } + tc6387xb = kzalloc(sizeof *tc6387xb, GFP_KERNEL); + if (!tc6387xb) + return -ENOMEM; + ret = platform_get_irq(dev, 0); if (ret >= 0) irq = ret; else - goto err_resource; + goto err_no_irq; clk32k = clk_get(&dev->dev, "CLK_CK32K"); if (IS_ERR(clk32k)) { ret = PTR_ERR(clk32k); + goto err_no_clk; + } + + rscr = &tc6387xb->rscr; + rscr->name = "tc6387xb-core"; + rscr->start = iomem->start; + rscr->end = iomem->start + 0xff; + rscr->flags = IORESOURCE_MEM; + + ret = request_resource(iomem, rscr); + if (ret) goto err_resource; + + tc6387xb->scr = ioremap(rscr->start, rscr->end - rscr->start + 1); + if (!tc6387xb->scr) { + ret = -ENOMEM; + goto err_ioremap; } - platform_set_drvdata(dev, clk32k); + + tc6387xb->clk32k = clk32k; + platform_set_drvdata(dev, tc6387xb); if (pdata && pdata->enable) pdata->enable(dev); @@ -149,8 +200,13 @@ static int tc6387xb_probe(struct platform_device *dev) if (!ret) return 0; - clk_put(clk32k); +err_ioremap: + release_resource(&tc6387xb->rscr); err_resource: + clk_put(clk32k); +err_no_clk: +err_no_irq: + kfree(tc6387xb); return ret; } @@ -195,3 +251,4 @@ MODULE_DESCRIPTION("Toshiba TC6387XB core driver"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Ian Molton"); MODULE_ALIAS("platform:tc6387xb"); + diff --git a/drivers/mfd/tc6393xb.c b/drivers/mfd/tc6393xb.c index 1429a7341a9a..4bc5a08a2b09 100644 --- a/drivers/mfd/tc6393xb.c +++ b/drivers/mfd/tc6393xb.c @@ -136,10 +136,6 @@ static int tc6393xb_nand_enable(struct platform_device *nand) return 0; } -static struct tmio_mmc_data tc6393xb_mmc_data = { - .hclk = 24000000, -}; - static struct resource __devinitdata tc6393xb_nand_resources[] = { { .start = 0x1000, @@ -164,11 +160,6 @@ static struct resource __devinitdata tc6393xb_mmc_resources[] = { .end = 0x9ff, .flags = IORESOURCE_MEM, }, - { - .start = 0x200, - .end = 0x2ff, - .flags = IORESOURCE_MEM, - }, { .start = IRQ_TC6393_MMC, .end = IRQ_TC6393_MMC, @@ -346,6 +337,50 @@ int tc6393xb_lcd_mode(struct platform_device *fb, } EXPORT_SYMBOL(tc6393xb_lcd_mode); +static int tc6393xb_mmc_enable(struct platform_device *mmc) +{ + struct platform_device *dev = to_platform_device(mmc->dev.parent); + struct tc6393xb *tc6393xb = platform_get_drvdata(dev); + + tmio_core_mmc_enable(tc6393xb->scr + 0x200, 0, + tc6393xb_mmc_resources[0].start & 0xfffe); + + return 0; +} + +static int tc6393xb_mmc_resume(struct platform_device *mmc) +{ + struct platform_device *dev = to_platform_device(mmc->dev.parent); + struct tc6393xb *tc6393xb = platform_get_drvdata(dev); + + tmio_core_mmc_resume(tc6393xb->scr + 0x200, 0, + tc6393xb_mmc_resources[0].start & 0xfffe); + + return 0; +} + +static void tc6393xb_mmc_pwr(struct platform_device *mmc, int state) +{ + struct platform_device *dev = to_platform_device(mmc->dev.parent); + struct tc6393xb *tc6393xb = platform_get_drvdata(dev); + + tmio_core_mmc_pwr(tc6393xb->scr + 0x200, 0, state); +} + +static void tc6393xb_mmc_clk_div(struct platform_device *mmc, int state) +{ + struct platform_device *dev = to_platform_device(mmc->dev.parent); + struct tc6393xb *tc6393xb = platform_get_drvdata(dev); + + tmio_core_mmc_clk_div(tc6393xb->scr + 0x200, 0, state); +} + +static struct tmio_mmc_data tc6393xb_mmc_data = { + .hclk = 24000000, + .set_pwr = tc6393xb_mmc_pwr, + .set_clk_div = tc6393xb_mmc_clk_div, +}; + static struct mfd_cell __devinitdata tc6393xb_cells[] = { [TC6393XB_CELL_NAND] = { .name = "tmio-nand", @@ -355,6 +390,8 @@ static struct mfd_cell __devinitdata tc6393xb_cells[] = { }, [TC6393XB_CELL_MMC] = { .name = "tmio-mmc", + .enable = tc6393xb_mmc_enable, + .resume = tc6393xb_mmc_resume, .driver_data = &tc6393xb_mmc_data, .num_resources = ARRAY_SIZE(tc6393xb_mmc_resources), .resources = tc6393xb_mmc_resources, @@ -836,3 +873,4 @@ MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Ian Molton, Dmitry Baryshkov and Dirk Opfer"); MODULE_DESCRIPTION("tc6393xb Toshiba Mobile IO Controller"); MODULE_ALIAS("platform:tc6393xb"); + diff --git a/drivers/mfd/tmio_core.c b/drivers/mfd/tmio_core.c new file mode 100644 index 000000000000..eddc19ae464b --- /dev/null +++ b/drivers/mfd/tmio_core.c @@ -0,0 +1,52 @@ +/* + * Copyright(c) 2009 Ian Molton + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include + +int tmio_core_mmc_enable(void __iomem *cnf, int shift, unsigned long base) +{ + /* Enable the MMC/SD Control registers */ + sd_config_write16(cnf, shift, CNF_CMD, SDCREN); + sd_config_write32(cnf, shift, CNF_CTL_BASE, base & 0xfffe); + + /* Disable SD power during suspend */ + sd_config_write8(cnf, shift, CNF_PWR_CTL_3, 0x01); + + /* The below is required but why? FIXME */ + sd_config_write8(cnf, shift, CNF_STOP_CLK_CTL, 0x1f); + + /* Power down SD bus */ + sd_config_write8(cnf, shift, CNF_PWR_CTL_2, 0x00); + + return 0; +} +EXPORT_SYMBOL(tmio_core_mmc_enable); + +int tmio_core_mmc_resume(void __iomem *cnf, int shift, unsigned long base) +{ + + /* Enable the MMC/SD Control registers */ + sd_config_write16(cnf, shift, CNF_CMD, SDCREN); + sd_config_write32(cnf, shift, CNF_CTL_BASE, base & 0xfffe); + + return 0; +} +EXPORT_SYMBOL(tmio_core_mmc_resume); + +void tmio_core_mmc_pwr(void __iomem *cnf, int shift, int state) +{ + sd_config_write8(cnf, shift, CNF_PWR_CTL_2, state ? 0x02 : 0x00); +} +EXPORT_SYMBOL(tmio_core_mmc_pwr); + +void tmio_core_mmc_clk_div(void __iomem *cnf, int shift, int state) +{ + sd_config_write8(cnf, shift, CNF_SD_CLK_MODE, state ? 1 : 0); +} +EXPORT_SYMBOL(tmio_core_mmc_clk_div); + diff --git a/drivers/mmc/host/tmio_mmc.c b/drivers/mmc/host/tmio_mmc.c index 7cccc8523747..e22c3fa3516a 100644 --- a/drivers/mmc/host/tmio_mmc.c +++ b/drivers/mmc/host/tmio_mmc.c @@ -46,7 +46,9 @@ static void tmio_mmc_set_clock(struct tmio_mmc_host *host, int new_clock) clk |= 0x100; } - sd_config_write8(host, CNF_SD_CLK_MODE, clk >> 22); + if (host->set_clk_div) + host->set_clk_div(host->pdev, (clk>>22) & 1); + sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, clk & 0x1ff); } @@ -427,12 +429,13 @@ static void tmio_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) /* Power sequence - OFF -> ON -> UP */ switch (ios->power_mode) { case MMC_POWER_OFF: /* power down SD bus */ - sd_config_write8(host, CNF_PWR_CTL_2, 0x00); + if (host->set_pwr) + host->set_pwr(host->pdev, 0); tmio_mmc_clk_stop(host); break; case MMC_POWER_ON: /* power up SD bus */ - - sd_config_write8(host, CNF_PWR_CTL_2, 0x02); + if (host->set_pwr) + host->set_pwr(host->pdev, 1); break; case MMC_POWER_UP: /* start bus clock */ tmio_mmc_clk_start(host); @@ -485,21 +488,15 @@ static int tmio_mmc_resume(struct platform_device *dev) { struct mfd_cell *cell = (struct mfd_cell *)dev->dev.platform_data; struct mmc_host *mmc = platform_get_drvdata(dev); - struct tmio_mmc_host *host = mmc_priv(mmc); int ret = 0; /* Tell the MFD core we are ready to be enabled */ - if (cell->enable) { - ret = cell->enable(dev); + if (cell->resume) { + ret = cell->resume(dev); if (ret) goto out; } - /* Enable the MMC/SD Control registers */ - sd_config_write16(host, CNF_CMD, SDCREN); - sd_config_write32(host, CNF_CTL_BASE, - (dev->resource[0].start >> host->bus_shift) & 0xfffe); - mmc_resume_host(mmc); out: @@ -514,17 +511,16 @@ static int __devinit tmio_mmc_probe(struct platform_device *dev) { struct mfd_cell *cell = (struct mfd_cell *)dev->dev.platform_data; struct tmio_mmc_data *pdata; - struct resource *res_ctl, *res_cnf; + struct resource *res_ctl; struct tmio_mmc_host *host; struct mmc_host *mmc; int ret = -EINVAL; - if (dev->num_resources != 3) + if (dev->num_resources != 2) goto out; res_ctl = platform_get_resource(dev, IORESOURCE_MEM, 0); - res_cnf = platform_get_resource(dev, IORESOURCE_MEM, 1); - if (!res_ctl || !res_cnf) + if (!res_ctl) goto out; pdata = cell->driver_data; @@ -539,8 +535,12 @@ static int __devinit tmio_mmc_probe(struct platform_device *dev) host = mmc_priv(mmc); host->mmc = mmc; + host->pdev = dev; platform_set_drvdata(dev, mmc); + host->set_pwr = pdata->set_pwr; + host->set_clk_div = pdata->set_clk_div; + /* SD control register space size is 0x200, 0x400 for bus_shift=1 */ host->bus_shift = resource_size(res_ctl) >> 10; @@ -548,10 +548,6 @@ static int __devinit tmio_mmc_probe(struct platform_device *dev) if (!host->ctl) goto host_free; - host->cnf = ioremap(res_cnf->start, resource_size(res_cnf)); - if (!host->cnf) - goto unmap_ctl; - mmc->ops = &tmio_mmc_ops; mmc->caps = MMC_CAP_4_BIT_DATA; mmc->f_max = pdata->hclk; @@ -562,23 +558,9 @@ static int __devinit tmio_mmc_probe(struct platform_device *dev) if (cell->enable) { ret = cell->enable(dev); if (ret) - goto unmap_cnf; + goto unmap_ctl; } - /* Enable the MMC/SD Control registers */ - sd_config_write16(host, CNF_CMD, SDCREN); - sd_config_write32(host, CNF_CTL_BASE, - (dev->resource[0].start >> host->bus_shift) & 0xfffe); - - /* Disable SD power during suspend */ - sd_config_write8(host, CNF_PWR_CTL_3, 0x01); - - /* The below is required but why? FIXME */ - sd_config_write8(host, CNF_STOP_CLK_CTL, 0x1f); - - /* Power down SD bus*/ - sd_config_write8(host, CNF_PWR_CTL_2, 0x00); - tmio_mmc_clk_stop(host); reset(host); @@ -586,14 +568,14 @@ static int __devinit tmio_mmc_probe(struct platform_device *dev) if (ret >= 0) host->irq = ret; else - goto unmap_cnf; + goto unmap_ctl; disable_mmc_irqs(host, TMIO_MASK_ALL); ret = request_irq(host->irq, tmio_mmc_irq, IRQF_DISABLED | IRQF_TRIGGER_FALLING, dev_name(&dev->dev), host); if (ret) - goto unmap_cnf; + goto unmap_ctl; mmc_add_host(mmc); @@ -605,8 +587,6 @@ static int __devinit tmio_mmc_probe(struct platform_device *dev) return 0; -unmap_cnf: - iounmap(host->cnf); unmap_ctl: iounmap(host->ctl); host_free: @@ -626,7 +606,6 @@ static int __devexit tmio_mmc_remove(struct platform_device *dev) mmc_remove_host(mmc); free_irq(host->irq, host); iounmap(host->ctl); - iounmap(host->cnf); mmc_free_host(mmc); } diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h index 9fa998594974..692dc23363b9 100644 --- a/drivers/mmc/host/tmio_mmc.h +++ b/drivers/mmc/host/tmio_mmc.h @@ -11,26 +11,6 @@ #include -#define CNF_CMD 0x04 -#define CNF_CTL_BASE 0x10 -#define CNF_INT_PIN 0x3d -#define CNF_STOP_CLK_CTL 0x40 -#define CNF_GCLK_CTL 0x41 -#define CNF_SD_CLK_MODE 0x42 -#define CNF_PIN_STATUS 0x44 -#define CNF_PWR_CTL_1 0x48 -#define CNF_PWR_CTL_2 0x49 -#define CNF_PWR_CTL_3 0x4a -#define CNF_CARD_DETECT_MODE 0x4c -#define CNF_SD_SLOT 0x50 -#define CNF_EXT_GCLK_CTL_1 0xf0 -#define CNF_EXT_GCLK_CTL_2 0xf1 -#define CNF_EXT_GCLK_CTL_3 0xf9 -#define CNF_SD_LED_EN_1 0xfa -#define CNF_SD_LED_EN_2 0xfe - -#define SDCREN 0x2 /* Enable access to MMC CTL regs. (flag in COMMAND_REG)*/ - #define CTL_SD_CMD 0x00 #define CTL_ARG_REG 0x04 #define CTL_STOP_INTERNAL_ACTION 0x08 @@ -110,7 +90,6 @@ struct tmio_mmc_host { - void __iomem *cnf; void __iomem *ctl; unsigned long bus_shift; struct mmc_command *cmd; @@ -119,10 +98,16 @@ struct tmio_mmc_host { struct mmc_host *mmc; int irq; + /* Callbacks for clock / power control */ + void (*set_pwr)(struct platform_device *host, int state); + void (*set_clk_div)(struct platform_device *host, int state); + /* pio related stuff */ struct scatterlist *sg_ptr; unsigned int sg_len; unsigned int sg_off; + + struct platform_device *pdev; }; #include @@ -163,25 +148,6 @@ static inline void sd_ctrl_write32(struct tmio_mmc_host *host, int addr, writew(val >> 16, host->ctl + ((addr + 2) << host->bus_shift)); } -static inline void sd_config_write8(struct tmio_mmc_host *host, int addr, - u8 val) -{ - writeb(val, host->cnf + (addr << host->bus_shift)); -} - -static inline void sd_config_write16(struct tmio_mmc_host *host, int addr, - u16 val) -{ - writew(val, host->cnf + (addr << host->bus_shift)); -} - -static inline void sd_config_write32(struct tmio_mmc_host *host, int addr, - u32 val) -{ - writew(val, host->cnf + (addr << host->bus_shift)); - writew(val >> 16, host->cnf + ((addr + 2) << host->bus_shift)); -} - #include #include diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index 6b9c5d06690c..9cb1834deffa 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -2,6 +2,8 @@ #define MFD_TMIO_H #include +#include +#include #define tmio_ioread8(addr) readb(addr) #define tmio_ioread16(addr) readw(addr) @@ -18,11 +20,48 @@ writew((val) >> 16, (addr) + 2); \ } while (0) +#define CNF_CMD 0x04 +#define CNF_CTL_BASE 0x10 +#define CNF_INT_PIN 0x3d +#define CNF_STOP_CLK_CTL 0x40 +#define CNF_GCLK_CTL 0x41 +#define CNF_SD_CLK_MODE 0x42 +#define CNF_PIN_STATUS 0x44 +#define CNF_PWR_CTL_1 0x48 +#define CNF_PWR_CTL_2 0x49 +#define CNF_PWR_CTL_3 0x4a +#define CNF_CARD_DETECT_MODE 0x4c +#define CNF_SD_SLOT 0x50 +#define CNF_EXT_GCLK_CTL_1 0xf0 +#define CNF_EXT_GCLK_CTL_2 0xf1 +#define CNF_EXT_GCLK_CTL_3 0xf9 +#define CNF_SD_LED_EN_1 0xfa +#define CNF_SD_LED_EN_2 0xfe + +#define SDCREN 0x2 /* Enable access to MMC CTL regs. (flag in COMMAND_REG)*/ + +#define sd_config_write8(base, shift, reg, val) \ + tmio_iowrite8((val), (base) + ((reg) << (shift))) +#define sd_config_write16(base, shift, reg, val) \ + tmio_iowrite16((val), (base) + ((reg) << (shift))) +#define sd_config_write32(base, shift, reg, val) \ + do { \ + tmio_iowrite16((val), (base) + ((reg) << (shift))); \ + tmio_iowrite16((val) >> 16, (base) + ((reg + 2) << (shift))); \ + } while (0) + +int tmio_core_mmc_enable(void __iomem *cnf, int shift, unsigned long base); +int tmio_core_mmc_resume(void __iomem *cnf, int shift, unsigned long base); +void tmio_core_mmc_pwr(void __iomem *cnf, int shift, int state); +void tmio_core_mmc_clk_div(void __iomem *cnf, int shift, int state); + /* * data for the MMC controller */ struct tmio_mmc_data { const unsigned int hclk; + void (*set_pwr)(struct platform_device *host, int state); + void (*set_clk_div)(struct platform_device *host, int state); }; /* -- cgit v1.2.3 From ec51b7f538c440bfa5a4d538133c659071c02155 Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Tue, 19 Jan 2010 00:27:58 -0800 Subject: Input: ad7879 - support auxiliary GPIOs via gpiolib Drop the simple fancy sysfs hooks for the aux GPIOs and expose these via the gpiolib interface so that other drivers can use them. Signed-off-by: Michael Hennerich Signed-off-by: Mike Frysinger Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/ad7879.c | 197 ++++++++++++++++++++++++++----------- include/linux/spi/ad7879.h | 12 ++- 2 files changed, 149 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/touchscreen/ad7879.c b/drivers/input/touchscreen/ad7879.c index c21e6d3a8844..794d070c6900 100644 --- a/drivers/input/touchscreen/ad7879.c +++ b/drivers/input/touchscreen/ad7879.c @@ -47,6 +47,7 @@ #include #include #include +#include #include @@ -132,7 +133,9 @@ struct ad7879 { struct input_dev *input; struct work_struct work; struct timer_list timer; - +#ifdef CONFIG_GPIOLIB + struct gpio_chip gc; +#endif struct mutex mutex; unsigned disabled:1; /* P: mutex */ @@ -150,11 +153,9 @@ struct ad7879 { u8 median; u16 x_plate_ohms; u16 pressure_max; - u16 gpio_init; u16 cmd_crtl1; u16 cmd_crtl2; u16 cmd_crtl3; - unsigned gpio:1; }; static int ad7879_read(bus_device *, u8); @@ -237,24 +238,6 @@ static irqreturn_t ad7879_irq(int irq, void *handle) static void ad7879_setup(struct ad7879 *ts) { - ts->cmd_crtl3 = AD7879_YPLUS_BIT | - AD7879_XPLUS_BIT | - AD7879_Z2_BIT | - AD7879_Z1_BIT | - AD7879_TEMPMASK_BIT | - AD7879_AUXVBATMASK_BIT | - AD7879_GPIOALERTMASK_BIT; - - ts->cmd_crtl2 = AD7879_PM(AD7879_PM_DYN) | AD7879_DFR | - AD7879_AVG(ts->averaging) | - AD7879_MFS(ts->median) | - AD7879_FCD(ts->first_conversion_delay) | - ts->gpio_init; - - ts->cmd_crtl1 = AD7879_MODE_INT | AD7879_MODE_SEQ1 | - AD7879_ACQ(ts->acquisition_time) | - AD7879_TMR(ts->pen_down_acc_interval); - ad7879_write(ts->bus, AD7879_REG_CTRL2, ts->cmd_crtl2); ad7879_write(ts->bus, AD7879_REG_CTRL3, ts->cmd_crtl3); ad7879_write(ts->bus, AD7879_REG_CTRL1, ts->cmd_crtl1); @@ -324,48 +307,132 @@ static ssize_t ad7879_disable_store(struct device *dev, static DEVICE_ATTR(disable, 0664, ad7879_disable_show, ad7879_disable_store); -static ssize_t ad7879_gpio_show(struct device *dev, - struct device_attribute *attr, char *buf) +static struct attribute *ad7879_attributes[] = { + &dev_attr_disable.attr, + NULL +}; + +static const struct attribute_group ad7879_attr_group = { + .attrs = ad7879_attributes, +}; + +#ifdef CONFIG_GPIOLIB +static int ad7879_gpio_direction_input(struct gpio_chip *chip, + unsigned gpio) { - struct ad7879 *ts = dev_get_drvdata(dev); + struct ad7879 *ts = container_of(chip, struct ad7879, gc); + int err; - return sprintf(buf, "%u\n", ts->gpio); + mutex_lock(&ts->mutex); + ts->cmd_crtl2 |= AD7879_GPIO_EN | AD7879_GPIODIR | AD7879_GPIOPOL; + err = ad7879_write(ts->bus, AD7879_REG_CTRL2, ts->cmd_crtl2); + mutex_unlock(&ts->mutex); + + return err; } -static ssize_t ad7879_gpio_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static int ad7879_gpio_direction_output(struct gpio_chip *chip, + unsigned gpio, int level) { - struct ad7879 *ts = dev_get_drvdata(dev); - unsigned long val; - int error; + struct ad7879 *ts = container_of(chip, struct ad7879, gc); + int err; - error = strict_strtoul(buf, 10, &val); - if (error) - return error; + mutex_lock(&ts->mutex); + ts->cmd_crtl2 &= ~AD7879_GPIODIR; + ts->cmd_crtl2 |= AD7879_GPIO_EN | AD7879_GPIOPOL; + if (level) + ts->cmd_crtl2 |= AD7879_GPIO_DATA; + else + ts->cmd_crtl2 &= ~AD7879_GPIO_DATA; + + err = ad7879_write(ts->bus, AD7879_REG_CTRL2, ts->cmd_crtl2); + mutex_unlock(&ts->mutex); + + return err; +} + +static int ad7879_gpio_get_value(struct gpio_chip *chip, unsigned gpio) +{ + struct ad7879 *ts = container_of(chip, struct ad7879, gc); + u16 val; mutex_lock(&ts->mutex); - ts->gpio = !!val; - error = ad7879_write(ts->bus, AD7879_REG_CTRL2, - ts->gpio ? - ts->cmd_crtl2 & ~AD7879_GPIO_DATA : - ts->cmd_crtl2 | AD7879_GPIO_DATA); + val = ad7879_read(ts->bus, AD7879_REG_CTRL2); mutex_unlock(&ts->mutex); - return error ? : count; + return !!(val & AD7879_GPIO_DATA); } -static DEVICE_ATTR(gpio, 0664, ad7879_gpio_show, ad7879_gpio_store); +static void ad7879_gpio_set_value(struct gpio_chip *chip, + unsigned gpio, int value) +{ + struct ad7879 *ts = container_of(chip, struct ad7879, gc); -static struct attribute *ad7879_attributes[] = { - &dev_attr_disable.attr, - &dev_attr_gpio.attr, - NULL -}; + mutex_lock(&ts->mutex); + if (value) + ts->cmd_crtl2 |= AD7879_GPIO_DATA; + else + ts->cmd_crtl2 &= ~AD7879_GPIO_DATA; -static const struct attribute_group ad7879_attr_group = { - .attrs = ad7879_attributes, -}; + ad7879_write(ts->bus, AD7879_REG_CTRL2, ts->cmd_crtl2); + mutex_unlock(&ts->mutex); +} + +static int __devinit ad7879_gpio_add(struct device *dev) +{ + struct ad7879 *ts = dev_get_drvdata(dev); + struct ad7879_platform_data *pdata = dev->platform_data; + int ret = 0; + + if (pdata->gpio_export) { + ts->gc.direction_input = ad7879_gpio_direction_input; + ts->gc.direction_output = ad7879_gpio_direction_output; + ts->gc.get = ad7879_gpio_get_value; + ts->gc.set = ad7879_gpio_set_value; + ts->gc.can_sleep = 1; + ts->gc.base = pdata->gpio_base; + ts->gc.ngpio = 1; + ts->gc.label = "AD7879-GPIO"; + ts->gc.owner = THIS_MODULE; + ts->gc.dev = dev; + + ret = gpiochip_add(&ts->gc); + if (ret) + dev_err(dev, "failed to register gpio %d\n", + ts->gc.base); + } + + return ret; +} + +/* + * We mark ad7879_gpio_remove inline so there is a chance the code + * gets discarded when not needed. We can't do __devinit/__devexit + * markup since it is used in both probe and remove methods. + */ +static inline void ad7879_gpio_remove(struct device *dev) +{ + struct ad7879 *ts = dev_get_drvdata(dev); + struct ad7879_platform_data *pdata = dev->platform_data; + int ret; + + if (pdata->gpio_export) { + ret = gpiochip_remove(&ts->gc); + if (ret) + dev_err(dev, "failed to remove gpio %d\n", + ts->gc.base); + } +} +#else +static inline int ad7879_gpio_add(struct device *dev) +{ + return 0; +} + +static inline void ad7879_gpio_remove(struct device *dev) +{ +} +#endif static int __devinit ad7879_construct(bus_device *bus, struct ad7879 *ts) { @@ -403,12 +470,6 @@ static int __devinit ad7879_construct(bus_device *bus, struct ad7879 *ts) ts->pen_down_acc_interval = pdata->pen_down_acc_interval; ts->median = pdata->median; - if (pdata->gpio_output) - ts->gpio_init = AD7879_GPIO_EN | - (pdata->gpio_default ? 0 : AD7879_GPIO_DATA); - else - ts->gpio_init = AD7879_GPIO_EN | AD7879_GPIODIR; - snprintf(ts->phys, sizeof(ts->phys), "%s/input0", dev_name(&bus->dev)); input_dev->name = "AD7879 Touchscreen"; @@ -446,6 +507,23 @@ static int __devinit ad7879_construct(bus_device *bus, struct ad7879 *ts) goto err_free_mem; } + ts->cmd_crtl3 = AD7879_YPLUS_BIT | + AD7879_XPLUS_BIT | + AD7879_Z2_BIT | + AD7879_Z1_BIT | + AD7879_TEMPMASK_BIT | + AD7879_AUXVBATMASK_BIT | + AD7879_GPIOALERTMASK_BIT; + + ts->cmd_crtl2 = AD7879_PM(AD7879_PM_DYN) | AD7879_DFR | + AD7879_AVG(ts->averaging) | + AD7879_MFS(ts->median) | + AD7879_FCD(ts->first_conversion_delay); + + ts->cmd_crtl1 = AD7879_MODE_INT | AD7879_MODE_SEQ1 | + AD7879_ACQ(ts->acquisition_time) | + AD7879_TMR(ts->pen_down_acc_interval); + ad7879_setup(ts); err = request_irq(bus->irq, ad7879_irq, @@ -460,15 +538,21 @@ static int __devinit ad7879_construct(bus_device *bus, struct ad7879 *ts) if (err) goto err_free_irq; - err = input_register_device(input_dev); + err = ad7879_gpio_add(&bus->dev); if (err) goto err_remove_attr; + err = input_register_device(input_dev); + if (err) + goto err_remove_gpio; + dev_info(&bus->dev, "Rev.%d touchscreen, irq %d\n", revid >> 8, bus->irq); return 0; +err_remove_gpio: + ad7879_gpio_remove(&bus->dev); err_remove_attr: sysfs_remove_group(&bus->dev.kobj, &ad7879_attr_group); err_free_irq: @@ -481,6 +565,7 @@ err_free_mem: static int __devexit ad7879_destroy(bus_device *bus, struct ad7879 *ts) { + ad7879_gpio_remove(&bus->dev); ad7879_disable(ts); sysfs_remove_group(&ts->bus->dev.kobj, &ad7879_attr_group); free_irq(ts->bus->irq, ts); diff --git a/include/linux/spi/ad7879.h b/include/linux/spi/ad7879.h index 4231104c9afa..6334cee1a3be 100644 --- a/include/linux/spi/ad7879.h +++ b/include/linux/spi/ad7879.h @@ -28,8 +28,12 @@ struct ad7879_platform_data { * 1 = 4, 2 = 8, 3 = 16 (median > averaging) */ u8 median; - /* 1 = AUX/VBAT/GPIO set to GPIO Output */ - u8 gpio_output; - /* Initial GPIO pin state (valid if gpio_output = 1) */ - u8 gpio_default; + /* 1 = AUX/VBAT/GPIO export GPIO to gpiolib + * requires CONFIG_GPIOLIB + */ + bool gpio_export; + /* identifies the first GPIO number handled by this chip; + * or, if negative, requests dynamic ID allocation. + */ + s32 gpio_base; }; -- cgit v1.2.3 From d2d4e780aff2fab46a792ebc89f80d1a6872b325 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 18 Jan 2010 07:20:28 +0000 Subject: ide: add drive->pio_mode field Add pio_mode field to ide_drive_t matching pio_mode field used in struct ata_device. The validity of the field is restricted to ->set_pio_mode method only currently in IDE subsystem. Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: David S. Miller --- drivers/ide/ide-devsets.c | 2 ++ drivers/ide/ide-probe.c | 2 ++ drivers/ide/ide-xfer-mode.c | 3 +++ include/linux/ide.h | 1 + 4 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index 1099bf7cf968..cb3341ce655c 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -105,6 +105,8 @@ static int set_pio_mode(ide_drive_t *drive, int arg) return -ENOSYS; if (set_pio_mode_abuse(drive->hwif, arg)) { + drive->pio_mode = arg + XFER_PIO_0; + if (arg == 8 || arg == 9) { unsigned long flags; diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 4d76ba473097..9a9f10f4cf9f 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -1043,6 +1043,8 @@ static void ide_port_init_devices(ide_hwif_t *hwif) if (hwif->host_flags & IDE_HFLAG_NO_UNMASK_IRQS) drive->dev_flags |= IDE_DFLAG_NO_UNMASK; + drive->pio_mode = XFER_PIO_0; + if (port_ops && port_ops->init_dev) port_ops->init_dev(drive); } diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c index 46d203ce60cc..cdae463f6b41 100644 --- a/drivers/ide/ide-xfer-mode.c +++ b/drivers/ide/ide-xfer-mode.c @@ -135,6 +135,7 @@ int ide_set_pio_mode(ide_drive_t *drive, const u8 mode) * set transfer mode on the device in ->set_pio_mode method... */ if (port_ops->set_dma_mode == NULL) { + drive->pio_mode = mode; port_ops->set_pio_mode(drive, mode - XFER_PIO_0); return 0; } @@ -142,9 +143,11 @@ int ide_set_pio_mode(ide_drive_t *drive, const u8 mode) if (hwif->host_flags & IDE_HFLAG_POST_SET_MODE) { if (ide_config_drive_speed(drive, mode)) return -1; + drive->pio_mode = mode; port_ops->set_pio_mode(drive, mode - XFER_PIO_0); return 0; } else { + drive->pio_mode = mode; port_ops->set_pio_mode(drive, mode - XFER_PIO_0); return ide_config_drive_speed(drive, mode); } diff --git a/include/linux/ide.h b/include/linux/ide.h index 0ec612959042..b5d2e9655059 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -515,6 +515,7 @@ struct ide_drive_s { u8 init_speed; /* transfer rate set at boot */ u8 current_speed; /* current transfer rate set */ u8 desired_speed; /* desired transfer rate set */ + u8 pio_mode; /* for ->set_pio_mode _only_ */ u8 dn; /* now wide spread use */ u8 acoustic; /* acoustic management */ u8 media; /* disk, cdrom, tape, floppy, ... */ -- cgit v1.2.3 From 3fccaa192b9501e79a57e02e62b6bf420d2b461e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 18 Jan 2010 07:20:35 +0000 Subject: ide: add drive->dma_mode field Add dma_mode field to ide_drive_t matching dma_mode field used in struct ata_device. The validity of the field is restricted to ->dma_pio_mode method only currently in IDE subsystem. Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: David S. Miller --- drivers/ide/aec62xx.c | 1 + drivers/ide/ide-xfer-mode.c | 2 ++ include/linux/ide.h | 1 + 3 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/drivers/ide/aec62xx.c b/drivers/ide/aec62xx.c index 878f8ec6dbe1..4c869872eb9a 100644 --- a/drivers/ide/aec62xx.c +++ b/drivers/ide/aec62xx.c @@ -136,6 +136,7 @@ static void aec6260_set_mode(ide_drive_t *drive, const u8 speed) static void aec_set_pio_mode(ide_drive_t *drive, const u8 pio) { + drive->dma_mode = pio + XFER_PIO_0; drive->hwif->port_ops->set_dma_mode(drive, pio + XFER_PIO_0); } diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c index cdae463f6b41..c2323869d92a 100644 --- a/drivers/ide/ide-xfer-mode.c +++ b/drivers/ide/ide-xfer-mode.c @@ -167,9 +167,11 @@ int ide_set_dma_mode(ide_drive_t *drive, const u8 mode) if (hwif->host_flags & IDE_HFLAG_POST_SET_MODE) { if (ide_config_drive_speed(drive, mode)) return -1; + drive->dma_mode = mode; port_ops->set_dma_mode(drive, mode); return 0; } else { + drive->dma_mode = mode; port_ops->set_dma_mode(drive, mode); return ide_config_drive_speed(drive, mode); } diff --git a/include/linux/ide.h b/include/linux/ide.h index b5d2e9655059..746ef9fdabcb 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -516,6 +516,7 @@ struct ide_drive_s { u8 current_speed; /* current transfer rate set */ u8 desired_speed; /* desired transfer rate set */ u8 pio_mode; /* for ->set_pio_mode _only_ */ + u8 dma_mode; /* for ->dma_pio_mode _only_ */ u8 dn; /* now wide spread use */ u8 acoustic; /* acoustic management */ u8 media; /* disk, cdrom, tape, floppy, ... */ -- cgit v1.2.3 From e085b3cae85af47eb0a3eda3186bd898310fb322 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 19 Jan 2010 01:44:41 -0800 Subject: ide: change ->set_pio_mode method parameters Change ->set_pio_mode method parameters to match ->set_piomode method used in struct ata_port_operations. Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: David S. Miller --- drivers/ide/aec62xx.c | 6 +++--- drivers/ide/ali14xx.c | 3 ++- drivers/ide/alim15x3.c | 7 +++---- drivers/ide/amd74xx.c | 4 ++-- drivers/ide/at91_ide.c | 5 +++-- drivers/ide/atiixp.c | 7 ++++--- drivers/ide/au1xxx-ide.c | 5 ++--- drivers/ide/cmd640.c | 3 ++- drivers/ide/cmd64x.c | 4 +++- drivers/ide/cs5520.c | 7 ++++--- drivers/ide/cs5530.c | 7 ++++--- drivers/ide/cs5535.c | 6 +++--- drivers/ide/cs5536.c | 7 ++++--- drivers/ide/cy82c693.c | 5 ++--- drivers/ide/dtc2278.c | 4 ++-- drivers/ide/hpt366.c | 4 ++-- drivers/ide/ht6560b.c | 3 ++- drivers/ide/ide-devsets.c | 4 ++-- drivers/ide/ide-xfer-mode.c | 6 +++--- drivers/ide/it8172.c | 10 +++++----- drivers/ide/it8213.c | 14 +++++++------- drivers/ide/it821x.c | 6 +++--- drivers/ide/jmicron.c | 2 +- drivers/ide/opti621.c | 6 +++--- drivers/ide/palm_bk3710.c | 5 +++-- drivers/ide/pdc202xx_new.c | 4 ++-- drivers/ide/pdc202xx_old.c | 4 ++-- drivers/ide/piix.c | 14 +++++++------- drivers/ide/pmac.c | 5 ++--- drivers/ide/qd65xx.c | 10 ++++------ drivers/ide/sc1200.c | 4 ++-- drivers/ide/scc_pata.c | 6 +++--- drivers/ide/serverworks.c | 5 +++-- drivers/ide/siimage.c | 6 +++--- drivers/ide/sis5513.c | 4 ++-- drivers/ide/sl82c105.c | 5 +++-- drivers/ide/slc90e66.c | 13 +++++++------ drivers/ide/tc86c001.c | 4 ++-- drivers/ide/triflex.c | 4 ++-- drivers/ide/tx4938ide.c | 5 ++--- drivers/ide/tx4939ide.c | 4 ++-- drivers/ide/umc8672.c | 5 +++-- drivers/ide/via82cxxx.c | 6 +++--- include/linux/ide.h | 2 +- 44 files changed, 129 insertions(+), 121 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/aec62xx.c b/drivers/ide/aec62xx.c index 4c869872eb9a..3790847361c3 100644 --- a/drivers/ide/aec62xx.c +++ b/drivers/ide/aec62xx.c @@ -134,10 +134,10 @@ static void aec6260_set_mode(ide_drive_t *drive, const u8 speed) local_irq_restore(flags); } -static void aec_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void aec_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - drive->dma_mode = pio + XFER_PIO_0; - drive->hwif->port_ops->set_dma_mode(drive, pio + XFER_PIO_0); + drive->dma_mode = drive->pio_mode; + hwif->port_ops->set_dma_mode(drive, drive->dma_mode); } static int init_chipset_aec62xx(struct pci_dev *dev) diff --git a/drivers/ide/ali14xx.c b/drivers/ide/ali14xx.c index 90da1f953ed0..25b9fe3a9f8e 100644 --- a/drivers/ide/ali14xx.c +++ b/drivers/ide/ali14xx.c @@ -109,13 +109,14 @@ static DEFINE_SPINLOCK(ali14xx_lock); * This function computes timing parameters * and sets controller registers accordingly. */ -static void ali14xx_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void ali14xx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { int driveNum; int time1, time2; u8 param1, param2, param3, param4; unsigned long flags; int bus_speed = ide_vlb_clk ? ide_vlb_clk : 50; + const u8 pio = drive->pio_mode - XFER_PIO_0; struct ide_timing *t = ide_timing_find_mode(XFER_PIO_0 + pio); /* calculate timing, according to PIO mode */ diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c index 8f03cce055fa..28cee1055f76 100644 --- a/drivers/ide/alim15x3.c +++ b/drivers/ide/alim15x3.c @@ -63,15 +63,14 @@ static void ali_fifo_control(ide_hwif_t *hwif, ide_drive_t *drive, int on) /** * ali_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * Program the controller for the given PIO mode. */ -static void ali_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void ali_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); int bus_speed = ide_pci_clk ? ide_pci_clk : 33; unsigned long T = 1000000 / bus_speed; /* PCI clock based */ @@ -79,7 +78,7 @@ static void ali_set_pio_mode(ide_drive_t *drive, const u8 pio) u8 unit = drive->dn & 1; struct ide_timing t; - ide_timing_compute(drive, XFER_PIO_0 + pio, &t, T, 1); + ide_timing_compute(drive, drive->pio_mode, &t, T, 1); t.setup = clamp_val(t.setup, 1, 8) & 7; t.active = clamp_val(t.active, 1, 8) & 7; diff --git a/drivers/ide/amd74xx.c b/drivers/ide/amd74xx.c index 108e9b676859..3eee7be7ca6f 100644 --- a/drivers/ide/amd74xx.c +++ b/drivers/ide/amd74xx.c @@ -108,9 +108,9 @@ static void amd_set_drive(ide_drive_t *drive, const u8 speed) * amd_set_pio_mode() is a callback from upper layers for PIO-only tuning. */ -static void amd_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void amd_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - amd_set_drive(drive, XFER_PIO_0 + pio); + amd_set_drive(drive, drive->pio_mode); } static void amd7409_cable_detect(struct pci_dev *dev) diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c index 248219a89a68..000a78e5246c 100644 --- a/drivers/ide/at91_ide.c +++ b/drivers/ide/at91_ide.c @@ -172,11 +172,12 @@ static void at91_ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, leave_16bit(chipselect, mode); } -static void at91_ide_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void at91_ide_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { struct ide_timing *timing; - u8 chipselect = drive->hwif->select_data; + u8 chipselect = hwif->select_data; int use_iordy = 0; + const u8 pio = drive->pio_mode - XFER_PIO_0; pdbg("chipselect %u pio %u\n", chipselect, pio); diff --git a/drivers/ide/atiixp.c b/drivers/ide/atiixp.c index 837322b10a4c..b6848dfb93b0 100644 --- a/drivers/ide/atiixp.c +++ b/drivers/ide/atiixp.c @@ -42,19 +42,20 @@ static DEFINE_SPINLOCK(atiixp_lock); /** * atiixp_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * Set the interface PIO mode. */ -static void atiixp_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void atiixp_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - struct pci_dev *dev = to_pci_dev(drive->hwif->dev); + struct pci_dev *dev = to_pci_dev(hwif->dev); unsigned long flags; int timing_shift = (drive->dn ^ 1) * 8; u32 pio_timing_data; u16 pio_mode_data; + const u8 pio = drive->pio_mode - XFER_PIO_0; spin_lock_irqsave(&atiixp_lock, flags); diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c index 87cef0c440ad..c90e9b0a9f6e 100644 --- a/drivers/ide/au1xxx-ide.c +++ b/drivers/ide/au1xxx-ide.c @@ -99,12 +99,11 @@ static void au1xxx_output_data(ide_drive_t *drive, struct ide_cmd *cmd, } #endif -static void au1xxx_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void au1xxx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { int mem_sttime = 0, mem_stcfg = au_readl(MEM_STCFG2); - /* set pio mode! */ - switch(pio) { + switch (drive->pio_mode - XFER_PIO_0) { case 0: mem_sttime = SBC_IDE_TIMING(PIO0); diff --git a/drivers/ide/cmd640.c b/drivers/ide/cmd640.c index 1a32d62ed86b..c7d46a3d347a 100644 --- a/drivers/ide/cmd640.c +++ b/drivers/ide/cmd640.c @@ -572,9 +572,10 @@ static void cmd640_set_mode(ide_drive_t *drive, unsigned int index, program_drive_counts(drive, index); } -static void cmd640_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void cmd640_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { unsigned int index = 0, cycle_time; + const u8 pio = drive->pio_mode - XFER_PIO_0; u8 b; switch (pio) { diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c index 9f89f3116df0..0b11745937e7 100644 --- a/drivers/ide/cmd64x.c +++ b/drivers/ide/cmd64x.c @@ -127,8 +127,10 @@ static void cmd64x_program_timings(ide_drive_t *drive, u8 mode) * Special cases are 8: prefetch off, 9: prefetch on (both never worked) */ -static void cmd64x_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void cmd64x_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { + const u8 pio = drive->pio_mode - XFER_PIO_0; + /* * Filter out the prefetch control values * to prevent PIO5 from being programmed diff --git a/drivers/ide/cs5520.c b/drivers/ide/cs5520.c index 09f98ed0731f..b8094f049f3e 100644 --- a/drivers/ide/cs5520.c +++ b/drivers/ide/cs5520.c @@ -57,11 +57,11 @@ static struct pio_clocks cs5520_pio_clocks[]={ {1, 2, 1} }; -static void cs5520_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void cs5520_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *pdev = to_pci_dev(hwif->dev); int controller = drive->dn > 1 ? 1 : 0; + const u8 pio = drive->pio_mode - XFER_PIO_0; /* 8bit CAT/CRT - 8bit command timing for channel */ pci_write_config_byte(pdev, 0x62 + controller, @@ -85,7 +85,8 @@ static void cs5520_set_dma_mode(ide_drive_t *drive, const u8 speed) { printk(KERN_ERR "cs55x0: bad ide timing.\n"); - cs5520_set_pio_mode(drive, 0); + drive->pio_mode = XFER_PIO_0 + 0; + cs5520_set_pio_mode(drive->hwif, drive); } static const struct ide_port_ops cs5520_port_ops = { diff --git a/drivers/ide/cs5530.c b/drivers/ide/cs5530.c index 40bf05eddf6e..4ced40255ad6 100644 --- a/drivers/ide/cs5530.c +++ b/drivers/ide/cs5530.c @@ -41,8 +41,8 @@ static unsigned int cs5530_pio_timings[2][5] = { /** * cs5530_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * Handles setting of PIO mode for the chipset. * @@ -50,10 +50,11 @@ static unsigned int cs5530_pio_timings[2][5] = { * will have valid default PIO timings set up before we get here. */ -static void cs5530_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void cs5530_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - unsigned long basereg = CS5530_BASEREG(drive->hwif); + unsigned long basereg = CS5530_BASEREG(hwif); unsigned int format = (inl(basereg + 4) >> 31) & 1; + const u8 pio = drive->pio_mode - XFER_PIO_0; outl(cs5530_pio_timings[format][pio], basereg + ((drive->dn & 1)<<3)); } diff --git a/drivers/ide/cs5535.c b/drivers/ide/cs5535.c index b883838adc24..7974415ea89f 100644 --- a/drivers/ide/cs5535.c +++ b/drivers/ide/cs5535.c @@ -142,15 +142,15 @@ static void cs5535_set_dma_mode(ide_drive_t *drive, const u8 speed) /** * cs5535_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * A callback from the upper layers for PIO-only tuning. */ -static void cs5535_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void cs5535_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - cs5535_set_speed(drive, XFER_PIO_0 + pio); + cs5535_set_speed(drive, drive->pio_mode); } static u8 cs5535_cable_detect(ide_hwif_t *hwif) diff --git a/drivers/ide/cs5536.c b/drivers/ide/cs5536.c index 9623b852c616..b518ef0e9a35 100644 --- a/drivers/ide/cs5536.c +++ b/drivers/ide/cs5536.c @@ -125,11 +125,11 @@ static u8 cs5536_cable_detect(ide_hwif_t *hwif) /** * cs5536_set_pio_mode - PIO timing setup + * @hwif: ATA port * @drive: ATA device - * @pio: PIO mode number */ -static void cs5536_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void cs5536_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { static const u8 drv_timings[5] = { 0x98, 0x55, 0x32, 0x21, 0x20, @@ -143,11 +143,12 @@ static void cs5536_set_pio_mode(ide_drive_t *drive, const u8 pio) 0x99, 0x92, 0x90, 0x22, 0x20, }; - struct pci_dev *pdev = to_pci_dev(drive->hwif->dev); + struct pci_dev *pdev = to_pci_dev(hwif->dev); ide_drive_t *pair = ide_get_pair_dev(drive); int cshift = (drive->dn & 1) ? IDE_CAST_D1_SHIFT : IDE_CAST_D0_SHIFT; unsigned long timings = (unsigned long)ide_get_drivedata(drive); u32 cast; + const u8 pio = drive->pio_mode - XFER_PIO_0; u8 cmd_pio = pio; if (pair) diff --git a/drivers/ide/cy82c693.c b/drivers/ide/cy82c693.c index fbf3dcc26577..ead65c394f00 100644 --- a/drivers/ide/cy82c693.c +++ b/drivers/ide/cy82c693.c @@ -80,9 +80,8 @@ static void cy82c693_set_dma_mode(ide_drive_t *drive, const u8 mode) outb(data, CY82_DATA_PORT); } -static void cy82c693_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void cy82c693_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); int bus_speed = ide_pci_clk ? ide_pci_clk : 33; const unsigned long T = 1000000 / bus_speed; @@ -101,7 +100,7 @@ static void cy82c693_set_pio_mode(ide_drive_t *drive, const u8 pio) } } - ide_timing_compute(drive, XFER_PIO_0 + pio, &t, T, 1); + ide_timing_compute(drive, drive->pio_mode, &t, T, 1); time_16 = clamp_val(t.recover - 1, 0, 15) | (clamp_val(t.active - 1, 0, 15) << 4); diff --git a/drivers/ide/dtc2278.c b/drivers/ide/dtc2278.c index c6b138122981..6929f7fce93a 100644 --- a/drivers/ide/dtc2278.c +++ b/drivers/ide/dtc2278.c @@ -68,11 +68,11 @@ static void sub22 (char b, char c) static DEFINE_SPINLOCK(dtc2278_lock); -static void dtc2278_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void dtc2278_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { unsigned long flags; - if (pio >= 3) { + if (drive->pio_mode >= XFER_PIO_3) { spin_lock_irqsave(&dtc2278_lock, flags); /* * This enables PIO mode4 (3?) on the first interface diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c index 4d90ac2dbb1b..f1dec519a9e6 100644 --- a/drivers/ide/hpt366.c +++ b/drivers/ide/hpt366.c @@ -651,9 +651,9 @@ static void hpt3xx_set_mode(ide_drive_t *drive, const u8 speed) pci_write_config_dword(dev, itr_addr, new_itr); } -static void hpt3xx_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void hpt3xx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - hpt3xx_set_mode(drive, XFER_PIO_0 + pio); + hpt3xx_set_mode(drive, drive->pio_mode); } static void hpt3xx_maskproc(ide_drive_t *drive, int mask) diff --git a/drivers/ide/ht6560b.c b/drivers/ide/ht6560b.c index aafed8060e17..d81e49680c3f 100644 --- a/drivers/ide/ht6560b.c +++ b/drivers/ide/ht6560b.c @@ -279,9 +279,10 @@ static void ht_set_prefetch(ide_drive_t *drive, u8 state) #endif } -static void ht6560b_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void ht6560b_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { unsigned long flags, config; + const u8 pio = drive->pio_mode - XFER_PIO_0; u8 timing; switch (pio) { diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index cb3341ce655c..c6935c78757c 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -112,10 +112,10 @@ static int set_pio_mode(ide_drive_t *drive, int arg) /* take lock for IDE_DFLAG_[NO_]UNMASK/[NO_]IO_32BIT */ spin_lock_irqsave(&hwif->lock, flags); - port_ops->set_pio_mode(drive, arg); + port_ops->set_pio_mode(hwif, drive); spin_unlock_irqrestore(&hwif->lock, flags); } else - port_ops->set_pio_mode(drive, arg); + port_ops->set_pio_mode(hwif, drive); } else { int keep_dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA); diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c index c2323869d92a..a62fb03fc1cc 100644 --- a/drivers/ide/ide-xfer-mode.c +++ b/drivers/ide/ide-xfer-mode.c @@ -136,7 +136,7 @@ int ide_set_pio_mode(ide_drive_t *drive, const u8 mode) */ if (port_ops->set_dma_mode == NULL) { drive->pio_mode = mode; - port_ops->set_pio_mode(drive, mode - XFER_PIO_0); + port_ops->set_pio_mode(hwif, drive); return 0; } @@ -144,11 +144,11 @@ int ide_set_pio_mode(ide_drive_t *drive, const u8 mode) if (ide_config_drive_speed(drive, mode)) return -1; drive->pio_mode = mode; - port_ops->set_pio_mode(drive, mode - XFER_PIO_0); + port_ops->set_pio_mode(hwif, drive); return 0; } else { drive->pio_mode = mode; - port_ops->set_pio_mode(drive, mode - XFER_PIO_0); + port_ops->set_pio_mode(hwif, drive); return ide_config_drive_speed(drive, mode); } } diff --git a/drivers/ide/it8172.c b/drivers/ide/it8172.c index 0d266a5b524d..9dfdc8741a7b 100644 --- a/drivers/ide/it8172.c +++ b/drivers/ide/it8172.c @@ -37,12 +37,12 @@ #define DRV_NAME "IT8172" -static void it8172_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void it8172_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u16 drive_enables; u32 drive_timing; + const u8 pio = drive->pio_mode - XFER_PIO_0; /* * The highest value of DIOR/DIOW pulse width and recovery time @@ -98,14 +98,14 @@ static void it8172_set_dma_mode(ide_drive_t *drive, const u8 speed) pci_write_config_byte(dev, 0x4a, reg4a | u_speed); } else { const u8 mwdma_to_pio[] = { 0, 3, 4 }; - u8 pio; pci_write_config_byte(dev, 0x48, reg48 & ~u_flag); pci_write_config_byte(dev, 0x4a, reg4a & ~a_speed); - pio = mwdma_to_pio[speed - XFER_MW_DMA_0]; + drive->pio_mode = + mwdma_to_pio[speed - XFER_MW_DMA_0] + XFER_PIO_0; - it8172_set_pio_mode(drive, pio); + it8172_set_pio_mode(hwif, drive); } } diff --git a/drivers/ide/it8213.c b/drivers/ide/it8213.c index 47976167796a..492c07d5f4f3 100644 --- a/drivers/ide/it8213.c +++ b/drivers/ide/it8213.c @@ -17,15 +17,14 @@ /** * it8213_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * Set the interface PIO mode. */ -static void it8213_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void it8213_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); int is_slave = drive->dn & 1; int master_port = 0x40; @@ -35,6 +34,7 @@ static void it8213_set_pio_mode(ide_drive_t *drive, const u8 pio) u8 slave_data; static DEFINE_SPINLOCK(tune_lock); int control = 0; + const u8 pio = drive->pio_mode - XFER_PIO_0; static const u8 timings[][2] = { { 0, 0 }, @@ -120,7 +120,6 @@ static void it8213_set_dma_mode(ide_drive_t *drive, const u8 speed) pci_write_config_byte(dev, 0x54, reg54 & ~v_flag); } else { const u8 mwdma_to_pio[] = { 0, 3, 4 }; - u8 pio; if (reg48 & u_flag) pci_write_config_byte(dev, 0x48, reg48 & ~u_flag); @@ -132,11 +131,12 @@ static void it8213_set_dma_mode(ide_drive_t *drive, const u8 speed) pci_write_config_byte(dev, 0x55, (u8) reg55 & ~w_flag); if (speed >= XFER_MW_DMA_0) - pio = mwdma_to_pio[speed - XFER_MW_DMA_0]; + drive->pio_mode = + mwdma_to_pio[speed - XFER_MW_DMA_0] + XFER_PIO_0; else - pio = 2; /* only SWDMA2 is allowed */ + drive->pio_mode = XFER_PIO_2; /* for SWDMA2 */ - it8213_set_pio_mode(drive, pio); + it8213_set_pio_mode(hwif, drive); } } diff --git a/drivers/ide/it821x.c b/drivers/ide/it821x.c index 51aa745246dc..69becb7b9656 100644 --- a/drivers/ide/it821x.c +++ b/drivers/ide/it821x.c @@ -228,18 +228,18 @@ static void it821x_clock_strategy(ide_drive_t *drive) /** * it821x_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * Tune the host to the desired PIO mode taking into the consideration * the maximum PIO mode supported by the other device on the cable. */ -static void it821x_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void it821x_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct it821x_dev *itdev = ide_get_hwifdata(hwif); ide_drive_t *pair = ide_get_pair_dev(drive); + const u8 pio = drive->pio_mode - XFER_PIO_0; u8 unit = drive->dn & 1, set_pio = pio; /* Spec says 89 ref driver uses 88 */ diff --git a/drivers/ide/jmicron.c b/drivers/ide/jmicron.c index bf2be6431b20..ebffb904ed24 100644 --- a/drivers/ide/jmicron.c +++ b/drivers/ide/jmicron.c @@ -80,7 +80,7 @@ static u8 jmicron_cable_detect(ide_hwif_t *hwif) return ATA_CBL_PATA80; } -static void jmicron_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void jmicron_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { } diff --git a/drivers/ide/opti621.c b/drivers/ide/opti621.c index 2052788fab7a..1a53a4c375ed 100644 --- a/drivers/ide/opti621.c +++ b/drivers/ide/opti621.c @@ -62,12 +62,12 @@ static u8 read_reg(int reg) return ret; } -static void opti621_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void opti621_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; ide_drive_t *pair = ide_get_pair_dev(drive); unsigned long flags; - unsigned long mode = XFER_PIO_0 + pio, pair_mode; + unsigned long mode = drive->pio_mode, pair_mode; + const u8 pio = mode - XFER_PIO_0; u8 tim, misc, addr_pio = pio, clk; /* DRDY is default 2 (by OPTi Databook) */ diff --git a/drivers/ide/palm_bk3710.c b/drivers/ide/palm_bk3710.c index f8eddf05ecb8..0f262d07c378 100644 --- a/drivers/ide/palm_bk3710.c +++ b/drivers/ide/palm_bk3710.c @@ -203,12 +203,13 @@ static void palm_bk3710_set_dma_mode(ide_drive_t *drive, u8 xferspeed) } } -static void palm_bk3710_set_pio_mode(ide_drive_t *drive, u8 pio) +static void palm_bk3710_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { unsigned int cycle_time; int is_slave = drive->dn & 1; ide_drive_t *mate; - void __iomem *base = (void *)drive->hwif->dma_base; + void __iomem *base = (void *)hwif->dma_base; + const u8 pio = drive->pio_mode - XFER_PIO_0; /* * Obtain the drive PIO data for tuning the Palm Chip registers diff --git a/drivers/ide/pdc202xx_new.c b/drivers/ide/pdc202xx_new.c index 65ba8239e7b5..874acd2bb6e6 100644 --- a/drivers/ide/pdc202xx_new.c +++ b/drivers/ide/pdc202xx_new.c @@ -167,11 +167,11 @@ static void pdcnew_set_dma_mode(ide_drive_t *drive, const u8 speed) } } -static void pdcnew_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void pdcnew_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u8 adj = (drive->dn & 1) ? 0x08 : 0x00; + const u8 pio = drive->pio_mode - XFER_PIO_0; if (max_dma_rate(dev) == 4) { set_indexed_reg(hwif, 0x0c + adj, pio_timings[pio].reg0c); diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c index 1d20594ee420..402aab7f3baa 100644 --- a/drivers/ide/pdc202xx_old.c +++ b/drivers/ide/pdc202xx_old.c @@ -76,9 +76,9 @@ static void pdc202xx_set_mode(ide_drive_t *drive, const u8 speed) } } -static void pdc202xx_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void pdc202xx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - pdc202xx_set_mode(drive, XFER_PIO_0 + pio); + pdc202xx_set_mode(drive, drive->pio_mode); } static int pdc202xx_test_irq(ide_hwif_t *hwif) diff --git a/drivers/ide/piix.c b/drivers/ide/piix.c index bf14f39bd3a7..64b3041daa60 100644 --- a/drivers/ide/piix.c +++ b/drivers/ide/piix.c @@ -59,15 +59,14 @@ static int no_piix_dma; /** * piix_set_pio_mode - set host controller for PIO mode + * @port: port * @drive: drive - * @pio: PIO mode number * * Set the interface PIO mode based upon the settings done by AMI BIOS. */ -static void piix_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void piix_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); int is_slave = drive->dn & 1; int master_port = hwif->channel ? 0x42 : 0x40; @@ -77,6 +76,7 @@ static void piix_set_pio_mode(ide_drive_t *drive, const u8 pio) u8 slave_data; static DEFINE_SPINLOCK(tune_lock); int control = 0; + const u8 pio = drive->pio_mode - XFER_PIO_0; /* ISP RTC */ static const u8 timings[][2]= { @@ -176,7 +176,6 @@ static void piix_set_dma_mode(ide_drive_t *drive, const u8 speed) pci_write_config_byte(dev, 0x54, reg54 & ~v_flag); } else { const u8 mwdma_to_pio[] = { 0, 3, 4 }; - u8 pio; if (reg48 & u_flag) pci_write_config_byte(dev, 0x48, reg48 & ~u_flag); @@ -188,11 +187,12 @@ static void piix_set_dma_mode(ide_drive_t *drive, const u8 speed) pci_write_config_byte(dev, 0x55, (u8) reg55 & ~w_flag); if (speed >= XFER_MW_DMA_0) - pio = mwdma_to_pio[speed - XFER_MW_DMA_0]; + drive->pio_mode = + mwdma_to_pio[speed - XFER_MW_DMA_0] + XFER_PIO_0; else - pio = 2; /* only SWDMA2 is allowed */ + drive->pio_mode = XFER_PIO_2; /* for SWDMA2 */ - piix_set_pio_mode(drive, pio); + piix_set_pio_mode(hwif, drive); } } diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c index 7a4e788cab2f..a167968a2d42 100644 --- a/drivers/ide/pmac.c +++ b/drivers/ide/pmac.c @@ -496,12 +496,11 @@ static void pmac_write_devctl(ide_hwif_t *hwif, u8 ctl) /* * Old tuning functions (called on hdparm -p), sets up drive PIO timings */ -static void -pmac_ide_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void pmac_ide_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; pmac_ide_hwif_t *pmif = (pmac_ide_hwif_t *)dev_get_drvdata(hwif->gendev.parent); + const u8 pio = drive->pio_mode - XFER_PIO_0; struct ide_timing *tim = ide_timing_find_mode(XFER_PIO_0 + pio); u32 *timings, t; unsigned accessTicks, recTicks; diff --git a/drivers/ide/qd65xx.c b/drivers/ide/qd65xx.c index 74696edc8d1d..3f0244fd8e62 100644 --- a/drivers/ide/qd65xx.c +++ b/drivers/ide/qd65xx.c @@ -189,15 +189,13 @@ static void qd_set_timing (ide_drive_t *drive, u8 timing) printk(KERN_DEBUG "%s: %#x\n", drive->name, timing); } -static void qd6500_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void qd6500_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { u16 *id = drive->id; int active_time = 175; int recovery_time = 415; /* worst case values from the dos driver */ - /* - * FIXME: use "pio" value - */ + /* FIXME: use drive->pio_mode value */ if (!qd_find_disk_type(drive, &active_time, &recovery_time) && (id[ATA_ID_OLD_PIO_MODES] & 0xff) && (id[ATA_ID_FIELD_VALID] & 2) && id[ATA_ID_EIDE_PIO] >= 240) { @@ -211,9 +209,9 @@ static void qd6500_set_pio_mode(ide_drive_t *drive, const u8 pio) active_time, recovery_time)); } -static void qd6580_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void qd6580_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; + const u8 pio = drive->pio_mode - XFER_PIO_0; struct ide_timing *t = ide_timing_find_mode(XFER_PIO_0 + pio); unsigned int cycle_time; int active_time = 175; diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c index d467478d68da..bb0166e460ab 100644 --- a/drivers/ide/sc1200.c +++ b/drivers/ide/sc1200.c @@ -193,10 +193,10 @@ static int sc1200_dma_end(ide_drive_t *drive) * will have valid default PIO timings set up before we get here. */ -static void sc1200_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void sc1200_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; int mode = -1; + const u8 pio = drive->pio_mode - XFER_PIO_0; /* * bad abuse of ->set_pio_mode interface diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c index 1104bb301eb9..23e16e4460ee 100644 --- a/drivers/ide/scc_pata.c +++ b/drivers/ide/scc_pata.c @@ -199,16 +199,15 @@ scc_ide_outsl(unsigned long port, void *addr, u32 count) /** * scc_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * Load the timing settings for this device mode into the * controller. */ -static void scc_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void scc_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct scc_ports *ports = ide_get_hwifdata(hwif); unsigned long ctl_base = ports->ctl; unsigned long cckctrl_port = ctl_base + 0xff0; @@ -216,6 +215,7 @@ static void scc_set_pio_mode(ide_drive_t *drive, const u8 pio) unsigned long pioct_port = ctl_base + 0x004; unsigned long reg; int offset; + const u8 pio = drive->pio_mode - XFER_PIO_0; reg = in_be32((void __iomem *)cckctrl_port); if (reg & CCKCTRL_ATACLKOEN) { diff --git a/drivers/ide/serverworks.c b/drivers/ide/serverworks.c index 657f0433ec50..a56bc51ae032 100644 --- a/drivers/ide/serverworks.c +++ b/drivers/ide/serverworks.c @@ -106,12 +106,13 @@ static u8 svwks_csb_check (struct pci_dev *dev) return 0; } -static void svwks_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void svwks_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { static const u8 pio_modes[] = { 0x5d, 0x47, 0x34, 0x22, 0x20 }; static const u8 drive_pci[] = { 0x41, 0x40, 0x43, 0x42 }; - struct pci_dev *dev = to_pci_dev(drive->hwif->dev); + struct pci_dev *dev = to_pci_dev(hwif->dev); + const u8 pio = drive->pio_mode - XFER_PIO_0; pci_write_config_byte(dev, drive_pci[drive->dn], pio_modes[pio]); diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c index d95df528562f..97266958f744 100644 --- a/drivers/ide/siimage.c +++ b/drivers/ide/siimage.c @@ -229,19 +229,18 @@ static u8 sil_sata_udma_filter(ide_drive_t *drive) /** * sil_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * Load the timing settings for this device mode into the * controller. */ -static void sil_set_pio_mode(ide_drive_t *drive, u8 pio) +static void sil_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { static const u16 tf_speed[] = { 0x328a, 0x2283, 0x1281, 0x10c3, 0x10c1 }; static const u16 data_speed[] = { 0x328a, 0x2283, 0x1104, 0x10c3, 0x10c1 }; - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); ide_drive_t *pair = ide_get_pair_dev(drive); u32 speedt = 0; @@ -249,6 +248,7 @@ static void sil_set_pio_mode(ide_drive_t *drive, u8 pio) unsigned long addr = siimage_seldev(drive, 0x04); unsigned long tfaddr = siimage_selreg(hwif, 0x02); unsigned long base = (unsigned long)hwif->hwif_data; + const u8 pio = drive->pio_mode - XFER_PIO_0; u8 tf_pio = pio; u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0; u8 addr_mask = hwif->channel ? (mmio ? 0xF4 : 0x84) diff --git a/drivers/ide/sis5513.c b/drivers/ide/sis5513.c index 468706082fb5..5a0192060531 100644 --- a/drivers/ide/sis5513.c +++ b/drivers/ide/sis5513.c @@ -290,10 +290,10 @@ static void config_drive_art_rwp(ide_drive_t *drive) pci_write_config_byte(dev, 0x4b, rw_prefetch); } -static void sis_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void sis_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { config_drive_art_rwp(drive); - sis_program_timings(drive, XFER_PIO_0 + pio); + sis_program_timings(drive, drive->pio_mode); } static void sis_ata133_program_udma_timings(ide_drive_t *drive, const u8 mode) diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c index 3c2bbf0057ea..419cd3bc6c84 100644 --- a/drivers/ide/sl82c105.c +++ b/drivers/ide/sl82c105.c @@ -63,12 +63,13 @@ static unsigned int get_pio_timings(ide_drive_t *drive, u8 pio) /* * Configure the chipset for PIO mode. */ -static void sl82c105_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void sl82c105_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - struct pci_dev *dev = to_pci_dev(drive->hwif->dev); + struct pci_dev *dev = to_pci_dev(hwif->dev); unsigned long timings = (unsigned long)ide_get_drivedata(drive); int reg = 0x44 + drive->dn * 4; u16 drv_ctrl; + const u8 pio = drive->pio_mode - XFER_PIO_0; drv_ctrl = get_pio_timings(drive, pio); diff --git a/drivers/ide/slc90e66.c b/drivers/ide/slc90e66.c index 1ccfb40e7215..019777522cd2 100644 --- a/drivers/ide/slc90e66.c +++ b/drivers/ide/slc90e66.c @@ -18,9 +18,8 @@ static DEFINE_SPINLOCK(slc90e66_lock); -static void slc90e66_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void slc90e66_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); int is_slave = drive->dn & 1; int master_port = hwif->channel ? 0x42 : 0x40; @@ -29,6 +28,8 @@ static void slc90e66_set_pio_mode(ide_drive_t *drive, const u8 pio) u16 master_data; u8 slave_data; int control = 0; + const u8 pio = drive->pio_mode - XFER_PIO_0; + /* ISP RTC */ static const u8 timings[][2] = { { 0, 0 }, @@ -98,7 +99,6 @@ static void slc90e66_set_dma_mode(ide_drive_t *drive, const u8 speed) } } else { const u8 mwdma_to_pio[] = { 0, 3, 4 }; - u8 pio; if (reg48 & u_flag) pci_write_config_word(dev, 0x48, reg48 & ~u_flag); @@ -106,11 +106,12 @@ static void slc90e66_set_dma_mode(ide_drive_t *drive, const u8 speed) pci_write_config_word(dev, 0x4a, reg4a & ~a_speed); if (speed >= XFER_MW_DMA_0) - pio = mwdma_to_pio[speed - XFER_MW_DMA_0]; + drive->pio_mode = + mwdma_to_pio[speed - XFER_MW_DMA_0] + XFER_PIO_0; else - pio = 2; /* only SWDMA2 is allowed */ + drive->pio_mode = XFER_PIO_2; /* for SWDMA2 */ - slc90e66_set_pio_mode(drive, pio); + slc90e66_set_pio_mode(hwif, drive); } } diff --git a/drivers/ide/tc86c001.c b/drivers/ide/tc86c001.c index 05a93d6baecc..f2cb62bf3f22 100644 --- a/drivers/ide/tc86c001.c +++ b/drivers/ide/tc86c001.c @@ -41,9 +41,9 @@ static void tc86c001_set_mode(ide_drive_t *drive, const u8 speed) outw(scr, scr_port); } -static void tc86c001_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void tc86c001_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - tc86c001_set_mode(drive, XFER_PIO_0 + pio); + tc86c001_set_mode(drive, drive->pio_mode); } /* diff --git a/drivers/ide/triflex.c b/drivers/ide/triflex.c index 8773c3ba7462..d34a7eecdea5 100644 --- a/drivers/ide/triflex.c +++ b/drivers/ide/triflex.c @@ -82,9 +82,9 @@ static void triflex_set_mode(ide_drive_t *drive, const u8 speed) pci_write_config_dword(dev, channel_offset, triflex_timings); } -static void triflex_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void triflex_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - triflex_set_mode(drive, XFER_PIO_0 + pio); + triflex_set_mode(drive, drive->pio_mode); } static const struct ide_port_ops triflex_port_ops = { diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c index fd59c0d235b5..326d4683488b 100644 --- a/drivers/ide/tx4938ide.c +++ b/drivers/ide/tx4938ide.c @@ -56,11 +56,10 @@ static void tx4938ide_tune_ebusc(unsigned int ebus_ch, &tx4938_ebuscptr->cr[ebus_ch]); } -static void tx4938ide_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void tx4938ide_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct tx4938ide_platform_info *pdata = hwif->dev->platform_data; - u8 safe = pio; + u8 safe = drive->pio_mode - XFER_PIO_0; ide_drive_t *pair; pair = ide_get_pair_dev(drive); diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index 64b58ecc3f0e..5228a4786de5 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -104,11 +104,11 @@ static void tx4939ide_writeb(u8 val, void __iomem *base, u32 reg) #define TX4939IDE_BASE(hwif) ((void __iomem *)(hwif)->extra_base) -static void tx4939ide_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void tx4939ide_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; int is_slave = drive->dn; u32 mask, val; + const u8 pio = drive->pio_mode - XFER_PIO_0; u8 safe = pio; ide_drive_t *pair; diff --git a/drivers/ide/umc8672.c b/drivers/ide/umc8672.c index 60f936e2319c..47adcd09cb26 100644 --- a/drivers/ide/umc8672.c +++ b/drivers/ide/umc8672.c @@ -104,10 +104,11 @@ static void umc_set_speeds(u8 speeds[]) speeds[0], speeds[1], speeds[2], speeds[3]); } -static void umc_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void umc_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif, *mate = hwif->mate; + ide_hwif_t *mate = hwif->mate; unsigned long uninitialized_var(flags); + const u8 pio = drive->pio_mode - XFER_PIO_0; printk("%s: setting umc8672 to PIO mode%d (speed %d)\n", drive->name, pio, pio_to_umc[pio]); diff --git a/drivers/ide/via82cxxx.c b/drivers/ide/via82cxxx.c index fbecf8ea8207..6d995fc9d4f5 100644 --- a/drivers/ide/via82cxxx.c +++ b/drivers/ide/via82cxxx.c @@ -208,15 +208,15 @@ static void via_set_drive(ide_drive_t *drive, const u8 speed) /** * via_set_pio_mode - set host controller for PIO mode + * @hwif: port * @drive: drive - * @pio: PIO mode number * * A callback from the upper layers for PIO-only tuning. */ -static void via_set_pio_mode(ide_drive_t *drive, const u8 pio) +static void via_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - via_set_drive(drive, XFER_PIO_0 + pio); + via_set_drive(drive, drive->pio_mode); } static struct via_isa_bridge *via_config_find(struct pci_dev **isa) diff --git a/include/linux/ide.h b/include/linux/ide.h index 746ef9fdabcb..803ec306883c 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -624,7 +624,7 @@ extern const struct ide_tp_ops default_tp_ops; */ struct ide_port_ops { void (*init_dev)(ide_drive_t *); - void (*set_pio_mode)(ide_drive_t *, const u8); + void (*set_pio_mode)(struct hwif_s *, ide_drive_t *); void (*set_dma_mode)(ide_drive_t *, const u8); int (*reset_poll)(ide_drive_t *); void (*pre_reset)(ide_drive_t *); -- cgit v1.2.3 From 8776168ca2151850164af1de5565d01f7b8b2c53 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 19 Jan 2010 01:45:29 -0800 Subject: ide: change ->set_dma_mode method parameters Change ->set_dma_mode method parameters to match ->set_dmamode method used in struct ata_port_operations. Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/aec62xx.c | 10 +++++----- drivers/ide/alim15x3.c | 6 +++--- drivers/ide/amd74xx.c | 7 ++++--- drivers/ide/atiixp.c | 7 ++++--- drivers/ide/au1xxx-ide.c | 4 ++-- drivers/ide/cmd64x.c | 4 ++-- drivers/ide/cs5520.c | 4 ++-- drivers/ide/cs5530.c | 6 +++--- drivers/ide/cs5535.c | 6 +++--- drivers/ide/cs5536.c | 7 ++++--- drivers/ide/cy82c693.c | 4 ++-- drivers/ide/hpt366.c | 7 ++++--- drivers/ide/icside.c | 3 ++- drivers/ide/ide-xfer-mode.c | 4 ++-- drivers/ide/it8172.c | 4 ++-- drivers/ide/it8213.c | 6 +++--- drivers/ide/it821x.c | 6 ++++-- drivers/ide/jmicron.c | 4 ++-- drivers/ide/palm_bk3710.c | 5 +++-- drivers/ide/pdc202xx_new.c | 4 ++-- drivers/ide/pdc202xx_old.c | 7 ++++--- drivers/ide/piix.c | 6 +++--- drivers/ide/pmac.c | 4 ++-- drivers/ide/sc1200.c | 4 ++-- drivers/ide/scc_pata.c | 6 +++--- drivers/ide/serverworks.c | 4 ++-- drivers/ide/sgiioc4.c | 2 +- drivers/ide/siimage.c | 6 +++--- drivers/ide/sis5513.c | 4 +++- drivers/ide/sl82c105.c | 3 ++- drivers/ide/slc90e66.c | 4 ++-- drivers/ide/tc86c001.c | 7 ++++--- drivers/ide/triflex.c | 8 ++++---- drivers/ide/tx4939ide.c | 4 ++-- drivers/ide/via82cxxx.c | 9 +++++---- include/linux/ide.h | 2 +- 36 files changed, 101 insertions(+), 87 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/aec62xx.c b/drivers/ide/aec62xx.c index 3790847361c3..57d00caefc86 100644 --- a/drivers/ide/aec62xx.c +++ b/drivers/ide/aec62xx.c @@ -81,15 +81,15 @@ static u8 pci_bus_clock_list_ultra (u8 speed, struct chipset_bus_clock_list_entr return chipset_table->ultra_settings; } -static void aec6210_set_mode(ide_drive_t *drive, const u8 speed) +static void aec6210_set_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); struct ide_host *host = pci_get_drvdata(dev); struct chipset_bus_clock_list_entry *bus_clock = host->host_priv; u16 d_conf = 0; u8 ultra = 0, ultra_conf = 0; u8 tmp0 = 0, tmp1 = 0, tmp2 = 0; + const u8 speed = drive->dma_mode; unsigned long flags; local_irq_save(flags); @@ -109,15 +109,15 @@ static void aec6210_set_mode(ide_drive_t *drive, const u8 speed) local_irq_restore(flags); } -static void aec6260_set_mode(ide_drive_t *drive, const u8 speed) +static void aec6260_set_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); struct ide_host *host = pci_get_drvdata(dev); struct chipset_bus_clock_list_entry *bus_clock = host->host_priv; u8 unit = drive->dn & 1; u8 tmp1 = 0, tmp2 = 0; u8 ultra = 0, drive_conf = 0, ultra_conf = 0; + const u8 speed = drive->dma_mode; unsigned long flags; local_irq_save(flags); @@ -137,7 +137,7 @@ static void aec6260_set_mode(ide_drive_t *drive, const u8 speed) static void aec_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { drive->dma_mode = drive->pio_mode; - hwif->port_ops->set_dma_mode(drive, drive->dma_mode); + hwif->port_ops->set_dma_mode(hwif, drive); } static int init_chipset_aec62xx(struct pci_dev *dev) diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c index 28cee1055f76..6f0debae4e27 100644 --- a/drivers/ide/alim15x3.c +++ b/drivers/ide/alim15x3.c @@ -121,16 +121,16 @@ static u8 ali_udma_filter(ide_drive_t *drive) /** * ali_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Configure the hardware for the desired IDE transfer mode. */ -static void ali_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void ali_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); + const u8 speed = drive->dma_mode; u8 speed1 = speed; u8 unit = drive->dn & 1; u8 tmpbyte = 0x00; diff --git a/drivers/ide/amd74xx.c b/drivers/ide/amd74xx.c index 3eee7be7ca6f..b7e105338205 100644 --- a/drivers/ide/amd74xx.c +++ b/drivers/ide/amd74xx.c @@ -79,14 +79,14 @@ static void amd_set_speed(struct pci_dev *dev, u8 dn, u8 udma_mask, * to a desired transfer mode. It also can be called by upper layers. */ -static void amd_set_drive(ide_drive_t *drive, const u8 speed) +static void amd_set_drive(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); ide_drive_t *peer = ide_get_pair_dev(drive); struct ide_timing t, p; int T, UT; u8 udma_mask = hwif->ultra_mask; + const u8 speed = drive->dma_mode; T = 1000000000 / amd_clock; UT = (udma_mask == ATA_UDMA2) ? T : (T / 2); @@ -110,7 +110,8 @@ static void amd_set_drive(ide_drive_t *drive, const u8 speed) static void amd_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - amd_set_drive(drive, drive->pio_mode); + drive->dma_mode = drive->pio_mode; + amd_set_drive(hwif, drive); } static void amd7409_cable_detect(struct pci_dev *dev) diff --git a/drivers/ide/atiixp.c b/drivers/ide/atiixp.c index b6848dfb93b0..15f0ead89f5c 100644 --- a/drivers/ide/atiixp.c +++ b/drivers/ide/atiixp.c @@ -75,21 +75,22 @@ static void atiixp_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /** * atiixp_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Set a ATIIXP host controller to the desired DMA mode. This involves * programming the right timing data into the PCI configuration space. */ -static void atiixp_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void atiixp_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - struct pci_dev *dev = to_pci_dev(drive->hwif->dev); + struct pci_dev *dev = to_pci_dev(hwif->dev); unsigned long flags; int timing_shift = (drive->dn ^ 1) * 8; u32 tmp32; u16 tmp16; u16 udma_ctl = 0; + const u8 speed = drive->dma_mode; spin_lock_irqsave(&atiixp_lock, flags); diff --git a/drivers/ide/au1xxx-ide.c b/drivers/ide/au1xxx-ide.c index c90e9b0a9f6e..e2fd378ba9de 100644 --- a/drivers/ide/au1xxx-ide.c +++ b/drivers/ide/au1xxx-ide.c @@ -160,11 +160,11 @@ static void au1xxx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) au_writel(mem_stcfg,MEM_STCFG2); } -static void auide_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void auide_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { int mem_sttime = 0, mem_stcfg = au_readl(MEM_STCFG2); - switch(speed) { + switch (drive->dma_mode) { #ifdef CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA case XFER_MW_DMA_2: mem_sttime = SBC_IDE_TIMING(MDMA2); diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c index 0b11745937e7..a65a69171250 100644 --- a/drivers/ide/cmd64x.c +++ b/drivers/ide/cmd64x.c @@ -141,12 +141,12 @@ static void cmd64x_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) cmd64x_program_timings(drive, XFER_PIO_0 + pio); } -static void cmd64x_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void cmd64x_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u8 unit = drive->dn & 0x01; u8 regU = 0, pciU = hwif->channel ? UDIDETCR1 : UDIDETCR0; + const u8 speed = drive->dma_mode; pci_read_config_byte(dev, pciU, ®U); regU &= ~(unit ? 0xCA : 0x35); diff --git a/drivers/ide/cs5520.c b/drivers/ide/cs5520.c index b8094f049f3e..2c1e5f7cd261 100644 --- a/drivers/ide/cs5520.c +++ b/drivers/ide/cs5520.c @@ -81,12 +81,12 @@ static void cs5520_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) (cs5520_pio_clocks[pio].assert)); } -static void cs5520_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void cs5520_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { printk(KERN_ERR "cs55x0: bad ide timing.\n"); drive->pio_mode = XFER_PIO_0 + 0; - cs5520_set_pio_mode(drive->hwif, drive); + cs5520_set_pio_mode(hwif, drive); } static const struct ide_port_ops cs5520_port_ops = { diff --git a/drivers/ide/cs5530.c b/drivers/ide/cs5530.c index 4ced40255ad6..4dc4eb92b076 100644 --- a/drivers/ide/cs5530.c +++ b/drivers/ide/cs5530.c @@ -100,12 +100,12 @@ out: return mask; } -static void cs5530_set_dma_mode(ide_drive_t *drive, const u8 mode) +static void cs5530_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { unsigned long basereg; unsigned int reg, timings = 0; - switch (mode) { + switch (drive->dma_mode) { case XFER_UDMA_0: timings = 0x00921250; break; case XFER_UDMA_1: timings = 0x00911140; break; case XFER_UDMA_2: timings = 0x00911030; break; @@ -113,7 +113,7 @@ static void cs5530_set_dma_mode(ide_drive_t *drive, const u8 mode) case XFER_MW_DMA_1: timings = 0x00012121; break; case XFER_MW_DMA_2: timings = 0x00002020; break; } - basereg = CS5530_BASEREG(drive->hwif); + basereg = CS5530_BASEREG(hwif); reg = inl(basereg + 4); /* get drive0 config register */ timings |= reg & 0x80000000; /* preserve PIO format bit */ if ((drive-> dn & 1) == 0) { /* are we configuring drive0? */ diff --git a/drivers/ide/cs5535.c b/drivers/ide/cs5535.c index 7974415ea89f..740002b2f3e8 100644 --- a/drivers/ide/cs5535.c +++ b/drivers/ide/cs5535.c @@ -129,15 +129,15 @@ static void cs5535_set_speed(ide_drive_t *drive, const u8 speed) /** * cs5535_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Programs the chipset for DMA mode. */ -static void cs5535_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void cs5535_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - cs5535_set_speed(drive, speed); + cs5535_set_speed(drive, drive->dma_mode); } /** diff --git a/drivers/ide/cs5536.c b/drivers/ide/cs5536.c index b518ef0e9a35..70871fbc3c0a 100644 --- a/drivers/ide/cs5536.c +++ b/drivers/ide/cs5536.c @@ -173,11 +173,11 @@ static void cs5536_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /** * cs5536_set_dma_mode - DMA timing setup + * @hwif: ATA port * @drive: ATA device - * @mode: DMA mode */ -static void cs5536_set_dma_mode(ide_drive_t *drive, const u8 mode) +static void cs5536_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { static const u8 udma_timings[6] = { 0xc2, 0xc1, 0xc0, 0xc4, 0xc5, 0xc6, @@ -187,10 +187,11 @@ static void cs5536_set_dma_mode(ide_drive_t *drive, const u8 mode) 0x67, 0x21, 0x20, }; - struct pci_dev *pdev = to_pci_dev(drive->hwif->dev); + struct pci_dev *pdev = to_pci_dev(hwif->dev); int dshift = (drive->dn & 1) ? IDE_D1_SHIFT : IDE_D0_SHIFT; unsigned long timings = (unsigned long)ide_get_drivedata(drive); u32 etc; + const u8 mode = drive->dma_mode; cs5536_read(pdev, ETC, &etc); diff --git a/drivers/ide/cy82c693.c b/drivers/ide/cy82c693.c index ead65c394f00..9383f67deae1 100644 --- a/drivers/ide/cy82c693.c +++ b/drivers/ide/cy82c693.c @@ -53,9 +53,9 @@ * set DMA mode a specific channel for CY82C693 */ -static void cy82c693_set_dma_mode(ide_drive_t *drive, const u8 mode) +static void cy82c693_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; + const u8 mode = drive->dma_mode; u8 single = (mode & 0x10) >> 4, index = 0, data = 0; index = hwif->channel ? CY82_INDEX_CHANNEL1 : CY82_INDEX_CHANNEL0; diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c index f1dec519a9e6..b885c1d548f5 100644 --- a/drivers/ide/hpt366.c +++ b/drivers/ide/hpt366.c @@ -627,14 +627,14 @@ static u32 get_speed_setting(u8 speed, struct hpt_info *info) return info->timings->clock_table[info->clock][i]; } -static void hpt3xx_set_mode(ide_drive_t *drive, const u8 speed) +static void hpt3xx_set_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); struct hpt_info *info = hpt3xx_get_info(hwif->dev); struct hpt_timings *t = info->timings; u8 itr_addr = 0x40 + (drive->dn * 4); u32 old_itr = 0; + const u8 speed = drive->dma_mode; u32 new_itr = get_speed_setting(speed, info); u32 itr_mask = speed < XFER_MW_DMA_0 ? t->pio_mask : (speed < XFER_UDMA_0 ? t->dma_mask : @@ -653,7 +653,8 @@ static void hpt3xx_set_mode(ide_drive_t *drive, const u8 speed) static void hpt3xx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - hpt3xx_set_mode(drive, drive->pio_mode); + drive->dma_mode = drive->pio_mode; + hpt3xx_set_mode(hwif, drive); } static void hpt3xx_maskproc(ide_drive_t *drive, int mask) diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c index 0f67f1abbbd3..26b6c0a1f772 100644 --- a/drivers/ide/icside.c +++ b/drivers/ide/icside.c @@ -185,10 +185,11 @@ static const expansioncard_ops_t icside_ops_arcin_v6 = { * MW1 80 50 50 150 C * MW2 70 25 25 120 C */ -static void icside_set_dma_mode(ide_drive_t *drive, const u8 xfer_mode) +static void icside_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { unsigned long cycle_time; int use_dma_info = 0; + const u8 xfer_mode = drive->dma_mode; switch (xfer_mode) { case XFER_MW_DMA_2: diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c index a62fb03fc1cc..9b549e4d1848 100644 --- a/drivers/ide/ide-xfer-mode.c +++ b/drivers/ide/ide-xfer-mode.c @@ -168,11 +168,11 @@ int ide_set_dma_mode(ide_drive_t *drive, const u8 mode) if (ide_config_drive_speed(drive, mode)) return -1; drive->dma_mode = mode; - port_ops->set_dma_mode(drive, mode); + port_ops->set_dma_mode(hwif, drive); return 0; } else { drive->dma_mode = mode; - port_ops->set_dma_mode(drive, mode); + port_ops->set_dma_mode(hwif, drive); return ide_config_drive_speed(drive, mode); } } diff --git a/drivers/ide/it8172.c b/drivers/ide/it8172.c index 9dfdc8741a7b..560e66d07659 100644 --- a/drivers/ide/it8172.c +++ b/drivers/ide/it8172.c @@ -77,14 +77,14 @@ static void it8172_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) pci_write_config_dword(dev, 0x44, drive_timing); } -static void it8172_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void it8172_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); int a_speed = 3 << (drive->dn * 4); int u_flag = 1 << drive->dn; int u_speed = 0; u8 reg48, reg4a; + const u8 speed = drive->dma_mode; pci_read_config_byte(dev, 0x48, ®48); pci_read_config_byte(dev, 0x4a, ®4a); diff --git a/drivers/ide/it8213.c b/drivers/ide/it8213.c index 492c07d5f4f3..46816ba26416 100644 --- a/drivers/ide/it8213.c +++ b/drivers/ide/it8213.c @@ -74,15 +74,14 @@ static void it8213_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /** * it8213_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Tune the ITE chipset for the DMA mode. */ -static void it8213_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void it8213_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u8 maslave = 0x40; int a_speed = 3 << (drive->dn * 4); @@ -92,6 +91,7 @@ static void it8213_set_dma_mode(ide_drive_t *drive, const u8 speed) int u_speed = 0; u16 reg4042, reg4a; u8 reg48, reg54, reg55; + const u8 speed = drive->dma_mode; pci_read_config_word(dev, maslave, ®4042); pci_read_config_byte(dev, 0x48, ®48); diff --git a/drivers/ide/it821x.c b/drivers/ide/it821x.c index 69becb7b9656..56b79194156b 100644 --- a/drivers/ide/it821x.c +++ b/drivers/ide/it821x.c @@ -393,14 +393,16 @@ static int it821x_dma_end(ide_drive_t *drive) /** * it821x_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Tune the ITE chipset for the desired DMA mode. */ -static void it821x_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void it821x_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { + const u8 speed = drive->dma_mode; + /* * MWDMA tuning is really hard because our MWDMA and PIO * timings are kept in the same place. We can switch in the diff --git a/drivers/ide/jmicron.c b/drivers/ide/jmicron.c index ebffb904ed24..74c2c4a6d909 100644 --- a/drivers/ide/jmicron.c +++ b/drivers/ide/jmicron.c @@ -86,13 +86,13 @@ static void jmicron_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /** * jmicron_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @mode: DMA mode * * As the JMicron snoops for timings we don't need to do anything here. */ -static void jmicron_set_dma_mode(ide_drive_t *drive, const u8 mode) +static void jmicron_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { } diff --git a/drivers/ide/palm_bk3710.c b/drivers/ide/palm_bk3710.c index 0f262d07c378..35448c91b8c8 100644 --- a/drivers/ide/palm_bk3710.c +++ b/drivers/ide/palm_bk3710.c @@ -188,10 +188,11 @@ static void palm_bk3710_setpiomode(void __iomem *base, ide_drive_t *mate, writel(val32, base + BK3710_REGRCVR); } -static void palm_bk3710_set_dma_mode(ide_drive_t *drive, u8 xferspeed) +static void palm_bk3710_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { int is_slave = drive->dn & 1; - void __iomem *base = (void *)drive->hwif->dma_base; + void __iomem *base = (void *)hwif->dma_base; + const u8 xferspeed = drive->dma_mode; if (xferspeed >= XFER_UDMA_0) { palm_bk3710_setudmamode(base, is_slave, diff --git a/drivers/ide/pdc202xx_new.c b/drivers/ide/pdc202xx_new.c index 874acd2bb6e6..9546fe2a93f7 100644 --- a/drivers/ide/pdc202xx_new.c +++ b/drivers/ide/pdc202xx_new.c @@ -129,11 +129,11 @@ static struct udma_timing { { 0x1a, 0x01, 0xcb }, /* UDMA mode 6 */ }; -static void pdcnew_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void pdcnew_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u8 adj = (drive->dn & 1) ? 0x08 : 0x00; + const u8 speed = drive->dma_mode; /* * IDE core issues SETFEATURES_XFER to the drive first (thanks to diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c index 402aab7f3baa..07cd37516ba6 100644 --- a/drivers/ide/pdc202xx_old.c +++ b/drivers/ide/pdc202xx_old.c @@ -21,11 +21,11 @@ #define DRV_NAME "pdc202xx_old" -static void pdc202xx_set_mode(ide_drive_t *drive, const u8 speed) +static void pdc202xx_set_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u8 drive_pci = 0x60 + (drive->dn << 2); + const u8 speed = drive->dma_mode; u8 AP = 0, BP = 0, CP = 0; u8 TA = 0, TB = 0, TC = 0; @@ -78,7 +78,8 @@ static void pdc202xx_set_mode(ide_drive_t *drive, const u8 speed) static void pdc202xx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - pdc202xx_set_mode(drive, drive->pio_mode); + drive->dma_mode = drive->pio_mode; + pdc202xx_set_mode(hwif, drive); } static int pdc202xx_test_irq(ide_hwif_t *hwif) diff --git a/drivers/ide/piix.c b/drivers/ide/piix.c index 64b3041daa60..1bdca49e5a03 100644 --- a/drivers/ide/piix.c +++ b/drivers/ide/piix.c @@ -127,16 +127,15 @@ static void piix_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /** * piix_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Set a PIIX host controller to the desired DMA mode. This involves * programming the right timing data into the PCI configuration space. */ -static void piix_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void piix_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u8 maslave = hwif->channel ? 0x42 : 0x40; int a_speed = 3 << (drive->dn * 4); @@ -147,6 +146,7 @@ static void piix_set_dma_mode(ide_drive_t *drive, const u8 speed) int sitre; u16 reg4042, reg4a; u8 reg48, reg54, reg55; + const u8 speed = drive->dma_mode; pci_read_config_word(dev, maslave, ®4042); sitre = (reg4042 & 0x4000) ? 1 : 0; diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c index a167968a2d42..9fae1fb1468b 100644 --- a/drivers/ide/pmac.c +++ b/drivers/ide/pmac.c @@ -777,14 +777,14 @@ set_timings_mdma(ide_drive_t *drive, int intf_type, u32 *timings, u32 *timings2, #endif } -static void pmac_ide_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void pmac_ide_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; pmac_ide_hwif_t *pmif = (pmac_ide_hwif_t *)dev_get_drvdata(hwif->gendev.parent); int ret = 0; u32 *timings, *timings2, tl[2]; u8 unit = drive->dn & 1; + const u8 speed = drive->dma_mode; timings = &pmif->timings[unit]; timings2 = &pmif->timings[unit+2]; diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c index bb0166e460ab..134f1fd13866 100644 --- a/drivers/ide/sc1200.c +++ b/drivers/ide/sc1200.c @@ -122,13 +122,13 @@ out: return mask; } -static void sc1200_set_dma_mode(ide_drive_t *drive, const u8 mode) +static void sc1200_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); unsigned int reg, timings; unsigned short pci_clock; unsigned int basereg = hwif->channel ? 0x50 : 0x40; + const u8 mode = drive->dma_mode; static const u32 udma_timing[3][3] = { { 0x00921250, 0x00911140, 0x00911030 }, diff --git a/drivers/ide/scc_pata.c b/drivers/ide/scc_pata.c index 23e16e4460ee..e9d4b441d1c3 100644 --- a/drivers/ide/scc_pata.c +++ b/drivers/ide/scc_pata.c @@ -231,16 +231,15 @@ static void scc_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /** * scc_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Load the timing settings for this device mode into the * controller. */ -static void scc_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void scc_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct scc_ports *ports = ide_get_hwifdata(hwif); unsigned long ctl_base = ports->ctl; unsigned long cckctrl_port = ctl_base + 0xff0; @@ -254,6 +253,7 @@ static void scc_set_dma_mode(ide_drive_t *drive, const u8 speed) int offset, idx; unsigned long reg; unsigned long jcactsel; + const u8 speed = drive->dma_mode; reg = in_be32((void __iomem *)cckctrl_port); if (reg & CCKCTRL_ATACLKOEN) { diff --git a/drivers/ide/serverworks.c b/drivers/ide/serverworks.c index a56bc51ae032..35fb8dabb55d 100644 --- a/drivers/ide/serverworks.c +++ b/drivers/ide/serverworks.c @@ -128,14 +128,14 @@ static void svwks_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) } } -static void svwks_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void svwks_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { static const u8 udma_modes[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05 }; static const u8 dma_modes[] = { 0x77, 0x21, 0x20 }; static const u8 drive_pci2[] = { 0x45, 0x44, 0x47, 0x46 }; - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); + const u8 speed = drive->dma_mode; u8 unit = drive->dn & 1; u8 ultra_enable = 0, ultra_timing = 0, dma_timing = 0; diff --git a/drivers/ide/sgiioc4.c b/drivers/ide/sgiioc4.c index b7d61dc64096..e3ea591f66d3 100644 --- a/drivers/ide/sgiioc4.c +++ b/drivers/ide/sgiioc4.c @@ -255,7 +255,7 @@ static int sgiioc4_dma_end(ide_drive_t *drive) return dma_stat; } -static void sgiioc4_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void sgiioc4_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { } diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c index 97266958f744..2009ac2ff658 100644 --- a/drivers/ide/siimage.c +++ b/drivers/ide/siimage.c @@ -289,19 +289,18 @@ static void sil_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /** * sil_set_dma_mode - set host controller for DMA mode + * @hwif: port * @drive: drive - * @speed: DMA mode * * Tune the SiI chipset for the desired DMA mode. */ -static void sil_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void sil_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { static const u8 ultra6[] = { 0x0F, 0x0B, 0x07, 0x05, 0x03, 0x02, 0x01 }; static const u8 ultra5[] = { 0x0C, 0x07, 0x05, 0x04, 0x02, 0x01 }; static const u16 dma[] = { 0x2208, 0x10C2, 0x10C1 }; - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); unsigned long base = (unsigned long)hwif->hwif_data; u16 ultra = 0, multi = 0; @@ -311,6 +310,7 @@ static void sil_set_dma_mode(ide_drive_t *drive, const u8 speed) : (mmio ? 0xB4 : 0x80); unsigned long ma = siimage_seldev(drive, 0x08); unsigned long ua = siimage_seldev(drive, 0x0C); + const u8 speed = drive->dma_mode; scsc = sil_ioread8 (dev, base + (mmio ? 0x4A : 0x8A)); mode = sil_ioread8 (dev, base + addr_mask); diff --git a/drivers/ide/sis5513.c b/drivers/ide/sis5513.c index 5a0192060531..db7f4e761dbc 100644 --- a/drivers/ide/sis5513.c +++ b/drivers/ide/sis5513.c @@ -340,8 +340,10 @@ static void sis_program_udma_timings(ide_drive_t *drive, const u8 mode) sis_ata33_program_udma_timings(drive, mode); } -static void sis_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void sis_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { + const u8 speed = drive->dma_mode; + if (speed >= XFER_UDMA_0) sis_program_udma_timings(drive, speed); else diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c index 419cd3bc6c84..f21dc2ad7682 100644 --- a/drivers/ide/sl82c105.c +++ b/drivers/ide/sl82c105.c @@ -92,11 +92,12 @@ static void sl82c105_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /* * Configure the chipset for DMA mode. */ -static void sl82c105_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void sl82c105_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { static u16 mwdma_timings[] = {0x0707, 0x0201, 0x0200}; unsigned long timings = (unsigned long)ide_get_drivedata(drive); u16 drv_ctrl; + const u8 speed = drive->dma_mode; drv_ctrl = mwdma_timings[speed - XFER_MW_DMA_0]; diff --git a/drivers/ide/slc90e66.c b/drivers/ide/slc90e66.c index 019777522cd2..864ffe0e26d9 100644 --- a/drivers/ide/slc90e66.c +++ b/drivers/ide/slc90e66.c @@ -72,14 +72,14 @@ static void slc90e66_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) spin_unlock_irqrestore(&slc90e66_lock, flags); } -static void slc90e66_set_dma_mode(ide_drive_t *drive, const u8 speed) +static void slc90e66_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u8 maslave = hwif->channel ? 0x42 : 0x40; int sitre = 0, a_speed = 7 << (drive->dn * 4); int u_speed = 0, u_flag = 1 << drive->dn; u16 reg4042, reg44, reg48, reg4a; + const u8 speed = drive->dma_mode; pci_read_config_word(dev, maslave, ®4042); sitre = (reg4042 & 0x4000) ? 1 : 0; diff --git a/drivers/ide/tc86c001.c b/drivers/ide/tc86c001.c index f2cb62bf3f22..e444d24934b3 100644 --- a/drivers/ide/tc86c001.c +++ b/drivers/ide/tc86c001.c @@ -13,11 +13,11 @@ #define DRV_NAME "tc86c001" -static void tc86c001_set_mode(ide_drive_t *drive, const u8 speed) +static void tc86c001_set_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; unsigned long scr_port = hwif->config_data + (drive->dn ? 0x02 : 0x00); u16 mode, scr = inw(scr_port); + const u8 speed = drive->dma_mode; switch (speed) { case XFER_UDMA_4: mode = 0x00c0; break; @@ -43,7 +43,8 @@ static void tc86c001_set_mode(ide_drive_t *drive, const u8 speed) static void tc86c001_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - tc86c001_set_mode(drive, drive->pio_mode); + drive->dma_mode = drive->pio_mode; + tc86c001_set_mode(hwif, drive); } /* diff --git a/drivers/ide/triflex.c b/drivers/ide/triflex.c index d34a7eecdea5..7953447eae0f 100644 --- a/drivers/ide/triflex.c +++ b/drivers/ide/triflex.c @@ -34,9 +34,8 @@ #define DRV_NAME "triflex" -static void triflex_set_mode(ide_drive_t *drive, const u8 speed) +static void triflex_set_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; struct pci_dev *dev = to_pci_dev(hwif->dev); u32 triflex_timings = 0; u16 timing = 0; @@ -44,7 +43,7 @@ static void triflex_set_mode(ide_drive_t *drive, const u8 speed) pci_read_config_dword(dev, channel_offset, &triflex_timings); - switch(speed) { + switch (drive->dma_mode) { case XFER_MW_DMA_2: timing = 0x0103; break; @@ -84,7 +83,8 @@ static void triflex_set_mode(ide_drive_t *drive, const u8 speed) static void triflex_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - triflex_set_mode(drive, drive->pio_mode); + drive->dma_mode = drive->pio_mode; + triflex_set_mode(hwif, drive); } static const struct ide_port_ops triflex_port_ops = { diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index 5228a4786de5..f210633a3d57 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -125,10 +125,10 @@ static void tx4939ide_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) /* tx4939ide_tf_load_fixup() will set the Sys_Ctl register */ } -static void tx4939ide_set_dma_mode(ide_drive_t *drive, const u8 mode) +static void tx4939ide_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; u32 mask, val; + const u8 mode = drive->dma_mode; /* Update Data Transfer Mode for this drive. */ if (mode >= XFER_UDMA_0) diff --git a/drivers/ide/via82cxxx.c b/drivers/ide/via82cxxx.c index 6d995fc9d4f5..6769fe252b07 100644 --- a/drivers/ide/via82cxxx.c +++ b/drivers/ide/via82cxxx.c @@ -169,22 +169,22 @@ static void via_set_speed(ide_hwif_t *hwif, u8 dn, struct ide_timing *timing) /** * via_set_drive - configure transfer mode + * @hwif: port * @drive: Drive to set up - * @speed: desired speed * * via_set_drive() computes timing values configures the chipset to * a desired transfer mode. It also can be called by upper layers. */ -static void via_set_drive(ide_drive_t *drive, const u8 speed) +static void via_set_drive(ide_hwif_t *hwif, ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; ide_drive_t *peer = ide_get_pair_dev(drive); struct pci_dev *dev = to_pci_dev(hwif->dev); struct ide_host *host = pci_get_drvdata(dev); struct via82cxxx_dev *vdev = host->host_priv; struct ide_timing t, p; unsigned int T, UT; + const u8 speed = drive->dma_mode; T = 1000000000 / via_clock; @@ -216,7 +216,8 @@ static void via_set_drive(ide_drive_t *drive, const u8 speed) static void via_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { - via_set_drive(drive, drive->pio_mode); + drive->dma_mode = drive->pio_mode; + via_set_drive(hwif, drive); } static struct via_isa_bridge *via_config_find(struct pci_dev **isa) diff --git a/include/linux/ide.h b/include/linux/ide.h index 803ec306883c..53ecdba82d72 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -625,7 +625,7 @@ extern const struct ide_tp_ops default_tp_ops; struct ide_port_ops { void (*init_dev)(ide_drive_t *); void (*set_pio_mode)(struct hwif_s *, ide_drive_t *); - void (*set_dma_mode)(ide_drive_t *, const u8); + void (*set_dma_mode)(struct hwif_s *, ide_drive_t *); int (*reset_poll)(ide_drive_t *); void (*pre_reset)(ide_drive_t *); void (*resetproc)(ide_drive_t *); -- cgit v1.2.3 From 220c58bc6d1198c4c4e69a385d364602c38b6b1c Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 18 Jan 2010 07:22:38 +0000 Subject: ide: make ide_get_best_pio_mode() static Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: David S. Miller --- drivers/ide/ide-xfer-mode.c | 3 +-- include/linux/ide.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c index 9b549e4d1848..5fc8d5c17de9 100644 --- a/drivers/ide/ide-xfer-mode.c +++ b/drivers/ide/ide-xfer-mode.c @@ -58,7 +58,7 @@ EXPORT_SYMBOL(ide_xfer_verbose); * This is used by most chipset support modules when "auto-tuning". */ -u8 ide_get_best_pio_mode(ide_drive_t *drive, u8 mode_wanted, u8 max_mode) +static u8 ide_get_best_pio_mode(ide_drive_t *drive, u8 mode_wanted, u8 max_mode) { u16 *id = drive->id; int pio_mode = -1, overridden = 0; @@ -105,7 +105,6 @@ u8 ide_get_best_pio_mode(ide_drive_t *drive, u8 mode_wanted, u8 max_mode) return pio_mode; } -EXPORT_SYMBOL_GPL(ide_get_best_pio_mode); int ide_pio_need_iordy(ide_drive_t *drive, const u8 pio) { diff --git a/include/linux/ide.h b/include/linux/ide.h index 53ecdba82d72..97e6ab435184 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1496,7 +1496,6 @@ int ide_timing_compute(ide_drive_t *, u8, struct ide_timing *, int, int); #ifdef CONFIG_IDE_XFER_MODE int ide_scan_pio_blacklist(char *); const char *ide_xfer_verbose(u8); -u8 ide_get_best_pio_mode(ide_drive_t *, u8, u8); int ide_pio_need_iordy(ide_drive_t *, const u8); int ide_set_pio_mode(ide_drive_t *, u8); int ide_set_dma_mode(ide_drive_t *, u8); -- cgit v1.2.3 From 4f9c85a1b03bfa5c0a0d8488a3a7766f3c9fb756 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 18 Jan 2010 05:37:16 +0000 Subject: phylib: Move workqueue initialization to a proper place commit 541cd3ee00a4fe975b22fac6a3bc846bacef37f7 ("phylib: Fix deadlock on resume") caused TI DaVinci EMAC ethernet driver to oops upon resume: PM: resume of devices complete after 237.098 msecs Restarting tasks ... done. kernel BUG at kernel/workqueue.c:354! Unable to handle kernel NULL pointer dereference at virtual address 00000000 [...] Backtrace: [] (__bug+0x0/0x2c) from [] (queue_delayed_work_on+0x74/0xf8) [] (queue_delayed_work_on+0x0/0xf8) from [] (queue_delayed_work+0x2c/0x30) The oops pops up because TI DaVinci EMAC driver detaches PHY on suspend and attaches it back on resume. Attaching makes phylib call phy_start_machine() that initializes a workqueue. On the other hand, PHY's resume routine will call phy_start_machine() again, and that will cause the oops since we just destroyed the already scheduled workqueue. This patch fixes the issue by moving workqueue initialization to phy_device_create(). p.s. We don't see this oops with ucc_geth and gianfar drivers because they perform a fine-grained suspend, i.e. they just stop the PHYs without detaching. Reported-by: Sekhar Nori Signed-off-by: Anton Vorontsov Tested-by: Sekhar Nori Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 4 +--- drivers/net/phy/phy_device.c | 1 + include/linux/phy.h | 1 + 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index b0e9f9c51721..0295097d6c44 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -410,7 +410,6 @@ EXPORT_SYMBOL(phy_start_aneg); static void phy_change(struct work_struct *work); -static void phy_state_machine(struct work_struct *work); /** * phy_start_machine - start PHY state machine tracking @@ -430,7 +429,6 @@ void phy_start_machine(struct phy_device *phydev, { phydev->adjust_state = handler; - INIT_DELAYED_WORK(&phydev->state_queue, phy_state_machine); schedule_delayed_work(&phydev->state_queue, HZ); } @@ -761,7 +759,7 @@ EXPORT_SYMBOL(phy_start); * phy_state_machine - Handle the state machine * @work: work_struct that describes the work to be done */ -static void phy_state_machine(struct work_struct *work) +void phy_state_machine(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct phy_device *phydev = diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 8212b2b93422..adbc0fded130 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -177,6 +177,7 @@ struct phy_device* phy_device_create(struct mii_bus *bus, int addr, int phy_id) dev->state = PHY_DOWN; mutex_init(&dev->lock); + INIT_DELAYED_WORK(&dev->state_queue, phy_state_machine); return dev; } diff --git a/include/linux/phy.h b/include/linux/phy.h index 7968defd2fa7..6a7eb402165d 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -485,6 +485,7 @@ void phy_driver_unregister(struct phy_driver *drv); int phy_driver_register(struct phy_driver *new_driver); void phy_prepare_link(struct phy_device *phydev, void (*adjust_link)(struct net_device *)); +void phy_state_machine(struct work_struct *work); void phy_start_machine(struct phy_device *phydev, void (*handler)(struct net_device *)); void phy_stop_machine(struct phy_device *phydev); -- cgit v1.2.3 From 11380a4b2d86fae9a6bce75c9373668cc323fe57 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 19 Jan 2010 13:46:10 -0800 Subject: net: Unexport napi_gro_flush(). Nothing outside of net/core/dev.c uses it. Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - net/core/dev.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a3fccc85b1a0..468a11dea58c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1527,7 +1527,6 @@ extern int netif_rx(struct sk_buff *skb); extern int netif_rx_ni(struct sk_buff *skb); #define HAVE_NETIF_RECEIVE_SKB 1 extern int netif_receive_skb(struct sk_buff *skb); -extern void napi_gro_flush(struct napi_struct *napi); extern gro_result_t dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb); extern gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index a008f6987a95..5747b9edc1bb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2582,7 +2582,7 @@ out: return netif_receive_skb(skb); } -void napi_gro_flush(struct napi_struct *napi) +static void napi_gro_flush(struct napi_struct *napi) { struct sk_buff *skb, *next; @@ -2595,7 +2595,6 @@ void napi_gro_flush(struct napi_struct *napi) napi->gro_count = 0; napi->gro_list = NULL; } -EXPORT_SYMBOL(napi_gro_flush); enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { -- cgit v1.2.3 From 552e450929a7298cc8834fd2824a60b2e914f70e Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 20 Jan 2010 13:49:45 -0700 Subject: spi/dw_spi: refine the IRQ mode working flow Now dw_spi core fully supports 3 transfer modes: pure polling, DMA and IRQ mode. IRQ mode will use the FIFO half empty as the IRQ trigger, so each interface driver need set the fifo_len, so that core driver can handle it properly Signed-off-by: Feng Tang Signed-off-by: Grant Likely --- drivers/spi/dw_spi.c | 64 ++++++++++++++++++++++++++++------------------ drivers/spi/dw_spi_pci.c | 1 + include/linux/spi/dw_spi.h | 1 + 3 files changed, 41 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/spi/dw_spi.c b/drivers/spi/dw_spi.c index 521d680af289..1bb709b3920f 100644 --- a/drivers/spi/dw_spi.c +++ b/drivers/spi/dw_spi.c @@ -358,6 +358,8 @@ static void transfer_complete(struct dw_spi *dws) static irqreturn_t interrupt_transfer(struct dw_spi *dws) { u16 irq_status, irq_mask = 0x3f; + u32 int_level = dws->fifo_len / 2; + u32 left; irq_status = dw_readw(dws, isr) & irq_mask; /* Error handling */ @@ -369,22 +371,23 @@ static irqreturn_t interrupt_transfer(struct dw_spi *dws) return IRQ_HANDLED; } - /* INT comes from tx */ - if (dws->tx && (irq_status & SPI_INT_TXEI)) { - while (dws->tx < dws->tx_end) + if (irq_status & SPI_INT_TXEI) { + spi_mask_intr(dws, SPI_INT_TXEI); + + left = (dws->tx_end - dws->tx) / dws->n_bytes; + left = (left > int_level) ? int_level : left; + + while (left--) dws->write(dws); + dws->read(dws); - if (dws->tx == dws->tx_end) { - spi_mask_intr(dws, SPI_INT_TXEI); + /* Re-enable the IRQ if there is still data left to tx */ + if (dws->tx_end > dws->tx) + spi_umask_intr(dws, SPI_INT_TXEI); + else transfer_complete(dws); - } } - /* INT comes from rx */ - if (dws->rx && (irq_status & SPI_INT_RXFI)) { - if (dws->read(dws)) - transfer_complete(dws); - } return IRQ_HANDLED; } @@ -428,6 +431,7 @@ static void pump_transfers(unsigned long data) u8 bits = 0; u8 imask = 0; u8 cs_change = 0; + u16 txint_level = 0; u16 clk_div = 0; u32 speed = 0; u32 cr0 = 0; @@ -438,6 +442,9 @@ static void pump_transfers(unsigned long data) chip = dws->cur_chip; spi = message->spi; + if (unlikely(!chip->clk_div)) + chip->clk_div = dws->max_freq / chip->speed_hz; + if (message->state == ERROR_STATE) { message->status = -EIO; goto early_exit; @@ -492,7 +499,7 @@ static void pump_transfers(unsigned long data) /* clk_div doesn't support odd number */ clk_div = dws->max_freq / speed; - clk_div = (clk_div >> 1) << 1; + clk_div = (clk_div + 1) & 0xfffe; chip->speed_hz = speed; chip->clk_div = clk_div; @@ -535,11 +542,16 @@ static void pump_transfers(unsigned long data) /* Check if current transfer is a DMA transaction */ dws->dma_mapped = map_dma_buffers(dws); + /* + * Interrupt mode + * we only need set the TXEI IRQ, as TX/RX always happen syncronizely + */ if (!dws->dma_mapped && !chip->poll_mode) { - if (dws->rx) - imask |= SPI_INT_RXFI; - if (dws->tx) - imask |= SPI_INT_TXEI; + int templen = dws->len / dws->n_bytes; + txint_level = dws->fifo_len / 2; + txint_level = (templen > txint_level) ? txint_level : templen; + + imask |= SPI_INT_TXEI; dws->transfer_handler = interrupt_transfer; } @@ -549,21 +561,23 @@ static void pump_transfers(unsigned long data) * 2. clk_div is changed * 3. control value changes */ - if (dw_readw(dws, ctrl0) != cr0 || cs_change || clk_div) { + if (dw_readw(dws, ctrl0) != cr0 || cs_change || clk_div || imask) { spi_enable_chip(dws, 0); if (dw_readw(dws, ctrl0) != cr0) dw_writew(dws, ctrl0, cr0); + spi_set_clk(dws, clk_div ? clk_div : chip->clk_div); + spi_chip_sel(dws, spi->chip_select); + /* Set the interrupt mask, for poll mode just diable all int */ spi_mask_intr(dws, 0xff); - if (!chip->poll_mode) + if (imask) spi_umask_intr(dws, imask); + if (txint_level) + dw_writew(dws, txfltr, txint_level); - spi_set_clk(dws, clk_div ? clk_div : chip->clk_div); - spi_chip_sel(dws, spi->chip_select); spi_enable_chip(dws, 1); - if (cs_change) dws->prev_chip = chip; } @@ -712,11 +726,11 @@ static int dw_spi_setup(struct spi_device *spi) } chip->bits_per_word = spi->bits_per_word; + if (!spi->max_speed_hz) { + dev_err(&spi->dev, "No max speed HZ parameter\n"); + return -EINVAL; + } chip->speed_hz = spi->max_speed_hz; - if (chip->speed_hz) - chip->clk_div = 25000000 / chip->speed_hz; - else - chip->clk_div = 8; /* default value */ chip->tmode = 0; /* Tx & Rx */ /* Default SPI mode is SCPOL = 0, SCPH = 0 */ diff --git a/drivers/spi/dw_spi_pci.c b/drivers/spi/dw_spi_pci.c index 7980f1443ce1..1f0735f9cc76 100644 --- a/drivers/spi/dw_spi_pci.c +++ b/drivers/spi/dw_spi_pci.c @@ -73,6 +73,7 @@ static int __devinit spi_pci_probe(struct pci_dev *pdev, dws->num_cs = 4; dws->max_freq = 25000000; /* for Moorestwon */ dws->irq = pdev->irq; + dws->fifo_len = 40; /* FIFO has 40 words buffer */ ret = dw_spi_add_host(dws); if (ret) diff --git a/include/linux/spi/dw_spi.h b/include/linux/spi/dw_spi.h index 51b3e771a9a3..1a127a31e017 100644 --- a/include/linux/spi/dw_spi.h +++ b/include/linux/spi/dw_spi.h @@ -90,6 +90,7 @@ struct dw_spi { unsigned long paddr; u32 iolen; int irq; + u32 fifo_len; /* depth of the FIFO buffer */ u32 max_freq; /* max bus freq supported */ u16 bus_num; -- cgit v1.2.3 From 04a723ea9c53ba608b0411aa36948bb57c51a08e Mon Sep 17 00:00:00 2001 From: Sarah Sharp Date: Wed, 6 Jan 2010 10:16:51 -0800 Subject: USB: Fix duplicate sysfs problem after device reset. Borislav Petkov reports issues with duplicate sysfs endpoint files after a resume from a hibernate. It turns out that the code to support alternate settings under xHCI has issues when a device with a non-default alternate setting is reset during the hibernate: [ 427.681810] Restarting tasks ... [ 427.681995] hub 1-0:1.0: state 7 ports 6 chg 0004 evt 0000 [ 427.682019] usb usb3: usb resume [ 427.682030] ohci_hcd 0000:00:12.0: wakeup root hub [ 427.682191] hub 1-0:1.0: port 2, status 0501, change 0000, 480 Mb/s [ 427.682205] usb 1-2: usb wakeup-resume [ 427.682226] usb 1-2: finish reset-resume [ 427.682886] done. [ 427.734658] ehci_hcd 0000:00:12.2: port 2 high speed [ 427.734663] ehci_hcd 0000:00:12.2: GetStatus port 2 status 001005 POWER sig=se0 PE CONNECT [ 427.746682] hub 3-0:1.0: hub_reset_resume [ 427.746693] hub 3-0:1.0: trying to enable port power on non-switchable hub [ 427.786715] usb 1-2: reset high speed USB device using ehci_hcd and address 2 [ 427.839653] ehci_hcd 0000:00:12.2: port 2 high speed [ 427.839666] ehci_hcd 0000:00:12.2: GetStatus port 2 status 001005 POWER sig=se0 PE CONNECT [ 427.847717] ohci_hcd 0000:00:12.0: GetStatus roothub.portstatus [1] = 0x00010100 CSC PPS [ 427.915497] hub 1-2:1.0: remove_intf_ep_devs: if: ffff88022f9e8800 ->ep_devs_created: 1 [ 427.915774] hub 1-2:1.0: remove_intf_ep_devs: bNumEndpoints: 1 [ 427.915934] hub 1-2:1.0: if: ffff88022f9e8800: endpoint devs removed. [ 427.916158] hub 1-2:1.0: create_intf_ep_devs: if: ffff88022f9e8800 ->ep_devs_created: 0, ->unregistering: 0 [ 427.916434] hub 1-2:1.0: create_intf_ep_devs: bNumEndpoints: 1 [ 427.916609] ep_81: create, parent hub [ 427.916632] ------------[ cut here ]------------ [ 427.916644] WARNING: at fs/sysfs/dir.c:477 sysfs_add_one+0x82/0x96() [ 427.916649] Hardware name: System Product Name [ 427.916653] sysfs: cannot create duplicate filename '/devices/pci0000:00/0000:00:12.2/usb1/1-2/1-2:1.0/ep_81' [ 427.916658] Modules linked in: binfmt_misc kvm_amd kvm powernow_k8 cpufreq_ondemand cpufreq_powersave cpufreq_userspace freq_table cpufreq_conservative ipv6 vfat fat +8250_pnp 8250 pcspkr ohci_hcd serial_core k10temp edac_core [ 427.916694] Pid: 278, comm: khubd Not tainted 2.6.33-rc2-00187-g08d869a-dirty #13 [ 427.916699] Call Trace: The problem is caused by a mismatch between the USB core's view of the device state and the USB device and xHCI host's view of the device state. After the device reset and re-configuration, the device and the xHCI host think they are using alternate setting 0 of all interfaces. However, the USB core keeps track of the old state, which may include non-zero alternate settings. It uses intf->cur_altsetting to keep the endpoint sysfs files for the old state across the reset. The bandwidth allocation functions need to know what the xHCI host thinks the current alternate settings are, so original patch set intf->cur_altsetting to the alternate setting 0. This caused duplicate endpoint files to be created. The solution is to not set intf->cur_altsetting before calling usb_set_interface() in usb_reset_and_verify_device(). Instead, we add a new flag to struct usb_interface to tell usb_hcd_alloc_bandwidth() to use alternate setting 0 as the currently installed alternate setting. Signed-off-by: Sarah Sharp Tested-by: Borislav Petkov Cc: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 18 ++++++++++++++++++ drivers/usb/core/hub.c | 15 +++++---------- include/linux/usb.h | 1 + 3 files changed, 24 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 0495fa651225..80995ef0868c 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1684,6 +1684,24 @@ int usb_hcd_alloc_bandwidth(struct usb_device *udev, } } if (cur_alt && new_alt) { + struct usb_interface *iface = usb_ifnum_to_if(udev, + cur_alt->desc.bInterfaceNumber); + + if (iface->resetting_device) { + /* + * The USB core just reset the device, so the xHCI host + * and the device will think alt setting 0 is installed. + * However, the USB core will pass in the alternate + * setting installed before the reset as cur_alt. Dig + * out the alternate setting 0 structure, or the first + * alternate setting if a broken device doesn't have alt + * setting 0. + */ + cur_alt = usb_altnum_to_altsetting(iface, 0); + if (!cur_alt) + cur_alt = &iface->altsetting[0]; + } + /* Drop all the endpoints in the current alt setting */ for (i = 0; i < cur_alt->desc.bNumEndpoints; i++) { ret = hcd->driver->drop_endpoint(hcd, udev, diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index b9f5fcd713e2..35cc8b9ba1f5 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -3695,19 +3695,14 @@ static int usb_reset_and_verify_device(struct usb_device *udev) usb_enable_interface(udev, intf, true); ret = 0; } else { - /* We've just reset the device, so it will think alt - * setting 0 is installed. For usb_set_interface() to - * work properly, we need to set the current alternate - * interface setting to 0 (or the first alt setting, if - * the device doesn't have alt setting 0). + /* Let the bandwidth allocation function know that this + * device has been reset, and it will have to use + * alternate setting 0 as the current alternate setting. */ - intf->cur_altsetting = - usb_find_alt_setting(config, i, 0); - if (!intf->cur_altsetting) - intf->cur_altsetting = - &config->intf_cache[i]->altsetting[0]; + intf->resetting_device = 1; ret = usb_set_interface(udev, desc->bInterfaceNumber, desc->bAlternateSetting); + intf->resetting_device = 0; } if (ret < 0) { dev_err(&udev->dev, "failed to restore interface %d " diff --git a/include/linux/usb.h b/include/linux/usb.h index e101a2d04d75..d7ace1b80f09 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -192,6 +192,7 @@ struct usb_interface { unsigned needs_altsetting0:1; /* switch to altsetting 0 is pending */ unsigned needs_binding:1; /* needs delayed unbind/rebind */ unsigned reset_running:1; + unsigned resetting_device:1; /* true: bandwidth alloc after reset */ struct device dev; /* interface specific device info */ struct device *usb_dev; -- cgit v1.2.3 From 3bf127637e22ddf95e67e10a23c339cee3d52429 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 21 Jan 2010 00:02:36 -0800 Subject: Input: sh_keysc - add mode 4 and mode 5 support Add Mode 4 and Mode 5 support to the SH_KEYSC driver. These modes allow slightly larger key pad matrixes. While at it, make use of resource_size(). Signed-off-by: Magnus Damm Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/sh_keysc.c | 6 +++--- include/linux/input/sh_keysc.h | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/keyboard/sh_keysc.c b/drivers/input/keyboard/sh_keysc.c index 076111fc72d2..25706f802258 100644 --- a/drivers/input/keyboard/sh_keysc.c +++ b/drivers/input/keyboard/sh_keysc.c @@ -36,6 +36,8 @@ static const struct { [SH_KEYSC_MODE_1] = { 0, 6, 5 }, [SH_KEYSC_MODE_2] = { 1, 5, 6 }, [SH_KEYSC_MODE_3] = { 2, 4, 7 }, + [SH_KEYSC_MODE_4] = { 3, 6, 6 }, + [SH_KEYSC_MODE_5] = { 4, 6, 7 }, }; struct sh_keysc_priv { @@ -122,8 +124,6 @@ static irqreturn_t sh_keysc_isr(int irq, void *dev_id) return IRQ_HANDLED; } -#define res_size(res) ((res)->end - (res)->start + 1) - static int __devinit sh_keysc_probe(struct platform_device *pdev) { struct sh_keysc_priv *priv; @@ -164,7 +164,7 @@ static int __devinit sh_keysc_probe(struct platform_device *pdev) memcpy(&priv->pdata, pdev->dev.platform_data, sizeof(priv->pdata)); pdata = &priv->pdata; - priv->iomem_base = ioremap_nocache(res->start, res_size(res)); + priv->iomem_base = ioremap_nocache(res->start, resource_size(res)); if (priv->iomem_base == NULL) { dev_err(&pdev->dev, "failed to remap I/O memory\n"); error = -ENXIO; diff --git a/include/linux/input/sh_keysc.h b/include/linux/input/sh_keysc.h index c211b5cf08e6..2aff38bcf2ba 100644 --- a/include/linux/input/sh_keysc.h +++ b/include/linux/input/sh_keysc.h @@ -1,10 +1,11 @@ #ifndef __SH_KEYSC_H__ #define __SH_KEYSC_H__ -#define SH_KEYSC_MAXKEYS 30 +#define SH_KEYSC_MAXKEYS 42 struct sh_keysc_info { - enum { SH_KEYSC_MODE_1, SH_KEYSC_MODE_2, SH_KEYSC_MODE_3 } mode; + enum { SH_KEYSC_MODE_1, SH_KEYSC_MODE_2, SH_KEYSC_MODE_3, + SH_KEYSC_MODE_4, SH_KEYSC_MODE_5 } mode; int scan_timing; /* 0 -> 7, see KYCR1, SCN[2:0] */ int delay; int kycr2_delay; -- cgit v1.2.3 From 50b926e439620c469565e8be0f28be78f5fca1ce Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 4 Jan 2010 14:44:56 +0100 Subject: sched: Fix vmark regression on big machines SD_PREFER_SIBLING is set at the CPU domain level if power saving isn't enabled, leading to many cache misses on large machines as we traverse looking for an idle shared cache to wake to. Change the enabler of select_idle_sibling() to SD_SHARE_PKG_RESOURCES, and enable same at the sibling domain level. Reported-by: Lin Ming Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1262612696.15495.15.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- include/linux/topology.h | 2 +- kernel/sched_fair.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index 57e63579bfdd..5b81156780b1 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -99,7 +99,7 @@ int arch_update_cpu_topology(void); | 1*SD_WAKE_AFFINE \ | 1*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ - | 0*SD_SHARE_PKG_RESOURCES \ + | 1*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ | 0*SD_PREFER_SIBLING \ , \ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 42ac3c9f66f6..8fe7ee81c552 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1508,7 +1508,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag * If there's an idle sibling in this domain, make that * the wake_affine target instead of the current cpu. */ - if (tmp->flags & SD_PREFER_SIBLING) + if (tmp->flags & SD_SHARE_PKG_RESOURCES) target = select_idle_sibling(p, tmp, target); if (target >= 0) { -- cgit v1.2.3 From 3d45fd804a95055ecab5b3eed81f5ab2dbb047a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 17:12:46 +0100 Subject: sched: Remove the sched_class load_balance methods Take out the sched_class methods for load-balancing. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ------ kernel/sched.c | 26 ------------------- kernel/sched_fair.c | 66 +++++++++++++++++++++++++++---------------------- kernel/sched_idletask.c | 21 ---------------- kernel/sched_rt.c | 20 --------------- 5 files changed, 37 insertions(+), 104 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f2f842db03ce..50d685cde70e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1087,14 +1087,6 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); - unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, - struct rq *busiest, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio); - - int (*move_one_task) (struct rq *this_rq, int this_cpu, - struct rq *busiest, struct sched_domain *sd, - enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); void (*task_waking) (struct rq *this_rq, struct task_struct *task); diff --git a/kernel/sched.c b/kernel/sched.c index 13a2acf18b2d..c0be07932a8d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1390,32 +1390,6 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); - -/* - * runqueue iterator, to support SMP load-balancing between different - * scheduling classes, without having to expose their internal data - * structures to the load-balancing proper: - */ -struct rq_iterator { - void *arg; - struct task_struct *(*start)(void *); - struct task_struct *(*next)(void *); -}; - -#ifdef CONFIG_SMP -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, - int *this_best_prio, struct rq_iterator *iterator); - -static int -iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle, - struct rq_iterator *iterator); -#endif - /* Time spent by the tasks of the cpu accounting group executing in ... */ enum cpuacct_stat_index { CPUACCT_STAT_USER, /* ... user mode */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5116b81d7727..faf9a2f099ab 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1851,6 +1851,24 @@ static struct task_struct *load_balance_next_fair(void *arg) return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); } +/* + * runqueue iterator, to support SMP load-balancing between different + * scheduling classes, without having to expose their internal data + * structures to the load-balancing proper: + */ +struct rq_iterator { + void *arg; + struct task_struct *(*start)(void *); + struct task_struct *(*next)(void *); +}; + +static unsigned long +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, + int *this_best_prio, struct rq_iterator *iterator); + + static unsigned long __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, @@ -1929,8 +1947,20 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, #endif static int -move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle, + struct rq_iterator *iterator); + +/* + * move_one_task tries to move exactly one task from busiest to this_rq, as + * part of active balancing operations within "domain". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int +move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) { struct cfs_rq *busy_cfs_rq; struct rq_iterator cfs_rq_iterator; @@ -2094,16 +2124,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { - const struct sched_class *class = sched_class_highest; - unsigned long total_load_moved = 0; + unsigned long total_load_moved = 0, load_moved; int this_best_prio = this_rq->curr->prio; do { - total_load_moved += - class->load_balance(this_rq, this_cpu, busiest, + load_moved = load_balance_fair(this_rq, this_cpu, busiest, max_load_move - total_load_moved, sd, idle, all_pinned, &this_best_prio); - class = class->next; + + total_load_moved += load_moved; #ifdef CONFIG_PREEMPT /* @@ -2114,7 +2143,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) break; #endif - } while (class && max_load_move > total_load_moved); + } while (load_moved && max_load_move > total_load_moved); return total_load_moved > 0; } @@ -2145,25 +2174,6 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } -/* - * move_one_task tries to move exactly one task from busiest to this_rq, as - * part of active balancing operations within "domain". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) -{ - const struct sched_class *class; - - for_each_class(class) { - if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) - return 1; - } - - return 0; -} /********** Helpers for find_busiest_group ************************/ /* * sd_lb_stats - Structure to store the statistics of a sched_domain @@ -3873,8 +3883,6 @@ static const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, - .load_balance = load_balance_fair, - .move_one_task = move_one_task_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 01332bfc61a7..a8a6d8a50947 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { } -#ifdef CONFIG_SMP -static unsigned long -load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) -{ - return 0; -} - -static int -move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) -{ - return 0; -} -#endif - static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) { } @@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, - - .load_balance = load_balance_idle, - .move_one_task = move_one_task_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 072b3fcee8d8..502bb614e40a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1481,24 +1481,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) push_rt_tasks(rq); } -static unsigned long -load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) -{ - /* don't touch RT tasks */ - return 0; -} - -static int -move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) -{ - /* don't touch RT tasks */ - return 0; -} - static void set_cpus_allowed_rt(struct task_struct *p, const struct cpumask *new_mask) { @@ -1746,8 +1728,6 @@ static const struct sched_class rt_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_rt, - .load_balance = load_balance_rt, - .move_one_task = move_one_task_rt, .set_cpus_allowed = set_cpus_allowed_rt, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, -- cgit v1.2.3 From 7c9414385ebfdd87cc542d4e7e3bb0dbb2d3ce25 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Wed, 20 Jan 2010 13:26:18 +0100 Subject: sched: Remove USER_SCHED Remove the USER_SCHED feature. It has been scheduled to be removed in 2.6.34 as per http://marc.info/?l=linux-kernel&m=125728479022976&w=2 Signed-off-by: Dhaval Giani Signed-off-by: Peter Zijlstra LKML-Reference: <1263990378.24844.3.camel@localhost> Signed-off-by: Ingo Molnar --- Documentation/feature-removal-schedule.txt | 15 -- include/linux/sched.h | 14 +- init/Kconfig | 81 +++----- kernel/ksysfs.c | 8 - kernel/sched.c | 114 +---------- kernel/sys.c | 5 - kernel/user.c | 305 ----------------------------- 7 files changed, 38 insertions(+), 504 deletions(-) (limited to 'include/linux') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 870d190fe617..04a3fc3d139b 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -6,21 +6,6 @@ be removed from this file. --------------------------- -What: USER_SCHED -When: 2.6.34 - -Why: USER_SCHED was implemented as a proof of concept for group scheduling. - The effect of USER_SCHED can already be achieved from userspace with - the help of libcgroup. The removal of USER_SCHED will also simplify - the scheduler code with the removal of one major ifdef. There are also - issues USER_SCHED has with USER_NS. A decision was taken not to fix - those and instead remove USER_SCHED. Also new group scheduling - features will not be implemented for USER_SCHED. - -Who: Dhaval Giani - ---------------------------- - What: PRISM54 When: 2.6.34 diff --git a/include/linux/sched.h b/include/linux/sched.h index 50d685cde70e..8b079735ae5f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -731,14 +731,6 @@ struct user_struct { uid_t uid; struct user_namespace *user_ns; -#ifdef CONFIG_USER_SCHED - struct task_group *tg; -#ifdef CONFIG_SYSFS - struct kobject kobj; - struct delayed_work work; -#endif -#endif - #ifdef CONFIG_PERF_EVENTS atomic_long_t locked_vm; #endif @@ -2502,13 +2494,9 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); extern void normalize_rt_tasks(void); -#ifdef CONFIG_GROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED extern struct task_group init_task_group; -#ifdef CONFIG_USER_SCHED -extern struct task_group root_task_group; -extern void set_tg_uid(struct user_struct *user); -#endif extern struct task_group *sched_create_group(struct task_group *parent); extern void sched_destroy_group(struct task_group *tg); diff --git a/init/Kconfig b/init/Kconfig index a23da9f01803..e9fa3007a6fc 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -435,57 +435,6 @@ config LOG_BUF_SHIFT config HAVE_UNSTABLE_SCHED_CLOCK bool -config GROUP_SCHED - bool "Group CPU scheduler" - depends on EXPERIMENTAL - default n - help - This feature lets CPU scheduler recognize task groups and control CPU - bandwidth allocation to such task groups. - In order to create a group from arbitrary set of processes, use - CONFIG_CGROUPS. (See Control Group support.) - -config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on GROUP_SCHED - default GROUP_SCHED - -config RT_GROUP_SCHED - bool "Group scheduling for SCHED_RR/FIFO" - depends on EXPERIMENTAL - depends on GROUP_SCHED - default n - help - This feature lets you explicitly allocate real CPU bandwidth - to users or control groups (depending on the "Basis for grouping tasks" - setting below. If enabled, it will also make it impossible to - schedule realtime tasks for non-root users until you allocate - realtime bandwidth for them. - See Documentation/scheduler/sched-rt-group.txt for more information. - -choice - depends on GROUP_SCHED - prompt "Basis for grouping tasks" - default USER_SCHED - -config USER_SCHED - bool "user id" - help - This option will choose userid as the basis for grouping - tasks, thus providing equal CPU bandwidth to each user. - -config CGROUP_SCHED - bool "Control groups" - depends on CGROUPS - help - This option allows you to create arbitrary task groups - using the "cgroup" pseudo filesystem and control - the cpu bandwidth allocated to each such task group. - Refer to Documentation/cgroups/cgroups.txt for more - information on "cgroup" pseudo filesystem. - -endchoice - menuconfig CGROUPS boolean "Control Group support" help @@ -606,6 +555,36 @@ config CGROUP_MEM_RES_CTLR_SWAP Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page size is 4096bytes, 512k per 1Gbytes of swap. +menuconfig CGROUP_SCHED + bool "Group CPU scheduler" + depends on EXPERIMENTAL && CGROUPS + default n + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +if CGROUP_SCHED +config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED + default CGROUP_SCHED + +config RT_GROUP_SCHED + bool "Group scheduling for SCHED_RR/FIFO" + depends on EXPERIMENTAL + depends on CGROUP_SCHED + default n + help + This feature lets you explicitly allocate real CPU bandwidth + to users or control groups (depending on the "Basis for grouping tasks" + setting below. If enabled, it will also make it impossible to + schedule realtime tasks for non-root users until you allocate + realtime bandwidth for them. + See Documentation/scheduler/sched-rt-group.txt for more information. + +endif #CGROUP_SCHED + endif # CGROUPS config MM_OWNER diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 3feaf5a74514..6b1ccc3f0205 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -197,16 +197,8 @@ static int __init ksysfs_init(void) goto group_exit; } - /* create the /sys/kernel/uids/ directory */ - error = uids_sysfs_init(); - if (error) - goto notes_exit; - return 0; -notes_exit: - if (notes_size > 0) - sysfs_remove_bin_file(kernel_kobj, ¬es_attr); group_exit: sysfs_remove_group(kernel_kobj, &kernel_attr_group); kset_exit: diff --git a/kernel/sched.c b/kernel/sched.c index c0be07932a8d..41e76d325648 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) */ static DEFINE_MUTEX(sched_domains_mutex); -#ifdef CONFIG_GROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED #include @@ -243,13 +243,7 @@ static LIST_HEAD(task_groups); /* task group related information */ struct task_group { -#ifdef CONFIG_CGROUP_SCHED struct cgroup_subsys_state css; -#endif - -#ifdef CONFIG_USER_SCHED - uid_t uid; -#endif #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ @@ -274,35 +268,7 @@ struct task_group { struct list_head children; }; -#ifdef CONFIG_USER_SCHED - -/* Helper function to pass uid information to create_sched_user() */ -void set_tg_uid(struct user_struct *user) -{ - user->tg->uid = user->uid; -} - -/* - * Root task group. - * Every UID task group (including init_task_group aka UID-0) will - * be a child to this group. - */ -struct task_group root_task_group; - -#ifdef CONFIG_FAIR_GROUP_SCHED -/* Default task group's sched entity on each cpu */ -static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); -/* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var); -#endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -318,11 +284,7 @@ static int root_task_group_empty(void) } #endif -#ifdef CONFIG_USER_SCHED -# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) -#else /* !CONFIG_USER_SCHED */ # define INIT_TASK_GROUP_LOAD NICE_0_LOAD -#endif /* CONFIG_USER_SCHED */ /* * A weight of 0 or 1 can cause arithmetics problems. @@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p) { struct task_group *tg; -#ifdef CONFIG_USER_SCHED - rcu_read_lock(); - tg = __task_cred(p)->user->tg; - rcu_read_unlock(); -#elif defined(CONFIG_CGROUP_SCHED) +#ifdef CONFIG_CGROUP_SCHED tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); #else @@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p) return NULL; } -#endif /* CONFIG_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ /* CFS-related fields in a runqueue */ struct cfs_rq { @@ -7678,9 +7636,6 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif -#ifdef CONFIG_USER_SCHED - alloc_size *= 2; -#endif #ifdef CONFIG_CPUMASK_OFFSTACK alloc_size += num_possible_cpus() * cpumask_size(); #endif @@ -7694,13 +7649,6 @@ void __init sched_init(void) init_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); -#ifdef CONFIG_USER_SCHED - root_task_group.se = (struct sched_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.cfs_rq = (struct cfs_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); -#endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED init_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -7709,13 +7657,6 @@ void __init sched_init(void) init_task_group.rt_rq = (struct rt_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); -#ifdef CONFIG_USER_SCHED - root_task_group.rt_se = (struct sched_rt_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.rt_rq = (struct rt_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); -#endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CPUMASK_OFFSTACK for_each_possible_cpu(i) { @@ -7735,22 +7676,13 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&init_task_group.rt_bandwidth, global_rt_period(), global_rt_runtime()); -#ifdef CONFIG_USER_SCHED - init_rt_bandwidth(&root_task_group.rt_bandwidth, - global_rt_period(), RUNTIME_INF); -#endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_GROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED list_add(&init_task_group.list, &task_groups); INIT_LIST_HEAD(&init_task_group.children); -#ifdef CONFIG_USER_SCHED - INIT_LIST_HEAD(&root_task_group.children); - init_task_group.parent = &root_task_group; - list_add(&init_task_group.siblings, &root_task_group.children); -#endif /* CONFIG_USER_SCHED */ -#endif /* CONFIG_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), @@ -7790,25 +7722,6 @@ void __init sched_init(void) * directly in rq->cfs (i.e init_task_group->se[] = NULL). */ init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); -#elif defined CONFIG_USER_SCHED - root_task_group.shares = NICE_0_LOAD; - init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); - /* - * In case of task-groups formed thr' the user id of tasks, - * init_task_group represents tasks belonging to root user. - * Hence it forms a sibling of all subsequent groups formed. - * In this case, init_task_group gets only a fraction of overall - * system cpu resource, based on the weight assigned to root - * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished - * by letting tasks of init_task_group sit in a separate cfs_rq - * (init_tg_cfs_rq) and having one entity represent this group of - * tasks in rq->cfs (i.e init_task_group->se[] != NULL). - */ - init_tg_cfs_entry(&init_task_group, - &per_cpu(init_tg_cfs_rq, i), - &per_cpu(init_sched_entity, i), i, 1, - root_task_group.se[i]); - #endif #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -7817,12 +7730,6 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->leaf_rt_rq_list); #ifdef CONFIG_CGROUP_SCHED init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); -#elif defined CONFIG_USER_SCHED - init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); - init_tg_rt_entry(&init_task_group, - &per_cpu(init_rt_rq_var, i), - &per_cpu(init_sched_rt_entity, i), i, 1, - root_task_group.rt_se[i]); #endif #endif @@ -8218,7 +8125,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) } #endif /* CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_GROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -8327,7 +8234,7 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, &flags); } -#endif /* CONFIG_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED static void __set_se_shares(struct sched_entity *se, unsigned long shares) @@ -8469,13 +8376,6 @@ static int tg_schedulable(struct task_group *tg, void *data) runtime = d->rt_runtime; } -#ifdef CONFIG_USER_SCHED - if (tg == &root_task_group) { - period = global_rt_period(); - runtime = global_rt_runtime(); - } -#endif - /* * Cannot have more runtime than the period. */ diff --git a/kernel/sys.c b/kernel/sys.c index 26a6b73a6b85..f75bf0936f47 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -569,11 +569,6 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; - if (!task_can_switch_user(new_user, current)) { - free_uid(new_user); - return -EINVAL; - } - if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && new_user != INIT_USER) { diff --git a/kernel/user.c b/kernel/user.c index 46d0165ca70c..766467b3bcb7 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -56,9 +56,6 @@ struct user_struct root_user = { .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .user_ns = &init_user_ns, -#ifdef CONFIG_USER_SCHED - .tg = &init_task_group, -#endif }; /* @@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up) put_user_ns(up->user_ns); } -#ifdef CONFIG_USER_SCHED - -static void sched_destroy_user(struct user_struct *up) -{ - sched_destroy_group(up->tg); -} - -static int sched_create_user(struct user_struct *up) -{ - int rc = 0; - - up->tg = sched_create_group(&root_task_group); - if (IS_ERR(up->tg)) - rc = -ENOMEM; - - set_tg_uid(up); - - return rc; -} - -#else /* CONFIG_USER_SCHED */ - -static void sched_destroy_user(struct user_struct *up) { } -static int sched_create_user(struct user_struct *up) { return 0; } - -#endif /* CONFIG_USER_SCHED */ - -#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS) - -static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) -{ - struct user_struct *user; - struct hlist_node *h; - - hlist_for_each_entry(user, h, hashent, uidhash_node) { - if (user->uid == uid) { - /* possibly resurrect an "almost deleted" object */ - if (atomic_inc_return(&user->__count) == 1) - cancel_delayed_work(&user->work); - return user; - } - } - - return NULL; -} - -static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ -static DEFINE_MUTEX(uids_mutex); - -static inline void uids_mutex_lock(void) -{ - mutex_lock(&uids_mutex); -} - -static inline void uids_mutex_unlock(void) -{ - mutex_unlock(&uids_mutex); -} - -/* uid directory attributes */ -#ifdef CONFIG_FAIR_GROUP_SCHED -static ssize_t cpu_shares_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - - return sprintf(buf, "%lu\n", sched_group_shares(up->tg)); -} - -static ssize_t cpu_shares_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t size) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - unsigned long shares; - int rc; - - sscanf(buf, "%lu", &shares); - - rc = sched_group_set_shares(up->tg, shares); - - return (rc ? rc : size); -} - -static struct kobj_attribute cpu_share_attr = - __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); -#endif - -#ifdef CONFIG_RT_GROUP_SCHED -static ssize_t cpu_rt_runtime_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - - return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); -} - -static ssize_t cpu_rt_runtime_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t size) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - unsigned long rt_runtime; - int rc; - - sscanf(buf, "%ld", &rt_runtime); - - rc = sched_group_set_rt_runtime(up->tg, rt_runtime); - - return (rc ? rc : size); -} - -static struct kobj_attribute cpu_rt_runtime_attr = - __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); - -static ssize_t cpu_rt_period_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - - return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg)); -} - -static ssize_t cpu_rt_period_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t size) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - unsigned long rt_period; - int rc; - - sscanf(buf, "%lu", &rt_period); - - rc = sched_group_set_rt_period(up->tg, rt_period); - - return (rc ? rc : size); -} - -static struct kobj_attribute cpu_rt_period_attr = - __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store); -#endif - -/* default attributes per uid directory */ -static struct attribute *uids_attributes[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED - &cpu_share_attr.attr, -#endif -#ifdef CONFIG_RT_GROUP_SCHED - &cpu_rt_runtime_attr.attr, - &cpu_rt_period_attr.attr, -#endif - NULL -}; - -/* the lifetime of user_struct is not managed by the core (now) */ -static void uids_release(struct kobject *kobj) -{ - return; -} - -static struct kobj_type uids_ktype = { - .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = uids_attributes, - .release = uids_release, -}; - -/* - * Create /sys/kernel/uids//cpu_share file for this user - * We do not create this file for users in a user namespace (until - * sysfs tagging is implemented). - * - * See Documentation/scheduler/sched-design-CFS.txt for ramifications. - */ -static int uids_user_create(struct user_struct *up) -{ - struct kobject *kobj = &up->kobj; - int error; - - memset(kobj, 0, sizeof(struct kobject)); - if (up->user_ns != &init_user_ns) - return 0; - kobj->kset = uids_kset; - error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); - if (error) { - kobject_put(kobj); - goto done; - } - - kobject_uevent(kobj, KOBJ_ADD); -done: - return error; -} - -/* create these entries in sysfs: - * "/sys/kernel/uids" directory - * "/sys/kernel/uids/0" directory (for root user) - * "/sys/kernel/uids/0/cpu_share" file (for root user) - */ -int __init uids_sysfs_init(void) -{ - uids_kset = kset_create_and_add("uids", NULL, kernel_kobj); - if (!uids_kset) - return -ENOMEM; - - return uids_user_create(&root_user); -} - -/* delayed work function to remove sysfs directory for a user and free up - * corresponding structures. - */ -static void cleanup_user_struct(struct work_struct *w) -{ - struct user_struct *up = container_of(w, struct user_struct, work.work); - unsigned long flags; - int remove_user = 0; - - /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() - * atomic. - */ - uids_mutex_lock(); - - spin_lock_irqsave(&uidhash_lock, flags); - if (atomic_read(&up->__count) == 0) { - uid_hash_remove(up); - remove_user = 1; - } - spin_unlock_irqrestore(&uidhash_lock, flags); - - if (!remove_user) - goto done; - - if (up->user_ns == &init_user_ns) { - kobject_uevent(&up->kobj, KOBJ_REMOVE); - kobject_del(&up->kobj); - kobject_put(&up->kobj); - } - - sched_destroy_user(up); - key_put(up->uid_keyring); - key_put(up->session_keyring); - kmem_cache_free(uid_cachep, up); - -done: - uids_mutex_unlock(); -} - -/* IRQs are disabled and uidhash_lock is held upon function entry. - * IRQ state (as stored in flags) is restored and uidhash_lock released - * upon function exit. - */ -static void free_user(struct user_struct *up, unsigned long flags) -{ - INIT_DELAYED_WORK(&up->work, cleanup_user_struct); - schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); - spin_unlock_irqrestore(&uidhash_lock, flags); -} - -#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ - static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) { struct user_struct *user; @@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) return NULL; } -int uids_sysfs_init(void) { return 0; } -static inline int uids_user_create(struct user_struct *up) { return 0; } -static inline void uids_mutex_lock(void) { } -static inline void uids_mutex_unlock(void) { } - /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. @@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); - sched_destroy_user(up); key_put(up->uid_keyring); key_put(up->session_keyring); kmem_cache_free(uid_cachep, up); } -#endif - -#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED) -/* - * We need to check if a setuid can take place. This function should be called - * before successfully completing the setuid. - */ -int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) -{ - - return sched_rt_can_attach(up->tg, tsk); - -} -#else -int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) -{ - return 1; -} -#endif - /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). @@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() * atomic. */ - uids_mutex_lock(); - spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); spin_unlock_irq(&uidhash_lock); @@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) new->uid = uid; atomic_set(&new->__count, 1); - if (sched_create_user(new) < 0) - goto out_free_user; - new->user_ns = get_user_ns(ns); - if (uids_user_create(new)) - goto out_destoy_sched; - /* * Before adding this, check whether we raced * on adding the same user already.. @@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) spin_unlock_irq(&uidhash_lock); } - uids_mutex_unlock(); - return up; -out_destoy_sched: - sched_destroy_user(new); put_user_ns(new->user_ns); -out_free_user: kmem_cache_free(uid_cachep, new); out_unlock: - uids_mutex_unlock(); return NULL; } -- cgit v1.2.3 From 92b6759857ea3ad19bc6871044e373f6251841d3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Jan 2010 14:02:16 +0100 Subject: perf: Change the is_software_event() definition The is_software_event() definition always confuses me because its an exclusive expression, make it an inclusive one. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c66b34f75eea..8fa71874113f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -814,9 +814,14 @@ extern int perf_event_overflow(struct perf_event *event, int nmi, */ static inline int is_software_event(struct perf_event *event) { - return (event->attr.type != PERF_TYPE_RAW) && - (event->attr.type != PERF_TYPE_HARDWARE) && - (event->attr.type != PERF_TYPE_HW_CACHE); + switch (event->attr.type) { + case PERF_TYPE_SOFTWARE: + case PERF_TYPE_TRACEPOINT: + /* for now the breakpoint stuff also works as software event */ + case PERF_TYPE_BREAKPOINT: + return 1; + } + return 0; } extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; -- cgit v1.2.3 From 83fe518a839e317480e50a138ef4acd73510d7ce Mon Sep 17 00:00:00 2001 From: George Shore Date: Thu, 21 Jan 2010 11:40:48 +0000 Subject: spi/dw_spi: enable platform specific chipselect. The driver core allows for a platform-specific chipselect assert/deassert function, however the chipselect function in the core doesn't take advantage of this fact. This enables the use of a custom function, should it be defined. Signed-off-by: George Shore Signed-off-by: Grant Likely --- include/linux/spi/dw_spi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/dw_spi.h b/include/linux/spi/dw_spi.h index 1a127a31e017..cc813f95a2f2 100644 --- a/include/linux/spi/dw_spi.h +++ b/include/linux/spi/dw_spi.h @@ -172,6 +172,10 @@ static inline void spi_chip_sel(struct dw_spi *dws, u16 cs) { if (cs > dws->num_cs) return; + + if (dws->cs_control) + dws->cs_control(1); + dw_writel(dws, ser, 1 << cs); } -- cgit v1.2.3 From ea87bb7853168434f4a82426dd1ea8421f9e604d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Jan 2010 20:58:57 +0000 Subject: sched: Extend enqueue_task to allow head queueing The ability of enqueueing a task to the head of a SCHED_FIFO priority list is required to fix some violations of POSIX scheduling policy. Extend the related functions with a "head" argument. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Tested-by: Carsten Emde Tested-by: Mathias Weber LKML-Reference: <20100120171629.734886007@linutronix.de> --- include/linux/sched.h | 3 ++- kernel/sched.c | 13 +++++++------ kernel/sched_fair.c | 3 ++- kernel/sched_rt.c | 3 ++- 4 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8b079735ae5f..b35c0c7130c8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1067,7 +1067,8 @@ struct sched_domain; struct sched_class { const struct sched_class *next; - void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup, + bool head); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); void (*yield_task) (struct rq *rq); diff --git a/kernel/sched.c b/kernel/sched.c index 41e76d325648..f47560ff3346 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1856,13 +1856,14 @@ static void update_avg(u64 *avg, u64 sample) *avg += diff >> 3; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) +static void +enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) { if (wakeup) p->se.start_runtime = p->se.sum_exec_runtime; sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, wakeup); + p->sched_class->enqueue_task(rq, p, wakeup, head); p->se.on_rq = 1; } @@ -1892,7 +1893,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) if (task_contributes_to_load(p)) rq->nr_uninterruptible--; - enqueue_task(rq, p, wakeup); + enqueue_task(rq, p, wakeup, false); inc_nr_running(rq); } @@ -4236,7 +4237,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); if (on_rq) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, 0, false); check_class_changed(rq, p, prev_class, oldprio, running); } @@ -4280,7 +4281,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (on_rq) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, 0, false); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -8230,7 +8231,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (on_rq) - enqueue_task(rq, tsk, 0); + enqueue_task(rq, tsk, 0, false); task_rq_unlock(rq, &flags); } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 22231ccb2f98..0e7a7af9cf8b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq) * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */ -static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) +static void +enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 502bb614e40a..38076dabb44a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -878,7 +878,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) /* * Adding/removing a task to/from a priority array: */ -static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) +static void +enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) { struct sched_rt_entity *rt_se = &p->rt; -- cgit v1.2.3 From d0dd2de0d055f0ffb1e2ecdc21380de9d12a85e2 Mon Sep 17 00:00:00 2001 From: Andriy Tkachuk Date: Wed, 20 Jan 2010 13:55:06 +0200 Subject: mac80211: Account HT Control field in Data frame hdrlen according to 802.11n-2009 ieee80211_hdrlen() should account account new HT Control field in 802.11 data frame header introduced by IEEE 802.11n standard. According to 802.11n-2009 HT Control field is present in data frames when both of following are met: 1. It is QoS data frame. 2. Order bit is set in Frame Control field. The change might be totally compatible with legacy non-11n aware frames, because 802.11-2007 standard states that "all QoS STAs set this subfield to 0". Signed-off-by: Andriy V. Tkachuk Acked-by : Benoit Papillault Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 2 ++ net/wireless/util.c | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 842701906ae9..19984958ab7b 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -138,6 +138,8 @@ #define IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK 0x03 #define IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT 5 +#define IEEE80211_HT_CTL_LEN 4 + struct ieee80211_hdr { __le16 frame_control; __le16 duration_id; diff --git a/net/wireless/util.c b/net/wireless/util.c index 23557c1d0a9c..be2ab8c59e3a 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -227,8 +227,11 @@ unsigned int ieee80211_hdrlen(__le16 fc) if (ieee80211_is_data(fc)) { if (ieee80211_has_a4(fc)) hdrlen = 30; - if (ieee80211_is_data_qos(fc)) + if (ieee80211_is_data_qos(fc)) { hdrlen += IEEE80211_QOS_CTL_LEN; + if (ieee80211_has_order(fc)) + hdrlen += IEEE80211_HT_CTL_LEN; + } goto out; } -- cgit v1.2.3 From a271623f871dda970319ca15dfad3a8c8c36249f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 22 Jan 2010 10:13:10 +0000 Subject: netdev: remove certain HAVE_ macros After netdev_ops compat code HAVE_* macros aren't needed, in fact they _will_ result in compile breakage for out of tree drivers. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 468a11dea58c..b5fb51d0b8b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -621,30 +621,21 @@ struct net_device_ops { struct net_device *dev); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb); -#define HAVE_CHANGE_RX_FLAGS void (*ndo_change_rx_flags)(struct net_device *dev, int flags); -#define HAVE_SET_RX_MODE void (*ndo_set_rx_mode)(struct net_device *dev); -#define HAVE_MULTICAST void (*ndo_set_multicast_list)(struct net_device *dev); -#define HAVE_SET_MAC_ADDR int (*ndo_set_mac_address)(struct net_device *dev, void *addr); -#define HAVE_VALIDATE_ADDR int (*ndo_validate_addr)(struct net_device *dev); -#define HAVE_PRIVATE_IOCTL int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); -#define HAVE_SET_CONFIG int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); -#define HAVE_CHANGE_MTU int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); int (*ndo_neigh_setup)(struct net_device *dev, struct neigh_parms *); -#define HAVE_TX_TIMEOUT void (*ndo_tx_timeout) (struct net_device *dev); struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); @@ -656,7 +647,6 @@ struct net_device_ops { void (*ndo_vlan_rx_kill_vid)(struct net_device *dev, unsigned short vid); #ifdef CONFIG_NET_POLL_CONTROLLER -#define HAVE_NETDEV_POLL void (*ndo_poll_controller)(struct net_device *dev); #endif #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) -- cgit v1.2.3 From cb289d6244a37cf932c571d6deb0daa8030f931b Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Wed, 13 Jan 2010 09:34:36 -0800 Subject: eventfd - allow atomic read and waitqueue remove KVM needs a wait to atomically remove themselves from the eventfd ->poll() wait queue head, in order to handle correctly their IRQfd deassign operation. This patch introduces such API, plus a way to read an eventfd from its context. Signed-off-by: Davide Libenzi Signed-off-by: Avi Kivity --- fs/eventfd.c | 89 ++++++++++++++++++++++++++++++++++++++++--------- include/linux/eventfd.h | 16 +++++++++ 2 files changed, 90 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/fs/eventfd.c b/fs/eventfd.c index d26402ff06ea..7758cc382ef0 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -135,26 +135,71 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait) return events; } -static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) +static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) +{ + *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; + ctx->count -= *cnt; +} + +/** + * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. + * @ctx: [in] Pointer to eventfd context. + * @wait: [in] Wait queue to be removed. + * @cnt: [out] Pointer to the 64bit conter value. + * + * Returns zero if successful, or the following error codes: + * + * -EAGAIN : The operation would have blocked. + * + * This is used to atomically remove a wait queue entry from the eventfd wait + * queue head, and read/reset the counter value. + */ +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, + __u64 *cnt) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->wqh.lock, flags); + eventfd_ctx_do_read(ctx, cnt); + __remove_wait_queue(&ctx->wqh, wait); + if (*cnt != 0 && waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, POLLOUT); + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return *cnt != 0 ? 0 : -EAGAIN; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); + +/** + * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero. + * @ctx: [in] Pointer to eventfd context. + * @no_wait: [in] Different from zero if the operation should not block. + * @cnt: [out] Pointer to the 64bit conter value. + * + * Returns zero if successful, or the following error codes: + * + * -EAGAIN : The operation would have blocked but @no_wait was nonzero. + * -ERESTARTSYS : A signal interrupted the wait operation. + * + * If @no_wait is zero, the function might sleep until the eventfd internal + * counter becomes greater than zero. + */ +ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt) { - struct eventfd_ctx *ctx = file->private_data; ssize_t res; - __u64 ucnt = 0; DECLARE_WAITQUEUE(wait, current); - if (count < sizeof(ucnt)) - return -EINVAL; spin_lock_irq(&ctx->wqh.lock); + *cnt = 0; res = -EAGAIN; if (ctx->count > 0) - res = sizeof(ucnt); - else if (!(file->f_flags & O_NONBLOCK)) { + res = 0; + else if (!no_wait) { __add_wait_queue(&ctx->wqh, &wait); - for (res = 0;;) { + for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (ctx->count > 0) { - res = sizeof(ucnt); + res = 0; break; } if (signal_pending(current)) { @@ -168,18 +213,32 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } - if (likely(res > 0)) { - ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; - ctx->count -= ucnt; + if (likely(res == 0)) { + eventfd_ctx_do_read(ctx, cnt); if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, POLLOUT); } spin_unlock_irq(&ctx->wqh.lock); - if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) - return -EFAULT; return res; } +EXPORT_SYMBOL_GPL(eventfd_ctx_read); + +static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct eventfd_ctx *ctx = file->private_data; + ssize_t res; + __u64 cnt; + + if (count < sizeof(cnt)) + return -EINVAL; + res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt); + if (res < 0) + return res; + + return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt); +} static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 94dd10366a78..91bb4f27238c 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -10,6 +10,7 @@ #include #include +#include /* * CAREFUL: Check include/asm-generic/fcntl.h when defining @@ -34,6 +35,9 @@ struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); int eventfd_signal(struct eventfd_ctx *ctx, int n); +ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt); +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, + __u64 *cnt); #else /* CONFIG_EVENTFD */ @@ -61,6 +65,18 @@ static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) } +static inline ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, + __u64 *cnt) +{ + return -ENOSYS; +} + +static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, + wait_queue_t *wait, __u64 *cnt) +{ + return -ENOSYS; +} + #endif #endif /* _LINUX_EVENTFD_H */ -- cgit v1.2.3 From 9df5f74194871ebd0e51ef5ad2eca5084acaaaba Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Mon, 25 Jan 2010 11:42:20 -0600 Subject: mm: add coherence API for DMA to vmalloc/vmap areas On Virtually Indexed architectures (which don't do automatic alias resolution in their caches), we have to flush via the correct virtual address to prepare pages for DMA. On some architectures (like arm) we cannot prevent the CPU from doing data movein along the alias (and thus giving stale read data), so we not only have to introduce a flush API to push dirty cache lines out, but also an invalidate API to kill inconsistent cache lines that may have moved in before DMA changed the data Signed-off-by: James Bottomley --- Documentation/cachetlb.txt | 24 ++++++++++++++++++++++++ include/linux/highmem.h | 6 ++++++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index da42ab414c48..b231414bb8bc 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt @@ -377,3 +377,27 @@ maps this page at its virtual address. All the functionality of flush_icache_page can be implemented in flush_dcache_page and update_mmu_cache. In 2.7 the hope is to remove this interface completely. + +The final category of APIs is for I/O to deliberately aliased address +ranges inside the kernel. Such aliases are set up by use of the +vmap/vmalloc API. Since kernel I/O goes via physical pages, the I/O +subsystem assumes that the user mapping and kernel offset mapping are +the only aliases. This isn't true for vmap aliases, so anything in +the kernel trying to do I/O to vmap areas must manually manage +coherency. It must do this by flushing the vmap range before doing +I/O and invalidating it after the I/O returns. + + void flush_kernel_vmap_range(void *vaddr, int size) + flushes the kernel cache for a given virtual address range in + the vmap area. This is to make sure that any data the kernel + modified in the vmap range is made visible to the physical + page. The design is to make this area safe to perform I/O on. + Note that this API does *not* also flush the offset map alias + of the area. + + void invalidate_kernel_vmap_range(void *vaddr, int size) invalidates + the cache for a given virtual address range in the vmap area + which prevents the processor from making the cache stale by + speculatively reading data while the I/O was occurring to the + physical pages. This is only necessary for data reads into the + vmap area. diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 211ff4497269..adfe1013b2bd 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -17,6 +17,12 @@ static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page static inline void flush_kernel_dcache_page(struct page *page) { } +static inline void flush_kernel_vmap_range(void *vaddr, int size) +{ +} +static inline void invalidate_kernel_vmap_range(void *vaddr, int size) +{ +} #endif #include -- cgit v1.2.3 From 32e7bfc41110bc8f29ec0f293c3bcee6645fef34 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 25 Jan 2010 13:36:10 -0800 Subject: net: use helpers to access uc list V2 This patch introduces three macros to work with uc list from net drivers. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/bnx2.c | 5 ++--- drivers/net/e1000/e1000_main.c | 4 ++-- drivers/net/igb/igb_main.c | 7 ++++--- drivers/net/ixgbe/ixgbe_common.c | 7 +++---- drivers/net/ixgbe/ixgbe_common.h | 2 +- drivers/net/ixgbe/ixgbe_main.c | 2 +- drivers/net/ixgbe/ixgbe_type.h | 4 ++-- drivers/net/mv643xx_eth.c | 3 +-- drivers/net/niu.c | 4 ++-- drivers/net/stmmac/dwmac1000_core.c | 10 +++++----- drivers/net/virtio_net.c | 12 +++++++----- drivers/s390/net/qeth_l2_main.c | 2 +- include/linux/netdevice.h | 5 +++++ net/core/dev.c | 4 ++-- 14 files changed, 38 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index d83512d3e02f..a7b6b12c1c05 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -48,7 +48,6 @@ #include #include #include -#include #if defined(CONFIG_CNIC) || defined(CONFIG_CNIC_MODULE) #define BCM_CNIC 1 @@ -3579,14 +3578,14 @@ bnx2_set_rx_mode(struct net_device *dev) sort_mode |= BNX2_RPM_SORT_USER0_MC_HSH_EN; } - if (dev->uc.count > BNX2_MAX_UNICAST_ADDRESSES) { + if (netdev_uc_count(dev) > BNX2_MAX_UNICAST_ADDRESSES) { rx_mode |= BNX2_EMAC_RX_MODE_PROMISCUOUS; sort_mode |= BNX2_RPM_SORT_USER0_PROM_EN | BNX2_RPM_SORT_USER0_PROM_VLAN; } else if (!(dev->flags & IFF_PROMISC)) { /* Add all entries into to the match filter list */ i = 0; - list_for_each_entry(ha, &dev->uc.list, list) { + netdev_for_each_uc_addr(ha, dev) { bnx2_set_mac_addr(bp, ha->addr, i + BNX2_START_UNICAST_ADDRESS_INDEX); sort_mode |= (1 << diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index 87f575ca427d..2ce88c5f75c5 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -2139,7 +2139,7 @@ static void e1000_set_rx_mode(struct net_device *netdev) rctl |= E1000_RCTL_VFE; } - if (netdev->uc.count > rar_entries - 1) { + if (netdev_uc_count(netdev) > rar_entries - 1) { rctl |= E1000_RCTL_UPE; } else if (!(netdev->flags & IFF_PROMISC)) { rctl &= ~E1000_RCTL_UPE; @@ -2162,7 +2162,7 @@ static void e1000_set_rx_mode(struct net_device *netdev) */ i = 1; if (use_uc) - list_for_each_entry(ha, &netdev->uc.list, list) { + netdev_for_each_uc_addr(ha, netdev) { if (i == rar_entries) break; e1000_rar_set(hw, ha->addr, i++); diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c index d9679493c635..01cc29483e2f 100644 --- a/drivers/net/igb/igb_main.c +++ b/drivers/net/igb/igb_main.c @@ -2905,12 +2905,13 @@ static int igb_write_uc_addr_list(struct net_device *netdev) int count = 0; /* return ENOMEM indicating insufficient memory for addresses */ - if (netdev->uc.count > rar_entries) + if (netdev_uc_count(netdev) > rar_entries) return -ENOMEM; - if (netdev->uc.count && rar_entries) { + if (!netdev_uc_empty(netdev) && rar_entries) { struct netdev_hw_addr *ha; - list_for_each_entry(ha, &netdev->uc.list, list) { + + netdev_for_each_uc_addr(ha, netdev) { if (!rar_entries) break; igb_rar_set_qsel(adapter, ha->addr, diff --git a/drivers/net/ixgbe/ixgbe_common.c b/drivers/net/ixgbe/ixgbe_common.c index 276c2aaa800b..eb49020903c1 100644 --- a/drivers/net/ixgbe/ixgbe_common.c +++ b/drivers/net/ixgbe/ixgbe_common.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include "ixgbe.h" @@ -1347,7 +1346,7 @@ static void ixgbe_add_uc_addr(struct ixgbe_hw *hw, u8 *addr, u32 vmdq) /** * ixgbe_update_uc_addr_list_generic - Updates MAC list of secondary addresses * @hw: pointer to hardware structure - * @uc_list: the list of new addresses + * @netdev: pointer to net device structure * * The given list replaces any existing list. Clears the secondary addrs from * receive address registers. Uses unused receive address registers for the @@ -1357,7 +1356,7 @@ static void ixgbe_add_uc_addr(struct ixgbe_hw *hw, u8 *addr, u32 vmdq) * manually putting the device into promiscuous mode. **/ s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw, - struct list_head *uc_list) + struct net_device *netdev) { u32 i; u32 old_promisc_setting = hw->addr_ctrl.overflow_promisc; @@ -1381,7 +1380,7 @@ s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw, } /* Add the new addresses */ - list_for_each_entry(ha, uc_list, list) { + netdev_for_each_uc_addr(ha, netdev) { hw_dbg(hw, " Adding the secondary addresses:\n"); ixgbe_add_uc_addr(hw, ha->addr, 0); } diff --git a/drivers/net/ixgbe/ixgbe_common.h b/drivers/net/ixgbe/ixgbe_common.h index dfff0ffaa502..13606d4809c9 100644 --- a/drivers/net/ixgbe/ixgbe_common.h +++ b/drivers/net/ixgbe/ixgbe_common.h @@ -60,7 +60,7 @@ s32 ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list, u32 mc_addr_count, ixgbe_mc_addr_itr func); s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw, - struct list_head *uc_list); + struct net_device *netdev); s32 ixgbe_enable_mc_generic(struct ixgbe_hw *hw); s32 ixgbe_disable_mc_generic(struct ixgbe_hw *hw); s32 ixgbe_enable_rx_dma_generic(struct ixgbe_hw *hw, u32 regval); diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c index ee41d331a35f..439645d2aeef 100644 --- a/drivers/net/ixgbe/ixgbe_main.c +++ b/drivers/net/ixgbe/ixgbe_main.c @@ -2568,7 +2568,7 @@ void ixgbe_set_rx_mode(struct net_device *netdev) IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlnctrl); /* reprogram secondary unicast list */ - hw->mac.ops.update_uc_addr_list(hw, &netdev->uc.list); + hw->mac.ops.update_uc_addr_list(hw, netdev); /* reprogram multicast list */ addr_count = netdev->mc_count; diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h index b4caa7011a2b..0db67c19b2c4 100644 --- a/drivers/net/ixgbe/ixgbe_type.h +++ b/drivers/net/ixgbe/ixgbe_type.h @@ -30,7 +30,7 @@ #include #include -#include +#include /* Vendor ID */ #define IXGBE_INTEL_VENDOR_ID 0x8086 @@ -2405,7 +2405,7 @@ struct ixgbe_mac_operations { s32 (*set_vmdq)(struct ixgbe_hw *, u32, u32); s32 (*clear_vmdq)(struct ixgbe_hw *, u32, u32); s32 (*init_rx_addrs)(struct ixgbe_hw *); - s32 (*update_uc_addr_list)(struct ixgbe_hw *, struct list_head *); + s32 (*update_uc_addr_list)(struct ixgbe_hw *, struct net_device *); s32 (*update_mc_addr_list)(struct ixgbe_hw *, u8 *, u32, ixgbe_mc_addr_itr); s32 (*enable_mc)(struct ixgbe_hw *); diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c index af67af55efe7..e24072a9a979 100644 --- a/drivers/net/mv643xx_eth.c +++ b/drivers/net/mv643xx_eth.c @@ -55,7 +55,6 @@ #include #include #include -#include static char mv643xx_eth_driver_name[] = "mv643xx_eth"; static char mv643xx_eth_driver_version[] = "1.4"; @@ -1697,7 +1696,7 @@ static u32 uc_addr_filter_mask(struct net_device *dev) return 0; nibbles = 1 << (dev->dev_addr[5] & 0x0f); - list_for_each_entry(ha, &dev->uc.list, list) { + netdev_for_each_uc_addr(ha, dev) { if (memcmp(dev->dev_addr, ha->addr, 5)) return 0; if ((dev->dev_addr[5] ^ ha->addr[5]) & 0xf0) diff --git a/drivers/net/niu.c b/drivers/net/niu.c index 0e260cfbff7b..af9a8647c7e8 100644 --- a/drivers/net/niu.c +++ b/drivers/net/niu.c @@ -6372,7 +6372,7 @@ static void niu_set_rx_mode(struct net_device *dev) if ((dev->flags & IFF_ALLMULTI) || (dev->mc_count > 0)) np->flags |= NIU_FLAGS_MCAST; - alt_cnt = dev->uc.count; + alt_cnt = netdev_uc_count(dev); if (alt_cnt > niu_num_alt_addr(np)) { alt_cnt = 0; np->flags |= NIU_FLAGS_PROMISC; @@ -6381,7 +6381,7 @@ static void niu_set_rx_mode(struct net_device *dev) if (alt_cnt) { int index = 0; - list_for_each_entry(ha, &dev->uc.list, list) { + netdev_for_each_uc_addr(ha, dev) { err = niu_set_alt_mac(np, index, ha->addr); if (err) printk(KERN_WARNING PFX "%s: Error %d " diff --git a/drivers/net/stmmac/dwmac1000_core.c b/drivers/net/stmmac/dwmac1000_core.c index 928eac05b912..d812e9cdb3db 100644 --- a/drivers/net/stmmac/dwmac1000_core.c +++ b/drivers/net/stmmac/dwmac1000_core.c @@ -83,7 +83,7 @@ static void dwmac1000_set_filter(struct net_device *dev) unsigned int value = 0; DBG(KERN_INFO "%s: # mcasts %d, # unicast %d\n", - __func__, dev->mc_count, dev->uc.count); + __func__, dev->mc_count, netdev_uc_count(dev)); if (dev->flags & IFF_PROMISC) value = GMAC_FRAME_FILTER_PR; @@ -117,7 +117,7 @@ static void dwmac1000_set_filter(struct net_device *dev) } /* Handle multiple unicast addresses (perfect filtering)*/ - if (dev->uc.count > GMAC_MAX_UNICAST_ADDRESSES) + if (netdev_uc_count(dev) > GMAC_MAX_UNICAST_ADDRESSES) /* Switch to promiscuous mode is more than 16 addrs are required */ value |= GMAC_FRAME_FILTER_PR; @@ -125,9 +125,9 @@ static void dwmac1000_set_filter(struct net_device *dev) int reg = 1; struct netdev_hw_addr *ha; - list_for_each_entry(ha, &dev->uc.list, list) { - dwmac1000_set_umac_addr(ioaddr, ha->addr, reg); - reg++; + netdev_for_each_uc_addr(ha, dev) { + dwmac1000_set_umac_addr(ioaddr, ha->addr, reg); + reg++; } } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index c708ecc3cb2e..088332a943f7 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -675,6 +675,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) struct virtio_net_ctrl_mac *mac_data; struct dev_addr_list *addr; struct netdev_hw_addr *ha; + int uc_count; void *buf; int i; @@ -701,8 +702,9 @@ static void virtnet_set_rx_mode(struct net_device *dev) dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", allmulti ? "en" : "dis"); + uc_count = netdev_uc_count(dev); /* MAC filter - use one buffer for both lists */ - mac_data = buf = kzalloc(((dev->uc.count + dev->mc_count) * ETH_ALEN) + + mac_data = buf = kzalloc(((uc_count + dev->mc_count) * ETH_ALEN) + (2 * sizeof(mac_data->entries)), GFP_ATOMIC); if (!buf) { dev_warn(&dev->dev, "No memory for MAC address buffer\n"); @@ -712,16 +714,16 @@ static void virtnet_set_rx_mode(struct net_device *dev) sg_init_table(sg, 2); /* Store the unicast list and count in the front of the buffer */ - mac_data->entries = dev->uc.count; + mac_data->entries = uc_count; i = 0; - list_for_each_entry(ha, &dev->uc.list, list) + netdev_for_each_uc_addr(ha, dev) memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN); sg_set_buf(&sg[0], mac_data, - sizeof(mac_data->entries) + (dev->uc.count * ETH_ALEN)); + sizeof(mac_data->entries) + (uc_count * ETH_ALEN)); /* multicast list and count fill the end */ - mac_data = (void *)&mac_data->macs[dev->uc.count][0]; + mac_data = (void *)&mac_data->macs[uc_count][0]; mac_data->entries = dev->mc_count; addr = dev->mc_list; diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index c3258b0dd649..51fde6f2e0b8 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -622,7 +622,7 @@ static void qeth_l2_set_multicast_list(struct net_device *dev) for (dm = dev->mc_list; dm; dm = dm->next) qeth_l2_add_mc(card, dm->da_addr, 0); - list_for_each_entry(ha, &dev->uc.list, list) + netdev_for_each_uc_addr(ha, dev) qeth_l2_add_mc(card, ha->addr, 1); spin_unlock_bh(&card->mclock); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b5fb51d0b8b1..93a32a5ca74f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -263,6 +263,11 @@ struct netdev_hw_addr_list { int count; }; +#define netdev_uc_count(dev) ((dev)->uc.count) +#define netdev_uc_empty(dev) ((dev)->uc.count == 0) +#define netdev_for_each_uc_addr(ha, dev) \ + list_for_each_entry(ha, &dev->uc.list, list) + struct hh_cache { struct hh_cache *hh_next; /* Next entry */ atomic_t hh_refcnt; /* number of users */ diff --git a/net/core/dev.c b/net/core/dev.c index 4fad9db417b1..2cba5c521e56 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3665,10 +3665,10 @@ void __dev_set_rx_mode(struct net_device *dev) /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ - if (dev->uc.count > 0 && !dev->uc_promisc) { + if (!netdev_uc_empty(dev) && !dev->uc_promisc) { __dev_set_promiscuity(dev, 1); dev->uc_promisc = 1; - } else if (dev->uc.count == 0 && dev->uc_promisc) { + } else if (netdev_uc_empty(dev) && dev->uc_promisc) { __dev_set_promiscuity(dev, -1); dev->uc_promisc = 0; } -- cgit v1.2.3 From 6d3faf6f431bafb25f4b9926c50a7e5c267738c6 Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Sun, 24 Jan 2010 14:48:00 +0100 Subject: firewire: cdev: add_descriptor documentation fix struct fw_cdev_add_descriptor.length is in quadlets, not in bytes. Also remove any doubts about the endianess of descriptor data. Signed-off-by: Stefan Richter --- include/linux/firewire-cdev.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h index 1f716d9f714b..520ecf86cbb3 100644 --- a/include/linux/firewire-cdev.h +++ b/include/linux/firewire-cdev.h @@ -380,7 +380,7 @@ struct fw_cdev_initiate_bus_reset { * @immediate: If non-zero, immediate key to insert before pointer * @key: Upper 8 bits of root directory pointer * @data: Userspace pointer to contents of descriptor block - * @length: Length of descriptor block data, in bytes + * @length: Length of descriptor block data, in quadlets * @handle: Handle to the descriptor, written by the kernel * * Add a descriptor block and optionally a preceding immediate key to the local @@ -394,6 +394,8 @@ struct fw_cdev_initiate_bus_reset { * If not 0, the @immediate field specifies an immediate key which will be * inserted before the root directory pointer. * + * @immediate, @key, and @data array elements are CPU-endian quadlets. + * * If successful, the kernel adds the descriptor and writes back a handle to the * kernel-side object to be used for later removal of the descriptor block and * immediate key. -- cgit v1.2.3 From abd50713944c8ea9e0af5b7bffa0aacae21cc91a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2010 18:50:16 +0100 Subject: perf: Reimplement frequency driven sampling There was a bug in the old period code that caused intel_pmu_enable_all() or native_write_msr_safe() to show up quite high in the profiles. In staring at that code it made my head hurt, so I rewrote it in a hopefully simpler fashion. Its now fully symetric between tick and overflow driven adjustments and uses less data to boot. The only complication is that it basically wants to do a u128 division. The code approximates that in a rather simple truncate until it fits fashion, taking care to balance the terms while truncating. This version does not generate that sampling artefact. Signed-off-by: Peter Zijlstra LKML-Reference: Cc: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 5 +- kernel/perf_event.c | 132 +++++++++++++++++++++++++++++++-------------- 2 files changed, 94 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c6f812e4d058..72b2615600d8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -498,9 +498,8 @@ struct hw_perf_event { atomic64_t period_left; u64 interrupts; - u64 freq_count; - u64 freq_interrupts; - u64 freq_stamp; + u64 freq_time_stamp; + u64 freq_count_stamp; #endif }; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index edc46b92b508..251fb9552492 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1423,14 +1423,83 @@ void perf_event_task_sched_in(struct task_struct *task) static void perf_log_throttle(struct perf_event *event, int enable); -static void perf_adjust_period(struct perf_event *event, u64 events) +static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) +{ + u64 frequency = event->attr.sample_freq; + u64 sec = NSEC_PER_SEC; + u64 divisor, dividend; + + int count_fls, nsec_fls, frequency_fls, sec_fls; + + count_fls = fls64(count); + nsec_fls = fls64(nsec); + frequency_fls = fls64(frequency); + sec_fls = 30; + + /* + * We got @count in @nsec, with a target of sample_freq HZ + * the target period becomes: + * + * @count * 10^9 + * period = ------------------- + * @nsec * sample_freq + * + */ + + /* + * Reduce accuracy by one bit such that @a and @b converge + * to a similar magnitude. + */ +#define REDUCE_FLS(a, b) \ +do { \ + if (a##_fls > b##_fls) { \ + a >>= 1; \ + a##_fls--; \ + } else { \ + b >>= 1; \ + b##_fls--; \ + } \ +} while (0) + + /* + * Reduce accuracy until either term fits in a u64, then proceed with + * the other, so that finally we can do a u64/u64 division. + */ + while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + REDUCE_FLS(sec, count); + } + + if (count_fls + sec_fls > 64) { + divisor = nsec * frequency; + + while (count_fls + sec_fls > 64) { + REDUCE_FLS(count, sec); + divisor >>= 1; + } + + dividend = count * sec; + } else { + dividend = count * sec; + + while (nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + dividend >>= 1; + } + + divisor = nsec * frequency; + } + + return div64_u64(dividend, divisor); +} + +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; u64 period, sample_period; s64 delta; - events *= hwc->sample_period; - period = div64_u64(events, event->attr.sample_freq); + period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); delta = (delta + 7) / 8; /* low pass filter */ @@ -1441,13 +1510,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events) sample_period = 1; hwc->sample_period = sample_period; + + if (atomic64_read(&hwc->period_left) > 8*sample_period) { + perf_disable(); + event->pmu->disable(event); + atomic64_set(&hwc->period_left, 0); + event->pmu->enable(event); + perf_enable(); + } } static void perf_ctx_adjust_freq(struct perf_event_context *ctx) { struct perf_event *event; struct hw_perf_event *hwc; - u64 interrupts, freq; + u64 interrupts, now; + s64 delta; raw_spin_lock(&ctx->lock); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { @@ -1468,44 +1546,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(event, 1); event->pmu->unthrottle(event); - interrupts = 2*sysctl_perf_event_sample_rate/HZ; } if (!event->attr.freq || !event->attr.sample_freq) continue; - /* - * if the specified freq < HZ then we need to skip ticks - */ - if (event->attr.sample_freq < HZ) { - freq = event->attr.sample_freq; - - hwc->freq_count += freq; - hwc->freq_interrupts += interrupts; - - if (hwc->freq_count < HZ) - continue; - - interrupts = hwc->freq_interrupts; - hwc->freq_interrupts = 0; - hwc->freq_count -= HZ; - } else - freq = HZ; - - perf_adjust_period(event, freq * interrupts); + event->pmu->read(event); + now = atomic64_read(&event->count); + delta = now - hwc->freq_count_stamp; + hwc->freq_count_stamp = now; - /* - * In order to avoid being stalled by an (accidental) huge - * sample period, force reset the sample period if we didn't - * get any events in this freq period. - */ - if (!interrupts) { - perf_disable(); - event->pmu->disable(event); - atomic64_set(&hwc->period_left, 0); - event->pmu->enable(event); - perf_enable(); - } + if (delta > 0) + perf_adjust_period(event, TICK_NSEC, delta); } raw_spin_unlock(&ctx->lock); } @@ -3768,12 +3820,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, if (event->attr.freq) { u64 now = perf_clock(); - s64 delta = now - hwc->freq_stamp; + s64 delta = now - hwc->freq_time_stamp; - hwc->freq_stamp = now; + hwc->freq_time_stamp = now; - if (delta > 0 && delta < TICK_NSEC) - perf_adjust_period(event, NSEC_PER_SEC / (int)delta); + if (delta > 0 && delta < 2*TICK_NSEC) + perf_adjust_period(event, delta, hwc->last_period); } /* -- cgit v1.2.3 From 0531b2aac59c2296570ac52bfc032ef2ace7d5e1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 27 Jan 2010 09:20:03 -0800 Subject: mm: add new 'read_cache_page_gfp()' helper function It's a simplified 'read_cache_page()' which takes a page allocation flag, so that different paths can control how aggressive the memory allocations are that populate a address space. In particular, the intel GPU object mapping code wants to be able to do a certain amount of own internal memory management by automatically shrinking the address space when memory starts getting tight. This allows it to dynamically use different memory allocation policies on a per-allocation basis, rather than depend on the (static) address space gfp policy. The actual new function is a one-liner, but re-organizing the helper functions to the point where you can do this with a single line of code is what most of the patch is all about. Tested-by: Chris Wilson Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 + mm/filemap.c | 100 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 70 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ed5d7501e181..3c62ed408492 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -253,6 +253,8 @@ extern struct page * read_cache_page_async(struct address_space *mapping, extern struct page * read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, void *data); +extern struct page * read_cache_page_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp_mask); extern int read_cache_pages(struct address_space *mapping, struct list_head *pages, filler_t *filler, void *data); diff --git a/mm/filemap.c b/mm/filemap.c index 96ac6b0eb6cb..e3736923220e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1634,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *,struct page*), - void *data) + void *data, + gfp_t gfp) { struct page *page; int err; repeat: page = find_get_page(mapping, index); if (!page) { - page = page_cache_alloc_cold(mapping); + page = __page_cache_alloc(gfp | __GFP_COLD); if (!page) return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); @@ -1661,31 +1662,18 @@ repeat: return page; } -/** - * read_cache_page_async - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: destination for read data - * - * Same as read_cache_page, but don't wait for page to become unlocked - * after submitting it to the filler. - * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page but don't wait for it to become unlocked. - * - * If the page does not get brought uptodate, return -EIO. - */ -struct page *read_cache_page_async(struct address_space *mapping, +static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *,struct page*), - void *data) + void *data, + gfp_t gfp) + { struct page *page; int err; retry: - page = __read_cache_page(mapping, index, filler, data); + page = __read_cache_page(mapping, index, filler, data, gfp); if (IS_ERR(page)) return page; if (PageUptodate(page)) @@ -1710,8 +1698,67 @@ out: mark_page_accessed(page); return page; } + +/** + * read_cache_page_async - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * + * Same as read_cache_page, but don't wait for page to become unlocked + * after submitting it to the filler. + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page but don't wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page_async(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *,struct page*), + void *data) +{ + return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); +} EXPORT_SYMBOL(read_cache_page_async); +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + +/** + * read_cache_page_gfp - read into page cache, using specified page allocation flags. + * @mapping: the page's address_space + * @index: the page index + * @gfp: the page allocator flags to use if allocating + * + * This is the same as "read_mapping_page(mapping, index, NULL)", but with + * any new page allocations done using the specified allocation flags. Note + * that the Radix tree operations will still use GFP_KERNEL, so you can't + * expect to do this atomically or anything like that - but you can pass in + * other page requirements. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page_gfp(struct address_space *mapping, + pgoff_t index, + gfp_t gfp) +{ + filler_t *filler = (filler_t *)mapping->a_ops->readpage; + + return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); +} +EXPORT_SYMBOL(read_cache_page_gfp); + /** * read_cache_page - read into page cache, fill it if needed * @mapping: the page's address_space @@ -1729,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping, int (*filler)(void *,struct page*), void *data) { - struct page *page; - - page = read_cache_page_async(mapping, index, filler, data); - if (IS_ERR(page)) - goto out; - wait_on_page_locked(page); - if (!PageUptodate(page)) { - page_cache_release(page); - page = ERR_PTR(-EIO); - } - out: - return page; + return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); } EXPORT_SYMBOL(read_cache_page); -- cgit v1.2.3 From 6016a363f6b56b46b24655bcfc0499b715851cf3 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 28 Jan 2010 14:06:53 -0700 Subject: of: unify phandle name in struct device_node In struct device_node, the phandle is named 'linux_phandle' for PowerPC and MicroBlaze, and 'node' for SPARC. There is no good reason for the difference, it is just an artifact of the code diverging over a couple of years. This patch renames both to simply .phandle. Note: the .node also existed in PowerPC/MicroBlaze, but the only user seems to be arch/powerpc/platforms/powermac/pfunc_core.c. It doesn't look like the assignment between .linux_phandle and .node is significantly different enough to warrant the separate code paths unless ibm,phandle properties actually appear in Apple device trees. I think it is safe to eliminate the old .node property and use phandle everywhere. Signed-off-by: Grant Likely Acked-by: David S. Miller Tested-by: Wolfram Sang Acked-by: Benjamin Herrenschmidt --- arch/microblaze/kernel/of_platform.c | 2 +- arch/microblaze/kernel/prom.c | 2 +- arch/powerpc/kernel/of_platform.c | 2 +- arch/powerpc/kernel/prom.c | 6 +++--- arch/powerpc/platforms/cell/spu_manage.c | 6 +++--- arch/powerpc/platforms/powermac/pfunc_core.c | 2 +- arch/sparc/kernel/devices.c | 2 +- arch/sparc/kernel/of_device_32.c | 2 +- arch/sparc/kernel/of_device_64.c | 2 +- arch/sparc/kernel/prom_common.c | 8 ++++---- arch/sparc/kernel/smp_64.c | 2 +- drivers/of/fdt.c | 7 +++---- drivers/sbus/char/openprom.c | 10 +++++----- drivers/video/aty/atyfb_base.c | 2 +- include/linux/of.h | 5 +---- sound/aoa/fabrics/layout.c | 2 +- 16 files changed, 29 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/kernel/of_platform.c b/arch/microblaze/kernel/of_platform.c index acf4574d0f18..1c6d684996d7 100644 --- a/arch/microblaze/kernel/of_platform.c +++ b/arch/microblaze/kernel/of_platform.c @@ -185,7 +185,7 @@ EXPORT_SYMBOL(of_find_device_by_node); static int of_dev_phandle_match(struct device *dev, void *data) { phandle *ph = data; - return to_of_device(dev)->node->linux_phandle == *ph; + return to_of_device(dev)->node->phandle == *ph; } struct of_device *of_find_device_by_phandle(phandle ph) diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index 46407e643926..6eff83a71218 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -342,7 +342,7 @@ struct device_node *of_find_node_by_phandle(phandle handle) read_lock(&devtree_lock); for (np = allnodes; np != NULL; np = np->allnext) - if (np->linux_phandle == handle) + if (np->phandle == handle) break; of_node_get(np); read_unlock(&devtree_lock); diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index 1a4fc0d11a03..666d08db319e 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -214,7 +214,7 @@ EXPORT_SYMBOL(of_find_device_by_node); static int of_dev_phandle_match(struct device *dev, void *data) { phandle *ph = data; - return to_of_device(dev)->node->linux_phandle == *ph; + return to_of_device(dev)->node->phandle == *ph; } struct of_device *of_find_device_by_phandle(phandle ph) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index deccd91d7e81..1ed2ec2ea05b 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -778,7 +778,7 @@ struct device_node *of_find_node_by_phandle(phandle handle) read_lock(&devtree_lock); for (np = allnodes; np != 0; np = np->allnext) - if (np->linux_phandle == handle) + if (np->phandle == handle) break; of_node_get(np); read_unlock(&devtree_lock); @@ -907,9 +907,9 @@ static int of_finish_dynamic_node(struct device_node *node) if (machine_is(powermac)) return -ENODEV; - /* fix up new node's linux_phandle field */ + /* fix up new node's phandle field */ if ((ibm_phandle = of_get_property(node, "ibm,phandle", NULL))) - node->linux_phandle = *ibm_phandle; + node->phandle = *ibm_phandle; out: of_node_put(parent); diff --git a/arch/powerpc/platforms/cell/spu_manage.c b/arch/powerpc/platforms/cell/spu_manage.c index 4c506c1463cd..891f18e337a2 100644 --- a/arch/powerpc/platforms/cell/spu_manage.c +++ b/arch/powerpc/platforms/cell/spu_manage.c @@ -457,7 +457,7 @@ neighbour_spu(int cbe, struct device_node *target, struct device_node *avoid) continue; vic_handles = of_get_property(spu_dn, "vicinity", &lenp); for (i=0; i < (lenp / sizeof(phandle)); i++) { - if (vic_handles[i] == target->linux_phandle) + if (vic_handles[i] == target->phandle) return spu; } } @@ -499,7 +499,7 @@ static void init_affinity_node(int cbe) if (strcmp(name, "spe") == 0) { spu = devnode_spu(cbe, vic_dn); - avoid_ph = last_spu_dn->linux_phandle; + avoid_ph = last_spu_dn->phandle; } else { /* * "mic-tm" and "bif0" nodes do not have @@ -514,7 +514,7 @@ static void init_affinity_node(int cbe) last_spu->has_mem_affinity = 1; spu->has_mem_affinity = 1; } - avoid_ph = vic_dn->linux_phandle; + avoid_ph = vic_dn->phandle; } list_add_tail(&spu->aff_list, &last_spu->aff_list); diff --git a/arch/powerpc/platforms/powermac/pfunc_core.c b/arch/powerpc/platforms/powermac/pfunc_core.c index 96d5ce50364e..ede49e78a8da 100644 --- a/arch/powerpc/platforms/powermac/pfunc_core.c +++ b/arch/powerpc/platforms/powermac/pfunc_core.c @@ -842,7 +842,7 @@ struct pmf_function *__pmf_find_function(struct device_node *target, list_for_each_entry(func, &dev->functions, link) { if (name && strcmp(name, func->name)) continue; - if (func->phandle && target->node != func->phandle) + if (func->phandle && target->phandle != func->phandle) continue; if ((func->flags & flags) == 0) continue; diff --git a/arch/sparc/kernel/devices.c b/arch/sparc/kernel/devices.c index b171ae8de90d..b062de9424a4 100644 --- a/arch/sparc/kernel/devices.c +++ b/arch/sparc/kernel/devices.c @@ -59,7 +59,7 @@ static int __cpu_find_by(int (*compare)(int, int, void *), void *compare_arg, cur_inst = 0; for_each_node_by_type(dp, "cpu") { - int err = check_cpu_node(dp->node, &cur_inst, + int err = check_cpu_node(dp->phandle, &cur_inst, compare, compare_arg, prom_node, mid); if (!err) { diff --git a/arch/sparc/kernel/of_device_32.c b/arch/sparc/kernel/of_device_32.c index 4c26eb59e742..09138d403c7f 100644 --- a/arch/sparc/kernel/of_device_32.c +++ b/arch/sparc/kernel/of_device_32.c @@ -433,7 +433,7 @@ build_resources: if (!parent) dev_set_name(&op->dev, "root"); else - dev_set_name(&op->dev, "%08x", dp->node); + dev_set_name(&op->dev, "%08x", dp->phandle); if (of_device_register(op)) { printk("%s: Could not register of device.\n", diff --git a/arch/sparc/kernel/of_device_64.c b/arch/sparc/kernel/of_device_64.c index 881947e59e95..036f18ae59a6 100644 --- a/arch/sparc/kernel/of_device_64.c +++ b/arch/sparc/kernel/of_device_64.c @@ -666,7 +666,7 @@ static struct of_device * __init scan_one_device(struct device_node *dp, if (!parent) dev_set_name(&op->dev, "root"); else - dev_set_name(&op->dev, "%08x", dp->node); + dev_set_name(&op->dev, "%08x", dp->phandle); if (of_device_register(op)) { printk("%s: Could not register of device.\n", diff --git a/arch/sparc/kernel/prom_common.c b/arch/sparc/kernel/prom_common.c index d80a65d9e893..5832e13dfeeb 100644 --- a/arch/sparc/kernel/prom_common.c +++ b/arch/sparc/kernel/prom_common.c @@ -42,7 +42,7 @@ struct device_node *of_find_node_by_phandle(phandle handle) struct device_node *np; for (np = allnodes; np; np = np->allnext) - if (np->node == handle) + if (np->phandle == handle) break; return np; @@ -89,7 +89,7 @@ int of_set_property(struct device_node *dp, const char *name, void *val, int len void *old_val = prop->value; int ret; - ret = prom_setprop(dp->node, name, val, len); + ret = prom_setprop(dp->phandle, name, val, len); err = -EINVAL; if (ret >= 0) { @@ -236,7 +236,7 @@ static struct device_node * __init prom_create_node(phandle node, dp->name = get_one_property(node, "name"); dp->type = get_one_property(node, "device_type"); - dp->node = node; + dp->phandle = node; dp->properties = build_prop_list(node); @@ -313,7 +313,7 @@ void __init prom_build_devicetree(void) nextp = &allnodes->allnext; allnodes->child = prom_build_tree(allnodes, - prom_getchild(allnodes->node), + prom_getchild(allnodes->phandle), &nextp); of_console_init(); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index aa36223497b9..eb14844a0021 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -370,7 +370,7 @@ static int __cpuinit smp_boot_one_cpu(unsigned int cpu) } else { struct device_node *dp = of_find_node_by_cpuid(cpu); - prom_startcpu(dp->node, entry, cookie); + prom_startcpu(dp->phandle, entry, cookie); } for (timeout = 0; timeout < 50000; timeout++) { diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 616a4767a950..7f8861121a31 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -310,12 +310,11 @@ unsigned long __init unflatten_dt_node(unsigned long mem, __alignof__(struct property)); if (allnextpp) { if (strcmp(pname, "linux,phandle") == 0) { - np->node = *((u32 *)*p); - if (np->linux_phandle == 0) - np->linux_phandle = np->node; + if (np->phandle == 0) + np->phandle = *((u32 *)*p); } if (strcmp(pname, "ibm,phandle") == 0) - np->linux_phandle = *((u32 *)*p); + np->phandle = *((u32 *)*p); pp->name = pname; pp->length = sz; pp->value = (void *)*p; diff --git a/drivers/sbus/char/openprom.c b/drivers/sbus/char/openprom.c index 75ac19b1192f..fc2f676e984d 100644 --- a/drivers/sbus/char/openprom.c +++ b/drivers/sbus/char/openprom.c @@ -233,7 +233,7 @@ static int opromnext(void __user *argp, unsigned int cmd, struct device_node *dp ph = 0; if (dp) - ph = dp->node; + ph = dp->phandle; data->current_node = dp; *((int *) op->oprom_array) = ph; @@ -256,7 +256,7 @@ static int oprompci2node(void __user *argp, struct device_node *dp, struct openp dp = pci_device_to_OF_node(pdev); data->current_node = dp; - *((int *)op->oprom_array) = dp->node; + *((int *)op->oprom_array) = dp->phandle; op->oprom_size = sizeof(int); err = copyout(argp, op, bufsize + sizeof(int)); @@ -273,7 +273,7 @@ static int oprompath2node(void __user *argp, struct device_node *dp, struct open dp = of_find_node_by_path(op->oprom_array); if (dp) - ph = dp->node; + ph = dp->phandle; data->current_node = dp; *((int *)op->oprom_array) = ph; op->oprom_size = sizeof(int); @@ -540,7 +540,7 @@ static int opiocgetnext(unsigned int cmd, void __user *argp) } } if (dp) - nd = dp->node; + nd = dp->phandle; if (copy_to_user(argp, &nd, sizeof(phandle))) return -EFAULT; @@ -570,7 +570,7 @@ static int openprom_bsd_ioctl(struct inode * inode, struct file * file, case OPIOCGETOPTNODE: BUILD_BUG_ON(sizeof(phandle) != sizeof(int)); - if (copy_to_user(argp, &options_node->node, sizeof(phandle))) + if (copy_to_user(argp, &options_node->phandle, sizeof(phandle))) return -EFAULT; return 0; diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c index 913b4a47ae52..bb20987d58af 100644 --- a/drivers/video/aty/atyfb_base.c +++ b/drivers/video/aty/atyfb_base.c @@ -3104,7 +3104,7 @@ static int __devinit atyfb_setup_sparc(struct pci_dev *pdev, } dp = pci_device_to_OF_node(pdev); - if (node == dp->node) { + if (node == dp->phandle) { struct fb_var_screeninfo *var = &default_var; unsigned int N, P, Q, M, T, R; u32 v_total, h_total; diff --git a/include/linux/of.h b/include/linux/of.h index d4c014a35ea5..dbabf86e0b7a 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -39,10 +39,7 @@ struct of_irq_controller; struct device_node { const char *name; const char *type; - phandle node; -#if !defined(CONFIG_SPARC) - phandle linux_phandle; -#endif + phandle phandle; char *full_name; struct property *properties; diff --git a/sound/aoa/fabrics/layout.c b/sound/aoa/fabrics/layout.c index 586965f9605f..7a437da05646 100644 --- a/sound/aoa/fabrics/layout.c +++ b/sound/aoa/fabrics/layout.c @@ -768,7 +768,7 @@ static int check_codec(struct aoa_codec *codec, "required property %s not present\n", propname); return -ENODEV; } - if (*ref != codec->node->linux_phandle) { + if (*ref != codec->node->phandle) { printk(KERN_INFO "snd-aoa-fabric-layout: " "%s doesn't match!\n", propname); return -ENODEV; -- cgit v1.2.3 From 430ad5a600a83956749307b13257c464c3826b55 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 28 Jan 2010 09:32:29 +0800 Subject: perf: Factorize trace events raw sample buffer operations Introduce ftrace_perf_buf_prepare() and ftrace_perf_buf_submit() to gather the common code that operates on raw events sampling buffer. This cleans up redundant code between regular trace events, syscall events and kprobe events. Changelog v1->v2: - Rename function name as per Masami and Frederic's suggestion - Add __kprobes for ftrace_perf_buf_prepare() and make ftrace_perf_buf_submit() inline as per Masami's suggestion - Export ftrace_perf_buf_prepare since modules will use it Signed-off-by: Xiao Guangrong Acked-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Steven Rostedt Cc: Paul Mackerras Cc: Jason Baron Cc: Peter Zijlstra LKML-Reference: <4B60E92D.9000808@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- include/linux/ftrace_event.h | 18 ++++++-- include/trace/ftrace.h | 48 +++------------------ kernel/trace/trace_event_profile.c | 52 ++++++++++++++++++++--- kernel/trace/trace_kprobe.c | 86 +++++--------------------------------- kernel/trace/trace_syscalls.c | 71 +++++-------------------------- 5 files changed, 88 insertions(+), 187 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 0a09e758c7d3..cd95919d9ff3 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -5,6 +5,7 @@ #include #include #include +#include struct trace_array; struct tracer; @@ -138,9 +139,6 @@ struct ftrace_event_call { #define FTRACE_MAX_PROFILE_SIZE 2048 -extern char *perf_trace_buf; -extern char *perf_trace_buf_nmi; - #define MAX_FILTER_PRED 32 #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ @@ -195,6 +193,20 @@ extern void ftrace_profile_disable(int event_id); extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); +extern void * +ftrace_perf_buf_prepare(int size, unsigned short type, int *rctxp, + unsigned long *irq_flags); + +static inline void +ftrace_perf_buf_submit(void *raw_data, int size, int rctx, u64 addr, + u64 count, unsigned long irq_flags) +{ + struct trace_entry *entry = raw_data; + + perf_tp_event(entry->type, addr, count, raw_data, size); + perf_swevent_put_recursion_context(rctx); + local_irq_restore(irq_flags); +} #endif #endif /* _LINUX_FTRACE_EVENT_H */ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 4a46a60c2077..f2c09e4d656c 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -850,22 +850,12 @@ ftrace_profile_templ_##call(struct ftrace_event_call *event_call, \ proto) \ { \ struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ - extern int perf_swevent_get_recursion_context(void); \ - extern void perf_swevent_put_recursion_context(int rctx); \ - extern void perf_tp_event(int, u64, u64, void *, int); \ struct ftrace_raw_##call *entry; \ u64 __addr = 0, __count = 1; \ unsigned long irq_flags; \ - struct trace_entry *ent; \ int __entry_size; \ int __data_size; \ - char *trace_buf; \ - char *raw_data; \ - int __cpu; \ int rctx; \ - int pc; \ - \ - pc = preempt_count(); \ \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ @@ -875,42 +865,16 @@ ftrace_profile_templ_##call(struct ftrace_event_call *event_call, \ if (WARN_ONCE(__entry_size > FTRACE_MAX_PROFILE_SIZE, \ "profile buffer not large enough")) \ return; \ - \ - local_irq_save(irq_flags); \ - \ - rctx = perf_swevent_get_recursion_context(); \ - if (rctx < 0) \ - goto end_recursion; \ - \ - __cpu = smp_processor_id(); \ - \ - if (in_nmi()) \ - trace_buf = rcu_dereference(perf_trace_buf_nmi); \ - else \ - trace_buf = rcu_dereference(perf_trace_buf); \ - \ - if (!trace_buf) \ - goto end; \ - \ - raw_data = per_cpu_ptr(trace_buf, __cpu); \ - \ - *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \ - entry = (struct ftrace_raw_##call *)raw_data; \ - ent = &entry->ent; \ - tracing_generic_entry_update(ent, irq_flags, pc); \ - ent->type = event_call->id; \ - \ + entry = (struct ftrace_raw_##call *)ftrace_perf_buf_prepare( \ + __entry_size, event_call->id, &rctx, &irq_flags); \ + if (!entry) \ + return; \ tstruct \ \ { assign; } \ \ - perf_tp_event(event_call->id, __addr, __count, entry, \ - __entry_size); \ - \ -end: \ - perf_swevent_put_recursion_context(rctx); \ -end_recursion: \ - local_irq_restore(irq_flags); \ + ftrace_perf_buf_submit(entry, __entry_size, rctx, __addr, \ + __count, irq_flags); \ } #undef DEFINE_EVENT diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 9e25573242cf..f0d693005075 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -6,14 +6,12 @@ */ #include +#include #include "trace.h" -char *perf_trace_buf; -EXPORT_SYMBOL_GPL(perf_trace_buf); - -char *perf_trace_buf_nmi; -EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); +static char *perf_trace_buf; +static char *perf_trace_buf_nmi; typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; @@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id) } mutex_unlock(&event_mutex); } + +__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, + int *rctxp, unsigned long *irq_flags) +{ + struct trace_entry *entry; + char *trace_buf, *raw_data; + int pc, cpu; + + pc = preempt_count(); + + /* Protect the per cpu buffer, begin the rcu read side */ + local_irq_save(*irq_flags); + + *rctxp = perf_swevent_get_recursion_context(); + if (*rctxp < 0) + goto err_recursion; + + cpu = smp_processor_id(); + + if (in_nmi()) + trace_buf = rcu_dereference(perf_trace_buf_nmi); + else + trace_buf = rcu_dereference(perf_trace_buf); + + if (!trace_buf) + goto err; + + raw_data = per_cpu_ptr(trace_buf, cpu); + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + entry = (struct trace_entry *)raw_data; + tracing_generic_entry_update(entry, *irq_flags, pc); + entry->type = type; + + return raw_data; +err: + perf_swevent_put_recursion_context(*rctxp); +err_recursion: + local_irq_restore(*irq_flags); + return NULL; +} +EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d6266cad6953..2e28ee36646f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1243,14 +1243,10 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct ftrace_event_call *call = &tp->call; struct kprobe_trace_entry *entry; - struct trace_entry *ent; - int size, __size, i, pc, __cpu; + int size, __size, i; unsigned long irq_flags; - char *trace_buf; - char *raw_data; int rctx; - pc = preempt_count(); __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1258,45 +1254,16 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, "profile buffer not large enough")) return 0; - /* - * Protect the non nmi buffer - * This also protects the rcu read side - */ - local_irq_save(irq_flags); - - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - goto end_recursion; - - __cpu = smp_processor_id(); - - if (in_nmi()) - trace_buf = rcu_dereference(perf_trace_buf_nmi); - else - trace_buf = rcu_dereference(perf_trace_buf); - - if (!trace_buf) - goto end; - - raw_data = per_cpu_ptr(trace_buf, __cpu); - - /* Zero dead bytes from alignment to avoid buffer leak to userspace */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; - entry = (struct kprobe_trace_entry *)raw_data; - ent = &entry->ent; + entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); + if (!entry) + return 0; - tracing_generic_entry_update(ent, irq_flags, pc); - ent->type = call->id; entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - perf_tp_event(call->id, entry->ip, 1, entry, size); -end: - perf_swevent_put_recursion_context(rctx); -end_recursion: - local_irq_restore(irq_flags); + ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags); return 0; } @@ -1308,14 +1275,10 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct ftrace_event_call *call = &tp->call; struct kretprobe_trace_entry *entry; - struct trace_entry *ent; - int size, __size, i, pc, __cpu; + int size, __size, i; unsigned long irq_flags; - char *trace_buf; - char *raw_data; int rctx; - pc = preempt_count(); __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1323,46 +1286,17 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, "profile buffer not large enough")) return 0; - /* - * Protect the non nmi buffer - * This also protects the rcu read side - */ - local_irq_save(irq_flags); - - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - goto end_recursion; - - __cpu = smp_processor_id(); - - if (in_nmi()) - trace_buf = rcu_dereference(perf_trace_buf_nmi); - else - trace_buf = rcu_dereference(perf_trace_buf); - - if (!trace_buf) - goto end; - - raw_data = per_cpu_ptr(trace_buf, __cpu); - - /* Zero dead bytes from alignment to avoid buffer leak to userspace */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; - entry = (struct kretprobe_trace_entry *)raw_data; - ent = &entry->ent; + entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); + if (!entry) + return 0; - tracing_generic_entry_update(ent, irq_flags, pc); - ent->type = call->id; entry->nargs = tp->nr_args; entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - perf_tp_event(call->id, entry->ret_ip, 1, entry, size); -end: - perf_swevent_put_recursion_context(rctx); -end_recursion: - local_irq_restore(irq_flags); + ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags); return 0; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f694f66d75b0..4e332b9e449c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -433,12 +433,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; unsigned long flags; - char *trace_buf; - char *raw_data; int syscall_nr; int rctx; int size; - int cpu; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) @@ -457,37 +454,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) "profile buffer not large enough")) return; - /* Protect the per cpu buffer, begin the rcu read side */ - local_irq_save(flags); - - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - goto end_recursion; - - cpu = smp_processor_id(); - - trace_buf = rcu_dereference(perf_trace_buf); - - if (!trace_buf) - goto end; - - raw_data = per_cpu_ptr(trace_buf, cpu); - - /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size, + sys_data->enter_event->id, &rctx, &flags); + if (!rec) + return; - rec = (struct syscall_trace_enter *) raw_data; - tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->enter_event->id; rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); - -end: - perf_swevent_put_recursion_context(rctx); -end_recursion: - local_irq_restore(flags); + ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); } int prof_sysenter_enable(struct ftrace_event_call *call) @@ -531,11 +506,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) struct syscall_trace_exit *rec; unsigned long flags; int syscall_nr; - char *trace_buf; - char *raw_data; int rctx; int size; - int cpu; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) @@ -557,38 +529,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) "exit event has grown above profile buffer size")) return; - /* Protect the per cpu buffer, begin the rcu read side */ - local_irq_save(flags); - - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - goto end_recursion; - - cpu = smp_processor_id(); - - trace_buf = rcu_dereference(perf_trace_buf); - - if (!trace_buf) - goto end; - - raw_data = per_cpu_ptr(trace_buf, cpu); - - /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; - - rec = (struct syscall_trace_exit *)raw_data; + rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size, + sys_data->exit_event->id, &rctx, &flags); + if (!rec) + return; - tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->exit_event->id; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); - -end: - perf_swevent_put_recursion_context(rctx); -end_recursion: - local_irq_restore(flags); + ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); } int prof_sysexit_enable(struct ftrace_event_call *call) -- cgit v1.2.3 From bb209c8287d2d55ec4a67e3933346e0a3ee0da76 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 26 Jan 2010 17:10:03 +0000 Subject: powerpc/pci: Add calls to set_pcie_port_type() and set_pcie_hotplug_bridge() We are missing these when building the pci_dev from scratch off the Open Firmware device-tree Signed-off-by: Benjamin Herrenschmidt Acked-by: Jesse Barnes --- arch/powerpc/kernel/pci_of_scan.c | 2 ++ drivers/pci/probe.c | 4 ++-- include/linux/pci.h | 4 ++++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index 7311fdfb9bf8..693eb9a25bfa 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -140,6 +140,7 @@ struct pci_dev *of_create_pci_dev(struct device_node *node, dev->devfn = devfn; dev->multifunction = 0; /* maybe a lie? */ dev->needs_freset = 0; /* pcie fundamental reset required */ + set_pcie_port_type(dev); dev->vendor = get_int_prop(node, "vendor-id", 0xffff); dev->device = get_int_prop(node, "device-id", 0xffff); @@ -164,6 +165,7 @@ struct pci_dev *of_create_pci_dev(struct device_node *node, /* a PCI-PCI bridge */ dev->hdr_type = PCI_HEADER_TYPE_BRIDGE; dev->rom_base_reg = PCI_ROM_ADDRESS1; + set_pcie_hotplug_bridge(dev); } else if (!strcmp(type, "cardbus")) { dev->hdr_type = PCI_HEADER_TYPE_CARDBUS; } else { diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 98ffb2de22e9..446e4a94d7d3 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -681,7 +681,7 @@ static void pci_read_irq(struct pci_dev *dev) dev->irq = irq; } -static void set_pcie_port_type(struct pci_dev *pdev) +void set_pcie_port_type(struct pci_dev *pdev) { int pos; u16 reg16; @@ -695,7 +695,7 @@ static void set_pcie_port_type(struct pci_dev *pdev) pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4; } -static void set_pcie_hotplug_bridge(struct pci_dev *pdev) +void set_pcie_hotplug_bridge(struct pci_dev *pdev) { int pos; u16 reg16; diff --git a/include/linux/pci.h b/include/linux/pci.h index 174e5392e51e..c1968f464c38 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -756,6 +756,10 @@ pci_power_t pci_target_state(struct pci_dev *dev); int pci_prepare_to_sleep(struct pci_dev *dev); int pci_back_from_sleep(struct pci_dev *dev); +/* For use by arch with custom probe code */ +void set_pcie_port_type(struct pci_dev *pdev); +void set_pcie_hotplug_bridge(struct pci_dev *pdev); + /* Functions for PCI Hotplug drivers to use */ int pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap); #ifdef CONFIG_HOTPLUG -- cgit v1.2.3 From cb6ecf6f7afece066265e243657b0ac28150a7b2 Mon Sep 17 00:00:00 2001 From: Henrik Rydberg Date: Thu, 28 Jan 2010 22:28:27 -0800 Subject: Input: add the ABS_MT_PRESSURE event For pressure-based multi-touch devices, a direct way to send sensor intensity data per finger is needed. This patch adds the ABS_MT_PRESSURE event to the MT protocol. Requested-by: Yoonyoung Shim Requested-by: Mika Kuoppala Requested-by: Peter Hutterer Signed-off-by: Henrik Rydberg Signed-off-by: Dmitry Torokhov --- drivers/input/input.c | 1 + include/linux/input.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/drivers/input/input.c b/drivers/input/input.c index 30b503b8d67b..86cb2d2196ff 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -46,6 +46,7 @@ static unsigned int input_abs_bypass_init_data[] __initdata = { ABS_MT_TOOL_TYPE, ABS_MT_BLOB_ID, ABS_MT_TRACKING_ID, + ABS_MT_PRESSURE, 0 }; static unsigned long input_abs_bypass[BITS_TO_LONGS(ABS_CNT)]; diff --git a/include/linux/input.h b/include/linux/input.h index 7be8a6537b57..735ceaf1bc2d 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -660,6 +660,7 @@ struct input_absinfo { #define ABS_MT_TOOL_TYPE 0x37 /* Type of touching device */ #define ABS_MT_BLOB_ID 0x38 /* Group a set of packets as a blob */ #define ABS_MT_TRACKING_ID 0x39 /* Unique ID of initiated contact */ +#define ABS_MT_PRESSURE 0x3a /* Pressure on contact area */ #define ABS_MAX 0x3f #define ABS_CNT (ABS_MAX+1) -- cgit v1.2.3 From 9f41699ed067fa695faff8e2e9981b2550abec62 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jan 2010 15:59:29 +0100 Subject: bitops: Provide compile time HWEIGHT{8,16,32,64} Provide compile time versions of hweight. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner LKML-Reference: <20100122155535.797688466@chello.nl> [ Remove some whitespace damage while we are at it ] Signed-off-by: Ingo Molnar --- include/linux/bitops.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index c05a29cb9bb2..ba0fd1eb4af7 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -25,7 +25,7 @@ static __inline__ int get_bitmask_order(unsigned int count) { int order; - + order = fls(count); return order; /* We could be slightly more clever with -1 here... */ } @@ -33,7 +33,7 @@ static __inline__ int get_bitmask_order(unsigned int count) static __inline__ int get_count_order(unsigned int count) { int order; - + order = fls(count) - 1; if (count & (count - 1)) order++; @@ -45,6 +45,20 @@ static inline unsigned long hweight_long(unsigned long w) return sizeof(w) == 4 ? hweight32(w) : hweight64(w); } +#define HWEIGHT8(w) \ + ( (!!((w) & (1ULL << 0))) + \ + (!!((w) & (1ULL << 1))) + \ + (!!((w) & (1ULL << 2))) + \ + (!!((w) & (1ULL << 3))) + \ + (!!((w) & (1ULL << 4))) + \ + (!!((w) & (1ULL << 5))) + \ + (!!((w) & (1ULL << 6))) + \ + (!!((w) & (1ULL << 7))) ) + +#define HWEIGHT16(w) (HWEIGHT8(w) + HWEIGHT8(w >> 8)) +#define HWEIGHT32(w) (HWEIGHT16(w) + HWEIGHT16(w >> 16)) +#define HWEIGHT64(w) (HWEIGHT32(w) + HWEIGHT32(w >> 32)) + /** * rol32 - rotate a 32-bit value left * @word: value to rotate -- cgit v1.2.3 From 184f412c3341cd24fbd26604634a5800b83dbdc3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Jan 2010 08:39:39 +0100 Subject: perf, x86: Clean up event constraints code a bit - Remove stray debug code - Improve ugly macros a bit - Remove some whitespace damage - (Also fix up some accumulated damage in perf_event.h) Signed-off-by: Ingo Molnar Cc: Stephane Eranian Cc: Peter Zijlstra LKML-Reference: --- arch/x86/kernel/cpu/perf_event.c | 37 ++++++++----------------------------- include/linux/perf_event.h | 24 +++++++++++------------- 2 files changed, 19 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 66de282ad2fb..fdbe24842271 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -93,24 +93,19 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; -#define EVENT_CONSTRAINT(c, n, m) { \ +#define EVENT_CONSTRAINT(c, n, m) { \ { .idxmsk64[0] = (n) }, \ .code = (c), \ .cmask = (m), \ .weight = HWEIGHT64((u64)(n)), \ } -#define INTEL_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +#define INTEL_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +#define FIXED_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) -#define FIXED_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) +#define EVENT_CONSTRAINT_END EVENT_CONSTRAINT(0, 0, 0) -#define EVENT_CONSTRAINT_END \ - EVENT_CONSTRAINT(0, 0, 0) - -#define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->cmask; (e)++) +#define for_each_event_constraint(e, c) for ((e) = (c); (e)->cmask; (e)++) /* * struct x86_pmu - generic x86 pmu @@ -1276,14 +1271,6 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (test_bit(hwc->idx, used_mask)) break; -#if 0 - pr_debug("CPU%d fast config=0x%llx idx=%d assign=%c\n", - smp_processor_id(), - hwc->config, - hwc->idx, - assign ? 'y' : 'n'); -#endif - set_bit(hwc->idx, used_mask); if (assign) assign[i] = hwc->idx; @@ -1333,14 +1320,6 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (j == X86_PMC_IDX_MAX) break; -#if 0 - pr_debug("CPU%d slow config=0x%llx idx=%d assign=%c\n", - smp_processor_id(), - hwc->config, - j, - assign ? 'y' : 'n'); -#endif - set_bit(j, used_mask); if (assign) @@ -2596,9 +2575,9 @@ static const struct pmu pmu = { * validate a single event group * * validation include: - * - check events are compatible which each other - * - events do not compete for the same counter - * - number of events <= number of counters + * - check events are compatible which each other + * - events do not compete for the same counter + * - number of events <= number of counters * * validation ensures the group can be loaded onto the * PMU if it was the only group available. diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 72b2615600d8..953c17731e0d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -290,7 +290,7 @@ struct perf_event_mmap_page { }; #define PERF_RECORD_MISC_CPUMODE_MASK (3 << 0) -#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) +#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_RECORD_MISC_KERNEL (1 << 0) #define PERF_RECORD_MISC_USER (2 << 0) #define PERF_RECORD_MISC_HYPERVISOR (3 << 0) @@ -356,8 +356,8 @@ enum perf_event_type { * u64 stream_id; * }; */ - PERF_RECORD_THROTTLE = 5, - PERF_RECORD_UNTHROTTLE = 6, + PERF_RECORD_THROTTLE = 5, + PERF_RECORD_UNTHROTTLE = 6, /* * struct { @@ -371,10 +371,10 @@ enum perf_event_type { /* * struct { - * struct perf_event_header header; - * u32 pid, tid; + * struct perf_event_header header; + * u32 pid, tid; * - * struct read_format values; + * struct read_format values; * }; */ PERF_RECORD_READ = 8, @@ -412,7 +412,7 @@ enum perf_event_type { * char data[size];}&& PERF_SAMPLE_RAW * }; */ - PERF_RECORD_SAMPLE = 9, + PERF_RECORD_SAMPLE = 9, PERF_RECORD_MAX, /* non-ABI */ }; @@ -752,8 +752,7 @@ extern int perf_max_events; extern const struct pmu *hw_perf_event_init(struct perf_event *event); extern void perf_event_task_sched_in(struct task_struct *task); -extern void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next); +extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next); extern void perf_event_task_tick(struct task_struct *task); extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); @@ -853,8 +852,7 @@ extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; extern void perf_event_init(void); -extern void perf_tp_event(int event_id, u64 addr, u64 count, - void *record, int entry_size); +extern void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size); extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags @@ -895,13 +893,13 @@ static inline void perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { } static inline void -perf_bp_event(struct perf_event *event, void *data) { } +perf_bp_event(struct perf_event *event, void *data) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_comm(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } -static inline int perf_swevent_get_recursion_context(void) { return -1; } +static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } -- cgit v1.2.3 From 488991e28e55b4fbca8067edf0259f69d1a6f92c Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Fri, 29 Jan 2010 09:04:08 +0100 Subject: block: Added in stricter no merge semantics for block I/O Updated 'nomerges' tunable to accept a value of '2' - indicating that _no_ merges at all are to be attempted (not even the simple one-hit cache). The following table illustrates the additional benefit - 5 minute runs of a random I/O load were applied to a dozen devices on a 16-way x86_64 system. nomerges Throughput %System Improvement (tput / %sys) -------- ------------ ----------- ------------------------- 0 12.45 MB/sec 0.669365609 1 12.50 MB/sec 0.641519199 0.40% / 2.71% 2 12.52 MB/sec 0.639849750 0.56% / 2.96% Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block | 14 ++++++++++++++ Documentation/block/queue-sysfs.txt | 10 +++++----- block/blk-sysfs.c | 11 +++++++---- block/elevator.c | 11 ++++++++++- include/linux/blkdev.h | 3 +++ 5 files changed, 39 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index d2f90334bb93..4873c759d535 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -128,3 +128,17 @@ Description: preferred request size for workloads where sustained throughput is desired. If no optimal I/O size is reported this file contains 0. + +What: /sys/block//queue/nomerges +Date: January 2010 +Contact: +Description: + Standard I/O elevator operations include attempts to + merge contiguous I/Os. For known random I/O loads these + attempts will always fail and result in extra cycles + being spent in the kernel. This allows one to turn off + this behavior on one of two ways: When set to 1, complex + merge checks are disabled, but the simple one-shot merges + with the previous I/O request are enabled. When set to 2, + all merge tries are disabled. The default value is 0 - + which enables all types of merge tries. diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index e164403f60e1..f65274081c8d 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -25,11 +25,11 @@ size allowed by the hardware. nomerges (RW) ------------- -This enables the user to disable the lookup logic involved with IO merging -requests in the block layer. Merging may still occur through a direct -1-hit cache, since that comes for (almost) free. The IO scheduler will not -waste cycles doing tree/hash lookups for merges if nomerges is 1. Defaults -to 0, enabling all merges. +This enables the user to disable the lookup logic involved with IO +merging requests in the block layer. By default (0) all merges are +enabled. When set to 1 only simple one-hit merges will be tried. When +set to 2 no merge algorithms will be tried (including one-hit or more +complex tree/hash lookups). nr_requests (RW) ---------------- diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8606c9543fdd..e85442415db3 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -189,7 +189,8 @@ static ssize_t queue_nonrot_store(struct request_queue *q, const char *page, static ssize_t queue_nomerges_show(struct request_queue *q, char *page) { - return queue_var_show(blk_queue_nomerges(q), page); + return queue_var_show((blk_queue_nomerges(q) << 1) | + blk_queue_noxmerges(q), page); } static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, @@ -199,10 +200,12 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, ssize_t ret = queue_var_store(&nm, page, count); spin_lock_irq(q->queue_lock); - if (nm) + queue_flag_clear(QUEUE_FLAG_NOMERGES, q); + queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); + if (nm == 2) queue_flag_set(QUEUE_FLAG_NOMERGES, q); - else - queue_flag_clear(QUEUE_FLAG_NOMERGES, q); + else if (nm) + queue_flag_set(QUEUE_FLAG_NOXMERGES, q); spin_unlock_irq(q->queue_lock); return ret; diff --git a/block/elevator.c b/block/elevator.c index 9ad5ccc4c5ee..ee3a883840f2 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -473,6 +473,15 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) struct request *__rq; int ret; + /* + * Levels of merges: + * nomerges: No merges at all attempted + * noxmerges: Only simple one-hit cache try + * merges: All merge tries attempted + */ + if (blk_queue_nomerges(q)) + return ELEVATOR_NO_MERGE; + /* * First try one-hit cache. */ @@ -484,7 +493,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) } } - if (blk_queue_nomerges(q)) + if (blk_queue_noxmerges(q)) return ELEVATOR_NO_MERGE; /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ffb13ad35716..f71f5c58620c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -463,6 +463,7 @@ struct request_queue #define QUEUE_FLAG_IO_STAT 15 /* do IO stats */ #define QUEUE_FLAG_CQ 16 /* hardware does queuing */ #define QUEUE_FLAG_DISCARD 17 /* supports DISCARD */ +#define QUEUE_FLAG_NOXMERGES 18 /* No extended merges */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_CLUSTER) | \ @@ -589,6 +590,8 @@ enum { #define blk_queue_queuing(q) test_bit(QUEUE_FLAG_CQ, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) +#define blk_queue_noxmerges(q) \ + test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_flushing(q) ((q)->ordseq) -- cgit v1.2.3 From 1f5b8f8a2031ae9507eb67743cad4d424739bfff Mon Sep 17 00:00:00 2001 From: john stultz Date: Thu, 28 Jan 2010 15:02:41 -0800 Subject: ntp: Make time_esterror and time_maxerror static Make time_esterror and time_maxerror static as no one uses them outside of ntp.c Signed-off-by: John Stultz Cc: richard@rsk.demon.co.uk LKML-Reference: <1264719761.3437.47.camel@localhost.localdomain> Signed-off-by: Thomas Gleixner --- include/linux/timex.h | 3 --- kernel/time/ntp.c | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timex.h b/include/linux/timex.h index 94f8faecdcbc..7a082b32d8e1 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -238,9 +238,6 @@ extern int tickadj; /* amount of adjustment per tick */ * phase-lock loop variables */ extern int time_status; /* clock synchronization status bits */ -extern long time_maxerror; /* maximum error */ -extern long time_esterror; /* estimated error */ - extern long time_adjust; /* The amount of adjtime left */ extern void ntp_init(void); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4800f933910e..74b1b37b1595 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -58,10 +58,10 @@ static s64 time_offset; static long time_constant = 2; /* maximum error (usecs): */ -long time_maxerror = NTP_PHASE_LIMIT; +static long time_maxerror = NTP_PHASE_LIMIT; /* estimated error (usecs): */ -long time_esterror = NTP_PHASE_LIMIT; +static long time_esterror = NTP_PHASE_LIMIT; /* frequency offset (scaled nsecs/secs): */ static s64 time_freq; -- cgit v1.2.3 From 221af7f87b97431e3ee21ce4b0e77d5411cf1549 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Jan 2010 22:14:42 -0800 Subject: Split 'flush_old_exec' into two functions 'flush_old_exec()' is the point of no return when doing an execve(), and it is pretty badly misnamed. It doesn't just flush the old executable environment, it also starts up the new one. Which is very inconvenient for things like setting up the new personality, because we want the new personality to affect the starting of the new environment, but at the same time we do _not_ want the new personality to take effect if flushing the old one fails. As a result, the x86-64 '32-bit' personality is actually done using this insane "I'm going to change the ABI, but I haven't done it yet" bit (TIF_ABI_PENDING), with SET_PERSONALITY() not actually setting the personality, but just the "pending" bit, so that "flush_thread()" can do the actual personality magic. This patch in no way changes any of that insanity, but it does split the 'flush_old_exec()' function up into a preparatory part that can fail (still called flush_old_exec()), and a new part that will actually set up the new exec environment (setup_new_exec()). All callers are changed to trivially comply with the new world order. Signed-off-by: H. Peter Anvin Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/sh/kernel/process_64.c | 2 +- arch/x86/ia32/ia32_aout.c | 10 ++++++---- fs/binfmt_aout.c | 1 + fs/binfmt_elf.c | 27 ++------------------------- fs/binfmt_elf_fdpic.c | 3 +++ fs/binfmt_flat.c | 1 + fs/binfmt_som.c | 1 + fs/exec.c | 26 ++++++++++++++++---------- include/linux/binfmts.h | 1 + include/linux/sched.h | 2 +- 10 files changed, 33 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index 31f80c61b031..ec79faf6f021 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -368,7 +368,7 @@ void exit_thread(void) void flush_thread(void) { - /* Called by fs/exec.c (flush_old_exec) to remove traces of a + /* Called by fs/exec.c (setup_new_exec) to remove traces of a * previously running executable. */ #ifdef CONFIG_SH_FPU if (last_task_used_math == current) { diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 2a4d073d2cf1..435d2a5323da 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -308,15 +308,17 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval) return retval; - regs->cs = __USER32_CS; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = - regs->r13 = regs->r14 = regs->r15 = 0; - /* OK, This is the point of no return */ set_personality(PER_LINUX); set_thread_flag(TIF_IA32); clear_thread_flag(TIF_ABI_PENDING); + setup_new_exec(bprm); + + regs->cs = __USER32_CS; + regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = + regs->r13 = regs->r14 = regs->r15 = 0; + current->mm->end_code = ex.a_text + (current->mm->start_code = N_TXTADDR(ex)); current->mm->end_data = ex.a_data + diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 346b69405363..fdd397099172 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -264,6 +264,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) #else set_personality(PER_LINUX); #endif + setup_new_exec(bprm); current->mm->end_code = ex.a_text + (current->mm->start_code = N_TXTADDR(ex)); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index edd90c49003c..fd5b2ea5d299 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -662,27 +662,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') goto out_free_interp; - /* - * The early SET_PERSONALITY here is so that the lookup - * for the interpreter happens in the namespace of the - * to-be-execed image. SET_PERSONALITY can select an - * alternate root. - * - * However, SET_PERSONALITY is NOT allowed to switch - * this task into the new images's memory mapping - * policy - that is, TASK_SIZE must still evaluate to - * that which is appropriate to the execing application. - * This is because exit_mmap() needs to have TASK_SIZE - * evaluate to the size of the old image. - * - * So if (say) a 64-bit application is execing a 32-bit - * application it is the architecture's responsibility - * to defer changing the value of TASK_SIZE until the - * switch really is going to happen - do this in - * flush_thread(). - akpm - */ - SET_PERSONALITY(loc->elf_ex); - interpreter = open_exec(elf_interpreter); retval = PTR_ERR(interpreter); if (IS_ERR(interpreter)) @@ -730,9 +709,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* Verify the interpreter has a valid arch */ if (!elf_check_arch(&loc->interp_elf_ex)) goto out_free_dentry; - } else { - /* Executables without an interpreter also need a personality */ - SET_PERSONALITY(loc->elf_ex); } /* Flush all traces of the currently running executable */ @@ -752,7 +728,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) current->flags |= PF_RANDOMIZE; - arch_pick_mmap_layout(current->mm); + + setup_new_exec(bprm); /* Do this so that we can load the interpreter, if need be. We will change some of these later */ diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c57d9ce5ff7e..18d77297ccc8 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -321,6 +321,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, set_personality(PER_LINUX_FDPIC); if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) current->personality |= READ_IMPLIES_EXEC; + + setup_new_exec(bprm); + set_binfmt(&elf_fdpic_format); current->mm->start_code = 0; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index d4a00ea1054c..42c6b4a54445 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -519,6 +519,7 @@ static int load_flat_file(struct linux_binprm * bprm, /* OK, This is the point of no return */ set_personality(PER_LINUX_32BIT); + setup_new_exec(bprm); } /* diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 2a9b5330cc5e..cc8560f6c9b0 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -227,6 +227,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) /* OK, This is the point of no return */ current->flags &= ~PF_FORKNOEXEC; current->personality = PER_HPUX; + setup_new_exec(bprm); /* Set the task size for HP-UX processes such that * the gateway page is outside the address space. diff --git a/fs/exec.c b/fs/exec.c index 632b02e34ec7..675c3f44c2ea 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -941,9 +941,7 @@ void set_task_comm(struct task_struct *tsk, char *buf) int flush_old_exec(struct linux_binprm * bprm) { - char * name; - int i, ch, retval; - char tcomm[sizeof(current->comm)]; + int retval; /* * Make sure we have a private signal table and that @@ -963,6 +961,20 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; bprm->mm = NULL; /* We're using it now */ + return 0; + +out: + return retval; +} +EXPORT_SYMBOL(flush_old_exec); + +void setup_new_exec(struct linux_binprm * bprm) +{ + int i, ch; + char * name; + char tcomm[sizeof(current->comm)]; + + arch_pick_mmap_layout(current->mm); /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; @@ -1019,14 +1031,8 @@ int flush_old_exec(struct linux_binprm * bprm) flush_signal_handlers(current, 0); flush_old_files(current->files); - - return 0; - -out: - return retval; } - -EXPORT_SYMBOL(flush_old_exec); +EXPORT_SYMBOL(setup_new_exec); /* * Prepare credentials and lock ->cred_guard_mutex. diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index cd4349bdc34e..89c6249fc561 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -109,6 +109,7 @@ extern int prepare_binprm(struct linux_binprm *); extern int __must_check remove_arg_zero(struct linux_binprm *); extern int search_binary_handler(struct linux_binprm *,struct pt_regs *); extern int flush_old_exec(struct linux_binprm * bprm); +extern void setup_new_exec(struct linux_binprm * bprm); extern int suid_dumpable; #define SUID_DUMP_DISABLE 0 /* No setuid dumping */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 6f7bba93929b..abdfacc58653 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1369,7 +1369,7 @@ struct task_struct { char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock it with task_lock()) - - initialized normally by flush_old_exec */ + - initialized normally by setup_new_exec */ /* file system info */ int link_count, total_link_count; #ifdef CONFIG_SYSVIPC -- cgit v1.2.3 From 5352ae638e2d7d5c9b2e4d528676bbf2af6fd6f3 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 28 Jan 2010 17:04:43 -0600 Subject: perf, hw_breakpoint, kgdb: Do not take mutex for kernel debugger This patch fixes the regression in functionality where the kernel debugger and the perf API do not nicely share hw breakpoint reservations. The kernel debugger cannot use any mutex_lock() calls because it can start the kernel running from an invalid context. A mutex free version of the reservation API needed to get created for the kernel debugger to safely update hw breakpoint reservations. The possibility for a breakpoint reservation to be concurrently processed at the time that kgdb interrupts the system is improbable. Should this corner case occur the end user is warned, and the kernel debugger will prohibit updating the hardware breakpoint reservations. Any time the kernel debugger reserves a hardware breakpoint it will be a system wide reservation. Signed-off-by: Jason Wessel Acked-by: Frederic Weisbecker Cc: kgdb-bugreport@lists.sourceforge.net Cc: K.Prasad Cc: Peter Zijlstra Cc: Alan Stern Cc: torvalds@linux-foundation.org LKML-Reference: <1264719883-7285-3-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 51 ++++++++++++++++++++++++++++++++++++++++++ include/linux/hw_breakpoint.h | 2 ++ kernel/hw_breakpoint.c | 52 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 95 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 62bea7307eaa..bfba6019d762 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -239,6 +239,49 @@ static void kgdb_correct_hw_break(void) hw_breakpoint_restore(); } +static int hw_break_reserve_slot(int breakno) +{ + int cpu; + int cnt = 0; + struct perf_event **pevent; + + for_each_online_cpu(cpu) { + cnt++; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_reserve_bp_slot(*pevent)) + goto fail; + } + + return 0; + +fail: + for_each_online_cpu(cpu) { + cnt--; + if (!cnt) + break; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + dbg_release_bp_slot(*pevent); + } + return -1; +} + +static int hw_break_release_slot(int breakno) +{ + struct perf_event **pevent; + int cpu; + + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_release_bp_slot(*pevent)) + /* + * The debugger is responisble for handing the retry on + * remove failure. + */ + return -1; + } + return 0; +} + static int kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { @@ -250,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) if (i == 4) return -1; + if (hw_break_release_slot(i)) { + printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr); + return -1; + } breakinfo[i].enabled = 0; return 0; @@ -316,6 +363,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) return -1; } breakinfo[i].addr = addr; + if (hw_break_reserve_slot(i)) { + breakinfo[i].addr = 0; + return -1; + } breakinfo[i].enabled = 1; return 0; diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 41235c93e4e9..070ba0621738 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -75,6 +75,8 @@ extern int __register_perf_hw_breakpoint(struct perf_event *bp); extern void unregister_hw_breakpoint(struct perf_event *bp); extern void unregister_wide_hw_breakpoint(struct perf_event **cpu_events); +extern int dbg_reserve_bp_slot(struct perf_event *bp); +extern int dbg_release_bp_slot(struct perf_event *bp); extern int reserve_bp_slot(struct perf_event *bp); extern void release_bp_slot(struct perf_event *bp); diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index c030ae657f20..8a5c7d55ac9f 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -243,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM */ -int reserve_bp_slot(struct perf_event *bp) +static int __reserve_bp_slot(struct perf_event *bp) { struct bp_busy_slots slots = {0}; - int ret = 0; - - mutex_lock(&nr_bp_mutex); fetch_bp_busy_slots(&slots, bp); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) == HBP_NUM) { - ret = -ENOSPC; - goto end; - } + if (slots.pinned + (!!slots.flexible) == HBP_NUM) + return -ENOSPC; toggle_bp_slot(bp, true); -end: + return 0; +} + +int reserve_bp_slot(struct perf_event *bp) +{ + int ret; + + mutex_lock(&nr_bp_mutex); + + ret = __reserve_bp_slot(bp); + mutex_unlock(&nr_bp_mutex); return ret; } +static void __release_bp_slot(struct perf_event *bp) +{ + toggle_bp_slot(bp, false); +} + void release_bp_slot(struct perf_event *bp) { mutex_lock(&nr_bp_mutex); - toggle_bp_slot(bp, false); + __release_bp_slot(bp); mutex_unlock(&nr_bp_mutex); } +/* + * Allow the kernel debugger to reserve breakpoint slots without + * taking a lock using the dbg_* variant of for the reserve and + * release breakpoint slots. + */ +int dbg_reserve_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + return __reserve_bp_slot(bp); +} + +int dbg_release_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + __release_bp_slot(bp); + + return 0; +} int register_perf_hw_breakpoint(struct perf_event *bp) { -- cgit v1.2.3 From ef7995f4e46b1677f3eaaf547316e1a910b38dcb Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 29 Jan 2010 23:59:12 -0800 Subject: Input: implement input filters Sometimes it is desirable to suppress certain events from reaching input handlers and thus user space. One such example is Mac mouse button emulation code which catches certain key presses and converts them into button clicks as if they were emitted by a virtual mouse. The original key press events should be completely suppressed, otherwise user space will be confused, and while keyboard driver does it on its own evdev is blissfully unaware of this arrangement. This patch adds notion of 'filter' to the standard input handlers, which may flag event as filtered thus preventing it from reaching other input handlers. Filters don't (nor will they ever) have a notion of priority relative to each other, input core will run all of them first and any one of them may mark event as filtered. This patch is inspired by similar patch by Matthew Garret but the implementation and intended usage are quite different. Signed-off-by: Dmitry Torokhov --- drivers/input/input.c | 41 ++++++++++++++++++++++++++++++++++------- include/linux/input.h | 8 ++++++++ 2 files changed, 42 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/input.c b/drivers/input/input.c index 6c161e220868..7080a9d4b840 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -86,12 +86,14 @@ static int input_defuzz_abs_event(int value, int old_val, int fuzz) } /* - * Pass event through all open handles. This function is called with + * Pass event first through all filters and then, if event has not been + * filtered out, through all open handles. This function is called with * dev->event_lock held and interrupts disabled. */ static void input_pass_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { + struct input_handler *handler; struct input_handle *handle; rcu_read_lock(); @@ -99,11 +101,25 @@ static void input_pass_event(struct input_dev *dev, handle = rcu_dereference(dev->grab); if (handle) handle->handler->event(handle, type, code, value); - else - list_for_each_entry_rcu(handle, &dev->h_list, d_node) - if (handle->open) - handle->handler->event(handle, - type, code, value); + else { + bool filtered = false; + + list_for_each_entry_rcu(handle, &dev->h_list, d_node) { + if (!handle->open) + continue; + + handler = handle->handler; + if (!handler->filter) { + if (filtered) + break; + + handler->event(handle, type, code, value); + + } else if (handler->filter(handle, type, code, value)) + filtered = true; + } + } + rcu_read_unlock(); } @@ -990,6 +1006,8 @@ static int input_handlers_seq_show(struct seq_file *seq, void *v) union input_seq_state *state = (union input_seq_state *)&seq->private; seq_printf(seq, "N: Number=%u Name=%s", state->pos, handler->name); + if (handler->filter) + seq_puts(seq, " (filter)"); if (handler->fops) seq_printf(seq, " Minor=%d", handler->minor); seq_putc(seq, '\n'); @@ -1803,7 +1821,16 @@ int input_register_handle(struct input_handle *handle) error = mutex_lock_interruptible(&dev->mutex); if (error) return error; - list_add_tail_rcu(&handle->d_node, &dev->h_list); + + /* + * Filters go to the head of the list, normal handlers + * to the tail. + */ + if (handler->filter) + list_add_rcu(&handle->d_node, &dev->h_list); + else + list_add_tail_rcu(&handle->d_node, &dev->h_list); + mutex_unlock(&dev->mutex); /* diff --git a/include/linux/input.h b/include/linux/input.h index 7be8a6537b57..6c9d3d49fa91 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -1198,6 +1198,8 @@ struct input_handle; * @event: event handler. This method is being called by input core with * interrupts disabled and dev->event_lock spinlock held and so * it may not sleep + * @filter: similar to @event; separates normal event handlers from + * "filters". * @connect: called when attaching a handler to an input device * @disconnect: disconnects a handler from input device * @start: starts handler for given handle. This function is called by @@ -1219,6 +1221,11 @@ struct input_handle; * same time. All of them will get their copy of input event generated by * the device. * + * The very same structure is used to implement input filters. Input core + * allows filters to run first and will not pass event to regular handlers + * if any of the filters indicate that the event should be filtered (by + * returning %true from their filter() method). + * * Note that input core serializes calls to connect() and disconnect() * methods. */ @@ -1227,6 +1234,7 @@ struct input_handler { void *private; void (*event)(struct input_handle *handle, unsigned int type, unsigned int code, int value); + bool (*filter)(struct input_handle *handle, unsigned int type, unsigned int code, int value); int (*connect)(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id); void (*disconnect)(struct input_handle *handle); void (*start)(struct input_handle *handle); -- cgit v1.2.3 From 99b089c3c38a83ebaeb1cc4584ddcde841626467 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sat, 30 Jan 2010 00:53:29 -0800 Subject: Input: Mac button emulation - implement as an input filter Current implementation of Mac mouse button emulation plugs into legacy keyboard driver, converts certain keys into button events on a separate device, and suppresses the real events from reaching tty. This worked well enough until user space started using evdev which was completely unaware of this arrangement and kept sending original key presses to its users. Change the implementation to use newly added input filter framework so that original key presses are not transmitted to any handlers. As a bonus remove SYSCTL dependencies from the code and use Kconfig instead; also do not create the emulated mouse device until user activates emulation. Signed-off-by: Dmitry Torokhov --- drivers/char/keyboard.c | 5 - drivers/macintosh/Kconfig | 1 + drivers/macintosh/mac_hid.c | 257 ++++++++++++++++++++++++++++++++------------ include/linux/kbd_kern.h | 3 - 4 files changed, 188 insertions(+), 78 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c index f706b1dffdb3..cbf64b985ef4 100644 --- a/drivers/char/keyboard.c +++ b/drivers/char/keyboard.c @@ -1185,11 +1185,6 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw) rep = (down == 2); -#ifdef CONFIG_MAC_EMUMOUSEBTN - if (mac_hid_mouse_emulate_buttons(1, keycode, down)) - return; -#endif /* CONFIG_MAC_EMUMOUSEBTN */ - if ((raw_mode = (kbd->kbdmode == VC_RAW)) && !hw_raw) if (emulate_raw(vc, keycode, !down << 7)) if (keycode < BTN_MISC && printk_ratelimit()) diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig index 3d906833948d..aa3c27e5255d 100644 --- a/drivers/macintosh/Kconfig +++ b/drivers/macintosh/Kconfig @@ -172,6 +172,7 @@ config INPUT_ADBHID config MAC_EMUMOUSEBTN bool "Support for mouse button 2+3 emulation" + depends on SYSCTL select INPUT help This provides generic support for emulating the 2nd and 3rd mouse diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c index 7b4ef5bb556b..0b210a90aef5 100644 --- a/drivers/macintosh/mac_hid.c +++ b/drivers/macintosh/mac_hid.c @@ -13,17 +13,195 @@ #include #include #include -#include - -static struct input_dev *emumousebtn; -static int emumousebtn_input_register(void); static int mouse_emulate_buttons; static int mouse_button2_keycode = KEY_RIGHTCTRL; /* right control key */ static int mouse_button3_keycode = KEY_RIGHTALT; /* right option key */ -static int mouse_last_keycode; -#if defined(CONFIG_SYSCTL) +static struct input_dev *mac_hid_emumouse_dev; + +static int mac_hid_create_emumouse(void) +{ + static struct lock_class_key mac_hid_emumouse_dev_event_class; + static struct lock_class_key mac_hid_emumouse_dev_mutex_class; + int err; + + mac_hid_emumouse_dev = input_allocate_device(); + if (!mac_hid_emumouse_dev) + return -ENOMEM; + + lockdep_set_class(&mac_hid_emumouse_dev->event_lock, + &mac_hid_emumouse_dev_event_class); + lockdep_set_class(&mac_hid_emumouse_dev->mutex, + &mac_hid_emumouse_dev_mutex_class); + + mac_hid_emumouse_dev->name = "Macintosh mouse button emulation"; + mac_hid_emumouse_dev->id.bustype = BUS_ADB; + mac_hid_emumouse_dev->id.vendor = 0x0001; + mac_hid_emumouse_dev->id.product = 0x0001; + mac_hid_emumouse_dev->id.version = 0x0100; + + mac_hid_emumouse_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + mac_hid_emumouse_dev->keybit[BIT_WORD(BTN_MOUSE)] = + BIT_MASK(BTN_LEFT) | BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); + mac_hid_emumouse_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); + + err = input_register_device(mac_hid_emumouse_dev); + if (err) { + input_free_device(mac_hid_emumouse_dev); + mac_hid_emumouse_dev = NULL; + return err; + } + + return 0; +} + +static void mac_hid_destroy_emumouse(void) +{ + input_unregister_device(mac_hid_emumouse_dev); + mac_hid_emumouse_dev = NULL; +} + +static bool mac_hid_emumouse_filter(struct input_handle *handle, + unsigned int type, unsigned int code, + int value) +{ + unsigned int btn; + + if (type != EV_KEY) + return false; + + if (code == mouse_button2_keycode) + btn = BTN_MIDDLE; + else if (code == mouse_button3_keycode) + btn = BTN_RIGHT; + else + return false; + + input_report_key(mac_hid_emumouse_dev, btn, value); + input_sync(mac_hid_emumouse_dev); + + return true; +} + +static int mac_hid_emumouse_connect(struct input_handler *handler, + struct input_dev *dev, + const struct input_device_id *id) +{ + struct input_handle *handle; + int error; + + /* Don't bind to ourselves */ + if (dev == mac_hid_emumouse_dev) + return -ENODEV; + + handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + handle->dev = dev; + handle->handler = handler; + handle->name = "mac-button-emul"; + + error = input_register_handle(handle); + if (error) { + printk(KERN_ERR + "mac_hid: Failed to register button emulation handle, " + "error %d\n", error); + goto err_free; + } + + error = input_open_device(handle); + if (error) { + printk(KERN_ERR + "mac_hid: Failed to open input device, error %d\n", + error); + goto err_unregister; + } + + return 0; + + err_unregister: + input_unregister_handle(handle); + err_free: + kfree(handle); + return error; +} + +static void mac_hid_emumouse_disconnect(struct input_handle *handle) +{ + input_close_device(handle); + input_unregister_handle(handle); + kfree(handle); +} + +static const struct input_device_id mac_hid_emumouse_ids[] = { + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT, + .evbit = { BIT_MASK(EV_KEY) }, + }, + { }, +}; + +MODULE_DEVICE_TABLE(input, mac_hid_emumouse_ids); + +static struct input_handler mac_hid_emumouse_handler = { + .filter = mac_hid_emumouse_filter, + .connect = mac_hid_emumouse_connect, + .disconnect = mac_hid_emumouse_disconnect, + .name = "mac-button-emul", + .id_table = mac_hid_emumouse_ids, +}; + +static int mac_hid_start_emulation(void) +{ + int err; + + err = mac_hid_create_emumouse(); + if (err) + return err; + + err = input_register_handler(&mac_hid_emumouse_handler); + if (err) { + mac_hid_destroy_emumouse(); + return err; + } + + return 0; +} + +static void mac_hid_stop_emulation(void) +{ + input_unregister_handler(&mac_hid_emumouse_handler); + mac_hid_destroy_emumouse(); +} + +static int mac_hid_toggle_emumouse(ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int *valp = table->data; + int old_val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + + if (rc == 0 && write && *valp != old_val) { + if (*valp == 1) + rc = mac_hid_start_emulation(); + else if (*valp == 0) + mac_hid_stop_emulation(); + else + rc = -EINVAL; + } + + /* Restore the old value in case of error */ + if (rc) + *valp = old_val; + + return rc; +} + /* file(s) in /proc/sys/dev/mac_hid */ static ctl_table mac_hid_files[] = { { @@ -31,7 +209,7 @@ static ctl_table mac_hid_files[] = { .data = &mouse_emulate_buttons, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = mac_hid_toggle_emumouse, }, { .procname = "mouse_button2_keycode", @@ -74,73 +252,12 @@ static ctl_table mac_hid_root_dir[] = { static struct ctl_table_header *mac_hid_sysctl_header; -#endif /* endif CONFIG_SYSCTL */ - -int mac_hid_mouse_emulate_buttons(int caller, unsigned int keycode, int down) -{ - switch (caller) { - case 1: - /* Called from keyboard.c */ - if (mouse_emulate_buttons - && (keycode == mouse_button2_keycode - || keycode == mouse_button3_keycode)) { - if (mouse_emulate_buttons == 1) { - input_report_key(emumousebtn, - keycode == mouse_button2_keycode ? BTN_MIDDLE : BTN_RIGHT, - down); - input_sync(emumousebtn); - return 1; - } - mouse_last_keycode = down ? keycode : 0; - } - break; - } - return 0; -} - -static struct lock_class_key emumousebtn_event_class; -static struct lock_class_key emumousebtn_mutex_class; - -static int emumousebtn_input_register(void) -{ - int ret; - - emumousebtn = input_allocate_device(); - if (!emumousebtn) - return -ENOMEM; - - lockdep_set_class(&emumousebtn->event_lock, &emumousebtn_event_class); - lockdep_set_class(&emumousebtn->mutex, &emumousebtn_mutex_class); - - emumousebtn->name = "Macintosh mouse button emulation"; - emumousebtn->id.bustype = BUS_ADB; - emumousebtn->id.vendor = 0x0001; - emumousebtn->id.product = 0x0001; - emumousebtn->id.version = 0x0100; - - emumousebtn->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); - emumousebtn->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | - BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); - emumousebtn->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); - - ret = input_register_device(emumousebtn); - if (ret) - input_free_device(emumousebtn); - - return ret; -} static int __init mac_hid_init(void) { - int err; - - err = emumousebtn_input_register(); - if (err) - return err; - -#if defined(CONFIG_SYSCTL) mac_hid_sysctl_header = register_sysctl_table(mac_hid_root_dir); -#endif /* CONFIG_SYSCTL */ + if (!mac_hid_sysctl_header) + return -ENOMEM; return 0; } diff --git a/include/linux/kbd_kern.h b/include/linux/kbd_kern.h index 8bdb16bfe5fb..506ad20c18f8 100644 --- a/include/linux/kbd_kern.h +++ b/include/linux/kbd_kern.h @@ -161,7 +161,4 @@ static inline void con_schedule_flip(struct tty_struct *t) schedule_delayed_work(&t->buf.work, 0); } -/* mac_hid.c */ -extern int mac_hid_mouse_emulate_buttons(int, unsigned int, int); - #endif -- cgit v1.2.3 From d6ad3e286d2c075a60b9f11075a2c55aeeeca2ad Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 27 Jan 2010 16:25:22 -0600 Subject: softlockup: Add sched_clock_tick() to avoid kernel warning on kgdb resume When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, sched_clock() gets the time from hardware such as the TSC on x86. In this configuration kgdb will report a softlock warning message on resuming or detaching from a debug session. Sequence of events in the problem case: 1) "cpu sched clock" and "hardware time" are at 100 sec prior to a call to kgdb_handle_exception() 2) Debugger waits in kgdb_handle_exception() for 80 sec and on exit the following is called ... touch_softlockup_watchdog() --> __raw_get_cpu_var(touch_timestamp) = 0; 3) "cpu sched clock" = 100s (it was not updated, because the interrupt was disabled in kgdb) but the "hardware time" = 180 sec 4) The first timer interrupt after resuming from kgdb_handle_exception updates the watchdog from the "cpu sched clock" update_process_times() { ... run_local_timers() --> softlockup_tick() --> check (touch_timestamp == 0) (it is "YES" here, we have set "touch_timestamp = 0" at kgdb) --> __touch_softlockup_watchdog() ***(A)--> reset "touch_timestamp" to "get_timestamp()" (Here, the "touch_timestamp" will still be set to 100s.) ... scheduler_tick() ***(B)--> sched_clock_tick() (update "cpu sched clock" to "hardware time" = 180s) ... } 5) The Second timer interrupt handler appears to have a large jump and trips the softlockup warning. update_process_times() { ... run_local_timers() --> softlockup_tick() --> "cpu sched clock" - "touch_timestamp" = 180s-100s > 60s --> printk "soft lockup error messages" ... } note: ***(A) reset "touch_timestamp" to "get_timestamp(this_cpu)" Why is "touch_timestamp" 100 sec, instead of 180 sec? When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, the call trace of get_timestamp() is: get_timestamp(this_cpu) -->cpu_clock(this_cpu) -->sched_clock_cpu(this_cpu) -->__update_sched_clock(sched_clock_data, now) The __update_sched_clock() function uses the GTOD tick value to create a window to normalize the "now" values. So if "now" value is too big for sched_clock_data, it will be ignored. The fix is to invoke sched_clock_tick() to update "cpu sched clock" in order to recover from this state. This is done by introducing the function touch_softlockup_watchdog_sync(). This allows kgdb to request that the sched clock is updated when the watchdog thread runs the first time after a resume from kgdb. [yong.zhang0@gmail.com: Use per cpu instead of an array] Signed-off-by: Jason Wessel Signed-off-by: Dongdong Deng Cc: kgdb-bugreport@lists.sourceforge.net Cc: peterz@infradead.org LKML-Reference: <1264631124-4837-2-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/kgdb.c | 6 +++--- kernel/softlockup.c | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6f7bba93929b..89232151a9d0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -310,6 +310,7 @@ extern void sched_show_task(struct task_struct *p); #ifdef CONFIG_DETECT_SOFTLOCKUP extern void softlockup_tick(void); extern void touch_softlockup_watchdog(void); +extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, void __user *buffer, @@ -323,6 +324,9 @@ static inline void softlockup_tick(void) static inline void touch_softlockup_watchdog(void) { } +static inline void touch_softlockup_watchdog_sync(void) +{ +} static inline void touch_all_softlockup_watchdogs(void) { } diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 2eb517e23514..87f2cc557553 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -596,7 +596,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -1450,7 +1450,7 @@ acquirelock: (kgdb_info[cpu].task && kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1550,7 +1550,7 @@ kgdb_restore: } /* Free kgdb_active */ atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index d22579087e27..0d4c7898ab80 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock); static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); +static DEFINE_PER_CPU(bool, softlock_touch_sync); static int __read_mostly did_panic; int __read_mostly softlockup_thresh = 60; @@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void) } EXPORT_SYMBOL(touch_softlockup_watchdog); +void touch_softlockup_watchdog_sync(void) +{ + __raw_get_cpu_var(softlock_touch_sync) = true; + __raw_get_cpu_var(softlockup_touch_ts) = 0; +} + void touch_all_softlockup_watchdogs(void) { int cpu; @@ -118,6 +125,14 @@ void softlockup_tick(void) } if (touch_ts == 0) { + if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { + /* + * If the time stamp was touched atomically + * make sure the scheduler tick is up to date. + */ + per_cpu(softlock_touch_sync, this_cpu) = false; + sched_clock_tick(); + } __touch_softlockup_watchdog(); return; } -- cgit v1.2.3 From 61ef2489dbf587258526cfd4ebf4bba3b079f401 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 22 Jan 2010 16:16:19 +0800 Subject: resources: introduce generic page_is_ram() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's based on walk_system_ram_range(), for archs that don't have their own page_is_ram(). The static verions in MIPS and SCORE are also made global. v4: prefer plain 1 instead of PAGE_IS_RAM (H. Peter Anvin) v3: add comment (KAMEZAWA Hiroyuki) "AFAIK, this "System RAM" information has been used for kdump to grab valid memory area and seems good for the kernel itself." v2: add PAGE_IS_RAM macro (Américo Wang) Cc: Chen Liqin Cc: Lennox Wu Cc: Américo Wang Cc: linux-mips@linux-mips.org Cc: Yinghai Lu Acked-by: Ralf Baechle Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Wu Fengguang LKML-Reference: <20100122081619.GA6431@localhost> Cc: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/mips/mm/init.c | 2 +- arch/score/mm/init.c | 2 +- include/linux/ioport.h | 2 ++ kernel/resource.c | 13 +++++++++++++ 4 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 15aa1902a788..4d72aabe8352 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -294,7 +294,7 @@ void __init fixrange_init(unsigned long start, unsigned long end, } #ifndef CONFIG_NEED_MULTIPLE_NODES -static int __init page_is_ram(unsigned long pagenr) +int page_is_ram(unsigned long pagenr) { int i; diff --git a/arch/score/mm/init.c b/arch/score/mm/init.c index 4e3dcd0c4716..f684a590c21d 100644 --- a/arch/score/mm/init.c +++ b/arch/score/mm/init.c @@ -59,7 +59,7 @@ static unsigned long setup_zero_page(void) } #ifndef CONFIG_NEED_MULTIPLE_NODES -static int __init page_is_ram(unsigned long pagenr) +int page_is_ram(unsigned long pagenr) { if (pagenr >= min_low_pfn && pagenr < max_low_pfn) return 1; diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 83aa81297ea3..11ef7952b63a 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -188,5 +188,7 @@ extern int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)); +extern int page_is_ram(unsigned long pfn); + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */ diff --git a/kernel/resource.c b/kernel/resource.c index fb11a58b9594..b4d637a55256 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -297,6 +297,19 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, #endif +static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) +{ + return 1; +} +/* + * This generic page_is_ram() returns true if specified address is + * registered as "System RAM" in iomem_resource list. + */ +int __attribute__((weak)) page_is_ram(unsigned long pfn) +{ + return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; +} + /* * Find empty slot in the resource tree given range and alignment. */ -- cgit v1.2.3 From 53df8fdc15fb646b0219e43c989c2cdab1ab100c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 27 Jan 2010 11:06:39 +0800 Subject: Move page_is_ram() declaration to mm.h Move page_is_ram() declaration to mm.h, it makes no sense in . Signed-off-by: Wu Fengguang LKML-Reference: <20100127030639.GD8132@localhost> Signed-off-by: H. Peter Anvin --- include/linux/ioport.h | 2 -- include/linux/mm.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 11ef7952b63a..83aa81297ea3 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -188,7 +188,5 @@ extern int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)); -extern int page_is_ram(unsigned long pfn); - #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 24c395694f4d..bad433fdbfce 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -265,6 +265,8 @@ static inline int get_page_unless_zero(struct page *page) return atomic_inc_not_zero(&page->_count); } +extern int page_is_ram(unsigned long pfn); + /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); -- cgit v1.2.3 From b79c7adf82e8b8a6d6ad1dadf7e687a4a030cf8c Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 2 Feb 2010 13:01:25 +0900 Subject: mtd: trivial sh_flctl changes This patch contains a few changes for the sh_flctl driver: - not sh7723-only driver - get rid of kconfig dependency - use dev_err() instead of printk() - use __devinit and __devexit for probe()/remove() - fix probe() return values Signed-off-by: Magnus Damm Acked-by: Yoshihiro Shimoda Signed-off-by: Paul Mundt --- drivers/mtd/nand/Kconfig | 4 ++-- drivers/mtd/nand/sh_flctl.c | 42 +++++++++++++++++++++++------------------- include/linux/mtd/sh_flctl.h | 1 + 3 files changed, 26 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig index 677cd53f18c3..bb6465604235 100644 --- a/drivers/mtd/nand/Kconfig +++ b/drivers/mtd/nand/Kconfig @@ -457,10 +457,10 @@ config MTD_NAND_NOMADIK config MTD_NAND_SH_FLCTL tristate "Support for NAND on Renesas SuperH FLCTL" - depends on MTD_NAND && SUPERH && CPU_SUBTYPE_SH7723 + depends on MTD_NAND && SUPERH help Several Renesas SuperH CPU has FLCTL. This option enables support - for NAND Flash using FLCTL. This driver support SH7723. + for NAND Flash using FLCTL. config MTD_NAND_DAVINCI tristate "Support NAND on DaVinci SoC" diff --git a/drivers/mtd/nand/sh_flctl.c b/drivers/mtd/nand/sh_flctl.c index 02bef21f2e4b..ab068a503b29 100644 --- a/drivers/mtd/nand/sh_flctl.c +++ b/drivers/mtd/nand/sh_flctl.c @@ -1,10 +1,10 @@ /* * SuperH FLCTL nand controller * - * Copyright © 2008 Renesas Solutions Corp. - * Copyright © 2008 Atom Create Engineering Co., Ltd. + * Copyright (c) 2008 Renesas Solutions Corp. + * Copyright (c) 2008 Atom Create Engineering Co., Ltd. * - * Based on fsl_elbc_nand.c, Copyright © 2006-2007 Freescale Semiconductor + * Based on fsl_elbc_nand.c, Copyright (c) 2006-2007 Freescale Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -75,6 +75,11 @@ static void start_translation(struct sh_flctl *flctl) writeb(TRSTRT, FLTRCR(flctl)); } +static void timeout_error(struct sh_flctl *flctl, const char *str) +{ + dev_err(&flctl->pdev->dev, "Timeout occured in %s\n", str); +} + static void wait_completion(struct sh_flctl *flctl) { uint32_t timeout = LOOP_TIMEOUT_MAX; @@ -87,7 +92,7 @@ static void wait_completion(struct sh_flctl *flctl) udelay(1); } - printk(KERN_ERR "wait_completion(): Timeout occured \n"); + timeout_error(flctl, __func__); writeb(0x0, FLTRCR(flctl)); } @@ -132,7 +137,7 @@ static void wait_rfifo_ready(struct sh_flctl *flctl) return; udelay(1); } - printk(KERN_ERR "wait_rfifo_ready(): Timeout occured \n"); + timeout_error(flctl, __func__); } static void wait_wfifo_ready(struct sh_flctl *flctl) @@ -146,7 +151,7 @@ static void wait_wfifo_ready(struct sh_flctl *flctl) return; udelay(1); } - printk(KERN_ERR "wait_wfifo_ready(): Timeout occured \n"); + timeout_error(flctl, __func__); } static int wait_recfifo_ready(struct sh_flctl *flctl, int sector_number) @@ -198,7 +203,7 @@ static int wait_recfifo_ready(struct sh_flctl *flctl, int sector_number) writel(0, FL4ECCCR(flctl)); } - printk(KERN_ERR "wait_recfifo_ready(): Timeout occured \n"); + timeout_error(flctl, __func__); return 1; /* timeout */ } @@ -214,7 +219,7 @@ static void wait_wecfifo_ready(struct sh_flctl *flctl) return; udelay(1); } - printk(KERN_ERR "wait_wecfifo_ready(): Timeout occured \n"); + timeout_error(flctl, __func__); } static void read_datareg(struct sh_flctl *flctl, int offset) @@ -769,38 +774,36 @@ static int flctl_chip_init_tail(struct mtd_info *mtd) return 0; } -static int __init flctl_probe(struct platform_device *pdev) +static int __devinit flctl_probe(struct platform_device *pdev) { struct resource *res; struct sh_flctl *flctl; struct mtd_info *flctl_mtd; struct nand_chip *nand; struct sh_flctl_platform_data *pdata; - int ret; + int ret = -ENXIO; pdata = pdev->dev.platform_data; if (pdata == NULL) { - printk(KERN_ERR "sh_flctl platform_data not found.\n"); - return -ENODEV; + dev_err(&pdev->dev, "no platform data defined\n"); + return -EINVAL; } flctl = kzalloc(sizeof(struct sh_flctl), GFP_KERNEL); if (!flctl) { - printk(KERN_ERR "Unable to allocate NAND MTD dev structure.\n"); + dev_err(&pdev->dev, "failed to allocate driver data\n"); return -ENOMEM; } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) { - printk(KERN_ERR "%s: resource not found.\n", __func__); - ret = -ENODEV; + dev_err(&pdev->dev, "failed to get I/O memory\n"); goto err; } - flctl->reg = ioremap(res->start, res->end - res->start + 1); + flctl->reg = ioremap(res->start, resource_size(res)); if (flctl->reg == NULL) { - printk(KERN_ERR "%s: ioremap error.\n", __func__); - ret = -ENOMEM; + dev_err(&pdev->dev, "failed to remap I/O memory\n"); goto err; } @@ -808,6 +811,7 @@ static int __init flctl_probe(struct platform_device *pdev) flctl_mtd = &flctl->mtd; nand = &flctl->chip; flctl_mtd->priv = nand; + flctl->pdev = pdev; flctl->hwecc = pdata->has_hwecc; flctl_register_init(flctl, pdata->flcmncr_val); @@ -846,7 +850,7 @@ err: return ret; } -static int __exit flctl_remove(struct platform_device *pdev) +static int __devexit flctl_remove(struct platform_device *pdev) { struct sh_flctl *flctl = platform_get_drvdata(pdev); diff --git a/include/linux/mtd/sh_flctl.h b/include/linux/mtd/sh_flctl.h index e77c1cea404d..164c9d4013c0 100644 --- a/include/linux/mtd/sh_flctl.h +++ b/include/linux/mtd/sh_flctl.h @@ -96,6 +96,7 @@ struct sh_flctl { struct mtd_info mtd; struct nand_chip chip; + struct platform_device *pdev; void __iomem *reg; uint8_t done_buff[2048 + 64]; /* max size 2048 + 64 */ -- cgit v1.2.3 From 010ab820582d03bcd3648416b5837107e8a9c5f3 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 27 Jan 2010 09:17:21 +0000 Subject: mtd: sh_flctl SHBUSSEL and SEL_16BIT support This patch extends the sh_flctl driver with support for 16-bit bus configuration using SEL_16BIT and support for multiplexed pins using SHBUSSEL. Signed-off-by: Magnus Damm Acked-by: Yoshihiro Shimoda Signed-off-by: Paul Mundt --- drivers/mtd/nand/sh_flctl.c | 27 ++++++++++++++++++++++++++- include/linux/mtd/sh_flctl.h | 2 ++ 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/sh_flctl.c b/drivers/mtd/nand/sh_flctl.c index ab068a503b29..1842df8bdd93 100644 --- a/drivers/mtd/nand/sh_flctl.c +++ b/drivers/mtd/nand/sh_flctl.c @@ -105,6 +105,8 @@ static void set_addr(struct mtd_info *mtd, int column, int page_addr) addr = page_addr; /* ERASE1 */ } else if (page_addr != -1) { /* SEQIN, READ0, etc.. */ + if (flctl->chip.options & NAND_BUSWIDTH_16) + column >>= 1; if (flctl->page_size) { addr = column & 0x0FFF; addr |= (page_addr & 0xff) << 16; @@ -280,7 +282,7 @@ static void write_fiforeg(struct sh_flctl *flctl, int rlen, int offset) static void set_cmd_regs(struct mtd_info *mtd, uint32_t cmd, uint32_t flcmcdr_val) { struct sh_flctl *flctl = mtd_to_flctl(mtd); - uint32_t flcmncr_val = readl(FLCMNCR(flctl)); + uint32_t flcmncr_val = readl(FLCMNCR(flctl)) & ~SEL_16BIT; uint32_t flcmdcr_val, addr_len_bytes = 0; /* Set SNAND bit if page size is 2048byte */ @@ -302,6 +304,8 @@ static void set_cmd_regs(struct mtd_info *mtd, uint32_t cmd, uint32_t flcmcdr_va case NAND_CMD_READOOB: addr_len_bytes = flctl->rw_ADRCNT; flcmdcr_val |= CDSRC_E; + if (flctl->chip.options & NAND_BUSWIDTH_16) + flcmncr_val |= SEL_16BIT; break; case NAND_CMD_SEQIN: /* This case is that cmd is READ0 or READ1 or READ00 */ @@ -310,6 +314,8 @@ static void set_cmd_regs(struct mtd_info *mtd, uint32_t cmd, uint32_t flcmcdr_va case NAND_CMD_PAGEPROG: addr_len_bytes = flctl->rw_ADRCNT; flcmdcr_val |= DOCMD2_E | CDSRC_E | SELRW; + if (flctl->chip.options & NAND_BUSWIDTH_16) + flcmncr_val |= SEL_16BIT; break; case NAND_CMD_READID: flcmncr_val &= ~SNAND_E; @@ -528,6 +534,8 @@ static void flctl_cmdfunc(struct mtd_info *mtd, unsigned int command, set_addr(mtd, 0, page_addr); flctl->read_bytes = mtd->writesize + mtd->oobsize; + if (flctl->chip.options & NAND_BUSWIDTH_16) + column >>= 1; flctl->index += column; goto read_normal_exit; @@ -691,6 +699,18 @@ static uint8_t flctl_read_byte(struct mtd_info *mtd) return data; } +static uint16_t flctl_read_word(struct mtd_info *mtd) +{ + struct sh_flctl *flctl = mtd_to_flctl(mtd); + int index = flctl->index; + uint16_t data; + uint16_t *buf = (uint16_t *)&flctl->done_buff[index]; + + data = *buf; + flctl->index += 2; + return data; +} + static void flctl_read_buf(struct mtd_info *mtd, uint8_t *buf, int len) { int i; @@ -829,6 +849,11 @@ static int __devinit flctl_probe(struct platform_device *pdev) nand->select_chip = flctl_select_chip; nand->cmdfunc = flctl_cmdfunc; + if (pdata->flcmncr_val & SEL_16BIT) { + nand->options |= NAND_BUSWIDTH_16; + nand->read_word = flctl_read_word; + } + ret = nand_scan_ident(flctl_mtd, 1); if (ret) goto err; diff --git a/include/linux/mtd/sh_flctl.h b/include/linux/mtd/sh_flctl.h index 164c9d4013c0..ab77609ec337 100644 --- a/include/linux/mtd/sh_flctl.h +++ b/include/linux/mtd/sh_flctl.h @@ -51,6 +51,8 @@ #define _4ECCCNTEN (0x1 << 24) #define _4ECCEN (0x1 << 23) #define _4ECCCORRECT (0x1 << 22) +#define SHBUSSEL (0x1 << 20) +#define SEL_16BIT (0x1 << 19) #define SNAND_E (0x1 << 18) /* SNAND (0=512 1=2048)*/ #define QTSEL_E (0x1 << 17) #define ENDIAN (0x1 << 16) /* 1 = little endian */ -- cgit v1.2.3 From c30f540b63047437ffa894b5353216410c480d1a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 2 Feb 2010 15:03:24 +0100 Subject: netfilter: xtables: CONFIG_COMPAT redux Ifdef out struct nf_sockopt_ops::compat_set struct nf_sockopt_ops::compat_get struct xt_match::compat_from_user struct xt_match::compat_to_user struct xt_match::compatsize to make structures smaller on COMPAT=n kernels. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/linux/netfilter.h | 9 ++++++--- include/linux/netfilter/x_tables.h | 12 ++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 48c54960773c..78f33d223680 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -114,15 +114,17 @@ struct nf_sockopt_ops { int set_optmin; int set_optmax; int (*set)(struct sock *sk, int optval, void __user *user, unsigned int len); +#ifdef CONFIG_COMPAT int (*compat_set)(struct sock *sk, int optval, void __user *user, unsigned int len); - +#endif int get_optmin; int get_optmax; int (*get)(struct sock *sk, int optval, void __user *user, int *len); +#ifdef CONFIG_COMPAT int (*compat_get)(struct sock *sk, int optval, void __user *user, int *len); - +#endif /* Use the module struct to lock set/get code in place */ struct module *owner; }; @@ -222,11 +224,12 @@ int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, unsigned int len); int nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, int *len); - +#ifdef CONFIG_COMPAT int compat_nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, unsigned int len); int compat_nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, int *len); +#endif /* Call this before modifying an existing packet: ensures it is modifiable and linear to the point you care about (writable_len). diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 3caf5e151102..026eb78ee83c 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -283,11 +283,11 @@ struct xt_match { /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_mtdtor_param *); - +#ifdef CONFIG_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, void *src); int (*compat_to_user)(void __user *dst, void *src); - +#endif /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; @@ -296,7 +296,9 @@ struct xt_match { const char *table; unsigned int matchsize; +#ifdef CONFIG_COMPAT unsigned int compatsize; +#endif unsigned int hooks; unsigned short proto; @@ -323,17 +325,19 @@ struct xt_target { /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_tgdtor_param *); - +#ifdef CONFIG_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, void *src); int (*compat_to_user)(void __user *dst, void *src); - +#endif /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; const char *table; unsigned int targetsize; +#ifdef CONFIG_COMPAT unsigned int compatsize; +#endif unsigned int hooks; unsigned short proto; -- cgit v1.2.3 From c85bb41e93184bf5494dde6d8fe5a81b564c84c8 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Tue, 2 Feb 2010 07:32:29 -0800 Subject: igmp: fix ip_mc_sf_allow race [v5] Almost all igmp functions accessing inet->mc_list are protected by rtnl_lock(), but there is one exception which is ip_mc_sf_allow(), so there is a chance of either ip_mc_drop_socket or ip_mc_leave_group remove an entry while ip_mc_sf_allow is running causing a crash. Signed-off-by: Flavio Leitner Signed-off-by: David S. Miller --- include/linux/igmp.h | 2 ++ net/ipv4/igmp.c | 83 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 64 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 724c27e5d173..93fc2449af10 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -153,6 +153,7 @@ extern int sysctl_igmp_max_msf; struct ip_sf_socklist { unsigned int sl_max; unsigned int sl_count; + struct rcu_head rcu; __be32 sl_addr[0]; }; @@ -170,6 +171,7 @@ struct ip_mc_socklist { struct ip_mreqn multi; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ip_sf_socklist *sflist; + struct rcu_head rcu; }; struct ip_sf_list { diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 8f5468393f01..d28363998743 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1799,7 +1799,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) iml->next = inet->mc_list; iml->sflist = NULL; iml->sfmode = MCAST_EXCLUDE; - inet->mc_list = iml; + rcu_assign_pointer(inet->mc_list, iml); ip_mc_inc_group(in_dev, addr); err = 0; done: @@ -1807,24 +1807,46 @@ done: return err; } +static void ip_sf_socklist_reclaim(struct rcu_head *rp) +{ + struct ip_sf_socklist *psf; + + psf = container_of(rp, struct ip_sf_socklist, rcu); + /* sk_omem_alloc should have been decreased by the caller*/ + kfree(psf); +} + static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { + struct ip_sf_socklist *psf = iml->sflist; int err; - if (iml->sflist == NULL) { + if (psf == NULL) { /* any-source empty exclude case */ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, 0, NULL, 0); } err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, - iml->sfmode, iml->sflist->sl_count, - iml->sflist->sl_addr, 0); - sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max)); - iml->sflist = NULL; + iml->sfmode, psf->sl_count, psf->sl_addr, 0); + rcu_assign_pointer(iml->sflist, NULL); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); + call_rcu(&psf->rcu, ip_sf_socklist_reclaim); return err; } + +static void ip_mc_socklist_reclaim(struct rcu_head *rp) +{ + struct ip_mc_socklist *iml; + + iml = container_of(rp, struct ip_mc_socklist, rcu); + /* sk_omem_alloc should have been decreased by the caller*/ + kfree(iml); +} + + /* * Ask a socket to leave a group. */ @@ -1854,12 +1876,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) (void) ip_mc_leave_src(sk, iml, in_dev); - *imlp = iml->next; + rcu_assign_pointer(*imlp, iml->next); if (in_dev) ip_mc_dec_group(in_dev, group); rtnl_unlock(); - sock_kfree_s(sk, iml, sizeof(*iml)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); + call_rcu(&iml->rcu, ip_mc_socklist_reclaim); return 0; } if (!in_dev) @@ -1974,9 +1998,12 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (psl) { for (i=0; isl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; - sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + call_rcu(&psl->rcu, ip_sf_socklist_reclaim); } - pmc->sflist = psl = newpsl; + rcu_assign_pointer(pmc->sflist, newpsl); + psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i=0; isl_count; i++) { @@ -2072,11 +2099,13 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) if (psl) { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); - sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + call_rcu(&psl->rcu, ip_sf_socklist_reclaim); } else (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 0, NULL, 0); - pmc->sflist = newpsl; + rcu_assign_pointer(pmc->sflist, newpsl); pmc->sfmode = msf->imsf_fmode; err = 0; done: @@ -2209,30 +2238,40 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) struct ip_mc_socklist *pmc; struct ip_sf_socklist *psl; int i; + int ret; + ret = 1; if (!ipv4_is_multicast(loc_addr)) - return 1; + goto out; - for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + rcu_read_lock(); + for (pmc=rcu_dereference(inet->mc_list); pmc; pmc=rcu_dereference(pmc->next)) { if (pmc->multi.imr_multiaddr.s_addr == loc_addr && pmc->multi.imr_ifindex == dif) break; } + ret = inet->mc_all; if (!pmc) - return inet->mc_all; + goto unlock; psl = pmc->sflist; + ret = (pmc->sfmode == MCAST_EXCLUDE); if (!psl) - return pmc->sfmode == MCAST_EXCLUDE; + goto unlock; for (i=0; isl_count; i++) { if (psl->sl_addr[i] == rmt_addr) break; } + ret = 0; if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) - return 0; + goto unlock; if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) - return 0; - return 1; + goto unlock; + ret = 1; +unlock: + rcu_read_unlock(); +out: + return ret; } /* @@ -2251,7 +2290,7 @@ void ip_mc_drop_socket(struct sock *sk) rtnl_lock(); while ((iml = inet->mc_list) != NULL) { struct in_device *in_dev; - inet->mc_list = iml->next; + rcu_assign_pointer(inet->mc_list, iml->next); in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); @@ -2259,7 +2298,9 @@ void ip_mc_drop_socket(struct sock *sk) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); in_dev_put(in_dev); } - sock_kfree_s(sk, iml, sizeof(*iml)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); + call_rcu(&iml->rcu, ip_mc_socklist_reclaim); } rtnl_unlock(); } -- cgit v1.2.3 From f9bfbebf34eab707b065116cdc9699d25ba4252a Mon Sep 17 00:00:00 2001 From: Shirley Ma Date: Fri, 29 Jan 2010 03:19:05 +0000 Subject: virtio: Add ability to detach unused buffers from vrings There's currently no way for a virtio driver to ask for unused buffers, so it has to keep a list itself to reclaim them at shutdown. This is redundant, since virtio_ring stores that information. So add a new hook to do this. Signed-off-by: Shirley Ma Signed-off-by: Amit Shah Signed-off-by: Rusty Russell Signed-off-by: David S. Miller --- drivers/virtio/virtio_ring.c | 25 +++++++++++++++++++++++++ include/linux/virtio.h | 4 ++++ 2 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index fbd2ecde93e4..71929ee00d69 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -334,6 +334,30 @@ static bool vring_enable_cb(struct virtqueue *_vq) return true; } +static void *vring_detach_unused_buf(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + unsigned int i; + void *buf; + + START_USE(vq); + + for (i = 0; i < vq->vring.num; i++) { + if (!vq->data[i]) + continue; + /* detach_buf clears data, so grab it now. */ + buf = vq->data[i]; + detach_buf(vq, i); + END_USE(vq); + return buf; + } + /* That should have freed everything. */ + BUG_ON(vq->num_free != vq->vring.num); + + END_USE(vq); + return NULL; +} + irqreturn_t vring_interrupt(int irq, void *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); @@ -360,6 +384,7 @@ static struct virtqueue_ops vring_vq_ops = { .kick = vring_kick, .disable_cb = vring_disable_cb, .enable_cb = vring_enable_cb, + .detach_unused_buf = vring_detach_unused_buf, }; struct virtqueue *vring_new_virtqueue(unsigned int num, diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 057a2e010758..f508c651e53d 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -51,6 +51,9 @@ struct virtqueue { * This re-enables callbacks; it returns "false" if there are pending * buffers in the queue, to detect a possible race between the driver * checking for more work, and enabling callbacks. + * @detach_unused_buf: detach first unused buffer + * vq: the struct virtqueue we're talking about. + * Returns NULL or the "data" token handed to add_buf * * Locking rules are straightforward: the driver is responsible for * locking. No two operations may be invoked simultaneously, with the exception @@ -71,6 +74,7 @@ struct virtqueue_ops { void (*disable_cb)(struct virtqueue *vq); bool (*enable_cb)(struct virtqueue *vq); + void *(*detach_unused_buf)(struct virtqueue *vq); }; /** -- cgit v1.2.3 From f98bfbd78c37c5946cc53089da32a5f741efdeb7 Mon Sep 17 00:00:00 2001 From: Evgeniy Polyakov Date: Tue, 2 Feb 2010 15:58:48 -0800 Subject: connector: Delete buggy notification code. On Tue, Feb 02, 2010 at 02:57:14PM -0800, Greg KH (gregkh@suse.de) wrote: > > There are at least two ways to fix it: using a big cannon and a small > > one. The former way is to disable notification registration, since it is > > not used by anyone at all. Second way is to check whether calling > > process is root and its destination group is -1 (kind of priveledged > > one) before command is dispatched to workqueue. > > Well if no one is using it, removing it makes the most sense, right? > > No objection from me, care to make up a patch either way for this? Getting it is not used, let's drop support for notifications about (un)registered events from connector. Another option was to check credentials on receiving, but we can always restore it without bugs if needed, but genetlink has a wider code base and none complained, that userspace can not get notification when some other clients were (un)registered. Kudos for Sebastian Krahmer , who found a bug in the code. Signed-off-by: Evgeniy Polyakov Acked-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- drivers/connector/connector.c | 175 ------------------------------------------ include/linux/connector.h | 32 -------- 2 files changed, 207 deletions(-) (limited to 'include/linux') diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c index f06024668f99..537c29ac4487 100644 --- a/drivers/connector/connector.c +++ b/drivers/connector/connector.c @@ -36,17 +36,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Evgeniy Polyakov "); MODULE_DESCRIPTION("Generic userspace <-> kernelspace connector."); -static u32 cn_idx = CN_IDX_CONNECTOR; -static u32 cn_val = CN_VAL_CONNECTOR; - -module_param(cn_idx, uint, 0); -module_param(cn_val, uint, 0); -MODULE_PARM_DESC(cn_idx, "Connector's main device idx."); -MODULE_PARM_DESC(cn_val, "Connector's main device val."); - -static DEFINE_MUTEX(notify_lock); -static LIST_HEAD(notify_list); - static struct cn_dev cdev; static int cn_already_initialized; @@ -209,54 +198,6 @@ static void cn_rx_skb(struct sk_buff *__skb) } } -/* - * Notification routing. - * - * Gets id and checks if there are notification request for it's idx - * and val. If there are such requests notify the listeners with the - * given notify event. - * - */ -static void cn_notify(struct cb_id *id, u32 notify_event) -{ - struct cn_ctl_entry *ent; - - mutex_lock(¬ify_lock); - list_for_each_entry(ent, ¬ify_list, notify_entry) { - int i; - struct cn_notify_req *req; - struct cn_ctl_msg *ctl = ent->msg; - int idx_found, val_found; - - idx_found = val_found = 0; - - req = (struct cn_notify_req *)ctl->data; - for (i = 0; i < ctl->idx_notify_num; ++i, ++req) { - if (id->idx >= req->first && - id->idx < req->first + req->range) { - idx_found = 1; - break; - } - } - - for (i = 0; i < ctl->val_notify_num; ++i, ++req) { - if (id->val >= req->first && - id->val < req->first + req->range) { - val_found = 1; - break; - } - } - - if (idx_found && val_found) { - struct cn_msg m = { .ack = notify_event, }; - - memcpy(&m.id, id, sizeof(m.id)); - cn_netlink_send(&m, ctl->group, GFP_KERNEL); - } - } - mutex_unlock(¬ify_lock); -} - /* * Callback add routing - adds callback with given ID and name. * If there is registered callback with the same ID it will not be added. @@ -276,8 +217,6 @@ int cn_add_callback(struct cb_id *id, char *name, if (err) return err; - cn_notify(id, 0); - return 0; } EXPORT_SYMBOL_GPL(cn_add_callback); @@ -295,111 +234,9 @@ void cn_del_callback(struct cb_id *id) struct cn_dev *dev = &cdev; cn_queue_del_callback(dev->cbdev, id); - cn_notify(id, 1); } EXPORT_SYMBOL_GPL(cn_del_callback); -/* - * Checks two connector's control messages to be the same. - * Returns 1 if they are the same or if the first one is corrupted. - */ -static int cn_ctl_msg_equals(struct cn_ctl_msg *m1, struct cn_ctl_msg *m2) -{ - int i; - struct cn_notify_req *req1, *req2; - - if (m1->idx_notify_num != m2->idx_notify_num) - return 0; - - if (m1->val_notify_num != m2->val_notify_num) - return 0; - - if (m1->len != m2->len) - return 0; - - if ((m1->idx_notify_num + m1->val_notify_num) * sizeof(*req1) != - m1->len) - return 1; - - req1 = (struct cn_notify_req *)m1->data; - req2 = (struct cn_notify_req *)m2->data; - - for (i = 0; i < m1->idx_notify_num; ++i) { - if (req1->first != req2->first || req1->range != req2->range) - return 0; - req1++; - req2++; - } - - for (i = 0; i < m1->val_notify_num; ++i) { - if (req1->first != req2->first || req1->range != req2->range) - return 0; - req1++; - req2++; - } - - return 1; -} - -/* - * Main connector device's callback. - * - * Used for notification of a request's processing. - */ -static void cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) -{ - struct cn_ctl_msg *ctl; - struct cn_ctl_entry *ent; - u32 size; - - if (msg->len < sizeof(*ctl)) - return; - - ctl = (struct cn_ctl_msg *)msg->data; - - size = (sizeof(*ctl) + ((ctl->idx_notify_num + - ctl->val_notify_num) * - sizeof(struct cn_notify_req))); - - if (msg->len != size) - return; - - if (ctl->len + sizeof(*ctl) != msg->len) - return; - - /* - * Remove notification. - */ - if (ctl->group == 0) { - struct cn_ctl_entry *n; - - mutex_lock(¬ify_lock); - list_for_each_entry_safe(ent, n, ¬ify_list, notify_entry) { - if (cn_ctl_msg_equals(ent->msg, ctl)) { - list_del(&ent->notify_entry); - kfree(ent); - } - } - mutex_unlock(¬ify_lock); - - return; - } - - size += sizeof(*ent); - - ent = kzalloc(size, GFP_KERNEL); - if (!ent) - return; - - ent->msg = (struct cn_ctl_msg *)(ent + 1); - - memcpy(ent->msg, ctl, size - sizeof(*ent)); - - mutex_lock(¬ify_lock); - list_add(&ent->notify_entry, ¬ify_list); - mutex_unlock(¬ify_lock); -} - static int cn_proc_show(struct seq_file *m, void *v) { struct cn_queue_dev *dev = cdev.cbdev; @@ -437,11 +274,8 @@ static const struct file_operations cn_file_ops = { static int __devinit cn_init(void) { struct cn_dev *dev = &cdev; - int err; dev->input = cn_rx_skb; - dev->id.idx = cn_idx; - dev->id.val = cn_val; dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR, CN_NETLINK_USERS + 0xf, @@ -457,14 +291,6 @@ static int __devinit cn_init(void) cn_already_initialized = 1; - err = cn_add_callback(&dev->id, "connector", &cn_callback); - if (err) { - cn_already_initialized = 0; - cn_queue_free_dev(dev->cbdev); - netlink_kernel_release(dev->nls); - return -EINVAL; - } - proc_net_fops_create(&init_net, "connector", S_IRUGO, &cn_file_ops); return 0; @@ -478,7 +304,6 @@ static void __devexit cn_fini(void) proc_net_remove(&init_net, "connector"); - cn_del_callback(&dev->id); cn_queue_free_dev(dev->cbdev); netlink_kernel_release(dev->nls); } diff --git a/include/linux/connector.h b/include/linux/connector.h index 72ba63eb83c5..3a779ffba60b 100644 --- a/include/linux/connector.h +++ b/include/linux/connector.h @@ -24,9 +24,6 @@ #include -#define CN_IDX_CONNECTOR 0xffffffff -#define CN_VAL_CONNECTOR 0xffffffff - /* * Process Events connector unique ids -- used for message routing */ @@ -75,30 +72,6 @@ struct cn_msg { __u8 data[0]; }; -/* - * Notify structure - requests notification about - * registering/unregistering idx/val in range [first, first+range]. - */ -struct cn_notify_req { - __u32 first; - __u32 range; -}; - -/* - * Main notification control message - * *_notify_num - number of appropriate cn_notify_req structures after - * this struct. - * group - notification receiver's idx. - * len - total length of the attached data. - */ -struct cn_ctl_msg { - __u32 idx_notify_num; - __u32 val_notify_num; - __u32 group; - __u32 len; - __u8 data[0]; -}; - #ifdef __KERNEL__ #include @@ -151,11 +124,6 @@ struct cn_callback_entry { u32 seq, group; }; -struct cn_ctl_entry { - struct list_head notify_entry; - struct cn_ctl_msg *msg; -}; - struct cn_dev { struct cb_id id; -- cgit v1.2.3 From 24551f64d47af9539a7f324343bffeea09d9dcfa Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 12 Jan 2010 21:25:24 +0000 Subject: lmb: Add lmb_free() We can free memory allocated with lmb_alloc() by removing it from the list of reserved LMBs. Rework lmb_remove() to allow that possibility and add lmb_free() which exploits it. BenH: Removed some useless parenthesis Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- include/linux/lmb.h | 1 + lib/lmb.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lmb.h b/include/linux/lmb.h index ef82b8fcbddb..f3d14333ebed 100644 --- a/include/linux/lmb.h +++ b/include/linux/lmb.h @@ -42,6 +42,7 @@ extern void __init lmb_init(void); extern void __init lmb_analyze(void); extern long lmb_add(u64 base, u64 size); extern long lmb_remove(u64 base, u64 size); +extern long __init lmb_free(u64 base, u64 size); extern long __init lmb_reserve(u64 base, u64 size); extern u64 __init lmb_alloc_nid(u64 size, u64 align, int nid, u64 (*nid_range)(u64, u64, int *)); diff --git a/lib/lmb.c b/lib/lmb.c index 9cee17142b2c..b1fc52606524 100644 --- a/lib/lmb.c +++ b/lib/lmb.c @@ -205,9 +205,8 @@ long lmb_add(u64 base, u64 size) } -long lmb_remove(u64 base, u64 size) +static long __lmb_remove(struct lmb_region *rgn, u64 base, u64 size) { - struct lmb_region *rgn = &(lmb.memory); u64 rgnbegin, rgnend; u64 end = base + size; int i; @@ -254,6 +253,16 @@ long lmb_remove(u64 base, u64 size) return lmb_add_region(rgn, end, rgnend - end); } +long lmb_remove(u64 base, u64 size) +{ + return __lmb_remove(&lmb.memory, base, size); +} + +long __init lmb_free(u64 base, u64 size) +{ + return __lmb_remove(&lmb.reserved, base, size); +} + long __init lmb_reserve(u64 base, u64 size) { struct lmb_region *_rgn = &lmb.reserved; -- cgit v1.2.3 From add67461240c1dadc7c8d97e66f8f92b556ca523 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 3 Feb 2010 13:45:12 +0100 Subject: netfilter: add struct net * to target parameters Signed-off-by: Patrick McHardy --- include/linux/netfilter/x_tables.h | 2 ++ net/bridge/netfilter/ebtables.c | 10 ++++++---- net/ipv4/netfilter/ip_tables.c | 8 +++++--- net/ipv6/netfilter/ip6_tables.c | 8 +++++--- 4 files changed, 18 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 026eb78ee83c..365fabe1b16e 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -249,6 +249,7 @@ struct xt_target_param { * Other fields see above. */ struct xt_tgchk_param { + struct net *net; const char *table; const void *entryinfo; const struct xt_target *target; @@ -259,6 +260,7 @@ struct xt_tgchk_param { /* Target destructor parameters */ struct xt_tgdtor_param { + struct net *net; const struct xt_target *target; void *targinfo; u_int8_t family; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 1aa0e4c1f52d..12beb580aa21 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -579,13 +579,14 @@ ebt_cleanup_match(struct ebt_entry_match *m, struct net *net, unsigned int *i) } static inline int -ebt_cleanup_watcher(struct ebt_entry_watcher *w, unsigned int *i) +ebt_cleanup_watcher(struct ebt_entry_watcher *w, struct net *net, unsigned int *i) { struct xt_tgdtor_param par; if (i && (*i)-- == 0) return 1; + par.net = net; par.target = w->u.watcher; par.targinfo = w->data; par.family = NFPROTO_BRIDGE; @@ -606,10 +607,11 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt) /* we're done */ if (cnt && (*cnt)-- == 0) return 1; - EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, NULL); + EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL); EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL); t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + par.net = net; par.target = t->u.target; par.targinfo = t->data; par.family = NFPROTO_BRIDGE; @@ -674,7 +676,7 @@ ebt_check_entry(struct ebt_entry *e, } i = 0; - mtpar.net = net; + mtpar.net = tgpar.net = net; mtpar.table = tgpar.table = name; mtpar.entryinfo = tgpar.entryinfo = e; mtpar.hook_mask = tgpar.hook_mask = hookmask; @@ -730,7 +732,7 @@ ebt_check_entry(struct ebt_entry *e, (*cnt)++; return 0; cleanup_watchers: - EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, &j); + EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, &j); cleanup_matches: EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, &i); return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index cfaba0e2e6fc..7fde8f6950d8 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -638,10 +638,11 @@ err: return ret; } -static int check_target(struct ipt_entry *e, const char *name) +static int check_target(struct ipt_entry *e, struct net *net, const char *name) { struct ipt_entry_target *t = ipt_get_target(e); struct xt_tgchk_param par = { + .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, @@ -697,7 +698,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, } t->u.kernel.target = target; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto err; @@ -788,6 +789,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net, unsigned int *i) IPT_MATCH_ITERATE(e, cleanup_match, net, NULL); t = ipt_get_target(e); + par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_IPV4; @@ -1675,7 +1677,7 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name, if (ret) goto cleanup_matches; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto cleanup_matches; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 9f1d45f2ba8f..0376ed6d5594 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -669,10 +669,11 @@ err: return ret; } -static int check_target(struct ip6t_entry *e, const char *name) +static int check_target(struct ip6t_entry *e, struct net *net, const char *name) { struct ip6t_entry_target *t = ip6t_get_target(e); struct xt_tgchk_param par = { + .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, @@ -729,7 +730,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, } t->u.kernel.target = target; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto err; @@ -820,6 +821,7 @@ cleanup_entry(struct ip6t_entry *e, struct net *net, unsigned int *i) IP6T_MATCH_ITERATE(e, cleanup_match, net, NULL); t = ip6t_get_target(e); + par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_IPV6; @@ -1710,7 +1712,7 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net, if (ret) goto cleanup_matches; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto cleanup_matches; -- cgit v1.2.3 From 0cebe4b4163b6373c9d24c1a192939777bc27e55 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 3 Feb 2010 13:51:51 +0100 Subject: netfilter: ctnetlink: support selective event delivery Add two masks for conntrack end expectation events to struct nf_conntrack_ecache and use them to filter events. Their default value is "all events" when the event sysctl is on and "no events" when it is off. A following patch will add specific initializations. Expectation events depend on the ecache struct of their master conntrack. Signed-off-by: Patrick McHardy --- include/linux/netfilter/nf_conntrack_common.h | 18 ++++++++ include/net/netfilter/nf_conntrack_ecache.h | 59 +++++++++++++-------------- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 2 +- 4 files changed, 48 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index a374787ed9b0..ebfed90733f7 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -74,6 +74,24 @@ enum ip_conntrack_status { IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT), }; +/* Connection tracking event types */ +enum ip_conntrack_events { + IPCT_NEW, /* new conntrack */ + IPCT_RELATED, /* related conntrack */ + IPCT_DESTROY, /* destroyed conntrack */ + IPCT_REPLY, /* connection has seen two-way traffic */ + IPCT_ASSURED, /* connection status has changed to assured */ + IPCT_PROTOINFO, /* protocol information has changed */ + IPCT_HELPER, /* new helper has been set */ + IPCT_MARK, /* new mark has been set */ + IPCT_NATSEQADJ, /* NAT is doing sequence adjustment */ + IPCT_SECMARK, /* new security mark has been set */ +}; + +enum ip_conntrack_expect_events { + IPEXP_NEW, /* new expectation */ +}; + #ifdef __KERNEL__ struct ip_conntrack_stat { unsigned int searched; diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 5e05fb883ab1..96ba5f7dcab6 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -12,28 +12,12 @@ #include #include -/* Connection tracking event types */ -enum ip_conntrack_events { - IPCT_NEW, /* new conntrack */ - IPCT_RELATED, /* related conntrack */ - IPCT_DESTROY, /* destroyed conntrack */ - IPCT_REPLY, /* connection has seen two-way traffic */ - IPCT_ASSURED, /* connection status has changed to assured */ - IPCT_PROTOINFO, /* protocol information has changed */ - IPCT_HELPER, /* new helper has been set */ - IPCT_MARK, /* new mark has been set */ - IPCT_NATSEQADJ, /* NAT is doing sequence adjustment */ - IPCT_SECMARK, /* new security mark has been set */ -}; - -enum ip_conntrack_expect_events { - IPEXP_NEW, /* new expectation */ -}; - struct nf_conntrack_ecache { - unsigned long cache; /* bitops want long */ - unsigned long missed; /* missed events */ - u32 pid; /* netlink pid of destroyer */ + unsigned long cache; /* bitops want long */ + unsigned long missed; /* missed events */ + u16 ctmask; /* bitmask of ct events to be delivered */ + u16 expmask; /* bitmask of expect events to be delivered */ + u32 pid; /* netlink pid of destroyer */ }; static inline struct nf_conntrack_ecache * @@ -43,14 +27,24 @@ nf_ct_ecache_find(const struct nf_conn *ct) } static inline struct nf_conntrack_ecache * -nf_ct_ecache_ext_add(struct nf_conn *ct, gfp_t gfp) +nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) { struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *e; - if (!net->ct.sysctl_events) + if (!ctmask && !expmask && net->ct.sysctl_events) { + ctmask = ~0; + expmask = ~0; + } + if (!ctmask && !expmask) return NULL; - return nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); + e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); + if (e) { + e->ctmask = ctmask; + e->expmask = expmask; + } + return e; }; #ifdef CONFIG_NF_CONNTRACK_EVENTS @@ -83,6 +77,9 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) if (e == NULL) return; + if (!(e->ctmask & (1 << event))) + return; + set_bit(event, &e->cache); } @@ -93,7 +90,6 @@ nf_conntrack_eventmask_report(unsigned int eventmask, int report) { int ret = 0; - struct net *net = nf_ct_net(ct); struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; @@ -102,9 +98,6 @@ nf_conntrack_eventmask_report(unsigned int eventmask, if (notify == NULL) goto out_unlock; - if (!net->ct.sysctl_events) - goto out_unlock; - e = nf_ct_ecache_find(ct); if (e == NULL) goto out_unlock; @@ -118,6 +111,9 @@ nf_conntrack_eventmask_report(unsigned int eventmask, /* This is a resent of a destroy event? If so, skip missed */ unsigned long missed = e->pid ? 0 : e->missed; + if (!((eventmask | missed) & e->ctmask)) + goto out_unlock; + ret = notify->fcn(eventmask | missed, &item); if (unlikely(ret < 0 || missed)) { spin_lock_bh(&ct->lock); @@ -173,18 +169,19 @@ nf_ct_expect_event_report(enum ip_conntrack_expect_events event, u32 pid, int report) { - struct net *net = nf_ct_exp_net(exp); struct nf_exp_event_notifier *notify; + struct nf_conntrack_ecache *e; rcu_read_lock(); notify = rcu_dereference(nf_expect_event_cb); if (notify == NULL) goto out_unlock; - if (!net->ct.sysctl_events) + e = nf_ct_ecache_find(exp->master); + if (e == NULL) goto out_unlock; - { + if (e->expmask & (1 << event)) { struct nf_exp_event item = { .exp = exp, .pid = pid, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 091ff770eb7b..53b8da6ad6b7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -648,7 +648,7 @@ init_conntrack(struct net *net, } nf_ct_acct_ext_add(ct, GFP_ATOMIC); - nf_ct_ecache_ext_add(ct, GFP_ATOMIC); + nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); spin_lock_bh(&nf_conntrack_lock); exp = nf_ct_find_expectation(net, tuple); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index ff594eb138c1..f5c0b09e12f1 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1281,7 +1281,7 @@ ctnetlink_create_conntrack(struct net *net, } nf_ct_acct_ext_add(ct, GFP_ATOMIC); - nf_ct_ecache_ext_add(ct, GFP_ATOMIC); + nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) -- cgit v1.2.3 From b2a15a604d379af323645e330638e2cfcc696aff Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 3 Feb 2010 14:13:03 +0100 Subject: netfilter: nf_conntrack: support conntrack templates Support initializing selected parameters of new conntrack entries from a "conntrack template", which is a specially marked conntrack entry attached to the skb. Currently the helper and the event delivery masks can be initialized this way. Signed-off-by: Patrick McHardy --- include/linux/netfilter/nf_conntrack_common.h | 4 +++ include/net/netfilter/nf_conntrack.h | 5 +++ include/net/netfilter/nf_conntrack_helper.h | 3 +- net/ipv4/netfilter/nf_defrag_ipv4.c | 2 +- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 2 +- net/netfilter/nf_conntrack_core.c | 50 +++++++++++++++++--------- net/netfilter/nf_conntrack_helper.c | 17 ++++++--- net/netfilter/nf_conntrack_netlink.c | 2 +- 8 files changed, 61 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index ebfed90733f7..c608677dda60 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -72,6 +72,10 @@ enum ip_conntrack_status { /* Connection has fixed timeout. */ IPS_FIXED_TIMEOUT_BIT = 10, IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT), + + /* Conntrack is a template */ + IPS_TEMPLATE_BIT = 11, + IPS_TEMPLATE = (1 << IPS_TEMPLATE_BIT), }; /* Connection tracking event types */ diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index a0904adfb8f7..5043d61c99a7 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -272,6 +272,11 @@ nf_conntrack_alloc(struct net *net, const struct nf_conntrack_tuple *repl, gfp_t gfp); +static inline int nf_ct_is_template(const struct nf_conn *ct) +{ + return test_bit(IPS_TEMPLATE_BIT, &ct->status); +} + /* It's confirmed if it is, or has been in the hash table. */ static inline int nf_ct_is_confirmed(struct nf_conn *ct) { diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 86be7c4816d6..e17aaa3e19fd 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -47,7 +47,8 @@ extern void nf_conntrack_helper_unregister(struct nf_conntrack_helper *); extern struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp); -extern int __nf_ct_try_assign_helper(struct nf_conn *ct, gfp_t flags); +extern int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, + gfp_t flags); extern void nf_ct_helper_destroy(struct nf_conn *ct); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 331ead3ebd1b..77627fa80561 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -59,7 +59,7 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, #if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ - if (skb->nfct) + if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) return NF_ACCEPT; #endif #endif diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 0956ebabbff2..55ce22e5de49 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -212,7 +212,7 @@ static unsigned int ipv6_defrag(unsigned int hooknum, struct sk_buff *reasm; /* Previously seen (loopback)? */ - if (skb->nfct) + if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) return NF_ACCEPT; reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb)); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 53b8da6ad6b7..471e2a79d26f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -618,7 +618,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free); /* Allocate a new conntrack: we return -ENOMEM if classification failed due to stress. Otherwise it really is unclassifiable. */ static struct nf_conntrack_tuple_hash * -init_conntrack(struct net *net, +init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_l3proto *l3proto, struct nf_conntrack_l4proto *l4proto, @@ -628,6 +628,7 @@ init_conntrack(struct net *net, struct nf_conn *ct; struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; + struct nf_conntrack_ecache *ecache; struct nf_conntrack_expect *exp; if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { @@ -648,7 +649,11 @@ init_conntrack(struct net *net, } nf_ct_acct_ext_add(ct, GFP_ATOMIC); - nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); + + ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; + nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, + ecache ? ecache->expmask : 0, + GFP_ATOMIC); spin_lock_bh(&nf_conntrack_lock); exp = nf_ct_find_expectation(net, tuple); @@ -673,7 +678,7 @@ init_conntrack(struct net *net, nf_conntrack_get(&ct->master->ct_general); NF_CT_STAT_INC(net, expect_new); } else { - __nf_ct_try_assign_helper(ct, GFP_ATOMIC); + __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); NF_CT_STAT_INC(net, new); } @@ -694,7 +699,7 @@ init_conntrack(struct net *net, /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ static inline struct nf_conn * -resolve_normal_ct(struct net *net, +resolve_normal_ct(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u_int16_t l3num, @@ -718,7 +723,8 @@ resolve_normal_ct(struct net *net, /* look for tuple match */ h = nf_conntrack_find_get(net, &tuple); if (!h) { - h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff); + h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, + skb, dataoff); if (!h) return NULL; if (IS_ERR(h)) @@ -755,7 +761,7 @@ unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) { - struct nf_conn *ct; + struct nf_conn *ct, *tmpl = NULL; enum ip_conntrack_info ctinfo; struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; @@ -764,10 +770,14 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, int set_reply = 0; int ret; - /* Previously seen (loopback or untracked)? Ignore. */ if (skb->nfct) { - NF_CT_STAT_INC_ATOMIC(net, ignore); - return NF_ACCEPT; + /* Previously seen (loopback or untracked)? Ignore. */ + tmpl = (struct nf_conn *)skb->nfct; + if (!nf_ct_is_template(tmpl)) { + NF_CT_STAT_INC_ATOMIC(net, ignore); + return NF_ACCEPT; + } + skb->nfct = NULL; } /* rcu_read_lock()ed by nf_hook_slow */ @@ -778,7 +788,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, pr_debug("not prepared to track yet or error occured\n"); NF_CT_STAT_INC_ATOMIC(net, error); NF_CT_STAT_INC_ATOMIC(net, invalid); - return -ret; + ret = -ret; + goto out; } l4proto = __nf_ct_l4proto_find(pf, protonum); @@ -791,22 +802,25 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, if (ret <= 0) { NF_CT_STAT_INC_ATOMIC(net, error); NF_CT_STAT_INC_ATOMIC(net, invalid); - return -ret; + ret = -ret; + goto out; } } - ct = resolve_normal_ct(net, skb, dataoff, pf, protonum, + ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l3proto, l4proto, &set_reply, &ctinfo); if (!ct) { /* Not valid part of a connection */ NF_CT_STAT_INC_ATOMIC(net, invalid); - return NF_ACCEPT; + ret = NF_ACCEPT; + goto out; } if (IS_ERR(ct)) { /* Too stressed to deal. */ NF_CT_STAT_INC_ATOMIC(net, drop); - return NF_DROP; + ret = NF_DROP; + goto out; } NF_CT_ASSERT(skb->nfct); @@ -821,11 +835,15 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, NF_CT_STAT_INC_ATOMIC(net, invalid); if (ret == -NF_DROP) NF_CT_STAT_INC_ATOMIC(net, drop); - return -ret; + ret = -ret; + goto out; } if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_REPLY, ct); +out: + if (tmpl) + nf_ct_put(tmpl); return ret; } @@ -864,7 +882,7 @@ void nf_conntrack_alter_reply(struct nf_conn *ct, return; rcu_read_lock(); - __nf_ct_try_assign_helper(ct, GFP_ATOMIC); + __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index c0e461f466ae..8144b0da5515 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -96,13 +96,22 @@ struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) } EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); -int __nf_ct_try_assign_helper(struct nf_conn *ct, gfp_t flags) +int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, + gfp_t flags) { + struct nf_conntrack_helper *helper = NULL; + struct nf_conn_help *help; int ret = 0; - struct nf_conntrack_helper *helper; - struct nf_conn_help *help = nfct_help(ct); - helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + if (tmpl != NULL) { + help = nfct_help(tmpl); + if (help != NULL) + helper = help->helper; + } + + help = nfct_help(ct); + if (helper == NULL) + helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); if (helper == NULL) { if (help) rcu_assign_pointer(help->helper, NULL); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index f5c0b09e12f1..09044f9f4b2e 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1249,7 +1249,7 @@ ctnetlink_create_conntrack(struct net *net, } } else { /* try an implicit helper assignation */ - err = __nf_ct_try_assign_helper(ct, GFP_ATOMIC); + err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); if (err < 0) goto err2; } -- cgit v1.2.3 From d4bfa033ed84e0ae446eff445d107ffd5ee78df3 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 29 Jan 2010 15:03:36 +0100 Subject: HID: make raw reports possible for both feature and output reports In commit 2da31939a42 ("Bluetooth: Implement raw output support for HIDP layer"), support for Bluetooth hid_output_raw_report was added, but it pushes the data to the intr socket instead of the ctrl one. This has been fixed by 6bf8268f9a91f1 ("Bluetooth: Use the control channel for raw HID reports") Still, it is necessary to distinguish whether the report in question should be either FEATURE or OUTPUT. For this, we have to extend the generic HID API, so that hid_output_raw_report() callback provides means to specify this value so that it can be passed down to lower level hardware drivers (currently Bluetooth and USB). Based on original patch by Bastien Nocera Acked-by: Marcel Holtmann Signed-off-by: Jiri Kosina --- drivers/hid/hidraw.c | 2 +- drivers/hid/usbhid/hid-core.c | 5 +++-- include/linux/hid.h | 2 +- net/bluetooth/hidp/core.c | 17 ++++++++++++++--- 4 files changed, 19 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index cdd136942bca..d04476700b7b 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -134,7 +134,7 @@ static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t goto out; } - ret = dev->hid_output_raw_report(dev, buf, count); + ret = dev->hid_output_raw_report(dev, buf, count, HID_OUTPUT_REPORT); out: kfree(buf); return ret; diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index e2997a8d5e1b..caa16c057ce2 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -774,7 +774,8 @@ static int hid_alloc_buffers(struct usb_device *dev, struct hid_device *hid) return 0; } -static int usbhid_output_raw_report(struct hid_device *hid, __u8 *buf, size_t count) +static int usbhid_output_raw_report(struct hid_device *hid, __u8 *buf, size_t count, + unsigned char report_type) { struct usbhid_device *usbhid = hid->driver_data; struct usb_device *dev = hid_to_usb_dev(hid); @@ -785,7 +786,7 @@ static int usbhid_output_raw_report(struct hid_device *hid, __u8 *buf, size_t co ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), HID_REQ_SET_REPORT, USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE, - ((HID_OUTPUT_REPORT + 1) << 8) | *buf, + ((report_type + 1) << 8) | *buf, interface->desc.bInterfaceNumber, buf + 1, count - 1, USB_CTRL_SET_TIMEOUT); diff --git a/include/linux/hid.h b/include/linux/hid.h index 87093652dda8..3661a626941d 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -501,7 +501,7 @@ struct hid_device { /* device report descriptor */ void (*hiddev_report_event) (struct hid_device *, struct hid_report *); /* handler for raw output data, used by hidraw */ - int (*hid_output_raw_report) (struct hid_device *, __u8 *, size_t); + int (*hid_output_raw_report) (struct hid_device *, __u8 *, size_t, unsigned char); /* debugging support via debugfs */ unsigned short debug; diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 6cf526d06e21..37ba153c4cd4 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -313,10 +313,21 @@ static int hidp_send_report(struct hidp_session *session, struct hid_report *rep return hidp_queue_report(session, buf, rsize); } -static int hidp_output_raw_report(struct hid_device *hid, unsigned char *data, size_t count) +static int hidp_output_raw_report(struct hid_device *hid, unsigned char *data, size_t count, + unsigned char report_type) { - if (hidp_send_ctrl_message(hid->driver_data, - HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_FEATURE, + switch (report_type) { + case HID_FEATURE_REPORT: + report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_FEATURE; + break; + case HID_OUTPUT_REPORT: + report_type = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT; + break; + default: + return -EINVAL; + } + + if (hidp_send_ctrl_message(hid->driver_data, report_type, data, count)) return -ENOMEM; return count; -- cgit v1.2.3 From 84f3bb9ae9db90f7fb15d98b55279a58ab1b2363 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 3 Feb 2010 17:17:06 +0100 Subject: netfilter: xtables: add CT target Add a new target for the raw table, which can be used to specify conntrack parameters for specific connections, f.i. the conntrack helper. The target attaches a "template" connection tracking entry to the skb, which is used by the conntrack core when initializing a new conntrack. Signed-off-by: Patrick McHardy --- include/linux/netfilter/Kbuild | 1 + include/linux/netfilter/xt_CT.h | 17 +++ include/net/netfilter/nf_conntrack_helper.h | 3 + net/netfilter/Kconfig | 12 +++ net/netfilter/Makefile | 1 + net/netfilter/nf_conntrack_helper.c | 19 ++++ net/netfilter/xt_CT.c | 158 ++++++++++++++++++++++++++++ 7 files changed, 211 insertions(+) create mode 100644 include/linux/netfilter/xt_CT.h create mode 100644 net/netfilter/xt_CT.c (limited to 'include/linux') diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild index 2aea50399c0b..a5a63e41b8af 100644 --- a/include/linux/netfilter/Kbuild +++ b/include/linux/netfilter/Kbuild @@ -6,6 +6,7 @@ header-y += nfnetlink_queue.h header-y += xt_CLASSIFY.h header-y += xt_CONNMARK.h header-y += xt_CONNSECMARK.h +header-y += xt_CT.h header-y += xt_DSCP.h header-y += xt_LED.h header-y += xt_MARK.h diff --git a/include/linux/netfilter/xt_CT.h b/include/linux/netfilter/xt_CT.h new file mode 100644 index 000000000000..7fd0effe1316 --- /dev/null +++ b/include/linux/netfilter/xt_CT.h @@ -0,0 +1,17 @@ +#ifndef _XT_CT_H +#define _XT_CT_H + +#define XT_CT_NOTRACK 0x1 + +struct xt_ct_target_info { + u_int16_t flags; + u_int16_t __unused; + u_int32_t ct_events; + u_int32_t exp_events; + char helper[16]; + + /* Used internally by the kernel */ + struct nf_conn *ct __attribute__((aligned(8))); +}; + +#endif /* _XT_CT_H */ diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index e17aaa3e19fd..32c305dbdab6 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -42,6 +42,9 @@ struct nf_conntrack_helper { extern struct nf_conntrack_helper * __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum); +extern struct nf_conntrack_helper * +nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum); + extern int nf_conntrack_helper_register(struct nf_conntrack_helper *); extern void nf_conntrack_helper_unregister(struct nf_conntrack_helper *); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 634d14affc8d..4469d45261f4 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -341,6 +341,18 @@ config NETFILTER_XT_TARGET_CONNSECMARK To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_CT + tristate '"CT" target support' + depends on NF_CONNTRACK + depends on IP_NF_RAW || IP6_NF_RAW + depends on NETFILTER_ADVANCED + help + This options adds a `CT' target, which allows to specify initial + connection tracking parameters like events to be delivered and + the helper to be used. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_DSCP tristate '"DSCP" and "TOS" target support' depends on IP_NF_MANGLE || IP6_NF_MANGLE diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 49f62ee4e9ff..f873644f02f6 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 8144b0da5515..a74a5769877b 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -83,6 +83,25 @@ __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) } EXPORT_SYMBOL_GPL(__nf_conntrack_helper_find); +struct nf_conntrack_helper * +nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) +{ + struct nf_conntrack_helper *h; + + h = __nf_conntrack_helper_find(name, l3num, protonum); +#ifdef CONFIG_MODULES + if (h == NULL) { + if (request_module("nfct-helper-%s", name) == 0) + h = __nf_conntrack_helper_find(name, l3num, protonum); + } +#endif + if (h != NULL && !try_module_get(h->me)) + h = NULL; + + return h; +} +EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); + struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) { struct nf_conn_help *help; diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c new file mode 100644 index 000000000000..8183a054256f --- /dev/null +++ b/net/netfilter/xt_CT.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2010 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int xt_ct_target(struct sk_buff *skb, + const struct xt_target_param *par) +{ + const struct xt_ct_target_info *info = par->targinfo; + struct nf_conn *ct = info->ct; + + /* Previously seen (loopback)? Ignore. */ + if (skb->nfct != NULL) + return XT_CONTINUE; + + atomic_inc(&ct->ct_general.use); + skb->nfct = &ct->ct_general; + skb->nfctinfo = IP_CT_NEW; + + return XT_CONTINUE; +} + +static u8 xt_ct_find_proto(const struct xt_tgchk_param *par) +{ + if (par->family == AF_INET) { + const struct ipt_entry *e = par->entryinfo; + + if (e->ip.invflags & IPT_INV_PROTO) + return 0; + return e->ip.proto; + } else if (par->family == AF_INET6) { + const struct ip6t_entry *e = par->entryinfo; + + if (e->ipv6.invflags & IP6T_INV_PROTO) + return 0; + return e->ipv6.proto; + } else + return 0; +} + +static bool xt_ct_tg_check(const struct xt_tgchk_param *par) +{ + struct xt_ct_target_info *info = par->targinfo; + struct nf_conntrack_tuple t; + struct nf_conn_help *help; + struct nf_conn *ct; + u8 proto; + + if (info->flags & ~XT_CT_NOTRACK) + return false; + + if (info->flags & XT_CT_NOTRACK) { + ct = &nf_conntrack_untracked; + atomic_inc(&ct->ct_general.use); + goto out; + } + + if (nf_ct_l3proto_try_module_get(par->family) < 0) + goto err1; + + memset(&t, 0, sizeof(t)); + ct = nf_conntrack_alloc(par->net, &t, &t, GFP_KERNEL); + if (IS_ERR(ct)) + goto err2; + + if ((info->ct_events || info->exp_events) && + !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, + GFP_KERNEL)) + goto err3; + + if (info->helper[0]) { + proto = xt_ct_find_proto(par); + if (!proto) + goto err3; + + help = nf_ct_helper_ext_add(ct, GFP_KERNEL); + if (help == NULL) + goto err3; + + help->helper = nf_conntrack_helper_try_module_get(info->helper, + par->family, + proto); + if (help->helper == NULL) + goto err3; + } + + __set_bit(IPS_TEMPLATE_BIT, &ct->status); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); +out: + info->ct = ct; + return true; + +err3: + nf_conntrack_free(ct); +err2: + nf_ct_l3proto_module_put(par->family); +err1: + return false; +} + +static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par) +{ + struct xt_ct_target_info *info = par->targinfo; + struct nf_conn *ct = info->ct; + struct nf_conn_help *help; + + if (ct != &nf_conntrack_untracked) { + help = nfct_help(ct); + if (help) + module_put(help->helper->me); + + nf_ct_l3proto_module_put(par->family); + } + nf_ct_put(info->ct); +} + +static struct xt_target xt_ct_tg __read_mostly = { + .name = "CT", + .family = NFPROTO_UNSPEC, + .targetsize = XT_ALIGN(sizeof(struct xt_ct_target_info)), + .checkentry = xt_ct_tg_check, + .destroy = xt_ct_tg_destroy, + .target = xt_ct_target, + .table = "raw", + .me = THIS_MODULE, +}; + +static int __init xt_ct_tg_init(void) +{ + return xt_register_target(&xt_ct_tg); +} + +static void __exit xt_ct_tg_exit(void) +{ + xt_unregister_target(&xt_ct_tg); +} + +module_init(xt_ct_tg_init); +module_exit(xt_ct_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: connection tracking target"); +MODULE_ALIAS("ipt_CT"); +MODULE_ALIAS("ip6t_CT"); -- cgit v1.2.3 From cd757645fbdc34a8343c04bb0e74e06fccc2cb10 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Sat, 30 Jan 2010 10:25:18 +0530 Subject: perf: Make bp_len type to u64 generic across the arch Change 'bp_len' type to __u64 to make it work across archs as the s390 architecture watch point length can be upto 2^64. reference: http://lkml.org/lkml/2010/1/25/212 This is an ABI change that is not backward compatible with the previous hardware breakpoint info layout integrated in this development cycle, a rebuilt of perf tools is necessary for versions based on 2.6.33-rc1 - 2.6.33-rc6 to work with a kernel based on this patch. Signed-off-by: Mahesh Salgaonkar Acked-by: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: "K. Prasad" Cc: Maneesh Soni Cc: Heiko Carstens Cc: Martin LKML-Reference: <20100130045518.GA20776@in.ibm.com> Signed-off-by: Frederic Weisbecker --- include/linux/hw_breakpoint.h | 2 +- include/linux/perf_event.h | 6 ++---- kernel/hw_breakpoint.c | 2 +- kernel/perf_event.c | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 070ba0621738..5977b724f7c6 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -44,7 +44,7 @@ static inline int hw_breakpoint_type(struct perf_event *bp) return bp->attr.bp_type; } -static inline int hw_breakpoint_len(struct perf_event *bp) +static inline unsigned long hw_breakpoint_len(struct perf_event *bp) { return bp->attr.bp_len; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8fa71874113f..a177698d95e2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -211,11 +211,9 @@ struct perf_event_attr { __u32 wakeup_watermark; /* bytes before wakeup */ }; - __u32 __reserved_2; - - __u64 bp_addr; __u32 bp_type; - __u32 bp_len; + __u64 bp_addr; + __u64 bp_len; }; /* diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 8a5c7d55ac9f..967e66143e11 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -360,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { u64 old_addr = bp->attr.bp_addr; + u64 old_len = bp->attr.bp_len; int old_type = bp->attr.bp_type; - int old_len = bp->attr.bp_len; int err = 0; perf_event_disable(bp); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index d27746bd3a06..2b19297742cb 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4580,7 +4580,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->type >= PERF_TYPE_MAX) return -EINVAL; - if (attr->__reserved_1 || attr->__reserved_2) + if (attr->__reserved_1) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) -- cgit v1.2.3 From 002345925e6c45861f60db6f4fc6236713fd8847 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Feb 2010 15:36:43 -0800 Subject: syslog: distinguish between /proc/kmsg and syscalls This allows the LSM to distinguish between syslog functions originating from /proc/kmsg access and direct syscalls. By default, the commoncaps will now no longer require CAP_SYS_ADMIN to read an opened /proc/kmsg file descriptor. For example the kernel syslog reader can now drop privileges after opening /proc/kmsg, instead of staying privileged with CAP_SYS_ADMIN. MAC systems that implement security_syslog have unchanged behavior. Signed-off-by: Kees Cook Acked-by: Serge Hallyn Acked-by: John Johansen Signed-off-by: James Morris --- fs/proc/kmsg.c | 14 +++++++------- include/linux/security.h | 11 ++++++----- include/linux/syslog.h | 29 +++++++++++++++++++++++++++++ kernel/printk.c | 7 ++++--- security/commoncap.c | 7 ++++++- security/security.c | 4 ++-- security/selinux/hooks.c | 5 +++-- security/smack/smack_lsm.c | 4 ++-- 8 files changed, 59 insertions(+), 22 deletions(-) create mode 100644 include/linux/syslog.h (limited to 'include/linux') diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 7ca78346d3f0..6a3d843a1088 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -12,37 +12,37 @@ #include #include #include +#include #include #include extern wait_queue_head_t log_wait; -extern int do_syslog(int type, char __user *bug, int count); - static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(1,NULL,0); + return do_syslog(1, NULL, 0, SYSLOG_FROM_FILE); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(0,NULL,0); + (void) do_syslog(0, NULL, 0, SYSLOG_FROM_FILE); return 0; } static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - if ((file->f_flags & O_NONBLOCK) && !do_syslog(9, NULL, 0)) + if ((file->f_flags & O_NONBLOCK) && + !do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) return -EAGAIN; - return do_syslog(2, buf, count); + return do_syslog(2, buf, count, SYSLOG_FROM_FILE); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(9, NULL, 0)) + if (do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) return POLLIN | POLLRDNORM; return 0; } diff --git a/include/linux/security.h b/include/linux/security.h index 26eca85b2417..a4dc74d86ac6 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -76,7 +76,7 @@ extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp); extern int cap_task_setioprio(struct task_struct *p, int ioprio); extern int cap_task_setnice(struct task_struct *p, int nice); -extern int cap_syslog(int type); +extern int cap_syslog(int type, bool from_file); extern int cap_vm_enough_memory(struct mm_struct *mm, long pages); struct msghdr; @@ -1349,6 +1349,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * logging to the console. * See the syslog(2) manual page for an explanation of the @type values. * @type contains the type of action. + * @from_file indicates the context of action (if it came from /proc). * Return 0 if permission is granted. * @settime: * Check permission to change the system time. @@ -1463,7 +1464,7 @@ struct security_operations { int (*sysctl) (struct ctl_table *table, int op); int (*quotactl) (int cmds, int type, int id, struct super_block *sb); int (*quota_on) (struct dentry *dentry); - int (*syslog) (int type); + int (*syslog) (int type, bool from_file); int (*settime) (struct timespec *ts, struct timezone *tz); int (*vm_enough_memory) (struct mm_struct *mm, long pages); @@ -1762,7 +1763,7 @@ int security_acct(struct file *file); int security_sysctl(struct ctl_table *table, int op); int security_quotactl(int cmds, int type, int id, struct super_block *sb); int security_quota_on(struct dentry *dentry); -int security_syslog(int type); +int security_syslog(int type, bool from_file); int security_settime(struct timespec *ts, struct timezone *tz); int security_vm_enough_memory(long pages); int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); @@ -2008,9 +2009,9 @@ static inline int security_quota_on(struct dentry *dentry) return 0; } -static inline int security_syslog(int type) +static inline int security_syslog(int type, bool from_file) { - return cap_syslog(type); + return cap_syslog(type, from_file); } static inline int security_settime(struct timespec *ts, struct timezone *tz) diff --git a/include/linux/syslog.h b/include/linux/syslog.h new file mode 100644 index 000000000000..5f02b1817be1 --- /dev/null +++ b/include/linux/syslog.h @@ -0,0 +1,29 @@ +/* Syslog internals + * + * Copyright 2010 Canonical, Ltd. + * Author: Kees Cook + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _LINUX_SYSLOG_H +#define _LINUX_SYSLOG_H + +#define SYSLOG_FROM_CALL 0 +#define SYSLOG_FROM_FILE 1 + +int do_syslog(int type, char __user *buf, int count, bool from_file); + +#endif /* _LINUX_SYSLOG_H */ diff --git a/kernel/printk.c b/kernel/printk.c index 17463ca2e229..809cf9a258a0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -273,14 +274,14 @@ static inline void boot_delay_msec(void) * 9 -- Return number of unread characters in the log buffer * 10 -- Return size of the log buffer */ -int do_syslog(int type, char __user *buf, int len) +int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; int do_clear = 0; char c; int error = 0; - error = security_syslog(type); + error = security_syslog(type, from_file); if (error) return error; @@ -417,7 +418,7 @@ out: SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { - return do_syslog(type, buf, len); + return do_syslog(type, buf, len, SYSLOG_FROM_CALL); } /* diff --git a/security/commoncap.c b/security/commoncap.c index f800fdb3de94..677fad9d5cba 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * If a non-root user executes a setuid-root binary in @@ -888,12 +889,16 @@ error: /** * cap_syslog - Determine whether syslog function is permitted * @type: Function requested + * @from_file: Whether this request came from an open file (i.e. /proc) * * Determine whether the current process is permitted to use a particular * syslog function, returning 0 if permission is granted, -ve if not. */ -int cap_syslog(int type) +int cap_syslog(int type, bool from_file) { + /* /proc/kmsg can open be opened by CAP_SYS_ADMIN */ + if (type != 1 && from_file) + return 0; if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) return -EPERM; return 0; diff --git a/security/security.c b/security/security.c index 440afe5eb54c..971092c06f31 100644 --- a/security/security.c +++ b/security/security.c @@ -203,9 +203,9 @@ int security_quota_on(struct dentry *dentry) return security_ops->quota_on(dentry); } -int security_syslog(int type) +int security_syslog(int type, bool from_file) { - return security_ops->syslog(type); + return security_ops->syslog(type, from_file); } int security_settime(struct timespec *ts, struct timezone *tz) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 9a2ee845e9d4..a4862a0730fa 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -76,6 +76,7 @@ #include #include #include +#include #include "avc.h" #include "objsec.h" @@ -2049,11 +2050,11 @@ static int selinux_quota_on(struct dentry *dentry) return dentry_has_perm(cred, NULL, dentry, FILE__QUOTAON); } -static int selinux_syslog(int type) +static int selinux_syslog(int type, bool from_file) { int rc; - rc = cap_syslog(type); + rc = cap_syslog(type, from_file); if (rc) return rc; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 529c9ca65878..a5721b373f53 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -157,12 +157,12 @@ static int smack_ptrace_traceme(struct task_struct *ptp) * * Returns 0 on success, error code otherwise. */ -static int smack_syslog(int type) +static int smack_syslog(int type, bool from_file) { int rc; char *sp = current_security(); - rc = cap_syslog(type); + rc = cap_syslog(type, from_file); if (rc != 0) return rc; -- cgit v1.2.3 From d78ca3cd733d8a2c3dcd88471beb1a15d973eed8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Feb 2010 15:37:13 -0800 Subject: syslog: use defined constants instead of raw numbers Right now the syslog "type" action are just raw numbers which makes the source difficult to follow. This patch replaces the raw numbers with defined constants for some level of sanity. Signed-off-by: Kees Cook Acked-by: John Johansen Acked-by: Serge Hallyn Signed-off-by: James Morris --- fs/proc/kmsg.c | 10 +++++----- include/linux/syslog.h | 23 +++++++++++++++++++++++ kernel/printk.c | 45 +++++++++++++++++++-------------------------- security/commoncap.c | 5 +++-- security/selinux/hooks.c | 21 +++++++++++---------- 5 files changed, 61 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 6a3d843a1088..cfe90a48a6e8 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait; static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(1, NULL, 0, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(0, NULL, 0, SYSLOG_FROM_FILE); + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); return 0; } @@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { if ((file->f_flags & O_NONBLOCK) && - !do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) return -EAGAIN; - return do_syslog(2, buf, count, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) return POLLIN | POLLRDNORM; return 0; } diff --git a/include/linux/syslog.h b/include/linux/syslog.h index 5f02b1817be1..38911391a139 100644 --- a/include/linux/syslog.h +++ b/include/linux/syslog.h @@ -21,6 +21,29 @@ #ifndef _LINUX_SYSLOG_H #define _LINUX_SYSLOG_H +/* Close the log. Currently a NOP. */ +#define SYSLOG_ACTION_CLOSE 0 +/* Open the log. Currently a NOP. */ +#define SYSLOG_ACTION_OPEN 1 +/* Read from the log. */ +#define SYSLOG_ACTION_READ 2 +/* Read all messages remaining in the ring buffer. */ +#define SYSLOG_ACTION_READ_ALL 3 +/* Read and clear all messages remaining in the ring buffer */ +#define SYSLOG_ACTION_READ_CLEAR 4 +/* Clear ring buffer. */ +#define SYSLOG_ACTION_CLEAR 5 +/* Disable printk's to console */ +#define SYSLOG_ACTION_CONSOLE_OFF 6 +/* Enable printk's to console */ +#define SYSLOG_ACTION_CONSOLE_ON 7 +/* Set level of messages printed to console */ +#define SYSLOG_ACTION_CONSOLE_LEVEL 8 +/* Return number of unread characters in the log buffer */ +#define SYSLOG_ACTION_SIZE_UNREAD 9 +/* Return size of the log buffer */ +#define SYSLOG_ACTION_SIZE_BUFFER 10 + #define SYSLOG_FROM_CALL 0 #define SYSLOG_FROM_FILE 1 diff --git a/kernel/printk.c b/kernel/printk.c index 809cf9a258a0..3e162d867098 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -259,21 +259,6 @@ static inline void boot_delay_msec(void) } #endif -/* - * Commands to do_syslog: - * - * 0 -- Close the log. Currently a NOP. - * 1 -- Open the log. Currently a NOP. - * 2 -- Read from the log. - * 3 -- Read all messages remaining in the ring buffer. - * 4 -- Read and clear all messages remaining in the ring buffer - * 5 -- Clear ring buffer. - * 6 -- Disable printk's to console - * 7 -- Enable printk's to console - * 8 -- Set level of messages printed to console - * 9 -- Return number of unread characters in the log buffer - * 10 -- Return size of the log buffer - */ int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; @@ -286,11 +271,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) return error; switch (type) { - case 0: /* Close log */ + case SYSLOG_ACTION_CLOSE: /* Close log */ break; - case 1: /* Open log */ + case SYSLOG_ACTION_OPEN: /* Open log */ break; - case 2: /* Read from log */ + case SYSLOG_ACTION_READ: /* Read from log */ error = -EINVAL; if (!buf || len < 0) goto out; @@ -321,10 +306,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) if (!error) error = i; break; - case 4: /* Read/clear last kernel messages */ + /* Read/clear last kernel messages */ + case SYSLOG_ACTION_READ_CLEAR: do_clear = 1; /* FALL THRU */ - case 3: /* Read last kernel messages */ + /* Read last kernel messages */ + case SYSLOG_ACTION_READ_ALL: error = -EINVAL; if (!buf || len < 0) goto out; @@ -377,21 +364,25 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) } } break; - case 5: /* Clear ring buffer */ + /* Clear ring buffer */ + case SYSLOG_ACTION_CLEAR: logged_chars = 0; break; - case 6: /* Disable logging to console */ + /* Disable logging to console */ + case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == -1) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; - case 7: /* Enable logging to console */ + /* Enable logging to console */ + case SYSLOG_ACTION_CONSOLE_ON: if (saved_console_loglevel != -1) { console_loglevel = saved_console_loglevel; saved_console_loglevel = -1; } break; - case 8: /* Set level of messages printed to console */ + /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_LEVEL: error = -EINVAL; if (len < 1 || len > 8) goto out; @@ -402,10 +393,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) saved_console_loglevel = -1; error = 0; break; - case 9: /* Number of chars in the log buffer */ + /* Number of chars in the log buffer */ + case SYSLOG_ACTION_SIZE_UNREAD: error = log_end - log_start; break; - case 10: /* Size of the log buffer */ + /* Size of the log buffer */ + case SYSLOG_ACTION_SIZE_BUFFER: error = log_buf_len; break; default: diff --git a/security/commoncap.c b/security/commoncap.c index 677fad9d5cba..cf01b2eebb60 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -897,9 +897,10 @@ error: int cap_syslog(int type, bool from_file) { /* /proc/kmsg can open be opened by CAP_SYS_ADMIN */ - if (type != 1 && from_file) + if (type != SYSLOG_ACTION_OPEN && from_file) return 0; - if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) + if ((type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER) && !capable(CAP_SYS_ADMIN)) return -EPERM; return 0; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index a4862a0730fa..6b36ce2eef2e 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2059,20 +2059,21 @@ static int selinux_syslog(int type, bool from_file) return rc; switch (type) { - case 3: /* Read last kernel messages */ - case 10: /* Return size of the log buffer */ + case SYSLOG_ACTION_READ_ALL: /* Read last kernel messages */ + case SYSLOG_ACTION_SIZE_BUFFER: /* Return size of the log buffer */ rc = task_has_system(current, SYSTEM__SYSLOG_READ); break; - case 6: /* Disable logging to console */ - case 7: /* Enable logging to console */ - case 8: /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_OFF: /* Disable logging to console */ + case SYSLOG_ACTION_CONSOLE_ON: /* Enable logging to console */ + /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_LEVEL: rc = task_has_system(current, SYSTEM__SYSLOG_CONSOLE); break; - case 0: /* Close log */ - case 1: /* Open log */ - case 2: /* Read from log */ - case 4: /* Read/clear last kernel messages */ - case 5: /* Clear ring buffer */ + case SYSLOG_ACTION_CLOSE: /* Close log */ + case SYSLOG_ACTION_OPEN: /* Open log */ + case SYSLOG_ACTION_READ: /* Read from log */ + case SYSLOG_ACTION_READ_CLEAR: /* Read/clear last kernel messages */ + case SYSLOG_ACTION_CLEAR: /* Clear ring buffer */ default: rc = task_has_system(current, SYSTEM__SYSLOG_MOD); break; -- cgit v1.2.3 From 8a83a00b0735190384a348156837918271034144 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 30 Jan 2010 12:23:03 +0000 Subject: net: maintain namespace isolation between vlan and real device In the vlan and macvlan drivers, the start_xmit function forwards data to the dev_queue_xmit function for another device, which may potentially belong to a different namespace. To make sure that classification stays within a single namespace, this resets the potentially critical fields. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 2 +- include/linux/netdevice.h | 9 +++++++++ net/8021q/vlan_dev.c | 2 +- net/core/dev.c | 35 +++++++++++++++++++++++++++++++---- 4 files changed, 42 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index fa0dc514dbaf..d32e0bdfc5e9 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -269,7 +269,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) } xmit_world: - skb->dev = vlan->lowerdev; + skb_set_dev(skb, vlan->lowerdev); return dev_queue_xmit(skb); } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 93a32a5ca74f..622ba5aa93c4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1004,6 +1004,15 @@ static inline bool netdev_uses_dsa_tags(struct net_device *dev) return 0; } +#ifndef CONFIG_NET_NS +static inline void skb_set_dev(struct sk_buff *skb, struct net_device *dev) +{ + skb->dev = dev; +} +#else /* CONFIG_NET_NS */ +void skb_set_dev(struct sk_buff *skb, struct net_device *dev); +#endif + static inline bool netdev_uses_trailer_tags(struct net_device *dev) { #ifdef CONFIG_NET_DSA_TAG_TRAILER diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index a9e1f1785614..9e83272fc5b0 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -322,7 +322,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, } - skb->dev = vlan_dev_info(dev)->real_dev; + skb_set_dev(skb, vlan_dev_info(dev)->real_dev); len = skb->len; ret = dev_queue_xmit(skb); diff --git a/net/core/dev.c b/net/core/dev.c index 2cba5c521e56..94c1eeed25e5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1448,13 +1448,10 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) if (skb->len > (dev->mtu + dev->hard_header_len)) return NET_RX_DROP; - skb_dst_drop(skb); + skb_set_dev(skb, dev); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, dev); - skb->mark = 0; - secpath_reset(skb); - nf_reset(skb); return netif_rx(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); @@ -1614,6 +1611,36 @@ static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) return false; } +/** + * skb_dev_set -- assign a new device to a buffer + * @skb: buffer for the new device + * @dev: network device + * + * If an skb is owned by a device already, we have to reset + * all data private to the namespace a device belongs to + * before assigning it a new device. + */ +#ifdef CONFIG_NET_NS +void skb_set_dev(struct sk_buff *skb, struct net_device *dev) +{ + skb_dst_drop(skb); + if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) { + secpath_reset(skb); + nf_reset(skb); + skb_init_secmark(skb); + skb->mark = 0; + skb->priority = 0; + skb->nf_trace = 0; + skb->ipvs_property = 0; +#ifdef CONFIG_NET_SCHED + skb->tc_index = 0; +#endif + } + skb->dev = dev; +} +EXPORT_SYMBOL(skb_set_dev); +#endif /* CONFIG_NET_NS */ + /* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path. -- cgit v1.2.3 From fc0663d6b5e6d8e9b57f872a644c0aafd82361b7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 30 Jan 2010 12:23:40 +0000 Subject: macvlan: allow multiple driver backends This makes it possible to hook into the macvlan driver from another kernel module. In particular, the goal is to extend it with the macvtap backend that provides a tun/tap compatible interface directly on the macvlan device. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 113 ++++++++++++++++++++------------------------- include/linux/if_macvlan.h | 70 ++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d32e0bdfc5e9..40faa368b07a 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -39,31 +39,6 @@ struct macvlan_port { struct list_head vlans; }; -/** - * struct macvlan_rx_stats - MACVLAN percpu rx stats - * @rx_packets: number of received packets - * @rx_bytes: number of received bytes - * @multicast: number of received multicast packets - * @rx_errors: number of errors - */ -struct macvlan_rx_stats { - unsigned long rx_packets; - unsigned long rx_bytes; - unsigned long multicast; - unsigned long rx_errors; -}; - -struct macvlan_dev { - struct net_device *dev; - struct list_head list; - struct hlist_node hlist; - struct macvlan_port *port; - struct net_device *lowerdev; - struct macvlan_rx_stats *rx_stats; - enum macvlan_mode mode; -}; - - static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, const unsigned char *addr) { @@ -118,31 +93,17 @@ static int macvlan_addr_busy(const struct macvlan_port *port, return 0; } -static inline void macvlan_count_rx(const struct macvlan_dev *vlan, - unsigned int len, bool success, - bool multicast) -{ - struct macvlan_rx_stats *rx_stats; - - rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id()); - if (likely(success)) { - rx_stats->rx_packets++;; - rx_stats->rx_bytes += len; - if (multicast) - rx_stats->multicast++; - } else { - rx_stats->rx_errors++; - } -} -static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev, +static int macvlan_broadcast_one(struct sk_buff *skb, + const struct macvlan_dev *vlan, const struct ethhdr *eth, bool local) { + struct net_device *dev = vlan->dev; if (!skb) return NET_RX_DROP; if (local) - return dev_forward_skb(dev, skb); + return vlan->forward(dev, skb); skb->dev = dev; if (!compare_ether_addr_64bits(eth->h_dest, @@ -151,7 +112,7 @@ static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev, else skb->pkt_type = PACKET_MULTICAST; - return netif_rx(skb); + return vlan->receive(skb); } static void macvlan_broadcast(struct sk_buff *skb, @@ -175,7 +136,7 @@ static void macvlan_broadcast(struct sk_buff *skb, continue; nskb = skb_clone(skb, GFP_ATOMIC); - err = macvlan_broadcast_one(nskb, vlan->dev, eth, + err = macvlan_broadcast_one(nskb, vlan, eth, mode == MACVLAN_MODE_BRIDGE); macvlan_count_rx(vlan, skb->len + ETH_HLEN, err == NET_RX_SUCCESS, 1); @@ -238,7 +199,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) skb->dev = dev; skb->pkt_type = PACKET_HOST; - netif_rx(skb); + vlan->receive(skb); return NULL; } @@ -260,7 +221,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) dest = macvlan_hash_lookup(port, eth->h_dest); if (dest && dest->mode == MACVLAN_MODE_BRIDGE) { unsigned int length = skb->len + ETH_HLEN; - int ret = dev_forward_skb(dest->dev, skb); + int ret = dest->forward(dest->dev, skb); macvlan_count_rx(dest, length, ret == NET_RX_SUCCESS, 0); @@ -273,8 +234,8 @@ xmit_world: return dev_queue_xmit(skb); } -static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, - struct net_device *dev) +netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, + struct net_device *dev) { int i = skb_get_queue_mapping(skb); struct netdev_queue *txq = netdev_get_tx_queue(dev, i); @@ -290,6 +251,7 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, return ret; } +EXPORT_SYMBOL_GPL(macvlan_start_xmit); static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, @@ -623,8 +585,11 @@ static int macvlan_get_tx_queues(struct net *net, return 0; } -static int macvlan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) +int macvlan_common_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + int (*receive)(struct sk_buff *skb), + int (*forward)(struct net_device *dev, + struct sk_buff *skb)) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port; @@ -664,6 +629,8 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev, vlan->lowerdev = lowerdev; vlan->dev = dev; vlan->port = port; + vlan->receive = receive; + vlan->forward = forward; vlan->mode = MACVLAN_MODE_VEPA; if (data && data[IFLA_MACVLAN_MODE]) @@ -677,8 +644,17 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev, netif_stacked_transfer_operstate(lowerdev, dev); return 0; } +EXPORT_SYMBOL_GPL(macvlan_common_newlink); -static void macvlan_dellink(struct net_device *dev, struct list_head *head) +static int macvlan_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + return macvlan_common_newlink(src_net, dev, tb, data, + netif_rx, + dev_forward_skb); +} + +void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; @@ -689,6 +665,7 @@ static void macvlan_dellink(struct net_device *dev, struct list_head *head) if (list_empty(&port->vlans)) macvlan_port_destroy(port->dev); } +EXPORT_SYMBOL_GPL(macvlan_dellink); static int macvlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) @@ -720,19 +697,27 @@ static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = { [IFLA_MACVLAN_MODE] = { .type = NLA_U32 }, }; -static struct rtnl_link_ops macvlan_link_ops __read_mostly = { +int macvlan_link_register(struct rtnl_link_ops *ops) +{ + /* common fields */ + ops->priv_size = sizeof(struct macvlan_dev); + ops->get_tx_queues = macvlan_get_tx_queues; + ops->setup = macvlan_setup; + ops->validate = macvlan_validate; + ops->maxtype = IFLA_MACVLAN_MAX; + ops->policy = macvlan_policy; + ops->changelink = macvlan_changelink; + ops->get_size = macvlan_get_size; + ops->fill_info = macvlan_fill_info; + + return rtnl_link_register(ops); +}; +EXPORT_SYMBOL_GPL(macvlan_link_register); + +static struct rtnl_link_ops macvlan_link_ops = { .kind = "macvlan", - .priv_size = sizeof(struct macvlan_dev), - .get_tx_queues = macvlan_get_tx_queues, - .setup = macvlan_setup, - .validate = macvlan_validate, .newlink = macvlan_newlink, .dellink = macvlan_dellink, - .maxtype = IFLA_MACVLAN_MAX, - .policy = macvlan_policy, - .changelink = macvlan_changelink, - .get_size = macvlan_get_size, - .fill_info = macvlan_fill_info, }; static int macvlan_device_event(struct notifier_block *unused, @@ -761,7 +746,7 @@ static int macvlan_device_event(struct notifier_block *unused, break; case NETDEV_UNREGISTER: list_for_each_entry_safe(vlan, next, &port->vlans, list) - macvlan_dellink(vlan->dev, NULL); + vlan->dev->rtnl_link_ops->dellink(vlan->dev, NULL); break; } return NOTIFY_DONE; @@ -778,7 +763,7 @@ static int __init macvlan_init_module(void) register_netdevice_notifier(&macvlan_notifier_block); macvlan_handle_frame_hook = macvlan_handle_frame; - err = rtnl_link_register(&macvlan_link_ops); + err = macvlan_link_register(&macvlan_link_ops); if (err < 0) goto err1; return 0; diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 5f200bac3749..9a11544bb0b1 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -1,6 +1,76 @@ #ifndef _LINUX_IF_MACVLAN_H #define _LINUX_IF_MACVLAN_H +#include +#include +#include +#include +#include + +struct macvlan_port; +struct macvtap_queue; + +/** + * struct macvlan_rx_stats - MACVLAN percpu rx stats + * @rx_packets: number of received packets + * @rx_bytes: number of received bytes + * @multicast: number of received multicast packets + * @rx_errors: number of errors + */ +struct macvlan_rx_stats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long multicast; + unsigned long rx_errors; +}; + +struct macvlan_dev { + struct net_device *dev; + struct list_head list; + struct hlist_node hlist; + struct macvlan_port *port; + struct net_device *lowerdev; + struct macvlan_rx_stats *rx_stats; + enum macvlan_mode mode; + int (*receive)(struct sk_buff *skb); + int (*forward)(struct net_device *dev, struct sk_buff *skb); +}; + +static inline void macvlan_count_rx(const struct macvlan_dev *vlan, + unsigned int len, bool success, + bool multicast) +{ + struct macvlan_rx_stats *rx_stats; + + rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id()); + if (likely(success)) { + rx_stats->rx_packets++;; + rx_stats->rx_bytes += len; + if (multicast) + rx_stats->multicast++; + } else { + rx_stats->rx_errors++; + } +} + +extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + int (*receive)(struct sk_buff *skb), + int (*forward)(struct net_device *dev, + struct sk_buff *skb)); + +extern void macvlan_count_rx(const struct macvlan_dev *vlan, + unsigned int len, bool success, + bool multicast); + +extern void macvlan_dellink(struct net_device *dev, struct list_head *head); + +extern int macvlan_link_register(struct rtnl_link_ops *ops); + +extern netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, + struct net_device *dev); + + extern struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *); #endif /* _LINUX_IF_MACVLAN_H */ -- cgit v1.2.3 From 20d29d7a916a47bf533b5709437fe735b6b5b79e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 30 Jan 2010 12:24:26 +0000 Subject: net: macvtap driver In order to use macvlan with qemu and other tools that require a tap file descriptor, the macvtap driver adds a small backend with a character device with the same interface as the tun driver, with a minimum set of features. Macvtap interfaces are created in the same way as macvlan interfaces using ip link, but the netif is just used as a handle for configuration and accounting, while the data goes through the chardev. Each macvtap interface has its own character device, simplifying permission management significantly over the generic tun/tap driver. Cc: Patrick McHardy Cc: Stephen Hemminger Cc: David S. Miller" Cc: "Michael S. Tsirkin" Cc: Herbert Xu Cc: Or Gerlitz Cc: netdev@vger.kernel.org Cc: bridge@lists.linux-foundation.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/Kconfig | 12 + drivers/net/Makefile | 1 + drivers/net/macvtap.c | 581 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/if_macvlan.h | 1 + 4 files changed, 595 insertions(+) create mode 100644 drivers/net/macvtap.c (limited to 'include/linux') diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index cb0e534418e3..411e20703110 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -90,6 +90,18 @@ config MACVLAN To compile this driver as a module, choose M here: the module will be called macvlan. +config MACVTAP + tristate "MAC-VLAN based tap driver (EXPERIMENTAL)" + depends on MACVLAN + help + This adds a specialized tap character device driver that is based + on the MAC-VLAN network interface, called macvtap. A macvtap device + can be added in the same way as a macvlan device, using 'type + macvlan', and then be accessed through the tap user space interface. + + To compile this driver as a module, choose M here: the module + will be called macvtap. + config EQUALIZER tristate "EQL (serial line load balancing) support" ---help--- diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 0b763cbe9b1f..95958032cd31 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -169,6 +169,7 @@ obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o obj-$(CONFIG_DUMMY) += dummy.o obj-$(CONFIG_IFB) += ifb.o obj-$(CONFIG_MACVLAN) += macvlan.o +obj-$(CONFIG_MACVTAP) += macvtap.o obj-$(CONFIG_DE600) += de600.o obj-$(CONFIG_DE620) += de620.o obj-$(CONFIG_LANCE) += lance.o diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c new file mode 100644 index 000000000000..ad1f6ef89308 --- /dev/null +++ b/drivers/net/macvtap.c @@ -0,0 +1,581 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * A macvtap queue is the central object of this driver, it connects + * an open character device to a macvlan interface. There can be + * multiple queues on one interface, which map back to queues + * implemented in hardware on the underlying device. + * + * macvtap_proto is used to allocate queues through the sock allocation + * mechanism. + * + * TODO: multiqueue support is currently not implemented, even though + * macvtap is basically prepared for that. We will need to add this + * here as well as in virtio-net and qemu to get line rate on 10gbit + * adapters from a guest. + */ +struct macvtap_queue { + struct sock sk; + struct socket sock; + struct macvlan_dev *vlan; + struct file *file; +}; + +static struct proto macvtap_proto = { + .name = "macvtap", + .owner = THIS_MODULE, + .obj_size = sizeof (struct macvtap_queue), +}; + +/* + * Minor number matches netdev->ifindex, so need a potentially + * large value. This also makes it possible to split the + * tap functionality out again in the future by offering it + * from other drivers besides macvtap. As long as every device + * only has one tap, the interface numbers assure that the + * device nodes are unique. + */ +static unsigned int macvtap_major; +#define MACVTAP_NUM_DEVS 65536 +static struct class *macvtap_class; +static struct cdev macvtap_cdev; + +/* + * RCU usage: + * The macvtap_queue is referenced both from the chardev struct file + * and from the struct macvlan_dev using rcu_read_lock. + * + * We never actually update the contents of a macvtap_queue atomically + * with RCU but it is used for race-free destruction of a queue when + * either the file or the macvlan_dev goes away. Pointers back to + * the dev and the file are implicitly valid as long as the queue + * exists. + * + * The callbacks from macvlan are always done with rcu_read_lock held + * already, while in the file_operations, we get it ourselves. + * + * When destroying a queue, we remove the pointers from the file and + * from the dev and then synchronize_rcu to make sure no thread is + * still using the queue. There may still be references to the struct + * sock inside of the queue from outbound SKBs, but these never + * reference back to the file or the dev. The data structure is freed + * through __sk_free when both our references and any pending SKBs + * are gone. + * + * macvtap_lock is only used to prevent multiple concurrent open() + * calls to assign a new vlan->tap pointer. It could be moved into + * the macvlan_dev itself but is extremely rarely used. + */ +static DEFINE_SPINLOCK(macvtap_lock); + +/* + * Choose the next free queue, for now there is only one + */ +static int macvtap_set_queue(struct net_device *dev, struct file *file, + struct macvtap_queue *q) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + int err = -EBUSY; + + spin_lock(&macvtap_lock); + if (rcu_dereference(vlan->tap)) + goto out; + + err = 0; + q->vlan = vlan; + rcu_assign_pointer(vlan->tap, q); + + q->file = file; + rcu_assign_pointer(file->private_data, q); + +out: + spin_unlock(&macvtap_lock); + return err; +} + +/* + * We must destroy each queue exactly once, when either + * the netdev or the file go away. + * + * Using the spinlock makes sure that we don't get + * to the queue again after destroying it. + * + * synchronize_rcu serializes with the packet flow + * that uses rcu_read_lock. + */ +static void macvtap_del_queue(struct macvtap_queue **qp) +{ + struct macvtap_queue *q; + + spin_lock(&macvtap_lock); + q = rcu_dereference(*qp); + if (!q) { + spin_unlock(&macvtap_lock); + return; + } + + rcu_assign_pointer(q->vlan->tap, NULL); + rcu_assign_pointer(q->file->private_data, NULL); + spin_unlock(&macvtap_lock); + + synchronize_rcu(); + sock_put(&q->sk); +} + +/* + * Since we only support one queue, just dereference the pointer. + */ +static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, + struct sk_buff *skb) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + + return rcu_dereference(vlan->tap); +} + +static void macvtap_del_queues(struct net_device *dev) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + macvtap_del_queue(&vlan->tap); +} + +static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file) +{ + rcu_read_lock_bh(); + return rcu_dereference(file->private_data); +} + +static inline void macvtap_file_put_queue(void) +{ + rcu_read_unlock_bh(); +} + +/* + * Forward happens for data that gets sent from one macvlan + * endpoint to another one in bridge mode. We just take + * the skb and put it into the receive queue. + */ +static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) +{ + struct macvtap_queue *q = macvtap_get_queue(dev, skb); + if (!q) + return -ENOLINK; + + skb_queue_tail(&q->sk.sk_receive_queue, skb); + wake_up(q->sk.sk_sleep); + return 0; +} + +/* + * Receive is for data from the external interface (lowerdev), + * in case of macvtap, we can treat that the same way as + * forward, which macvlan cannot. + */ +static int macvtap_receive(struct sk_buff *skb) +{ + skb_push(skb, ETH_HLEN); + return macvtap_forward(skb->dev, skb); +} + +static int macvtap_newlink(struct net *src_net, + struct net_device *dev, + struct nlattr *tb[], + struct nlattr *data[]) +{ + struct device *classdev; + dev_t devt; + int err; + + err = macvlan_common_newlink(src_net, dev, tb, data, + macvtap_receive, macvtap_forward); + if (err) + goto out; + + devt = MKDEV(MAJOR(macvtap_major), dev->ifindex); + + classdev = device_create(macvtap_class, &dev->dev, devt, + dev, "tap%d", dev->ifindex); + if (IS_ERR(classdev)) { + err = PTR_ERR(classdev); + macvtap_del_queues(dev); + } + +out: + return err; +} + +static void macvtap_dellink(struct net_device *dev, + struct list_head *head) +{ + device_destroy(macvtap_class, + MKDEV(MAJOR(macvtap_major), dev->ifindex)); + + macvtap_del_queues(dev); + macvlan_dellink(dev, head); +} + +static struct rtnl_link_ops macvtap_link_ops __read_mostly = { + .kind = "macvtap", + .newlink = macvtap_newlink, + .dellink = macvtap_dellink, +}; + + +static void macvtap_sock_write_space(struct sock *sk) +{ + if (!sock_writeable(sk) || + !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) + return; + + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible_sync(sk->sk_sleep); +} + +static int macvtap_open(struct inode *inode, struct file *file) +{ + struct net *net = current->nsproxy->net_ns; + struct net_device *dev = dev_get_by_index(net, iminor(inode)); + struct macvtap_queue *q; + int err; + + err = -ENODEV; + if (!dev) + goto out; + + /* check if this is a macvtap device */ + err = -EINVAL; + if (dev->rtnl_link_ops != &macvtap_link_ops) + goto out; + + err = -ENOMEM; + q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, + &macvtap_proto); + if (!q) + goto out; + + init_waitqueue_head(&q->sock.wait); + q->sock.type = SOCK_RAW; + q->sock.state = SS_CONNECTED; + sock_init_data(&q->sock, &q->sk); + q->sk.sk_allocation = GFP_ATOMIC; /* for now */ + q->sk.sk_write_space = macvtap_sock_write_space; + + err = macvtap_set_queue(dev, file, q); + if (err) + sock_put(&q->sk); + +out: + if (dev) + dev_put(dev); + + return err; +} + +static int macvtap_release(struct inode *inode, struct file *file) +{ + macvtap_del_queue((struct macvtap_queue **)&file->private_data); + return 0; +} + +static unsigned int macvtap_poll(struct file *file, poll_table * wait) +{ + struct macvtap_queue *q = macvtap_file_get_queue(file); + unsigned int mask = POLLERR; + + if (!q) + goto out; + + mask = 0; + poll_wait(file, &q->sock.wait, wait); + + if (!skb_queue_empty(&q->sk.sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + + if (sock_writeable(&q->sk) || + (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) && + sock_writeable(&q->sk))) + mask |= POLLOUT | POLLWRNORM; + +out: + macvtap_file_put_queue(); + return mask; +} + +/* Get packet from user space buffer */ +static ssize_t macvtap_get_user(struct macvtap_queue *q, + const struct iovec *iv, size_t count, + int noblock) +{ + struct sk_buff *skb; + size_t len = count; + int err; + + if (unlikely(len < ETH_HLEN)) + return -EINVAL; + + skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err); + + if (!skb) { + macvlan_count_rx(q->vlan, 0, false, false); + return err; + } + + skb_reserve(skb, NET_IP_ALIGN); + skb_put(skb, count); + + if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) { + macvlan_count_rx(q->vlan, 0, false, false); + kfree_skb(skb); + return -EFAULT; + } + + skb_set_network_header(skb, ETH_HLEN); + + macvlan_start_xmit(skb, q->vlan->dev); + + return count; +} + +static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) +{ + struct file *file = iocb->ki_filp; + ssize_t result = -ENOLINK; + struct macvtap_queue *q = macvtap_file_get_queue(file); + + if (!q) + goto out; + + result = macvtap_get_user(q, iv, iov_length(iv, count), + file->f_flags & O_NONBLOCK); +out: + macvtap_file_put_queue(); + return result; +} + +/* Put packet to the user space buffer */ +static ssize_t macvtap_put_user(struct macvtap_queue *q, + const struct sk_buff *skb, + const struct iovec *iv, int len) +{ + struct macvlan_dev *vlan = q->vlan; + int ret; + + len = min_t(int, skb->len, len); + + ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len); + + macvlan_count_rx(vlan, len, ret == 0, 0); + + return ret ? ret : len; +} + +static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct macvtap_queue *q = macvtap_file_get_queue(file); + + DECLARE_WAITQUEUE(wait, current); + struct sk_buff *skb; + ssize_t len, ret = 0; + + if (!q) { + ret = -ENOLINK; + goto out; + } + + len = iov_length(iv, count); + if (len < 0) { + ret = -EINVAL; + goto out; + } + + add_wait_queue(q->sk.sk_sleep, &wait); + while (len) { + current->state = TASK_INTERRUPTIBLE; + + /* Read frames from the queue */ + skb = skb_dequeue(&q->sk.sk_receive_queue); + if (!skb) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + /* Nothing to read, let's sleep */ + schedule(); + continue; + } + ret = macvtap_put_user(q, skb, iv, len); + kfree_skb(skb); + break; + } + + current->state = TASK_RUNNING; + remove_wait_queue(q->sk.sk_sleep, &wait); + +out: + macvtap_file_put_queue(); + return ret; +} + +/* + * provide compatibility with generic tun/tap interface + */ +static long macvtap_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct macvtap_queue *q; + void __user *argp = (void __user *)arg; + struct ifreq __user *ifr = argp; + unsigned int __user *up = argp; + unsigned int u; + char devname[IFNAMSIZ]; + + switch (cmd) { + case TUNSETIFF: + /* ignore the name, just look at flags */ + if (get_user(u, &ifr->ifr_flags)) + return -EFAULT; + if (u != (IFF_TAP | IFF_NO_PI)) + return -EINVAL; + return 0; + + case TUNGETIFF: + q = macvtap_file_get_queue(file); + if (!q) + return -ENOLINK; + memcpy(devname, q->vlan->dev->name, sizeof(devname)); + macvtap_file_put_queue(); + + if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) || + put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags)) + return -EFAULT; + return 0; + + case TUNGETFEATURES: + if (put_user((IFF_TAP | IFF_NO_PI), up)) + return -EFAULT; + return 0; + + case TUNSETSNDBUF: + if (get_user(u, up)) + return -EFAULT; + + q = macvtap_file_get_queue(file); + q->sk.sk_sndbuf = u; + macvtap_file_put_queue(); + return 0; + + case TUNSETOFFLOAD: + /* let the user check for future flags */ + if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | + TUN_F_TSO_ECN | TUN_F_UFO)) + return -EINVAL; + + /* TODO: add support for these, so far we don't + support any offload */ + if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | + TUN_F_TSO_ECN | TUN_F_UFO)) + return -EINVAL; + + return 0; + + default: + return -EINVAL; + } +} + +#ifdef CONFIG_COMPAT +static long macvtap_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + +static const struct file_operations macvtap_fops = { + .owner = THIS_MODULE, + .open = macvtap_open, + .release = macvtap_release, + .aio_read = macvtap_aio_read, + .aio_write = macvtap_aio_write, + .poll = macvtap_poll, + .llseek = no_llseek, + .unlocked_ioctl = macvtap_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = macvtap_compat_ioctl, +#endif +}; + +static int macvtap_init(void) +{ + int err; + + err = alloc_chrdev_region(&macvtap_major, 0, + MACVTAP_NUM_DEVS, "macvtap"); + if (err) + goto out1; + + cdev_init(&macvtap_cdev, &macvtap_fops); + err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS); + if (err) + goto out2; + + macvtap_class = class_create(THIS_MODULE, "macvtap"); + if (IS_ERR(macvtap_class)) { + err = PTR_ERR(macvtap_class); + goto out3; + } + + err = macvlan_link_register(&macvtap_link_ops); + if (err) + goto out4; + + return 0; + +out4: + class_unregister(macvtap_class); +out3: + cdev_del(&macvtap_cdev); +out2: + unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); +out1: + return err; +} +module_init(macvtap_init); + +static void macvtap_exit(void) +{ + rtnl_link_unregister(&macvtap_link_ops); + class_unregister(macvtap_class); + cdev_del(&macvtap_cdev); + unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); +} +module_exit(macvtap_exit); + +MODULE_ALIAS_RTNL_LINK("macvtap"); +MODULE_AUTHOR("Arnd Bergmann "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 9a11544bb0b1..51f1512045e9 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -34,6 +34,7 @@ struct macvlan_dev { enum macvlan_mode mode; int (*receive)(struct sk_buff *skb); int (*forward)(struct net_device *dev, struct sk_buff *skb); + struct macvtap_queue *tap; }; static inline void macvlan_count_rx(const struct macvlan_dev *vlan, -- cgit v1.2.3 From 1621e0940294c20e302faf401f41204de7252e22 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 1 Feb 2010 09:44:19 +0000 Subject: net: CONFIG_COMPAT redux Ifdef out struct proto_ops::compat_ioctl struct proto_ops::compat_setsockopt struct proto_ops::compat_getsockopt to make structures smaller on COMPAT=n kernels. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/linux/net.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 5e8083cacc8b..4157b5d42bd6 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -174,18 +174,22 @@ struct proto_ops { struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); +#ifdef CONFIG_COMPAT int (*compat_ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); +#endif int (*listen) (struct socket *sock, int len); int (*shutdown) (struct socket *sock, int flags); int (*setsockopt)(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen); int (*getsockopt)(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); +#ifdef CONFIG_COMPAT int (*compat_setsockopt)(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen); int (*compat_getsockopt)(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); +#endif int (*sendmsg) (struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t total_len); int (*recvmsg) (struct kiocb *iocb, struct socket *sock, -- cgit v1.2.3 From f7acede65d6b65919aee5b6a360a17cedb11f2f7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Jan 2010 13:30:11 +0100 Subject: libata: fix ata_id_logical_per_physical_sectors The value we get from the low byte of the ATA_ID_SECTOR_SIZE word is not not a plain multiple, but the log of it, so fix the helper to give the correct answer. Without this we'll get an incorrect minimal I/O size in the block limits VPD page for 4k sector drives. Also change the return value of ata_id_logical_per_physical_sectors to u16 for the unlikely case of very large logical sectors. Signed-off-by: Christoph Hellwig Signed-off-by: Jeff Garzik --- include/linux/ata.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index 38a6948ce0c2..20f31567ccee 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -647,9 +647,9 @@ static inline int ata_id_has_large_logical_sectors(const u16 *id) return id[ATA_ID_SECTOR_SIZE] & (1 << 13); } -static inline u8 ata_id_logical_per_physical_sectors(const u16 *id) +static inline u16 ata_id_logical_per_physical_sectors(const u16 *id) { - return id[ATA_ID_SECTOR_SIZE] & 0xf; + return 1 << (id[ATA_ID_SECTOR_SIZE] & 0xf); } static inline int ata_id_has_lba48(const u16 *id) -- cgit v1.2.3 From 0b7024ac4df5821347141c18e680b7166bc1cb20 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 2 Feb 2010 21:08:26 -0800 Subject: Input: add match() method to input hanlders Get rid of blacklist in input handler structure and instead allow handlers to define their own match() method to perform fine-grained filtering of supported devices. Signed-off-by: Dmitry Torokhov --- drivers/char/keyboard.c | 24 ++++++++++++++++-------- drivers/input/input.c | 13 ++++++------- drivers/input/joydev.c | 32 +++++++++++++++----------------- include/linux/input.h | 6 +++--- 4 files changed, 40 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c index cbf64b985ef4..ada25bb8941e 100644 --- a/drivers/char/keyboard.c +++ b/drivers/char/keyboard.c @@ -1323,6 +1323,21 @@ static void kbd_event(struct input_handle *handle, unsigned int event_type, schedule_console_callback(); } +static bool kbd_match(struct input_handler *handler, struct input_dev *dev) +{ + int i; + + if (test_bit(EV_SND, dev->evbit)) + return true; + + if (test_bit(EV_KEY, dev->evbit)) + for (i = KEY_RESERVED; i < BTN_MISC; i++) + if (test_bit(i, dev->keybit)) + return true; + + return false; +} + /* * When a keyboard (or other input device) is found, the kbd_connect * function is called. The function then looks at the device, and if it @@ -1334,14 +1349,6 @@ static int kbd_connect(struct input_handler *handler, struct input_dev *dev, { struct input_handle *handle; int error; - int i; - - for (i = KEY_RESERVED; i < BTN_MISC; i++) - if (test_bit(i, dev->keybit)) - break; - - if (i == BTN_MISC && !test_bit(EV_SND, dev->evbit)) - return -ENODEV; handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL); if (!handle) @@ -1407,6 +1414,7 @@ MODULE_DEVICE_TABLE(input, kbd_ids); static struct input_handler kbd_handler = { .event = kbd_event, + .match = kbd_match, .connect = kbd_connect, .disconnect = kbd_disconnect, .start = kbd_start, diff --git a/drivers/input/input.c b/drivers/input/input.c index 7080a9d4b840..dae49eba6ccd 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -723,12 +723,13 @@ EXPORT_SYMBOL(input_set_keycode); if (i != BITS_TO_LONGS(max)) \ continue; -static const struct input_device_id *input_match_device(const struct input_device_id *id, +static const struct input_device_id *input_match_device(struct input_handler *handler, struct input_dev *dev) { + const struct input_device_id *id; int i; - for (; id->flags || id->driver_info; id++) { + for (id = handler->id_table; id->flags || id->driver_info; id++) { if (id->flags & INPUT_DEVICE_ID_MATCH_BUS) if (id->bustype != dev->id.bustype) @@ -756,7 +757,8 @@ static const struct input_device_id *input_match_device(const struct input_devic MATCH_BIT(ffbit, FF_MAX); MATCH_BIT(swbit, SW_MAX); - return id; + if (!handler->match || handler->match(handler, dev)) + return id; } return NULL; @@ -767,10 +769,7 @@ static int input_attach_handler(struct input_dev *dev, struct input_handler *han const struct input_device_id *id; int error; - if (handler->blacklist && input_match_device(handler->blacklist, dev)) - return -ENODEV; - - id = input_match_device(handler->id_table, dev); + id = input_match_device(handler, dev); if (!id) return -ENODEV; diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c index b1bd6dd32286..63e71f2a7acc 100644 --- a/drivers/input/joydev.c +++ b/drivers/input/joydev.c @@ -775,6 +775,20 @@ static void joydev_cleanup(struct joydev *joydev) input_close_device(handle); } + +static bool joydev_match(struct input_handler *handler, struct input_dev *dev) +{ + /* Avoid touchpads and touchscreens */ + if (test_bit(EV_KEY, dev->evbit) && test_bit(BTN_TOUCH, dev->keybit)) + return false; + + /* Avoid tablets, digitisers and similar devices */ + if (test_bit(EV_KEY, dev->evbit) && test_bit(BTN_DIGI, dev->keybit)) + return false; + + return true; +} + static int joydev_connect(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id) { @@ -894,22 +908,6 @@ static void joydev_disconnect(struct input_handle *handle) put_device(&joydev->dev); } -static const struct input_device_id joydev_blacklist[] = { - { - .flags = INPUT_DEVICE_ID_MATCH_EVBIT | - INPUT_DEVICE_ID_MATCH_KEYBIT, - .evbit = { BIT_MASK(EV_KEY) }, - .keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) }, - }, /* Avoid itouchpads and touchscreens */ - { - .flags = INPUT_DEVICE_ID_MATCH_EVBIT | - INPUT_DEVICE_ID_MATCH_KEYBIT, - .evbit = { BIT_MASK(EV_KEY) }, - .keybit = { [BIT_WORD(BTN_DIGI)] = BIT_MASK(BTN_DIGI) }, - }, /* Avoid tablets, digitisers and similar devices */ - { } /* Terminating entry */ -}; - static const struct input_device_id joydev_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | @@ -936,13 +934,13 @@ MODULE_DEVICE_TABLE(input, joydev_ids); static struct input_handler joydev_handler = { .event = joydev_event, + .match = joydev_match, .connect = joydev_connect, .disconnect = joydev_disconnect, .fops = &joydev_fops, .minor = JOYDEV_MINOR_BASE, .name = "joydev", .id_table = joydev_ids, - .blacklist = joydev_blacklist, }; static int __init joydev_init(void) diff --git a/include/linux/input.h b/include/linux/input.h index 6c9d3d49fa91..8dc5d724c703 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -1200,6 +1200,8 @@ struct input_handle; * it may not sleep * @filter: similar to @event; separates normal event handlers from * "filters". + * @match: called after comparing device's id with handler's id_table + * to perform fine-grained matching between device and handler * @connect: called when attaching a handler to an input device * @disconnect: disconnects a handler from input device * @start: starts handler for given handle. This function is called by @@ -1211,8 +1213,6 @@ struct input_handle; * @name: name of the handler, to be shown in /proc/bus/input/handlers * @id_table: pointer to a table of input_device_ids this driver can * handle - * @blacklist: pointer to a table of input_device_ids this driver should - * ignore even if they match @id_table * @h_list: list of input handles associated with the handler * @node: for placing the driver onto input_handler_list * @@ -1235,6 +1235,7 @@ struct input_handler { void (*event)(struct input_handle *handle, unsigned int type, unsigned int code, int value); bool (*filter)(struct input_handle *handle, unsigned int type, unsigned int code, int value); + bool (*match)(struct input_handler *handler, struct input_dev *dev); int (*connect)(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id); void (*disconnect)(struct input_handle *handle); void (*start)(struct input_handle *handle); @@ -1244,7 +1245,6 @@ struct input_handler { const char *name; const struct input_device_id *id_table; - const struct input_device_id *blacklist; struct list_head h_list; struct list_head node; -- cgit v1.2.3 From 2cfa19780d61740f65790c5bae363b759d7c96fa Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 2 Feb 2010 16:49:11 -0500 Subject: ftrace/alternatives: Introducing *_text_reserved functions Introducing *_text_reserved functions for checking the text address range is partially reserved or not. This patch provides checking routines for x86 smp alternatives and dynamic ftrace. Since both functions modify fixed pieces of kernel text, they should reserve and protect those from other dynamic text modifier, like kprobes. This will also be extended when introducing other subsystems which modify fixed pieces of kernel text. Dynamic text modifiers should avoid those. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Steven Rostedt Cc: przemyslaw@pawelczyk.it Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Mathieu Desnoyers Cc: Jason Baron LKML-Reference: <20100202214911.4694.16587.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 5 +++++ arch/x86/kernel/alternative.c | 16 ++++++++++++++++ include/linux/ftrace.h | 6 ++++++ kernel/trace/ftrace.c | 15 +++++++++++++++ 4 files changed, 42 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 69b74a7b877f..ac80b7d70014 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -65,12 +65,17 @@ extern void alternatives_smp_module_add(struct module *mod, char *name, void *text, void *text_end); extern void alternatives_smp_module_del(struct module *mod); extern void alternatives_smp_switch(int smp); +extern int alternatives_text_reserved(void *start, void *end); #else static inline void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) {} static inline void alternatives_smp_module_del(struct module *mod) {} static inline void alternatives_smp_switch(int smp) {} +static inline int alternatives_text_reserved(void *start, void *end) +{ + return 0; +} #endif /* CONFIG_SMP */ /* alternative assembly primitive: */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index de7353c0ce9c..3c13284ff86d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -390,6 +390,22 @@ void alternatives_smp_switch(int smp) mutex_unlock(&smp_alt); } +/* Return 1 if the address range is reserved for smp-alternatives */ +int alternatives_text_reserved(void *start, void *end) +{ + struct smp_alt_module *mod; + u8 **ptr; + + list_for_each_entry(mod, &smp_alt_modules, next) { + if (mod->text > end || mod->text_end < start) + continue; + for (ptr = mod->locks; ptr < mod->locks_end; ptr++) + if (start <= *ptr && end >= *ptr) + return 1; + } + + return 0; +} #endif #ifdef CONFIG_PARAVIRT diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0b4f97d24d7f..9d127efed43c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -134,6 +134,8 @@ extern void unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); extern void unregister_ftrace_function_probe_all(char *glob); +extern int ftrace_text_reserved(void *start, void *end); + enum { FTRACE_FL_FREE = (1 << 0), FTRACE_FL_FAILED = (1 << 1), @@ -250,6 +252,10 @@ static inline int unregister_ftrace_command(char *cmd_name) { return -EINVAL; } +static inline int ftrace_text_reserved(void *start, void *end) +{ + return 0; +} #endif /* CONFIG_DYNAMIC_FTRACE */ /* totally disable ftrace - can not re-enable after this */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1e6640f80454..3d90661a5f40 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1025,6 +1025,21 @@ static void ftrace_bug(int failed, unsigned long ip) } +/* Return 1 if the address range is reserved for ftrace */ +int ftrace_text_reserved(void *start, void *end) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + + do_for_each_ftrace_rec(pg, rec) { + if (rec->ip <= (unsigned long)end && + rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) + return 1; + } while_for_each_ftrace_rec(); + return 0; +} + + static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { -- cgit v1.2.3 From f24bb999d2b9f2950e5cac5b69bffedf73c24ea4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 2 Feb 2010 16:49:25 -0500 Subject: ftrace: Remove record freezing Remove record freezing. Because kprobes never puts probe on ftrace's mcount call anymore, it doesn't need ftrace to check whether kprobes on it. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Steven Rostedt Cc: przemyslaw@pawelczyk.it Cc: Frederic Weisbecker LKML-Reference: <20100202214925.4694.73469.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 1 - kernel/trace/ftrace.c | 39 --------------------------------------- 2 files changed, 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9d127efed43c..eb054ae95605 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -143,7 +143,6 @@ enum { FTRACE_FL_ENABLED = (1 << 3), FTRACE_FL_NOTRACE = (1 << 4), FTRACE_FL_CONVERTED = (1 << 5), - FTRACE_FL_FROZEN = (1 << 6), }; struct dyn_ftrace { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3d90661a5f40..1904797f4a8a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -898,36 +897,6 @@ static struct dyn_ftrace *ftrace_free_records; } \ } -#ifdef CONFIG_KPROBES - -static int frozen_record_count; - -static inline void freeze_record(struct dyn_ftrace *rec) -{ - if (!(rec->flags & FTRACE_FL_FROZEN)) { - rec->flags |= FTRACE_FL_FROZEN; - frozen_record_count++; - } -} - -static inline void unfreeze_record(struct dyn_ftrace *rec) -{ - if (rec->flags & FTRACE_FL_FROZEN) { - rec->flags &= ~FTRACE_FL_FROZEN; - frozen_record_count--; - } -} - -static inline int record_frozen(struct dyn_ftrace *rec) -{ - return rec->flags & FTRACE_FL_FROZEN; -} -#else -# define freeze_record(rec) ({ 0; }) -# define unfreeze_record(rec) ({ 0; }) -# define record_frozen(rec) ({ 0; }) -#endif /* CONFIG_KPROBES */ - static void ftrace_free_rec(struct dyn_ftrace *rec) { rec->freelist = ftrace_free_records; @@ -1091,14 +1060,6 @@ static void ftrace_replace_code(int enable) !(rec->flags & FTRACE_FL_CONVERTED)) continue; - /* ignore updates to this record's mcount site */ - if (get_kprobe((void *)rec->ip)) { - freeze_record(rec); - continue; - } else { - unfreeze_record(rec); - } - failed = __ftrace_replace_code(rec, enable); if (failed) { rec->flags |= FTRACE_FL_FAILED; -- cgit v1.2.3 From 9e3af04f8787315f63f55b191bb9a06741dbf183 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 4 Feb 2010 00:48:00 -0800 Subject: Input: gpio-keys - add support for disabling gpios through sysfs Now gpio-keys input driver exports 4 new attributes to userland through sysfs: /sys/devices/platform/gpio-keys/keys [ro] /sys/devices/platform/gpio-keys/switches [ro] /sys/devices/platform/gpio-keys/disabled_keys [rw] /sys/devices/platform/gpio-keys/disables_switches [rw] With these attributes, userland program can read which keys and switches can be disabled and then disable/enable them as needed. Keys and switches are exported as stringified bitmap of codes (keycodes or switch codes). For example keys 15, 89, 100, 101, 102 are exported as: '15,89,100-102'. Description of the attributes: keys - bitmap of keys which can be disabled switches - bitmap of switches which can be disabled disabled_keys - bitmap of currently disabled keys (bit 1 means disabled, 0 enabled) disabled_switches - bitmap of currently disabled switches (bit 1 means disabled, 0 enabled) Signed-off-by: Mika Westerberg Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/gpio_keys.c | 318 +++++++++++++++++++++++++++++++++++-- include/linux/gpio_keys.h | 1 + 2 files changed, 308 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c index 1aff3b76effd..2b708aa85553 100644 --- a/drivers/input/keyboard/gpio_keys.c +++ b/drivers/input/keyboard/gpio_keys.c @@ -30,13 +30,289 @@ struct gpio_button_data { struct input_dev *input; struct timer_list timer; struct work_struct work; + bool disabled; }; struct gpio_keys_drvdata { struct input_dev *input; + struct mutex disable_lock; + unsigned int n_buttons; struct gpio_button_data data[0]; }; +/* + * SYSFS interface for enabling/disabling keys and switches: + * + * There are 4 attributes under /sys/devices/platform/gpio-keys/ + * keys [ro] - bitmap of keys (EV_KEY) which can be + * disabled + * switches [ro] - bitmap of switches (EV_SW) which can be + * disabled + * disabled_keys [rw] - bitmap of keys currently disabled + * disabled_switches [rw] - bitmap of switches currently disabled + * + * Userland can change these values and hence disable event generation + * for each key (or switch). Disabling a key means its interrupt line + * is disabled. + * + * For example, if we have following switches set up as gpio-keys: + * SW_DOCK = 5 + * SW_CAMERA_LENS_COVER = 9 + * SW_KEYPAD_SLIDE = 10 + * SW_FRONT_PROXIMITY = 11 + * This is read from switches: + * 11-9,5 + * Next we want to disable proximity (11) and dock (5), we write: + * 11,5 + * to file disabled_switches. Now proximity and dock IRQs are disabled. + * This can be verified by reading the file disabled_switches: + * 11,5 + * If we now want to enable proximity (11) switch we write: + * 5 + * to disabled_switches. + * + * We can disable only those keys which don't allow sharing the irq. + */ + +/** + * get_n_events_by_type() - returns maximum number of events per @type + * @type: type of button (%EV_KEY, %EV_SW) + * + * Return value of this function can be used to allocate bitmap + * large enough to hold all bits for given type. + */ +static inline int get_n_events_by_type(int type) +{ + BUG_ON(type != EV_SW && type != EV_KEY); + + return (type == EV_KEY) ? KEY_CNT : SW_CNT; +} + +/** + * gpio_keys_disable_button() - disables given GPIO button + * @bdata: button data for button to be disabled + * + * Disables button pointed by @bdata. This is done by masking + * IRQ line. After this function is called, button won't generate + * input events anymore. Note that one can only disable buttons + * that don't share IRQs. + * + * Make sure that @bdata->disable_lock is locked when entering + * this function to avoid races when concurrent threads are + * disabling buttons at the same time. + */ +static void gpio_keys_disable_button(struct gpio_button_data *bdata) +{ + if (!bdata->disabled) { + /* + * Disable IRQ and possible debouncing timer. + */ + disable_irq(gpio_to_irq(bdata->button->gpio)); + if (bdata->button->debounce_interval) + del_timer_sync(&bdata->timer); + + bdata->disabled = true; + } +} + +/** + * gpio_keys_enable_button() - enables given GPIO button + * @bdata: button data for button to be disabled + * + * Enables given button pointed by @bdata. + * + * Make sure that @bdata->disable_lock is locked when entering + * this function to avoid races with concurrent threads trying + * to enable the same button at the same time. + */ +static void gpio_keys_enable_button(struct gpio_button_data *bdata) +{ + if (bdata->disabled) { + enable_irq(gpio_to_irq(bdata->button->gpio)); + bdata->disabled = false; + } +} + +/** + * gpio_keys_attr_show_helper() - fill in stringified bitmap of buttons + * @ddata: pointer to drvdata + * @buf: buffer where stringified bitmap is written + * @type: button type (%EV_KEY, %EV_SW) + * @only_disabled: does caller want only those buttons that are + * currently disabled or all buttons that can be + * disabled + * + * This function writes buttons that can be disabled to @buf. If + * @only_disabled is true, then @buf contains only those buttons + * that are currently disabled. Returns 0 on success or negative + * errno on failure. + */ +static ssize_t gpio_keys_attr_show_helper(struct gpio_keys_drvdata *ddata, + char *buf, unsigned int type, + bool only_disabled) +{ + int n_events = get_n_events_by_type(type); + unsigned long *bits; + ssize_t ret; + int i; + + bits = kcalloc(BITS_TO_LONGS(n_events), sizeof(*bits), GFP_KERNEL); + if (!bits) + return -ENOMEM; + + for (i = 0; i < ddata->n_buttons; i++) { + struct gpio_button_data *bdata = &ddata->data[i]; + + if (bdata->button->type != type) + continue; + + if (only_disabled && !bdata->disabled) + continue; + + __set_bit(bdata->button->code, bits); + } + + ret = bitmap_scnlistprintf(buf, PAGE_SIZE - 2, bits, n_events); + buf[ret++] = '\n'; + buf[ret] = '\0'; + + kfree(bits); + + return ret; +} + +/** + * gpio_keys_attr_store_helper() - enable/disable buttons based on given bitmap + * @ddata: pointer to drvdata + * @buf: buffer from userspace that contains stringified bitmap + * @type: button type (%EV_KEY, %EV_SW) + * + * This function parses stringified bitmap from @buf and disables/enables + * GPIO buttons accordinly. Returns 0 on success and negative error + * on failure. + */ +static ssize_t gpio_keys_attr_store_helper(struct gpio_keys_drvdata *ddata, + const char *buf, unsigned int type) +{ + int n_events = get_n_events_by_type(type); + unsigned long *bits; + ssize_t error; + int i; + + bits = kcalloc(BITS_TO_LONGS(n_events), sizeof(*bits), GFP_KERNEL); + if (!bits) + return -ENOMEM; + + error = bitmap_parselist(buf, bits, n_events); + if (error) + goto out; + + /* First validate */ + for (i = 0; i < ddata->n_buttons; i++) { + struct gpio_button_data *bdata = &ddata->data[i]; + + if (bdata->button->type != type) + continue; + + if (test_bit(bdata->button->code, bits) && + !bdata->button->can_disable) { + error = -EINVAL; + goto out; + } + } + + mutex_lock(&ddata->disable_lock); + + for (i = 0; i < ddata->n_buttons; i++) { + struct gpio_button_data *bdata = &ddata->data[i]; + + if (bdata->button->type != type) + continue; + + if (test_bit(bdata->button->code, bits)) + gpio_keys_disable_button(bdata); + else + gpio_keys_enable_button(bdata); + } + + mutex_unlock(&ddata->disable_lock); + +out: + kfree(bits); + return error; +} + +#define ATTR_SHOW_FN(name, type, only_disabled) \ +static ssize_t gpio_keys_show_##name(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct platform_device *pdev = to_platform_device(dev); \ + struct gpio_keys_drvdata *ddata = platform_get_drvdata(pdev); \ + \ + return gpio_keys_attr_show_helper(ddata, buf, \ + type, only_disabled); \ +} + +ATTR_SHOW_FN(keys, EV_KEY, false); +ATTR_SHOW_FN(switches, EV_SW, false); +ATTR_SHOW_FN(disabled_keys, EV_KEY, true); +ATTR_SHOW_FN(disabled_switches, EV_SW, true); + +/* + * ATTRIBUTES: + * + * /sys/devices/platform/gpio-keys/keys [ro] + * /sys/devices/platform/gpio-keys/switches [ro] + */ +static DEVICE_ATTR(keys, S_IRUGO, gpio_keys_show_keys, NULL); +static DEVICE_ATTR(switches, S_IRUGO, gpio_keys_show_switches, NULL); + +#define ATTR_STORE_FN(name, type) \ +static ssize_t gpio_keys_store_##name(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, \ + size_t count) \ +{ \ + struct platform_device *pdev = to_platform_device(dev); \ + struct gpio_keys_drvdata *ddata = platform_get_drvdata(pdev); \ + ssize_t error; \ + \ + error = gpio_keys_attr_store_helper(ddata, buf, type); \ + if (error) \ + return error; \ + \ + return count; \ +} + +ATTR_STORE_FN(disabled_keys, EV_KEY); +ATTR_STORE_FN(disabled_switches, EV_SW); + +/* + * ATTRIBUTES: + * + * /sys/devices/platform/gpio-keys/disabled_keys [rw] + * /sys/devices/platform/gpio-keys/disables_switches [rw] + */ +static DEVICE_ATTR(disabled_keys, S_IWUSR | S_IRUGO, + gpio_keys_show_disabled_keys, + gpio_keys_store_disabled_keys); +static DEVICE_ATTR(disabled_switches, S_IWUSR | S_IRUGO, + gpio_keys_show_disabled_switches, + gpio_keys_store_disabled_switches); + +static struct attribute *gpio_keys_attrs[] = { + &dev_attr_keys.attr, + &dev_attr_switches.attr, + &dev_attr_disabled_keys.attr, + &dev_attr_disabled_switches.attr, + NULL, +}; + +static struct attribute_group gpio_keys_attr_group = { + .attrs = gpio_keys_attrs, +}; + static void gpio_keys_report_event(struct gpio_button_data *bdata) { struct gpio_keys_button *button = bdata->button; @@ -79,11 +355,13 @@ static irqreturn_t gpio_keys_isr(int irq, void *dev_id) return IRQ_HANDLED; } -static int __devinit gpio_keys_setup_key(struct device *dev, +static int __devinit gpio_keys_setup_key(struct platform_device *pdev, struct gpio_button_data *bdata, struct gpio_keys_button *button) { char *desc = button->desc ? button->desc : "gpio_keys"; + struct device *dev = &pdev->dev; + unsigned long irqflags; int irq, error; setup_timer(&bdata->timer, gpio_keys_timer, (unsigned long)bdata); @@ -112,10 +390,15 @@ static int __devinit gpio_keys_setup_key(struct device *dev, goto fail3; } - error = request_irq(irq, gpio_keys_isr, - IRQF_SHARED | - IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, - desc, bdata); + irqflags = IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING; + /* + * If platform has specified that the button can be disabled, + * we don't want it to share the interrupt line. + */ + if (!button->can_disable) + irqflags |= IRQF_SHARED; + + error = request_irq(irq, gpio_keys_isr, irqflags, desc, bdata); if (error) { dev_err(dev, "Unable to claim irq %d; error %d\n", irq, error); @@ -149,6 +432,10 @@ static int __devinit gpio_keys_probe(struct platform_device *pdev) goto fail1; } + ddata->input = input; + ddata->n_buttons = pdata->nbuttons; + mutex_init(&ddata->disable_lock); + platform_set_drvdata(pdev, ddata); input->name = pdev->name; @@ -164,8 +451,6 @@ static int __devinit gpio_keys_probe(struct platform_device *pdev) if (pdata->rep) __set_bit(EV_REP, input->evbit); - ddata->input = input; - for (i = 0; i < pdata->nbuttons; i++) { struct gpio_keys_button *button = &pdata->buttons[i]; struct gpio_button_data *bdata = &ddata->data[i]; @@ -174,7 +459,7 @@ static int __devinit gpio_keys_probe(struct platform_device *pdev) bdata->input = input; bdata->button = button; - error = gpio_keys_setup_key(dev, bdata, button); + error = gpio_keys_setup_key(pdev, bdata, button); if (error) goto fail2; @@ -184,13 +469,20 @@ static int __devinit gpio_keys_probe(struct platform_device *pdev) input_set_capability(input, type, button->code); } - error = input_register_device(input); + error = sysfs_create_group(&pdev->dev.kobj, &gpio_keys_attr_group); if (error) { - dev_err(dev, "Unable to register input device, " - "error: %d\n", error); + dev_err(dev, "Unable to export keys/switches, error: %d\n", + error); goto fail2; } + error = input_register_device(input); + if (error) { + dev_err(dev, "Unable to register input device, error: %d\n", + error); + goto fail3; + } + /* get current state of buttons */ for (i = 0; i < pdata->nbuttons; i++) gpio_keys_report_event(&ddata->data[i]); @@ -200,6 +492,8 @@ static int __devinit gpio_keys_probe(struct platform_device *pdev) return 0; + fail3: + sysfs_remove_group(&pdev->dev.kobj, &gpio_keys_attr_group); fail2: while (--i >= 0) { free_irq(gpio_to_irq(pdata->buttons[i].gpio), &ddata->data[i]); @@ -224,6 +518,8 @@ static int __devexit gpio_keys_remove(struct platform_device *pdev) struct input_dev *input = ddata->input; int i; + sysfs_remove_group(&pdev->dev.kobj, &gpio_keys_attr_group); + device_init_wakeup(&pdev->dev, 0); for (i = 0; i < pdata->nbuttons; i++) { diff --git a/include/linux/gpio_keys.h b/include/linux/gpio_keys.h index 1289fa7623ca..cd0b3f30f48e 100644 --- a/include/linux/gpio_keys.h +++ b/include/linux/gpio_keys.h @@ -10,6 +10,7 @@ struct gpio_keys_button { int type; /* input event type (EV_KEY, EV_SW) */ int wakeup; /* configure the button as a wake-up source */ int debounce_interval; /* debounce ticks interval in msecs */ + bool can_disable; }; struct gpio_keys_platform_data { -- cgit v1.2.3 From fce877e3a429940a986e085a41e8b57f2d922e36 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Jan 2010 13:25:12 +0100 Subject: bitops: Ensure the compile time HWEIGHT is only used for such Avoid accidental misuse by failing to compile things Suggested-by: Andrew Morton Signed-off-by: Peter Zijlstra Cc: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 10 +++++++--- include/linux/bitops.h | 33 ++++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5b91992b6b25..96cfc1a4fe9f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -93,13 +93,16 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; -#define EVENT_CONSTRAINT(c, n, m) { \ +#define __EVENT_CONSTRAINT(c, n, m, w) {\ { .idxmsk64[0] = (n) }, \ .code = (c), \ .cmask = (m), \ - .weight = HWEIGHT64((u64)(n)), \ + .weight = (w), \ } +#define EVENT_CONSTRAINT(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) + #define INTEL_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) @@ -2622,7 +2625,8 @@ void __init init_hw_perf_events(void) register_die_notifier(&perf_event_nmi_notifier); unconstrained = (struct event_constraint) - EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 0); + __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, + 0, x86_pmu.num_events); pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.event_bits); diff --git a/include/linux/bitops.h b/include/linux/bitops.h index ba0fd1eb4af7..25b8b2f33ae9 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -45,19 +45,30 @@ static inline unsigned long hweight_long(unsigned long w) return sizeof(w) == 4 ? hweight32(w) : hweight64(w); } -#define HWEIGHT8(w) \ - ( (!!((w) & (1ULL << 0))) + \ - (!!((w) & (1ULL << 1))) + \ - (!!((w) & (1ULL << 2))) + \ - (!!((w) & (1ULL << 3))) + \ - (!!((w) & (1ULL << 4))) + \ - (!!((w) & (1ULL << 5))) + \ - (!!((w) & (1ULL << 6))) + \ +/* + * Clearly slow versions of the hweightN() functions, their benefit is + * of course compile time evaluation of constant arguments. + */ +#define HWEIGHT8(w) \ + ( BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + \ + (!!((w) & (1ULL << 0))) + \ + (!!((w) & (1ULL << 1))) + \ + (!!((w) & (1ULL << 2))) + \ + (!!((w) & (1ULL << 3))) + \ + (!!((w) & (1ULL << 4))) + \ + (!!((w) & (1ULL << 5))) + \ + (!!((w) & (1ULL << 6))) + \ (!!((w) & (1ULL << 7))) ) -#define HWEIGHT16(w) (HWEIGHT8(w) + HWEIGHT8(w >> 8)) -#define HWEIGHT32(w) (HWEIGHT16(w) + HWEIGHT16(w >> 16)) -#define HWEIGHT64(w) (HWEIGHT32(w) + HWEIGHT32(w >> 32)) +#define HWEIGHT16(w) (HWEIGHT8(w) + HWEIGHT8((w) >> 8)) +#define HWEIGHT32(w) (HWEIGHT16(w) + HWEIGHT16((w) >> 16)) +#define HWEIGHT64(w) (HWEIGHT32(w) + HWEIGHT32((w) >> 32)) + +/* + * Type invariant version that simply casts things to the + * largest type. + */ +#define HWEIGHT(w) HWEIGHT64((u64)(w)) /** * rol32 - rotate a 32-bit value left -- cgit v1.2.3 From 447a194b393f32699607fd99617a40abd6a95114 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 1 Feb 2010 14:50:01 +0200 Subject: perf_events, x86: Fix bug in hw_perf_enable() We cannot assume that because hwc->idx == assign[i], we can avoid reprogramming the counter in hw_perf_enable(). The event may have been scheduled out and another event may have been programmed into this counter. Thus, we need a more robust way of verifying if the counter still contains config/data related to an event. This patch adds a generation number to each counter on each cpu. Using this mechanism we can verify reliabilty whether the content of a counter corresponds to an event. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <4b66dc67.0b38560a.1635.ffffae18@mx.google.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 34 ++++++++++++++++++++++++++++------ include/linux/perf_event.h | 2 ++ 2 files changed, 30 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 96cfc1a4fe9f..a920f173a220 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -90,6 +90,7 @@ struct cpu_hw_events { int n_events; int n_added; int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ + u64 tags[X86_PMC_IDX_MAX]; struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ }; @@ -1142,6 +1143,8 @@ static int __hw_perf_event_init(struct perf_event *event) hwc->config = ARCH_PERFMON_EVENTSEL_INT; hwc->idx = -1; + hwc->last_cpu = -1; + hwc->last_tag = ~0ULL; /* * Count user and OS events unless requested not to. @@ -1457,11 +1460,14 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, return n; } - static inline void x86_assign_hw_event(struct perf_event *event, - struct hw_perf_event *hwc, int idx) + struct cpu_hw_events *cpuc, int i) { - hwc->idx = idx; + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = cpuc->assign[i]; + hwc->last_cpu = smp_processor_id(); + hwc->last_tag = ++cpuc->tags[i]; if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { hwc->config_base = 0; @@ -1480,6 +1486,15 @@ static inline void x86_assign_hw_event(struct perf_event *event, } } +static inline int match_prev_assignment(struct hw_perf_event *hwc, + struct cpu_hw_events *cpuc, + int i) +{ + return hwc->idx == cpuc->assign[i] && + hwc->last_cpu == smp_processor_id() && + hwc->last_tag == cpuc->tags[i]; +} + static void __x86_pmu_disable(struct perf_event *event, struct cpu_hw_events *cpuc); void hw_perf_enable(void) @@ -1508,7 +1523,14 @@ void hw_perf_enable(void) event = cpuc->event_list[i]; hwc = &event->hw; - if (hwc->idx == -1 || hwc->idx == cpuc->assign[i]) + /* + * we can avoid reprogramming counter if: + * - assigned same counter as last time + * - running on same CPU as last time + * - no other event has used the counter since + */ + if (hwc->idx == -1 || + match_prev_assignment(hwc, cpuc, i)) continue; __x86_pmu_disable(event, cpuc); @@ -1522,12 +1544,12 @@ void hw_perf_enable(void) hwc = &event->hw; if (hwc->idx == -1) { - x86_assign_hw_event(event, hwc, cpuc->assign[i]); + x86_assign_hw_event(event, cpuc, i); x86_perf_event_set_period(event, hwc, hwc->idx); } /* * need to mark as active because x86_pmu_disable() - * clear active_mask and eventsp[] yet it preserves + * clear active_mask and events[] yet it preserves * idx */ set_bit(hwc->idx, cpuc->active_mask); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 556b0f4a668e..071a7db52549 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -478,9 +478,11 @@ struct hw_perf_event { union { struct { /* hardware */ u64 config; + u64 last_tag; unsigned long config_base; unsigned long event_base; int idx; + int last_cpu; }; struct { /* software */ s64 remaining; -- cgit v1.2.3 From 2a61aa401638529cd4231f6106980d307fba98fa Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Fri, 11 Dec 2009 16:35:40 -0500 Subject: Fix misspellings of "invocation" in comments. Some comments misspell "invocation"; this fixes them. No code changes. Signed-off-by: Adam Buchbinder Signed-off-by: Jiri Kosina --- fs/buffer.c | 2 +- fs/mpage.c | 2 +- include/linux/mmzone.h | 2 +- kernel/sched_cpupri.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/buffer.c b/fs/buffer.c index 6fa530256bfd..1d920bab5e70 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2893,7 +2893,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, /* * The page straddles i_size. It must be zeroed out on each and every - * writepage invokation because it may be mmapped. "A file is mapped + * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." diff --git a/fs/mpage.c b/fs/mpage.c index 42381bd6543b..598d54e200eb 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -561,7 +561,7 @@ page_is_mapped: if (page->index >= end_index) { /* * The page straddles i_size. It must be zeroed out on each - * and every writepage invokation because it may be mmapped. + * and every writepage invocation because it may be mmapped. * "A file is mapped in multiples of the page size. For a file * that is not a multiple of the page size, the remaining memory * is zeroed when mapped, and writes to that region are not diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 30fe668c2542..e60a340fe890 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -349,7 +349,7 @@ struct zone { * prev_priority holds the scanning priority for this zone. It is * defined as the scanning priority at which we achieved our reclaim * target at the previous try_to_free_pages() or balance_pgdat() - * invokation. + * invocation. * * We use prev_priority as a measure of how much stress page reclaim is * under - it drives the swappiness decision: whether to unmap mapped diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 597b33099dfa..3db4b1a0e921 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -58,7 +58,7 @@ static int convert_prio(int prio) * @lowest_mask: A mask to fill in with selected CPUs (or NULL) * * Note: This function returns the recommended CPUs as calculated during the - * current invokation. By the time the call returns, the CPUs may have in + * current invocation. By the time the call returns, the CPUs may have in * fact changed priorities any number of times. While not ideal, it is not * an issue of correctness since the normal rebalancer logic will correct * any discrepancies created by racing against the uncertainty of the current -- cgit v1.2.3 From 6683ece36e3531fc8c75f69e7165c5f20930be88 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 4 Feb 2010 10:22:25 -0800 Subject: net: use helpers to access mc list V2 This patch introduces the similar helpers as those already done for uc list. However multicast lists are no list_head lists but "mademanually". The three macros added by this patch will make the transition of mc_list to list_head smooth in two steps: 1) convert all drivers to use these macros (with the original iterator of type "struct dev_mc_list") 2) once all drivers are converted, convert list type and iterators to "struct netdev_hw_addr" in one patch. >From now on, drivers can (and should) use "netdev_for_each_mc_addr" to iterate over the addresses with iterator of type "struct netdev_hw_addr". Also macros "netdev_mc_count" and "netdev_mc_empty" to read list's length. This is the state which should be reached in all drivers. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 622ba5aa93c4..e535700a3b72 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -268,6 +268,12 @@ struct netdev_hw_addr_list { #define netdev_for_each_uc_addr(ha, dev) \ list_for_each_entry(ha, &dev->uc.list, list) +#define netdev_mc_count(dev) ((dev)->mc_count) +#define netdev_mc_empty(dev) (netdev_mc_count(dev) == 0) + +#define netdev_for_each_mc_addr(mclist, dev) \ + for (mclist = dev->mc_list; mclist; mclist = mclist->next) + struct hh_cache { struct hh_cache *hh_next; /* Next entry */ atomic_t hh_refcnt; /* number of users */ -- cgit v1.2.3 From f8f76db1db369f3a130ac3fd33e2eee5f1610d9c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 4 Feb 2010 10:23:02 -0800 Subject: libphy: add phy_find_first function Many drivers do this in them manually. Now they can use this function. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 16 ++++++++++++++++ include/linux/phy.h | 1 + 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index adbc0fded130..db1794546c56 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -276,6 +276,22 @@ int phy_device_register(struct phy_device *phydev) } EXPORT_SYMBOL(phy_device_register); +/** + * phy_find_first - finds the first PHY device on the bus + * @bus: the target MII bus + */ +struct phy_device *phy_find_first(struct mii_bus *bus) +{ + int addr; + + for (addr = 0; addr < PHY_MAX_ADDR; addr++) { + if (bus->phy_map[addr]) + return bus->phy_map[addr]; + } + return NULL; +} +EXPORT_SYMBOL(phy_find_first); + /** * phy_prepare_link - prepares the PHY layer to monitor link status * @phydev: target phy_device struct diff --git a/include/linux/phy.h b/include/linux/phy.h index 6a7eb402165d..14d7fdf6a90a 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -452,6 +452,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface); struct phy_device * phy_attach(struct net_device *dev, const char *bus_id, u32 flags, phy_interface_t interface); +struct phy_device *phy_find_first(struct mii_bus *bus); int phy_connect_direct(struct net_device *dev, struct phy_device *phydev, void (*handler)(struct net_device *), u32 flags, phy_interface_t interface); -- cgit v1.2.3 From 8ee2bf9ab792d0c02b13ca3acbd036debb7745d9 Mon Sep 17 00:00:00 2001 From: Sriramakrishnan Date: Thu, 19 Nov 2009 15:58:25 +0530 Subject: TI Davinci EMAC : Re-use driver for other platforms. The davinci EMAC peripheral is also available on other TI platforms -notably TI AM3517 SoC. This patch modifies the config option and the platform structure header files so that the driver can be reused on non-davinci platforms as well. Signed-off-by: Sriramakrishnan Acked-by: Chaithrika U S Acked-by: David S. Miller Signed-off-by: Kevin Hilman --- arch/arm/mach-davinci/common.c | 2 +- arch/arm/mach-davinci/include/mach/da8xx.h | 2 +- arch/arm/mach-davinci/include/mach/dm365.h | 2 +- arch/arm/mach-davinci/include/mach/dm644x.h | 2 +- arch/arm/mach-davinci/include/mach/dm646x.h | 2 +- arch/arm/mach-davinci/include/mach/emac.h | 36 ----------------------------- drivers/net/Kconfig | 2 +- drivers/net/davinci_emac.c | 3 +-- include/linux/davinci_emac.h | 36 +++++++++++++++++++++++++++++ 9 files changed, 43 insertions(+), 44 deletions(-) delete mode 100644 arch/arm/mach-davinci/include/mach/emac.h create mode 100644 include/linux/davinci_emac.h (limited to 'include/linux') diff --git a/arch/arm/mach-davinci/common.c b/arch/arm/mach-davinci/common.c index c2de94cde56a..94f27cbcd55a 100644 --- a/arch/arm/mach-davinci/common.c +++ b/arch/arm/mach-davinci/common.c @@ -11,13 +11,13 @@ #include #include #include +#include #include #include #include #include -#include #include "clock.h" diff --git a/arch/arm/mach-davinci/include/mach/da8xx.h b/arch/arm/mach-davinci/include/mach/da8xx.h index d43a4b6b6d76..d9a7f11894c4 100644 --- a/arch/arm/mach-davinci/include/mach/da8xx.h +++ b/arch/arm/mach-davinci/include/mach/da8xx.h @@ -13,10 +13,10 @@ #include