diff options
434 files changed, 11468 insertions, 6315 deletions
@@ -2365,8 +2365,6 @@ E: acme@redhat.com W: http://oops.ghostprotocols.net:81/blog/ P: 1024D/9224DF01 D5DF E3BB E3C8 BCBB F8AD 841A B6AB 4681 9224 DF01 D: IPX, LLC, DCCP, cyc2x, wl3501_cs, net/ hacks -S: R. Brasílio Itiberê, 4270/1010 - Água Verde -S: 80240-060 - Curitiba - Paraná S: Brazil N: Karsten Merker diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index a851118775d8..6a8c73f55b80 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -1,18 +1,22 @@ CONFIG_RCU_TRACE debugfs Files and Formats -The rcutree implementation of RCU provides debugfs trace output that -summarizes counters and state. This information is useful for debugging -RCU itself, and can sometimes also help to debug abuses of RCU. -The following sections describe the debugfs files and formats. +The rcutree and rcutiny implementations of RCU provide debugfs trace +output that summarizes counters and state. This information is useful for +debugging RCU itself, and can sometimes also help to debug abuses of RCU. +The following sections describe the debugfs files and formats, first +for rcutree and next for rcutiny. -Hierarchical RCU debugfs Files and Formats +CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats -This implementation of RCU provides three debugfs files under the +These implementations of RCU provides five debugfs files under the top-level directory RCU: rcu/rcudata (which displays fields in struct -rcu_data), rcu/rcugp (which displays grace-period counters), and -rcu/rcuhier (which displays the struct rcu_node hierarchy). +rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of +rcu/rcudata), rcu/rcugp (which displays grace-period counters), +rcu/rcuhier (which displays the struct rcu_node hierarchy), and +rcu/rcu_pending (which displays counts of the reasons that the +rcu_pending() function decided that there was core RCU work to do). The output of "cat rcu/rcudata" looks as follows: @@ -130,7 +134,8 @@ o "ci" is the number of RCU callbacks that have been invoked for been registered in absence of CPU-hotplug activity. o "co" is the number of RCU callbacks that have been orphaned due to - this CPU going offline. + this CPU going offline. These orphaned callbacks have been moved + to an arbitrarily chosen online CPU. o "ca" is the number of RCU callbacks that have been adopted due to other CPUs going offline. Note that ci+co-ca+ql is the number of @@ -168,12 +173,12 @@ o "gpnum" is the number of grace periods that have started. It is The output of "cat rcu/rcuhier" looks as follows, with very long lines: -c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0 +c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 1/1 .>. 0:127 ^0 3/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 3/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 rcu_bh: -c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0 +c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 0/1 .>. 0:127 ^0 0/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 0/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 @@ -212,11 +217,6 @@ o "fqlh" is the number of calls to force_quiescent_state() that exited immediately (without even being counted in nfqs above) due to contention on ->fqslock. -o "oqlen" is the number of callbacks on the "orphan" callback - list. RCU callbacks are placed on this list by CPUs going - offline, and are "adopted" either by the CPU helping the outgoing - CPU or by the next rcu_barrier*() call, whichever comes first. - o Each element of the form "1/1 0:127 ^0" represents one struct rcu_node. Each line represents one level of the hierarchy, from root to leaves. It is best to think of the rcu_data structures @@ -326,3 +326,115 @@ o "nn" is the number of times that this CPU needed nothing. Alert readers will note that the rcu "nn" number for a given CPU very closely matches the rcu_bh "np" number for that same CPU. This is due to short-circuit evaluation in rcu_pending(). + + +CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats + +These implementations of RCU provides a single debugfs file under the +top-level directory RCU, namely rcu/rcudata, which displays fields in +rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU, +rcu_preempt_ctrlblk. + +The output of "cat rcu/rcudata" is as follows: + +rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=... + ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274 + normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0 + exp balk: bt=0 nos=0 +rcu_sched: qlen: 0 +rcu_bh: qlen: 0 + +This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the +rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds. +The last three lines of the rcu_preempt section appear only in +CONFIG_RCU_BOOST kernel builds. The fields are as follows: + +o "qlen" is the number of RCU callbacks currently waiting either + for an RCU grace period or waiting to be invoked. This is the + only field present for rcu_sched and rcu_bh, due to the + short-circuiting of grace period in those two cases. + +o "gp" is the number of grace periods that have completed. + +o "g197/p197/c197" displays the grace-period state, with the + "g" number being the number of grace periods that have started + (mod 256), the "p" number being the number of grace periods + that the CPU has responded to (also mod 256), and the "c" + number being the number of grace periods that have completed + (once again mode 256). + + Why have both "gp" and "g"? Because the data flowing into + "gp" is only present in a CONFIG_RCU_TRACE kernel. + +o "tasks" is a set of bits. The first bit is "T" if there are + currently tasks that have recently blocked within an RCU + read-side critical section, the second bit is "N" if any of the + aforementioned tasks are blocking the current RCU grace period, + and the third bit is "E" if any of the aforementioned tasks are + blocking the current expedited grace period. Each bit is "." + if the corresponding condition does not hold. + +o "ttb" is a single bit. It is "B" if any of the blocked tasks + need to be priority boosted and "." otherwise. + +o "btg" indicates whether boosting has been carried out during + the current grace period, with "exp" indicating that boosting + is in progress for an expedited grace period, "no" indicating + that boosting has not yet started for a normal grace period, + "begun" indicating that boosting has bebug for a normal grace + period, and "done" indicating that boosting has completed for + a normal grace period. + +o "ntb" is the total number of tasks subjected to RCU priority boosting + periods since boot. + +o "neb" is the number of expedited grace periods that have had + to resort to RCU priority boosting since boot. + +o "nnb" is the number of normal grace periods that have had + to resort to RCU priority boosting since boot. + +o "j" is the low-order 12 bits of the jiffies counter in hexadecimal. + +o "bt" is the low-order 12 bits of the value that the jiffies counter + will have at the next time that boosting is scheduled to begin. + +o In the line beginning with "normal balk", the fields are as follows: + + o "nt" is the number of times that the system balked from + boosting because there were no blocked tasks to boost. + Note that the system will balk from boosting even if the + grace period is overdue when the currently running task + is looping within an RCU read-side critical section. + There is no point in boosting in this case, because + boosting a running task won't make it run any faster. + + o "gt" is the number of times that the system balked + from boosting because, although there were blocked tasks, + none of them were preventing the current grace period + from completing. + + o "bt" is the number of times that the system balked + from boosting because boosting was already in progress. + + o "b" is the number of times that the system balked from + boosting because boosting had already completed for + the grace period in question. + + o "ny" is the number of times that the system balked from + boosting because it was not yet time to start boosting + the grace period in question. + + o "nos" is the number of times that the system balked from + boosting for inexplicable ("not otherwise specified") + reasons. This can actually happen due to races involving + increments of the jiffies counter. + +o In the line beginning with "exp balk", the fields are as follows: + + o "bt" is the number of times that the system balked from + boosting because there were no blocked tasks to boost. + + o "nos" is the number of times that the system balked from + boosting for inexplicable ("not otherwise specified") + reasons. diff --git a/Documentation/dontdiff b/Documentation/dontdiff index d9bcffd59433..470d3dba1a69 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -62,6 +62,10 @@ aic7*reg_print.c* aic7*seq.h* aicasm aicdb.h* +altivec1.c +altivec2.c +altivec4.c +altivec8.c asm-offsets.h asm_offsets.h autoconf.h* @@ -76,6 +80,7 @@ btfixupprep build bvmlinux bzImage* +capflags.c classlist.h* comp*.log compile.h* @@ -94,6 +99,7 @@ devlist.h* docproc elf2ecoff elfconfig.h* +evergreen_reg_safe.h fixdep flask.h fore200e_mkfirm @@ -108,9 +114,16 @@ genksyms *_gray256.c ihex2fw ikconfig.h* +inat-tables.c initramfs_data.cpio initramfs_data.cpio.gz initramfs_list +int16.c +int1.c +int2.c +int32.c +int4.c +int8.c kallsyms kconfig keywords.c @@ -140,6 +153,7 @@ mkprep mktables mktree modpost +modules.builtin modules.order modversions.h* ncscope.* @@ -153,14 +167,23 @@ pca200e.bin pca200e_ecd.bin2 piggy.gz piggyback +piggy.S pnmtologo ppc_defs.h* pss_boot.h qconf +r100_reg_safe.h +r200_reg_safe.h +r300_reg_safe.h +r420_reg_safe.h +r600_reg_safe.h raid6altivec*.c raid6int*.c raid6tables.c relocs +rn50_reg_safe.h +rs600_reg_safe.h +rv515_reg_safe.h series setup setup.bin @@ -169,6 +192,7 @@ sImage sm_tbl* split-include syscalltab.h +tables.c tags tftpboot.img timeconst.h @@ -190,6 +214,7 @@ vmlinux vmlinux-* vmlinux.aout vmlinux.lds +voffset.h vsyscall.lds vsyscall_32.lds wanxlfw.inc @@ -200,3 +225,4 @@ wakeup.elf wakeup.lds zImage* zconf.hash.c +zoffset.h diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index b6426f15b4ae..33fa3e5d38fd 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -18,7 +18,6 @@ prototypes: char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); locking rules: - none have BKL dcache_lock rename_lock ->d_lock may block d_revalidate: no no no yes d_hash no no no yes @@ -42,18 +41,23 @@ ata *); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); int (*readlink) (struct dentry *, char __user *,int); - int (*follow_link) (struct dentry *, struct nameidata *); + void * (*follow_link) (struct dentry *, struct nameidata *); + void (*put_link) (struct dentry *, struct nameidata *, void *); void (*truncate) (struct inode *); int (*permission) (struct inode *, int, struct nameidata *); + int (*check_acl)(struct inode *, int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); + void (*truncate_range)(struct inode *, loff_t, loff_t); + long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len); + int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); locking rules: - all may block, none have BKL + all may block i_mutex(inode) lookup: yes create: yes @@ -66,19 +70,24 @@ rmdir: yes (both) (see below) rename: yes (all) (see below) readlink: no follow_link: no +put_link: no truncate: yes (see below) setattr: yes permission: no +check_acl: no getattr: no setxattr: yes getxattr: no listxattr: no removexattr: yes +truncate_range: yes +fallocate: no +fiemap: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. ->truncate() is never called directly - it's a callback, not a -method. It's called by vmtruncate() - library function normally used by +method. It's called by vmtruncate() - deprecated library function used by ->setattr(). Locking information above applies to that call (i.e. is inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been passed). @@ -91,7 +100,7 @@ prototypes: struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); void (*dirty_inode) (struct inode *); - int (*write_inode) (struct inode *, int); + int (*write_inode) (struct inode *, struct writeback_control *wbc); int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); @@ -105,10 +114,10 @@ prototypes: int (*show_options)(struct seq_file *, struct vfsmount *); ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); + int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); locking rules: All may block [not true, see below] - None have BKL s_umount alloc_inode: destroy_inode: @@ -127,6 +136,7 @@ umount_begin: no show_options: no (namespace_sem) quota_read: no (see below) quota_write: no (see below) +bdev_try_to_free_page: no (see below) ->statfs() has s_umount (shared) when called by ustat(2) (native or compat), but that's an accident of bad API; s_umount is used to pin @@ -139,19 +149,25 @@ be the only ones operating on the quota file by the quota code (via dqio_sem) (unless an admin really wants to screw up something and writes to quota files with quotas on). For other details about locking see also dquot_operations section. +->bdev_try_to_free_page is called from the ->releasepage handler of +the block device inode. See there for more details. --------------------------- file_system_type --------------------------- prototypes: int (*get_sb) (struct file_system_type *, int, const char *, void *, struct vfsmount *); + struct dentry *(*mount) (struct file_system_type *, int, + const char *, void *); void (*kill_sb) (struct super_block *); locking rules: - may block BKL -get_sb yes no -kill_sb yes no + may block +get_sb yes +mount yes +kill_sb yes ->get_sb() returns error or 0 with locked superblock attached to the vfsmount (exclusive on ->s_umount). +->mount() returns ERR_PTR or the root dentry. ->kill_sb() takes a write-locked superblock, does all shutdown work on it, unlocks and drops the reference. @@ -176,27 +192,35 @@ prototypes: void (*freepage)(struct page *); int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, loff_t offset, unsigned long nr_segs); - int (*launder_page) (struct page *); + int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, + unsigned long *); + int (*migratepage)(struct address_space *, struct page *, struct page *); + int (*launder_page)(struct page *); + int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long); + int (*error_remove_page)(struct address_space *, struct page *); locking rules: All except set_page_dirty and freepage may block - BKL PageLocked(page) i_mutex -writepage: no yes, unlocks (see below) -readpage: no yes, unlocks -sync_page: no maybe -writepages: no -set_page_dirty no no -readpages: no -write_begin: no locks the page yes -write_end: no yes, unlocks yes -perform_write: no n/a yes -bmap: no -invalidatepage: no yes -releasepage: no yes -freepage: no yes -direct_IO: no -launder_page: no yes + PageLocked(page) i_mutex +writepage: yes, unlocks (see below) +readpage: yes, unlocks +sync_page: maybe +writepages: +set_page_dirty no +readpages: +write_begin: locks the page yes +write_end: yes, unlocks yes +bmap: +invalidatepage: yes +releasepage: yes +freepage: yes +direct_IO: +get_xip_mem: maybe +migratepage: yes (both) +launder_page: yes +is_partially_uptodate: yes +error_remove_page: yes ->write_begin(), ->write_end(), ->sync_page() and ->readpage() may be called from the request handler (/dev/loop). @@ -276,9 +300,8 @@ under spinlock (it cannot block) and is sometimes called with the page not locked. ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some -filesystems and by the swapper. The latter will eventually go away. All -instances do not actually need the BKL. Please, keep it that way and don't -breed new callers. +filesystems and by the swapper. The latter will eventually go away. Please, +keep it that way and don't breed new callers. ->invalidatepage() is called when the filesystem must attempt to drop some or all of the buffers from the page when it is being truncated. It @@ -299,47 +322,37 @@ cleaned, or an error value if not. Note that in order to prevent the page getting mapped back in and redirtied, it needs to be kept locked across the entire operation. - Note: currently almost all instances of address_space methods are -using BKL for internal serialization and that's one of the worst sources -of contention. Normally they are calling library functions (in fs/buffer.c) -and pass foo_get_block() as a callback (on local block-based filesystems, -indeed). BKL is not needed for library stuff and is usually taken by -foo_get_block(). It's an overkill, since block bitmaps can be protected by -internal fs locking and real critical areas are much smaller than the areas -filesystems protect now. - ----------------------- file_lock_operations ------------------------------ prototypes: - void (*fl_insert)(struct file_lock *); /* lock insertion callback */ - void (*fl_remove)(struct file_lock *); /* lock removal callback */ void (*fl_copy_lock)(struct file_lock *, struct file_lock *); void (*fl_release_private)(struct file_lock *); locking rules: - BKL may block -fl_insert: yes no -fl_remove: yes no -fl_copy_lock: yes no -fl_release_private: yes yes + file_lock_lock may block +fl_copy_lock: yes no +fl_release_private: maybe no ----------------------- lock_manager_operations --------------------------- prototypes: int (*fl_compare_owner)(struct file_lock *, struct file_lock *); void (*fl_notify)(struct file_lock *); /* unblock callback */ + int (*fl_grant)(struct file_lock *, struct file_lock *, int); void (*fl_release_private)(struct file_lock *); void (*fl_break)(struct file_lock *); /* break_lease callback */ + int (*fl_mylease)(struct file_lock *, struct file_lock *); + int (*fl_change)(struct file_lock **, int); locking rules: - BKL may block -fl_compare_owner: yes no -fl_notify: yes no -fl_release_private: yes yes -fl_break: yes no - - Currently only NFSD and NLM provide instances of this class. None of the -them block. If you have out-of-tree instances - please, show up. Locking -in that area will change. + file_lock_lock may block +fl_compare_owner: yes no +fl_notify: yes no +fl_grant: no no +fl_release_private: maybe no +fl_break: yes no +fl_mylease: yes no +fl_change yes no + --------------------------- buffer_head ----------------------------------- prototypes: void (*b_end_io)(struct buffer_head *bh, int uptodate); @@ -364,17 +377,17 @@ prototypes: void (*swap_slot_free_notify) (struct block_device *, unsigned long); locking rules: - BKL bd_mutex -open: no yes -release: no yes -ioctl: no no -compat_ioctl: no no -direct_access: no no -media_changed: no no -unlock_native_capacity: no no -revalidate_disk: no no -getgeo: no no -swap_slot_free_notify: no no (see below) + bd_mutex +open: yes +release: yes +ioctl: no +compat_ioctl: no +direct_access: no +media_changed: no +unlock_native_capacity: no +revalidate_disk: no +getgeo: no +swap_slot_free_notify: no (see below) media_changed, unlock_native_capacity and revalidate_disk are called only from check_disk_change(). @@ -413,34 +426,21 @@ prototypes: unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); + int (*flock) (struct file *, int, struct file_lock *); + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, + size_t, unsigned int); + ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, + size_t, unsigned int); + int (*setlease)(struct file *, long, struct file_lock **); }; locking rules: - All may block. - BKL -llseek: no (see below) -read: no -aio_read: no -write: no -aio_write: no -readdir: no -poll: no -unlocked_ioctl: no -compat_ioctl: no -mmap: no -open: no -flush: no -release: no -fsync: no (see below) -aio_fsync: no -fasync: no -lock: yes -readv: no -writev: no -sendfile: no -sendpage: no -get_unmapped_area: no -check_flags: no + All may block except for ->setlease. + No VFS locks held on entry except for ->fsync and ->setlease. + +->fsync() has i_mutex on inode. + +->setlease has the file_list_lock held and must not sleep. ->llseek() locking has moved from llseek to the individual llseek implementations. If your fs is not using generic_file_llseek, you @@ -450,17 +450,10 @@ mutex or just to use i_size_read() instead. Note: this does not protect the file->f_pos against concurrent modifications since this is something the userspace has to take care about. -Note: ext2_release() was *the* source of contention on fs-intensive -loads and dropping BKL on ->release() helps to get rid of that (we still -grab BKL for cases when we close a file that had been opened r/w, but that -can and should be done using the internal locking with smaller critical areas). -Current worst offender is ext2_get_block()... - -->fasync() is called without BKL protection, and is responsible for -maintaining the FASYNC bit in filp->f_flags. Most instances call -fasync_helper(), which does that maintenance, so it's not normally -something one needs to worry about. Return values > 0 will be mapped to -zero in the VFS layer. +->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags. +Most instances call fasync_helper(), which does that maintenance, so it's +not normally something one needs to worry about. Return values > 0 will be +mapped to zero in the VFS layer. ->readdir() and ->ioctl() on directories must be changed. Ideally we would move ->readdir() to inode_operations and use a separate method for directory @@ -471,8 +464,6 @@ components. And there are other reasons why the current interface is a mess... ->read on directories probably must go away - we should just enforce -EISDIR in sys_read() and friends. -->fsync() has i_mutex on inode. - --------------------------- dquot_operations ------------------------------- prototypes: int (*write_dquot) (struct dquot *); @@ -507,12 +498,12 @@ prototypes: int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: - BKL mmap_sem PageLocked(page) -open: no yes -close: no yes -fault: no yes can return with page locked -page_mkwrite: no yes can return with page locked -access: no yes + mmap_sem PageLocked(page) +open: yes +close: yes +fault: yes can return with page locked +page_mkwrite: yes can return with page locked +access: yes ->fault() is called when a previously not present pte is about to be faulted in. The filesystem must find and return the page associated @@ -539,6 +530,3 @@ VM_IO | VM_PFNMAP VMAs. (if you break something or notice that it is broken and do not fix it yourself - at least put it here) - -ipc/shm.c::shm_delete() - may need BKL. -->read() and ->write() in many drivers are (probably) missing BKL. diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt index 715eaaf1519d..9a8674629a07 100644 --- a/Documentation/kernel-docs.txt +++ b/Documentation/kernel-docs.txt @@ -537,7 +537,7 @@ Notes: Further information in http://www.oreilly.com/catalog/linuxdrive2/ - * Title: "Linux Device Drivers, 3nd Edition" + * Title: "Linux Device Drivers, 3rd Edition" Authors: Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman Publisher: O'Reilly & Associates. Date: 2005. @@ -592,14 +592,6 @@ Pages: 600. ISBN: 0-13-101908-2 - * Title: "The Design and Implementation of the 4.4 BSD UNIX - Operating System" - Author: Marshall Kirk McKusick, Keith Bostic, Michael J. Karels, - John S. Quarterman. - Publisher: Addison-Wesley. - Date: 1996. - ISBN: 0-201-54979-4 - * Title: "Programming for the real world - POSIX.4" Author: Bill O. Gallmeister. Publisher: O'Reilly & Associates, Inc.. @@ -610,28 +602,13 @@ POSIX. Good reference. * Title: "UNIX Systems for Modern Architectures: Symmetric - Multiprocesssing and Caching for Kernel Programmers" + Multiprocessing and Caching for Kernel Programmers" Author: Curt Schimmel. Publisher: Addison Wesley. Date: June, 1994. Pages: 432. ISBN: 0-201-63338-8 - * Title: "The Design and Implementation of the 4.3 BSD UNIX - Operating System" - Author: Samuel J. Leffler, Marshall Kirk McKusick, Michael J. - Karels, John S. Quarterman. - Publisher: Addison-Wesley. - Date: 1989 (reprinted with corrections on October, 1990). - ISBN: 0-201-06196-1 - - * Title: "The Design of the UNIX Operating System" - Author: Maurice J. Bach. - Publisher: Prentice Hall. - Date: 1986. - Pages: 471. - ISBN: 0-13-201757-1 - MISCELLANEOUS: * Name: linux/Documentation diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 8b61c9360999..f3dc951e949f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1579,20 +1579,12 @@ and is between 256 and 4096 characters. It is defined in the file nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels Format: [panic,][num] - Valid num: 0,1,2 + Valid num: 0 0 - turn nmi_watchdog off - 1 - use the IO-APIC timer for the NMI watchdog - 2 - use the local APIC for the NMI watchdog using - a performance counter. Note: This will use one - performance counter and the local APIC's performance - vector. When panic is specified, panic when an NMI watchdog timeout occurs. This is useful when you use a panic=... timeout and need the box quickly up again. - Instead of 1 and 2 it is possible to use the following - symbolic names: lapic and ioapic - Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic netpoll.carrier_timeout= [NET] Specifies amount of time (in seconds) that @@ -1622,6 +1614,8 @@ and is between 256 and 4096 characters. It is defined in the file noapic [SMP,APIC] Tells the kernel to not make use of any IOAPICs that may be present in the system. + noautogroup Disable scheduler automatic task group creation. + nobats [PPC] Do not use BATs for mapping kernel lowmem on "Classic" PPC cores. @@ -1759,7 +1753,7 @@ and is between 256 and 4096 characters. It is defined in the file nousb [USB] Disable the USB subsystem - nowatchdog [KNL] Disable the lockup detector. + nowatchdog [KNL] Disable the lockup detector (NMI watchdog). nowb [ARM] @@ -2467,12 +2461,13 @@ and is between 256 and 4096 characters. It is defined in the file to facilitate early boot debugging. See also Documentation/trace/events.txt - tsc= Disable clocksource-must-verify flag for TSC. + tsc= Disable clocksource stability checks for TSC. Format: <string> [x86] reliable: mark tsc clocksource as reliable, this - disables clocksource verification at runtime. - Used to enable high-resolution timer mode on older - hardware, and in virtualized environment. + disables clocksource verification at runtime, as well + as the stability checks done at bootup. Used to enable + high-resolution timer mode on older hardware, and in + virtualized environment. [x86] noirqtime: Do not use TSC to do irq accounting. Used to run time disable IRQ_TIME_ACCOUNTING on any platforms where RDTSC is slow and this accounting diff --git a/Documentation/trace/events-power.txt b/Documentation/trace/events-power.txt new file mode 100644 index 000000000000..96d87b67fe37 --- /dev/null +++ b/Documentation/trace/events-power.txt @@ -0,0 +1,90 @@ + + Subsystem Trace Points: power + +The power tracing system captures events related to power transitions +within the kernel. Broadly speaking there are three major subheadings: + + o Power state switch which reports events related to suspend (S-states), + cpuidle (C-states) and cpufreq (P-states) + o System clock related changes + o Power domains related changes and transitions + +This document describes what each of the tracepoints is and why they +might be useful. + +Cf. include/trace/events/power.h for the events definitions. + +1. Power state switch events +============================ + +1.1 New trace API +----------------- + +A 'cpu' event class gathers the CPU-related events: cpuidle and +cpufreq. + +cpu_idle "state=%lu cpu_id=%lu" +cpu_frequency "state=%lu cpu_id=%lu" + +A suspend event is used to indicate the system going in and out of the +suspend mode: + +machine_suspend "state=%lu" + + +Note: the value of '-1' or '4294967295' for state means an exit from the current state, +i.e. trace_cpu_idle(4, smp_processor_id()) means that the system +enters the idle state 4, while trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()) +means that the system exits the previous idle state. + +The event which has 'state=4294967295' in the trace is very important to the user +space tools which are using it to detect the end of the current state, and so to +correctly draw the states diagrams and to calculate accurate statistics etc. + +1.2 DEPRECATED trace API +------------------------ + +A new Kconfig option CONFIG_EVENT_POWER_TRACING_DEPRECATED with the default value of +'y' has been created. This allows the legacy trace power API to be used conjointly +with the new trace API. +The Kconfig option, the old trace API (in include/trace/events/power.h) and the +old trace points will disappear in a future release (namely 2.6.41). + +power_start "type=%lu state=%lu cpu_id=%lu" +power_frequency "type=%lu state=%lu cpu_id=%lu" +power_end "cpu_id=%lu" + +The 'type' parameter takes one of those macros: + . POWER_NONE = 0, + . POWER_CSTATE = 1, /* C-State */ + . POWER_PSTATE = 2, /* Fequency change or DVFS */ + +The 'state' parameter is set depending on the type: + . Target C-state for type=POWER_CSTATE, + . Target frequency for type=POWER_PSTATE, + +power_end is used to indicate the exit of a state, corresponding to the latest +power_start event. + +2. Clocks events +================ +The clock events are used for clock enable/disable and for +clock rate change. + +clock_enable "%s state=%lu cpu_id=%lu" +clock_disable "%s state=%lu cpu_id=%lu" +clock_set_rate "%s state=%lu cpu_id=%lu" + +The first parameter gives the clock name (e.g. "gpio1_iclk"). +The second parameter is '1' for enable, '0' for disable, the target +clock rate for set_rate. + +3. Power domains events +======================= +The power domain events are used for power domains transitions + +power_domain_target "%s state=%lu cpu_id=%lu" + +The first parameter gives the power domain name (e.g. "mpu_pwrdm"). +The second parameter is the power domain target state. + diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index 30b43e1b2697..bdeb81ccb5f6 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -600,6 +600,7 @@ Protocol: 2.07+ 0x00000001 lguest 0x00000002 Xen 0x00000003 Moorestown MID + 0x00000004 CE4100 TV Platform Field name: hardware_subarch_data Type: write (subarch-dependent) diff --git a/MAINTAINERS b/MAINTAINERS index 2424699364fe..db1c2b665a42 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -791,11 +791,14 @@ S: Maintained ARM/NOMADIK ARCHITECTURE M: Alessandro Rubini <rubini@unipv.it> +M: Linus Walleij <linus.walleij@stericsson.com> M: STEricsson <STEricsson_nomadik_linux@list.st.com> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-nomadik/ F: arch/arm/plat-nomadik/ +F: drivers/i2c/busses/i2c-nomadik.c +T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-stericsson.git ARM/OPENMOKO NEO FREERUNNER (GTA02) MACHINE SUPPORT M: Nelson Castillo <arhuaco@freaks-unidos.net> @@ -997,12 +1000,24 @@ F: drivers/i2c/busses/i2c-stu300.c F: drivers/rtc/rtc-coh901331.c F: drivers/watchdog/coh901327_wdt.c F: drivers/dma/coh901318* +F: drivers/mfd/ab3100* +F: drivers/rtc/rtc-ab3100.c +F: drivers/rtc/rtc-coh901331.c +T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-stericsson.git -ARM/U8500 ARM ARCHITECTURE +ARM/Ux500 ARM ARCHITECTURE M: Srinidhi Kasagar <srinidhi.kasagar@stericsson.com> +M: Linus Walleij <linus.walleij@stericsson.com> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-ux500/ +F: drivers/dma/ste_dma40* +F: drivers/mfd/ab3550* +F: drivers/mfd/abx500* +F: drivers/mfd/ab8500* +F: drivers/mfd/stmpe* +F: drivers/rtc/rtc-ab8500.c +T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-stericsson.git ARM/VFP SUPPORT M: Russell King <linux@arm.linux.org.uk> @@ -2811,6 +2826,10 @@ M: Thomas Gleixner <tglx@linutronix.de> S: Maintained F: Documentation/timers/ F: kernel/hrtimer.c +F: kernel/time/clockevents.c +F: kernel/time/tick*.* +F: kernel/time/timer_*.c +F include/linux/clockevents.h F: include/linux/hrtimer.h HIGH-SPEED SCC DRIVER FOR AX.25 @@ -4628,7 +4647,7 @@ PERFORMANCE EVENTS SUBSYSTEM M: Peter Zijlstra <a.p.zijlstra@chello.nl> M: Paul Mackerras <paulus@samba.org> M: Ingo Molnar <mingo@elte.hu> -M: Arnaldo Carvalho de Melo <acme@redhat.com> +M: Arnaldo Carvalho de Melo <acme@ghostprotocols.net> S: Supported F: kernel/perf_event*.c F: include/linux/perf_event.h @@ -5153,6 +5172,18 @@ L: alsa-devel@alsa-project.org (moderated for non-subscribers) S: Supported F: sound/soc/s3c24xx +TIMEKEEPING, NTP +M: John Stultz <johnstul@us.ibm.com> +M: Thomas Gleixner <tglx@linutronix.de> +S: Supported +F: include/linux/clocksource.h +F: include/linux/time.h +F: include/linux/timex.h +F: include/linux/timekeeping.h +F: kernel/time/clocksource.c +F: kernel/time/time*.c +F: kernel/time/ntp.c + TLG2300 VIDEO4LINUX-2 DRIVER M: Huang Shijie <shijie8@gmail.com> M: Kang Yong <kangyong@telegent.com> @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 37 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Flesh-Eating Bats with Fangs # *DOCUMENTATION* diff --git a/arch/Kconfig b/arch/Kconfig index 8bf0fa652eb6..f78c2be4242b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -175,4 +175,7 @@ config HAVE_PERF_EVENTS_NMI config HAVE_ARCH_JUMP_LABEL bool +config HAVE_ARCH_MUTEX_CPU_RELAX + bool + source "kernel/gcov/Kconfig" diff --git a/arch/alpha/include/asm/perf_event.h b/arch/alpha/include/asm/perf_event.h index fe792ca818f6..5996e7a6757e 100644 --- a/arch/alpha/include/asm/perf_event.h +++ b/arch/alpha/include/asm/perf_event.h @@ -1,10 +1,4 @@ #ifndef __ASM_ALPHA_PERF_EVENT_H #define __ASM_ALPHA_PERF_EVENT_H -#ifdef CONFIG_PERF_EVENTS -extern void init_hw_perf_events(void); -#else -static inline void init_hw_perf_events(void) { } -#endif - #endif /* __ASM_ALPHA_PERF_EVENT_H */ diff --git a/arch/alpha/kernel/irq_alpha.c b/arch/alpha/kernel/irq_alpha.c index 5f77afb88e89..4c8bb374eb0a 100644 --- a/arch/alpha/kernel/irq_alpha.c +++ b/arch/alpha/kernel/irq_alpha.c @@ -112,8 +112,6 @@ init_IRQ(void) wrent(entInt, 0); alpha_mv.init_irq(); - - init_hw_perf_events(); } /* diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 1cc49683fb69..90561c45e7d8 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -14,6 +14,7 @@ #include <linux/kernel.h> #include <linux/kdebug.h> #include <linux/mutex.h> +#include <linux/init.h> #include <asm/hwrpb.h> #include <asm/atomic.h> @@ -863,13 +864,13 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr, /* * Init call to initialise performance events at kernel startup. */ -void __init init_hw_perf_events(void) +int __init init_hw_perf_events(void) { pr_info("Performance events: "); if (!supported_cpu()) { pr_cont("No support for your CPU.\n"); - return; + return 0; } pr_cont("Supported CPU type!\n"); @@ -881,6 +882,8 @@ void __init init_hw_perf_events(void) /* And set up PMU specification */ alpha_pmu = &ev67_pmu; - perf_pmu_register(&pmu); -} + perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); + return 0; +} +early_initcall(init_hw_perf_events); diff --git a/arch/arm/common/it8152.c b/arch/arm/common/it8152.c index 1bec96e85196..42ff90b46dfb 100644 --- a/arch/arm/common/it8152.c +++ b/arch/arm/common/it8152.c @@ -352,3 +352,4 @@ struct pci_bus * __init it8152_pci_scan_bus(int nr, struct pci_sys_data *sys) return pci_scan_bus(nr, &it8152_ops, sys); } +EXPORT_SYMBOL(dma_set_coherent_mask); diff --git a/arch/arm/include/asm/hardware/it8152.h b/arch/arm/include/asm/hardware/it8152.h index 21fa272301f8..b2f95c72287c 100644 --- a/arch/arm/include/asm/hardware/it8152.h +++ b/arch/arm/include/asm/hardware/it8152.h @@ -76,6 +76,7 @@ extern unsigned long it8152_base_address; IT8152_PD_IRQ(0) Audio controller (ACR) */ #define IT8152_IRQ(x) (IRQ_BOARD_START + (x)) +#define IT8152_LAST_IRQ (IRQ_BOARD_START + 40) /* IRQ-sources in 3 groups - local devices, LPC (serial), and external PCI */ #define IT8152_LD_IRQ_COUNT 9 diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index 1fc684e70ab6..7080e2c8fa62 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -25,9 +25,6 @@ extern void *kmap_high(struct page *page); extern void *kmap_high_get(struct page *page); extern void kunmap_high(struct page *page); -extern void *kmap_high_l1_vipt(struct page *page, pte_t *saved_pte); -extern void kunmap_high_l1_vipt(struct page *page, pte_t saved_pte); - /* * The following functions are already defined by <linux/highmem.h> * when CONFIG_HIGHMEM is not set. diff --git a/arch/arm/include/asm/sizes.h b/arch/arm/include/asm/sizes.h index 4fc1565e4f93..316bb2b2be3d 100644 --- a/arch/arm/include/asm/sizes.h +++ b/arch/arm/include/asm/sizes.h @@ -13,9 +13,6 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -/* DO NOT EDIT!! - this file automatically generated - * from .s file by awk -f s2h.awk - */ /* Size definitions * Copyright (C) ARM Limited 1998. All rights reserved. */ @@ -25,6 +22,9 @@ /* handy sizes */ #define SZ_16 0x00000010 +#define SZ_32 0x00000020 +#define SZ_64 0x00000040 +#define SZ_128 0x00000080 #define SZ_256 0x00000100 #define SZ_512 0x00000200 diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h index 1120f18a6b17..80025948b8ad 100644 --- a/arch/arm/include/asm/system.h +++ b/arch/arm/include/asm/system.h @@ -150,6 +150,7 @@ extern unsigned int user_debug; #define rmb() dmb() #define wmb() mb() #else +#include <asm/memory.h> #define mb() do { if (arch_is_coherent()) dmb(); else barrier(); } while (0) #define rmb() do { if (arch_is_coherent()) dmb(); else barrier(); } while (0) #define wmb() do { if (arch_is_coherent()) dmb(); else barrier(); } while (0) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 8bfa98757cd2..80bf8cd88d7c 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -29,6 +29,9 @@ ret_fast_syscall: ldr r1, [tsk, #TI_FLAGS] tst r1, #_TIF_WORK_MASK bne fast_work_pending +#if defined(CONFIG_IRQSOFF_TRACER) + asm_trace_hardirqs_on +#endif /* perform architecture specific actions before user return */ arch_ret_to_user r1, lr @@ -65,6 +68,9 @@ ret_slow_syscall: tst r1, #_TIF_WORK_MASK bne work_pending no_work_pending: +#if defined(CONFIG_IRQSOFF_TRACER) + asm_trace_hardirqs_on +#endif /* perform architecture specific actions before user return */ arch_ret_to_user r1, lr diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index 07a50357492a..fdfa4976b0bf 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -3034,11 +3034,11 @@ init_hw_perf_events(void) pr_info("no hardware support available\n"); } - perf_pmu_register(&pmu); + perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); return 0; } -arch_initcall(init_hw_perf_events); +early_initcall(init_hw_perf_events); /* * Callchain handling code. diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 8c1959590252..9066473c0ebc 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -310,7 +310,6 @@ asmlinkage void __cpuinit secondary_start_kernel(void) * All kernel threads share the same mm context; grab a * reference and switch to it. */ - atomic_inc(&mm->mm_users); atomic_inc(&mm->mm_count); current->active_mm = mm; cpumask_set_cpu(cpu, mm_cpumask(mm)); diff --git a/arch/arm/mach-at91/include/mach/at91_mci.h b/arch/arm/mach-at91/include/mach/at91_mci.h index 57f8ee154943..27ac6f550fe3 100644 --- a/arch/arm/mach-at91/include/mach/at91_mci.h +++ b/arch/arm/mach-at91/include/mach/at91_mci.h @@ -74,6 +74,8 @@ #define AT91_MCI_TRTYP_BLOCK (0 << 19) #define AT91_MCI_TRTYP_MULTIPLE (1 << 19) #define AT91_MCI_TRTYP_STREAM (2 << 19) +#define AT91_MCI_TRTYP_SDIO_BYTE (4 << 19) +#define AT91_MCI_TRTYP_SDIO_BLOCK (5 << 19) #define AT91_MCI_BLKR 0x18 /* Block Register */ #define AT91_MCI_BLKR_BCNT(n) ((0xffff & (n)) << 0) /* Block count */ diff --git a/arch/arm/mach-ixp4xx/common-pci.c b/arch/arm/mach-ixp4xx/common-pci.c index 24498a932ba6..a54b3db80366 100644 --- a/arch/arm/mach-ixp4xx/common-pci.c +++ b/arch/arm/mach-ixp4xx/common-pci.c @@ -513,4 +513,4 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) EXPORT_SYMBOL(ixp4xx_pci_read); EXPORT_SYMBOL(ixp4xx_pci_write); - +EXPORT_SYMBOL(dma_set_coherent_mask); diff --git a/arch/arm/mach-pxa/Kconfig b/arch/arm/mach-pxa/Kconfig index dd235ecc9d6c..c93e73d54dd1 100644 --- a/arch/arm/mach-pxa/Kconfig +++ b/arch/arm/mach-pxa/Kconfig @@ -540,6 +540,7 @@ config MACH_ICONTROL config ARCH_PXA_ESERIES bool "PXA based Toshiba e-series PDAs" select PXA25x + select FB_W100 config MACH_E330 bool "Toshiba e330" diff --git a/arch/arm/mach-pxa/sleep.S b/arch/arm/mach-pxa/sleep.S index 52c30b01a671..ae008110db4e 100644 --- a/arch/arm/mach-pxa/sleep.S +++ b/arch/arm/mach-pxa/sleep.S @@ -353,8 +353,8 @@ resume_turn_on_mmu: @ Let us ensure we jump to resume_after_mmu only when the mcr above @ actually took effect. They call it the "cpwait" operation. - mrc p15, 0, r1, c2, c0, 0 @ queue a dependency on CP15 - sub pc, r2, r1, lsr #32 @ jump to virtual addr + mrc p15, 0, r0, c2, c0, 0 @ queue a dependency on CP15 + sub pc, r2, r0, lsr #32 @ jump to virtual addr nop nop nop diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c index 6e77c042d8e9..e0b0e7a4ec68 100644 --- a/arch/arm/mm/cache-feroceon-l2.c +++ b/arch/arm/mm/cache-feroceon-l2.c @@ -13,13 +13,9 @@ */ #include <linux/init.h> +#include <linux/highmem.h> #include <asm/cacheflush.h> -#include <asm/kmap_types.h> -#include <asm/fixmap.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> #include <plat/cache-feroceon-l2.h> -#include "mm.h" /* * Low-level cache maintenance operations. @@ -39,27 +35,30 @@ * between which we don't want to be preempted. */ -static inline unsigned long l2_start_va(unsigned long paddr) +static inline unsigned long l2_get_va(unsigned long paddr) { #ifdef CONFIG_HIGHMEM /* - * Let's do our own fixmap stuff in a minimal way here. * Because range ops can't be done on physical addresses, * we simply install a virtual mapping for it only for the * TLB lookup to occur, hence no need to flush the untouched - * memory mapping. This is protected with the disabling of - * interrupts by the caller. + * memory mapping afterwards (note: a cache flush may happen + * in some circumstances depending on the path taken in kunmap_atomic). */ - unsigned long idx = KM_L2_CACHE + KM_TYPE_NR * smp_processor_id(); - unsigned long vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte_ext(TOP_PTE(vaddr), pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL), 0); - local_flush_tlb_kernel_page(vaddr); - return vaddr + (paddr & ~PAGE_MASK); + void *vaddr = kmap_atomic_pfn(paddr >> PAGE_SHIFT); + return (unsigned long)vaddr + (paddr & ~PAGE_MASK); #else return __phys_to_virt(paddr); #endif } +static inline void l2_put_va(unsigned long vaddr) +{ +#ifdef CONFIG_HIGHMEM + kunmap_atomic((void *)vaddr); +#endif +} + static inline void l2_clean_pa(unsigned long addr) { __asm__("mcr p15, 1, %0, c15, c9, 3" : : "r" (addr)); @@ -76,13 +75,14 @@ static inline void l2_clean_pa_range(unsigned long start, unsigned long end) */ BUG_ON((start ^ end) >> PAGE_SHIFT); - raw_local_irq_save(flags); - va_start = l2_start_va(start); + va_start = l2_get_va(start); va_end = va_start + (end - start); + raw_local_irq_save(flags); __asm__("mcr p15, 1, %0, c15, c9, 4\n\t" "mcr p15, 1, %1, c15, c9, 5" : : "r" (va_start), "r" (va_end)); raw_local_irq_restore(flags); + l2_put_va(va_start); } static inline void l2_clean_inv_pa(unsigned long addr) @@ -106,13 +106,14 @@ static inline void l2_inv_pa_range(unsigned long start, unsigned long end) */ BUG_ON((start ^ end) >> PAGE_SHIFT); - raw_local_irq_save(flags); - va_start = l2_start_va(start); + va_start = l2_get_va(start); va_end = va_start + (end - start); + raw_local_irq_save(flags); __asm__("mcr p15, 1, %0, c15, c11, 4\n\t" "mcr p15, 1, %1, c15, c11, 5" : : "r" (va_start), "r" (va_end)); raw_local_irq_restore(flags); + l2_put_va(va_start); } static inline void l2_inv_all(void) diff --git a/arch/arm/mm/cache-xsc3l2.c b/arch/arm/mm/cache-xsc3l2.c index c3154928bccd..5a32020471e3 100644 --- a/arch/arm/mm/cache-xsc3l2.c +++ b/arch/arm/mm/cache-xsc3l2.c @@ -17,14 +17,10 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/init.h> +#include <linux/highmem.h> #include <asm/system.h> #include <asm/cputype.h> #include <asm/cacheflush.h> -#include <asm/kmap_types.h> -#include <asm/fixmap.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include "mm.h" #define CR_L2 (1 << 26) @@ -71,16 +67,15 @@ static inline void xsc3_l2_inv_all(void) dsb(); } +static inline void l2_unmap_va(unsigned long va) +{ #ifdef CONFIG_HIGHMEM -#define l2_map_save_flags(x) raw_local_save_flags(x) -#define l2_map_restore_flags(x) raw_local_irq_restore(x) -#else -#define l2_map_save_flags(x) ((x) = 0) -#define l2_map_restore_flags(x) ((void)(x)) + if (va != -1) + kunmap_atomic((void *)va); #endif +} -static inline unsigned long l2_map_va(unsigned long pa, unsigned long prev_va, - unsigned long flags) +static inline unsigned long l2_map_va(unsigned long pa, unsigned long prev_va) { #ifdef CONFIG_HIGHMEM unsigned long va = prev_va & PAGE_MASK; @@ -89,17 +84,10 @@ static inline unsigned long l2_map_va(unsigned long pa, unsigned long prev_va, /* * Switching to a new page. Because cache ops are * using virtual addresses only, we must put a mapping - * in place for it. We also enable interrupts for a - * short while and disable them again to protect this - * mapping. + * in place for it. */ - unsigned long idx; - raw_local_irq_restore(flags); - idx = KM_L2_CACHE + KM_TYPE_NR * smp_processor_id(); - va = __fix_to_virt(FIX_KMAP_BEGIN + idx); - raw_local_irq_restore(flags | PSR_I_BIT); - set_pte_ext(TOP_PTE(va), pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL), 0); - local_flush_tlb_kernel_page(va); + l2_unmap_va(prev_va); + va = (unsigned long)kmap_atomic_pfn(pa >> PAGE_SHIFT); } return va + (pa_offset >> (32 - PAGE_SHIFT)); #else @@ -109,7 +97,7 @@ static inline unsigned long l2_map_va(unsigned long pa, unsigned long prev_va, static void xsc3_l2_inv_range(unsigned long start, unsigned long end) { - unsigned long vaddr, flags; + unsigned long vaddr; if (start == 0 && end == -1ul) { xsc3_l2_inv_all(); @@ -117,13 +105,12 @@ static void xsc3_l2_inv_range(unsigned long start, unsigned long end) } vaddr = -1; /* to force the first mapping */ - l2_map_save_flags(flags); /* * Clean and invalidate partial first cache line. */ if (start & (CACHE_LINE_SIZE - 1)) { - vaddr = l2_map_va(start & ~(CACHE_LINE_SIZE - 1), vaddr, flags); + vaddr = l2_map_va(start & ~(CACHE_LINE_SIZE - 1), vaddr); xsc3_l2_clean_mva(vaddr); xsc3_l2_inv_mva(vaddr); start = (start | (CACHE_LINE_SIZE - 1)) + 1; @@ -133,7 +120,7 @@ static void xsc3_l2_inv_range(unsigned long start, unsigned long end) * Invalidate all full cache lines between 'start' and 'end'. */ while (start < (end & ~(CACHE_LINE_SIZE - 1))) { - vaddr = l2_map_va(start, vaddr, flags); + vaddr = l2_map_va(start, vaddr); xsc3_l2_inv_mva(vaddr); start += CACHE_LINE_SIZE; } @@ -142,31 +129,30 @@ static void xsc3_l2_inv_range(unsigned long start, unsigned long end) * Clean and invalidate partial last cache line. */ if (start < end) { - vaddr = l2_map_va(start, vaddr, flags); + vaddr = l2_map_va(start, vaddr); xsc3_l2_clean_mva(vaddr); xsc3_l2_inv_mva(vaddr); } - l2_map_restore_flags(flags); + l2_unmap_va(vaddr); dsb(); } static void xsc3_l2_clean_range(unsigned long start, unsigned long end) { - unsigned long vaddr, flags; + unsigned long vaddr; vaddr = -1; /* to force the first mapping */ - l2_map_save_flags(flags); start &= ~(CACHE_LINE_SIZE - 1); while (start < end) { - vaddr = l2_map_va(start, vaddr, flags); + vaddr = l2_map_va(start, vaddr); xsc3_l2_clean_mva(vaddr); start += CACHE_LINE_SIZE; } - l2_map_restore_flags(flags); + l2_unmap_va(vaddr); dsb(); } @@ -193,7 +179,7 @@ static inline void xsc3_l2_flush_all(void) static void xsc3_l2_flush_range(unsigned long start, unsigned long end) { - unsigned long vaddr, flags; + unsigned long vaddr; if (start == 0 && end == -1ul) { xsc3_l2_flush_all(); @@ -201,17 +187,16 @@ static void xsc3_l2_flush_range(unsigned long start, unsigned long end) } vaddr = -1; /* to force the first mapping */ - l2_map_save_flags(flags); start &= ~(CACHE_LINE_SIZE - 1); while (start < end) { - vaddr = l2_map_va(start, vaddr, flags); + vaddr = l2_map_va(start, vaddr); xsc3_l2_clean_mva(vaddr); xsc3_l2_inv_mva(vaddr); start += CACHE_LINE_SIZE; } - l2_map_restore_flags(flags); + l2_unmap_va(vaddr); dsb(); } diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ac6a36142fcd..809f1bf9fa29 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -17,6 +17,7 @@ #include <linux/init.h> #include <linux/device.h> #include <linux/dma-mapping.h> +#include <linux/highmem.h> #include <asm/memory.h> #include <asm/highmem.h> @@ -480,10 +481,10 @@ static void dma_cache_maint_page(struct page *page, unsigned long offset, op(vaddr, len, dir); kunmap_high(page); } else if (cache_is_vipt()) { - pte_t saved_pte; - vaddr = kmap_high_l1_vipt(page, &saved_pte); + /* unmapped pages might still be cached */ + vaddr = kmap_atomic(page); op(vaddr + offset, len, dir); - kunmap_high_l1_vipt(page, saved_pte); + kunmap_atomic(vaddr); } } else { vaddr = page_address(page) + offset; diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 391ffae75098..c29f2839f1d2 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/pagemap.h> +#include <linux/highmem.h> #include <asm/cacheflush.h> #include <asm/cachetype.h> @@ -180,10 +181,10 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page) __cpuc_flush_dcache_area(addr, PAGE_SIZE); kunmap_high(page); } else if (cache_is_vipt()) { - pte_t saved_pte; - addr = kmap_high_l1_vipt(page, &saved_pte); + /* unmapped pages might still be cached */ + addr = kmap_atomic(page); __cpuc_flush_dcache_area(addr, PAGE_SIZE); - kunmap_high_l1_vipt(page, saved_pte); + kunmap_atomic(addr); } } diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index c435fd9e1da9..807c0573abbe 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -140,90 +140,3 @@ struct page *kmap_atomic_to_page(const void *ptr) pte = TOP_PTE(vaddr); return pte_page(*pte); } - -#ifdef CONFIG_CPU_CACHE_VIPT - -#include <linux/percpu.h> - -/* - * The VIVT cache of a highmem page is always flushed before the page - * is unmapped. Hence unmapped highmem pages need no cache maintenance - * in that case. - * - * However unmapped pages may still be cached with a VIPT cache, and - * it is not possible to perform cache maintenance on them using physical - * addresses unfortunately. So we have no choice but to set up a temporary - * virtual mapping for that purpose. - * - * Yet this VIPT cache maintenance may be triggered from DMA support - * functions which are possibly called from interrupt context. As we don't - * want to keep interrupt disabled all the time when such maintenance is - * taking place, we therefore allow for some reentrancy by preserving and - * restoring the previous fixmap entry before the interrupted context is - * resumed. If the reentrancy depth is 0 then there is no need to restore - * the previous fixmap, and leaving the current one in place allow it to - * be reused the next time without a TLB flush (common with DMA). - */ - -static DEFINE_PER_CPU(int, kmap_high_l1_vipt_depth); - -void *kmap_high_l1_vipt(struct page *page, pte_t *saved_pte) -{ - unsigned int idx, cpu; - int *depth; - unsigned long vaddr, flags; - pte_t pte, *ptep; - - if (!in_interrupt()) - preempt_disable(); - - cpu = smp_processor_id(); - depth = &per_cpu(kmap_high_l1_vipt_depth, cpu); - - idx = KM_L1_CACHE + KM_TYPE_NR * cpu; - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - ptep = TOP_PTE(vaddr); - pte = mk_pte(page, kmap_prot); - - raw_local_irq_save(flags); - (*depth)++; - if (pte_val(*ptep) == pte_val(pte)) { - *saved_pte = pte; - } else { - *saved_pte = *ptep; - set_pte_ext(ptep, pte, 0); - local_flush_tlb_kernel_page(vaddr); - } - raw_local_irq_restore(flags); - - return (void *)vaddr; -} - -void kunmap_high_l1_vipt(struct page *page, pte_t saved_pte) -{ - unsigned int idx, cpu = smp_processor_id(); - int *depth = &per_cpu(kmap_high_l1_vipt_depth, cpu); - unsigned long vaddr, flags; - pte_t pte, *ptep; - - idx = KM_L1_CACHE + KM_TYPE_NR * cpu; - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - ptep = TOP_PTE(vaddr); - pte = mk_pte(page, kmap_prot); - - BUG_ON(pte_val(*ptep) != pte_val(pte)); - BUG_ON(*depth <= 0); - - raw_local_irq_save(flags); - (*depth)--; - if (*depth != 0 && pte_val(pte) != pte_val(saved_pte)) { - set_pte_ext(ptep, saved_pte, 0); - local_flush_tlb_kernel_page(vaddr); - } - raw_local_irq_restore(flags); - - if (!in_interrupt()) - preempt_enable(); -} - -#endif /* CONFIG_CPU_CACHE_VIPT */ diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 5c7c6fc07565..183e0d226669 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -1047,6 +1047,6 @@ init_hw_perf_events(void) return 0; } -arch_initcall(init_hw_perf_events); +early_initcall(init_hw_perf_events); #endif /* defined(CONFIG_CPU_MIPS32)... */ diff --git a/arch/mn10300/kernel/irq.c b/arch/mn10300/kernel/irq.c index c2e44597c22b..ac11754ecec5 100644 --- a/arch/mn10300/kernel/irq.c +++ b/arch/mn10300/kernel/irq.c @@ -459,7 +459,7 @@ void migrate_irqs(void) tmp = CROSS_GxICR(irq, new); x &= GxICR_LEVEL | GxICR_ENABLE; - if (GxICR(irq) & GxICR_REQUEST) { + if (GxICR(irq) & GxICR_REQUEST) x |= GxICR_REQUEST | GxICR_DETECT; CROSS_GxICR(irq, new) = x; tmp = CROSS_GxICR(irq, new); diff --git a/arch/powerpc/kernel/e500-pmu.c b/arch/powerpc/kernel/e500-pmu.c index 7c07de0d8943..b150b510510f 100644 --- a/arch/powerpc/kernel/e500-pmu.c +++ b/arch/powerpc/kernel/e500-pmu.c @@ -126,4 +126,4 @@ static int init_e500_pmu(void) return register_fsl_emb_pmu(&e500_pmu); } -arch_initcall(init_e500_pmu); +early_initcall(init_e500_pmu); diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c index 09d72028f317..2cc5e0301d0b 100644 --- a/arch/powerpc/kernel/mpc7450-pmu.c +++ b/arch/powerpc/kernel/mpc7450-pmu.c @@ -414,4 +414,4 @@ static int init_mpc7450_pmu(void) return register_power_pmu(&mpc7450_pmu); } -arch_initcall(init_mpc7450_pmu); +early_initcall(init_mpc7450_pmu); diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index 3129c855933c..567480705789 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -1379,7 +1379,7 @@ int register_power_pmu(struct power_pmu *pmu) freeze_events_kernel = MMCR0_FCHV; #endif /* CONFIG_PPC64 */ - perf_pmu_register(&power_pmu); + perf_pmu_register(&power_pmu, "cpu", PERF_TYPE_RAW); perf_cpu_notifier(power_pmu_notifier); return 0; diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c index 7ecca59ddf77..4dcf5f831e9d 100644 --- a/arch/powerpc/kernel/perf_event_fsl_emb.c +++ b/arch/powerpc/kernel/perf_event_fsl_emb.c @@ -681,7 +681,7 @@ int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu) pr_info("%s performance monitor hardware support registered\n", pmu->name); - perf_pmu_register(&fsl_emb_pmu); + perf_pmu_register(&fsl_emb_pmu, "cpu", PERF_TYPE_RAW); return 0; } diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c index 2a361cdda635..ead8b3c2649e 100644 --- a/arch/powerpc/kernel/power4-pmu.c +++ b/arch/powerpc/kernel/power4-pmu.c @@ -613,4 +613,4 @@ static int init_power4_pmu(void) return register_power_pmu(&power4_pmu); } -arch_initcall(init_power4_pmu); +early_initcall(init_power4_pmu); diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index 199de527d411..eca0ac595cb6 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -682,4 +682,4 @@ static int init_power5p_pmu(void) return register_power_pmu(&power5p_pmu); } -arch_initcall(init_power5p_pmu); +early_initcall(init_power5p_pmu); diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index 98b6a729a9dd..d5ff0f64a5e6 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -621,4 +621,4 @@ static int init_power5_pmu(void) return register_power_pmu(&power5_pmu); } -arch_initcall(init_power5_pmu); +early_initcall(init_power5_pmu); diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index 84a607bda8fb..31603927e376 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -544,4 +544,4 @@ static int init_power6_pmu(void) return register_power_pmu(&power6_pmu); } -arch_initcall(init_power6_pmu); +early_initcall(init_power6_pmu); diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c index 852f7b7f6b40..593740fcb799 100644 --- a/arch/powerpc/kernel/power7-pmu.c +++ b/arch/powerpc/kernel/power7-pmu.c @@ -369,4 +369,4 @@ static int init_power7_pmu(void) return register_power_pmu(&power7_pmu); } -arch_initcall(init_power7_pmu); +early_initcall(init_power7_pmu); diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index 3fee685de4df..9a6e093858fe 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -494,4 +494,4 @@ static int init_ppc970_pmu(void) return register_power_pmu(&ppc970_pmu); } -arch_initcall(init_ppc970_pmu); +early_initcall(init_ppc970_pmu); diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index e0b98e71ff47..6c6d7b339aae 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -99,6 +99,7 @@ config S390 select HAVE_KERNEL_LZMA select HAVE_KERNEL_LZO select HAVE_GET_USER_PAGES_FAST + select HAVE_ARCH_MUTEX_CPU_RELAX select ARCH_INLINE_SPIN_TRYLOCK select ARCH_INLINE_SPIN_TRYLOCK_BH select ARCH_INLINE_SPIN_LOCK diff --git a/arch/s390/include/asm/mutex.h b/arch/s390/include/asm/mutex.h index 458c1f7fbc18..688271f5f2e4 100644 --- a/arch/s390/include/asm/mutex.h +++ b/arch/s390/include/asm/mutex.h @@ -7,3 +7,5 @@ */ #include <asm-generic/mutex-dec.h> + +#define arch_mutex_cpu_relax() barrier() diff --git a/arch/sh/boards/mach-se/7206/irq.c b/arch/sh/boards/mach-se/7206/irq.c index d961949600fd..9070d7e60704 100644 --- a/arch/sh/boards/mach-se/7206/irq.c +++ b/arch/sh/boards/mach-se/7206/irq.c @@ -140,7 +140,7 @@ void __init init_se7206_IRQ(void) make_se7206_irq(IRQ1_IRQ); /* ATA */ make_se7206_irq(IRQ3_IRQ); /* SLOT / PCM */ - __raw_writew(__raw_readw(INTC_ICR1) | 0x000b, INTC_ICR); /* ICR1 */ + __raw_writew(__raw_readw(INTC_ICR1) | 0x000b, INTC_ICR1); /* ICR1 */ /* FPGA System register setup*/ __raw_writew(0x0000,INTSTS0); /* Clear INTSTS0 */ diff --git a/arch/sh/kernel/cpu/sh2a/clock-sh7201.c b/arch/sh/kernel/cpu/sh2a/clock-sh7201.c index b26264dc2aef..c509c40cba4b 100644 --- a/arch/sh/kernel/cpu/sh2a/clock-sh7201.c +++ b/arch/sh/kernel/cpu/sh2a/clock-sh7201.c @@ -34,7 +34,7 @@ static const int pfc_divisors[]={1,2,3,4,6,8,12}; static void master_clk_init(struct clk *clk) { - return 10000000 * PLL2 * pll1rate[(__raw_readw(FREQCR) >> 8) & 0x0007]; + clk->rate = 10000000 * PLL2 * pll1rate[(__raw_readw(FREQCR) >> 8) & 0x0007]; } static struct clk_ops sh7201_master_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c index b601fa3978d1..6282a839e08e 100644 --- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c +++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c @@ -81,8 +81,7 @@ static void shoc_clk_init(struct clk *clk) for (i = 0; i < ARRAY_SIZE(frqcr3_divisors); i++) { int divisor = frqcr3_divisors[i]; - if (clk->ops->set_rate(clk, clk->parent->rate / - divisor, 0) == 0) + if (clk->ops->set_rate(clk, clk->parent->rate / divisor) == 0) break; } diff --git a/arch/sh/kernel/cpu/sh4/perf_event.c b/arch/sh/kernel/cpu/sh4/perf_event.c index dbf3b4bb71fe..748955df018d 100644 --- a/arch/sh/kernel/cpu/sh4/perf_event.c +++ b/arch/sh/kernel/cpu/sh4/perf_event.c @@ -250,4 +250,4 @@ static int __init sh7750_pmu_init(void) return register_sh_pmu(&sh7750_pmu); } -arch_initcall(sh7750_pmu_init); +early_initcall(sh7750_pmu_init); diff --git a/arch/sh/kernel/cpu/sh4a/perf_event.c b/arch/sh/kernel/cpu/sh4a/perf_event.c index 580276525731..17e6bebfede0 100644 --- a/arch/sh/kernel/cpu/sh4a/perf_event.c +++ b/arch/sh/kernel/cpu/sh4a/perf_event.c @@ -284,4 +284,4 @@ static int __init sh4a_pmu_init(void) return register_sh_pmu(&sh4a_pmu); } -arch_initcall(sh4a_pmu_init); +early_initcall(sh4a_pmu_init); diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 5a4b33435650..2ee21a47b5af 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -389,7 +389,7 @@ int __cpuinit register_sh_pmu(struct sh_pmu *_pmu) WARN_ON(_pmu->num_events > MAX_HWEVENTS); - perf_pmu_register(&pmu); + perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); perf_cpu_notifier(sh_pmu_notifier); return 0; } diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h index 6e8bfa1786da..4d3dbe3703e9 100644 --- a/arch/sparc/include/asm/perf_event.h +++ b/arch/sparc/include/asm/perf_event.h @@ -4,8 +4,6 @@ #ifdef CONFIG_PERF_EVENTS #include <asm/ptrace.h> -extern void init_hw_perf_events(void); - #define perf_arch_fetch_caller_regs(regs, ip) \ do { \ unsigned long _pstate, _asi, _pil, _i7, _fp; \ @@ -26,8 +24,6 @@ do { \ (regs)->u_regs[UREG_I6] = _fp; \ (regs)->u_regs[UREG_I7] = _i7; \ } while (0) -#else -static inline void init_hw_perf_events(void) { } #endif #endif diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index a4bd7ba74c89..300f810142f5 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -270,8 +270,6 @@ int __init nmi_init(void) atomic_set(&nmi_active, -1); } } - if (!err) - init_hw_perf_events(); return err; } diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 0d6deb55a2ae..760578687e7c 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1307,20 +1307,23 @@ static bool __init supported_pmu(void) return false; } -void __init init_hw_perf_events(void) +int __init init_hw_perf_events(void) { pr_info("Performance events: "); if (!supported_pmu()) { pr_cont("No support for PMU type '%s'\n", sparc_pmu_type); - return; + return 0; } pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type); - perf_pmu_register(&pmu); + perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); register_die_notifier(&perf_event_nmi_notifier); + + return 0; } +early_initcall(init_hw_perf_events); void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e330da21b84f..b6fccb07123e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -377,6 +377,18 @@ config X86_ELAN If unsure, choose "PC-compatible" instead. +config X86_INTEL_CE + bool "CE4100 TV platform" + depends on PCI + depends on PCI_GODIRECT + depends on X86_32 + depends on X86_EXTENDED_PLATFORM + select X86_REBOOTFIXUPS + ---help--- + Select for the Intel CE media processor (CE4100) SOC. + This option compiles in support for the CE4100 SOC for settop + boxes and media devices. + config X86_MRST bool "Moorestown MID platform" depends on PCI @@ -385,6 +397,10 @@ config X86_MRST depends on X86_EXTENDED_PLATFORM depends on X86_IO_APIC select APB_TIMER + select I2C + select SPI + select INTEL_SCU_IPC + select X86_PLATFORM_DEVICES ---help--- Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin Internet Device(MID) platform. Moorestown consists of two chips: @@ -466,6 +482,19 @@ config X86_ES7000 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is supposed to run on an IA32-based Unisys ES7000 system. +config X86_32_IRIS + tristate "Eurobraille/Iris poweroff module" + depends on X86_32 + ---help--- + The Iris machines from EuroBraille do not have APM or ACPI support + to shut themselves down properly. A special I/O sequence is + needed to do so, which is what this module does at + kernel shutdown. + + This is only for Iris machines from EuroBraille. + + If unused, say N. + config SCHED_OMIT_FRAME_POINTER def_bool y prompt "Single-depth WCHAN output" @@ -1141,16 +1170,16 @@ config NUMA comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) -config K8_NUMA +config AMD_NUMA def_bool y prompt "Old style AMD Opteron NUMA detection" depends on X86_64 && NUMA && PCI ---help--- - Enable K8 NUMA node topology detection. You should say Y here if - you have a multi processor AMD K8 system. This uses an old - method to read the NUMA configuration directly from the builtin - Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA - instead, which also takes priority if both are compiled in. + Enable AMD NUMA node topology detection. You should say Y here if + you have a multi processor AMD system. This uses an old method to + read the NUMA configuration directly from the builtin Northbridge + of Opteron. It is recommended to use X86_64_ACPI_NUMA instead, + which also takes priority if both are compiled in. config X86_64_ACPI_NUMA def_bool y diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index b59ee765414e..45143bbcfe5e 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -117,6 +117,17 @@ config DEBUG_RODATA_TEST feature as well as for the change_page_attr() infrastructure. If in doubt, say "N" +config DEBUG_SET_MODULE_RONX + bool "Set loadable kernel module data as NX and text as RO" + depends on MODULES + ---help--- + This option helps catch unintended modifications to loadable + kernel module's text and read-only data. It also prevents execution + of module data. Such protection may interfere with run-time code + patching and dynamic kernel tracing - and they might also protect + against certain classes of kernel exploits. + If in doubt, say "N". + config DEBUG_NX_TEST tristate "Testcase for the NX non-executable stack feature" depends on DEBUG_KERNEL && m diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 52f85a196fa0..35af09d13dc1 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -182,7 +182,7 @@ no_longmode: hlt jmp 1b -#include "../../kernel/verify_cpu_64.S" +#include "../../kernel/verify_cpu.S" /* * Be careful here startup_64 needs to be at a predictable diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 76561d20ea2f..13009d1af99a 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -66,6 +66,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name, extern void alternatives_smp_module_del(struct module *mod); extern void alternatives_smp_switch(int smp); extern int alternatives_text_reserved(void *start, void *end); +extern bool skip_smp_alternatives; #else static inline void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, @@ -180,8 +181,15 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. */ +struct text_poke_param { + void *addr; + const void *opcode; + size_t len; +}; + extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_smp(void *addr, const void *opcode, size_t len); +extern void text_poke_smp_batch(struct text_poke_param *params, int n); #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) #define IDEAL_NOP_SIZE_5 5 diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index c8517f81b21e..6aee50d655d1 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -3,36 +3,53 @@ #include <linux/pci.h> -extern struct pci_device_id k8_nb_ids[]; +extern struct pci_device_id amd_nb_misc_ids[]; struct bootnode; -extern int early_is_k8_nb(u32 value); -extern int cache_k8_northbridges(void); -extern void k8_flush_garts(void); -extern int k8_get_nodes(struct bootnode *nodes); -extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); -extern int k8_scan_nodes(void); +extern int early_is_amd_nb(u32 value); +extern int amd_cache_northbridges(void); +extern void amd_flush_garts(void); +extern int amd_get_nodes(struct bootnode *nodes); +extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); +extern int amd_scan_nodes(void); -struct k8_northbridge_info { +struct amd_northbridge { + struct pci_dev *misc; +}; + +struct amd_northbridge_info { u16 num; - u8 gart_supported; - struct pci_dev **nb_misc; + u64 flags; + struct amd_northbridge *nb; }; -extern struct k8_northbridge_info k8_northbridges; +extern struct amd_northbridge_info amd_northbridges; + +#define AMD_NB_GART 0x1 +#define AMD_NB_L3_INDEX_DISABLE 0x2 #ifdef CONFIG_AMD_NB -static inline struct pci_dev *node_to_k8_nb_misc(int node) +static inline int amd_nb_num(void) { - return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL; + return amd_northbridges.num; } -#else +static inline int amd_nb_has_feature(int feature) +{ + return ((amd_northbridges.flags & feature) == feature); +} -static inline struct pci_dev *node_to_k8_nb_misc(int node) +static inline struct amd_northbridge *node_to_amd_nb(int node) { - return NULL; + return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; } + +#else + +#define amd_nb_num(x) 0 +#define amd_nb_has_feature(x) false +#define node_to_amd_nb(x) NULL + #endif diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f6ce0bda3b98..cf12007796db 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -238,6 +238,7 @@ extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); extern void enable_NMI_through_LVT0(void); +extern int apic_force_enable(void); /* * On 32bit this is mach-xxx local diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index a859ca461fb0..47a30ff8e517 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -145,6 +145,7 @@ #ifdef CONFIG_X86_32 # define MAX_IO_APICS 64 +# define MAX_LOCAL_APIC 256 #else # define MAX_IO_APICS 128 # define MAX_LOCAL_APIC 32768 diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 8e6218550e77..c8bfe63a06de 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -124,6 +124,7 @@ enum { X86_SUBARCH_LGUEST, X86_SUBARCH_XEN, X86_SUBARCH_MRST, + X86_SUBARCH_CE4100, X86_NR_SUBARCHS, }; diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 9479a037419f..0141b234406f 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -117,6 +117,10 @@ enum fixed_addresses { FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ __end_of_permanent_fixed_addresses, + +#ifdef CONFIG_X86_MRST + FIX_LNW_VRTC, +#endif /* * 256 temporary boot-time mappings, used by early_ioremap(), * before ioremap() is functional. diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 4aa2bb3b242a..ef328901c802 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -93,6 +93,17 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) int err; /* See comment in fxsave() below. */ +#ifdef CONFIG_AS_FXSAVEQ + asm volatile("1: fxrstorq %[fx]\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err) + : [fx] "m" (*fx), "0" (0)); +#else asm volatile("1: rex64/fxrstor (%[fx])\n\t" "2:\n" ".section .fixup,\"ax\"\n" @@ -102,6 +113,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) _ASM_EXTABLE(1b, 3b) : [err] "=r" (err) : [fx] "R" (fx), "m" (*fx), "0" (0)); +#endif return err; } @@ -119,6 +131,17 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) return -EFAULT; /* See comment in fxsave() below. */ +#ifdef CONFIG_AS_FXSAVEQ + asm volatile("1: fxsaveq %[fx]\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err), [fx] "=m" (*fx) + : "0" (0)); +#else asm volatile("1: rex64/fxsave (%[fx])\n\t" "2:\n" ".section .fixup,\"ax\"\n" @@ -128,6 +151,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) _ASM_EXTABLE(1b, 3b) : [err] "=r" (err), "=m" (*fx) : [fx] "R" (fx), "0" (0)); +#endif if (unlikely(err) && __clear_user(fx, sizeof(struct i387_fxsave_struct))) err = -EFAULT; diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index a6b28d017c2f..0c5ca4e30d7b 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -159,7 +159,7 @@ struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); void setup_IO_APIC_irq_extra(u32 gsi); -extern void ioapic_init_mappings(void); +extern void ioapic_and_gsi_init(void); extern void ioapic_insert_resources(void); extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); @@ -168,10 +168,9 @@ extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); -extern void probe_nr_irqs_gsi(void); extern int get_nr_irqs_gsi(void); - extern void setup_ioapic_ids_from_mpc(void); +extern void setup_ioapic_ids_from_mpc_nocheck(void); struct mp_ioapic_gsi{ u32 gsi_base; @@ -189,9 +188,8 @@ extern void __init pre_init_apic_IRQ0(void); #define io_apic_assign_pci_irqs 0 #define setup_ioapic_ids_from_mpc x86_init_noop static const int timer_through_8259 = 0; -static inline void ioapic_init_mappings(void) { } +static inline void ioapic_and_gsi_init(void) { } static inline void ioapic_insert_resources(void) { } -static inline void probe_nr_irqs_gsi(void) { } #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 13b0ebaa512f..ba870bb6dd8e 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -15,10 +15,6 @@ static inline int irq_canonicalize(int irq) return ((irq == 2) ? 9 : irq); } -#ifdef CONFIG_X86_LOCAL_APIC -# define ARCH_HAS_NMI_WATCHDOG -#endif - #ifdef CONFIG_X86_32 extern void irq_ctx_init(int cpu); #else diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 5bdfca86581b..f23eb2528464 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -28,7 +28,7 @@ extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_registers(struct pt_regs *regs); extern void show_trace(struct task_struct *t, struct pt_regs *regs, - unsigned long *sp, unsigned long bp); + unsigned long *sp); extern void __show_regs(struct pt_regs *regs, int all); extern void show_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index c62c13cb9788..eb16e94ae04f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -223,6 +223,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c); void mce_log_therm_throt_event(__u64 status); +/* Interrupt Handler for core thermal thresholds */ +extern int (*platform_thermal_notify)(__u64 msr_val); + #ifdef CONFIG_X86_THERMAL_VECTOR extern void mcheck_intel_therm_init(void); #else diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index ef51b501e22a..24215072d0e1 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -48,6 +48,12 @@ static inline struct microcode_ops * __init init_intel_microcode(void) #ifdef CONFIG_MICROCODE_AMD extern struct microcode_ops * __init init_amd_microcode(void); + +static inline void get_ucode_data(void *to, const u8 *from, size_t n) +{ + memcpy(to, from, n); +} + #else static inline struct microcode_ops * __init init_amd_microcode(void) { diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index c82868e9f905..0c90dd9f0505 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -5,8 +5,9 @@ #include <asm/mpspec_def.h> #include <asm/x86_init.h> +#include <asm/apicdef.h> -extern int apic_version[MAX_APICS]; +extern int apic_version[]; extern int pic_mode; #ifdef CONFIG_X86_32 @@ -107,7 +108,7 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level, int active_high_low); #endif /* CONFIG_ACPI */ -#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) +#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) struct physid_mask { unsigned long mask[PHYSID_ARRAY_SIZE]; @@ -122,31 +123,31 @@ typedef struct physid_mask physid_mask_t; test_and_set_bit(physid, (map).mask) #define physids_and(dst, src1, src2) \ - bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS) + bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) #define physids_or(dst, src1, src2) \ - bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS) + bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) #define physids_clear(map) \ - bitmap_zero((map).mask, MAX_APICS) + bitmap_zero((map).mask, MAX_LOCAL_APIC) #define physids_complement(dst, src) \ - bitmap_complement((dst).mask, (src).mask, MAX_APICS) + bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC) #define physids_empty(map) \ - bitmap_empty((map).mask, MAX_APICS) + bitmap_empty((map).mask, MAX_LOCAL_APIC) #define physids_equal(map1, map2) \ - bitmap_equal((map1).mask, (map2).mask, MAX_APICS) + bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC) #define physids_weight(map) \ - bitmap_weight((map).mask, MAX_APICS) + bitmap_weight((map).mask, MAX_LOCAL_APIC) #define physids_shift_right(d, s, n) \ - bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS) + bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC) #define physids_shift_left(d, s, n) \ - bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) + bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC) static inline unsigned long physids_coerce(physid_mask_t *map) { @@ -159,14 +160,6 @@ static inline void physids_promote(unsigned long physids, physid_mask_t *map) map->mask[0] = physids; } -/* Note: will create very large stack frames if physid_mask_t is big */ -#define physid_mask_of_physid(physid) \ - ({ \ - physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ - physid_set(physid, __physid_mask); \ - __physid_mask; \ - }) - static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) { physids_clear(*map); diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h index 4a7f96d7c188..c0a955a9a087 100644 --- a/arch/x86/include/asm/mpspec_def.h +++ b/arch/x86/include/asm/mpspec_def.h @@ -15,13 +15,6 @@ #ifdef CONFIG_X86_32 # define MAX_MPC_ENTRY 1024 -# define MAX_APICS 256 -#else -# if NR_CPUS <= 255 -# define MAX_APICS 255 -# else -# define MAX_APICS 32768 -# endif #endif /* Intel MP Floating Pointer Structure */ diff --git a/arch/x86/include/asm/mrst-vrtc.h b/arch/x86/include/asm/mrst-vrtc.h new file mode 100644 index 000000000000..73668abdbedf --- /dev/null +++ b/arch/x86/include/asm/mrst-vrtc.h @@ -0,0 +1,9 @@ +#ifndef _MRST_VRTC_H +#define _MRST_VRTC_H + +extern unsigned char vrtc_cmos_read(unsigned char reg); +extern void vrtc_cmos_write(unsigned char val, unsigned char reg); +extern unsigned long vrtc_get_time(void); +extern int vrtc_set_mmss(unsigned long nowtime); + +#endif diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 4a711a684b17..719f00b28ff5 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -14,7 +14,9 @@ #include <linux/sfi.h> extern int pci_mrst_init(void); -int __init sfi_parse_mrtc(struct sfi_table_header *table); +extern int __init sfi_parse_mrtc(struct sfi_table_header *table); +extern int sfi_mrtc_num; +extern struct sfi_rtc_table_entry sfi_mrtc_array[]; /* * Medfield is the follow-up of Moorestown, it combines two chip solution into @@ -50,4 +52,14 @@ extern void mrst_early_console_init(void); extern struct console early_hsu_console; extern void hsu_early_console_init(void); + +extern void intel_scu_devices_create(void); +extern void intel_scu_devices_destroy(void); + +/* VRTC timer */ +#define MRST_VRTC_MAP_SZ (1024) +/*#define MRST_VRTC_PGOFFSET (0xc00) */ + +extern void mrst_rtc_init(void); + #endif /* _ASM_X86_MRST_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6b89f5e86021..4d0dfa0d998e 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -123,6 +123,10 @@ #define MSR_AMD64_IBSCTL 0xc001103a #define MSR_AMD64_IBSBRTARGET 0xc001103b +/* Fam 15h MSRs */ +#define MSR_F15H_PERF_CTL 0xc0010200 +#define MSR_F15H_PERF_CTR 0xc0010201 + /* Fam 10h MSRs */ #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 #define FAM10H_MMIO_CONF_ENABLE (1<<0) @@ -253,6 +257,18 @@ #define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1) #define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24) +/* Thermal Thresholds Support */ +#define THERM_INT_THRESHOLD0_ENABLE (1 << 15) +#define THERM_SHIFT_THRESHOLD0 8 +#define THERM_MASK_THRESHOLD0 (0x7f << THERM_SHIFT_THRESHOLD0) +#define THERM_INT_THRESHOLD1_ENABLE (1 << 23) +#define THERM_SHIFT_THRESHOLD1 16 +#define THERM_MASK_THRESHOLD1 (0x7f << THERM_SHIFT_THRESHOLD1) +#define THERM_STATUS_THRESHOLD0 (1 << 6) +#define THERM_LOG_THRESHOLD0 (1 << 7) +#define THERM_STATUS_THRESHOLD1 (1 << 8) +#define THERM_LOG_THRESHOLD1 (1 << 9) + /* MISC_ENABLE bits: architectural */ #define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) #define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 932f0f86b4b7..c4021b953510 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -5,41 +5,15 @@ #include <asm/irq.h> #include <asm/io.h> -#ifdef ARCH_HAS_NMI_WATCHDOG - -/** - * do_nmi_callback - * - * Check to see if a callback exists and execute it. Return 1 - * if the handler exists and was handled successfully. - */ -int do_nmi_callback(struct pt_regs *regs, int cpu); +#ifdef CONFIG_X86_LOCAL_APIC extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); -extern int check_nmi_watchdog(void); -#if !defined(CONFIG_LOCKUP_DETECTOR) -extern int nmi_watchdog_enabled; -#endif extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int reserve_perfctr_nmi(unsigned int); extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); -extern void setup_apic_nmi_watchdog(void *); -extern void stop_apic_nmi_watchdog(void *); -extern void disable_timer_nmi_watchdog(void); -extern void enable_timer_nmi_watchdog(void); -extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason); -extern void cpu_nmi_set_wd_enabled(void); - -extern atomic_t nmi_active; -extern unsigned int nmi_watchdog; -#define NMI_NONE 0 -#define NMI_IO_APIC 1 -#define NMI_LOCAL_APIC 2 -#define NMI_INVALID 3 - struct ctl_table; extern int proc_nmi_enabled(struct ctl_table *, int , void __user *, size_t *, loff_t *); @@ -47,33 +21,8 @@ extern int unknown_nmi_panic; void arch_trigger_all_cpu_backtrace(void); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace - -static inline void localise_nmi_watchdog(void) -{ - if (nmi_watchdog == NMI_IO_APIC) - nmi_watchdog = NMI_LOCAL_APIC; -} - -/* check if nmi_watchdog is active (ie was specified at boot) */ -static inline int nmi_watchdog_active(void) -{ - /* - * actually it should be: - * return (nmi_watchdog == NMI_LOCAL_APIC || - * nmi_watchdog == NMI_IO_APIC) - * but since they are power of two we could use a - * cheaper way --cvg - */ - return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC); -} #endif -void lapic_watchdog_stop(void); -int lapic_watchdog_init(unsigned nmi_hz); -int lapic_wd_event(unsigned nmi_hz); -unsigned lapic_adjust_nmi_hz(unsigned hz); -void disable_lapic_nmi_watchdog(void); -void enable_lapic_nmi_watchdog(void); void stop_nmi(void); void restart_nmi(void); diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ef9975812c77..7709c12431b8 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -112,7 +112,7 @@ static inline void arch_safe_halt(void) static inline void halt(void) { - PVOP_VCALL0(pv_irq_ops.safe_halt); + PVOP_VCALL0(pv_irq_ops.halt); } static inline void wbinvd(void) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index ca0437c714b2..676129229630 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -65,6 +65,7 @@ extern unsigned long pci_mem_start; #define PCIBIOS_MIN_CARDBUS_IO 0x4000 +extern int pcibios_enabled; void pcibios_config_init(void); struct pci_bus *pcibios_scan_root(int bus); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 550e26b1dbb3..d9d4dae305f6 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -125,7 +125,6 @@ union cpuid10_edx { #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ #ifdef CONFIG_PERF_EVENTS -extern void init_hw_perf_events(void); extern void perf_events_lapic_init(void); #define PERF_EVENT_INDEX_OFFSET 0 @@ -156,7 +155,6 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs); } #else -static inline void init_hw_perf_events(void) { } static inline void perf_events_lapic_init(void) { } #endif diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index a70cd216be5d..295e2ff18a6a 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -744,14 +744,6 @@ enum P4_ESCR_EMASKS { }; /* - * P4 PEBS specifics (Replay Event only) - * - * Format (bits): - * 0-6: metric from P4_PEBS_METRIC enum - * 7 : reserved - * 8 : reserved - * 9-11 : reserved - * * Note we have UOP and PEBS bits reserved for now * just in case if we will need them once */ @@ -788,5 +780,60 @@ enum P4_PEBS_METRIC { P4_PEBS_METRIC__max }; +/* + * Notes on internal configuration of ESCR+CCCR tuples + * + * Since P4 has quite the different architecture of + * performance registers in compare with "architectural" + * once and we have on 64 bits to keep configuration + * of performance event, the following trick is used. + * + * 1) Since both ESCR and CCCR registers have only low + * 32 bits valuable, we pack them into a single 64 bit + * configuration. Low 32 bits of such config correspond + * to low 32 bits of CCCR register and high 32 bits + * correspond to low 32 bits of ESCR register. + * + * 2) The meaning of every bit of such config field can + * be found in Intel SDM but it should be noted that + * we "borrow" some reserved bits for own usage and + * clean them or set to a proper value when we do + * a real write to hardware registers. + * + * 3) The format of bits of config is the following + * and should be either 0 or set to some predefined + * values: + * + * Low 32 bits + * ----------- + * 0-6: P4_PEBS_METRIC enum + * 7-11: reserved + * 12: reserved (Enable) + * 13-15: reserved (ESCR select) + * 16-17: Active Thread + * 18: Compare + * 19: Complement + * 20-23: Threshold + * 24: Edge + * 25: reserved (FORCE_OVF) + * 26: reserved (OVF_PMI_T0) + * 27: reserved (OVF_PMI_T1) + * 28-29: reserved + * 30: reserved (Cascade) + * 31: reserved (OVF) + * + * High 32 bits + * ------------ + * 0: reserved (T1_USR) + * 1: reserved (T1_OS) + * 2: reserved (T0_USR) + * 3: reserved (T0_OS) + * 4: Tag Enable + * 5-8: Tag Value + * 9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper) + * 25-30: enum P4_EVENTS + * 31: reserved (HT thread) + */ + #endif /* PERF_EVENT_P4_H */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index d6763b139a84..db8aa19a08a2 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -53,6 +53,12 @@ extern void x86_mrst_early_setup(void); static inline void x86_mrst_early_setup(void) { } #endif +#ifdef CONFIG_X86_INTEL_CE +extern void x86_ce4100_early_setup(void); +#else +static inline void x86_ce4100_early_setup(void) { } +#endif + #ifndef _SETUP /* diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h index 1def60114906..6c22bf353f26 100644 --- a/arch/x86/include/asm/smpboot_hooks.h +++ b/arch/x86/include/asm/smpboot_hooks.h @@ -48,7 +48,6 @@ static inline void __init smpboot_setup_io_apic(void) setup_IO_APIC(); else { nr_ioapics = 0; - localise_nmi_watchdog(); } #endif } diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 2b16a2ad23dc..52b5c7ed3608 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -7,6 +7,7 @@ #define _ASM_X86_STACKTRACE_H #include <linux/uaccess.h> +#include <linux/ptrace.h> extern int kstack_depth_to_print; @@ -46,7 +47,7 @@ struct stacktrace_ops { }; void dump_trace(struct task_struct *tsk, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, + unsigned long *stack, const struct stacktrace_ops *ops, void *data); #ifdef CONFIG_X86_32 @@ -57,13 +58,39 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) #endif +#ifdef CONFIG_FRAME_POINTER +static inline unsigned long +stack_frame(struct task_struct *task, struct pt_regs *regs) +{ + unsigned long bp; + + if (regs) + return regs->bp; + + if (task == current) { + /* Grab bp right from our regs */ + get_bp(bp); + return bp; + } + + /* bp is the last reg pushed by switch_to */ + return *(unsigned long *)task->thread.sp; +} +#else +static inline unsigned long +stack_frame(struct task_struct *task, struct pt_regs *regs) +{ + return 0; +} +#endif + extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl); + unsigned long *stack, char *log_lvl); extern void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl); + unsigned long *sp, char *log_lvl); extern unsigned int code_bytes; diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 5469630b27f5..fa7b9176b76c 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -10,12 +10,6 @@ unsigned long long native_sched_clock(void); extern int recalibrate_cpu_khz(void); -#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) -extern int timer_ack; -#else -# define timer_ack (0) -#endif - extern int no_timer_check; /* Accelerators for sched_clock() diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 42d412fd8b02..ce1d54c8a433 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -26,20 +26,22 @@ * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512, * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. * - * We will use 31 sets, one for sending BAU messages from each of the 32 + * We will use one set for sending BAU messages from each of the * cpu's on the uvhub. * * TLB shootdown will use the first of the 8 descriptors of each set. * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). */ +#define MAX_CPUS_PER_UVHUB 64 +#define MAX_CPUS_PER_SOCKET 32 +#define UV_ADP_SIZE 64 /* hardware-provided max. */ +#define UV_CPUS_PER_ACT_STATUS 32 /* hardware-provided max. */ #define UV_ITEMS_PER_DESCRIPTOR 8 /* the 'throttle' to prevent the hardware stay-busy bug */ #define MAX_BAU_CONCURRENT 3 -#define UV_CPUS_PER_ACT_STATUS 32 #define UV_ACT_STATUS_MASK 0x3 #define UV_ACT_STATUS_SIZE 2 -#define UV_ADP_SIZE 32 #define UV_DISTRIBUTION_SIZE 256 #define UV_SW_ACK_NPENDING 8 #define UV_NET_ENDPOINT_INTD 0x38 @@ -100,7 +102,6 @@ * number of destination side software ack resources */ #define DEST_NUM_RESOURCES 8 -#define MAX_CPUS_PER_NODE 32 /* * completion statuses for sending a TLB flush message */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 1e994754d323..34244b2cd880 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -85,7 +85,6 @@ obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_APB_TIMER) += apb_timer.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 71232b941b6c..17c8090fabd4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -198,6 +198,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled) { unsigned int ver = 0; + if (id >= (MAX_LOCAL_APIC-1)) { + printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); + return; + } + if (!enabled) { ++disabled_cpus; return; @@ -910,13 +915,13 @@ static int __init acpi_parse_madt_lapic_entries(void) acpi_register_lapic_address(acpi_lapic_addr); count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, - acpi_parse_sapic, MAX_APICS); + acpi_parse_sapic, MAX_LOCAL_APIC); if (!count) { x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, - acpi_parse_x2apic, MAX_APICS); + acpi_parse_x2apic, MAX_LOCAL_APIC); count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, - acpi_parse_lapic, MAX_APICS); + acpi_parse_lapic, MAX_LOCAL_APIC); } if (!count && !x2count) { printk(KERN_ERR PREFIX "No LAPIC entries present\n"); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5079f24c955a..123608531c8f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -353,6 +353,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod) mutex_unlock(&smp_alt); } +bool skip_smp_alternatives; void alternatives_smp_switch(int smp) { struct smp_alt_module *mod; @@ -368,7 +369,7 @@ void alternatives_smp_switch(int smp) printk("lockdep: fixing up alternatives.\n"); #endif - if (noreplace_smp || smp_alt_once) + if (noreplace_smp || smp_alt_once || skip_smp_alternatives) return; BUG_ON(!smp && (num_online_cpus() > 1)); @@ -591,17 +592,21 @@ static atomic_t stop_machine_first; static int wrote_text; struct text_poke_params { - void *addr; - const void *opcode; - size_t len; + struct text_poke_param *params; + int nparams; }; static int __kprobes stop_machine_text_poke(void *data) { struct text_poke_params *tpp = data; + struct text_poke_param *p; + int i; if (atomic_dec_and_test(&stop_machine_first)) { - text_poke(tpp->addr, tpp->opcode, tpp->len); + for (i = 0; i < tpp->nparams; i++) { + p = &tpp->params[i]; + text_poke(p->addr, p->opcode, p->len); + } smp_wmb(); /* Make sure other cpus see that this has run */ wrote_text = 1; } else { @@ -610,8 +615,12 @@ static int __kprobes stop_machine_text_poke(void *data) smp_mb(); /* Load wrote_text before following execution */ } - flush_icache_range((unsigned long)tpp->addr, - (unsigned long)tpp->addr + tpp->len); + for (i = 0; i < tpp->nparams; i++) { + p = &tpp->params[i]; + flush_icache_range((unsigned long)p->addr, + (unsigned long)p->addr + p->len); + } + return 0; } @@ -631,10 +640,13 @@ static int __kprobes stop_machine_text_poke(void *data) void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) { struct text_poke_params tpp; + struct text_poke_param p; - tpp.addr = addr; - tpp.opcode = opcode; - tpp.len = len; + p.addr = addr; + p.opcode = opcode; + p.len = len; + tpp.params = &p; + tpp.nparams = 1; atomic_set(&stop_machine_first, 1); wrote_text = 0; /* Use __stop_machine() because the caller already got online_cpus. */ @@ -642,6 +654,26 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) return addr; } +/** + * text_poke_smp_batch - Update instructions on a live kernel on SMP + * @params: an array of text_poke parameters + * @n: the number of elements in params. + * + * Modify multi-byte instruction by using stop_machine() on SMP. Since the + * stop_machine() is heavy task, it is better to aggregate text_poke requests + * and do it once if possible. + * + * Note: Must be called under get_online_cpus() and text_mutex. + */ +void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) +{ + struct text_poke_params tpp = {.params = params, .nparams = n}; + + atomic_set(&stop_machine_first, 1); + wrote_text = 0; + stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); +} + #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 8f6463d8ed0d..affacb5e0065 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -12,95 +12,116 @@ static u32 *flush_words; -struct pci_device_id k8_nb_ids[] = { +struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, {} }; -EXPORT_SYMBOL(k8_nb_ids); +EXPORT_SYMBOL(amd_nb_misc_ids); -struct k8_northbridge_info k8_northbridges; -EXPORT_SYMBOL(k8_northbridges); +struct amd_northbridge_info amd_northbridges; +EXPORT_SYMBOL(amd_northbridges); -static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) +static struct pci_dev *next_northbridge(struct pci_dev *dev, + struct pci_device_id *ids) { do { dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); if (!dev) break; - } while (!pci_match_id(&k8_nb_ids[0], dev)); + } while (!pci_match_id(ids, dev)); return dev; } -int cache_k8_northbridges(void) +int amd_cache_northbridges(void) { - int i; - struct pci_dev *dev; + int i = 0; + struct amd_northbridge *nb; + struct pci_dev *misc; - if (k8_northbridges.num) + if (amd_nb_num()) return 0; - dev = NULL; - while ((dev = next_k8_northbridge(dev)) != NULL) - k8_northbridges.num++; + misc = NULL; + while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL) + i++; - /* some CPU families (e.g. family 0x11) do not support GART */ - if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || - boot_cpu_data.x86 == 0x15) - k8_northbridges.gart_supported = 1; + if (i == 0) + return 0; - k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * - sizeof(void *), GFP_KERNEL); - if (!k8_northbridges.nb_misc) + nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); + if (!nb) return -ENOMEM; - if (!k8_northbridges.num) { - k8_northbridges.nb_misc[0] = NULL; - return 0; - } + amd_northbridges.nb = nb; + amd_northbridges.num = i; - if (k8_northbridges.gart_supported) { - flush_words = kmalloc(k8_northbridges.num * sizeof(u32), - GFP_KERNEL); - if (!flush_words) { - kfree(k8_northbridges.nb_misc); - return -ENOMEM; - } - } + misc = NULL; + for (i = 0; i != amd_nb_num(); i++) { + node_to_amd_nb(i)->misc = misc = + next_northbridge(misc, amd_nb_misc_ids); + } + + /* some CPU families (e.g. family 0x11) do not support GART */ + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || + boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_GART; + + /* + * Some CPU families support L3 Cache Index Disable. There are some + * limitations because of E382 and E388 on family 0x10. + */ + if (boot_cpu_data.x86 == 0x10 && + boot_cpu_data.x86_model >= 0x8 && + (boot_cpu_data.x86_model > 0x9 || + boot_cpu_data.x86_mask >= 0x1)) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; - dev = NULL; - i = 0; - while ((dev = next_k8_northbridge(dev)) != NULL) { - k8_northbridges.nb_misc[i] = dev; - if (k8_northbridges.gart_supported) - pci_read_config_dword(dev, 0x9c, &flush_words[i++]); - } - k8_northbridges.nb_misc[i] = NULL; return 0; } -EXPORT_SYMBOL_GPL(cache_k8_northbridges); +EXPORT_SYMBOL_GPL(amd_cache_northbridges); /* Ignores subdevice/subvendor but as far as I can figure out they're useless anyways */ -int __init early_is_k8_nb(u32 device) +int __init early_is_amd_nb(u32 device) { struct pci_device_id *id; u32 vendor = device & 0xffff; device >>= 16; - for (id = k8_nb_ids; id->vendor; id++) + for (id = amd_nb_misc_ids; id->vendor; id++) if (vendor == id->vendor && device == id->device) return 1; return 0; } -void k8_flush_garts(void) +int amd_cache_gart(void) +{ + int i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return 0; + + flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); + if (!flush_words) { + amd_northbridges.flags &= ~AMD_NB_GART; + return -ENOMEM; + } + + for (i = 0; i != amd_nb_num(); i++) + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, + &flush_words[i]); + + return 0; +} + +void amd_flush_garts(void) { int flushed, i; unsigned long flags; static DEFINE_SPINLOCK(gart_lock); - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; /* Avoid races between AGP and IOMMU. In theory it's not needed @@ -109,16 +130,16 @@ void k8_flush_garts(void) that it doesn't matter to serialize more. -AK */ spin_lock_irqsave(&gart_lock, flags); flushed = 0; - for (i = 0; i < k8_northbridges.num; i++) { - pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c, - flush_words[i]|1); + for (i = 0; i < amd_nb_num(); i++) { + pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c, + flush_words[i] | 1); flushed++; } - for (i = 0; i < k8_northbridges.num; i++) { + for (i = 0; i < amd_nb_num(); i++) { u32 w; /* Make sure the hardware actually executed the flush*/ for (;;) { - pci_read_config_dword(k8_northbridges.nb_misc[i], + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &w); if (!(w & 1)) break; @@ -129,19 +150,23 @@ void k8_flush_garts(void) if (!flushed) printk("nothing to flush?\n"); } -EXPORT_SYMBOL_GPL(k8_flush_garts); +EXPORT_SYMBOL_GPL(amd_flush_garts); -static __init int init_k8_nbs(void) +static __init int init_amd_nbs(void) { int err = 0; - err = cache_k8_northbridges(); + err = amd_cache_northbridges(); if (err < 0) - printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); + printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); + + if (amd_cache_gart() < 0) + printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " + "GART support disabled.\n"); return err; } /* This has to go after the PCI subsystem */ -fs_initcall(init_k8_nbs); +fs_initcall(init_amd_nbs); diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 92543c73cf8e..7c9ab59653e8 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -315,6 +315,7 @@ static void apbt_setup_irq(struct apbt_dev *adev) if (system_state == SYSTEM_BOOTING) { irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); + irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); /* APB timer irqs are set up as mp_irqs, timer is edge type */ __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge"); if (request_irq(adev->irq, apbt_interrupt_handler, diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b3a16e8f0703..dcd7c83e1659 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -206,7 +206,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) * Do an PCI bus scan by hand because we're running before the PCI * subsystem. * - * All K8 AGP bridges are AGPv3 compliant, so we can do this scan + * All AMD AGP bridges are AGPv3 compliant, so we can do this scan * generically. It's probably overkill to always scan all slots because * the AGP bridges should be always an own bus on the HT hierarchy, * but do it here for future safety. @@ -303,7 +303,7 @@ void __init early_gart_iommu_check(void) dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); @@ -358,7 +358,7 @@ void __init early_gart_iommu_check(void) dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); @@ -400,7 +400,7 @@ int __init gart_iommu_hole_init(void) dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; iommu_detected = 1; @@ -518,7 +518,7 @@ out: dev_base = bus_dev_ranges[i].dev_base; dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 910f20b457c4..3966b564ea47 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -3,10 +3,7 @@ # obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o -ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) -obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o -endif -obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o +obj-y += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 78218135b48e..879999a5230f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -31,7 +31,6 @@ #include <linux/init.h> #include <linux/cpu.h> #include <linux/dmi.h> -#include <linux/nmi.h> #include <linux/smp.h> #include <linux/mm.h> @@ -432,17 +431,18 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) reserved = reserve_eilvt_offset(offset, new); if (reserved != new) { - pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but " - "vector 0x%x was already reserved by another core, " - "APIC%lX=0x%x\n", - smp_processor_id(), new, reserved, reg, old); + pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " + "vector 0x%x, but the register is already in use for " + "vector 0x%x on another cpu\n", + smp_processor_id(), reg, offset, new, reserved); return -EINVAL; } if (!eilvt_entry_is_changeable(old, new)) { - pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but " - "register already in use, APIC%lX=0x%x\n", - smp_processor_id(), new, reg, old); + pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " + "vector 0x%x, but the register is already in use for " + "vector 0x%x on this cpu\n", + smp_processor_id(), reg, offset, new, old); return -EBUSY; } @@ -799,11 +799,7 @@ void __init setup_boot_APIC_clock(void) * PIT/HPET going. Otherwise register lapic as a dummy * device. */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - pr_warning("APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; /* Setup the lapic or request the broadcast */ setup_APIC_timer(); @@ -1387,7 +1383,6 @@ void __cpuinit end_local_APIC_setup(void) } #endif - setup_apic_nmi_watchdog(NULL); apic_pm_activate(); /* @@ -1538,13 +1533,60 @@ static int __init detect_init_APIC(void) return 0; } #else + +static int apic_verify(void) +{ + u32 features, h, l; + + /* + * The APIC feature bit should now be enabled + * in `cpuid' + */ + features = cpuid_edx(1); + if (!(features & (1 << X86_FEATURE_APIC))) { + pr_warning("Could not enable APIC!\n"); + return -1; + } + set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* The BIOS may have set up the APIC at some other address */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + + pr_info("Found and enabled local APIC!\n"); + return 0; +} + +int apic_force_enable(void) +{ + u32 h, l; + + if (disable_apic) + return -1; + + /* + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. + */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + pr_info("Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } + return apic_verify(); +} + /* * Detect and initialize APIC */ static int __init detect_init_APIC(void) { - u32 h, l, features; - /* Disabled by kernel option? */ if (disable_apic) return -1; @@ -1574,38 +1616,12 @@ static int __init detect_init_APIC(void) "you can enable it with \"lapic\"\n"); return -1; } - /* - * Some BIOSes disable the local APIC in the APIC_BASE - * MSR. This can only be done in software for Intel P6 or later - * and AMD K7 (Model > 1) or later. - */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (!(l & MSR_IA32_APICBASE_ENABLE)) { - pr_info("Local APIC disabled by BIOS -- reenabling.\n"); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; - wrmsr(MSR_IA32_APICBASE, l, h); - enabled_via_apicbase = 1; - } - } - /* - * The APIC feature bit should now be enabled - * in `cpuid' - */ - features = cpuid_edx(1); - if (!(features & (1 << X86_FEATURE_APIC))) { - pr_warning("Could not enable APIC!\n"); - return -1; + if (apic_force_enable()) + return -1; + } else { + if (apic_verify()) + return -1; } - set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* The BIOS may have set up the APIC at some other address */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (l & MSR_IA32_APICBASE_ENABLE) - mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; - - pr_info("Found and enabled local APIC!\n"); apic_pm_activate(); @@ -1693,7 +1709,7 @@ void __init init_apic_mappings(void) * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ -int apic_version[MAX_APICS]; +int apic_version[MAX_LOCAL_APIC]; int __init APIC_init_uniprocessor(void) { @@ -1758,17 +1774,10 @@ int __init APIC_init_uniprocessor(void) setup_IO_APIC(); else { nr_ioapics = 0; - localise_nmi_watchdog(); } -#else - localise_nmi_watchdog(); #endif x86_init.timers.setup_percpu_clockev(); -#ifdef CONFIG_X86_64 - check_nmi_watchdog(); -#endif - return 0; } diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 62f6e1e55b90..72ec29e1ae06 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -17,20 +17,31 @@ #include <linux/nmi.h> #include <linux/module.h> +#ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(void) { return (u64)(cpu_khz) * 1000 * 60; } +#endif -#ifdef ARCH_HAS_NMI_WATCHDOG - +#ifdef arch_trigger_all_cpu_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; +/* "in progress" flag of arch_trigger_all_cpu_backtrace */ +static unsigned long backtrace_flag; + void arch_trigger_all_cpu_backtrace(void) { int i; + if (test_and_set_bit(0, &backtrace_flag)) + /* + * If there is already a trigger_all_cpu_backtrace() in progress + * (backtrace_flag == 1), don't output double cpu dump infos. + */ + return; + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); printk(KERN_INFO "sending NMI to all CPUs:\n"); @@ -42,6 +53,9 @@ void arch_trigger_all_cpu_backtrace(void) break; mdelay(1); } + + clear_bit(0, &backtrace_flag); + smp_mb__after_clear_bit(); } static int __kprobes @@ -50,7 +64,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, { struct die_args *args = __args; struct pt_regs *regs; - int cpu = smp_processor_id(); + int cpu; switch (cmd) { case DIE_NMI: @@ -62,6 +76,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, } regs = args->regs; + cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -91,18 +106,3 @@ static int __init register_trigger_all_cpu_backtrace(void) } early_initcall(register_trigger_all_cpu_backtrace); #endif - -/* STUB calls to mimic old nmi_watchdog behaviour */ -#if defined(CONFIG_X86_LOCAL_APIC) -unsigned int nmi_watchdog = NMI_NONE; -EXPORT_SYMBOL(nmi_watchdog); -void acpi_nmi_enable(void) { return; } -void acpi_nmi_disable(void) { return; } -#endif -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -EXPORT_SYMBOL(nmi_active); -int unknown_nmi_panic; -void cpu_nmi_set_wd_enabled(void) { return; } -void stop_apic_nmi_watchdog(void *unused) { return; } -void setup_apic_nmi_watchdog(void *unused) { return; } -int __init check_nmi_watchdog(void) { return 0; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index fadcd743a74f..f6cd5b410770 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -54,7 +54,6 @@ #include <asm/dma.h> #include <asm/timer.h> #include <asm/i8259.h> -#include <asm/nmi.h> #include <asm/msidef.h> #include <asm/hypertransport.h> #include <asm/setup.h> @@ -1934,8 +1933,7 @@ void disable_IO_APIC(void) * * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 */ - -void __init setup_ioapic_ids_from_mpc(void) +void __init setup_ioapic_ids_from_mpc_nocheck(void) { union IO_APIC_reg_00 reg_00; physid_mask_t phys_id_present_map; @@ -1944,15 +1942,6 @@ void __init setup_ioapic_ids_from_mpc(void) unsigned char old_id; unsigned long flags; - if (acpi_ioapic) - return; - /* - * Don't check I/O APIC IDs for xAPIC systems. They have - * no meaning without the serial APIC bus. - */ - if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return; /* * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. @@ -2006,7 +1995,6 @@ void __init setup_ioapic_ids_from_mpc(void) physids_or(phys_id_present_map, phys_id_present_map, tmp); } - /* * We need to adjust the IRQ routing table * if the ID changed. @@ -2042,6 +2030,21 @@ void __init setup_ioapic_ids_from_mpc(void) apic_printk(APIC_VERBOSE, " ok.\n"); } } + +void __init setup_ioapic_ids_from_mpc(void) +{ + + if (acpi_ioapic) + return; + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + setup_ioapic_ids_from_mpc_nocheck(); +} #endif int no_timer_check __initdata; @@ -2642,24 +2645,6 @@ static void lapic_register_intr(int irq) "edge"); } -static void __init setup_nmi(void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(); - - apic_printk(APIC_VERBOSE, " done.\n"); -} - /* * This looks a bit hackish but it's about the only one way of sending * a few INTA cycles to 8259As and any associated glue logic. ICR does @@ -2765,15 +2750,6 @@ static inline void __init check_timer(void) */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); legacy_pic->init(1); -#ifdef CONFIG_X86_32 - { - unsigned int ver; - - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); - } -#endif pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); @@ -2821,10 +2797,6 @@ static inline void __init check_timer(void) unmask_ioapic(cfg); } if (timer_irq_works()) { - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - legacy_pic->unmask(0); - } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); goto out; @@ -2850,11 +2822,6 @@ static inline void __init check_timer(void) if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; - if (nmi_watchdog == NMI_IO_APIC) { - legacy_pic->mask(0); - setup_nmi(); - legacy_pic->unmask(0); - } goto out; } /* @@ -2866,15 +2833,6 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } - if (nmi_watchdog == NMI_IO_APIC) { - apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " - "through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = NMI_NONE; - } -#ifdef CONFIG_X86_32 - timer_ack = 0; -#endif - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); @@ -3639,7 +3597,7 @@ int __init io_apic_get_redir_entries (int ioapic) return reg_01.bits.entries + 1; } -void __init probe_nr_irqs_gsi(void) +static void __init probe_nr_irqs_gsi(void) { int nr; @@ -3956,7 +3914,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) return res; } -void __init ioapic_init_mappings(void) +void __init ioapic_and_gsi_init(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; @@ -3994,6 +3952,8 @@ fake_ioapic_page: ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; ioapic_res++; } + + probe_nr_irqs_gsi(); } void __init ioapic_insert_resources(void) @@ -4103,7 +4063,8 @@ void __init pre_init_apic_IRQ0(void) printk(KERN_INFO "Early APIC setup for system timer0\n"); #ifndef CONFIG_SMP - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); + physid_set_mask_of_physid(boot_cpu_physical_apicid, + &phys_cpu_present_map); #endif /* Make sure the irq descriptor is set up */ cfg = alloc_irq_and_cfg_at(0, 0); diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c deleted file mode 100644 index c90041ccb742..000000000000 --- a/arch/x86/kernel/apic/nmi.c +++ /dev/null @@ -1,567 +0,0 @@ -/* - * NMI watchdog support on APIC systems - * - * Started by Ingo Molnar <mingo@redhat.com> - * - * Fixes: - * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. - * Mikael Pettersson : Power Management for local APIC NMI watchdog. - * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. Disable/enable API. - */ - -#include <asm/apic.h> - -#include <linux/nmi.h> -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/sysdev.h> -#include <linux/sysctl.h> -#include <linux/percpu.h> -#include <linux/kprobes.h> -#include <linux/cpumask.h> -#include <linux/kernel_stat.h> -#include <linux/kdebug.h> -#include <linux/smp.h> - -#include <asm/i8259.h> -#include <asm/io_apic.h> -#include <asm/proto.h> -#include <asm/timer.h> - -#include <asm/mce.h> - -#include <asm/mach_traps.h> - -int unknown_nmi_panic; -int nmi_watchdog_enabled; - -/* For reliability, we're prepared to waste bits here. */ -static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; - -/* nmi_active: - * >0: the lapic NMI watchdog is active, but can be disabled - * <0: the lapic NMI watchdog has not been set up, and cannot - * be enabled - * 0: the lapic NMI watchdog is disabled, but can be enabled - */ -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -EXPORT_SYMBOL(nmi_active); - -unsigned int nmi_watchdog = NMI_NONE; -EXPORT_SYMBOL(nmi_watchdog); - -static int panic_on_timeout; - -static unsigned int nmi_hz = HZ; -static DEFINE_PER_CPU(short, wd_enabled); -static int endflag __initdata; - -static inline unsigned int get_nmi_count(int cpu) -{ - return per_cpu(irq_stat, cpu).__nmi_count; -} - -static inline int mce_in_progress(void) -{ -#if defined(CONFIG_X86_MCE) - return atomic_read(&mce_entry) > 0; -#endif - return 0; -} - -/* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ -static inline unsigned int get_timer_irqs(int cpu) -{ - return per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; -} - -#ifdef CONFIG_SMP -/* - * The performance counters used by NMI_LOCAL_APIC don't trigger when - * the CPU is idle. To make sure the NMI watchdog really ticks on all - * CPUs during the test make them busy. - */ -static __init void nmi_cpu_busy(void *data) -{ - local_irq_enable_in_hardirq(); - /* - * Intentionally don't use cpu_relax here. This is - * to make sure that the performance counter really ticks, - * even if there is a simulator or similar that catches the - * pause instruction. On a real HT machine this is fine because - * all other CPUs are busy with "useless" delay loops and don't - * care if they get somewhat less cycles. - */ - while (endflag == 0) - mb(); -} -#endif - -static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) -{ - printk(KERN_CONT "\n"); - - printk(KERN_WARNING - "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n", - cpu, prev_nmi_count[cpu], get_nmi_count(cpu)); - - printk(KERN_WARNING - "Please report this to bugzilla.kernel.org,\n"); - printk(KERN_WARNING - "and attach the output of the 'dmesg' command.\n"); - - per_cpu(wd_enabled, cpu) = 0; - atomic_dec(&nmi_active); -} - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -int __init check_nmi_watchdog(void) -{ - unsigned int *prev_nmi_count; - int cpu; - - if (!nmi_watchdog_active() || !atomic_read(&nmi_active)) - return 0; - - prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); - if (!prev_nmi_count) - goto error; - - printk(KERN_INFO "Testing NMI watchdog ... "); - -#ifdef CONFIG_SMP - if (nmi_watchdog == NMI_LOCAL_APIC) - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); -#endif - - for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = get_nmi_count(cpu); - local_irq_enable(); - mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ - - for_each_online_cpu(cpu) { - if (!per_cpu(wd_enabled, cpu)) - continue; - if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) - report_broken_nmi(cpu, prev_nmi_count); - } - endflag = 1; - if (!atomic_read(&nmi_active)) { - kfree(prev_nmi_count); - atomic_set(&nmi_active, -1); - goto error; - } - printk("OK.\n"); - - /* - * now that we know it works we can reduce NMI frequency to - * something more reasonable; makes a difference in some configs - */ - if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = lapic_adjust_nmi_hz(1); - - kfree(prev_nmi_count); - return 0; -error: - if (nmi_watchdog == NMI_IO_APIC) { - if (!timer_through_8259) - legacy_pic->mask(0); - on_each_cpu(__acpi_nmi_disable, NULL, 1); - } - -#ifdef CONFIG_X86_32 - timer_ack = 0; -#endif - return -1; -} - -static int __init setup_nmi_watchdog(char *str) -{ - unsigned int nmi; - - if (!strncmp(str, "panic", 5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - - if (!strncmp(str, "lapic", 5)) - nmi_watchdog = NMI_LOCAL_APIC; - else if (!strncmp(str, "ioapic", 6)) - nmi_watchdog = NMI_IO_APIC; - else { - get_option(&str, &nmi); - if (nmi >= NMI_INVALID) - return 0; - nmi_watchdog = nmi; - } - - return 1; -} -__setup("nmi_watchdog=", setup_nmi_watchdog); - -/* - * Suspend/resume support - */ -#ifdef CONFIG_PM - -static int nmi_pm_active; /* nmi_active before suspend */ - -static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) -{ - /* only CPU0 goes here, other CPUs should be offline */ - nmi_pm_active = atomic_read(&nmi_active); - stop_apic_nmi_watchdog(NULL); - BUG_ON(atomic_read(&nmi_active) != 0); - return 0; -} - -static int lapic_nmi_resume(struct sys_device *dev) -{ - /* only CPU0 goes here, other CPUs should be offline */ - if (nmi_pm_active > 0) { - setup_apic_nmi_watchdog(NULL); - touch_nmi_watchdog(); - } - return 0; -} - -static struct sysdev_class nmi_sysclass = { - .name = "lapic_nmi", - .resume = lapic_nmi_resume, - .suspend = lapic_nmi_suspend, -}; - -static struct sys_device device_lapic_nmi = { - .id = 0, - .cls = &nmi_sysclass, -}; - -static int __init init_lapic_nmi_sysfs(void) -{ - int error; - - /* - * should really be a BUG_ON but b/c this is an - * init call, it just doesn't work. -dcz - */ - if (nmi_watchdog != NMI_LOCAL_APIC) - return 0; - - if (atomic_read(&nmi_active) < 0) - return 0; - - error = sysdev_class_register(&nmi_sysclass); - if (!error) - error = sysdev_register(&device_lapic_nmi); - return error; -} - -/* must come after the local APIC's device_initcall() */ -late_initcall(init_lapic_nmi_sysfs); - -#endif /* CONFIG_PM */ - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 1); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 1); -} - -/* - * This function is called as soon the LAPIC NMI watchdog driver has everything - * in place and it's ready to check if the NMIs belong to the NMI watchdog - */ -void cpu_nmi_set_wd_enabled(void) -{ - __get_cpu_var(wd_enabled) = 1; -} - -void setup_apic_nmi_watchdog(void *unused) -{ - if (__get_cpu_var(wd_enabled)) - return; - - /* cheap hack to support suspend/resume */ - /* if cpu0 is not active neither should the other cpus */ - if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0) - return; - - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - if (lapic_watchdog_init(nmi_hz) < 0) { - __get_cpu_var(wd_enabled) = 0; - return; - } - /* FALL THROUGH */ - case NMI_IO_APIC: - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - } -} - -void stop_apic_nmi_watchdog(void *unused) -{ - /* only support LOCAL and IO APICs for now */ - if (!nmi_watchdog_active()) - return; - if (__get_cpu_var(wd_enabled) == 0) - return; - if (nmi_watchdog == NMI_LOCAL_APIC) - lapic_watchdog_stop(); - else - __acpi_nmi_disable(NULL); - __get_cpu_var(wd_enabled) = 0; - atomic_dec(&nmi_active); -} - -/* - * the best way to detect whether a CPU has a 'hard lockup' problem - * is to check it's local APIC timer IRQ counts. If they are not - * changing then that CPU has some problem. - * - * as these watchdog NMI IRQs are generated on every CPU, we only - * have to check the current processor. - * - * since NMIs don't listen to _any_ locks, we have to be extremely - * careful not to rely on unsafe variables. The printk might lock - * up though, so we have to break up any console locks first ... - * [when there will be more tty-related locks, break them up here too!] - */ - -static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(long, alert_counter); -static DEFINE_PER_CPU(int, nmi_touch); - -void touch_nmi_watchdog(void) -{ - if (nmi_watchdog_active()) { - unsigned cpu; - - /* - * Tell other CPUs to reset their alert counters. We cannot - * do it ourselves because the alert count increase is not - * atomic. - */ - for_each_present_cpu(cpu) { - if (per_cpu(nmi_touch, cpu) != 1) - per_cpu(nmi_touch, cpu) = 1; - } - } - - /* - * Tickle the softlockup detector too: - */ - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -notrace __kprobes int -nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) -{ - /* - * Since current_thread_info()-> is always on the stack, and we - * always switch the stack NMI-atomically, it's safe to use - * smp_processor_id(). - */ - unsigned int sum; - int touched = 0; - int cpu = smp_processor_id(); - int rc = 0; - - sum = get_timer_irqs(cpu); - - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; - touched = 1; - } - - /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */ - - raw_spin_lock(&lock); - printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); - show_regs(regs); - dump_stack(); - raw_spin_unlock(&lock); - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); - - rc = 1; - } - - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - touched = 1; - - /* if the none of the timers isn't firing, this cpu isn't doing much */ - if (!touched && __get_cpu_var(last_irq_sum) == sum) { - /* - * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... - */ - __this_cpu_inc(alert_counter); - if (__this_cpu_read(alert_counter) == 5 * nmi_hz) - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi("BUG: NMI Watchdog detected LOCKUP", - regs, panic_on_timeout); - } else { - __get_cpu_var(last_irq_sum) = sum; - __this_cpu_write(alert_counter, 0); - } - - /* see if the nmi watchdog went off */ - if (!__get_cpu_var(wd_enabled)) - return rc; - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - rc |= lapic_wd_event(nmi_hz); - break; - case NMI_IO_APIC: - /* - * don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - break; - } - return rc; -} - -#ifdef CONFIG_SYSCTL - -static void enable_ioapic_nmi_watchdog_single(void *unused) -{ - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - __acpi_nmi_enable(NULL); -} - -static void enable_ioapic_nmi_watchdog(void) -{ - on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1); - touch_nmi_watchdog(); -} - -static void disable_ioapic_nmi_watchdog(void) -{ - on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); -} - -static int __init setup_unknown_nmi_panic(char *str) -{ - unknown_nmi_panic = 1; - return 1; -} -__setup("unknown_nmi_panic", setup_unknown_nmi_panic); - -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) -{ - unsigned char reason = get_nmi_reason(); - char buf[64]; - - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf, regs, 1); /* Always panic here */ - return 0; -} - -/* - * proc handler for /proc/sys/kernel/nmi - */ -int proc_nmi_enabled(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; - old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, buffer, length, ppos); - if (!!old_state == !!nmi_watchdog_enabled) - return 0; - - if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) { - printk(KERN_WARNING - "NMI watchdog is permanently disabled\n"); - return -EIO; - } - - if (nmi_watchdog == NMI_LOCAL_APIC) { - if (nmi_watchdog_enabled) - enable_lapic_nmi_watchdog(); - else - disable_lapic_nmi_watchdog(); - } else if (nmi_watchdog == NMI_IO_APIC) { - if (nmi_watchdog_enabled) - enable_ioapic_nmi_watchdog(); - else - disable_ioapic_nmi_watchdog(); - } else { - printk(KERN_WARNING - "NMI watchdog doesn't know what hardware to touch\n"); - return -EIO; - } - return 0; -} - -#endif /* CONFIG_SYSCTL */ - -int do_nmi_callback(struct pt_regs *regs, int cpu) -{ -#ifdef CONFIG_SYSCTL - if (unknown_nmi_panic) - return unknown_nmi_panic_callback(regs, cpu); -#endif - return 0; -} - -void arch_trigger_all_cpu_backtrace(void) -{ - int i; - - cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); - - printk(KERN_INFO "sending NMI to all CPUs:\n"); - apic->send_IPI_all(NMI_VECTOR); - - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(to_cpumask(backtrace_mask))) - break; - mdelay(1); - } -} diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index c1c52c341f40..2a3f2a7db243 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -48,6 +48,16 @@ unsigned int uv_apicid_hibits; EXPORT_SYMBOL_GPL(uv_apicid_hibits); static DEFINE_SPINLOCK(uv_nmi_lock); +static unsigned long __init uv_early_read_mmr(unsigned long addr) +{ + unsigned long val, *mmr; + + mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr)); + val = *mmr; + early_iounmap(mmr, sizeof(*mmr)); + return val; +} + static inline bool is_GRU_range(u64 start, u64 end) { return start >= gru_start_paddr && end <= gru_end_paddr; @@ -58,28 +68,24 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end) return is_ISA_range(start, end) || is_GRU_range(start, end); } -static int early_get_nodeid(void) +static int __init early_get_pnodeid(void) { union uvh_node_id_u node_id; - unsigned long *mmr; - - mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); - node_id.v = *mmr; - early_iounmap(mmr, sizeof(*mmr)); + union uvh_rh_gam_config_mmr_u m_n_config; + int pnode; /* Currently, all blades have same revision number */ + node_id.v = uv_early_read_mmr(UVH_NODE_ID); + m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR); uv_min_hub_revision_id = node_id.s.revision; - return node_id.s.node_id; + pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); + return pnode; } static void __init early_get_apic_pnode_shift(void) { - unsigned long *mmr; - - mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr)); - uvh_apicid.v = *mmr; - early_iounmap(mmr, sizeof(*mmr)); + uvh_apicid.v = uv_early_read_mmr(UVH_APICID); if (!uvh_apicid.v) /* * Old bios, use default value @@ -95,21 +101,17 @@ static void __init early_get_apic_pnode_shift(void) static void __init uv_set_apicid_hibit(void) { union uvh_lb_target_physical_apic_id_mask_u apicid_mask; - unsigned long *mmr; - mmr = early_ioremap(UV_LOCAL_MMR_BASE | - UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK, sizeof(*mmr)); - apicid_mask.v = *mmr; - early_iounmap(mmr, sizeof(*mmr)); + apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK); uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK; } static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - int nodeid; + int pnodeid; if (!strcmp(oem_id, "SGI")) { - nodeid = early_get_nodeid(); + pnodeid = early_get_pnodeid(); early_get_apic_pnode_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; x86_platform.nmi_init = uv_nmi_init; @@ -119,7 +121,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) uv_system_type = UV_X2APIC; else if (!strcmp(oem_table_id, "UVH")) { __get_cpu_var(x2apic_extra_bits) = - nodeid << (uvh_apicid.s.pnode_shift - 1); + pnodeid << uvh_apicid.s.pnode_shift; uv_system_type = UV_NON_UNIQUE_APIC; uv_set_apicid_hibit(); return 1; @@ -682,27 +684,32 @@ void uv_nmi_init(void) void __init uv_system_init(void) { union uvh_rh_gam_config_mmr_u m_n_config; + union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; - int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; + int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io; int gnode_extra, max_pnode = 0; unsigned long mmr_base, present, paddr; - unsigned short pnode_mask; + unsigned short pnode_mask, pnode_io_mask; map_low_mmrs(); m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; + mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); + n_io = mmioh.s.n_io; mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; pnode_mask = (1 << n_val) - 1; + pnode_io_mask = (1 << n_io) - 1; + node_id.v = uv_read_local_mmr(UVH_NODE_ID); gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; gnode_upper = ((unsigned long)gnode_extra << m_val); - printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", - n_val, m_val, gnode_upper, gnode_extra); + printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n", + n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask); printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); @@ -735,7 +742,7 @@ void __init uv_system_init(void) for (j = 0; j < 64; j++) { if (!test_bit(j, &present)) continue; - pnode = (i * 64 + j); + pnode = (i * 64 + j) & pnode_mask; uv_blade_info[blade].pnode = pnode; uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; @@ -756,6 +763,7 @@ void __init uv_system_init(void) /* * apic_pnode_shift must be set before calling uv_apicid_to_pnode(); */ + uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); @@ -772,7 +780,6 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->numa_blade_id = blade; uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; uv_cpu_hub_info(cpu)->pnode = pnode; - uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; @@ -796,7 +803,7 @@ void __init uv_system_init(void) map_gru_high(max_pnode); map_mmr_high(max_pnode); - map_mmioh_high(max_pnode); + map_mmioh_high(max_pnode & pnode_io_mask); uv_cpu_init(); uv_scir_register_cpu_notifier(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4b68bda30938..1d59834396bd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -894,7 +894,6 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - init_hw_perf_events(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 17ad03366211..9ecf81f9b90f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -149,8 +149,7 @@ union _cpuid4_leaf_ecx { }; struct amd_l3_cache { - struct pci_dev *dev; - bool can_disable; + struct amd_northbridge *nb; unsigned indices; u8 subcaches[4]; }; @@ -311,14 +310,12 @@ struct _cache_attr { /* * L3 cache descriptors */ -static struct amd_l3_cache **__cpuinitdata l3_caches; - static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) { unsigned int sc0, sc1, sc2, sc3; u32 val = 0; - pci_read_config_dword(l3->dev, 0x1C4, &val); + pci_read_config_dword(l3->nb->misc, 0x1C4, &val); /* calculate subcache sizes */ l3->subcaches[0] = sc0 = !(val & BIT(0)); @@ -330,47 +327,14 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } -static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) -{ - struct amd_l3_cache *l3; - struct pci_dev *dev = node_to_k8_nb_misc(node); - - l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC); - if (!l3) { - printk(KERN_WARNING "Error allocating L3 struct\n"); - return NULL; - } - - l3->dev = dev; - - amd_calc_l3_indices(l3); - - return l3; -} - -static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, - int index) +static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, + int index) { + static struct amd_l3_cache *__cpuinitdata l3_caches; int node; - if (boot_cpu_data.x86 != 0x10) - return; - - if (index < 3) - return; - - /* see errata #382 and #388 */ - if (boot_cpu_data.x86_model < 0x8) - return; - - if ((boot_cpu_data.x86_model == 0x8 || - boot_cpu_data.x86_model == 0x9) - && - boot_cpu_data.x86_mask < 0x1) - return; - - /* not in virtualized environments */ - if (k8_northbridges.num == 0) + /* only for L3, and not in virtualized environments */ + if (index < 3 || amd_nb_num() == 0) return; /* @@ -378,7 +342,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, * never freed but this is done only on shutdown so it doesn't matter. */ if (!l3_caches) { - int size = k8_northbridges.num * sizeof(struct amd_l3_cache *); + int size = amd_nb_num() * sizeof(struct amd_l3_cache); l3_caches = kzalloc(size, GFP_ATOMIC); if (!l3_caches) @@ -387,14 +351,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, node = amd_get_nb_id(smp_processor_id()); - if (!l3_caches[node]) { - l3_caches[node] = amd_init_l3_cache(node); - l3_caches[node]->can_disable = true; + if (!l3_caches[node].nb) { + l3_caches[node].nb = node_to_amd_nb(node); + amd_calc_l3_indices(&l3_caches[node]); } - WARN_ON(!l3_caches[node]); - - this_leaf->l3 = l3_caches[node]; + this_leaf->l3 = &l3_caches[node]; } /* @@ -408,7 +370,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) { unsigned int reg = 0; - pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); + pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, ®); /* check whether this slot is activated already */ if (reg & (3UL << 30)) @@ -422,7 +384,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, { int index; - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + if (!this_leaf->l3 || + !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; index = amd_get_l3_disable_slot(this_leaf->l3, slot); @@ -457,7 +420,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, if (!l3->subcaches[i]) continue; - pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); + pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); /* * We need to WBINVD on a core on the node containing the L3 @@ -467,7 +430,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, wbinvd_on_cpu(cpu); reg |= BIT(31); - pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); + pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); } } @@ -524,7 +487,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + if (!this_leaf->l3 || + !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); @@ -545,7 +509,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, #define STORE_CACHE_DISABLE(slot) \ static ssize_t \ store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ + const char *buf, size_t count) \ { \ return store_cache_disable(this_leaf, buf, count, slot); \ } @@ -558,10 +522,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); #else /* CONFIG_AMD_NB */ -static void __cpuinit -amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) -{ -}; +#define amd_init_l3_cache(x, y) #endif /* CONFIG_AMD_NB */ static int @@ -575,7 +536,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { amd_cpuid4(index, &eax, &ebx, &ecx); - amd_check_l3_disable(this_leaf, index); + amd_init_l3_cache(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } @@ -983,30 +944,48 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -#define DEFAULT_SYSFS_CACHE_ATTRS \ - &type.attr, \ - &level.attr, \ - &coherency_line_size.attr, \ - &physical_line_partition.attr, \ - &ways_of_associativity.attr, \ - &number_of_sets.attr, \ - &size.attr, \ - &shared_cpu_map.attr, \ - &shared_cpu_list.attr - static struct attribute *default_attrs[] = { - DEFAULT_SYSFS_CACHE_ATTRS, + &type.attr, + &level.attr, + &coherency_line_size.attr, + &physical_line_partition.attr, + &ways_of_associativity.attr, + &number_of_sets.attr, + &size.attr, + &shared_cpu_map.attr, + &shared_cpu_list.attr, NULL }; -static struct attribute *default_l3_attrs[] = { - DEFAULT_SYSFS_CACHE_ATTRS, #ifdef CONFIG_AMD_NB - &cache_disable_0.attr, - &cache_disable_1.attr, +static struct attribute ** __cpuinit amd_l3_attrs(void) +{ + static struct attribute **attrs; + int n; + + if (attrs) + return attrs; + + n = sizeof (default_attrs) / sizeof (struct attribute *); + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + + attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); + if (attrs == NULL) + return attrs = default_attrs; + + for (n = 0; default_attrs[n]; n++) + attrs[n] = default_attrs[n]; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + attrs[n++] = &cache_disable_0.attr; + attrs[n++] = &cache_disable_1.attr; + } + + return attrs; +} #endif - NULL -}; static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -1117,11 +1096,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_leaf = CPUID4_INFO_IDX(cpu, i); - if (this_leaf->l3 && this_leaf->l3->can_disable) - ktype_cache.default_attrs = default_l3_attrs; - else - ktype_cache.default_attrs = default_attrs; - + ktype_cache.default_attrs = default_attrs; +#ifdef CONFIG_AMD_NB + if (this_leaf->l3) + ktype_cache.default_attrs = amd_l3_attrs(); +#endif retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, per_cpu(ici_cache_kobject, cpu), diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 80c482382d5c..5bf2fac52aca 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -31,8 +31,6 @@ #include <asm/mce.h> #include <asm/msr.h> -#define PFX "mce_threshold: " -#define VERSION "version 1.1.1" #define NR_BANKS 6 #define NR_BLOCKS 9 #define THRESHOLD_MAX 0xFFF @@ -59,12 +57,6 @@ struct threshold_block { struct list_head miscj; }; -/* defaults used early on boot */ -static struct threshold_block threshold_defaults = { - .interrupt_enable = 0, - .threshold_limit = THRESHOLD_MAX, -}; - struct threshold_bank { struct kobject *kobj; struct threshold_block *blocks; @@ -89,50 +81,101 @@ static void amd_threshold_interrupt(void); struct thresh_restart { struct threshold_block *b; int reset; + int set_lvt_off; + int lvt_off; u16 old_limit; }; +static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) +{ + int msr = (hi & MASK_LVTOFF_HI) >> 20; + + if (apic < 0) { + pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, + b->bank, b->block, b->address, hi, lo); + return 0; + } + + if (apic != msr) { + pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", + b->cpu, apic, b->bank, b->block, b->address, hi, lo); + return 0; + } + + return 1; +}; + /* must be called with correct cpu affinity */ /* Called via smp_call_function_single() */ static void threshold_restart_bank(void *_tr) { struct thresh_restart *tr = _tr; - u32 mci_misc_hi, mci_misc_lo; + u32 hi, lo; - rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + rdmsr(tr->b->address, lo, hi); - if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) tr->reset = 1; /* limit cannot be lower than err count */ if (tr->reset) { /* reset err count and overflow bit */ - mci_misc_hi = - (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + hi = + (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | (THRESHOLD_MAX - tr->b->threshold_limit); } else if (tr->old_limit) { /* change limit w/o reset */ - int new_count = (mci_misc_hi & THRESHOLD_MAX) + + int new_count = (hi & THRESHOLD_MAX) + (tr->old_limit - tr->b->threshold_limit); - mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | + hi = (hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } + if (tr->set_lvt_off) { + if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { + /* set new lvt offset */ + hi &= ~MASK_LVTOFF_HI; + hi |= tr->lvt_off << 20; + } + } + tr->b->interrupt_enable ? - (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : - (mci_misc_hi &= ~MASK_INT_TYPE_HI); + (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : + (hi &= ~MASK_INT_TYPE_HI); - mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + hi |= MASK_COUNT_EN_HI; + wrmsr(tr->b->address, lo, hi); +} + +static void mce_threshold_block_init(struct threshold_block *b, int offset) +{ + struct thresh_restart tr = { + .b = b, + .set_lvt_off = 1, + .lvt_off = offset, + }; + + b->threshold_limit = THRESHOLD_MAX; + threshold_restart_bank(&tr); +}; + +static int setup_APIC_mce(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; } /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { + struct threshold_block b; unsigned int cpu = smp_processor_id(); u32 low = 0, high = 0, address = 0; unsigned int bank, block; - struct thresh_restart tr; - int lvt_off = -1; - u8 offset; + int offset = -1; for (bank = 0; bank < NR_BANKS; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@ -163,39 +206,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (shared_bank[bank] && c->cpu_core_id) break; #endif - offset = (high & MASK_LVTOFF_HI) >> 20; - if (lvt_off < 0) { - if (setup_APIC_eilvt(offset, - THRESHOLD_APIC_VECTOR, - APIC_EILVT_MSG_FIX, 0)) { - pr_err(FW_BUG "cpu %d, failed to " - "setup threshold interrupt " - "for bank %d, block %d " - "(MSR%08X=0x%x%08x)", - smp_processor_id(), bank, block, - address, high, low); - continue; - } - lvt_off = offset; - } else if (lvt_off != offset) { - pr_err(FW_BUG "cpu %d, invalid threshold " - "interrupt offset %d for bank %d," - "block %d (MSR%08X=0x%x%08x)", - smp_processor_id(), lvt_off, bank, - block, address, high, low); - continue; - } - - high &= ~MASK_LVTOFF_HI; - high |= lvt_off << 20; - wrmsr(address, low, high); + offset = setup_APIC_mce(offset, + (high & MASK_LVTOFF_HI) >> 20); - threshold_defaults.address = address; - tr.b = &threshold_defaults; - tr.reset = 0; - tr.old_limit = 0; - threshold_restart_bank(&tr); + memset(&b, 0, sizeof(b)); + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = address; + mce_threshold_block_init(&b, offset); mce_threshold_vector = amd_threshold_interrupt; } } @@ -298,9 +318,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) b->interrupt_enable = !!new; + memset(&tr, 0, sizeof(tr)); tr.b = b; - tr.reset = 0; - tr.old_limit = 0; smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); @@ -321,10 +340,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) if (new < 1) new = 1; + memset(&tr, 0, sizeof(tr)); tr.old_limit = b->threshold_limit; b->threshold_limit = new; tr.b = b; - tr.reset = 0; smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); @@ -603,9 +622,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu) continue; err = threshold_create_bank(cpu, bank); if (err) - goto out; + return err; } -out: + return err; } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 4b683267eca5..e12246ff5aa6 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -53,8 +53,13 @@ struct thermal_state { struct _thermal_state core_power_limit; struct _thermal_state package_throttle; struct _thermal_state package_power_limit; + struct _thermal_state core_thresh0; + struct _thermal_state core_thresh1; }; +/* Callback to handle core threshold interrupts */ +int (*platform_thermal_notify)(__u64 msr_val); + static DEFINE_PER_CPU(struct thermal_state, thermal_state); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -200,6 +205,22 @@ static int therm_throt_process(bool new_event, int event, int level) return 0; } +static int thresh_event_valid(int event) +{ + struct _thermal_state *state; + unsigned int this_cpu = smp_processor_id(); + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); + u64 now = get_jiffies_64(); + + state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1; + + if (time_before64(now, state->next_check)) + return 0; + + state->next_check = now + CHECK_INTERVAL; + return 1; +} + #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, @@ -313,6 +334,22 @@ device_initcall(thermal_throttle_init_device); #define PACKAGE_THROTTLED ((__u64)2 << 62) #define PACKAGE_POWER_LIMIT ((__u64)3 << 62) +static void notify_thresholds(__u64 msr_val) +{ + /* check whether the interrupt handler is defined; + * otherwise simply return + */ + if (!platform_thermal_notify) + return; + + /* lower threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0)) + platform_thermal_notify(msr_val); + /* higher threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1)) + platform_thermal_notify(msr_val); +} + /* Thermal transition interrupt handler */ static void intel_thermal_interrupt(void) { @@ -321,6 +358,9 @@ static void intel_thermal_interrupt(void) rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + /* Check for violation of core thermal thresholds*/ + notify_thresholds(msr_val); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, CORE_LEVEL) != 0) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 6d75b9145b13..0a360d146596 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -330,9 +330,6 @@ static bool reserve_pmc_hardware(void) { int i; - if (nmi_watchdog == NMI_LOCAL_APIC) - disable_lapic_nmi_watchdog(); - for (i = 0; i < x86_pmu.num_counters; i++) { if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; @@ -355,9 +352,6 @@ perfctr_fail: for (i--; i >= 0; i--) release_perfctr_nmi(x86_pmu.perfctr + i); - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); - return false; } @@ -369,9 +363,6 @@ static void release_pmc_hardware(void) release_perfctr_nmi(x86_pmu.perfctr + i); release_evntsel_nmi(x86_pmu.eventsel + i); } - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); } #else @@ -384,15 +375,53 @@ static void release_pmc_hardware(void) {} static bool check_hw_exists(void) { u64 val, val_new = 0; - int ret = 0; + int i, reg, ret = 0; + + /* + * Check to see if the BIOS enabled any of the counters, if so + * complain and bail. + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + reg = x86_pmu.eventsel + i; + ret = rdmsrl_safe(reg, &val); + if (ret) + goto msr_fail; + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) + goto bios_fail; + } + if (x86_pmu.num_counters_fixed) { + reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + ret = rdmsrl_safe(reg, &val); + if (ret) + goto msr_fail; + for (i = 0; i < x86_pmu.num_counters_fixed; i++) { + if (val & (0x03 << i*4)) + goto bios_fail; + } + } + + /* + * Now write a value and read it back to see if it matches, + * this is needed to detect certain hardware emulators (qemu/kvm) + * that don't trap on the MSR access and always return 0s. + */ val = 0xabcdUL; - ret |= checking_wrmsrl(x86_pmu.perfctr, val); + ret = checking_wrmsrl(x86_pmu.perfctr, val); ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new); if (ret || val != val_new) - return false; + goto msr_fail; return true; + +bios_fail: + printk(KERN_CONT "Broken BIOS detected, using software events only.\n"); + printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val); + return false; + +msr_fail: + printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + return false; } static void reserve_ds_buffers(void); @@ -451,7 +480,7 @@ static int x86_setup_perfctr(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; u64 config; - if (!hwc->sample_period) { + if (!is_sampling_event(event)) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); @@ -1362,7 +1391,7 @@ static void __init pmu_check_apic(void) pr_info("no hardware sampling interrupt available.\n"); } -void __init init_hw_perf_events(void) +int __init init_hw_perf_events(void) { struct event_constraint *c; int err; @@ -1377,20 +1406,18 @@ void __init init_hw_perf_events(void) err = amd_pmu_init(); break; default: - return; + return 0; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); - return; + return 0; } pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ - if (!check_hw_exists()) { - pr_cont("Broken PMU hardware detected, software events only.\n"); - return; - } + if (!check_hw_exists()) + return 0; pr_cont("%s PMU driver.\n", x86_pmu.name); @@ -1438,9 +1465,12 @@ void __init init_hw_perf_events(void) pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); - perf_pmu_register(&pmu); + perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); perf_cpu_notifier(x86_pmu_notifier); + + return 0; } +early_initcall(init_hw_perf_events); static inline void x86_pmu_read(struct perf_event *event) { @@ -1686,7 +1716,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) perf_callchain_store(entry, regs->ip); - dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); + dump_trace(NULL, regs, NULL, &backtrace_ops, entry); } #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index e421b8cd6944..67e2202a6039 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -1,7 +1,5 @@ #ifdef CONFIG_CPU_SUP_AMD -static DEFINE_RAW_SPINLOCK(amd_nb_lock); - static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -275,7 +273,7 @@ done: return &emptyconstraint; } -static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) +static struct amd_nb *amd_alloc_nb(int cpu) { struct amd_nb *nb; int i; @@ -285,7 +283,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) if (!nb) return NULL; - nb->nb_id = nb_id; + nb->nb_id = -1; /* * initialize all possible NB constraints @@ -306,7 +304,7 @@ static int amd_pmu_cpu_prepare(int cpu) if (boot_cpu_data.x86_max_cores < 2) return NOTIFY_OK; - cpuc->amd_nb = amd_alloc_nb(cpu, -1); + cpuc->amd_nb = amd_alloc_nb(cpu); if (!cpuc->amd_nb) return NOTIFY_BAD; @@ -325,8 +323,6 @@ static void amd_pmu_cpu_starting(int cpu) nb_id = amd_get_nb_id(cpu); WARN_ON_ONCE(nb_id == BAD_APICID); - raw_spin_lock(&amd_nb_lock); - for_each_online_cpu(i) { nb = per_cpu(cpu_hw_events, i).amd_nb; if (WARN_ON_ONCE(!nb)) @@ -341,8 +337,6 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; - - raw_spin_unlock(&amd_nb_lock); } static void amd_pmu_cpu_dead(int cpu) @@ -354,8 +348,6 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw = &per_cpu(cpu_hw_events, cpu); - raw_spin_lock(&amd_nb_lock); - if (cpuhw->amd_nb) { struct amd_nb *nb = cpuhw->amd_nb; @@ -364,8 +356,6 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } - - raw_spin_unlock(&amd_nb_lock); } static __initconst const struct x86_pmu amd_pmu = { diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index c8f5c088cad1..24e390e40f2e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -816,6 +816,32 @@ static int intel_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (event->attr.precise_ip && + (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use INST_RETIRED.ANY_P + * (0x00c0), which is a PEBS capable event, to get the same + * count. + * + * INST_RETIRED.ANY_P counts the number of cycles that retires + * CNTMASK instructions. By setting CNTMASK to a value (16) + * larger than the maximum number of instructions that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less instructions, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */ + + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } + if (event->attr.type != PERF_TYPE_RAW) return 0; diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index d9f4ff8fcd69..d5a236615501 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -16,32 +16,12 @@ #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/smp.h> -#include <linux/nmi.h> +#include <asm/nmi.h> #include <linux/kprobes.h> #include <asm/apic.h> #include <asm/perf_event.h> -struct nmi_watchdog_ctlblk { - unsigned int cccr_msr; - unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ - unsigned int evntsel_msr; /* the MSR to select the events to handle */ -}; - -/* Interface defining a CPU specific perfctr watchdog */ -struct wd_ops { - int (*reserve)(void); - void (*unreserve)(void); - int (*setup)(unsigned nmi_hz); - void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz); - void (*stop)(void); - unsigned perfctr; - unsigned evntsel; - u64 checkbit; -}; - -static const struct wd_ops *wd_ops; - /* * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's * offset from MSR_P4_BSU_ESCR0. @@ -60,8 +40,6 @@ static const struct wd_ops *wd_ops; static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); -static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); - /* converts an msr to an appropriate reservation bit */ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) { @@ -172,623 +150,3 @@ void release_evntsel_nmi(unsigned int msr) clear_bit(counter, evntsel_nmi_owner); } EXPORT_SYMBOL(release_evntsel_nmi); - -void disable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); - - if (wd_ops) - wd_ops->unreserve(); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -void enable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - /* are we already enabled */ - if (atomic_read(&nmi_active) != 0) - return; - - /* are we lapic aware */ - if (!wd_ops) - return; - if (!wd_ops->reserve()) { - printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n"); - return; - } - - on_each_cpu(setup_apic_nmi_watchdog, NULL, 1); - touch_nmi_watchdog(); -} - -/* - * Activate the NMI watchdog via the local APIC. - */ - -static unsigned int adjust_for_32bit_ctr(unsigned int hz) -{ - u64 counter_val; - unsigned int retval = hz; - - /* - * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - counter_val = (u64)cpu_khz * 1000; - do_div(counter_val, retval); - if (counter_val > 0x7fffffffULL) { - u64 count = (u64)cpu_khz * 1000; - do_div(count, 0x7fffffffUL); - retval = count + 1; - } - return retval; -} - -static void write_watchdog_counter(unsigned int perfctr_msr, - const char *descr, unsigned nmi_hz) -{ - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if (descr) - pr_debug("setting %s to -0x%08Lx\n", descr, count); - wrmsrl(perfctr_msr, 0 - count); -} - -static void write_watchdog_counter32(unsigned int perfctr_msr, - const char *descr, unsigned nmi_hz) -{ - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if (descr) - pr_debug("setting %s to -0x%08Lx\n", descr, count); - wrmsr(perfctr_msr, (u32)(-count), 0); -} - -/* - * AMD K7/K8/Family10h/Family11h support. - * AMD keeps this interface nicely stable so there is not much variety - */ -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING - -static int setup_k7_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = K7_EVNTSEL_INT - | K7_EVNTSEL_OS - | K7_EVNTSEL_USR - | K7_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz); - - /* initialize the wd struct before enabling */ - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - return 1; -} - -static void single_msr_stop_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->evntsel_msr, 0, 0); -} - -static int single_msr_reserve(void) -{ - if (!reserve_perfctr_nmi(wd_ops->perfctr)) - return 0; - - if (!reserve_evntsel_nmi(wd_ops->evntsel)) { - release_perfctr_nmi(wd_ops->perfctr); - return 0; - } - return 1; -} - -static void single_msr_unreserve(void) -{ - release_evntsel_nmi(wd_ops->evntsel); - release_perfctr_nmi(wd_ops->perfctr); -} - -static void __kprobes -single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops k7_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_k7_watchdog, - .rearm = single_msr_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_K7_PERFCTR0, - .evntsel = MSR_K7_EVNTSEL0, - .checkbit = 1ULL << 47, -}; - -/* - * Intel Model 6 (PPro+,P2,P3,P-M,Core1) - */ -#define P6_EVNTSEL0_ENABLE (1 << 22) -#define P6_EVNTSEL_INT (1 << 20) -#define P6_EVNTSEL_OS (1 << 17) -#define P6_EVNTSEL_USR (1 << 16) -#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 -#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED - -static int setup_p6_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - /* KVM doesn't implement this MSR */ - if (wrmsr_safe(perfctr_msr, 0, 0) < 0) - return 0; - - evntsel = P6_EVNTSEL_INT - | P6_EVNTSEL_OS - | P6_EVNTSEL_USR - | P6_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz); - - /* initialize the wd struct before enabling */ - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= P6_EVNTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - return 1; -} - -static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - /* - * P6 based Pentium M need to re-unmask - * the apic vector but it doesn't hurt - * other P6 variant. - * ArchPerfom/Core Duo also needs this - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - - /* P6/ARCH_PERFMON has 32 bit counter write */ - write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops p6_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_p6_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_P6_PERFCTR0, - .evntsel = MSR_P6_EVNTSEL0, - .checkbit = 1ULL << 39, -}; - -/* - * Intel P4 performance counters. - * By far the most complicated of all. - */ -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7) -#define P4_ESCR_EVENT_SELECT(N) ((N) << 25) -#define P4_ESCR_OS (1 << 3) -#define P4_ESCR_USR (1 << 2) -#define P4_CCCR_OVF_PMI0 (1 << 26) -#define P4_CCCR_OVF_PMI1 (1 << 27) -#define P4_CCCR_THRESHOLD(N) ((N) << 20) -#define P4_CCCR_COMPLEMENT (1 << 19) -#define P4_CCCR_COMPARE (1 << 18) -#define P4_CCCR_REQUIRED (3 << 16) -#define P4_CCCR_ESCR_SELECT(N) ((N) << 13) -#define P4_CCCR_ENABLE (1 << 12) -#define P4_CCCR_OVF (1 << 31) - -#define P4_CONTROLS 18 -static unsigned int p4_controls[18] = { - MSR_P4_BPU_CCCR0, - MSR_P4_BPU_CCCR1, - MSR_P4_BPU_CCCR2, - MSR_P4_BPU_CCCR3, - MSR_P4_MS_CCCR0, - MSR_P4_MS_CCCR1, - MSR_P4_MS_CCCR2, - MSR_P4_MS_CCCR3, - MSR_P4_FLAME_CCCR0, - MSR_P4_FLAME_CCCR1, - MSR_P4_FLAME_CCCR2, - MSR_P4_FLAME_CCCR3, - MSR_P4_IQ_CCCR0, - MSR_P4_IQ_CCCR1, - MSR_P4_IQ_CCCR2, - MSR_P4_IQ_CCCR3, - MSR_P4_IQ_CCCR4, - MSR_P4_IQ_CCCR5, -}; -/* - * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - * CRU_ESCR0 (with any non-null event selector) through a complemented - * max threshold. [IA32-Vol3, Section 14.9.9] - */ -static int setup_p4_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr, cccr_msr; - unsigned int evntsel, cccr_val; - unsigned int misc_enable, dummy; - unsigned int ht_num; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); - if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) - return 0; - -#ifdef CONFIG_SMP - /* detect which hyperthread we are on */ - if (smp_num_siblings == 2) { - unsigned int ebx, apicid; - - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; - } else -#endif - ht_num = 0; - - /* - * performance counters are shared resources - * assign each hyperthread its own set - * (re-use the ESCR0 register, seems safe - * and keeps the cccr_val the same) - */ - if (!ht_num) { - /* logical cpu 0 */ - perfctr_msr = MSR_P4_IQ_PERFCTR0; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR0; - cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); - - /* - * If we're on the kdump kernel or other situation, we may - * still have other performance counter registers set to - * interrupt and they'll keep interrupting forever because - * of the P4_CCCR_OVF quirk. So we need to ACK all the - * pending interrupts and disable all the registers here, - * before reenabling the NMI delivery. Refer to p4_rearm() - * about the P4_CCCR_OVF quirk. - */ - if (reset_devices) { - unsigned int low, high; - int i; - - for (i = 0; i < P4_CONTROLS; i++) { - rdmsr(p4_controls[i], low, high); - low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF); - wrmsr(p4_controls[i], low, high); - } - } - } else { - /* logical cpu 1 */ - perfctr_msr = MSR_P4_IQ_PERFCTR1; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR1; - - /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */ - if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4) - cccr_val = P4_CCCR_OVF_PMI0; - else - cccr_val = P4_CCCR_OVF_PMI1; - cccr_val |= P4_CCCR_ESCR_SELECT(4); - } - - evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS - | P4_ESCR_USR; - - cccr_val |= P4_CCCR_THRESHOLD(15) - | P4_CCCR_COMPLEMENT - | P4_CCCR_COMPARE - | P4_CCCR_REQUIRED; - - wrmsr(evntsel_msr, evntsel, 0); - wrmsr(cccr_msr, cccr_val, 0); - write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = cccr_msr; - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); - return 1; -} - -static void stop_p4_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - wrmsr(wd->cccr_msr, 0, 0); - wrmsr(wd->evntsel_msr, 0, 0); -} - -static int p4_reserve(void) -{ - if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0)) - return 0; -#ifdef CONFIG_SMP - if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1)) - goto fail1; -#endif - if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) - goto fail2; - /* RED-PEN why is ESCR1 not reserved here? */ - return 1; - fail2: -#ifdef CONFIG_SMP - if (smp_num_siblings > 1) - release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); - fail1: -#endif - release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); - return 0; -} - -static void p4_unreserve(void) -{ -#ifdef CONFIG_SMP - if (smp_num_siblings > 1) - release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); -#endif - release_evntsel_nmi(MSR_P4_CRU_ESCR0); - release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); -} - -static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - unsigned dummy; - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - rdmsrl(wd->cccr_msr, dummy); - dummy &= ~P4_CCCR_OVF; - wrmsrl(wd->cccr_msr, dummy); - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops p4_wd_ops = { - .reserve = p4_reserve, - .unreserve = p4_unreserve, - .setup = setup_p4_watchdog, - .rearm = p4_rearm, - .stop = stop_p4_watchdog, - /* RED-PEN this is wrong for the other sibling */ - .perfctr = MSR_P4_BPU_PERFCTR0, - .evntsel = MSR_P4_BSU_ESCR0, - .checkbit = 1ULL << 39, -}; - -/* - * Watchdog using the Intel architected PerfMon. - * Used for Core2 and hopefully all future Intel CPUs. - */ -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - -static struct wd_ops intel_arch_wd_ops; - -static int setup_intel_arch_watchdog(unsigned nmi_hz) -{ - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < - (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return 0; - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); - return 1; -} - -static struct wd_ops intel_arch_wd_ops __read_mostly = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_intel_arch_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_ARCH_PERFMON_PERFCTR1, - .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, -}; - -static void probe_nmi_watchdog(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 == 6 || - (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15)) - wd_ops = &k7_wd_ops; - return; - case X86_VENDOR_INTEL: - /* Work around where perfctr1 doesn't have a working enable - * bit as described in the following errata: - * AE49 Core Duo and Intel Core Solo 65 nm - * AN49 Intel Pentium Dual-Core - * AF49 Dual-Core Intel Xeon Processor LV - */ - if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) || - ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 && - boot_cpu_data.x86_mask == 4))) { - intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; - intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; - } - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - wd_ops = &intel_arch_wd_ops; - break; - } - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 13) - return; - - wd_ops = &p6_wd_ops; - break; - case 15: - wd_ops = &p4_wd_ops; - break; - default: - return; - } - break; - } -} - -/* Interface to nmi.c */ - -int lapic_watchdog_init(unsigned nmi_hz) -{ - if (!wd_ops) { - probe_nmi_watchdog(); - if (!wd_ops) { - printk(KERN_INFO "NMI watchdog: CPU not supported\n"); - return -1; - } - - if (!wd_ops->reserve()) { - printk(KERN_ERR - "NMI watchdog: cannot reserve perfctrs\n"); - return -1; - } - } - - if (!(wd_ops->setup(nmi_hz))) { - printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n", - raw_smp_processor_id()); - return -1; - } - - return 0; -} - -void lapic_watchdog_stop(void) -{ - if (wd_ops) - wd_ops->stop(); -} - -unsigned lapic_adjust_nmi_hz(unsigned hz) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - if (wd->perfctr_msr == MSR_P6_PERFCTR0 || - wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) - hz = adjust_for_32bit_ctr(hz); - return hz; -} - -int __kprobes lapic_wd_event(unsigned nmi_hz) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - u64 ctr; - - rdmsrl(wd->perfctr_msr, ctr); - if (ctr & wd_ops->checkbit) /* perfctr still running? */ - return 0; - - wd_ops->rearm(wd, nmi_hz); - return 1; -} diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6e8752c1bd52..8474c998cbd4 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = { void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) + unsigned long *stack, char *log_lvl) { printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); + dump_trace(task, regs, stack, &print_trace_ops, log_lvl); } void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) + unsigned long *stack) { - show_trace_log_lvl(task, regs, stack, bp, ""); + show_trace_log_lvl(task, regs, stack, ""); } void show_stack(struct task_struct *task, unsigned long *sp) { - show_stack_log_lvl(task, NULL, sp, 0, ""); + show_stack_log_lvl(task, NULL, sp, ""); } /* @@ -210,7 +210,7 @@ void dump_stack(void) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); + show_trace(NULL, NULL, &stack); } EXPORT_SYMBOL(dump_stack); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 1bc7f75a5bda..74cc1eda384b 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -17,11 +17,12 @@ #include <asm/stacktrace.h> -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, +void dump_trace(struct task_struct *task, + struct pt_regs *regs, unsigned long *stack, const struct stacktrace_ops *ops, void *data) { int graph = 0; + unsigned long bp; if (!task) task = current; @@ -34,18 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, stack = (unsigned long *)task->thread.sp; } -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - get_bp(bp); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - + bp = stack_frame(task, regs); for (;;) { struct thread_info *context; @@ -65,7 +55,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, char *log_lvl) { unsigned long *stack; int i; @@ -87,7 +77,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, touch_nmi_watchdog(); } printk(KERN_CONT "\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); } @@ -112,8 +102,7 @@ void show_registers(struct pt_regs *regs) u8 *ip; printk(KERN_EMERG "Stack:\n"); - show_stack_log_lvl(NULL, regs, ®s->sp, - 0, KERN_EMERG); + show_stack_log_lvl(NULL, regs, ®s->sp, KERN_EMERG); printk(KERN_EMERG "Code: "); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6a340485249a..64101335de19 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, +void dump_trace(struct task_struct *task, + struct pt_regs *regs, unsigned long *stack, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); @@ -149,6 +149,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned used = 0; struct thread_info *tinfo; int graph = 0; + unsigned long bp; if (!task) task = current; @@ -160,18 +161,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, stack = (unsigned long *)task->thread.sp; } -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - get_bp(bp); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - + bp = stack_frame(task, regs); /* * Print function call entries in all stacks, starting at the * current stack address. If the stacks consist of nested @@ -235,7 +225,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, char *log_lvl) { unsigned long *irq_stack_end; unsigned long *irq_stack; @@ -279,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, preempt_enable(); printk(KERN_CONT "\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); } void show_registers(struct pt_regs *regs) @@ -308,7 +298,7 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Stack:\n"); show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - regs->bp, KERN_EMERG); + KERN_EMERG); printk(KERN_EMERG "Code: "); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 4572f25f9325..cd28a350f7f9 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -240,7 +240,7 @@ static int __init setup_early_printk(char *buf) if (!strncmp(buf, "xen", 3)) early_console_register(&xenboot_console, keep); #endif -#ifdef CONFIG_X86_MRST_EARLY_PRINTK +#ifdef CONFIG_EARLY_PRINTK_MRST if (!strncmp(buf, "mrst", 4)) { mrst_early_console_init(); early_console_register(&early_mrst_console, keep); @@ -250,7 +250,6 @@ static int __init setup_early_printk(char *buf) hsu_early_console_init(); early_console_register(&early_hsu_console, keep); } - #endif buf++; } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3afb33f14d2d..298448656b60 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -19,6 +19,7 @@ #include <linux/sched.h> #include <linux/init.h> #include <linux/list.h> +#include <linux/module.h> #include <trace/syscall.h> @@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code); int ftrace_arch_code_modify_prepare(void) { set_kernel_text_rw(); + set_all_modules_text_rw(); modifying_code = 1; return 0; } @@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void) int ftrace_arch_code_modify_post_process(void) { modifying_code = 0; + set_all_modules_text_ro(); set_kernel_text_ro(); return 0; } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 763310165fa0..7f138b3c3c52 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -61,6 +61,9 @@ void __init i386_start_kernel(void) case X86_SUBARCH_MRST: x86_mrst_early_setup(); break; + case X86_SUBARCH_CE4100: + x86_ce4100_early_setup(); + break; default: i386_default_early_setup(); break; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index c0dbd9ac24f0..9f54b209c378 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -139,39 +139,6 @@ ENTRY(startup_32) movl %eax, pa(olpc_ofw_pgd) #endif -#ifdef CONFIG_PARAVIRT - /* This is can only trip for a broken bootloader... */ - cmpw $0x207, pa(boot_params + BP_version) - jb default_entry - - /* Paravirt-compatible boot parameters. Look to see what architecture - we're booting under. */ - movl pa(boot_params + BP_hardware_subarch), %eax - cmpl $num_subarch_entries, %eax - jae bad_subarch - - movl pa(subarch_entries)(,%eax,4), %eax - subl $__PAGE_OFFSET, %eax - jmp *%eax - -bad_subarch: -WEAK(lguest_entry) -WEAK(xen_entry) - /* Unknown implementation; there's really - nothing we can do at this point. */ - ud2a - - __INITDATA - -subarch_entries: - .long default_entry /* normal x86/PC */ - .long lguest_entry /* lguest hypervisor */ - .long xen_entry /* Xen hypervisor */ - .long default_entry /* Moorestown MID */ -num_subarch_entries = (. - subarch_entries) / 4 -.previous -#endif /* CONFIG_PARAVIRT */ - /* * Initialize page tables. This creates a PDE and a set of page * tables, which are located immediately beyond __brk_base. The variable @@ -181,7 +148,6 @@ num_subarch_entries = (. - subarch_entries) / 4 * * Note that the stack is not yet set up! */ -default_entry: #ifdef CONFIG_X86_PAE /* @@ -261,7 +227,42 @@ page_pde_offset = (__PAGE_OFFSET >> 20); movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax movl %eax,pa(initial_page_table+0xffc) #endif - jmp 3f + +#ifdef CONFIG_PARAVIRT + /* This is can only trip for a broken bootloader... */ + cmpw $0x207, pa(boot_params + BP_version) + jb default_entry + + /* Paravirt-compatible boot parameters. Look to see what architecture + we're booting under. */ + movl pa(boot_params + BP_hardware_subarch), %eax + cmpl $num_subarch_entries, %eax + jae bad_subarch + + movl pa(subarch_entries)(,%eax,4), %eax + subl $__PAGE_OFFSET, %eax + jmp *%eax + +bad_subarch: +WEAK(lguest_entry) +WEAK(xen_entry) + /* Unknown implementation; there's really + nothing we can do at this point. */ + ud2a + + __INITDATA + +subarch_entries: + .long default_entry /* normal x86/PC */ + .long lguest_entry /* lguest hypervisor */ + .long xen_entry /* Xen hypervisor */ + .long default_entry /* Moorestown MID */ +num_subarch_entries = (. - subarch_entries) / 4 +.previous +#else + jmp default_entry +#endif /* CONFIG_PARAVIRT */ + /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but @@ -282,7 +283,7 @@ ENTRY(startup_32_smp) movl %eax,%fs movl %eax,%gs #endif /* CONFIG_SMP */ -3: +default_entry: /* * New page tables may be in 4Mbyte page mode and may @@ -316,6 +317,10 @@ ENTRY(startup_32_smp) subl $0x80000001, %eax cmpl $(0x8000ffff-0x80000001), %eax ja 6f + + /* Clear bogus XD_DISABLE bits */ + call verify_cpu + mov $0x80000001, %eax cpuid /* Execute Disable bit supported? */ @@ -611,6 +616,8 @@ ignore_int: #endif iret +#include "verify_cpu.S" + __REFDATA .align 4 ENTRY(initial_code) @@ -622,13 +629,13 @@ ENTRY(initial_code) __PAGE_ALIGNED_BSS .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE -ENTRY(initial_pg_pmd) +initial_pg_pmd: .fill 1024*KPMDS,4,0 #else ENTRY(initial_page_table) .fill 1024,4,0 #endif -ENTRY(initial_pg_fixmap) +initial_pg_fixmap: .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 1cbd54c0df99..5940282bd2f9 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1184,6 +1184,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + /* This is possible if op is under delayed unoptimizing */ + if (kprobe_disabled(&op->kp)) + return; + preempt_disable(); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); @@ -1401,10 +1405,16 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) return 0; } -/* Replace a breakpoint (int3) with a relative jump. */ -int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) +#define MAX_OPTIMIZE_PROBES 256 +static struct text_poke_param *jump_poke_params; +static struct jump_poke_buffer { + u8 buf[RELATIVEJUMP_SIZE]; +} *jump_poke_bufs; + +static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, + u8 *insn_buf, + struct optimized_kprobe *op) { - unsigned char jmp_code[RELATIVEJUMP_SIZE]; s32 rel = (s32)((long)op->optinsn.insn - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); @@ -1412,16 +1422,79 @@ int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, RELATIVE_ADDR_SIZE); - jmp_code[0] = RELATIVEJUMP_OPCODE; - *(s32 *)(&jmp_code[1]) = rel; + insn_buf[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&insn_buf[1]) = rel; + + tprm->addr = op->kp.addr; + tprm->opcode = insn_buf; + tprm->len = RELATIVEJUMP_SIZE; +} + +/* + * Replace breakpoints (int3) with relative jumps. + * Caller must call with locking kprobe_mutex and text_mutex. + */ +void __kprobes arch_optimize_kprobes(struct list_head *oplist) +{ + struct optimized_kprobe *op, *tmp; + int c = 0; + + list_for_each_entry_safe(op, tmp, oplist, list) { + WARN_ON(kprobe_disabled(&op->kp)); + /* Setup param */ + setup_optimize_kprobe(&jump_poke_params[c], + jump_poke_bufs[c].buf, op); + list_del_init(&op->list); + if (++c >= MAX_OPTIMIZE_PROBES) + break; + } /* * text_poke_smp doesn't support NMI/MCE code modifying. * However, since kprobes itself also doesn't support NMI/MCE * code probing, it's not a problem. */ - text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); - return 0; + text_poke_smp_batch(jump_poke_params, c); +} + +static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, + u8 *insn_buf, + struct optimized_kprobe *op) +{ + /* Set int3 to first byte for kprobes */ + insn_buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + + tprm->addr = op->kp.addr; + tprm->opcode = insn_buf; + tprm->len = RELATIVEJUMP_SIZE; +} + +/* + * Recover original instructions and breakpoints from relative jumps. + * Caller must call with locking kprobe_mutex. + */ +extern void arch_unoptimize_kprobes(struct list_head *oplist, + struct list_head *done_list) +{ + struct optimized_kprobe *op, *tmp; + int c = 0; + + list_for_each_entry_safe(op, tmp, oplist, list) { + /* Setup param */ + setup_unoptimize_kprobe(&jump_poke_params[c], + jump_poke_bufs[c].buf, op); + list_move(&op->list, done_list); + if (++c >= MAX_OPTIMIZE_PROBES) + break; + } + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp_batch(jump_poke_params, c); } /* Replace a relative jump with a breakpoint (int3). */ @@ -1453,11 +1526,35 @@ static int __kprobes setup_detour_execution(struct kprobe *p, } return 0; } + +static int __kprobes init_poke_params(void) +{ + /* Allocate code buffer and parameter array */ + jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * + MAX_OPTIMIZE_PROBES, GFP_KERNEL); + if (!jump_poke_bufs) + return -ENOMEM; + + jump_poke_params = kmalloc(sizeof(struct text_poke_param) * + MAX_OPTIMIZE_PROBES, GFP_KERNEL); + if (!jump_poke_params) { + kfree(jump_poke_bufs); + jump_poke_bufs = NULL; + return -ENOMEM; + } + + return 0; +} +#else /* !CONFIG_OPTPROBES */ +static int __kprobes init_poke_params(void) +{ + return 0; +} #endif int __init arch_init_kprobes(void) { - return 0; + return init_poke_params(); } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index ce0cb4721c9a..0fe6d1a66c38 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -155,12 +155,6 @@ static int apply_microcode_amd(int cpu) return 0; } -static int get_ucode_data(void *to, const u8 *from, size_t n) -{ - memcpy(to, from, n); - return 0; -} - static void * get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) { @@ -168,8 +162,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; void *mc; - if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) - return NULL; + get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR); if (section_hdr[0] != UCODE_UCODE_TYPE) { pr_err("error: invalid type field in container file section header\n"); @@ -183,16 +176,13 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) return NULL; } - mc = vmalloc(UCODE_MAX_SIZE); - if (mc) { - memset(mc, 0, UCODE_MAX_SIZE); - if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, - total_size)) { - vfree(mc); - mc = NULL; - } else - *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; - } + mc = vzalloc(UCODE_MAX_SIZE); + if (!mc) + return NULL; + + get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size); + *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; + return mc; } @@ -202,8 +192,7 @@ static int install_equiv_cpu_table(const u8 *buf) unsigned int *buf_pos = (unsigned int *)container_hdr; unsigned long size; - if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE)) - return 0; + get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE); size = buf_pos[2]; @@ -219,10 +208,7 @@ static int install_equiv_cpu_table(const u8 *buf) } buf += UCODE_CONTAINER_HEADER_SIZE; - if (get_ucode_data(equiv_cpu_table, buf, size)) { - vfree(equiv_cpu_table); - return 0; - } + get_ucode_data(equiv_cpu_table, buf, size); return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ } diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index dcb65cc0a053..1a1b606d3e92 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -364,8 +364,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, /* For performance reasons, reuse mc area when possible */ if (!mc || mc_size > curr_mc_size) { - if (mc) - vfree(mc); + vfree(mc); mc = vmalloc(mc_size); if (!mc) break; @@ -374,13 +373,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, if (get_ucode_data(mc, ucode_ptr, mc_size) || microcode_sanity_check(mc) < 0) { - vfree(mc); break; } if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; mc = NULL; /* trigger new vmalloc */ @@ -390,12 +387,10 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, leftover -= mc_size; } - if (mc) - vfree(mc); + vfree(mc); if (leftover) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); state = UCODE_ERROR; goto out; } @@ -405,8 +400,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, goto out; } - if (uci->mc) - vfree(uci->mc); + vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index ba0f0ca9f280..c01ffa5b9b87 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -143,7 +143,7 @@ static void flush_gart(void) spin_lock_irqsave(&iommu_bitmap_lock, flags); if (need_flush) { - k8_flush_garts(); + amd_flush_garts(); need_flush = false; } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); @@ -561,17 +561,17 @@ static void enable_gart_translations(void) { int i; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; - for (i = 0; i < k8_northbridges.num; i++) { - struct pci_dev *dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; enable_gart_translation(dev, __pa(agp_gatt_table)); } /* Flush the GART-TLB to remove stale entries */ - k8_flush_garts(); + amd_flush_garts(); } /* @@ -596,13 +596,13 @@ static void gart_fixup_northbridges(struct sys_device *dev) if (!fix_up_north_bridges) return; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; pr_info("PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < k8_northbridges.num; i++) { - struct pci_dev *dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; /* * Don't enable translations just yet. That is the next @@ -644,7 +644,7 @@ static struct sys_device device_gart = { * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. */ -static __init int init_k8_gatt(struct agp_kern_info *info) +static __init int init_amd_gatt(struct agp_kern_info *info) { unsigned aper_size, gatt_size, new_aper_size; unsigned aper_base, new_aper_base; @@ -656,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) aper_size = aper_base = info->aper_size = 0; dev = NULL; - for (i = 0; i < k8_northbridges.num; i++) { - dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + dev = node_to_amd_nb(i)->misc; new_aper_base = read_aperture(dev, &new_aper_size); if (!new_aper_base) goto nommu; @@ -725,13 +725,13 @@ static void gart_iommu_shutdown(void) if (!no_agp) return; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; - for (i = 0; i < k8_northbridges.num; i++) { + for (i = 0; i < amd_nb_num(); i++) { u32 ctl; - dev = k8_northbridges.nb_misc[i]; + dev = node_to_amd_nb(i)->misc; pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); ctl &= ~GARTEN; @@ -749,14 +749,14 @@ int __init gart_iommu_init(void) unsigned long scratch; long i; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return 0; #ifndef CONFIG_AGP_AMD64 no_agp = 1; #else /* Makefile puts PCI initialization via subsys_initcall first. */ - /* Add other K8 AGP bridge drivers here */ + /* Add other AMD AGP bridge drivers here */ no_agp = no_agp || (agp_amd64_init() < 0) || (agp_copy_info(agp_bridge, &info) < 0); @@ -765,7 +765,7 @@ int __init gart_iommu_init(void) if (no_iommu || (!force_iommu && max_pfn <= MAX_DMA32_PFN) || !gart_iommu_aperture || - (no_agp && init_k8_gatt(&info) < 0)) { + (no_agp && init_amd_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); pr_warning("falling back to iommu=soft.\n"); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 57d1868a86aa..c852041bfc3d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -91,8 +91,7 @@ void exit_thread(void) void show_regs(struct pt_regs *regs) { show_registers(regs); - show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), - regs->bp); + show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs)); } void show_regs_common(void) @@ -374,6 +373,7 @@ void default_idle(void) { if (hlt_use_halt()) { trace_power_start(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle(1, smp_processor_id()); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -444,6 +444,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); + trace_cpu_idle((ax>>4)+1, smp_processor_id()); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -460,6 +461,7 @@ static void mwait_idle(void) { if (!need_resched()) { trace_power_start(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle(1, smp_processor_id()); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -481,10 +483,12 @@ static void mwait_idle(void) static void poll_idle(void) { trace_power_start(POWER_CSTATE, 0, smp_processor_id()); + trace_cpu_idle(0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end(0); + trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 96586c3cbbbf..4b9befa0e347 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -113,8 +113,8 @@ void cpu_idle(void) stop_critical_timings(); pm_idle(); start_critical_timings(); - trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b3d7a3a04f38..4c818a738396 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -142,6 +142,8 @@ void cpu_idle(void) start_critical_timings(); trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, + smp_processor_id()); /* In many cases the interrupt that ended idle has already called exit_idle. But some idle diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index fda313ebbb03..c8e41e90f59c 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -43,17 +43,33 @@ static void rdc321x_reset(struct pci_dev *dev) outb(1, 0x92); } +static void ce4100_reset(struct pci_dev *dev) +{ + int i; + + for (i = 0; i < 10; i++) { + outb(0x2, 0xcf9); + udelay(50); + } +} + struct device_fixup { unsigned int vendor; unsigned int device; void (*reboot_fixup)(struct pci_dev *); }; +/* + * PCI ids solely used for fixups_table go here + */ +#define PCI_DEVICE_ID_INTEL_CE4100 0x0708 + static const struct device_fixup fixups_table[] = { { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, { PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset }, +{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset }, }; /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a0f52af256a0..d3cfe26c0252 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -705,7 +705,7 @@ static u64 __init get_max_mapped(void) void __init setup_arch(char **cmdline_p) { int acpi = 0; - int k8 = 0; + int amd = 0; unsigned long flags; #ifdef CONFIG_X86_32 @@ -991,12 +991,12 @@ void __init setup_arch(char **cmdline_p) acpi = acpi_numa_init(); #endif -#ifdef CONFIG_K8_NUMA +#ifdef CONFIG_AMD_NUMA if (!acpi) - k8 = !k8_numa_init(0, max_pfn); + amd = !amd_numa_init(0, max_pfn); #endif - initmem_init(0, max_pfn, acpi, k8); + initmem_init(0, max_pfn, acpi, amd); memblock_find_dma_reserve(); dma32_reserve_bootmem(); @@ -1045,10 +1045,7 @@ void __init setup_arch(char **cmdline_p) #endif init_apic_mappings(); - ioapic_init_mappings(); - - /* need to wait for io_apic is mapped */ - probe_nr_irqs_gsi(); + ioapic_and_gsi_init(); kvm_guest_init(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 083e99d1b7df..ee886fe10ef4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -281,6 +281,13 @@ static void __cpuinit smp_callin(void) */ smp_store_cpu_info(cpuid); + /* + * This must be done before setting cpu_online_mask + * or calling notify_cpu_starting. + */ + set_cpu_sibling_map(raw_smp_processor_id()); + wmb(); + notify_cpu_starting(cpuid); /* @@ -316,16 +323,6 @@ notrace static void __cpuinit start_secondary(void *unused) */ check_tsc_sync_target(); - if (nmi_watchdog == NMI_IO_APIC) { - legacy_pic->mask(0); - enable_NMI_through_LVT0(); - legacy_pic->unmask(0); - } - - /* This must be done before setting cpu_online_mask */ - set_cpu_sibling_map(raw_smp_processor_id()); - wmb(); - /* * We need to hold call_lock, so there is no inconsistency * between the time smp_call_function() determines number of @@ -1061,8 +1058,6 @@ static int __init smp_sanity_check(unsigned max_cpus) printk(KERN_INFO "SMP mode deactivated.\n"); smpboot_clear_io_apic(); - localise_nmi_watchdog(); - connect_bsp_APIC(); setup_local_APIC(); end_local_APIC_setup(); @@ -1166,6 +1161,20 @@ out: preempt_enable(); } +void arch_disable_nonboot_cpus_begin(void) +{ + /* + * Avoid the smp alternatives switch during the disable_nonboot_cpus(). + * In the suspend path, we will be back in the SMP mode shortly anyways. + */ + skip_smp_alternatives = true; +} + +void arch_disable_nonboot_cpus_end(void) +{ + skip_smp_alternatives = false; +} + void arch_enable_nonboot_cpus_begin(void) { set_mtrr_aps_delayed_init(); @@ -1196,7 +1205,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif - check_nmi_watchdog(); mtrr_aps_init(); } @@ -1341,8 +1349,6 @@ int native_cpu_disable(void) if (cpu == 0) return -EBUSY; - if (nmi_watchdog == NMI_LOCAL_APIC) - stop_apic_nmi_watchdog(NULL); clear_local_APIC(); cpu_disable_common(); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index b53c525368a7..938c8e10a19a 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -73,22 +73,22 @@ static const struct stacktrace_ops save_stack_ops_nosched = { */ void save_stack_trace(struct stack_trace *trace) { - dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); + dump_trace(current, NULL, NULL, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace); -void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) +void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) { - dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); + dump_trace(current, regs, NULL, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); + dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index fb5cc5e14cfa..25a28a245937 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -22,10 +22,6 @@ #include <asm/hpet.h> #include <asm/time.h> -#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) -int timer_ack; -#endif - #ifdef CONFIG_X86_64 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; #endif @@ -63,20 +59,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) /* Keep nmi watchdog up to date */ inc_irq_stat(irq0_irqs); - /* Optimized out for !IO_APIC and x86_64 */ - if (timer_ack) { - /* - * Subtle, when I/O APICs are used we have to ack timer IRQ - * manually to deassert NMI lines for the watchdog if run - * on an 82489DX-based system. - */ - raw_spin_lock(&i8259A_lock); - outb(0x0c, PIC_MASTER_OCW3); - /* Ack the IRQ; AEOI will end it automatically. */ - inb(PIC_MASTER_POLL); - raw_spin_unlock(&i8259A_lock); - } - global_clock_event->event_handler(global_clock_event); /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index 3af2dff58b21..075d130efcf9 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -127,7 +127,7 @@ startup_64: no_longmode: hlt jmp no_longmode -#include "verify_cpu_64.S" +#include "verify_cpu.S" # Careful these need to be in the same 64K segment as the above; tidt: diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index cb838ca42c96..c76aaca5694d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -83,6 +83,8 @@ EXPORT_SYMBOL_GPL(used_vectors); static int ignore_nmis; +int unknown_nmi_panic; + static inline void conditional_sti(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) @@ -300,6 +302,13 @@ gp_in_kernel: die("general protection fault", regs, error_code); } +static int __init setup_unknown_nmi_panic(char *str) +{ + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); + static notrace __kprobes void mem_parity_error(unsigned char reason, struct pt_regs *regs) { @@ -342,9 +351,11 @@ io_check_error(unsigned char reason, struct pt_regs *regs) reason = (reason & 0xf) | 8; outb(reason, 0x61); - i = 2000; - while (--i) - udelay(1000); + i = 20000; + while (--i) { + touch_nmi_watchdog(); + udelay(100); + } reason &= ~8; outb(reason, 0x61); @@ -371,7 +382,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) reason, smp_processor_id()); printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); - if (panic_on_unrecovered_nmi) + if (unknown_nmi_panic || panic_on_unrecovered_nmi) panic("NMI: Not continuing"); printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); @@ -397,20 +408,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; - -#ifndef CONFIG_LOCKUP_DETECTOR - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog_tick(regs, reason)) - return; - if (!do_nmi_callback(regs, cpu)) -#endif /* !CONFIG_LOCKUP_DETECTOR */ - unknown_nmi_error(reason, regs); -#else - unknown_nmi_error(reason, regs); #endif + unknown_nmi_error(reason, regs); return; } @@ -446,14 +445,12 @@ do_nmi(struct pt_regs *regs, long error_code) void stop_nmi(void) { - acpi_nmi_disable(); ignore_nmis++; } void restart_nmi(void) { ignore_nmis--; - acpi_nmi_enable(); } /* May run on IST stack. */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 0c40d8b72416..356a0d455cf9 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -872,6 +872,9 @@ __cpuinit int unsynchronized_tsc(void) if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; + + if (tsc_clocksource_reliable) + return 0; /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: @@ -879,14 +882,92 @@ __cpuinit int unsynchronized_tsc(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { /* assume multi socket systems are not synchronized: */ if (num_possible_cpus() > 1) - tsc_unstable = 1; + return 1; } - return tsc_unstable; + return 0; +} + + +static void tsc_refine_calibration_work(struct work_struct *work); +static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); +/** + * tsc_refine_calibration_work - Further refine tsc freq calibration + * @work - ignored. + * + * This functions uses delayed work over a period of a + * second to further refine the TSC freq value. Since this is + * timer based, instead of loop based, we don't block the boot + * process while this longer calibration is done. + * + * If there are any calibration anomolies (too many SMIs, etc), + * or the refined calibration is off by 1% of the fast early + * calibration, we throw out the new calibration and use the + * early calibration. + */ +static void tsc_refine_calibration_work(struct work_struct *work) +{ + static u64 tsc_start = -1, ref_start; + static int hpet; + u64 tsc_stop, ref_stop, delta; + unsigned long freq; + + /* Don't bother refining TSC on unstable systems */ + if (check_tsc_unstable()) + goto out; + + /* + * Since the work is started early in boot, we may be + * delayed the first time we expire. So set the workqueue + * again once we know timers are working. + */ + if (tsc_start == -1) { + /* + * Only set hpet once, to avoid mixing hardware + * if the hpet becomes enabled later. + */ + hpet = is_hpet_enabled(); + schedule_delayed_work(&tsc_irqwork, HZ); + tsc_start = tsc_read_refs(&ref_start, hpet); + return; + } + + tsc_stop = tsc_read_refs(&ref_stop, hpet); + + /* hpet or pmtimer available ? */ + if (!hpet && !ref_start && !ref_stop) + goto out; + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX) + goto out; + + delta = tsc_stop - tsc_start; + delta *= 1000000LL; + if (hpet) + freq = calc_hpet_ref(delta, ref_start, ref_stop); + else + freq = calc_pmtimer_ref(delta, ref_start, ref_stop); + + /* Make sure we're within 1% */ + if (abs(tsc_khz - freq) > tsc_khz/100) + goto out; + + tsc_khz = freq; + printk(KERN_INFO "Refined TSC clocksource calibration: " + "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); + +out: + clocksource_register_khz(&clocksource_tsc, tsc_khz); } -static void __init init_tsc_clocksource(void) + +static int __init init_tsc_clocksource(void) { + if (!cpu_has_tsc || tsc_disabled > 0) + return 0; + if (tsc_clocksource_reliable) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; /* lower the rating if we already know its unstable: */ @@ -894,8 +975,14 @@ static void __init init_tsc_clocksource(void) clocksource_tsc.rating = 0; clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; } - clocksource_register_khz(&clocksource_tsc, tsc_khz); + schedule_delayed_work(&tsc_irqwork, 0); + return 0; } +/* + * We use device_initcall here, to ensure we run after the hpet + * is fully initialized, which may occur at fs_initcall time. + */ +device_initcall(init_tsc_clocksource); void __init tsc_init(void) { @@ -949,6 +1036,5 @@ void __init tsc_init(void) mark_tsc_unstable("TSCs unsynchronized"); check_system_tsc_reliable(); - init_tsc_clocksource(); } diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu.S index 56a8c2a867d9..0edefc19a113 100644 --- a/arch/x86/kernel/verify_cpu_64.S +++ b/arch/x86/kernel/verify_cpu.S @@ -7,6 +7,7 @@ * Copyright (c) 2007 Andi Kleen (ak@suse.de) * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) + * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com) * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. @@ -14,18 +15,17 @@ * This is a common code for verification whether CPU supports * long mode and SSE or not. It is not called directly instead this * file is included at various places and compiled in that context. - * Following are the current usage. + * This file is expected to run in 32bit code. Currently: * - * This file is included by both 16bit and 32bit code. + * arch/x86/boot/compressed/head_64.S: Boot cpu verification + * arch/x86/kernel/trampoline_64.S: secondary processor verfication + * arch/x86/kernel/head_32.S: processor startup * - * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) - * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit) - * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit) - * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit) - * - * verify_cpu, returns the status of cpu check in register %eax. + * verify_cpu, returns the status of longmode and SSE in register %eax. * 0: Success 1: Failure * + * On Intel, the XD_DISABLE flag will be cleared as a side-effect. + * * The caller needs to check for the error code and take the action * appropriately. Either display a message or halt. */ @@ -62,8 +62,41 @@ verify_cpu: cmpl $0x444d4163,%ecx jnz verify_cpu_noamd mov $1,%di # cpu is from AMD + jmp verify_cpu_check verify_cpu_noamd: + cmpl $0x756e6547,%ebx # GenuineIntel? + jnz verify_cpu_check + cmpl $0x49656e69,%edx + jnz verify_cpu_check + cmpl $0x6c65746e,%ecx + jnz verify_cpu_check + + # only call IA32_MISC_ENABLE when: + # family > 6 || (family == 6 && model >= 0xd) + movl $0x1, %eax # check CPU family and model + cpuid + movl %eax, %ecx + + andl $0x0ff00f00, %eax # mask family and extended family + shrl $8, %eax + cmpl $6, %eax + ja verify_cpu_clear_xd # family > 6, ok + jb verify_cpu_check # family < 6, skip + + andl $0x000f00f0, %ecx # mask model and extended model + shrl $4, %ecx + cmpl $0xd, %ecx + jb verify_cpu_check # family == 6, model < 0xd, skip + +verify_cpu_clear_xd: + movl $MSR_IA32_MISC_ENABLE, %ecx + rdmsr + btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE + jnc verify_cpu_check # only write MSR if bit was changed + wrmsr + +verify_cpu_check: movl $0x1,%eax # Does the cpu have what it takes cpuid andl $REQUIRED_MASK0,%edx diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e03530aebfd0..bf4700755184 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -69,7 +69,7 @@ jiffies_64 = jiffies; PHDRS { text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ + data PT_LOAD FLAGS(6); /* RW_ */ #ifdef CONFIG_X86_64 user PT_LOAD FLAGS(5); /* R_E */ #ifdef CONFIG_SMP @@ -116,6 +116,10 @@ SECTIONS EXCEPTION_TABLE(16) :text = 0x9090 +#if defined(CONFIG_DEBUG_RODATA) + /* .text should occupy whole number of pages */ + . = ALIGN(PAGE_SIZE); +#endif X64_ALIGN_DEBUG_RODATA_BEGIN RO_DATA(PAGE_SIZE) X64_ALIGN_DEBUG_RODATA_END @@ -335,7 +339,7 @@ SECTIONS __bss_start = .; *(.bss..page_aligned) *(.bss) - . = ALIGN(4); + . = ALIGN(PAGE_SIZE); __bss_stop = .; } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index f628234fbeca..3cece05e4ac4 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -575,6 +575,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s->pics[1].elcr_mask = 0xde; s->pics[0].pics_state = s; s->pics[1].pics_state = s; + s->pics[0].isr_ack = 0xff; + s->pics[1].isr_ack = 0xff; /* * Initialize PIO device diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fb8b376bf28c..fbb04aee8301 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2394,7 +2394,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); - sp = kvm_mmu_get_page(vcpu, i << 30, i << 30, + sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), + i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL, NULL); root = __pa(sp->spt); diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index e7d5382ef263..4f420c2f2d55 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -4,7 +4,6 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/processor-flags.h> -#include <asm/pgtable.h> /*G:020 * Our story starts with the kernel booting into startup_32 in @@ -38,113 +37,9 @@ ENTRY(lguest_entry) /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp - call init_pagetables - /* Jumps are relative: we're running __PAGE_OFFSET too low. */ jmp lguest_init+__PAGE_OFFSET -/* - * Initialize page tables. This creates a PDE and a set of page - * tables, which are located immediately beyond __brk_base. The variable - * _brk_end is set up to point to the first "safe" location. - * Mappings are created both at virtual address 0 (identity mapping) - * and PAGE_OFFSET for up to _end. - * - * FIXME: This code is taken verbatim from arch/x86/kernel/head_32.S: they - * don't have a stack at this point, so we can't just use call and ret. - */ -init_pagetables: -#if PTRS_PER_PMD > 1 -#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) -#else -#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) -#endif -#define pa(X) ((X) - __PAGE_OFFSET) - -/* Enough space to fit pagetables for the low memory linear map */ -MAPPING_BEYOND_END = \ - PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT -#ifdef CONFIG_X86_PAE - - /* - * In PAE mode initial_page_table is statically defined to contain - * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 - * entries). The identity mapping is handled by pointing two PGD entries - * to the first kernel PMD. - * - * Note the upper half of each PMD or PTE are always zero at this stage. - */ - -#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ - - xorl %ebx,%ebx /* %ebx is kept at zero */ - - movl $pa(__brk_base), %edi - movl $pa(initial_pg_pmd), %edx - movl $PTE_IDENT_ATTR, %eax -10: - leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ - movl %ecx,(%edx) /* Store PMD entry */ - /* Upper half already zero */ - addl $8,%edx - movl $512,%ecx -11: - stosl - xchgl %eax,%ebx - stosl - xchgl %eax,%ebx - addl $0x1000,%eax - loop 11b - - /* - * End condition: we must map up to the end + MAPPING_BEYOND_END. - */ - movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp - cmpl %ebp,%eax - jb 10b -1: - addl $__PAGE_OFFSET, %edi - movl %edi, pa(_brk_end) - shrl $12, %eax - movl %eax, pa(max_pfn_mapped) - - /* Do early initialization of the fixmap area */ - movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax - movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8) -#else /* Not PAE */ - -page_pde_offset = (__PAGE_OFFSET >> 20); - - movl $pa(__brk_base), %edi - movl $pa(initial_page_table), %edx - movl $PTE_IDENT_ATTR, %eax -10: - leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ - movl %ecx,(%edx) /* Store identity PDE entry */ - movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ - addl $4,%edx - movl $1024, %ecx -11: - stosl - addl $0x1000,%eax - loop 11b - /* - * End condition: we must map up to the end + MAPPING_BEYOND_END. - */ - movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp - cmpl %ebp,%eax - jb 10b - addl $__PAGE_OFFSET, %edi - movl %edi, pa(_brk_end) - shrl $12, %eax - movl %eax, pa(max_pfn_mapped) - - /* Do early initialization of the fixmap area */ - movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax - movl %eax,pa(initial_page_table+0xffc) -#endif - ret - /*G:055 * We create a macro which puts the assembler code between lgstart_ and lgend_ * markers. These templates are put in the .text section: they can't be diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 55543397a8a7..09df2f9a3d69 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -23,7 +23,7 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o -obj-$(CONFIG_K8_NUMA) += k8topology_64.o +obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology_64.c index 804a3b6c6e14..51fae9cfdecb 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -1,8 +1,8 @@ /* - * AMD K8 NUMA support. + * AMD NUMA support. * Discover the memory map and associated nodes. * - * This version reads it directly from the K8 northbridge. + * This version reads it directly from the AMD northbridge. * * Copyright 2002,2003 Andi Kleen, SuSE Labs. */ @@ -57,7 +57,7 @@ static __init void early_get_boot_cpu_id(void) { /* * need to get the APIC ID of the BSP so can use that to - * create apicid_to_node in k8_scan_nodes() + * create apicid_to_node in amd_scan_nodes() */ #ifdef CONFIG_X86_MPPARSE /* @@ -69,7 +69,7 @@ static __init void early_get_boot_cpu_id(void) early_init_lapic_mapping(); } -int __init k8_get_nodes(struct bootnode *physnodes) +int __init amd_get_nodes(struct bootnode *physnodes) { int i; int ret = 0; @@ -82,7 +82,7 @@ int __init k8_get_nodes(struct bootnode *physnodes) return ret; } -int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) +int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) { unsigned long start = PFN_PHYS(start_pfn); unsigned long end = PFN_PHYS(end_pfn); @@ -194,7 +194,7 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) return 0; } -int __init k8_scan_nodes(void) +int __init amd_scan_nodes(void) { unsigned int bits; unsigned int cores; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c0e28a13de7d..947f42abe820 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) /* * We just marked the kernel text read only above, now that * we are going to free part of that, we need to make that - * writeable first. + * writeable and non-executable first. */ + set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0e969f9f401b..f89b5bb4e93f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -226,7 +226,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) static inline int is_kernel_text(unsigned long addr) { - if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) + if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end) return 1; return 0; } @@ -912,6 +912,23 @@ void set_kernel_text_ro(void) set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); } +static void mark_nxdata_nx(void) +{ + /* + * When this called, init has already been executed and released, + * so everything past _etext sould be NX. + */ + unsigned long start = PFN_ALIGN(_etext); + /* + * This comes from is_kernel_text upper limit. Also HPAGE where used: + */ + unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start; + + if (__supported_pte_mask & _PAGE_NX) + printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10); + set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT); +} + void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); @@ -946,6 +963,7 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: write protecting again\n"); set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); #endif + mark_nxdata_nx(); } #endif diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index af3b6c8a436f..704a37cedddb 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state, e->trace.entries = e->trace_entries; e->trace.max_entries = ARRAY_SIZE(e->trace_entries); e->trace.skip = 0; - save_stack_trace_bp(&e->trace, regs->bp); + save_stack_trace_regs(&e->trace, regs); /* Round address down to nearest 16 bytes */ shadow_copy = kmemcheck_shadow_lookup(address diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 7ffc9b727efd..7762a517d69d 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -264,7 +264,7 @@ static struct bootnode physnodes[MAX_NUMNODES] __initdata; static char *cmdline __initdata; static int __init setup_physnodes(unsigned long start, unsigned long end, - int acpi, int k8) + int acpi, int amd) { int nr_nodes = 0; int ret = 0; @@ -274,13 +274,13 @@ static int __init setup_physnodes(unsigned long start, unsigned long end, if (acpi) nr_nodes = acpi_get_nodes(physnodes); #endif -#ifdef CONFIG_K8_NUMA - if (k8) - nr_nodes = k8_get_nodes(physnodes); +#ifdef CONFIG_AMD_NUMA + if (amd) + nr_nodes = amd_get_nodes(physnodes); #endif /* * Basic sanity checking on the physical node map: there may be errors - * if the SRAT or K8 incorrectly reported the topology or the mem= + * if the SRAT or AMD code incorrectly reported the topology or the mem= * kernel parameter is used. */ for (i = 0; i < nr_nodes; i++) { @@ -549,7 +549,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) * numa=fake command-line option. */ static int __init numa_emulation(unsigned long start_pfn, - unsigned long last_pfn, int acpi, int k8) + unsigned long last_pfn, int acpi, int amd) { u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; @@ -557,7 +557,7 @@ static int __init numa_emulation(unsigned long start_pfn, int num_nodes; int i; - num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); + num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd); /* * If the numa=fake command-line contains a 'M' or 'G', it represents * the fixed node size. Otherwise, if it is just a single number N, @@ -602,7 +602,7 @@ static int __init numa_emulation(unsigned long start_pfn, #endif /* CONFIG_NUMA_EMU */ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, - int acpi, int k8) + int acpi, int amd) { int i; @@ -610,7 +610,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU - if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) + if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); @@ -624,8 +624,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, nodes_clear(node_online_map); #endif -#ifdef CONFIG_K8_NUMA - if (!numa_off && k8 && !k8_scan_nodes()) +#ifdef CONFIG_AMD_NUMA + if (!numa_off && amd && !amd_scan_nodes()) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 532e7933d606..8b830ca14ac4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -13,6 +13,7 @@ #include <linux/pfn.h> #include <linux/percpu.h> #include <linux/gfp.h> +#include <linux/pci.h> #include <asm/e820.h> #include <asm/processor.h> @@ -255,13 +256,16 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, unsigned long pfn) { pgprot_t forbidden = __pgprot(0); + pgprot_t required = __pgprot(0); /* * The BIOS area between 640k and 1Mb needs to be executable for * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. */ - if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) +#ifdef CONFIG_PCI_BIOS + if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_NX; +#endif /* * The kernel text needs to be executable for obvious reasons @@ -278,6 +282,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; + /* + * .data and .bss should always be writable. + */ + if (within(address, (unsigned long)_sdata, (unsigned long)_edata) || + within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop)) + pgprot_val(required) |= _PAGE_RW; #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) /* @@ -317,6 +327,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, #endif prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); + prot = __pgprot(pgprot_val(prot) | pgprot_val(required)); return prot; } @@ -393,7 +404,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, { unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; pte_t new_pte, old_pte, *tmp; - pgprot_t old_prot, new_prot; + pgprot_t old_prot, new_prot, req_prot; int i, do_split = 1; unsigned int level; @@ -438,10 +449,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * We are safe now. Check whether the new pgprot is the same: */ old_pte = *kpte; - old_prot = new_prot = pte_pgprot(old_pte); + old_prot = new_prot = req_prot = pte_pgprot(old_pte); - pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); - pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); + pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); /* * old_pte points to the large page base address. So we need @@ -450,17 +461,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); cpa->pfn = pfn; - new_prot = static_protections(new_prot, address, pfn); + new_prot = static_protections(req_prot, address, pfn); /* * We need to check the full range, whether * static_protection() requires a different pgprot for one of * the pages in the range we try to preserve: */ - addr = address + PAGE_SIZE; - pfn++; - for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { - pgprot_t chk_prot = static_protections(new_prot, addr, pfn); + addr = address & pmask; + pfn = pte_pfn(old_pte); + for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { + pgprot_t chk_prot = static_protections(req_prot, addr, pfn); if (pgprot_val(chk_prot) != pgprot_val(new_prot)) goto out_unlock; @@ -483,7 +494,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * that we limited the number of possible pages already to * the number of pages in the large page. */ - if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { + if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { /* * The address is aligned and the number of pages * covers the full page. diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index a3250aa34086..410531d3c292 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -41,7 +41,7 @@ void __init x86_report_nx(void) { if (!cpu_has_nx) { printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " - "missing in CPU or disabled in BIOS!\n"); + "missing in CPU!\n"); } else { #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) if (disable_nx) { diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index a17dffd136c1..f16434568a51 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -92,6 +92,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) /* mark this node as "seen" in node bitmap */ BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); + /* don't need to check apic_id here, because it is always 8 bits */ apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index a35cb9d8b060..171a0aacb99a 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) } apic_id = pa->apic_id; + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } apicid_to_node[apic_id] = node; node_set(node, cpu_nodes_parsed); acpi_numa = 1; @@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; else apic_id = pa->apic_id; + + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + apicid_to_node[apic_id] = node; node_set(node, cpu_nodes_parsed); acpi_numa = 1; diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 2d49d4e19a36..72cbec14d783 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) if (!user_mode_vm(regs)) { unsigned long stack = kernel_stack_pointer(regs); if (depth) - dump_trace(NULL, regs, (unsigned long *)stack, 0, + dump_trace(NULL, regs, (unsigned long *)stack, &backtrace_ops, &depth); return; } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 4e8baad36d37..358c8b9c96a7 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -732,6 +732,9 @@ int __init op_nmi_init(struct oprofile_operations *ops) case 0x14: cpu_type = "x86-64/family14h"; break; + case 0x15: + cpu_type = "x86-64/family15h"; + break; default: return -ENODEV; } diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c index e3ecb71b5790..0636dd93cef8 100644 --- a/arch/x86/oprofile/nmi_timer_int.c +++ b/arch/x86/oprofile/nmi_timer_int.c @@ -58,9 +58,6 @@ static void timer_stop(void) int __init op_nmi_timer_init(struct oprofile_operations *ops) { - if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0)) - return -ENODEV; - ops->start = timer_start; ops->stop = timer_stop; ops->cpu_type = "timer"; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index a011bcc0f943..c3b8e24f2b16 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -29,11 +29,12 @@ #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 4 +#define NUM_COUNTERS 4 +#define NUM_COUNTERS_F15H 6 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -#define NUM_VIRT_COUNTERS 32 +#define NUM_VIRT_COUNTERS 32 #else -#define NUM_VIRT_COUNTERS NUM_COUNTERS +#define NUM_VIRT_COUNTERS 0 #endif #define OP_EVENT_MASK 0x0FFF @@ -41,7 +42,8 @@ #define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) -static unsigned long reset_value[NUM_VIRT_COUNTERS]; +static int num_counters; +static unsigned long reset_value[OP_MAX_COUNTER]; #define IBS_FETCH_SIZE 6 #define IBS_OP_SIZE 12 @@ -387,7 +389,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, int i; /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { int virt = op_x86_phys_to_virt(i); if (!reset_value[virt]) continue; @@ -406,7 +408,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!msrs->counters[i].addr) continue; release_perfctr_nmi(MSR_K7_PERFCTR0 + i); @@ -418,7 +420,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs) { int i; - for (i = 0; i < NUM_COUNTERS; i++) { + for (i = 0; i < num_counters; i++) { if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) goto fail; if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { @@ -426,8 +428,13 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs) goto fail; } /* both registers must be reserved */ - msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + if (num_counters == NUM_COUNTERS_F15H) { + msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); + msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); + } else { + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; + } continue; fail: if (!counter_config[i].enabled) @@ -447,7 +454,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, int i; /* setup reset_value */ - for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { + for (i = 0; i < OP_MAX_COUNTER; ++i) { if (counter_config[i].enabled && msrs->counters[op_x86_virt_to_phys(i)].addr) reset_value[i] = counter_config[i].count; @@ -456,7 +463,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, } /* clear all counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!msrs->controls[i].addr) continue; rdmsrl(msrs->controls[i].addr, val); @@ -472,7 +479,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, } /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { int virt = op_x86_phys_to_virt(i); if (!reset_value[virt]) continue; @@ -503,7 +510,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, u64 val; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { int virt = op_x86_phys_to_virt(i); if (!reset_value[virt]) continue; @@ -526,7 +533,7 @@ static void op_amd_start(struct op_msrs const * const msrs) u64 val; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); @@ -546,7 +553,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) * Subtle: stop on all counters to avoid race with setting our * pm callback */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); @@ -603,6 +610,7 @@ static int force_ibs_eilvt_setup(void) ret = setup_ibs_ctl(i); if (ret) return ret; + pr_err(FW_BUG "using offset %d for IBS interrupts\n", i); return 0; } @@ -630,21 +638,29 @@ static int __init_ibs_nmi(void) return 0; } -/* initialize the APIC for the IBS interrupts if available */ +/* + * check and reserve APIC extended interrupt LVT offset for IBS if + * available + * + * init_ibs() preforms implicitly cpu-local operations, so pin this + * thread to its current CPU + */ + static void init_ibs(void) { - ibs_caps = get_ibs_caps(); + preempt_disable(); + ibs_caps = get_ibs_caps(); if (!ibs_caps) - return; + goto out; - if (__init_ibs_nmi()) { + if (__init_ibs_nmi() < 0) ibs_caps = 0; - return; - } + else + printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); - printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", - (unsigned)ibs_caps); +out: + preempt_enable(); } static int (*create_arch_files)(struct super_block *sb, struct dentry *root); @@ -698,18 +714,29 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) return 0; } +struct op_x86_model_spec op_amd_spec; + static int op_amd_init(struct oprofile_operations *ops) { init_ibs(); create_arch_files = ops->create_files; ops->create_files = setup_ibs_files; + + if (boot_cpu_data.x86 == 0x15) { + num_counters = NUM_COUNTERS_F15H; + } else { + num_counters = NUM_COUNTERS; + } + + op_amd_spec.num_counters = num_counters; + op_amd_spec.num_controls = num_counters; + op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS); + return 0; } struct op_x86_model_spec op_amd_spec = { - .num_counters = NUM_COUNTERS, - .num_controls = NUM_COUNTERS, - .num_virt_counters = NUM_VIRT_COUNTERS, + /* num_counters/num_controls filled in at runtime */ .reserved = MSR_AMD_EVENTSEL_RESERVED, .event_mask = OP_EVENT_MASK, .init = op_amd_init, diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 182558dd5515..9fadec074142 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -11,7 +11,7 @@ #include <linux/oprofile.h> #include <linux/smp.h> #include <linux/ptrace.h> -#include <linux/nmi.h> +#include <asm/nmi.h> #include <asm/msr.h> #include <asm/fixmap.h> #include <asm/apic.h> diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index effd96e33f16..6b8759f7634e 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_PCI_OLPC) += olpc.o obj-$(CONFIG_PCI_XEN) += xen.o obj-y += fixup.o +obj-$(CONFIG_X86_INTEL_CE) += ce4100.o obj-$(CONFIG_ACPI) += acpi.o obj-y += legacy.o irq.o diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c new file mode 100644 index 000000000000..85b68ef5e809 --- /dev/null +++ b/arch/x86/pci/ce4100.c @@ -0,0 +1,315 @@ +/* + * GPL LICENSE SUMMARY + * + * Copyright(c) 2010 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * The full GNU General Public License is included in this distribution + * in the file called LICENSE.GPL. + * + * Contact Information: + * Intel Corporation + * 2200 Mission College Blvd. + * Santa Clara, CA 97052 + * + * This provides access methods for PCI registers that mis-behave on + * the CE4100. Each register can be assigned a private init, read and + * write routine. The exception to this is the bridge device. The + * bridge device is the only device on bus zero (0) that requires any + * fixup so it is a special case ATM + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/init.h> + +#include <asm/pci_x86.h> + +struct sim_reg { + u32 value; + u32 mask; +}; + +struct sim_dev_reg { + int dev_func; + int reg; + void (*init)(struct sim_dev_reg *reg); + void (*read)(struct sim_dev_reg *reg, u32 *value); + void (*write)(struct sim_dev_reg *reg, u32 value); + struct sim_reg sim_reg; +}; + +struct sim_reg_op { + void (*init)(struct sim_dev_reg *reg); + void (*read)(struct sim_dev_reg *reg, u32 value); + void (*write)(struct sim_dev_reg *reg, u32 value); +}; + +#define MB (1024 * 1024) +#define KB (1024) +#define SIZE_TO_MASK(size) (~(size - 1)) + +#define DEFINE_REG(device, func, offset, size, init_op, read_op, write_op)\ +{ PCI_DEVFN(device, func), offset, init_op, read_op, write_op,\ + {0, SIZE_TO_MASK(size)} }, + +static void reg_init(struct sim_dev_reg *reg) +{ + pci_direct_conf1.read(0, 1, reg->dev_func, reg->reg, 4, + ®->sim_reg.value); +} + +static void reg_read(struct sim_dev_reg *reg, u32 *value) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&pci_config_lock, flags); + *value = reg->sim_reg.value; + raw_spin_unlock_irqrestore(&pci_config_lock, flags); +} + +static void reg_write(struct sim_dev_reg *reg, u32 value) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&pci_config_lock, flags); + reg->sim_reg.value = (value & reg->sim_reg.mask) | + (reg->sim_reg.value & ~reg->sim_reg.mask); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); +} + +static void sata_reg_init(struct sim_dev_reg *reg) +{ + pci_direct_conf1.read(0, 1, PCI_DEVFN(14, 0), 0x10, 4, + ®->sim_reg.value); + reg->sim_reg.value += 0x400; +} + +static void ehci_reg_read(struct sim_dev_reg *reg, u32 *value) +{ + reg_read(reg, value); + if (*value != reg->sim_reg.mask) + *value |= 0x100; +} + +void sata_revid_init(struct sim_dev_reg *reg) +{ + reg->sim_reg.value = 0x01060100; + reg->sim_reg.mask = 0; +} + +static void sata_revid_read(struct sim_dev_reg *reg, u32 *value) +{ + reg_read(reg, value); +} + +static struct sim_dev_reg bus1_fixups[] = { + DEFINE_REG(2, 0, 0x10, (16*MB), reg_init, reg_read, reg_write) + DEFINE_REG(2, 0, 0x14, (256), reg_init, reg_read, reg_write) + DEFINE_REG(2, 1, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(3, 0, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(4, 0, 0x10, (128*KB), reg_init, reg_read, reg_write) + DEFINE_REG(4, 1, 0x10, (128*KB), reg_init, reg_read, reg_write) + DEFINE_REG(6, 0, 0x10, (512*KB), reg_init, reg_read, reg_write) + DEFINE_REG(6, 1, 0x10, (512*KB), reg_init, reg_read, reg_write) + DEFINE_REG(6, 2, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(8, 0, 0x10, (1*MB), reg_init, reg_read, reg_write) + DEFINE_REG(8, 1, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(8, 2, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(9, 0, 0x10 , (1*MB), reg_init, reg_read, reg_write) + DEFINE_REG(9, 0, 0x14, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(10, 0, 0x10, (256), reg_init, reg_read, reg_write) + DEFINE_REG(10, 0, 0x14, (256*MB), reg_init, reg_read, reg_write) + DEFINE_REG(11, 0, 0x10, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 0, 0x14, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 1, 0x10, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 2, 0x10, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 2, 0x14, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 2, 0x18, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 3, 0x10, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 3, 0x14, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 4, 0x10, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 5, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(11, 6, 0x10, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 7, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(12, 0, 0x10, (128*KB), reg_init, reg_read, reg_write) + DEFINE_REG(12, 0, 0x14, (256), reg_init, reg_read, reg_write) + DEFINE_REG(12, 1, 0x10, (1024), reg_init, reg_read, reg_write) + DEFINE_REG(13, 0, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write) + DEFINE_REG(13, 1, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write) + DEFINE_REG(14, 0, 0x8, 0, sata_revid_init, sata_revid_read, 0) + DEFINE_REG(14, 0, 0x10, 0, reg_init, reg_read, reg_write) + DEFINE_REG(14, 0, 0x14, 0, reg_init, reg_read, reg_write) + DEFINE_REG(14, 0, 0x18, 0, reg_init, reg_read, reg_write) + DEFINE_REG(14, 0, 0x1C, 0, reg_init, reg_read, reg_write) + DEFINE_REG(14, 0, 0x20, 0, reg_init, reg_read, reg_write) + DEFINE_REG(14, 0, 0x24, (0x200), sata_reg_init, reg_read, reg_write) + DEFINE_REG(15, 0, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(15, 0, 0x14, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(16, 0, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(16, 0, 0x14, (64*MB), reg_init, reg_read, reg_write) + DEFINE_REG(16, 0, 0x18, (64*MB), reg_init, reg_read, reg_write) + DEFINE_REG(17, 0, 0x10, (128*KB), reg_init, reg_read, reg_write) + DEFINE_REG(18, 0, 0x10, (1*KB), reg_init, reg_read, reg_write) +}; + +static void __init init_sim_regs(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { + if (bus1_fixups[i].init) + bus1_fixups[i].init(&bus1_fixups[i]); + } +} + +static inline void extract_bytes(u32 *value, int reg, int len) +{ + uint32_t mask; + + *value >>= ((reg & 3) * 8); + mask = 0xFFFFFFFF >> ((4 - len) * 8); + *value &= mask; +} + +int bridge_read(unsigned int devfn, int reg, int len, u32 *value) +{ + u32 av_bridge_base, av_bridge_limit; + int retval = 0; + + switch (reg) { + /* Make BARs appear to not request any memory. */ + case PCI_BASE_ADDRESS_0: + case PCI_BASE_ADDRESS_0 + 1: + case PCI_BASE_ADDRESS_0 + 2: + case PCI_BASE_ADDRESS_0 + 3: + *value = 0; + break; + + /* Since subordinate bus number register is hardwired + * to zero and read only, so do the simulation. + */ + case PCI_PRIMARY_BUS: + if (len == 4) + *value = 0x00010100; + break; + + case PCI_SUBORDINATE_BUS: + *value = 1; + break; + + case PCI_MEMORY_BASE: + case PCI_MEMORY_LIMIT: + /* Get the A/V bridge base address. */ + pci_direct_conf1.read(0, 0, devfn, + PCI_BASE_ADDRESS_0, 4, &av_bridge_base); + + av_bridge_limit = av_bridge_base + (512*MB - 1); + av_bridge_limit >>= 16; + av_bridge_limit &= 0xFFF0; + + av_bridge_base >>= 16; + av_bridge_base &= 0xFFF0; + + if (reg == PCI_MEMORY_LIMIT) + *value = av_bridge_limit; + else if (len == 2) + *value = av_bridge_base; + else + *value = (av_bridge_limit << 16) | av_bridge_base; + break; + /* Make prefetchable memory limit smaller than prefetchable + * memory base, so not claim prefetchable memory space. + */ + case PCI_PREF_MEMORY_BASE: + *value = 0xFFF0; + break; + case PCI_PREF_MEMORY_LIMIT: + *value = 0x0; + break; + /* Make IO limit smaller than IO base, so not claim IO space. */ + case PCI_IO_BASE: + *value = 0xF0; + break; + case PCI_IO_LIMIT: + *value = 0; + break; + default: + retval = 1; + } + return retval; +} + +static int ce4100_conf_read(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *value) +{ + int i, retval = 1; + + if (bus == 1) { + for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { + if (bus1_fixups[i].dev_func == devfn && + bus1_fixups[i].reg == (reg & ~3) && + bus1_fixups[i].read) { + bus1_fixups[i].read(&(bus1_fixups[i]), + value); + extract_bytes(value, reg, len); + return 0; + } + } + } + + if (bus == 0 && (PCI_DEVFN(1, 0) == devfn) && + !bridge_read(devfn, reg, len, value)) + return 0; + + return pci_direct_conf1.read(seg, bus, devfn, reg, len, value); +} + +static int ce4100_conf_write(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 value) +{ + int i; + + if (bus == 1) { + for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { + if (bus1_fixups[i].dev_func == devfn && + bus1_fixups[i].reg == (reg & ~3) && + bus1_fixups[i].write) { + bus1_fixups[i].write(&(bus1_fixups[i]), + value); + return 0; + } + } + } + + /* Discard writes to A/V bridge BAR. */ + if (bus == 0 && PCI_DEVFN(1, 0) == devfn && + ((reg & ~3) == PCI_BASE_ADDRESS_0)) + return 0; + + return pci_direct_conf1.write(seg, bus, devfn, reg, len, value); +} + +struct pci_raw_ops ce4100_pci_conf = { + .read = ce4100_conf_read, + .write = ce4100_conf_write, +}; + +static int __init ce4100_pci_init(void) +{ + init_sim_regs(); + raw_pci_ops = &ce4100_pci_conf; + return 0; +} +subsys_initcall(ce4100_pci_init); diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 2492d165096a..a5f7d0d63de0 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -9,6 +9,7 @@ #include <linux/uaccess.h> #include <asm/pci_x86.h> #include <asm/pci-functions.h> +#include <asm/cacheflush.h> /* BIOS32 signature: "_32_" */ #define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) @@ -25,6 +26,27 @@ #define PCIBIOS_HW_TYPE1_SPEC 0x10 #define PCIBIOS_HW_TYPE2_SPEC 0x20 +int pcibios_enabled; + +/* According to the BIOS specification at: + * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could + * restrict the x zone to some pages and make it ro. But this may be + * broken on some bios, complex to handle with static_protections. + * We could make the 0xe0000-0x100000 range rox, but this can break + * some ISA mapping. + * + * So we let's an rw and x hole when pcibios is used. This shouldn't + * happen for modern system with mmconfig, and if you don't want it + * you could disable pcibios... + */ +static inline void set_bios_x(void) +{ + pcibios_enabled = 1; + set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT); + if (__supported_pte_mask & _PAGE_NX) + printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n"); +} + /* * This is the standard structure used to identify the entry point * to the BIOS32 Service Directory, as documented in @@ -332,6 +354,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void) DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", bios32_entry); bios32_indirect.address = bios32_entry + PAGE_OFFSET; + set_bios_x(); if (check_pcibios()) return &pci_bios_access; } diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index 7bf70b812fa2..021eee91c056 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -1,5 +1,7 @@ # Platform specific code goes here +obj-y += ce4100/ obj-y += efi/ +obj-y += iris/ obj-y += mrst/ obj-y += olpc/ obj-y += scx200/ diff --git a/arch/x86/platform/ce4100/Makefile b/arch/x86/platform/ce4100/Makefile new file mode 100644 index 000000000000..91fc92971d94 --- /dev/null +++ b/arch/x86/platform/ce4100/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_X86_INTEL_CE) += ce4100.o diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c new file mode 100644 index 000000000000..d2c0d51a7178 --- /dev/null +++ b/arch/x86/platform/ce4100/ce4100.c @@ -0,0 +1,132 @@ +/* + * Intel CE4100 platform specific setup code + * + * (C) Copyright 2010 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/serial_reg.h> +#include <linux/serial_8250.h> + +#include <asm/setup.h> +#include <asm/io.h> + +static int ce4100_i8042_detect(void) +{ + return 0; +} + +static void __init sdv_find_smp_config(void) +{ +} + +#ifdef CONFIG_SERIAL_8250 + + +static unsigned int mem_serial_in(struct uart_port *p, int offset) +{ + offset = offset << p->regshift; + return readl(p->membase + offset); +} + +/* + * The UART Tx interrupts are not set under some conditions and therefore serial + * transmission hangs. This is a silicon issue and has not been root caused. The + * workaround for this silicon issue checks UART_LSR_THRE bit and UART_LSR_TEMT + * bit of LSR register in interrupt handler to see whether at least one of these + * two bits is set, if so then process the transmit request. If this workaround + * is not applied, then the serial transmission may hang. This workaround is for + * errata number 9 in Errata - B step. +*/ + +static unsigned int ce4100_mem_serial_in(struct uart_port *p, int offset) +{ + unsigned int ret, ier, lsr; + + if (offset == UART_IIR) { + offset = offset << p->regshift; + ret = readl(p->membase + offset); + if (ret & UART_IIR_NO_INT) { + /* see if the TX interrupt should have really set */ + ier = mem_serial_in(p, UART_IER); + /* see if the UART's XMIT interrupt is enabled */ + if (ier & UART_IER_THRI) { + lsr = mem_serial_in(p, UART_LSR); + /* now check to see if the UART should be + generating an interrupt (but isn't) */ + if (lsr & (UART_LSR_THRE | UART_LSR_TEMT)) + ret &= ~UART_IIR_NO_INT; + } + } + } else + ret = mem_serial_in(p, offset); + return ret; +} + +static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value) +{ + offset = offset << p->regshift; + writel(value, p->membase + offset); +} + +static void ce4100_serial_fixup(int port, struct uart_port *up, + unsigned short *capabilites) +{ +#ifdef CONFIG_EARLY_PRINTK + /* + * Over ride the legacy port configuration that comes from + * asm/serial.h. Using the ioport driver then switching to the + * PCI memmaped driver hangs the IOAPIC + */ + if (up->iotype != UPIO_MEM32) { + up->uartclk = 14745600; + up->mapbase = 0xdffe0200; + set_fixmap_nocache(FIX_EARLYCON_MEM_BASE, + up->mapbase & PAGE_MASK); + up->membase = + (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE); + up->membase += up->mapbase & ~PAGE_MASK; + up->iotype = UPIO_MEM32; + up->regshift = 2; + } +#endif + up->iobase = 0; + up->serial_in = ce4100_mem_serial_in; + up->serial_out = ce4100_mem_serial_out; + + *capabilites |= (1 << 12); +} + +static __init void sdv_serial_fixup(void) +{ + serial8250_set_isa_configurator(ce4100_serial_fixup); +} + +#else +static inline void sdv_serial_fixup(void); +#endif + +static void __init sdv_arch_setup(void) +{ + sdv_serial_fixup(); +} + +/* + * CE4100 specific x86_init function overrides and early setup + * calls. + */ +void __init x86_ce4100_early_setup(void) +{ + x86_init.oem.arch_setup = sdv_arch_setup; + x86_platform.i8042_detect = ce4100_i8042_detect; + x86_init.resources.probe_roms = x86_init_noop; + x86_init.mpparse.get_smp_config = x86_init_uint_noop; + x86_init.mpparse.find_smp_config = sdv_find_smp_config; +} diff --git a/arch/x86/platform/iris/Makefile b/arch/x86/platform/iris/Makefile new file mode 100644 index 000000000000..db921983a102 --- /dev/null +++ b/arch/x86/platform/iris/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_X86_32_IRIS) += iris.o diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c new file mode 100644 index 000000000000..1ba7f5ed8c9b --- /dev/null +++ b/arch/x86/platform/iris/iris.c @@ -0,0 +1,91 @@ +/* + * Eurobraille/Iris power off support. + * + * Eurobraille's Iris machine is a PC with no APM or ACPI support. + * It is shutdown by a special I/O sequence which this module provides. + * + * Copyright (C) Shérab <Sebastien.Hinderer@ens-lyon.org> + * + * This program is free software ; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation ; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY ; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the program ; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/moduleparam.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/pm.h> +#include <asm/io.h> + +#define IRIS_GIO_BASE 0x340 +#define IRIS_GIO_INPUT IRIS_GIO_BASE +#define IRIS_GIO_OUTPUT (IRIS_GIO_BASE + 1) +#define IRIS_GIO_PULSE 0x80 /* First byte to send */ +#define IRIS_GIO_REST 0x00 /* Second byte to send */ +#define IRIS_GIO_NODEV 0xff /* Likely not an Iris */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>"); +MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille"); +MODULE_SUPPORTED_DEVICE("Eurobraille/Iris"); + +static int force; + +module_param(force, bool, 0); +MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation."); + +static void (*old_pm_power_off)(void); + +static void iris_power_off(void) +{ + outb(IRIS_GIO_PULSE, IRIS_GIO_OUTPUT); + msleep(850); + outb(IRIS_GIO_REST, IRIS_GIO_OUTPUT); +} + +/* + * Before installing the power_off handler, try to make sure the OS is + * running on an Iris. Since Iris does not support DMI, this is done + * by reading its input port and seeing whether the read value is + * meaningful. + */ +static int iris_init(void) +{ + unsigned char status; + if (force != 1) { + printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n"); + return -ENODEV; + } + status = inb(IRIS_GIO_INPUT); + if (status == IRIS_GIO_NODEV) { + printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n"); + return -ENODEV; + } + old_pm_power_off = pm_power_off; + pm_power_off = &iris_power_off; + printk(KERN_INFO "Iris power_off handler installed.\n"); + + return 0; +} + +static void iris_exit(void) +{ + pm_power_off = old_pm_power_off; + printk(KERN_INFO "Iris power_off handler uninstalled.\n"); +} + +module_init(iris_init); +module_exit(iris_exit); diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile index efbbc552fa95..f61ccdd49341 100644 --- a/arch/x86/platform/mrst/Makefile +++ b/arch/x86/platform/mrst/Makefile @@ -1 +1,3 @@ obj-$(CONFIG_X86_MRST) += mrst.o +obj-$(CONFIG_X86_MRST) += vrtc.o +obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c index 65df603622b2..65df603622b2 100644 --- a/arch/x86/kernel/early_printk_mrst.c +++ b/arch/x86/platform/mrst/early_printk_mrst.c diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 79ae68154e87..fee0b4914e07 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -9,9 +9,19 @@ * as published by the Free Software Foundation; version 2 * of the License. */ + +#define pr_fmt(fmt) "mrst: " fmt + #include <linux/init.h> #include <linux/kernel.h> #include <linux/sfi.h> +#include <linux/intel_pmic_gpio.h> +#include <linux/spi/spi.h> +#include <linux/i2c.h> +#include <linux/i2c/pca953x.h> +#include <linux/gpio_keys.h> +#include <linux/input.h> +#include <linux/platform_device.h> #include <linux/irq.h> #include <linux/module.h> @@ -23,7 +33,9 @@ #include <asm/mrst.h> #include <asm/io.h> #include <asm/i8259.h> +#include <asm/intel_scu_ipc.h> #include <asm/apb_timer.h> +#include <asm/reboot.h> /* * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, @@ -102,10 +114,10 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table) memcpy(sfi_mtimer_array, pentry, totallen); } - printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num); + pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num); pentry = sfi_mtimer_array; for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) { - printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz," + pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz," " irq = %d\n", totallen, (u32)pentry->phys_addr, pentry->freq_hz, pentry->irq); if (!pentry->irq) @@ -176,14 +188,14 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) memcpy(sfi_mrtc_array, pentry, totallen); } - printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num); + pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num); pentry = sfi_mrtc_array; for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { - printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n", + pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n", totallen, (u32)pentry->phys_addr, pentry->irq); mp_irq.type = MP_IOAPIC; mp_irq.irqtype = mp_INT; - mp_irq.irqflag = 0; + mp_irq.irqflag = 0xf; /* level trigger and active low */ mp_irq.srcbus = 0; mp_irq.srcbusirq = pentry->irq; /* IRQ */ mp_irq.dstapic = MP_APIC_ALL; @@ -209,6 +221,7 @@ static unsigned long __init mrst_calibrate_tsc(void) void __init mrst_time_init(void) { + sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); switch (mrst_timer_options) { case MRST_TIMER_APBT_ONLY: break; @@ -224,16 +237,10 @@ void __init mrst_time_init(void) return; } /* we need at least one APB timer */ - sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); pre_init_apic_IRQ0(); apbt_time_init(); } -void __init mrst_rtc_init(void) -{ - sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); -} - void __cpuinit mrst_arch_setup(void) { if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) @@ -256,6 +263,17 @@ static int mrst_i8042_detect(void) return 0; } +/* Reboot and power off are handled by the SCU on a MID device */ +static void mrst_power_off(void) +{ + intel_scu_ipc_simple_command(0xf1, 1); +} + +static void mrst_reboot(void) +{ + intel_scu_ipc_simple_command(0xf1, 0); +} + /* * Moorestown specific x86_init function overrides and early setup * calls. @@ -281,6 +299,10 @@ void __init x86_mrst_early_setup(void) legacy_pic = &null_legacy_pic; + /* Moorestown specific power_off/restart method */ + pm_power_off = mrst_power_off; + machine_ops.emergency_restart = mrst_reboot; + /* Avoid searching for BIOS MP tables */ x86_init.mpparse.find_smp_config = x86_init_noop; x86_init.mpparse.get_smp_config = x86_init_uint_noop; @@ -309,3 +331,505 @@ static inline int __init setup_x86_mrst_timer(char *arg) return 0; } __setup("x86_mrst_timer=", setup_x86_mrst_timer); + +/* + * Parsing GPIO table first, since the DEVS table will need this table + * to map the pin name to the actual pin. + */ +static struct sfi_gpio_table_entry *gpio_table; +static int gpio_num_entry; + +static int __init sfi_parse_gpio(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_gpio_table_entry *pentry; + int num, i; + + if (gpio_table) + return 0; + sb = (struct sfi_table_simple *)table; + num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry); + pentry = (struct sfi_gpio_table_entry *)sb->pentry; + + gpio_table = (struct sfi_gpio_table_entry *) + kmalloc(num * sizeof(*pentry), GFP_KERNEL); + if (!gpio_table) + return -1; + memcpy(gpio_table, pentry, num * sizeof(*pentry)); + gpio_num_entry = num; + + pr_debug("GPIO pin info:\n"); + for (i = 0; i < num; i++, pentry++) + pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s," + " pin = %d\n", i, + pentry->controller_name, + pentry->pin_name, + pentry->pin_no); + return 0; +} + +static int get_gpio_by_name(const char *name) +{ + struct sfi_gpio_table_entry *pentry = gpio_table; + int i; + + if (!pentry) + return -1; + for (i = 0; i < gpio_num_entry; i++, pentry++) { + if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN)) + return pentry->pin_no; + } + return -1; +} + +/* + * Here defines the array of devices platform data that IAFW would export + * through SFI "DEVS" table, we use name and type to match the device and + * its platform data. + */ +struct devs_id { + char name[SFI_NAME_LEN + 1]; + u8 type; + u8 delay; + void *(*get_platform_data)(void *info); +}; + +/* the offset for the mapping of global gpio pin to irq */ +#define MRST_IRQ_OFFSET 0x100 + +static void __init *pmic_gpio_platform_data(void *info) +{ + static struct intel_pmic_gpio_platform_data pmic_gpio_pdata; + int gpio_base = get_gpio_by_name("pmic_gpio_base"); + + if (gpio_base == -1) + gpio_base = 64; + pmic_gpio_pdata.gpio_base = gpio_base; + pmic_gpio_pdata.irq_base = gpio_base + MRST_IRQ_OFFSET; + pmic_gpio_pdata.gpiointr = 0xffffeff8; + + return &pmic_gpio_pdata; +} + +static void __init *max3111_platform_data(void *info) +{ + struct spi_board_info *spi_info = info; + int intr = get_gpio_by_name("max3111_int"); + + if (intr == -1) + return NULL; + spi_info->irq = intr + MRST_IRQ_OFFSET; + return NULL; +} + +/* we have multiple max7315 on the board ... */ +#define MAX7315_NUM 2 +static void __init *max7315_platform_data(void *info) +{ + static struct pca953x_platform_data max7315_pdata[MAX7315_NUM]; + static int nr; + struct pca953x_platform_data *max7315 = &max7315_pdata[nr]; + struct i2c_board_info *i2c_info = info; + int gpio_base, intr; + char base_pin_name[SFI_NAME_LEN + 1]; + char intr_pin_name[SFI_NAME_LEN + 1]; + + if (nr == MAX7315_NUM) { + pr_err("too many max7315s, we only support %d\n", + MAX7315_NUM); + return NULL; + } + /* we have several max7315 on the board, we only need load several + * instances of the same pca953x driver to cover them + */ + strcpy(i2c_info->type, "max7315"); + if (nr++) { + sprintf(base_pin_name, "max7315_%d_base", nr); + sprintf(intr_pin_name, "max7315_%d_int", nr); + } else { + strcpy(base_pin_name, "max7315_base"); + strcpy(intr_pin_name, "max7315_int"); + } + + gpio_base = get_gpio_by_name(base_pin_name); + intr = get_gpio_by_name(intr_pin_name); + + if (gpio_base == -1) + return NULL; + max7315->gpio_base = gpio_base; + if (intr != -1) { + i2c_info->irq = intr + MRST_IRQ_OFFSET; + max7315->irq_base = gpio_base + MRST_IRQ_OFFSET; + } else { + i2c_info->irq = -1; + max7315->irq_base = -1; + } + return max7315; +} + +static void __init *emc1403_platform_data(void *info) +{ + static short intr2nd_pdata; + struct i2c_board_info *i2c_info = info; + int intr = get_gpio_by_name("thermal_int"); + int intr2nd = get_gpio_by_name("thermal_alert"); + + if (intr == -1 || intr2nd == -1) + return NULL; + + i2c_info->irq = intr + MRST_IRQ_OFFSET; + intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET; + + return &intr2nd_pdata; +} + +static void __init *lis331dl_platform_data(void *info) +{ + static short intr2nd_pdata; + struct i2c_board_info *i2c_info = info; + int intr = get_gpio_by_name("accel_int"); + int intr2nd = get_gpio_by_name("accel_2"); + + if (intr == -1 || intr2nd == -1) + return NULL; + + i2c_info->irq = intr + MRST_IRQ_OFFSET; + intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET; + + return &intr2nd_pdata; +} + +static void __init *no_platform_data(void *info) +{ + return NULL; +} + +static const struct devs_id __initconst device_ids[] = { + {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data}, + {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data}, + {"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data}, + {"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data}, + {"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data}, + {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data}, + {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data}, + {"msic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data}, + {}, +}; + +#define MAX_IPCDEVS 24 +static struct platform_device *ipc_devs[MAX_IPCDEVS]; +static int ipc_next_dev; + +#define MAX_SCU_SPI 24 +static struct spi_board_info *spi_devs[MAX_SCU_SPI]; +static int spi_next_dev; + +#define MAX_SCU_I2C 24 +static struct i2c_board_info *i2c_devs[MAX_SCU_I2C]; +static int i2c_bus[MAX_SCU_I2C]; +static int i2c_next_dev; + +static void __init intel_scu_device_register(struct platform_device *pdev) +{ + if(ipc_next_dev == MAX_IPCDEVS) + pr_err("too many SCU IPC devices"); + else + ipc_devs[ipc_next_dev++] = pdev; +} + +static void __init intel_scu_spi_device_register(struct spi_board_info *sdev) +{ + struct spi_board_info *new_dev; + + if (spi_next_dev == MAX_SCU_SPI) { + pr_err("too many SCU SPI devices"); + return; + } + + new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL); + if (!new_dev) { + pr_err("failed to alloc mem for delayed spi dev %s\n", + sdev->modalias); + return; + } + memcpy(new_dev, sdev, sizeof(*sdev)); + + spi_devs[spi_next_dev++] = new_dev; +} + +static void __init intel_scu_i2c_device_register(int bus, + struct i2c_board_info *idev) +{ + struct i2c_board_info *new_dev; + + if (i2c_next_dev == MAX_SCU_I2C) { + pr_err("too many SCU I2C devices"); + return; + } + + new_dev = kzalloc(sizeof(*idev), GFP_KERNEL); + if (!new_dev) { + pr_err("failed to alloc mem for delayed i2c dev %s\n", + idev->type); + return; + } + memcpy(new_dev, idev, sizeof(*idev)); + + i2c_bus[i2c_next_dev] = bus; + i2c_devs[i2c_next_dev++] = new_dev; +} + +/* Called by IPC driver */ +void intel_scu_devices_create(void) +{ + int i; + + for (i = 0; i < ipc_next_dev; i++) + platform_device_add(ipc_devs[i]); + + for (i = 0; i < spi_next_dev; i++) + spi_register_board_info(spi_devs[i], 1); + + for (i = 0; i < i2c_next_dev; i++) { + struct i2c_adapter *adapter; + struct i2c_client *client; + + adapter = i2c_get_adapter(i2c_bus[i]); + if (adapter) { + client = i2c_new_device(adapter, i2c_devs[i]); + if (!client) + pr_err("can't create i2c device %s\n", + i2c_devs[i]->type); + } else + i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1); + } +} +EXPORT_SYMBOL_GPL(intel_scu_devices_create); + +/* Called by IPC driver */ +void intel_scu_devices_destroy(void) +{ + int i; + + for (i = 0; i < ipc_next_dev; i++) + platform_device_del(ipc_devs[i]); +} +EXPORT_SYMBOL_GPL(intel_scu_devices_destroy); + +static void __init install_irq_resource(struct platform_device *pdev, int irq) +{ + /* Single threaded */ + static struct resource __initdata res = { + .name = "IRQ", + .flags = IORESOURCE_IRQ, + }; + res.start = irq; + platform_device_add_resources(pdev, &res, 1); +} + +static void __init sfi_handle_ipc_dev(struct platform_device *pdev) +{ + const struct devs_id *dev = device_ids; + void *pdata = NULL; + + while (dev->name[0]) { + if (dev->type == SFI_DEV_TYPE_IPC && + !strncmp(dev->name, pdev->name, SFI_NAME_LEN)) { + pdata = dev->get_platform_data(pdev); + break; + } + dev++; + } + pdev->dev.platform_data = pdata; + intel_scu_device_register(pdev); +} + +static void __init sfi_handle_spi_dev(struct spi_board_info *spi_info) +{ + const struct devs_id *dev = device_ids; + void *pdata = NULL; + + while (dev->name[0]) { + if (dev->type == SFI_DEV_TYPE_SPI && + !strncmp(dev->name, spi_info->modalias, SFI_NAME_LEN)) { + pdata = dev->get_platform_data(spi_info); + break; + } + dev++; + } + spi_info->platform_data = pdata; + if (dev->delay) + intel_scu_spi_device_register(spi_info); + else + spi_register_board_info(spi_info, 1); +} + +static void __init sfi_handle_i2c_dev(int bus, struct i2c_board_info *i2c_info) +{ + const struct devs_id *dev = device_ids; + void *pdata = NULL; + + while (dev->name[0]) { + if (dev->type == SFI_DEV_TYPE_I2C && + !strncmp(dev->name, i2c_info->type, SFI_NAME_LEN)) { + pdata = dev->get_platform_data(i2c_info); + break; + } + dev++; + } + i2c_info->platform_data = pdata; + + if (dev->delay) + intel_scu_i2c_device_register(bus, i2c_info); + else + i2c_register_board_info(bus, i2c_info, 1); + } + + +static int __init sfi_parse_devs(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_device_table_entry *pentry; + struct spi_board_info spi_info; + struct i2c_board_info i2c_info; + struct platform_device *pdev; + int num, i, bus; + int ioapic; + struct io_apic_irq_attr irq_attr; + + sb = (struct sfi_table_simple *)table; + num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry); + pentry = (struct sfi_device_table_entry *)sb->pentry; + + for (i = 0; i < num; i++, pentry++) { + if (pentry->irq != (u8)0xff) { /* native RTE case */ + /* these SPI2 devices are not exposed to system as PCI + * devices, but they have separate RTE entry in IOAPIC + * so we have to enable them one by one here + */ + ioapic = mp_find_ioapic(pentry->irq); + irq_attr.ioapic = ioapic; + irq_attr.ioapic_pin = pentry->irq; + irq_attr.trigger = 1; + irq_attr.polarity = 1; + io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr); + } + switch (pentry->type) { + case SFI_DEV_TYPE_IPC: + /* ID as IRQ is a hack that will go away */ + pdev = platform_device_alloc(pentry->name, pentry->irq); + if (pdev == NULL) { + pr_err("out of memory for SFI platform device '%s'.\n", + pentry->name); + continue; + } + install_irq_resource(pdev, pentry->irq); + pr_debug("info[%2d]: IPC bus, name = %16.16s, " + "irq = 0x%2x\n", i, pentry->name, pentry->irq); + sfi_handle_ipc_dev(pdev); + break; + case SFI_DEV_TYPE_SPI: + memset(&spi_info, 0, sizeof(spi_info)); + strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN); + spi_info.irq = pentry->irq; + spi_info.bus_num = pentry->host_num; + spi_info.chip_select = pentry->addr; + spi_info.max_speed_hz = pentry->max_freq; + pr_debug("info[%2d]: SPI bus = %d, name = %16.16s, " + "irq = 0x%2x, max_freq = %d, cs = %d\n", i, + spi_info.bus_num, + spi_info.modalias, + spi_info.irq, + spi_info.max_speed_hz, + spi_info.chip_select); + sfi_handle_spi_dev(&spi_info); + break; + case SFI_DEV_TYPE_I2C: + memset(&i2c_info, 0, sizeof(i2c_info)); + bus = pentry->host_num; + strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN); + i2c_info.irq = pentry->irq; + i2c_info.addr = pentry->addr; + pr_debug("info[%2d]: I2C bus = %d, name = %16.16s, " + "irq = 0x%2x, addr = 0x%x\n", i, bus, + i2c_info.type, + i2c_info.irq, + i2c_info.addr); + sfi_handle_i2c_dev(bus, &i2c_info); + break; + case SFI_DEV_TYPE_UART: + case SFI_DEV_TYPE_HSI: + default: + ; + } + } + return 0; +} + +static int __init mrst_platform_init(void) +{ + sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio); + sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs); + return 0; +} +arch_initcall(mrst_platform_init); + +/* + * we will search these buttons in SFI GPIO table (by name) + * and register them dynamically. Please add all possible + * buttons here, we will shrink them if no GPIO found. + */ +static struct gpio_keys_button gpio_button[] = { + {KEY_POWER, -1, 1, "power_btn", EV_KEY, 0, 3000}, + {KEY_PROG1, -1, 1, "prog_btn1", EV_KEY, 0, 20}, + {KEY_PROG2, -1, 1, "prog_btn2", EV_KEY, 0, 20}, + {SW_LID, -1, 1, "lid_switch", EV_SW, 0, 20}, + {KEY_VOLUMEUP, -1, 1, "vol_up", EV_KEY, 0, 20}, + {KEY_VOLUMEDOWN, -1, 1, "vol_down", EV_KEY, 0, 20}, + {KEY_CAMERA, -1, 1, "camera_full", EV_KEY, 0, 20}, + {KEY_CAMERA_FOCUS, -1, 1, "camera_half", EV_KEY, 0, 20}, + {SW_KEYPAD_SLIDE, -1, 1, "MagSw1", EV_SW, 0, 20}, + {SW_KEYPAD_SLIDE, -1, 1, "MagSw2", EV_SW, 0, 20}, +}; + +static struct gpio_keys_platform_data mrst_gpio_keys = { + .buttons = gpio_button, + .rep = 1, + .nbuttons = -1, /* will fill it after search */ +}; + +static struct platform_device pb_device = { + .name = "gpio-keys", + .id = -1, + .dev = { + .platform_data = &mrst_gpio_keys, + }, +}; + +/* + * Shrink the non-existent buttons, register the gpio button + * device if there is some + */ +static int __init pb_keys_init(void) +{ + struct gpio_keys_button *gb = gpio_button; + int i, num, good = 0; + + num = sizeof(gpio_button) / sizeof(struct gpio_keys_button); + for (i = 0; i < num; i++) { + gb[i].gpio = get_gpio_by_name(gb[i].desc); + if (gb[i].gpio == -1) + continue; + + if (i != good) + gb[good] = gb[i]; + good++; + } + + if (good) { + mrst_gpio_keys.nbuttons = good; + return platform_device_register(&pb_device); + } + return 0; +} +late_initcall(pb_keys_init); diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c new file mode 100644 index 000000000000..32cd7edd71a0 --- /dev/null +++ b/arch/x86/platform/mrst/vrtc.c @@ -0,0 +1,165 @@ +/* + * vrtc.c: Driver for virtual RTC device on Intel MID platform + * + * (C) Copyright 2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + * VRTC is emulated by system controller firmware, the real HW + * RTC is located in the PMIC device. SCU FW shadows PMIC RTC + * in a memory mapped IO space that is visible to the host IA + * processor. + * + * This driver is based on RTC CMOS driver. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/sfi.h> +#include <linux/platform_device.h> + +#include <asm/mrst.h> +#include <asm/mrst-vrtc.h> +#include <asm/time.h> +#include <asm/fixmap.h> + +static unsigned char __iomem *vrtc_virt_base; + +unsigned char vrtc_cmos_read(unsigned char reg) +{ + unsigned char retval; + + /* vRTC's registers range from 0x0 to 0xD */ + if (reg > 0xd || !vrtc_virt_base) + return 0xff; + + lock_cmos_prefix(reg); + retval = __raw_readb(vrtc_virt_base + (reg << 2)); + lock_cmos_suffix(reg); + return retval; +} +EXPORT_SYMBOL_GPL(vrtc_cmos_read); + +void vrtc_cmos_write(unsigned char val, unsigned char reg) +{ + if (reg > 0xd || !vrtc_virt_base) + return; + + lock_cmos_prefix(reg); + __raw_writeb(val, vrtc_virt_base + (reg << 2)); + lock_cmos_suffix(reg); +} +EXPORT_SYMBOL_GPL(vrtc_cmos_write); + +unsigned long vrtc_get_time(void) +{ + u8 sec, min, hour, mday, mon; + u32 year; + + while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP)) + cpu_relax(); + + sec = vrtc_cmos_read(RTC_SECONDS); + min = vrtc_cmos_read(RTC_MINUTES); + hour = vrtc_cmos_read(RTC_HOURS); + mday = vrtc_cmos_read(RTC_DAY_OF_MONTH); + mon = vrtc_cmos_read(RTC_MONTH); + year = vrtc_cmos_read(RTC_YEAR); + + /* vRTC YEAR reg contains the offset to 1960 */ + year += 1960; + + printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d " + "mon: %d year: %d\n", sec, min, hour, mday, mon, year); + + return mktime(year, mon, mday, hour, min, sec); +} + +/* Only care about the minutes and seconds */ +int vrtc_set_mmss(unsigned long nowtime) +{ + int real_sec, real_min; + int vrtc_min; + + vrtc_min = vrtc_cmos_read(RTC_MINUTES); + + real_sec = nowtime % 60; + real_min = nowtime / 60; + if (((abs(real_min - vrtc_min) + 15)/30) & 1) + real_min += 30; + real_min %= 60; + + vrtc_cmos_write(real_sec, RTC_SECONDS); + vrtc_cmos_write(real_min, RTC_MINUTES); + return 0; +} + +void __init mrst_rtc_init(void) +{ + unsigned long rtc_paddr; + void __iomem *virt_base; + + sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); + if (!sfi_mrtc_num) + return; + + rtc_paddr = sfi_mrtc_array[0].phys_addr; + + /* vRTC's register address may not be page aligned */ + set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr); + + virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC); + virt_base += rtc_paddr & ~PAGE_MASK; + vrtc_virt_base = virt_base; + + x86_platform.get_wallclock = vrtc_get_time; + x86_platform.set_wallclock = vrtc_set_mmss; +} + +/* + * The Moorestown platform has a memory mapped virtual RTC device that emulates + * the programming interface of the RTC. + */ + +static struct resource vrtc_resources[] = { + [0] = { + .flags = IORESOURCE_MEM, + }, + [1] = { + .flags = IORESOURCE_IRQ, + } +}; + +static struct platform_device vrtc_device = { + .name = "rtc_mrst", + .id = -1, + .resource = vrtc_resources, + .num_resources = ARRAY_SIZE(vrtc_resources), +}; + +/* Register the RTC device if appropriate */ +static int __init mrst_device_create(void) +{ + /* No Moorestown, no device */ + if (!mrst_identify_cpu()) + return -ENODEV; + /* No timer, no device */ + if (!sfi_mrtc_num) + return -ENODEV; + + /* iomem resource */ + vrtc_resources[0].start = sfi_mrtc_array[0].phys_addr; + vrtc_resources[0].end = sfi_mrtc_array[0].phys_addr + + MRST_VRTC_MAP_SZ; + /* irq resource */ + vrtc_resources[1].start = sfi_mrtc_array[0].irq; + vrtc_resources[1].end = sfi_mrtc_array[0].irq; + + return platform_device_register(&vrtc_device); +} + +module_init(mrst_device_create); diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index dd4c281ffe57..ca54875ac795 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -48,9 +48,9 @@ static void __init mp_sfi_register_lapic_address(unsigned long address) /* All CPUs enumerated by SFI must be present and enabled */ static void __cpuinit mp_sfi_register_lapic(u8 id) { - if (MAX_APICS - id <= 0) { + if (MAX_LOCAL_APIC - id <= 0) { pr_warning("Processor #%d invalid (max %d)\n", - id, MAX_APICS); + id, MAX_LOCAL_APIC); return; } diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index ba9caa808a9c..df58e9cad96a 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1341,7 +1341,7 @@ uv_activation_descriptor_init(int node, int pnode) /* * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) - * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub + * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE) */ bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); @@ -1490,7 +1490,7 @@ calculate_destination_timeout(void) /* * initialize the bau_control structure for each cpu */ -static void __init uv_init_per_cpu(int nuvhubs) +static int __init uv_init_per_cpu(int nuvhubs) { int i; int cpu; @@ -1507,7 +1507,7 @@ static void __init uv_init_per_cpu(int nuvhubs) struct bau_control *smaster = NULL; struct socket_desc { short num_cpus; - short cpu_number[16]; + short cpu_number[MAX_CPUS_PER_SOCKET]; }; struct uvhub_desc { unsigned short socket_mask; @@ -1540,6 +1540,10 @@ static void __init uv_init_per_cpu(int nuvhubs) sdp = &bdp->socket[socket]; sdp->cpu_number[sdp->num_cpus] = cpu; sdp->num_cpus++; + if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) { + printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus); + return 1; + } } for (uvhub = 0; uvhub < nuvhubs; uvhub++) { if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) @@ -1570,6 +1574,12 @@ static void __init uv_init_per_cpu(int nuvhubs) bcp->uvhub_master = hmaster; bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> blade_processor_id; + if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { + printk(KERN_EMERG + "%d cpus per uvhub invalid\n", + bcp->uvhub_cpu); + return 1; + } } nextsocket: socket++; @@ -1595,6 +1605,7 @@ nextsocket: bcp->congested_reps = congested_reps; bcp->congested_period = congested_period; } + return 0; } /* @@ -1625,7 +1636,10 @@ static int __init uv_bau_init(void) spin_lock_init(&disable_lock); congested_cycles = microsec_2_cycles(congested_response_us); - uv_init_per_cpu(nuvhubs); + if (uv_init_per_cpu(nuvhubs)) { + nobau = 1; + return 0; + } uv_partition_base_pnode = 0x7fffffff; for (uvhub = 0; uvhub < nuvhubs; uvhub++) diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c index 3371bd053b89..632037671746 100644 --- a/arch/x86/platform/visws/visws_quirks.c +++ b/arch/x86/platform/visws/visws_quirks.c @@ -171,7 +171,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) ver = m->apicver; if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) { printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", - m->apicid, MAX_APICS); + m->apicid, MAX_LOCAL_APIC); return; } diff --git a/drivers/acpi/acpica/evgpeinit.c b/drivers/acpi/acpica/evgpeinit.c index 2c7def95f721..4c8dea513b66 100644 --- a/drivers/acpi/acpica/evgpeinit.c +++ b/drivers/acpi/acpica/evgpeinit.c @@ -408,6 +408,9 @@ acpi_ev_match_gpe_method(acpi_handle obj_handle, return_ACPI_STATUS(AE_OK); } + /* Disable the GPE in case it's been enabled already. */ + (void)acpi_hw_low_set_gpe(gpe_event_info, ACPI_GPE_DISABLE); + /* * Add the GPE information from above to the gpe_event_info block for * use during dispatch of this GPE. diff --git a/drivers/acpi/acpica/nsinit.c b/drivers/acpi/acpica/nsinit.c index 660a2728908d..0cac7ec0d2ec 100644 --- a/drivers/acpi/acpica/nsinit.c +++ b/drivers/acpi/acpica/nsinit.c @@ -577,9 +577,7 @@ acpi_ns_init_one_device(acpi_handle obj_handle, * as possible (without an NMI being received in the middle of * this) - so disable NMIs and initialize the device: */ - acpi_nmi_disable(); status = acpi_ns_evaluate(info); - acpi_nmi_enable(); if (ACPI_SUCCESS(status)) { walk_info->num_INI++; diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 9fb9d5ac939d..95649d373071 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -130,8 +130,6 @@ struct acpi_battery { unsigned long flags; }; -static int acpi_battery_update(struct acpi_battery *battery); - #define to_acpi_battery(x) container_of(x, struct acpi_battery, bat); inline int acpi_battery_present(struct acpi_battery *battery) @@ -186,9 +184,6 @@ static int acpi_battery_get_property(struct power_supply *psy, int ret = 0; struct acpi_battery *battery = to_acpi_battery(psy); - if (acpi_battery_update(battery)) - return -ENODEV; - if (acpi_battery_present(battery)) { /* run battery update only if it is present */ acpi_battery_get_state(battery); diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 5718566e00f9..d9926afec110 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -275,13 +275,23 @@ acpi_table_parse_srat(enum acpi_srat_type id, int __init acpi_numa_init(void) { int ret = 0; + int nr_cpu_entries = nr_cpu_ids; + +#ifdef CONFIG_X86 + /* + * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= + * SRAT cpu entries could have different order with that in MADT. + * So go over all cpu entries in SRAT to get apicid to node mapping. + */ + nr_cpu_entries = MAX_LOCAL_APIC; +#endif /* SRAT: Static Resource Affinity Table */ if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, - acpi_parse_x2apic_affinity, nr_cpu_ids); + acpi_parse_x2apic_affinity, nr_cpu_entries); acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, - acpi_parse_processor_affinity, nr_cpu_ids); + acpi_parse_processor_affinity, nr_cpu_entries); ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS); diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 2b6c21d86b98..29ef505c487b 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -705,54 +705,85 @@ static int acpi_bus_get_perf_flags(struct acpi_device *device) } static acpi_status -acpi_bus_extract_wakeup_device_power_package(struct acpi_device *device, - union acpi_object *package) +acpi_bus_extract_wakeup_device_power_package(acpi_handle handle, + struct acpi_device_wakeup *wakeup) { - int i = 0; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *package = NULL; union acpi_object *element = NULL; + acpi_status status; + int i = 0; - if (!device || !package || (package->package.count < 2)) + if (!wakeup) return AE_BAD_PARAMETER; + /* _PRW */ + status = acpi_evaluate_object(handle, "_PRW", NULL, &buffer); + if (ACPI_FAILURE(status)) { + ACPI_EXCEPTION((AE_INFO, status, "Evaluating _PRW")); + return status; + } + + package = (union acpi_object *)buffer.pointer; + + if (!package || (package->package.count < 2)) { + status = AE_BAD_DATA; + goto out; + } + element = &(package->package.elements[0]); - if (!element) - return AE_BAD_PARAMETER; + if (!element) { + status = AE_BAD_DATA; + goto out; + } if (element->type == ACPI_TYPE_PACKAGE) { if ((element->package.count < 2) || (element->package.elements[0].type != ACPI_TYPE_LOCAL_REFERENCE) - || (element->package.elements[1].type != ACPI_TYPE_INTEGER)) - return AE_BAD_DATA; - device->wakeup.gpe_device = + || (element->package.elements[1].type != ACPI_TYPE_INTEGER)) { + status = AE_BAD_DATA; + goto out; + } + wakeup->gpe_device = element->package.elements[0].reference.handle; - device->wakeup.gpe_number = + wakeup->gpe_number = (u32) element->package.elements[1].integer.value; } else if (element->type == ACPI_TYPE_INTEGER) { - device->wakeup.gpe_number = element->integer.value; - } else - return AE_BAD_DATA; + wakeup->gpe_device = NULL; + wakeup->gpe_number = element->integer.value; + } else { + status = AE_BAD_DATA; + goto out; + } element = &(package->package.elements[1]); if (element->type != ACPI_TYPE_INTEGER) { - return AE_BAD_DATA; + status = AE_BAD_DATA; + goto out; } - device->wakeup.sleep_state = element->integer.value; + wakeup->sleep_state = element->integer.value; if ((package->package.count - 2) > ACPI_MAX_HANDLES) { - return AE_NO_MEMORY; + status = AE_NO_MEMORY; + goto out; } - device->wakeup.resources.count = package->package.count - 2; - for (i = 0; i < device->wakeup.resources.count; i++) { + wakeup->resources.count = package->package.count - 2; + for (i = 0; i < wakeup->resources.count; i++) { element = &(package->package.elements[i + 2]); - if (element->type != ACPI_TYPE_LOCAL_REFERENCE) - return AE_BAD_DATA; + if (element->type != ACPI_TYPE_LOCAL_REFERENCE) { + status = AE_BAD_DATA; + goto out; + } - device->wakeup.resources.handles[i] = element->reference.handle; + wakeup->resources.handles[i] = element->reference.handle; } - acpi_gpe_can_wake(device->wakeup.gpe_device, device->wakeup.gpe_number); + acpi_gpe_can_wake(wakeup->gpe_device, wakeup->gpe_number); - return AE_OK; + out: + kfree(buffer.pointer); + + return status; } static void acpi_bus_set_run_wake_flags(struct acpi_device *device) @@ -787,26 +818,15 @@ static void acpi_bus_set_run_wake_flags(struct acpi_device *device) static int acpi_bus_get_wakeup_device_flags(struct acpi_device *device) { acpi_status status = 0; - struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; - union acpi_object *package = NULL; int psw_error; - /* _PRW */ - status = acpi_evaluate_object(device->handle, "_PRW", NULL, &buffer); - if (ACPI_FAILURE(status)) { - ACPI_EXCEPTION((AE_INFO, status, "Evaluating _PRW")); - goto end; - } - - package = (union acpi_object *)buffer.pointer; - status = acpi_bus_extract_wakeup_device_power_package(device, package); + status = acpi_bus_extract_wakeup_device_power_package(device->handle, + &device->wakeup); if (ACPI_FAILURE(status)) { ACPI_EXCEPTION((AE_INFO, status, "Extracting _PRW package")); goto end; } - kfree(buffer.pointer); - device->wakeup.flags.valid = 1; device->wakeup.prepare_count = 0; acpi_bus_set_run_wake_flags(device); @@ -1351,6 +1371,7 @@ static acpi_status acpi_bus_check_add(acpi_handle handle, u32 lvl, struct acpi_bus_ops *ops = context; int type; unsigned long long sta; + struct acpi_device_wakeup wakeup; struct acpi_device *device; acpi_status status; int result; @@ -1360,8 +1381,10 @@ static acpi_status acpi_bus_check_add(acpi_handle handle, u32 lvl, return AE_OK; if (!(sta & ACPI_STA_DEVICE_PRESENT) && - !(sta & ACPI_STA_DEVICE_FUNCTIONING)) + !(sta & ACPI_STA_DEVICE_FUNCTIONING)) { + acpi_bus_extract_wakeup_device_power_package(handle, &wakeup); return AE_CTRL_DEPTH; + } /* * We may already have an acpi_device from a previous enumeration. If diff --git a/drivers/ata/pata_cs5536.c b/drivers/ata/pata_cs5536.c index a6e6c963b6ae..628c8fae5937 100644 --- a/drivers/ata/pata_cs5536.c +++ b/drivers/ata/pata_cs5536.c @@ -44,6 +44,8 @@ static int use_msr; module_param_named(msr, use_msr, int, 0644); MODULE_PARM_DESC(msr, "Force using MSR to configure IDE function (Default: 0)"); #else +#undef rdmsr /* avoid accidental MSR usage on, e.g. x86-64 */ +#undef wrmsr #define rdmsr(x, y, z) do { } while (0) #define wrmsr(x, y, z) do { } while (0) #define use_msr 0 diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 42396df55556..9252e85706ef 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -38,7 +38,7 @@ static int agp_bridges_found; static void amd64_tlbflush(struct agp_memory *temp) { - k8_flush_garts(); + amd_flush_garts(); } static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type) @@ -124,7 +124,7 @@ static int amd64_fetch_size(void) u32 temp; struct aper_size_info_32 *values; - dev = k8_northbridges.nb_misc[0]; + dev = node_to_amd_nb(0)->misc; if (dev==NULL) return 0; @@ -181,16 +181,15 @@ static int amd_8151_configure(void) unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real); int i; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return 0; /* Configure AGP regs in each x86-64 host bridge. */ - for (i = 0; i < k8_northbridges.num; i++) { + for (i = 0; i < amd_nb_num(); i++) { agp_bridge->gart_bus_addr = - amd64_configure(k8_northbridges.nb_misc[i], - gatt_bus); + amd64_configure(node_to_amd_nb(i)->misc, gatt_bus); } - k8_flush_garts(); + amd_flush_garts(); return 0; } @@ -200,11 +199,11 @@ static void amd64_cleanup(void) u32 tmp; int i; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; - for (i = 0; i < k8_northbridges.num; i++) { - struct pci_dev *dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; /* disable gart translation */ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp); tmp &= ~GARTEN; @@ -331,15 +330,15 @@ static __devinit int cache_nbs(struct pci_dev *pdev, u32 cap_ptr) { int i; - if (cache_k8_northbridges() < 0) + if (amd_cache_northbridges() < 0) return -ENODEV; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return -ENODEV; i = 0; - for (i = 0; i < k8_northbridges.num; i++) { - struct pci_dev *dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; if (fix_northbridge(dev, pdev, cap_ptr) < 0) { dev_err(&dev->dev, "no usable aperture found\n"); #ifdef __x86_64__ @@ -416,7 +415,7 @@ static int __devinit uli_agp_init(struct pci_dev *pdev) } /* shadow x86-64 registers into ULi registers */ - pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE, + pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE, &httfea); /* if x86-64 aperture base is beyond 4G, exit here */ @@ -484,7 +483,7 @@ static int nforce3_agp_init(struct pci_dev *pdev) pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp); /* shadow x86-64 registers into NVIDIA registers */ - pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE, + pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE, &apbase); /* if x86-64 aperture base is beyond 4G, exit here */ @@ -778,7 +777,7 @@ int __init agp_amd64_init(void) } /* First check that we have at least one AMD64 NB */ - if (!pci_dev_present(k8_nb_ids)) + if (!pci_dev_present(amd_nb_misc_ids)) return -ENODEV; /* Look for any AGP bridge */ diff --git a/drivers/char/ramoops.c b/drivers/char/ramoops.c index 73dcb0ee41fd..d3d63be2cd37 100644 --- a/drivers/char/ramoops.c +++ b/drivers/char/ramoops.c @@ -29,7 +29,6 @@ #include <linux/ramoops.h> #define RAMOOPS_KERNMSG_HDR "====" -#define RAMOOPS_HEADER_SIZE (5 + sizeof(struct timeval)) #define RECORD_SIZE 4096 @@ -65,8 +64,8 @@ static void ramoops_do_dump(struct kmsg_dumper *dumper, struct ramoops_context, dump); unsigned long s1_start, s2_start; unsigned long l1_cpy, l2_cpy; - int res; - char *buf; + int res, hdr_size; + char *buf, *buf_orig; struct timeval timestamp; /* Only dump oopses if dump_oops is set */ @@ -74,6 +73,8 @@ static void ramoops_do_dump(struct kmsg_dumper *dumper, return; buf = (char *)(cxt->virt_addr + (cxt->count * RECORD_SIZE)); + buf_orig = buf; + memset(buf, '\0', RECORD_SIZE); res = sprintf(buf, "%s", RAMOOPS_KERNMSG_HDR); buf += res; @@ -81,8 +82,9 @@ static void ramoops_do_dump(struct kmsg_dumper *dumper, res = sprintf(buf, "%lu.%lu\n", (long)timestamp.tv_sec, (long)timestamp.tv_usec); buf += res; - l2_cpy = min(l2, (unsigned long)(RECORD_SIZE - RAMOOPS_HEADER_SIZE)); - l1_cpy = min(l1, (unsigned long)(RECORD_SIZE - RAMOOPS_HEADER_SIZE) - l2_cpy); + hdr_size = buf - buf_orig; + l2_cpy = min(l2, (unsigned long)(RECORD_SIZE - hdr_size)); + l1_cpy = min(l1, (unsigned long)(RECORD_SIZE - hdr_size) - l2_cpy); s2_start = l2 - l2_cpy; s1_start = l1 - l1_cpy; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index c63a43823744..1109f6848a43 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -355,6 +355,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state) dprintk("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new, (unsigned long)freqs->cpu); trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu); + trace_cpu_frequency(freqs->new, freqs->cpu); srcu_notifier_call_chain(&cpufreq_transition_notifier_list, CPUFREQ_POSTCHANGE, freqs); if (likely(policy) && likely(policy->cpu == freqs->cpu)) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index a50710843378..08d5f05378d9 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -107,6 +107,7 @@ static void cpuidle_idle_call(void) if (cpuidle_curr_governor->reflect) cpuidle_curr_governor->reflect(dev); trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } /** diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index 411d5bf50fc4..a25f5f61e0e0 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -449,7 +449,7 @@ mv_xor_slot_cleanup(struct mv_xor_chan *mv_chan) static void mv_xor_tasklet(unsigned long data) { struct mv_xor_chan *chan = (struct mv_xor_chan *) data; - __mv_xor_slot_cleanup(chan); + mv_xor_slot_cleanup(chan); } static struct mv_xor_desc_slot * diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index eca9ba193e94..df211181fca4 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2917,7 +2917,7 @@ static int __init amd64_edac_init(void) opstate_init(); - if (cache_k8_northbridges() < 0) + if (amd_cache_northbridges() < 0) goto err_ret; msrs = msrs_alloc(); @@ -2934,7 +2934,7 @@ static int __init amd64_edac_init(void) * to finish initialization of the MC instances. */ err = -ENODEV; - for (nb = 0; nb < k8_northbridges.num; nb++) { + for (nb = 0; nb < amd_nb_num(); nb++) { if (!pvt_lookup[nb]) continue; diff --git a/drivers/gpu/drm/i915/dvo_ch7017.c b/drivers/gpu/drm/i915/dvo_ch7017.c index af70337567ce..d3e8c540f778 100644 --- a/drivers/gpu/drm/i915/dvo_ch7017.c +++ b/drivers/gpu/drm/i915/dvo_ch7017.c @@ -242,7 +242,7 @@ fail: static enum drm_connector_status ch7017_detect(struct intel_dvo_device *dvo) { - return connector_status_unknown; + return connector_status_connected; } static enum drm_mode_status ch7017_mode_valid(struct intel_dvo_device *dvo, diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index e6800819bca8..cb900dc83d95 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -34,6 +34,7 @@ #include "i915_drm.h" #include "i915_drv.h" #include "i915_trace.h" +#include "../../../platform/x86/intel_ips.h" #include <linux/pci.h> #include <linux/vgaarb.h> #include <linux/acpi.h> @@ -1871,6 +1872,26 @@ out_unlock: EXPORT_SYMBOL_GPL(i915_gpu_turbo_disable); /** + * Tells the intel_ips driver that the i915 driver is now loaded, if + * IPS got loaded first. + * + * This awkward dance is so that neither module has to depend on the + * other in order for IPS to do the appropriate communication of + * GPU turbo limits to i915. + */ +static void +ips_ping_for_i915_load(void) +{ + void (*link)(void); + + link = symbol_get(ips_link_to_i915_driver); + if (link) { + link(); + symbol_put(ips_link_to_i915_driver); + } +} + +/** * i915_driver_load - setup chip and create an initial config * @dev: DRM device * @flags: startup flags @@ -2075,6 +2096,8 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags) dev_priv->mchdev_lock = &mchdev_lock; spin_unlock(&mchdev_lock); + ips_ping_for_i915_load(); + return 0; out_workqueue_free: diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 878fc766a12c..cb8f43429279 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -2471,6 +2471,9 @@ # define MARIUNIT_CLOCK_GATE_DISABLE (1 << 18) # define SVSMUNIT_CLOCK_GATE_DISABLE (1 << 1) +#define PCH_3DCGDIS1 0x46024 +# define VFMUNIT_CLOCK_GATE_DISABLE (1 << 11) + #define FDI_PLL_FREQ_CTL 0x46030 #define FDI_PLL_FREQ_CHANGE_REQUEST (1<<24) #define FDI_PLL_FREQ_LOCK_LIMIT_MASK 0xfff00 @@ -2588,6 +2591,13 @@ #define ILK_DISPLAY_CHICKEN2 0x42004 #define ILK_DPARB_GATE (1<<22) #define ILK_VSDPFD_FULL (1<<21) +#define ILK_DISPLAY_CHICKEN_FUSES 0x42014 +#define ILK_INTERNAL_GRAPHICS_DISABLE (1<<31) +#define ILK_INTERNAL_DISPLAY_DISABLE (1<<30) +#define ILK_DISPLAY_DEBUG_DISABLE (1<<29) +#define ILK_HDCP_DISABLE (1<<25) +#define ILK_eDP_A_DISABLE (1<<24) +#define ILK_DESKTOP (1<<23) #define ILK_DSPCLK_GATE 0x42020 #define ILK_DPARB_CLK_GATE (1<<5) /* According to spec this bit 7/8/9 of 0x42020 should be set to enable FBC */ diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c index 2b2078695d2a..b0b1200ed650 100644 --- a/drivers/gpu/drm/i915/intel_bios.c +++ b/drivers/gpu/drm/i915/intel_bios.c @@ -270,7 +270,7 @@ parse_general_features(struct drm_i915_private *dev_priv, general->ssc_freq ? 66 : 48; else if (IS_GEN5(dev) || IS_GEN6(dev)) dev_priv->lvds_ssc_freq = - general->ssc_freq ? 120 : 100; + general->ssc_freq ? 100 : 120; else dev_priv->lvds_ssc_freq = general->ssc_freq ? 100 : 96; diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index d9b7092439ef..fca523288aca 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -5379,6 +5379,23 @@ static int intel_encoder_clones(struct drm_device *dev, int type_mask) return index_mask; } +static bool has_edp_a(struct drm_device *dev) +{ + struct drm_i915_private *dev_priv = dev->dev_private; + + if (!IS_MOBILE(dev)) + return false; + + if ((I915_READ(DP_A) & DP_DETECTED) == 0) + return false; + + if (IS_GEN5(dev) && + (I915_READ(ILK_DISPLAY_CHICKEN_FUSES) & ILK_eDP_A_DISABLE)) + return false; + + return true; +} + static void intel_setup_outputs(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; @@ -5396,7 +5413,7 @@ static void intel_setup_outputs(struct drm_device *dev) if (HAS_PCH_SPLIT(dev)) { dpd_is_edp = intel_dpd_is_edp(dev); - if (IS_MOBILE(dev) && (I915_READ(DP_A) & DP_DETECTED)) + if (has_edp_a(dev)) intel_dp_init(dev, DP_A); if (dpd_is_edp && (I915_READ(PCH_DP_D) & DP_DETECTED)) @@ -5825,6 +5842,8 @@ void intel_init_clock_gating(struct drm_device *dev) I915_WRITE(PCH_3DCGDIS0, MARIUNIT_CLOCK_GATE_DISABLE | SVSMUNIT_CLOCK_GATE_DISABLE); + I915_WRITE(PCH_3DCGDIS1, + VFMUNIT_CLOCK_GATE_DISABLE); } I915_WRITE(PCH_DSPCLK_GATE_D, dspclk_gate); diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c index 27e63abf2a73..6bc42fa2a6ec 100644 --- a/drivers/gpu/drm/i915/intel_sdvo.c +++ b/drivers/gpu/drm/i915/intel_sdvo.c @@ -2040,13 +2040,14 @@ intel_sdvo_dvi_init(struct intel_sdvo *intel_sdvo, int device) SDVO_COLORIMETRY_RGB256); connector->connector_type = DRM_MODE_CONNECTOR_HDMIA; - intel_sdvo_add_hdmi_properties(intel_sdvo_connector); intel_sdvo->is_hdmi = true; } intel_sdvo->base.clone_mask = ((1 << INTEL_SDVO_NON_TV_CLONE_BIT) | (1 << INTEL_ANALOG_CLONE_BIT)); intel_sdvo_connector_init(intel_sdvo_connector, intel_sdvo); + if (intel_sdvo->is_hdmi) + intel_sdvo_add_hdmi_properties(intel_sdvo_connector); return true; } diff --git a/drivers/hwmon/s3c-hwmon.c b/drivers/hwmon/s3c-hwmon.c index 05248f2d7581..92b42db43bcf 100644 --- a/drivers/hwmon/s3c-hwmon.c +++ b/drivers/hwmon/s3c-hwmon.c @@ -234,7 +234,6 @@ static int s3c_hwmon_create_attr(struct device *dev, attr->index = channel; attr->dev_attr.attr.name = attrs->in_name; attr->dev_attr.attr.mode = S_IRUGO; - attr->dev_attr.attr.owner = THIS_MODULE; attr->dev_attr.show = s3c_hwmon_ch_show; ret = device_create_file(dev, &attr->dev_attr); @@ -252,7 +251,6 @@ static int s3c_hwmon_create_attr(struct device *dev, attr->index = channel; attr->dev_attr.attr.name = attrs->label_name; attr->dev_attr.attr.mode = S_IRUGO; - attr->dev_attr.attr.owner = THIS_MODULE; attr->dev_attr.show = s3c_hwmon_label_show; ret = device_create_file(dev, &attr->dev_attr); diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index c131d58bcb50..56ac09d6c930 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -220,9 +220,8 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state) kt_before = ktime_get_real(); stop_critical_timings(); -#ifndef MODULE trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu); -#endif + trace_cpu_idle((eax >> 4) + 1, cpu); if (!need_resched()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); diff --git a/drivers/media/video/cx25840/cx25840-core.c b/drivers/media/video/cx25840/cx25840-core.c index dfb198d0415b..f16461844c5c 100644 --- a/drivers/media/video/cx25840/cx25840-core.c +++ b/drivers/media/video/cx25840/cx25840-core.c @@ -1989,8 +1989,23 @@ static int cx25840_probe(struct i2c_client *client, v4l2_ctrl_new_std(&state->hdl, &cx25840_ctrl_ops, V4L2_CID_HUE, -128, 127, 1, 0); if (!is_cx2583x(state)) { - default_volume = 228 - cx25840_read(client, 0x8d4); - default_volume = ((default_volume / 2) + 23) << 9; + default_volume = cx25840_read(client, 0x8d4); + /* + * Enforce the legacy PVR-350/MSP3400 to PVR-150/CX25843 volume + * scale mapping limits to avoid -ERANGE errors when + * initializing the volume control + */ + if (default_volume > 228) { + /* Bottom out at -96 dB, v4l2 vol range 0x2e00-0x2fff */ + default_volume = 228; + cx25840_write(client, 0x8d4, 228); + } + else if (default_volume < 20) { + /* Top out at + 8 dB, v4l2 vol range 0xfe00-0xffff */ + default_volume = 20; + cx25840_write(client, 0x8d4, 20); + } + default_volume = (((228 - default_volume) >> 1) + 23) << 9; state->volume = v4l2_ctrl_new_std(&state->hdl, &cx25840_audio_ctrl_ops, V4L2_CID_AUDIO_VOLUME, diff --git a/drivers/media/video/cx88/cx88-alsa.c b/drivers/media/video/cx88/cx88-alsa.c index 4aaa47c0eabf..54b7fcd469a8 100644 --- a/drivers/media/video/cx88/cx88-alsa.c +++ b/drivers/media/video/cx88/cx88-alsa.c @@ -40,7 +40,6 @@ #include <sound/control.h> #include <sound/initval.h> #include <sound/tlv.h> -#include <media/wm8775.h> #include "cx88.h" #include "cx88-reg.h" @@ -587,47 +586,26 @@ static int snd_cx88_volume_put(struct snd_kcontrol *kcontrol, int left, right, v, b; int changed = 0; u32 old; - struct v4l2_control client_ctl; - - /* Pass volume & balance onto any WM8775 */ - if (value->value.integer.value[0] >= value->value.integer.value[1]) { - v = value->value.integer.value[0] << 10; - b = value->value.integer.value[0] ? - (0x8000 * value->value.integer.value[1]) / value->value.integer.value[0] : - 0x8000; - } else { - v = value->value.integer.value[1] << 10; - b = value->value.integer.value[1] ? - 0xffff - (0x8000 * value->value.integer.value[0]) / value->value.integer.value[1] : - 0x8000; - } - client_ctl.value = v; - client_ctl.id = V4L2_CID_AUDIO_VOLUME; - call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl); - - client_ctl.value = b; - client_ctl.id = V4L2_CID_AUDIO_BALANCE; - call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl); left = value->value.integer.value[0] & 0x3f; right = value->value.integer.value[1] & 0x3f; b = right - left; if (b < 0) { - v = 0x3f - left; - b = (-b) | 0x40; + v = 0x3f - left; + b = (-b) | 0x40; } else { - v = 0x3f - right; + v = 0x3f - right; } /* Do we really know this will always be called with IRQs on? */ spin_lock_irq(&chip->reg_lock); old = cx_read(AUD_VOL_CTL); if (v != (old & 0x3f)) { - cx_swrite(SHADOW_AUD_VOL_CTL, AUD_VOL_CTL, (old & ~0x3f) | v); - changed = 1; + cx_write(AUD_VOL_CTL, (old & ~0x3f) | v); + changed = 1; } - if ((cx_read(AUD_BAL_CTL) & 0x7f) != b) { - cx_write(AUD_BAL_CTL, b); - changed = 1; + if (cx_read(AUD_BAL_CTL) != b) { + cx_write(AUD_BAL_CTL, b); + changed = 1; } spin_unlock_irq(&chip->reg_lock); @@ -640,7 +618,7 @@ static const struct snd_kcontrol_new snd_cx88_volume = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .access = SNDRV_CTL_ELEM_ACCESS_READWRITE | SNDRV_CTL_ELEM_ACCESS_TLV_READ, - .name = "Analog-TV Volume", + .name = "Playback Volume", .info = snd_cx88_volume_info, .get = snd_cx88_volume_get, .put = snd_cx88_volume_put, @@ -671,14 +649,7 @@ static int snd_cx88_switch_put(struct snd_kcontrol *kcontrol, vol = cx_read(AUD_VOL_CTL); if (value->value.integer.value[0] != !(vol & bit)) { vol ^= bit; - cx_swrite(SHADOW_AUD_VOL_CTL, AUD_VOL_CTL, vol); - /* Pass mute onto any WM8775 */ - if ((1<<6) == bit) { - struct v4l2_control client_ctl; - client_ctl.value = 0 != (vol & bit); - client_ctl.id = V4L2_CID_AUDIO_MUTE; - call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl); - } + cx_write(AUD_VOL_CTL, vol); ret = 1; } spin_unlock_irq(&chip->reg_lock); @@ -687,7 +658,7 @@ static int snd_cx88_switch_put(struct snd_kcontrol *kcontrol, static const struct snd_kcontrol_new snd_cx88_dac_switch = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, - .name = "Audio-Out Switch", + .name = "Playback Switch", .info = snd_ctl_boolean_mono_info, .get = snd_cx88_switch_get, .put = snd_cx88_switch_put, @@ -696,49 +667,13 @@ static const struct snd_kcontrol_new snd_cx88_dac_switch = { static const struct snd_kcontrol_new snd_cx88_source_switch = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, - .name = "Analog-TV Switch", + .name = "Capture Switch", .info = snd_ctl_boolean_mono_info, .get = snd_cx88_switch_get, .put = snd_cx88_switch_put, .private_value = (1<<6), }; -static int snd_cx88_alc_get(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *value) -{ - snd_cx88_card_t *chip = snd_kcontrol_chip(kcontrol); - struct cx88_core *core = chip->core; - struct v4l2_control client_ctl; - - client_ctl.id = V4L2_CID_AUDIO_LOUDNESS; - call_hw(core, WM8775_GID, core, g_ctrl, &client_ctl); - value->value.integer.value[0] = client_ctl.value ? 1 : 0; - - return 0; -} - -static int snd_cx88_alc_put(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *value) -{ - snd_cx88_card_t *chip = snd_kcontrol_chip(kcontrol); - struct cx88_core *core = chip->core; - struct v4l2_control client_ctl; - - client_ctl.value = 0 != value->value.integer.value[0]; - client_ctl.id = V4L2_CID_AUDIO_LOUDNESS; - call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl); - - return 0; -} - -static struct snd_kcontrol_new snd_cx88_alc_switch = { - .iface = SNDRV_CTL_ELEM_IFACE_MIXER, - .name = "Line-In ALC Switch", - .info = snd_ctl_boolean_mono_info, - .get = snd_cx88_alc_get, - .put = snd_cx88_alc_put, -}; - /**************************************************************************** Basic Flow for Sound Devices ****************************************************************************/ @@ -860,7 +795,6 @@ static int __devinit cx88_audio_initdev(struct pci_dev *pci, { struct snd_card *card; snd_cx88_card_t *chip; - struct v4l2_subdev *sd; int err; if (devno >= SNDRV_CARDS) @@ -896,15 +830,6 @@ static int __devinit cx88_audio_initdev(struct pci_dev *pci, if (err < 0) goto error; - /* If there's a wm8775 then add a Line-In ALC switch */ - list_for_each_entry(sd, &chip->core->v4l2_dev.subdevs, list) { - if (WM8775_GID == sd->grp_id) { - snd_ctl_add(card, snd_ctl_new1(&snd_cx88_alc_switch, - chip)); - break; - } - } - strcpy (card->driver, "CX88x"); sprintf(card->shortname, "Conexant CX%x", pci->device); sprintf(card->longname, "%s at %#llx", diff --git a/drivers/media/video/cx88/cx88-cards.c b/drivers/media/video/cx88/cx88-cards.c index 9b9e169cce90..0ccc2afd7266 100644 --- a/drivers/media/video/cx88/cx88-cards.c +++ b/drivers/media/video/cx88/cx88-cards.c @@ -1007,15 +1007,22 @@ static const struct cx88_board cx88_boards[] = { .radio_type = UNSET, .tuner_addr = ADDR_UNSET, .radio_addr = ADDR_UNSET, + .audio_chip = V4L2_IDENT_WM8775, .input = {{ .type = CX88_VMUX_DVB, .vmux = 0, + /* 2: Line-In */ + .audioroute = 2, },{ .type = CX88_VMUX_COMPOSITE1, .vmux = 1, + /* 2: Line-In */ + .audioroute = 2, },{ .type = CX88_VMUX_SVIDEO, .vmux = 2, + /* 2: Line-In */ + .audioroute = 2, }}, .mpeg = CX88_MPEG_DVB, }, diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c index 62cea9549404..d9249e5a04c9 100644 --- a/drivers/media/video/cx88/cx88-video.c +++ b/drivers/media/video/cx88/cx88-video.c @@ -40,7 +40,6 @@ #include "cx88.h" #include <media/v4l2-common.h> #include <media/v4l2-ioctl.h> -#include <media/wm8775.h> MODULE_DESCRIPTION("v4l2 driver module for cx2388x based TV cards"); MODULE_AUTHOR("Gerd Knorr <kraxel@bytesex.org> [SuSE Labs]"); @@ -977,7 +976,6 @@ int cx88_set_control(struct cx88_core *core, struct v4l2_control *ctl) const struct cx88_ctrl *c = NULL; u32 value,mask; int i; - struct v4l2_control client_ctl; for (i = 0; i < CX8800_CTLS; i++) { if (cx8800_ctls[i].v.id == ctl->id) { @@ -991,27 +989,6 @@ int cx88_set_control(struct cx88_core *core, struct v4l2_control *ctl) ctl->value = c->v.minimum; if (ctl->value > c->v.maximum) ctl->value = c->v.maximum; - - /* Pass changes onto any WM8775 */ - client_ctl.id = ctl->id; - switch (ctl->id) { - case V4L2_CID_AUDIO_MUTE: - client_ctl.value = ctl->value; - break; - case V4L2_CID_AUDIO_VOLUME: - client_ctl.value = (ctl->value) ? - (0x90 + ctl->value) << 8 : 0; - break; - case V4L2_CID_AUDIO_BALANCE: - client_ctl.value = ctl->value << 9; - break; - default: - client_ctl.id = 0; - break; - } - if (client_ctl.id) - call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl); - mask=c->mask; switch (ctl->id) { case V4L2_CID_AUDIO_BALANCE: @@ -1558,9 +1535,7 @@ static int radio_queryctrl (struct file *file, void *priv, if (c->id < V4L2_CID_BASE || c->id >= V4L2_CID_LASTP1) return -EINVAL; - if (c->id == V4L2_CID_AUDIO_MUTE || - c->id == V4L2_CID_AUDIO_VOLUME || - c->id == V4L2_CID_AUDIO_BALANCE) { + if (c->id == V4L2_CID_AUDIO_MUTE) { for (i = 0; i < CX8800_CTLS; i++) { if (cx8800_ctls[i].v.id == c->id) break; diff --git a/drivers/media/video/cx88/cx88.h b/drivers/media/video/cx88/cx88.h index e8c732e7ae4f..c9981e77416a 100644 --- a/drivers/media/video/cx88/cx88.h +++ b/drivers/media/video/cx88/cx88.h @@ -398,19 +398,17 @@ static inline struct cx88_core *to_core(struct v4l2_device *v4l2_dev) return container_of(v4l2_dev, struct cx88_core, v4l2_dev); } -#define call_hw(core, grpid, o, f, args...) \ +#define call_all(core, o, f, args...) \ do { \ if (!core->i2c_rc) { \ if (core->gate_ctrl) \ core->gate_ctrl(core, 1); \ - v4l2_device_call_all(&core->v4l2_dev, grpid, o, f, ##args); \ + v4l2_device_call_all(&core->v4l2_dev, 0, o, f, ##args); \ if (core->gate_ctrl) \ core->gate_ctrl(core, 0); \ } \ } while (0) -#define call_all(core, o, f, args...) call_hw(core, 0, o, f, ##args) - struct cx8800_dev; struct cx8802_dev; diff --git a/drivers/media/video/em28xx/em28xx-video.c b/drivers/media/video/em28xx/em28xx-video.c index 908e3bc88303..2c3007280032 100644 --- a/drivers/media/video/em28xx/em28xx-video.c +++ b/drivers/media/video/em28xx/em28xx-video.c @@ -2377,7 +2377,7 @@ static const struct v4l2_file_operations radio_fops = { .owner = THIS_MODULE, .open = em28xx_v4l2_open, .release = em28xx_v4l2_close, - .ioctl = video_ioctl2, + .unlocked_ioctl = video_ioctl2, }; static const struct v4l2_ioctl_ops radio_ioctl_ops = { diff --git a/drivers/media/video/soc_camera.c b/drivers/media/video/soc_camera.c index 335120c2021b..052bd6dfa5a7 100644 --- a/drivers/media/video/soc_camera.c +++ b/drivers/media/video/soc_camera.c @@ -405,13 +405,13 @@ static int soc_camera_open(struct file *file) ret = soc_camera_set_fmt(icd, &f); if (ret < 0) goto esfmt; + + ici->ops->init_videobuf(&icd->vb_vidq, icd); } file->private_data = icd; dev_dbg(&icd->dev, "camera device open\n"); - ici->ops->init_videobuf(&icd->vb_vidq, icd); - mutex_unlock(&icd->video_lock); return 0; diff --git a/drivers/media/video/wm8775.c b/drivers/media/video/wm8775.c index 135525649086..fe8ef6419f83 100644 --- a/drivers/media/video/wm8775.c +++ b/drivers/media/video/wm8775.c @@ -35,7 +35,6 @@ #include <media/v4l2-device.h> #include <media/v4l2-chip-ident.h> #include <media/v4l2-ctrls.h> -#include <media/wm8775.h> MODULE_DESCRIPTION("wm8775 driver"); MODULE_AUTHOR("Ulf Eklund, Hans Verkuil"); @@ -51,16 +50,10 @@ enum { TOT_REGS }; -#define ALC_HOLD 0x85 /* R17: use zero cross detection, ALC hold time 42.6 ms */ -#define ALC_EN 0x100 /* R17: ALC enable */ - struct wm8775_state { struct v4l2_subdev sd; struct v4l2_ctrl_handler hdl; struct v4l2_ctrl *mute; - struct v4l2_ctrl *vol; - struct v4l2_ctrl *bal; - struct v4l2_ctrl *loud; u8 input; /* Last selected input (0-0xf) */ }; @@ -92,30 +85,6 @@ static int wm8775_write(struct v4l2_subdev *sd, int reg, u16 val) return -1; } -static void wm8775_set_audio(struct v4l2_subdev *sd, int quietly) -{ - struct wm8775_state *state = to_state(sd); - u8 vol_l, vol_r; - int muted = 0 != state->mute->val; - u16 volume = (u16)state->vol->val; - u16 balance = (u16)state->bal->val; - - /* normalize ( 65535 to 0 -> 255 to 0 (+24dB to -103dB) ) */ - vol_l = (min(65536 - balance, 32768) * volume) >> 23; - vol_r = (min(balance, (u16)32768) * volume) >> 23; - - /* Mute */ - if (muted || quietly) - wm8775_write(sd, R21, 0x0c0 | state->input); - - wm8775_write(sd, R14, vol_l | 0x100); /* 0x100= Left channel ADC zero cross enable */ - wm8775_write(sd, R15, vol_r | 0x100); /* 0x100= Right channel ADC zero cross enable */ - - /* Un-mute */ - if (!muted) - wm8775_write(sd, R21, state->input); -} - static int wm8775_s_routing(struct v4l2_subdev *sd, u32 input, u32 output, u32 config) { @@ -133,26 +102,25 @@ static int wm8775_s_routing(struct v4l2_subdev *sd, state->input = input; if (!v4l2_ctrl_g_ctrl(state->mute)) return 0; - if (!v4l2_ctrl_g_ctrl(state->vol)) - return 0; - if (!v4l2_ctrl_g_ctrl(state->bal)) - return 0; - wm8775_set_audio(sd, 1); + wm8775_write(sd, R21, 0x0c0); + wm8775_write(sd, R14, 0x1d4); + wm8775_write(sd, R15, 0x1d4); + wm8775_write(sd, R21, 0x100 + state->input); return 0; } static int wm8775_s_ctrl(struct v4l2_ctrl *ctrl) { struct v4l2_subdev *sd = to_sd(ctrl); + struct wm8775_state *state = to_state(sd); switch (ctrl->id) { case V4L2_CID_AUDIO_MUTE: - case V4L2_CID_AUDIO_VOLUME: - case V4L2_CID_AUDIO_BALANCE: - wm8775_set_audio(sd, 0); - return 0; - case V4L2_CID_AUDIO_LOUDNESS: - wm8775_write(sd, R17, (ctrl->val ? ALC_EN : 0) | ALC_HOLD); + wm8775_write(sd, R21, 0x0c0); + wm8775_write(sd, R14, 0x1d4); + wm8775_write(sd, R15, 0x1d4); + if (!ctrl->val) + wm8775_write(sd, R21, 0x100 + state->input); return 0; } return -EINVAL; @@ -176,7 +144,16 @@ static int wm8775_log_status(struct v4l2_subdev *sd) static int wm8775_s_frequency(struct v4l2_subdev *sd, struct v4l2_frequency *freq) { - wm8775_set_audio(sd, 0); + struct wm8775_state *state = to_state(sd); + + /* If I remove this, then it can happen that I have no + sound the first time I tune from static to a valid channel. + It's difficult to reproduce and is almost certainly related + to the zero cross detect circuit. */ + wm8775_write(sd, R21, 0x0c0); + wm8775_write(sd, R14, 0x1d4); + wm8775_write(sd, R15, 0x1d4); + wm8775_write(sd, R21, 0x100 + state->input); return 0; } @@ -226,7 +203,6 @@ static int wm8775_probe(struct i2c_client *client, { struct wm8775_state *state; struct v4l2_subdev *sd; - int err; /* Check if the adapter supports the needed features */ if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE_DATA)) @@ -240,21 +216,15 @@ static int wm8775_probe(struct i2c_client *client, return -ENOMEM; sd = &state->sd; v4l2_i2c_subdev_init(sd, client, &wm8775_ops); - sd->grp_id = WM8775_GID; /* subdev group id */ state->input = 2; - v4l2_ctrl_handler_init(&state->hdl, 4); + v4l2_ctrl_handler_init(&state->hdl, 1); state->mute = v4l2_ctrl_new_std(&state->hdl, &wm8775_ctrl_ops, V4L2_CID_AUDIO_MUTE, 0, 1, 1, 0); - state->vol = v4l2_ctrl_new_std(&state->hdl, &wm8775_ctrl_ops, - V4L2_CID_AUDIO_VOLUME, 0, 65535, (65535+99)/100, 0xCF00); /* 0dB*/ - state->bal = v4l2_ctrl_new_std(&state->hdl, &wm8775_ctrl_ops, - V4L2_CID_AUDIO_BALANCE, 0, 65535, (65535+99)/100, 32768); - state->loud = v4l2_ctrl_new_std(&state->hdl, &wm8775_ctrl_ops, - V4L2_CID_AUDIO_LOUDNESS, 0, 1, 1, 1); sd->ctrl_handler = &state->hdl; - err = state->hdl.error; - if (err) { + if (state->hdl.error) { + int err = state->hdl.error; + v4l2_ctrl_handler_free(&state->hdl); kfree(state); return err; @@ -266,25 +236,29 @@ static int wm8775_probe(struct i2c_client *client, wm8775_write(sd, R23, 0x000); /* Disable zero cross detect timeout */ wm8775_write(sd, R7, 0x000); - /* HPF enable, I2S mode, 24-bit */ - wm8775_write(sd, R11, 0x022); + /* Left justified, 24-bit mode */ + wm8775_write(sd, R11, 0x021); /* Master mode, clock ratio 256fs */ wm8775_write(sd, R12, 0x102); /* Powered up */ wm8775_write(sd, R13, 0x000); - /* ALC stereo, ALC target level -5dB FS, ALC max gain +8dB */ - wm8775_write(sd, R16, 0x1bb); - /* Set ALC mode and hold time */ - wm8775_write(sd, R17, (state->loud->val ? ALC_EN : 0) | ALC_HOLD); + /* ADC gain +2.5dB, enable zero cross */ + wm8775_write(sd, R14, 0x1d4); + /* ADC gain +2.5dB, enable zero cross */ + wm8775_write(sd, R15, 0x1d4); + /* ALC Stereo, ALC target level -1dB FS max gain +8dB */ + wm8775_write(sd, R16, 0x1bf); + /* Enable gain control, use zero cross detection, + ALC hold time 42.6 ms */ + wm8775_write(sd, R17, 0x185); /* ALC gain ramp up delay 34 s, ALC gain ramp down delay 33 ms */ wm8775_write(sd, R18, 0x0a2); /* Enable noise gate, threshold -72dBfs */ wm8775_write(sd, R19, 0x005); - /* Transient window 4ms, ALC min gain -5dB */ - wm8775_write(sd, R20, 0x0fb); - - wm8775_set_audio(sd, 1); /* set volume/mute/mux */ - + /* Transient window 4ms, lower PGA gain limit -1dB */ + wm8775_write(sd, R20, 0x07a); + /* LRBOTH = 1, use input 2. */ + wm8775_write(sd, R21, 0x102); return 0; } diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 31ae07a36576..57dcf8fa774a 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -1773,6 +1773,7 @@ int mmc_pm_notify(struct notifier_block *notify_block, case PM_POST_SUSPEND: case PM_POST_HIBERNATION: + case PM_POST_RESTORE: spin_lock_irqsave(&host->lock, flags); host->rescan_disable = 0; diff --git a/drivers/mmc/host/at91_mci.c b/drivers/mmc/host/at91_mci.c index 591ab540b407..d3e6a962f423 100644 --- a/drivers/mmc/host/at91_mci.c +++ b/drivers/mmc/host/at91_mci.c @@ -69,6 +69,7 @@ #include <linux/highmem.h> #include <linux/mmc/host.h> +#include <linux/mmc/sdio.h> #include <asm/io.h> #include <asm/irq.h> @@ -493,10 +494,14 @@ static void at91_mci_send_command(struct at91mci_host *host, struct mmc_command else if (data->flags & MMC_DATA_WRITE) cmdr |= AT91_MCI_TRCMD_START; - if (data->flags & MMC_DATA_STREAM) - cmdr |= AT91_MCI_TRTYP_STREAM; - if (data->blocks > 1) - cmdr |= AT91_MCI_TRTYP_MULTIPLE; + if (cmd->opcode == SD_IO_RW_EXTENDED) { + cmdr |= AT91_MCI_TRTYP_SDIO_BLOCK; + } else { + if (data->flags & MMC_DATA_STREAM) + cmdr |= AT91_MCI_TRTYP_STREAM; + if (data->blocks > 1) + cmdr |= AT91_MCI_TRTYP_MULTIPLE; + } } else { block_length = 0; diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index 301351a5d838..ad2a7a032cdf 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -26,6 +26,7 @@ #include <linux/stat.h> #include <linux/mmc/host.h> +#include <linux/mmc/sdio.h> #include <mach/atmel-mci.h> #include <linux/atmel-mci.h> @@ -532,12 +533,17 @@ static u32 atmci_prepare_command(struct mmc_host *mmc, data = cmd->data; if (data) { cmdr |= MCI_CMDR_START_XFER; - if (data->flags & MMC_DATA_STREAM) - cmdr |= MCI_CMDR_STREAM; - else if (data->blocks > 1) - cmdr |= MCI_CMDR_MULTI_BLOCK; - else - cmdr |= MCI_CMDR_BLOCK; + + if (cmd->opcode == SD_IO_RW_EXTENDED) { + cmdr |= MCI_CMDR_SDIO_BLOCK; + } else { + if (data->flags & MMC_DATA_STREAM) + cmdr |= MCI_CMDR_STREAM; + else if (data->blocks > 1) + cmdr |= MCI_CMDR_MULTI_BLOCK; + else + cmdr |= MCI_CMDR_BLOCK; + } if (data->flags & MMC_DATA_READ) cmdr |= MCI_CMDR_TRDIR_READ; diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c index 2574700db461..5f7226223a62 100644 --- a/drivers/pci/hotplug/pciehp_acpi.c +++ b/drivers/pci/hotplug/pciehp_acpi.c @@ -115,7 +115,8 @@ static struct pcie_port_service_driver __initdata dummy_driver = { static int __init select_detection_mode(void) { struct dummy_slot *slot, *tmp; - pcie_port_service_register(&dummy_driver); + if (pcie_port_service_register(&dummy_driver)) + return PCIEHP_DETECT_ACPI; pcie_port_service_unregister(&dummy_driver); list_for_each_entry_safe(slot, tmp, &dummy_slots, list) { list_del(&slot->list); diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c index c44a5e8b8b82..f0b3ad13c273 100644 --- a/drivers/platform/x86/intel_ips.c +++ b/drivers/platform/x86/intel_ips.c @@ -75,6 +75,7 @@ #include <drm/i915_drm.h> #include <asm/msr.h> #include <asm/processor.h> +#include "intel_ips.h" #define PCI_DEVICE_ID_INTEL_THERMAL_SENSOR 0x3b32 @@ -245,6 +246,7 @@ #define thm_writel(off, val) writel((val), ips->regmap + (off)) static const int IPS_ADJUST_PERIOD = 5000; /* ms */ +static bool late_i915_load = false; /* For initial average collection */ static const int IPS_SAMPLE_PERIOD = 200; /* ms */ @@ -339,6 +341,9 @@ struct ips_driver { u64 orig_turbo_ratios; }; +static bool +ips_gpu_turbo_enabled(struct ips_driver *ips); + /** * ips_cpu_busy - is CPU busy? * @ips: IPS driver struct @@ -517,7 +522,7 @@ static void ips_disable_cpu_turbo(struct ips_driver *ips) */ static bool ips_gpu_busy(struct ips_driver *ips) { - if (!ips->gpu_turbo_enabled) + if (!ips_gpu_turbo_enabled(ips)) return false; return ips->gpu_busy(); @@ -532,7 +537,7 @@ static bool ips_gpu_busy(struct ips_driver *ips) */ static void ips_gpu_raise(struct ips_driver *ips) { - if (!ips->gpu_turbo_enabled) + if (!ips_gpu_turbo_enabled(ips)) return; if (!ips->gpu_raise()) @@ -549,7 +554,7 @@ static void ips_gpu_raise(struct ips_driver *ips) */ static void ips_gpu_lower(struct ips_driver *ips) { - if (!ips->gpu_turbo_enabled) + if (!ips_gpu_turbo_enabled(ips)) return; if (!ips->gpu_lower()) @@ -1454,6 +1459,31 @@ out_err: return false; } +static bool +ips_gpu_turbo_enabled(struct ips_driver *ips) +{ + if (!ips->gpu_busy && late_i915_load) { + if (ips_get_i915_syms(ips)) { + dev_info(&ips->dev->dev, + "i915 driver attached, reenabling gpu turbo\n"); + ips->gpu_turbo_enabled = !(thm_readl(THM_HTS) & HTS_GTD_DIS); + } + } + + return ips->gpu_turbo_enabled; +} + +void +ips_link_to_i915_driver() +{ + /* We can't cleanly get at the various ips_driver structs from + * this caller (the i915 driver), so just set a flag saying + * that it's time to try getting the symbols again. + */ + late_i915_load = true; +} +EXPORT_SYMBOL_GPL(ips_link_to_i915_driver); + static DEFINE_PCI_DEVICE_TABLE(ips_id_table) = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_THERMAL_SENSOR), }, diff --git a/drivers/platform/x86/intel_ips.h b/drivers/platform/x86/intel_ips.h new file mode 100644 index 000000000000..73299beff5b3 --- /dev/null +++ b/drivers/platform/x86/intel_ips.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2010 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + */ + +void ips_link_to_i915_driver(void); diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c index 41a9e34899ac..ca35b0ce944a 100644 --- a/drivers/platform/x86/intel_scu_ipc.c +++ b/drivers/platform/x86/intel_scu_ipc.c @@ -26,6 +26,7 @@ #include <linux/sfi.h> #include <asm/mrst.h> #include <asm/intel_scu_ipc.h> +#include <asm/mrst.h> /* IPC defines the following message types */ #define IPCMSG_WATCHDOG_TIMER 0xF8 /* Set Kernel Watchdog Threshold */ @@ -699,6 +700,9 @@ static int ipc_probe(struct pci_dev *dev, const struct pci_device_id *id) iounmap(ipcdev.ipc_base); return -ENOMEM; } + + intel_scu_devices_create(); + return 0; } @@ -720,6 +724,7 @@ static void ipc_remove(struct pci_dev *pdev) iounmap(ipcdev.ipc_base); iounmap(ipcdev.i2c_base); ipcdev.pdev = NULL; + intel_scu_devices_destroy(); } static const struct pci_device_id pci_ids[] = { diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 2883428d5ac8..4941cade319f 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -463,6 +463,18 @@ config RTC_DRV_CMOS This driver can also be built as a module. If so, the module will be called rtc-cmos. +config RTC_DRV_VRTC + tristate "Virtual RTC for Moorestown platforms" + depends on X86_MRST + default y if X86_MRST + + help + Say "yes" here to get direct support for the real time clock + found on Moorestown platforms. The VRTC is a emulated RTC that + derives its clock source from a real RTC in the PMIC. The MC146818 + style programming interface is mostly conserved, but any + updates are done via IPC calls to the system controller FW. + config RTC_DRV_DS1216 tristate "Dallas DS1216" depends on SNI_RM diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 4c2832df4697..2afdaf3ff986 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_RTC_DRV_CMOS) += rtc-cmos.o obj-$(CONFIG_RTC_DRV_COH901331) += rtc-coh901331.o obj-$(CONFIG_RTC_DRV_DAVINCI) += rtc-davinci.o obj-$(CONFIG_RTC_DRV_DM355EVM) += rtc-dm355evm.o +obj-$(CONFIG_RTC_DRV_VRTC) += rtc-mrst.o obj-$(CONFIG_RTC_DRV_DS1216) += rtc-ds1216.o obj-$(CONFIG_RTC_DRV_DS1286) += rtc-ds1286.o obj-$(CONFIG_RTC_DRV_DS1302) += rtc-ds1302.o diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c new file mode 100644 index 000000000000..bcd0cf63eb16 --- /dev/null +++ b/drivers/rtc/rtc-mrst.c @@ -0,0 +1,582 @@ +/* + * rtc-mrst.c: Driver for Moorestown virtual RTC + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * Feng Tang (feng.tang@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + * VRTC is emulated by system controller firmware, the real HW + * RTC is located in the PMIC device. SCU FW shadows PMIC RTC + * in a memory mapped IO space that is visible to the host IA + * processor. + * + * This driver is based upon drivers/rtc/rtc-cmos.c + */ + +/* + * Note: + * * vRTC only supports binary mode and 24H mode + * * vRTC only support PIE and AIE, no UIE, and its PIE only happens + * at 23:59:59pm everyday, no support for adjustable frequency + * * Alarm function is also limited to hr/min/sec. + */ + +#include <linux/mod_devicetable.h> +#include <linux/platform_device.h> +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sfi.h> + +#include <asm-generic/rtc.h> +#include <asm/intel_scu_ipc.h> +#include <asm/mrst.h> +#include <asm/mrst-vrtc.h> + +struct mrst_rtc { + struct rtc_device *rtc; + struct device *dev; + int irq; + struct resource *iomem; + + u8 enabled_wake; + u8 suspend_ctrl; +}; + +static const char driver_name[] = "rtc_mrst"; + +#define RTC_IRQMASK (RTC_PF | RTC_AF) + +static inline int is_intr(u8 rtc_intr) +{ + if (!(rtc_intr & RTC_IRQF)) + return 0; + return rtc_intr & RTC_IRQMASK; +} + +/* + * rtc_time's year contains the increment over 1900, but vRTC's YEAR + * register can't be programmed to value larger than 0x64, so vRTC + * driver chose to use 1960 (1970 is UNIX time start point) as the base, + * and does the translation at read/write time. + * + * Why not just use 1970 as the offset? it's because using 1960 will + * make it consistent in leap year setting for both vrtc and low-level + * physical rtc devices. + */ +static int mrst_read_time(struct device *dev, struct rtc_time *time) +{ + unsigned long flags; + + if (rtc_is_updating()) + mdelay(20); + + spin_lock_irqsave(&rtc_lock, flags); + time->tm_sec = vrtc_cmos_read(RTC_SECONDS); + time->tm_min = vrtc_cmos_read(RTC_MINUTES); + time->tm_hour = vrtc_cmos_read(RTC_HOURS); + time->tm_mday = vrtc_cmos_read(RTC_DAY_OF_MONTH); + time->tm_mon = vrtc_cmos_read(RTC_MONTH); + time->tm_year = vrtc_cmos_read(RTC_YEAR); + spin_unlock_irqrestore(&rtc_lock, flags); + + /* Adjust for the 1960/1900 */ + time->tm_year += 60; + time->tm_mon--; + return RTC_24H; +} + +static int mrst_set_time(struct device *dev, struct rtc_time *time) +{ + int ret; + unsigned long flags; + unsigned char mon, day, hrs, min, sec; + unsigned int yrs; + + yrs = time->tm_year; + mon = time->tm_mon + 1; /* tm_mon starts at zero */ + day = time->tm_mday; + hrs = time->tm_hour; + min = time->tm_min; + sec = time->tm_sec; + + if (yrs < 70 || yrs > 138) + return -EINVAL; + yrs -= 60; + + spin_lock_irqsave(&rtc_lock, flags); + + vrtc_cmos_write(yrs, RTC_YEAR); + vrtc_cmos_write(mon, RTC_MONTH); + vrtc_cmos_write(day, RTC_DAY_OF_MONTH); + vrtc_cmos_write(hrs, RTC_HOURS); + vrtc_cmos_write(min, RTC_MINUTES); + vrtc_cmos_write(sec, RTC_SECONDS); + + spin_unlock_irqrestore(&rtc_lock, flags); + + ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETTIME); + return ret; +} + +static int mrst_read_alarm(struct device *dev, struct rtc_wkalrm *t) +{ + struct mrst_rtc *mrst = dev_get_drvdata(dev); + unsigned char rtc_control; + + if (mrst->irq <= 0) + return -EIO; + + /* Basic alarms only support hour, minute, and seconds fields. + * Some also support day and month, for alarms up to a year in + * the future. + */ + t->time.tm_mday = -1; + t->time.tm_mon = -1; + t->time.tm_year = -1; + + /* vRTC only supports binary mode */ + spin_lock_irq(&rtc_lock); + t->time.tm_sec = vrtc_cmos_read(RTC_SECONDS_ALARM); + t->time.tm_min = vrtc_cmos_read(RTC_MINUTES_ALARM); + t->time.tm_hour = vrtc_cmos_read(RTC_HOURS_ALARM); + + rtc_control = vrtc_cmos_read(RTC_CONTROL); + spin_unlock_irq(&rtc_lock); + + t->enabled = !!(rtc_control & RTC_AIE); + t->pending = 0; + + return 0; +} + +static void mrst_checkintr(struct mrst_rtc *mrst, unsigned char rtc_control) +{ + unsigned char rtc_intr; + + /* + * NOTE after changing RTC_xIE bits we always read INTR_FLAGS; + * allegedly some older rtcs need that to handle irqs properly + */ + rtc_intr = vrtc_cmos_read(RTC_INTR_FLAGS); + rtc_intr &= (rtc_control & RTC_IRQMASK) | RTC_IRQF; + if (is_intr(rtc_intr)) + rtc_update_irq(mrst->rtc, 1, rtc_intr); +} + +static void mrst_irq_enable(struct mrst_rtc *mrst, unsigned char mask) +{ + unsigned char rtc_control; + + /* + * Flush any pending IRQ status, notably for update irqs, + * before we enable new IRQs + */ + rtc_control = vrtc_cmos_read(RTC_CONTROL); + mrst_checkintr(mrst, rtc_control); + + rtc_control |= mask; + vrtc_cmos_write(rtc_control, RTC_CONTROL); + + mrst_checkintr(mrst, rtc_control); +} + +static void mrst_irq_disable(struct mrst_rtc *mrst, unsigned char mask) +{ + unsigned char rtc_control; + + rtc_control = vrtc_cmos_read(RTC_CONTROL); + rtc_control &= ~mask; + vrtc_cmos_write(rtc_control, RTC_CONTROL); + mrst_checkintr(mrst, rtc_control); +} + +static int mrst_set_alarm(struct device *dev, struct rtc_wkalrm *t) +{ + struct mrst_rtc *mrst = dev_get_drvdata(dev); + unsigned char hrs, min, sec; + int ret = 0; + + if (!mrst->irq) + return -EIO; + + hrs = t->time.tm_hour; + min = t->time.tm_min; + sec = t->time.tm_sec; + + spin_lock_irq(&rtc_lock); + /* Next rtc irq must not be from previous alarm setting */ + mrst_irq_disable(mrst, RTC_AIE); + + /* Update alarm */ + vrtc_cmos_write(hrs, RTC_HOURS_ALARM); + vrtc_cmos_write(min, RTC_MINUTES_ALARM); + vrtc_cmos_write(sec, RTC_SECONDS_ALARM); + + spin_unlock_irq(&rtc_lock); + + ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETALARM); + if (ret) + return ret; + + spin_lock_irq(&rtc_lock); + if (t->enabled) + mrst_irq_enable(mrst, RTC_AIE); + + spin_unlock_irq(&rtc_lock); + + return 0; +} + +static int mrst_irq_set_state(struct device *dev, int enabled) +{ + struct mrst_rtc *mrst = dev_get_drvdata(dev); + unsigned long flags; + + if (!mrst->irq) + return -ENXIO; + + spin_lock_irqsave(&rtc_lock, flags); + + if (enabled) + mrst_irq_enable(mrst, RTC_PIE); + else + mrst_irq_disable(mrst, RTC_PIE); + + spin_unlock_irqrestore(&rtc_lock, flags); + return 0; +} + +#if defined(CONFIG_RTC_INTF_DEV) || defined(CONFIG_RTC_INTF_DEV_MODULE) + +/* Currently, the vRTC doesn't support UIE ON/OFF */ +static int +mrst_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) +{ + struct mrst_rtc *mrst = dev_get_drvdata(dev); + unsigned long flags; + + switch (cmd) { + case RTC_AIE_OFF: + case RTC_AIE_ON: + if (!mrst->irq) + return -EINVAL; + break; + default: + /* PIE ON/OFF is handled by mrst_irq_set_state() */ + return -ENOIOCTLCMD; + } + + spin_lock_irqsave(&rtc_lock, flags); + switch (cmd) { + case RTC_AIE_OFF: /* alarm off */ + mrst_irq_disable(mrst, RTC_AIE); + break; + case RTC_AIE_ON: /* alarm on */ + mrst_irq_enable(mrst, RTC_AIE); + break; + } + spin_unlock_irqrestore(&rtc_lock, flags); + return 0; +} + +#else +#define mrst_rtc_ioctl NULL +#endif + +#if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE) + +static int mrst_procfs(struct device *dev, struct seq_file *seq) +{ + unsigned char rtc_control, valid; + + spin_lock_irq(&rtc_lock); + rtc_control = vrtc_cmos_read(RTC_CONTROL); + valid = vrtc_cmos_read(RTC_VALID); + spin_unlock_irq(&rtc_lock); + + return seq_printf(seq, + "periodic_IRQ\t: %s\n" + "alarm\t\t: %s\n" + "BCD\t\t: no\n" + "periodic_freq\t: daily (not adjustable)\n", + (rtc_control & RTC_PIE) ? "on" : "off", + (rtc_control & RTC_AIE) ? "on" : "off"); +} + +#else +#define mrst_procfs NULL +#endif + +static const struct rtc_class_ops mrst_rtc_ops = { + .ioctl = mrst_rtc_ioctl, + .read_time = mrst_read_time, + .set_time = mrst_set_time, + .read_alarm = mrst_read_alarm, + .set_alarm = mrst_set_alarm, + .proc = mrst_procfs, + .irq_set_state = mrst_irq_set_state, +}; + +static struct mrst_rtc mrst_rtc; + +/* + * When vRTC IRQ is captured by SCU FW, FW will clear the AIE bit in + * Reg B, so no need for this driver to clear it + */ +static irqreturn_t mrst_rtc_irq(int irq, void *p) +{ + u8 irqstat; + + spin_lock(&rtc_lock); + /* This read will clear all IRQ flags inside Reg C */ + irqstat = vrtc_cmos_read(RTC_INTR_FLAGS); + spin_unlock(&rtc_lock); + + irqstat &= RTC_IRQMASK | RTC_IRQF; + if (is_intr(irqstat)) { + rtc_update_irq(p, 1, irqstat); + return IRQ_HANDLED; + } + return IRQ_NONE; +} + +static int __init +vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq) +{ + int retval = 0; + unsigned char rtc_control; + + /* There can be only one ... */ + if (mrst_rtc.dev) + return -EBUSY; + + if (!iomem) + return -ENODEV; + + iomem = request_mem_region(iomem->start, + iomem->end + 1 - iomem->start, + driver_name); + if (!iomem) { + dev_dbg(dev, "i/o mem already in use.\n"); + return -EBUSY; + } + + mrst_rtc.irq = rtc_irq; + mrst_rtc.iomem = iomem; + + mrst_rtc.rtc = rtc_device_register(driver_name, dev, + &mrst_rtc_ops, THIS_MODULE); + if (IS_ERR(mrst_rtc.rtc)) { + retval = PTR_ERR(mrst_rtc.rtc); + goto cleanup0; + } + + mrst_rtc.dev = dev; + dev_set_drvdata(dev, &mrst_rtc); + rename_region(iomem, dev_name(&mrst_rtc.rtc->dev)); + + spin_lock_irq(&rtc_lock); + mrst_irq_disable(&mrst_rtc, RTC_PIE | RTC_AIE); + rtc_control = vrtc_cmos_read(RTC_CONTROL); + spin_unlock_irq(&rtc_lock); + + if (!(rtc_control & RTC_24H) || (rtc_control & (RTC_DM_BINARY))) + dev_dbg(dev, "TODO: support more than 24-hr BCD mode\n"); + + if (rtc_irq) { + retval = request_irq(rtc_irq, mrst_rtc_irq, + IRQF_DISABLED, dev_name(&mrst_rtc.rtc->dev), + mrst_rtc.rtc); + if (retval < 0) { + dev_dbg(dev, "IRQ %d is already in use, err %d\n", + rtc_irq, retval); + goto cleanup1; + } + } + dev_dbg(dev, "initialised\n"); + return 0; + +cleanup1: + mrst_rtc.dev = NULL; + rtc_device_unregister(mrst_rtc.rtc); +cleanup0: + release_region(iomem->start, iomem->end + 1 - iomem->start); + dev_err(dev, "rtc-mrst: unable to initialise\n"); + return retval; +} + +static void rtc_mrst_do_shutdown(void) +{ + spin_lock_irq(&rtc_lock); + mrst_irq_disable(&mrst_rtc, RTC_IRQMASK); + spin_unlock_irq(&rtc_lock); +} + +static void __exit rtc_mrst_do_remove(struct device *dev) +{ + struct mrst_rtc *mrst = dev_get_drvdata(dev); + struct resource *iomem; + + rtc_mrst_do_shutdown(); + + if (mrst->irq) + free_irq(mrst->irq, mrst->rtc); + + rtc_device_unregister(mrst->rtc); + mrst->rtc = NULL; + + iomem = mrst->iomem; + release_region(iomem->start, iomem->end + 1 - iomem->start); + mrst->iomem = NULL; + + mrst->dev = NULL; + dev_set_drvdata(dev, NULL); +} + +#ifdef CONFIG_PM +static int mrst_suspend(struct device *dev, pm_message_t mesg) +{ + struct mrst_rtc *mrst = dev_get_drvdata(dev); + unsigned char tmp; + + /* Only the alarm might be a wakeup event source */ + spin_lock_irq(&rtc_lock); + mrst->suspend_ctrl = tmp = vrtc_cmos_read(RTC_CONTROL); + if (tmp & (RTC_PIE | RTC_AIE)) { + unsigned char mask; + + if (device_may_wakeup(dev)) + mask = RTC_IRQMASK & ~RTC_AIE; + else + mask = RTC_IRQMASK; + tmp &= ~mask; + vrtc_cmos_write(tmp, RTC_CONTROL); + + mrst_checkintr(mrst, tmp); + } + spin_unlock_irq(&rtc_lock); + + if (tmp & RTC_AIE) { + mrst->enabled_wake = 1; + enable_irq_wake(mrst->irq); + } + + dev_dbg(&mrst_rtc.rtc->dev, "suspend%s, ctrl %02x\n", + (tmp & RTC_AIE) ? ", alarm may wake" : "", + tmp); + + return 0; +} + +/* + * We want RTC alarms to wake us from the deep power saving state + */ +static inline int mrst_poweroff(struct device *dev) +{ + return mrst_suspend(dev, PMSG_HIBERNATE); +} + +static int mrst_resume(struct device *dev) +{ + struct mrst_rtc *mrst = dev_get_drvdata(dev); + unsigned char tmp = mrst->suspend_ctrl; + + /* Re-enable any irqs previously active */ + if (tmp & RTC_IRQMASK) { + unsigned char mask; + + if (mrst->enabled_wake) { + disable_irq_wake(mrst->irq); + mrst->enabled_wake = 0; + } + + spin_lock_irq(&rtc_lock); + do { + vrtc_cmos_write(tmp, RTC_CONTROL); + + mask = vrtc_cmos_read(RTC_INTR_FLAGS); + mask &= (tmp & RTC_IRQMASK) | RTC_IRQF; + if (!is_intr(mask)) + break; + + rtc_update_irq(mrst->rtc, 1, mask); + tmp &= ~RTC_AIE; + } while (mask & RTC_AIE); + spin_unlock_irq(&rtc_lock); + } + + dev_dbg(&mrst_rtc.rtc->dev, "resume, ctrl %02x\n", tmp); + + return 0; +} + +#else +#define mrst_suspend NULL +#define mrst_resume NULL + +static inline int mrst_poweroff(struct device *dev) +{ + return -ENOSYS; +} + +#endif + +static int __init vrtc_mrst_platform_probe(struct platform_device *pdev) +{ + return vrtc_mrst_do_probe(&pdev->dev, + platform_get_resource(pdev, IORESOURCE_MEM, 0), + platform_get_irq(pdev, 0)); +} + +static int __exit vrtc_mrst_platform_remove(struct platform_device *pdev) +{ + rtc_mrst_do_remove(&pdev->dev); + return 0; +} + +static void vrtc_mrst_platform_shutdown(struct platform_device *pdev) +{ + if (system_state == SYSTEM_POWER_OFF && !mrst_poweroff(&pdev->dev)) + return; + + rtc_mrst_do_shutdown(); +} + +MODULE_ALIAS("platform:vrtc_mrst"); + +static struct platform_driver vrtc_mrst_platform_driver = { + .probe = vrtc_mrst_platform_probe, + .remove = __exit_p(vrtc_mrst_platform_remove), + .shutdown = vrtc_mrst_platform_shutdown, + .driver = { + .name = (char *) driver_name, + .suspend = mrst_suspend, + .resume = mrst_resume, + } +}; + +static int __init vrtc_mrst_init(void) +{ + return platform_driver_register(&vrtc_mrst_platform_driver); +} + +static void __exit vrtc_mrst_exit(void) +{ + platform_driver_unregister(&vrtc_mrst_platform_driver); +} + +module_init(vrtc_mrst_init); +module_exit(vrtc_mrst_exit); + +MODULE_AUTHOR("Jacob Pan; Feng Tang"); +MODULE_DESCRIPTION("Driver for Moorestown virtual RTC"); +MODULE_LICENSE("GPL"); diff --git a/drivers/sh/intc/core.c b/drivers/sh/intc/core.c index e5e9e6735f7d..9739431092d1 100644 --- a/drivers/sh/intc/core.c +++ b/drivers/sh/intc/core.c @@ -198,6 +198,7 @@ int __init register_intc_controller(struct intc_desc *desc) list_add_tail(&d->list, &intc_list); raw_spin_lock_init(&d->lock); + INIT_RADIX_TREE(&d->tree, GFP_ATOMIC); d->index = nr_intc_controllers; diff --git a/drivers/spi/coldfire_qspi.c b/drivers/spi/coldfire_qspi.c index 052b3c7fa6a0..8856bcca9d29 100644 --- a/drivers/spi/coldfire_qspi.c +++ b/drivers/spi/coldfire_qspi.c @@ -317,7 +317,7 @@ static void mcfqspi_work(struct work_struct *work) msg = container_of(mcfqspi->msgq.next, struct spi_message, queue); - list_del_init(&mcfqspi->msgq); + list_del_init(&msg->queue); spin_unlock_irqrestore(&mcfqspi->lock, flags); spi = msg->spi; diff --git a/drivers/spi/omap2_mcspi.c b/drivers/spi/omap2_mcspi.c index 2a651e61bfbf..951a160fc27f 100644 --- a/drivers/spi/omap2_mcspi.c +++ b/drivers/spi/omap2_mcspi.c @@ -1305,10 +1305,49 @@ static int __exit omap2_mcspi_remove(struct platform_device *pdev) /* work with hotplug and coldplug */ MODULE_ALIAS("platform:omap2_mcspi"); +#ifdef CONFIG_SUSPEND +/* + * When SPI wake up from off-mode, CS is in activate state. If it was in + * unactive state when driver was suspend, then force it to unactive state at + * wake up. + */ +static int omap2_mcspi_resume(struct device *dev) +{ + struct spi_master *master = dev_get_drvdata(dev); + struct omap2_mcspi *mcspi = spi_master_get_devdata(master); + struct omap2_mcspi_cs *cs; + + omap2_mcspi_enable_clocks(mcspi); + list_for_each_entry(cs, &omap2_mcspi_ctx[master->bus_num - 1].cs, + node) { + if ((cs->chconf0 & OMAP2_MCSPI_CHCONF_FORCE) == 0) { + + /* + * We need to toggle CS state for OMAP take this + * change in account. + */ + MOD_REG_BIT(cs->chconf0, OMAP2_MCSPI_CHCONF_FORCE, 1); + __raw_writel(cs->chconf0, cs->base + OMAP2_MCSPI_CHCONF0); + MOD_REG_BIT(cs->chconf0, OMAP2_MCSPI_CHCONF_FORCE, 0); + __raw_writel(cs->chconf0, cs->base + OMAP2_MCSPI_CHCONF0); + } + } + omap2_mcspi_disable_clocks(mcspi); + return 0; +} +#else +#define omap2_mcspi_resume NULL +#endif + +static const struct dev_pm_ops omap2_mcspi_pm_ops = { + .resume = omap2_mcspi_resume, +}; + static struct platform_driver omap2_mcspi_driver = { .driver = { .name = "omap2_mcspi", .owner = THIS_MODULE, + .pm = &omap2_mcspi_pm_ops }, .remove = __exit_p(omap2_mcspi_remove), }; diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index 8c3c057aa847..d0e9e0207539 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -435,12 +435,6 @@ static int zram_make_request(struct request_queue *queue, struct bio *bio) int ret = 0; struct zram *zram = queue->queuedata; - if (unlikely(!zram->init_done)) { - set_bit(BIO_UPTODATE, &bio->bi_flags); - bio_endio(bio, 0); - return 0; - } - if (!valid_io_request(zram, bio)) { zram_stat64_inc(zram, &zram->stats.invalid_io); bio_io_error(bio); diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index 0e6aa3d96a42..4ac1201ad6c2 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -1458,7 +1458,7 @@ static bool apertures_overlap(struct aperture *gen, struct aperture *hw) if (gen->base == hw->base) return true; /* is the generic aperture base inside the hw base->hw base+size */ - if (gen->base > hw->base && gen->base <= hw->base + hw->size) + if (gen->base > hw->base && gen->base < hw->base + hw->size) return true; return false; } diff --git a/drivers/video/imxfb.c b/drivers/video/imxfb.c index 5c363d026f64..1ab2c2588675 100644 --- a/drivers/video/imxfb.c +++ b/drivers/video/imxfb.c @@ -53,11 +53,8 @@ #define LCDC_SIZE 0x04 #define SIZE_XMAX(x) ((((x) >> 4) & 0x3f) << 20) -#ifdef CONFIG_ARCH_MX1 -#define SIZE_YMAX(y) ((y) & 0x1ff) -#else -#define SIZE_YMAX(y) ((y) & 0x3ff) -#endif +#define YMAX_MASK (cpu_is_mx1() ? 0x1ff : 0x3ff) +#define SIZE_YMAX(y) ((y) & YMAX_MASK) #define LCDC_VPW 0x08 #define VPW_VPW(x) ((x) & 0x3ff) @@ -623,7 +620,7 @@ static int imxfb_activate_var(struct fb_var_screeninfo *var, struct fb_info *inf if (var->right_margin > 255) printk(KERN_ERR "%s: invalid right_margin %d\n", info->fix.id, var->right_margin); - if (var->yres < 1 || var->yres > 511) + if (var->yres < 1 || var->yres > YMAX_MASK) printk(KERN_ERR "%s: invalid yres %d\n", info->fix.id, var->yres); if (var->vsync_len > 100) diff --git a/drivers/video/sh_mobile_hdmi.c b/drivers/video/sh_mobile_hdmi.c index d7df10315d8d..fcda0e970113 100644 --- a/drivers/video/sh_mobile_hdmi.c +++ b/drivers/video/sh_mobile_hdmi.c @@ -787,6 +787,9 @@ static int sh_hdmi_read_edid(struct sh_hdmi *hdmi) found_rate_error = rate_error; } + hdmi->var.width = hdmi->monspec.max_x * 10; + hdmi->var.height = hdmi->monspec.max_y * 10; + /* * TODO 1: if no ->info is present, postpone running the config until * after ->info first gets registered. @@ -960,8 +963,12 @@ static bool sh_hdmi_must_reconfigure(struct sh_hdmi *hdmi) dev_dbg(info->dev, "Old %ux%u, new %ux%u\n", mode1.xres, mode1.yres, mode2.xres, mode2.yres); - if (fb_mode_is_equal(&mode1, &mode2)) + if (fb_mode_is_equal(&mode1, &mode2)) { + /* It can be a different monitor with an equal video-mode */ + old_var->width = new_var->width; + old_var->height = new_var->height; return false; + } dev_dbg(info->dev, "Switching %u -> %u lines\n", mode1.yres, mode2.yres); @@ -1057,8 +1064,11 @@ static void sh_hdmi_edid_work_fn(struct work_struct *work) * on, if we run a resume here, the logo disappears */ if (lock_fb_info(hdmi->info)) { - sh_hdmi_display_on(hdmi, hdmi->info); - unlock_fb_info(hdmi->info); + struct fb_info *info = hdmi->info; + info->var.width = hdmi->var.width; + info->var.height = hdmi->var.height; + sh_hdmi_display_on(hdmi, info); + unlock_fb_info(info); } } else { /* New monitor or have to wake up */ diff --git a/drivers/video/sh_mobile_lcdcfb.c b/drivers/video/sh_mobile_lcdcfb.c index b02d97a879d6..c05326b61235 100644 --- a/drivers/video/sh_mobile_lcdcfb.c +++ b/drivers/video/sh_mobile_lcdcfb.c @@ -54,8 +54,8 @@ static int lcdc_shared_regs[] = { }; #define NR_SHARED_REGS ARRAY_SIZE(lcdc_shared_regs) -#define DEFAULT_XRES 1280 -#define DEFAULT_YRES 1024 +#define MAX_XRES 1920 +#define MAX_YRES 1080 static unsigned long lcdc_offs_mainlcd[NR_CH_REGS] = { [LDDCKPAT1R] = 0x400, @@ -914,22 +914,12 @@ static int sh_mobile_check_var(struct fb_var_screeninfo *var, struct fb_info *in { struct sh_mobile_lcdc_chan *ch = info->par; - if (var->xres < 160 || var->xres > 1920 || - var->yres < 120 || var->yres > 1080 || - var->left_margin < 32 || var->left_margin > 320 || - var->right_margin < 12 || var->right_margin > 240 || - var->upper_margin < 12 || var->upper_margin > 120 || - var->lower_margin < 1 || var->lower_margin > 64 || - var->hsync_len < 32 || var->hsync_len > 240 || - var->vsync_len < 2 || var->vsync_len > 64 || - var->pixclock < 6000 || var->pixclock > 40000 || + if (var->xres > MAX_XRES || var->yres > MAX_YRES || var->xres * var->yres * (ch->cfg.bpp / 8) * 2 > info->fix.smem_len) { - dev_warn(info->dev, "Invalid info: %u %u %u %u %u %u %u %u %u!\n", - var->xres, var->yres, - var->left_margin, var->right_margin, - var->upper_margin, var->lower_margin, - var->hsync_len, var->vsync_len, - var->pixclock); + dev_warn(info->dev, "Invalid info: %u-%u-%u-%u x %u-%u-%u-%u @ %ukHz!\n", + var->left_margin, var->xres, var->right_margin, var->hsync_len, + var->upper_margin, var->yres, var->lower_margin, var->vsync_len, + PICOS2KHZ(var->pixclock)); return -EINVAL; } return 0; @@ -1226,7 +1216,7 @@ static int __devinit sh_mobile_lcdc_probe(struct platform_device *pdev) } if (!mode) - max_size = DEFAULT_XRES * DEFAULT_YRES; + max_size = MAX_XRES * MAX_YRES; else if (max_cfg) dev_dbg(&pdev->dev, "Found largest videomode %ux%u\n", max_cfg->xres, max_cfg->yres); @@ -1238,12 +1228,14 @@ static int __devinit sh_mobile_lcdc_probe(struct platform_device *pdev) mode = &default_720p; num_cfg = 1; } else { - num_cfg = ch->cfg.num_cfg; + num_cfg = cfg->num_cfg; } fb_videomode_to_modelist(mode, num_cfg, &info->modelist); fb_videomode_to_var(var, mode); + var->width = cfg->lcd_size_cfg.width; + var->height = cfg->lcd_size_cfg.height; /* Default Y virtual resolution is 2x panel size */ var->yres_virtual = var->yres * 2; var->activate = FB_ACTIVATE_NOW; diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index 3d77116e4634..dea7b5bf6e2c 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -642,19 +642,14 @@ static struct notifier_block die_notifier = { */ #ifdef CONFIG_HPWDT_NMI_DECODING -#ifdef ARCH_HAS_NMI_WATCHDOG +#ifdef CONFIG_X86_LOCAL_APIC static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev) { /* * If nmi_watchdog is turned off then we can turn on * our nmi decoding capability. */ - if (!nmi_watchdog_active()) - hpwdt_nmi_decoding = 1; - else - dev_warn(&dev->dev, "NMI decoding is disabled. To enable this " - "functionality you must reboot with nmi_watchdog=0 " - "and load the hpwdt driver with priority=1.\n"); + hpwdt_nmi_decoding = 1; } #else static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev) @@ -662,7 +657,7 @@ static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev) dev_warn(&dev->dev, "NMI decoding is disabled. " "Your kernel does not support a NMI Watchdog.\n"); } -#endif /* ARCH_HAS_NMI_WATCHDOG */ +#endif /* CONFIG_X86_LOCAL_APIC */ static int __devinit hpwdt_init_nmi_decoding(struct pci_dev *dev) { diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5476c066d4ee..3c4039d5eef1 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, int metadata; unsigned int revokes = 0; int x; - int error; + int error = 0; if (!*top) sm->sm_first = 0; @@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, if (metadata) revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; - error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); + if (ip != GFS2_I(sdp->sd_rindex)) + error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); + else if (!sdp->sd_rgrps) + error = gfs2_ri_update(ip); + if (error) return error; @@ -879,7 +883,8 @@ out_rg_gunlock: out_rlist: gfs2_rlist_free(&rlist); out: - gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); + if (ip != GFS2_I(sdp->sd_rindex)) + gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); return error; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f92c17704169..08a8beb152e6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -541,21 +541,6 @@ out_locked: spin_unlock(&gl->gl_spin); } -static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, - unsigned int req_state, - unsigned int flags) -{ - int ret = LM_OUT_ERROR; - - if (!sdp->sd_lockstruct.ls_ops->lm_lock) - return req_state == LM_ST_UNLOCKED ? 0 : req_state; - - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, - req_state, flags); - return ret; -} - /** * do_xmote - Calls the DLM to change the state of a lock * @gl: The lock state @@ -575,13 +560,14 @@ __acquires(&gl->gl_spin) lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | LM_FLAG_PRIORITY); - BUG_ON(gl->gl_state == target); - BUG_ON(gl->gl_state == gl->gl_target); + GLOCK_BUG_ON(gl, gl->gl_state == target); + GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && glops->go_inval) { set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); do_error(gl, 0); /* Fail queued try locks */ } + gl->gl_req = target; spin_unlock(&gl->gl_spin); if (glops->go_xmote_th) glops->go_xmote_th(gl); @@ -594,15 +580,17 @@ __acquires(&gl->gl_spin) gl->gl_state == LM_ST_DEFERRED) && !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) lck_flags |= LM_FLAG_TRY_1CB; - ret = gfs2_lm_lock(sdp, gl, target, lck_flags); - if (!(ret & LM_OUT_ASYNC)) { - finish_xmote(gl, ret); + if (sdp->sd_lockstruct.ls_ops->lm_lock) { + /* lock_dlm */ + ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); + GLOCK_BUG_ON(gl, ret); + } else { /* lock_nolock */ + finish_xmote(gl, target); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); - } else { - GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC); } + spin_lock(&gl->gl_spin); } @@ -951,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); + if (seq) { struct gfs2_glock_iter *gi = seq->private; vsprintf(gi->string, fmt, args); seq_printf(seq, gi->string); } else { - printk(KERN_ERR " "); - vprintk(fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_ERR " %pV", &vaf); } + va_end(args); } @@ -1361,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl) * @gl: Pointer to the glock * @ret: The return value from the dlm * + * The gl_reply field is under the gl_spin lock so that it is ok + * to use a bitfield shared with other glock state fields. */ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) { struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + spin_lock(&gl->gl_spin); gl->gl_reply = ret; if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { - spin_lock(&gl->gl_spin); if (gfs2_should_freeze(gl)) { set_bit(GLF_FROZEN, &gl->gl_flags); spin_unlock(&gl->gl_spin); return; } - spin_unlock(&gl->gl_spin); } + + spin_unlock(&gl->gl_spin); set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + smp_wmb(); gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); @@ -1626,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) { struct task_struct *gh_owner = NULL; - char buffer[KSYM_SYMBOL_LEN]; char flags_buf[32]; - sprint_symbol(buffer, gh->gh_ip); if (gh->gh_owner_pid) gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); - gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n", - state2str(gh->gh_state), - hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), - gh->gh_error, - gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, - gh_owner ? gh_owner->comm : "(ended)", buffer); + gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n", + state2str(gh->gh_state), + hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), + gh->gh_error, + gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, + gh_owner ? gh_owner->comm : "(ended)", + (void *)gh->gh_ip); return 0; } @@ -1782,12 +1778,13 @@ int __init gfs2_glock_init(void) } #endif - glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER | + glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZEABLE, 0); if (IS_ERR(glock_workqueue)) return PTR_ERR(glock_workqueue); - gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER | - WQ_FREEZEABLE, 0); + gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", + WQ_MEM_RECLAIM | WQ_FREEZEABLE, + 0); if (IS_ERR(gfs2_delete_workqueue)) { destroy_workqueue(glock_workqueue); return PTR_ERR(gfs2_delete_workqueue); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index db1c26d6d220..691851ceb615 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -87,11 +87,10 @@ enum { #define GL_ASYNC 0x00000040 #define GL_EXACT 0x00000080 #define GL_SKIP 0x00000100 -#define GL_ATIME 0x00000200 #define GL_NOCACHE 0x00000400 /* - * lm_lock() and lm_async_cb return flags + * lm_async_cb return flags * * LM_OUT_ST_MASK * Masks the lower two bits of lock state in the returned value. @@ -99,15 +98,11 @@ enum { * LM_OUT_CANCELED * The lock request was canceled. * - * LM_OUT_ASYNC - * The result of the request will be returned in an LM_CB_ASYNC callback. - * */ #define LM_OUT_ST_MASK 0x00000003 #define LM_OUT_CANCELED 0x00000008 -#define LM_OUT_ASYNC 0x00000080 -#define LM_OUT_ERROR 0x00000100 +#define LM_OUT_ERROR 0x00000004 /* * lm_recovery_done() messages @@ -124,25 +119,12 @@ struct lm_lockops { void (*lm_unmount) (struct gfs2_sbd *sdp); void (*lm_withdraw) (struct gfs2_sbd *sdp); void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl); - unsigned int (*lm_lock) (struct gfs2_glock *gl, - unsigned int req_state, unsigned int flags); + int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, + unsigned int flags); void (*lm_cancel) (struct gfs2_glock *gl); const match_table_t *lm_tokens; }; -#define LM_FLAG_TRY 0x00000001 -#define LM_FLAG_TRY_1CB 0x00000002 -#define LM_FLAG_NOEXP 0x00000004 -#define LM_FLAG_ANY 0x00000008 -#define LM_FLAG_PRIORITY 0x00000010 - -#define GL_ASYNC 0x00000040 -#define GL_EXACT 0x00000080 -#define GL_SKIP 0x00000100 -#define GL_NOCACHE 0x00000400 - -#define GLR_TRYFAILED 13 - extern struct workqueue_struct *gfs2_delete_workqueue; static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) { @@ -212,6 +194,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp, int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); + +__attribute__ ((format(printf, 2, 3))) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); /** diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 0d149dcc04e5..263561bf1a50 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl) if (gl->gl_state != LM_ST_UNLOCKED && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { - flush_workqueue(gfs2_delete_workqueue); gfs2_meta_syncfs(sdp); gfs2_log_shutdown(sdp); } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 764fbb49efc8..8d3d2b4a0a7d 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -207,12 +207,14 @@ struct gfs2_glock { spinlock_t gl_spin; - unsigned int gl_state; - unsigned int gl_target; - unsigned int gl_reply; + /* State fields protected by gl_spin */ + unsigned int gl_state:2, /* Current state */ + gl_target:2, /* Target state */ + gl_demote_state:2, /* State requested by remote node */ + gl_req:2, /* State in last dlm request */ + gl_reply:8; /* Last reply from the dlm */ + unsigned int gl_hash; - unsigned int gl_req; - unsigned int gl_demote_state; /* state requested by remote node */ unsigned long gl_demote_time; /* time of first demote request */ struct list_head gl_holders; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e1213f7f9217..14e682dbe8bf 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -916,17 +916,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) if (error) return error; - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); - if (error) - return error; - } - setattr_copy(inode, attr); mark_inode_dirty(inode); - - gfs2_assert_warn(GFS2_SB(inode), !error); gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 1c09425b45fd..6e493aee28f8 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags, return lkf; } -static unsigned int gdlm_lock(struct gfs2_glock *gl, - unsigned int req_state, unsigned int flags) +static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, + unsigned int flags) { struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; - int error; int req; u32 lkf; - gl->gl_req = req_state; req = make_mode(req_state); lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req); @@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl, * Submit the actual lock request. */ - error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname, - GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); - if (error == -EAGAIN) - return 0; - if (error) - return LM_OUT_ERROR; - return LM_OUT_ASYNC; + return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname, + GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); } static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 12cbea7502c2..1db6b7343229 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -1069,7 +1069,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct buffer_head *dibh; u32 ouid, ogid, nuid, ngid; int error; @@ -1100,25 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (error) goto out_gunlock_q; - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_setattr_simple(ip, attr); if (error) goto out_end_trans; - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - int error; - - error = vmtruncate(inode, attr->ia_size); - gfs2_assert_warn(sdp, !error); - } - - setattr_copy(inode, attr); - mark_inode_dirty(inode); - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { u64 blocks = gfs2_get_inode_blocks(&ip->i_inode); gfs2_quota_change(ip, -blocks, ouid, ogid); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index f606baf9ba72..a689901963de 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -666,6 +666,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); qd->qd_qb.qb_limit = qp->qu_limit; } + if (fdq->d_fieldmask & FS_DQ_BCOUNT) { + qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_value = qp->qu_value; + } } /* Write the quota into the quota file on disk */ @@ -1509,7 +1513,7 @@ out: } /* GFS2 only supports a subset of the XFS fields */ -#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD) +#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT) static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, struct fs_disk_quota *fdq) @@ -1569,9 +1573,15 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, if ((fdq->d_fieldmask & FS_DQ_BSOFT) && ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) fdq->d_fieldmask ^= FS_DQ_BSOFT; + if ((fdq->d_fieldmask & FS_DQ_BHARD) && ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) fdq->d_fieldmask ^= FS_DQ_BHARD; + + if ((fdq->d_fieldmask & FS_DQ_BCOUNT) && + ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value))) + fdq->d_fieldmask ^= FS_DQ_BCOUNT; + if (fdq->d_fieldmask == 0) goto out_i; @@ -1620,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = { .get_dqblk = gfs2_get_dqblk, .set_dqblk = gfs2_set_dqblk, }; - diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 33c8407b876f..7293ea27020c 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) for (rgrps = 0;; rgrps++) { loff_t pos = rgrps * sizeof(struct gfs2_rindex); - if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode)) + if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode)) break; error = gfs2_internal_read(ip, &ra_state, buf, &pos, sizeof(struct gfs2_rindex)); @@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip, * Returns: 0 on successful update, error code otherwise */ -static int gfs2_ri_update(struct gfs2_inode *ip) +int gfs2_ri_update(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; @@ -614,46 +614,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip) } /** - * gfs2_ri_update_special - Pull in a new resource index from the disk - * - * This is a special version that's safe to call from gfs2_inplace_reserve_i. - * In this case we know that we don't have any resource groups in memory yet. - * - * @ip: pointer to the rindex inode - * - * Returns: 0 on successful update, error code otherwise - */ -static int gfs2_ri_update_special(struct gfs2_inode *ip) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct inode *inode = &ip->i_inode; - struct file_ra_state ra_state; - struct gfs2_rgrpd *rgd; - unsigned int max_data = 0; - int error; - - file_ra_state_init(&ra_state, inode->i_mapping); - for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { - /* Ignore partials */ - if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > - i_size_read(inode)) - break; - error = read_rindex_entry(ip, &ra_state); - if (error) { - clear_rgrpdi(sdp); - return error; - } - } - list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) - if (rgd->rd_data > max_data) - max_data = rgd->rd_data; - sdp->sd_max_rg_data = max_data; - - sdp->sd_rindex_uptodate = 1; - return 0; -} - -/** * gfs2_rindex_hold - Grab a lock on the rindex * @sdp: The GFS2 superblock * @ri_gh: the glock holder @@ -1226,16 +1186,25 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, error = gfs2_rindex_hold(sdp, &al->al_ri_gh); else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ - error = gfs2_ri_update_special(ip); + error = gfs2_ri_update(ip); if (error) return error; } +try_again: do { error = get_local_rgrp(ip, &last_unlinked); /* If there is no space, flushing the log may release some */ - if (error) + if (error) { + if (ip == GFS2_I(sdp->sd_rindex) && + !sdp->sd_rindex_uptodate) { + error = gfs2_ri_update(ip); + if (error) + return error; + goto try_again; + } gfs2_log_flush(sdp, NULL); + } } while (error && tries++ < 3); if (error) { diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 0e35c0466f9a..50c2bb04369c 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, extern void gfs2_inplace_release(struct gfs2_inode *ip); +extern int gfs2_ri_update(struct gfs2_inode *ip); extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 30b58f07c8a6..439b61c03262 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1296,10 +1296,8 @@ fail: int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) { - struct inode *inode = &ip->i_inode; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_ea_location el; - struct buffer_head *dibh; int error; error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el); @@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) if (error) return error; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out_trans_end; - - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - int error; - - error = vmtruncate(inode, attr->ia_size); - gfs2_assert_warn(GFS2_SB(inode), !error); - } - - setattr_copy(inode, attr); - mark_inode_dirty(inode); - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - -out_trans_end: + error = gfs2_setattr_simple(ip, attr); gfs2_trans_end(sdp); return error; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 182845147fe4..08cba2c3b612 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1407,6 +1407,82 @@ static const struct file_operations proc_pid_sched_operations = { #endif +#ifdef CONFIG_SCHED_AUTOGROUP +/* + * Print out autogroup related information: + */ +static int sched_autogroup_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_autogroup_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_autogroup_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + char buffer[PROC_NUMBUF]; + long nice; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + err = strict_strtol(strstrip(buffer), 0, &nice); + if (err) + return -EINVAL; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = nice; + err = proc_sched_autogroup_set_nice(p, &err); + if (err) + count = err; + + put_task_struct(p); + + return count; +} + +static int sched_autogroup_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, sched_autogroup_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_sched_autogroup_operations = { + .open = sched_autogroup_open, + .read = seq_read, + .write = sched_autogroup_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif /* CONFIG_SCHED_AUTOGROUP */ + static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { @@ -2733,6 +2809,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif +#ifdef CONFIG_SCHED_AUTOGROUP + REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), +#endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK INF("syscall", S_IRUSR, proc_pid_syscall), diff --git a/include/linux/completion.h b/include/linux/completion.h index 36d57f74cd01..51494e6b5548 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -81,10 +81,10 @@ extern int wait_for_completion_interruptible(struct completion *x); extern int wait_for_completion_killable(struct completion *x); extern unsigned long wait_for_completion_timeout(struct completion *x, unsigned long timeout); -extern unsigned long wait_for_completion_interruptible_timeout( - struct completion *x, unsigned long timeout); -extern unsigned long wait_for_completion_killable_timeout( - struct completion *x, unsigned long timeout); +extern long wait_for_completion_interruptible_timeout( + struct completion *x, unsigned long timeout); +extern long wait_for_completion_killable_timeout( + struct completion *x, unsigned long timeout); extern bool try_wait_for_completion(struct completion *x); extern bool completion_done(struct completion *x); diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 9d8688b92d8b..8cd00ad98d37 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -824,6 +824,8 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie); #ifdef CONFIG_DMA_ENGINE enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx); void dma_issue_pending_all(void); +struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param); +void dma_release_channel(struct dma_chan *chan); #else static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) { @@ -831,7 +833,14 @@ static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descript } static inline void dma_issue_pending_all(void) { - do { } while (0); +} +static inline struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, + dma_filter_fn fn, void *fn_param) +{ + return NULL; +} +static inline void dma_release_channel(struct dma_chan *chan) +{ } #endif @@ -842,8 +851,6 @@ void dma_async_device_unregister(struct dma_device *device); void dma_run_dependencies(struct dma_async_tx_descriptor *tx); struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type); #define dma_request_channel(mask, x, y) __dma_request_channel(&(mask), x, y) -struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param); -void dma_release_channel(struct dma_chan *chan); /* --- Helper iov-locking functions --- */ diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 8beabb958f61..47e3997f7b5c 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -154,12 +154,14 @@ enum { TRACE_EVENT_FL_ENABLED_BIT, TRACE_EVENT_FL_FILTERED_BIT, TRACE_EVENT_FL_RECORDED_CMD_BIT, + TRACE_EVENT_FL_CAP_ANY_BIT, }; enum { TRACE_EVENT_FL_ENABLED = (1 << TRACE_EVENT_FL_ENABLED_BIT), TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), TRACE_EVENT_FL_RECORDED_CMD = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT), + TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), }; struct ftrace_event_call { @@ -196,6 +198,14 @@ struct ftrace_event_call { #endif }; +#define __TRACE_EVENT_FLAGS(name, value) \ + static int __init trace_init_flags_##name(void) \ + { \ + event_##name.flags = value; \ + return 0; \ + } \ + early_initcall(trace_init_flags_##name); + #define PERF_MAX_TRACE_SIZE 2048 #define MAX_FILTER_PRED 32 @@ -215,6 +225,10 @@ enum { FILTER_PTR_STRING, }; +#define EVENT_STORAGE_SIZE 128 +extern struct mutex event_storage_mutex; +extern char event_storage[EVENT_STORAGE_SIZE]; + extern int trace_event_raw_init(struct ftrace_event_call *call); extern int trace_define_field(struct ftrace_event_call *call, const char *type, const char *name, int offset, int size, diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index fd0c1b857d3d..330586ffffbb 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -22,7 +22,7 @@ #include <linux/wait.h> #include <linux/percpu.h> #include <linux/timer.h> - +#include <linux/timerqueue.h> struct hrtimer_clock_base; struct hrtimer_cpu_base; @@ -79,8 +79,8 @@ enum hrtimer_restart { /** * struct hrtimer - the basic hrtimer structure - * @node: red black tree node for time ordered insertion - * @_expires: the absolute expiry time in the hrtimers internal + * @node: timerqueue node, which also manages node.expires, + * the absolute expiry time in the hrtimers internal * representation. The time is related to the clock on * which the timer is based. Is setup by adding * slack to the _softexpires value. For non range timers @@ -101,8 +101,7 @@ enum hrtimer_restart { * The hrtimer structure must be initialized by hrtimer_init() */ struct hrtimer { - struct rb_node node; - ktime_t _expires; + struct timerqueue_node node; ktime_t _softexpires; enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; @@ -141,8 +140,7 @@ struct hrtimer_sleeper { struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; clockid_t index; - struct rb_root active; - struct rb_node *first; + struct timerqueue_head active; ktime_t resolution; ktime_t (*get_time)(void); ktime_t softirq_time; @@ -158,7 +156,6 @@ struct hrtimer_clock_base { * @lock: lock protecting the base and associated clock bases * and timers * @clock_base: array of clock bases for this cpu - * @curr_timer: the timer which is executing a callback right now * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @hres_active: State of high resolution mode @@ -184,43 +181,43 @@ struct hrtimer_cpu_base { static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { - timer->_expires = time; + timer->node.expires = time; timer->_softexpires = time; } static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta) { timer->_softexpires = time; - timer->_expires = ktime_add_safe(time, delta); + timer->node.expires = ktime_add_safe(time, delta); } static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta) { timer->_softexpires = time; - timer->_expires = ktime_add_safe(time, ns_to_ktime(delta)); + timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta)); } static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64) { - timer->_expires.tv64 = tv64; + timer->node.expires.tv64 = tv64; timer->_softexpires.tv64 = tv64; } static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time) { - timer->_expires = ktime_add_safe(timer->_expires, time); + timer->node.expires = ktime_add_safe(timer->node.expires, time); timer->_softexpires = ktime_add_safe(timer->_softexpires, time); } static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns) { - timer->_expires = ktime_add_ns(timer->_expires, ns); + timer->node.expires = ktime_add_ns(timer->node.expires, ns); timer->_softexpires = ktime_add_ns(timer->_softexpires, ns); } static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer) { - return timer->_expires; + return timer->node.expires; } static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) @@ -230,7 +227,7 @@ static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer) { - return timer->_expires.tv64; + return timer->node.expires.tv64; } static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer) { @@ -239,12 +236,12 @@ static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer) static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) { - return ktime_to_ns(timer->_expires); + return ktime_to_ns(timer->node.expires); } static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { - return ktime_sub(timer->_expires, timer->base->get_time()); + return ktime_sub(timer->node.expires, timer->base->get_time()); } #ifdef CONFIG_HIGH_RES_TIMERS diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 1f8c06ce0fa6..caa151fbebb7 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -12,6 +12,13 @@ #include <linux/securebits.h> #include <net/net_namespace.h> +#ifdef CONFIG_SMP +# define INIT_PUSHABLE_TASKS(tsk) \ + .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), +#else +# define INIT_PUSHABLE_TASKS(tsk) +#endif + extern struct files_struct init_files; extern struct fs_struct init_fs; @@ -83,6 +90,12 @@ extern struct group_info init_groups; */ # define CAP_INIT_BSET CAP_FULL_SET +#ifdef CONFIG_RCU_BOOST +#define INIT_TASK_RCU_BOOST() \ + .rcu_boost_mutex = NULL, +#else +#define INIT_TASK_RCU_BOOST() +#endif #ifdef CONFIG_TREE_PREEMPT_RCU #define INIT_TASK_RCU_TREE_PREEMPT() \ .rcu_blocked_node = NULL, @@ -94,7 +107,8 @@ extern struct group_info init_groups; .rcu_read_lock_nesting = 0, \ .rcu_read_unlock_special = 0, \ .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ - INIT_TASK_RCU_TREE_PREEMPT() + INIT_TASK_RCU_TREE_PREEMPT() \ + INIT_TASK_RCU_BOOST() #else #define INIT_TASK_RCU_PREEMPT(tsk) #endif @@ -137,7 +151,7 @@ extern struct cred init_cred; .nr_cpus_allowed = NR_CPUS, \ }, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ - .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \ + INIT_PUSHABLE_TASKS(tsk) \ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ .real_parent = &tsk, \ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 79d0c4f6d071..55e0d4253e49 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -114,15 +114,15 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); struct irqaction { irq_handler_t handler; unsigned long flags; - const char *name; void *dev_id; struct irqaction *next; int irq; - struct proc_dir_entry *dir; irq_handler_t thread_fn; struct task_struct *thread; unsigned long thread_flags; -}; + const char *name; + struct proc_dir_entry *dir; +} ____cacheline_internodealigned_in_smp; extern irqreturn_t no_action(int cpl, void *dev_id); diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e7d1b2e0070d..b78edb58ee66 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -275,7 +275,9 @@ extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn); extern int arch_check_optimized_kprobe(struct optimized_kprobe *op); extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op); extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op); -extern int arch_optimize_kprobe(struct optimized_kprobe *op); +extern void arch_optimize_kprobes(struct list_head *oplist); +extern void arch_unoptimize_kprobes(struct list_head *oplist, + struct list_head *done_list); extern void arch_unoptimize_kprobe(struct optimized_kprobe *op); extern kprobe_opcode_t *get_optinsn_slot(void); extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty); diff --git a/include/linux/module.h b/include/linux/module.h index 7575bbbdf2a2..8b17fd8c790d 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -308,6 +308,9 @@ struct module /* The size of the executable code in each section. */ unsigned int init_text_size, core_text_size; + /* Size of RO sections of the module (text+rodata) */ + unsigned int init_ro_size, core_ro_size; + /* Arch-specific module values */ struct mod_arch_specific arch; @@ -672,7 +675,6 @@ static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter) { return 0; } - #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS @@ -687,6 +689,13 @@ extern int module_sysfs_initialized; #define __MODULE_STRING(x) __stringify(x) +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +extern void set_all_modules_text_rw(void); +extern void set_all_modules_text_ro(void); +#else +static inline void set_all_modules_text_rw(void) { } +static inline void set_all_modules_text_ro(void) { } +#endif #ifdef CONFIG_GENERIC_BUG void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, diff --git a/include/linux/mutex.h b/include/linux/mutex.h index f363bc8fdc74..94b48bd40dd7 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -160,4 +160,8 @@ extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); +#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX +#define arch_mutex_cpu_relax() cpu_relax() +#endif + #endif diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 06aab5eee134..c536f8545f74 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -14,22 +14,14 @@ * may be used to reset the timeout - for code which intentionally * disables interrupts for a long time. This call is stateless. */ -#ifdef ARCH_HAS_NMI_WATCHDOG +#if defined(ARCH_HAS_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) #include <asm/nmi.h> extern void touch_nmi_watchdog(void); -extern void acpi_nmi_disable(void); -extern void acpi_nmi_enable(void); #else -#ifndef CONFIG_HARDLOCKUP_DETECTOR static inline void touch_nmi_watchdog(void) { touch_softlockup_watchdog(); } -#else -extern void touch_nmi_watchdog(void); -#endif -static inline void acpi_nmi_disable(void) { } -static inline void acpi_nmi_enable(void) { } #endif /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4f1279e105ee..dda5b0a3ff60 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -215,8 +215,9 @@ struct perf_event_attr { */ precise_ip : 2, /* skid constraint */ mmap_data : 1, /* non-exec mmap data */ + sample_id_all : 1, /* sample_type all events */ - __reserved_1 : 46; + __reserved_1 : 45; union { __u32 wakeup_events; /* wakeup every n events */ @@ -327,6 +328,15 @@ struct perf_event_header { enum perf_event_type { /* + * If perf_event_attr.sample_id_all is set then all event types will + * have the sample_type selected fields related to where/when + * (identity) an event took place (TID, TIME, ID, CPU, STREAM_ID) + * described in PERF_RECORD_SAMPLE below, it will be stashed just after + * the perf_event_header and the fields already present for the existing + * fields, i.e. at the end of the payload. That way a newer perf.data + * file will be supported by older perf tools, with these new optional + * fields being ignored. + * * The MMAP events record the PROT_EXEC mappings so that we can * correlate userspace IPs to code. They have the following structure: * @@ -578,6 +588,10 @@ struct perf_event; struct pmu { struct list_head entry; + struct device *dev; + char *name; + int type; + int * __percpu pmu_disable_count; struct perf_cpu_context * __percpu pmu_cpu_context; int task_ctx_nr; @@ -758,6 +772,9 @@ struct perf_event { u64 shadow_ctx_time; struct perf_event_attr attr; + u16 header_size; + u16 id_header_size; + u16 read_size; struct hw_perf_event hw; struct perf_event_context *ctx; @@ -903,7 +920,7 @@ struct perf_output_handle { #ifdef CONFIG_PERF_EVENTS -extern int perf_pmu_register(struct pmu *pmu); +extern int perf_pmu_register(struct pmu *pmu, char *name, int type); extern void perf_pmu_unregister(struct pmu *pmu); extern int perf_num_counters(void); @@ -970,6 +987,11 @@ extern int perf_event_overflow(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs); +static inline bool is_sampling_event(struct perf_event *event) +{ + return event->attr.sample_period != 0; +} + /* * Return 1 for a software event, 0 for a hardware event */ diff --git a/include/linux/rculist.h b/include/linux/rculist.h index f31ef61f1c65..2dea94fc4402 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -241,11 +241,6 @@ static inline void list_splice_init_rcu(struct list_head *list, #define list_first_entry_rcu(ptr, type, member) \ list_entry_rcu((ptr)->next, type, member) -#define __list_for_each_rcu(pos, head) \ - for (pos = rcu_dereference_raw(list_next_rcu(head)); \ - pos != (head); \ - pos = rcu_dereference_raw(list_next_rcu((pos))) - /** * list_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 03cda7bed985..af5614856285 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -47,6 +47,8 @@ extern int rcutorture_runnable; /* for sysctl */ #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) +#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b)) #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) @@ -66,7 +68,6 @@ extern void call_rcu_sched(struct rcu_head *head, extern void synchronize_sched(void); extern void rcu_barrier_bh(void); extern void rcu_barrier_sched(void); -extern void synchronize_sched_expedited(void); extern int sched_expedited_torture_stats(char *page); static inline void __rcu_read_lock_bh(void) @@ -118,7 +119,6 @@ static inline int rcu_preempt_depth(void) #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* Internal to kernel */ -extern void rcu_init(void); extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); extern void rcu_check_callbacks(int cpu, int user); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 13877cb93a60..30ebd7c8d874 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -27,7 +27,9 @@ #include <linux/cache.h> -#define rcu_init_sched() do { } while (0) +static inline void rcu_init(void) +{ +} #ifdef CONFIG_TINY_RCU @@ -58,6 +60,11 @@ static inline void synchronize_rcu_bh_expedited(void) synchronize_sched(); } +static inline void synchronize_sched_expedited(void) +{ + synchronize_sched(); +} + #ifdef CONFIG_TINY_RCU static inline void rcu_preempt_note_context_switch(void) @@ -125,16 +132,12 @@ static inline void rcu_cpu_stall_reset(void) } #ifdef CONFIG_DEBUG_LOCK_ALLOC - extern int rcu_scheduler_active __read_mostly; extern void rcu_scheduler_starting(void); - #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - static inline void rcu_scheduler_starting(void) { } - #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 95518e628794..3a933482734a 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -30,6 +30,7 @@ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H +extern void rcu_init(void); extern void rcu_note_context_switch(int cpu); extern int rcu_needs_cpu(int cpu); extern void rcu_cpu_stall_reset(void); @@ -47,6 +48,7 @@ static inline void exit_rcu(void) #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ extern void synchronize_rcu_bh(void); +extern void synchronize_sched_expedited(void); extern void synchronize_rcu_expedited(void); static inline void synchronize_rcu_bh_expedited(void) diff --git a/include/linux/sched.h b/include/linux/sched.h index 223874538b33..777cd01e240e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -316,6 +316,7 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; extern int softlockup_thresh; +void lockup_detector_init(void); #else static inline void touch_softlockup_watchdog(void) { @@ -326,6 +327,9 @@ static inline void touch_softlockup_watchdog_sync(void) static inline void touch_all_softlockup_watchdogs(void) { } +static inline void lockup_detector_init(void) +{ +} #endif #ifdef CONFIG_DETECT_HUNG_TASK @@ -509,6 +513,8 @@ struct thread_group_cputimer { spinlock_t lock; }; +struct autogroup; + /* * NOTE! "signal_struct" does not have it's own * locking, because a shared signal_struct always @@ -576,6 +582,9 @@ struct signal_struct { struct tty_struct *tty; /* NULL if no tty */ +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif /* * Cumulative resource counters for dead threads in the group, * and for reaped dead child processes forked by this group. @@ -1229,13 +1238,18 @@ struct task_struct { #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifdef CONFIG_RCU_BOOST + struct rt_mutex *rcu_boost_mutex; +#endif /* #ifdef CONFIG_RCU_BOOST */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; #endif struct list_head tasks; +#ifdef CONFIG_SMP struct plist_node pushable_tasks; +#endif struct mm_struct *mm, *active_mm; #if defined(SPLIT_RSS_COUNTING) @@ -1759,7 +1773,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #ifdef CONFIG_PREEMPT_RCU #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ -#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ +#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */ +#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */ static inline void rcu_copy_process(struct task_struct *p) { @@ -1767,7 +1782,10 @@ static inline void rcu_copy_process(struct task_struct *p) p->rcu_read_unlock_special = 0; #ifdef CONFIG_TREE_PREEMPT_RCU p->rcu_blocked_node = NULL; -#endif +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifdef CONFIG_RCU_BOOST + p->rcu_boost_mutex = NULL; +#endif /* #ifdef CONFIG_RCU_BOOST */ INIT_LIST_HEAD(&p->rcu_node_entry); } @@ -1872,14 +1890,11 @@ extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); #ifdef CONFIG_HOTPLUG_CPU -extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p); extern void idle_task_exit(void); #else static inline void idle_task_exit(void) {} #endif -extern void sched_idle_next(void); - #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) extern void wake_up_idle_cpu(int cpu); #else @@ -1889,8 +1904,6 @@ static inline void wake_up_idle_cpu(int cpu) { } extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; -extern unsigned int sysctl_sched_shares_ratelimit; -extern unsigned int sysctl_sched_shares_thresh; extern unsigned int sysctl_sched_child_runs_first; enum sched_tunable_scaling { @@ -1906,6 +1919,7 @@ extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_time_avg; extern unsigned int sysctl_timer_migration; +extern unsigned int sysctl_sched_shares_window; int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, @@ -1931,6 +1945,24 @@ int sched_rt_handler(struct ctl_table *table, int write, extern unsigned int sysctl_sched_compat_yield; +#ifdef CONFIG_SCHED_AUTOGROUP +extern unsigned int sysctl_sched_autogroup_enabled; + +extern void sched_autogroup_create_attach(struct task_struct *p); +extern void sched_autogroup_detach(struct task_struct *p); +extern void sched_autogroup_fork(struct signal_struct *sig); +extern void sched_autogroup_exit(struct signal_struct *sig); +#ifdef CONFIG_PROC_FS +extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); +extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice); +#endif +#else +static inline void sched_autogroup_create_attach(struct task_struct *p) { } +static inline void sched_autogroup_detach(struct task_struct *p) { } +static inline void sched_autogroup_fork(struct signal_struct *sig) { } +static inline void sched_autogroup_exit(struct signal_struct *sig) { } +#endif + #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); @@ -1949,9 +1981,10 @@ extern int task_nice(const struct task_struct *p); extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); -extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); +extern int sched_setscheduler(struct task_struct *, int, + const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, - struct sched_param *); + const struct sched_param *); extern struct task_struct *idle_task(int cpu); extern struct task_struct *curr_task(int cpu); extern void set_curr_task(int cpu, struct task_struct *p); diff --git a/include/linux/sfi.h b/include/linux/sfi.h index 7f770c638e99..fe817918b30e 100644 --- a/include/linux/sfi.h +++ b/include/linux/sfi.h @@ -77,6 +77,8 @@ #define SFI_OEM_ID_SIZE 6 #define SFI_OEM_TABLE_ID_SIZE 8 +#define SFI_NAME_LEN 16 + #define SFI_SYST_SEARCH_BEGIN 0x000E0000 #define SFI_SYST_SEARCH_END 0x000FFFFF @@ -156,13 +158,13 @@ struct sfi_device_table_entry { u16 addr; u8 irq; u32 max_freq; - char name[16]; + char name[SFI_NAME_LEN]; } __packed; struct sfi_gpio_table_entry { - char controller_name[16]; + char controller_name[SFI_NAME_LEN]; u16 pin_no; - char pin_name[16]; + char pin_name[SFI_NAME_LEN]; } __packed; typedef int (*sfi_table_handler) (struct sfi_table_header *table); diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 51efbef38fb0..25310f1d7f37 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -2,6 +2,7 @@ #define __LINUX_STACKTRACE_H struct task_struct; +struct pt_regs; #ifdef CONFIG_STACKTRACE struct task_struct; @@ -13,7 +14,8 @@ struct stack_trace { }; extern void save_stack_trace(struct stack_trace *trace); -extern void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp); +extern void save_stack_trace_regs(struct stack_trace *trace, + struct pt_regs *regs); extern void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index cacc27a0e285..18cd0684fc4e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -127,8 +127,6 @@ extern struct trace_event_functions exit_syscall_print_funcs; #define SYSCALL_TRACE_ENTER_EVENT(sname) \ static struct syscall_metadata \ __attribute__((__aligned__(4))) __syscall_meta_##sname; \ - static struct ftrace_event_call \ - __attribute__((__aligned__(4))) event_enter_##sname; \ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) \ @@ -137,13 +135,12 @@ extern struct trace_event_functions exit_syscall_print_funcs; .class = &event_class_syscall_enter, \ .event.funcs = &enter_syscall_print_funcs, \ .data = (void *)&__syscall_meta_##sname,\ - } + }; \ + __TRACE_EVENT_FLAGS(enter_##sname, TRACE_EVENT_FL_CAP_ANY) #define SYSCALL_TRACE_EXIT_EVENT(sname) \ static struct syscall_metadata \ __attribute__((__aligned__(4))) __syscall_meta_##sname; \ - static struct ftrace_event_call \ - __attribute__((__aligned__(4))) event_exit_##sname; \ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) \ @@ -152,7 +149,8 @@ extern struct trace_event_functions exit_syscall_print_funcs; .class = &event_class_syscall_exit, \ .event.funcs = &exit_syscall_print_funcs, \ .data = (void *)&__syscall_meta_##sname,\ - } + }; \ + __TRACE_EVENT_FLAGS(exit_##sname, TRACE_EVENT_FL_CAP_ANY) #define SYSCALL_METADATA(sname, nb) \ SYSCALL_TRACE_ENTER_EVENT(sname); \ diff --git a/include/linux/timer.h b/include/linux/timer.h index 38cf093ef62c..6abd9138beda 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -24,9 +24,9 @@ struct timer_list { int slack; #ifdef CONFIG_TIMER_STATS + int start_pid; void *start_site; char start_comm[16]; - int start_pid; #endif #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; @@ -48,12 +48,38 @@ extern struct tvec_base boot_tvec_bases; #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) #endif +/* + * Note that all tvec_bases are 2 byte aligned and lower bit of + * base in timer_list is guaranteed to be zero. Use the LSB to + * indicate whether the timer is deferrable. + * + * A deferrable timer will work normally when the system is busy, but + * will not cause a CPU to come out of idle just to service it; instead, + * the timer will be serviced when the CPU eventually wakes up with a + * subsequent non-deferrable timer. + */ +#define TBASE_DEFERRABLE_FLAG (0x1) + #define TIMER_INITIALIZER(_function, _expires, _data) { \ .entry = { .prev = TIMER_ENTRY_STATIC }, \ .function = (_function), \ .expires = (_expires), \ .data = (_data), \ .base = &boot_tvec_bases, \ + .slack = -1, \ + __TIMER_LOCKDEP_MAP_INITIALIZER( \ + __FILE__ ":" __stringify(__LINE__)) \ + } + +#define TBASE_MAKE_DEFERRED(ptr) ((struct tvec_base *) \ + ((unsigned char *)(ptr) + TBASE_DEFERRABLE_FLAG)) + +#define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) {\ + .entry = { .prev = TIMER_ENTRY_STATIC }, \ + .function = (_function), \ + .expires = (_expires), \ + .data = (_data), \ + .base = TBASE_MAKE_DEFERRED(&boot_tvec_bases), \ __TIMER_LOCKDEP_MAP_INITIALIZER( \ __FILE__ ":" __stringify(__LINE__)) \ } @@ -248,11 +274,11 @@ static inline void timer_stats_timer_clear_start_info(struct timer_list *timer) extern void add_timer(struct timer_list *timer); +extern int try_to_del_timer_sync(struct timer_list *timer); + #ifdef CONFIG_SMP - extern int try_to_del_timer_sync(struct timer_list *timer); extern int del_timer_sync(struct timer_list *timer); #else -# define try_to_del_timer_sync(t) del_timer(t) # define del_timer_sync(t) del_timer(t) #endif diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h new file mode 100644 index 000000000000..d24aabaca474 --- /dev/null +++ b/include/linux/timerqueue.h @@ -0,0 +1,50 @@ +#ifndef _LINUX_TIMERQUEUE_H +#define _LINUX_TIMERQUEUE_H + +#include <linux/rbtree.h> +#include <linux/ktime.h> + + +struct timerqueue_node { + struct rb_node node; + ktime_t expires; +}; + +struct timerqueue_head { + struct rb_root head; + struct timerqueue_node *next; +}; + + +extern void timerqueue_add(struct timerqueue_head *head, + struct timerqueue_node *node); +extern void timerqueue_del(struct timerqueue_head *head, + struct timerqueue_node *node); +extern struct timerqueue_node *timerqueue_iterate_next( + struct timerqueue_node *node); + +/** + * timerqueue_getnext - Returns the timer with the earlies expiration time + * + * @head: head of timerqueue + * + * Returns a pointer to the timer node that has the + * earliest expiration time. + */ +static inline +struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) +{ + return head->next; +} + +static inline void timerqueue_init(struct timerqueue_node *node) +{ + RB_CLEAR_NODE(&node->node); +} + +static inline void timerqueue_init_head(struct timerqueue_head *head) +{ + head->head = RB_ROOT; + head->next = NULL; +} +#endif /* _LINUX_TIMERQUEUE_H */ diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index a4a90b6726ce..d3e4f87e95c0 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -106,6 +106,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin, #define TP_PROTO(args...) args #define TP_ARGS(args...) args +#define TP_CONDITION(args...) args #ifdef CONFIG_TRACEPOINTS @@ -119,12 +120,14 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin, * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto". */ -#define __DO_TRACE(tp, proto, args) \ +#define __DO_TRACE(tp, proto, args, cond) \ do { \ struct tracepoint_func *it_func_ptr; \ void *it_func; \ void *__data; \ \ + if (!(cond)) \ + return; \ rcu_read_lock_sched_notrace(); \ it_func_ptr = rcu_dereference_sched((tp)->funcs); \ if (it_func_ptr) { \ @@ -142,7 +145,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin, * not add unwanted padding between the beginning of the section and the * structure. Force alignment to the same alignment as the section start. */ -#define __DECLARE_TRACE(name, proto, args, data_proto, data_args) \ +#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \ extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ @@ -151,7 +154,8 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin, do_trace: \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ - TP_ARGS(data_args)); \ + TP_ARGS(data_args), \ + TP_CONDITION(cond)); \ } \ static inline int \ register_trace_##name(void (*probe)(data_proto), void *data) \ @@ -186,7 +190,7 @@ do_trace: \ EXPORT_SYMBOL(__tracepoint_##name) #else /* !CONFIG_TRACEPOINTS */ -#define __DECLARE_TRACE(name, proto, args, data_proto, data_args) \ +#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \ static inline void trace_##name(proto) \ { } \ static inline int \ @@ -227,13 +231,20 @@ do_trace: \ * "void *__data, proto" as the callback prototype. */ #define DECLARE_TRACE_NOARGS(name) \ - __DECLARE_TRACE(name, void, , void *__data, __data) + __DECLARE_TRACE(name, void, , 1, void *__data, __data) #define DECLARE_TRACE(name, proto, args) \ - __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), 1, \ PARAMS(void *__data, proto), \ PARAMS(__data, args)) +#define DECLARE_TRACE_CONDITION(name, proto, args, cond) \ + __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), PARAMS(cond), \ + PARAMS(void *__data, proto), \ + PARAMS(__data, args)) + +#define TRACE_EVENT_FLAGS(event, flag) + #endif /* DECLARE_TRACE */ #ifndef TRACE_EVENT @@ -347,11 +358,21 @@ do_trace: \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) +#define DEFINE_EVENT_CONDITION(template, name, proto, \ + args, cond) \ + DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ + PARAMS(args), PARAMS(cond)) #define TRACE_EVENT(name, proto, args, struct, assign, print) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) #define TRACE_EVENT_FN(name, proto, args, struct, \ assign, print, reg, unreg) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) +#define TRACE_EVENT_CONDITION(name, proto, args, cond, \ + struct, assign, print) \ + DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ + PARAMS(args), PARAMS(cond)) + +#define TRACE_EVENT_FLAGS(event, flag) #endif /* ifdef TRACE_EVENT (see note above) */ diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0c0771f06bfa..bd257fee6031 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -127,12 +127,20 @@ struct execute_work { .timer = TIMER_INITIALIZER(NULL, 0, 0), \ } +#define __DEFERRED_WORK_INITIALIZER(n, f) { \ + .work = __WORK_INITIALIZER((n).work, (f)), \ + .timer = TIMER_DEFERRED_INITIALIZER(NULL, 0, 0), \ + } + #define DECLARE_WORK(n, f) \ struct work_struct n = __WORK_INITIALIZER(n, f) #define DECLARE_DELAYED_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f) +#define DECLARE_DEFERRED_WORK(n, f) \ + struct delayed_work n = __DEFERRED_WORK_INITIALIZER(n, f) + /* * initialize a work item's function pointer */ diff --git a/include/media/wm8775.h b/include/media/wm8775.h index a1c4d417dfa2..60739c5a23ae 100644 --- a/include/media/wm8775.h +++ b/include/media/wm8775.h @@ -32,7 +32,4 @@ #define WM8775_AIN3 4 #define WM8775_AIN4 8 -/* subdev group ID */ -#define WM8775_GID (1 << 0) - #endif diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index 1dfab5401511..b0b4eb24d592 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -26,6 +26,15 @@ #define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ DEFINE_TRACE(name) +#undef TRACE_EVENT_CONDITION +#define TRACE_EVENT_CONDITION(name, proto, args, cond, tstruct, assign, print) \ + TRACE_EVENT(name, \ + PARAMS(proto), \ + PARAMS(args), \ + PARAMS(tstruct), \ + PARAMS(assign), \ + PARAMS(print)) + #undef TRACE_EVENT_FN #define TRACE_EVENT_FN(name, proto, args, tstruct, \ assign, print, reg, unreg) \ @@ -39,6 +48,10 @@ #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ DEFINE_TRACE(name) +#undef DEFINE_EVENT_CONDITION +#define DEFINE_EVENT_CONDITION(template, name, proto, args, cond) \ + DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) + #undef DECLARE_TRACE #define DECLARE_TRACE(name, proto, args) \ DEFINE_TRACE(name) @@ -75,9 +88,11 @@ #undef TRACE_EVENT #undef TRACE_EVENT_FN +#undef TRACE_EVENT_CONDITION #undef DECLARE_EVENT_CLASS #undef DEFINE_EVENT #undef DEFINE_EVENT_PRINT +#undef DEFINE_EVENT_CONDITION #undef TRACE_HEADER_MULTI_READ #undef DECLARE_TRACE diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 286784d69b8f..1bcc2a8c00e2 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -7,16 +7,67 @@ #include <linux/ktime.h> #include <linux/tracepoint.h> -#ifndef _TRACE_POWER_ENUM_ -#define _TRACE_POWER_ENUM_ -enum { - POWER_NONE = 0, - POWER_CSTATE = 1, /* C-State */ - POWER_PSTATE = 2, /* Fequency change or DVFS */ - POWER_SSTATE = 3, /* Suspend */ -}; +DECLARE_EVENT_CLASS(cpu, + + TP_PROTO(unsigned int state, unsigned int cpu_id), + + TP_ARGS(state, cpu_id), + + TP_STRUCT__entry( + __field( u32, state ) + __field( u32, cpu_id ) + ), + + TP_fast_assign( + __entry->state = state; + __entry->cpu_id = cpu_id; + ), + + TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state, + (unsigned long)__entry->cpu_id) +); + +DEFINE_EVENT(cpu, cpu_idle, + + TP_PROTO(unsigned int state, unsigned int cpu_id), + + TP_ARGS(state, cpu_id) +); + +/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */ +#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING +#define _PWR_EVENT_AVOID_DOUBLE_DEFINING + +#define PWR_EVENT_EXIT -1 #endif +DEFINE_EVENT(cpu, cpu_frequency, + + TP_PROTO(unsigned int frequency, unsigned int cpu_id), + + TP_ARGS(frequency, cpu_id) +); + +TRACE_EVENT(machine_suspend, + + TP_PROTO(unsigned int state), + + TP_ARGS(state), + + TP_STRUCT__entry( + __field( u32, state ) + ), + + TP_fast_assign( + __entry->state = state; + ), + + TP_printk("state=%lu", (unsigned long)__entry->state) +); + +/* This code will be removed after deprecation time exceeded (2.6.41) */ +#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED + /* * The power events are used for cpuidle & suspend (power_start, power_end) * and for cpufreq (power_frequency) @@ -75,6 +126,36 @@ TRACE_EVENT(power_end, ); +/* Deprecated dummy functions must be protected against multi-declartion */ +#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED +#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED + +enum { + POWER_NONE = 0, + POWER_CSTATE = 1, + POWER_PSTATE = 2, +}; +#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */ + +#else /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */ + +#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED +#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED +enum { + POWER_NONE = 0, + POWER_CSTATE = 1, + POWER_PSTATE = 2, +}; + +/* These dummy declaration have to be ripped out when the deprecated + events get removed */ +static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {}; +static inline void trace_power_end(u64 cpuid) {}; +static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {}; +#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */ + +#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */ + /* * The clock events are used for clock enable/disable and for * clock rate change @@ -153,7 +234,6 @@ DEFINE_EVENT(power_domain, power_domain_target, TP_ARGS(name, state, cpu_id) ); - #endif /* _TRACE_POWER_H */ /* This part must be outside protection */ diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h index fb726ac7caee..5a4c04a75b3d 100644 --- a/include/trace/events/syscalls.h +++ b/include/trace/events/syscalls.h @@ -40,6 +40,8 @@ TRACE_EVENT_FN(sys_enter, syscall_regfunc, syscall_unregfunc ); +TRACE_EVENT_FLAGS(sys_enter, TRACE_EVENT_FL_CAP_ANY) + TRACE_EVENT_FN(sys_exit, TP_PROTO(struct pt_regs *regs, long ret), @@ -62,6 +64,8 @@ TRACE_EVENT_FN(sys_exit, syscall_regfunc, syscall_unregfunc ); +TRACE_EVENT_FLAGS(sys_exit, TRACE_EVENT_FL_CAP_ANY) + #endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */ #endif /* _TRACE_EVENTS_SYSCALLS_H */ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index a9377c0083ad..e16610c208c9 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -82,6 +82,10 @@ TRACE_EVENT(name, PARAMS(proto), PARAMS(args), \ PARAMS(tstruct), PARAMS(assign), PARAMS(print)) \ +#undef TRACE_EVENT_FLAGS +#define TRACE_EVENT_FLAGS(name, value) \ + __TRACE_EVENT_FLAGS(name, value) + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) @@ -129,6 +133,9 @@ #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) +#undef TRACE_EVENT_FLAGS +#define TRACE_EVENT_FLAGS(event, flag) + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) /* @@ -289,13 +296,19 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = { \ #undef __array #define __array(type, item, len) \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + do { \ + mutex_lock(&event_storage_mutex); \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + snprintf(event_storage, sizeof(event_storage), \ + "%s[%d]", #type, len); \ + ret = trace_define_field(event_call, event_storage, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ is_signed_type(type), FILTER_OTHER); \ - if (ret) \ - return ret; + mutex_unlock(&event_storage_mutex); \ + if (ret) \ + return ret; \ + } while (0); #undef __dynamic_array #define __dynamic_array(type, item, len) \ diff --git a/init/Kconfig b/init/Kconfig index c9728992a776..8dfd094e6875 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -393,7 +393,6 @@ config PREEMPT_RCU config RCU_TRACE bool "Enable tracing for RCU" - depends on TREE_RCU || TREE_PREEMPT_RCU help This option provides tracing in RCU which presents stats in debugfs for debugging RCU implementation. @@ -459,6 +458,60 @@ config TREE_RCU_TRACE TREE_PREEMPT_RCU implementations, permitting Makefile to trivially select kernel/rcutree_trace.c. +config RCU_BOOST + bool "Enable RCU priority boosting" + depends on RT_MUTEXES && TINY_PREEMPT_RCU + default n + help + This option boosts the priority of preempted RCU readers that + block the current preemptible RCU grace period for too long. + This option also prevents heavy loads from blocking RCU + callback invocation for all flavors of RCU. + + Say Y here if you are working with real-time apps or heavy loads + Say N here if you are unsure. + +config RCU_BOOST_PRIO + int "Real-time priority to boost RCU readers to" + range 1 99 + depends on RCU_BOOST + default 1 + help + This option specifies the real-time priority to which preempted + RCU readers are to be boosted. If you are working with CPU-bound + real-time applications, you should specify a priority higher then + the highest-priority CPU-bound application. + + Specify the real-time priority, or take the default if unsure. + +config RCU_BOOST_DELAY + int "Milliseconds to delay boosting after RCU grace-period start" + range 0 3000 + depends on RCU_BOOST + default 500 + help + This option specifies the time to wait after the beginning of + a given grace period before priority-boosting preempted RCU + readers blocking that grace period. Note that any RCU reader + blocking an expedited RCU grace period is boosted immediately. + + Accept the default if unsure. + +config SRCU_SYNCHRONIZE_DELAY + int "Microseconds to delay before waiting for readers" + range 0 20 + default 10 + help + This option controls how long SRCU delays before entering its + loop waiting on SRCU readers. The purpose of this loop is + to avoid the unconditional context-switch penalty that would + otherwise be incurred if there was an active SRCU reader, + in a manner similar to adaptive locking schemes. This should + be set to be a bit longer than the common-case SRCU read-side + critical-section overhead. + + Accept the default if unsure. + endmenu # "RCU Subsystem" config IKCONFIG @@ -741,6 +794,19 @@ config NET_NS endif # NAMESPACES +config SCHED_AUTOGROUP + bool "Automatic process group scheduling" + select EVENTFD + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED + help + This option optimizes the scheduler for common desktop workloads by + automatically creating and populating task groups. This separation + of workloads isolates aggressive CPU burners (like build jobs) from + desktop applications. Task group autogeneration is currently based + upon task session. + config MM_OWNER bool diff --git a/init/do_mounts.c b/init/do_mounts.c index 830aaec9c7d5..2b54bef33b55 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -93,7 +93,7 @@ no_match: * * Returns the matching dev_t on success or 0 on failure. */ -static dev_t __init devt_from_partuuid(char *uuid_str) +static dev_t devt_from_partuuid(char *uuid_str) { dev_t res = 0; struct device *dev = NULL; diff --git a/init/main.c b/init/main.c index 8646401f7a0e..ea51770c0170 100644 --- a/init/main.c +++ b/init/main.c @@ -67,6 +67,7 @@ #include <linux/sfi.h> #include <linux/shmem_fs.h> #include <linux/slab.h> +#include <linux/perf_event.h> #include <asm/io.h> #include <asm/bugs.h> @@ -603,6 +604,8 @@ asmlinkage void __init start_kernel(void) "enabled *very* early, fixing it\n"); local_irq_disable(); } + idr_init_cache(); + perf_event_init(); rcu_init(); radix_tree_init(); /* init some links before init_ISA_irqs() */ @@ -658,7 +661,6 @@ asmlinkage void __init start_kernel(void) enable_debug_pagealloc(); kmemleak_init(); debug_objects_mem_init(); - idr_init_cache(); setup_per_cpu_pageset(); numa_policy_init(); if (late_time_init) @@ -882,6 +884,7 @@ static int __init kernel_init(void * unused) smp_prepare_cpus(setup_max_cpus); do_pre_smp_initcalls(); + lockup_detector_init(); smp_init(); sched_init_smp(); diff --git a/kernel/cpu.c b/kernel/cpu.c index f6e726f18491..156cc5556140 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu) } struct take_cpu_down_param { - struct task_struct *caller; unsigned long mod; void *hcpu; }; @@ -198,7 +197,6 @@ struct take_cpu_down_param { static int __ref take_cpu_down(void *_param) { struct take_cpu_down_param *param = _param; - unsigned int cpu = (unsigned long)param->hcpu; int err; /* Ensure this CPU doesn't handle any more interrupts. */ @@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param) cpu_notify(CPU_DYING | param->mod, param->hcpu); - if (task_cpu(param->caller) == cpu) - move_task_off_dead_cpu(cpu, param->caller); - /* Force idle task to run as soon as we yield: it should - immediately notice cpu is offline and die quickly. */ - sched_idle_next(); return 0; } @@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct take_cpu_down_param tcd_param = { - .caller = current, .mod = mod, .hcpu = hcpu, }; @@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) } BUG_ON(cpu_online(cpu)); - /* Wait for it to sleep (leaving idle task). */ + /* + * The migration_call() CPU_DYING callback will have removed all + * runnable tasks from the cpu, there's only the idle task left now + * that the migration thread is done doing the stop_machine thing. + * + * Wait for the stop thread to go away. + */ while (!idle_cpu(cpu)) - yield(); + cpu_relax(); /* This actually kills the CPU. */ __cpu_die(cpu); @@ -386,6 +384,14 @@ out: #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; +void __weak arch_disable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_disable_nonboot_cpus_end(void) +{ +} + int disable_nonboot_cpus(void) { int cpu, first_cpu, error = 0; @@ -397,6 +403,7 @@ int disable_nonboot_cpus(void) * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); + arch_disable_nonboot_cpus_begin(); printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { @@ -412,6 +419,8 @@ int disable_nonboot_cpus(void) } } + arch_disable_nonboot_cpus_end(); + if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ diff --git a/kernel/fork.c b/kernel/fork.c index 5447dc7defa9..7d164e25b0f0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig) static inline void put_signal_struct(struct signal_struct *sig) { - if (atomic_dec_and_test(&sig->sigcnt)) + if (atomic_dec_and_test(&sig->sigcnt)) { + sched_autogroup_exit(sig); free_signal_struct(sig); + } } void __put_task_struct(struct task_struct *tsk) @@ -905,6 +907,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) posix_cpu_timers_init_group(sig); tty_audit_fork(sig); + sched_autogroup_fork(sig); sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; @@ -1315,7 +1318,7 @@ bad_fork_cleanup_mm: } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) - free_signal_struct(p->signal); + put_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: diff --git a/kernel/futex.c b/kernel/futex.c index 40a8777a27d0..3019b92e6917 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled; #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* + * Futex flags used to encode options to functions and preserve them across + * restarts. + */ +#define FLAGS_SHARED 0x01 +#define FLAGS_CLOCKRT 0x02 +#define FLAGS_HAS_TIMEOUT 0x04 + +/* * Priority Inheritance state: */ struct futex_pi_state { @@ -123,6 +131,12 @@ struct futex_q { u32 bitset; }; +static const struct futex_q futex_q_init = { + /* list gets initialized in queue_me()*/ + .key = FUTEX_KEY_INIT, + .bitset = FUTEX_BITSET_MATCH_ANY +}; + /* * Hash buckets are shared by all the futex_keys that hash to the same * location. Each key may have multiple futex_q structures, one for each task @@ -283,8 +297,7 @@ again: return 0; } -static inline -void put_futex_key(int fshared, union futex_key *key) +static inline void put_futex_key(union futex_key *key) { drop_futex_key_refs(key); } @@ -870,7 +883,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) /* * Wake up waiters matching bitset queued on this futex (uaddr). */ -static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) +static int +futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -881,7 +895,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); if (unlikely(ret != 0)) goto out; @@ -907,7 +921,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) } spin_unlock(&hb->lock); - put_futex_key(fshared, &key); + put_futex_key(&key); out: return ret; } @@ -917,7 +931,7 @@ out: * to this virtual address: */ static int -futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, +futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; @@ -927,10 +941,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, fshared, &key1); + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -962,11 +976,11 @@ retry_private: if (ret) goto out_put_keys; - if (!fshared) + if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(fshared, &key2); - put_futex_key(fshared, &key1); + put_futex_key(&key2); + put_futex_key(&key1); goto retry; } @@ -996,9 +1010,9 @@ retry_private: double_unlock_hb(hb1, hb2); out_put_keys: - put_futex_key(fshared, &key2); + put_futex_key(&key2); out_put_key1: - put_futex_key(fshared, &key1); + put_futex_key(&key1); out: return ret; } @@ -1133,13 +1147,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, /** * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 * @uaddr1: source futex user address - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @flags: futex flags (FLAGS_SHARED, etc.) * @uaddr2: target futex user address * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) * @nr_requeue: number of waiters to requeue (0-INT_MAX) * @cmpval: @uaddr1 expected value (or %NULL) * @requeue_pi: if we are attempting to requeue from a non-pi futex to a - * pi futex (pi to pi requeue is not supported) + * pi futex (pi to pi requeue is not supported) * * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire * uaddr2 atomically on behalf of the top waiter. @@ -1148,9 +1162,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * >=0 - on success, the number of tasks requeued or woken * <0 - on error */ -static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, - int nr_wake, int nr_requeue, u32 *cmpval, - int requeue_pi) +static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + u32 __user *uaddr2, int nr_wake, int nr_requeue, + u32 *cmpval, int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; int drop_count = 0, task_count = 0, ret; @@ -1191,10 +1205,10 @@ retry: pi_state = NULL; } - ret = get_futex_key(uaddr1, fshared, &key1); + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -1216,11 +1230,11 @@ retry_private: if (ret) goto out_put_keys; - if (!fshared) + if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(fshared, &key2); - put_futex_key(fshared, &key1); + put_futex_key(&key2); + put_futex_key(&key1); goto retry; } if (curval != *cmpval) { @@ -1260,8 +1274,8 @@ retry_private: break; case -EFAULT: double_unlock_hb(hb1, hb2); - put_futex_key(fshared, &key2); - put_futex_key(fshared, &key1); + put_futex_key(&key2); + put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; @@ -1269,8 +1283,8 @@ retry_private: case -EAGAIN: /* The owner was exiting, try again. */ double_unlock_hb(hb1, hb2); - put_futex_key(fshared, &key2); - put_futex_key(fshared, &key1); + put_futex_key(&key2); + put_futex_key(&key1); cond_resched(); goto retry; default: @@ -1352,9 +1366,9 @@ out_unlock: drop_futex_key_refs(&key1); out_put_keys: - put_futex_key(fshared, &key2); + put_futex_key(&key2); out_put_key1: - put_futex_key(fshared, &key1); + put_futex_key(&key1); out: if (pi_state != NULL) free_pi_state(pi_state); @@ -1494,7 +1508,7 @@ static void unqueue_me_pi(struct futex_q *q) * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner, int fshared) + struct task_struct *newowner) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; @@ -1587,20 +1601,11 @@ handle_fault: goto retry; } -/* - * In case we must use restart_block to restart a futex_wait, - * we encode in the 'flags' shared capability - */ -#define FLAGS_SHARED 0x01 -#define FLAGS_CLOCKRT 0x02 -#define FLAGS_HAS_TIMEOUT 0x04 - static long futex_wait_restart(struct restart_block *restart); /** * fixup_owner() - Post lock pi_state and corner case management * @uaddr: user address of the futex - * @fshared: whether the futex is shared (1) or not (0) * @q: futex_q (contains pi_state and access to the rt_mutex) * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) * @@ -1613,8 +1618,7 @@ static long futex_wait_restart(struct restart_block *restart); * 0 - success, lock not taken * <0 - on error (-EFAULT) */ -static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, - int locked) +static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { struct task_struct *owner; int ret = 0; @@ -1625,7 +1629,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, * did a lock-steal - fix up the PI-state in that case: */ if (q->pi_state->owner != current) - ret = fixup_pi_state_owner(uaddr, q, current, fshared); + ret = fixup_pi_state_owner(uaddr, q, current); goto out; } @@ -1652,7 +1656,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, * lock. Fix the state up. */ owner = rt_mutex_owner(&q->pi_state->pi_mutex); - ret = fixup_pi_state_owner(uaddr, q, owner, fshared); + ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } @@ -1715,7 +1719,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * futex_wait_setup() - Prepare to wait on a futex * @uaddr: the futex userspace address * @val: the expected value - * @fshared: whether the futex is shared (1) or not (0) + * @flags: futex flags (FLAGS_SHARED, etc.) * @q: the associated futex_q * @hb: storage for hash_bucket pointer to be returned to caller * @@ -1728,7 +1732,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * 0 - uaddr contains val and hb has been locked * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked */ -static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, +static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb) { u32 uval; @@ -1752,8 +1756,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, * rare, but normal. */ retry: - q->key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q->key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); if (unlikely(ret != 0)) return ret; @@ -1769,10 +1772,10 @@ retry_private: if (ret) goto out; - if (!fshared) + if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(fshared, &q->key); + put_futex_key(&q->key); goto retry; } @@ -1783,32 +1786,29 @@ retry_private: out: if (ret) - put_futex_key(fshared, &q->key); + put_futex_key(&q->key); return ret; } -static int futex_wait(u32 __user *uaddr, int fshared, - u32 val, ktime_t *abs_time, u32 bitset, int clockrt) +static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) { struct hrtimer_sleeper timeout, *to = NULL; struct restart_block *restart; struct futex_hash_bucket *hb; - struct futex_q q; + struct futex_q q = futex_q_init; int ret; if (!bitset) return -EINVAL; - - q.pi_state = NULL; q.bitset = bitset; - q.rt_waiter = NULL; - q.requeue_pi_key = NULL; if (abs_time) { to = &timeout; - hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : - CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); hrtimer_set_expires_range_ns(&to->timer, *abs_time, current->timer_slack_ns); @@ -1819,7 +1819,7 @@ retry: * Prepare to wait on uaddr. On success, holds hb lock and increments * q.key refs. */ - ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) goto out; @@ -1852,12 +1852,7 @@ retry: restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; - restart->futex.flags = FLAGS_HAS_TIMEOUT; - - if (fshared) - restart->futex.flags |= FLAGS_SHARED; - if (clockrt) - restart->futex.flags |= FLAGS_CLOCKRT; + restart->futex.flags = flags; ret = -ERESTART_RESTARTBLOCK; @@ -1873,7 +1868,6 @@ out: static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; - int fshared = 0; ktime_t t, *tp = NULL; if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { @@ -1881,11 +1875,9 @@ static long futex_wait_restart(struct restart_block *restart) tp = &t; } restart->fn = do_no_restart_syscall; - if (restart->futex.flags & FLAGS_SHARED) - fshared = 1; - return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, - restart->futex.bitset, - restart->futex.flags & FLAGS_CLOCKRT); + + return (long)futex_wait(uaddr, restart->futex.flags, + restart->futex.val, tp, restart->futex.bitset); } @@ -1895,12 +1887,12 @@ static long futex_wait_restart(struct restart_block *restart) * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, int fshared, - int detect, ktime_t *time, int trylock) +static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, + ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; struct futex_hash_bucket *hb; - struct futex_q q; + struct futex_q q = futex_q_init; int res, ret; if (refill_pi_state_cache()) @@ -1914,12 +1906,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, hrtimer_set_expires(&to->timer, *time); } - q.pi_state = NULL; - q.rt_waiter = NULL; - q.requeue_pi_key = NULL; retry: - q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key); if (unlikely(ret != 0)) goto out; @@ -1941,7 +1929,7 @@ retry_private: * exit to complete. */ queue_unlock(&q, hb); - put_futex_key(fshared, &q.key); + put_futex_key(&q.key); cond_resched(); goto retry; default: @@ -1971,7 +1959,7 @@ retry_private: * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. */ - res = fixup_owner(uaddr, fshared, &q, !ret); + res = fixup_owner(uaddr, &q, !ret); /* * If fixup_owner() returned an error, proprogate that. If it acquired * the lock, clear our -ETIMEDOUT or -EINTR. @@ -1995,7 +1983,7 @@ out_unlock_put_key: queue_unlock(&q, hb); out_put_key: - put_futex_key(fshared, &q.key); + put_futex_key(&q.key); out: if (to) destroy_hrtimer_on_stack(&to->timer); @@ -2008,10 +1996,10 @@ uaddr_faulted: if (ret) goto out_put_key; - if (!fshared) + if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(fshared, &q.key); + put_futex_key(&q.key); goto retry; } @@ -2020,7 +2008,7 @@ uaddr_faulted: * This is the in-kernel slowpath: we look up the PI state (if any), * and do the rt-mutex unlock. */ -static int futex_unlock_pi(u32 __user *uaddr, int fshared) +static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -2038,7 +2026,7 @@ retry: if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); if (unlikely(ret != 0)) goto out; @@ -2093,14 +2081,14 @@ retry: out_unlock: spin_unlock(&hb->lock); - put_futex_key(fshared, &key); + put_futex_key(&key); out: return ret; pi_faulted: spin_unlock(&hb->lock); - put_futex_key(fshared, &key); + put_futex_key(&key); ret = fault_in_user_writeable(uaddr); if (!ret) @@ -2160,7 +2148,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, /** * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 * @uaddr: the futex we initially wait on (non-pi) - * @fshared: whether the futexes are shared (1) or not (0). They must be + * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be * the same type, no requeueing from private to shared, etc. * @val: the expected value of uaddr * @abs_time: absolute timeout @@ -2198,16 +2186,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * 0 - On success * <0 - On error */ -static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, +static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, - int clockrt, u32 __user *uaddr2) + u32 __user *uaddr2) { struct hrtimer_sleeper timeout, *to = NULL; struct rt_mutex_waiter rt_waiter; struct rt_mutex *pi_mutex = NULL; struct futex_hash_bucket *hb; - union futex_key key2; - struct futex_q q; + union futex_key key2 = FUTEX_KEY_INIT; + struct futex_q q = futex_q_init; int res, ret; if (!bitset) @@ -2215,8 +2203,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, if (abs_time) { to = &timeout; - hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : - CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); hrtimer_set_expires_range_ns(&to->timer, *abs_time, current->timer_slack_ns); @@ -2229,12 +2218,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, debug_rt_mutex_init_waiter(&rt_waiter); rt_waiter.task = NULL; - key2 = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); if (unlikely(ret != 0)) goto out; - q.pi_state = NULL; q.bitset = bitset; q.rt_waiter = &rt_waiter; q.requeue_pi_key = &key2; @@ -2243,7 +2230,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, * Prepare to wait on uaddr. On success, increments q.key (key1) ref * count. */ - ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) goto out_key2; @@ -2273,8 +2260,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, */ if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); - ret = fixup_pi_state_owner(uaddr2, &q, current, - fshared); + ret = fixup_pi_state_owner(uaddr2, &q, current); spin_unlock(q.lock_ptr); } } else { @@ -2293,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. */ - res = fixup_owner(uaddr2, fshared, &q, !ret); + res = fixup_owner(uaddr2, &q, !ret); /* * If fixup_owner() returned an error, proprogate that. If it * acquired the lock, clear -ETIMEDOUT or -EINTR. @@ -2324,9 +2310,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, } out_put_keys: - put_futex_key(fshared, &q.key); + put_futex_key(&q.key); out_key2: - put_futex_key(fshared, &key2); + put_futex_key(&key2); out: if (to) { @@ -2551,58 +2537,57 @@ void exit_robust_list(struct task_struct *curr) long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { - int clockrt, ret = -ENOSYS; - int cmd = op & FUTEX_CMD_MASK; - int fshared = 0; + int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; + unsigned int flags = 0; if (!(op & FUTEX_PRIVATE_FLAG)) - fshared = 1; + flags |= FLAGS_SHARED; - clockrt = op & FUTEX_CLOCK_REALTIME; - if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) - return -ENOSYS; + if (op & FUTEX_CLOCK_REALTIME) { + flags |= FLAGS_CLOCKRT; + if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) + return -ENOSYS; + } switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAIT_BITSET: - ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); + ret = futex_wait(uaddr, flags, val, timeout, val3); break; case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAKE_BITSET: - ret = futex_wake(uaddr, fshared, val, val3); + ret = futex_wake(uaddr, flags, val, val3); break; case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); + ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); break; case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, - 0); + ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); break; case FUTEX_WAKE_OP: - ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); + ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); + ret = futex_lock_pi(uaddr, flags, val, timeout, 0); break; case FUTEX_UNLOCK_PI: if (futex_cmpxchg_enabled) - ret = futex_unlock_pi(uaddr, fshared); + ret = futex_unlock_pi(uaddr, flags); break; case FUTEX_TRYLOCK_PI: if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); + ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); break; case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; - ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, - clockrt, uaddr2); + ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, + uaddr2); break; case FUTEX_CMP_REQUEUE_PI: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, - 1); + ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); break; default: ret = -ENOSYS; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 72206cf5c6cf..f2429fc3438c 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; + struct timerqueue_node *next; - if (!base->first) + next = timerqueue_getnext(&base->active); + if (!next) continue; - timer = rb_entry(base->first, struct hrtimer, node); + timer = container_of(next, struct hrtimer, node); + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); /* * clock_was_set() has changed base->offset so the @@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) { - struct rb_node **link = &base->active.rb_node; - struct rb_node *parent = NULL; - struct hrtimer *entry; - int leftmost = 1; - debug_activate(timer); - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct hrtimer, node); - /* - * We dont care about collisions. Nodes with - * the same expiry time stay together. - */ - if (hrtimer_get_expires_tv64(timer) < - hrtimer_get_expires_tv64(entry)) { - link = &(*link)->rb_left; - } else { - link = &(*link)->rb_right; - leftmost = 0; - } - } - - /* - * Insert the timer to the rbtree and check whether it - * replaces the first pending timer - */ - if (leftmost) - base->first = &timer->node; + timerqueue_add(&base->active, &timer->node); - rb_link_node(&timer->node, parent, link); - rb_insert_color(&timer->node, &base->active); /* * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the * state of a possibly running callback. */ timer->state |= HRTIMER_STATE_ENQUEUED; - return leftmost; + return (&timer->node == base->active.next); } /* @@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer, if (!(timer->state & HRTIMER_STATE_ENQUEUED)) goto out; - /* - * Remove the timer from the rbtree and replace the first - * entry pointer if necessary. - */ - if (base->first == &timer->node) { - base->first = rb_next(&timer->node); + if (&timer->node == timerqueue_getnext(&base->active)) { #ifdef CONFIG_HIGH_RES_TIMERS /* Reprogram the clock event device. if enabled */ if (reprogram && hrtimer_hres_active()) { @@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer, } #endif } - rb_erase(&timer->node, &base->active); + timerqueue_del(&base->active, &timer->node); out: timer->state = newstate; } @@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void) if (!hrtimer_hres_active()) { for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; + struct timerqueue_node *next; - if (!base->first) + next = timerqueue_getnext(&base->active); + if (!next) continue; - timer = rb_entry(base->first, struct hrtimer, node); + timer = container_of(next, struct hrtimer, node); delta.tv64 = hrtimer_get_expires_tv64(timer); delta = ktime_sub(delta, base->get_time()); if (delta.tv64 < mindelta.tv64) @@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, timer->base = &cpu_base->clock_base[clock_id]; hrtimer_init_timer_hres(timer); + timerqueue_init(&timer->node); #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; @@ -1278,14 +1248,14 @@ retry: for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { ktime_t basenow; - struct rb_node *node; + struct timerqueue_node *node; basenow = ktime_add(now, base->offset); - while ((node = base->first)) { + while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; - timer = rb_entry(node, struct hrtimer, node); + timer = container_of(node, struct hrtimer, node); /* * The immediate goal for using the softexpires is @@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void) */ void hrtimer_run_queues(void) { - struct rb_node *node; + struct timerqueue_node *node; struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; int index, gettime = 1; @@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void) for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { base = &cpu_base->clock_base[index]; - - if (!base->first) + if (!timerqueue_getnext(&base->active)) continue; if (gettime) { @@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void) raw_spin_lock(&cpu_base->lock); - while ((node = base->first)) { + while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; - timer = rb_entry(node, struct hrtimer, node); + timer = container_of(node, struct hrtimer, node); if (base->softirq_time.tv64 <= hrtimer_get_expires_tv64(timer)) break; @@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu) raw_spin_lock_init(&cpu_base->lock); - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { cpu_base->clock_base[i].cpu_base = cpu_base; + timerqueue_init_head(&cpu_base->clock_base[i].active); + } hrtimer_init_hres(cpu_base); } @@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { struct hrtimer *timer; - struct rb_node *node; + struct timerqueue_node *node; - while ((node = rb_first(&old_base->active))) { - timer = rb_entry(node, struct hrtimer, node); + while ((node = timerqueue_getnext(&old_base->active))) { + timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_deactivate(timer); diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index e5325825aeb6..086adf25a55e 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -641,7 +641,7 @@ int __init init_hw_breakpoint(void) constraints_initialized = 1; - perf_pmu_register(&perf_breakpoint); + perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5f92acc5f952..91a5fa25054e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } */ static int irq_thread(void *data) { - struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; + static struct sched_param param = { + .sched_priority = MAX_USER_RT_PRIO/2, + }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); int wake, oneshot = desc->status & IRQ_ONESHOT; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9737a76e106f..7663e5df0e6f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p) return p->pre_handler == aggr_pre_handler; } +/* Return true(!0) if the kprobe is unused */ +static inline int kprobe_unused(struct kprobe *p) +{ + return kprobe_aggrprobe(p) && kprobe_disabled(p) && + list_empty(&p->list); +} + /* * Keep all fields in the kprobe consistent */ -static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) +static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p) { - memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); - memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); + memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t)); + memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn)); } #ifdef CONFIG_OPTPROBES @@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) } } +/* Free optimized instructions and optimized_kprobe */ +static __kprobes void free_aggr_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = container_of(p, struct optimized_kprobe, kp); + arch_remove_optimized_kprobe(op); + arch_remove_kprobe(p); + kfree(op); +} + /* Return true(!0) if the kprobe is ready for optimization. */ static inline int kprobe_optready(struct kprobe *p) { @@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p) return 0; } +/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */ +static inline int kprobe_disarmed(struct kprobe *p) +{ + struct optimized_kprobe *op; + + /* If kprobe is not aggr/opt probe, just return kprobe is disabled */ + if (!kprobe_aggrprobe(p)) + return kprobe_disabled(p); + + op = container_of(p, struct optimized_kprobe, kp); + + return kprobe_disabled(p) && list_empty(&op->list); +} + +/* Return true(!0) if the probe is queued on (un)optimizing lists */ +static int __kprobes kprobe_queued(struct kprobe *p) +{ + struct optimized_kprobe *op; + + if (kprobe_aggrprobe(p)) { + op = container_of(p, struct optimized_kprobe, kp); + if (!list_empty(&op->list)) + return 1; + } + return 0; +} + /* * Return an optimized kprobe whose optimizing code replaces * instructions including addr (exclude breakpoint). @@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) /* Optimization staging list, protected by kprobe_mutex */ static LIST_HEAD(optimizing_list); +static LIST_HEAD(unoptimizing_list); static void kprobe_optimizer(struct work_struct *work); static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); +static DECLARE_COMPLETION(optimizer_comp); #define OPTIMIZE_DELAY 5 -/* Kprobe jump optimizer */ -static __kprobes void kprobe_optimizer(struct work_struct *work) +/* + * Optimize (replace a breakpoint with a jump) kprobes listed on + * optimizing_list. + */ +static __kprobes void do_optimize_kprobes(void) { - struct optimized_kprobe *op, *tmp; - - /* Lock modules while optimizing kprobes */ - mutex_lock(&module_mutex); - mutex_lock(&kprobe_mutex); - if (kprobes_all_disarmed || !kprobes_allow_optimization) - goto end; - - /* - * Wait for quiesence period to ensure all running interrupts - * are done. Because optprobe may modify multiple instructions - * there is a chance that Nth instruction is interrupted. In that - * case, running interrupt can return to 2nd-Nth byte of jump - * instruction. This wait is for avoiding it. - */ - synchronize_sched(); + /* Optimization never be done when disarmed */ + if (kprobes_all_disarmed || !kprobes_allow_optimization || + list_empty(&optimizing_list)) + return; /* * The optimization/unoptimization refers online_cpus via @@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) */ get_online_cpus(); mutex_lock(&text_mutex); - list_for_each_entry_safe(op, tmp, &optimizing_list, list) { - WARN_ON(kprobe_disabled(&op->kp)); - if (arch_optimize_kprobe(op) < 0) - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; - list_del_init(&op->list); + arch_optimize_kprobes(&optimizing_list); + mutex_unlock(&text_mutex); + put_online_cpus(); +} + +/* + * Unoptimize (replace a jump with a breakpoint and remove the breakpoint + * if need) kprobes listed on unoptimizing_list. + */ +static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) +{ + struct optimized_kprobe *op, *tmp; + + /* Unoptimization must be done anytime */ + if (list_empty(&unoptimizing_list)) + return; + + /* Ditto to do_optimize_kprobes */ + get_online_cpus(); + mutex_lock(&text_mutex); + arch_unoptimize_kprobes(&unoptimizing_list, free_list); + /* Loop free_list for disarming */ + list_for_each_entry_safe(op, tmp, free_list, list) { + /* Disarm probes if marked disabled */ + if (kprobe_disabled(&op->kp)) + arch_disarm_kprobe(&op->kp); + if (kprobe_unused(&op->kp)) { + /* + * Remove unused probes from hash list. After waiting + * for synchronization, these probes are reclaimed. + * (reclaiming is done by do_free_cleaned_kprobes.) + */ + hlist_del_rcu(&op->kp.hlist); + } else + list_del_init(&op->list); } mutex_unlock(&text_mutex); put_online_cpus(); -end: +} + +/* Reclaim all kprobes on the free_list */ +static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) +{ + struct optimized_kprobe *op, *tmp; + + list_for_each_entry_safe(op, tmp, free_list, list) { + BUG_ON(!kprobe_unused(&op->kp)); + list_del_init(&op->list); + free_aggr_kprobe(&op->kp); + } +} + +/* Start optimizer after OPTIMIZE_DELAY passed */ +static __kprobes void kick_kprobe_optimizer(void) +{ + if (!delayed_work_pending(&optimizing_work)) + schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); +} + +/* Kprobe jump optimizer */ +static __kprobes void kprobe_optimizer(struct work_struct *work) +{ + LIST_HEAD(free_list); + + /* Lock modules while optimizing kprobes */ + mutex_lock(&module_mutex); + mutex_lock(&kprobe_mutex); + + /* + * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) + * kprobes before waiting for quiesence period. + */ + do_unoptimize_kprobes(&free_list); + + /* + * Step 2: Wait for quiesence period to ensure all running interrupts + * are done. Because optprobe may modify multiple instructions + * there is a chance that Nth instruction is interrupted. In that + * case, running interrupt can return to 2nd-Nth byte of jump + * instruction. This wait is for avoiding it. + */ + synchronize_sched(); + + /* Step 3: Optimize kprobes after quiesence period */ + do_optimize_kprobes(); + + /* Step 4: Free cleaned kprobes after quiesence period */ + do_free_cleaned_kprobes(&free_list); + mutex_unlock(&kprobe_mutex); mutex_unlock(&module_mutex); + + /* Step 5: Kick optimizer again if needed */ + if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) + kick_kprobe_optimizer(); + else + /* Wake up all waiters */ + complete_all(&optimizer_comp); +} + +/* Wait for completing optimization and unoptimization */ +static __kprobes void wait_for_kprobe_optimizer(void) +{ + if (delayed_work_pending(&optimizing_work)) + wait_for_completion(&optimizer_comp); } /* Optimize kprobe if p is ready to be optimized */ @@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p) /* Check if it is already optimized. */ if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) return; - op->kp.flags |= KPROBE_FLAG_OPTIMIZED; - list_add(&op->list, &optimizing_list); - if (!delayed_work_pending(&optimizing_work)) - schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); + + if (!list_empty(&op->list)) + /* This is under unoptimizing. Just dequeue the probe */ + list_del_init(&op->list); + else { + list_add(&op->list, &optimizing_list); + kick_kprobe_optimizer(); + } +} + +/* Short cut to direct unoptimizing */ +static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) +{ + get_online_cpus(); + arch_unoptimize_kprobe(op); + put_online_cpus(); + if (kprobe_disabled(&op->kp)) + arch_disarm_kprobe(&op->kp); } /* Unoptimize a kprobe if p is optimized */ -static __kprobes void unoptimize_kprobe(struct kprobe *p) +static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) { struct optimized_kprobe *op; - if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { - op = container_of(p, struct optimized_kprobe, kp); - if (!list_empty(&op->list)) - /* Dequeue from the optimization queue */ + if (!kprobe_aggrprobe(p) || kprobe_disarmed(p)) + return; /* This is not an optprobe nor optimized */ + + op = container_of(p, struct optimized_kprobe, kp); + if (!kprobe_optimized(p)) { + /* Unoptimized or unoptimizing case */ + if (force && !list_empty(&op->list)) { + /* + * Only if this is unoptimizing kprobe and forced, + * forcibly unoptimize it. (No need to unoptimize + * unoptimized kprobe again :) + */ list_del_init(&op->list); - else - /* Replace jump with break */ - arch_unoptimize_kprobe(op); - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + force_unoptimize_kprobe(op); + } + return; + } + + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + if (!list_empty(&op->list)) { + /* Dequeue from the optimization queue */ + list_del_init(&op->list); + return; + } + /* Optimized kprobe case */ + if (force) + /* Forcibly update the code: this is a special case */ + force_unoptimize_kprobe(op); + else { + list_add(&op->list, &unoptimizing_list); + kick_kprobe_optimizer(); } } +/* Cancel unoptimizing for reusing */ +static void reuse_unused_kprobe(struct kprobe *ap) +{ + struct optimized_kprobe *op; + + BUG_ON(!kprobe_unused(ap)); + /* + * Unused kprobe MUST be on the way of delayed unoptimizing (means + * there is still a relative jump) and disabled. + */ + op = container_of(ap, struct optimized_kprobe, kp); + if (unlikely(list_empty(&op->list))) + printk(KERN_WARNING "Warning: found a stray unused " + "aggrprobe@%p\n", ap->addr); + /* Enable the probe again */ + ap->flags &= ~KPROBE_FLAG_DISABLED; + /* Optimize it again (remove from op->list) */ + BUG_ON(!kprobe_optready(ap)); + optimize_kprobe(ap); +} + /* Remove optimized instructions */ static void __kprobes kill_optimized_kprobe(struct kprobe *p) { struct optimized_kprobe *op; op = container_of(p, struct optimized_kprobe, kp); - if (!list_empty(&op->list)) { - /* Dequeue from the optimization queue */ + if (!list_empty(&op->list)) + /* Dequeue from the (un)optimization queue */ list_del_init(&op->list); - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; - } - /* Don't unoptimize, because the target code will be freed. */ + + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + /* Don't touch the code, because it is already freed. */ arch_remove_optimized_kprobe(op); } @@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p) arch_prepare_optimized_kprobe(op); } -/* Free optimized instructions and optimized_kprobe */ -static __kprobes void free_aggr_kprobe(struct kprobe *p) -{ - struct optimized_kprobe *op; - - op = container_of(p, struct optimized_kprobe, kp); - arch_remove_optimized_kprobe(op); - kfree(op); -} - /* Allocate new optimized_kprobe and try to prepare optimized instructions */ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) { @@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) op = container_of(ap, struct optimized_kprobe, kp); if (!arch_prepared_optinsn(&op->optinsn)) { /* If failed to setup optimizing, fallback to kprobe */ - free_aggr_kprobe(ap); + arch_remove_optimized_kprobe(op); + kfree(op); return; } @@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void) return; kprobes_allow_optimization = false; - printk(KERN_INFO "Kprobes globally unoptimized\n"); - get_online_cpus(); /* For avoiding text_mutex deadlock */ - mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { if (!kprobe_disabled(p)) - unoptimize_kprobe(p); + unoptimize_kprobe(p, false); } } - - mutex_unlock(&text_mutex); - put_online_cpus(); - /* Allow all currently running kprobes to complete */ - synchronize_sched(); + /* Wait for unoptimizing completion */ + wait_for_kprobe_optimizer(); + printk(KERN_INFO "Kprobes globally unoptimized\n"); } int sysctl_kprobes_optimization; @@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, } #endif /* CONFIG_SYSCTL */ +/* Put a breakpoint for a probe. Must be called with text_mutex locked */ static void __kprobes __arm_kprobe(struct kprobe *p) { - struct kprobe *old_p; + struct kprobe *_p; /* Check collision with other optimized kprobes */ - old_p = get_optimized_kprobe((unsigned long)p->addr); - if (unlikely(old_p)) - unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ + _p = get_optimized_kprobe((unsigned long)p->addr); + if (unlikely(_p)) + /* Fallback to unoptimized kprobe */ + unoptimize_kprobe(_p, true); arch_arm_kprobe(p); optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ } -static void __kprobes __disarm_kprobe(struct kprobe *p) +/* Remove the breakpoint of a probe. Must be called with text_mutex locked */ +static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) { - struct kprobe *old_p; + struct kprobe *_p; - unoptimize_kprobe(p); /* Try to unoptimize */ - arch_disarm_kprobe(p); + unoptimize_kprobe(p, false); /* Try to unoptimize */ - /* If another kprobe was blocked, optimize it. */ - old_p = get_optimized_kprobe((unsigned long)p->addr); - if (unlikely(old_p)) - optimize_kprobe(old_p); + if (!kprobe_queued(p)) { + arch_disarm_kprobe(p); + /* If another kprobe was blocked, optimize it. */ + _p = get_optimized_kprobe((unsigned long)p->addr); + if (unlikely(_p) && reopt) + optimize_kprobe(_p); + } + /* TODO: reoptimize others after unoptimized this probe */ } #else /* !CONFIG_OPTPROBES */ #define optimize_kprobe(p) do {} while (0) -#define unoptimize_kprobe(p) do {} while (0) +#define unoptimize_kprobe(p, f) do {} while (0) #define kill_optimized_kprobe(p) do {} while (0) #define prepare_optimized_kprobe(p) do {} while (0) #define try_to_optimize_kprobe(p) do {} while (0) #define __arm_kprobe(p) arch_arm_kprobe(p) -#define __disarm_kprobe(p) arch_disarm_kprobe(p) +#define __disarm_kprobe(p, o) arch_disarm_kprobe(p) +#define kprobe_disarmed(p) kprobe_disabled(p) +#define wait_for_kprobe_optimizer() do {} while (0) + +/* There should be no unused kprobes can be reused without optimization */ +static void reuse_unused_kprobe(struct kprobe *ap) +{ + printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); + BUG_ON(kprobe_unused(ap)); +} static __kprobes void free_aggr_kprobe(struct kprobe *p) { + arch_remove_kprobe(p); kfree(p); } @@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp) /* Disarm a kprobe with text_mutex */ static void __kprobes disarm_kprobe(struct kprobe *kp) { - get_online_cpus(); /* For avoiding text_mutex deadlock */ + /* Ditto */ mutex_lock(&text_mutex); - __disarm_kprobe(kp); + __disarm_kprobe(kp, true); mutex_unlock(&text_mutex); - put_online_cpus(); } /* @@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); if (p->break_handler || p->post_handler) - unoptimize_kprobe(ap); /* Fall back to normal kprobe */ + unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */ if (p->break_handler) { if (ap->break_handler) @@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) * This is the second or subsequent kprobe at the address - handle * the intricacies */ -static int __kprobes register_aggr_kprobe(struct kprobe *old_p, +static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) { int ret = 0; - struct kprobe *ap = old_p; + struct kprobe *ap = orig_p; - if (!kprobe_aggrprobe(old_p)) { - /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ - ap = alloc_aggr_kprobe(old_p); + if (!kprobe_aggrprobe(orig_p)) { + /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ + ap = alloc_aggr_kprobe(orig_p); if (!ap) return -ENOMEM; - init_aggr_kprobe(ap, old_p); - } + init_aggr_kprobe(ap, orig_p); + } else if (kprobe_unused(ap)) + /* This probe is going to die. Rescue it */ + reuse_unused_kprobe(ap); if (kprobe_gone(ap)) { /* @@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, return add_new_kprobe(ap, p); } -/* Try to disable aggr_kprobe, and return 1 if succeeded.*/ -static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p) -{ - struct kprobe *kp; - - list_for_each_entry_rcu(kp, &p->list, list) { - if (!kprobe_disabled(kp)) - /* - * There is an active probe on the list. - * We can't disable aggr_kprobe. - */ - return 0; - } - p->flags |= KPROBE_FLAG_DISABLED; - return 1; -} - static int __kprobes in_kprobes_functions(unsigned long addr) { struct kprobe_blackpoint *kb; @@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) /* Check passed kprobe is valid and return kprobe in kprobe_table. */ static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) { - struct kprobe *old_p, *list_p; + struct kprobe *ap, *list_p; - old_p = get_kprobe(p->addr); - if (unlikely(!old_p)) + ap = get_kprobe(p->addr); + if (unlikely(!ap)) return NULL; - if (p != old_p) { - list_for_each_entry_rcu(list_p, &old_p->list, list) + if (p != ap) { + list_for_each_entry_rcu(list_p, &ap->list, list) if (list_p == p) /* kprobe p is a valid probe */ goto valid; return NULL; } valid: - return old_p; + return ap; } /* Return error if the kprobe is being re-registered */ static inline int check_kprobe_rereg(struct kprobe *p) { int ret = 0; - struct kprobe *old_p; mutex_lock(&kprobe_mutex); - old_p = __get_valid_kprobe(p); - if (old_p) + if (__get_valid_kprobe(p)) ret = -EINVAL; mutex_unlock(&kprobe_mutex); + return ret; } @@ -1229,67 +1403,121 @@ fail_with_jump_label: } EXPORT_SYMBOL_GPL(register_kprobe); +/* Check if all probes on the aggrprobe are disabled */ +static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) +{ + struct kprobe *kp; + + list_for_each_entry_rcu(kp, &ap->list, list) + if (!kprobe_disabled(kp)) + /* + * There is an active probe on the list. + * We can't disable this ap. + */ + return 0; + + return 1; +} + +/* Disable one kprobe: Make sure called under kprobe_mutex is locked */ +static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) +{ + struct kprobe *orig_p; + + /* Get an original kprobe for return */ + orig_p = __get_valid_kprobe(p); + if (unlikely(orig_p == NULL)) + return NULL; + + if (!kprobe_disabled(p)) { + /* Disable probe if it is a child probe */ + if (p != orig_p) + p->flags |= KPROBE_FLAG_DISABLED; + + /* Try to disarm and disable this/parent probe */ + if (p == orig_p || aggr_kprobe_disabled(orig_p)) { + disarm_kprobe(orig_p); + orig_p->flags |= KPROBE_FLAG_DISABLED; + } + } + + return orig_p; +} + /* * Unregister a kprobe without a scheduler synchronization. */ static int __kprobes __unregister_kprobe_top(struct kprobe *p) { - struct kprobe *old_p, *list_p; + struct kprobe *ap, *list_p; - old_p = __get_valid_kprobe(p); - if (old_p == NULL) + /* Disable kprobe. This will disarm it if needed. */ + ap = __disable_kprobe(p); + if (ap == NULL) return -EINVAL; - if (old_p == p || - (kprobe_aggrprobe(old_p) && - list_is_singular(&old_p->list))) { + if (ap == p) /* - * Only probe on the hash list. Disarm only if kprobes are - * enabled and not gone - otherwise, the breakpoint would - * already have been removed. We save on flushing icache. + * This probe is an independent(and non-optimized) kprobe + * (not an aggrprobe). Remove from the hash list. */ - if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) - disarm_kprobe(old_p); - hlist_del_rcu(&old_p->hlist); - } else { + goto disarmed; + + /* Following process expects this probe is an aggrprobe */ + WARN_ON(!kprobe_aggrprobe(ap)); + + if (list_is_singular(&ap->list) && kprobe_disarmed(ap)) + /* + * !disarmed could be happen if the probe is under delayed + * unoptimizing. + */ + goto disarmed; + else { + /* If disabling probe has special handlers, update aggrprobe */ if (p->break_handler && !kprobe_gone(p)) - old_p->break_handler = NULL; + ap->break_handler = NULL; if (p->post_handler && !kprobe_gone(p)) { - list_for_each_entry_rcu(list_p, &old_p->list, list) { + list_for_each_entry_rcu(list_p, &ap->list, list) { if ((list_p != p) && (list_p->post_handler)) goto noclean; } - old_p->post_handler = NULL; + ap->post_handler = NULL; } noclean: + /* + * Remove from the aggrprobe: this path will do nothing in + * __unregister_kprobe_bottom(). + */ list_del_rcu(&p->list); - if (!kprobe_disabled(old_p)) { - try_to_disable_aggr_kprobe(old_p); - if (!kprobes_all_disarmed) { - if (kprobe_disabled(old_p)) - disarm_kprobe(old_p); - else - /* Try to optimize this probe again */ - optimize_kprobe(old_p); - } - } + if (!kprobe_disabled(ap) && !kprobes_all_disarmed) + /* + * Try to optimize this probe again, because post + * handler may have been changed. + */ + optimize_kprobe(ap); } return 0; + +disarmed: + BUG_ON(!kprobe_disarmed(ap)); + hlist_del_rcu(&ap->hlist); + return 0; } static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) { - struct kprobe *old_p; + struct kprobe *ap; if (list_empty(&p->list)) + /* This is an independent kprobe */ arch_remove_kprobe(p); else if (list_is_singular(&p->list)) { - /* "p" is the last child of an aggr_kprobe */ - old_p = list_entry(p->list.next, struct kprobe, list); + /* This is the last child of an aggrprobe */ + ap = list_entry(p->list.next, struct kprobe, list); list_del(&p->list); - arch_remove_kprobe(old_p); - free_aggr_kprobe(old_p); + free_aggr_kprobe(ap); } + /* Otherwise, do nothing. */ } int __kprobes register_kprobes(struct kprobe **kps, int num) @@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p) int __kprobes disable_kprobe(struct kprobe *kp) { int ret = 0; - struct kprobe *p; mutex_lock(&kprobe_mutex); - /* Check whether specified probe is valid. */ - p = __get_valid_kprobe(kp); - if (unlikely(p == NULL)) { + /* Disable this kprobe */ + if (__disable_kprobe(kp) == NULL) ret = -EINVAL; - goto out; - } - /* If the probe is already disabled (or gone), just return */ - if (kprobe_disabled(kp)) - goto out; - - kp->flags |= KPROBE_FLAG_DISABLED; - if (p != kp) - /* When kp != p, p is always enabled. */ - try_to_disable_aggr_kprobe(p); - - if (!kprobes_all_disarmed && kprobe_disabled(p)) - disarm_kprobe(p); -out: mutex_unlock(&kprobe_mutex); return ret; } @@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void) mutex_lock(&kprobe_mutex); /* If kprobes are already disarmed, just return */ - if (kprobes_all_disarmed) - goto already_disabled; + if (kprobes_all_disarmed) { + mutex_unlock(&kprobe_mutex); + return; + } kprobes_all_disarmed = true; printk(KERN_INFO "Kprobes globally disabled\n"); - /* - * Here we call get_online_cpus() for avoiding text_mutex deadlock, - * because disarming may also unoptimize kprobes. - */ - get_online_cpus(); mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - __disarm_kprobe(p); + __disarm_kprobe(p, false); } } - mutex_unlock(&text_mutex); - put_online_cpus(); mutex_unlock(&kprobe_mutex); - /* Allow all currently running kprobes to complete */ - synchronize_sched(); - return; -already_disabled: - mutex_unlock(&kprobe_mutex); - return; + /* Wait for disarming all kprobes by optimizer */ + wait_for_kprobe_optimizer(); } /* diff --git a/kernel/kthread.c b/kernel/kthread.c index ca61bbdd44b2..5355cfd44a3f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), wait_for_completion(&create.done); if (!IS_ERR(create.result)) { - struct sched_param param = { .sched_priority = 0 }; + static struct sched_param param = { .sched_priority = 0 }; va_list args; va_start(args, namefmt); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 59b76c8ce9d7..1969d2fc4b36 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -494,7 +494,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) namelen += 2; for (i = 0; i < LOCKSTAT_POINTS; i++) { - char sym[KSYM_SYMBOL_LEN]; char ip[32]; if (class->contention_point[i] == 0) @@ -503,15 +502,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) if (!i) seq_line(m, '-', 40-namelen, namelen); - sprint_symbol(sym, class->contention_point[i]); snprintf(ip, sizeof(ip), "[<%p>]", (void *)class->contention_point[i]); - seq_printf(m, "%40s %14lu %29s %s\n", name, - stats->contention_point[i], - ip, sym); + seq_printf(m, "%40s %14lu %29s %pS\n", + name, stats->contention_point[i], + ip, (void *)class->contention_point[i]); } for (i = 0; i < LOCKSTAT_POINTS; i++) { - char sym[KSYM_SYMBOL_LEN]; char ip[32]; if (class->contending_point[i] == 0) @@ -520,12 +517,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) if (!i) seq_line(m, '-', 40-namelen, namelen); - sprint_symbol(sym, class->contending_point[i]); snprintf(ip, sizeof(ip), "[<%p>]", (void *)class->contending_point[i]); - seq_printf(m, "%40s %14lu %29s %s\n", name, - stats->contending_point[i], - ip, sym); + seq_printf(m, "%40s %14lu %29s %pS\n", + name, stats->contending_point[i], + ip, (void *)class->contending_point[i]); } if (i) { seq_puts(m, "\n"); diff --git a/kernel/module.c b/kernel/module.c index d190664f25ff..34e00b708fad 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -56,6 +56,7 @@ #include <linux/percpu.h> #include <linux/kmemleak.h> #include <linux/jump_label.h> +#include <linux/pfn.h> #define CREATE_TRACE_POINTS #include <trace/events/module.h> @@ -70,6 +71,26 @@ #define ARCH_SHF_SMALL 0 #endif +/* + * Modules' sections will be aligned on page boundaries + * to ensure complete separation of code and data, but + * only when CONFIG_DEBUG_SET_MODULE_RONX=y + */ +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +# define debug_align(X) ALIGN(X, PAGE_SIZE) +#else +# define debug_align(X) (X) +#endif + +/* + * Given BASE and SIZE this macro calculates the number of pages the + * memory regions occupies + */ +#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \ + (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \ + PFN_DOWN((unsigned long)BASE) + 1) \ + : (0UL)) + /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) @@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod) return 0; } +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +/* + * LKM RO/NX protection: protect module's text/ro-data + * from modification and any data from execution. + */ +void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages)) +{ + unsigned long begin_pfn = PFN_DOWN((unsigned long)start); + unsigned long end_pfn = PFN_DOWN((unsigned long)end); + + if (end_pfn > begin_pfn) + set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); +} + +static void set_section_ro_nx(void *base, + unsigned long text_size, + unsigned long ro_size, + unsigned long total_size) +{ + /* begin and end PFNs of the current subsection */ + unsigned long begin_pfn; + unsigned long end_pfn; + + /* + * Set RO for module text and RO-data: + * - Always protect first page. + * - Do not protect last partial page. + */ + if (ro_size > 0) + set_page_attributes(base, base + ro_size, set_memory_ro); + + /* + * Set NX permissions for module data: + * - Do not protect first partial page. + * - Always protect last page. + */ + if (total_size > text_size) { + begin_pfn = PFN_UP((unsigned long)base + text_size); + end_pfn = PFN_UP((unsigned long)base + total_size); + if (end_pfn > begin_pfn) + set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); + } +} + +/* Setting memory back to RW+NX before releasing it */ +void unset_section_ro_nx(struct module *mod, void *module_region) +{ + unsigned long total_pages; + + if (mod->module_core == module_region) { + /* Set core as NX+RW */ + total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); + set_memory_nx((unsigned long)mod->module_core, total_pages); + set_memory_rw((unsigned long)mod->module_core, total_pages); + + } else if (mod->module_init == module_region) { + /* Set init as NX+RW */ + total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); + set_memory_nx((unsigned long)mod->module_init, total_pages); + set_memory_rw((unsigned long)mod->module_init, total_pages); + } +} + +/* Iterate through all modules and set each module's text as RW */ +void set_all_modules_text_rw() +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry_rcu(mod, &modules, list) { + if ((mod->module_core) && (mod->core_text_size)) { + set_page_attributes(mod->module_core, + mod->module_core + mod->core_text_size, + set_memory_rw); + } + if ((mod->module_init) && (mod->init_text_size)) { + set_page_attributes(mod->module_init, + mod->module_init + mod->init_text_size, + set_memory_rw); + } + } + mutex_unlock(&module_mutex); +} + +/* Iterate through all modules and set each module's text as RO */ +void set_all_modules_text_ro() +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry_rcu(mod, &modules, list) { + if ((mod->module_core) && (mod->core_text_size)) { + set_page_attributes(mod->module_core, + mod->module_core + mod->core_text_size, + set_memory_ro); + } + if ((mod->module_init) && (mod->init_text_size)) { + set_page_attributes(mod->module_init, + mod->module_init + mod->init_text_size, + set_memory_ro); + } + } + mutex_unlock(&module_mutex); +} +#else +static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } +static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } +#endif + /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -1566,6 +1696,7 @@ static void free_module(struct module *mod) destroy_params(mod->kp, mod->num_kp); /* This may be NULL, but that's OK */ + unset_section_ro_nx(mod, mod->module_init); module_free(mod, mod->module_init); kfree(mod->args); percpu_modfree(mod); @@ -1574,6 +1705,7 @@ static void free_module(struct module *mod) lockdep_free_key_range(mod->module_core, mod->core_size); /* Finally, free the core (containing the module structure) */ + unset_section_ro_nx(mod, mod->module_core); module_free(mod, mod->module_core); #ifdef CONFIG_MPU @@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info) s->sh_entsize = get_offset(mod, &mod->core_size, s, i); DEBUGP("\t%s\n", name); } - if (m == 0) + switch (m) { + case 0: /* executable */ + mod->core_size = debug_align(mod->core_size); mod->core_text_size = mod->core_size; + break; + case 1: /* RO: text and ro-data */ + mod->core_size = debug_align(mod->core_size); + mod->core_ro_size = mod->core_size; + break; + case 3: /* whole core */ + mod->core_size = debug_align(mod->core_size); + break; + } } DEBUGP("Init section allocation order:\n"); @@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info) | INIT_OFFSET_MASK); DEBUGP("\t%s\n", sname); } - if (m == 0) + switch (m) { + case 0: /* executable */ + mod->init_size = debug_align(mod->init_size); mod->init_text_size = mod->init_size; + break; + case 1: /* RO: text and ro-data */ + mod->init_size = debug_align(mod->init_size); + mod->init_ro_size = mod->init_size; + break; + case 3: /* whole init */ + mod->init_size = debug_align(mod->init_size); + break; + } } } @@ -2722,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); + /* Set RO and NX regions for core */ + set_section_ro_nx(mod->module_core, + mod->core_text_size, + mod->core_ro_size, + mod->core_size); + + /* Set RO and NX regions for init */ + set_section_ro_nx(mod->module_init, + mod->init_text_size, + mod->init_ro_size, + mod->init_size); + do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -2765,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, mod->symtab = mod->core_symtab; mod->strtab = mod->core_strtab; #endif + unset_section_ro_nx(mod, mod->module_init); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; diff --git a/kernel/mutex.c b/kernel/mutex.c index 200407c1502f..a5889fb28ecf 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - cpu_relax(); + arch_mutex_cpu_relax(); } #endif spin_lock_mutex(&lock->wait_lock, flags); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2870feee81dd..11847bf1e8cc 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -13,6 +13,7 @@ #include <linux/mm.h> #include <linux/cpu.h> #include <linux/smp.h> +#include <linux/idr.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/slab.h> @@ -21,7 +22,9 @@ #include <linux/dcache.h> #include <linux/percpu.h> #include <linux/ptrace.h> +#include <linux/reboot.h> #include <linux/vmstat.h> +#include <linux/device.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/rculist.h> @@ -133,6 +136,28 @@ static void unclone_ctx(struct perf_event_context *ctx) } } +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_tgid_nr_ns(p, event->ns); +} + +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_pid_nr_ns(p, event->ns); +} + /* * If we inherit events we want to return the parent event id * to userspace. @@ -312,9 +337,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_stat++; } +/* + * Called at perf_event creation and when events are attached/detached from a + * group. + */ +static void perf_event__read_size(struct perf_event *event) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + event->read_size = size; +} + +static void perf_event__header_size(struct perf_event *event) +{ + struct perf_sample_data *data; + u64 sample_type = event->attr.sample_type; + u16 size = 0; + + perf_event__read_size(event); + + if (sample_type & PERF_SAMPLE_IP) + size += sizeof(data->ip); + + if (sample_type & PERF_SAMPLE_ADDR) + size += sizeof(data->addr); + + if (sample_type & PERF_SAMPLE_PERIOD) + size += sizeof(data->period); + + if (sample_type & PERF_SAMPLE_READ) + size += event->read_size; + + event->header_size = size; +} + +static void perf_event__id_header_size(struct perf_event *event) +{ + struct perf_sample_data *data; + u64 sample_type = event->attr.sample_type; + u16 size = 0; + + if (sample_type & PERF_SAMPLE_TID) + size += sizeof(data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + size += sizeof(data->time); + + if (sample_type & PERF_SAMPLE_ID) + size += sizeof(data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + size += sizeof(data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + size += sizeof(data->cpu_entry); + + event->id_header_size = size; +} + static void perf_group_attach(struct perf_event *event) { - struct perf_event *group_leader = event->group_leader; + struct perf_event *group_leader = event->group_leader, *pos; /* * We can have double attach due to group movement in perf_event_open. @@ -333,6 +433,11 @@ static void perf_group_attach(struct perf_event *event) list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; + + perf_event__header_size(group_leader); + + list_for_each_entry(pos, &group_leader->sibling_list, group_entry) + perf_event__header_size(pos); } /* @@ -391,7 +496,7 @@ static void perf_group_detach(struct perf_event *event) if (event->group_leader != event) { list_del_init(&event->group_entry); event->group_leader->nr_siblings--; - return; + goto out; } if (!list_empty(&event->group_entry)) @@ -410,6 +515,12 @@ static void perf_group_detach(struct perf_event *event) /* Inherit group flags from the previous leader */ sibling->group_flags = event->group_flags; } + +out: + perf_event__header_size(event->group_leader); + + list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) + perf_event__header_size(tmp); } static inline int @@ -1073,7 +1184,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh) /* * not supported on inherited events */ - if (event->attr.inherit) + if (event->attr.inherit || !is_sampling_event(event)) return -EINVAL; atomic_add(refresh, &event->event_limit); @@ -2289,31 +2400,6 @@ static int perf_release(struct inode *inode, struct file *file) return perf_event_release_kernel(event); } -static int perf_event_read_size(struct perf_event *event) -{ - int entry = sizeof(u64); /* value */ - int size = 0; - int nr = 1; - - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - size += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - size += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_ID) - entry += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_GROUP) { - nr += event->group_leader->nr_siblings; - size += sizeof(u64); - } - - size += entry * nr; - - return size; -} - u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; @@ -2428,7 +2514,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) if (event->state == PERF_EVENT_STATE_ERROR) return 0; - if (count < perf_event_read_size(event)) + if (count < event->read_size) return -ENOSPC; WARN_ON_ONCE(event->ctx->parent_ctx); @@ -2514,7 +2600,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) int ret = 0; u64 value; - if (!event->attr.sample_period) + if (!is_sampling_event(event)) return -EINVAL; if (copy_from_user(&value, arg, sizeof(value))) @@ -3305,6 +3391,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, } while (len); } +static void __perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = event->attr.sample_type; + + data->type = sample_type; + header->size += event->id_header_size; + + if (sample_type & PERF_SAMPLE_TID) { + /* namespace issues */ + data->tid_entry.pid = perf_event_pid(event, current); + data->tid_entry.tid = perf_event_tid(event, current); + } + + if (sample_type & PERF_SAMPLE_TIME) + data->time = perf_clock(); + + if (sample_type & PERF_SAMPLE_ID) + data->id = primary_event_id(event); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + data->stream_id = event->id; + + if (sample_type & PERF_SAMPLE_CPU) { + data->cpu_entry.cpu = raw_smp_processor_id(); + data->cpu_entry.reserved = 0; + } +} + +static void perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + if (event->attr.sample_id_all) + __perf_event_header__init_id(header, data, event); +} + +static void __perf_event__output_id_sample(struct perf_output_handle *handle, + struct perf_sample_data *data) +{ + u64 sample_type = data->type; + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); +} + +static void perf_event__output_id_sample(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *sample) +{ + if (event->attr.sample_id_all) + __perf_event__output_id_sample(handle, sample); +} + int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, int nmi, int sample) @@ -3312,6 +3465,7 @@ int perf_output_begin(struct perf_output_handle *handle, struct perf_buffer *buffer; unsigned long tail, offset, head; int have_lost; + struct perf_sample_data sample_data; struct { struct perf_event_header header; u64 id; @@ -3338,8 +3492,12 @@ int perf_output_begin(struct perf_output_handle *handle, goto out; have_lost = local_read(&buffer->lost); - if (have_lost) - size += sizeof(lost_event); + if (have_lost) { + lost_event.header.size = sizeof(lost_event); + perf_event_header__init_id(&lost_event.header, &sample_data, + event); + size += lost_event.header.size; + } perf_output_get_handle(handle); @@ -3370,11 +3528,11 @@ int perf_output_begin(struct perf_output_handle *handle, if (have_lost) { lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; - lost_event.header.size = sizeof(lost_event); lost_event.id = event->id; lost_event.lost = local_xchg(&buffer->lost, 0); perf_output_put(handle, lost_event); + perf_event__output_id_sample(event, handle, &sample_data); } return 0; @@ -3407,28 +3565,6 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) -{ - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; - - return task_tgid_nr_ns(p, event->ns); -} - -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) -{ - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; - - return task_pid_nr_ns(p, event->ns); -} - static void perf_output_read_one(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) @@ -3603,61 +3739,16 @@ void perf_prepare_sample(struct perf_event_header *header, { u64 sample_type = event->attr.sample_type; - data->type = sample_type; - header->type = PERF_RECORD_SAMPLE; - header->size = sizeof(*header); + header->size = sizeof(*header) + event->header_size; header->misc = 0; header->misc |= perf_misc_flags(regs); - if (sample_type & PERF_SAMPLE_IP) { - data->ip = perf_instruction_pointer(regs); - - header->size += sizeof(data->ip); - } - - if (sample_type & PERF_SAMPLE_TID) { - /* namespace issues */ - data->tid_entry.pid = perf_event_pid(event, current); - data->tid_entry.tid = perf_event_tid(event, current); - - header->size += sizeof(data->tid_entry); - } - - if (sample_type & PERF_SAMPLE_TIME) { - data->time = perf_clock(); - - header->size += sizeof(data->time); - } - - if (sample_type & PERF_SAMPLE_ADDR) - header->size += sizeof(data->addr); - - if (sample_type & PERF_SAMPLE_ID) { - data->id = primary_event_id(event); - - header->size += sizeof(data->id); - } - - if (sample_type & PERF_SAMPLE_STREAM_ID) { - data->stream_id = event->id; - - header->size += sizeof(data->stream_id); - } - - if (sample_type & PERF_SAMPLE_CPU) { - data->cpu_entry.cpu = raw_smp_processor_id(); - data->cpu_entry.reserved = 0; - - header->size += sizeof(data->cpu_entry); - } - - if (sample_type & PERF_SAMPLE_PERIOD) - header->size += sizeof(data->period); + __perf_event_header__init_id(header, data, event); - if (sample_type & PERF_SAMPLE_READ) - header->size += perf_event_read_size(event); + if (sample_type & PERF_SAMPLE_IP) + data->ip = perf_instruction_pointer(regs); if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; @@ -3722,23 +3813,26 @@ perf_event_read_event(struct perf_event *event, struct task_struct *task) { struct perf_output_handle handle; + struct perf_sample_data sample; struct perf_read_event read_event = { .header = { .type = PERF_RECORD_READ, .misc = 0, - .size = sizeof(read_event) + perf_event_read_size(event), + .size = sizeof(read_event) + event->read_size, }, .pid = perf_event_pid(event, task), .tid = perf_event_tid(event, task), }; int ret; + perf_event_header__init_id(&read_event.header, &sample, event); ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); if (ret) return; perf_output_put(&handle, read_event); perf_output_read(&handle, event); + perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } @@ -3768,14 +3862,16 @@ static void perf_event_task_output(struct perf_event *event, struct perf_task_event *task_event) { struct perf_output_handle handle; + struct perf_sample_data sample; struct task_struct *task = task_event->task; - int size, ret; + int ret, size = task_event->event_id.header.size; - size = task_event->event_id.header.size; - ret = perf_output_begin(&handle, event, size, 0, 0); + perf_event_header__init_id(&task_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + task_event->event_id.header.size, 0, 0); if (ret) - return; + goto out; task_event->event_id.pid = perf_event_pid(event, task); task_event->event_id.ppid = perf_event_pid(event, current); @@ -3785,7 +3881,11 @@ static void perf_event_task_output(struct perf_event *event, perf_output_put(&handle, task_event->event_id); + perf_event__output_id_sample(event, &handle, &sample); + perf_output_end(&handle); +out: + task_event->event_id.header.size = size; } static int perf_event_task_match(struct perf_event *event) @@ -3900,11 +4000,16 @@ static void perf_event_comm_output(struct perf_event *event, struct perf_comm_event *comm_event) { struct perf_output_handle handle; + struct perf_sample_data sample; int size = comm_event->event_id.header.size; - int ret = perf_output_begin(&handle, event, size, 0, 0); + int ret; + + perf_event_header__init_id(&comm_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + comm_event->event_id.header.size, 0, 0); if (ret) - return; + goto out; comm_event->event_id.pid = perf_event_pid(event, comm_event->task); comm_event->event_id.tid = perf_event_tid(event, comm_event->task); @@ -3912,7 +4017,12 @@ static void perf_event_comm_output(struct perf_event *event, perf_output_put(&handle, comm_event->event_id); perf_output_copy(&handle, comm_event->comm, comm_event->comm_size); + + perf_event__output_id_sample(event, &handle, &sample); + perf_output_end(&handle); +out: + comm_event->event_id.header.size = size; } static int perf_event_comm_match(struct perf_event *event) @@ -3957,7 +4067,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->comm_size = size; comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); @@ -4038,11 +4147,15 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_mmap_event *mmap_event) { struct perf_output_handle handle; + struct perf_sample_data sample; int size = mmap_event->event_id.header.size; - int ret = perf_output_begin(&handle, event, size, 0, 0); + int ret; + perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + mmap_event->event_id.header.size, 0, 0); if (ret) - return; + goto out; mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); @@ -4050,7 +4163,12 @@ static void perf_event_mmap_output(struct perf_event *event, perf_output_put(&handle, mmap_event->event_id); perf_output_copy(&handle, mmap_event->file_name, mmap_event->file_size); + + perf_event__output_id_sample(event, &handle, &sample); + perf_output_end(&handle); +out: + mmap_event->event_id.header.size = size; } static int perf_event_mmap_match(struct perf_event *event, @@ -4205,6 +4323,7 @@ void perf_event_mmap(struct vm_area_struct *vma) static void perf_log_throttle(struct perf_event *event, int enable) { struct perf_output_handle handle; + struct perf_sample_data sample; int ret; struct { @@ -4226,11 +4345,15 @@ static void perf_log_throttle(struct perf_event *event, int enable) if (enable) throttle_event.header.type = PERF_RECORD_UNTHROTTLE; - ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); + perf_event_header__init_id(&throttle_event.header, &sample, event); + + ret = perf_output_begin(&handle, event, + throttle_event.header.size, 1, 0); if (ret) return; perf_output_put(&handle, throttle_event); + perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } @@ -4246,6 +4369,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, struct hw_perf_event *hwc = &event->hw; int ret = 0; + /* + * Non-sampling counters might still use the PMI to fold short + * hardware counters, ignore those. + */ + if (unlikely(!is_sampling_event(event))) + return 0; + if (!throttle) { hwc->interrupts++; } else { @@ -4391,7 +4521,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, if (!regs) return; - if (!hwc->sample_period) + if (!is_sampling_event(event)) return; if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) @@ -4554,7 +4684,7 @@ static int perf_swevent_add(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct hlist_head *head; - if (hwc->sample_period) { + if (is_sampling_event(event)) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } @@ -4811,15 +4941,6 @@ static int perf_tp_event_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_TRACEPOINT) return -ENOENT; - /* - * Raw tracepoint data is a severe data leak, only allow root to - * have these. - */ - if ((event->attr.sample_type & PERF_SAMPLE_RAW) && - perf_paranoid_tracepoint_raw() && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - err = perf_trace_init(event); if (err) return err; @@ -4842,7 +4963,7 @@ static struct pmu perf_tracepoint = { static inline void perf_tp_register(void) { - perf_pmu_register(&perf_tracepoint); + perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -4932,31 +5053,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) static void perf_swevent_start_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + s64 period; + + if (!is_sampling_event(event)) + return; hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - s64 period = local64_read(&hwc->period_left); - if (period) { - if (period < 0) - period = 10000; + period = local64_read(&hwc->period_left); + if (period) { + if (period < 0) + period = 10000; - local64_set(&hwc->period_left, 0); - } else { - period = max_t(u64, 10000, hwc->sample_period); - } - __hrtimer_start_range_ns(&hwc->hrtimer, + local64_set(&hwc->period_left, 0); + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, ns_to_ktime(period), 0, HRTIMER_MODE_REL_PINNED, 0); - } } static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - if (hwc->sample_period) { + if (is_sampling_event(event)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); @@ -5184,8 +5307,61 @@ static void free_pmu_context(struct pmu *pmu) out: mutex_unlock(&pmus_lock); } +static struct idr pmu_idr; + +static ssize_t +type_show(struct device *dev, struct device_attribute *attr, char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); +} + +static struct device_attribute pmu_dev_attrs[] = { + __ATTR_RO(type), + __ATTR_NULL, +}; + +static int pmu_bus_running; +static struct bus_type pmu_bus = { + .name = "event_source", + .dev_attrs = pmu_dev_attrs, +}; + +static void pmu_dev_release(struct device *dev) +{ + kfree(dev); +} + +static int pmu_dev_alloc(struct pmu *pmu) +{ + int ret = -ENOMEM; + + pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); + if (!pmu->dev) + goto out; + + device_initialize(pmu->dev); + ret = dev_set_name(pmu->dev, "%s", pmu->name); + if (ret) + goto free_dev; + + dev_set_drvdata(pmu->dev, pmu); + pmu->dev->bus = &pmu_bus; + pmu->dev->release = pmu_dev_release; + ret = device_add(pmu->dev); + if (ret) + goto free_dev; + +out: + return ret; + +free_dev: + put_device(pmu->dev); + goto out; +} -int perf_pmu_register(struct pmu *pmu) +int perf_pmu_register(struct pmu *pmu, char *name, int type) { int cpu, ret; @@ -5195,13 +5371,38 @@ int perf_pmu_register(struct pmu *pmu) if (!pmu->pmu_disable_count) goto unlock; + pmu->type = -1; + if (!name) + goto skip_type; + pmu->name = name; + + if (type < 0) { + int err = idr_pre_get(&pmu_idr, GFP_KERNEL); + if (!err) + goto free_pdc; + + err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); + if (err) { + ret = err; + goto free_pdc; + } + } + pmu->type = type; + + if (pmu_bus_running) { + ret = pmu_dev_alloc(pmu); + if (ret) + goto free_idr; + } + +skip_type: pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); if (pmu->pmu_cpu_context) goto got_cpu_context; pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); if (!pmu->pmu_cpu_context) - goto free_pdc; + goto free_dev; for_each_possible_cpu(cpu) { struct perf_cpu_context *cpuctx; @@ -5245,6 +5446,14 @@ unlock: return ret; +free_dev: + device_del(pmu->dev); + put_device(pmu->dev); + +free_idr: + if (pmu->type >= PERF_TYPE_MAX) + idr_remove(&pmu_idr, pmu->type); + free_pdc: free_percpu(pmu->pmu_disable_count); goto unlock; @@ -5264,6 +5473,10 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_rcu(); free_percpu(pmu->pmu_disable_count); + if (pmu->type >= PERF_TYPE_MAX) + idr_remove(&pmu_idr, pmu->type); + device_del(pmu->dev); + put_device(pmu->dev); free_pmu_context(pmu); } @@ -5273,6 +5486,13 @@ struct pmu *perf_init_event(struct perf_event *event) int idx; idx = srcu_read_lock(&pmus_srcu); + + rcu_read_lock(); + pmu = idr_find(&pmu_idr, event->attr.type); + rcu_read_unlock(); + if (pmu) + goto unlock; + list_for_each_entry_rcu(pmu, &pmus, entry) { int ret = pmu->event_init(event); if (!ret) @@ -5738,6 +5958,12 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(¤t->perf_event_mutex); /* + * Precalculate sample_data sizes + */ + perf_event__header_size(event); + perf_event__id_header_size(event); + + /* * Drop the reference on the group_event after placing the * new event on the sibling_list. This ensures destruction * of the group leader will find the pointer to itself in @@ -6090,6 +6316,12 @@ inherit_event(struct perf_event *parent_event, child_event->overflow_handler = parent_event->overflow_handler; /* + * Precalculate sample_data sizes + */ + perf_event__header_size(child_event); + perf_event__id_header_size(child_event); + + /* * Link it up in the child's context: */ raw_spin_lock_irqsave(&child_ctx->lock, flags); @@ -6320,7 +6552,7 @@ static void __cpuinit perf_event_init_cpu(int cpu) mutex_unlock(&swhash->hlist_mutex); } -#ifdef CONFIG_HOTPLUG_CPU +#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC static void perf_pmu_rotate_stop(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); @@ -6374,6 +6606,26 @@ static void perf_event_exit_cpu(int cpu) static inline void perf_event_exit_cpu(int cpu) { } #endif +static int +perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) +{ + int cpu; + + for_each_online_cpu(cpu) + perf_event_exit_cpu(cpu); + + return NOTIFY_OK; +} + +/* + * Run the perf reboot notifier at the very last possible moment so that + * the generic watchdog code runs as long as possible. + */ +static struct notifier_block perf_reboot_notifier = { + .notifier_call = perf_reboot, + .priority = INT_MIN, +}; + static int __cpuinit perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -6402,14 +6654,45 @@ void __init perf_event_init(void) { int ret; + idr_init(&pmu_idr); + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); - perf_pmu_register(&perf_swevent); - perf_pmu_register(&perf_cpu_clock); - perf_pmu_register(&perf_task_clock); + perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); + perf_pmu_register(&perf_cpu_clock, NULL, -1); + perf_pmu_register(&perf_task_clock, NULL, -1); perf_tp_register(); perf_cpu_notifier(perf_cpu_notify); + register_reboot_notifier(&perf_reboot_notifier); ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); } + +static int __init perf_event_sysfs_init(void) +{ + struct pmu *pmu; + int ret; + + mutex_lock(&pmus_lock); + + ret = bus_register(&pmu_bus); + if (ret) + goto unlock; + + list_for_each_entry(pmu, &pmus, entry) { + if (!pmu->name || pmu->type < 0) + continue; + + ret = pmu_dev_alloc(pmu); + WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); + } + pmu_bus_running = 1; + ret = 0; + +unlock: + mutex_unlock(&pmus_lock); + + return ret; +} +device_initcall(perf_event_sysfs_init); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 9ca4973f736d..93bd2eb2bc53 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer); static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); -static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); + +#define lock_timer(tid, flags) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ + __timr; \ +}) static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) { @@ -619,7 +625,7 @@ out: * the find to the timer lock. To avoid a dead lock, the timer id MUST * be release with out holding the timer lock. */ -static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; /* diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ecf770509d0d..031d5e3a6197 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/suspend.h> +#include <trace/events/power.h> #include "power.h" @@ -201,6 +202,7 @@ int suspend_devices_and_enter(suspend_state_t state) if (!suspend_ops) return -ENOSYS; + trace_machine_suspend(state); if (suspend_ops->begin) { error = suspend_ops->begin(state); if (error) @@ -229,6 +231,7 @@ int suspend_devices_and_enter(suspend_state_t state) Close: if (suspend_ops->end) suspend_ops->end(); + trace_machine_suspend(PWR_EVENT_EXIT); return error; Recover_platform: diff --git a/kernel/printk.c b/kernel/printk.c index a23315dc4498..ab3ffc5b3b64 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending); void printk_tick(void) { - if (__get_cpu_var(printk_pending)) { - __get_cpu_var(printk_pending) = 0; + if (__this_cpu_read(printk_pending)) { + __this_cpu_write(printk_pending, 0); wake_up_interruptible(&log_wait); } } int printk_needs_cpu(int cpu) { - if (unlikely(cpu_is_offline(cpu))) + if (cpu_is_offline(cpu)) printk_tick(); - return per_cpu(printk_pending, cpu); + return __this_cpu_read(printk_pending); } void wake_up_klogd(void) diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index d806735342ac..034493724749 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -36,31 +36,16 @@ #include <linux/time.h> #include <linux/cpu.h> -/* Global control variables for rcupdate callback mechanism. */ -struct rcu_ctrlblk { - struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ - struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ - struct rcu_head **curtail; /* ->next pointer of last CB. */ -}; - -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_sched_ctrlblk = { - .donetail = &rcu_sched_ctrlblk.rcucblist, - .curtail = &rcu_sched_ctrlblk.rcucblist, -}; - -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .donetail = &rcu_bh_ctrlblk.rcucblist, - .curtail = &rcu_bh_ctrlblk.rcucblist, -}; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -int rcu_scheduler_active __read_mostly; -EXPORT_SYMBOL_GPL(rcu_scheduler_active); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ +static struct task_struct *rcu_kthread_task; +static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); +static unsigned long have_rcu_kthread_work; +static void invoke_rcu_kthread(void); /* Forward declarations for rcutiny_plugin.h. */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); +struct rcu_ctrlblk; +static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); +static int rcu_kthread(void *arg); static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_ctrlblk *rcp); @@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu) { if (rcu_qsctr_help(&rcu_sched_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_kthread(); } /* @@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu) void rcu_bh_qs(int cpu) { if (rcu_qsctr_help(&rcu_bh_ctrlblk)) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_kthread(); } /* @@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user) } /* - * Helper function for rcu_process_callbacks() that operates on the - * specified rcu_ctrlkblk structure. + * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure + * whose grace period has elapsed. */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) +static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) { struct rcu_head *next, *list; unsigned long flags; + RCU_TRACE(int cb_count = 0); /* If no RCU callbacks ready to invoke, just return. */ if (&rcp->rcucblist == rcp->donetail) @@ -180,19 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) next = list->next; prefetch(next); debug_rcu_head_unqueue(list); + local_bh_disable(); list->func(list); + local_bh_enable(); list = next; + RCU_TRACE(cb_count++); } + RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); } /* - * Invoke any callbacks whose grace period has completed. + * This kthread invokes RCU callbacks whose grace periods have + * elapsed. It is awakened as needed, and takes the place of the + * RCU_SOFTIRQ that was used previously for this purpose. + * This is a kthread, but it is never stopped, at least not until + * the system goes down. */ -static void rcu_process_callbacks(struct softirq_action *unused) +static int rcu_kthread(void *arg) { - __rcu_process_callbacks(&rcu_sched_ctrlblk); - __rcu_process_callbacks(&rcu_bh_ctrlblk); - rcu_preempt_process_callbacks(); + unsigned long work; + unsigned long morework; + unsigned long flags; + + for (;;) { + wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0); + morework = rcu_boost(); + local_irq_save(flags); + work = have_rcu_kthread_work; + have_rcu_kthread_work = morework; + local_irq_restore(flags); + if (work) { + rcu_process_callbacks(&rcu_sched_ctrlblk); + rcu_process_callbacks(&rcu_bh_ctrlblk); + rcu_preempt_process_callbacks(); + } + schedule_timeout_interruptible(1); /* Leave CPU for others. */ + } + + return 0; /* Not reached, but needed to shut gcc up. */ +} + +/* + * Wake up rcu_kthread() to process callbacks now eligible for invocation + * or to boost readers. + */ +static void invoke_rcu_kthread(void) +{ + unsigned long flags; + + local_irq_save(flags); + have_rcu_kthread_work = 1; + wake_up(&rcu_kthread_wq); + local_irq_restore(flags); } /* @@ -230,6 +255,7 @@ static void __call_rcu(struct rcu_head *head, local_irq_save(flags); *rcp->curtail = head; rcp->curtail = &head->next; + RCU_TRACE(rcp->qlen++); local_irq_restore(flags); } @@ -282,7 +308,16 @@ void rcu_barrier_sched(void) } EXPORT_SYMBOL_GPL(rcu_barrier_sched); -void __init rcu_init(void) +/* + * Spawn the kthread that invokes RCU callbacks. + */ +static int __init rcu_spawn_kthreads(void) { - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + struct sched_param sp; + + rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); + sp.sched_priority = RCU_BOOST_PRIO; + sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); + return 0; } +early_initcall(rcu_spawn_kthreads); diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 6ceca4f745ff..015abaea962a 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -22,6 +22,40 @@ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#include <linux/kthread.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#ifdef CONFIG_RCU_TRACE +#define RCU_TRACE(stmt) stmt +#else /* #ifdef CONFIG_RCU_TRACE */ +#define RCU_TRACE(stmt) +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ + struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ + struct rcu_head **curtail; /* ->next pointer of last CB. */ + RCU_TRACE(long qlen); /* Number of pending CBs. */ +}; + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_sched_ctrlblk = { + .donetail = &rcu_sched_ctrlblk.rcucblist, + .curtail = &rcu_sched_ctrlblk.rcucblist, +}; + +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .donetail = &rcu_bh_ctrlblk.rcucblist, + .curtail = &rcu_bh_ctrlblk.rcucblist, +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +int rcu_scheduler_active __read_mostly; +EXPORT_SYMBOL_GPL(rcu_scheduler_active); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + #ifdef CONFIG_TINY_PREEMPT_RCU #include <linux/delay.h> @@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk { struct list_head *gp_tasks; /* Pointer to the first task blocking the */ /* current grace period, or NULL if there */ - /* is not such task. */ + /* is no such task. */ struct list_head *exp_tasks; /* Pointer to first task blocking the */ /* current expedited grace period, or NULL */ /* if there is no such task. If there */ /* is no current expedited grace period, */ /* then there cannot be any such task. */ +#ifdef CONFIG_RCU_BOOST + struct list_head *boost_tasks; + /* Pointer to first task that needs to be */ + /* priority-boosted, or NULL if no priority */ + /* boosting is needed. If there is no */ + /* current or expedited grace period, there */ + /* can be no such task. */ +#endif /* #ifdef CONFIG_RCU_BOOST */ u8 gpnum; /* Current grace period. */ u8 gpcpu; /* Last grace period blocked by the CPU. */ u8 completed; /* Last grace period completed. */ /* If all three are equal, RCU is idle. */ +#ifdef CONFIG_RCU_BOOST + s8 boosted_this_gp; /* Has boosting already happened? */ + unsigned long boost_time; /* When to start boosting (jiffies) */ +#endif /* #ifdef CONFIG_RCU_BOOST */ +#ifdef CONFIG_RCU_TRACE + unsigned long n_grace_periods; +#ifdef CONFIG_RCU_BOOST + unsigned long n_tasks_boosted; + unsigned long n_exp_boosts; + unsigned long n_normal_boosts; + unsigned long n_normal_balk_blkd_tasks; + unsigned long n_normal_balk_gp_tasks; + unsigned long n_normal_balk_boost_tasks; + unsigned long n_normal_balk_boosted; + unsigned long n_normal_balk_notyet; + unsigned long n_normal_balk_nos; + unsigned long n_exp_balk_blkd_tasks; + unsigned long n_exp_balk_nos; +#endif /* #ifdef CONFIG_RCU_BOOST */ +#endif /* #ifdef CONFIG_RCU_TRACE */ }; static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { @@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void) } /* + * Advance a ->blkd_tasks-list pointer to the next entry, instead + * returning NULL if at the end of the list. + */ +static struct list_head *rcu_next_node_entry(struct task_struct *t) +{ + struct list_head *np; + + np = t->rcu_node_entry.next; + if (np == &rcu_preempt_ctrlblk.blkd_tasks) + np = NULL; + return np; +} + +#ifdef CONFIG_RCU_TRACE + +#ifdef CONFIG_RCU_BOOST +static void rcu_initiate_boost_trace(void); +static void rcu_initiate_exp_boost_trace(void); +#endif /* #ifdef CONFIG_RCU_BOOST */ + +/* + * Dump additional statistice for TINY_PREEMPT_RCU. + */ +static void show_tiny_preempt_stats(struct seq_file *m) +{ + seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n", + rcu_preempt_ctrlblk.rcb.qlen, + rcu_preempt_ctrlblk.n_grace_periods, + rcu_preempt_ctrlblk.gpnum, + rcu_preempt_ctrlblk.gpcpu, + rcu_preempt_ctrlblk.completed, + "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)], + "N."[!rcu_preempt_ctrlblk.gp_tasks], + "E."[!rcu_preempt_ctrlblk.exp_tasks]); +#ifdef CONFIG_RCU_BOOST + seq_printf(m, " ttb=%c btg=", + "B."[!rcu_preempt_ctrlblk.boost_tasks]); + switch (rcu_preempt_ctrlblk.boosted_this_gp) { + case -1: + seq_puts(m, "exp"); + break; + case 0: + seq_puts(m, "no"); + break; + case 1: + seq_puts(m, "begun"); + break; + case 2: + seq_puts(m, "done"); + break; + default: + seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp); + } + seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", + rcu_preempt_ctrlblk.n_tasks_boosted, + rcu_preempt_ctrlblk.n_exp_boosts, + rcu_preempt_ctrlblk.n_normal_boosts, + (int)(jiffies & 0xffff), + (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); + seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", + "normal balk", + rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, + rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, + rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, + rcu_preempt_ctrlblk.n_normal_balk_boosted, + rcu_preempt_ctrlblk.n_normal_balk_notyet, + rcu_preempt_ctrlblk.n_normal_balk_nos); + seq_printf(m, " exp balk: bt=%lu nos=%lu\n", + rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks, + rcu_preempt_ctrlblk.n_exp_balk_nos); +#endif /* #ifdef CONFIG_RCU_BOOST */ +} + +#endif /* #ifdef CONFIG_RCU_TRACE */ + +#ifdef CONFIG_RCU_BOOST + +#include "rtmutex_common.h" + +/* + * Carry out RCU priority boosting on the task indicated by ->boost_tasks, + * and advance ->boost_tasks to the next task in the ->blkd_tasks list. + */ +static int rcu_boost(void) +{ + unsigned long flags; + struct rt_mutex mtx; + struct list_head *np; + struct task_struct *t; + + if (rcu_preempt_ctrlblk.boost_tasks == NULL) + return 0; /* Nothing to boost. */ + raw_local_irq_save(flags); + rcu_preempt_ctrlblk.boosted_this_gp++; + t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, + rcu_node_entry); + np = rcu_next_node_entry(t); + rt_mutex_init_proxy_locked(&mtx, t); + t->rcu_boost_mutex = &mtx; + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; + raw_local_irq_restore(flags); + rt_mutex_lock(&mtx); + RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); + rcu_preempt_ctrlblk.boosted_this_gp++; + rt_mutex_unlock(&mtx); + return rcu_preempt_ctrlblk.boost_tasks != NULL; +} + +/* + * Check to see if it is now time to start boosting RCU readers blocking + * the current grace period, and, if so, tell the rcu_kthread_task to + * start boosting them. If there is an expedited boost in progress, + * we wait for it to complete. + * + * If there are no blocked readers blocking the current grace period, + * return 0 to let the caller know, otherwise return 1. Note that this + * return value is independent of whether or not boosting was done. + */ +static int rcu_initiate_boost(void) +{ + if (!rcu_preempt_blocked_readers_cgp()) { + RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); + return 0; + } + if (rcu_preempt_ctrlblk.gp_tasks != NULL && + rcu_preempt_ctrlblk.boost_tasks == NULL && + rcu_preempt_ctrlblk.boosted_this_gp == 0 && + ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { + rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; + invoke_rcu_kthread(); + RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); + } else + RCU_TRACE(rcu_initiate_boost_trace()); + return 1; +} + +/* + * Initiate boosting for an expedited grace period. + */ +static void rcu_initiate_expedited_boost(void) +{ + unsigned long flags; + + raw_local_irq_save(flags); + if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) { + rcu_preempt_ctrlblk.boost_tasks = + rcu_preempt_ctrlblk.blkd_tasks.next; + rcu_preempt_ctrlblk.boosted_this_gp = -1; + invoke_rcu_kthread(); + RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); + } else + RCU_TRACE(rcu_initiate_exp_boost_trace()); + raw_local_irq_restore(flags); +} + +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000); + +/* + * Do priority-boost accounting for the start of a new grace period. + */ +static void rcu_preempt_boost_start_gp(void) +{ + rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; + if (rcu_preempt_ctrlblk.boosted_this_gp > 0) + rcu_preempt_ctrlblk.boosted_this_gp = 0; +} + +#else /* #ifdef CONFIG_RCU_BOOST */ + +/* + * If there is no RCU priority boosting, we don't boost. + */ +static int rcu_boost(void) +{ + return 0; +} + +/* + * If there is no RCU priority boosting, we don't initiate boosting, + * but we do indicate whether there are blocked readers blocking the + * current grace period. + */ +static int rcu_initiate_boost(void) +{ + return rcu_preempt_blocked_readers_cgp(); +} + +/* + * If there is no RCU priority boosting, we don't initiate expedited boosting. + */ +static void rcu_initiate_expedited_boost(void) +{ +} + +/* + * If there is no RCU priority boosting, nothing to do at grace-period start. + */ +static void rcu_preempt_boost_start_gp(void) +{ +} + +#endif /* else #ifdef CONFIG_RCU_BOOST */ + +/* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is * in a quiescent state. There might be any number of tasks blocked @@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void) rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + /* If there is no GP then there is nothing more to do. */ + if (!rcu_preempt_gp_in_progress()) + return; /* - * If there is no GP, or if blocked readers are still blocking GP, - * then there is nothing more to do. + * Check up on boosting. If there are no readers blocking the + * current grace period, leave. */ - if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) + if (rcu_initiate_boost()) return; /* Advance callbacks. */ @@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void) if (!rcu_preempt_blocked_readers_any()) rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; - /* If there are done callbacks, make RCU_SOFTIRQ process them. */ + /* If there are done callbacks, cause them to be invoked. */ if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_kthread(); } /* @@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void) /* Official start of GP. */ rcu_preempt_ctrlblk.gpnum++; + RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); /* Any blocked RCU readers block new GP. */ if (rcu_preempt_blocked_readers_any()) rcu_preempt_ctrlblk.gp_tasks = rcu_preempt_ctrlblk.blkd_tasks.next; + /* Set up for RCU priority boosting. */ + rcu_preempt_boost_start_gp(); + /* If there is no running reader, CPU is done with GP. */ if (!rcu_preempt_running_reader()) rcu_preempt_cpu_qs(); @@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t) */ empty = !rcu_preempt_blocked_readers_cgp(); empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; - np = t->rcu_node_entry.next; - if (np == &rcu_preempt_ctrlblk.blkd_tasks) - np = NULL; + np = rcu_next_node_entry(t); list_del(&t->rcu_node_entry); if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) rcu_preempt_ctrlblk.gp_tasks = np; if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) rcu_preempt_ctrlblk.exp_tasks = np; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) + rcu_preempt_ctrlblk.boost_tasks = np; +#endif /* #ifdef CONFIG_RCU_BOOST */ INIT_LIST_HEAD(&t->rcu_node_entry); /* @@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t) if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) rcu_report_exp_done(); } +#ifdef CONFIG_RCU_BOOST + /* Unboost self if was boosted. */ + if (special & RCU_READ_UNLOCK_BOOSTED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; + rt_mutex_unlock(t->rcu_boost_mutex); + t->rcu_boost_mutex = NULL; + } +#endif /* #ifdef CONFIG_RCU_BOOST */ local_irq_restore(flags); } @@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void) rcu_preempt_cpu_qs(); if (&rcu_preempt_ctrlblk.rcb.rcucblist != rcu_preempt_ctrlblk.rcb.donetail) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_kthread(); if (rcu_preempt_gp_in_progress() && rcu_cpu_blocking_cur_gp() && rcu_preempt_running_reader()) @@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void) /* * TINY_PREEMPT_RCU has an extra callback-list tail pointer to - * update, so this is invoked from __rcu_process_callbacks() to + * update, so this is invoked from rcu_process_callbacks() to * handle that case. Of course, it is invoked for all flavors of * RCU, but RCU callbacks can appear only on one of the lists, and * neither ->nexttail nor ->donetail can possibly be NULL, so there @@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) */ static void rcu_preempt_process_callbacks(void) { - __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); + rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); } /* @@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) local_irq_save(flags); *rcu_preempt_ctrlblk.nexttail = head; rcu_preempt_ctrlblk.nexttail = &head->next; + RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++); rcu_preempt_start_gp(); /* checks to see if GP needed. */ local_irq_restore(flags); } @@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void) /* Wait for tail of ->blkd_tasks list to drain. */ if (rcu_preempted_readers_exp()) + rcu_initiate_expedited_boost(); wait_event(sync_rcu_preempt_exp_wq, !rcu_preempted_readers_exp()); @@ -572,6 +857,27 @@ void exit_rcu(void) #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ +#ifdef CONFIG_RCU_TRACE + +/* + * Because preemptible RCU does not exist, it is not necessary to + * dump out its statistics. + */ +static void show_tiny_preempt_stats(struct seq_file *m) +{ +} + +#endif /* #ifdef CONFIG_RCU_TRACE */ + +/* + * Because preemptible RCU does not exist, it is never necessary to + * boost preempted RCU readers. + */ +static int rcu_boost(void) +{ + return 0; +} + /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void) #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ #ifdef CONFIG_DEBUG_LOCK_ALLOC - #include <linux/kernel_stat.h> /* * During boot, we forgive RCU lockdep issues. After this function is * invoked, we start taking RCU lockdep issues seriously. */ -void rcu_scheduler_starting(void) +void __init rcu_scheduler_starting(void) { WARN_ON(nr_context_switches() > 0); rcu_scheduler_active = 1; } #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_RCU_BOOST +#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO +#else /* #ifdef CONFIG_RCU_BOOST */ +#define RCU_BOOST_PRIO 1 +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + +#ifdef CONFIG_RCU_TRACE + +#ifdef CONFIG_RCU_BOOST + +static void rcu_initiate_boost_trace(void) +{ + if (rcu_preempt_ctrlblk.gp_tasks == NULL) + rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; + else if (rcu_preempt_ctrlblk.boost_tasks != NULL) + rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; + else if (rcu_preempt_ctrlblk.boosted_this_gp != 0) + rcu_preempt_ctrlblk.n_normal_balk_boosted++; + else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) + rcu_preempt_ctrlblk.n_normal_balk_notyet++; + else + rcu_preempt_ctrlblk.n_normal_balk_nos++; +} + +static void rcu_initiate_exp_boost_trace(void) +{ + if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) + rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++; + else + rcu_preempt_ctrlblk.n_exp_balk_nos++; +} + +#endif /* #ifdef CONFIG_RCU_BOOST */ + +static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) +{ + unsigned long flags; + + raw_local_irq_save(flags); + rcp->qlen -= n; + raw_local_irq_restore(flags); +} + +/* + * Dump statistics for TINY_RCU, such as they are. + */ +static int show_tiny_stats(struct seq_file *m, void *unused) +{ + show_tiny_preempt_stats(m); + seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); + seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); + return 0; +} + +static int show_tiny_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_tiny_stats, NULL); +} + +static const struct file_operations show_tiny_stats_fops = { + .owner = THIS_MODULE, + .open = show_tiny_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *rcudir; + +static int __init rcutiny_trace_init(void) +{ + struct dentry *retval; + + rcudir = debugfs_create_dir("rcu", NULL); + if (!rcudir) + goto free_out; + retval = debugfs_create_file("rcudata", 0444, rcudir, + NULL, &show_tiny_stats_fops); + if (!retval) + goto free_out; + return 0; +free_out: + debugfs_remove_recursive(rcudir); + return 1; +} + +static void __exit rcutiny_trace_cleanup(void) +{ + debugfs_remove_recursive(rcudir); +} + +module_init(rcutiny_trace_init); +module_exit(rcutiny_trace_cleanup); + +MODULE_AUTHOR("Paul E. McKenney"); +MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); +MODULE_LICENSE("GPL"); + +#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9d8e8fb2515f..89613f97ff26 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -47,6 +47,7 @@ #include <linux/srcu.h> #include <linux/slab.h> #include <asm/byteorder.h> +#include <linux/sched.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " @@ -64,6 +65,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ static int fqs_holdoff = 0; /* Hold time within burst (us). */ static int fqs_stutter = 3; /* Wait time between bursts (s). */ +static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ +static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ +static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ static char *torture_type = "rcu"; /* What RCU implementation to torture. */ module_param(nreaders, int, 0444); @@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444); MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +module_param(test_boost, int, 0444); +MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); +module_param(test_boost_interval, int, 0444); +MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); +module_param(test_boost_duration, int, 0444); +MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); @@ -109,6 +119,7 @@ static struct task_struct *stats_task; static struct task_struct *shuffler_task; static struct task_struct *stutter_task; static struct task_struct *fqs_task; +static struct task_struct *boost_tasks[NR_CPUS]; #define RCU_TORTURE_PIPE_LEN 10 @@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; +static long n_rcu_torture_boost_ktrerror; +static long n_rcu_torture_boost_rterror; +static long n_rcu_torture_boost_allocerror; +static long n_rcu_torture_boost_afferror; +static long n_rcu_torture_boost_failure; +static long n_rcu_torture_boosts; static long n_rcu_torture_timers; static struct list_head rcu_torture_removed; static cpumask_var_t shuffle_tmp_mask; @@ -147,6 +164,16 @@ static int stutter_pause_test; #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; +#ifdef CONFIG_RCU_BOOST +#define rcu_can_boost() 1 +#else /* #ifdef CONFIG_RCU_BOOST */ +#define rcu_can_boost() 0 +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + +static unsigned long boost_starttime; /* jiffies of next boost test start. */ +DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ + /* and boost task create/destroy. */ + /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ @@ -277,6 +304,7 @@ struct rcu_torture_ops { void (*fqs)(void); int (*stats)(char *page); int irq_capable; + int can_boost; char *name; }; @@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = { .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, + .can_boost = rcu_can_boost(), .name = "rcu" }; @@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = { .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, + .can_boost = rcu_can_boost(), .name = "rcu_sync" }; @@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, + .can_boost = rcu_can_boost(), .name = "rcu_expedited" }; @@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = { }; /* + * RCU torture priority-boost testing. Runs one real-time thread per + * CPU for moderate bursts, repeatedly registering RCU callbacks and + * spinning waiting for them to be invoked. If a given callback takes + * too long to be invoked, we assume that priority inversion has occurred. + */ + +struct rcu_boost_inflight { + struct rcu_head rcu; + int inflight; +}; + +static void rcu_torture_boost_cb(struct rcu_head *head) +{ + struct rcu_boost_inflight *rbip = + container_of(head, struct rcu_boost_inflight, rcu); + + smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ + rbip->inflight = 0; +} + +static int rcu_torture_boost(void *arg) +{ + unsigned long call_rcu_time; + unsigned long endtime; + unsigned long oldstarttime; + struct rcu_boost_inflight rbi = { .inflight = 0 }; + struct sched_param sp; + + VERBOSE_PRINTK_STRING("rcu_torture_boost started"); + + /* Set real-time priority. */ + sp.sched_priority = 1; + if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { + VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); + n_rcu_torture_boost_rterror++; + } + + /* Each pass through the following loop does one boost-test cycle. */ + do { + /* Wait for the next test interval. */ + oldstarttime = boost_starttime; + while (jiffies - oldstarttime > ULONG_MAX / 2) { + schedule_timeout_uninterruptible(1); + rcu_stutter_wait("rcu_torture_boost"); + if (kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP) + goto checkwait; + } + + /* Do one boost-test interval. */ + endtime = oldstarttime + test_boost_duration * HZ; + call_rcu_time = jiffies; + while (jiffies - endtime > ULONG_MAX / 2) { + /* If we don't have a callback in flight, post one. */ + if (!rbi.inflight) { + smp_mb(); /* RCU core before ->inflight = 1. */ + rbi.inflight = 1; + call_rcu(&rbi.rcu, rcu_torture_boost_cb); + if (jiffies - call_rcu_time > + test_boost_duration * HZ - HZ / 2) { + VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); + n_rcu_torture_boost_failure++; + } + call_rcu_time = jiffies; + } + cond_resched(); + rcu_stutter_wait("rcu_torture_boost"); + if (kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP) + goto checkwait; + } + + /* + * Set the start time of the next test interval. + * Yes, this is vulnerable to long delays, but such + * delays simply cause a false negative for the next + * interval. Besides, we are running at RT priority, + * so delays should be relatively rare. + */ + while (oldstarttime == boost_starttime) { + if (mutex_trylock(&boost_mutex)) { + boost_starttime = jiffies + + test_boost_interval * HZ; + n_rcu_torture_boosts++; + mutex_unlock(&boost_mutex); + break; + } + schedule_timeout_uninterruptible(1); + } + + /* Go do the stutter. */ +checkwait: rcu_stutter_wait("rcu_torture_boost"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + + /* Clean up and exit. */ + VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); + rcutorture_shutdown_absorb("rcu_torture_boost"); + while (!kthread_should_stop() || rbi.inflight) + schedule_timeout_uninterruptible(1); + smp_mb(); /* order accesses to ->inflight before stack-frame death. */ + return 0; +} + +/* * RCU torture force-quiescent-state kthread. Repeatedly induces * bursts of calls to force_quiescent_state(), increasing the probability * of occurrence of some important types of race conditions. @@ -933,7 +1068,8 @@ rcu_torture_printk(char *page) cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d nt: %ld", + "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " + "rtbf: %ld rtb: %ld nt: %ld", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), @@ -941,8 +1077,19 @@ rcu_torture_printk(char *page) atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free), atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_boost_ktrerror, + n_rcu_torture_boost_rterror, + n_rcu_torture_boost_allocerror, + n_rcu_torture_boost_afferror, + n_rcu_torture_boost_failure, + n_rcu_torture_boosts, n_rcu_torture_timers); - if (atomic_read(&n_rcu_torture_mberror) != 0) + if (atomic_read(&n_rcu_torture_mberror) != 0 || + n_rcu_torture_boost_ktrerror != 0 || + n_rcu_torture_boost_rterror != 0 || + n_rcu_torture_boost_allocerror != 0 || + n_rcu_torture_boost_afferror != 0 || + n_rcu_torture_boost_failure != 0) cnt += sprintf(&page[cnt], " !!!"); cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); if (i > 1) { @@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg) } static inline void -rcu_torture_print_module_parms(char *tag) +rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) { printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d nfakewriters=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " "shuffle_interval=%d stutter=%d irqreader=%d " - "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", + "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " + "test_boost=%d/%d test_boost_interval=%d " + "test_boost_duration=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); + stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, + test_boost, cur_ops->can_boost, + test_boost_interval, test_boost_duration); } -static struct notifier_block rcutorture_nb = { +static struct notifier_block rcutorture_shutdown_nb = { .notifier_call = rcutorture_shutdown_notify, }; +static void rcutorture_booster_cleanup(int cpu) +{ + struct task_struct *t; + + if (boost_tasks[cpu] == NULL) + return; + mutex_lock(&boost_mutex); + VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); + t = boost_tasks[cpu]; + boost_tasks[cpu] = NULL; + mutex_unlock(&boost_mutex); + + /* This must be outside of the mutex, otherwise deadlock! */ + kthread_stop(t); +} + +static int rcutorture_booster_init(int cpu) +{ + int retval; + + if (boost_tasks[cpu] != NULL) + return 0; /* Already created, nothing more to do. */ + + /* Don't allow time recalculation while creating a new task. */ + mutex_lock(&boost_mutex); + VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); + boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, + "rcu_torture_boost"); + if (IS_ERR(boost_tasks[cpu])) { + retval = PTR_ERR(boost_tasks[cpu]); + VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); + n_rcu_torture_boost_ktrerror++; + boost_tasks[cpu] = NULL; + mutex_unlock(&boost_mutex); + return retval; + } + kthread_bind(boost_tasks[cpu], cpu); + wake_up_process(boost_tasks[cpu]); + mutex_unlock(&boost_mutex); + return 0; +} + +static int rcutorture_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + (void)rcutorture_booster_init(cpu); + break; + case CPU_DOWN_PREPARE: + rcutorture_booster_cleanup(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block rcutorture_cpu_nb = { + .notifier_call = rcutorture_cpu_notify, +}; + static void rcu_torture_cleanup(void) { @@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void) } fullstop = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); - unregister_reboot_notifier(&rcutorture_nb); + unregister_reboot_notifier(&rcutorture_shutdown_nb); if (stutter_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); kthread_stop(stutter_task); @@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void) kthread_stop(fqs_task); } fqs_task = NULL; + if ((test_boost == 1 && cur_ops->can_boost) || + test_boost == 2) { + unregister_cpu_notifier(&rcutorture_cpu_nb); + for_each_possible_cpu(i) + rcutorture_booster_cleanup(i); + } /* Wait for all RCU callbacks to fire. */ @@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void) if (cur_ops->cleanup) cur_ops->cleanup(); if (atomic_read(&n_rcu_torture_error)) - rcu_torture_print_module_parms("End of test: FAILURE"); + rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); else - rcu_torture_print_module_parms("End of test: SUCCESS"); + rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); } static int __init @@ -1242,7 +1464,7 @@ rcu_torture_init(void) nrealreaders = nreaders; else nrealreaders = 2 * num_online_cpus(); - rcu_torture_print_module_parms("Start of test"); + rcu_torture_print_module_parms(cur_ops, "Start of test"); fullstop = FULLSTOP_DONTSTOP; /* Set up the freelist. */ @@ -1263,6 +1485,12 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_free, 0); atomic_set(&n_rcu_torture_mberror, 0); atomic_set(&n_rcu_torture_error, 0); + n_rcu_torture_boost_ktrerror = 0; + n_rcu_torture_boost_rterror = 0; + n_rcu_torture_boost_allocerror = 0; + n_rcu_torture_boost_afferror = 0; + n_rcu_torture_boost_failure = 0; + n_rcu_torture_boosts = 0; for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) atomic_set(&rcu_torture_wcount[i], 0); for_each_possible_cpu(cpu) { @@ -1376,7 +1604,27 @@ rcu_torture_init(void) goto unwind; } } - register_reboot_notifier(&rcutorture_nb); + if (test_boost_interval < 1) + test_boost_interval = 1; + if (test_boost_duration < 2) + test_boost_duration = 2; + if ((test_boost == 1 && cur_ops->can_boost) || + test_boost == 2) { + int retval; + + boost_starttime = jiffies + test_boost_interval * HZ; + register_cpu_notifier(&rcutorture_cpu_nb); + for_each_possible_cpu(i) { + if (cpu_is_offline(i)) + continue; /* Heuristic: CPU can go offline. */ + retval = rcutorture_booster_init(i); + if (retval < 0) { + firsterr = retval; + goto unwind; + } + } + } + register_reboot_notifier(&rcutorture_shutdown_nb); mutex_unlock(&fullstop_mutex); return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ccdc04c47981..d0ddfea6579d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; .gpnum = -300, \ .completed = -300, \ .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ - .orphan_cbs_list = NULL, \ - .orphan_cbs_tail = &structname.orphan_cbs_list, \ - .orphan_qlen = 0, \ .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ @@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void) static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { if (rdp->gpnum != rnp->gpnum) { - rdp->qs_pending = 1; - rdp->passed_quiesc = 0; + /* + * If the current grace period is waiting for this CPU, + * set up to detect a quiescent state, otherwise don't + * go looking for one. + */ rdp->gpnum = rnp->gpnum; + if (rnp->qsmask & rdp->grpmask) { + rdp->qs_pending = 1; + rdp->passed_quiesc = 0; + } else + rdp->qs_pending = 0; } } @@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; + + /* + * If we were in an extended quiescent state, we may have + * missed some grace periods that others CPUs handled on + * our behalf. Catch up with this state to avoid noting + * spurious new grace periods. If another grace period + * has started, then rnp->gpnum will have advanced, so + * we will detect this later on. + */ + if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) + rdp->gpnum = rdp->completed; + + /* + * If RCU does not need a quiescent state from this CPU, + * then make sure that this CPU doesn't go looking for one. + */ + if ((rnp->qsmask & rdp->grpmask) == 0) + rdp->qs_pending = 0; } } @@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) #ifdef CONFIG_HOTPLUG_CPU /* - * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the - * specified flavor of RCU. The callbacks will be adopted by the next - * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever - * comes first. Because this is invoked from the CPU_DYING notifier, - * irqs are already disabled. + * Move a dying CPU's RCU callbacks to online CPU's callback list. + * Synchronization is not required because this function executes + * in stop_machine() context. */ -static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +static void rcu_send_cbs_to_online(struct rcu_state *rsp) { int i; + /* current DYING CPU is cleared in the cpu_online_mask */ + int receive_cpu = cpumask_any(cpu_online_mask); struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); if (rdp->nxtlist == NULL) return; /* irqs disabled, so comparison is stable. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ - *rsp->orphan_cbs_tail = rdp->nxtlist; - rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; + + *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; + receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + receive_rdp->qlen += rdp->qlen; + receive_rdp->n_cbs_adopted += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; + rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; - rsp->orphan_qlen += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen = 0; - raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ -} - -/* - * Adopt previously orphaned RCU callbacks. - */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) -{ - unsigned long flags; - struct rcu_data *rdp; - - raw_spin_lock_irqsave(&rsp->onofflock, flags); - rdp = this_cpu_ptr(rsp->rda); - if (rsp->orphan_cbs_list == NULL) { - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - return; - } - *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; - rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; - rdp->qlen += rsp->orphan_qlen; - rdp->n_cbs_adopted += rsp->orphan_qlen; - rsp->orphan_cbs_list = NULL; - rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; - rsp->orphan_qlen = 0; - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } /* @@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp); - - rcu_adopt_orphan_cbs(rsp); } /* @@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu) #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) -{ -} - -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +static void rcu_send_cbs_to_online(struct rcu_state *rsp) { } @@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), */ local_irq_save(flags); rdp = this_cpu_ptr(rsp->rda); - rcu_process_gp_end(rsp, rdp); - check_for_new_grace_period(rsp, rdp); /* Add the callback to our list. */ *rdp->nxttail[RCU_NEXT_TAIL] = head; rdp->nxttail[RCU_NEXT_TAIL] = &head->next; - /* Start a new grace period if one not already started. */ - if (!rcu_gp_in_progress(rsp)) { - unsigned long nestflag; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - raw_spin_lock_irqsave(&rnp_root->lock, nestflag); - rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ - } - /* * Force the grace period if too many callbacks or too long waiting. * Enforce hysteresis, and don't invoke force_quiescent_state() @@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * is the only one waiting for a grace period to complete. */ if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { - rdp->blimit = LONG_MAX; - if (rsp->n_force_qs == rdp->n_force_qs_snap && - *rdp->nxttail[RCU_DONE_TAIL] != head) - force_quiescent_state(rsp, 0); - rdp->n_force_qs_snap = rsp->n_force_qs; - rdp->qlen_last_fqs_check = rdp->qlen; + + /* Are we ignoring a completed grace period? */ + rcu_process_gp_end(rsp, rdp); + check_for_new_grace_period(rsp, rdp); + + /* Start a new grace period if one not already started. */ + if (!rcu_gp_in_progress(rsp)) { + unsigned long nestflag; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + raw_spin_lock_irqsave(&rnp_root->lock, nestflag); + rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ + } else { + /* Give the grace period a kick. */ + rdp->blimit = LONG_MAX; + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; + } } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) force_quiescent_state(rsp, 1); local_irq_restore(flags); @@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp, * decrement rcu_barrier_cpu_count -- otherwise the first CPU * might complete its grace period before all of the other CPUs * did their increment, causing this function to return too - * early. + * early. Note that on_each_cpu() disables irqs, which prevents + * any CPUs from coming online or going offline until each online + * CPU has queued its RCU-barrier callback. */ atomic_set(&rcu_barrier_cpu_count, 1); - preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */ - rcu_adopt_orphan_cbs(rsp); on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); - preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */ if (atomic_dec_and_test(&rcu_barrier_cpu_count)) complete(&rcu_barrier_completion); wait_for_completion(&rcu_barrier_completion); @@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_DYING: case CPU_DYING_FROZEN: /* - * preempt_disable() in _rcu_barrier() prevents stop_machine(), - * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" - * returns, all online cpus have queued rcu_barrier_func(). - * The dying CPU clears its cpu_online_mask bit and - * moves all of its RCU callbacks to ->orphan_cbs_list - * in the context of stop_machine(), so subsequent calls - * to _rcu_barrier() will adopt these callbacks and only - * then queue rcu_barrier_func() on all remaining CPUs. + * The whole machine is "stopped" except this CPU, so we can + * touch any data without introducing corruption. We send the + * dying CPU's callbacks to an arbitrarily chosen online CPU. */ - rcu_send_cbs_to_orphanage(&rcu_bh_state); - rcu_send_cbs_to_orphanage(&rcu_sched_state); - rcu_preempt_send_cbs_to_orphanage(); + rcu_send_cbs_to_online(&rcu_bh_state); + rcu_send_cbs_to_online(&rcu_sched_state); + rcu_preempt_send_cbs_to_online(); break; case CPU_DEAD: case CPU_DEAD_FROZEN: @@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) + for (i = NUM_RCU_LVLS - 1; i > 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; + rsp->levelspread[0] = RCU_FANOUT_LEAF; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 91d4170c5c13..e8f057e44e3e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -31,46 +31,51 @@ /* * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. * In theory, it should be possible to add more levels straightforwardly. - * In practice, this has not been tested, so there is probably some - * bug somewhere. + * In practice, this did work well going from three levels to four. + * Of course, your mileage may vary. */ #define MAX_RCU_LVLS 4 -#define RCU_FANOUT (CONFIG_RCU_FANOUT) -#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) -#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) -#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) - -#if NR_CPUS <= RCU_FANOUT +#if CONFIG_RCU_FANOUT > 16 +#define RCU_FANOUT_LEAF 16 +#else /* #if CONFIG_RCU_FANOUT > 16 */ +#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) +#endif /* #else #if CONFIG_RCU_FANOUT > 16 */ +#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) +#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) +#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) +#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) + +#if NR_CPUS <= RCU_FANOUT_1 # define NUM_RCU_LVLS 1 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 (NR_CPUS) # define NUM_RCU_LVL_2 0 # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 -#elif NR_CPUS <= RCU_FANOUT_SQ +#elif NR_CPUS <= RCU_FANOUT_2 # define NUM_RCU_LVLS 2 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) # define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 -#elif NR_CPUS <= RCU_FANOUT_CUBE +#elif NR_CPUS <= RCU_FANOUT_3 # define NUM_RCU_LVLS 3 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) -# define NUM_RCU_LVL_3 NR_CPUS +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_LVL_3 (NR_CPUS) # define NUM_RCU_LVL_4 0 -#elif NR_CPUS <= RCU_FANOUT_FOURTH +#elif NR_CPUS <= RCU_FANOUT_4 # define NUM_RCU_LVLS 4 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) -# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) -# define NUM_RCU_LVL_4 NR_CPUS +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_LVL_4 (NR_CPUS) #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" -#endif /* #if (NR_CPUS) <= RCU_FANOUT */ +#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) @@ -203,8 +208,8 @@ struct rcu_data { long qlen_last_fqs_check; /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ - unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ - unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ + unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ + unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -309,15 +314,7 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ raw_spinlock_t onofflock; /* exclude on/offline and */ - /* starting new GP. Also */ - /* protects the following */ - /* orphan_cbs fields. */ - struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */ - /* orphaned by all CPUs in */ - /* a given leaf rcu_node */ - /* going offline. */ - struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ - long orphan_qlen; /* Number of orphaned cbs. */ + /* starting new GP. */ raw_spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ @@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); static int rcu_preempt_pending(int cpu); static int rcu_preempt_needs_cpu(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu); -static void rcu_preempt_send_cbs_to_orphanage(void); +static void rcu_preempt_send_cbs_to_online(void); static void __init __rcu_init_preempt(void); static void rcu_needs_cpu_flush(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 71a4147473f9..a3638710dc67 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,6 +25,7 @@ */ #include <linux/delay.h> +#include <linux/stop_machine.h> /* * Check the RCU kernel configuration parameters and print informative @@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Move preemptable RCU's callbacks to ->orphan_cbs_list. + * Move preemptable RCU's callbacks from dying CPU to other online CPU. */ -static void rcu_preempt_send_cbs_to_orphanage(void) +static void rcu_preempt_send_cbs_to_online(void) { - rcu_send_cbs_to_orphanage(&rcu_preempt_state); + rcu_send_cbs_to_online(&rcu_preempt_state); } /* @@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) /* * Because there is no preemptable RCU, there are no callbacks to move. */ -static void rcu_preempt_send_cbs_to_orphanage(void) +static void rcu_preempt_send_cbs_to_online(void) { } @@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void) #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifndef CONFIG_SMP + +void synchronize_sched_expedited(void) +{ + cond_resched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#else /* #ifndef CONFIG_SMP */ + +static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); +static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); + +static int synchronize_sched_expedited_cpu_stop(void *data) +{ + /* + * There must be a full memory barrier on each affected CPU + * between the time that try_stop_cpus() is called and the + * time that it returns. + * + * In the current initial implementation of cpu_stop, the + * above condition is already met when the control reaches + * this point and the following smp_mb() is not strictly + * necessary. Do smp_mb() anyway for documentation and + * robustness against future implementation changes. + */ + smp_mb(); /* See above comment block. */ + return 0; +} + +/* + * Wait for an rcu-sched grace period to elapse, but use "big hammer" + * approach to force grace period to end quickly. This consumes + * significant time on all CPUs, and is thus not recommended for + * any sort of common-case code. + * + * Note that it is illegal to call this function while holding any + * lock that is acquired by a CPU-hotplug notifier. Failing to + * observe this restriction will result in deadlock. + * + * This implementation can be thought of as an application of ticket + * locking to RCU, with sync_sched_expedited_started and + * sync_sched_expedited_done taking on the roles of the halves + * of the ticket-lock word. Each task atomically increments + * sync_sched_expedited_started upon entry, snapshotting the old value, + * then attempts to stop all the CPUs. If this succeeds, then each + * CPU will have executed a context switch, resulting in an RCU-sched + * grace period. We are then done, so we use atomic_cmpxchg() to + * update sync_sched_expedited_done to match our snapshot -- but + * only if someone else has not already advanced past our snapshot. + * + * On the other hand, if try_stop_cpus() fails, we check the value + * of sync_sched_expedited_done. If it has advanced past our + * initial snapshot, then someone else must have forced a grace period + * some time after we took our snapshot. In this case, our work is + * done for us, and we can simply return. Otherwise, we try again, + * but keep our initial snapshot for purposes of checking for someone + * doing our work for us. + * + * If we fail too many times in a row, we fall back to synchronize_sched(). + */ +void synchronize_sched_expedited(void) +{ + int firstsnap, s, snap, trycount = 0; + + /* Note that atomic_inc_return() implies full memory barrier. */ + firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); + get_online_cpus(); + + /* + * Each pass through the following loop attempts to force a + * context switch on each CPU. + */ + while (try_stop_cpus(cpu_online_mask, + synchronize_sched_expedited_cpu_stop, + NULL) == -EAGAIN) { + put_online_cpus(); + + /* No joy, try again later. Or just synchronize_sched(). */ + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_sched(); + return; + } + + /* Check to see if someone else did our work for us. */ + s = atomic_read(&sync_sched_expedited_done); + if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + + /* + * Refetching sync_sched_expedited_started allows later + * callers to piggyback on our grace period. We subtract + * 1 to get the same token that the last incrementer got. + * We retry after they started, so our grace period works + * for them, and they started after our first try, so their + * grace period works for us. + */ + get_online_cpus(); + snap = atomic_read(&sync_sched_expedited_started) - 1; + smp_mb(); /* ensure read is before try_stop_cpus(). */ + } + + /* + * Everyone up to our most recent fetch is covered by our grace + * period. Update the counter, but only if our work is still + * relevant -- which it won't be if someone who started later + * than we did beat us to the punch. + */ + do { + s = atomic_read(&sync_sched_expedited_done); + if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { + smp_mb(); /* ensure test happens before caller kfree */ + break; + } + } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); + + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#endif /* #else #ifndef CONFIG_SMP */ + #if !defined(CONFIG_RCU_FAST_NO_HZ) /* diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d15430b9d122..c8e97853b970 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) gpnum = rsp->gpnum; seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", rsp->completed, gpnum, rsp->signaled, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - rsp->n_force_qs_lh, rsp->orphan_qlen); + rsp->n_force_qs_lh); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); @@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = { static struct dentry *rcudir; -static int __init rcuclassic_trace_init(void) +static int __init rcutree_trace_init(void) { struct dentry *retval; @@ -337,14 +337,14 @@ free_out: return 1; } -static void __exit rcuclassic_trace_cleanup(void) +static void __exit rcutree_trace_cleanup(void) { debugfs_remove_recursive(rcudir); } -module_init(rcuclassic_trace_init); -module_exit(rcuclassic_trace_cleanup); +module_init(rcutree_trace_init); +module_exit(rcutree_trace_cleanup); MODULE_AUTHOR("Paul E. McKenney"); MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); diff --git a/kernel/sched.c b/kernel/sched.c index 297d1a0eedb0..04949089e760 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,9 +75,11 @@ #include <asm/tlb.h> #include <asm/irq_regs.h> +#include <asm/mutex.h> #include "sched_cpupri.h" #include "workqueue_sched.h" +#include "sched_autogroup.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> @@ -253,6 +255,8 @@ struct task_group { /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; unsigned long shares; + + atomic_t load_weight; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -268,24 +272,19 @@ struct task_group { struct task_group *parent; struct list_head siblings; struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif }; #define root_task_group init_task_group -/* task_group_lock serializes add/remove of task groups and also changes to - * a task group's cpu shares. - */ +/* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP -static int root_task_group_empty(void) -{ - return list_empty(&root_task_group.children); -} -#endif - # define INIT_TASK_GROUP_LOAD NICE_0_LOAD /* @@ -342,6 +341,7 @@ struct cfs_rq { * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This * list is used during load balance. */ + int on_list; struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ @@ -360,14 +360,17 @@ struct cfs_rq { unsigned long h_load; /* - * this cpu's part of tg->shares + * Maintaining per-cpu shares distribution for group scheduling + * + * load_stamp is the last time we updated the load average + * load_last is the last time we updated the load average and saw load + * load_unacc_exec_time is currently unaccounted execution time */ - unsigned long shares; + u64 load_avg; + u64 load_period; + u64 load_stamp, load_last, load_unacc_exec_time; - /* - * load.weight at the time we set shares - */ - unsigned long rq_weight; + unsigned long load_contribution; #endif #endif }; @@ -605,11 +608,14 @@ static inline int cpu_of(struct rq *rq) */ static inline struct task_group *task_group(struct task_struct *p) { + struct task_group *tg; struct cgroup_subsys_state *css; css = task_subsys_state_check(p, cpu_cgroup_subsys_id, lockdep_is_held(&task_rq(p)->lock)); - return container_of(css, struct task_group, css); + tg = container_of(css, struct task_group, css); + + return autogroup_task_group(p, tg); } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ @@ -793,20 +799,6 @@ late_initcall(sched_init_debug); const_debug unsigned int sysctl_sched_nr_migrate = 32; /* - * ratelimit for updating the group shares. - * default: 0.25ms - */ -unsigned int sysctl_sched_shares_ratelimit = 250000; -unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; - -/* - * Inject some fuzzyness into changing the per-cpu group shares - * this avoids remote rq-locks at the expense of fairness. - * default: 4 - */ -unsigned int sysctl_sched_shares_thresh = 4; - -/* * period over which we average the RT time consumption, measured * in ms. * @@ -1355,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) lw->inv_weight = 0; } +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + /* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that @@ -1543,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -static __read_mostly unsigned long __percpu *update_shares_data; - -static void __set_se_shares(struct sched_entity *se, unsigned long shares); - -/* - * Calculate and set the cpu's group shares. - */ -static void update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, - unsigned long sd_rq_weight, - unsigned long *usd_rq_weight) -{ - unsigned long shares, rq_weight; - int boost = 0; - - rq_weight = usd_rq_weight[cpu]; - if (!rq_weight) { - boost = 1; - rq_weight = NICE_0_LOAD; - } - - /* - * \Sum_j shares_j * rq_weight_i - * shares_i = ----------------------------- - * \Sum_j rq_weight_j - */ - shares = (sd_shares * rq_weight) / sd_rq_weight; - shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); - - if (abs(shares - tg->se[cpu]->load.weight) > - sysctl_sched_shares_thresh) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - __set_se_shares(tg->se[cpu], shares); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } -} - -/* - * Re-compute the task group their per cpu shares over the given domain. - * This needs to be done in a bottom-up fashion because the rq weight of a - * parent group depends on the shares of its child groups. - */ -static int tg_shares_up(struct task_group *tg, void *data) -{ - unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; - unsigned long *usd_rq_weight; - struct sched_domain *sd = data; - unsigned long flags; - int i; - - if (!tg->se[0]) - return 0; - - local_irq_save(flags); - usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); - - for_each_cpu(i, sched_domain_span(sd)) { - weight = tg->cfs_rq[i]->load.weight; - usd_rq_weight[i] = weight; - - rq_weight += weight; - /* - * If there are currently no tasks on the cpu pretend there - * is one of average load so that when a new task gets to - * run here it will not get delayed by group starvation. - */ - if (!weight) - weight = NICE_0_LOAD; - - sum_weight += weight; - shares += tg->cfs_rq[i]->shares; - } - - if (!rq_weight) - rq_weight = sum_weight; - - if ((!shares && rq_weight) || shares > tg->shares) - shares = tg->shares; - - if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) - shares = tg->shares; - - for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); - - local_irq_restore(flags); - - return 0; -} - /* * Compute the cpu's hierarchical load factor for each task group. * This needs to be done in a top-down fashion because the load of a child @@ -1652,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data) load = cpu_rq(cpu)->load.weight; } else { load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->cfs_rq[cpu]->shares; + load *= tg->se[cpu]->load.weight; load /= tg->parent->cfs_rq[cpu]->load.weight + 1; } @@ -1661,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data) return 0; } -static void update_shares(struct sched_domain *sd) -{ - s64 elapsed; - u64 now; - - if (root_task_group_empty()) - return; - - now = local_clock(); - elapsed = now - sd->last_update; - - if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { - sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, sd); - } -} - static void update_h_load(long cpu) { walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -#else - -static inline void update_shares(struct sched_domain *sd) -{ -} - #endif #ifdef CONFIG_PREEMPT @@ -1810,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) #endif -#ifdef CONFIG_FAIR_GROUP_SCHED -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ -#ifdef CONFIG_SMP - cfs_rq->shares = shares; -#endif -} -#endif - static void calc_load_account_idle(struct rq *this_rq); static void update_sysctl(void); static int get_update_sysctl_factor(void); @@ -2063,6 +1934,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" +#include "sched_autogroup.c" #include "sched_stoptask.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" @@ -2255,10 +2127,8 @@ static int migration_cpu_stop(void *data); * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static bool migrate_task(struct task_struct *p, int dest_cpu) +static bool migrate_task(struct task_struct *p, struct rq *rq) { - struct rq *rq = task_rq(p); - /* * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. @@ -2438,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) return dest_cpu; /* No more Mr. Nice Guy. */ - if (unlikely(dest_cpu >= nr_cpu_ids)) { - dest_cpu = cpuset_cpus_allowed_fallback(p); - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, cpu); - } + dest_cpu = cpuset_cpus_allowed_fallback(p); + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); } return dest_cpu; @@ -2785,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif +#ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); +#endif put_cpu(); } @@ -3549,7 +3418,7 @@ void sched_exec(void) * select_task_rq() can race against ->cpus_allowed */ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && - likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { + likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { struct migration_arg arg = { p, dest_cpu }; task_rq_unlock(rq, &flags); @@ -4214,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) if (task_thread_info(rq->curr) != owner || need_resched()) return 0; - cpu_relax(); + arch_mutex_cpu_relax(); } return 1; @@ -4526,7 +4395,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. It is interruptible. The timeout is in jiffies. */ -unsigned long __sched +long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) { @@ -4559,7 +4428,7 @@ EXPORT_SYMBOL(wait_for_completion_killable); * signaled or for a specified timeout to expire. It can be * interrupted by a kill signal. The timeout is in jiffies. */ -unsigned long __sched +long __sched wait_for_completion_killable_timeout(struct completion *x, unsigned long timeout) { @@ -4901,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p) } static int __sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param, bool user) + const struct sched_param *param, bool user) { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; @@ -5056,7 +4925,7 @@ recheck: * NOTE that the task may be already dead. */ int sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param) + const struct sched_param *param) { return __sched_setscheduler(p, policy, param, true); } @@ -5074,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); * but our caller might not have that capability. */ int sched_setscheduler_nocheck(struct task_struct *p, int policy, - struct sched_param *param) + const struct sched_param *param) { return __sched_setscheduler(p, policy, param, false); } @@ -5590,7 +5459,7 @@ void sched_show_task(struct task_struct *p) unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-13.13s %c", p->comm, + printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 if (state == TASK_RUNNING) @@ -5754,7 +5623,6 @@ static void update_sysctl(void) SET_SYSCTL(sched_min_granularity); SET_SYSCTL(sched_latency); SET_SYSCTL(sched_wakeup_granularity); - SET_SYSCTL(sched_shares_ratelimit); #undef SET_SYSCTL } @@ -5830,7 +5698,7 @@ again: goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (migrate_task(p, dest_cpu)) { + if (migrate_task(p, rq)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); @@ -5912,29 +5780,20 @@ static int migration_cpu_stop(void *data) } #ifdef CONFIG_HOTPLUG_CPU + /* - * Figure out where task on dead CPU should go, use force if necessary. + * Ensures that the idle task is using init_mm right before its cpu goes + * offline. */ -void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) +void idle_task_exit(void) { - struct rq *rq = cpu_rq(dead_cpu); - int needs_cpu, uninitialized_var(dest_cpu); - unsigned long flags; + struct mm_struct *mm = current->active_mm; - local_irq_save(flags); + BUG_ON(cpu_online(smp_processor_id())); - raw_spin_lock(&rq->lock); - needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); - if (needs_cpu) - dest_cpu = select_fallback_rq(dead_cpu, p); - raw_spin_unlock(&rq->lock); - /* - * It can only fail if we race with set_cpus_allowed(), - * in the racer should migrate the task anyway. - */ - if (needs_cpu) - __migrate_task(p, dead_cpu, dest_cpu); - local_irq_restore(flags); + if (mm != &init_mm) + switch_mm(mm, &init_mm, current); + mmdrop(mm); } /* @@ -5947,128 +5806,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) static void migrate_nr_uninterruptible(struct rq *rq_src) { struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - unsigned long flags; - local_irq_save(flags); - double_rq_lock(rq_src, rq_dest); rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; rq_src->nr_uninterruptible = 0; - double_rq_unlock(rq_src, rq_dest); - local_irq_restore(flags); -} - -/* Run through task list and migrate tasks from the dead cpu. */ -static void migrate_live_tasks(int src_cpu) -{ - struct task_struct *p, *t; - - read_lock(&tasklist_lock); - - do_each_thread(t, p) { - if (p == current) - continue; - - if (task_cpu(p) == src_cpu) - move_task_off_dead_cpu(src_cpu, p); - } while_each_thread(t, p); - - read_unlock(&tasklist_lock); } /* - * Schedules idle task to be the next runnable task on current CPU. - * It does so by boosting its priority to highest possible. - * Used by CPU offline code. + * remove the tasks which were accounted by rq from calc_load_tasks. */ -void sched_idle_next(void) +static void calc_global_load_remove(struct rq *rq) { - int this_cpu = smp_processor_id(); - struct rq *rq = cpu_rq(this_cpu); - struct task_struct *p = rq->idle; - unsigned long flags; - - /* cpu has to be offline */ - BUG_ON(cpu_online(this_cpu)); - - /* - * Strictly not necessary since rest of the CPUs are stopped by now - * and interrupts disabled on the current cpu. - */ - raw_spin_lock_irqsave(&rq->lock, flags); - - __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - - activate_task(rq, p, 0); - - raw_spin_unlock_irqrestore(&rq->lock, flags); + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); + rq->calc_load_active = 0; } /* - * Ensures that the idle task is using init_mm right before its cpu goes - * offline. + * Migrate all tasks from the rq, sleeping tasks will be migrated by + * try_to_wake_up()->select_task_rq(). + * + * Called with rq->lock held even though we'er in stop_machine() and + * there's no concurrency possible, we hold the required locks anyway + * because of lock validation efforts. */ -void idle_task_exit(void) -{ - struct mm_struct *mm = current->active_mm; - - BUG_ON(cpu_online(smp_processor_id())); - - if (mm != &init_mm) - switch_mm(mm, &init_mm, current); - mmdrop(mm); -} - -/* called under rq->lock with disabled interrupts */ -static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) +static void migrate_tasks(unsigned int dead_cpu) { struct rq *rq = cpu_rq(dead_cpu); - - /* Must be exiting, otherwise would be on tasklist. */ - BUG_ON(!p->exit_state); - - /* Cannot have done final schedule yet: would have vanished. */ - BUG_ON(p->state == TASK_DEAD); - - get_task_struct(p); + struct task_struct *next, *stop = rq->stop; + int dest_cpu; /* - * Drop lock around migration; if someone else moves it, - * that's OK. No task can be added to this CPU, so iteration is - * fine. + * Fudge the rq selection such that the below task selection loop + * doesn't get stuck on the currently eligible stop task. + * + * We're currently inside stop_machine() and the rq is either stuck + * in the stop_machine_cpu_stop() loop, or we're executing this code, + * either way we should never end up calling schedule() until we're + * done here. */ - raw_spin_unlock_irq(&rq->lock); - move_task_off_dead_cpu(dead_cpu, p); - raw_spin_lock_irq(&rq->lock); - - put_task_struct(p); -} - -/* release_task() removes task from tasklist, so we won't find dead tasks. */ -static void migrate_dead_tasks(unsigned int dead_cpu) -{ - struct rq *rq = cpu_rq(dead_cpu); - struct task_struct *next; + rq->stop = NULL; for ( ; ; ) { - if (!rq->nr_running) + /* + * There's this thread running, bail when that's the only + * remaining thread. + */ + if (rq->nr_running == 1) break; + next = pick_next_task(rq); - if (!next) - break; + BUG_ON(!next); next->sched_class->put_prev_task(rq, next); - migrate_dead(dead_cpu, next); + /* Find suitable destination for @next, with force if needed. */ + dest_cpu = select_fallback_rq(dead_cpu, next); + raw_spin_unlock(&rq->lock); + + __migrate_task(next, dead_cpu, dest_cpu); + + raw_spin_lock(&rq->lock); } -} -/* - * remove the tasks which were accounted by rq from calc_load_tasks. - */ -static void calc_global_load_remove(struct rq *rq) -{ - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; + rq->stop = stop; } + #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -6278,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) unsigned long flags; struct rq *rq = cpu_rq(cpu); - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: rq->calc_load_update = calc_load_update; break; case CPU_ONLINE: - case CPU_ONLINE_FROZEN: /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { @@ -6298,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_live_tasks(cpu); - /* Idle task back to normal (off runqueue, low prio) */ - raw_spin_lock_irq(&rq->lock); - deactivate_task(rq, rq->idle, 0); - __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); - rq->idle->sched_class = &idle_sched_class; - migrate_dead_tasks(cpu); - raw_spin_unlock_irq(&rq->lock); - migrate_nr_uninterruptible(rq); - BUG_ON(rq->nr_running != 0); - calc_global_load_remove(rq); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } + migrate_tasks(cpu); + BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); + + migrate_nr_uninterruptible(rq); + calc_global_load_remove(rq); break; #endif } @@ -8052,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_FAIR_GROUP_SCHED static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - struct sched_entity *se, int cpu, int add, + struct sched_entity *se, int cpu, struct sched_entity *parent) { struct rq *rq = cpu_rq(cpu); tg->cfs_rq[cpu] = cfs_rq; init_cfs_rq(cfs_rq, rq); cfs_rq->tg = tg; - if (add) - list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); tg->se[cpu] = se; /* se could be NULL for init_task_group */ @@ -8073,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, se->cfs_rq = parent->my_q; se->my_q = cfs_rq; - se->load.weight = tg->shares; - se->load.inv_weight = 0; + update_load_set(&se->load, 0); se->parent = parent; } #endif #ifdef CONFIG_RT_GROUP_SCHED static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, - struct sched_rt_entity *rt_se, int cpu, int add, + struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent) { struct rq *rq = cpu_rq(cpu); @@ -8090,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, init_rt_rq(rt_rq, rq); rt_rq->tg = tg; rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; - if (add) - list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); tg->rt_se[cpu] = rt_se; if (!rt_se) @@ -8164,13 +7946,9 @@ void __init sched_init(void) #ifdef CONFIG_CGROUP_SCHED list_add(&init_task_group.list, &task_groups); INIT_LIST_HEAD(&init_task_group.children); - + autogroup_init(&init_task); #endif /* CONFIG_CGROUP_SCHED */ -#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP - update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), - __alignof__(unsigned long)); -#endif for_each_possible_cpu(i) { struct rq *rq; @@ -8184,7 +7962,6 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.shares = init_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); -#ifdef CONFIG_CGROUP_SCHED /* * How much cpu bandwidth does init_task_group get? * @@ -8204,16 +7981,13 @@ void __init sched_init(void) * We achieve this by letting init_task_group's tasks sit * directly in rq->cfs (i.e init_task_group->se[] = NULL). */ - init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); -#endif + init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_rt_rq_list); -#ifdef CONFIG_CGROUP_SCHED - init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); -#endif + init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL); #endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) @@ -8293,8 +8067,6 @@ void __init sched_init(void) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ - perf_event_init(); - scheduler_running = 1; } @@ -8488,7 +8260,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!se) goto err_free_rq; - init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); + init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); } return 1; @@ -8499,15 +8271,21 @@ err: return 0; } -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ - list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, - &cpu_rq(cpu)->leaf_cfs_rq_list); -} - static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { - list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); } #else /* !CONFG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) @@ -8520,10 +8298,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ -} - static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { } @@ -8578,7 +8352,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; - init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); + init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } return 1; @@ -8588,17 +8362,6 @@ err_free_rq: err: return 0; } - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ - list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, - &cpu_rq(cpu)->leaf_rt_rq_list); -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ - list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); -} #else /* !CONFIG_RT_GROUP_SCHED */ static inline void free_rt_sched_group(struct task_group *tg) { @@ -8609,14 +8372,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ -} #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED @@ -8632,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; unsigned long flags; - int i; tg = kzalloc(sizeof(*tg), GFP_KERNEL); if (!tg) @@ -8645,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent) goto err; spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { - register_fair_sched_group(tg, i); - register_rt_sched_group(tg, i); - } list_add_rcu(&tg->list, &task_groups); WARN_ON(!parent); /* root should already exist */ @@ -8678,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg) unsigned long flags; int i; - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { + /* end participation in shares distribution */ + for_each_possible_cpu(i) unregister_fair_sched_group(tg, i); - unregister_rt_sched_group(tg, i); - } + + spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); @@ -8729,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk) #endif /* CONFIG_CGROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED -static void __set_se_shares(struct sched_entity *se, unsigned long shares) -{ - struct cfs_rq *cfs_rq = se->cfs_rq; - int on_rq; - - on_rq = se->on_rq; - if (on_rq) - dequeue_entity(cfs_rq, se, 0); - - se->load.weight = shares; - se->load.inv_weight = 0; - - if (on_rq) - enqueue_entity(cfs_rq, se, 0); -} - -static void set_se_shares(struct sched_entity *se, unsigned long shares) -{ - struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __set_se_shares(se, shares); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - static DEFINE_MUTEX(shares_mutex); int sched_group_set_shares(struct task_group *tg, unsigned long shares) @@ -8778,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) if (tg->shares == shares) goto done; - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) - unregister_fair_sched_group(tg, i); - list_del_rcu(&tg->siblings); - spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for any ongoing reference to this group to finish */ - synchronize_sched(); - - /* - * Now we are free to modify the group's share on each cpu - * w/o tripping rebalance_share or load_balance_fair. - */ tg->shares = shares; for_each_possible_cpu(i) { - /* - * force a rebalance - */ - cfs_rq_set_shares(tg->cfs_rq[i], 0); - set_se_shares(tg->se[i], shares); + struct rq *rq = cpu_rq(i); + struct sched_entity *se; + + se = tg->se[i]; + /* Propagate contribution to hierarchy */ + raw_spin_lock_irqsave(&rq->lock, flags); + for_each_sched_entity(se) + update_cfs_shares(group_cfs_rq(se), 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); } - /* - * Enable load balance activity on this group, by inserting it back on - * each cpu's rq->leaf_cfs_rq_list. - */ - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) - register_fair_sched_group(tg, i); - list_add_rcu(&tg->siblings, &tg->parent->children); - spin_unlock_irqrestore(&task_group_lock, flags); done: mutex_unlock(&shares_mutex); return 0; @@ -9534,72 +9239,3 @@ struct cgroup_subsys cpuacct_subsys = { }; #endif /* CONFIG_CGROUP_CPUACCT */ -#ifndef CONFIG_SMP - -void synchronize_sched_expedited(void) -{ - barrier(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - -#else /* #ifndef CONFIG_SMP */ - -static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); - -static int synchronize_sched_expedited_cpu_stop(void *data) -{ - /* - * There must be a full memory barrier on each affected CPU - * between the time that try_stop_cpus() is called and the - * time that it returns. - * - * In the current initial implementation of cpu_stop, the - * above condition is already met when the control reaches - * this point and the following smp_mb() is not strictly - * necessary. Do smp_mb() anyway for documentation and - * robustness against future implementation changes. - */ - smp_mb(); /* See above comment block. */ - return 0; -} - -/* - * Wait for an rcu-sched grace period to elapse, but use "big hammer" - * approach to force grace period to end quickly. This consumes - * significant time on all CPUs, and is thus not recommended for - * any sort of common-case code. - * - * Note that it is illegal to call this function while holding any - * lock that is acquired by a CPU-hotplug notifier. Failing to - * observe this restriction will result in deadlock. - */ -void synchronize_sched_expedited(void) -{ - int snap, trycount = 0; - - smp_mb(); /* ensure prior mod happens before capturing snap. */ - snap = atomic_read(&synchronize_sched_expedited_count) + 1; - get_online_cpus(); - while (try_stop_cpus(cpu_online_mask, - synchronize_sched_expedited_cpu_stop, - NULL) == -EAGAIN) { - put_online_cpus(); - if (trycount++ < 10) - udelay(trycount * num_online_cpus()); - else { - synchronize_sched(); - return; - } - if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { - smp_mb(); /* ensure test happens before caller kfree */ - return; - } - get_online_cpus(); - } - atomic_inc(&synchronize_sched_expedited_count); - smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ - put_online_cpus(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - -#endif /* #else #ifndef CONFIG_SMP */ diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c new file mode 100644 index 000000000000..c80fedcd476b --- /dev/null +++ b/kernel/sched_autogroup.c @@ -0,0 +1,238 @@ +#ifdef CONFIG_SCHED_AUTOGROUP + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/utsname.h> + +unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; +static struct autogroup autogroup_default; +static atomic_t autogroup_seq_nr; + +static void autogroup_init(struct task_struct *init_task) +{ + autogroup_default.tg = &init_task_group; + init_task_group.autogroup = &autogroup_default; + kref_init(&autogroup_default.kref); + init_rwsem(&autogroup_default.lock); + init_task->signal->autogroup = &autogroup_default; +} + +static inline void autogroup_free(struct task_group *tg) +{ + kfree(tg->autogroup); +} + +static inline void autogroup_destroy(struct kref *kref) +{ + struct autogroup *ag = container_of(kref, struct autogroup, kref); + + sched_destroy_group(ag->tg); +} + +static inline void autogroup_kref_put(struct autogroup *ag) +{ + kref_put(&ag->kref, autogroup_destroy); +} + +static inline struct autogroup *autogroup_kref_get(struct autogroup *ag) +{ + kref_get(&ag->kref); + return ag; +} + +static inline struct autogroup *autogroup_task_get(struct task_struct *p) +{ + struct autogroup *ag; + unsigned long flags; + + if (!lock_task_sighand(p, &flags)) + return autogroup_kref_get(&autogroup_default); + + ag = autogroup_kref_get(p->signal->autogroup); + unlock_task_sighand(p, &flags); + + return ag; +} + +static inline struct autogroup *autogroup_create(void) +{ + struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); + struct task_group *tg; + + if (!ag) + goto out_fail; + + tg = sched_create_group(&init_task_group); + + if (IS_ERR(tg)) + goto out_free; + + kref_init(&ag->kref); + init_rwsem(&ag->lock); + ag->id = atomic_inc_return(&autogroup_seq_nr); + ag->tg = tg; + tg->autogroup = ag; + + return ag; + +out_free: + kfree(ag); +out_fail: + if (printk_ratelimit()) { + printk(KERN_WARNING "autogroup_create: %s failure.\n", + ag ? "sched_create_group()" : "kmalloc()"); + } + + return autogroup_kref_get(&autogroup_default); +} + +static inline bool +task_wants_autogroup(struct task_struct *p, struct task_group *tg) +{ + if (tg != &root_task_group) + return false; + + if (p->sched_class != &fair_sched_class) + return false; + + /* + * We can only assume the task group can't go away on us if + * autogroup_move_group() can see us on ->thread_group list. + */ + if (p->flags & PF_EXITING) + return false; + + return true; +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) + return p->signal->autogroup->tg; + + return tg; +} + +static void +autogroup_move_group(struct task_struct *p, struct autogroup *ag) +{ + struct autogroup *prev; + struct task_struct *t; + unsigned long flags; + + BUG_ON(!lock_task_sighand(p, &flags)); + + prev = p->signal->autogroup; + if (prev == ag) { + unlock_task_sighand(p, &flags); + return; + } + + p->signal->autogroup = autogroup_kref_get(ag); + + t = p; + do { + sched_move_task(t); + } while_each_thread(p, t); + + unlock_task_sighand(p, &flags); + autogroup_kref_put(prev); +} + +/* Allocates GFP_KERNEL, cannot be called under any spinlock */ +void sched_autogroup_create_attach(struct task_struct *p) +{ + struct autogroup *ag = autogroup_create(); + + autogroup_move_group(p, ag); + /* drop extra refrence added by autogroup_create() */ + autogroup_kref_put(ag); +} +EXPORT_SYMBOL(sched_autogroup_create_attach); + +/* Cannot be called under siglock. Currently has no users */ +void sched_autogroup_detach(struct task_struct *p) +{ + autogroup_move_group(p, &autogroup_default); +} +EXPORT_SYMBOL(sched_autogroup_detach); + +void sched_autogroup_fork(struct signal_struct *sig) +{ + sig->autogroup = autogroup_task_get(current); +} + +void sched_autogroup_exit(struct signal_struct *sig) +{ + autogroup_kref_put(sig->autogroup); +} + +static int __init setup_autogroup(char *str) +{ + sysctl_sched_autogroup_enabled = 0; + + return 1; +} + +__setup("noautogroup", setup_autogroup); + +#ifdef CONFIG_PROC_FS + +int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) +{ + static unsigned long next = INITIAL_JIFFIES; + struct autogroup *ag; + int err; + + if (*nice < -20 || *nice > 19) + return -EINVAL; + + err = security_task_setnice(current, *nice); + if (err) + return err; + + if (*nice < 0 && !can_nice(current, *nice)) + return -EPERM; + + /* this is a heavy operation taking global locks.. */ + if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) + return -EAGAIN; + + next = HZ / 10 + jiffies; + ag = autogroup_task_get(p); + + down_write(&ag->lock); + err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); + if (!err) + ag->nice = *nice; + up_write(&ag->lock); + + autogroup_kref_put(ag); + + return err; +} + +void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) +{ + struct autogroup *ag = autogroup_task_get(p); + + down_read(&ag->lock); + seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); + up_read(&ag->lock); + + autogroup_kref_put(ag); +} +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_SCHED_DEBUG +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); +} +#endif /* CONFIG_SCHED_DEBUG */ + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h new file mode 100644 index 000000000000..5358e241cb20 --- /dev/null +++ b/kernel/sched_autogroup.h @@ -0,0 +1,32 @@ +#ifdef CONFIG_SCHED_AUTOGROUP + +struct autogroup { + struct kref kref; + struct task_group *tg; + struct rw_semaphore lock; + unsigned long id; + int nice; +}; + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg); + +#else /* !CONFIG_SCHED_AUTOGROUP */ + +static inline void autogroup_init(struct task_struct *init_task) { } +static inline void autogroup_free(struct task_group *tg) { } + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + return tg; +} + +#ifdef CONFIG_SCHED_DEBUG +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + return 0; +} +#endif + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 52f1a149bfb1..9d8af0b3fb64 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) } EXPORT_SYMBOL_GPL(sched_clock); -static __read_mostly int sched_clock_running; +__read_mostly int sched_clock_running; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK __read_mostly int sched_clock_stable; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 2e1b0d17dd9b..1dfae3d014b5 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec) #define SPLIT_NS(x) nsec_high(x), nsec_low(x) #ifdef CONFIG_FAIR_GROUP_SCHED -static void print_cfs_group_stats(struct seq_file *m, int cpu, - struct task_group *tg) +static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { struct sched_entity *se = tg->se[cpu]; if (!se) @@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif -#ifdef CONFIG_CGROUP_SCHED - { - char path[64]; - - rcu_read_lock(); - cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); - rcu_read_unlock(); - SEQ_printf(m, " %s", path); - } -#endif SEQ_printf(m, "\n"); } @@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_unlock_irqrestore(&tasklist_lock, flags); } -#if defined(CONFIG_CGROUP_SCHED) && \ - (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)) -static void task_group_path(struct task_group *tg, char *buf, int buflen) -{ - /* may be NULL if the underlying cgroup isn't fully-created yet */ - if (!tg->css.cgroup) { - buf[0] = '\0'; - return; - } - cgroup_path(tg->css.cgroup, buf, buflen); -} -#endif - void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, @@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) struct sched_entity *last; unsigned long flags; -#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) - char path[128]; - struct task_group *tg = cfs_rq->tg; - - task_group_path(tg, path, sizeof(path)); - - SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); -#else SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); -#endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) spread0 = min_vruntime - rq0_min_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", SPLIT_NS(spread0)); - SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); - SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); - SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); + SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", + SPLIT_NS(cfs_rq->load_avg)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", + SPLIT_NS(cfs_rq->load_period)); + SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", + cfs_rq->load_contribution); + SEQ_printf(m, " .%-30s: %d\n", "load_tg", + atomic_read(&cfs_rq->tg->load_weight)); #endif + print_cfs_group_stats(m, cpu, cfs_rq->tg); #endif } void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { -#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) - char path[128]; - struct task_group *tg = rt_rq->tg; - - task_group_path(tg, path, sizeof(path)); - - SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); -#else SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); -#endif - #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) @@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) #undef P } +extern __read_mostly int sched_clock_running; + static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); @@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = { static int sched_debug_show(struct seq_file *m, void *v) { - u64 now = ktime_to_ns(ktime_get()); + u64 ktime, sched_clk, cpu_clk; + unsigned long flags; int cpu; - SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", + local_irq_save(flags); + ktime = ktime_to_ns(ktime_get()); + sched_clk = sched_clock(); + cpu_clk = local_clock(); + local_irq_restore(flags); + + SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); +#define P(x) \ + SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ + SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + PN(ktime); + PN(sched_clk); + PN(cpu_clk); + P(jiffies); +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + P(sched_clock_stable); +#endif +#undef PN +#undef P + + SEQ_printf(m, "\n"); + SEQ_printf(m, "sysctl_sched\n"); #define P(x) \ SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) - P(jiffies); PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); PN(sysctl_sched_wakeup_granularity); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 00ebd7686676..c62ebae65cf0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +/* + * The exponential sliding window over which load is averaged for shares + * distribution. + * (default: 10msec) + */ +unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; + static const struct sched_class fair_sched_class; /************************************************************** @@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) return cfs_rq->tg->cfs_rq[this_cpu]; } +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (!cfs_rq->on_list) { + /* + * Ensure we either appear before our parent (if already + * enqueued) or force our parent to appear after us when it is + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases. + */ + if (cfs_rq->tg->parent && + cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + } else { + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + } + + cfs_rq->on_list = 1; + } +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->on_list) { + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + cfs_rq->on_list = 0; + } +} + /* Iterate thr' all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq(rq, cfs_rq) \ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) @@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) return &cpu_rq(this_cpu)->cfs; } +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) @@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write, WRT_SYSCTL(sched_min_granularity); WRT_SYSCTL(sched_latency); WRT_SYSCTL(sched_wakeup_granularity); - WRT_SYSCTL(sched_shares_ratelimit); #undef WRT_SYSCTL return 0; @@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); +static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); + +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED + cfs_rq->load_unacc_exec_time += delta_exec; +#endif } static void update_curr(struct cfs_rq *cfs_rq) @@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_add(&se->group_node, &cfs_rq->tasks); } cfs_rq->nr_running++; - se->on_rq = 1; } static void @@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_del_init(&se->group_node); } cfs_rq->nr_running--; - se->on_rq = 0; } +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, + int global_update) +{ + struct task_group *tg = cfs_rq->tg; + long load_avg; + + load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); + load_avg -= cfs_rq->load_contribution; + + if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { + atomic_add(load_avg, &tg->load_weight); + cfs_rq->load_contribution += load_avg; + } +} + +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ + u64 period = sysctl_sched_shares_window; + u64 now, delta; + unsigned long load = cfs_rq->load.weight; + + if (!cfs_rq) + return; + + now = rq_of(cfs_rq)->clock; + delta = now - cfs_rq->load_stamp; + + /* truncate load history at 4 idle periods */ + if (cfs_rq->load_stamp > cfs_rq->load_last && + now - cfs_rq->load_last > 4 * period) { + cfs_rq->load_period = 0; + cfs_rq->load_avg = 0; + } + + cfs_rq->load_stamp = now; + cfs_rq->load_unacc_exec_time = 0; + cfs_rq->load_period += delta; + if (load) { + cfs_rq->load_last = now; + cfs_rq->load_avg += delta * load; + } + + /* consider updating load contribution on each fold or truncate */ + if (global_update || cfs_rq->load_period > period + || !cfs_rq->load_period) + update_cfs_rq_load_contribution(cfs_rq, global_update); + + while (cfs_rq->load_period > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (cfs_rq->load_period)); + cfs_rq->load_period /= 2; + cfs_rq->load_avg /= 2; + } + + if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) + list_del_leaf_cfs_rq(cfs_rq); +} + +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->on_rq) { + /* commit outstanding execution time */ + if (cfs_rq->curr == se) + update_curr(cfs_rq); + account_entity_dequeue(cfs_rq, se); + } + + update_load_set(&se->load, weight); + + if (se->on_rq) + account_entity_enqueue(cfs_rq, se); +} + +static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +{ + struct task_group *tg; + struct sched_entity *se; + long load_weight, load, shares; + + if (!cfs_rq) + return; + + tg = cfs_rq->tg; + se = tg->se[cpu_of(rq_of(cfs_rq))]; + if (!se) + return; + + load = cfs_rq->load.weight + weight_delta; + + load_weight = atomic_read(&tg->load_weight); + load_weight -= cfs_rq->load_contribution; + load_weight += load; + + shares = (tg->shares * load); + if (load_weight) + shares /= load_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + reweight_entity(cfs_rq_of(se), se, shares); +} + +static void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } +} +#else /* CONFIG_FAIR_GROUP_SCHED */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +{ +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS @@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, se->load.weight); account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) { @@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); + se->on_rq = 1; + + if (cfs_rq->nr_running == 1) + list_add_leaf_cfs_rq(cfs_rq); } static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); + se->on_rq = 0; + update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); + update_cfs_shares(cfs_rq, 0); /* * Normalize the entity after updating the min_vruntime because the @@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) */ update_curr(cfs_rq); + /* + * Update share accounting for long-running entities. + */ + update_entity_shares_tick(cfs_rq); + #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother @@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) flags = ENQUEUE_WAKEUP; } + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } + hrtick_update(rq); } @@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; flags |= DEQUEUE_SLEEP; } + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } + hrtick_update(rq); } @@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one * can calculate the shift in shares. - * - * The problem is that perfectly aligning the shares is rather expensive, hence - * we try to avoid doing that too often - see update_shares(), which ratelimits - * this change. - * - * We compensate this by not only taking the current delta into account, but - * also considering the delta between when the shares were last adjusted and - * now. - * - * We still saw a performance dip, some tracing learned us that between - * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased - * significantly. Therefore try to bias the error in direction of failing - * the affine wakeup. - * */ -static long effective_load(struct task_group *tg, int cpu, - long wl, long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; if (!tg->parent) return wl; - /* - * By not taking the decrease of shares on the other cpu into - * account our error leans towards reducing the affine wakeups. - */ - if (!wl && sched_feat(ASYM_EFF_LOAD)) - return wl; - for_each_sched_entity(se) { long S, rw, s, a, b; - long more_w; - - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; S = se->my_q->tg->shares; - s = se->my_q->shares; - rw = se->my_q->rq_weight; + s = se->load.weight; + rw = se->my_q->load.weight; a = S*(rw + wl); b = S*rw + s*wg; @@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ sd = tmp; } -#ifdef CONFIG_FAIR_GROUP_SCHED - if (sched_feat(LB_SHARES_UPDATE)) { - /* - * Pick the largest domain to update shares over - */ - tmp = sd; - if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) - tmp = affine_sd; - - if (tmp) { - raw_spin_unlock(&rq->lock); - update_shares(tmp); - raw_spin_lock(&rq->lock); - } - } -#endif - if (affine_sd) { if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) return select_idle_sibling(p, cpu); @@ -1909,6 +2071,48 @@ out: } #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * update tg->load_weight by folding this cpu's load_avg + */ +static int update_shares_cpu(struct task_group *tg, int cpu) +{ + struct cfs_rq *cfs_rq; + unsigned long flags; + struct rq *rq; + + if (!tg->se[cpu]) + return 0; + + rq = cpu_rq(cpu); + cfs_rq = tg->cfs_rq[cpu]; + + raw_spin_lock_irqsave(&rq->lock, flags); + + update_rq_clock(rq); + update_cfs_load(cfs_rq, 1); + + /* + * We need to update shares after updating tg->load_weight in + * order to adjust the weight of groups with long running tasks. + */ + update_cfs_shares(cfs_rq, 0); + + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return 0; +} + +static void update_shares(int cpu) +{ + struct cfs_rq *cfs_rq; + struct rq *rq = cpu_rq(cpu); + + rcu_read_lock(); + for_each_leaf_cfs_rq(rq, cfs_rq) + update_shares_cpu(cfs_rq->tg, cpu); + rcu_read_unlock(); +} + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return max_load_move - rem_load_move; } #else +static inline void update_shares(int cpu) +{ +} + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: - update_shares(sd); group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance); @@ -3174,8 +3381,6 @@ out_one_pinned: else ld_moved = 0; out: - if (ld_moved) - update_shares(sd); return ld_moved; } @@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) */ raw_spin_unlock(&this_rq->lock); + update_shares(this_cpu); for_each_domain(this_cpu, sd) { unsigned long interval; int balance = 1; @@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) int update_next_balance = 0; int need_serialize; + update_shares(cpu); + for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 185f920ec1a2..68e69acc29b9 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0) SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(LB_BIAS, 1) -SCHED_FEAT(LB_SHARES_UPDATE, 1) -SCHED_FEAT(ASYM_EFF_LOAD, 1) /* * Spin-wait on mutex acquisition when the mutex owner is running on diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bea7d79f7e9c..c914ec747ca6 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); } +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) +{ + list_add_rcu(&rt_rq->leaf_rt_rq_list, + &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); +} + +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) +{ + list_del_rcu(&rt_rq->leaf_rt_rq_list); +} + #define for_each_leaf_rt_rq(rt_rq, rq) \ list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) @@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) return ktime_to_ns(def_rt_bandwidth.rt_period); } +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) +{ +} + +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) +{ +} + #define for_each_leaf_rt_rq(rt_rq, rq) \ for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) @@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) return; + if (!rt_rq->rt_nr_running) + list_add_leaf_rt_rq(rt_rq); + if (head) list_add(&rt_se->run_list, queue); else @@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); + if (!rt_rq->rt_nr_running) + list_del_leaf_rt_rq(rt_rq); } /* diff --git a/kernel/softirq.c b/kernel/softirq.c index 18f4be0d5fe0..d4d918a91881 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + static struct sched_param param = { + .sched_priority = MAX_RT_PRIO-1 + }; p = per_cpu(ksoftirqd, hotcpu); per_cpu(ksoftirqd, hotcpu) = NULL; diff --git a/kernel/srcu.c b/kernel/srcu.c index c71e07500536..98d8c1e80edb 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -31,6 +31,7 @@ #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/smp.h> +#include <linux/delay.h> #include <linux/srcu.h> static int init_srcu_struct_fields(struct srcu_struct *sp) @@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) * all srcu_read_lock() calls using the old counters have completed. * Their corresponding critical sections might well be still * executing, but the srcu_read_lock() primitives themselves - * will have finished executing. + * will have finished executing. We initially give readers + * an arbitrarily chosen 10 microseconds to get out of their + * SRCU read-side critical sections, then loop waiting 1/HZ + * seconds per iteration. */ + if (srcu_readers_active_idx(sp, idx)) + udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY); while (srcu_readers_active_idx(sp, idx)) schedule_timeout_interruptible(1); diff --git a/kernel/sys.c b/kernel/sys.c index 7f5a0cd296a9..2745dcdb6c6c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid) err = session; out: write_unlock_irq(&tasklist_lock); - if (err > 0) + if (err > 0) { proc_sid_connector(group_leader); + sched_autogroup_create_attach(group_leader); + } return err; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5abfa1518554..ae5cbb1e3ced 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -static int min_sched_shares_ratelimit = 100000; /* 100 usec */ -static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ #endif #ifdef CONFIG_COMPACTION @@ -305,15 +303,6 @@ static struct ctl_table kern_table[] = { .extra2 = &max_wakeup_granularity_ns, }, { - .procname = "sched_shares_ratelimit", - .data = &sysctl_sched_shares_ratelimit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_shares_ratelimit, - .extra2 = &max_sched_shares_ratelimit, - }, - { .procname = "sched_tunable_scaling", .data = &sysctl_sched_tunable_scaling, .maxlen = sizeof(enum sched_tunable_scaling), @@ -323,14 +312,6 @@ static struct ctl_table kern_table[] = { .extra2 = &max_sched_tunable_scaling, }, { - .procname = "sched_shares_thresh", - .data = &sysctl_sched_shares_thresh, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - }, - { .procname = "sched_migration_cost", .data = &sysctl_sched_migration_cost, .maxlen = sizeof(unsigned int), @@ -352,6 +333,13 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "sched_shares_window", + .data = &sysctl_sched_shares_window, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "timer_migration", .data = &sysctl_timer_migration, .maxlen = sizeof(unsigned int), @@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_AUTOGROUP + { + .procname = "sched_autogroup_enabled", + .data = &sysctl_sched_autogroup_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &one, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", @@ -745,21 +744,21 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) { - .procname = "unknown_nmi_panic", - .data = &unknown_nmi_panic, + .procname = "nmi_watchdog", + .data = &watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dowatchdog_enabled, }, +#endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { - .procname = "nmi_watchdog", - .data = &nmi_watchdog_enabled, + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_nmi_enabled, + .proc_handler = proc_dointvec, }, #endif #if defined(CONFIG_X86) diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 1357c5786064..4b2545a136ff 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = { { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, - { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" }, { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, {} }; diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c index ac38fbb176cc..a9ae369925ce 100644 --- a/kernel/time/timecompare.c +++ b/kernel/time/timecompare.c @@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/math64.h> +#include <linux/kernel.h> /* * fixed point arithmetic scale factor for skew @@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync, int index; int num_samples = sync->num_samples; - if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { + if (num_samples > ARRAY_SIZE(buffer)) { samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); if (!samples) { samples = buffer; - num_samples = sizeof(buffer)/sizeof(buffer[0]); + num_samples = ARRAY_SIZE(buffer); } } else { samples = buffer; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 49010d822f72..5bb86da82003 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -32,6 +32,8 @@ struct timekeeper { cycle_t cycle_interval; /* Number of clock shifted nano seconds in one NTP interval. */ u64 xtime_interval; + /* shifted nano seconds left over when rounding cycle_interval */ + s64 xtime_remainder; /* Raw nano seconds accumulated per NTP interval. */ u32 raw_interval; @@ -62,7 +64,7 @@ struct timekeeper timekeeper; static void timekeeper_setup_internals(struct clocksource *clock) { cycle_t interval; - u64 tmp; + u64 tmp, ntpinterval; timekeeper.clock = clock; clock->cycle_last = clock->read(clock); @@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock) /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; tmp <<= clock->shift; + ntpinterval = tmp; tmp += clock->mult/2; do_div(tmp, clock->mult); if (tmp == 0) @@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock) /* Go back from cycles -> shifted ns */ timekeeper.xtime_interval = (u64) interval * clock->mult; + timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; timekeeper.raw_interval = ((u64) interval * clock->mult) >> clock->shift; @@ -719,7 +723,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) /* Accumulate error between NTP and clock interval */ timekeeper.ntp_error += tick_length << shift; - timekeeper.ntp_error -= timekeeper.xtime_interval << + timekeeper.ntp_error -= + (timekeeper.xtime_interval + timekeeper.xtime_remainder) << (timekeeper.ntp_error_shift + shift); return offset; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ab8f5e33fa92..32a19f9397fc 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, { struct hrtimer *timer, tmp; unsigned long next = 0, i; - struct rb_node *curr; + struct timerqueue_node *curr; unsigned long flags; next_one: i = 0; raw_spin_lock_irqsave(&base->cpu_base->lock, flags); - curr = base->first; + curr = timerqueue_getnext(&base->active); /* * Crude but we have to do this O(N*N) thing, because * we have to unlock the base when printing: */ while (curr && i < next) { - curr = rb_next(curr); + curr = timerqueue_iterate_next(curr); i++; } if (curr) { - timer = rb_entry(curr, struct hrtimer, node); + timer = container_of(curr, struct hrtimer, node); tmp = *timer; raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); diff --git a/kernel/timer.c b/kernel/timer.c index 353b9227c2ec..43ca9936f2d0 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; -/* - * Note that all tvec_bases are 2 byte aligned and lower bit of - * base in timer_list is guaranteed to be zero. Use the LSB to - * indicate whether the timer is deferrable. - * - * A deferrable timer will work normally when the system is busy, but - * will not cause a CPU to come out of idle just to service it; instead, - * the timer will be serviced when the CPU eventually wakes up with a - * subsequent non-deferrable timer. - */ -#define TBASE_DEFERRABLE_FLAG (0x1) - /* Functions below help us manage 'deferrable' flag */ static inline unsigned int tbase_get_deferrable(struct tvec_base *base) { @@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base) static inline void timer_set_deferrable(struct timer_list *timer) { - timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | - TBASE_DEFERRABLE_FLAG)); + timer->base = TBASE_MAKE_DEFERRED(timer->base); } static inline void @@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); - -static inline void set_running_timer(struct tvec_base *base, - struct timer_list *timer) -{ -#ifdef CONFIG_SMP - base->running_timer = timer; -#endif -} - static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; @@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer) } EXPORT_SYMBOL(del_timer); -#ifdef CONFIG_SMP /** * try_to_del_timer_sync - Try to deactivate a timer * @timer: timer do del * * This function tries to deactivate a timer. Upon successful (ret >= 0) * exit the timer is not queued and the handler is not running on any CPU. - * - * It must not be called from interrupt contexts. */ int try_to_del_timer_sync(struct timer_list *timer) { @@ -973,6 +948,7 @@ out: } EXPORT_SYMBOL(try_to_del_timer_sync); +#ifdef CONFIG_SMP /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -983,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync); * * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from - * interrupt contexts. The caller must not hold locks which would prevent + * hardirq contexts. The caller must not hold locks which would prevent * completion of the timer's handler. The timer's handler must not call * add_timer_on(). Upon exit the timer is not queued and the handler is * not running on any CPU. @@ -993,14 +969,16 @@ EXPORT_SYMBOL(try_to_del_timer_sync); int del_timer_sync(struct timer_list *timer) { #ifdef CONFIG_LOCKDEP - unsigned long flags; - - local_irq_save(flags); + local_bh_disable(); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); - local_irq_restore(flags); + local_bh_enable(); #endif - + /* + * don't use it in hardirq context, because it + * could lead to deadlock. + */ + WARN_ON(in_irq()); for (;;) { int ret = try_to_del_timer_sync(timer); if (ret >= 0) @@ -1111,7 +1089,7 @@ static inline void __run_timers(struct tvec_base *base) timer_stats_account_timer(timer); - set_running_timer(base, timer); + base->running_timer = timer; detach_timer(timer, 1); spin_unlock_irq(&base->lock); @@ -1119,7 +1097,7 @@ static inline void __run_timers(struct tvec_base *base) spin_lock_irq(&base->lock); } } - set_running_timer(base, NULL); + base->running_timer = NULL; spin_unlock_irq(&base->lock); } @@ -1249,7 +1227,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, */ unsigned long get_next_timer_interrupt(unsigned long now) { - struct tvec_base *base = __get_cpu_var(tvec_bases); + struct tvec_base *base = __this_cpu_read(tvec_bases); unsigned long expires; /* @@ -1298,7 +1276,7 @@ void update_process_times(int user_tick) */ static void run_timer_softirq(struct softirq_action *h) { - struct tvec_base *base = __get_cpu_var(tvec_bases); + struct tvec_base *base = __this_cpu_read(tvec_bases); hrtimer_run_pending(); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ea37e2ff4164..14674dce77a6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -69,6 +69,21 @@ config EVENT_TRACING select CONTEXT_SWITCH_TRACER bool +config EVENT_POWER_TRACING_DEPRECATED + depends on EVENT_TRACING + bool "Deprecated power event trace API, to be removed" + default y + help + Provides old power event types: + C-state/idle accounting events: + power:power_start + power:power_end + and old cpufreq accounting event: + power:power_frequency + This is for userspace compatibility + and will vanish after 5 kernel iterations, + namely 2.6.41. + config CONTEXT_SWITCH_TRACER bool diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index a22582a06161..f55fcf61b223 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -13,5 +13,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/power.h> -EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); +#ifdef EVENT_POWER_TRACING_DEPRECATED +EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); +#endif +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9ed509a015d8..bd1c35a4fbcc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3853,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* Need to copy one event at a time */ do { + /* We need the size of one event, because + * rb_advance_reader only advances by one event, + * whereas rb_event_ts_length may include the size of + * one or two events. + * We have already ensured there's enough space if this + * is a time extend. */ + size = rb_event_length(event); memcpy(bpage->data + pos, rpage->data + rpos, size); len -= size; @@ -3867,7 +3874,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, event = rb_reader_event(cpu_buffer); /* Always keep the time extend and data together */ size = rb_event_ts_length(event); - } while (len > size); + } while (len >= size); /* update bpage */ local_set(&bpage->commit, pos); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 39c059ca670e..19a359d5e6d5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) /* Count the events in use (per event id, not per instance) */ static int total_ref_count; +static int perf_trace_event_perm(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + /* No tracing, just counting, so no obvious leak */ + if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) + return 0; + + /* Some events are ok to be traced by non-root users... */ + if (p_event->attach_state == PERF_ATTACH_TASK) { + if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) + return 0; + } + + /* + * ...otherwise raw tracepoint data can be a severe data leak, + * only allow root to have these. + */ + if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return 0; +} + static int perf_trace_event_init(struct ftrace_event_call *tp_event, struct perf_event *p_event) { struct hlist_head __percpu *list; - int ret = -ENOMEM; + int ret; int cpu; + ret = perf_trace_event_perm(tp_event, p_event); + if (ret) + return ret; + p_event->tp_event = tp_event; if (tp_event->perf_refcount++ > 0) return 0; + ret = -ENOMEM; + list = alloc_percpu(struct hlist_head); if (!list) goto fail; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0725eeab1937..35fde09b81de 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,6 +27,12 @@ DEFINE_MUTEX(event_mutex); +DEFINE_MUTEX(event_storage_mutex); +EXPORT_SYMBOL_GPL(event_storage_mutex); + +char event_storage[EVENT_STORAGE_SIZE]; +EXPORT_SYMBOL_GPL(event_storage); + LIST_HEAD(ftrace_events); LIST_HEAD(ftrace_common_fields); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4ba44deaac25..4b74d71705c0 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __array #define __array(type, item, len) \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + do { \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + mutex_lock(&event_storage_mutex); \ + snprintf(event_storage, sizeof(event_storage), \ + "%s[%d]", #type, len); \ + ret = trace_define_field(event_call, event_storage, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ is_signed_type(type), FILTER_OTHER); \ - if (ret) \ - return ret; + mutex_unlock(&event_storage_mutex); \ + if (ret) \ + return ret; \ + } while (0); #undef __array_desc #define __array_desc(type, container, item, len) \ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 155a415b3209..562c56e048fd 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) static int trace_wakeup_test_thread(void *data) { /* Make this a RT thread, doesn't need to be too high */ - struct sched_param param = { .sched_priority = 5 }; + static struct sched_param param = { .sched_priority = 5 }; struct completion *x = data; sched_setscheduler(current, SCHED_FIFO, ¶m); diff --git a/kernel/user.c b/kernel/user.c index 2c7d8d5914b1..5c598ca781df 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -158,6 +158,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { + put_user_ns(ns); key_put(new->uid_keyring); key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6e3c41a4024c..6e7b575ac33c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -57,6 +57,8 @@ static int __init hardlockup_panic_setup(char *str) { if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; + else if (!strncmp(str, "0", 1)) + no_watchdog = 1; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -307,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) */ static int watchdog(void *unused) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); sched_setscheduler(current, SCHED_FIFO, ¶m); @@ -364,7 +366,8 @@ static int watchdog_nmi_enable(int cpu) goto out_save; } - printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); + printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n", + cpu, PTR_ERR(event)); return PTR_ERR(event); /* success path */ @@ -547,13 +550,13 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; -static int __init spawn_watchdog_task(void) +void __init lockup_detector_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; if (no_watchdog) - return 0; + return; err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); WARN_ON(notifier_to_errno(err)); @@ -561,6 +564,5 @@ static int __init spawn_watchdog_task(void) cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); - return 0; + return; } -early_initcall(spawn_watchdog_task); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 28b42b9274d0..2d05adb98401 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -173,7 +173,8 @@ config LOCKUP_DETECTOR An NMI is generated every 60 seconds or so to check for hardlockups. config HARDLOCKUP_DETECTOR - def_bool LOCKUP_DETECTOR && PERF_EVENTS && HAVE_PERF_EVENTS_NMI + def_bool LOCKUP_DETECTOR && PERF_EVENTS && HAVE_PERF_EVENTS_NMI && \ + !ARCH_HAS_NMI_WATCHDOG config BOOTPARAM_SOFTLOCKUP_PANIC bool "Panic (Reboot) On Soft Lockups" diff --git a/lib/Makefile b/lib/Makefile index 76d3b8514903..d7b6e30a3a1e 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) endif lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o dump_stack.o \ + rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o prio_tree.o \ sha1.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o prio_heap.o ratelimit.o show_mem.o \ diff --git a/lib/timerqueue.c b/lib/timerqueue.c new file mode 100644 index 000000000000..e3a1050e6820 --- /dev/null +++ b/lib/timerqueue.c @@ -0,0 +1,107 @@ +/* + * Generic Timer-queue + * + * Manages a simple queue of timers, ordered by expiration time. + * Uses rbtrees for quick list adds and expiration. + * + * NOTE: All of the following functions need to be serialized + * to avoid races. No locking is done by this libary code. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/timerqueue.h> +#include <linux/rbtree.h> +#include <linux/module.h> + +/** + * timerqueue_add - Adds timer to timerqueue. + * + * @head: head of timerqueue + * @node: timer node to be added + * + * Adds the timer node to the timerqueue, sorted by the + * node's expires value. + */ +void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) +{ + struct rb_node **p = &head->head.rb_node; + struct rb_node *parent = NULL; + struct timerqueue_node *ptr; + + /* Make sure we don't add nodes that are already added */ + WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node)); + + while (*p) { + parent = *p; + ptr = rb_entry(parent, struct timerqueue_node, node); + if (node->expires.tv64 < ptr->expires.tv64) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + rb_link_node(&node->node, parent, p); + rb_insert_color(&node->node, &head->head); + + if (!head->next || node->expires.tv64 < head->next->expires.tv64) + head->next = node; +} +EXPORT_SYMBOL_GPL(timerqueue_add); + +/** + * timerqueue_del - Removes a timer from the timerqueue. + * + * @head: head of timerqueue + * @node: timer node to be removed + * + * Removes the timer node from the timerqueue. + */ +void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) +{ + WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); + + /* update next pointer */ + if (head->next == node) { + struct rb_node *rbn = rb_next(&node->node); + + head->next = rbn ? + rb_entry(rbn, struct timerqueue_node, node) : NULL; + } + rb_erase(&node->node, &head->head); + RB_CLEAR_NODE(&node->node); +} +EXPORT_SYMBOL_GPL(timerqueue_del); + +/** + * timerqueue_iterate_next - Returns the timer after the provided timer + * + * @node: Pointer to a timer. + * + * Provides the timer that is after the given node. This is used, when + * necessary, to iterate through the list of timers in a timer list + * without modifying the list. + */ +struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node) +{ + struct rb_node *next; + + if (!node) + return NULL; + next = rb_next(&node->node); + if (!next) + return NULL; + return container_of(next, struct timerqueue_node, node); +} +EXPORT_SYMBOL_GPL(timerqueue_iterate_next); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7a22b4129211..00bb8a64d028 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1925,19 +1925,18 @@ again: rcu_read_lock(); p = rcu_dereference(mm->owner); - VM_BUG_ON(!p); /* - * because we don't have task_lock(), "p" can exit while - * we're here. In that case, "mem" can point to root - * cgroup but never be NULL. (and task_struct itself is freed - * by RCU, cgroup itself is RCU safe.) Then, we have small - * risk here to get wrong cgroup. But such kind of mis-account - * by race always happens because we don't have cgroup_mutex(). - * It's overkill and we allow that small race, here. + * Because we don't have task_lock(), "p" can exit. + * In that case, "mem" can point to root or p can be NULL with + * race with swapoff. Then, we have small risk of mis-accouning. + * But such kind of mis-account by race always happens because + * we don't have cgroup_mutex(). It's overkill and we allo that + * small race, here. + * (*) swapoff at el will charge against mm-struct not against + * task-struct. So, mm->owner can be NULL. */ mem = mem_cgroup_from_task(p); - VM_BUG_ON(!mem); - if (mem_cgroup_is_root(mem)) { + if (!mem || mem_cgroup_is_root(mem)) { rcu_read_unlock(); goto done; } diff --git a/mm/nommu.c b/mm/nommu.c index 27a9ac588516..ef4045d010d5 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -10,7 +10,7 @@ * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> - * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org> + * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> */ #include <linux/module.h> @@ -328,6 +328,7 @@ void *vmalloc_node(unsigned long size, int node) { return vmalloc(size); } +EXPORT_SYMBOL(vmalloc_node); /** * vzalloc_node - allocate memory on a specific node with zero fill @@ -440,6 +441,31 @@ void __attribute__((weak)) vmalloc_sync_all(void) { } +/** + * alloc_vm_area - allocate a range of kernel address space + * @size: size of the area + * + * Returns: NULL on failure, vm_struct on success + * + * This function reserves a range of kernel address space, and + * allocates pagetables to map that range. No actual mappings + * are created. If the kernel address space is not shared + * between processes, it syncs the pagetable across all + * processes. + */ +struct vm_struct *alloc_vm_area(size_t size) +{ + BUG(); + return NULL; +} +EXPORT_SYMBOL_GPL(alloc_vm_area); + +void free_vm_area(struct vm_struct *area) +{ + BUG(); +} +EXPORT_SYMBOL_GPL(free_vm_area); + int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 5ad25e17b6cb..4eb99ab34053 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -214,17 +214,22 @@ ifdef BUILD_C_RECORDMCOUNT # The empty.o file is created in the make process in order to determine # the target endianness and word size. It is made before all other C # files, including recordmcount. -cmd_record_mcount = if [ $(@) != "scripts/mod/empty.o" ]; then \ - $(objtree)/scripts/recordmcount "$(@)"; \ - fi; +sub_cmd_record_mcount = \ + if [ $(@) != "scripts/mod/empty.o" ]; then \ + $(objtree)/scripts/recordmcount "$(@)"; \ + fi; else -cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \ +sub_cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \ "$(if $(CONFIG_CPU_BIG_ENDIAN),big,little)" \ "$(if $(CONFIG_64BIT),64,32)" \ "$(OBJDUMP)" "$(OBJCOPY)" "$(CC) $(KBUILD_CFLAGS)" \ "$(LD)" "$(NM)" "$(RM)" "$(MV)" \ "$(if $(part-of-module),1,0)" "$(@)"; endif +cmd_record_mcount = \ + if [ "$(findstring -pg,$(_c_flags))" = "-pg" ]; then \ + $(sub_cmd_record_mcount) \ + fi; endif define rule_cc_o_c diff --git a/scripts/kconfig/menu.c b/scripts/kconfig/menu.c index b9d9aa18e6d6..5f77dcb8977e 100644 --- a/scripts/kconfig/menu.c +++ b/scripts/kconfig/menu.c @@ -140,6 +140,20 @@ struct property *menu_add_prop(enum prop_type type, char *prompt, struct expr *e } if (current_entry->prompt && current_entry != &rootmenu) prop_warn(prop, "prompt redefined"); + + /* Apply all upper menus' visibilities to actual prompts. */ + if(type == P_PROMPT) { + struct menu *menu = current_entry; + + while ((menu = menu->parent) != NULL) { + if (!menu->visibility) + continue; + prop->visible.expr + = expr_alloc_and(prop->visible.expr, + menu->visibility); + } + } + current_entry->prompt = prop; } prop->text = prompt; diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 39580a5dc5df..9f85012acf0d 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -155,6 +155,8 @@ use strict; # '@parameter' - name of a parameter # '%CONST' - name of a constant. +## init lots of data + my $errors = 0; my $warnings = 0; my $anon_struct_union = 0; @@ -218,21 +220,14 @@ my %highlights_list = ( $type_constant, "\$1", $type_param, "\$1" ); my $blankline_list = ""; -sub usage { - print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n"; - print " [ -no-doc-sections ]\n"; - print " [ -function funcname [ -function funcname ...] ]\n"; - print " [ -nofunction funcname [ -nofunction funcname ...] ]\n"; - print " c source file(s) > outputfile\n"; - print " -v : verbose output, more warnings & other info listed\n"; - exit 1; -} - # read arguments if ($#ARGV == -1) { usage(); } +my $kernelversion; +my $dohighlight = ""; + my $verbose = 0; my $output_mode = "man"; my $no_doc_sections = 0; @@ -245,7 +240,7 @@ my $man_date = ('January', 'February', 'March', 'April', 'May', 'June', 'November', 'December')[(localtime)[4]] . " " . ((localtime)[5]+1900); -# Essentially these are globals +# Essentially these are globals. # They probably want to be tidied up, made more localised or something. # CAVEAT EMPTOR! Some of the others I localised may not want to be, which # could cause "use of undefined value" or other bugs. @@ -353,6 +348,18 @@ while ($ARGV[0] =~ m/^-(.*)/) { } } +# continue execution near EOF; + +sub usage { + print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n"; + print " [ -no-doc-sections ]\n"; + print " [ -function funcname [ -function funcname ...] ]\n"; + print " [ -nofunction funcname [ -nofunction funcname ...] ]\n"; + print " c source file(s) > outputfile\n"; + print " -v : verbose output, more warnings & other info listed\n"; + exit 1; +} + # get kernel version from env sub get_kernel_version() { my $version = 'unknown kernel version'; @@ -362,15 +369,6 @@ sub get_kernel_version() { } return $version; } -my $kernelversion = get_kernel_version(); - -# generate a sequence of code that will splice in highlighting information -# using the s// operator. -my $dohighlight = ""; -foreach my $pattern (keys %highlights) { -# print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n"; - $dohighlight .= "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n"; -} ## # dumps section contents to arrays/hashes intended for that purpose. @@ -1851,34 +1849,6 @@ sub dump_function($$) { }); } -sub process_file($); - -# Read the file that maps relative names to absolute names for -# separate source and object directories and for shadow trees. -if (open(SOURCE_MAP, "<.tmp_filelist.txt")) { - my ($relname, $absname); - while(<SOURCE_MAP>) { - chop(); - ($relname, $absname) = (split())[0..1]; - $relname =~ s:^/+::; - $source_map{$relname} = $absname; - } - close(SOURCE_MAP); -} - -foreach (@ARGV) { - chomp; - process_file($_); -} -if ($verbose && $errors) { - print STDERR "$errors errors\n"; -} -if ($verbose && $warnings) { - print STDERR "$warnings warnings\n"; -} - -exit($errors); - sub reset_state { $function = ""; %constants = (); @@ -2285,3 +2255,39 @@ sub process_file($) { } } } + + +$kernelversion = get_kernel_version(); + +# generate a sequence of code that will splice in highlighting information +# using the s// operator. +foreach my $pattern (keys %highlights) { +# print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n"; + $dohighlight .= "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n"; +} + +# Read the file that maps relative names to absolute names for +# separate source and object directories and for shadow trees. +if (open(SOURCE_MAP, "<.tmp_filelist.txt")) { + my ($relname, $absname); + while(<SOURCE_MAP>) { + chop(); + ($relname, $absname) = (split())[0..1]; + $relname =~ s:^/+::; + $source_map{$relname} = $absname; + } + close(SOURCE_MAP); +} + +foreach (@ARGV) { + chomp; + process_file($_); +} +if ($verbose && $errors) { + print STDERR "$errors errors\n"; +} +if ($verbose && $warnings) { + print STDERR "$warnings warnings\n"; +} + +exit($errors); diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index aef8c0a923ab..d661afbe474c 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -253,6 +253,8 @@ static int ima_lsm_rule_init(struct ima_measure_rule_entry *entry, result = security_filter_rule_init(entry->lsm[lsm_rule].type, Audit_equal, args, &entry->lsm[lsm_rule].rule); + if (!entry->lsm[lsm_rule].rule) + return -EINVAL; return result; } diff --git a/sound/oss/soundcard.c b/sound/oss/soundcard.c index 46c0d03dbecc..fcb14a099822 100644 --- a/sound/oss/soundcard.c +++ b/sound/oss/soundcard.c @@ -87,7 +87,7 @@ int *load_mixer_volumes(char *name, int *levels, int present) int i, n; for (i = 0; i < num_mixer_volumes; i++) { - if (strcmp(name, mixer_vols[i].name) == 0) { + if (strncmp(name, mixer_vols[i].name, 32) == 0) { if (present) mixer_vols[i].num = i; return mixer_vols[i].levels; @@ -99,7 +99,7 @@ int *load_mixer_volumes(char *name, int *levels, int present) } n = num_mixer_volumes++; - strcpy(mixer_vols[n].name, name); + strncpy(mixer_vols[n].name, name, 32); if (present) mixer_vols[n].num = n; diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index b030c8eba21f..a1c4008af891 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2300,6 +2300,7 @@ static struct snd_pci_quirk position_fix_list[] __devinitdata = { SND_PCI_QUIRK(0x1028, 0x01cc, "Dell D820", POS_FIX_LPIB), SND_PCI_QUIRK(0x1028, 0x01de, "Dell Precision 390", POS_FIX_LPIB), SND_PCI_QUIRK(0x1028, 0x01f6, "Dell Latitude 131L", POS_FIX_LPIB), + SND_PCI_QUIRK(0x1028, 0x0470, "Dell Inspiron 1120", POS_FIX_LPIB), SND_PCI_QUIRK(0x103c, 0x306d, "HP dv3", POS_FIX_LPIB), SND_PCI_QUIRK(0x1043, 0x813d, "ASUS P5AD2", POS_FIX_LPIB), SND_PCI_QUIRK(0x1043, 0x81b3, "ASUS", POS_FIX_LPIB), diff --git a/sound/soc/codecs/max98088.c b/sound/soc/codecs/max98088.c index d63e28773eb1..6447dbb2f123 100644 --- a/sound/soc/codecs/max98088.c +++ b/sound/soc/codecs/max98088.c @@ -40,7 +40,6 @@ struct max98088_cdata { }; struct max98088_priv { - u8 reg_cache[M98088_REG_CNT]; enum max98088_type devtype; void *control_data; struct max98088_pdata *pdata; @@ -1588,7 +1587,7 @@ static int max98088_dai2_set_fmt(struct snd_soc_dai *codec_dai, static void max98088_sync_cache(struct snd_soc_codec *codec) { - struct max98088_priv *max98088 = snd_soc_codec_get_drvdata(codec); + u16 *reg_cache = codec->reg_cache; int i; if (!codec->cache_sync) @@ -1599,14 +1598,14 @@ static void max98088_sync_cache(struct snd_soc_codec *codec) /* write back cached values if they're writeable and * different from the hardware default. */ - for (i = 1; i < ARRAY_SIZE(max98088->reg_cache); i++) { + for (i = 1; i < codec->driver->reg_cache_size; i++) { if (!max98088_access[i].writable) continue; - if (max98088->reg_cache[i] == max98088_reg[i]) + if (reg_cache[i] == max98088_reg[i]) continue; - snd_soc_write(codec, i, max98088->reg_cache[i]); + snd_soc_write(codec, i, reg_cache[i]); } codec->cache_sync = 0; @@ -1951,7 +1950,6 @@ static int max98088_probe(struct snd_soc_codec *codec) int ret = 0; codec->cache_sync = 1; - memcpy(codec->reg_cache, max98088_reg, sizeof(max98088_reg)); ret = snd_soc_codec_set_cache_io(codec, 8, 8, SND_SOC_I2C); if (ret != 0) { diff --git a/sound/soc/codecs/wm8523.c b/sound/soc/codecs/wm8523.c index 9a433a5396cb..deca79ea2b4b 100644 --- a/sound/soc/codecs/wm8523.c +++ b/sound/soc/codecs/wm8523.c @@ -41,7 +41,6 @@ static const char *wm8523_supply_names[WM8523_NUM_SUPPLIES] = { /* codec private data */ struct wm8523_priv { enum snd_soc_control_type control_type; - u16 reg_cache[WM8523_REGISTER_COUNT]; struct regulator_bulk_data supplies[WM8523_NUM_SUPPLIES]; unsigned int sysclk; unsigned int rate_constraint_list[WM8523_NUM_RATES]; @@ -314,6 +313,7 @@ static int wm8523_set_bias_level(struct snd_soc_codec *codec, enum snd_soc_bias_level level) { struct wm8523_priv *wm8523 = snd_soc_codec_get_drvdata(codec); + u16 *reg_cache = codec->reg_cache; int ret, i; switch (level) { @@ -344,7 +344,7 @@ static int wm8523_set_bias_level(struct snd_soc_codec *codec, /* Sync back default/cached values */ for (i = WM8523_AIF_CTRL1; i < WM8523_MAX_REGISTER; i++) - snd_soc_write(codec, i, wm8523->reg_cache[i]); + snd_soc_write(codec, i, reg_cache[i]); msleep(100); @@ -414,6 +414,7 @@ static int wm8523_resume(struct snd_soc_codec *codec) static int wm8523_probe(struct snd_soc_codec *codec) { struct wm8523_priv *wm8523 = snd_soc_codec_get_drvdata(codec); + u16 *reg_cache = codec->reg_cache; int ret, i; codec->hw_write = (hw_write_t)i2c_master_send; @@ -470,8 +471,8 @@ static int wm8523_probe(struct snd_soc_codec *codec) } /* Change some default settings - latch VU and enable ZC */ - wm8523->reg_cache[WM8523_DAC_GAINR] |= WM8523_DACR_VU; - wm8523->reg_cache[WM8523_DAC_CTRL3] |= WM8523_ZC; + reg_cache[WM8523_DAC_GAINR] |= WM8523_DACR_VU; + reg_cache[WM8523_DAC_CTRL3] |= WM8523_ZC; wm8523_set_bias_level(codec, SND_SOC_BIAS_STANDBY); diff --git a/sound/soc/codecs/wm8741.c b/sound/soc/codecs/wm8741.c index 90e31e9aa6f7..aea60ef8aba7 100644 --- a/sound/soc/codecs/wm8741.c +++ b/sound/soc/codecs/wm8741.c @@ -41,7 +41,6 @@ static const char *wm8741_supply_names[WM8741_NUM_SUPPLIES] = { /* codec private data */ struct wm8741_priv { enum snd_soc_control_type control_type; - u16 reg_cache[WM8741_REGISTER_COUNT]; struct regulator_bulk_data supplies[WM8741_NUM_SUPPLIES]; unsigned int sysclk; struct snd_pcm_hw_constraint_list *sysclk_constraints; @@ -422,6 +421,7 @@ static int wm8741_resume(struct snd_soc_codec *codec) static int wm8741_probe(struct snd_soc_codec *codec) { struct wm8741_priv *wm8741 = snd_soc_codec_get_drvdata(codec); + u16 *reg_cache = codec->reg_cache; int ret = 0; ret = snd_soc_codec_set_cache_io(codec, 7, 9, wm8741->control_type); @@ -437,10 +437,10 @@ static int wm8741_probe(struct snd_soc_codec *codec) } /* Change some default settings - latch VU */ - wm8741->reg_cache[WM8741_DACLLSB_ATTENUATION] |= WM8741_UPDATELL; - wm8741->reg_cache[WM8741_DACLMSB_ATTENUATION] |= WM8741_UPDATELM; - wm8741->reg_cache[WM8741_DACRLSB_ATTENUATION] |= WM8741_UPDATERL; - wm8741->reg_cache[WM8741_DACRLSB_ATTENUATION] |= WM8741_UPDATERM; + reg_cache[WM8741_DACLLSB_ATTENUATION] |= WM8741_UPDATELL; + reg_cache[WM8741_DACLMSB_ATTENUATION] |= WM8741_UPDATELM; + reg_cache[WM8741_DACRLSB_ATTENUATION] |= WM8741_UPDATERL; + reg_cache[WM8741_DACRLSB_ATTENUATION] |= WM8741_UPDATERM; snd_soc_add_controls(codec, wm8741_snd_controls, ARRAY_SIZE(wm8741_snd_controls)); diff --git a/sound/soc/codecs/wm8753.c b/sound/soc/codecs/wm8753.c index 8f679a13f2bc..87caae59e939 100644 --- a/sound/soc/codecs/wm8753.c +++ b/sound/soc/codecs/wm8753.c @@ -65,22 +65,22 @@ static void wm8753_set_dai_mode(struct snd_soc_codec *codec, * are using 2 wire for device control, so we cache them instead. */ static const u16 wm8753_reg[] = { - 0x0008, 0x0000, 0x000a, 0x000a, - 0x0033, 0x0000, 0x0007, 0x00ff, - 0x00ff, 0x000f, 0x000f, 0x007b, - 0x0000, 0x0032, 0x0000, 0x00c3, - 0x00c3, 0x00c0, 0x0000, 0x0000, + 0x0000, 0x0008, 0x0000, 0x000a, + 0x000a, 0x0033, 0x0000, 0x0007, + 0x00ff, 0x00ff, 0x000f, 0x000f, + 0x007b, 0x0000, 0x0032, 0x0000, + 0x00c3, 0x00c3, 0x00c0, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0055, - 0x0005, 0x0050, 0x0055, 0x0050, - 0x0055, 0x0050, 0x0055, 0x0079, - 0x0079, 0x0079, 0x0079, 0x0079, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0097, 0x0097, 0x0000, 0x0004, - 0x0000, 0x0083, 0x0024, 0x01ba, - 0x0000, 0x0083, 0x0024, 0x01ba, - 0x0000, 0x0000, 0x0000 + 0x0055, 0x0005, 0x0050, 0x0055, + 0x0050, 0x0055, 0x0050, 0x0055, + 0x0079, 0x0079, 0x0079, 0x0079, + 0x0079, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0097, 0x0097, 0x0000, + 0x0004, 0x0000, 0x0083, 0x0024, + 0x01ba, 0x0000, 0x0083, 0x0024, + 0x01ba, 0x0000, 0x0000, 0x0000 }; /* codec private data */ @@ -88,57 +88,10 @@ struct wm8753_priv { enum snd_soc_control_type control_type; unsigned int sysclk; unsigned int pcmclk; - u16 reg_cache[ARRAY_SIZE(wm8753_reg)]; int dai_func; }; -/* - * read wm8753 register cache - */ -static inline unsigned int wm8753_read_reg_cache(struct snd_soc_codec *codec, - unsigned int reg) -{ - u16 *cache = codec->reg_cache; - if (reg < 1 || reg >= (ARRAY_SIZE(wm8753_reg) + 1)) - return -1; - return cache[reg - 1]; -} - -/* - * write wm8753 register cache - */ -static inline void wm8753_write_reg_cache(struct snd_soc_codec *codec, - unsigned int reg, unsigned int value) -{ - u16 *cache = codec->reg_cache; - if (reg < 1 || reg >= (ARRAY_SIZE(wm8753_reg) + 1)) - return; - cache[reg - 1] = value; -} - -/* - * write to the WM8753 register space - */ -static int wm8753_write(struct snd_soc_codec *codec, unsigned int reg, - unsigned int value) -{ - u8 data[2]; - - /* data is - * D15..D9 WM8753 register offset - * D8...D0 register data - */ - data[0] = (reg << 1) | ((value >> 8) & 0x0001); - data[1] = value & 0x00ff; - - wm8753_write_reg_cache(codec, reg, value); - if (codec->hw_write(codec->control_data, data, 2) == 2) - return 0; - else - return -EIO; -} - -#define wm8753_reset(c) wm8753_write(c, WM8753_RESET, 0) +#define wm8753_reset(c) snd_soc_write(c, WM8753_RESET, 0) /* * WM8753 Controls @@ -218,7 +171,7 @@ static int wm8753_get_dai(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); - int mode = wm8753_read_reg_cache(codec, WM8753_IOCTL); + int mode = snd_soc_read(codec, WM8753_IOCTL); ucontrol->value.integer.value[0] = (mode & 0xc) >> 2; return 0; @@ -228,7 +181,7 @@ static int wm8753_set_dai(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); - int mode = wm8753_read_reg_cache(codec, WM8753_IOCTL); + int mode = snd_soc_read(codec, WM8753_IOCTL); struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec); if (((mode & 0xc) >> 2) == ucontrol->value.integer.value[0]) @@ -738,17 +691,17 @@ static int wm8753_set_dai_pll(struct snd_soc_dai *codec_dai, int pll_id, if (pll_id == WM8753_PLL1) { offset = 0; enable = 0x10; - reg = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xffef; + reg = snd_soc_read(codec, WM8753_CLOCK) & 0xffef; } else { offset = 4; enable = 0x8; - reg = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xfff7; + reg = snd_soc_read(codec, WM8753_CLOCK) & 0xfff7; } if (!freq_in || !freq_out) { /* disable PLL */ - wm8753_write(codec, WM8753_PLL1CTL1 + offset, 0x0026); - wm8753_write(codec, WM8753_CLOCK, reg); + snd_soc_write(codec, WM8753_PLL1CTL1 + offset, 0x0026); + snd_soc_write(codec, WM8753_CLOCK, reg); return 0; } else { u16 value = 0; @@ -759,20 +712,20 @@ static int wm8753_set_dai_pll(struct snd_soc_dai *codec_dai, int pll_id, /* set up N and K PLL divisor ratios */ /* bits 8:5 = PLL_N, bits 3:0 = PLL_K[21:18] */ value = (pll_div.n << 5) + ((pll_div.k & 0x3c0000) >> 18); - wm8753_write(codec, WM8753_PLL1CTL2 + offset, value); + snd_soc_write(codec, WM8753_PLL1CTL2 + offset, value); /* bits 8:0 = PLL_K[17:9] */ value = (pll_div.k & 0x03fe00) >> 9; - wm8753_write(codec, WM8753_PLL1CTL3 + offset, value); + snd_soc_write(codec, WM8753_PLL1CTL3 + offset, value); /* bits 8:0 = PLL_K[8:0] */ value = pll_div.k & 0x0001ff; - wm8753_write(codec, WM8753_PLL1CTL4 + offset, value); + snd_soc_write(codec, WM8753_PLL1CTL4 + offset, value); /* set PLL as input and enable */ - wm8753_write(codec, WM8753_PLL1CTL1 + offset, 0x0027 | + snd_soc_write(codec, WM8753_PLL1CTL1 + offset, 0x0027 | (pll_div.div2 << 3)); - wm8753_write(codec, WM8753_CLOCK, reg | enable); + snd_soc_write(codec, WM8753_CLOCK, reg | enable); } return 0; } @@ -879,7 +832,7 @@ static int wm8753_vdac_adc_set_dai_fmt(struct snd_soc_dai *codec_dai, unsigned int fmt) { struct snd_soc_codec *codec = codec_dai->codec; - u16 voice = wm8753_read_reg_cache(codec, WM8753_PCM) & 0x01ec; + u16 voice = snd_soc_read(codec, WM8753_PCM) & 0x01ec; /* interface format */ switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) { @@ -901,7 +854,7 @@ static int wm8753_vdac_adc_set_dai_fmt(struct snd_soc_dai *codec_dai, return -EINVAL; } - wm8753_write(codec, WM8753_PCM, voice); + snd_soc_write(codec, WM8753_PCM, voice); return 0; } @@ -922,8 +875,8 @@ static int wm8753_pcm_hw_params(struct snd_pcm_substream *substream, struct snd_soc_pcm_runtime *rtd = substream->private_data; struct snd_soc_codec *codec = rtd->codec; struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec); - u16 voice = wm8753_read_reg_cache(codec, WM8753_PCM) & 0x01f3; - u16 srate = wm8753_read_reg_cache(codec, WM8753_SRATE1) & 0x017f; + u16 voice = snd_soc_read(codec, WM8753_PCM) & 0x01f3; + u16 srate = snd_soc_read(codec, WM8753_SRATE1) & 0x017f; /* bit size */ switch (params_format(params)) { @@ -943,9 +896,9 @@ static int wm8753_pcm_hw_params(struct snd_pcm_substream *substream, /* sample rate */ if (params_rate(params) * 384 == wm8753->pcmclk) srate |= 0x80; - wm8753_write(codec, WM8753_SRATE1, srate); + snd_soc_write(codec, WM8753_SRATE1, srate); - wm8753_write(codec, WM8753_PCM, voice); + snd_soc_write(codec, WM8753_PCM, voice); return 0; } @@ -958,8 +911,8 @@ static int wm8753_pcm_set_dai_fmt(struct snd_soc_dai *codec_dai, struct snd_soc_codec *codec = codec_dai->codec; u16 voice, ioctl; - voice = wm8753_read_reg_cache(codec, WM8753_PCM) & 0x011f; - ioctl = wm8753_read_reg_cache(codec, WM8753_IOCTL) & 0x015d; + voice = snd_soc_read(codec, WM8753_PCM) & 0x011f; + ioctl = snd_soc_read(codec, WM8753_IOCTL) & 0x015d; /* set master/slave audio interface */ switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) { @@ -1013,8 +966,8 @@ static int wm8753_pcm_set_dai_fmt(struct snd_soc_dai *codec_dai, return -EINVAL; } - wm8753_write(codec, WM8753_PCM, voice); - wm8753_write(codec, WM8753_IOCTL, ioctl); + snd_soc_write(codec, WM8753_PCM, voice); + snd_soc_write(codec, WM8753_IOCTL, ioctl); return 0; } @@ -1026,16 +979,16 @@ static int wm8753_set_dai_clkdiv(struct snd_soc_dai *codec_dai, switch (div_id) { case WM8753_PCMDIV: - reg = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0x003f; - wm8753_write(codec, WM8753_CLOCK, reg | div); + reg = snd_soc_read(codec, WM8753_CLOCK) & 0x003f; + snd_soc_write(codec, WM8753_CLOCK, reg | div); break; case WM8753_BCLKDIV: - reg = wm8753_read_reg_cache(codec, WM8753_SRATE2) & 0x01c7; - wm8753_write(codec, WM8753_SRATE2, reg | div); + reg = snd_soc_read(codec, WM8753_SRATE2) & 0x01c7; + snd_soc_write(codec, WM8753_SRATE2, reg | div); break; case WM8753_VXCLKDIV: - reg = wm8753_read_reg_cache(codec, WM8753_SRATE2) & 0x003f; - wm8753_write(codec, WM8753_SRATE2, reg | div); + reg = snd_soc_read(codec, WM8753_SRATE2) & 0x003f; + snd_soc_write(codec, WM8753_SRATE2, reg | div); break; default: return -EINVAL; @@ -1050,7 +1003,7 @@ static int wm8753_hdac_set_dai_fmt(struct snd_soc_dai *codec_dai, unsigned int fmt) { struct snd_soc_codec *codec = codec_dai->codec; - u16 hifi = wm8753_read_reg_cache(codec, WM8753_HIFI) & 0x01e0; + u16 hifi = snd_soc_read(codec, WM8753_HIFI) & 0x01e0; /* interface format */ switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) { @@ -1072,7 +1025,7 @@ static int wm8753_hdac_set_dai_fmt(struct snd_soc_dai *codec_dai, return -EINVAL; } - wm8753_write(codec, WM8753_HIFI, hifi); + snd_soc_write(codec, WM8753_HIFI, hifi); return 0; } @@ -1085,8 +1038,8 @@ static int wm8753_i2s_set_dai_fmt(struct snd_soc_dai *codec_dai, struct snd_soc_codec *codec = codec_dai->codec; u16 ioctl, hifi; - hifi = wm8753_read_reg_cache(codec, WM8753_HIFI) & 0x011f; - ioctl = wm8753_read_reg_cache(codec, WM8753_IOCTL) & 0x00ae; + hifi = snd_soc_read(codec, WM8753_HIFI) & 0x011f; + ioctl = snd_soc_read(codec, WM8753_IOCTL) & 0x00ae; /* set master/slave audio interface */ switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) { @@ -1140,8 +1093,8 @@ static int wm8753_i2s_set_dai_fmt(struct snd_soc_dai *codec_dai, return -EINVAL; } - wm8753_write(codec, WM8753_HIFI, hifi); - wm8753_write(codec, WM8753_IOCTL, ioctl); + snd_soc_write(codec, WM8753_HIFI, hifi); + snd_soc_write(codec, WM8753_IOCTL, ioctl); return 0; } @@ -1162,8 +1115,8 @@ static int wm8753_i2s_hw_params(struct snd_pcm_substream *substream, struct snd_soc_pcm_runtime *rtd = substream->private_data; struct snd_soc_codec *codec = rtd->codec; struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec); - u16 srate = wm8753_read_reg_cache(codec, WM8753_SRATE1) & 0x01c0; - u16 hifi = wm8753_read_reg_cache(codec, WM8753_HIFI) & 0x01f3; + u16 srate = snd_soc_read(codec, WM8753_SRATE1) & 0x01c0; + u16 hifi = snd_soc_read(codec, WM8753_HIFI) & 0x01f3; int coeff; /* is digital filter coefficient valid ? */ @@ -1172,7 +1125,7 @@ static int wm8753_i2s_hw_params(struct snd_pcm_substream *substream, printk(KERN_ERR "wm8753 invalid MCLK or rate\n"); return coeff; } - wm8753_write(codec, WM8753_SRATE1, srate | (coeff_div[coeff].sr << 1) | + snd_soc_write(codec, WM8753_SRATE1, srate | (coeff_div[coeff].sr << 1) | coeff_div[coeff].usb); /* bit size */ @@ -1190,7 +1143,7 @@ static int wm8753_i2s_hw_params(struct snd_pcm_substream *substream, break; } - wm8753_write(codec, WM8753_HIFI, hifi); + snd_soc_write(codec, WM8753_HIFI, hifi); return 0; } @@ -1201,8 +1154,8 @@ static int wm8753_mode1v_set_dai_fmt(struct snd_soc_dai *codec_dai, u16 clock; /* set clk source as pcmclk */ - clock = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xfffb; - wm8753_write(codec, WM8753_CLOCK, clock); + clock = snd_soc_read(codec, WM8753_CLOCK) & 0xfffb; + snd_soc_write(codec, WM8753_CLOCK, clock); if (wm8753_vdac_adc_set_dai_fmt(codec_dai, fmt) < 0) return -EINVAL; @@ -1224,8 +1177,8 @@ static int wm8753_mode2_set_dai_fmt(struct snd_soc_dai *codec_dai, u16 clock; /* set clk source as pcmclk */ - clock = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xfffb; - wm8753_write(codec, WM8753_CLOCK, clock); + clock = snd_soc_read(codec, WM8753_CLOCK) & 0xfffb; + snd_soc_write(codec, WM8753_CLOCK, clock); if (wm8753_vdac_adc_set_dai_fmt(codec_dai, fmt) < 0) return -EINVAL; @@ -1239,8 +1192,8 @@ static int wm8753_mode3_4_set_dai_fmt(struct snd_soc_dai *codec_dai, u16 clock; /* set clk source as mclk */ - clock = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xfffb; - wm8753_write(codec, WM8753_CLOCK, clock | 0x4); + clock = snd_soc_read(codec, WM8753_CLOCK) & 0xfffb; + snd_soc_write(codec, WM8753_CLOCK, clock | 0x4); if (wm8753_hdac_set_dai_fmt(codec_dai, fmt) < 0) return -EINVAL; @@ -1252,19 +1205,19 @@ static int wm8753_mode3_4_set_dai_fmt(struct snd_soc_dai *codec_dai, static int wm8753_mute(struct snd_soc_dai *dai, int mute) { struct snd_soc_codec *codec = dai->codec; - u16 mute_reg = wm8753_read_reg_cache(codec, WM8753_DAC) & 0xfff7; + u16 mute_reg = snd_soc_read(codec, WM8753_DAC) & 0xfff7; struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec); /* the digital mute covers the HiFi and Voice DAC's on the WM8753. * make sure we check if they are not both active when we mute */ if (mute && wm8753->dai_func == 1) { if (!codec->active) - wm8753_write(codec, WM8753_DAC, mute_reg | 0x8); + snd_soc_write(codec, WM8753_DAC, mute_reg | 0x8); } else { if (mute) - wm8753_write(codec, WM8753_DAC, mute_reg | 0x8); + snd_soc_write(codec, WM8753_DAC, mute_reg | 0x8); else - wm8753_write(codec, WM8753_DAC, mute_reg); + snd_soc_write(codec, WM8753_DAC, mute_reg); } return 0; @@ -1273,23 +1226,23 @@ static int wm8753_mute(struct snd_soc_dai *dai, int mute) static int wm8753_set_bias_level(struct snd_soc_codec *codec, enum snd_soc_bias_level level) { - u16 pwr_reg = wm8753_read_reg_cache(codec, WM8753_PWR1) & 0xfe3e; + u16 pwr_reg = snd_soc_read(codec, WM8753_PWR1) & 0xfe3e; switch (level) { case SND_SOC_BIAS_ON: /* set vmid to 50k and unmute dac */ - wm8753_write(codec, WM8753_PWR1, pwr_reg | 0x00c0); + snd_soc_write(codec, WM8753_PWR1, pwr_reg | 0x00c0); break; case SND_SOC_BIAS_PREPARE: /* set vmid to 5k for quick power up */ - wm8753_write(codec, WM8753_PWR1, pwr_reg | 0x01c1); + snd_soc_write(codec, WM8753_PWR1, pwr_reg | 0x01c1); break; case SND_SOC_BIAS_STANDBY: /* mute dac and set vmid to 500k, enable VREF */ - wm8753_write(codec, WM8753_PWR1, pwr_reg | 0x0141); + snd_soc_write(codec, WM8753_PWR1, pwr_reg | 0x0141); break; case SND_SOC_BIAS_OFF: - wm8753_write(codec, WM8753_PWR1, 0x0001); + snd_soc_write(codec, WM8753_PWR1, 0x0001); break; } codec->bias_level = level; @@ -1477,7 +1430,7 @@ static void wm8753_set_dai_mode(struct snd_soc_codec *codec, else dai->driver = &wm8753_all_dai[(wm8753->dai_func << 1) + 1]; } - wm8753_write(codec, WM8753_IOCTL, wm8753->dai_func); + snd_soc_write(codec, WM8753_IOCTL, wm8753->dai_func); } static void wm8753_work(struct work_struct *work) @@ -1495,22 +1448,19 @@ static int wm8753_suspend(struct snd_soc_codec *codec, pm_message_t state) static int wm8753_resume(struct snd_soc_codec *codec) { + u16 *reg_cache = codec->reg_cache; int i; - u8 data[2]; - u16 *cache = codec->reg_cache; /* Sync reg_cache with the hardware */ - for (i = 0; i < ARRAY_SIZE(wm8753_reg); i++) { - if (i + 1 == WM8753_RESET) + for (i = 1; i < ARRAY_SIZE(wm8753_reg); i++) { + if (i == WM8753_RESET) continue; /* No point in writing hardware default values back */ - if (cache[i] == wm8753_reg[i]) + if (reg_cache[i] == wm8753_reg[i]) continue; - data[0] = ((i + 1) << 1) | ((cache[i] >> 8) & 0x0001); - data[1] = cache[i] & 0x00ff; - codec->hw_write(codec->control_data, data, 2); + snd_soc_write(codec, i, reg_cache[i]); } wm8753_set_bias_level(codec, SND_SOC_BIAS_STANDBY); @@ -1548,7 +1498,7 @@ static int run_delayed_work(struct delayed_work *dwork) static int wm8753_probe(struct snd_soc_codec *codec) { struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec); - int ret = 0, reg; + int ret; INIT_DELAYED_WORK(&codec->delayed_work, wm8753_work); @@ -1573,26 +1523,16 @@ static int wm8753_probe(struct snd_soc_codec *codec) msecs_to_jiffies(caps_charge)); /* set the update bits */ - reg = wm8753_read_reg_cache(codec, WM8753_LDAC); - wm8753_write(codec, WM8753_LDAC, reg | 0x0100); - reg = wm8753_read_reg_cache(codec, WM8753_RDAC); - wm8753_write(codec, WM8753_RDAC, reg | 0x0100); - reg = wm8753_read_reg_cache(codec, WM8753_LADC); - wm8753_write(codec, WM8753_LADC, reg | 0x0100); - reg = wm8753_read_reg_cache(codec, WM8753_RADC); - wm8753_write(codec, WM8753_RADC, reg | 0x0100); - reg = wm8753_read_reg_cache(codec, WM8753_LOUT1V); - wm8753_write(codec, WM8753_LOUT1V, reg | 0x0100); - reg = wm8753_read_reg_cache(codec, WM8753_ROUT1V); - wm8753_write(codec, WM8753_ROUT1V, reg | 0x0100); - reg = wm8753_read_reg_cache(codec, WM8753_LOUT2V); - wm8753_write(codec, WM8753_LOUT2V, reg | 0x0100); - reg = wm8753_read_reg_cache(codec, WM8753_ROUT2V); - wm8753_write(codec, WM8753_ROUT2V, reg | 0x0100); - reg = wm8753_read_reg_cache(codec, WM8753_LINVOL); - wm8753_write(codec, WM8753_LINVOL, reg | 0x0100); - reg = wm8753_read_reg_cache(codec, WM8753_RINVOL); - wm8753_write(codec, WM8753_RINVOL, reg | 0x0100); + snd_soc_update_bits(codec, WM8753_LDAC, 0x0100, 0x0100); + snd_soc_update_bits(codec, WM8753_RDAC, 0x0100, 0x0100); + snd_soc_update_bits(codec, WM8753_LDAC, 0x0100, 0x0100); + snd_soc_update_bits(codec, WM8753_RDAC, 0x0100, 0x0100); + snd_soc_update_bits(codec, WM8753_LOUT1V, 0x0100, 0x0100); + snd_soc_update_bits(codec, WM8753_ROUT1V, 0x0100, 0x0100); + snd_soc_update_bits(codec, WM8753_LOUT2V, 0x0100, 0x0100); + snd_soc_update_bits(codec, WM8753_ROUT2V, 0x0100, 0x0100); + snd_soc_update_bits(codec, WM8753_LINVOL, 0x0100, 0x0100); + snd_soc_update_bits(codec, WM8753_RINVOL, 0x0100, 0x0100); snd_soc_add_controls(codec, wm8753_snd_controls, ARRAY_SIZE(wm8753_snd_controls)); diff --git a/sound/soc/codecs/wm8904.c b/sound/soc/codecs/wm8904.c index 9001cc48ba13..1ec12eff0620 100644 --- a/sound/soc/codecs/wm8904.c +++ b/sound/soc/codecs/wm8904.c @@ -50,8 +50,6 @@ static const char *wm8904_supply_names[WM8904_NUM_SUPPLIES] = { /* codec private data */ struct wm8904_priv { - u16 reg_cache[WM8904_MAX_REGISTER + 1]; - enum wm8904_type devtype; void *control_data; @@ -2094,7 +2092,7 @@ static int wm8904_digital_mute(struct snd_soc_dai *codec_dai, int mute) static void wm8904_sync_cache(struct snd_soc_codec *codec) { - struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec); + u16 *reg_cache = codec->reg_cache; int i; if (!codec->cache_sync) @@ -2105,14 +2103,14 @@ static void wm8904_sync_cache(struct snd_soc_codec *codec) /* Sync back cached values if they're different from the * hardware default. */ - for (i = 1; i < ARRAY_SIZE(wm8904->reg_cache); i++) { + for (i = 1; i < codec->driver->reg_cache_size; i++) { if (!wm8904_access[i].writable) continue; - if (wm8904->reg_cache[i] == wm8904_reg[i]) + if (reg_cache[i] == wm8904_reg[i]) continue; - snd_soc_write(codec, i, wm8904->reg_cache[i]); + snd_soc_write(codec, i, reg_cache[i]); } codec->cache_sync = 0; @@ -2371,6 +2369,7 @@ static int wm8904_probe(struct snd_soc_codec *codec) { struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec); struct wm8904_pdata *pdata = wm8904->pdata; + u16 *reg_cache = codec->reg_cache; int ret, i; codec->cache_sync = 1; @@ -2437,19 +2436,19 @@ static int wm8904_probe(struct snd_soc_codec *codec) } /* Change some default settings - latch VU and enable ZC */ - wm8904->reg_cache[WM8904_ADC_DIGITAL_VOLUME_LEFT] |= WM8904_ADC_VU; - wm8904->reg_cache[WM8904_ADC_DIGITAL_VOLUME_RIGHT] |= WM8904_ADC_VU; - wm8904->reg_cache[WM8904_DAC_DIGITAL_VOLUME_LEFT] |= WM8904_DAC_VU; - wm8904->reg_cache[WM8904_DAC_DIGITAL_VOLUME_RIGHT] |= WM8904_DAC_VU; - wm8904->reg_cache[WM8904_ANALOGUE_OUT1_LEFT] |= WM8904_HPOUT_VU | + reg_cache[WM8904_ADC_DIGITAL_VOLUME_LEFT] |= WM8904_ADC_VU; + reg_cache[WM8904_ADC_DIGITAL_VOLUME_RIGHT] |= WM8904_ADC_VU; + reg_cache[WM8904_DAC_DIGITAL_VOLUME_LEFT] |= WM8904_DAC_VU; + reg_cache[WM8904_DAC_DIGITAL_VOLUME_RIGHT] |= WM8904_DAC_VU; + reg_cache[WM8904_ANALOGUE_OUT1_LEFT] |= WM8904_HPOUT_VU | WM8904_HPOUTLZC; - wm8904->reg_cache[WM8904_ANALOGUE_OUT1_RIGHT] |= WM8904_HPOUT_VU | + reg_cache[WM8904_ANALOGUE_OUT1_RIGHT] |= WM8904_HPOUT_VU | WM8904_HPOUTRZC; - wm8904->reg_cache[WM8904_ANALOGUE_OUT2_LEFT] |= WM8904_LINEOUT_VU | + reg_cache[WM8904_ANALOGUE_OUT2_LEFT] |= WM8904_LINEOUT_VU | WM8904_LINEOUTLZC; - wm8904->reg_cache[WM8904_ANALOGUE_OUT2_RIGHT] |= WM8904_LINEOUT_VU | + reg_cache[WM8904_ANALOGUE_OUT2_RIGHT] |= WM8904_LINEOUT_VU | WM8904_LINEOUTRZC; - wm8904->reg_cache[WM8904_CLOCK_RATES_0] &= ~WM8904_SR_MODE; + reg_cache[WM8904_CLOCK_RATES_0] &= ~WM8904_SR_MODE; /* Apply configuration from the platform data. */ if (wm8904->pdata) { @@ -2457,23 +2456,23 @@ static int wm8904_probe(struct snd_soc_codec *codec) if (!pdata->gpio_cfg[i]) continue; - wm8904->reg_cache[WM8904_GPIO_CONTROL_1 + i] + reg_cache[WM8904_GPIO_CONTROL_1 + i] = pdata->gpio_cfg[i] & 0xffff; } /* Zero is the default value for these anyway */ for (i = 0; i < WM8904_MIC_REGS; i++) - wm8904->reg_cache[WM8904_MIC_BIAS_CONTROL_0 + i] + reg_cache[WM8904_MIC_BIAS_CONTROL_0 + i] = pdata->mic_cfg[i]; } /* Set Class W by default - this will be managed by the Class * G widget at runtime where bypass paths are available. */ - wm8904->reg_cache[WM8904_CLASS_W_0] |= WM8904_CP_DYN_PWR; + reg_cache[WM8904_CLASS_W_0] |= WM8904_CP_DYN_PWR; /* Use normal bias source */ - wm8904->reg_cache[WM8904_BIAS_CONTROL_0] &= ~WM8904_POBCTRL; + reg_cache[WM8904_BIAS_CONTROL_0] &= ~WM8904_POBCTRL; wm8904_set_bias_level(codec, SND_SOC_BIAS_STANDBY); diff --git a/sound/soc/codecs/wm8940.c b/sound/soc/codecs/wm8940.c index 2cb16f895c46..23086e2c976a 100644 --- a/sound/soc/codecs/wm8940.c +++ b/sound/soc/codecs/wm8940.c @@ -768,6 +768,7 @@ static __devinit int wm8940_i2c_probe(struct i2c_client *i2c, i2c_set_clientdata(i2c, wm8940); wm8940->control_data = i2c; + wm8940->control_type = SND_SOC_I2C; ret = snd_soc_register_codec(&i2c->dev, &soc_codec_dev_wm8940, &wm8940_dai, 1); diff --git a/sound/soc/codecs/wm8955.c b/sound/soc/codecs/wm8955.c index 9cbab8e1de01..2ac35b0be86a 100644 --- a/sound/soc/codecs/wm8955.c +++ b/sound/soc/codecs/wm8955.c @@ -42,8 +42,6 @@ static const char *wm8955_supply_names[WM8955_NUM_SUPPLIES] = { struct wm8955_priv { enum snd_soc_control_type control_type; - u16 reg_cache[WM8955_MAX_REGISTER + 1]; - unsigned int mclk_rate; int deemph; @@ -768,6 +766,7 @@ static int wm8955_set_bias_level(struct snd_soc_codec *codec, enum snd_soc_bias_level level) { struct wm8955_priv *wm8955 = snd_soc_codec_get_drvdata(codec); + u16 *reg_cache = codec->reg_cache; int ret, i; switch (level) { @@ -800,14 +799,14 @@ static int wm8955_set_bias_level(struct snd_soc_codec *codec, /* Sync back cached values if they're * different from the hardware default. */ - for (i = 0; i < ARRAY_SIZE(wm8955->reg_cache); i++) { + for (i = 0; i < codec->driver->reg_cache_size; i++) { if (i == WM8955_RESET) continue; - if (wm8955->reg_cache[i] == wm8955_reg[i]) + if (reg_cache[i] == wm8955_reg[i]) continue; - snd_soc_write(codec, i, wm8955->reg_cache[i]); + snd_soc_write(codec, i, reg_cache[i]); } /* Enable VREF and VMID */ @@ -902,6 +901,7 @@ static int wm8955_probe(struct snd_soc_codec *codec) { struct wm8955_priv *wm8955 = snd_soc_codec_get_drvdata(codec); struct wm8955_pdata *pdata = dev_get_platdata(codec->dev); + u16 *reg_cache = codec->reg_cache; int ret, i; ret = snd_soc_codec_set_cache_io(codec, 7, 9, wm8955->control_type); @@ -934,25 +934,25 @@ static int wm8955_probe(struct snd_soc_codec *codec) } /* Change some default settings - latch VU and enable ZC */ - wm8955->reg_cache[WM8955_LEFT_DAC_VOLUME] |= WM8955_LDVU; - wm8955->reg_cache[WM8955_RIGHT_DAC_VOLUME] |= WM8955_RDVU; - wm8955->reg_cache[WM8955_LOUT1_VOLUME] |= WM8955_LO1VU | WM8955_LO1ZC; - wm8955->reg_cache[WM8955_ROUT1_VOLUME] |= WM8955_RO1VU | WM8955_RO1ZC; - wm8955->reg_cache[WM8955_LOUT2_VOLUME] |= WM8955_LO2VU | WM8955_LO2ZC; - wm8955->reg_cache[WM8955_ROUT2_VOLUME] |= WM8955_RO2VU | WM8955_RO2ZC; - wm8955->reg_cache[WM8955_MONOOUT_VOLUME] |= WM8955_MOZC; + reg_cache[WM8955_LEFT_DAC_VOLUME] |= WM8955_LDVU; + reg_cache[WM8955_RIGHT_DAC_VOLUME] |= WM8955_RDVU; + reg_cache[WM8955_LOUT1_VOLUME] |= WM8955_LO1VU | WM8955_LO1ZC; + reg_cache[WM8955_ROUT1_VOLUME] |= WM8955_RO1VU | WM8955_RO1ZC; + reg_cache[WM8955_LOUT2_VOLUME] |= WM8955_LO2VU | WM8955_LO2ZC; + reg_cache[WM8955_ROUT2_VOLUME] |= WM8955_RO2VU | WM8955_RO2ZC; + reg_cache[WM8955_MONOOUT_VOLUME] |= WM8955_MOZC; /* Also enable adaptive bass boost by default */ - wm8955->reg_cache[WM8955_BASS_CONTROL] |= WM8955_BB; + reg_cache[WM8955_BASS_CONTROL] |= WM8955_BB; /* Set platform data values */ if (pdata) { if (pdata->out2_speaker) - wm8955->reg_cache[WM8955_ADDITIONAL_CONTROL_2] + reg_cache[WM8955_ADDITIONAL_CONTROL_2] |= WM8955_ROUT2INV; if (pdata->monoin_diff) - wm8955->reg_cache[WM8955_MONO_OUT_MIX_1] + reg_cache[WM8955_MONO_OUT_MIX_1] |= WM8955_DMEN; } @@ -1003,6 +1003,7 @@ static __devinit int wm8955_i2c_probe(struct i2c_client *i2c, return -ENOMEM; i2c_set_clientdata(i2c, wm8955); + wm8955->control_type = SND_SOC_I2C; ret = snd_soc_register_codec(&i2c->dev, &soc_codec_dev_wm8955, &wm8955_dai, 1); diff --git a/sound/soc/codecs/wm8960.c b/sound/soc/codecs/wm8960.c index 21986c42272f..ff6ff2f529d2 100644 --- a/sound/soc/codecs/wm8960.c +++ b/sound/soc/codecs/wm8960.c @@ -1013,6 +1013,7 @@ static __devinit int wm8960_i2c_probe(struct i2c_client *i2c, return -ENOMEM; i2c_set_clientdata(i2c, wm8960); + wm8960->control_type = SND_SOC_I2C; wm8960->control_data = i2c; ret = snd_soc_register_codec(&i2c->dev, diff --git a/sound/soc/codecs/wm8962.c b/sound/soc/codecs/wm8962.c index 1304ca91a11c..7c421cc837bd 100644 --- a/sound/soc/codecs/wm8962.c +++ b/sound/soc/codecs/wm8962.c @@ -52,8 +52,6 @@ static const char *wm8962_supply_names[WM8962_NUM_SUPPLIES] = { struct wm8962_priv { struct snd_soc_codec *codec; - u16 reg_cache[WM8962_MAX_REGISTER + 1]; - int sysclk; int sysclk_rate; @@ -1991,8 +1989,7 @@ static int wm8962_put_hp_sw(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); - struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec); - u16 *reg_cache = wm8962->reg_cache; + u16 *reg_cache = codec->reg_cache; int ret; /* Apply the update (if any) */ @@ -2020,8 +2017,7 @@ static int wm8962_put_spk_sw(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); - struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec); - u16 *reg_cache = wm8962->reg_cache; + u16 *reg_cache = codec->reg_cache; int ret; /* Apply the update (if any) */ @@ -2329,8 +2325,7 @@ static int out_pga_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event) { struct snd_soc_codec *codec = w->codec; - struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec); - u16 *reg_cache = wm8962->reg_cache; + u16 *reg_cache = codec->reg_cache; int reg; switch (w->shift) { @@ -2719,7 +2714,7 @@ static int wm8962_add_widgets(struct snd_soc_codec *codec) static void wm8962_sync_cache(struct snd_soc_codec *codec) { - struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec); + u16 *reg_cache = codec->reg_cache; int i; if (!codec->cache_sync) @@ -2732,13 +2727,13 @@ static void wm8962_sync_cache(struct snd_soc_codec *codec) /* Sync back cached values if they're different from the * hardware default. */ - for (i = 1; i < ARRAY_SIZE(wm8962->reg_cache); i++) { + for (i = 1; i < codec->driver->reg_cache_size; i++) { if (i == WM8962_SOFTWARE_RESET) continue; - if (wm8962->reg_cache[i] == wm8962_reg[i]) + if (reg_cache[i] == wm8962_reg[i]) continue; - snd_soc_write(codec, i, wm8962->reg_cache[i]); + snd_soc_write(codec, i, reg_cache[i]); } codec->cache_sync = 0; @@ -3406,12 +3401,11 @@ EXPORT_SYMBOL_GPL(wm8962_mic_detect); #ifdef CONFIG_PM static int wm8962_resume(struct snd_soc_codec *codec) { - struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec); u16 *reg_cache = codec->reg_cache; int i; /* Restore the registers */ - for (i = 1; i < ARRAY_SIZE(wm8962->reg_cache); i++) { + for (i = 1; i < codec->driver->reg_cache_size; i++) { switch (i) { case WM8962_SOFTWARE_RESET: continue; @@ -3705,6 +3699,7 @@ static int wm8962_probe(struct snd_soc_codec *codec) struct wm8962_pdata *pdata = dev_get_platdata(codec->dev); struct i2c_client *i2c = container_of(codec->dev, struct i2c_client, dev); + u16 *reg_cache = codec->reg_cache; int i, trigger, irq_pol; wm8962->codec = codec; @@ -3804,7 +3799,7 @@ static int wm8962_probe(struct snd_soc_codec *codec) /* Put the speakers into mono mode? */ if (pdata->spk_mono) - wm8962->reg_cache[WM8962_CLASS_D_CONTROL_2] + reg_cache[WM8962_CLASS_D_CONTROL_2] |= WM8962_SPK_MONO; /* Micbias setup, detection enable and detection @@ -3819,16 +3814,16 @@ static int wm8962_probe(struct snd_soc_codec *codec) } /* Latch volume update bits */ - wm8962->reg_cache[WM8962_LEFT_INPUT_VOLUME] |= WM8962_IN_VU; - wm8962->reg_cache[WM8962_RIGHT_INPUT_VOLUME] |= WM8962_IN_VU; - wm8962->reg_cache[WM8962_LEFT_ADC_VOLUME] |= WM8962_ADC_VU; - wm8962->reg_cache[WM8962_RIGHT_ADC_VOLUME] |= WM8962_ADC_VU; - wm8962->reg_cache[WM8962_LEFT_DAC_VOLUME] |= WM8962_DAC_VU; - wm8962->reg_cache[WM8962_RIGHT_DAC_VOLUME] |= WM8962_DAC_VU; - wm8962->reg_cache[WM8962_SPKOUTL_VOLUME] |= WM8962_SPKOUT_VU; - wm8962->reg_cache[WM8962_SPKOUTR_VOLUME] |= WM8962_SPKOUT_VU; - wm8962->reg_cache[WM8962_HPOUTL_VOLUME] |= WM8962_HPOUT_VU; - wm8962->reg_cache[WM8962_HPOUTR_VOLUME] |= WM8962_HPOUT_VU; + reg_cache[WM8962_LEFT_INPUT_VOLUME] |= WM8962_IN_VU; + reg_cache[WM8962_RIGHT_INPUT_VOLUME] |= WM8962_IN_VU; + reg_cache[WM8962_LEFT_ADC_VOLUME] |= WM8962_ADC_VU; + reg_cache[WM8962_RIGHT_ADC_VOLUME] |= WM8962_ADC_VU; + reg_cache[WM8962_LEFT_DAC_VOLUME] |= WM8962_DAC_VU; + reg_cache[WM8962_RIGHT_DAC_VOLUME] |= WM8962_DAC_VU; + reg_cache[WM8962_SPKOUTL_VOLUME] |= WM8962_SPKOUT_VU; + reg_cache[WM8962_SPKOUTR_VOLUME] |= WM8962_SPKOUT_VU; + reg_cache[WM8962_HPOUTL_VOLUME] |= WM8962_HPOUT_VU; + reg_cache[WM8962_HPOUTR_VOLUME] |= WM8962_HPOUT_VU; wm8962_add_widgets(codec); diff --git a/sound/soc/codecs/wm8971.c b/sound/soc/codecs/wm8971.c index 63f6dbf5d070..9f18db6e167c 100644 --- a/sound/soc/codecs/wm8971.c +++ b/sound/soc/codecs/wm8971.c @@ -718,6 +718,7 @@ static __devinit int wm8971_i2c_probe(struct i2c_client *i2c, if (wm8971 == NULL) return -ENOMEM; + wm8971->control_type = SND_SOC_I2C; i2c_set_clientdata(i2c, wm8971); ret = snd_soc_register_codec(&i2c->dev, diff --git a/sound/soc/codecs/wm9081.c b/sound/soc/codecs/wm9081.c index ecc7c37180c7..a486670966bd 100644 --- a/sound/soc/codecs/wm9081.c +++ b/sound/soc/codecs/wm9081.c @@ -1335,6 +1335,7 @@ static __devinit int wm9081_i2c_probe(struct i2c_client *i2c, return -ENOMEM; i2c_set_clientdata(i2c, wm9081); + wm9081->control_type = SND_SOC_I2C; wm9081->control_data = i2c; ret = snd_soc_register_codec(&i2c->dev, diff --git a/sound/soc/codecs/wm9090.c b/sound/soc/codecs/wm9090.c index 99c046ba46bb..6e5f64f627cb 100644 --- a/sound/soc/codecs/wm9090.c +++ b/sound/soc/codecs/wm9090.c @@ -141,7 +141,6 @@ static const u16 wm9090_reg_defaults[] = { /* This struct is used to save the context */ struct wm9090_priv { struct mutex mutex; - u16 reg_cache[WM9090_MAX_REGISTER + 1]; struct wm9090_platform_data pdata; void *control_data; }; @@ -552,6 +551,7 @@ static int wm9090_set_bias_level(struct snd_soc_codec *codec, static int wm9090_probe(struct snd_soc_codec *codec) { struct wm9090_priv *wm9090 = snd_soc_codec_get_drvdata(codec); + u16 *reg_cache = codec->reg_cache; int ret; codec->control_data = wm9090->control_data; @@ -576,22 +576,22 @@ static int wm9090_probe(struct snd_soc_codec *codec) /* Configure some defaults; they will be written out when we * bring the bias up. */ - wm9090->reg_cache[WM9090_IN1_LINE_INPUT_A_VOLUME] |= WM9090_IN1_VU + reg_cache[WM9090_IN1_LINE_INPUT_A_VOLUME] |= WM9090_IN1_VU | WM9090_IN1A_ZC; - wm9090->reg_cache[WM9090_IN1_LINE_INPUT_B_VOLUME] |= WM9090_IN1_VU + reg_cache[WM9090_IN1_LINE_INPUT_B_VOLUME] |= WM9090_IN1_VU | WM9090_IN1B_ZC; - wm9090->reg_cache[WM9090_IN2_LINE_INPUT_A_VOLUME] |= WM9090_IN2_VU + reg_cache[WM9090_IN2_LINE_INPUT_A_VOLUME] |= WM9090_IN2_VU | WM9090_IN2A_ZC; - wm9090->reg_cache[WM9090_IN2_LINE_INPUT_B_VOLUME] |= WM9090_IN2_VU + reg_cache[WM9090_IN2_LINE_INPUT_B_VOLUME] |= WM9090_IN2_VU | WM9090_IN2B_ZC; - wm9090->reg_cache[WM9090_SPEAKER_VOLUME_LEFT] |= + reg_cache[WM9090_SPEAKER_VOLUME_LEFT] |= WM9090_SPKOUT_VU | WM9090_SPKOUTL_ZC; - wm9090->reg_cache[WM9090_LEFT_OUTPUT_VOLUME] |= + reg_cache[WM9090_LEFT_OUTPUT_VOLUME] |= WM9090_HPOUT1_VU | WM9090_HPOUT1L_ZC; - wm9090->reg_cache[WM9090_RIGHT_OUTPUT_VOLUME] |= + reg_cache[WM9090_RIGHT_OUTPUT_VOLUME] |= WM9090_HPOUT1_VU | WM9090_HPOUT1R_ZC; - wm9090->reg_cache[WM9090_CLOCKING_1] |= WM9090_TOCLK_ENA; + reg_cache[WM9090_CLOCKING_1] |= WM9090_TOCLK_ENA; wm9090_set_bias_level(codec, SND_SOC_BIAS_STANDBY); diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt index b2c63309a651..6f5a498608b2 100644 --- a/tools/perf/Documentation/perf-annotate.txt +++ b/tools/perf/Documentation/perf-annotate.txt @@ -24,12 +24,47 @@ OPTIONS --input=:: Input file name. (default: perf.data) +-d:: +--dsos=<dso[,dso...]>:: + Only consider symbols in these dsos. +-s:: +--symbol=<symbol>:: + Symbol to annotate. + +-f:: +--force:: + Don't complain, do it. + +-v:: +--verbose:: + Be more verbose. (Show symbol address, etc) + +-D:: +--dump-raw-trace:: + Dump raw trace in ASCII. + +-k:: +--vmlinux=<file>:: + vmlinux pathname. + +-m:: +--modules:: + Load module symbols. WARNING: use only with -k and LIVE kernel. + +-l:: +--print-line:: + Print matching source lines (may be slow). + +-P:: +--full-paths:: + Don't shorten the displayed pathnames. + --stdio:: Use the stdio interface. --tui:: Use the TUI interface Use of --tui requires a tty, if one is not present, as when piping to other commands, the stdio interface is used. This interfaces starts by centering on the line with more - samples, TAB/UNTAB cycles thru the lines with more samples. + samples, TAB/UNTAB cycles through the lines with more samples. SEE ALSO -------- diff --git a/tools/perf/Documentation/perf-buildid-list.txt b/tools/perf/Documentation/perf-buildid-list.txt index 01b642c0bf8f..5eaac6f26d51 100644 --- a/tools/perf/Documentation/perf-buildid-list.txt +++ b/tools/perf/Documentation/perf-buildid-list.txt @@ -18,6 +18,9 @@ perf report. OPTIONS ------- +-H:: +--with-hits:: + Show only DSOs with hits. -i:: --input=:: Input file name. (default: perf.data) diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt index 20d97d84ea1c..74d7481ed7a6 100644 --- a/tools/perf/Documentation/perf-diff.txt +++ b/tools/perf/Documentation/perf-diff.txt @@ -19,6 +19,18 @@ If no parameters are passed it will assume perf.data.old and perf.data. OPTIONS ------- +-M:: +--displacement:: + Show position displacement relative to baseline. + +-D:: +--dump-raw-trace:: + Dump raw trace in ASCII. + +-m:: +--modules:: + Load module symbols. WARNING: use only with -k and LIVE kernel + -d:: --dsos=:: Only consider symbols in these dsos. CSV that understands @@ -42,7 +54,7 @@ OPTIONS --field-separator=:: Use a special separator character and don't pad with spaces, replacing - all occurances of this separator in symbol names (and other output) + all occurrences of this separator in symbol names (and other output) with a '.' character, that thus it's the only non valid separator. -v:: @@ -50,6 +62,13 @@ OPTIONS Be verbose, for instance, show the raw counts in addition to the diff. +-f:: +--force:: + Don't complain, do it. + +--symfs=<directory>:: + Look for files with symbols relative to this directory. + SEE ALSO -------- linkperf:perf-record[1] diff --git a/tools/perf/Documentation/perf-kvm.txt b/tools/perf/Documentation/perf-kvm.txt index d004e19fe6d6..dd84cb2f0a88 100644 --- a/tools/perf/Documentation/perf-kvm.txt +++ b/tools/perf/Documentation/perf-kvm.txt @@ -22,7 +22,7 @@ There are a couple of variants of perf kvm: a performance counter profile of guest os in realtime of an arbitrary workload. - 'perf kvm record <command>' to record the performance couinter profile + 'perf kvm record <command>' to record the performance counter profile of an arbitrary workload and save it into a perf data file. If both --host and --guest are input, the perf data file name is perf.data.kvm. If there is no --host but --guest, the file name is perf.data.guest. @@ -40,6 +40,12 @@ There are a couple of variants of perf kvm: OPTIONS ------- +-i:: +--input=:: + Input file name. +-o:: +--output:: + Output file name. --host=:: Collect host side performance profile. --guest=:: diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt index b317102138c8..921de259ea10 100644 --- a/tools/perf/Documentation/perf-lock.txt +++ b/tools/perf/Documentation/perf-lock.txt @@ -24,6 +24,21 @@ and statistics with this 'perf lock' command. 'perf lock report' reports statistical data. +OPTIONS +------- + +-i:: +--input=<file>:: + Input file name. + +-v:: +--verbose:: + Be more verbose (show symbol address, etc). + +-D:: +--dump-raw-trace:: + Dump raw trace in ASCII. + SEE ALSO -------- linkperf:perf[1] diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt index 62de1b7f4e76..86b797a35aa6 100644 --- a/tools/perf/Documentation/perf-probe.txt +++ b/tools/perf/Documentation/perf-probe.txt @@ -115,9 +115,9 @@ Each probe argument follows below syntax. LINE SYNTAX ----------- -Line range is descripted by following syntax. +Line range is described by following syntax. - "FUNC[:RLN[+NUM|-RLN2]]|SRC:ALN[+NUM|-ALN2]" + "FUNC[:RLN[+NUM|-RLN2]]|SRC[:ALN[+NUM|-ALN2]]" FUNC specifies the function name of showing lines. 'RLN' is the start line number from function entry line, and 'RLN2' is the end line number. As same as diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index a91f9f9e6e5c..52462ae26455 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -39,15 +39,24 @@ OPTIONS be passed as follows: '\mem:addr[:[r][w][x]]'. If you want to profile read-write accesses in 0x1000, just set 'mem:0x1000:rw'. + +--filter=<filter>:: + Event filter. + -a:: - System-wide collection. +--all-cpus:: + System-wide collection from all CPUs. -l:: Scale counter values. -p:: --pid=:: - Record events on existing pid. + Record events on existing process ID. + +-t:: +--tid=:: + Record events on existing thread ID. -r:: --realtime=:: @@ -99,6 +108,11 @@ OPTIONS --data:: Sample addresses. +-T:: +--timestamp:: + Sample timestamps. Use it with 'perf report -D' to see the timestamps, + for instance. + -n:: --no-samples:: Don't sample. @@ -109,8 +123,8 @@ Collect raw sample records from all opened counters (default for tracepoint coun -C:: --cpu:: -Collect samples only on the list of cpus provided. Multiple CPUs can be provided as a -comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. +Collect samples only on the list of CPUs provided. Multiple CPUs can be provided as a +comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. In per-thread mode with inheritance mode on (default), samples are captured only when the thread executes on the designated CPUs. Default is to monitor all CPUs. diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 12052c9ed0ba..8ba03d6e5398 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -20,6 +20,11 @@ OPTIONS -i:: --input=:: Input file name. (default: perf.data) + +-v:: +--verbose:: + Be more verbose. (show symbol address, etc) + -d:: --dsos=:: Only consider symbols in these dsos. CSV that understands @@ -27,6 +32,10 @@ OPTIONS -n:: --show-nr-samples:: Show the number of samples for each symbol + +--showcpuutilization:: + Show sample percentage for different cpu modes. + -T:: --threads:: Show per-thread event counters @@ -39,12 +48,24 @@ OPTIONS Only consider these symbols. CSV that understands file://filename entries. +-U:: +--hide-unresolved:: + Only display entries resolved to a symbol. + -s:: --sort=:: Sort by key(s): pid, comm, dso, symbol, parent. +-p:: +--parent=<regex>:: + regex filter to identify parent, see: '--sort parent' + +-x:: +--exclude-other:: + Only display entries with parent-match. + -w:: ---field-width=:: +--column-widths=<width[,width...]>:: Force each column width to the provided list, for large terminal readability. @@ -52,19 +73,26 @@ OPTIONS --field-separator=:: Use a special separator character and don't pad with spaces, replacing - all occurances of this separator in symbol names (and other output) + all occurrences of this separator in symbol names (and other output) with a '.' character, that thus it's the only non valid separator. +-D:: +--dump-raw-trace:: + Dump raw trace in ASCII. + -g [type,min]:: --call-graph:: - Display callchains using type and min percent threshold. + Display call chains using type and min percent threshold. type can be either: - - flat: single column, linear exposure of callchains. + - flat: single column, linear exposure of call chains. - graph: use a graph tree, displaying absolute overhead rates. - fractal: like graph, but displays relative rates. Each branch of the tree is considered as a new profiled object. + Default: fractal,0.5. +--pretty=<key>:: + Pretty printing style. key: normal, raw + --stdio:: Use the stdio interface. --tui:: Use the TUI interface, that is integrated with annotate and allows @@ -72,6 +100,25 @@ OPTIONS requires a tty, if one is not present, as when piping to other commands, the stdio interface is used. +-k:: +--vmlinux=<file>:: + vmlinux pathname + +--kallsyms=<file>:: + kallsyms pathname + +-m:: +--modules:: + Load module symbols. WARNING: This should only be used with -k and + a LIVE kernel. + +-f:: +--force:: + Don't complain, do it. + +--symfs=<directory>:: + Look for files with symbols relative to this directory. + SEE ALSO -------- linkperf:perf-stat[1] diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 8417644a6166..46822d5fde1c 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -8,11 +8,11 @@ perf-sched - Tool to trace/measure scheduler properties (latencies) SYNOPSIS -------- [verse] -'perf sched' {record|latency|replay|trace} +'perf sched' {record|latency|map|replay|trace} DESCRIPTION ----------- -There are four variants of perf sched: +There are five variants of perf sched: 'perf sched record <command>' to record the scheduling events of an arbitrary workload. @@ -30,8 +30,22 @@ There are four variants of perf sched: of the workload as it occurred when it was recorded - and can repeat it a number of times, measuring its performance.) + 'perf sched map' to print a textual context-switching outline of + workload captured via perf sched record. Columns stand for + individual CPUs, and the two-letter shortcuts stand for tasks that + are running on a CPU. A '*' denotes the CPU that had the event, and + a dot signals an idle CPU. + OPTIONS ------- +-i:: +--input=<file>:: + Input file name. (default: perf.data) + +-v:: +--verbose:: + Be more verbose. (show symbol address, etc) + -D:: --dump-raw-trace=:: Display verbose dump of the sched data. diff --git a/tools/perf/Documentation/perf-trace-perl.txt b/tools/perf/Documentation/perf-script-perl.txt index ee6525ee6d69..5bb41e55a3ac 100644 --- a/tools/perf/Documentation/perf-trace-perl.txt +++ b/tools/perf/Documentation/perf-script-perl.txt @@ -1,19 +1,19 @@ -perf-trace-perl(1) +perf-script-perl(1) ================== NAME ---- -perf-trace-perl - Process trace data with a Perl script +perf-script-perl - Process trace data with a Perl script SYNOPSIS -------- [verse] -'perf trace' [-s [Perl]:script[.pl] ] +'perf script' [-s [Perl]:script[.pl] ] DESCRIPTION ----------- -This perf trace option is used to process perf trace data using perf's +This perf script option is used to process perf script data using perf's built-in Perl interpreter. It reads and processes the input file and displays the results of the trace analysis implemented in the given Perl script, if any. @@ -21,7 +21,7 @@ Perl script, if any. STARTER SCRIPTS --------------- -You can avoid reading the rest of this document by running 'perf trace +You can avoid reading the rest of this document by running 'perf script -g perl' in the same directory as an existing perf.data trace file. That will generate a starter script containing a handler for each of the event types in the trace file; it simply prints every available @@ -30,13 +30,13 @@ field for each event in the trace file. You can also look at the existing scripts in ~/libexec/perf-core/scripts/perl for typical examples showing how to do basic things like aggregate event data, print results, etc. Also, -the check-perf-trace.pl script, while not interesting for its results, +the check-perf-script.pl script, while not interesting for its results, attempts to exercise all of the main scripting features. EVENT HANDLERS -------------- -When perf trace is invoked using a trace script, a user-defined +When perf script is invoked using a trace script, a user-defined 'handler function' is called for each event in the trace. If there's no handler function defined for a given event type, the event is ignored (or passed to a 'trace_handled' function, see below) and the @@ -112,13 +112,13 @@ write a useful trace script. The sections below cover the rest. SCRIPT LAYOUT ------------- -Every perf trace Perl script should start by setting up a Perl module +Every perf script Perl script should start by setting up a Perl module search path and 'use'ing a few support modules (see module descriptions below): ---- - use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib"; - use lib "./Perf-Trace-Util/lib"; + use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/perf-script-Util/lib"; + use lib "./perf-script-Util/lib"; use Perf::Trace::Core; use Perf::Trace::Context; use Perf::Trace::Util; @@ -162,7 +162,7 @@ sub trace_unhandled ---- The remaining sections provide descriptions of each of the available -built-in perf trace Perl modules and their associated functions. +built-in perf script Perl modules and their associated functions. AVAILABLE MODULES AND FUNCTIONS ------------------------------- @@ -170,7 +170,7 @@ AVAILABLE MODULES AND FUNCTIONS The following sections describe the functions and variables available via the various Perf::Trace::* Perl modules. To use the functions and variables from the given module, add the corresponding 'use -Perf::Trace::XXX' line to your perf trace script. +Perf::Trace::XXX' line to your perf script script. Perf::Trace::Core Module ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -204,7 +204,7 @@ argument. Perf::Trace::Util Module ~~~~~~~~~~~~~~~~~~~~~~~~ -Various utility functions for use with perf trace: +Various utility functions for use with perf script: nsecs($secs, $nsecs) - returns total nsecs given secs/nsecs pair nsecs_secs($nsecs) - returns whole secs portion given nsecs @@ -214,4 +214,4 @@ Various utility functions for use with perf trace: SEE ALSO -------- -linkperf:perf-trace[1] +linkperf:perf-script[1] diff --git a/tools/perf/Documentation/perf-trace-python.txt b/tools/perf/Documentation/perf-script-python.txt index 693be804dd3d..36b38277422c 100644 --- a/tools/perf/Documentation/perf-trace-python.txt +++ b/tools/perf/Documentation/perf-script-python.txt @@ -1,19 +1,19 @@ -perf-trace-python(1) +perf-script-python(1) ==================== NAME ---- -perf-trace-python - Process trace data with a Python script +perf-script-python - Process trace data with a Python script SYNOPSIS -------- [verse] -'perf trace' [-s [Python]:script[.py] ] +'perf script' [-s [Python]:script[.py] ] DESCRIPTION ----------- -This perf trace option is used to process perf trace data using perf's +This perf script option is used to process perf script data using perf's built-in Python interpreter. It reads and processes the input file and displays the results of the trace analysis implemented in the given Python script, if any. @@ -23,15 +23,15 @@ A QUICK EXAMPLE This section shows the process, start to finish, of creating a working Python script that aggregates and extracts useful information from a -raw perf trace stream. You can avoid reading the rest of this +raw perf script stream. You can avoid reading the rest of this document if an example is enough for you; the rest of the document provides more details on each step and lists the library functions available to script writers. This example actually details the steps that were used to create the -'syscall-counts' script you see when you list the available perf trace -scripts via 'perf trace -l'. As such, this script also shows how to -integrate your script into the list of general-purpose 'perf trace' +'syscall-counts' script you see when you list the available perf script +scripts via 'perf script -l'. As such, this script also shows how to +integrate your script into the list of general-purpose 'perf script' scripts listed by that command. The syscall-counts script is a simple script, but demonstrates all the @@ -105,31 +105,31 @@ That single stream will be recorded in a file in the current directory called perf.data. Once we have a perf.data file containing our data, we can use the -g -'perf trace' option to generate a Python script that will contain a +'perf script' option to generate a Python script that will contain a callback handler for each event type found in the perf.data trace stream (for more details, see the STARTER SCRIPTS section). ---- -# perf trace -g python -generated Python script: perf-trace.py +# perf script -g python +generated Python script: perf-script.py The output file created also in the current directory is named -perf-trace.py. Here's the file in its entirety: +perf-script.py. Here's the file in its entirety: -# perf trace event handlers, generated by perf trace -g python +# perf script event handlers, generated by perf script -g python # Licensed under the terms of the GNU GPL License version 2 # The common_* event handler fields are the most useful fields common to # all events. They don't necessarily correspond to the 'common_*' fields # in the format files. Those fields not available as handler params can # be retrieved using Python functions of the form common_*(context). -# See the perf-trace-python Documentation for the list of available functions. +# See the perf-script-python Documentation for the list of available functions. import os import sys sys.path.append(os.environ['PERF_EXEC_PATH'] + \ - '/scripts/python/Perf-Trace-Util/lib/Perf/Trace') + '/scripts/python/perf-script-Util/lib/Perf/Trace') from perf_trace_context import * from Core import * @@ -160,7 +160,7 @@ def print_header(event_name, cpu, secs, nsecs, pid, comm): ---- At the top is a comment block followed by some import statements and a -path append which every perf trace script should include. +path append which every perf script script should include. Following that are a couple generated functions, trace_begin() and trace_end(), which are called at the beginning and the end of the @@ -189,8 +189,8 @@ simply a utility function used for that purpose. Let's rename the script and run it to see the default output: ---- -# mv perf-trace.py syscall-counts.py -# perf trace -s syscall-counts.py +# mv perf-script.py syscall-counts.py +# perf script -s syscall-counts.py raw_syscalls__sys_enter 1 00840.847582083 7506 perf id=1, args= raw_syscalls__sys_enter 1 00840.847595764 7506 perf id=1, args= @@ -216,7 +216,7 @@ import os import sys sys.path.append(os.environ['PERF_EXEC_PATH'] + \ - '/scripts/python/Perf-Trace-Util/lib/Perf/Trace') + '/scripts/python/perf-script-Util/lib/Perf/Trace') from perf_trace_context import * from Core import * @@ -279,7 +279,7 @@ import os import sys sys.path.append(os.environ['PERF_EXEC_PATH'] + \ - '/scripts/python/Perf-Trace-Util/lib/Perf/Trace') + '/scripts/python/perf-script-Util/lib/Perf/Trace') from perf_trace_context import * from Core import * @@ -315,7 +315,7 @@ def print_syscall_totals(): The script can be run just as before: - # perf trace -s syscall-counts.py + # perf script -s syscall-counts.py So those are the essential steps in writing and running a script. The process can be generalized to any tracepoint or set of tracepoints @@ -324,17 +324,17 @@ interested in by looking at the list of available events shown by 'perf list' and/or look in /sys/kernel/debug/tracing events for detailed event and field info, record the corresponding trace data using 'perf record', passing it the list of interesting events, -generate a skeleton script using 'perf trace -g python' and modify the +generate a skeleton script using 'perf script -g python' and modify the code to aggregate and display it for your particular needs. After you've done that you may end up with a general-purpose script that you want to keep around and have available for future use. By writing a couple of very simple shell scripts and putting them in the right place, you can have your script listed alongside the other -scripts listed by the 'perf trace -l' command e.g.: +scripts listed by the 'perf script -l' command e.g.: ---- -root@tropicana:~# perf trace -l +root@tropicana:~# perf script -l List of available trace scripts: workqueue-stats workqueue stats (ins/exe/create/destroy) wakeup-latency system-wide min/max/avg wakeup latency @@ -365,14 +365,14 @@ perf record -a -e raw_syscalls:sys_enter The 'report' script is also a shell script with the same base name as your script, but with -report appended. It should also be located in the perf/scripts/python/bin directory. In that script, you write the -'perf trace -s' command-line needed for running your script: +'perf script -s' command-line needed for running your script: ---- # cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-report #!/bin/bash # description: system-wide syscall counts -perf trace -s ~/libexec/perf-core/scripts/python/syscall-counts.py +perf script -s ~/libexec/perf-core/scripts/python/syscall-counts.py ---- Note that the location of the Python script given in the shell script @@ -390,17 +390,17 @@ total 32 drwxr-xr-x 4 trz trz 4096 2010-01-26 22:30 . drwxr-xr-x 4 trz trz 4096 2010-01-26 22:29 .. drwxr-xr-x 2 trz trz 4096 2010-01-26 22:29 bin --rw-r--r-- 1 trz trz 2548 2010-01-26 22:29 check-perf-trace.py -drwxr-xr-x 3 trz trz 4096 2010-01-26 22:49 Perf-Trace-Util +-rw-r--r-- 1 trz trz 2548 2010-01-26 22:29 check-perf-script.py +drwxr-xr-x 3 trz trz 4096 2010-01-26 22:49 perf-script-Util -rw-r--r-- 1 trz trz 1462 2010-01-26 22:30 syscall-counts.py ---- Once you've done that (don't forget to do a new 'make install', -otherwise your script won't show up at run-time), 'perf trace -l' +otherwise your script won't show up at run-time), 'perf script -l' should show a new entry for your script: ---- -root@tropicana:~# perf trace -l +root@tropicana:~# perf script -l List of available trace scripts: workqueue-stats workqueue stats (ins/exe/create/destroy) wakeup-latency system-wide min/max/avg wakeup latency @@ -409,19 +409,19 @@ List of available trace scripts: syscall-counts system-wide syscall counts ---- -You can now perform the record step via 'perf trace record': +You can now perform the record step via 'perf script record': - # perf trace record syscall-counts + # perf script record syscall-counts -and display the output using 'perf trace report': +and display the output using 'perf script report': - # perf trace report syscall-counts + # perf script report syscall-counts STARTER SCRIPTS --------------- You can quickly get started writing a script for a particular set of -trace data by generating a skeleton script using 'perf trace -g +trace data by generating a skeleton script using 'perf script -g python' in the same directory as an existing perf.data trace file. That will generate a starter script containing a handler for each of the event types in the trace file; it simply prints every available @@ -430,13 +430,13 @@ field for each event in the trace file. You can also look at the existing scripts in ~/libexec/perf-core/scripts/python for typical examples showing how to do basic things like aggregate event data, print results, etc. Also, -the check-perf-trace.py script, while not interesting for its results, +the check-perf-script.py script, while not interesting for its results, attempts to exercise all of the main scripting features. EVENT HANDLERS -------------- -When perf trace is invoked using a trace script, a user-defined +When perf script is invoked using a trace script, a user-defined 'handler function' is called for each event in the trace. If there's no handler function defined for a given event type, the event is ignored (or passed to a 'trace_handled' function, see below) and the @@ -510,7 +510,7 @@ write a useful trace script. The sections below cover the rest. SCRIPT LAYOUT ------------- -Every perf trace Python script should start by setting up a Python +Every perf script Python script should start by setting up a Python module search path and 'import'ing a few support modules (see module descriptions below): @@ -519,7 +519,7 @@ descriptions below): import sys sys.path.append(os.environ['PERF_EXEC_PATH'] + \ - '/scripts/python/Perf-Trace-Util/lib/Perf/Trace') + '/scripts/python/perf-script-Util/lib/Perf/Trace') from perf_trace_context import * from Core import * @@ -559,15 +559,15 @@ def trace_unhandled(event_name, context, common_cpu, common_secs, ---- The remaining sections provide descriptions of each of the available -built-in perf trace Python modules and their associated functions. +built-in perf script Python modules and their associated functions. AVAILABLE MODULES AND FUNCTIONS ------------------------------- The following sections describe the functions and variables available -via the various perf trace Python modules. To use the functions and +via the various perf script Python modules. To use the functions and variables from the given module, add the corresponding 'from XXXX -import' line to your perf trace script. +import' line to your perf script script. Core.py Module ~~~~~~~~~~~~~~ @@ -610,7 +610,7 @@ argument. Util.py Module ~~~~~~~~~~~~~~ -Various utility functions for use with perf trace: +Various utility functions for use with perf script: nsecs(secs, nsecs) - returns total nsecs given secs/nsecs pair nsecs_secs(nsecs) - returns whole secs portion given nsecs @@ -620,4 +620,4 @@ Various utility functions for use with perf trace: SEE ALSO -------- -linkperf:perf-trace[1] +linkperf:perf-script[1] diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-script.txt index 26aff6bf9e50..29ad94293cd2 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -1,71 +1,71 @@ -perf-trace(1) +perf-script(1) ============= NAME ---- -perf-trace - Read perf.data (created by perf record) and display trace output +perf-script - Read perf.data (created by perf record) and display trace output SYNOPSIS -------- [verse] -'perf trace' [<options>] -'perf trace' [<options>] record <script> [<record-options>] <command> -'perf trace' [<options>] report <script> [script-args] -'perf trace' [<options>] <script> <required-script-args> [<record-options>] <command> -'perf trace' [<options>] <top-script> [script-args] +'perf script' [<options>] +'perf script' [<options>] record <script> [<record-options>] <command> +'perf script' [<options>] report <script> [script-args] +'perf script' [<options>] <script> <required-script-args> [<record-options>] <command> +'perf script' [<options>] <top-script> [script-args] DESCRIPTION ----------- This command reads the input file and displays the trace recorded. -There are several variants of perf trace: +There are several variants of perf script: - 'perf trace' to see a detailed trace of the workload that was + 'perf script' to see a detailed trace of the workload that was recorded. You can also run a set of pre-canned scripts that aggregate and summarize the raw trace data in various ways (the list of scripts is - available via 'perf trace -l'). The following variants allow you to + available via 'perf script -l'). The following variants allow you to record and run those scripts: - 'perf trace record <script> <command>' to record the events required - for 'perf trace report'. <script> is the name displayed in the - output of 'perf trace --list' i.e. the actual script name minus any + 'perf script record <script> <command>' to record the events required + for 'perf script report'. <script> is the name displayed in the + output of 'perf script --list' i.e. the actual script name minus any language extension. If <command> is not specified, the events are recorded using the -a (system-wide) 'perf record' option. - 'perf trace report <script> [args]' to run and display the results + 'perf script report <script> [args]' to run and display the results of <script>. <script> is the name displayed in the output of 'perf trace --list' i.e. the actual script name minus any language - extension. The perf.data output from a previous run of 'perf trace + extension. The perf.data output from a previous run of 'perf script record <script>' is used and should be present for this command to succeed. [args] refers to the (mainly optional) args expected by the script. - 'perf trace <script> <required-script-args> <command>' to both + 'perf script <script> <required-script-args> <command>' to both record the events required for <script> and to run the <script> using 'live-mode' i.e. without writing anything to disk. <script> - is the name displayed in the output of 'perf trace --list' i.e. the + is the name displayed in the output of 'perf script --list' i.e. the actual script name minus any language extension. If <command> is not specified, the events are recorded using the -a (system-wide) 'perf record' option. If <script> has any required args, they should be specified before <command>. This mode doesn't allow for optional script args to be specified; if optional script args are - desired, they can be specified using separate 'perf trace record' - and 'perf trace report' commands, with the stdout of the record step + desired, they can be specified using separate 'perf script record' + and 'perf script report' commands, with the stdout of the record step piped to the stdin of the report script, using the '-o -' and '-i -' options of the corresponding commands. - 'perf trace <top-script>' to both record the events required for + 'perf script <top-script>' to both record the events required for <top-script> and to run the <top-script> using 'live-mode' i.e. without writing anything to disk. <top-script> is the name - displayed in the output of 'perf trace --list' i.e. the actual + displayed in the output of 'perf script --list' i.e. the actual script name minus any language extension; a <top-script> is defined as any script name ending with the string 'top'. - [<record-options>] can be passed to the record steps of 'perf trace + [<record-options>] can be passed to the record steps of 'perf script record' and 'live-mode' variants; this isn't possible however for - <top-script> 'live-mode' or 'perf trace report' variants. + <top-script> 'live-mode' or 'perf script report' variants. See the 'SEE ALSO' section for links to language-specific information on how to write and run your own trace scripts. @@ -76,7 +76,7 @@ OPTIONS Any command you can specify in a shell. -D:: ---dump-raw-trace=:: +--dump-raw-script=:: Display verbose dump of the trace data. -L:: @@ -95,7 +95,7 @@ OPTIONS -g:: --gen-script=:: - Generate perf-trace.[ext] starter script for given language, + Generate perf-script.[ext] starter script for given language, using current perf.data. -a:: @@ -104,8 +104,15 @@ OPTIONS normally don't - this option allows the latter to be run in system-wide mode. +-i:: +--input=:: + Input file name. + +-d:: +--debug-mode:: + Do various checks like samples ordering and lost events. SEE ALSO -------- -linkperf:perf-record[1], linkperf:perf-trace-perl[1], -linkperf:perf-trace-python[1] +linkperf:perf-record[1], linkperf:perf-script-perl[1], +linkperf:perf-script-python[1] diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 4b3a2d46b437..b6da7affbbee 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -8,8 +8,8 @@ perf-stat - Run a command and gather performance counter statistics SYNOPSIS -------- [verse] -'perf stat' [-e <EVENT> | --event=EVENT] [-S] [-a] <command> -'perf stat' [-e <EVENT> | --event=EVENT] [-S] [-a] -- <command> [<options>] +'perf stat' [-e <EVENT> | --event=EVENT] [-a] <command> +'perf stat' [-e <EVENT> | --event=EVENT] [-a] -- <command> [<options>] DESCRIPTION ----------- @@ -35,24 +35,54 @@ OPTIONS child tasks do not inherit counters -p:: --pid=<pid>:: - stat events on existing pid + stat events on existing process id + +-t:: +--tid=<tid>:: + stat events on existing thread id + -a:: - system-wide collection +--all-cpus:: + system-wide collection from all CPUs -c:: - scale counter values +--scale:: + scale/normalize counter values + +-r:: +--repeat=<n>:: + repeat command and print average + stddev (max: 100) -B:: +--big-num:: print large numbers with thousands' separators according to locale -C:: --cpu=:: -Count only on the list of cpus provided. Multiple CPUs can be provided as a -comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. +Count only on the list of CPUs provided. Multiple CPUs can be provided as a +comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. In per-thread mode, this option is ignored. The -a option is still necessary to activate system-wide monitoring. Default is to count on all CPUs. +-A:: +--no-aggr:: +Do not aggregate counts across all monitored CPUs in system-wide mode (-a). +This option is only valid in system-wide mode. + +-n:: +--null:: + null run - don't start any counters + +-v:: +--verbose:: + be more verbose (show counter open errors, etc) + +-x SEP:: +--field-separator SEP:: +print counts using a CSV-style output to make it easy to import directly into +spreadsheets. Columns are separated by the string specified in SEP. + EXAMPLES -------- diff --git a/tools/perf/Documentation/perf-test.txt b/tools/perf/Documentation/perf-test.txt index 1c4b5f5b7f71..2c3b462f64b0 100644 --- a/tools/perf/Documentation/perf-test.txt +++ b/tools/perf/Documentation/perf-test.txt @@ -12,7 +12,7 @@ SYNOPSIS DESCRIPTION ----------- -This command does assorted sanity tests, initially thru linked routines but +This command does assorted sanity tests, initially through linked routines but also will look for a directory with more tests in the form of scripts. OPTIONS diff --git a/tools/perf/Documentation/perf-timechart.txt b/tools/perf/Documentation/perf-timechart.txt index 4b1788355eca..d7b79e2ba2ad 100644 --- a/tools/perf/Documentation/perf-timechart.txt +++ b/tools/perf/Documentation/perf-timechart.txt @@ -38,6 +38,8 @@ OPTIONS --process:: Select the processes to display, by name or PID +--symfs=<directory>:: + Look for files with symbols relative to this directory. SEE ALSO -------- diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 1f9687663f2a..f6eb1cdafb77 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -12,7 +12,7 @@ SYNOPSIS DESCRIPTION ----------- -This command generates and displays a performance counter profile in realtime. +This command generates and displays a performance counter profile in real time. OPTIONS @@ -27,8 +27,8 @@ OPTIONS -C <cpu-list>:: --cpu=<cpu>:: -Monitor only on the list of cpus provided. Multiple CPUs can be provided as a -comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. +Monitor only on the list of CPUs provided. Multiple CPUs can be provided as a +comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. Default is to monitor all CPUS. -d <seconds>:: @@ -50,6 +50,10 @@ Default is to monitor all CPUS. --count-filter=<count>:: Only display functions with more events than this. +-g:: +--group:: + Put the counters into a counter group. + -F <freq>:: --freq=<freq>:: Profile at this frequency. @@ -68,7 +72,11 @@ Default is to monitor all CPUS. -p <pid>:: --pid=<pid>:: - Profile events on existing pid. + Profile events on existing Process ID. + +-t <tid>:: +--tid=<tid>:: + Profile events on existing thread ID. -r <priority>:: --realtime=<priority>:: @@ -78,6 +86,18 @@ Default is to monitor all CPUS. --sym-annotate=<symbol>:: Annotate this symbol. +-K:: +--hide_kernel_symbols:: + Hide kernel symbols. + +-U:: +--hide_user_symbols:: + Hide user symbols. + +-D:: +--dump-symtab:: + Dump the symbol table used for profiling. + -v:: --verbose:: Be more verbose (show counter open errors, etc). diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 8c7fc0c8f0b8..c12659d8cb26 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -7,6 +7,7 @@ include/linux/stringify.h lib/rbtree.c include/linux/swab.h arch/*/include/asm/unistd*.h +arch/*/lib/memcpy*.S include/linux/poison.h include/linux/magic.h include/linux/hw_breakpoint.h diff --git a/tools/perf/Makefile b/tools/perf/Makefile index d1db0f676a4b..1b9b13ee2a72 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -185,7 +185,10 @@ ifeq ($(ARCH),i386) ARCH := x86 endif ifeq ($(ARCH),x86_64) + RAW_ARCH := x86_64 ARCH := x86 + ARCH_CFLAGS := -DARCH_X86_64 + ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S endif # CFLAGS and LDFLAGS are for the users to override from the command line. @@ -375,6 +378,7 @@ LIB_H += util/include/linux/prefetch.h LIB_H += util/include/linux/rbtree.h LIB_H += util/include/linux/string.h LIB_H += util/include/linux/types.h +LIB_H += util/include/linux/linkage.h LIB_H += util/include/asm/asm-offsets.h LIB_H += util/include/asm/bug.h LIB_H += util/include/asm/byteorder.h @@ -383,6 +387,8 @@ LIB_H += util/include/asm/swab.h LIB_H += util/include/asm/system.h LIB_H += util/include/asm/uaccess.h LIB_H += util/include/dwarf-regs.h +LIB_H += util/include/asm/dwarf2.h +LIB_H += util/include/asm/cpufeature.h LIB_H += perf.h LIB_H += util/cache.h LIB_H += util/callchain.h @@ -390,6 +396,7 @@ LIB_H += util/build-id.h LIB_H += util/debug.h LIB_H += util/debugfs.h LIB_H += util/event.h +LIB_H += util/evsel.h LIB_H += util/exec_cmd.h LIB_H += util/types.h LIB_H += util/levenshtein.h @@ -398,6 +405,7 @@ LIB_H += util/parse-options.h LIB_H += util/parse-events.h LIB_H += util/quote.h LIB_H += util/util.h +LIB_H += util/xyarray.h LIB_H += util/header.h LIB_H += util/help.h LIB_H += util/session.h @@ -417,6 +425,7 @@ LIB_H += util/probe-finder.h LIB_H += util/probe-event.h LIB_H += util/pstack.h LIB_H += util/cpumap.h +LIB_H += $(ARCH_INCLUDE) LIB_OBJS += $(OUTPUT)util/abspath.o LIB_OBJS += $(OUTPUT)util/alias.o @@ -426,6 +435,7 @@ LIB_OBJS += $(OUTPUT)util/ctype.o LIB_OBJS += $(OUTPUT)util/debugfs.o LIB_OBJS += $(OUTPUT)util/environment.o LIB_OBJS += $(OUTPUT)util/event.o +LIB_OBJS += $(OUTPUT)util/evsel.o LIB_OBJS += $(OUTPUT)util/exec_cmd.o LIB_OBJS += $(OUTPUT)util/help.o LIB_OBJS += $(OUTPUT)util/levenshtein.o @@ -463,6 +473,7 @@ LIB_OBJS += $(OUTPUT)util/sort.o LIB_OBJS += $(OUTPUT)util/hist.o LIB_OBJS += $(OUTPUT)util/probe-event.o LIB_OBJS += $(OUTPUT)util/util.o +LIB_OBJS += $(OUTPUT)util/xyarray.o LIB_OBJS += $(OUTPUT)util/cpumap.o BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o @@ -472,6 +483,9 @@ BUILTIN_OBJS += $(OUTPUT)builtin-bench.o # Benchmark modules BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o +ifeq ($(RAW_ARCH),x86_64) +BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o +endif BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o BUILTIN_OBJS += $(OUTPUT)builtin-diff.o @@ -485,7 +499,7 @@ BUILTIN_OBJS += $(OUTPUT)builtin-report.o BUILTIN_OBJS += $(OUTPUT)builtin-stat.o BUILTIN_OBJS += $(OUTPUT)builtin-timechart.o BUILTIN_OBJS += $(OUTPUT)builtin-top.o -BUILTIN_OBJS += $(OUTPUT)builtin-trace.o +BUILTIN_OBJS += $(OUTPUT)builtin-script.o BUILTIN_OBJS += $(OUTPUT)builtin-probe.o BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o BUILTIN_OBJS += $(OUTPUT)builtin-lock.o @@ -507,7 +521,7 @@ PERFLIBS = $(LIB_FILE) -include config.mak ifndef NO_DWARF -FLAGS_DWARF=$(ALL_CFLAGS) -I/usr/include/elfutils -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS) +FLAGS_DWARF=$(ALL_CFLAGS) -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS) ifneq ($(call try-cc,$(SOURCE_DWARF),$(FLAGS_DWARF)),y) msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev); NO_DWARF := 1 @@ -554,7 +568,7 @@ ifndef NO_DWARF ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined) msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled); else - BASIC_CFLAGS += -I/usr/include/elfutils -DDWARF_SUPPORT + BASIC_CFLAGS += -DDWARF_SUPPORT EXTLIBS += -lelf -ldw LIB_OBJS += $(OUTPUT)util/probe-finder.o endif # PERF_HAVE_DWARF_REGS @@ -891,13 +905,14 @@ prefix_SQ = $(subst ','\'',$(prefix)) SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH)) PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH)) -LIBS = $(PERFLIBS) $(EXTLIBS) +LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive $(EXTLIBS) BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \ $(COMPAT_CFLAGS) LIB_OBJS += $(COMPAT_OBJS) ALL_CFLAGS += $(BASIC_CFLAGS) +ALL_CFLAGS += $(ARCH_CFLAGS) ALL_LDFLAGS += $(BASIC_LDFLAGS) export TAR INSTALL DESTDIR SHELL_PATH diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h new file mode 100644 index 000000000000..a72e36cb5394 --- /dev/null +++ b/tools/perf/bench/mem-memcpy-arch.h @@ -0,0 +1,12 @@ + +#ifdef ARCH_X86_64 + +#define MEMCPY_FN(fn, name, desc) \ + extern void *fn(void *, const void *, size_t); + +#include "mem-memcpy-x86-64-asm-def.h" + +#undef MEMCPY_FN + +#endif + diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h new file mode 100644 index 000000000000..d588b87696fc --- /dev/null +++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h @@ -0,0 +1,4 @@ + +MEMCPY_FN(__memcpy, + "x86-64-unrolled", + "unrolled memcpy() in arch/x86/lib/memcpy_64.S") diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S new file mode 100644 index 000000000000..a57b66e853c2 --- /dev/null +++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S @@ -0,0 +1,2 @@ + +#include "../../../arch/x86/lib/memcpy_64.S" diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c index 38dae7465142..db82021f4b91 100644 --- a/tools/perf/bench/mem-memcpy.c +++ b/tools/perf/bench/mem-memcpy.c @@ -12,6 +12,7 @@ #include "../util/parse-options.h" #include "../util/header.h" #include "bench.h" +#include "mem-memcpy-arch.h" #include <stdio.h> #include <stdlib.h> @@ -23,8 +24,10 @@ static const char *length_str = "1MB"; static const char *routine = "default"; -static bool use_clock = false; +static bool use_clock; static int clock_fd; +static bool only_prefault; +static bool no_prefault; static const struct option options[] = { OPT_STRING('l', "length", &length_str, "1MB", @@ -34,19 +37,33 @@ static const struct option options[] = { "Specify routine to copy"), OPT_BOOLEAN('c', "clock", &use_clock, "Use CPU clock for measuring"), + OPT_BOOLEAN('o', "only-prefault", &only_prefault, + "Show only the result with page faults before memcpy()"), + OPT_BOOLEAN('n', "no-prefault", &no_prefault, + "Show only the result without page faults before memcpy()"), OPT_END() }; +typedef void *(*memcpy_t)(void *, const void *, size_t); + struct routine { const char *name; const char *desc; - void * (*fn)(void *dst, const void *src, size_t len); + memcpy_t fn; }; struct routine routines[] = { { "default", "Default memcpy() provided by glibc", memcpy }, +#ifdef ARCH_X86_64 + +#define MEMCPY_FN(fn, name, desc) { name, desc, fn }, +#include "mem-memcpy-x86-64-asm-def.h" +#undef MEMCPY_FN + +#endif + { NULL, NULL, NULL } @@ -89,29 +106,98 @@ static double timeval2double(struct timeval *ts) (double)ts->tv_usec / (double)1000000; } +static void alloc_mem(void **dst, void **src, size_t length) +{ + *dst = zalloc(length); + if (!dst) + die("memory allocation failed - maybe length is too large?\n"); + + *src = zalloc(length); + if (!src) + die("memory allocation failed - maybe length is too large?\n"); +} + +static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault) +{ + u64 clock_start = 0ULL, clock_end = 0ULL; + void *src = NULL, *dst = NULL; + + alloc_mem(&src, &dst, len); + + if (prefault) + fn(dst, src, len); + + clock_start = get_clock(); + fn(dst, src, len); + clock_end = get_clock(); + + free(src); + free(dst); + return clock_end - clock_start; +} + +static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault) +{ + struct timeval tv_start, tv_end, tv_diff; + void *src = NULL, *dst = NULL; + + alloc_mem(&src, &dst, len); + + if (prefault) + fn(dst, src, len); + + BUG_ON(gettimeofday(&tv_start, NULL)); + fn(dst, src, len); + BUG_ON(gettimeofday(&tv_end, NULL)); + + timersub(&tv_end, &tv_start, &tv_diff); + + free(src); + free(dst); + return (double)((double)len / timeval2double(&tv_diff)); +} + +#define pf (no_prefault ? 0 : 1) + +#define print_bps(x) do { \ + if (x < K) \ + printf(" %14lf B/Sec", x); \ + else if (x < K * K) \ + printf(" %14lfd KB/Sec", x / K); \ + else if (x < K * K * K) \ + printf(" %14lf MB/Sec", x / K / K); \ + else \ + printf(" %14lf GB/Sec", x / K / K / K); \ + } while (0) + int bench_mem_memcpy(int argc, const char **argv, const char *prefix __used) { int i; - void *dst, *src; - size_t length; - double bps = 0.0; - struct timeval tv_start, tv_end, tv_diff; - u64 clock_start, clock_end, clock_diff; + size_t len; + double result_bps[2]; + u64 result_clock[2]; - clock_start = clock_end = clock_diff = 0ULL; argc = parse_options(argc, argv, options, bench_mem_memcpy_usage, 0); - tv_diff.tv_sec = 0; - tv_diff.tv_usec = 0; - length = (size_t)perf_atoll((char *)length_str); + if (use_clock) + init_clock(); + + len = (size_t)perf_atoll((char *)length_str); - if ((s64)length <= 0) { + result_clock[0] = result_clock[1] = 0ULL; + result_bps[0] = result_bps[1] = 0.0; + + if ((s64)len <= 0) { fprintf(stderr, "Invalid length:%s\n", length_str); return 1; } + /* same to without specifying either of prefault and no-prefault */ + if (only_prefault && no_prefault) + only_prefault = no_prefault = false; + for (i = 0; routines[i].name; i++) { if (!strcmp(routines[i].name, routine)) break; @@ -126,61 +212,80 @@ int bench_mem_memcpy(int argc, const char **argv, return 1; } - dst = zalloc(length); - if (!dst) - die("memory allocation failed - maybe length is too large?\n"); - - src = zalloc(length); - if (!src) - die("memory allocation failed - maybe length is too large?\n"); - - if (bench_format == BENCH_FORMAT_DEFAULT) { - printf("# Copying %s Bytes from %p to %p ...\n\n", - length_str, src, dst); - } - - if (use_clock) { - init_clock(); - clock_start = get_clock(); - } else { - BUG_ON(gettimeofday(&tv_start, NULL)); - } - - routines[i].fn(dst, src, length); + if (bench_format == BENCH_FORMAT_DEFAULT) + printf("# Copying %s Bytes ...\n\n", length_str); - if (use_clock) { - clock_end = get_clock(); - clock_diff = clock_end - clock_start; + if (!only_prefault && !no_prefault) { + /* show both of results */ + if (use_clock) { + result_clock[0] = + do_memcpy_clock(routines[i].fn, len, false); + result_clock[1] = + do_memcpy_clock(routines[i].fn, len, true); + } else { + result_bps[0] = + do_memcpy_gettimeofday(routines[i].fn, + len, false); + result_bps[1] = + do_memcpy_gettimeofday(routines[i].fn, + len, true); + } } else { - BUG_ON(gettimeofday(&tv_end, NULL)); - timersub(&tv_end, &tv_start, &tv_diff); - bps = (double)((double)length / timeval2double(&tv_diff)); + if (use_clock) { + result_clock[pf] = + do_memcpy_clock(routines[i].fn, + len, only_prefault); + } else { + result_bps[pf] = + do_memcpy_gettimeofday(routines[i].fn, + len, only_prefault); + } } switch (bench_format) { case BENCH_FORMAT_DEFAULT: - if (use_clock) { - printf(" %14lf Clock/Byte\n", - (double)clock_diff / (double)length); - } else { - if (bps < K) - printf(" %14lf B/Sec\n", bps); - else if (bps < K * K) - printf(" %14lfd KB/Sec\n", bps / 1024); - else if (bps < K * K * K) - printf(" %14lf MB/Sec\n", bps / 1024 / 1024); - else { - printf(" %14lf GB/Sec\n", - bps / 1024 / 1024 / 1024); + if (!only_prefault && !no_prefault) { + if (use_clock) { + printf(" %14lf Clock/Byte\n", + (double)result_clock[0] + / (double)len); + printf(" %14lf Clock/Byte (with prefault)\n", + (double)result_clock[1] + / (double)len); + } else { + print_bps(result_bps[0]); + printf("\n"); + print_bps(result_bps[1]); + printf(" (with prefault)\n"); } + } else { + if (use_clock) { + printf(" %14lf Clock/Byte", + (double)result_clock[pf] + / (double)len); + } else + print_bps(result_bps[pf]); + + printf("%s\n", only_prefault ? " (with prefault)" : ""); } break; case BENCH_FORMAT_SIMPLE: - if (use_clock) { - printf("%14lf\n", - (double)clock_diff / (double)length); - } else - printf("%lf\n", bps); + if (!only_prefault && !no_prefault) { + if (use_clock) { + printf("%lf %lf\n", + (double)result_clock[0] / (double)len, + (double)result_clock[1] / (double)len); + } else { + printf("%lf %lf\n", + result_bps[0], result_bps[1]); + } + } else { + if (use_clock) { + printf("%lf\n", (double)result_clock[pf] + / (double)len); + } else + printf("%lf\n", result_bps[pf]); + } break; default: /* reaching this means there's some disaster: */ diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 6d5604d8df95..c056cdc06912 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -58,12 +58,12 @@ static int hists__add_entry(struct hists *self, struct addr_location *al) return hist_entry__inc_addr_samples(he, al->addr); } -static int process_sample_event(event_t *event, struct perf_session *session) +static int process_sample_event(event_t *event, struct sample_data *sample, + struct perf_session *session) { struct addr_location al; - struct sample_data data; - if (event__preprocess_sample(event, session, &al, &data, NULL) < 0) { + if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) { pr_warning("problem processing %d event, skipping it.\n", event->header.type); return -1; @@ -375,6 +375,8 @@ static struct perf_event_ops event_ops = { .mmap = event__process_mmap, .comm = event__process_comm, .fork = event__process_task, + .ordered_samples = true, + .ordering_requires_timestamps = true, }; static int __cmd_annotate(void) @@ -382,7 +384,7 @@ static int __cmd_annotate(void) int ret; struct perf_session *session; - session = perf_session__new(input_name, O_RDONLY, force, false); + session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops); if (session == NULL) return -ENOMEM; diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index c49837de7d3f..5af32ae9031e 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -38,7 +38,8 @@ static int __cmd_buildid_list(void) { struct perf_session *session; - session = perf_session__new(input_name, O_RDONLY, force, false); + session = perf_session__new(input_name, O_RDONLY, force, false, + &build_id__mark_dso_hit_ops); if (session == NULL) return -1; diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index fca1d4402910..3153e492dbcc 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -30,12 +30,13 @@ static int hists__add_entry(struct hists *self, return -ENOMEM; } -static int diff__process_sample_event(event_t *event, struct perf_session *session) +static int diff__process_sample_event(event_t *event, + struct sample_data *sample, + struct perf_session *session) { struct addr_location al; - struct sample_data data = { .period = 1, }; - if (event__preprocess_sample(event, session, &al, &data, NULL) < 0) { + if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) { pr_warning("problem processing %d event, skipping it.\n", event->header.type); return -1; @@ -44,12 +45,12 @@ static int diff__process_sample_event(event_t *event, struct perf_session *sessi if (al.filtered || al.sym == NULL) return 0; - if (hists__add_entry(&session->hists, &al, data.period)) { + if (hists__add_entry(&session->hists, &al, sample->period)) { pr_warning("problem incrementing symbol period, skipping event\n"); return -1; } - session->hists.stats.total_period += data.period; + session->hists.stats.total_period += sample->period; return 0; } @@ -60,6 +61,8 @@ static struct perf_event_ops event_ops = { .exit = event__process_task, .fork = event__process_task, .lost = event__process_lost, + .ordered_samples = true, + .ordering_requires_timestamps = true, }; static void perf_session__insert_hist_entry_by_name(struct rb_root *root, @@ -141,8 +144,8 @@ static int __cmd_diff(void) int ret, i; struct perf_session *session[2]; - session[0] = perf_session__new(input_old, O_RDONLY, force, false); - session[1] = perf_session__new(input_new, O_RDONLY, force, false); + session[0] = perf_session__new(input_old, O_RDONLY, force, false, &event_ops); + session[1] = perf_session__new(input_new, O_RDONLY, force, false, &event_ops); if (session[0] == NULL || session[1] == NULL) return -ENOMEM; @@ -173,7 +176,7 @@ static const char * const diff_usage[] = { static const struct option options[] = { OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), - OPT_BOOLEAN('m', "displacement", &show_displacement, + OPT_BOOLEAN('M', "displacement", &show_displacement, "Show position displacement relative to baseline"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), @@ -191,6 +194,8 @@ static const struct option options[] = { OPT_STRING('t', "field-separator", &symbol_conf.field_sep, "separator", "separator for columns, no spaces will be added between " "columns '.' is reserved."), + OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", + "Look for files with symbols relative to this directory"), OPT_END() }; diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 8e3e47b064ce..0c78ffa7bf67 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -16,8 +16,8 @@ static char const *input_name = "-"; static bool inject_build_ids; -static int event__repipe(event_t *event __used, - struct perf_session *session __used) +static int event__repipe_synth(event_t *event, + struct perf_session *session __used) { uint32_t size; void *buf = event; @@ -36,22 +36,30 @@ static int event__repipe(event_t *event __used, return 0; } -static int event__repipe_mmap(event_t *self, struct perf_session *session) +static int event__repipe(event_t *event, struct sample_data *sample __used, + struct perf_session *session) +{ + return event__repipe_synth(event, session); +} + +static int event__repipe_mmap(event_t *self, struct sample_data *sample, + struct perf_session *session) { int err; - err = event__process_mmap(self, session); - event__repipe(self, session); + err = event__process_mmap(self, sample, session); + event__repipe(self, sample, session); return err; } -static int event__repipe_task(event_t *self, struct perf_session *session) +static int event__repipe_task(event_t *self, struct sample_data *sample, + struct perf_session *session) { int err; - err = event__process_task(self, session); - event__repipe(self, session); + err = event__process_task(self, sample, session); + event__repipe(self, sample, session); return err; } @@ -61,7 +69,7 @@ static int event__repipe_tracing_data(event_t *self, { int err; - event__repipe(self, session); + event__repipe_synth(self, session); err = event__process_tracing_data(self, session); return err; @@ -111,7 +119,8 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session) return 0; } -static int event__inject_buildid(event_t *event, struct perf_session *session) +static int event__inject_buildid(event_t *event, struct sample_data *sample, + struct perf_session *session) { struct addr_location al; struct thread *thread; @@ -146,7 +155,7 @@ static int event__inject_buildid(event_t *event, struct perf_session *session) } repipe: - event__repipe(event, session); + event__repipe(event, sample, session); return 0; } @@ -160,10 +169,10 @@ struct perf_event_ops inject_ops = { .read = event__repipe, .throttle = event__repipe, .unthrottle = event__repipe, - .attr = event__repipe, - .event_type = event__repipe, - .tracing_data = event__repipe, - .build_id = event__repipe, + .attr = event__repipe_synth, + .event_type = event__repipe_synth, + .tracing_data = event__repipe_synth, + .build_id = event__repipe_synth, }; extern volatile int session_done; @@ -187,7 +196,7 @@ static int __cmd_inject(void) inject_ops.tracing_data = event__repipe_tracing_data; } - session = perf_session__new(input_name, O_RDONLY, false, true); + session = perf_session__new(input_name, O_RDONLY, false, true, &inject_ops); if (session == NULL) return -ENOMEM; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 31f60a2535e0..def7ddc2fd4f 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -304,22 +304,11 @@ process_raw_event(event_t *raw_event __used, void *data, } } -static int process_sample_event(event_t *event, struct perf_session *session) +static int process_sample_event(event_t *event, struct sample_data *sample, + struct perf_session *session) { - struct sample_data data; - struct thread *thread; + struct thread *thread = perf_session__findnew(session, event->ip.pid); - memset(&data, 0, sizeof(data)); - data.time = -1; - data.cpu = -1; - data.period = 1; - - event__parse_sample(event, session->sample_type, &data); - - dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc, - data.pid, data.tid, data.ip, data.period); - - thread = perf_session__findnew(session, event->ip.pid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", event->header.type); @@ -328,8 +317,8 @@ static int process_sample_event(event_t *event, struct perf_session *session) dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); - process_raw_event(event, data.raw_data, data.cpu, - data.time, thread); + process_raw_event(event, sample->raw_data, sample->cpu, + sample->time, thread); return 0; } @@ -492,7 +481,8 @@ static void sort_result(void) static int __cmd_kmem(void) { int err = -EINVAL; - struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0, false); + struct perf_session *session = perf_session__new(input_name, O_RDONLY, + 0, false, &event_ops); if (session == NULL) return -ENOMEM; @@ -747,6 +737,9 @@ static int __cmd_record(int argc, const char **argv) rec_argc = ARRAY_SIZE(record_args) + argc - 1; rec_argv = calloc(rec_argc + 1, sizeof(char *)); + if (rec_argv == NULL) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(record_args); i++) rec_argv[i] = strdup(record_args[i]); diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 821c1586a22b..b9c6e5432971 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -834,22 +834,18 @@ static void dump_info(void) die("Unknown type of information\n"); } -static int process_sample_event(event_t *self, struct perf_session *s) +static int process_sample_event(event_t *self, struct sample_data *sample, + struct perf_session *s) { - struct sample_data data; - struct thread *thread; + struct thread *thread = perf_session__findnew(s, sample->tid); - bzero(&data, sizeof(data)); - event__parse_sample(self, s->sample_type, &data); - - thread = perf_session__findnew(s, data.tid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", self->header.type); return -1; } - process_raw_event(data.raw_data, data.cpu, data.time, thread); + process_raw_event(sample->raw_data, sample->cpu, sample->time, thread); return 0; } @@ -862,7 +858,7 @@ static struct perf_event_ops eops = { static int read_events(void) { - session = perf_session__new(input_name, O_RDONLY, 0, false); + session = perf_session__new(input_name, O_RDONLY, 0, false, &eops); if (!session) die("Initializing perf session failed\n"); @@ -947,6 +943,9 @@ static int __cmd_record(int argc, const char **argv) rec_argc = ARRAY_SIZE(record_args) + argc - 1; rec_argv = calloc(rec_argc + 1, sizeof(char *)); + if (rec_argv == NULL) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(record_args); i++) rec_argv[i] = strdup(record_args[i]); @@ -982,9 +981,9 @@ int cmd_lock(int argc, const char **argv, const char *prefix __used) usage_with_options(report_usage, report_options); } __cmd_report(); - } else if (!strcmp(argv[0], "trace")) { - /* Aliased to 'perf trace' */ - return cmd_trace(argc, argv, prefix); + } else if (!strcmp(argv[0], "script")) { + /* Aliased to 'perf script' */ + return cmd_script(argc, argv, prefix); } else if (!strcmp(argv[0], "info")) { if (argc) { argc = parse_options(argc, argv, diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 564491fa18b2..7bc049035484 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -18,6 +18,7 @@ #include "util/header.h" #include "util/event.h" +#include "util/evsel.h" #include "util/debug.h" #include "util/session.h" #include "util/symbol.h" @@ -27,17 +28,18 @@ #include <sched.h> #include <sys/mman.h> +#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) + enum write_mode_t { WRITE_FORCE, WRITE_APPEND }; -static int *fd[MAX_NR_CPUS][MAX_COUNTERS]; - static u64 user_interval = ULLONG_MAX; static u64 default_interval = 0; +static u64 sample_type; -static int nr_cpus = 0; +static struct cpu_map *cpus; static unsigned int page_size; static unsigned int mmap_pages = 128; static unsigned int user_freq = UINT_MAX; @@ -48,11 +50,11 @@ static const char *output_name = "perf.data"; static int group = 0; static int realtime_prio = 0; static bool raw_samples = false; +static bool sample_id_all_avail = true; static bool system_wide = false; static pid_t target_pid = -1; static pid_t target_tid = -1; -static pid_t *all_tids = NULL; -static int thread_num = 0; +static struct thread_map *threads; static pid_t child_pid = -1; static bool no_inherit = false; static enum write_mode_t write_mode = WRITE_FORCE; @@ -60,7 +62,9 @@ static bool call_graph = false; static bool inherit_stat = false; static bool no_samples = false; static bool sample_address = false; +static bool sample_time = false; static bool no_buildid = false; +static bool no_buildid_cache = false; static long samples = 0; static u64 bytes_written = 0; @@ -77,7 +81,6 @@ static struct perf_session *session; static const char *cpu_list; struct mmap_data { - int counter; void *base; unsigned int mask; unsigned int prev; @@ -128,6 +131,7 @@ static void write_output(void *buf, size_t size) } static int process_synthesized_event(event_t *event, + struct sample_data *sample __used, struct perf_session *self __used) { write_output(event, event->header.size); @@ -224,12 +228,12 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n return h_attr; } -static void create_counter(int counter, int cpu) +static void create_counter(struct perf_evsel *evsel, int cpu) { - char *filter = filters[counter]; - struct perf_event_attr *attr = attrs + counter; + char *filter = evsel->filter; + struct perf_event_attr *attr = &evsel->attr; struct perf_header_attr *h_attr; - int track = !counter; /* only the first counter needs these */ + int track = !evsel->idx; /* only the first counter needs these */ int thread_index; int ret; struct { @@ -238,6 +242,19 @@ static void create_counter(int counter, int cpu) u64 time_running; u64 id; } read_data; + /* + * Check if parse_single_tracepoint_event has already asked for + * PERF_SAMPLE_TIME. + * + * XXX this is kludgy but short term fix for problems introduced by + * eac23d1c that broke 'perf script' by having different sample_types + * when using multiple tracepoint events when we use a perf binary + * that tries to use sample_id_all on an older kernel. + * + * We need to move counter creation to perf_session, support + * different sample_types, etc. + */ + bool time_needed = attr->sample_type & PERF_SAMPLE_TIME; attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING | @@ -280,6 +297,10 @@ static void create_counter(int counter, int cpu) if (system_wide) attr->sample_type |= PERF_SAMPLE_CPU; + if (sample_id_all_avail && + (sample_time || system_wide || !no_inherit || cpu_list)) + attr->sample_type |= PERF_SAMPLE_TIME; + if (raw_samples) { attr->sample_type |= PERF_SAMPLE_TIME; attr->sample_type |= PERF_SAMPLE_RAW; @@ -293,13 +314,14 @@ static void create_counter(int counter, int cpu) attr->disabled = 1; attr->enable_on_exec = 1; } +retry_sample_id: + attr->sample_id_all = sample_id_all_avail ? 1 : 0; - for (thread_index = 0; thread_index < thread_num; thread_index++) { + for (thread_index = 0; thread_index < threads->nr; thread_index++) { try_again: - fd[nr_cpu][counter][thread_index] = sys_perf_event_open(attr, - all_tids[thread_index], cpu, group_fd, 0); + FD(evsel, nr_cpu, thread_index) = sys_perf_event_open(attr, threads->map[thread_index], cpu, group_fd, 0); - if (fd[nr_cpu][counter][thread_index] < 0) { + if (FD(evsel, nr_cpu, thread_index) < 0) { int err = errno; if (err == EPERM || err == EACCES) @@ -309,6 +331,15 @@ try_again: else if (err == ENODEV && cpu_list) { die("No such device - did you specify" " an out-of-range profile CPU?\n"); + } else if (err == EINVAL && sample_id_all_avail) { + /* + * Old kernel, no attr->sample_id_type_all field + */ + sample_id_all_avail = false; + if (!sample_time && !raw_samples && !time_needed) + attr->sample_type &= ~PERF_SAMPLE_TIME; + + goto retry_sample_id; } /* @@ -326,8 +357,8 @@ try_again: goto try_again; } printf("\n"); - error("perfcounter syscall returned with %d (%s)\n", - fd[nr_cpu][counter][thread_index], strerror(err)); + error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n", + FD(evsel, nr_cpu, thread_index), strerror(err)); #if defined(__i386__) || defined(__x86_64__) if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP) @@ -341,7 +372,7 @@ try_again: exit(-1); } - h_attr = get_header_attr(attr, counter); + h_attr = get_header_attr(attr, evsel->idx); if (h_attr == NULL) die("nomem\n"); @@ -352,7 +383,7 @@ try_again: } } - if (read(fd[nr_cpu][counter][thread_index], &read_data, sizeof(read_data)) == -1) { + if (read(FD(evsel, nr_cpu, thread_index), &read_data, sizeof(read_data)) == -1) { perror("Unable to read perf file descriptor"); exit(-1); } @@ -362,43 +393,44 @@ try_again: exit(-1); } - assert(fd[nr_cpu][counter][thread_index] >= 0); - fcntl(fd[nr_cpu][counter][thread_index], F_SETFL, O_NONBLOCK); + assert(FD(evsel, nr_cpu, thread_index) >= 0); + fcntl(FD(evsel, nr_cpu, thread_index), F_SETFL, O_NONBLOCK); /* * First counter acts as the group leader: */ if (group && group_fd == -1) - group_fd = fd[nr_cpu][counter][thread_index]; - - if (counter || thread_index) { - ret = ioctl(fd[nr_cpu][counter][thread_index], - PERF_EVENT_IOC_SET_OUTPUT, - fd[nr_cpu][0][0]); + group_fd = FD(evsel, nr_cpu, thread_index); + + if (evsel->idx || thread_index) { + struct perf_evsel *first; + first = list_entry(evsel_list.next, struct perf_evsel, node); + ret = ioctl(FD(evsel, nr_cpu, thread_index), + PERF_EVENT_IOC_SET_OUTPUT, + FD(first, nr_cpu, 0)); if (ret) { error("failed to set output: %d (%s)\n", errno, strerror(errno)); exit(-1); } } else { - mmap_array[nr_cpu].counter = counter; mmap_array[nr_cpu].prev = 0; mmap_array[nr_cpu].mask = mmap_pages*page_size - 1; mmap_array[nr_cpu].base = mmap(NULL, (mmap_pages+1)*page_size, - PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter][thread_index], 0); + PROT_READ | PROT_WRITE, MAP_SHARED, FD(evsel, nr_cpu, thread_index), 0); if (mmap_array[nr_cpu].base == MAP_FAILED) { error("failed to mmap with %d (%s)\n", errno, strerror(errno)); exit(-1); } - event_array[nr_poll].fd = fd[nr_cpu][counter][thread_index]; + event_array[nr_poll].fd = FD(evsel, nr_cpu, thread_index); event_array[nr_poll].events = POLLIN; nr_poll++; } if (filter != NULL) { - ret = ioctl(fd[nr_cpu][counter][thread_index], - PERF_EVENT_IOC_SET_FILTER, filter); + ret = ioctl(FD(evsel, nr_cpu, thread_index), + PERF_EVENT_IOC_SET_FILTER, filter); if (ret) { error("failed to set filter with %d (%s)\n", errno, strerror(errno)); @@ -406,15 +438,19 @@ try_again: } } } + + if (!sample_type) + sample_type = attr->sample_type; } static void open_counters(int cpu) { - int counter; + struct perf_evsel *pos; group_fd = -1; - for (counter = 0; counter < nr_counters; counter++) - create_counter(counter, cpu); + + list_for_each_entry(pos, &evsel_list, node) + create_counter(pos, cpu); nr_cpu++; } @@ -437,7 +473,8 @@ static void atexit_header(void) if (!pipe_output) { session->header.data_size += bytes_written; - process_buildids(); + if (!no_buildid) + process_buildids(); perf_header__write(&session->header, output, true); perf_session__delete(session); symbol__exit(); @@ -500,7 +537,7 @@ static void mmap_read_all(void) static int __cmd_record(int argc, const char **argv) { - int i, counter; + int i; struct stat st; int flags; int err; @@ -552,19 +589,22 @@ static int __cmd_record(int argc, const char **argv) } session = perf_session__new(output_name, O_WRONLY, - write_mode == WRITE_FORCE, false); + write_mode == WRITE_FORCE, false, NULL); if (session == NULL) { pr_err("Not enough memory for reading perf file header\n"); return -1; } + if (!no_buildid) + perf_header__set_feat(&session->header, HEADER_BUILD_ID); + if (!file_new) { err = perf_header__read(session, output); if (err < 0) goto out_delete_session; } - if (have_tracepoints(attrs, nr_counters)) + if (have_tracepoints(&evsel_list)) perf_header__set_feat(&session->header, HEADER_TRACE_INFO); /* @@ -612,7 +652,7 @@ static int __cmd_record(int argc, const char **argv) } if (!system_wide && target_tid == -1 && target_pid == -1) - all_tids[0] = child_pid; + threads->map[0] = child_pid; close(child_ready_pipe[1]); close(go_pipe[0]); @@ -626,19 +666,15 @@ static int __cmd_record(int argc, const char **argv) close(child_ready_pipe[0]); } - nr_cpus = read_cpu_map(cpu_list); - if (nr_cpus < 1) { - perror("failed to collect number of CPUs"); - return -1; - } - if (!system_wide && no_inherit && !cpu_list) { open_counters(-1); } else { - for (i = 0; i < nr_cpus; i++) - open_counters(cpumap[i]); + for (i = 0; i < cpus->nr; i++) + open_counters(cpus->map[i]); } + perf_session__set_sample_type(session, sample_type); + if (pipe_output) { err = perf_header__write_pipe(output); if (err < 0) @@ -651,6 +687,8 @@ static int __cmd_record(int argc, const char **argv) post_processing_offset = lseek(output, 0, SEEK_CUR); + perf_session__set_sample_id_all(session, sample_id_all_avail); + if (pipe_output) { err = event__synthesize_attrs(&session->header, process_synthesized_event, @@ -667,7 +705,7 @@ static int __cmd_record(int argc, const char **argv) return err; } - if (have_tracepoints(attrs, nr_counters)) { + if (have_tracepoints(&evsel_list)) { /* * FIXME err <= 0 here actually means that * there were no tracepoints so its not really @@ -676,8 +714,7 @@ static int __cmd_record(int argc, const char **argv) * return this more properly and also * propagate errors that now are calling die() */ - err = event__synthesize_tracing_data(output, attrs, - nr_counters, + err = event__synthesize_tracing_data(output, &evsel_list, process_synthesized_event, session); if (err <= 0) { @@ -751,13 +788,13 @@ static int __cmd_record(int argc, const char **argv) if (done) { for (i = 0; i < nr_cpu; i++) { - for (counter = 0; - counter < nr_counters; - counter++) { + struct perf_evsel *pos; + + list_for_each_entry(pos, &evsel_list, node) { for (thread = 0; - thread < thread_num; + thread < threads->nr; thread++) - ioctl(fd[i][counter][thread], + ioctl(FD(pos, i, thread), PERF_EVENT_IOC_DISABLE); } } @@ -831,16 +868,20 @@ const struct option record_options[] = { "per thread counts"), OPT_BOOLEAN('d', "data", &sample_address, "Sample addresses"), + OPT_BOOLEAN('T', "timestamp", &sample_time, "Sample timestamps"), OPT_BOOLEAN('n', "no-samples", &no_samples, "don't sample"), - OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid, + OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache, "do not update the buildid cache"), + OPT_BOOLEAN('B', "no-buildid", &no_buildid, + "do not collect buildids in perf.data"), OPT_END() }; int cmd_record(int argc, const char **argv, const char *prefix __used) { - int i, j, err = -ENOMEM; + int err = -ENOMEM; + struct perf_evsel *pos; argc = parse_options(argc, argv, record_options, record_usage, PARSE_OPT_STOP_AT_NON_OPTION); @@ -859,41 +900,36 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) } symbol__init(); - if (no_buildid) + + if (no_buildid_cache || no_buildid) disable_buildid_cache(); - if (!nr_counters) { - nr_counters = 1; - attrs[0].type = PERF_TYPE_HARDWARE; - attrs[0].config = PERF_COUNT_HW_CPU_CYCLES; + if (list_empty(&evsel_list) && perf_evsel_list__create_default() < 0) { + pr_err("Not enough memory for event selector list\n"); + goto out_symbol_exit; } - if (target_pid != -1) { + if (target_pid != -1) target_tid = target_pid; - thread_num = find_all_tid(target_pid, &all_tids); - if (thread_num <= 0) { - fprintf(stderr, "Can't find all threads of pid %d\n", - target_pid); - usage_with_options(record_usage, record_options); - } - } else { - all_tids=malloc(sizeof(pid_t)); - if (!all_tids) - goto out_symbol_exit; - all_tids[0] = target_tid; - thread_num = 1; + threads = thread_map__new(target_pid, target_tid); + if (threads == NULL) { + pr_err("Problems finding threads of monitor\n"); + usage_with_options(record_usage, record_options); } - for (i = 0; i < MAX_NR_CPUS; i++) { - for (j = 0; j < MAX_COUNTERS; j++) { - fd[i][j] = malloc(sizeof(int)*thread_num); - if (!fd[i][j]) - goto out_free_fd; - } + cpus = cpu_map__new(cpu_list); + if (cpus == NULL) { + perror("failed to parse CPUs map"); + return -1; } - event_array = malloc( - sizeof(struct pollfd)*MAX_NR_CPUS*MAX_COUNTERS*thread_num); + + list_for_each_entry(pos, &evsel_list, node) { + if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0) + goto out_free_fd; + } + event_array = malloc((sizeof(struct pollfd) * MAX_NR_CPUS * + MAX_COUNTERS * threads->nr)); if (!event_array) goto out_free_fd; @@ -920,12 +956,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) out_free_event_array: free(event_array); out_free_fd: - for (i = 0; i < MAX_NR_CPUS; i++) { - for (j = 0; j < MAX_COUNTERS; j++) - free(fd[i][j]); - } - free(all_tids); - all_tids = NULL; + thread_map__delete(threads); + threads = NULL; out_symbol_exit: symbol__exit(); return err; diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 5de405d45230..75183a4518e6 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -150,13 +150,13 @@ static int add_event_total(struct perf_session *session, return 0; } -static int process_sample_event(event_t *event, struct perf_session *session) +static int process_sample_event(event_t *event, struct sample_data *sample, + struct perf_session *session) { - struct sample_data data = { .period = 1, }; struct addr_location al; struct perf_event_attr *attr; - if (event__preprocess_sample(event, session, &al, &data, NULL) < 0) { + if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) { fprintf(stderr, "problem processing %d event, skipping it.\n", event->header.type); return -1; @@ -165,14 +165,14 @@ static int process_sample_event(event_t *event, struct perf_session *session) if (al.filtered || (hide_unresolved && al.sym == NULL)) return 0; - if (perf_session__add_hist_entry(session, &al, &data)) { + if (perf_session__add_hist_entry(session, &al, sample)) { pr_debug("problem incrementing symbol period, skipping event\n"); return -1; } - attr = perf_header__find_attr(data.id, &session->header); + attr = perf_header__find_attr(sample->id, &session->header); - if (add_event_total(session, &data, attr)) { + if (add_event_total(session, sample, attr)) { pr_debug("problem adding event period\n"); return -1; } @@ -180,7 +180,8 @@ static int process_sample_event(event_t *event, struct perf_session *session) return 0; } -static int process_read_event(event_t *event, struct perf_session *session __used) +static int process_read_event(event_t *event, struct sample_data *sample __used, + struct perf_session *session __used) { struct perf_event_attr *attr; @@ -243,6 +244,8 @@ static struct perf_event_ops event_ops = { .event_type = event__process_event_type, .tracing_data = event__process_tracing_data, .build_id = event__process_build_id, + .ordered_samples = true, + .ordering_requires_timestamps = true, }; extern volatile int session_done; @@ -307,7 +310,7 @@ static int __cmd_report(void) signal(SIGINT, sig_handler); - session = perf_session__new(input_name, O_RDONLY, force, false); + session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops); if (session == NULL) return -ENOMEM; @@ -442,6 +445,8 @@ static const struct option options[] = { "dump raw trace in ASCII"), OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, "file", "vmlinux pathname"), + OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, + "file", "kallsyms pathname"), OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules, "load module symbols - WARNING: use only with -k and LIVE kernel"), @@ -478,6 +483,8 @@ static const struct option options[] = { "columns '.' is reserved."), OPT_BOOLEAN('U', "hide-unresolved", &hide_unresolved, "Only display entries resolved to a symbol"), + OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", + "Look for files with symbols relative to this directory"), OPT_END() }; diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 55f3b5dcc731..7a4ebeb8b016 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1606,25 +1606,15 @@ process_raw_event(event_t *raw_event __used, struct perf_session *session, process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread); } -static int process_sample_event(event_t *event, struct perf_session *session) +static int process_sample_event(event_t *event, struct sample_data *sample, + struct perf_session *session) { - struct sample_data data; struct thread *thread; if (!(session->sample_type & PERF_SAMPLE_RAW)) return 0; - memset(&data, 0, sizeof(data)); - data.time = -1; - data.cpu = -1; - data.period = -1; - - event__parse_sample(event, session->sample_type, &data); - - dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc, - data.pid, data.tid, data.ip, data.period); - - thread = perf_session__findnew(session, data.pid); + thread = perf_session__findnew(session, sample->pid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", event->header.type); @@ -1633,10 +1623,11 @@ static int process_sample_event(event_t *event, struct perf_session *session) dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); - if (profile_cpu != -1 && profile_cpu != (int)data.cpu) + if (profile_cpu != -1 && profile_cpu != (int)sample->cpu) return 0; - process_raw_event(event, session, data.raw_data, data.cpu, data.time, thread); + process_raw_event(event, session, sample->raw_data, sample->cpu, + sample->time, thread); return 0; } @@ -1652,7 +1643,8 @@ static struct perf_event_ops event_ops = { static int read_events(void) { int err = -EINVAL; - struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0, false); + struct perf_session *session = perf_session__new(input_name, O_RDONLY, + 0, false, &event_ops); if (session == NULL) return -ENOMEM; @@ -1869,6 +1861,9 @@ static int __cmd_record(int argc, const char **argv) rec_argc = ARRAY_SIZE(record_args) + argc - 1; rec_argv = calloc(rec_argc + 1, sizeof(char *)); + if (rec_argv) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(record_args); i++) rec_argv[i] = strdup(record_args[i]); @@ -1888,10 +1883,10 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) usage_with_options(sched_usage, sched_options); /* - * Aliased to 'perf trace' for now: + * Aliased to 'perf script' for now: */ - if (!strcmp(argv[0], "trace")) - return cmd_trace(argc, argv, prefix); + if (!strcmp(argv[0], "script")) + return cmd_script(argc, argv, prefix); symbol__init(); if (!strncmp(argv[0], "rec", 3)) { diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-script.c index 86cfe3800e6b..150a606002eb 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-script.c @@ -56,29 +56,18 @@ static void setup_scripting(void) static int cleanup_scripting(void) { - pr_debug("\nperf trace script stopped\n"); + pr_debug("\nperf script stopped\n"); return scripting_ops->stop_script(); } static char const *input_name = "perf.data"; -static int process_sample_event(event_t *event, struct perf_session *session) +static int process_sample_event(event_t *event, struct sample_data *sample, + struct perf_session *session) { - struct sample_data data; - struct thread *thread; + struct thread *thread = perf_session__findnew(session, event->ip.pid); - memset(&data, 0, sizeof(data)); - data.time = -1; - data.cpu = -1; - data.period = 1; - - event__parse_sample(event, session->sample_type, &data); - - dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc, - data.pid, data.tid, data.ip, data.period); - - thread = perf_session__findnew(session, event->ip.pid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", event->header.type); @@ -87,13 +76,13 @@ static int process_sample_event(event_t *event, struct perf_session *session) if (session->sample_type & PERF_SAMPLE_RAW) { if (debug_mode) { - if (data.time < last_timestamp) { + if (sample->time < last_timestamp) { pr_err("Samples misordered, previous: %llu " "this: %llu\n", last_timestamp, - data.time); + sample->time); nr_unordered++; } - last_timestamp = data.time; + last_timestamp = sample->time; return 0; } /* @@ -101,21 +90,12 @@ static int process_sample_event(event_t *event, struct perf_session *session) * field, although it should be the same than this perf * event pid */ - scripting_ops->process_event(data.cpu, data.raw_data, - data.raw_size, - data.time, thread->comm); + scripting_ops->process_event(sample->cpu, sample->raw_data, + sample->raw_size, + sample->time, thread->comm); } - session->hists.stats.total_period += data.period; - return 0; -} - -static u64 nr_lost; - -static int process_lost_event(event_t *event, struct perf_session *session __used) -{ - nr_lost += event->lost.lost; - + session->hists.stats.total_period += sample->period; return 0; } @@ -126,7 +106,7 @@ static struct perf_event_ops event_ops = { .event_type = event__process_event_type, .tracing_data = event__process_tracing_data, .build_id = event__process_build_id, - .lost = process_lost_event, + .ordering_requires_timestamps = true, .ordered_samples = true, }; @@ -137,7 +117,7 @@ static void sig_handler(int sig __unused) session_done = 1; } -static int __cmd_trace(struct perf_session *session) +static int __cmd_script(struct perf_session *session) { int ret; @@ -145,10 +125,8 @@ static int __cmd_trace(struct perf_session *session) ret = perf_session__process_events(session, &event_ops); - if (debug_mode) { + if (debug_mode) pr_err("Misordered timestamps: %llu\n", nr_unordered); - pr_err("Lost events: %llu\n", nr_lost); - } return ret; } @@ -159,7 +137,7 @@ struct script_spec { char spec[0]; }; -LIST_HEAD(script_specs); +static LIST_HEAD(script_specs); static struct script_spec *script_spec__new(const char *spec, struct scripting_ops *ops) @@ -247,7 +225,7 @@ static void list_available_languages(void) fprintf(stderr, "\n"); fprintf(stderr, "Scripting language extensions (used in " - "perf trace -s [spec:]script.[spec]):\n\n"); + "perf script -s [spec:]script.[spec]):\n\n"); list_for_each_entry(s, &script_specs, node) fprintf(stderr, " %-42s [%s]\n", s->spec, s->ops->name); @@ -301,17 +279,34 @@ static int parse_scriptname(const struct option *opt __used, return 0; } -#define for_each_lang(scripts_dir, lang_dirent, lang_next) \ +/* Helper function for filesystems that return a dent->d_type DT_UNKNOWN */ +static int is_directory(const char *base_path, const struct dirent *dent) +{ + char path[PATH_MAX]; + struct stat st; + + sprintf(path, "%s/%s", base_path, dent->d_name); + if (stat(path, &st)) + return 0; + + return S_ISDIR(st.st_mode); +} + +#define for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next)\ while (!readdir_r(scripts_dir, &lang_dirent, &lang_next) && \ lang_next) \ - if (lang_dirent.d_type == DT_DIR && \ + if ((lang_dirent.d_type == DT_DIR || \ + (lang_dirent.d_type == DT_UNKNOWN && \ + is_directory(scripts_path, &lang_dirent))) && \ (strcmp(lang_dirent.d_name, ".")) && \ (strcmp(lang_dirent.d_name, ".."))) -#define for_each_script(lang_dir, script_dirent, script_next) \ +#define for_each_script(lang_path, lang_dir, script_dirent, script_next)\ while (!readdir_r(lang_dir, &script_dirent, &script_next) && \ script_next) \ - if (script_dirent.d_type != DT_DIR) + if (script_dirent.d_type != DT_DIR && \ + (script_dirent.d_type != DT_UNKNOWN || \ + !is_directory(lang_path, &script_dirent))) #define RECORD_SUFFIX "-record" @@ -324,7 +319,7 @@ struct script_desc { char *args; }; -LIST_HEAD(script_descs); +static LIST_HEAD(script_descs); static struct script_desc *script_desc__new(const char *name) { @@ -380,10 +375,10 @@ out_delete_desc: return NULL; } -static char *ends_with(char *str, const char *suffix) +static const char *ends_with(const char *str, const char *suffix) { size_t suffix_len = strlen(suffix); - char *p = str; + const char *p = str; if (strlen(str) > suffix_len) { p = str + strlen(str) - suffix_len; @@ -466,16 +461,16 @@ static int list_available_scripts(const struct option *opt __used, if (!scripts_dir) return -1; - for_each_lang(scripts_dir, lang_dirent, lang_next) { + for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) { snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path, lang_dirent.d_name); lang_dir = opendir(lang_path); if (!lang_dir) continue; - for_each_script(lang_dir, script_dirent, script_next) { + for_each_script(lang_path, lang_dir, script_dirent, script_next) { script_root = strdup(script_dirent.d_name); - str = ends_with(script_root, REPORT_SUFFIX); + str = (char *)ends_with(script_root, REPORT_SUFFIX); if (str) { *str = '\0'; desc = script_desc__findnew(script_root); @@ -514,16 +509,16 @@ static char *get_script_path(const char *script_root, const char *suffix) if (!scripts_dir) return NULL; - for_each_lang(scripts_dir, lang_dirent, lang_next) { + for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) { snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path, lang_dirent.d_name); lang_dir = opendir(lang_path); if (!lang_dir) continue; - for_each_script(lang_dir, script_dirent, script_next) { + for_each_script(lang_path, lang_dir, script_dirent, script_next) { __script_root = strdup(script_dirent.d_name); - str = ends_with(__script_root, suffix); + str = (char *)ends_with(__script_root, suffix); if (str) { *str = '\0'; if (strcmp(__script_root, script_root)) @@ -543,7 +538,7 @@ static char *get_script_path(const char *script_root, const char *suffix) static bool is_top_script(const char *script_path) { - return ends_with((char *)script_path, "top") == NULL ? false : true; + return ends_with(script_path, "top") == NULL ? false : true; } static int has_required_arg(char *script_path) @@ -569,12 +564,12 @@ out: return n_args; } -static const char * const trace_usage[] = { - "perf trace [<options>]", - "perf trace [<options>] record <script> [<record-options>] <command>", - "perf trace [<options>] report <script> [script-args]", - "perf trace [<options>] <script> [<record-options>] <command>", - "perf trace [<options>] <top-script> [script-args]", +static const char * const script_usage[] = { + "perf script [<options>]", + "perf script [<options>] record <script> [<record-options>] <command>", + "perf script [<options>] report <script> [script-args]", + "perf script [<options>] <script> [<record-options>] <command>", + "perf script [<options>] <top-script> [script-args]", NULL }; @@ -591,7 +586,7 @@ static const struct option options[] = { "script file name (lang:script name, script name, or *)", parse_scriptname), OPT_STRING('g', "gen-script", &generate_script_lang, "lang", - "generate perf-trace.xx script in specified language"), + "generate perf-script.xx script in specified language"), OPT_STRING('i', "input", &input_name, "file", "input file name"), OPT_BOOLEAN('d', "debug-mode", &debug_mode, @@ -614,7 +609,7 @@ static bool have_cmd(int argc, const char **argv) return argc != 0; } -int cmd_trace(int argc, const char **argv, const char *prefix __used) +int cmd_script(int argc, const char **argv, const char *prefix __used) { char *rec_script_path = NULL; char *rep_script_path = NULL; @@ -626,7 +621,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used) setup_scripting(); - argc = parse_options(argc, argv, options, trace_usage, + argc = parse_options(argc, argv, options, script_usage, PARSE_OPT_STOP_AT_NON_OPTION); if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) { @@ -640,7 +635,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used) if (!rep_script_path) { fprintf(stderr, "Please specify a valid report script" - "(see 'perf trace -l' for listing)\n"); + "(see 'perf script -l' for listing)\n"); return -1; } } @@ -658,8 +653,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used) if (!rec_script_path && !rep_script_path) { fprintf(stderr, " Couldn't find script %s\n\n See perf" - " trace -l for available scripts.\n", argv[0]); - usage_with_options(trace_usage, options); + " script -l for available scripts.\n", argv[0]); + usage_with_options(script_usage, options); } if (is_top_script(argv[0])) { @@ -671,9 +666,9 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used) rec_args = (argc - 1) - rep_args; if (rec_args < 0) { fprintf(stderr, " %s script requires options." - "\n\n See perf trace -l for available " + "\n\n See perf script -l for available " "scripts and options.\n", argv[0]); - usage_with_options(trace_usage, options); + usage_with_options(script_usage, options); } } @@ -772,7 +767,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used) if (!script_name) setup_pager(); - session = perf_session__new(input_name, O_RDONLY, 0, false); + session = perf_session__new(input_name, O_RDONLY, 0, false, &event_ops); if (session == NULL) return -ENOMEM; @@ -806,7 +801,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used) return -1; } - err = scripting_ops->generate_script("perf-trace"); + err = scripting_ops->generate_script("perf-script"); goto out; } @@ -814,10 +809,10 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used) err = scripting_ops->start_script(script_name, argc, argv); if (err) goto out; - pr_debug("perf trace started with script %s\n\n", script_name); + pr_debug("perf script started with script %s\n\n", script_name); } - err = __cmd_trace(session); + err = __cmd_script(session); perf_session__delete(session); cleanup_scripting(); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a6b4d44f9502..02b2d8013a61 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -43,6 +43,7 @@ #include "util/parse-options.h" #include "util/parse-events.h" #include "util/event.h" +#include "util/evsel.h" #include "util/debug.h" #include "util/header.h" #include "util/cpumap.h" @@ -52,6 +53,8 @@ #include <math.h> #include <locale.h> +#define DEFAULT_SEPARATOR " " + static struct perf_event_attr default_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, @@ -69,25 +72,23 @@ static struct perf_event_attr default_attrs[] = { }; static bool system_wide = false; -static int nr_cpus = 0; +static struct cpu_map *cpus; static int run_idx = 0; static int run_count = 1; static bool no_inherit = false; static bool scale = true; +static bool no_aggr = false; static pid_t target_pid = -1; static pid_t target_tid = -1; -static pid_t *all_tids = NULL; -static int thread_num = 0; +static struct thread_map *threads; static pid_t child_pid = -1; static bool null_run = false; -static bool big_num = false; +static bool big_num = true; +static int big_num_opt = -1; static const char *cpu_list; - - -static int *fd[MAX_NR_CPUS][MAX_COUNTERS]; - -static int event_scaled[MAX_COUNTERS]; +static const char *csv_sep = NULL; +static bool csv_output = false; static volatile int done = 0; @@ -96,6 +97,22 @@ struct stats double n, mean, M2; }; +struct perf_stat { + struct stats res_stats[3]; +}; + +static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel) +{ + evsel->priv = zalloc(sizeof(struct perf_stat)); + return evsel->priv == NULL ? -ENOMEM : 0; +} + +static void perf_evsel__free_stat_priv(struct perf_evsel *evsel) +{ + free(evsel->priv); + evsel->priv = NULL; +} + static void update_stats(struct stats *stats, u64 val) { double delta; @@ -135,69 +152,38 @@ static double stddev_stats(struct stats *stats) return sqrt(variance_mean); } -struct stats event_res_stats[MAX_COUNTERS][3]; -struct stats runtime_nsecs_stats; +struct stats runtime_nsecs_stats[MAX_NR_CPUS]; +struct stats runtime_cycles_stats[MAX_NR_CPUS]; +struct stats runtime_branches_stats[MAX_NR_CPUS]; struct stats walltime_nsecs_stats; -struct stats runtime_cycles_stats; -struct stats runtime_branches_stats; -#define MATCH_EVENT(t, c, counter) \ - (attrs[counter].type == PERF_TYPE_##t && \ - attrs[counter].config == PERF_COUNT_##c) - -#define ERR_PERF_OPEN \ -"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n" - -static int create_perf_stat_counter(int counter) +static int create_perf_stat_counter(struct perf_evsel *evsel) { - struct perf_event_attr *attr = attrs + counter; - int thread; - int ncreated = 0; + struct perf_event_attr *attr = &evsel->attr; if (scale) attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; - if (system_wide) { - int cpu; - - for (cpu = 0; cpu < nr_cpus; cpu++) { - fd[cpu][counter][0] = sys_perf_event_open(attr, - -1, cpumap[cpu], -1, 0); - if (fd[cpu][counter][0] < 0) - pr_debug(ERR_PERF_OPEN, counter, - fd[cpu][counter][0], strerror(errno)); - else - ++ncreated; - } - } else { - attr->inherit = !no_inherit; - if (target_pid == -1 && target_tid == -1) { - attr->disabled = 1; - attr->enable_on_exec = 1; - } - for (thread = 0; thread < thread_num; thread++) { - fd[0][counter][thread] = sys_perf_event_open(attr, - all_tids[thread], -1, -1, 0); - if (fd[0][counter][thread] < 0) - pr_debug(ERR_PERF_OPEN, counter, - fd[0][counter][thread], - strerror(errno)); - else - ++ncreated; - } + if (system_wide) + return perf_evsel__open_per_cpu(evsel, cpus); + + attr->inherit = !no_inherit; + if (target_pid == -1 && target_tid == -1) { + attr->disabled = 1; + attr->enable_on_exec = 1; } - return ncreated; + return perf_evsel__open_per_thread(evsel, threads); } /* * Does the counter have nsecs as a unit? */ -static inline int nsec_counter(int counter) +static inline int nsec_counter(struct perf_evsel *evsel) { - if (MATCH_EVENT(SOFTWARE, SW_CPU_CLOCK, counter) || - MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) + if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) || + perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) return 1; return 0; @@ -205,55 +191,19 @@ static inline int nsec_counter(int counter) /* * Read out the results of a single counter: + * aggregate counts across CPUs in system-wide mode */ -static void read_counter(int counter) +static int read_counter_aggr(struct perf_evsel *counter) { - u64 count[3], single_count[3]; - int cpu; - size_t res, nv; - int scaled; - int i, thread; - - count[0] = count[1] = count[2] = 0; - - nv = scale ? 3 : 1; - for (cpu = 0; cpu < nr_cpus; cpu++) { - for (thread = 0; thread < thread_num; thread++) { - if (fd[cpu][counter][thread] < 0) - continue; - - res = read(fd[cpu][counter][thread], - single_count, nv * sizeof(u64)); - assert(res == nv * sizeof(u64)); - - close(fd[cpu][counter][thread]); - fd[cpu][counter][thread] = -1; - - count[0] += single_count[0]; - if (scale) { - count[1] += single_count[1]; - count[2] += single_count[2]; - } - } - } - - scaled = 0; - if (scale) { - if (count[2] == 0) { - event_scaled[counter] = -1; - count[0] = 0; - return; - } + struct perf_stat *ps = counter->priv; + u64 *count = counter->counts->aggr.values; + int i; - if (count[2] < count[1]) { - event_scaled[counter] = 1; - count[0] = (unsigned long long) - ((double)count[0] * count[1] / count[2] + 0.5); - } - } + if (__perf_evsel__read(counter, cpus->nr, threads->nr, scale) < 0) + return -1; for (i = 0; i < 3; i++) - update_stats(&event_res_stats[counter][i], count[i]); + update_stats(&ps->res_stats[i], count[i]); if (verbose) { fprintf(stderr, "%s: %Ld %Ld %Ld\n", event_name(counter), @@ -263,26 +213,51 @@ static void read_counter(int counter) /* * Save the full runtime - to allow normalization during printout: */ - if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) - update_stats(&runtime_nsecs_stats, count[0]); - if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter)) - update_stats(&runtime_cycles_stats, count[0]); - if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter)) - update_stats(&runtime_branches_stats, count[0]); + if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) + update_stats(&runtime_nsecs_stats[0], count[0]); + if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) + update_stats(&runtime_cycles_stats[0], count[0]); + if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) + update_stats(&runtime_branches_stats[0], count[0]); + + return 0; +} + +/* + * Read out the results of a single counter: + * do not aggregate counts across CPUs in system-wide mode + */ +static int read_counter(struct perf_evsel *counter) +{ + u64 *count; + int cpu; + + for (cpu = 0; cpu < cpus->nr; cpu++) { + if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0) + return -1; + + count = counter->counts->cpu[cpu].values; + + if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) + update_stats(&runtime_nsecs_stats[cpu], count[0]); + if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) + update_stats(&runtime_cycles_stats[cpu], count[0]); + if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) + update_stats(&runtime_branches_stats[cpu], count[0]); + } + + return 0; } static int run_perf_stat(int argc __used, const char **argv) { unsigned long long t0, t1; + struct perf_evsel *counter; int status = 0; - int counter, ncreated = 0; int child_ready_pipe[2], go_pipe[2]; const bool forks = (argc > 0); char buf; - if (!system_wide) - nr_cpus = 1; - if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) { perror("failed to create pipes"); exit(1); @@ -322,7 +297,7 @@ static int run_perf_stat(int argc __used, const char **argv) } if (target_tid == -1 && target_pid == -1 && !system_wide) - all_tids[0] = child_pid; + threads->map[0] = child_pid; /* * Wait for the child to be ready to exec. @@ -334,16 +309,23 @@ static int run_perf_stat(int argc __used, const char **argv) close(child_ready_pipe[0]); } - for (counter = 0; counter < nr_counters; counter++) - ncreated += create_perf_stat_counter(counter); - - if (ncreated == 0) { - pr_err("No permission to collect %sstats.\n" - "Consider tweaking /proc/sys/kernel/perf_event_paranoid.\n", - system_wide ? "system-wide " : ""); - if (child_pid != -1) - kill(child_pid, SIGTERM); - return -1; + list_for_each_entry(counter, &evsel_list, node) { + if (create_perf_stat_counter(counter) < 0) { + if (errno == -EPERM || errno == -EACCES) { + error("You may not have permission to collect %sstats.\n" + "\t Consider tweaking" + " /proc/sys/kernel/perf_event_paranoid or running as root.", + system_wide ? "system-wide " : ""); + } else { + error("open_counter returned with %d (%s). " + "/bin/dmesg may provide additional information.\n", + errno, strerror(errno)); + } + if (child_pid != -1) + kill(child_pid, SIGTERM); + die("Not all events could be opened.\n"); + return -1; + } } /* @@ -362,60 +344,97 @@ static int run_perf_stat(int argc __used, const char **argv) update_stats(&walltime_nsecs_stats, t1 - t0); - for (counter = 0; counter < nr_counters; counter++) - read_counter(counter); + if (no_aggr) { + list_for_each_entry(counter, &evsel_list, node) { + read_counter(counter); + perf_evsel__close_fd(counter, cpus->nr, 1); + } + } else { + list_for_each_entry(counter, &evsel_list, node) { + read_counter_aggr(counter); + perf_evsel__close_fd(counter, cpus->nr, threads->nr); + } + } return WEXITSTATUS(status); } -static void print_noise(int counter, double avg) +static void print_noise(struct perf_evsel *evsel, double avg) { + struct perf_stat *ps; + if (run_count == 1) return; + ps = evsel->priv; fprintf(stderr, " ( +- %7.3f%% )", - 100 * stddev_stats(&event_res_stats[counter][0]) / avg); + 100 * stddev_stats(&ps->res_stats[0]) / avg); } -static void nsec_printout(int counter, double avg) +static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) { double msecs = avg / 1e6; + char cpustr[16] = { '\0', }; + const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-24s"; - fprintf(stderr, " %18.6f %-24s", msecs, event_name(counter)); + if (no_aggr) + sprintf(cpustr, "CPU%*d%s", + csv_output ? 0 : -4, + cpus->map[cpu], csv_sep); + + fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel)); + + if (csv_output) + return; - if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) { + if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) fprintf(stderr, " # %10.3f CPUs ", avg / avg_stats(&walltime_nsecs_stats)); - } } -static void abs_printout(int counter, double avg) +static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) { double total, ratio = 0.0; + char cpustr[16] = { '\0', }; + const char *fmt; + + if (csv_output) + fmt = "%s%.0f%s%s"; + else if (big_num) + fmt = "%s%'18.0f%s%-24s"; + else + fmt = "%s%18.0f%s%-24s"; - if (big_num) - fprintf(stderr, " %'18.0f %-24s", avg, event_name(counter)); + if (no_aggr) + sprintf(cpustr, "CPU%*d%s", + csv_output ? 0 : -4, + cpus->map[cpu], csv_sep); else - fprintf(stderr, " %18.0f %-24s", avg, event_name(counter)); + cpu = 0; + + fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(evsel)); - if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) { - total = avg_stats(&runtime_cycles_stats); + if (csv_output) + return; + + if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { + total = avg_stats(&runtime_cycles_stats[cpu]); if (total) ratio = avg / total; fprintf(stderr, " # %10.3f IPC ", ratio); - } else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) && - runtime_branches_stats.n != 0) { - total = avg_stats(&runtime_branches_stats); + } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && + runtime_branches_stats[cpu].n != 0) { + total = avg_stats(&runtime_branches_stats[cpu]); if (total) ratio = avg * 100 / total; fprintf(stderr, " # %10.3f %% ", ratio); - } else if (runtime_nsecs_stats.n != 0) { - total = avg_stats(&runtime_nsecs_stats); + } else if (runtime_nsecs_stats[cpu].n != 0) { + total = avg_stats(&runtime_nsecs_stats[cpu]); if (total) ratio = 1000.0 * avg / total; @@ -426,30 +445,38 @@ static void abs_printout(int counter, double avg) /* * Print out the results of a single counter: + * aggregated counts in system-wide mode */ -static void print_counter(int counter) +static void print_counter_aggr(struct perf_evsel *counter) { - double avg = avg_stats(&event_res_stats[counter][0]); - int scaled = event_scaled[counter]; + struct perf_stat *ps = counter->priv; + double avg = avg_stats(&ps->res_stats[0]); + int scaled = counter->counts->scaled; if (scaled == -1) { - fprintf(stderr, " %18s %-24s\n", - "<not counted>", event_name(counter)); + fprintf(stderr, "%*s%s%-24s\n", + csv_output ? 0 : 18, + "<not counted>", csv_sep, event_name(counter)); return; } if (nsec_counter(counter)) - nsec_printout(counter, avg); + nsec_printout(-1, counter, avg); else - abs_printout(counter, avg); + abs_printout(-1, counter, avg); + + if (csv_output) { + fputc('\n', stderr); + return; + } print_noise(counter, avg); if (scaled) { double avg_enabled, avg_running; - avg_enabled = avg_stats(&event_res_stats[counter][1]); - avg_running = avg_stats(&event_res_stats[counter][2]); + avg_enabled = avg_stats(&ps->res_stats[1]); + avg_running = avg_stats(&ps->res_stats[2]); fprintf(stderr, " (scaled from %.2f%%)", 100 * avg_running / avg_enabled); @@ -458,40 +485,92 @@ static void print_counter(int counter) fprintf(stderr, "\n"); } +/* + * Print out the results of a single counter: + * does not use aggregated count in system-wide + */ +static void print_counter(struct perf_evsel *counter) +{ + u64 ena, run, val; + int cpu; + + for (cpu = 0; cpu < cpus->nr; cpu++) { + val = counter->counts->cpu[cpu].val; + ena = counter->counts->cpu[cpu].ena; + run = counter->counts->cpu[cpu].run; + if (run == 0 || ena == 0) { + fprintf(stderr, "CPU%*d%s%*s%s%-24s", + csv_output ? 0 : -4, + cpus->map[cpu], csv_sep, + csv_output ? 0 : 18, + "<not counted>", csv_sep, + event_name(counter)); + + fprintf(stderr, "\n"); + continue; + } + + if (nsec_counter(counter)) + nsec_printout(cpu, counter, val); + else + abs_printout(cpu, counter, val); + + if (!csv_output) { + print_noise(counter, 1.0); + + if (run != ena) { + fprintf(stderr, " (scaled from %.2f%%)", + 100.0 * run / ena); + } + } + fprintf(stderr, "\n"); + } +} + static void print_stat(int argc, const char **argv) { - int i, counter; + struct perf_evsel *counter; + int i; fflush(stdout); - fprintf(stderr, "\n"); - fprintf(stderr, " Performance counter stats for "); - if(target_pid == -1 && target_tid == -1) { - fprintf(stderr, "\'%s", argv[0]); - for (i = 1; i < argc; i++) - fprintf(stderr, " %s", argv[i]); - } else if (target_pid != -1) - fprintf(stderr, "process id \'%d", target_pid); - else - fprintf(stderr, "thread id \'%d", target_tid); - - fprintf(stderr, "\'"); - if (run_count > 1) - fprintf(stderr, " (%d runs)", run_count); - fprintf(stderr, ":\n\n"); + if (!csv_output) { + fprintf(stderr, "\n"); + fprintf(stderr, " Performance counter stats for "); + if(target_pid == -1 && target_tid == -1) { + fprintf(stderr, "\'%s", argv[0]); + for (i = 1; i < argc; i++) + fprintf(stderr, " %s", argv[i]); + } else if (target_pid != -1) + fprintf(stderr, "process id \'%d", target_pid); + else + fprintf(stderr, "thread id \'%d", target_tid); + + fprintf(stderr, "\'"); + if (run_count > 1) + fprintf(stderr, " (%d runs)", run_count); + fprintf(stderr, ":\n\n"); + } - for (counter = 0; counter < nr_counters; counter++) - print_counter(counter); + if (no_aggr) { + list_for_each_entry(counter, &evsel_list, node) + print_counter(counter); + } else { + list_for_each_entry(counter, &evsel_list, node) + print_counter_aggr(counter); + } - fprintf(stderr, "\n"); - fprintf(stderr, " %18.9f seconds time elapsed", - avg_stats(&walltime_nsecs_stats)/1e9); - if (run_count > 1) { - fprintf(stderr, " ( +- %7.3f%% )", + if (!csv_output) { + fprintf(stderr, "\n"); + fprintf(stderr, " %18.9f seconds time elapsed", + avg_stats(&walltime_nsecs_stats)/1e9); + if (run_count > 1) { + fprintf(stderr, " ( +- %7.3f%% )", 100*stddev_stats(&walltime_nsecs_stats) / avg_stats(&walltime_nsecs_stats)); + } + fprintf(stderr, "\n\n"); } - fprintf(stderr, "\n\n"); } static volatile int signr = -1; @@ -521,6 +600,13 @@ static const char * const stat_usage[] = { NULL }; +static int stat__set_big_num(const struct option *opt __used, + const char *s __used, int unset) +{ + big_num_opt = unset ? 0 : 1; + return 0; +} + static const struct option options[] = { OPT_CALLBACK('e', "event", NULL, "event", "event selector. use 'perf list' to list available events", @@ -541,64 +627,96 @@ static const struct option options[] = { "repeat command and print average + stddev (max: 100)"), OPT_BOOLEAN('n', "null", &null_run, "null run - dont start any counters"), - OPT_BOOLEAN('B', "big-num", &big_num, - "print large numbers with thousands\' separators"), + OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, + "print large numbers with thousands\' separators", + stat__set_big_num), OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to monitor in system-wide"), + OPT_BOOLEAN('A', "no-aggr", &no_aggr, + "disable CPU count aggregation"), + OPT_STRING('x', "field-separator", &csv_sep, "separator", + "print counts with custom separator"), OPT_END() }; int cmd_stat(int argc, const char **argv, const char *prefix __used) { - int status; - int i,j; + struct perf_evsel *pos; + int status = -ENOMEM; setlocale(LC_ALL, ""); argc = parse_options(argc, argv, options, stat_usage, PARSE_OPT_STOP_AT_NON_OPTION); + + if (csv_sep) + csv_output = true; + else + csv_sep = DEFAULT_SEPARATOR; + + /* + * let the spreadsheet do the pretty-printing + */ + if (csv_output) { + /* User explicitely passed -B? */ + if (big_num_opt == 1) { + fprintf(stderr, "-B option not supported with -x\n"); + usage_with_options(stat_usage, options); + } else /* Nope, so disable big number formatting */ + big_num = false; + } else if (big_num_opt == 0) /* User passed --no-big-num */ + big_num = false; + if (!argc && target_pid == -1 && target_tid == -1) usage_with_options(stat_usage, options); if (run_count <= 0) usage_with_options(stat_usage, options); + /* no_aggr is for system-wide only */ + if (no_aggr && !system_wide) + usage_with_options(stat_usage, options); + /* Set attrs and nr_counters if no event is selected and !null_run */ if (!null_run && !nr_counters) { - memcpy(attrs, default_attrs, sizeof(default_attrs)); + size_t c; + nr_counters = ARRAY_SIZE(default_attrs); + + for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) { + pos = perf_evsel__new(default_attrs[c].type, + default_attrs[c].config, + nr_counters); + if (pos == NULL) + goto out; + list_add(&pos->node, &evsel_list); + } } - if (system_wide) - nr_cpus = read_cpu_map(cpu_list); - else - nr_cpus = 1; + if (target_pid != -1) + target_tid = target_pid; - if (nr_cpus < 1) + threads = thread_map__new(target_pid, target_tid); + if (threads == NULL) { + pr_err("Problems finding threads of monitor\n"); usage_with_options(stat_usage, options); + } - if (target_pid != -1) { - target_tid = target_pid; - thread_num = find_all_tid(target_pid, &all_tids); - if (thread_num <= 0) { - fprintf(stderr, "Can't find all threads of pid %d\n", - target_pid); - usage_with_options(stat_usage, options); - } - } else { - all_tids=malloc(sizeof(pid_t)); - if (!all_tids) - return -ENOMEM; + if (system_wide) + cpus = cpu_map__new(cpu_list); + else + cpus = cpu_map__dummy_new(); - all_tids[0] = target_tid; - thread_num = 1; + if (cpus == NULL) { + perror("failed to parse CPUs map"); + usage_with_options(stat_usage, options); + return -1; } - for (i = 0; i < MAX_NR_CPUS; i++) { - for (j = 0; j < MAX_COUNTERS; j++) { - fd[i][j] = malloc(sizeof(int)*thread_num); - if (!fd[i][j]) - return -ENOMEM; - } + list_for_each_entry(pos, &evsel_list, node) { + if (perf_evsel__alloc_stat_priv(pos) < 0 || + perf_evsel__alloc_counts(pos, cpus->nr) < 0 || + perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0) + goto out_free_fd; } /* @@ -621,6 +739,11 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) if (status != -1) print_stat(argc, argv); - +out_free_fd: + list_for_each_entry(pos, &evsel_list, node) + perf_evsel__free_stat_priv(pos); +out: + thread_map__delete(threads); + threads = NULL; return status; } diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 035b9fa063a9..1c984342a579 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -119,10 +119,16 @@ static int test__vmlinux_matches_kallsyms(void) * end addresses too. */ for (nd = rb_first(&vmlinux_map->dso->symbols[type]); nd; nd = rb_next(nd)) { - struct symbol *pair; + struct symbol *pair, *first_pair; + bool backwards = true; sym = rb_entry(nd, struct symbol, rb_node); - pair = machine__find_kernel_symbol(&kallsyms, type, sym->start, NULL, NULL); + + if (sym->start == sym->end) + continue; + + first_pair = machine__find_kernel_symbol(&kallsyms, type, sym->start, NULL, NULL); + pair = first_pair; if (pair && pair->start == sym->start) { next_pair: @@ -143,8 +149,10 @@ next_pair: pr_debug("%#Lx: diff end addr for %s v: %#Lx k: %#Lx\n", sym->start, sym->name, sym->end, pair->end); } else { - struct rb_node *nnd = rb_prev(&pair->rb_node); - + struct rb_node *nnd; +detour: + nnd = backwards ? rb_prev(&pair->rb_node) : + rb_next(&pair->rb_node); if (nnd) { struct symbol *next = rb_entry(nnd, struct symbol, rb_node); @@ -153,6 +161,13 @@ next_pair: goto next_pair; } } + + if (backwards) { + backwards = false; + pair = first_pair; + goto detour; + } + pr_debug("%#Lx: diff name v: %s k: %s\n", sym->start, sym->name, pair->name); } @@ -219,6 +234,89 @@ out: return err; } +#include "util/evsel.h" +#include <sys/types.h> + +static int trace_event__id(const char *event_name) +{ + char *filename; + int err = -1, fd; + + if (asprintf(&filename, + "/sys/kernel/debug/tracing/events/syscalls/%s/id", + event_name) < 0) + return -1; + + fd = open(filename, O_RDONLY); + if (fd >= 0) { + char id[16]; + if (read(fd, id, sizeof(id)) > 0) + err = atoi(id); + close(fd); + } + + free(filename); + return err; +} + +static int test__open_syscall_event(void) +{ + int err = -1, fd; + struct thread_map *threads; + struct perf_evsel *evsel; + unsigned int nr_open_calls = 111, i; + int id = trace_event__id("sys_enter_open"); + + if (id < 0) { + pr_debug("is debugfs mounted on /sys/kernel/debug?\n"); + return -1; + } + + threads = thread_map__new(-1, getpid()); + if (threads == NULL) { + pr_debug("thread_map__new\n"); + return -1; + } + + evsel = perf_evsel__new(PERF_TYPE_TRACEPOINT, id, 0); + if (evsel == NULL) { + pr_debug("perf_evsel__new\n"); + goto out_thread_map_delete; + } + + if (perf_evsel__open_per_thread(evsel, threads) < 0) { + pr_debug("failed to open counter: %s, " + "tweak /proc/sys/kernel/perf_event_paranoid?\n", + strerror(errno)); + goto out_evsel_delete; + } + + for (i = 0; i < nr_open_calls; ++i) { + fd = open("/etc/passwd", O_RDONLY); + close(fd); + } + + if (perf_evsel__read_on_cpu(evsel, 0, 0) < 0) { + pr_debug("perf_evsel__open_read_on_cpu\n"); + goto out_close_fd; + } + + if (evsel->counts->cpu[0].val != nr_open_calls) { + pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls, got %Ld\n", + nr_open_calls, evsel->counts->cpu[0].val); + goto out_close_fd; + } + + err = 0; +out_close_fd: + perf_evsel__close_fd(evsel, 1, threads->nr); +out_evsel_delete: + perf_evsel__delete(evsel); +out_thread_map_delete: + thread_map__delete(threads); + return err; +} + static struct test { const char *desc; int (*func)(void); @@ -228,6 +326,10 @@ static struct test { .func = test__vmlinux_matches_kallsyms, }, { + .desc = "detect open syscall event", + .func = test__open_syscall_event, + }, + { .func = NULL, }, }; diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 9bcc38f0b706..746cf03cb05d 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -32,6 +32,10 @@ #include "util/session.h" #include "util/svghelper.h" +#define SUPPORT_OLD_POWER_EVENTS 1 +#define PWR_EVENT_EXIT -1 + + static char const *input_name = "perf.data"; static char const *output_name = "output.svg"; @@ -272,19 +276,22 @@ static int cpus_cstate_state[MAX_CPUS]; static u64 cpus_pstate_start_times[MAX_CPUS]; static u64 cpus_pstate_state[MAX_CPUS]; -static int process_comm_event(event_t *event, struct perf_session *session __used) +static int process_comm_event(event_t *event, struct sample_data *sample __used, + struct perf_session *session __used) { pid_set_comm(event->comm.tid, event->comm.comm); return 0; } -static int process_fork_event(event_t *event, struct perf_session *session __used) +static int process_fork_event(event_t *event, struct sample_data *sample __used, + struct perf_session *session __used) { pid_fork(event->fork.pid, event->fork.ppid, event->fork.time); return 0; } -static int process_exit_event(event_t *event, struct perf_session *session __used) +static int process_exit_event(event_t *event, struct sample_data *sample __used, + struct perf_session *session __used) { pid_exit(event->fork.pid, event->fork.time); return 0; @@ -298,12 +305,21 @@ struct trace_entry { int lock_depth; }; -struct power_entry { +#ifdef SUPPORT_OLD_POWER_EVENTS +static int use_old_power_events; +struct power_entry_old { struct trace_entry te; u64 type; u64 value; u64 cpu_id; }; +#endif + +struct power_processor_entry { + struct trace_entry te; + u32 state; + u32 cpu_id; +}; #define TASK_COMM_LEN 16 struct wakeup_entry { @@ -470,48 +486,65 @@ static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te) } -static int process_sample_event(event_t *event, struct perf_session *session) +static int process_sample_event(event_t *event __used, + struct sample_data *sample, + struct perf_session *session) { - struct sample_data data; struct trace_entry *te; - memset(&data, 0, sizeof(data)); - - event__parse_sample(event, session->sample_type, &data); - if (session->sample_type & PERF_SAMPLE_TIME) { - if (!first_time || first_time > data.time) - first_time = data.time; - if (last_time < data.time) - last_time = data.time; + if (!first_time || first_time > sample->time) + first_time = sample->time; + if (last_time < sample->time) + last_time = sample->time; } - te = (void *)data.raw_data; - if (session->sample_type & PERF_SAMPLE_RAW && data.raw_size > 0) { + te = (void *)sample->raw_data; + if (session->sample_type & PERF_SAMPLE_RAW && sample->raw_size > 0) { char *event_str; - struct power_entry *pe; - - pe = (void *)te; - +#ifdef SUPPORT_OLD_POWER_EVENTS + struct power_entry_old *peo; + peo = (void *)te; +#endif event_str = perf_header__find_event(te->type); if (!event_str) return 0; - if (strcmp(event_str, "power:power_start") == 0) - c_state_start(pe->cpu_id, data.time, pe->value); + if (strcmp(event_str, "power:cpu_idle") == 0) { + struct power_processor_entry *ppe = (void *)te; + if (ppe->state == (u32)PWR_EVENT_EXIT) + c_state_end(ppe->cpu_id, sample->time); + else + c_state_start(ppe->cpu_id, sample->time, + ppe->state); + } + else if (strcmp(event_str, "power:cpu_frequency") == 0) { + struct power_processor_entry *ppe = (void *)te; + p_state_change(ppe->cpu_id, sample->time, ppe->state); + } + + else if (strcmp(event_str, "sched:sched_wakeup") == 0) + sched_wakeup(sample->cpu, sample->time, sample->pid, te); - if (strcmp(event_str, "power:power_end") == 0) - c_state_end(pe->cpu_id, data.time); + else if (strcmp(event_str, "sched:sched_switch") == 0) + sched_switch(sample->cpu, sample->time, te); - if (strcmp(event_str, "power:power_frequency") == 0) - p_state_change(pe->cpu_id, data.time, pe->value); +#ifdef SUPPORT_OLD_POWER_EVENTS + if (use_old_power_events) { + if (strcmp(event_str, "power:power_start") == 0) + c_state_start(peo->cpu_id, sample->time, + peo->value); - if (strcmp(event_str, "sched:sched_wakeup") == 0) - sched_wakeup(data.cpu, data.time, data.pid, te); + else if (strcmp(event_str, "power:power_end") == 0) + c_state_end(sample->cpu, sample->time); - if (strcmp(event_str, "sched:sched_switch") == 0) - sched_switch(data.cpu, data.time, te); + else if (strcmp(event_str, + "power:power_frequency") == 0) + p_state_change(peo->cpu_id, sample->time, + peo->value); + } +#endif } return 0; } @@ -937,7 +970,8 @@ static struct perf_event_ops event_ops = { static int __cmd_timechart(void) { - struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0, false); + struct perf_session *session = perf_session__new(input_name, O_RDONLY, + 0, false, &event_ops); int ret = -EINVAL; if (session == NULL) @@ -968,7 +1002,8 @@ static const char * const timechart_usage[] = { NULL }; -static const char *record_args[] = { +#ifdef SUPPORT_OLD_POWER_EVENTS +static const char * const record_old_args[] = { "record", "-a", "-R", @@ -980,16 +1015,43 @@ static const char *record_args[] = { "-e", "sched:sched_wakeup", "-e", "sched:sched_switch", }; +#endif + +static const char * const record_new_args[] = { + "record", + "-a", + "-R", + "-f", + "-c", "1", + "-e", "power:cpu_frequency", + "-e", "power:cpu_idle", + "-e", "sched:sched_wakeup", + "-e", "sched:sched_switch", +}; static int __cmd_record(int argc, const char **argv) { unsigned int rec_argc, i, j; const char **rec_argv; + const char * const *record_args = record_new_args; + unsigned int record_elems = ARRAY_SIZE(record_new_args); + +#ifdef SUPPORT_OLD_POWER_EVENTS + if (!is_valid_tracepoint("power:cpu_idle") && + is_valid_tracepoint("power:power_start")) { + use_old_power_events = 1; + record_args = record_old_args; + record_elems = ARRAY_SIZE(record_old_args); + } +#endif - rec_argc = ARRAY_SIZE(record_args) + argc - 1; + rec_argc = record_elems + argc - 1; rec_argv = calloc(rec_argc + 1, sizeof(char *)); - for (i = 0; i < ARRAY_SIZE(record_args); i++) + if (rec_argv == NULL) + return -ENOMEM; + + for (i = 0; i < record_elems; i++) rec_argv[i] = strdup(record_args[i]); for (j = 1; j < (unsigned int)argc; j++, i++) @@ -1018,6 +1080,8 @@ static const struct option options[] = { OPT_CALLBACK('p', "process", NULL, "process", "process selector. Pass a pid or process name.", parse_process), + OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", + "Look for files with symbols relative to this directory"), OPT_END() }; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index dd625808c2a5..1e67ab9c7ebc 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -21,6 +21,7 @@ #include "perf.h" #include "util/color.h" +#include "util/evsel.h" #include "util/session.h" #include "util/symbol.h" #include "util/thread.h" @@ -29,6 +30,7 @@ #include "util/parse-options.h" #include "util/parse-events.h" #include "util/cpumap.h" +#include "util/xyarray.h" #include "util/debug.h" @@ -55,7 +57,7 @@ #include <linux/unistd.h> #include <linux/types.h> -static int *fd[MAX_NR_CPUS][MAX_COUNTERS]; +#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) static bool system_wide = false; @@ -66,10 +68,9 @@ static int print_entries; static int target_pid = -1; static int target_tid = -1; -static pid_t *all_tids = NULL; -static int thread_num = 0; +static struct thread_map *threads; static bool inherit = false; -static int nr_cpus = 0; +static struct cpu_map *cpus; static int realtime_prio = 0; static bool group = false; static unsigned int page_size; @@ -100,6 +101,7 @@ struct sym_entry *sym_filter_entry = NULL; struct sym_entry *sym_filter_entry_sched = NULL; static int sym_pcnt_filter = 5; static int sym_counter = 0; +static struct perf_evsel *sym_evsel = NULL; static int display_weighted = -1; static const char *cpu_list; @@ -353,7 +355,7 @@ static void show_details(struct sym_entry *syme) return; symbol = sym_entry__symbol(syme); - printf("Showing %s for %s\n", event_name(sym_counter), symbol->name); + printf("Showing %s for %s\n", event_name(sym_evsel), symbol->name); printf(" Events Pcnt (>=%d%%)\n", sym_pcnt_filter); pthread_mutex_lock(&syme->src->lock); @@ -460,7 +462,8 @@ static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se) static void print_sym_table(void) { int printed = 0, j; - int counter, snap = !display_weighted ? sym_counter : 0; + struct perf_evsel *counter; + int snap = !display_weighted ? sym_counter : 0; float samples_per_sec = samples/delay_secs; float ksamples_per_sec = kernel_samples/delay_secs; float us_samples_per_sec = (us_samples)/delay_secs; @@ -532,7 +535,9 @@ static void print_sym_table(void) } if (nr_counters == 1 || !display_weighted) { - printf("%Ld", (u64)attrs[0].sample_period); + struct perf_evsel *first; + first = list_entry(evsel_list.next, struct perf_evsel, node); + printf("%Ld", first->attr.sample_period); if (freq) printf("Hz "); else @@ -540,9 +545,9 @@ static void print_sym_table(void) } if (!display_weighted) - printf("%s", event_name(sym_counter)); - else for (counter = 0; counter < nr_counters; counter++) { - if (counter) + printf("%s", event_name(sym_evsel)); + else list_for_each_entry(counter, &evsel_list, node) { + if (counter->idx) printf("/"); printf("%s", event_name(counter)); @@ -558,12 +563,12 @@ static void print_sym_table(void) printf(" (all"); if (cpu_list) - printf(", CPU%s: %s)\n", nr_cpus > 1 ? "s" : "", cpu_list); + printf(", CPU%s: %s)\n", cpus->nr > 1 ? "s" : "", cpu_list); else { if (target_tid != -1) printf(")\n"); else - printf(", %d CPU%s)\n", nr_cpus, nr_cpus > 1 ? "s" : ""); + printf(", %d CPU%s)\n", cpus->nr, cpus->nr > 1 ? "s" : ""); } printf("%-*.*s\n", win_width, win_width, graph_dotted_line); @@ -739,7 +744,7 @@ static void print_mapped_keys(void) fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", print_entries); if (nr_counters > 1) - fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(sym_counter)); + fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(sym_evsel)); fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", count_filter); @@ -826,19 +831,23 @@ static void handle_keypress(struct perf_session *session, int c) break; case 'E': if (nr_counters > 1) { - int i; - fprintf(stderr, "\nAvailable events:"); - for (i = 0; i < nr_counters; i++) - fprintf(stderr, "\n\t%d %s", i, event_name(i)); + + list_for_each_entry(sym_evsel, &evsel_list, node) + fprintf(stderr, "\n\t%d %s", sym_evsel->idx, event_name(sym_evsel)); prompt_integer(&sym_counter, "Enter details event counter"); if (sym_counter >= nr_counters) { - fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(0)); + sym_evsel = list_entry(evsel_list.next, struct perf_evsel, node); sym_counter = 0; + fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(sym_evsel)); sleep(1); + break; } + list_for_each_entry(sym_evsel, &evsel_list, node) + if (sym_evsel->idx == sym_counter) + break; } else sym_counter = 0; break; case 'f': @@ -977,12 +986,13 @@ static int symbol_filter(struct map *map, struct symbol *sym) } static void event__process_sample(const event_t *self, - struct perf_session *session, int counter) + struct sample_data *sample, + struct perf_session *session, + struct perf_evsel *evsel) { u64 ip = self->ip.ip; struct sym_entry *syme; struct addr_location al; - struct sample_data data; struct machine *machine; u8 origin = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; @@ -1025,7 +1035,7 @@ static void event__process_sample(const event_t *self, if (self->header.misc & PERF_RECORD_MISC_EXACT_IP) exact_samples++; - if (event__preprocess_sample(self, session, &al, &data, + if (event__preprocess_sample(self, session, &al, sample, symbol_filter) < 0 || al.filtered) return; @@ -1071,9 +1081,9 @@ static void event__process_sample(const event_t *self, syme = symbol__priv(al.sym); if (!syme->skip) { - syme->count[counter]++; + syme->count[evsel->idx]++; syme->origin = origin; - record_precise_ip(syme, counter, ip); + record_precise_ip(syme, evsel->idx, ip); pthread_mutex_lock(&active_symbols_lock); if (list_empty(&syme->node) || !syme->node.next) __list_insert_active_sym(syme); @@ -1082,12 +1092,24 @@ static void event__process_sample(const event_t *self, } struct mmap_data { - int counter; void *base; int mask; unsigned int prev; }; +static int perf_evsel__alloc_mmap_per_thread(struct perf_evsel *evsel, + int ncpus, int nthreads) +{ + evsel->priv = xyarray__new(ncpus, nthreads, sizeof(struct mmap_data)); + return evsel->priv != NULL ? 0 : -ENOMEM; +} + +static void perf_evsel__free_mmap(struct perf_evsel *evsel) +{ + xyarray__delete(evsel->priv); + evsel->priv = NULL; +} + static unsigned int mmap_read_head(struct mmap_data *md) { struct perf_event_mmap_page *pc = md->base; @@ -1100,11 +1122,15 @@ static unsigned int mmap_read_head(struct mmap_data *md) } static void perf_session__mmap_read_counter(struct perf_session *self, - struct mmap_data *md) + struct perf_evsel *evsel, + int cpu, int thread_idx) { + struct xyarray *mmap_array = evsel->priv; + struct mmap_data *md = xyarray__entry(mmap_array, cpu, thread_idx); unsigned int head = mmap_read_head(md); unsigned int old = md->prev; unsigned char *data = md->base + page_size; + struct sample_data sample; int diff; /* @@ -1152,10 +1178,11 @@ static void perf_session__mmap_read_counter(struct perf_session *self, event = &event_copy; } + event__parse_sample(event, self, &sample); if (event->header.type == PERF_RECORD_SAMPLE) - event__process_sample(event, self, md->counter); + event__process_sample(event, &sample, self, evsel); else - event__process(event, self); + event__process(event, &sample, self); old += size; } @@ -1163,36 +1190,39 @@ static void perf_session__mmap_read_counter(struct perf_session *self, } static struct pollfd *event_array; -static struct mmap_data *mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; static void perf_session__mmap_read(struct perf_session *self) { - int i, counter, thread_index; + struct perf_evsel *counter; + int i, thread_index; - for (i = 0; i < nr_cpus; i++) { - for (counter = 0; counter < nr_counters; counter++) + for (i = 0; i < cpus->nr; i++) { + list_for_each_entry(counter, &evsel_list, node) { for (thread_index = 0; - thread_index < thread_num; + thread_index < threads->nr; thread_index++) { perf_session__mmap_read_counter(self, - &mmap_array[i][counter][thread_index]); + counter, i, thread_index); } + } } } int nr_poll; int group_fd; -static void start_counter(int i, int counter) +static void start_counter(int i, struct perf_evsel *evsel) { + struct xyarray *mmap_array = evsel->priv; + struct mmap_data *mm; struct perf_event_attr *attr; int cpu = -1; int thread_index; if (target_tid == -1) - cpu = cpumap[i]; + cpu = cpus->map[i]; - attr = attrs + counter; + attr = &evsel->attr; attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; @@ -1205,16 +1235,18 @@ static void start_counter(int i, int counter) attr->inherit = (cpu < 0) && inherit; attr->mmap = 1; - for (thread_index = 0; thread_index < thread_num; thread_index++) { + for (thread_index = 0; thread_index < threads->nr; thread_index++) { try_again: - fd[i][counter][thread_index] = sys_perf_event_open(attr, - all_tids[thread_index], cpu, group_fd, 0); + FD(evsel, i, thread_index) = sys_perf_event_open(attr, + threads->map[thread_index], cpu, group_fd, 0); - if (fd[i][counter][thread_index] < 0) { + if (FD(evsel, i, thread_index) < 0) { int err = errno; if (err == EPERM || err == EACCES) - die("No permission - are you root?\n"); + die("Permission error - are you root?\n" + "\t Consider tweaking" + " /proc/sys/kernel/perf_event_paranoid.\n"); /* * If it's cycles then fall back to hrtimer * based cpu-clock-tick sw counter, which @@ -1231,30 +1263,30 @@ try_again: goto try_again; } printf("\n"); - error("perfcounter syscall returned with %d (%s)\n", - fd[i][counter][thread_index], strerror(err)); + error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n", + FD(evsel, i, thread_index), strerror(err)); die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); exit(-1); } - assert(fd[i][counter][thread_index] >= 0); - fcntl(fd[i][counter][thread_index], F_SETFL, O_NONBLOCK); + assert(FD(evsel, i, thread_index) >= 0); + fcntl(FD(evsel, i, thread_index), F_SETFL, O_NONBLOCK); /* * First counter acts as the group leader: */ if (group && group_fd == -1) - group_fd = fd[i][counter][thread_index]; + group_fd = FD(evsel, i, thread_index); - event_array[nr_poll].fd = fd[i][counter][thread_index]; + event_array[nr_poll].fd = FD(evsel, i, thread_index); event_array[nr_poll].events = POLLIN; nr_poll++; - mmap_array[i][counter][thread_index].counter = counter; - mmap_array[i][counter][thread_index].prev = 0; - mmap_array[i][counter][thread_index].mask = mmap_pages*page_size - 1; - mmap_array[i][counter][thread_index].base = mmap(NULL, (mmap_pages+1)*page_size, - PROT_READ, MAP_SHARED, fd[i][counter][thread_index], 0); - if (mmap_array[i][counter][thread_index].base == MAP_FAILED) + mm = xyarray__entry(mmap_array, i, thread_index); + mm->prev = 0; + mm->mask = mmap_pages*page_size - 1; + mm->base = mmap(NULL, (mmap_pages+1)*page_size, + PROT_READ, MAP_SHARED, FD(evsel, i, thread_index), 0); + if (mm->base == MAP_FAILED) die("failed to mmap with %d (%s)\n", errno, strerror(errno)); } } @@ -1262,13 +1294,13 @@ try_again: static int __cmd_top(void) { pthread_t thread; - int i, counter; - int ret; + struct perf_evsel *counter; + int i, ret; /* * FIXME: perf_session__new should allow passing a O_MMAP, so that all this * mmap reading, etc is encapsulated in it. Use O_WRONLY for now. */ - struct perf_session *session = perf_session__new(NULL, O_WRONLY, false, false); + struct perf_session *session = perf_session__new(NULL, O_WRONLY, false, false, NULL); if (session == NULL) return -ENOMEM; @@ -1277,9 +1309,9 @@ static int __cmd_top(void) else event__synthesize_threads(event__process, session); - for (i = 0; i < nr_cpus; i++) { + for (i = 0; i < cpus->nr; i++) { group_fd = -1; - for (counter = 0; counter < nr_counters; counter++) + list_for_each_entry(counter, &evsel_list, node) start_counter(i, counter); } @@ -1368,8 +1400,8 @@ static const struct option options[] = { int cmd_top(int argc, const char **argv, const char *prefix __used) { - int counter; - int i,j; + struct perf_evsel *pos; + int status = -ENOMEM; page_size = sysconf(_SC_PAGE_SIZE); @@ -1377,34 +1409,17 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) if (argc) usage_with_options(top_usage, options); - if (target_pid != -1) { + if (target_pid != -1) target_tid = target_pid; - thread_num = find_all_tid(target_pid, &all_tids); - if (thread_num <= 0) { - fprintf(stderr, "Can't find all threads of pid %d\n", - target_pid); - usage_with_options(top_usage, options); - } - } else { - all_tids=malloc(sizeof(pid_t)); - if (!all_tids) - return -ENOMEM; - all_tids[0] = target_tid; - thread_num = 1; + threads = thread_map__new(target_pid, target_tid); + if (threads == NULL) { + pr_err("Problems finding threads of monitor\n"); + usage_with_options(top_usage, options); } - for (i = 0; i < MAX_NR_CPUS; i++) { - for (j = 0; j < MAX_COUNTERS; j++) { - fd[i][j] = malloc(sizeof(int)*thread_num); - mmap_array[i][j] = zalloc( - sizeof(struct mmap_data)*thread_num); - if (!fd[i][j] || !mmap_array[i][j]) - return -ENOMEM; - } - } - event_array = malloc( - sizeof(struct pollfd)*MAX_NR_CPUS*MAX_COUNTERS*thread_num); + event_array = malloc((sizeof(struct pollfd) * + MAX_NR_CPUS * MAX_COUNTERS * threads->nr)); if (!event_array) return -ENOMEM; @@ -1415,15 +1430,10 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) cpu_list = NULL; } - if (!nr_counters) - nr_counters = 1; - - symbol_conf.priv_size = (sizeof(struct sym_entry) + - (nr_counters + 1) * sizeof(unsigned long)); - - symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL); - if (symbol__init() < 0) - return -1; + if (!nr_counters && perf_evsel_list__create_default() < 0) { + pr_err("Not enough memory for event selector list\n"); + return -ENOMEM; + } if (delay_secs < 1) delay_secs = 1; @@ -1440,23 +1450,33 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) exit(EXIT_FAILURE); } - /* - * Fill in the ones not specifically initialized via -c: - */ - for (counter = 0; counter < nr_counters; counter++) { - if (attrs[counter].sample_period) + if (target_tid != -1) + cpus = cpu_map__dummy_new(); + else + cpus = cpu_map__new(cpu_list); + + if (cpus == NULL) + usage_with_options(top_usage, options); + + list_for_each_entry(pos, &evsel_list, node) { + if (perf_evsel__alloc_mmap_per_thread(pos, cpus->nr, threads->nr) < 0 || + perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0) + goto out_free_fd; + /* + * Fill in the ones not specifically initialized via -c: + */ + if (pos->attr.sample_period) continue; - attrs[counter].sample_period = default_interval; + pos->attr.sample_period = default_interval; } - if (target_tid != -1) - nr_cpus = 1; - else - nr_cpus = read_cpu_map(cpu_list); + symbol_conf.priv_size = (sizeof(struct sym_entry) + + (nr_counters + 1) * sizeof(unsigned long)); - if (nr_cpus < 1) - usage_with_options(top_usage, options); + symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL); + if (symbol__init() < 0) + return -1; get_term_dimensions(&winsize); if (print_entries == 0) { @@ -1464,5 +1484,10 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) signal(SIGWINCH, sig_winch_handler); } - return __cmd_top(); + status = __cmd_top(); +out_free_fd: + list_for_each_entry(pos, &evsel_list, node) + perf_evsel__free_mmap(pos); + + return status; } diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h index 921245b28583..c7798c7f24ed 100644 --- a/tools/perf/builtin.h +++ b/tools/perf/builtin.h @@ -27,7 +27,7 @@ extern int cmd_report(int argc, const char **argv, const char *prefix); extern int cmd_stat(int argc, const char **argv, const char *prefix); extern int cmd_timechart(int argc, const char **argv, const char *prefix); extern int cmd_top(int argc, const char **argv, const char *prefix); -extern int cmd_trace(int argc, const char **argv, const char *prefix); +extern int cmd_script(int argc, const char **argv, const char *prefix); extern int cmd_version(int argc, const char **argv, const char *prefix); extern int cmd_probe(int argc, const char **argv, const char *prefix); extern int cmd_kmem(int argc, const char **argv, const char *prefix); diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt index 949d77fc0b97..16b5088cf8f4 100644 --- a/tools/perf/command-list.txt +++ b/tools/perf/command-list.txt @@ -16,7 +16,7 @@ perf-report mainporcelain common perf-stat mainporcelain common perf-timechart mainporcelain common perf-top mainporcelain common -perf-trace mainporcelain common +perf-script mainporcelain common perf-probe mainporcelain common perf-kmem mainporcelain common perf-lock mainporcelain common diff --git a/tools/perf/feature-tests.mak b/tools/perf/feature-tests.mak index b253db634f04..b041ca67a2cb 100644 --- a/tools/perf/feature-tests.mak +++ b/tools/perf/feature-tests.mak @@ -9,8 +9,8 @@ endef ifndef NO_DWARF define SOURCE_DWARF #include <dwarf.h> -#include <libdw.h> -#include <version.h> +#include <elfutils/libdw.h> +#include <elfutils/version.h> #ifndef _ELFUTILS_PREREQ #error #endif diff --git a/tools/perf/perf.c b/tools/perf/perf.c index cdd6c03f1e14..5b1ecd66bb36 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -286,6 +286,8 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv) status = p->fn(argc, argv, prefix); exit_browser(status); + perf_evsel_list__delete(); + if (status) return status & 0xff; @@ -323,7 +325,7 @@ static void handle_internal_command(int argc, const char **argv) { "top", cmd_top, 0 }, { "annotate", cmd_annotate, 0 }, { "version", cmd_version, 0 }, - { "trace", cmd_trace, 0 }, + { "script", cmd_script, 0 }, { "sched", cmd_sched, 0 }, { "probe", cmd_probe, 0 }, { "kmem", cmd_kmem, 0 }, diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/Context.c b/tools/perf/scripts/perl/Perf-Trace-Util/Context.c index 01a64ad693f2..790ceba6ad3f 100644 --- a/tools/perf/scripts/perl/Perf-Trace-Util/Context.c +++ b/tools/perf/scripts/perl/Perf-Trace-Util/Context.c @@ -8,7 +8,7 @@ #line 1 "Context.xs" /* - * Context.xs. XS interfaces for perf trace. + * Context.xs. XS interfaces for perf script. * * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> * diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/Context.xs b/tools/perf/scripts/perl/Perf-Trace-Util/Context.xs index 549cf0467d30..c1e2ed1ed34e 100644 --- a/tools/perf/scripts/perl/Perf-Trace-Util/Context.xs +++ b/tools/perf/scripts/perl/Perf-Trace-Util/Context.xs @@ -1,5 +1,5 @@ /* - * Context.xs. XS interfaces for perf trace. + * Context.xs. XS interfaces for perf script. * * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> * @@ -23,7 +23,7 @@ #include "perl.h" #include "XSUB.h" #include "../../../perf.h" -#include "../../../util/trace-event.h" +#include "../../../util/script-event.h" MODULE = Perf::Trace::Context PACKAGE = Perf::Trace::Context PROTOTYPES: ENABLE diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/README b/tools/perf/scripts/perl/Perf-Trace-Util/README index 9a9707630791..2f0c7f3043ee 100644 --- a/tools/perf/scripts/perl/Perf-Trace-Util/README +++ b/tools/perf/scripts/perl/Perf-Trace-Util/README @@ -1,7 +1,7 @@ Perf-Trace-Util version 0.01 ============================ -This module contains utility functions for use with perf trace. +This module contains utility functions for use with perf script. Core.pm and Util.pm are pure Perl modules; Core.pm contains routines that the core perf support for Perl calls on and should always be @@ -33,7 +33,7 @@ After you do that: INSTALLATION -Building perf with perf trace Perl scripting should install this +Building perf with perf script Perl scripting should install this module in the right place. You should make sure libperl and ExtUtils/Embed.pm are installed first diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm index 6c7f3659cb17..4e2f6039ac92 100644 --- a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm +++ b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm @@ -34,7 +34,7 @@ Perf::Trace::Context - Perl extension for accessing functions in perf. =head1 SEE ALSO -Perf (trace) documentation +Perf (script) documentation =head1 AUTHOR diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm index 9df376a9f629..9158458d3eeb 100644 --- a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm +++ b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm @@ -163,7 +163,7 @@ sub dump_symbolic_fields __END__ =head1 NAME -Perf::Trace::Core - Perl extension for perf trace +Perf::Trace::Core - Perl extension for perf script =head1 SYNOPSIS @@ -171,7 +171,7 @@ Perf::Trace::Core - Perl extension for perf trace =head1 SEE ALSO -Perf (trace) documentation +Perf (script) documentation =head1 AUTHOR diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm index d94b40c8ac85..053500114625 100644 --- a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm +++ b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm @@ -65,7 +65,7 @@ sub clear_term __END__ =head1 NAME -Perf::Trace::Util - Perl extension for perf trace +Perf::Trace::Util - Perl extension for perf script =head1 SYNOPSIS @@ -73,7 +73,7 @@ Perf::Trace::Util - Perl extension for perf trace =head1 SEE ALSO -Perf (trace) documentation +Perf (script) documentation =head1 AUTHOR diff --git a/tools/perf/scripts/perl/bin/failed-syscalls-report b/tools/perf/scripts/perl/bin/failed-syscalls-report index 4028d92dc4ae..9f83cc1ad8ba 100644 --- a/tools/perf/scripts/perl/bin/failed-syscalls-report +++ b/tools/perf/scripts/perl/bin/failed-syscalls-report @@ -7,4 +7,4 @@ if [ $# -gt 0 ] ; then shift fi fi -perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/failed-syscalls.pl $comm +perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/failed-syscalls.pl $comm diff --git a/tools/perf/scripts/perl/bin/rw-by-file-report b/tools/perf/scripts/perl/bin/rw-by-file-report index ba25f4d41fb0..77200b3f3100 100644 --- a/tools/perf/scripts/perl/bin/rw-by-file-report +++ b/tools/perf/scripts/perl/bin/rw-by-file-report @@ -7,7 +7,4 @@ if [ $# -lt 1 ] ; then fi comm=$1 shift -perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/rw-by-file.pl $comm - - - +perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/rw-by-file.pl $comm diff --git a/tools/perf/scripts/perl/bin/rw-by-pid-report b/tools/perf/scripts/perl/bin/rw-by-pid-report index 641a3f5d085c..a27b9f311f95 100644 --- a/tools/perf/scripts/perl/bin/rw-by-pid-report +++ b/tools/perf/scripts/perl/bin/rw-by-pid-report @@ -1,6 +1,3 @@ #!/bin/bash # description: system-wide r/w activity -perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/rw-by-pid.pl - - - +perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/rw-by-pid.pl diff --git a/tools/perf/scripts/perl/bin/rwtop-report b/tools/perf/scripts/perl/bin/rwtop-report index 4918dba77021..83e11ec2e190 100644 --- a/tools/perf/scripts/perl/bin/rwtop-report +++ b/tools/perf/scripts/perl/bin/rwtop-report @@ -17,7 +17,4 @@ if [ "$n_args" -gt 0 ] ; then interval=$1 shift fi -perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/rwtop.pl $interval - - - +perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/rwtop.pl $interval diff --git a/tools/perf/scripts/perl/bin/wakeup-latency-report b/tools/perf/scripts/perl/bin/wakeup-latency-report index 49052ebcb632..889e8130cca5 100644 --- a/tools/perf/scripts/perl/bin/wakeup-latency-report +++ b/tools/perf/scripts/perl/bin/wakeup-latency-report @@ -1,6 +1,3 @@ #!/bin/bash # description: system-wide min/max/avg wakeup latency -perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/wakeup-latency.pl - - - +perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/wakeup-latency.pl diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-report b/tools/perf/scripts/perl/bin/workqueue-stats-report index df0c65f4ca93..6d91411d248c 100644 --- a/tools/perf/scripts/perl/bin/workqueue-stats-report +++ b/tools/perf/scripts/perl/bin/workqueue-stats-report @@ -1,7 +1,3 @@ #!/bin/bash # description: workqueue stats (ins/exe/create/destroy) -perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/workqueue-stats.pl - - - - +perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/workqueue-stats.pl diff --git a/tools/perf/scripts/perl/check-perf-trace.pl b/tools/perf/scripts/perl/check-perf-trace.pl index 4e7dc0a407a5..4e7076c20616 100644 --- a/tools/perf/scripts/perl/check-perf-trace.pl +++ b/tools/perf/scripts/perl/check-perf-trace.pl @@ -1,4 +1,4 @@ -# perf trace event handlers, generated by perf trace -g perl +# perf script event handlers, generated by perf script -g perl # (c) 2009, Tom Zanussi <tzanussi@gmail.com> # Licensed under the terms of the GNU GPL License version 2 diff --git a/tools/perf/scripts/perl/rw-by-file.pl b/tools/perf/scripts/perl/rw-by-file.pl index 2a39097687b9..74844ee2be3e 100644 --- a/tools/perf/scripts/perl/rw-by-file.pl +++ b/tools/perf/scripts/perl/rw-by-file.pl @@ -18,7 +18,7 @@ use lib "./Perf-Trace-Util/lib"; use Perf::Trace::Core; use Perf::Trace::Util; -my $usage = "perf trace -s rw-by-file.pl <comm>\n"; +my $usage = "perf script -s rw-by-file.pl <comm>\n"; my $for_comm = shift or die $usage; diff --git a/tools/perf/scripts/perl/workqueue-stats.pl b/tools/perf/scripts/perl/workqueue-stats.pl index b84b12699b70..a8eaff5119e0 100644 --- a/tools/perf/scripts/perl/workqueue-stats.pl +++ b/tools/perf/scripts/perl/workqueue-stats.pl @@ -10,7 +10,7 @@ # workqueue:workqueue_destruction -e workqueue:workqueue_execution # -e workqueue:workqueue_insertion # -# perf trace -p -s tools/perf/scripts/perl/workqueue-stats.pl +# perf script -p -s tools/perf/scripts/perl/workqueue-stats.pl use 5.010000; use strict; diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Context.c b/tools/perf/scripts/python/Perf-Trace-Util/Context.c index 957085dd5d8d..315067b8f552 100644 --- a/tools/perf/scripts/python/Perf-Trace-Util/Context.c +++ b/tools/perf/scripts/python/Perf-Trace-Util/Context.c @@ -1,5 +1,5 @@ /* - * Context.c. Python interfaces for perf trace. + * Context.c. Python interfaces for perf script. * * Copyright (C) 2010 Tom Zanussi <tzanussi@gmail.com> * diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Core.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Core.py index aad7525bca1d..de7211e4fa47 100644 --- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Core.py +++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Core.py @@ -1,4 +1,4 @@ -# Core.py - Python extension for perf trace, core functions +# Core.py - Python extension for perf script, core functions # # Copyright (C) 2010 by Tom Zanussi <tzanussi@gmail.com> # diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/SchedGui.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/SchedGui.py index ae9a56e43e05..fdd92f699055 100644 --- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/SchedGui.py +++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/SchedGui.py @@ -1,4 +1,4 @@ -# SchedGui.py - Python extension for perf trace, basic GUI code for +# SchedGui.py - Python extension for perf script, basic GUI code for # traces drawing and overview. # # Copyright (C) 2010 by Frederic Weisbecker <fweisbec@gmail.com> diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py index 13cc02b5893a..15c8400240fd 100644 --- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py +++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py @@ -1,4 +1,4 @@ -# Util.py - Python extension for perf trace, miscellaneous utility code +# Util.py - Python extension for perf script, miscellaneous utility code # # Copyright (C) 2010 by Tom Zanussi <tzanussi@gmail.com> # diff --git a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report index 03587021463d..fda5096d0cbf 100644 --- a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report +++ b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report @@ -7,4 +7,4 @@ if [ $# -gt 0 ] ; then shift fi fi -perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/failed-syscalls-by-pid.py $comm +perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/failed-syscalls-by-pid.py $comm diff --git a/tools/perf/scripts/python/bin/futex-contention-report b/tools/perf/scripts/python/bin/futex-contention-report index c8268138fb7e..6c44271091ab 100644 --- a/tools/perf/scripts/python/bin/futex-contention-report +++ b/tools/perf/scripts/python/bin/futex-contention-report @@ -1,4 +1,4 @@ #!/bin/bash # description: futext contention measurement -perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/futex-contention.py +perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/futex-contention.py diff --git a/tools/perf/scripts/python/bin/netdev-times-report b/tools/perf/scripts/python/bin/netdev-times-report index 4ad361b31249..8f759291da86 100644 --- a/tools/perf/scripts/python/bin/netdev-times-report +++ b/tools/perf/scripts/python/bin/netdev-times-report @@ -2,4 +2,4 @@ # description: display a process of packet and processing time # args: [tx] [rx] [dev=] [debug] -perf trace -s "$PERF_EXEC_PATH"/scripts/python/netdev-times.py $@ +perf script -s "$PERF_EXEC_PATH"/scripts/python/netdev-times.py $@ diff --git a/tools/perf/scripts/python/bin/sched-migration-report b/tools/perf/scripts/python/bin/sched-migration-report index df1791f07c24..68b037a1849b 100644 --- a/tools/perf/scripts/python/bin/sched-migration-report +++ b/tools/perf/scripts/python/bin/sched-migration-report @@ -1,3 +1,3 @@ #!/bin/bash # description: sched migration overview -perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/sched-migration.py +perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/sched-migration.py diff --git a/tools/perf/scripts/python/bin/sctop-report b/tools/perf/scripts/python/bin/sctop-report index 36b409c05e50..c32db294124d 100644 --- a/tools/perf/scripts/python/bin/sctop-report +++ b/tools/perf/scripts/python/bin/sctop-report @@ -21,4 +21,4 @@ elif [ "$n_args" -gt 0 ] ; then interval=$1 shift fi -perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/sctop.py $comm $interval +perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/sctop.py $comm $interval diff --git a/tools/perf/scripts/python/bin/syscall-counts-by-pid-report b/tools/perf/scripts/python/bin/syscall-counts-by-pid-report index 4eb88c9fc83c..16eb8d65c543 100644 --- a/tools/perf/scripts/python/bin/syscall-counts-by-pid-report +++ b/tools/perf/scripts/python/bin/syscall-counts-by-pid-report @@ -7,4 +7,4 @@ if [ $# -gt 0 ] ; then shift fi fi -perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/syscall-counts-by-pid.py $comm +perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/syscall-counts-by-pid.py $comm diff --git a/tools/perf/scripts/python/bin/syscall-counts-report b/tools/perf/scripts/python/bin/syscall-counts-report index cb2f9c5cf17e..0f0e9d453bb4 100644 --- a/tools/perf/scripts/python/bin/syscall-counts-report +++ b/tools/perf/scripts/python/bin/syscall-counts-report @@ -7,4 +7,4 @@ if [ $# -gt 0 ] ; then shift fi fi -perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/syscall-counts.py $comm +perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/syscall-counts.py $comm diff --git a/tools/perf/scripts/python/check-perf-trace.py b/tools/perf/scripts/python/check-perf-trace.py index d9f7893e315c..4647a7694cf6 100644 --- a/tools/perf/scripts/python/check-perf-trace.py +++ b/tools/perf/scripts/python/check-perf-trace.py @@ -1,4 +1,4 @@ -# perf trace event handlers, generated by perf trace -g python +# perf script event handlers, generated by perf script -g python # (c) 2010, Tom Zanussi <tzanussi@gmail.com> # Licensed under the terms of the GNU GPL License version 2 # diff --git a/tools/perf/scripts/python/failed-syscalls-by-pid.py b/tools/perf/scripts/python/failed-syscalls-by-pid.py index acd7848717b3..85805fac4116 100644 --- a/tools/perf/scripts/python/failed-syscalls-by-pid.py +++ b/tools/perf/scripts/python/failed-syscalls-by-pid.py @@ -15,7 +15,7 @@ from perf_trace_context import * from Core import * from Util import * -usage = "perf trace -s syscall-counts-by-pid.py [comm|pid]\n"; +usage = "perf script -s syscall-counts-by-pid.py [comm|pid]\n"; for_comm = None for_pid = None diff --git a/tools/perf/scripts/python/sched-migration.py b/tools/perf/scripts/python/sched-migration.py index b934383c3364..74d55ec08aed 100644 --- a/tools/perf/scripts/python/sched-migration.py +++ b/tools/perf/scripts/python/sched-migration.py @@ -4,7 +4,7 @@ # # Copyright (C) 2010 Frederic Weisbecker <fweisbec@gmail.com> # -# perf trace event handlers have been generated by perf trace -g python +# perf script event handlers have been generated by perf script -g python # # This software is distributed under the terms of the GNU General # Public License ("GPL") version 2 as published by the Free Software diff --git a/tools/perf/scripts/python/sctop.py b/tools/perf/scripts/python/sctop.py index 7a6ec2c7d8ab..42c267e292fa 100644 --- a/tools/perf/scripts/python/sctop.py +++ b/tools/perf/scripts/python/sctop.py @@ -17,7 +17,7 @@ from perf_trace_context import * from Core import * from Util import * -usage = "perf trace -s sctop.py [comm] [interval]\n"; +usage = "perf script -s sctop.py [comm] [interval]\n"; for_comm = None default_interval = 3 diff --git a/tools/perf/scripts/python/syscall-counts-by-pid.py b/tools/perf/scripts/python/syscall-counts-by-pid.py index d1ee3ec10cf2..c64d1c55d745 100644 --- a/tools/perf/scripts/python/syscall-counts-by-pid.py +++ b/tools/perf/scripts/python/syscall-counts-by-pid.py @@ -14,7 +14,7 @@ from perf_trace_context import * from Core import * from Util import syscall_name -usage = "perf trace -s syscall-counts-by-pid.py [comm]\n"; +usage = "perf script -s syscall-counts-by-pid.py [comm]\n"; for_comm = None for_pid = None diff --git a/tools/perf/scripts/python/syscall-counts.py b/tools/perf/scripts/python/syscall-counts.py index ea183dc82d29..b435d3f188e8 100644 --- a/tools/perf/scripts/python/syscall-counts.py +++ b/tools/perf/scripts/python/syscall-counts.py @@ -15,7 +15,7 @@ from perf_trace_context import * from Core import * from Util import syscall_name -usage = "perf trace -s syscall-counts.py [comm]\n"; +usage = "perf script -s syscall-counts.py [comm]\n"; for_comm = None diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index e437edb72417..deffb8c96071 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -14,7 +14,9 @@ #include <linux/kernel.h> #include "debug.h" -static int build_id__mark_dso_hit(event_t *event, struct perf_session *session) +static int build_id__mark_dso_hit(event_t *event, + struct sample_data *sample __used, + struct perf_session *session) { struct addr_location al; u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; @@ -35,7 +37,8 @@ static int build_id__mark_dso_hit(event_t *event, struct perf_session *session) return 0; } -static int event__exit_del_thread(event_t *self, struct perf_session *session) +static int event__exit_del_thread(event_t *self, struct sample_data *sample __used, + struct perf_session *session) { struct thread *thread = perf_session__findnew(session, self->fork.tid); diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 0f9b8d7a7d7e..3ccaa1043383 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -4,32 +4,53 @@ #include <assert.h> #include <stdio.h> -int cpumap[MAX_NR_CPUS]; - -static int default_cpu_map(void) +static struct cpu_map *cpu_map__default_new(void) { - int nr_cpus, i; + struct cpu_map *cpus; + int nr_cpus; nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - assert(nr_cpus <= MAX_NR_CPUS); - assert((int)nr_cpus >= 0); + if (nr_cpus < 0) + return NULL; + + cpus = malloc(sizeof(*cpus) + nr_cpus * sizeof(int)); + if (cpus != NULL) { + int i; + for (i = 0; i < nr_cpus; ++i) + cpus->map[i] = i; - for (i = 0; i < nr_cpus; ++i) - cpumap[i] = i; + cpus->nr = nr_cpus; + } - return nr_cpus; + return cpus; } -static int read_all_cpu_map(void) +static struct cpu_map *cpu_map__trim_new(int nr_cpus, int *tmp_cpus) { + size_t payload_size = nr_cpus * sizeof(int); + struct cpu_map *cpus = malloc(sizeof(*cpus) + payload_size); + + if (cpus != NULL) { + cpus->nr = nr_cpus; + memcpy(cpus->map, tmp_cpus, payload_size); + } + + return cpus; +} + +static struct cpu_map *cpu_map__read_all_cpu_map(void) +{ + struct cpu_map *cpus = NULL; FILE *onlnf; int nr_cpus = 0; + int *tmp_cpus = NULL, *tmp; + int max_entries = 0; int n, cpu, prev; char sep; onlnf = fopen("/sys/devices/system/cpu/online", "r"); if (!onlnf) - return default_cpu_map(); + return cpu_map__default_new(); sep = 0; prev = -1; @@ -38,12 +59,28 @@ static int read_all_cpu_map(void) if (n <= 0) break; if (prev >= 0) { - assert(nr_cpus + cpu - prev - 1 < MAX_NR_CPUS); + int new_max = nr_cpus + cpu - prev - 1; + + if (new_max >= max_entries) { + max_entries = new_max + MAX_NR_CPUS / 2; + tmp = realloc(tmp_cpus, max_entries * sizeof(int)); + if (tmp == NULL) + goto out_free_tmp; + tmp_cpus = tmp; + } + while (++prev < cpu) - cpumap[nr_cpus++] = prev; + tmp_cpus[nr_cpus++] = prev; + } + if (nr_cpus == max_entries) { + max_entries += MAX_NR_CPUS; + tmp = realloc(tmp_cpus, max_entries * sizeof(int)); + if (tmp == NULL) + goto out_free_tmp; + tmp_cpus = tmp; } - assert (nr_cpus < MAX_NR_CPUS); - cpumap[nr_cpus++] = cpu; + + tmp_cpus[nr_cpus++] = cpu; if (n == 2 && sep == '-') prev = cpu; else @@ -51,24 +88,31 @@ static int read_all_cpu_map(void) if (n == 1 || sep == '\n') break; } - fclose(onlnf); - if (nr_cpus > 0) - return nr_cpus; - return default_cpu_map(); + if (nr_cpus > 0) + cpus = cpu_map__trim_new(nr_cpus, tmp_cpus); + else + cpus = cpu_map__default_new(); +out_free_tmp: + free(tmp_cpus); + fclose(onlnf); + return cpus; } -int read_cpu_map(const char *cpu_list) +struct cpu_map *cpu_map__new(const char *cpu_list) { + struct cpu_map *cpus = NULL; unsigned long start_cpu, end_cpu = 0; char *p = NULL; int i, nr_cpus = 0; + int *tmp_cpus = NULL, *tmp; + int max_entries = 0; if (!cpu_list) - return read_all_cpu_map(); + return cpu_map__read_all_cpu_map(); if (!isdigit(*cpu_list)) - goto invalid; + goto out; while (isdigit(*cpu_list)) { p = NULL; @@ -94,21 +138,42 @@ int read_cpu_map(const char *cpu_list) for (; start_cpu <= end_cpu; start_cpu++) { /* check for duplicates */ for (i = 0; i < nr_cpus; i++) - if (cpumap[i] == (int)start_cpu) + if (tmp_cpus[i] == (int)start_cpu) goto invalid; - assert(nr_cpus < MAX_NR_CPUS); - cpumap[nr_cpus++] = (int)start_cpu; + if (nr_cpus == max_entries) { + max_entries += MAX_NR_CPUS; + tmp = realloc(tmp_cpus, max_entries * sizeof(int)); + if (tmp == NULL) + goto invalid; + tmp_cpus = tmp; + } + tmp_cpus[nr_cpus++] = (int)start_cpu; } if (*p) ++p; cpu_list = p; } - if (nr_cpus > 0) - return nr_cpus; - return default_cpu_map(); + if (nr_cpus > 0) + cpus = cpu_map__trim_new(nr_cpus, tmp_cpus); + else + cpus = cpu_map__default_new(); invalid: - return -1; + free(tmp_cpus); +out: + return cpus; +} + +struct cpu_map *cpu_map__dummy_new(void) +{ + struct cpu_map *cpus = malloc(sizeof(*cpus) + sizeof(int)); + + if (cpus != NULL) { + cpus->nr = 1; + cpus->map[0] = -1; + } + + return cpus; } diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 3e60f56e490e..f7a4f42f6307 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -1,7 +1,13 @@ #ifndef __PERF_CPUMAP_H #define __PERF_CPUMAP_H -extern int read_cpu_map(const char *cpu_list); -extern int cpumap[]; +struct cpu_map { + int nr; + int map[]; +}; + +struct cpu_map *cpu_map__new(const char *cpu_list); +struct cpu_map *cpu_map__dummy_new(void); +void *cpu_map__delete(struct cpu_map *map); #endif /* __PERF_CPUMAP_H */ diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c index c8d81b00089d..01bbe8ecec3f 100644 --- a/tools/perf/util/debug.c +++ b/tools/perf/util/debug.c @@ -46,20 +46,16 @@ int dump_printf(const char *fmt, ...) return ret; } -static int dump_printf_color(const char *fmt, const char *color, ...) +#ifdef NO_NEWT_SUPPORT +void ui__warning(const char *format, ...) { va_list args; - int ret = 0; - if (dump_trace) { - va_start(args, color); - ret = color_vfprintf(stdout, color, fmt, args); - va_end(args); - } - - return ret; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); } - +#endif void trace_event(event_t *event) { @@ -70,29 +66,29 @@ void trace_event(event_t *event) if (!dump_trace) return; - dump_printf("."); - dump_printf_color("\n. ... raw event: size %d bytes\n", color, - event->header.size); + printf("."); + color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n", + event->header.size); for (i = 0; i < event->header.size; i++) { if ((i & 15) == 0) { - dump_printf("."); - dump_printf_color(" %04x: ", color, i); + printf("."); + color_fprintf(stdout, color, " %04x: ", i); } - dump_printf_color(" %02x", color, raw_event[i]); + color_fprintf(stdout, color, " %02x", raw_event[i]); if (((i & 15) == 15) || i == event->header.size-1) { - dump_printf_color(" ", color); + color_fprintf(stdout, color, " "); for (j = 0; j < 15-(i & 15); j++) - dump_printf_color(" ", color); + color_fprintf(stdout, color, " "); for (j = i & ~15; j <= i; j++) { - dump_printf_color("%c", color, - isprint(raw_event[j]) ? - raw_event[j] : '.'); + color_fprintf(stdout, color, "%c", + isprint(raw_event[j]) ? + raw_event[j] : '.'); } - dump_printf_color("\n", color); + color_fprintf(stdout, color, "\n"); } } - dump_printf(".\n"); + printf(".\n"); } diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h index 7b514082bbaf..ca35fd66b5df 100644 --- a/tools/perf/util/debug.h +++ b/tools/perf/util/debug.h @@ -35,4 +35,6 @@ int ui_helpline__show_help(const char *format, va_list ap); #include "ui/progress.h" #endif +void ui__warning(const char *format, ...) __attribute__((format(printf, 1, 2))); + #endif /* __PERF_DEBUG_H */ diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index dab9e754a281..2302ec051bb4 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -7,7 +7,7 @@ #include "strlist.h" #include "thread.h" -const char *event__name[] = { +static const char *event__name[] = { [0] = "TOTAL", [PERF_RECORD_MMAP] = "MMAP", [PERF_RECORD_LOST] = "LOST", @@ -22,13 +22,31 @@ const char *event__name[] = { [PERF_RECORD_HEADER_EVENT_TYPE] = "EVENT_TYPE", [PERF_RECORD_HEADER_TRACING_DATA] = "TRACING_DATA", [PERF_RECORD_HEADER_BUILD_ID] = "BUILD_ID", + [PERF_RECORD_FINISHED_ROUND] = "FINISHED_ROUND", }; -static pid_t event__synthesize_comm(pid_t pid, int full, +const char *event__get_event_name(unsigned int id) +{ + if (id >= ARRAY_SIZE(event__name)) + return "INVALID"; + if (!event__name[id]) + return "UNKNOWN"; + return event__name[id]; +} + +static struct sample_data synth_sample = { + .pid = -1, + .tid = -1, + .time = -1, + .stream_id = -1, + .cpu = -1, + .period = 1, +}; + +static pid_t event__synthesize_comm(event_t *event, pid_t pid, int full, event__handler_t process, struct perf_session *session) { - event_t ev; char filename[PATH_MAX]; char bf[BUFSIZ]; FILE *fp; @@ -49,34 +67,39 @@ out_race: return 0; } - memset(&ev.comm, 0, sizeof(ev.comm)); - while (!ev.comm.comm[0] || !ev.comm.pid) { - if (fgets(bf, sizeof(bf), fp) == NULL) - goto out_failure; + memset(&event->comm, 0, sizeof(event->comm)); + + while (!event->comm.comm[0] || !event->comm.pid) { + if (fgets(bf, sizeof(bf), fp) == NULL) { + pr_warning("couldn't get COMM and pgid, malformed %s\n", filename); + goto out; + } if (memcmp(bf, "Name:", 5) == 0) { char *name = bf + 5; while (*name && isspace(*name)) ++name; size = strlen(name) - 1; - memcpy(ev.comm.comm, name, size++); + memcpy(event->comm.comm, name, size++); } else if (memcmp(bf, "Tgid:", 5) == 0) { char *tgids = bf + 5; while (*tgids && isspace(*tgids)) ++tgids; - tgid = ev.comm.pid = atoi(tgids); + tgid = event->comm.pid = atoi(tgids); } } - ev.comm.header.type = PERF_RECORD_COMM; + event->comm.header.type = PERF_RECORD_COMM; size = ALIGN(size, sizeof(u64)); - ev.comm.header.size = sizeof(ev.comm) - (sizeof(ev.comm.comm) - size); - + memset(event->comm.comm + size, 0, session->id_hdr_size); + event->comm.header.size = (sizeof(event->comm) - + (sizeof(event->comm.comm) - size) + + session->id_hdr_size); if (!full) { - ev.comm.tid = pid; + event->comm.tid = pid; - process(&ev, session); - goto out_fclose; + process(event, &synth_sample, session); + goto out; } snprintf(filename, sizeof(filename), "/proc/%d/task", pid); @@ -91,22 +114,19 @@ out_race: if (*end) continue; - ev.comm.tid = pid; + event->comm.tid = pid; - process(&ev, session); + process(event, &synth_sample, session); } - closedir(tasks); -out_fclose: + closedir(tasks); +out: fclose(fp); - return tgid; -out_failure: - pr_warning("couldn't get COMM and pgid, malformed %s\n", filename); - return -1; + return tgid; } -static int event__synthesize_mmap_events(pid_t pid, pid_t tgid, +static int event__synthesize_mmap_events(event_t *event, pid_t pid, pid_t tgid, event__handler_t process, struct perf_session *session) { @@ -124,29 +144,25 @@ static int event__synthesize_mmap_events(pid_t pid, pid_t tgid, return -1; } + event->header.type = PERF_RECORD_MMAP; + /* + * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c + */ + event->header.misc = PERF_RECORD_MISC_USER; + while (1) { char bf[BUFSIZ], *pbf = bf; - event_t ev = { - .header = { - .type = PERF_RECORD_MMAP, - /* - * Just like the kernel, see __perf_event_mmap - * in kernel/perf_event.c - */ - .misc = PERF_RECORD_MISC_USER, - }, - }; int n; size_t size; if (fgets(bf, sizeof(bf), fp) == NULL) break; /* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */ - n = hex2u64(pbf, &ev.mmap.start); + n = hex2u64(pbf, &event->mmap.start); if (n < 0) continue; pbf += n + 1; - n = hex2u64(pbf, &ev.mmap.len); + n = hex2u64(pbf, &event->mmap.len); if (n < 0) continue; pbf += n + 3; @@ -161,19 +177,21 @@ static int event__synthesize_mmap_events(pid_t pid, pid_t tgid, continue; pbf += 3; - n = hex2u64(pbf, &ev.mmap.pgoff); + n = hex2u64(pbf, &event->mmap.pgoff); size = strlen(execname); execname[size - 1] = '\0'; /* Remove \n */ - memcpy(ev.mmap.filename, execname, size); + memcpy(event->mmap.filename, execname, size); size = ALIGN(size, sizeof(u64)); - ev.mmap.len -= ev.mmap.start; - ev.mmap.header.size = (sizeof(ev.mmap) - - (sizeof(ev.mmap.filename) - size)); - ev.mmap.pid = tgid; - ev.mmap.tid = pid; - - process(&ev, session); + event->mmap.len -= event->mmap.start; + event->mmap.header.size = (sizeof(event->mmap) - + (sizeof(event->mmap.filename) - size)); + memset(event->mmap.filename + size, 0, session->id_hdr_size); + event->mmap.header.size += session->id_hdr_size; + event->mmap.pid = tgid; + event->mmap.tid = pid; + + process(event, &synth_sample, session); } } @@ -187,20 +205,27 @@ int event__synthesize_modules(event__handler_t process, { struct rb_node *nd; struct map_groups *kmaps = &machine->kmaps; - u16 misc; + event_t *event = zalloc(sizeof(event->mmap) + session->id_hdr_size); + + if (event == NULL) { + pr_debug("Not enough memory synthesizing mmap event " + "for kernel modules\n"); + return -1; + } + + event->header.type = PERF_RECORD_MMAP; /* * kernel uses 0 for user space maps, see kernel/perf_event.c * __perf_event_mmap */ if (machine__is_host(machine)) - misc = PERF_RECORD_MISC_KERNEL; + event->header.misc = PERF_RECORD_MISC_KERNEL; else - misc = PERF_RECORD_MISC_GUEST_KERNEL; + event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL; for (nd = rb_first(&kmaps->maps[MAP__FUNCTION]); nd; nd = rb_next(nd)) { - event_t ev; size_t size; struct map *pos = rb_entry(nd, struct map, rb_node); @@ -208,39 +233,78 @@ int event__synthesize_modules(event__handler_t process, continue; size = ALIGN(pos->dso->long_name_len + 1, sizeof(u64)); - memset(&ev, 0, sizeof(ev)); - ev.mmap.header.misc = misc; - ev.mmap.header.type = PERF_RECORD_MMAP; - ev.mmap.header.size = (sizeof(ev.mmap) - - (sizeof(ev.mmap.filename) - size)); - ev.mmap.start = pos->start; - ev.mmap.len = pos->end - pos->start; - ev.mmap.pid = machine->pid; - - memcpy(ev.mmap.filename, pos->dso->long_name, + event->mmap.header.type = PERF_RECORD_MMAP; + event->mmap.header.size = (sizeof(event->mmap) - + (sizeof(event->mmap.filename) - size)); + memset(event->mmap.filename + size, 0, session->id_hdr_size); + event->mmap.header.size += session->id_hdr_size; + event->mmap.start = pos->start; + event->mmap.len = pos->end - pos->start; + event->mmap.pid = machine->pid; + + memcpy(event->mmap.filename, pos->dso->long_name, pos->dso->long_name_len + 1); - process(&ev, session); + process(event, &synth_sample, session); } + free(event); return 0; } -int event__synthesize_thread(pid_t pid, event__handler_t process, - struct perf_session *session) +static int __event__synthesize_thread(event_t *comm_event, event_t *mmap_event, + pid_t pid, event__handler_t process, + struct perf_session *session) { - pid_t tgid = event__synthesize_comm(pid, 1, process, session); + pid_t tgid = event__synthesize_comm(comm_event, pid, 1, process, + session); if (tgid == -1) return -1; - return event__synthesize_mmap_events(pid, tgid, process, session); + return event__synthesize_mmap_events(mmap_event, pid, tgid, + process, session); +} + +int event__synthesize_thread(pid_t pid, event__handler_t process, + struct perf_session *session) +{ + event_t *comm_event, *mmap_event; + int err = -1; + + comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size); + if (comm_event == NULL) + goto out; + + mmap_event = malloc(sizeof(mmap_event->mmap) + session->id_hdr_size); + if (mmap_event == NULL) + goto out_free_comm; + + err = __event__synthesize_thread(comm_event, mmap_event, pid, + process, session); + free(mmap_event); +out_free_comm: + free(comm_event); +out: + return err; } -void event__synthesize_threads(event__handler_t process, - struct perf_session *session) +int event__synthesize_threads(event__handler_t process, + struct perf_session *session) { DIR *proc; struct dirent dirent, *next; + event_t *comm_event, *mmap_event; + int err = -1; + + comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size); + if (comm_event == NULL) + goto out; + + mmap_event = malloc(sizeof(mmap_event->mmap) + session->id_hdr_size); + if (mmap_event == NULL) + goto out_free_comm; proc = opendir("/proc"); + if (proc == NULL) + goto out_free_mmap; while (!readdir_r(proc, &dirent, &next) && next) { char *end; @@ -249,10 +313,18 @@ void event__synthesize_threads(event__handler_t process, if (*end) /* only interested in proper numerical dirents */ continue; - event__synthesize_thread(pid, process, session); + __event__synthesize_thread(comm_event, mmap_event, pid, + process, session); } closedir(proc); + err = 0; +out_free_mmap: + free(mmap_event); +out_free_comm: + free(comm_event); +out: + return err; } struct process_symbol_args { @@ -260,7 +332,8 @@ struct process_symbol_args { u64 start; }; -static int find_symbol_cb(void *arg, const char *name, char type, u64 start) +static int find_symbol_cb(void *arg, const char *name, char type, + u64 start, u64 end __used) { struct process_symbol_args *args = arg; @@ -286,18 +359,20 @@ int event__synthesize_kernel_mmap(event__handler_t process, char path[PATH_MAX]; char name_buff[PATH_MAX]; struct map *map; - - event_t ev = { - .header = { - .type = PERF_RECORD_MMAP, - }, - }; + int err; /* * We should get this from /sys/kernel/sections/.text, but till that is * available use this, and after it is use this as a fallback for older * kernels. */ struct process_symbol_args args = { .name = symbol_name, }; + event_t *event = zalloc(sizeof(event->mmap) + session->id_hdr_size); + + if (event == NULL) { + pr_debug("Not enough memory synthesizing mmap event " + "for kernel modules\n"); + return -1; + } mmap_name = machine__mmap_name(machine, name_buff, sizeof(name_buff)); if (machine__is_host(machine)) { @@ -305,10 +380,10 @@ int event__synthesize_kernel_mmap(event__handler_t process, * kernel uses PERF_RECORD_MISC_USER for user space maps, * see kernel/perf_event.c __perf_event_mmap */ - ev.header.misc = PERF_RECORD_MISC_KERNEL; + event->header.misc = PERF_RECORD_MISC_KERNEL; filename = "/proc/kallsyms"; } else { - ev.header.misc = PERF_RECORD_MISC_GUEST_KERNEL; + event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL; if (machine__is_default_guest(machine)) filename = (char *) symbol_conf.default_guest_kallsyms; else { @@ -321,17 +396,21 @@ int event__synthesize_kernel_mmap(event__handler_t process, return -ENOENT; map = machine->vmlinux_maps[MAP__FUNCTION]; - size = snprintf(ev.mmap.filename, sizeof(ev.mmap.filename), + size = snprintf(event->mmap.filename, sizeof(event->mmap.filename), "%s%s", mmap_name, symbol_name) + 1; size = ALIGN(size, sizeof(u64)); - ev.mmap.header.size = (sizeof(ev.mmap) - - (sizeof(ev.mmap.filename) - size)); - ev.mmap.pgoff = args.start; - ev.mmap.start = map->start; - ev.mmap.len = map->end - ev.mmap.start; - ev.mmap.pid = machine->pid; - - return process(&ev, session); + event->mmap.header.type = PERF_RECORD_MMAP; + event->mmap.header.size = (sizeof(event->mmap) - + (sizeof(event->mmap.filename) - size) + session->id_hdr_size); + event->mmap.pgoff = args.start; + event->mmap.start = map->start; + event->mmap.len = map->end - event->mmap.start; + event->mmap.pid = machine->pid; + + err = process(event, &synth_sample, session); + free(event); + + return err; } static void thread__comm_adjust(struct thread *self, struct hists *hists) @@ -361,7 +440,8 @@ static int thread__set_comm_adjust(struct thread *self, const char *comm, return 0; } -int event__process_comm(event_t *self, struct perf_session *session) +int event__process_comm(event_t *self, struct sample_data *sample __used, + struct perf_session *session) { struct thread *thread = perf_session__findnew(session, self->comm.tid); @@ -376,7 +456,8 @@ int event__process_comm(event_t *self, struct perf_session *session) return 0; } -int event__process_lost(event_t *self, struct perf_session *session) +int event__process_lost(event_t *self, struct sample_data *sample __used, + struct perf_session *session) { dump_printf(": id:%Ld: lost:%Ld\n", self->lost.id, self->lost.lost); session->hists.stats.total_lost += self->lost.lost; @@ -392,7 +473,7 @@ static void event_set_kernel_mmap_len(struct map **maps, event_t *self) * a zero sized synthesized MMAP event for the kernel. */ if (maps[MAP__FUNCTION]->end == 0) - maps[MAP__FUNCTION]->end = ~0UL; + maps[MAP__FUNCTION]->end = ~0ULL; } static int event__process_kernel_mmap(event_t *self, @@ -485,7 +566,8 @@ out_problem: return -1; } -int event__process_mmap(event_t *self, struct perf_session *session) +int event__process_mmap(event_t *self, struct sample_data *sample __used, + struct perf_session *session) { struct machine *machine; struct thread *thread; @@ -526,7 +608,8 @@ out_problem: return 0; } -int event__process_task(event_t *self, struct perf_session *session) +int event__process_task(event_t *self, struct sample_data *sample __used, + struct perf_session *session) { struct thread *thread = perf_session__findnew(session, self->fork.tid); struct thread *parent = perf_session__findnew(session, self->fork.ptid); @@ -548,18 +631,19 @@ int event__process_task(event_t *self, struct perf_session *session) return 0; } -int event__process(event_t *event, struct perf_session *session) +int event__process(event_t *event, struct sample_data *sample, + struct perf_session *session) { switch (event->header.type) { case PERF_RECORD_COMM: - event__process_comm(event, session); + event__process_comm(event, sample, session); break; case PERF_RECORD_MMAP: - event__process_mmap(event, session); + event__process_mmap(event, sample, session); break; case PERF_RECORD_FORK: case PERF_RECORD_EXIT: - event__process_task(event, session); + event__process_task(event, sample, session); break; default: break; @@ -674,32 +758,8 @@ int event__preprocess_sample(const event_t *self, struct perf_session *session, symbol_filter_t filter) { u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - struct thread *thread; - - event__parse_sample(self, session->sample_type, data); - - dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld cpu:%d\n", - self->header.misc, data->pid, data->tid, data->ip, - data->period, data->cpu); - - if (session->sample_type & PERF_SAMPLE_CALLCHAIN) { - unsigned int i; - - dump_printf("... chain: nr:%Lu\n", data->callchain->nr); + struct thread *thread = perf_session__findnew(session, self->ip.pid); - if (!ip_callchain__valid(data->callchain, self)) { - pr_debug("call-chain problem with event, " - "skipping it.\n"); - goto out_filtered; - } - - if (dump_trace) { - for (i = 0; i < data->callchain->nr; i++) - dump_printf("..... %2d: %016Lx\n", - i, data->callchain->ips[i]); - } - } - thread = perf_session__findnew(session, self->ip.pid); if (thread == NULL) return -1; @@ -766,9 +826,65 @@ out_filtered: return 0; } -int event__parse_sample(const event_t *event, u64 type, struct sample_data *data) +static int event__parse_id_sample(const event_t *event, + struct perf_session *session, + struct sample_data *sample) { - const u64 *array = event->sample.array; + const u64 *array; + u64 type; + + sample->cpu = sample->pid = sample->tid = -1; + sample->stream_id = sample->id = sample->time = -1ULL; + + if (!session->sample_id_all) + return 0; + + array = event->sample.array; + array += ((event->header.size - + sizeof(event->header)) / sizeof(u64)) - 1; + type = session->sample_type; + + if (type & PERF_SAMPLE_CPU) { + u32 *p = (u32 *)array; + sample->cpu = *p; + array--; + } + + if (type & PERF_SAMPLE_STREAM_ID) { + sample->stream_id = *array; + array--; + } + + if (type & PERF_SAMPLE_ID) { + sample->id = *array; + array--; + } + + if (type & PERF_SAMPLE_TIME) { + sample->time = *array; + array--; + } + + if (type & PERF_SAMPLE_TID) { + u32 *p = (u32 *)array; + sample->pid = p[0]; + sample->tid = p[1]; + } + + return 0; +} + +int event__parse_sample(const event_t *event, struct perf_session *session, + struct sample_data *data) +{ + const u64 *array; + u64 type; + + if (event->header.type != PERF_RECORD_SAMPLE) + return event__parse_id_sample(event, session, data); + + array = event->sample.array; + type = session->sample_type; if (type & PERF_SAMPLE_IP) { data->ip = event->ip.ip; diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 8e790dae7026..2b7e91902f10 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -85,6 +85,7 @@ struct build_id_event { }; enum perf_user_event_type { /* above any possible kernel type */ + PERF_RECORD_USER_TYPE_START = 64, PERF_RECORD_HEADER_ATTR = 64, PERF_RECORD_HEADER_EVENT_TYPE = 65, PERF_RECORD_HEADER_TRACING_DATA = 66, @@ -135,12 +136,15 @@ void event__print_totals(void); struct perf_session; -typedef int (*event__handler_t)(event_t *event, struct perf_session *session); +typedef int (*event__handler_synth_t)(event_t *event, + struct perf_session *session); +typedef int (*event__handler_t)(event_t *event, struct sample_data *sample, + struct perf_session *session); int event__synthesize_thread(pid_t pid, event__handler_t process, struct perf_session *session); -void event__synthesize_threads(event__handler_t process, - struct perf_session *session); +int event__synthesize_threads(event__handler_t process, + struct perf_session *session); int event__synthesize_kernel_mmap(event__handler_t process, struct perf_session *session, struct machine *machine, @@ -150,18 +154,24 @@ int event__synthesize_modules(event__handler_t process, struct perf_session *session, struct machine *machine); -int event__process_comm(event_t *self, struct perf_session *session); -int event__process_lost(event_t *self, struct perf_session *session); -int event__process_mmap(event_t *self, struct perf_session *session); -int event__process_task(event_t *self, struct perf_session *session); -int event__process(event_t *event, struct perf_session *session); +int event__process_comm(event_t *self, struct sample_data *sample, + struct perf_session *session); +int event__process_lost(event_t *self, struct sample_data *sample, + struct perf_session *session); +int event__process_mmap(event_t *self, struct sample_data *sample, + struct perf_session *session); +int event__process_task(event_t *self, struct sample_data *sample, + struct perf_session *session); +int event__process(event_t *event, struct sample_data *sample, + struct perf_session *session); struct addr_location; int event__preprocess_sample(const event_t *self, struct perf_session *session, struct addr_location *al, struct sample_data *data, symbol_filter_t filter); -int event__parse_sample(const event_t *event, u64 type, struct sample_data *data); +int event__parse_sample(const event_t *event, struct perf_session *session, + struct sample_data *sample); -extern const char *event__name[]; +const char *event__get_event_name(unsigned int id); #endif /* __PERF_RECORD_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c new file mode 100644 index 000000000000..c95267e63c5b --- /dev/null +++ b/tools/perf/util/evsel.c @@ -0,0 +1,186 @@ +#include "evsel.h" +#include "../perf.h" +#include "util.h" +#include "cpumap.h" +#include "thread.h" + +#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) + +struct perf_evsel *perf_evsel__new(u32 type, u64 config, int idx) +{ + struct perf_evsel *evsel = zalloc(sizeof(*evsel)); + + if (evsel != NULL) { + evsel->idx = idx; + evsel->attr.type = type; + evsel->attr.config = config; + INIT_LIST_HEAD(&evsel->node); + } + + return evsel; +} + +int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads) +{ + evsel->fd = xyarray__new(ncpus, nthreads, sizeof(int)); + return evsel->fd != NULL ? 0 : -ENOMEM; +} + +int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus) +{ + evsel->counts = zalloc((sizeof(*evsel->counts) + + (ncpus * sizeof(struct perf_counts_values)))); + return evsel->counts != NULL ? 0 : -ENOMEM; +} + +void perf_evsel__free_fd(struct perf_evsel *evsel) +{ + xyarray__delete(evsel->fd); + evsel->fd = NULL; +} + +void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads) +{ + int cpu, thread; + + for (cpu = 0; cpu < ncpus; cpu++) + for (thread = 0; thread < nthreads; ++thread) { + close(FD(evsel, cpu, thread)); + FD(evsel, cpu, thread) = -1; + } +} + +void perf_evsel__delete(struct perf_evsel *evsel) +{ + assert(list_empty(&evsel->node)); + xyarray__delete(evsel->fd); + free(evsel); +} + +int __perf_evsel__read_on_cpu(struct perf_evsel *evsel, + int cpu, int thread, bool scale) +{ + struct perf_counts_values count; + size_t nv = scale ? 3 : 1; + + if (FD(evsel, cpu, thread) < 0) + return -EINVAL; + + if (evsel->counts == NULL && perf_evsel__alloc_counts(evsel, cpu + 1) < 0) + return -ENOMEM; + + if (readn(FD(evsel, cpu, thread), &count, nv * sizeof(u64)) < 0) + return -errno; + + if (scale) { + if (count.run == 0) + count.val = 0; + else if (count.run < count.ena) + count.val = (u64)((double)count.val * count.ena / count.run + 0.5); + } else + count.ena = count.run = 0; + + evsel->counts->cpu[cpu] = count; + return 0; +} + +int __perf_evsel__read(struct perf_evsel *evsel, + int ncpus, int nthreads, bool scale) +{ + size_t nv = scale ? 3 : 1; + int cpu, thread; + struct perf_counts_values *aggr = &evsel->counts->aggr, count; + + aggr->val = 0; + + for (cpu = 0; cpu < ncpus; cpu++) { + for (thread = 0; thread < nthreads; thread++) { + if (FD(evsel, cpu, thread) < 0) + continue; + + if (readn(FD(evsel, cpu, thread), + &count, nv * sizeof(u64)) < 0) + return -errno; + + aggr->val += count.val; + if (scale) { + aggr->ena += count.ena; + aggr->run += count.run; + } + } + } + + evsel->counts->scaled = 0; + if (scale) { + if (aggr->run == 0) { + evsel->counts->scaled = -1; + aggr->val = 0; + return 0; + } + + if (aggr->run < aggr->ena) { + evsel->counts->scaled = 1; + aggr->val = (u64)((double)aggr->val * aggr->ena / aggr->run + 0.5); + } + } else + aggr->ena = aggr->run = 0; + + return 0; +} + +int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus) +{ + int cpu; + + if (evsel->fd == NULL && perf_evsel__alloc_fd(evsel, cpus->nr, 1) < 0) + return -1; + + for (cpu = 0; cpu < cpus->nr; cpu++) { + FD(evsel, cpu, 0) = sys_perf_event_open(&evsel->attr, -1, + cpus->map[cpu], -1, 0); + if (FD(evsel, cpu, 0) < 0) + goto out_close; + } + + return 0; + +out_close: + while (--cpu >= 0) { + close(FD(evsel, cpu, 0)); + FD(evsel, cpu, 0) = -1; + } + return -1; +} + +int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads) +{ + int thread; + + if (evsel->fd == NULL && perf_evsel__alloc_fd(evsel, 1, threads->nr)) + return -1; + + for (thread = 0; thread < threads->nr; thread++) { + FD(evsel, 0, thread) = sys_perf_event_open(&evsel->attr, + threads->map[thread], -1, -1, 0); + if (FD(evsel, 0, thread) < 0) + goto out_close; + } + + return 0; + +out_close: + while (--thread >= 0) { + close(FD(evsel, 0, thread)); + FD(evsel, 0, thread) = -1; + } + return -1; +} + +int perf_evsel__open(struct perf_evsel *evsel, + struct cpu_map *cpus, struct thread_map *threads) +{ + if (threads == NULL) + return perf_evsel__open_per_cpu(evsel, cpus); + + return perf_evsel__open_per_thread(evsel, threads); +} diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h new file mode 100644 index 000000000000..a0ccd69c3fc2 --- /dev/null +++ b/tools/perf/util/evsel.h @@ -0,0 +1,115 @@ +#ifndef __PERF_EVSEL_H +#define __PERF_EVSEL_H 1 + +#include <linux/list.h> +#include <stdbool.h> +#include "../../../include/linux/perf_event.h" +#include "types.h" +#include "xyarray.h" + +struct perf_counts_values { + union { + struct { + u64 val; + u64 ena; + u64 run; + }; + u64 values[3]; + }; +}; + +struct perf_counts { + s8 scaled; + struct perf_counts_values aggr; + struct perf_counts_values cpu[]; +}; + +struct perf_evsel { + struct list_head node; + struct perf_event_attr attr; + char *filter; + struct xyarray *fd; + struct perf_counts *counts; + int idx; + void *priv; +}; + +struct cpu_map; +struct thread_map; + +struct perf_evsel *perf_evsel__new(u32 type, u64 config, int idx); +void perf_evsel__delete(struct perf_evsel *evsel); + +int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); +int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus); +void perf_evsel__free_fd(struct perf_evsel *evsel); +void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads); + +int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus); +int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads); +int perf_evsel__open(struct perf_evsel *evsel, + struct cpu_map *cpus, struct thread_map *threads); + +#define perf_evsel__match(evsel, t, c) \ + (evsel->attr.type == PERF_TYPE_##t && \ + evsel->attr.config == PERF_COUNT_##c) + +int __perf_evsel__read_on_cpu(struct perf_evsel *evsel, + int cpu, int thread, bool scale); + +/** + * perf_evsel__read_on_cpu - Read out the results on a CPU and thread + * + * @evsel - event selector to read value + * @cpu - CPU of interest + * @thread - thread of interest + */ +static inline int perf_evsel__read_on_cpu(struct perf_evsel *evsel, + int cpu, int thread) +{ + return __perf_evsel__read_on_cpu(evsel, cpu, thread, false); +} + +/** + * perf_evsel__read_on_cpu_scaled - Read out the results on a CPU and thread, scaled + * + * @evsel - event selector to read value + * @cpu - CPU of interest + * @thread - thread of interest + */ +static inline int perf_evsel__read_on_cpu_scaled(struct perf_evsel *evsel, + int cpu, int thread) +{ + return __perf_evsel__read_on_cpu(evsel, cpu, thread, true); +} + +int __perf_evsel__read(struct perf_evsel *evsel, int ncpus, int nthreads, + bool scale); + +/** + * perf_evsel__read - Read the aggregate results on all CPUs + * + * @evsel - event selector to read value + * @ncpus - Number of cpus affected, from zero + * @nthreads - Number of threads affected, from zero + */ +static inline int perf_evsel__read(struct perf_evsel *evsel, + int ncpus, int nthreads) +{ + return __perf_evsel__read(evsel, ncpus, nthreads, false); +} + +/** + * perf_evsel__read_scaled - Read the aggregate results on all CPUs, scaled + * + * @evsel - event selector to read value + * @ncpus - Number of cpus affected, from zero + * @nthreads - Number of threads affected, from zero + */ +static inline int perf_evsel__read_scaled(struct perf_evsel *evsel, + int ncpus, int nthreads) +{ + return __perf_evsel__read(evsel, ncpus, nthreads, true); +} + +#endif /* __PERF_EVSEL_H */ diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 7cba0551a565..989fa2dee2fd 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -152,6 +152,11 @@ void perf_header__set_feat(struct perf_header *self, int feat) set_bit(feat, self->adds_features); } +void perf_header__clear_feat(struct perf_header *self, int feat) +{ + clear_bit(feat, self->adds_features); +} + bool perf_header__has_feat(const struct perf_header *self, int feat) { return test_bit(feat, self->adds_features); @@ -433,8 +438,10 @@ static int perf_header__adds_write(struct perf_header *self, int fd) int idx = 0, err; session = container_of(self, struct perf_session, header); - if (perf_session__read_build_ids(session, true)) - perf_header__set_feat(self, HEADER_BUILD_ID); + + if (perf_header__has_feat(self, HEADER_BUILD_ID && + !perf_session__read_build_ids(session, true))) + perf_header__clear_feat(self, HEADER_BUILD_ID); nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS); if (!nr_sections) @@ -456,7 +463,7 @@ static int perf_header__adds_write(struct perf_header *self, int fd) /* Write trace info */ trace_sec->offset = lseek(fd, 0, SEEK_CUR); - read_tracing_data(fd, attrs, nr_counters); + read_tracing_data(fd, &evsel_list); trace_sec->size = lseek(fd, 0, SEEK_CUR) - trace_sec->offset; } @@ -599,7 +606,7 @@ int perf_header__write(struct perf_header *self, int fd, bool at_exit) static int perf_header__getbuffer64(struct perf_header *self, int fd, void *buf, size_t size) { - if (do_read(fd, buf, size) <= 0) + if (readn(fd, buf, size) <= 0) return -1; if (self->needs_swap) @@ -655,7 +662,7 @@ int perf_file_header__read(struct perf_file_header *self, { lseek(fd, 0, SEEK_SET); - if (do_read(fd, self, sizeof(*self)) <= 0 || + if (readn(fd, self, sizeof(*self)) <= 0 || memcmp(&self->magic, __perf_magic, sizeof(self->magic))) return -1; @@ -816,7 +823,7 @@ static int perf_file_header__read_pipe(struct perf_pipe_file_header *self, struct perf_header *ph, int fd, bool repipe) { - if (do_read(fd, self, sizeof(*self)) <= 0 || + if (readn(fd, self, sizeof(*self)) <= 0 || memcmp(&self->magic, __perf_magic, sizeof(self->magic))) return -1; @@ -941,6 +948,24 @@ u64 perf_header__sample_type(struct perf_header *header) return type; } +bool perf_header__sample_id_all(const struct perf_header *header) +{ + bool value = false, first = true; + int i; + + for (i = 0; i < header->attrs; i++) { + struct perf_header_attr *attr = header->attr[i]; + + if (first) { + value = attr->attr.sample_id_all; + first = false; + } else if (value != attr->attr.sample_id_all) + die("non matching sample_id_all"); + } + + return value; +} + struct perf_event_attr * perf_header__find_attr(u64 id, struct perf_header *header) { @@ -987,21 +1012,23 @@ int event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, ev = malloc(size); + if (ev == NULL) + return -ENOMEM; + ev->attr.attr = *attr; memcpy(ev->attr.id, id, ids * sizeof(u64)); ev->attr.header.type = PERF_RECORD_HEADER_ATTR; ev->attr.header.size = size; - err = process(ev, session); + err = process(ev, NULL, session); free(ev); return err; } -int event__synthesize_attrs(struct perf_header *self, - event__handler_t process, +int event__synthesize_attrs(struct perf_header *self, event__handler_t process, struct perf_session *session) { struct perf_header_attr *attr; @@ -1071,7 +1098,7 @@ int event__synthesize_event_type(u64 event_id, char *name, ev.event_type.header.size = sizeof(ev.event_type) - (sizeof(ev.event_type.event_type.name) - size); - err = process(&ev, session); + err = process(&ev, NULL, session); return err; } @@ -1106,8 +1133,7 @@ int event__process_event_type(event_t *self, return 0; } -int event__synthesize_tracing_data(int fd, struct perf_event_attr *pattrs, - int nb_events, +int event__synthesize_tracing_data(int fd, struct list_head *pattrs, event__handler_t process, struct perf_session *session __unused) { @@ -1118,7 +1144,7 @@ int event__synthesize_tracing_data(int fd, struct perf_event_attr *pattrs, memset(&ev, 0, sizeof(ev)); ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA; - size = read_tracing_data_size(fd, pattrs, nb_events); + size = read_tracing_data_size(fd, pattrs); if (size <= 0) return size; aligned_size = ALIGN(size, sizeof(u64)); @@ -1126,9 +1152,9 @@ int event__synthesize_tracing_data(int fd, struct perf_event_attr *pattrs, ev.tracing_data.header.size = sizeof(ev.tracing_data); ev.tracing_data.size = aligned_size; - process(&ev, session); + process(&ev, NULL, session); - err = read_tracing_data(fd, pattrs, nb_events); + err = read_tracing_data(fd, pattrs); write_padded(fd, NULL, 0, padding); return aligned_size; @@ -1186,7 +1212,7 @@ int event__synthesize_build_id(struct dso *pos, u16 misc, ev.build_id.header.size = sizeof(ev.build_id) + len; memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len); - err = process(&ev, session); + err = process(&ev, NULL, session); return err; } diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 402ac2454cf8..33f16be7b72f 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -81,9 +81,11 @@ void perf_header_attr__delete(struct perf_header_attr *self); int perf_header_attr__add_id(struct perf_header_attr *self, u64 id); u64 perf_header__sample_type(struct perf_header *header); +bool perf_header__sample_id_all(const struct perf_header *header); struct perf_event_attr * perf_header__find_attr(u64 id, struct perf_header *header); void perf_header__set_feat(struct perf_header *self, int feat); +void perf_header__clear_feat(struct perf_header *self, int feat); bool perf_header__has_feat(const struct perf_header *self, int feat); int perf_header__process_sections(struct perf_header *self, int fd, @@ -111,8 +113,7 @@ int event__synthesize_event_types(event__handler_t process, int event__process_event_type(event_t *self, struct perf_session *session); -int event__synthesize_tracing_data(int fd, struct perf_event_attr *pattrs, - int nb_events, +int event__synthesize_tracing_data(int fd, struct list_head *pattrs, event__handler_t process, struct perf_session *session); int event__process_tracing_data(event_t *self, diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 2022e8740994..c749ba6136a0 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -356,7 +356,7 @@ static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask, static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain, int depth, int depth_mask, int period, - u64 total_samples, int hits, + u64 total_samples, u64 hits, int left_margin) { int i; @@ -1092,6 +1092,12 @@ int hist_entry__annotate(struct hist_entry *self, struct list_head *head, FILE *file; int err = 0; u64 len; + char symfs_filename[PATH_MAX]; + + if (filename) { + snprintf(symfs_filename, sizeof(symfs_filename), "%s%s", + symbol_conf.symfs, filename); + } if (filename == NULL) { if (dso->has_build_id) { @@ -1100,9 +1106,9 @@ int hist_entry__annotate(struct hist_entry *self, struct list_head *head, return -ENOMEM; } goto fallback; - } else if (readlink(filename, command, sizeof(command)) < 0 || + } else if (readlink(symfs_filename, command, sizeof(command)) < 0 || strstr(command, "[kernel.kallsyms]") || - access(filename, R_OK)) { + access(symfs_filename, R_OK)) { free(filename); fallback: /* @@ -1111,6 +1117,8 @@ fallback: * DSO is the same as when 'perf record' ran. */ filename = dso->long_name; + snprintf(symfs_filename, sizeof(symfs_filename), "%s%s", + symbol_conf.symfs, filename); free_filename = false; } @@ -1137,7 +1145,7 @@ fallback: "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS -C %s|grep -v %s|expand", map__rip_2objdump(map, sym->start), map__rip_2objdump(map, sym->end), - filename, filename); + symfs_filename, filename); pr_debug("Executing: %s\n", command); @@ -1168,10 +1176,13 @@ size_t hists__fprintf_nr_events(struct hists *self, FILE *fp) size_t ret = 0; for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) { - if (!event__name[i]) + const char *name = event__get_event_name(i); + + if (!strcmp(name, "UNKNOWN")) continue; - ret += fprintf(fp, "%10s events: %10d\n", - event__name[i], self->stats.nr_events[i]); + + ret += fprintf(fp, "%16s events: %10d\n", name, + self->stats.nr_events[i]); } return ret; diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 587d375d3430..ee789856a8c9 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -52,8 +52,10 @@ struct sym_priv { struct events_stats { u64 total_period; u64 total_lost; + u64 total_invalid_chains; u32 nr_events[PERF_RECORD_HEADER_MAX]; u32 nr_unknown_events; + u32 nr_invalid_chains; }; enum hist_column { diff --git a/tools/perf/util/include/asm/cpufeature.h b/tools/perf/util/include/asm/cpufeature.h new file mode 100644 index 000000000000..acffd5e4d1d4 --- /dev/null +++ b/tools/perf/util/include/asm/cpufeature.h @@ -0,0 +1,9 @@ + +#ifndef PERF_CPUFEATURE_H +#define PERF_CPUFEATURE_H + +/* cpufeature.h ... dummy header file for including arch/x86/lib/memcpy_64.S */ + +#define X86_FEATURE_REP_GOOD 0 + +#endif /* PERF_CPUFEATURE_H */ diff --git a/tools/perf/util/include/asm/dwarf2.h b/tools/perf/util/include/asm/dwarf2.h new file mode 100644 index 000000000000..bb4198e7837a --- /dev/null +++ b/tools/perf/util/include/asm/dwarf2.h @@ -0,0 +1,11 @@ + +#ifndef PERF_DWARF2_H +#define PERF_DWARF2_H + +/* dwarf2.h ... dummy header file for including arch/x86/lib/memcpy_64.S */ + +#define CFI_STARTPROC +#define CFI_ENDPROC + +#endif /* PERF_DWARF2_H */ + diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h index bb4ac2e05385..8be0b968ca0b 100644 --- a/tools/perf/util/include/linux/bitops.h +++ b/tools/perf/util/include/linux/bitops.h @@ -13,6 +13,11 @@ static inline void set_bit(int nr, unsigned long *addr) addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG); } +static inline void clear_bit(int nr, unsigned long *addr) +{ + addr[nr / BITS_PER_LONG] &= ~(1UL << (nr % BITS_PER_LONG)); +} + static __always_inline int test_bit(unsigned int nr, const unsigned long *addr) { return ((1UL << (nr % BITS_PER_LONG)) & diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h new file mode 100644 index 000000000000..06387cffe125 --- /dev/null +++ b/tools/perf/util/include/linux/linkage.h @@ -0,0 +1,13 @@ + +#ifndef PERF_LINUX_LINKAGE_H_ +#define PERF_LINUX_LINKAGE_H_ + +/* linkage.h ... for including arch/x86/lib/memcpy_64.S */ + +#define ENTRY(name) \ + .globl name; \ + name: + +#define ENDPROC(name) + +#endif /* PERF_LINUX_LINKAGE_H_ */ diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 4af5bd59cfd1..649083f27e08 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1,6 +1,7 @@ #include "../../../include/linux/hw_breakpoint.h" #include "util.h" #include "../perf.h" +#include "evsel.h" #include "parse-options.h" #include "parse-events.h" #include "exec_cmd.h" @@ -12,8 +13,7 @@ int nr_counters; -struct perf_event_attr attrs[MAX_COUNTERS]; -char *filters[MAX_COUNTERS]; +LIST_HEAD(evsel_list); struct event_symbol { u8 type; @@ -266,10 +266,10 @@ static char *event_cache_name(u8 cache_type, u8 cache_op, u8 cache_result) return name; } -const char *event_name(int counter) +const char *event_name(struct perf_evsel *evsel) { - u64 config = attrs[counter].config; - int type = attrs[counter].type; + u64 config = evsel->attr.config; + int type = evsel->attr.type; return __event_name(type, config); } @@ -434,7 +434,7 @@ parse_single_tracepoint_event(char *sys_name, id = atoll(id_buf); attr->config = id; attr->type = PERF_TYPE_TRACEPOINT; - *strp = evt_name + evt_length; + *strp += strlen(sys_name) + evt_length + 1; /* + 1 for the ':' */ attr->sample_type |= PERF_SAMPLE_RAW; attr->sample_type |= PERF_SAMPLE_TIME; @@ -495,7 +495,7 @@ static enum event_result parse_tracepoint_event(const char **strp, struct perf_event_attr *attr) { const char *evt_name; - char *flags; + char *flags = NULL, *comma_loc; char sys_name[MAX_EVENT_LENGTH]; unsigned int sys_length, evt_length; @@ -514,6 +514,11 @@ static enum event_result parse_tracepoint_event(const char **strp, sys_name[sys_length] = '\0'; evt_name = evt_name + 1; + comma_loc = strchr(evt_name, ','); + if (comma_loc) { + /* take the event name up to the comma */ + evt_name = strndup(evt_name, comma_loc - evt_name); + } flags = strchr(evt_name, ':'); if (flags) { /* split it out: */ @@ -524,9 +529,8 @@ static enum event_result parse_tracepoint_event(const char **strp, evt_length = strlen(evt_name); if (evt_length >= MAX_EVENT_LENGTH) return EVT_FAILED; - if (strpbrk(evt_name, "*?")) { - *strp = evt_name + evt_length; + *strp += strlen(sys_name) + evt_length; return parse_multiple_tracepoint_event(sys_name, evt_name, flags); } else @@ -810,9 +814,6 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u return -1; for (;;) { - if (nr_counters == MAX_COUNTERS) - return -1; - memset(&attr, 0, sizeof(attr)); ret = parse_event_symbols(&str, &attr); if (ret == EVT_FAILED) @@ -822,8 +823,13 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u return -1; if (ret != EVT_HANDLED_ALL) { - attrs[nr_counters] = attr; - nr_counters++; + struct perf_evsel *evsel; + evsel = perf_evsel__new(attr.type, attr.config, + nr_counters); + if (evsel == NULL) + return -1; + list_add_tail(&evsel->node, &evsel_list); + ++nr_counters; } if (*str == 0) @@ -840,21 +846,22 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u int parse_filter(const struct option *opt __used, const char *str, int unset __used) { - int i = nr_counters - 1; - int len = strlen(str); + struct perf_evsel *last = NULL; - if (i < 0 || attrs[i].type != PERF_TYPE_TRACEPOINT) { + if (!list_empty(&evsel_list)) + last = list_entry(evsel_list.prev, struct perf_evsel, node); + + if (last == NULL || last->attr.type != PERF_TYPE_TRACEPOINT) { fprintf(stderr, "-F option should follow a -e tracepoint option\n"); return -1; } - filters[i] = malloc(len + 1); - if (!filters[i]) { + last->filter = strdup(str); + if (last->filter == NULL) { fprintf(stderr, "not enough memory to hold filter string\n"); return -1; } - strcpy(filters[i], str); return 0; } @@ -906,6 +913,47 @@ static void print_tracepoint_events(void) } /* + * Check whether event is in <debugfs_mount_point>/tracing/events + */ + +int is_valid_tracepoint(const char *event_string) +{ + DIR *sys_dir, *evt_dir; + struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent; + char evt_path[MAXPATHLEN]; + char dir_path[MAXPATHLEN]; + + if (debugfs_valid_mountpoint(debugfs_path)) + return 0; + + sys_dir = opendir(debugfs_path); + if (!sys_dir) + return 0; + + for_each_subsystem(sys_dir, sys_dirent, sys_next) { + + snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path, + sys_dirent.d_name); + evt_dir = opendir(dir_path); + if (!evt_dir) + continue; + + for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) { + snprintf(evt_path, MAXPATHLEN, "%s:%s", + sys_dirent.d_name, evt_dirent.d_name); + if (!strcmp(evt_path, event_string)) { + closedir(evt_dir); + closedir(sys_dir); + return 1; + } + } + closedir(evt_dir); + } + closedir(sys_dir); + return 0; +} + +/* * Print the help text for the event symbols: */ void print_events(void) @@ -963,3 +1011,26 @@ void print_events(void) exit(129); } + +int perf_evsel_list__create_default(void) +{ + struct perf_evsel *evsel = perf_evsel__new(PERF_TYPE_HARDWARE, + PERF_COUNT_HW_CPU_CYCLES, 0); + if (evsel == NULL) + return -ENOMEM; + + list_add(&evsel->node, &evsel_list); + ++nr_counters; + return 0; +} + +void perf_evsel_list__delete(void) +{ + struct perf_evsel *pos, *n; + + list_for_each_entry_safe(pos, n, &evsel_list, node) { + list_del_init(&pos->node); + perf_evsel__delete(pos); + } + nr_counters = 0; +} diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index fc4ab3fe877a..b82cafb83772 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -4,6 +4,16 @@ * Parse symbolic events/counts passed in as options: */ +#include "../../../include/linux/perf_event.h" + +struct list_head; +struct perf_evsel; + +extern struct list_head evsel_list; + +int perf_evsel_list__create_default(void); +void perf_evsel_list__delete(void); + struct option; struct tracepoint_path { @@ -13,14 +23,11 @@ struct tracepoint_path { }; extern struct tracepoint_path *tracepoint_id_to_path(u64 config); -extern bool have_tracepoints(struct perf_event_attr *pattrs, int nb_events); +extern bool have_tracepoints(struct list_head *evsel_list); extern int nr_counters; -extern struct perf_event_attr attrs[MAX_COUNTERS]; -extern char *filters[MAX_COUNTERS]; - -extern const char *event_name(int ctr); +const char *event_name(struct perf_evsel *event); extern const char *__event_name(int type, u64 config); extern int parse_events(const struct option *opt, const char *str, int unset); @@ -29,9 +36,9 @@ extern int parse_filter(const struct option *opt, const char *str, int unset); #define EVENTS_HELP_MAX (128*1024) extern void print_events(void); +extern int is_valid_tracepoint(const char *event_string); extern char debugfs_path[]; extern int valid_debugfs_mount(const char *debugfs); - #endif /* __PERF_PARSE_EVENTS_H */ diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h index c7d72dce54b2..abc31a1dac1a 100644 --- a/tools/perf/util/parse-options.h +++ b/tools/perf/util/parse-options.h @@ -119,6 +119,10 @@ struct option { { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .flags = PARSE_OPT_NOARG } #define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d) \ { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d, .flags = PARSE_OPT_LASTARG_DEFAULT } +#define OPT_CALLBACK_DEFAULT_NOOPT(s, l, v, a, h, f, d) \ + { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l),\ + .value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d,\ + .flags = PARSE_OPT_LASTARG_DEFAULT | PARSE_OPT_NOARG} /* parse_options() will filter out the processed options and leave the * non-option argments in argv[]. diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 61191c6cbe7a..128aaab0aeda 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -95,7 +95,7 @@ static int init_vmlinux(void) goto out; if (machine__create_kernel_maps(&machine) < 0) { - pr_debug("machine__create_kernel_maps "); + pr_debug("machine__create_kernel_maps() failed.\n"); goto out; } out: @@ -149,7 +149,8 @@ static int open_vmlinux(const char *module) { const char *path = kernel_get_module_path(module); if (!path) { - pr_err("Failed to find path of %s module", module ?: "kernel"); + pr_err("Failed to find path of %s module.\n", + module ?: "kernel"); return -ENOENT; } pr_debug("Try to open %s\n", path); @@ -226,7 +227,7 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev, pr_warning("Warning: No dwarf info found in the vmlinux - " "please rebuild kernel with CONFIG_DEBUG_INFO=y.\n"); if (!need_dwarf) { - pr_debug("Trying to use symbols.\nn"); + pr_debug("Trying to use symbols.\n"); return 0; } } @@ -295,42 +296,49 @@ static int get_real_path(const char *raw_path, const char *comp_dir, #define LINEBUF_SIZE 256 #define NR_ADDITIONAL_LINES 2 -static int show_one_line(FILE *fp, int l, bool skip, bool show_num) +static int __show_one_line(FILE *fp, int l, bool skip, bool show_num) { char buf[LINEBUF_SIZE]; - const char *color = PERF_COLOR_BLUE; + const char *color = show_num ? "" : PERF_COLOR_BLUE; + const char *prefix = NULL; - if (fgets(buf, LINEBUF_SIZE, fp) == NULL) - goto error; - if (!skip) { - if (show_num) - fprintf(stdout, "%7d %s", l, buf); - else - color_fprintf(stdout, color, " %s", buf); - } - - while (strlen(buf) == LINEBUF_SIZE - 1 && - buf[LINEBUF_SIZE - 2] != '\n') { + do { if (fgets(buf, LINEBUF_SIZE, fp) == NULL) goto error; - if (!skip) { - if (show_num) - fprintf(stdout, "%s", buf); - else - color_fprintf(stdout, color, "%s", buf); + if (skip) + continue; + if (!prefix) { + prefix = show_num ? "%7d " : " "; + color_fprintf(stdout, color, prefix, l); } - } + color_fprintf(stdout, color, "%s", buf); - return 0; + } while (strchr(buf, '\n') == NULL); + + return 1; error: - if (feof(fp)) - pr_warning("Source file is shorter than expected.\n"); - else + if (ferror(fp)) { pr_warning("File read error: %s\n", strerror(errno)); + return -1; + } + return 0; +} - return -1; +static int _show_one_line(FILE *fp, int l, bool skip, bool show_num) +{ + int rv = __show_one_line(fp, l, skip, show_num); + if (rv == 0) { + pr_warning("Source file is shorter than expected.\n"); + rv = -1; + } + return rv; } +#define show_one_line_with_num(f,l) _show_one_line(f,l,false,true) +#define show_one_line(f,l) _show_one_line(f,l,false,false) +#define skip_one_line(f,l) _show_one_line(f,l,true,false) +#define show_one_line_or_eof(f,l) __show_one_line(f,l,false,false) + /* * Show line-range always requires debuginfo to find source file and * line number. @@ -379,7 +387,7 @@ int show_line_range(struct line_range *lr, const char *module) fprintf(stdout, "<%s:%d>\n", lr->function, lr->start - lr->offset); else - fprintf(stdout, "<%s:%d>\n", lr->file, lr->start); + fprintf(stdout, "<%s:%d>\n", lr->path, lr->start); fp = fopen(lr->path, "r"); if (fp == NULL) { @@ -388,26 +396,30 @@ int show_line_range(struct line_range *lr, const char *module) return -errno; } /* Skip to starting line number */ - while (l < lr->start && ret >= 0) - ret = show_one_line(fp, l++, true, false); - if (ret < 0) - goto end; + while (l < lr->start) { + ret = skip_one_line(fp, l++); + if (ret < 0) + goto end; + } list_for_each_entry(ln, &lr->line_list, list) { - while (ln->line > l && ret >= 0) - ret = show_one_line(fp, (l++) - lr->offset, - false, false); - if (ret >= 0) - ret = show_one_line(fp, (l++) - lr->offset, - false, true); + for (; ln->line > l; l++) { + ret = show_one_line(fp, l - lr->offset); + if (ret < 0) + goto end; + } + ret = show_one_line_with_num(fp, l++ - lr->offset); if (ret < 0) goto end; } if (lr->end == INT_MAX) lr->end = l + NR_ADDITIONAL_LINES; - while (l <= lr->end && !feof(fp) && ret >= 0) - ret = show_one_line(fp, (l++) - lr->offset, false, false); + while (l <= lr->end) { + ret = show_one_line_or_eof(fp, l++ - lr->offset); + if (ret <= 0) + break; + } end: fclose(fp); return ret; @@ -466,7 +478,7 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs, fd = open_vmlinux(module); if (fd < 0) { - pr_warning("Failed to open debuginfo file.\n"); + pr_warning("Failed to open debug information file.\n"); return fd; } @@ -526,56 +538,87 @@ int show_available_vars(struct perf_probe_event *pevs __unused, } #endif +static int parse_line_num(char **ptr, int *val, const char *what) +{ + const char *start = *ptr; + + errno = 0; + *val = strtol(*ptr, ptr, 0); + if (errno || *ptr == start) { + semantic_error("'%s' is not a valid number.\n", what); + return -EINVAL; + } + return 0; +} + +/* + * Stuff 'lr' according to the line range described by 'arg'. + * The line range syntax is described by: + * + * SRC[:SLN[+NUM|-ELN]] + * FNC[:SLN[+NUM|-ELN]] + */ int parse_line_range_desc(const char *arg, struct line_range *lr) { - const char *ptr; - char *tmp; - /* - * <Syntax> - * SRC:SLN[+NUM|-ELN] - * FUNC[:SLN[+NUM|-ELN]] - */ - ptr = strchr(arg, ':'); - if (ptr) { - lr->start = (int)strtoul(ptr + 1, &tmp, 0); - if (*tmp == '+') { - lr->end = lr->start + (int)strtoul(tmp + 1, &tmp, 0); - lr->end--; /* - * Adjust the number of lines here. - * If the number of lines == 1, the - * the end of line should be equal to - * the start of line. - */ - } else if (*tmp == '-') - lr->end = (int)strtoul(tmp + 1, &tmp, 0); - else - lr->end = INT_MAX; + char *range, *name = strdup(arg); + int err; + + if (!name) + return -ENOMEM; + + lr->start = 0; + lr->end = INT_MAX; + + range = strchr(name, ':'); + if (range) { + *range++ = '\0'; + + err = parse_line_num(&range, &lr->start, "start line"); + if (err) + goto err; + + if (*range == '+' || *range == '-') { + const char c = *range++; + + err = parse_line_num(&range, &lr->end, "end line"); + if (err) + goto err; + + if (c == '+') { + lr->end += lr->start; + /* + * Adjust the number of lines here. + * If the number of lines == 1, the + * the end of line should be equal to + * the start of line. + */ + lr->end--; + } + } + pr_debug("Line range is %d to %d\n", lr->start, lr->end); + + err = -EINVAL; if (lr->start > lr->end) { semantic_error("Start line must be smaller" " than end line.\n"); - return -EINVAL; + goto err; } - if (*tmp != '\0') { - semantic_error("Tailing with invalid character '%d'.\n", - *tmp); - return -EINVAL; + if (*range != '\0') { + semantic_error("Tailing with invalid str '%s'.\n", range); + goto err; } - tmp = strndup(arg, (ptr - arg)); - } else { - tmp = strdup(arg); - lr->end = INT_MAX; } - if (tmp == NULL) - return -ENOMEM; - - if (strchr(tmp, '.')) - lr->file = tmp; + if (strchr(name, '.')) + lr->file = name; else - lr->function = tmp; + lr->function = name; return 0; +err: + free(name); + return err; } /* Check the name is good for event/group */ @@ -699,39 +742,40 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev) /* Exclusion check */ if (pp->lazy_line && pp->line) { - semantic_error("Lazy pattern can't be used with line number."); + semantic_error("Lazy pattern can't be used with" + " line number.\n"); return -EINVAL; } if (pp->lazy_line && pp->offset) { - semantic_error("Lazy pattern can't be used with offset."); + semantic_error("Lazy pattern can't be used with offset.\n"); return -EINVAL; } if (pp->line && pp->offset) { - semantic_error("Offset can't be used with line number."); + semantic_error("Offset can't be used with line number.\n"); return -EINVAL; } if (!pp->line && !pp->lazy_line && pp->file && !pp->function) { semantic_error("File always requires line number or " - "lazy pattern."); + "lazy pattern.\n"); return -EINVAL; } if (pp->offset && !pp->function) { - semantic_error("Offset requires an entry function."); + semantic_error("Offset requires an entry function.\n"); return -EINVAL; } if (pp->retprobe && !pp->function) { - semantic_error("Return probe requires an entry function."); + semantic_error("Return probe requires an entry function.\n"); return -EINVAL; } if ((pp->offset || pp->line || pp->lazy_line) && pp->retprobe) { semantic_error("Offset/Line/Lazy pattern can't be used with " - "return probe."); + "return probe.\n"); return -EINVAL; } @@ -1005,7 +1049,7 @@ int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf, size_t len) return tmp - buf; error: - pr_debug("Failed to synthesize perf probe argument: %s", + pr_debug("Failed to synthesize perf probe argument: %s\n", strerror(-ret)); return ret; } @@ -1033,13 +1077,13 @@ static char *synthesize_perf_probe_point(struct perf_probe_point *pp) goto error; } if (pp->file) { - len = strlen(pp->file) - 31; - if (len < 0) - len = 0; - tmp = strchr(pp->file + len, '/'); - if (!tmp) - tmp = pp->file + len; - ret = e_snprintf(file, 32, "@%s", tmp + 1); + tmp = pp->file; + len = strlen(tmp); + if (len > 30) { + tmp = strchr(pp->file + len - 30, '/'); + tmp = tmp ? tmp + 1 : pp->file + len - 30; + } + ret = e_snprintf(file, 32, "@%s", tmp); if (ret <= 0) goto error; } @@ -1055,7 +1099,7 @@ static char *synthesize_perf_probe_point(struct perf_probe_point *pp) return buf; error: - pr_debug("Failed to synthesize perf probe point: %s", + pr_debug("Failed to synthesize perf probe point: %s\n", strerror(-ret)); if (buf) free(buf); @@ -1796,7 +1840,7 @@ static int del_trace_probe_event(int fd, const char *group, ret = e_snprintf(buf, 128, "%s:%s", group, event); if (ret < 0) { - pr_err("Failed to copy event."); + pr_err("Failed to copy event.\n"); return ret; } diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index ddf4d4556321..ab83b6ac5d65 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -652,8 +652,8 @@ static_var: regs = get_arch_regstr(regn); if (!regs) { /* This should be a bug in DWARF or this tool */ - pr_warning("Mapping for DWARF register number %u " - "missing on this architecture.", regn); + pr_warning("Mapping for the register number %u " + "missing on this architecture.\n", regn); return -ERANGE; } @@ -699,13 +699,14 @@ static int convert_variable_type(Dwarf_Die *vr_die, if (ret != DW_TAG_pointer_type && ret != DW_TAG_array_type) { pr_warning("Failed to cast into string: " - "%s(%s) is not a pointer nor array.", + "%s(%s) is not a pointer nor array.\n", dwarf_diename(vr_die), dwarf_diename(&type)); return -EINVAL; } if (ret == DW_TAG_pointer_type) { if (die_get_real_type(&type, &type) == NULL) { - pr_warning("Failed to get a type information."); + pr_warning("Failed to get a type" + " information.\n"); return -ENOENT; } while (*ref_ptr) @@ -720,7 +721,7 @@ static int convert_variable_type(Dwarf_Die *vr_die, if (!die_compare_name(&type, "char") && !die_compare_name(&type, "unsigned char")) { pr_warning("Failed to cast into string: " - "%s is not (unsigned) char *.", + "%s is not (unsigned) char *.\n", dwarf_diename(vr_die)); return -EINVAL; } @@ -830,8 +831,8 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname, return -EINVAL; } if (field->name[0] == '[') { - pr_err("Semantic error: %s is not a pointor nor array.", - varname); + pr_err("Semantic error: %s is not a pointor" + " nor array.\n", varname); return -EINVAL; } if (field->ref) { @@ -978,7 +979,7 @@ static int convert_to_trace_point(Dwarf_Die *sp_die, Dwarf_Addr paddr, name = dwarf_diename(sp_die); if (name) { if (dwarf_entrypc(sp_die, &eaddr) != 0) { - pr_warning("Failed to get entry pc of %s\n", + pr_warning("Failed to get entry address of %s\n", dwarf_diename(sp_die)); return -ENOENT; } @@ -994,7 +995,7 @@ static int convert_to_trace_point(Dwarf_Die *sp_die, Dwarf_Addr paddr, if (retprobe) { if (eaddr != paddr) { pr_warning("Return probe must be on the head of" - " a real function\n"); + " a real function.\n"); return -EINVAL; } tp->retprobe = true; @@ -1033,7 +1034,7 @@ static int call_probe_finder(Dwarf_Die *sp_die, struct probe_finder *pf) Dwarf_Frame *frame; if (dwarf_cfi_addrframe(pf->cfi, pf->addr, &frame) != 0 || dwarf_frame_cfa(frame, &pf->fb_ops, &nops) != 0) { - pr_warning("Failed to get CFA on 0x%jx\n", + pr_warning("Failed to get call frame on 0x%jx\n", (uintmax_t)pf->addr); return -ENOENT; } @@ -1060,7 +1061,7 @@ static int find_probe_point_by_line(struct probe_finder *pf) int ret = 0; if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) { - pr_warning("No source lines found in this CU.\n"); + pr_warning("No source lines found.\n"); return -ENOENT; } @@ -1162,7 +1163,7 @@ static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf) } if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) { - pr_warning("No source lines found in this CU.\n"); + pr_warning("No source lines found.\n"); return -ENOENT; } @@ -1220,7 +1221,7 @@ static int probe_point_inline_cb(Dwarf_Die *in_die, void *data) else { /* Get probe address */ if (dwarf_entrypc(in_die, &addr) != 0) { - pr_warning("Failed to get entry pc of %s.\n", + pr_warning("Failed to get entry address of %s.\n", dwarf_diename(in_die)); param->retval = -ENOENT; return DWARF_CB_ABORT; @@ -1261,8 +1262,8 @@ static int probe_point_search_cb(Dwarf_Die *sp_die, void *data) param->retval = find_probe_point_lazy(sp_die, pf); else { if (dwarf_entrypc(sp_die, &pf->addr) != 0) { - pr_warning("Failed to get entry pc of %s.\n", - dwarf_diename(sp_die)); + pr_warning("Failed to get entry address of " + "%s.\n", dwarf_diename(sp_die)); param->retval = -ENOENT; return DWARF_CB_ABORT; } @@ -1304,7 +1305,7 @@ static int find_probes(int fd, struct probe_finder *pf) dbg = dwfl_init_offline_dwarf(fd, &dwfl, &bias); if (!dbg) { - pr_warning("No dwarf info found in the vmlinux - " + pr_warning("No debug information found in the vmlinux - " "please rebuild with CONFIG_DEBUG_INFO=y.\n"); return -EBADF; } @@ -1549,7 +1550,7 @@ int find_perf_probe_point(unsigned long addr, struct perf_probe_point *ppt) /* Open the live linux kernel */ dbg = dwfl_init_live_kernel_dwarf(addr, &dwfl, &bias); if (!dbg) { - pr_warning("No dwarf info found in the vmlinux - " + pr_warning("No debug information found in the vmlinux - " "please rebuild with CONFIG_DEBUG_INFO=y.\n"); ret = -EINVAL; goto end; @@ -1559,7 +1560,8 @@ int find_perf_probe_point(unsigned long addr, struct perf_probe_point *ppt) addr += bias; /* Find cu die */ if (!dwarf_addrdie(dbg, (Dwarf_Addr)addr - bias, &cudie)) { - pr_warning("No CU DIE is found at %lx\n", addr); + pr_warning("Failed to find debug information for address %lx\n", + addr); ret = -EINVAL; goto end; } @@ -1684,7 +1686,7 @@ static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf) line_list__init(&lf->lr->line_list); if (dwarf_getsrclines(&lf->cu_die, &lines, &nlines) != 0) { - pr_warning("No source lines found in this CU.\n"); + pr_warning("No source lines found.\n"); return -ENOENT; } @@ -1809,7 +1811,7 @@ int find_line_range(int fd, struct line_range *lr) dbg = dwfl_init_offline_dwarf(fd, &dwfl, &bias); if (!dbg) { - pr_warning("No dwarf info found in the vmlinux - " + pr_warning("No debug information found in the vmlinux - " "please rebuild with CONFIG_DEBUG_INFO=y.\n"); return -EBADF; } diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h index bba69d455699..beaefc3c1223 100644 --- a/tools/perf/util/probe-finder.h +++ b/tools/perf/util/probe-finder.h @@ -34,9 +34,9 @@ extern int find_available_vars_at(int fd, struct perf_probe_event *pev, bool externs); #include <dwarf.h> -#include <libdw.h> -#include <libdwfl.h> -#include <version.h> +#include <elfutils/libdw.h> +#include <elfutils/libdwfl.h> +#include <elfutils/version.h> struct probe_finder { struct perf_probe_event *pev; /* Target probe event */ diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index b059dc50cc2d..93680818e244 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -1,5 +1,5 @@ /* - * trace-event-perl. Feed perf trace events to an embedded Perl interpreter. + * trace-event-perl. Feed perf script events to an embedded Perl interpreter. * * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> * @@ -411,8 +411,8 @@ static int perl_generate_script(const char *outfile) return -1; } - fprintf(ofp, "# perf trace event handlers, " - "generated by perf trace -g perl\n"); + fprintf(ofp, "# perf script event handlers, " + "generated by perf script -g perl\n"); fprintf(ofp, "# Licensed under the terms of the GNU GPL" " License version 2\n\n"); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 33a632523743..c6d99334bdfa 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -442,8 +442,8 @@ static int python_generate_script(const char *outfile) fprintf(stderr, "couldn't open %s\n", fname); return -1; } - fprintf(ofp, "# perf trace event handlers, " - "generated by perf trace -g python\n"); + fprintf(ofp, "# perf script event handlers, " + "generated by perf script -g python\n"); fprintf(ofp, "# Licensed under the terms of the GNU GPL" " License version 2\n\n"); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index fa9d652c2dc3..6fb4694d05fa 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -65,9 +65,49 @@ out_close: return -1; } +static void perf_session__id_header_size(struct perf_session *session) +{ + struct sample_data *data; + u64 sample_type = session->sample_type; + u16 size = 0; + + if (!session->sample_id_all) + goto out; + + if (sample_type & PERF_SAMPLE_TID) + size += sizeof(data->tid) * 2; + + if (sample_type & PERF_SAMPLE_TIME) + size += sizeof(data->time); + + if (sample_type & PERF_SAMPLE_ID) + size += sizeof(data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + size += sizeof(data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + size += sizeof(data->cpu) * 2; +out: + session->id_hdr_size = size; +} + +void perf_session__set_sample_id_all(struct perf_session *session, bool value) +{ + session->sample_id_all = value; + perf_session__id_header_size(session); +} + +void perf_session__set_sample_type(struct perf_session *session, u64 type) +{ + session->sample_type = type; +} + void perf_session__update_sample_type(struct perf_session *self) { self->sample_type = perf_header__sample_type(&self->header); + self->sample_id_all = perf_header__sample_id_all(&self->header); + perf_session__id_header_size(self); } int perf_session__create_kernel_maps(struct perf_session *self) @@ -85,7 +125,9 @@ static void perf_session__destroy_kernel_maps(struct perf_session *self) machines__destroy_guest_kernel_maps(&self->machines); } -struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe) +struct perf_session *perf_session__new(const char *filename, int mode, + bool force, bool repipe, + struct perf_event_ops *ops) { size_t len = filename ? strlen(filename) + 1 : 0; struct perf_session *self = zalloc(sizeof(*self) + len); @@ -101,10 +143,20 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc INIT_LIST_HEAD(&self->dead_threads); self->hists_tree = RB_ROOT; self->last_match = NULL; - self->mmap_window = 32; + /* + * On 64bit we can mmap the data file in one go. No need for tiny mmap + * slices. On 32bit we use 32MB. + */ +#if BITS_PER_LONG == 64 + self->mmap_window = ULLONG_MAX; +#else + self->mmap_window = 32 * 1024 * 1024ULL; +#endif self->machines = RB_ROOT; self->repipe = repipe; - INIT_LIST_HEAD(&self->ordered_samples.samples_head); + INIT_LIST_HEAD(&self->ordered_samples.samples); + INIT_LIST_HEAD(&self->ordered_samples.sample_cache); + INIT_LIST_HEAD(&self->ordered_samples.to_free); machine__init(&self->host_machine, "", HOST_KERNEL_ID); if (mode == O_RDONLY) { @@ -120,6 +172,13 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc } perf_session__update_sample_type(self); + + if (ops && ops->ordering_requires_timestamps && + ops->ordered_samples && !self->sample_id_all) { + dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n"); + ops->ordered_samples = false; + } + out: return self; out_free: @@ -230,7 +289,15 @@ struct map_symbol *perf_session__resolve_callchain(struct perf_session *self, return syms; } +static int process_event_synth_stub(event_t *event __used, + struct perf_session *session __used) +{ + dump_printf(": unhandled!\n"); + return 0; +} + static int process_event_stub(event_t *event __used, + struct sample_data *sample __used, struct perf_session *session __used) { dump_printf(": unhandled!\n"); @@ -262,7 +329,7 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler) if (handler->exit == NULL) handler->exit = process_event_stub; if (handler->lost == NULL) - handler->lost = process_event_stub; + handler->lost = event__process_lost; if (handler->read == NULL) handler->read = process_event_stub; if (handler->throttle == NULL) @@ -270,13 +337,13 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler) if (handler->unthrottle == NULL) handler->unthrottle = process_event_stub; if (handler->attr == NULL) - handler->attr = process_event_stub; + handler->attr = process_event_synth_stub; if (handler->event_type == NULL) - handler->event_type = process_event_stub; + handler->event_type = process_event_synth_stub; if (handler->tracing_data == NULL) - handler->tracing_data = process_event_stub; + handler->tracing_data = process_event_synth_stub; if (handler->build_id == NULL) - handler->build_id = process_event_stub; + handler->build_id = process_event_synth_stub; if (handler->finished_round == NULL) { if (handler->ordered_samples) handler->finished_round = process_finished_round; @@ -386,33 +453,61 @@ static event__swap_op event__swap_ops[] = { struct sample_queue { u64 timestamp; - struct sample_event *event; + u64 file_offset; + event_t *event; struct list_head list; }; +static void perf_session_free_sample_buffers(struct perf_session *session) +{ + struct ordered_samples *os = &session->ordered_samples; + + while (!list_empty(&os->to_free)) { + struct sample_queue *sq; + + sq = list_entry(os->to_free.next, struct sample_queue, list); + list_del(&sq->list); + free(sq); + } +} + +static int perf_session_deliver_event(struct perf_session *session, + event_t *event, + struct sample_data *sample, + struct perf_event_ops *ops, + u64 file_offset); + static void flush_sample_queue(struct perf_session *s, struct perf_event_ops *ops) { - struct list_head *head = &s->ordered_samples.samples_head; - u64 limit = s->ordered_samples.next_flush; + struct ordered_samples *os = &s->ordered_samples; + struct list_head *head = &os->samples; struct sample_queue *tmp, *iter; + struct sample_data sample; + u64 limit = os->next_flush; + u64 last_ts = os->last_sample ? os->last_sample->timestamp : 0ULL; if (!ops->ordered_samples || !limit) return; list_for_each_entry_safe(iter, tmp, head, list) { if (iter->timestamp > limit) - return; + break; - if (iter == s->ordered_samples.last_inserted) - s->ordered_samples.last_inserted = NULL; + event__parse_sample(iter->event, s, &sample); + perf_session_deliver_event(s, iter->event, &sample, ops, + iter->file_offset); - ops->sample((event_t *)iter->event, s); - - s->ordered_samples.last_flush = iter->timestamp; + os->last_flush = iter->timestamp; list_del(&iter->list); - free(iter->event); - free(iter); + list_add(&iter->list, &os->sample_cache); + } + + if (list_empty(head)) { + os->last_sample = NULL; + } else if (last_ts <= limit) { + os->last_sample = + list_entry(head->prev, struct sample_queue, list); } } @@ -465,178 +560,265 @@ static int process_finished_round(event_t *event __used, return 0; } -static void __queue_sample_end(struct sample_queue *new, struct list_head *head) -{ - struct sample_queue *iter; - - list_for_each_entry_reverse(iter, head, list) { - if (iter->timestamp < new->timestamp) { - list_add(&new->list, &iter->list); - return; - } - } - - list_add(&new->list, head); -} - -static void __queue_sample_before(struct sample_queue *new, - struct sample_queue *iter, - struct list_head *head) -{ - list_for_each_entry_continue_reverse(iter, head, list) { - if (iter->timestamp < new->timestamp) { - list_add(&new->list, &iter->list); - return; - } - } - - list_add(&new->list, head); -} - -static void __queue_sample_after(struct sample_queue *new, - struct sample_queue *iter, - struct list_head *head) -{ - list_for_each_entry_continue(iter, head, list) { - if (iter->timestamp > new->timestamp) { - list_add_tail(&new->list, &iter->list); - return; - } - } - list_add_tail(&new->list, head); -} - /* The queue is ordered by time */ -static void __queue_sample_event(struct sample_queue *new, - struct perf_session *s) +static void __queue_event(struct sample_queue *new, struct perf_session *s) { - struct sample_queue *last_inserted = s->ordered_samples.last_inserted; - struct list_head *head = &s->ordered_samples.samples_head; + struct ordered_samples *os = &s->ordered_samples; + struct sample_queue *sample = os->last_sample; + u64 timestamp = new->timestamp; + struct list_head *p; + os->last_sample = new; - if (!last_inserted) { - __queue_sample_end(new, head); + if (!sample) { + list_add(&new->list, &os->samples); + os->max_timestamp = timestamp; return; } /* - * Most of the time the current event has a timestamp - * very close to the last event inserted, unless we just switched - * to another event buffer. Having a sorting based on a list and - * on the last inserted event that is close to the current one is - * probably more efficient than an rbtree based sorting. + * last_sample might point to some random place in the list as it's + * the last queued event. We expect that the new event is close to + * this. */ - if (last_inserted->timestamp >= new->timestamp) - __queue_sample_before(new, last_inserted, head); - else - __queue_sample_after(new, last_inserted, head); + if (sample->timestamp <= timestamp) { + while (sample->timestamp <= timestamp) { + p = sample->list.next; + if (p == &os->samples) { + list_add_tail(&new->list, &os->samples); + os->max_timestamp = timestamp; + return; + } + sample = list_entry(p, struct sample_queue, list); + } + list_add_tail(&new->list, &sample->list); + } else { + while (sample->timestamp > timestamp) { + p = sample->list.prev; + if (p == &os->samples) { + list_add(&new->list, &os->samples); + return; + } + sample = list_entry(p, struct sample_queue, list); + } + list_add(&new->list, &sample->list); + } } -static int queue_sample_event(event_t *event, struct sample_data *data, - struct perf_session *s) +#define MAX_SAMPLE_BUFFER (64 * 1024 / sizeof(struct sample_queue)) + +static int perf_session_queue_event(struct perf_session *s, event_t *event, + struct sample_data *data, u64 file_offset) { + struct ordered_samples *os = &s->ordered_samples; + struct list_head *sc = &os->sample_cache; u64 timestamp = data->time; struct sample_queue *new; + if (!timestamp || timestamp == ~0ULL) + return -ETIME; if (timestamp < s->ordered_samples.last_flush) { printf("Warning: Timestamp below last timeslice flush\n"); return -EINVAL; } - new = malloc(sizeof(*new)); - if (!new) - return -ENOMEM; + if (!list_empty(sc)) { + new = list_entry(sc->next, struct sample_queue, list); + list_del(&new->list); + } else if (os->sample_buffer) { + new = os->sample_buffer + os->sample_buffer_idx; + if (++os->sample_buffer_idx == MAX_SAMPLE_BUFFER) + os->sample_buffer = NULL; + } else { + os->sample_buffer = malloc(MAX_SAMPLE_BUFFER * sizeof(*new)); + if (!os->sample_buffer) + return -ENOMEM; + list_add(&os->sample_buffer->list, &os->to_free); + os->sample_buffer_idx = 2; + new = os->sample_buffer + 1; + } new->timestamp = timestamp; + new->file_offset = file_offset; + new->event = event; - new->event = malloc(event->header.size); - if (!new->event) { - free(new); - return -ENOMEM; - } + __queue_event(new, s); - memcpy(new->event, event, event->header.size); + return 0; +} - __queue_sample_event(new, s); - s->ordered_samples.last_inserted = new; +static void callchain__printf(struct sample_data *sample) +{ + unsigned int i; - if (new->timestamp > s->ordered_samples.max_timestamp) - s->ordered_samples.max_timestamp = new->timestamp; + printf("... chain: nr:%Lu\n", sample->callchain->nr); - return 0; + for (i = 0; i < sample->callchain->nr; i++) + printf("..... %2d: %016Lx\n", i, sample->callchain->ips[i]); } -static int perf_session__process_sample(event_t *event, struct perf_session *s, - struct perf_event_ops *ops) +static void perf_session__print_tstamp(struct perf_session *session, + event_t *event, + struct sample_data *sample) { - struct sample_data data; + if (event->header.type != PERF_RECORD_SAMPLE && + !session->sample_id_all) { + fputs("-1 -1 ", stdout); + return; + } - if (!ops->ordered_samples) - return ops->sample(event, s); + if ((session->sample_type & PERF_SAMPLE_CPU)) + printf("%u ", sample->cpu); - bzero(&data, sizeof(struct sample_data)); - event__parse_sample(event, s->sample_type, &data); + if (session->sample_type & PERF_SAMPLE_TIME) + printf("%Lu ", sample->time); +} - queue_sample_event(event, &data, s); +static void dump_event(struct perf_session *session, event_t *event, + u64 file_offset, struct sample_data *sample) +{ + if (!dump_trace) + return; - return 0; + printf("\n%#Lx [%#x]: event: %d\n", file_offset, event->header.size, + event->header.type); + + trace_event(event); + + if (sample) + perf_session__print_tstamp(session, event, sample); + + printf("%#Lx [%#x]: PERF_RECORD_%s", file_offset, event->header.size, + event__get_event_name(event->header.type)); } -static int perf_session__process_event(struct perf_session *self, - event_t *event, - struct perf_event_ops *ops, - u64 offset, u64 head) +static void dump_sample(struct perf_session *session, event_t *event, + struct sample_data *sample) { - trace_event(event); + if (!dump_trace) + return; - if (event->header.type < PERF_RECORD_HEADER_MAX) { - dump_printf("%#Lx [%#x]: PERF_RECORD_%s", - offset + head, event->header.size, - event__name[event->header.type]); - hists__inc_nr_events(&self->hists, event->header.type); - } + printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc, + sample->pid, sample->tid, sample->ip, sample->period); - if (self->header.needs_swap && event__swap_ops[event->header.type]) - event__swap_ops[event->header.type](event); + if (session->sample_type & PERF_SAMPLE_CALLCHAIN) + callchain__printf(sample); +} + +static int perf_session_deliver_event(struct perf_session *session, + event_t *event, + struct sample_data *sample, + struct perf_event_ops *ops, + u64 file_offset) +{ + dump_event(session, event, file_offset, sample); switch (event->header.type) { case PERF_RECORD_SAMPLE: - return perf_session__process_sample(event, self, ops); + dump_sample(session, event, sample); + return ops->sample(event, sample, session); case PERF_RECORD_MMAP: - return ops->mmap(event, self); + return ops->mmap(event, sample, session); case PERF_RECORD_COMM: - return ops->comm(event, self); + return ops->comm(event, sample, session); case PERF_RECORD_FORK: - return ops->fork(event, self); + return ops->fork(event, sample, session); case PERF_RECORD_EXIT: - return ops->exit(event, self); + return ops->exit(event, sample, session); case PERF_RECORD_LOST: - return ops->lost(event, self); + return ops->lost(event, sample, session); case PERF_RECORD_READ: - return ops->read(event, self); + return ops->read(event, sample, session); case PERF_RECORD_THROTTLE: - return ops->throttle(event, self); + return ops->throttle(event, sample, session); case PERF_RECORD_UNTHROTTLE: - return ops->unthrottle(event, self); + return ops->unthrottle(event, sample, session); + default: + ++session->hists.stats.nr_unknown_events; + return -1; + } +} + +static int perf_session__preprocess_sample(struct perf_session *session, + event_t *event, struct sample_data *sample) +{ + if (event->header.type != PERF_RECORD_SAMPLE || + !(session->sample_type & PERF_SAMPLE_CALLCHAIN)) + return 0; + + if (!ip_callchain__valid(sample->callchain, event)) { + pr_debug("call-chain problem with event, skipping it.\n"); + ++session->hists.stats.nr_invalid_chains; + session->hists.stats.total_invalid_chains += sample->period; + return -EINVAL; + } + return 0; +} + +static int perf_session__process_user_event(struct perf_session *session, event_t *event, + struct perf_event_ops *ops, u64 file_offset) +{ + dump_event(session, event, file_offset, NULL); + + /* These events are processed right away */ + switch (event->header.type) { case PERF_RECORD_HEADER_ATTR: - return ops->attr(event, self); + return ops->attr(event, session); case PERF_RECORD_HEADER_EVENT_TYPE: - return ops->event_type(event, self); + return ops->event_type(event, session); case PERF_RECORD_HEADER_TRACING_DATA: /* setup for reading amidst mmap */ - lseek(self->fd, offset + head, SEEK_SET); - return ops->tracing_data(event, self); + lseek(session->fd, file_offset, SEEK_SET); + return ops->tracing_data(event, session); case PERF_RECORD_HEADER_BUILD_ID: - return ops->build_id(event, self); + return ops->build_id(event, session); case PERF_RECORD_FINISHED_ROUND: - return ops->finished_round(event, self, ops); + return ops->finished_round(event, session, ops); default: - ++self->hists.stats.nr_unknown_events; - return -1; + return -EINVAL; } } +static int perf_session__process_event(struct perf_session *session, + event_t *event, + struct perf_event_ops *ops, + u64 file_offset) +{ + struct sample_data sample; + int ret; + + if (session->header.needs_swap && event__swap_ops[event->header.type]) + event__swap_ops[event->header.type](event); + + if (event->header.type >= PERF_RECORD_HEADER_MAX) + return -EINVAL; + + hists__inc_nr_events(&session->hists, event->header.type); + + if (event->header.type >= PERF_RECORD_USER_TYPE_START) + return perf_session__process_user_event(session, event, ops, file_offset); + + /* + * For all kernel events we get the sample data + */ + event__parse_sample(event, session, &sample); + + /* Preprocess sample records - precheck callchains */ + if (perf_session__preprocess_sample(session, event, &sample)) + return 0; + + if (ops->ordered_samples) { + ret = perf_session_queue_event(session, event, &sample, + file_offset); + if (ret != -ETIME) + return ret; + } + + return perf_session_deliver_event(session, event, &sample, ops, + file_offset); +} + void perf_event_header__bswap(struct perf_event_header *self) { self->type = bswap_32(self->type); @@ -656,21 +838,33 @@ static struct thread *perf_session__register_idle_thread(struct perf_session *se return thread; } -int do_read(int fd, void *buf, size_t size) +static void perf_session__warn_about_errors(const struct perf_session *session, + const struct perf_event_ops *ops) { - void *buf_start = buf; - - while (size) { - int ret = read(fd, buf, size); - - if (ret <= 0) - return ret; + if (ops->lost == event__process_lost && + session->hists.stats.total_lost != 0) { + ui__warning("Processed %Lu events and LOST %Lu!\n\n" + "Check IO/CPU overload!\n\n", + session->hists.stats.total_period, + session->hists.stats.total_lost); + } - size -= ret; - buf += ret; + if (session->hists.stats.nr_unknown_events != 0) { + ui__warning("Found %u unknown events!\n\n" + "Is this an older tool processing a perf.data " + "file generated by a more recent tool?\n\n" + "If that is not the case, consider " + "reporting to linux-kernel@vger.kernel.org.\n\n", + session->hists.stats.nr_unknown_events); } - return buf - buf_start; + if (session->hists.stats.nr_invalid_chains != 0) { + ui__warning("Found invalid callchains!\n\n" + "%u out of %u events were discarded for this reason.\n\n" + "Consider reporting to linux-kernel@vger.kernel.org.\n\n", + session->hists.stats.nr_invalid_chains, + session->hists.stats.nr_events[PERF_RECORD_SAMPLE]); + } } #define session_done() (*(volatile int *)(&session_done)) @@ -690,7 +884,7 @@ static int __perf_session__process_pipe_events(struct perf_session *self, head = 0; more: - err = do_read(self->fd, &event, sizeof(struct perf_event_header)); + err = readn(self->fd, &event, sizeof(struct perf_event_header)); if (err <= 0) { if (err == 0) goto done; @@ -710,8 +904,7 @@ more: p += sizeof(struct perf_event_header); if (size - sizeof(struct perf_event_header)) { - err = do_read(self->fd, p, - size - sizeof(struct perf_event_header)); + err = readn(self->fd, p, size - sizeof(struct perf_event_header)); if (err <= 0) { if (err == 0) { pr_err("unexpected end of event stream\n"); @@ -724,8 +917,7 @@ more: } if (size == 0 || - (skip = perf_session__process_event(self, &event, ops, - 0, head)) < 0) { + (skip = perf_session__process_event(self, &event, ops, head)) < 0) { dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n", head, event.header.size, event.header.type); /* @@ -740,9 +932,6 @@ more: head += size; - dump_printf("\n%#Lx [%#x]: event: %d\n", - head, event.header.size, event.header.type); - if (skip > 0) head += skip; @@ -751,82 +940,91 @@ more: done: err = 0; out_err: + perf_session__warn_about_errors(self, ops); + perf_session_free_sample_buffers(self); return err; } -int __perf_session__process_events(struct perf_session *self, +int __perf_session__process_events(struct perf_session *session, u64 data_offset, u64 data_size, u64 file_size, struct perf_event_ops *ops) { - int err, mmap_prot, mmap_flags; - u64 head, shift; - u64 offset = 0; - size_t page_size; + u64 head, page_offset, file_offset, file_pos, progress_next; + int err, mmap_prot, mmap_flags, map_idx = 0; + struct ui_progress *progress; + size_t page_size, mmap_size; + char *buf, *mmaps[8]; event_t *event; uint32_t size; - char *buf; - struct ui_progress *progress = ui_progress__new("Processing events...", - self->size); - if (progress == NULL) - return -1; perf_event_ops__fill_defaults(ops); page_size = sysconf(_SC_PAGESIZE); - head = data_offset; - shift = page_size * (head / page_size); - offset += shift; - head -= shift; + page_offset = page_size * (data_offset / page_size); + file_offset = page_offset; + head = data_offset - page_offset; + + if (data_offset + data_size < file_size) + file_size = data_offset + data_size; + + progress_next = file_size / 16; + progress = ui_progress__new("Processing events...", file_size); + if (progress == NULL) + return -1; + + mmap_size = session->mmap_window; + if (mmap_size > file_size) + mmap_size = file_size; + + memset(mmaps, 0, sizeof(mmaps)); mmap_prot = PROT_READ; mmap_flags = MAP_SHARED; - if (self->header.needs_swap) { + if (session->header.needs_swap) { mmap_prot |= PROT_WRITE; mmap_flags = MAP_PRIVATE; } remap: - buf = mmap(NULL, page_size * self->mmap_window, mmap_prot, - mmap_flags, self->fd, offset); + buf = mmap(NULL, mmap_size, mmap_prot, mmap_flags, session->fd, + file_offset); if (buf == MAP_FAILED) { pr_err("failed to mmap file\n"); err = -errno; goto out_err; } + mmaps[map_idx] = buf; + map_idx = (map_idx + 1) & (ARRAY_SIZE(mmaps) - 1); + file_pos = file_offset + head; more: event = (event_t *)(buf + head); - ui_progress__update(progress, offset); - if (self->header.needs_swap) + if (session->header.needs_swap) perf_event_header__bswap(&event->header); size = event->header.size; if (size == 0) size = 8; - if (head + event->header.size >= page_size * self->mmap_window) { - int munmap_ret; - - shift = page_size * (head / page_size); - - munmap_ret = munmap(buf, page_size * self->mmap_window); - assert(munmap_ret == 0); + if (head + event->header.size >= mmap_size) { + if (mmaps[map_idx]) { + munmap(mmaps[map_idx], mmap_size); + mmaps[map_idx] = NULL; + } - offset += shift; - head -= shift; + page_offset = page_size * (head / page_size); + file_offset += page_offset; + head -= page_offset; goto remap; } size = event->header.size; - dump_printf("\n%#Lx [%#x]: event: %d\n", - offset + head, event->header.size, event->header.type); - if (size == 0 || - perf_session__process_event(self, event, ops, offset, head) < 0) { + perf_session__process_event(session, event, ops, file_pos) < 0) { dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n", - offset + head, event->header.size, + file_offset + head, event->header.size, event->header.type); /* * assume we lost track of the stream, check alignment, and @@ -839,19 +1037,24 @@ more: } head += size; + file_pos += size; - if (offset + head >= data_offset + data_size) - goto done; + if (file_pos >= progress_next) { + progress_next += file_size / 16; + ui_progress__update(progress, file_pos); + } - if (offset + head < file_size) + if (file_pos < file_size) goto more; -done: + err = 0; /* do the final flush for ordered samples */ - self->ordered_samples.next_flush = ULLONG_MAX; - flush_sample_queue(self, ops); + session->ordered_samples.next_flush = ULLONG_MAX; + flush_sample_queue(session, ops); out_err: ui_progress__delete(progress); + perf_session__warn_about_errors(session, ops); + perf_session_free_sample_buffers(session); return err; } diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 9fa0fc2a863f..decd83f274fd 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -17,8 +17,12 @@ struct ordered_samples { u64 last_flush; u64 next_flush; u64 max_timestamp; - struct list_head samples_head; - struct sample_queue *last_inserted; + struct list_head samples; + struct list_head sample_cache; + struct list_head to_free; + struct sample_queue *sample_buffer; + struct sample_queue *last_sample; + int sample_buffer_idx; }; struct perf_session { @@ -42,6 +46,8 @@ struct perf_session { int fd; bool fd_pipe; bool repipe; + bool sample_id_all; + u16 id_hdr_size; int cwdlen; char *cwd; struct ordered_samples ordered_samples; @@ -50,7 +56,9 @@ struct perf_session { struct perf_event_ops; -typedef int (*event_op)(event_t *self, struct perf_session *session); +typedef int (*event_op)(event_t *self, struct sample_data *sample, + struct perf_session *session); +typedef int (*event_synth_op)(event_t *self, struct perf_session *session); typedef int (*event_op2)(event_t *self, struct perf_session *session, struct perf_event_ops *ops); @@ -63,16 +71,19 @@ struct perf_event_ops { lost, read, throttle, - unthrottle, - attr, + unthrottle; + event_synth_op attr, event_type, tracing_data, build_id; event_op2 finished_round; bool ordered_samples; + bool ordering_requires_timestamps; }; -struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe); +struct perf_session *perf_session__new(const char *filename, int mode, + bool force, bool repipe, + struct perf_event_ops *ops); void perf_session__delete(struct perf_session *self); void perf_event_header__bswap(struct perf_event_header *self); @@ -98,8 +109,9 @@ void mem_bswap_64(void *src, int byte_size); int perf_session__create_kernel_maps(struct perf_session *self); -int do_read(int fd, void *buf, size_t size); void perf_session__update_sample_type(struct perf_session *self); +void perf_session__set_sample_id_all(struct perf_session *session, bool value); +void perf_session__set_sample_type(struct perf_session *session, u64 type); void perf_session__remove_thread(struct perf_session *self, struct thread *th); static inline diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index b62a553cc67d..f44fa541d56e 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -170,7 +170,7 @@ static int hist_entry__dso_snprintf(struct hist_entry *self, char *bf, return repsep_snprintf(bf, size, "%-*s", width, dso_name); } - return repsep_snprintf(bf, size, "%*Lx", width, self->ip); + return repsep_snprintf(bf, size, "%-*s", width, "[unknown]"); } /* --sort symbol */ @@ -196,7 +196,7 @@ static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf, if (verbose) { char o = self->ms.map ? dso__symtab_origin(self->ms.map->dso) : '!'; - ret += repsep_snprintf(bf, size, "%*Lx %c ", + ret += repsep_snprintf(bf, size, "%-#*llx %c ", BITS_PER_LONG / 4, self->ip, o); } @@ -205,7 +205,7 @@ static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf, ret += repsep_snprintf(bf + ret, size - ret, "%s", self->ms.sym->name); else - ret += repsep_snprintf(bf + ret, size - ret, "%*Lx", + ret += repsep_snprintf(bf + ret, size - ret, "%-#*llx", BITS_PER_LONG / 4, self->ip); return ret; diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 439ab947daf4..15ccfba8cdf8 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -22,6 +22,10 @@ #include <limits.h> #include <sys/utsname.h> +#ifndef KSYM_NAME_LEN +#define KSYM_NAME_LEN 128 +#endif + #ifndef NT_GNU_BUILD_ID #define NT_GNU_BUILD_ID 3 #endif @@ -41,6 +45,7 @@ struct symbol_conf symbol_conf = { .exclude_other = true, .use_modules = true, .try_vmlinux_path = true, + .symfs = "", }; int dso__name_len(const struct dso *self) @@ -92,7 +97,7 @@ static void symbols__fixup_end(struct rb_root *self) prev = curr; curr = rb_entry(nd, struct symbol, rb_node); - if (prev->end == prev->start) + if (prev->end == prev->start && prev->end != curr->start) prev->end = curr->start - 1; } @@ -121,7 +126,7 @@ static void __map_groups__fixup_end(struct map_groups *self, enum map_type type) * We still haven't the actual symbols, so guess the * last map final address. */ - curr->end = ~0UL; + curr->end = ~0ULL; } static void map_groups__fixup_end(struct map_groups *self) @@ -425,16 +430,25 @@ size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp) int kallsyms__parse(const char *filename, void *arg, int (*process_symbol)(void *arg, const char *name, - char type, u64 start)) + char type, u64 start, u64 end)) { char *line = NULL; size_t n; - int err = 0; + int err = -1; + u64 prev_start = 0; + char prev_symbol_type = 0; + char *prev_symbol_name; FILE *file = fopen(filename, "r"); if (file == NULL) goto out_failure; + prev_symbol_name = malloc(KSYM_NAME_LEN); + if (prev_symbol_name == NULL) + goto out_close; + + err = 0; + while (!feof(file)) { u64 start; int line_len, len; @@ -454,14 +468,33 @@ int kallsyms__parse(const char *filename, void *arg, continue; symbol_type = toupper(line[len]); - symbol_name = line + len + 2; + len += 2; + symbol_name = line + len; + len = line_len - len; - err = process_symbol(arg, symbol_name, symbol_type, start); - if (err) + if (len >= KSYM_NAME_LEN) { + err = -1; break; + } + + if (prev_symbol_type) { + u64 end = start; + if (end != prev_start) + --end; + err = process_symbol(arg, prev_symbol_name, + prev_symbol_type, prev_start, end); + if (err) + break; + } + + memcpy(prev_symbol_name, symbol_name, len + 1); + prev_symbol_type = symbol_type; + prev_start = start; } + free(prev_symbol_name); free(line); +out_close: fclose(file); return err; @@ -483,7 +516,7 @@ static u8 kallsyms2elf_type(char type) } static int map__process_kallsym_symbol(void *arg, const char *name, - char type, u64 start) + char type, u64 start, u64 end) { struct symbol *sym; struct process_kallsyms_args *a = arg; @@ -492,11 +525,8 @@ static int map__process_kallsym_symbol(void *arg, const char *name, if (!symbol_type__is_a(type, a->map->type)) return 0; - /* - * Will fix up the end later, when we have all symbols sorted. - */ - sym = symbol__new(start, 0, kallsyms2elf_type(type), name); - + sym = symbol__new(start, end - start + 1, + kallsyms2elf_type(type), name); if (sym == NULL) return -ENOMEM; /* @@ -649,7 +679,6 @@ int dso__load_kallsyms(struct dso *self, const char *filename, if (dso__load_all_kallsyms(self, filename, map) < 0) return -1; - symbols__fixup_end(&self->symbols[map->type]); if (self->kernel == DSO_TYPE_GUEST_KERNEL) self->origin = DSO__ORIG_GUEST_KERNEL; else @@ -839,8 +868,11 @@ static int dso__synthesize_plt_symbols(struct dso *self, struct map *map, char sympltname[1024]; Elf *elf; int nr = 0, symidx, fd, err = 0; + char name[PATH_MAX]; - fd = open(self->long_name, O_RDONLY); + snprintf(name, sizeof(name), "%s%s", + symbol_conf.symfs, self->long_name); + fd = open(name, O_RDONLY); if (fd < 0) goto out; @@ -1452,16 +1484,19 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter) self->origin++) { switch (self->origin) { case DSO__ORIG_BUILD_ID_CACHE: - if (dso__build_id_filename(self, name, size) == NULL) + /* skip the locally configured cache if a symfs is given */ + if (symbol_conf.symfs[0] || + (dso__build_id_filename(self, name, size) == NULL)) { continue; + } break; case DSO__ORIG_FEDORA: - snprintf(name, size, "/usr/lib/debug%s.debug", - self->long_name); + snprintf(name, size, "%s/usr/lib/debug%s.debug", + symbol_conf.symfs, self->long_name); break; case DSO__ORIG_UBUNTU: - snprintf(name, size, "/usr/lib/debug%s", - self->long_name); + snprintf(name, size, "%s/usr/lib/debug%s", + symbol_conf.symfs, self->long_name); break; case DSO__ORIG_BUILDID: { char build_id_hex[BUILD_ID_SIZE * 2 + 1]; @@ -1473,19 +1508,26 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter) sizeof(self->build_id), build_id_hex); snprintf(name, size, - "/usr/lib/debug/.build-id/%.2s/%s.debug", - build_id_hex, build_id_hex + 2); + "%s/usr/lib/debug/.build-id/%.2s/%s.debug", + symbol_conf.symfs, build_id_hex, build_id_hex + 2); } break; case DSO__ORIG_DSO: - snprintf(name, size, "%s", self->long_name); + snprintf(name, size, "%s%s", + symbol_conf.symfs, self->long_name); break; case DSO__ORIG_GUEST_KMODULE: if (map->groups && map->groups->machine) root_dir = map->groups->machine->root_dir; else root_dir = ""; - snprintf(name, size, "%s%s", root_dir, self->long_name); + snprintf(name, size, "%s%s%s", symbol_conf.symfs, + root_dir, self->long_name); + break; + + case DSO__ORIG_KMODULE: + snprintf(name, size, "%s%s", symbol_conf.symfs, + self->long_name); break; default: @@ -1784,17 +1826,20 @@ int dso__load_vmlinux(struct dso *self, struct map *map, const char *vmlinux, symbol_filter_t filter) { int err = -1, fd; + char symfs_vmlinux[PATH_MAX]; - fd = open(vmlinux, O_RDONLY); + snprintf(symfs_vmlinux, sizeof(symfs_vmlinux), "%s/%s", + symbol_conf.symfs, vmlinux); + fd = open(symfs_vmlinux, O_RDONLY); if (fd < 0) return -1; dso__set_loaded(self, map->type); - err = dso__load_sym(self, map, vmlinux, fd, filter, 0, 0); + err = dso__load_sym(self, map, symfs_vmlinux, fd, filter, 0, 0); close(fd); if (err > 0) - pr_debug("Using %s for symbols\n", vmlinux); + pr_debug("Using %s for symbols\n", symfs_vmlinux); return err; } @@ -1836,8 +1881,8 @@ static int dso__load_kernel_sym(struct dso *self, struct map *map, const char *kallsyms_filename = NULL; char *kallsyms_allocated_filename = NULL; /* - * Step 1: if the user specified a vmlinux filename, use it and only - * it, reporting errors to the user if it cannot be used. + * Step 1: if the user specified a kallsyms or vmlinux filename, use + * it and only it, reporting errors to the user if it cannot be used. * * For instance, try to analyse an ARM perf.data file _without_ a * build-id, or if the user specifies the wrong path to the right @@ -1850,6 +1895,11 @@ static int dso__load_kernel_sym(struct dso *self, struct map *map, * validation in dso__load_vmlinux and will bail out if they don't * match. */ + if (symbol_conf.kallsyms_name != NULL) { + kallsyms_filename = symbol_conf.kallsyms_name; + goto do_kallsyms; + } + if (symbol_conf.vmlinux_name != NULL) { err = dso__load_vmlinux(self, map, symbol_conf.vmlinux_name, filter); @@ -1867,6 +1917,10 @@ static int dso__load_kernel_sym(struct dso *self, struct map *map, goto out_fixup; } + /* do not try local files if a symfs was given */ + if (symbol_conf.symfs[0] != 0) + return -1; + /* * Say the kernel DSO was created when processing the build-id header table, * we have a build-id, so check if it is the same as the running kernel, @@ -2136,7 +2190,7 @@ struct process_args { }; static int symbol__in_kernel(void *arg, const char *name, - char type __used, u64 start) + char type __used, u64 start, u64 end __used) { struct process_args *args = arg; @@ -2257,9 +2311,6 @@ static int vmlinux_path__init(void) struct utsname uts; char bf[PATH_MAX]; - if (uname(&uts) < 0) - return -1; - vmlinux_path = malloc(sizeof(char *) * 5); if (vmlinux_path == NULL) return -1; @@ -2272,6 +2323,14 @@ static int vmlinux_path__init(void) if (vmlinux_path[vmlinux_path__nr_entries] == NULL) goto out_fail; ++vmlinux_path__nr_entries; + + /* only try running kernel version if no symfs was given */ + if (symbol_conf.symfs[0] != 0) + return 0; + + if (uname(&uts) < 0) + return -1; + snprintf(bf, sizeof(bf), "/boot/vmlinux-%s", uts.release); vmlinux_path[vmlinux_path__nr_entries] = strdup(bf); if (vmlinux_path[vmlinux_path__nr_entries] == NULL) @@ -2331,6 +2390,8 @@ static int setup_list(struct strlist **list, const char *list_str, int symbol__init(void) { + const char *symfs; + if (symbol_conf.initialized) return 0; @@ -2359,6 +2420,18 @@ int symbol__init(void) symbol_conf.sym_list_str, "symbol") < 0) goto out_free_comm_list; + /* + * A path to symbols of "/" is identical to "" + * reset here for simplicity. + */ + symfs = realpath(symbol_conf.symfs, NULL); + if (symfs == NULL) + symfs = symbol_conf.symfs; + if (strcmp(symfs, "/") == 0) + symbol_conf.symfs = ""; + if (symfs != symbol_conf.symfs) + free((void *)symfs); + symbol_conf.initialized = true; return 0; diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 6c6eafdb932d..670cd1c88f54 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -72,6 +72,7 @@ struct symbol_conf { show_cpu_utilization, initialized; const char *vmlinux_name, + *kallsyms_name, *source_prefix, *field_sep; const char *default_guest_vmlinux_name, @@ -85,6 +86,7 @@ struct symbol_conf { struct strlist *dso_list, *comm_list, *sym_list; + const char *symfs; }; extern struct symbol_conf symbol_conf; @@ -215,7 +217,7 @@ bool __dsos__read_build_ids(struct list_head *head, bool with_hits); int build_id__sprintf(const u8 *self, int len, char *bf); int kallsyms__parse(const char *filename, void *arg, int (*process_symbol)(void *arg, const char *name, - char type, u64 start)); + char type, u64 start, u64 end)); void machine__destroy_kernel_maps(struct machine *self); int __machine__create_kernel_maps(struct machine *self, struct dso *kernel); diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 8c72d888e449..00f4eade2e3e 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -16,35 +16,50 @@ static int filter(const struct dirent *dir) return 1; } -int find_all_tid(int pid, pid_t ** all_tid) +struct thread_map *thread_map__new_by_pid(pid_t pid) { + struct thread_map *threads; char name[256]; int items; struct dirent **namelist = NULL; - int ret = 0; int i; sprintf(name, "/proc/%d/task", pid); items = scandir(name, &namelist, filter, NULL); if (items <= 0) - return -ENOENT; - *all_tid = malloc(sizeof(pid_t) * items); - if (!*all_tid) { - ret = -ENOMEM; - goto failure; - } - - for (i = 0; i < items; i++) - (*all_tid)[i] = atoi(namelist[i]->d_name); + return NULL; - ret = items; + threads = malloc(sizeof(*threads) + sizeof(pid_t) * items); + if (threads != NULL) { + for (i = 0; i < items; i++) + threads->map[i] = atoi(namelist[i]->d_name); + threads->nr = items; + } -failure: for (i=0; i<items; i++) free(namelist[i]); free(namelist); - return ret; + return threads; +} + +struct thread_map *thread_map__new_by_tid(pid_t tid) +{ + struct thread_map *threads = malloc(sizeof(*threads) + sizeof(pid_t)); + + if (threads != NULL) { + threads->map[0] = tid; + threads->nr = 1; + } + + return threads; +} + +struct thread_map *thread_map__new(pid_t pid, pid_t tid) +{ + if (pid != -1) + return thread_map__new_by_pid(pid); + return thread_map__new_by_tid(tid); } static struct thread *thread__new(pid_t pid) diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 688500ff826f..d7574101054a 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -18,11 +18,24 @@ struct thread { int comm_len; }; +struct thread_map { + int nr; + int map[]; +}; + struct perf_session; void thread__delete(struct thread *self); -int find_all_tid(int pid, pid_t ** all_tid); +struct thread_map *thread_map__new_by_pid(pid_t pid); +struct thread_map *thread_map__new_by_tid(pid_t tid); +struct thread_map *thread_map__new(pid_t pid, pid_t tid); + +static inline void thread_map__delete(struct thread_map *threads) +{ + free(threads); +} + int thread__set_comm(struct thread *self, const char *comm); int thread__comm_len(struct thread *self); struct thread *perf_session__findnew(struct perf_session *self, pid_t pid); diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index b1572601286c..35729f4c40cb 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -34,11 +34,13 @@ #include <ctype.h> #include <errno.h> #include <stdbool.h> +#include <linux/list.h> #include <linux/kernel.h> #include "../perf.h" #include "trace-event.h" #include "debugfs.h" +#include "evsel.h" #define VERSION "0.5" @@ -469,16 +471,17 @@ out: } static struct tracepoint_path * -get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events) +get_tracepoints_path(struct list_head *pattrs) { struct tracepoint_path path, *ppath = &path; - int i, nr_tracepoints = 0; + struct perf_evsel *pos; + int nr_tracepoints = 0; - for (i = 0; i < nb_events; i++) { - if (pattrs[i].type != PERF_TYPE_TRACEPOINT) + list_for_each_entry(pos, pattrs, node) { + if (pos->attr.type != PERF_TYPE_TRACEPOINT) continue; ++nr_tracepoints; - ppath->next = tracepoint_id_to_path(pattrs[i].config); + ppath->next = tracepoint_id_to_path(pos->attr.config); if (!ppath->next) die("%s\n", "No memory to alloc tracepoints list"); ppath = ppath->next; @@ -487,21 +490,21 @@ get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events) return nr_tracepoints > 0 ? path.next : NULL; } -bool have_tracepoints(struct perf_event_attr *pattrs, int nb_events) +bool have_tracepoints(struct list_head *pattrs) { - int i; + struct perf_evsel *pos; - for (i = 0; i < nb_events; i++) - if (pattrs[i].type == PERF_TYPE_TRACEPOINT) + list_for_each_entry(pos, pattrs, node) + if (pos->attr.type == PERF_TYPE_TRACEPOINT) return true; return false; } -int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events) +int read_tracing_data(int fd, struct list_head *pattrs) { char buf[BUFSIZ]; - struct tracepoint_path *tps = get_tracepoints_path(pattrs, nb_events); + struct tracepoint_path *tps = get_tracepoints_path(pattrs); /* * What? No tracepoints? No sense writing anything here, bail out. @@ -545,14 +548,13 @@ int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events) return 0; } -ssize_t read_tracing_data_size(int fd, struct perf_event_attr *pattrs, - int nb_events) +ssize_t read_tracing_data_size(int fd, struct list_head *pattrs) { ssize_t size; int err = 0; calc_data_size = 1; - err = read_tracing_data(fd, pattrs, nb_events); + err = read_tracing_data(fd, pattrs); size = calc_data_size - 1; calc_data_size = 0; diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index b3e86b1e4444..b5f12ca24d99 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -262,9 +262,8 @@ raw_field_value(struct event *event, const char *name, void *data); void *raw_field_ptr(struct event *event, const char *name, void *data); unsigned long long eval_flag(const char *flag); -int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events); -ssize_t read_tracing_data_size(int fd, struct perf_event_attr *pattrs, - int nb_events); +int read_tracing_data(int fd, struct list_head *pattrs); +ssize_t read_tracing_data_size(int fd, struct list_head *pattrs); /* taken from kernel/trace/trace.h */ enum trace_flag_type { diff --git a/tools/perf/util/ui/util.c b/tools/perf/util/ui/util.c index 056c69521a38..7b5a8926624e 100644 --- a/tools/perf/util/ui/util.c +++ b/tools/perf/util/ui/util.c @@ -104,10 +104,24 @@ out_destroy_form: return rc; } -static const char yes[] = "Yes", no[] = "No"; +static const char yes[] = "Yes", no[] = "No", + warning_str[] = "Warning!", ok[] = "Ok"; bool ui__dialog_yesno(const char *msg) { /* newtWinChoice should really be accepting const char pointers... */ return newtWinChoice(NULL, (char *)yes, (char *)no, (char *)msg) == 1; } + +void ui__warning(const char *format, ...) +{ + va_list args; + + va_start(args, format); + if (use_browser > 0) + newtWinMessagev((char *)warning_str, (char *)ok, + (char *)format, args); + else + vfprintf(stderr, format, args); + va_end(args); +} diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 214265674ddd..5b3ea49aa63e 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -114,3 +114,20 @@ unsigned long convert_unit(unsigned long value, char *unit) return value; } + +int readn(int fd, void *buf, size_t n) +{ + void *buf_start = buf; + + while (n) { + int ret = read(fd, buf, n); + + if (ret <= 0) + return ret; + + n -= ret; + buf += ret; + } + + return buf - buf_start; +} diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 7562707ddd1c..e833f26f3bfc 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -265,6 +265,7 @@ void argv_free(char **argv); bool strglobmatch(const char *str, const char *pat); bool strlazymatch(const char *str, const char *pat); unsigned long convert_unit(unsigned long value, char *unit); +int readn(int fd, void *buf, size_t size); #define _STR(x) #x #define STR(x) _STR(x) diff --git a/tools/perf/util/xyarray.c b/tools/perf/util/xyarray.c new file mode 100644 index 000000000000..22afbf6c536a --- /dev/null +++ b/tools/perf/util/xyarray.c @@ -0,0 +1,20 @@ +#include "xyarray.h" +#include "util.h" + +struct xyarray *xyarray__new(int xlen, int ylen, size_t entry_size) +{ + size_t row_size = ylen * entry_size; + struct xyarray *xy = zalloc(sizeof(*xy) + xlen * row_size); + + if (xy != NULL) { + xy->entry_size = entry_size; + xy->row_size = row_size; + } + + return xy; +} + +void xyarray__delete(struct xyarray *xy) +{ + free(xy); +} diff --git a/tools/perf/util/xyarray.h b/tools/perf/util/xyarray.h new file mode 100644 index 000000000000..c488a07275dd --- /dev/null +++ b/tools/perf/util/xyarray.h @@ -0,0 +1,20 @@ +#ifndef _PERF_XYARRAY_H_ +#define _PERF_XYARRAY_H_ 1 + +#include <sys/types.h> + +struct xyarray { + size_t row_size; + size_t entry_size; + char contents[]; +}; + +struct xyarray *xyarray__new(int xlen, int ylen, size_t entry_size); +void xyarray__delete(struct xyarray *xy); + +static inline void *xyarray__entry(struct xyarray *xy, int x, int y) +{ + return &xy->contents[x * xy->row_size + y * xy->entry_size]; +} + +#endif /* _PERF_XYARRAY_H_ */ |